diff --git a/docs/conf.py b/docs/conf.py index 88311b9e7..7df2562bc 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -49,9 +49,9 @@ # 'details', #'exception_hierarchy', - # for pdf - # 'rst2pdf.pdfbuilder' + # 'rst2pdf.pdfbuilder', + 'sphinx.ext.mathjax' ] # Add any paths that contain templates here, relative to this directory. templates_path = [] #'_templates'] diff --git a/docs/reward.rst b/docs/reward.rst index 049962952..0c35cafc7 100644 --- a/docs/reward.rst +++ b/docs/reward.rst @@ -86,6 +86,7 @@ At time of writing the available reward functions is : - :class:`LinesReconnectedReward` - :class:`N1Reward` - :class:`RedispReward` +- :class:`ShapedReward` In the provided reward you have also some convenience functions to combine different reward. These are: diff --git a/grid2op/Reward/__init__.py b/grid2op/Reward/__init__.py index 54bacfb73..e023a2f52 100644 --- a/grid2op/Reward/__init__.py +++ b/grid2op/Reward/__init__.py @@ -19,13 +19,14 @@ "AlarmReward", "N1Reward", # TODO it would be better to have a specific package for this, but in the mean time i put it here + "ShapedReward", "L2RPNSandBoxScore", "L2RPNWCCI2022ScoreFun", "AlertReward", "_AlarmScore", "_NewRenewableSourcesUsageScore", "_AlertCostScore", - "_AlertTrustScore" + "_AlertTrustScore", ] from grid2op.Reward.constantReward import ConstantReward @@ -49,6 +50,7 @@ from grid2op.Reward.alarmReward import AlarmReward from grid2op.Reward._alarmScore import _AlarmScore from grid2op.Reward.n1Reward import N1Reward +from grid2op.Reward.shapedReward import ShapedReward from grid2op.Reward.l2rpn_wcci2022_scorefun import L2RPNWCCI2022ScoreFun from grid2op.Reward.alertReward import AlertReward from grid2op.Reward._newRenewableSourcesUsageScore import _NewRenewableSourcesUsageScore diff --git a/grid2op/Reward/shapedReward.py b/grid2op/Reward/shapedReward.py new file mode 100644 index 000000000..b2b0b64fa --- /dev/null +++ b/grid2op/Reward/shapedReward.py @@ -0,0 +1,87 @@ +import numpy as np +from grid2op.Reward.baseReward import BaseReward +from grid2op.dtypes import dt_float +import math + + +class ShapedReward(BaseReward): + """ + This reward is based on the cumulative sum of all overflowing line loads, which the agent aims to minimize. + + This rewards is computed as followed: + + - We first calculate the coefficient u, which summarizes the (overflowing) line loads. + + - If rho_max < 1,i.e, there is currently no overflow, and line loads of all lines are within the allowed bounds, u is + calculated as: + + u = max(rho_max - 0.5, 0) ; where rho_max is the maximum overflow value in the grid + + (If rho_max - 0.5 is positive or zero, it will return :math:rho_max - 0.5`, else it will return 0.) + + - If rho_max > 1, u is calculated as: + + u = Σ (rho_i - 0.5) for each i in [1, n] and rho_i > 1 ; where `n` is the number of power lines in the grid and Σ denotes summation. + + Then, utilizing u calculated above, we take into account offline lines and apply exponential decay to obtain the shaped reward r as: + + r = exp(-u - 0.5*n_offline) ; + where n_offline is the number of lines which are currently offline as a result of an overflow or agent’s actions (i.e., + we do not consider lines that are offline because of maintenance or opponent attacks). + + Examples + --------- + You can use this reward in any environment with: + + .. code-block:: python + + import grid2op + from grid2op.Reward import ShapedReward + + # then you create your environment with it: + NAME_OF_THE_ENVIRONMENT = "l2rpn_case14_sandbox" + env = grid2op.make(NAME_OF_THE_ENVIRONMENT,reward_class=ShapedReward) + # and do a step with a "do nothing" action + obs = env.reset() + obs, reward, done, info = env.step(env.action_space()) + # the reward is computed with the ShapedReward class + + """ + def __init__(self, logger=None): + BaseReward.__init__(self, logger=logger) + + + def initialize(self, env): + self.reward_min = dt_float(0.0) + + + def __call__(self, action, env, has_error, is_done, is_illegal, is_ambiguous): + + if not is_done and not has_error: + res = self.line_overflowing_sum(env) + + else: + res = self.reward_min + + return res + + + @staticmethod + def line_overflowing_sum(env): + obs = env.current_obs + + if obs.rho.max()<=1: + u = max([obs.rho.max()- 0.5, 0]) + + else: + u = sum([rho-0.5 for rho in obs.rho if rho>1]) + + lines_disconnected = np.sum(obs.line_status == False) + lines_in_maintenance = np.sum(obs.time_next_maintenance==0) + lines_under_attack = np.sum(obs.time_since_last_attack>=0) + n_offline = lines_disconnected - lines_in_maintenance - lines_under_attack + + reward = math.exp(-u-0.5*n_offline) + + return reward + \ No newline at end of file diff --git a/grid2op/tests/test_Reward.py b/grid2op/tests/test_Reward.py index c76c55f03..995caf049 100644 --- a/grid2op/tests/test_Reward.py +++ b/grid2op/tests/test_Reward.py @@ -18,7 +18,7 @@ from grid2op.Parameters import Parameters from grid2op.Runner import Runner from grid2op.Agent import BaseAgent - +from grid2op.Reward.shapedReward import ShapedReward import warnings @@ -88,6 +88,10 @@ def _reward_type(self): class TestLoadingLinesCapacityReward(TestLoadingReward, unittest.TestCase): def _reward_type(self): return LinesCapacityReward + +class TestLoadingShapedReward(TestLoadingReward, unittest.TestCase): + def _reward_type(self): + return ShapedReward class TestDistanceReward(TestLoadingReward, unittest.TestCase):