From 98c9c02738ede94868fd0d3238f3d0ae00a15a95 Mon Sep 17 00:00:00 2001 From: munsif135 Date: Thu, 16 May 2024 10:03:06 +0530 Subject: [PATCH 1/8] added file related to shaped reward function --- grid2op/Reward/shapedReward.py | 67 ++++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) create mode 100644 grid2op/Reward/shapedReward.py diff --git a/grid2op/Reward/shapedReward.py b/grid2op/Reward/shapedReward.py new file mode 100644 index 000000000..abf5b6b49 --- /dev/null +++ b/grid2op/Reward/shapedReward.py @@ -0,0 +1,67 @@ +import numpy as np +from grid2op.Reward.baseReward import BaseReward +from grid2op.dtypes import dt_float +import math + + +class ShapedReward(BaseReward): + """ + This reward is based on the cumulative sum of all overflowing line loads, which the agent aims to minimize. + + we first calculate the coefficient u, which summarizes the (overflowing) line loads. + If Rho_max < 1,i.e, there is currently no overflow, and line loads of all lines are within the allowed bounds, u is + calculated as + + u = max(Rho_max-0.5, 0) ( If Rho_max - 0.5 is positive or zero, it will return Rho_max - 0.5, else it will return 0.) + + If Rho_max > 1, u is calculated as + + u = sum of (rho_i - 0.5) for each i in the range [1, n] where rho_i > 1, n is the number of power lines in the grid + + Then, utilizing u calculated above, we take into account offline lines and apply exponential decay to obtain + the shaped reward r as + + r =exp(-u - 0.5*n_offline) + + n_offline is the number of lines which are currently offline as a result of an overflow or agent’s actions (i.e., + we do not consider lines that are offline because of maintenance or opponent attacks) + + """ + def __init__(self, logger=None): + BaseReward.__init__(self, logger=logger) + + + def initialize(self, env): + self.reward_min = dt_float(0.0) + + + def __call__(self, action, env, has_error, is_done, is_illegal, is_ambiguous): + + if not is_done and not has_error: + res = self.line_overflowing_sum(env) + + else: + res = self.reward_min + + return res + + + @staticmethod + def line_overflowing_sum(env): + obs = env.current_obs + + if obs.rho.max()<=1: + u = max([obs.rho.max()- 0.5, 0]) + + else: + u = sum([rho-0.5 for rho in obs.rho if rho>1]) + + lines_disconnected = np.sum(obs.line_status == False) + lines_in_maintenance = np.sum(obs.time_next_maintenance==0) + lines_under_attack = np.sum(obs.time_since_last_attack>=0) + n_offline = lines_disconnected - lines_in_maintenance - lines_under_attack + + reward = math.exp(-u-0.5*n_offline) + + return reward + \ No newline at end of file From bd3fe0c9cf1e695cdae9af0544cceb353ccb0bb0 Mon Sep 17 00:00:00 2001 From: Gajithira Puvanendran Date: Mon, 20 May 2024 11:59:19 +0530 Subject: [PATCH 2/8] added documentation for Shaped Reward --- docs/conf.py | 3 ++- docs/reward.rst | 1 + grid2op/Reward/shapedReward.py | 29 ++++++++++++++++++++++++----- 3 files changed, 27 insertions(+), 6 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 55664ee2b..820391b98 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -51,7 +51,8 @@ #'exception_hierarchy', # for pdf - # 'rst2pdf.pdfbuilder' + # 'rst2pdf.pdfbuilder', + 'sphinx.ext.mathjax' ] # Add any paths that contain templates here, relative to this directory. templates_path = [] #'_templates'] diff --git a/docs/reward.rst b/docs/reward.rst index 049962952..0c35cafc7 100644 --- a/docs/reward.rst +++ b/docs/reward.rst @@ -86,6 +86,7 @@ At time of writing the available reward functions is : - :class:`LinesReconnectedReward` - :class:`N1Reward` - :class:`RedispReward` +- :class:`ShapedReward` In the provided reward you have also some convenience functions to combine different reward. These are: diff --git a/grid2op/Reward/shapedReward.py b/grid2op/Reward/shapedReward.py index abf5b6b49..b788e2e1a 100644 --- a/grid2op/Reward/shapedReward.py +++ b/grid2op/Reward/shapedReward.py @@ -8,13 +8,15 @@ class ShapedReward(BaseReward): """ This reward is based on the cumulative sum of all overflowing line loads, which the agent aims to minimize. - we first calculate the coefficient u, which summarizes the (overflowing) line loads. - If Rho_max < 1,i.e, there is currently no overflow, and line loads of all lines are within the allowed bounds, u is - calculated as + This rewards is computed as followed: + - We first calculate the coefficient u, which summarizes the (overflowing) line loads. + - If rho_max <= 1,i.e, there is currently no overflow, and line loads of all lines are within the allowed bounds, u is + calculated as; - u = max(Rho_max-0.5, 0) ( If Rho_max - 0.5 is positive or zero, it will return Rho_max - 0.5, else it will return 0.) + u = max(rho_max-0.5, 0) + If rho_max - 0.5 is positive or zero, it will return Rho_max - 0.5, else it will return 0.) - If Rho_max > 1, u is calculated as + - If rho_max > 1, u is calculated as; u = sum of (rho_i - 0.5) for each i in the range [1, n] where rho_i > 1, n is the number of power lines in the grid @@ -25,6 +27,23 @@ class ShapedReward(BaseReward): n_offline is the number of lines which are currently offline as a result of an overflow or agent’s actions (i.e., we do not consider lines that are offline because of maintenance or opponent attacks) + + Examples + --------- + You can use this reward in any environment with: + + .. code-block:: python + + import grid2op + from grid2op.Reward import ShapedReward + + # then you create your environment with it: + NAME_OF_THE_ENVIRONMENT = "l2rpn_case14_sandbox" + env = grid2op.make(NAME_OF_THE_ENVIRONMENT,reward_class=ShapedReward) + # and do a step with a "do nothing" action + obs = env.reset() + obs, reward, done, info = env.step(env.action_space()) + # the reward is computed with the ShapedReward class """ def __init__(self, logger=None): From df2de289c3c4215919cb6032f161e8d961c92800 Mon Sep 17 00:00:00 2001 From: Gajithira Puvanendran Date: Wed, 22 May 2024 12:32:39 +0530 Subject: [PATCH 3/8] added shapedReward class to render in the sphinx documentation --- grid2op/Reward/__init__.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/grid2op/Reward/__init__.py b/grid2op/Reward/__init__.py index 54bacfb73..9c233960d 100644 --- a/grid2op/Reward/__init__.py +++ b/grid2op/Reward/__init__.py @@ -18,6 +18,7 @@ "EpisodeDurationReward", "AlarmReward", "N1Reward", + "ShapedReward", # TODO it would be better to have a specific package for this, but in the mean time i put it here "L2RPNSandBoxScore", "L2RPNWCCI2022ScoreFun", @@ -25,7 +26,7 @@ "_AlarmScore", "_NewRenewableSourcesUsageScore", "_AlertCostScore", - "_AlertTrustScore" + "_AlertTrustScore", ] from grid2op.Reward.constantReward import ConstantReward @@ -49,6 +50,7 @@ from grid2op.Reward.alarmReward import AlarmReward from grid2op.Reward._alarmScore import _AlarmScore from grid2op.Reward.n1Reward import N1Reward +from grid2op.Reward.shapedReward import ShapedReward from grid2op.Reward.l2rpn_wcci2022_scorefun import L2RPNWCCI2022ScoreFun from grid2op.Reward.alertReward import AlertReward from grid2op.Reward._newRenewableSourcesUsageScore import _NewRenewableSourcesUsageScore From f3e0f71264efbc3862340209dcf2802d32565a1c Mon Sep 17 00:00:00 2001 From: Gajithira Puvanendran Date: Wed, 22 May 2024 12:33:23 +0530 Subject: [PATCH 4/8] modified the doc string to remove latex expressions --- grid2op/Reward/shapedReward.py | 35 ++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/grid2op/Reward/shapedReward.py b/grid2op/Reward/shapedReward.py index b788e2e1a..864f1d64d 100644 --- a/grid2op/Reward/shapedReward.py +++ b/grid2op/Reward/shapedReward.py @@ -8,25 +8,28 @@ class ShapedReward(BaseReward): """ This reward is based on the cumulative sum of all overflowing line loads, which the agent aims to minimize. - This rewards is computed as followed: + This rewards is computed as followed: + - We first calculate the coefficient u, which summarizes the (overflowing) line loads. - - If rho_max <= 1,i.e, there is currently no overflow, and line loads of all lines are within the allowed bounds, u is - calculated as; - - u = max(rho_max-0.5, 0) - If rho_max - 0.5 is positive or zero, it will return Rho_max - 0.5, else it will return 0.) - - If rho_max > 1, u is calculated as; - - u = sum of (rho_i - 0.5) for each i in the range [1, n] where rho_i > 1, n is the number of power lines in the grid - + - If rho_max < 1,i.e, there is currently no overflow, and line loads of all lines are within the allowed bounds, u is + calculated as: + + u = max(rho_max - 0.5, 0) ; where rho_max is the maximum overflow value in the grid + + (If rho_max - 0.5 is positive or zero, it will return :math:rho_max - 0.5`, else it will return 0.) + + - If rho_max > 1, u is calculated as: + + u = sum(rho_i - 0.5) for each i in [1, n] and rho_i > 1 ; where `n` is the number of power lines in the grid. + Then, utilizing u calculated above, we take into account offline lines and apply exponential decay to obtain - the shaped reward r as - - r =exp(-u - 0.5*n_offline) - - n_offline is the number of lines which are currently offline as a result of an overflow or agent’s actions (i.e., - we do not consider lines that are offline because of maintenance or opponent attacks) + the shaped reward r as: + + r = exp(-u - 0.5^n_offline) + + where n_offline is the number of lines which are currently offline as a result of an overflow or agent’s actions (i.e., + we do not consider lines that are offline because of maintenance or opponent attacks). Examples --------- From 9726d82a562b6a88edfe9243f5671e6af1c412cd Mon Sep 17 00:00:00 2001 From: munsif135 Date: Mon, 27 May 2024 14:43:48 +0530 Subject: [PATCH 5/8] added files related to shaped reward and test shaped reward --- grid2op/Reward/shapedReward.py | 4 ++-- grid2op/tests/test_Reward.py | 7 ++++++- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/grid2op/Reward/shapedReward.py b/grid2op/Reward/shapedReward.py index 864f1d64d..e49d25596 100644 --- a/grid2op/Reward/shapedReward.py +++ b/grid2op/Reward/shapedReward.py @@ -21,12 +21,12 @@ class ShapedReward(BaseReward): - If rho_max > 1, u is calculated as: - u = sum(rho_i - 0.5) for each i in [1, n] and rho_i > 1 ; where `n` is the number of power lines in the grid. + u = Σ (rho_i - 0.5) for each i in [1, n] and rho_i > 1 ; where `n` is the number of power lines in the grid. Then, utilizing u calculated above, we take into account offline lines and apply exponential decay to obtain the shaped reward r as: - r = exp(-u - 0.5^n_offline) + r = e\ :sup:`(-u - 0.5.n_offline)` where n_offline is the number of lines which are currently offline as a result of an overflow or agent’s actions (i.e., we do not consider lines that are offline because of maintenance or opponent attacks). diff --git a/grid2op/tests/test_Reward.py b/grid2op/tests/test_Reward.py index c76c55f03..7482e3ed3 100644 --- a/grid2op/tests/test_Reward.py +++ b/grid2op/tests/test_Reward.py @@ -18,7 +18,8 @@ from grid2op.Parameters import Parameters from grid2op.Runner import Runner from grid2op.Agent import BaseAgent - +print("shaped") +from grid2op.Reward.shapedReward import ShapedReward import warnings @@ -88,6 +89,10 @@ def _reward_type(self): class TestLoadingLinesCapacityReward(TestLoadingReward, unittest.TestCase): def _reward_type(self): return LinesCapacityReward + +class TestLoadingShapedReward(TestLoadingReward, unittest.TestCase): + def _reward_type(self): + return ShapedReward class TestDistanceReward(TestLoadingReward, unittest.TestCase): From 7b8df06fb0b417faff31599b8ec472b74d844b24 Mon Sep 17 00:00:00 2001 From: munsif135 Date: Thu, 30 May 2024 09:23:37 +0530 Subject: [PATCH 6/8] removed unwanted blanks --- docs/conf.py | 1 - grid2op/Reward/__init__.py | 1 - grid2op/tests/test_Reward.py | 1 - 3 files changed, 3 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 820391b98..582d4d1b3 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -49,7 +49,6 @@ # 'details', #'exception_hierarchy', - # for pdf # 'rst2pdf.pdfbuilder', 'sphinx.ext.mathjax' diff --git a/grid2op/Reward/__init__.py b/grid2op/Reward/__init__.py index 9c233960d..c38d7094c 100644 --- a/grid2op/Reward/__init__.py +++ b/grid2op/Reward/__init__.py @@ -19,7 +19,6 @@ "AlarmReward", "N1Reward", "ShapedReward", - # TODO it would be better to have a specific package for this, but in the mean time i put it here "L2RPNSandBoxScore", "L2RPNWCCI2022ScoreFun", "AlertReward", diff --git a/grid2op/tests/test_Reward.py b/grid2op/tests/test_Reward.py index 7482e3ed3..995caf049 100644 --- a/grid2op/tests/test_Reward.py +++ b/grid2op/tests/test_Reward.py @@ -18,7 +18,6 @@ from grid2op.Parameters import Parameters from grid2op.Runner import Runner from grid2op.Agent import BaseAgent -print("shaped") from grid2op.Reward.shapedReward import ShapedReward import warnings From 7afbb4d955743fca908e061f42c6476e865904cd Mon Sep 17 00:00:00 2001 From: munsif135 Date: Thu, 6 Jun 2024 12:51:55 +0530 Subject: [PATCH 7/8] added deleted comment in __init__ --- grid2op/Reward/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/grid2op/Reward/__init__.py b/grid2op/Reward/__init__.py index c38d7094c..e023a2f52 100644 --- a/grid2op/Reward/__init__.py +++ b/grid2op/Reward/__init__.py @@ -18,6 +18,7 @@ "EpisodeDurationReward", "AlarmReward", "N1Reward", + # TODO it would be better to have a specific package for this, but in the mean time i put it here "ShapedReward", "L2RPNSandBoxScore", "L2RPNWCCI2022ScoreFun", From 2bdf90a5eec7c72b0cec0102e50f617b48ca10ff Mon Sep 17 00:00:00 2001 From: Gajithira Puvanendran Date: Fri, 7 Jun 2024 16:25:57 +0530 Subject: [PATCH 8/8] modified doc string to include few more clarifications --- grid2op/Reward/shapedReward.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/grid2op/Reward/shapedReward.py b/grid2op/Reward/shapedReward.py index e49d25596..b2b0b64fa 100644 --- a/grid2op/Reward/shapedReward.py +++ b/grid2op/Reward/shapedReward.py @@ -21,14 +21,12 @@ class ShapedReward(BaseReward): - If rho_max > 1, u is calculated as: - u = Σ (rho_i - 0.5) for each i in [1, n] and rho_i > 1 ; where `n` is the number of power lines in the grid. + u = Σ (rho_i - 0.5) for each i in [1, n] and rho_i > 1 ; where `n` is the number of power lines in the grid and Σ denotes summation. - Then, utilizing u calculated above, we take into account offline lines and apply exponential decay to obtain - the shaped reward r as: + Then, utilizing u calculated above, we take into account offline lines and apply exponential decay to obtain the shaped reward r as: - r = e\ :sup:`(-u - 0.5.n_offline)` - - where n_offline is the number of lines which are currently offline as a result of an overflow or agent’s actions (i.e., + r = exp(-u - 0.5*n_offline) ; + where n_offline is the number of lines which are currently offline as a result of an overflow or agent’s actions (i.e., we do not consider lines that are offline because of maintenance or opponent attacks). Examples