From 98c9c02738ede94868fd0d3238f3d0ae00a15a95 Mon Sep 17 00:00:00 2001
From: munsif135 <mohammed.munsif@rootcodelabs.com>
Date: Thu, 16 May 2024 10:03:06 +0530
Subject: [PATCH 1/8] added file related to shaped reward function

---
 grid2op/Reward/shapedReward.py | 67 ++++++++++++++++++++++++++++++++++
 1 file changed, 67 insertions(+)
 create mode 100644 grid2op/Reward/shapedReward.py

diff --git a/grid2op/Reward/shapedReward.py b/grid2op/Reward/shapedReward.py
new file mode 100644
index 000000000..abf5b6b49
--- /dev/null
+++ b/grid2op/Reward/shapedReward.py
@@ -0,0 +1,67 @@
+import numpy as np
+from grid2op.Reward.baseReward import BaseReward
+from grid2op.dtypes import dt_float
+import math
+
+
+class ShapedReward(BaseReward):
+    """
+    This reward is based on the cumulative sum of all overflowing line loads, which the agent aims to minimize.
+    
+    we first calculate the coefficient u, which summarizes the (overflowing) line loads.
+    If Rho_max < 1,i.e, there is currently no overflow, and line loads of all lines are within the allowed bounds, u is
+    calculated as
+    
+        u = max(Rho_max-0.5, 0) ( If Rho_max - 0.5 is positive or zero, it will return Rho_max - 0.5, else it will return 0.)
+    
+    If Rho_max > 1,  u is calculated as
+    
+        u = sum of (rho_i - 0.5) for each i in the range [1, n] where rho_i > 1, n is the number of power lines in the grid
+        
+    Then, utilizing u calculated above, we take into account offline lines and apply exponential decay to obtain
+    the shaped reward r as
+        
+        r =exp(-u - 0.5*n_offline)
+        
+    n_offline is the number of lines which are currently offline as a result of an overflow or agent’s actions (i.e.,
+    we do not consider lines that are offline because of maintenance or opponent attacks)
+        
+    """
+    def __init__(self, logger=None):
+        BaseReward.__init__(self, logger=logger)
+
+        
+    def initialize(self, env):
+        self.reward_min = dt_float(0.0)
+        
+    
+    def __call__(self, action, env, has_error, is_done, is_illegal, is_ambiguous):
+        
+        if not is_done and not has_error:
+            res = self.line_overflowing_sum(env)
+            
+        else:
+            res = self.reward_min
+            
+        return res
+    
+    
+    @staticmethod
+    def line_overflowing_sum(env):
+        obs = env.current_obs
+        
+        if obs.rho.max()<=1:
+            u = max([obs.rho.max()- 0.5, 0])
+            
+        else:
+            u = sum([rho-0.5 for rho in obs.rho if rho>1])
+            
+        lines_disconnected = np.sum(obs.line_status == False)
+        lines_in_maintenance = np.sum(obs.time_next_maintenance==0)
+        lines_under_attack = np.sum(obs.time_since_last_attack>=0)
+        n_offline = lines_disconnected - lines_in_maintenance - lines_under_attack
+            
+        reward = math.exp(-u-0.5*n_offline)
+            
+        return reward
+    
\ No newline at end of file

From bd3fe0c9cf1e695cdae9af0544cceb353ccb0bb0 Mon Sep 17 00:00:00 2001
From: Gajithira Puvanendran <gajipuvi@gmail.com>
Date: Mon, 20 May 2024 11:59:19 +0530
Subject: [PATCH 2/8] added documentation for Shaped Reward

---
 docs/conf.py                   |  3 ++-
 docs/reward.rst                |  1 +
 grid2op/Reward/shapedReward.py | 29 ++++++++++++++++++++++++-----
 3 files changed, 27 insertions(+), 6 deletions(-)

diff --git a/docs/conf.py b/docs/conf.py
index 55664ee2b..820391b98 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -51,7 +51,8 @@
     #'exception_hierarchy',
 
     # for pdf
-    # 'rst2pdf.pdfbuilder'
+    # 'rst2pdf.pdfbuilder',
+    'sphinx.ext.mathjax'
 ]
 # Add any paths that contain templates here, relative to this directory.
 templates_path = [] #'_templates']
diff --git a/docs/reward.rst b/docs/reward.rst
index 049962952..0c35cafc7 100644
--- a/docs/reward.rst
+++ b/docs/reward.rst
@@ -86,6 +86,7 @@ At time of writing the available reward functions is :
 - :class:`LinesReconnectedReward`
 - :class:`N1Reward`
 - :class:`RedispReward`
+- :class:`ShapedReward`
 
 In the provided reward you have also some convenience functions to combine different reward. These are:
 
diff --git a/grid2op/Reward/shapedReward.py b/grid2op/Reward/shapedReward.py
index abf5b6b49..b788e2e1a 100644
--- a/grid2op/Reward/shapedReward.py
+++ b/grid2op/Reward/shapedReward.py
@@ -8,13 +8,15 @@ class ShapedReward(BaseReward):
     """
     This reward is based on the cumulative sum of all overflowing line loads, which the agent aims to minimize.
     
-    we first calculate the coefficient u, which summarizes the (overflowing) line loads.
-    If Rho_max < 1,i.e, there is currently no overflow, and line loads of all lines are within the allowed bounds, u is
-    calculated as
+    This rewards is computed as followed:
+    - We first calculate the coefficient u, which summarizes the (overflowing) line loads.
+    - If rho_max <= 1,i.e, there is currently no overflow, and line loads of all lines are within the allowed bounds, u is
+    calculated as;
     
-        u = max(Rho_max-0.5, 0) ( If Rho_max - 0.5 is positive or zero, it will return Rho_max - 0.5, else it will return 0.)
+        u = max(rho_max-0.5, 0) 
+     If rho_max - 0.5 is positive or zero, it will return Rho_max - 0.5, else it will return 0.)
     
-    If Rho_max > 1,  u is calculated as
+    - If rho_max > 1,  u is calculated as;
     
         u = sum of (rho_i - 0.5) for each i in the range [1, n] where rho_i > 1, n is the number of power lines in the grid
         
@@ -25,6 +27,23 @@ class ShapedReward(BaseReward):
         
     n_offline is the number of lines which are currently offline as a result of an overflow or agent’s actions (i.e.,
     we do not consider lines that are offline because of maintenance or opponent attacks)
+
+    Examples
+    ---------
+    You can use this reward in any environment with:
+
+    .. code-block:: python
+
+        import grid2op
+        from grid2op.Reward import ShapedReward
+
+        # then you create your environment with it:
+        NAME_OF_THE_ENVIRONMENT = "l2rpn_case14_sandbox"
+        env = grid2op.make(NAME_OF_THE_ENVIRONMENT,reward_class=ShapedReward)
+        # and do a step with a "do nothing" action
+        obs = env.reset()
+        obs, reward, done, info = env.step(env.action_space())
+        # the reward is computed with the ShapedReward class
         
     """
     def __init__(self, logger=None):

From df2de289c3c4215919cb6032f161e8d961c92800 Mon Sep 17 00:00:00 2001
From: Gajithira Puvanendran <gajipuvi@gmail.com>
Date: Wed, 22 May 2024 12:32:39 +0530
Subject: [PATCH 3/8] added shapedReward class to render in the sphinx
 documentation

---
 grid2op/Reward/__init__.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/grid2op/Reward/__init__.py b/grid2op/Reward/__init__.py
index 54bacfb73..9c233960d 100644
--- a/grid2op/Reward/__init__.py
+++ b/grid2op/Reward/__init__.py
@@ -18,6 +18,7 @@
     "EpisodeDurationReward",
     "AlarmReward",
     "N1Reward",
+    "ShapedReward",
     # TODO it would be better to have a specific package for this, but in the mean time i put it here
     "L2RPNSandBoxScore",
     "L2RPNWCCI2022ScoreFun",
@@ -25,7 +26,7 @@
     "_AlarmScore",
     "_NewRenewableSourcesUsageScore",
     "_AlertCostScore",
-    "_AlertTrustScore"
+    "_AlertTrustScore",
 ]
 
 from grid2op.Reward.constantReward import ConstantReward
@@ -49,6 +50,7 @@
 from grid2op.Reward.alarmReward import AlarmReward
 from grid2op.Reward._alarmScore import _AlarmScore
 from grid2op.Reward.n1Reward import N1Reward
+from grid2op.Reward.shapedReward import ShapedReward
 from grid2op.Reward.l2rpn_wcci2022_scorefun import L2RPNWCCI2022ScoreFun
 from grid2op.Reward.alertReward import AlertReward
 from grid2op.Reward._newRenewableSourcesUsageScore import _NewRenewableSourcesUsageScore

From f3e0f71264efbc3862340209dcf2802d32565a1c Mon Sep 17 00:00:00 2001
From: Gajithira Puvanendran <gajipuvi@gmail.com>
Date: Wed, 22 May 2024 12:33:23 +0530
Subject: [PATCH 4/8] modified the doc string to remove latex expressions

---
 grid2op/Reward/shapedReward.py | 35 ++++++++++++++++++----------------
 1 file changed, 19 insertions(+), 16 deletions(-)

diff --git a/grid2op/Reward/shapedReward.py b/grid2op/Reward/shapedReward.py
index b788e2e1a..864f1d64d 100644
--- a/grid2op/Reward/shapedReward.py
+++ b/grid2op/Reward/shapedReward.py
@@ -8,25 +8,28 @@ class ShapedReward(BaseReward):
     """
     This reward is based on the cumulative sum of all overflowing line loads, which the agent aims to minimize.
     
-    This rewards is computed as followed:
+    This rewards is computed as followed:  
+
     - We first calculate the coefficient u, which summarizes the (overflowing) line loads.
-    - If rho_max <= 1,i.e, there is currently no overflow, and line loads of all lines are within the allowed bounds, u is
-    calculated as;
-    
-        u = max(rho_max-0.5, 0) 
-     If rho_max - 0.5 is positive or zero, it will return Rho_max - 0.5, else it will return 0.)
     
-    - If rho_max > 1,  u is calculated as;
-    
-        u = sum of (rho_i - 0.5) for each i in the range [1, n] where rho_i > 1, n is the number of power lines in the grid
-        
+    - If rho_max < 1,i.e, there is currently no overflow, and line loads of all lines are within the allowed bounds, u is
+    calculated as:
+
+          u = max(rho_max - 0.5, 0) ; where rho_max is the maximum overflow value in the grid
+
+      (If rho_max - 0.5 is positive or zero, it will return :math:rho_max - 0.5`, else it will return 0.)
+
+    - If  rho_max > 1, u is calculated as:
+
+          u = sum(rho_i - 0.5) for each i in [1, n] and rho_i > 1 ; where  `n` is the number of power lines in the grid.
+
     Then, utilizing u calculated above, we take into account offline lines and apply exponential decay to obtain
-    the shaped reward r as
-        
-        r =exp(-u - 0.5*n_offline)
-        
-    n_offline is the number of lines which are currently offline as a result of an overflow or agent’s actions (i.e.,
-    we do not consider lines that are offline because of maintenance or opponent attacks)
+    the shaped reward r as:
+
+        r = exp(-u - 0.5^n_offline)
+
+    where n_offline is the number of lines which are currently offline as a result of an overflow or agent’s actions (i.e.,
+    we do not consider lines that are offline because of maintenance or opponent attacks).
 
     Examples
     ---------

From 9726d82a562b6a88edfe9243f5671e6af1c412cd Mon Sep 17 00:00:00 2001
From: munsif135 <mohammed.munsif@rootcodelabs.com>
Date: Mon, 27 May 2024 14:43:48 +0530
Subject: [PATCH 5/8] added files related to shaped reward and test shaped
 reward

---
 grid2op/Reward/shapedReward.py | 4 ++--
 grid2op/tests/test_Reward.py   | 7 ++++++-
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/grid2op/Reward/shapedReward.py b/grid2op/Reward/shapedReward.py
index 864f1d64d..e49d25596 100644
--- a/grid2op/Reward/shapedReward.py
+++ b/grid2op/Reward/shapedReward.py
@@ -21,12 +21,12 @@ class ShapedReward(BaseReward):
 
     - If  rho_max > 1, u is calculated as:
 
-          u = sum(rho_i - 0.5) for each i in [1, n] and rho_i > 1 ; where  `n` is the number of power lines in the grid.
+          u =  Σ (rho_i - 0.5) for each i in [1, n] and rho_i > 1 ; where  `n` is the number of power lines in the grid.
 
     Then, utilizing u calculated above, we take into account offline lines and apply exponential decay to obtain
     the shaped reward r as:
 
-        r = exp(-u - 0.5^n_offline)
+        r = e\ :sup:`(-u - 0.5.n_offline)`
 
     where n_offline is the number of lines which are currently offline as a result of an overflow or agent’s actions (i.e.,
     we do not consider lines that are offline because of maintenance or opponent attacks).
diff --git a/grid2op/tests/test_Reward.py b/grid2op/tests/test_Reward.py
index c76c55f03..7482e3ed3 100644
--- a/grid2op/tests/test_Reward.py
+++ b/grid2op/tests/test_Reward.py
@@ -18,7 +18,8 @@
 from grid2op.Parameters import Parameters
 from grid2op.Runner import Runner
 from grid2op.Agent import BaseAgent
-
+print("shaped")
+from grid2op.Reward.shapedReward import ShapedReward
 import warnings
 
 
@@ -88,6 +89,10 @@ def _reward_type(self):
 class TestLoadingLinesCapacityReward(TestLoadingReward, unittest.TestCase):
     def _reward_type(self):
         return LinesCapacityReward
+    
+class TestLoadingShapedReward(TestLoadingReward, unittest.TestCase):
+    def _reward_type(self):
+        return ShapedReward
 
 
 class TestDistanceReward(TestLoadingReward, unittest.TestCase):

From 7b8df06fb0b417faff31599b8ec472b74d844b24 Mon Sep 17 00:00:00 2001
From: munsif135 <mohammed.munsif@rootcodelabs.com>
Date: Thu, 30 May 2024 09:23:37 +0530
Subject: [PATCH 6/8] removed unwanted blanks

---
 docs/conf.py                 | 1 -
 grid2op/Reward/__init__.py   | 1 -
 grid2op/tests/test_Reward.py | 1 -
 3 files changed, 3 deletions(-)

diff --git a/docs/conf.py b/docs/conf.py
index 820391b98..582d4d1b3 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -49,7 +49,6 @@
 
     # 'details',
     #'exception_hierarchy',
-
     # for pdf
     # 'rst2pdf.pdfbuilder',
     'sphinx.ext.mathjax'
diff --git a/grid2op/Reward/__init__.py b/grid2op/Reward/__init__.py
index 9c233960d..c38d7094c 100644
--- a/grid2op/Reward/__init__.py
+++ b/grid2op/Reward/__init__.py
@@ -19,7 +19,6 @@
     "AlarmReward",
     "N1Reward",
     "ShapedReward",
-    # TODO it would be better to have a specific package for this, but in the mean time i put it here
     "L2RPNSandBoxScore",
     "L2RPNWCCI2022ScoreFun",
     "AlertReward",
diff --git a/grid2op/tests/test_Reward.py b/grid2op/tests/test_Reward.py
index 7482e3ed3..995caf049 100644
--- a/grid2op/tests/test_Reward.py
+++ b/grid2op/tests/test_Reward.py
@@ -18,7 +18,6 @@
 from grid2op.Parameters import Parameters
 from grid2op.Runner import Runner
 from grid2op.Agent import BaseAgent
-print("shaped")
 from grid2op.Reward.shapedReward import ShapedReward
 import warnings
 

From 7afbb4d955743fca908e061f42c6476e865904cd Mon Sep 17 00:00:00 2001
From: munsif135 <mohammed.munsif@rootcodelabs.com>
Date: Thu, 6 Jun 2024 12:51:55 +0530
Subject: [PATCH 7/8] added deleted comment in __init__

---
 grid2op/Reward/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/grid2op/Reward/__init__.py b/grid2op/Reward/__init__.py
index c38d7094c..e023a2f52 100644
--- a/grid2op/Reward/__init__.py
+++ b/grid2op/Reward/__init__.py
@@ -18,6 +18,7 @@
     "EpisodeDurationReward",
     "AlarmReward",
     "N1Reward",
+    # TODO it would be better to have a specific package for this, but in the mean time i put it here
     "ShapedReward",
     "L2RPNSandBoxScore",
     "L2RPNWCCI2022ScoreFun",

From 2bdf90a5eec7c72b0cec0102e50f617b48ca10ff Mon Sep 17 00:00:00 2001
From: Gajithira Puvanendran <gajipuvi@gmail.com>
Date: Fri, 7 Jun 2024 16:25:57 +0530
Subject: [PATCH 8/8] modified doc string to include few more clarifications

---
 grid2op/Reward/shapedReward.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/grid2op/Reward/shapedReward.py b/grid2op/Reward/shapedReward.py
index e49d25596..b2b0b64fa 100644
--- a/grid2op/Reward/shapedReward.py
+++ b/grid2op/Reward/shapedReward.py
@@ -21,14 +21,12 @@ class ShapedReward(BaseReward):
 
     - If  rho_max > 1, u is calculated as:
 
-          u =  Σ (rho_i - 0.5) for each i in [1, n] and rho_i > 1 ; where  `n` is the number of power lines in the grid.
+          u =  Σ (rho_i - 0.5) for each i in [1, n] and rho_i > 1 ; where  `n` is the number of power lines in the grid and Σ denotes summation.
 
-    Then, utilizing u calculated above, we take into account offline lines and apply exponential decay to obtain
-    the shaped reward r as:
+    Then, utilizing u calculated above, we take into account offline lines and apply exponential decay to obtain the shaped reward r as:
 
-        r = e\ :sup:`(-u - 0.5.n_offline)`
-
-    where n_offline is the number of lines which are currently offline as a result of an overflow or agent’s actions (i.e.,
+        r = exp(-u - 0.5*n_offline) ; 
+        where n_offline is the number of lines which are currently offline as a result of an overflow or agent’s actions (i.e.,
     we do not consider lines that are offline because of maintenance or opponent attacks).
 
     Examples