DeNA · ikki407 · Feb 10, 2022 · Mar 12, 2021 · Mar 12, 2021 · Mar 17, 2021
diff --git a/README.md b/README.md
@@ -113,5 +113,5 @@ NOTE: Default opponent AI is random agent implemented in `evaluation.py`. You ca
 
 ## Use Cases
 
-*   [Month 1 Winner in Hungry Geese (Kaggle)](https://www.kaggle.com/c/hungry-geese/discussion/222941)
-*   [The 5th solution in Google Research Football with Manchester City F.C. (Kaggle)](https://www.kaggle.com/c/google-football/discussion/203412)
+*   [The 1st place solution in Hungry Geese (Kaggle)](https://www.kaggle.com/c/hungry-geese/discussion/263279)
+*   [The 5th place solution in Google Research Football with Manchester City F.C. (Kaggle)](https://www.kaggle.com/c/google-football/discussion/203412)
diff --git a/config.yaml b/config.yaml
@@ -10,6 +10,7 @@ train_args:
     observation: False
     gamma: 0.8
     forward_steps: 16
+    burn_in_steps: 0  # for RNNs
     compress_steps: 4
     entropy_regularization: 1.0e-1
     entropy_regularization_decay: 0.1

diff --git a/handyrl/agent.py b/handyrl/agent.py
@@ -34,16 +34,17 @@ def print_outputs(env, prob, v):
     if hasattr(env, 'print_outputs'):
         env.print_outputs(prob, v)
     else:
-        print('v = %f' % v)
-        print('p = %s' % (prob * 1000).astype(int))
+        if v is not None:
+            print('v = %f' % v)
+        if prob is not None:
+            print('p = %s' % (prob * 1000).astype(int))
 
 
 class Agent:
-    def __init__(self, model, observation=False, temperature=0.0):
+    def __init__(self, model, temperature=0.0):
         # model might be a neural net, or some planning algorithm such as game tree search
         self.model = model
         self.hidden = None
-        self.observation = observation
         self.temperature = temperature
 
     def reset(self, env, show=False):
@@ -73,12 +74,10 @@ def action(self, env, player, show=False):
             return random.choices(np.arange(len(p)), weights=softmax(p / self.temperature))[0]
 
     def observe(self, env, player, show=False):
-        v = None
-        if self.observation:
-            outputs = self.plan(env.observation(player))
-            v = outputs.get('value', None)
-            if show:
-                print_outputs(env, None, v)
+        outputs = self.plan(env.observation(player))
+        v = outputs.get('value', None)
+        if show:
+            print_outputs(env, None, v)
         return v if v is not None else [0.0]
 
 
@@ -101,5 +100,5 @@ def plan(self, obs):
 
 
 class SoftAgent(Agent):
-    def __init__(self, model, observation=False):
-        super().__init__(model, observation=observation, temperature=1.0)
+    def __init__(self, model):
+        super().__init__(model, temperature=1.0)
diff --git a/handyrl/connection.py b/handyrl/connection.py
@@ -2,7 +2,6 @@
 # Licensed under The MIT License [see LICENSE for details]
 
 import io
-import time
 import struct
 import socket
 import pickle

diff --git a/handyrl/environment.py b/handyrl/environment.py
@@ -77,6 +77,13 @@ def turn(self):
     def turns(self):
         return [self.turn()]
 
+    #
+    # Should be defined if there are other players besides the turn player
+    # who should observe the environment (mainly with RNNs)
+    #
+    def observers(self):
+        return []
+
     #
     # Should be defined in all games
     #

diff --git a/handyrl/envs/geister.py b/handyrl/envs/geister.py
@@ -34,16 +34,10 @@ def __init__(self, input_dim, hidden_dim, kernel_size, bias):
         )
 
     def init_hidden(self, input_size, batch_size):
-        if batch_size is None:  # for inference
-            return tuple([
-                np.zeros((self.hidden_dim, *input_size), dtype=np.float32),
-                np.zeros((self.hidden_dim, *input_size), dtype=np.float32)
-            ])
-        else:  # for training
-            return tuple([
-                torch.zeros(*batch_size, self.hidden_dim, *input_size),
-                torch.zeros(*batch_size, self.hidden_dim, *input_size)
-            ])
+        return tuple([
+            torch.zeros(*batch_size, self.hidden_dim, *input_size),
+            torch.zeros(*batch_size, self.hidden_dim, *input_size)
+        ])
 
     def forward(self, input_tensor, cur_state):
         h_cur, c_cur = cur_state
@@ -63,6 +57,11 @@ def forward(self, input_tensor, cur_state):
         return h_next, c_next
 
 
+# Deep Repeated Conv-LSTM (https://arxiv.org/abs/1901.03559)
+# increases expressive power with fewer parameters
+# by repeatedly computing multi-layer convolutional LSTM.
+# When num_repeats=1, it is simply a multi-layer Conv-LSTM.
+
 class DRC(nn.Module):
     def __init__(self, num_layers, input_dim, hidden_dim, kernel_size=3, bias=True):
         super().__init__()
@@ -93,7 +92,7 @@ def forward(self, x, hidden, num_repeats):
         hs, cs = hidden
         for _ in range(num_repeats):
             for i, block in enumerate(self.blocks):
-                hs[i], cs[i] = block(x, (hs[i], cs[i]))
+                hs[i], cs[i] = block(hs[i - 1] if i > 0 else x, (hs[i], cs[i]))
 
         return hs[-1], (hs, cs)
 
@@ -145,7 +144,7 @@ def __init__(self):
         self.head_v = ScalarHead((filters * 2, 6, 6), 1, 1)
         self.head_r = ScalarHead((filters * 2, 6, 6), 1, 1)
 
-    def init_hidden(self, batch_size=None):
+    def init_hidden(self, batch_size=[]):
         return self.body.init_hidden(self.input_size[1:], batch_size)
 
     def forward(self, x, hidden):
@@ -448,6 +447,8 @@ def legal(self, action):
         if self.turn_count < 0:
             layout = action - 4 * 6 * 6
             return 0 <= layout < 70
+        elif not 0 <= action < 4 * 6 * 6:
+            return False
 
         pos_from = self.action2from(action, self.color)
         pos_to = self.action2to(action, self.color)

diff --git a/handyrl/evaluation.py b/handyrl/evaluation.py
@@ -76,7 +76,7 @@ def observe(self, player):
         return send_recv(self.conn, ('observe', [player]))
 
 
-def exec_match(env, agents, critic, show=False, game_args={}):
+def exec_match(env, agents, critic=None, show=False, game_args={}):
     ''' match with shared game environment '''
     if env.reset(game_args):
         return None
@@ -88,11 +88,12 @@ def exec_match(env, agents, critic, show=False, game_args={}):
         if show and critic is not None:
             print('cv = ', critic.observe(env, None, show=False)[0])
         turn_players = env.turns()
+        observers = env.observers()
         actions = {}
         for p, agent in agents.items():
             if p in turn_players:
                 actions[p] = agent.action(env, p, show=show)
-            else:
+            elif p in observers:
                 agent.observe(env, p, show=show)
         if env.step(actions):
             return None
@@ -104,7 +105,7 @@ def exec_match(env, agents, critic, show=False, game_args={}):
     return outcome
 
 
-def exec_network_match(env, network_agents, critic, show=False, game_args={}):
+def exec_network_match(env, network_agents, critic=None, show=False, game_args={}):
     ''' match with divided game environment '''
     if env.reset(game_args):
         return None
@@ -117,12 +118,13 @@ def exec_network_match(env, network_agents, critic, show=False, game_args={}):
         if show and critic is not None:
             print('cv = ', critic.observe(env, None, show=False)[0])
         turn_players = env.turns()
+        observers = env.observers()
         actions = {}
         for p, agent in network_agents.items():
             if p in turn_players:
                 action = agent.action(p)
                 actions[p] = env.str2action(action, p)
-            else:
+            elif p in observers:
                 agent.observe(p)
         if env.step(actions):
             return None
@@ -161,9 +163,9 @@ def execute(self, models, args):
             if model is None:
                 agents[p] = build_agent(opponent, self.env)
             else:
-                agents[p] = Agent(model, self.args['observation'])
+                agents[p] = Agent(model)
 
-        outcome = exec_match(self.env, agents, None)
+        outcome = exec_match(self.env, agents)
         if outcome is None:
             print('None episode in evaluation!')
             return None
@@ -277,10 +279,78 @@ def network_match_acception(n, env_args, num_agents, port):
     return agents_list
 
 
-def get_model(env, model_path):
+class OnnxModel:
+    def __init__(self, model_path):
+        self.model_path = model_path
+        self.ort_session = None
+
+    def _open_session(self):
+        import os
+        os.environ['OMP_NUM_THREADS'] = '1'
+        os.environ['OMP_WAIT_POLICY'] = 'PASSIVE'
+
+        import onnxruntime
+        opts = onnxruntime.SessionOptions()
+        opts.intra_op_num_threads = 1
+        opts.inter_op_num_threads = 1
+        opts.execution_mode = onnxruntime.ExecutionMode.ORT_SEQUENTIAL
+
+        self.ort_session = onnxruntime.InferenceSession(self.model_path, sess_options=opts)
+
+    def init_hidden(self):
+        if self.ort_session is None:
+            self._open_session()
+        hidden_inputs = [y for y in self.ort_session.get_inputs() if y.name.startswith('hidden')]
+        if len(hidden_inputs) == 0:
+            return None
+        import numpy as np
+        type_map = {
+            'tensor(float)': np.float32,
+            'tensor(int64)': np.int64,
+        }
+        hidden_tensors = [np.zeros(y.shape[1:], dtype=type_map[y.type]) for y in hidden_inputs]
+        return hidden_tensors
+
+    def inference(self, x, hidden=None, batch_input=False):
+        # numpy array -> numpy array
+        if self.ort_session is None:
+            self._open_session()
+
+        ort_inputs = {}
+        ort_input_names = [y.name for y in self.ort_session.get_inputs()]
+
+        import numpy as np
+        def insert_input(y):
+            y = y if batch_input else np.expand_dims(y, 0)
+            ort_inputs[ort_input_names[len(ort_inputs)]] = y
+        from .util import map_r
+        map_r(x, lambda y: insert_input(y))
+        if hidden is not None:
+            map_r(hidden, lambda y: insert_input(y))
+        ort_outputs = self.ort_session.run(None, ort_inputs)
+        if not batch_input:
+            ort_outputs = [o.squeeze(0) for o in ort_outputs]
+
+        ort_output_names = [y.name for y in self.ort_session.get_outputs()]
+        outputs = {name: ort_outputs[i] for i, name in enumerate(ort_output_names)}
+
+        hidden_outputs = []
+        for k in list(outputs.keys()):
+            if k.startswith('hidden'):
+                hidden_outputs.append(outputs.pop(k))
+        if len(hidden_outputs) == 0:
+            hidden_outputs = None
+
+        outputs = {**outputs, 'hidden': hidden_outputs}
+        return outputs
+
+
+def load_model(model_path, model):
+    if model_path.endswith('.onnx'):
+        model = OnnxModel(model_path)
+        return model
     import torch
     from .model import ModelWrapper
-    model = env.net()
     model.load_state_dict(torch.load(model_path))
     model.eval()
     return ModelWrapper(model)
@@ -290,7 +360,7 @@ def client_mp_child(env_args, model_path, conn):
     env = make_env(env_args)
     agent = build_agent(model_path, env)
     if agent is None:
-        model = get_model(env, model_path)
+        model = load_model(model_path, env.net())
         agent = Agent(model)
     NetworkAgentClient(agent, env, conn).run()
 
@@ -306,7 +376,8 @@ def eval_main(args, argv):
 
     agent1 = build_agent(model_path, env)
     if agent1 is None:
-        agent1 = Agent(get_model(env, model_path))
+        model = load_model(model_path, env.net())
+        agent1 = Agent(model)
     critic = None
 
     print('%d process, %d games' % (num_process, num_games))

diff --git a/handyrl/generation.py b/handyrl/generation.py
@@ -29,32 +29,35 @@ def generate(self, models, args):
             return None
 
         while not self.env.terminal():
-            moment_keys = ['observation', 'policy', 'action_mask', 'action', 'value', 'reward', 'return']
+            moment_keys = ['observation', 'selected_prob', 'action_mask', 'action', 'value', 'reward', 'return']
             moment = {key: {p: None for p in self.env.players()} for key in moment_keys}
 
             turn_players = self.env.turns()
+            observers = self.env.observers()
             for player in self.env.players():
-                if player in turn_players or self.args['observation']:
-                    obs = self.env.observation(player)
-                    model = models[player]
-                    outputs = model.inference(obs, hidden[player])
-                    hidden[player] = outputs.get('hidden', None)
-                    v = outputs.get('value', None)
-
-                    moment['observation'][player] = obs
-                    moment['value'][player] = v
-
-                    if player in turn_players:
-                        p_ = outputs['policy']
-                        legal_actions = self.env.legal_actions(player)
-                        action_mask = np.ones_like(p_) * 1e32
-                        action_mask[legal_actions] = 0
-                        p = p_ - action_mask
-                        action = random.choices(legal_actions, weights=softmax(p[legal_actions]))[0]
-
-                        moment['policy'][player] = p
-                        moment['action_mask'][player] = action_mask
-                        moment['action'][player] = action
+                if player not in turn_players + observers:
+                    continue
+
+                obs = self.env.observation(player)
+                model = models[player]
+                outputs = model.inference(obs, hidden[player])
+                hidden[player] = outputs.get('hidden', None)
+                v = outputs.get('value', None)
+
+                moment['observation'][player] = obs
+                moment['value'][player] = v
+
+                if player in turn_players:
+                    p_ = outputs['policy']
+                    legal_actions = self.env.legal_actions(player)
+                    action_mask = np.ones_like(p_) * 1e32
+                    action_mask[legal_actions] = 0
+                    p = softmax(p_ - action_mask)
+                    action = random.choices(legal_actions, weights=p[legal_actions])[0]
+
+                    moment['selected_prob'][player] = p[action]
+                    moment['action_mask'][player] = action_mask
+                    moment['action'][player] = action
 
             err = self.env.step(moment['action'])
             if err:

diff --git a/handyrl/model.py b/handyrl/model.py
@@ -37,7 +37,11 @@ def __init__(self, model):
 
     def init_hidden(self, batch_size=None):
         if hasattr(self.model, 'init_hidden'):
-            return self.model.init_hidden(batch_size)
+            if batch_size is None:  # for inference
+                hidden = self.model.init_hidden([])
+                return map_r(hidden, lambda h: h.detach().numpy() if isinstance(h, torch.Tensor) else h)
+            else:  # for training
+                return self.model.init_hidden(batch_size)
         return None
 
     def forward(self, *args, **kwargs):