From 28f960c1fce03c2519b4390f1bd96fe46c934d4a Mon Sep 17 00:00:00 2001 From: YuriCat Date: Fri, 12 Mar 2021 23:57:33 +0900 Subject: [PATCH 01/57] feature: burn-in steps --- config.yaml | 1 + handyrl/train.py | 41 ++++++++++++++++++++++++----------------- 2 files changed, 25 insertions(+), 17 deletions(-) diff --git a/config.yaml b/config.yaml index 7ff60d42..5e747dc4 100755 --- a/config.yaml +++ b/config.yaml @@ -10,6 +10,7 @@ train_args: observation: False gamma: 0.8 forward_steps: 16 + burn_in_steps: 8 compress_steps: 4 entropy_regularization: 1.0e-1 entropy_regularization_decay: 0.1 diff --git a/handyrl/train.py b/handyrl/train.py index 186755e9..0d65fe51 100755 --- a/handyrl/train.py +++ b/handyrl/train.py @@ -89,18 +89,19 @@ def replace_none(a, b): # pad each array if step length is short if len(tmask) < args['forward_steps']: - pad_len = args['forward_steps'] - len(tmask) - obs = map_r(obs, lambda o: np.pad(o, [(0, pad_len)] + [(0, 0)] * (len(o.shape) - 1), 'constant', constant_values=0)) - p = np.pad(p, [(0, pad_len), (0, 0), (0, 0)], 'constant', constant_values=0) - v = np.concatenate([v, np.tile(oc, [pad_len, 1, 1])]) - act = np.pad(act, [(0, pad_len), (0, 0), (0, 0)], 'constant', constant_values=0) - rew = np.pad(rew, [(0, pad_len), (0, 0), (0, 0)], 'constant', constant_values=0) - ret = np.pad(ret, [(0, pad_len), (0, 0), (0, 0)], 'constant', constant_values=0) - emask = np.pad(emask, [(0, pad_len), (0, 0), (0, 0)], 'constant', constant_values=0) - tmask = np.pad(tmask, [(0, pad_len), (0, 0), (0, 0)], 'constant', constant_values=0) - omask = np.pad(omask, [(0, pad_len), (0, 0), (0, 0)], 'constant', constant_values=0) - amask = np.pad(amask, [(0, pad_len), (0, 0), (0, 0)], 'constant', constant_values=1e32) - progress = np.pad(progress, [(0, pad_len), (0, 0)], 'constant', constant_values=1) + pad_len_b = args['burn_in_steps'] - (ep['train_start'] - ep['start']) + pad_len = args['forward_steps'] - len(tmask) - pad_len_b + obs = map_r(obs, lambda o: np.pad(o, [(pad_len_b, pad_len)] + [(0, 0)] * (len(o.shape) - 1), 'constant', constant_values=0)) + p = np.pad(p, [(pad_len_b, pad_len), (0, 0), (0, 0)], 'constant', constant_values=0) + v = np.pad(np.concatenate([v, np.tile(oc, [pad_len, 1, 1])]), [(pad_len_b, 0), (0, 0), (0, 0)], 'constant', constant_values=0) + act = np.pad(act, [(pad_len_b, pad_len), (0, 0), (0, 0)], 'constant', constant_values=0) + rew = np.pad(rew, [(pad_len_b, pad_len), (0, 0), (0, 0)], 'constant', constant_values=0) + ret = np.pad(ret, [(pad_len_b, pad_len), (0, 0), (0, 0)], 'constant', constant_values=0) + emask = np.pad(emask, [(pad_len_b, pad_len), (0, 0), (0, 0)], 'constant', constant_values=0) + tmask = np.pad(tmask, [(pad_len_b, pad_len), (0, 0), (0, 0)], 'constant', constant_values=0) + omask = np.pad(omask, [(pad_len_b, pad_len), (0, 0), (0, 0)], 'constant', constant_values=0) + amask = np.pad(amask, [(pad_len_b, pad_len), (0, 0), (0, 0)], 'constant', constant_values=1e32) + progress = np.pad(progress, [(pad_len_b, pad_len), (0, 0)], 'constant', constant_values=1) obss.append(obs) datum.append((p, v, act, oc, rew, ret, emask, tmask, omask, amask, progress)) @@ -162,7 +163,11 @@ def forward_prediction(model, hidden, batch, args): hidden_ = map_r(hidden_, lambda h: h.sum(1)) # (..., B * 1, ...) else: hidden_ = map_r(hidden_, lambda h: h.view(-1, *h.size()[2:])) # (..., B * P, ...) - outputs_ = model(obs, hidden_) + if t < args['burn_in_steps']: + with torch.no_grad(): + outputs_= model(obs, hidden_) + else: + outputs_ = model(obs, hidden_) for k, o in outputs_.items(): if k == 'hidden': next_hidden = outputs_['hidden'] @@ -290,16 +295,18 @@ def select_episode(self): if random.random() < accept_rate: break ep = self.episodes[ep_idx] - turn_candidates = 1 + max(0, ep['steps'] - self.args['forward_steps']) # change start turn by sequence length - st = random.randrange(turn_candidates) - ed = min(st + self.args['forward_steps'], ep['steps']) + trained_steps = self.args['forward_steps'] - self.args['burn_in_steps'] + turn_candidates = 1 + max(0, ep['steps'] - trained_steps) # change start turn by sequence length + st_train = random.randrange(turn_candidates) + ed = min(st_train + trained_steps, ep['steps']) + st = max(0, st_train - self.args['burn_in_steps']) st_block = st // self.args['compress_steps'] ed_block = (ed - 1) // self.args['compress_steps'] + 1 ep_minimum = { 'args': ep['args'], 'outcome': ep['outcome'], 'moment': ep['moment'][st_block:ed_block], 'base': st_block * self.args['compress_steps'], - 'start': st, 'end': ed, 'total': ep['steps'] + 'start': st, 'end': ed, 'train_start': st_train, 'total': ep['steps'] } return ep_minimum From a52e9dd6d40455e5255ee1fa669edb3d8b618620 Mon Sep 17 00:00:00 2001 From: YuriCat Date: Sat, 13 Mar 2021 02:38:09 +0900 Subject: [PATCH 02/57] feature: set model.eval() in burn_in steps --- handyrl/train.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/handyrl/train.py b/handyrl/train.py index 0d65fe51..88ac83bc 100755 --- a/handyrl/train.py +++ b/handyrl/train.py @@ -164,9 +164,12 @@ def forward_prediction(model, hidden, batch, args): else: hidden_ = map_r(hidden_, lambda h: h.view(-1, *h.size()[2:])) # (..., B * P, ...) if t < args['burn_in_steps']: + model.eval() with torch.no_grad(): outputs_= model(obs, hidden_) else: + if not model.training: + model.train() outputs_ = model(obs, hidden_) for k, o in outputs_.items(): if k == 'hidden': From 4b7120616a23350abd6a473257ac914841b3a75f Mon Sep 17 00:00:00 2001 From: YuriCat Date: Wed, 20 Oct 2021 14:52:32 +0900 Subject: [PATCH 03/57] feature: add observers() method in environments --- handyrl/agent.py | 17 +++++++---------- handyrl/environment.py | 6 ++++++ handyrl/evaluation.py | 8 +++++--- handyrl/generation.py | 3 ++- 4 files changed, 20 insertions(+), 14 deletions(-) diff --git a/handyrl/agent.py b/handyrl/agent.py index 72f1778b..30f63fea 100755 --- a/handyrl/agent.py +++ b/handyrl/agent.py @@ -39,11 +39,10 @@ def print_outputs(env, prob, v): class Agent: - def __init__(self, model, observation=False, temperature=0.0): + def __init__(self, model, temperature=0.0): # model might be a neural net, or some planning algorithm such as game tree search self.model = model self.hidden = None - self.observation = observation self.temperature = temperature def reset(self, env, show=False): @@ -73,12 +72,10 @@ def action(self, env, player, show=False): return random.choices(np.arange(len(p)), weights=softmax(p / self.temperature))[0] def observe(self, env, player, show=False): - v = None - if self.observation: - outputs = self.plan(env.observation(player)) - v = outputs.get('value', None) - if show: - print_outputs(env, None, v) + outputs = self.plan(env.observation(player)) + v = outputs.get('value', None) + if show: + print_outputs(env, None, v) return v if v is not None else [0.0] @@ -101,5 +98,5 @@ def plan(self, obs): class SoftAgent(Agent): - def __init__(self, model, observation=False): - super().__init__(model, observation=observation, temperature=1.0) + def __init__(self, model): + super().__init__(model, temperature=1.0) diff --git a/handyrl/environment.py b/handyrl/environment.py index f470e816..75aaeacc 100755 --- a/handyrl/environment.py +++ b/handyrl/environment.py @@ -77,6 +77,12 @@ def turn(self): def turns(self): return [self.turn()] + # + # Should be defined if you use multiplayer simultaneous action game + # + def observers(self): + return [] + # # Should be defined in all games # diff --git a/handyrl/evaluation.py b/handyrl/evaluation.py index f4b564cc..962de334 100755 --- a/handyrl/evaluation.py +++ b/handyrl/evaluation.py @@ -88,11 +88,12 @@ def exec_match(env, agents, critic, show=False, game_args={}): if show and critic is not None: print('cv = ', critic.observe(env, None, show=False)[0]) turn_players = env.turns() + observers = env.observers() actions = {} for p, agent in agents.items(): if p in turn_players: actions[p] = agent.action(env, p, show=show) - else: + elif p in observers: agent.observe(env, p, show=show) if env.step(actions): return None @@ -117,12 +118,13 @@ def exec_network_match(env, network_agents, critic, show=False, game_args={}): if show and critic is not None: print('cv = ', critic.observe(env, None, show=False)[0]) turn_players = env.turns() + observers = env.observers() actions = {} for p, agent in network_agents.items(): if p in turn_players: action = agent.action(p) actions[p] = env.str2action(action, p) - else: + elif p in observers: agent.observe(p) if env.step(actions): return None @@ -161,7 +163,7 @@ def execute(self, models, args): if model is None: agents[p] = build_agent(opponent, self.env) else: - agents[p] = Agent(model, self.args['observation']) + agents[p] = Agent(model) outcome = exec_match(self.env, agents, None) if outcome is None: diff --git a/handyrl/generation.py b/handyrl/generation.py index 63b7e553..79b0a1df 100755 --- a/handyrl/generation.py +++ b/handyrl/generation.py @@ -33,8 +33,9 @@ def generate(self, models, args): moment = {key: {p: None for p in self.env.players()} for key in moment_keys} turn_players = self.env.turns() + observers = self.env.observers() for player in self.env.players(): - if player in turn_players or self.args['observation']: + if player in turn_players or player in observers: obs = self.env.observation(player) model = models[player] outputs = model.inference(obs, hidden[player]) From 23ff9f9575cb523a0bc0efb7d6e2cad39bd5b38e Mon Sep 17 00:00:00 2001 From: YuriCat Date: Thu, 21 Oct 2021 02:30:51 +0900 Subject: [PATCH 04/57] feature: steps = burn_in_steps + forward_steps --- handyrl/train.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/handyrl/train.py b/handyrl/train.py index f0345957..c20c4a1c 100755 --- a/handyrl/train.py +++ b/handyrl/train.py @@ -89,9 +89,10 @@ def replace_none(a, b): progress = np.arange(ep['start'], ep['end'], dtype=np.float32)[..., np.newaxis] / ep['total'] # pad each array if step length is short - if len(tmask) < args['forward_steps']: + batch_steps = args['burn_in_steps'] + args['forward_steps'] + if len(tmask) < batch_steps: pad_len_b = args['burn_in_steps'] - (ep['train_start'] - ep['start']) - pad_len = args['forward_steps'] - len(tmask) - pad_len_b + pad_len = batch_steps - len(tmask) - pad_len_b obs = map_r(obs, lambda o: np.pad(o, [(pad_len_b, pad_len)] + [(0, 0)] * (len(o.shape) - 1), 'constant', constant_values=0)) p = np.pad(p, [(pad_len_b, pad_len), (0, 0), (0, 0)], 'constant', constant_values=0) v = np.pad(np.concatenate([v, np.tile(oc, [pad_len, 1, 1])]), [(pad_len_b, 0), (0, 0), (0, 0)], 'constant', constant_values=0) @@ -225,6 +226,9 @@ def compose_losses(outputs, log_selected_policies, total_advantages, targets, ba def compute_loss(batch, model, hidden, args): outputs = forward_prediction(model, hidden, batch, args) + batch = map_r(batch, lambda v: v[args['burn_in_steps']:]) + outputs = map_r(outputs, lambda v: v[args['burn_in_steps']:]) + actions = batch['action'] emasks = batch['episode_mask'] clip_rho_threshold, clip_c_threshold = 1.0, 1.0 @@ -296,10 +300,9 @@ def select_episode(self): if random.random() < accept_rate: break ep = self.episodes[ep_idx] - trained_steps = self.args['forward_steps'] - self.args['burn_in_steps'] - turn_candidates = 1 + max(0, ep['steps'] - trained_steps) # change start turn by sequence length + turn_candidates = 1 + max(0, ep['steps'] - self.args['forward_steps']) # change start turn by sequence length st_train = random.randrange(turn_candidates) - ed = min(st_train + trained_steps, ep['steps']) + ed = min(st_train + self.args['forward_steps'], ep['steps']) st = max(0, st_train - self.args['burn_in_steps']) st_block = st // self.args['compress_steps'] ed_block = (ed - 1) // self.args['compress_steps'] + 1 From 6151ea73b779db68c4878199c42c32a23a2cad55 Mon Sep 17 00:00:00 2001 From: YuriCat Date: Tue, 9 Nov 2021 16:40:10 +0900 Subject: [PATCH 05/57] feature: unnecessary hidden argument --- handyrl/envs/kaggle/hungry_geese.py | 2 +- handyrl/envs/tictactoe.py | 2 +- handyrl/model.py | 12 ++++++++++-- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/handyrl/envs/kaggle/hungry_geese.py b/handyrl/envs/kaggle/hungry_geese.py index 7ca39ca0..a5ae5b00 100644 --- a/handyrl/envs/kaggle/hungry_geese.py +++ b/handyrl/envs/kaggle/hungry_geese.py @@ -45,7 +45,7 @@ def __init__(self): self.head_p = nn.Linear(filters, 4, bias=False) self.head_v = nn.Linear(filters * 2, 1, bias=False) - def forward(self, x, _=None): + def forward(self, x): h = F.relu_(self.conv0(x)) for block in self.blocks: h = F.relu_(h + block(h)) diff --git a/handyrl/envs/tictactoe.py b/handyrl/envs/tictactoe.py index c6403b7f..be441f7b 100755 --- a/handyrl/envs/tictactoe.py +++ b/handyrl/envs/tictactoe.py @@ -59,7 +59,7 @@ def __init__(self): self.head_p = Head((filters, 3, 3), 2, 9) self.head_v = Head((filters, 3, 3), 1, 1) - def forward(self, x, hidden=None): + def forward(self, x): h = F.relu(self.conv(x)) for block in self.blocks: h = F.relu(block(h)) diff --git a/handyrl/model.py b/handyrl/model.py index 54780867..7efb2e23 100755 --- a/handyrl/model.py +++ b/handyrl/model.py @@ -35,13 +35,21 @@ def __init__(self, model): super().__init__() self.model = model + def get_argument_names(f): + return f.__code__.co_varnames[:f.__code__.co_argcount] + self.forward_args = get_argument_names(self.model.forward) + def init_hidden(self, batch_size=None): if hasattr(self.model, 'init_hidden'): return self.model.init_hidden(batch_size) return None - def forward(self, *args, **kwargs): - return self.model.forward(*args, **kwargs) + def forward(self, x, hidden, **kwargs): + # Remove 'hidden' input if it will not accepted + if 'hidden' not in self.forward_args: + return self.model.forward(x, **kwargs) + else: + return self.model.forward(x, hidden, **kwargs) def inference(self, x, hidden, **kwargs): # numpy array -> numpy array From 216734eef593edc9bb2a09399d20fc4317c39bb3 Mon Sep 17 00:00:00 2001 From: YuriCat Date: Fri, 26 Nov 2021 11:18:53 +0900 Subject: [PATCH 06/57] chore: check whether burn_in_steps > 0 before slicing batch and outputs --- handyrl/train.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/handyrl/train.py b/handyrl/train.py index bbd41912..fd2d7d0c 100755 --- a/handyrl/train.py +++ b/handyrl/train.py @@ -226,8 +226,9 @@ def compose_losses(outputs, log_selected_policies, total_advantages, targets, ba def compute_loss(batch, model, hidden, args): outputs = forward_prediction(model, hidden, batch, args) - batch = map_r(batch, lambda v: v[args['burn_in_steps']:]) - outputs = map_r(outputs, lambda v: v[args['burn_in_steps']:]) + if args['burn_in_steps'] > 0: + batch = map_r(batch, lambda v: v[args['burn_in_steps']:]) + outputs = map_r(outputs, lambda v: v[args['burn_in_steps']:]) actions = batch['action'] emasks = batch['episode_mask'] From 126399a4352f891d6378720d46589dfb4bc85a70 Mon Sep 17 00:00:00 2001 From: YuriCat Date: Sun, 9 Jan 2022 22:28:37 +0900 Subject: [PATCH 07/57] feature: use load_model() function --- handyrl/evaluation.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/handyrl/evaluation.py b/handyrl/evaluation.py index b00db9f6..dea4cad3 100755 --- a/handyrl/evaluation.py +++ b/handyrl/evaluation.py @@ -277,7 +277,7 @@ def network_match_acception(n, env_args, num_agents, port): return agents_list -def get_model(env, model_path): +def load_model(model_path, env): import torch from .model import ModelWrapper model = env.net() @@ -290,7 +290,7 @@ def client_mp_child(env_args, model_path, conn): env = make_env(env_args) agent = build_agent(model_path, env) if agent is None: - model = get_model(env, model_path) + model = load_model(model_path, env) agent = Agent(model) NetworkAgentClient(agent, env, conn).run() @@ -306,7 +306,8 @@ def eval_main(args, argv): agent1 = build_agent(model_path, env) if agent1 is None: - agent1 = Agent(get_model(env, model_path)) + model = load_model(model_path, env) + agent1 = Agent(model) critic = None print('%d process, %d games' % (num_process, num_games)) From cfc1a39030062ee573e0b6559d1b89a9ba5b8546 Mon Sep 17 00:00:00 2001 From: YuriCat Date: Sun, 9 Jan 2022 23:04:36 +0900 Subject: [PATCH 08/57] feature: store selected probability only (IMPORTANT) --- handyrl/generation.py | 8 ++++---- handyrl/train.py | 20 ++++++++++---------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/handyrl/generation.py b/handyrl/generation.py index 63b7e553..f426e5d2 100755 --- a/handyrl/generation.py +++ b/handyrl/generation.py @@ -29,7 +29,7 @@ def generate(self, models, args): return None while not self.env.terminal(): - moment_keys = ['observation', 'policy', 'action_mask', 'action', 'value', 'reward', 'return'] + moment_keys = ['observation', 'selected_prob', 'action_mask', 'action', 'value', 'reward', 'return'] moment = {key: {p: None for p in self.env.players()} for key in moment_keys} turn_players = self.env.turns() @@ -49,10 +49,10 @@ def generate(self, models, args): legal_actions = self.env.legal_actions(player) action_mask = np.ones_like(p_) * 1e32 action_mask[legal_actions] = 0 - p = p_ - action_mask - action = random.choices(legal_actions, weights=softmax(p[legal_actions]))[0] + p = softmax(p_ - action_mask) + action = random.choices(legal_actions, weights=p[legal_actions])[0] - moment['policy'][player] = p + moment['selected_prob'][player] = p[action] moment['action_mask'][player] = action_mask moment['action'][player] = action diff --git a/handyrl/train.py b/handyrl/train.py index b933ebaa..1e76ac1f 100755 --- a/handyrl/train.py +++ b/handyrl/train.py @@ -58,19 +58,19 @@ def replace_none(a, b): players = [random.choice(players)] obs_zeros = map_r(moments[0]['observation'][moments[0]['turn'][0]], lambda o: np.zeros_like(o)) # template for padding - p_zeros = np.zeros_like(moments[0]['policy'][moments[0]['turn'][0]]) # template for padding + amask_zeros = np.zeros_like(moments[0]['action_mask'][moments[0]['turn'][0]]) # template for padding # data that is chainge by training configuration if args['turn_based_training'] and not args['observation']: obs = [[m['observation'][m['turn'][0]]] for m in moments] - p = np.array([[m['policy'][m['turn'][0]]] for m in moments]) + prob = np.array([[[m['selected_prob'][m['turn'][0]]]] for m in moments]) act = np.array([[m['action'][m['turn'][0]]] for m in moments], dtype=np.int64)[..., np.newaxis] amask = np.array([[m['action_mask'][m['turn'][0]]] for m in moments]) else: obs = [[replace_none(m['observation'][player], obs_zeros) for player in players] for m in moments] - p = np.array([[replace_none(m['policy'][player], p_zeros) for player in players] for m in moments]) + prob = np.array([[[replace_none(m['selected_prob'][player], 1.0)] for player in players] for m in moments]) act = np.array([[replace_none(m['action'][player], 0) for player in players] for m in moments], dtype=np.int64)[..., np.newaxis] - amask = np.array([[replace_none(m['action_mask'][player], p_zeros + 1e32) for player in players] for m in moments]) + amask = np.array([[replace_none(m['action_mask'][player], amask_zeros + 1e32) for player in players] for m in moments]) # reshape observation obs = rotate(rotate(obs)) # (T, P, ..., ...) -> (P, ..., T, ...) -> (..., T, P, ...) @@ -83,7 +83,7 @@ def replace_none(a, b): oc = np.array([ep['outcome'][player] for player in players], dtype=np.float32).reshape(1, len(players), -1) emask = np.ones((len(moments), 1, 1), dtype=np.float32) # episode mask - tmask = np.array([[[m['policy'][player] is not None] for player in players] for m in moments], dtype=np.float32) + tmask = np.array([[[m['selected_prob'][player] is not None] for player in players] for m in moments], dtype=np.float32) omask = np.array([[[m['value'][player] is not None] for player in players] for m in moments], dtype=np.float32) progress = np.arange(ep['start'], ep['end'], dtype=np.float32)[..., np.newaxis] / ep['total'] @@ -92,7 +92,7 @@ def replace_none(a, b): if len(tmask) < args['forward_steps']: pad_len = args['forward_steps'] - len(tmask) obs = map_r(obs, lambda o: np.pad(o, [(0, pad_len)] + [(0, 0)] * (len(o.shape) - 1), 'constant', constant_values=0)) - p = np.pad(p, [(0, pad_len), (0, 0), (0, 0)], 'constant', constant_values=0) + prob = np.pad(prob, [(0, pad_len), (0, 0), (0, 0)], 'constant', constant_values=1) v = np.concatenate([v, np.tile(oc, [pad_len, 1, 1])]) act = np.pad(act, [(0, pad_len), (0, 0), (0, 0)], 'constant', constant_values=0) rew = np.pad(rew, [(0, pad_len), (0, 0), (0, 0)], 'constant', constant_values=0) @@ -104,14 +104,14 @@ def replace_none(a, b): progress = np.pad(progress, [(0, pad_len), (0, 0)], 'constant', constant_values=1) obss.append(obs) - datum.append((p, v, act, oc, rew, ret, emask, tmask, omask, amask, progress)) + datum.append((prob, v, act, oc, rew, ret, emask, tmask, omask, amask, progress)) obs = to_torch(bimap_r(obs_zeros, rotate(obss), lambda _, o: np.array(o))) - p, v, act, oc, rew, ret, emask, tmask, omask, amask, progress = [to_torch(np.array(val)) for val in zip(*datum)] + prob, v, act, oc, rew, ret, emask, tmask, omask, amask, progress = [to_torch(np.array(val)) for val in zip(*datum)] return { 'observation': obs, - 'policy': p, 'value': v, + 'selected_prob': prob, 'value': v, 'action': act, 'outcome': oc, 'reward': rew, 'return': ret, 'episode_mask': emask, @@ -209,7 +209,7 @@ def compute_loss(batch, model, hidden, args): emasks = batch['episode_mask'] clip_rho_threshold, clip_c_threshold = 1.0, 1.0 - log_selected_b_policies = F.log_softmax(batch['policy'] , dim=-1).gather(-1, actions) * emasks + log_selected_b_policies = torch.log(torch.clamp(batch['selected_prob'], 1e-16, 1)) * emasks log_selected_t_policies = F.log_softmax(outputs['policy'], dim=-1).gather(-1, actions) * emasks # thresholds of importance sampling From e4a74c1b419cb16c819ab13d23731cec4643101b Mon Sep 17 00:00:00 2001 From: YuriCat Date: Tue, 11 Jan 2022 19:51:53 +0900 Subject: [PATCH 09/57] feature: add OnnxModel class to infer with ONNX models --- handyrl/evaluation.py | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/handyrl/evaluation.py b/handyrl/evaluation.py index b00db9f6..ddf488a8 100755 --- a/handyrl/evaluation.py +++ b/handyrl/evaluation.py @@ -277,6 +277,49 @@ def network_match_acception(n, env_args, num_agents, port): return agents_list +class OnnxModel: + def __init__(self, model_path, output_keys): + self.model_path = model_path + self.output_keys = output_keys + self.ort_session = None + + def init_hidden(self): + # TODO RNN + return None + + def inference(self, x, hidden=None, batch_input=False): + # numpy array -> numpy array + if self.ort_session is None: + import os + os.environ['OMP_NUM_THREADS'] = '1' + os.environ['OMP_WAIT_POLICY'] = 'PASSIVE' + + import onnxruntime + opts = onnxruntime.SessionOptions() + opts.intra_op_num_threads = 1 + opts.inter_op_num_threads = 1 + opts.execution_mode = onnxruntime.ExecutionMode.ORT_SEQUENTIAL + + self.ort_session = onnxruntime.InferenceSession(self.model_path, sess_options=opts) + + ort_inputs = {} + ort_input_names = [y.name for y in self.ort_session.get_inputs()] + def insert_input(y): + y = y if batch_input else np.expand_dims(y, 0) + ort_inputs[ort_input_names[len(ort_inputs)]] = y + from .util import map_r + map_r(x, lambda y: insert_input(y)) + if hidden is not None: + map_r(hidden, lambda y: insert_input(y)) + ort_outputs = self.ort_session.run(None, ort_inputs) + if not batch_input: + ort_outputs = [o.squeeze(0) for o in ort_outputs] + + assert len(self.output_keys) == len(outputs) + outputs = {key: outputs[i] for i, key in enumerate(self.output_keys)} + return outputs + + def get_model(env, model_path): import torch from .model import ModelWrapper From 5900c308950080e9aa96c1a03dc90226982fd253 Mon Sep 17 00:00:00 2001 From: YuriCat Date: Tue, 11 Jan 2022 19:57:34 +0900 Subject: [PATCH 10/57] fix: load ONNX model when selecting ONNX model path --- handyrl/evaluation.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/handyrl/evaluation.py b/handyrl/evaluation.py index ddf488a8..7461286a 100755 --- a/handyrl/evaluation.py +++ b/handyrl/evaluation.py @@ -278,9 +278,8 @@ def network_match_acception(n, env_args, num_agents, port): class OnnxModel: - def __init__(self, model_path, output_keys): + def __init__(self, model_path): self.model_path = model_path - self.output_keys = output_keys self.ort_session = None def init_hidden(self): @@ -315,12 +314,16 @@ def insert_input(y): if not batch_input: ort_outputs = [o.squeeze(0) for o in ort_outputs] - assert len(self.output_keys) == len(outputs) - outputs = {key: outputs[i] for i, key in enumerate(self.output_keys)} + ort_outputs_names = [y.name for y in self.ort_session.get_outputs()] + assert len(ort_outputs_names) == len(outputs) + outputs = {key: outputs[i] for i, key in enumerate(self.output_names)} return outputs def get_model(env, model_path): + if model_path.endswith('.onnx'): + model = OnnxModel(model_path) + return model import torch from .model import ModelWrapper model = env.net() From 1cb017643f1dea8a04cc5a2a0c14ae17cc8d3fab Mon Sep 17 00:00:00 2001 From: YuriCat Date: Wed, 12 Jan 2022 19:24:03 +0900 Subject: [PATCH 11/57] feature: support hidden state inputs in OnnxModel --- handyrl/evaluation.py | 55 ++++++++++++++++++++++++++++++------------- 1 file changed, 39 insertions(+), 16 deletions(-) diff --git a/handyrl/evaluation.py b/handyrl/evaluation.py index 7461286a..247f6fd2 100755 --- a/handyrl/evaluation.py +++ b/handyrl/evaluation.py @@ -282,27 +282,42 @@ def __init__(self, model_path): self.model_path = model_path self.ort_session = None + def _open_session(self): + import os + os.environ['OMP_NUM_THREADS'] = '1' + os.environ['OMP_WAIT_POLICY'] = 'PASSIVE' + + import onnxruntime + opts = onnxruntime.SessionOptions() + opts.intra_op_num_threads = 1 + opts.inter_op_num_threads = 1 + opts.execution_mode = onnxruntime.ExecutionMode.ORT_SEQUENTIAL + + self.ort_session = onnxruntime.InferenceSession(self.model_path, sess_options=opts) + def init_hidden(self): - # TODO RNN - return None + if self.ort_session is None: + self._open_session() + hidden_inputs = [y for y in self.ort_session.get_inputs() if y.name.startswith('hidden')] + if len(hidden_inputs) == 0: + return None + import numpy as np + type_map = { + 'tensor(float)': np.float32, + 'tensor(int64)': np.int64, + } + hidden_tensors = [np.zeros(y.shape[1:], dtype=type_map[y.type]) for y in hidden_inputs] + return hidden_tensors def inference(self, x, hidden=None, batch_input=False): # numpy array -> numpy array if self.ort_session is None: - import os - os.environ['OMP_NUM_THREADS'] = '1' - os.environ['OMP_WAIT_POLICY'] = 'PASSIVE' - - import onnxruntime - opts = onnxruntime.SessionOptions() - opts.intra_op_num_threads = 1 - opts.inter_op_num_threads = 1 - opts.execution_mode = onnxruntime.ExecutionMode.ORT_SEQUENTIAL - - self.ort_session = onnxruntime.InferenceSession(self.model_path, sess_options=opts) + self._open_session() ort_inputs = {} ort_input_names = [y.name for y in self.ort_session.get_inputs()] + + import numpy as np def insert_input(y): y = y if batch_input else np.expand_dims(y, 0) ort_inputs[ort_input_names[len(ort_inputs)]] = y @@ -314,9 +329,17 @@ def insert_input(y): if not batch_input: ort_outputs = [o.squeeze(0) for o in ort_outputs] - ort_outputs_names = [y.name for y in self.ort_session.get_outputs()] - assert len(ort_outputs_names) == len(outputs) - outputs = {key: outputs[i] for i, key in enumerate(self.output_names)} + ort_output_names = [y.name for y in self.ort_session.get_outputs()] + outputs = {name: ort_outputs[i] for i, name in enumerate(ort_output_names)} + + hidden_outputs = [] + for k in list(outputs.keys()): + if k.startswith('hidden'): + hidden_outputs.append(outputs.pop(k)) + if len(hidden_outputs) == 0: + hidden_outputs = None + + outputs = {**outputs, 'hidden': hidden_outputs} return outputs From 6bc68bec9a0335aa951793db7ac8f0ac66f0402c Mon Sep 17 00:00:00 2001 From: YuriCat Date: Thu, 13 Jan 2022 01:33:37 +0900 Subject: [PATCH 12/57] feature: link the 1st place solution in Hungry Geese competition --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 05c7eb64..351e7246 100755 --- a/README.md +++ b/README.md @@ -113,5 +113,5 @@ NOTE: Default opponent AI is random agent implemented in `evaluation.py`. You ca ## Use Cases -* [Month 1 Winner in Hungry Geese (Kaggle)](https://www.kaggle.com/c/hungry-geese/discussion/222941) -* [The 5th solution in Google Research Football with Manchester City F.C. (Kaggle)](https://www.kaggle.com/c/google-football/discussion/203412) +* [The 1st place solution in Hungry Geese (Kaggle)](https://www.kaggle.com/c/hungry-geese/discussion/263279) +* [The 5th place solution in Google Research Football with Manchester City F.C. (Kaggle)](https://www.kaggle.com/c/google-football/discussion/203412) From b68c737d08a68818b465854af1228c54fbd4cd5d Mon Sep 17 00:00:00 2001 From: YuriCat Date: Thu, 13 Jan 2022 02:11:42 +0900 Subject: [PATCH 13/57] chore: set default burn_in_steps=0 --- config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config.yaml b/config.yaml index 5863cc58..2bf65d85 100755 --- a/config.yaml +++ b/config.yaml @@ -10,7 +10,7 @@ train_args: observation: False gamma: 0.8 forward_steps: 16 - burn_in_steps: 8 + burn_in_steps: 0 # for RNNs compress_steps: 4 entropy_regularization: 1.0e-1 entropy_regularization_decay: 0.1 From 57b6a54a056670b06a156c9e66453cde8ac8e150 Mon Sep 17 00:00:00 2001 From: YuriCat Date: Thu, 13 Jan 2022 02:21:55 +0900 Subject: [PATCH 14/57] chore: update redundunt code when receiving hidden outputs --- handyrl/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/handyrl/train.py b/handyrl/train.py index b933ebaa..bb34193c 100755 --- a/handyrl/train.py +++ b/handyrl/train.py @@ -154,7 +154,7 @@ def forward_prediction(model, hidden, batch, args): outputs_ = model(obs, hidden_) for k, o in outputs_.items(): if k == 'hidden': - next_hidden = outputs_['hidden'] + next_hidden = o else: outputs[k] = outputs.get(k, []) + [o] next_hidden = bimap_r(next_hidden, hidden, lambda nh, h: nh.view(h.size(0), -1, *h.size()[2:])) # (..., B, P or 1, ...) From e8dcb8e1478d2cff9e256439a911db60358ce960 Mon Sep 17 00:00:00 2001 From: YuriCat Date: Fri, 14 Jan 2022 16:57:07 +0900 Subject: [PATCH 15/57] feature: true DRC reffered to PyTorch official LSTM implementation --- handyrl/envs/geister.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/handyrl/envs/geister.py b/handyrl/envs/geister.py index 747beec2..69acfeab 100755 --- a/handyrl/envs/geister.py +++ b/handyrl/envs/geister.py @@ -93,7 +93,7 @@ def forward(self, x, hidden, num_repeats): hs, cs = hidden for _ in range(num_repeats): for i, block in enumerate(self.blocks): - hs[i], cs[i] = block(x, (hs[i], cs[i])) + hs[i], cs[i] = block(hs[i - 1] if i > 0 else x, (hs[i], cs[i])) return hs[-1], (hs, cs) From 12cd56fdd1990c4be5cb22c0891a02e4cf15ded8 Mon Sep 17 00:00:00 2001 From: YuriCat Date: Sat, 15 Jan 2022 16:31:58 +0900 Subject: [PATCH 16/57] feature: add an explanation of DRC net and a link to the paper --- handyrl/envs/geister.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/handyrl/envs/geister.py b/handyrl/envs/geister.py index 747beec2..71114e27 100755 --- a/handyrl/envs/geister.py +++ b/handyrl/envs/geister.py @@ -63,6 +63,11 @@ def forward(self, input_tensor, cur_state): return h_next, c_next +# Deep Repeated Conv-LSTM (https://arxiv.org/abs/1901.03559) +# increases expressive power with fewer parameters +# by repeatedly computing multi-layer convolutional LSTM. +# When num_repeats=1, it is simply a multi-layer Conv-LSTM. + class DRC(nn.Module): def __init__(self, num_layers, input_dim, hidden_dim, kernel_size=3, bias=True): super().__init__() From 3ccdc942553ef0eddb5eaa3662434bbed4d8a4f8 Mon Sep 17 00:00:00 2001 From: YuriCat Date: Sun, 16 Jan 2022 02:01:14 +0900 Subject: [PATCH 17/57] chore: remove unused import --- handyrl/train.py | 1 - 1 file changed, 1 deletion(-) diff --git a/handyrl/train.py b/handyrl/train.py index bb34193c..6275eb2e 100755 --- a/handyrl/train.py +++ b/handyrl/train.py @@ -26,7 +26,6 @@ from .model import to_torch, to_gpu, ModelWrapper from .losses import compute_target from .connection import MultiProcessJobExecutor -from .connection import accept_socket_connections from .worker import WorkerCluster, WorkerServer From b697ea14e16c4cd569f5beed34d7ba692e7d4cc2 Mon Sep 17 00:00:00 2001 From: YuriCat Date: Mon, 17 Jan 2022 09:36:12 +0900 Subject: [PATCH 18/57] chore: update legal action check in Geister --- handyrl/envs/geister.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/handyrl/envs/geister.py b/handyrl/envs/geister.py index 747beec2..ad279ffa 100755 --- a/handyrl/envs/geister.py +++ b/handyrl/envs/geister.py @@ -448,6 +448,8 @@ def legal(self, action): if self.turn_count < 0: layout = action - 4 * 6 * 6 return 0 <= layout < 70 + elif not 0 <= action < 4 * 6 * 6: + return False pos_from = self.action2from(action, self.color) pos_to = self.action2to(action, self.color) From 60e602d379430fac7cf48d31da687f0980ec7d38 Mon Sep 17 00:00:00 2001 From: YuriCat Date: Sat, 22 Jan 2022 02:50:03 +0900 Subject: [PATCH 19/57] fix: return model even if using non-trainable models --- handyrl/train.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/handyrl/train.py b/handyrl/train.py index b933ebaa..11f0b752 100755 --- a/handyrl/train.py +++ b/handyrl/train.py @@ -349,8 +349,8 @@ def shutdown(self): def train(self): if self.optimizer is None: # non-parametric model - print() - return + time.sleep(0.1) + return self.model batch_cnt, data_cnt, loss_sum = 0, 0, {} if self.gpu > 0: @@ -395,7 +395,7 @@ def run(self): if len(self.episodes) < self.args['minimum_episodes']: time.sleep(1) continue - if self.steps == 0: + if self.steps == 0 and self.optimizer is not None: self.batcher.run() print('started training') model = self.train() From bcb4b37184ffeb1ac6c8b15f3338d208db65d77a Mon Sep 17 00:00:00 2001 From: YuriCat Date: Sat, 22 Jan 2022 03:24:09 +0900 Subject: [PATCH 20/57] feature: (idea) create observation mask from observation!=None --- handyrl/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/handyrl/train.py b/handyrl/train.py index b933ebaa..79ae2afc 100755 --- a/handyrl/train.py +++ b/handyrl/train.py @@ -84,7 +84,7 @@ def replace_none(a, b): emask = np.ones((len(moments), 1, 1), dtype=np.float32) # episode mask tmask = np.array([[[m['policy'][player] is not None] for player in players] for m in moments], dtype=np.float32) - omask = np.array([[[m['value'][player] is not None] for player in players] for m in moments], dtype=np.float32) + omask = np.array([[[m['observation'][player] is not None] for player in players] for m in moments], dtype=np.float32) progress = np.arange(ep['start'], ep['end'], dtype=np.float32)[..., np.newaxis] / ep['total'] From c1d0235119380eb3261d9a5994c542832d29fdca Mon Sep 17 00:00:00 2001 From: YuriCat Date: Sat, 22 Jan 2022 09:20:13 +0900 Subject: [PATCH 21/57] fix: observers() explanation --- handyrl/environment.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/handyrl/environment.py b/handyrl/environment.py index 2a144f0c..02bb836c 100755 --- a/handyrl/environment.py +++ b/handyrl/environment.py @@ -78,7 +78,7 @@ def turns(self): return [self.turn()] # - # Should be defined if you use multiplayer simultaneous action game + # Should be defined if players except turn player also observe game states # def observers(self): return [] From 2446802994885df91b7a9125bb180e45cbdb2d73 Mon Sep 17 00:00:00 2001 From: YuriCat Date: Sat, 22 Jan 2022 09:30:37 +0900 Subject: [PATCH 22/57] chore: update generation player loop --- handyrl/generation.py | 44 ++++++++++++++++++++++--------------------- 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/handyrl/generation.py b/handyrl/generation.py index 79b0a1df..3ad3b76e 100755 --- a/handyrl/generation.py +++ b/handyrl/generation.py @@ -35,27 +35,29 @@ def generate(self, models, args): turn_players = self.env.turns() observers = self.env.observers() for player in self.env.players(): - if player in turn_players or player in observers: - obs = self.env.observation(player) - model = models[player] - outputs = model.inference(obs, hidden[player]) - hidden[player] = outputs.get('hidden', None) - v = outputs.get('value', None) - - moment['observation'][player] = obs - moment['value'][player] = v - - if player in turn_players: - p_ = outputs['policy'] - legal_actions = self.env.legal_actions(player) - action_mask = np.ones_like(p_) * 1e32 - action_mask[legal_actions] = 0 - p = p_ - action_mask - action = random.choices(legal_actions, weights=softmax(p[legal_actions]))[0] - - moment['policy'][player] = p - moment['action_mask'][player] = action_mask - moment['action'][player] = action + if player not in turn_players + observers: + continue + + obs = self.env.observation(player) + model = models[player] + outputs = model.inference(obs, hidden[player]) + hidden[player] = outputs.get('hidden', None) + v = outputs.get('value', None) + + moment['observation'][player] = obs + moment['value'][player] = v + + if player in turn_players: + p_ = outputs['policy'] + legal_actions = self.env.legal_actions(player) + action_mask = np.ones_like(p_) * 1e32 + action_mask[legal_actions] = 0 + p = p_ - action_mask + action = random.choices(legal_actions, weights=softmax(p[legal_actions]))[0] + + moment['policy'][player] = p + moment['action_mask'][player] = action_mask + moment['action'][player] = action err = self.env.step(moment['action']) if err: From 1f3f3e213f683bd69685d48d2a86969b5797282c Mon Sep 17 00:00:00 2001 From: YuriCat Date: Sat, 22 Jan 2022 21:38:46 +0900 Subject: [PATCH 23/57] feature: there is cases if we have no value outputs nor policy outputs --- handyrl/agent.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/handyrl/agent.py b/handyrl/agent.py index 72f1778b..f26dfb87 100755 --- a/handyrl/agent.py +++ b/handyrl/agent.py @@ -34,8 +34,10 @@ def print_outputs(env, prob, v): if hasattr(env, 'print_outputs'): env.print_outputs(prob, v) else: - print('v = %f' % v) - print('p = %s' % (prob * 1000).astype(int)) + if v is not None: + print('v = %f' % v) + if prob is not None: + print('p = %s' % (prob * 1000).astype(int)) class Agent: From 4e346819aa06c6dfe6c39848f1959a036e195a51 Mon Sep 17 00:00:00 2001 From: YuriCat Date: Sat, 22 Jan 2022 21:47:44 +0900 Subject: [PATCH 24/57] feature: load_model(model_path, env.net()) looks cool and easy --- handyrl/evaluation.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/handyrl/evaluation.py b/handyrl/evaluation.py index dea4cad3..9668b32e 100755 --- a/handyrl/evaluation.py +++ b/handyrl/evaluation.py @@ -277,10 +277,9 @@ def network_match_acception(n, env_args, num_agents, port): return agents_list -def load_model(model_path, env): +def load_model(model_path, model): import torch from .model import ModelWrapper - model = env.net() model.load_state_dict(torch.load(model_path)) model.eval() return ModelWrapper(model) @@ -306,7 +305,7 @@ def eval_main(args, argv): agent1 = build_agent(model_path, env) if agent1 is None: - model = load_model(model_path, env) + model = load_model(model_path, env.net()) agent1 = Agent(model) critic = None From 46560e80c9835e253f3c639df0d08468ab0bc253 Mon Sep 17 00:00:00 2001 From: YuriCat Date: Sat, 22 Jan 2022 21:59:48 +0900 Subject: [PATCH 25/57] chore: fix typo chainge --- handyrl/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/handyrl/train.py b/handyrl/train.py index 6275eb2e..8d2d22d5 100755 --- a/handyrl/train.py +++ b/handyrl/train.py @@ -59,7 +59,7 @@ def replace_none(a, b): obs_zeros = map_r(moments[0]['observation'][moments[0]['turn'][0]], lambda o: np.zeros_like(o)) # template for padding p_zeros = np.zeros_like(moments[0]['policy'][moments[0]['turn'][0]]) # template for padding - # data that is chainge by training configuration + # data that is changed by training configuration if args['turn_based_training'] and not args['observation']: obs = [[m['observation'][m['turn'][0]]] for m in moments] p = np.array([[m['policy'][m['turn'][0]]] for m in moments]) From 2aeb7e29a8a7fa0572e70aee76946ab7b5ef5a97 Mon Sep 17 00:00:00 2001 From: YuriCat Date: Sun, 23 Jan 2022 06:08:54 +0900 Subject: [PATCH 26/57] fix: eval-client mode --- handyrl/evaluation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/handyrl/evaluation.py b/handyrl/evaluation.py index 9668b32e..a6d0ddf1 100755 --- a/handyrl/evaluation.py +++ b/handyrl/evaluation.py @@ -289,7 +289,7 @@ def client_mp_child(env_args, model_path, conn): env = make_env(env_args) agent = build_agent(model_path, env) if agent is None: - model = load_model(model_path, env) + model = load_model(model_path, env.net()) agent = Agent(model) NetworkAgentClient(agent, env, conn).run() From 951d363e6775b0b8601a64a9e80ca1dce0cd5ddc Mon Sep 17 00:00:00 2001 From: YuriCat Date: Sun, 23 Jan 2022 16:43:17 +0900 Subject: [PATCH 27/57] feature: define only torch-based initialized hidden state --- handyrl/envs/geister.py | 16 +++++----------- handyrl/model.py | 6 +++++- 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/handyrl/envs/geister.py b/handyrl/envs/geister.py index 71114e27..5af0371b 100755 --- a/handyrl/envs/geister.py +++ b/handyrl/envs/geister.py @@ -34,16 +34,10 @@ def __init__(self, input_dim, hidden_dim, kernel_size, bias): ) def init_hidden(self, input_size, batch_size): - if batch_size is None: # for inference - return tuple([ - np.zeros((self.hidden_dim, *input_size), dtype=np.float32), - np.zeros((self.hidden_dim, *input_size), dtype=np.float32) - ]) - else: # for training - return tuple([ - torch.zeros(*batch_size, self.hidden_dim, *input_size), - torch.zeros(*batch_size, self.hidden_dim, *input_size) - ]) + return tuple([ + torch.zeros(*batch_size, self.hidden_dim, *input_size), + torch.zeros(*batch_size, self.hidden_dim, *input_size) + ]) def forward(self, input_tensor, cur_state): h_cur, c_cur = cur_state @@ -150,7 +144,7 @@ def __init__(self): self.head_v = ScalarHead((filters * 2, 6, 6), 1, 1) self.head_r = ScalarHead((filters * 2, 6, 6), 1, 1) - def init_hidden(self, batch_size=None): + def init_hidden(self, batch_size=[]): return self.body.init_hidden(self.input_size[1:], batch_size) def forward(self, x, hidden): diff --git a/handyrl/model.py b/handyrl/model.py index 621d703f..9eb7b94b 100755 --- a/handyrl/model.py +++ b/handyrl/model.py @@ -37,7 +37,11 @@ def __init__(self, model): def init_hidden(self, batch_size=None): if hasattr(self.model, 'init_hidden'): - return self.model.init_hidden(batch_size) + if batch_size is None: # for inference + hidden = self.model.init_hidden([]) + return map_r(hidden, lambda h: h.detach().numpy() if isinstance(h, torch.Tensor) else h) + else: # for training + return self.model.init_hidden(batch_size) return None def forward(self, *args, **kwargs): From fffc42df311ff12b77fa84c386449cb49350bd2d Mon Sep 17 00:00:00 2001 From: YuriCat Date: Tue, 25 Jan 2022 03:00:05 +0900 Subject: [PATCH 28/57] chore: change the position of debug output --- handyrl/worker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/handyrl/worker.py b/handyrl/worker.py index 58cd12f7..984cc9a6 100755 --- a/handyrl/worker.py +++ b/handyrl/worker.py @@ -206,8 +206,8 @@ def entry_server(port): print('finished entry server') def worker_server(port): - conn_acceptor = accept_socket_connections(port=port, timeout=0.3) print('started worker server %d' % port) + conn_acceptor = accept_socket_connections(port=port, timeout=0.3) while not self.shutdown_flag: # use super class's flag conn = next(conn_acceptor) if conn is not None: From 897ea32240e8a800b8829d1e3d50f5c43f51bab6 Mon Sep 17 00:00:00 2001 From: YuriCat Date: Tue, 25 Jan 2022 03:16:14 +0900 Subject: [PATCH 29/57] feature: assing cumulative worker index --- handyrl/worker.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/handyrl/worker.py b/handyrl/worker.py index 58cd12f7..7048097c 100755 --- a/handyrl/worker.py +++ b/handyrl/worker.py @@ -84,8 +84,8 @@ def run(self): send_recv(self.conn, ('result', result)) -def make_worker_args(args, n_ga, gaid, wid, conn): - return args, conn, wid * n_ga + gaid +def make_worker_args(args, n_ga, gaid, base_wid, wid, conn): + return args, conn, base_wid + wid * n_ga + gaid def open_worker(args, conn, wid): @@ -107,10 +107,12 @@ def __init__(self, args, conn, gaid): n_pro, n_ga = args['worker']['num_parallel'], args['worker']['num_gathers'] num_workers_per_gather = (n_pro // n_ga) + int(gaid < n_pro % n_ga) + base_wid = args['worker'].get('base_worker_id', 0) + worker_conns = open_multiprocessing_connections( num_workers_per_gather, open_worker, - functools.partial(make_worker_args, args, n_ga, gaid) + functools.partial(make_worker_args, args, n_ga, gaid, base_wid) ) for conn in worker_conns: @@ -188,6 +190,7 @@ class WorkerServer(QueueCommunicator): def __init__(self, args): super().__init__() self.args = args + self.total_worker_count = 0 def run(self): # prepare listening connections @@ -199,6 +202,8 @@ def entry_server(port): if conn is not None: worker_args = conn.recv() print('accepted connection from %s!' % worker_args['address']) + worker_args['base_worker_id'] = self.total_worker_count + self.total_worker_count += worker_args['num_parallel'] args = copy.deepcopy(self.args) args['worker'] = worker_args conn.send(args) From 8dbf79a49799e8c26f7c6ed454eb6fd266b3b074 Mon Sep 17 00:00:00 2001 From: YuriCat Date: Tue, 25 Jan 2022 04:29:58 +0900 Subject: [PATCH 30/57] chore: remove ugly comment --- handyrl/worker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/handyrl/worker.py b/handyrl/worker.py index 984cc9a6..796196df 100755 --- a/handyrl/worker.py +++ b/handyrl/worker.py @@ -208,7 +208,7 @@ def entry_server(port): def worker_server(port): print('started worker server %d' % port) conn_acceptor = accept_socket_connections(port=port, timeout=0.3) - while not self.shutdown_flag: # use super class's flag + while not self.shutdown_flag: conn = next(conn_acceptor) if conn is not None: self.add_connection(conn) From 3db83769028f8ed5a6b923913de055c273ff9a1e Mon Sep 17 00:00:00 2001 From: YuriCat Date: Tue, 25 Jan 2022 18:51:07 +0900 Subject: [PATCH 31/57] chore: update description of observers() --- handyrl/environment.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/handyrl/environment.py b/handyrl/environment.py index 02bb836c..080a27b1 100755 --- a/handyrl/environment.py +++ b/handyrl/environment.py @@ -78,7 +78,8 @@ def turns(self): return [self.turn()] # - # Should be defined if players except turn player also observe game states + # Should be defined if there are other players besides the turn player + # who should observe the environment (mainly with RNNs) # def observers(self): return [] From d2b0dc4271e5768279ed6eed87a9a8ee75fa9e77 Mon Sep 17 00:00:00 2001 From: YuriCat Date: Tue, 25 Jan 2022 19:00:33 +0900 Subject: [PATCH 32/57] feature: add make_onnx_model.py --- scripts/make_onnx_model.py | 61 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) create mode 100644 scripts/make_onnx_model.py diff --git a/scripts/make_onnx_model.py b/scripts/make_onnx_model.py new file mode 100644 index 00000000..73da3452 --- /dev/null +++ b/scripts/make_onnx_model.py @@ -0,0 +1,61 @@ + +# Usage: python3 script/make_onnx_model.py MODEL_PATH + +import sys +import yaml +import torch + +sys.path.append('./') + +from handyrl.environment import make_env +from handyrl.model import to_torch, ModelWrapper +from handyrl.util import map_r + + +model_path = sys.argv[-1] +saved_model_path = model_path + '.onnx' + +with open('config.yaml') as f: + args = yaml.safe_load(f) + +env = make_env(args['env_args']) +model = env.net() +model.load_state_dict(torch.load(model_path), strict=False) +model.eval() +model = ModelWrapper(model) + +model.load_state_dict(torch.load(model_path), strict=False) +model.eval() +print('loaded PyTorch model from %s' % model_path) + +env.reset() +obs = to_torch(env.observation(player=env.turn())) +obs = map_r(obs, lambda x: x.unsqueeze(0)) + +hidden = model.init_hidden([1]) +inputs = obs, hidden + +# You can specify meaningful names for the inputs here. +input_names = [] +map_r(obs, lambda y: input_names.append('input.' + str(len(input_names)))) + +hidden_names = [] +if hidden is not None: + map_r(hidden, lambda y: hidden_names.append('hidden.' + str(len(hidden_names)))) + input_names += hidden_names + +outputs = model(*inputs) +output_names = list(outputs.keys()) +if 'hidden' in output_names: + index = output_names.index('hidden') + output_names = output_names[:index] + [name + 'o' for name in hidden_names] + output_names[index+1:] + +print('input =', input_names) +print('output =', output_names) + +dynamic_axes = {name: {0: 'batch_size'} for name in (input_names + output_names)} + +torch.onnx.export(model, inputs, saved_model_path, + input_names=input_names, output_names=output_names, + dynamic_axes=dynamic_axes) +print('saved ONNX model to %s' % saved_model_path) From 154d2839affcd8bfb7e2e2a36f199a21b8c4e87a Mon Sep 17 00:00:00 2001 From: YuriCat Date: Tue, 25 Jan 2022 19:56:25 +0900 Subject: [PATCH 33/57] feature: add scripts/aux_swa.py --- scripts/aux_swa.py | 56 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 scripts/aux_swa.py diff --git a/scripts/aux_swa.py b/scripts/aux_swa.py new file mode 100644 index 00000000..fb2bd59e --- /dev/null +++ b/scripts/aux_swa.py @@ -0,0 +1,56 @@ + +# Usage: python3 script/aux_swa.py [FINAL_EPOCH] [EPOCHS] [EPOCH_STEP] + +import os +import sys + +sys.path.append('./') + +import yaml + +import torch +from torch.optim.swa_utils import AveragedModel + +from handyrl.environment import make_env + + +# +# SWA (running equal averaging) +# + +model_dir = 'models' +saved_model_path = os.path.join('models', 'swa.pth') + +ed, length = int(sys.argv[1]), int(sys.argv[2]) +step = 1 +if len(sys.argv) >= 4: + step + +model_ids = [str(i) + '.pth' for i in range(ed - length + 1, ed + 1, step)] + +with open('config.yaml') as f: + args = yaml.safe_load(f) + +env = make_env(args['env_args']) +model = env.net() +model.load_state_dict(torch.load(os.path.join(model_dir, model_ids[0])), strict=True) + +def _avg_fn(averaged_model_parameter, model_parameter, num_averaged): + return averaged_model_parameter + (model_parameter - averaged_model_parameter) / (num_averaged + 1) + +swa_model = AveragedModel(model, avg_fn=_avg_fn) + +for model_id in model_ids: + model.load_state_dict(torch.load(os.path.join(model_dir, model_id)), strict=True) + swa_model.update_parameters(model) + +torch.save(swa_model.module.state_dict(), saved_model_path) + +print('Saved %s' % saved_model_path) + +# +# Test (load in strict=True) +# + +model = env.net() +model.load_state_dict(torch.load(saved_model_path), strict=True) From acd77f7dee002bb3d9c6b441a6fcc6e5e76022d9 Mon Sep 17 00:00:00 2001 From: YuriCat Date: Tue, 25 Jan 2022 21:20:04 +0900 Subject: [PATCH 34/57] feature: accept smaller number of arguments by checking argument count --- handyrl/model.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/handyrl/model.py b/handyrl/model.py index bdcbd4bf..83103fb8 100755 --- a/handyrl/model.py +++ b/handyrl/model.py @@ -35,10 +35,6 @@ def __init__(self, model): super().__init__() self.model = model - def get_argument_names(f): - return f.__code__.co_varnames[:f.__code__.co_argcount] - self.forward_args = get_argument_names(self.model.forward) - def init_hidden(self, batch_size=None): if hasattr(self.model, 'init_hidden'): if batch_size is None: # for inference @@ -49,10 +45,12 @@ def init_hidden(self, batch_size=None): return None def forward(self, x, hidden, **kwargs): - # Remove 'hidden' input if it will not accepted - if 'hidden' not in self.forward_args: - return self.model.forward(x, **kwargs) + if self.model.forward.__code__.co_argcount == 1 + 1: + # ignore hidden state inputs if the number of arguments is just one + assert len(kwargs) == 0 + return self.model.forward(x) else: + # otherwize, users should prepare an argument for hidden states return self.model.forward(x, hidden, **kwargs) def inference(self, x, hidden, **kwargs): From 4fab02620b86edaa5629281460c2714247488b84 Mon Sep 17 00:00:00 2001 From: YuriCat Date: Wed, 26 Jan 2022 02:18:07 +0900 Subject: [PATCH 35/57] chore: change variable names --- handyrl/train.py | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/handyrl/train.py b/handyrl/train.py index d0a8001c..d20d057e 100755 --- a/handyrl/train.py +++ b/handyrl/train.py @@ -91,18 +91,18 @@ def replace_none(a, b): batch_steps = args['burn_in_steps'] + args['forward_steps'] if len(tmask) < batch_steps: pad_len_b = args['burn_in_steps'] - (ep['train_start'] - ep['start']) - pad_len = batch_steps - len(tmask) - pad_len_b - obs = map_r(obs, lambda o: np.pad(o, [(pad_len_b, pad_len)] + [(0, 0)] * (len(o.shape) - 1), 'constant', constant_values=0)) - p = np.pad(p, [(pad_len_b, pad_len), (0, 0), (0, 0)], 'constant', constant_values=0) - v = np.pad(np.concatenate([v, np.tile(oc, [pad_len, 1, 1])]), [(pad_len_b, 0), (0, 0), (0, 0)], 'constant', constant_values=0) - act = np.pad(act, [(pad_len_b, pad_len), (0, 0), (0, 0)], 'constant', constant_values=0) - rew = np.pad(rew, [(pad_len_b, pad_len), (0, 0), (0, 0)], 'constant', constant_values=0) - ret = np.pad(ret, [(pad_len_b, pad_len), (0, 0), (0, 0)], 'constant', constant_values=0) - emask = np.pad(emask, [(pad_len_b, pad_len), (0, 0), (0, 0)], 'constant', constant_values=0) - tmask = np.pad(tmask, [(pad_len_b, pad_len), (0, 0), (0, 0)], 'constant', constant_values=0) - omask = np.pad(omask, [(pad_len_b, pad_len), (0, 0), (0, 0)], 'constant', constant_values=0) - amask = np.pad(amask, [(pad_len_b, pad_len), (0, 0), (0, 0)], 'constant', constant_values=1e32) - progress = np.pad(progress, [(pad_len_b, pad_len), (0, 0)], 'constant', constant_values=1) + pad_len_a = batch_steps - len(tmask) - pad_len_b + obs = map_r(obs, lambda o: np.pad(o, [(pad_len_b, pad_len_a)] + [(0, 0)] * (len(o.shape) - 1), 'constant', constant_values=0)) + p = np.pad(p, [(pad_len_b, pad_len_a), (0, 0), (0, 0)], 'constant', constant_values=0) + v = np.concatenate([np.pad(v, [(pad_len_b, 0), (0, 0), (0, 0)], 'constant', constant_values=0), np.tile(oc, [pad_len_a, 1, 1])]) + act = np.pad(act, [(pad_len_b, pad_len_a), (0, 0), (0, 0)], 'constant', constant_values=0) + rew = np.pad(rew, [(pad_len_b, pad_len_a), (0, 0), (0, 0)], 'constant', constant_values=0) + ret = np.pad(ret, [(pad_len_b, pad_len_a), (0, 0), (0, 0)], 'constant', constant_values=0) + emask = np.pad(emask, [(pad_len_b, pad_len_a), (0, 0), (0, 0)], 'constant', constant_values=0) + tmask = np.pad(tmask, [(pad_len_b, pad_len_a), (0, 0), (0, 0)], 'constant', constant_values=0) + omask = np.pad(omask, [(pad_len_b, pad_len_a), (0, 0), (0, 0)], 'constant', constant_values=0) + amask = np.pad(amask, [(pad_len_b, pad_len_a), (0, 0), (0, 0)], 'constant', constant_values=1e32) + progress = np.pad(progress, [(pad_len_b, pad_len_a), (0, 0)], 'constant', constant_values=1) obss.append(obs) datum.append((p, v, act, oc, rew, ret, emask, tmask, omask, amask, progress)) @@ -289,16 +289,16 @@ def select_episode(self): break ep = self.episodes[ep_idx] turn_candidates = 1 + max(0, ep['steps'] - self.args['forward_steps']) # change start turn by sequence length - st_train = random.randrange(turn_candidates) - ed = min(st_train + self.args['forward_steps'], ep['steps']) - st = max(0, st_train - self.args['burn_in_steps']) + train_st = random.randrange(turn_candidates) + st = max(0, train_st - self.args['burn_in_steps']) + ed = min(train_st + self.args['forward_steps'], ep['steps']) st_block = st // self.args['compress_steps'] ed_block = (ed - 1) // self.args['compress_steps'] + 1 ep_minimum = { 'args': ep['args'], 'outcome': ep['outcome'], 'moment': ep['moment'][st_block:ed_block], 'base': st_block * self.args['compress_steps'], - 'start': st, 'end': ed, 'train_start': st_train, 'total': ep['steps'], + 'start': st, 'end': ed, 'train_start': train_st, 'total': ep['steps'], } return ep_minimum From 190852ad9eda7e655c57106cc1ab1ec61c086ae9 Mon Sep 17 00:00:00 2001 From: YuriCat Date: Wed, 26 Jan 2022 18:16:35 +0900 Subject: [PATCH 36/57] feature: remove duplicate loading and unnecessary ModelWrapper --- scripts/make_onnx_model.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/scripts/make_onnx_model.py b/scripts/make_onnx_model.py index 73da3452..494365cb 100644 --- a/scripts/make_onnx_model.py +++ b/scripts/make_onnx_model.py @@ -8,7 +8,7 @@ sys.path.append('./') from handyrl.environment import make_env -from handyrl.model import to_torch, ModelWrapper +from handyrl.model import to_torch from handyrl.util import map_r @@ -20,10 +20,6 @@ env = make_env(args['env_args']) model = env.net() -model.load_state_dict(torch.load(model_path), strict=False) -model.eval() -model = ModelWrapper(model) - model.load_state_dict(torch.load(model_path), strict=False) model.eval() print('loaded PyTorch model from %s' % model_path) @@ -32,7 +28,7 @@ obs = to_torch(env.observation(player=env.turn())) obs = map_r(obs, lambda x: x.unsqueeze(0)) -hidden = model.init_hidden([1]) +hidden = model.init_hidden([1]) if hasattr(model, 'init_hidden') else None inputs = obs, hidden # You can specify meaningful names for the inputs here. From 09f74c796d0595cb5aff5db9becc6688525a220d Mon Sep 17 00:00:00 2001 From: YuriCat Date: Wed, 26 Jan 2022 18:17:48 +0900 Subject: [PATCH 37/57] fix: use turns()[0] instead of turn() --- scripts/make_onnx_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/make_onnx_model.py b/scripts/make_onnx_model.py index 494365cb..d9f9d551 100644 --- a/scripts/make_onnx_model.py +++ b/scripts/make_onnx_model.py @@ -25,7 +25,7 @@ print('loaded PyTorch model from %s' % model_path) env.reset() -obs = to_torch(env.observation(player=env.turn())) +obs = to_torch(env.observation(player=env.turns()[0])) obs = map_r(obs, lambda x: x.unsqueeze(0)) hidden = model.init_hidden([1]) if hasattr(model, 'init_hidden') else None From c804ce718049b2fd630f25fcaf17a9f81fc6a028 Mon Sep 17 00:00:00 2001 From: YuriCat Date: Wed, 26 Jan 2022 18:59:19 +0900 Subject: [PATCH 38/57] chore: fix typo --- handyrl/model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/handyrl/model.py b/handyrl/model.py index 83103fb8..60c5fe36 100755 --- a/handyrl/model.py +++ b/handyrl/model.py @@ -50,7 +50,7 @@ def forward(self, x, hidden, **kwargs): assert len(kwargs) == 0 return self.model.forward(x) else: - # otherwize, users should prepare an argument for hidden states + # otherwise, users should prepare an argument for hidden states return self.model.forward(x, hidden, **kwargs) def inference(self, x, hidden, **kwargs): From ddcb6f725c7f0fe9e21f7c74919cdc3b8fbe8013 Mon Sep 17 00:00:00 2001 From: YuriCat Date: Wed, 26 Jan 2022 19:57:15 +0900 Subject: [PATCH 39/57] fix: set skip setting from argument --- scripts/aux_swa.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/aux_swa.py b/scripts/aux_swa.py index fb2bd59e..6c10a164 100644 --- a/scripts/aux_swa.py +++ b/scripts/aux_swa.py @@ -24,7 +24,7 @@ ed, length = int(sys.argv[1]), int(sys.argv[2]) step = 1 if len(sys.argv) >= 4: - step + step = int(sys.argv[3]) model_ids = [str(i) + '.pth' for i in range(ed - length + 1, ed + 1, step)] From b7397bc4fe733bd944d236a50cb75b82904dd467 Mon Sep 17 00:00:00 2001 From: Ikki Tanaka Date: Thu, 27 Jan 2022 10:47:05 +0900 Subject: [PATCH 40/57] Revert "feature: hidden hidden argument count" --- handyrl/envs/kaggle/hungry_geese.py | 2 +- handyrl/envs/tictactoe.py | 2 +- handyrl/model.py | 10 ++-------- 3 files changed, 4 insertions(+), 10 deletions(-) diff --git a/handyrl/envs/kaggle/hungry_geese.py b/handyrl/envs/kaggle/hungry_geese.py index 060bdb7d..0a663adc 100644 --- a/handyrl/envs/kaggle/hungry_geese.py +++ b/handyrl/envs/kaggle/hungry_geese.py @@ -45,7 +45,7 @@ def __init__(self): self.head_p = nn.Linear(filters, 4, bias=False) self.head_v = nn.Linear(filters * 2, 1, bias=False) - def forward(self, x): + def forward(self, x, _=None): h = F.relu_(self.conv0(x)) for block in self.blocks: h = F.relu_(h + block(h)) diff --git a/handyrl/envs/tictactoe.py b/handyrl/envs/tictactoe.py index 1d594021..2c27809c 100755 --- a/handyrl/envs/tictactoe.py +++ b/handyrl/envs/tictactoe.py @@ -59,7 +59,7 @@ def __init__(self): self.head_p = Head((filters, 3, 3), 2, 9) self.head_v = Head((filters, 3, 3), 1, 1) - def forward(self, x): + def forward(self, x, hidden=None): h = F.relu(self.conv(x)) for block in self.blocks: h = F.relu(block(h)) diff --git a/handyrl/model.py b/handyrl/model.py index 83103fb8..9eb7b94b 100755 --- a/handyrl/model.py +++ b/handyrl/model.py @@ -44,14 +44,8 @@ def init_hidden(self, batch_size=None): return self.model.init_hidden(batch_size) return None - def forward(self, x, hidden, **kwargs): - if self.model.forward.__code__.co_argcount == 1 + 1: - # ignore hidden state inputs if the number of arguments is just one - assert len(kwargs) == 0 - return self.model.forward(x) - else: - # otherwize, users should prepare an argument for hidden states - return self.model.forward(x, hidden, **kwargs) + def forward(self, *args, **kwargs): + return self.model.forward(*args, **kwargs) def inference(self, x, hidden, **kwargs): # numpy array -> numpy array From 8d7e6537e6559e2a5dd26090d88b37352f4911ed Mon Sep 17 00:00:00 2001 From: YuriCat Date: Thu, 27 Jan 2022 14:38:47 +0900 Subject: [PATCH 41/57] chore: style fix in scripts/make_onnx_model.py --- scripts/make_onnx_model.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/make_onnx_model.py b/scripts/make_onnx_model.py index d9f9d551..664a253c 100644 --- a/scripts/make_onnx_model.py +++ b/scripts/make_onnx_model.py @@ -43,8 +43,8 @@ outputs = model(*inputs) output_names = list(outputs.keys()) if 'hidden' in output_names: - index = output_names.index('hidden') - output_names = output_names[:index] + [name + 'o' for name in hidden_names] + output_names[index+1:] + index = output_names.index('hidden') + output_names = output_names[:index] + [name + 'o' for name in hidden_names] + output_names[index+1:] print('input =', input_names) print('output =', output_names) From 2a8f79506bd01ea5235991524b7abffb503c4757 Mon Sep 17 00:00:00 2001 From: YuriCat Date: Thu, 27 Jan 2022 18:45:29 +0900 Subject: [PATCH 42/57] Revert "chore: fix typo" --- handyrl/model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/handyrl/model.py b/handyrl/model.py index 60c5fe36..83103fb8 100755 --- a/handyrl/model.py +++ b/handyrl/model.py @@ -50,7 +50,7 @@ def forward(self, x, hidden, **kwargs): assert len(kwargs) == 0 return self.model.forward(x) else: - # otherwise, users should prepare an argument for hidden states + # otherwize, users should prepare an argument for hidden states return self.model.forward(x, hidden, **kwargs) def inference(self, x, hidden, **kwargs): From 24bbfaedbe8ce769499fa373c42a586f977e0c4c Mon Sep 17 00:00:00 2001 From: YuriCat Date: Fri, 28 Jan 2022 20:07:34 +0900 Subject: [PATCH 43/57] feature: add scripts/win_rate_plot.py --- scripts/win_rate_plot.py | 115 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 115 insertions(+) create mode 100644 scripts/win_rate_plot.py diff --git a/scripts/win_rate_plot.py b/scripts/win_rate_plot.py new file mode 100644 index 00000000..d14da07d --- /dev/null +++ b/scripts/win_rate_plot.py @@ -0,0 +1,115 @@ + +# Usage: python3 scripts/win_rate_plot.py train-log-002.txt all + +# You should not include figures generated by this script in your paper, because +# 1. This version of HandyRL doesn't display all the results of the matches. +# 2. Smoothing method in this script is not a simple moving average. + +import os.path, sys, csv +import numpy as np + + +n = 15 + +def kernel(n): + a = np.array(list(range(1, 1 + (n+1)//2)) + list(range(1 + n//2,1,-1))) + return a / a.sum() + + +def get_wp_list(path): + opponents = set() + epoch_data_list = [{}] + epoch_list = [0] + step_list = [0] + game_list = [0] + + f = open(path) + lines = f.readlines() + prev_line = '' + + for line in lines: + if line.startswith('updated'): + epoch_data_list.append({}) + epoch_list.append(len(epoch_list)) + step_list.append(int(line.split('(')[1].rstrip().rstrip(')'))) + if line.startswith('win rate'): + elms = line.split() + opponent = elms[2].lstrip('(').rstrip(')') + games = int(elms[-1].lstrip('(').rstrip(')')) + wp = float(elms[-4]) if games > 0 else 0.0 + epoch_data_list[-1][opponent] = {'w': games * wp, 'n': games} + opponents.add(opponent) + if line.startswith('epoch '): + print(line, len(epoch_list)) + if ' ' in prev_line: + game = int(prev_line.split()[-1]) + game_list.append(game) + + prev_line = line + + game_list = game_list[:len(epoch_data_list)] + + clipped_epoch_list = epoch_list[n//2:-n//2+1] + clipped_step_list = step_list[n//2:-n//2+1] + clipped_game_list = game_list[n//2:-n//2+1] + null_data = {'w': 0, 'n': 0} + kn = kernel(n) + averaged_wp_lists = {} + start_epoch = {} + for opponent in opponents: + win_list = [e.get(opponent, null_data)['w'] for e in epoch_data_list] + n_list = [e.get(opponent, null_data)['n'] for e in epoch_data_list] + averaged_win_list = np.convolve(win_list, kn, mode='valid') + averaged_n_list = np.convolve(n_list, kn, mode='valid') + 1e-6 + averaged_wp_lists[opponent] = averaged_win_list / averaged_n_list + try: + start_epoch[opponent] = next(i for i, n in enumerate(n_list) if n >= 1) + except: + start_epoch[opponent] = 0 + return clipped_epoch_list, clipped_step_list, clipped_game_list, averaged_wp_lists, start_epoch + + +import matplotlib.pyplot as plt +import seaborn as sns +flatui = ["#9b59b6", "#95a5a6", "#34495e", "#3498db", "#e74c3c", "#2ecc71", "#b22222"] +sns.set_palette(sns.color_palette(flatui, 24)) + +clipped_epoch_list, clipped_step_list, clipped_game_list, averaged_wp_lists, start_epoch = get_wp_list(sys.argv[1]) + +opponents_ = list(averaged_wp_lists.keys()) +opponents = sorted(opponents_, key=lambda o: averaged_wp_lists[o][-1], reverse=True) + +fig = plt.figure() +ax = fig.add_subplot(1, 1, 1) + +last_win_rate = {} +for opponent in opponents: + if opponent == 'total': + continue + wp_list = averaged_wp_lists[opponent] + start = start_epoch[opponent] + # ax.plot(clipped_epoch_list[start:], wp_list[start:], label=opponent) + ax.plot(clipped_game_list[start:], wp_list[start:], label=opponent) + last_win_rate[opponent] = wp_list[-1] + +ax.set_xlabel('Games', size=14) +ax.set_ylabel('Win rate', size=14) +ax.set_title(sys.argv[2]) +ax.set_ylim(0, 1) +ax.legend() + +# Major ticks every 20, minor ticks every 5 +major_ticks = np.linspace(0, 1, 11) +minor_ticks = np.linspace(0, 1, 21) + +ax.set_yticks(major_ticks) +ax.set_yticks(minor_ticks, minor=True) + +# A corresponding grid +plt.grid(which='minor', color='gray', alpha=0.5, + linestyle='--', linewidth=0.5) +plt.grid(which='major', color='gray', alpha=0.5, + linestyle='--', linewidth=1) + +fig.tight_layout() +plt.show() From e023aa0ac40ec42ca14f2762d297df9aad5cb7fe Mon Sep 17 00:00:00 2001 From: YuriCat Date: Sun, 30 Jan 2022 20:35:54 +0900 Subject: [PATCH 44/57] feature: use Tensor.dim() instead of len(Tensor.size()) --- handyrl/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/handyrl/train.py b/handyrl/train.py index 031baa35..8b759da3 100755 --- a/handyrl/train.py +++ b/handyrl/train.py @@ -146,7 +146,7 @@ def forward_prediction(model, hidden, batch, args): for t in range(batch['turn_mask'].size(1)): obs = map_r(observations, lambda o: o[:, t].reshape(-1, *o.size()[3:])) # (..., B * P, ...) omask_ = batch['observation_mask'][:, t] - omask = map_r(hidden, lambda h: omask_.view(*h.size()[:2], *([1] * (len(h.size()) - 2)))) + omask = map_r(hidden, lambda h: omask_.view(*h.size()[:2], *([1] * (h.dim() - 2)))) hidden_ = bimap_r(hidden, omask, lambda h, m: h * m) # (..., B, P, ...) if args['turn_based_training'] and not args['observation']: hidden_ = map_r(hidden_, lambda h: h.sum(1)) # (..., B * 1, ...) From 1f15ba902b56d8dfc1770e06258c4d336a35d935 Mon Sep 17 00:00:00 2001 From: YuriCat Date: Sun, 30 Jan 2022 22:11:45 +0900 Subject: [PATCH 45/57] chore: style fix in train.py --- handyrl/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/handyrl/train.py b/handyrl/train.py index 8b759da3..857055c8 100755 --- a/handyrl/train.py +++ b/handyrl/train.py @@ -155,7 +155,7 @@ def forward_prediction(model, hidden, batch, args): if t < args['burn_in_steps']: model.eval() with torch.no_grad(): - outputs_= model(obs, hidden_) + outputs_ = model(obs, hidden_) else: if not model.training: model.train() From 8a28df741136c3262e1fb9712ea65c9906dd0e85 Mon Sep 17 00:00:00 2001 From: YuriCat Date: Sun, 30 Jan 2022 22:22:15 +0900 Subject: [PATCH 46/57] feature: set batch_shape, and use flatten and unflatten --- handyrl/train.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/handyrl/train.py b/handyrl/train.py index 031baa35..d768740e 100755 --- a/handyrl/train.py +++ b/handyrl/train.py @@ -135,16 +135,18 @@ def forward_prediction(model, hidden, batch, args): """ observations = batch['observation'] # (B, T, P, ...) + batch_shape = batch['action'].size()[:3] # (B, T, P or 1) if hidden is None: # feed-forward neural network obs = map_r(observations, lambda o: o.view(-1, *o.size()[3:])) outputs = model(obs, None) + outputs = map_r(outputs, lambda o: o.unflatten(0, batch_shape)) else: # sequential computation with RNN outputs = {} - for t in range(batch['turn_mask'].size(1)): - obs = map_r(observations, lambda o: o[:, t].reshape(-1, *o.size()[3:])) # (..., B * P, ...) + for t in range(batch_shape[1]): + obs = map_r(observations, lambda o: o[:, t].flatten(0, 1)) # (..., B * P, ...) omask_ = batch['observation_mask'][:, t] omask = map_r(hidden, lambda h: omask_.view(*h.size()[:2], *([1] * (len(h.size()) - 2)))) hidden_ = bimap_r(hidden, omask, lambda h, m: h * m) # (..., B, P, ...) @@ -160,17 +162,16 @@ def forward_prediction(model, hidden, batch, args): if not model.training: model.train() outputs_ = model(obs, hidden_) + outputs_ = map_r(outputs_, lambda o: o.unflatten(0, (batch_shape[0], -1))) # (..., B, P or 1, ...) for k, o in outputs_.items(): if k == 'hidden': next_hidden = o else: outputs[k] = outputs.get(k, []) + [o] - next_hidden = bimap_r(next_hidden, hidden, lambda nh, h: nh.view(h.size(0), -1, *h.size()[2:])) # (..., B, P or 1, ...) hidden = trimap_r(hidden, next_hidden, omask, lambda h, nh, m: h * (1 - m) + nh * m) outputs = {k: torch.stack(o, dim=1) for k, o in outputs.items() if o[0] is not None} for k, o in outputs.items(): - o = o.view(*batch['turn_mask'].size()[:2], -1, o.size(-1)) if k == 'policy': # gather turn player's policies outputs[k] = o.mul(batch['turn_mask']).sum(2, keepdim=True) - batch['action_mask'] From 927d7eb5bfd5021c26236cef0caae0f2626b479c Mon Sep 17 00:00:00 2001 From: YuriCat Date: Sun, 30 Jan 2022 22:27:58 +0900 Subject: [PATCH 47/57] feature: fix dimension description --- handyrl/train.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/handyrl/train.py b/handyrl/train.py index d768740e..9a707a75 100755 --- a/handyrl/train.py +++ b/handyrl/train.py @@ -134,7 +134,7 @@ def forward_prediction(model, hidden, batch, args): tuple: batch outputs of neural network """ - observations = batch['observation'] # (B, T, P, ...) + observations = batch['observation'] # (B, T, P or 1, ...) batch_shape = batch['action'].size()[:3] # (B, T, P or 1) if hidden is None: @@ -146,7 +146,7 @@ def forward_prediction(model, hidden, batch, args): # sequential computation with RNN outputs = {} for t in range(batch_shape[1]): - obs = map_r(observations, lambda o: o[:, t].flatten(0, 1)) # (..., B * P, ...) + obs = map_r(observations, lambda o: o[:, t].flatten(0, 1)) # (..., B * P or 1, ...) omask_ = batch['observation_mask'][:, t] omask = map_r(hidden, lambda h: omask_.view(*h.size()[:2], *([1] * (len(h.size()) - 2)))) hidden_ = bimap_r(hidden, omask, lambda h, m: h * m) # (..., B, P, ...) @@ -162,7 +162,7 @@ def forward_prediction(model, hidden, batch, args): if not model.training: model.train() outputs_ = model(obs, hidden_) - outputs_ = map_r(outputs_, lambda o: o.unflatten(0, (batch_shape[0], -1))) # (..., B, P or 1, ...) + outputs_ = map_r(outputs_, lambda o: o.unflatten(0, (batch_shape[0], batch_shape[2]))) # (..., B, P or 1, ...) for k, o in outputs_.items(): if k == 'hidden': next_hidden = o From f1b57e39fa43ef23f1576d0d38dfbba2ce546418 Mon Sep 17 00:00:00 2001 From: YuriCat Date: Sun, 30 Jan 2022 22:50:51 +0900 Subject: [PATCH 48/57] chore: use flatten in feed forward computation --- handyrl/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/handyrl/train.py b/handyrl/train.py index 9a707a75..2fe4b486 100755 --- a/handyrl/train.py +++ b/handyrl/train.py @@ -139,7 +139,7 @@ def forward_prediction(model, hidden, batch, args): if hidden is None: # feed-forward neural network - obs = map_r(observations, lambda o: o.view(-1, *o.size()[3:])) + obs = map_r(observations, lambda o: o.flatten(0, 2)) outputs = model(obs, None) outputs = map_r(outputs, lambda o: o.unflatten(0, batch_shape)) else: From 037ae152acedb855ebd02220b8c6751bf93089f1 Mon Sep 17 00:00:00 2001 From: YuriCat Date: Sun, 30 Jan 2022 23:20:17 +0900 Subject: [PATCH 49/57] chore: use flatten in RNN computation --- handyrl/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/handyrl/train.py b/handyrl/train.py index 2fe4b486..b6a0ce48 100755 --- a/handyrl/train.py +++ b/handyrl/train.py @@ -153,7 +153,7 @@ def forward_prediction(model, hidden, batch, args): if args['turn_based_training'] and not args['observation']: hidden_ = map_r(hidden_, lambda h: h.sum(1)) # (..., B * 1, ...) else: - hidden_ = map_r(hidden_, lambda h: h.view(-1, *h.size()[2:])) # (..., B * P, ...) + hidden_ = map_r(hidden_, lambda h: h.flatten(0, 1)) # (..., B * P, ...) if t < args['burn_in_steps']: model.eval() with torch.no_grad(): From 62e910a09d4f2fd934572b49eb3344eab9c0a33c Mon Sep 17 00:00:00 2001 From: YuriCat Date: Mon, 31 Jan 2022 09:38:20 +0900 Subject: [PATCH 50/57] feature: save_model.pth.onnx -> saved_model.onnx --- scripts/make_onnx_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/make_onnx_model.py b/scripts/make_onnx_model.py index d9f9d551..05e52b1f 100644 --- a/scripts/make_onnx_model.py +++ b/scripts/make_onnx_model.py @@ -13,7 +13,7 @@ model_path = sys.argv[-1] -saved_model_path = model_path + '.onnx' +saved_model_path = model_path.rstrip('.pth') + '.onnx' with open('config.yaml') as f: args = yaml.safe_load(f) From 20d059c823854a6b86cc4ca75a188661185dbaa4 Mon Sep 17 00:00:00 2001 From: YuriCat Date: Mon, 31 Jan 2022 13:00:53 +0900 Subject: [PATCH 51/57] chore: paper -> academic paper --- scripts/win_rate_plot.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/win_rate_plot.py b/scripts/win_rate_plot.py index d14da07d..9d11c8e0 100644 --- a/scripts/win_rate_plot.py +++ b/scripts/win_rate_plot.py @@ -1,7 +1,7 @@ # Usage: python3 scripts/win_rate_plot.py train-log-002.txt all -# You should not include figures generated by this script in your paper, because +# You should not include figures generated by this script in your academic paper, because # 1. This version of HandyRL doesn't display all the results of the matches. # 2. Smoothing method in this script is not a simple moving average. From 6740725d739a869d65caf33c494d13dfb00160cf Mon Sep 17 00:00:00 2001 From: YuriCat Date: Mon, 31 Jan 2022 18:31:44 +0900 Subject: [PATCH 52/57] feature: add batch-shape description for feedforward net --- handyrl/train.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/handyrl/train.py b/handyrl/train.py index b6a0ce48..0aecacf0 100755 --- a/handyrl/train.py +++ b/handyrl/train.py @@ -134,14 +134,14 @@ def forward_prediction(model, hidden, batch, args): tuple: batch outputs of neural network """ - observations = batch['observation'] # (B, T, P or 1, ...) + observations = batch['observation'] # (..., B, T, P or 1, ...) batch_shape = batch['action'].size()[:3] # (B, T, P or 1) if hidden is None: # feed-forward neural network - obs = map_r(observations, lambda o: o.flatten(0, 2)) + obs = map_r(observations, lambda o: o.flatten(0, 2)) # (..., B * T * P or 1, ...) outputs = model(obs, None) - outputs = map_r(outputs, lambda o: o.unflatten(0, batch_shape)) + outputs = map_r(outputs, lambda o: o.unflatten(0, batch_shape)) # (..., B, T, P or 1, ...) else: # sequential computation with RNN outputs = {} From bfb56ba2b8cd96fcc0933b5aa72c900ed083f9a4 Mon Sep 17 00:00:00 2001 From: YuriCat Date: Mon, 31 Jan 2022 19:26:34 +0900 Subject: [PATCH 53/57] fix: consider the case when training policies of two players --- handyrl/train.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/handyrl/train.py b/handyrl/train.py index 0aecacf0..86a28198 100755 --- a/handyrl/train.py +++ b/handyrl/train.py @@ -174,7 +174,10 @@ def forward_prediction(model, hidden, batch, args): for k, o in outputs.items(): if k == 'policy': # gather turn player's policies - outputs[k] = o.mul(batch['turn_mask']).sum(2, keepdim=True) - batch['action_mask'] + o = o.mul(batch['turn_mask']) + if o.size(2) > 1 and batch_shape[2] == 1: # turn-alternating batch + o = o.sum(2, keepdim=True) + outputs[k] = o - batch['action_mask'] else: # mask valid target values and cumulative rewards outputs[k] = o.mul(batch['observation_mask']) From c5dd1c4ae5d63042ad0ba7296c29e48792480b2e Mon Sep 17 00:00:00 2001 From: YuriCat Date: Mon, 31 Jan 2022 19:41:45 +0900 Subject: [PATCH 54/57] chore: change position of comment --- handyrl/train.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/handyrl/train.py b/handyrl/train.py index 86a28198..78d472d7 100755 --- a/handyrl/train.py +++ b/handyrl/train.py @@ -173,10 +173,9 @@ def forward_prediction(model, hidden, batch, args): for k, o in outputs.items(): if k == 'policy': - # gather turn player's policies o = o.mul(batch['turn_mask']) if o.size(2) > 1 and batch_shape[2] == 1: # turn-alternating batch - o = o.sum(2, keepdim=True) + o = o.sum(2, keepdim=True) # gather turn player's policies outputs[k] = o - batch['action_mask'] else: # mask valid target values and cumulative rewards From e43df89da28098f471a615bf963bde07e0a91429 Mon Sep 17 00:00:00 2001 From: YuriCat Date: Tue, 1 Feb 2022 17:43:47 +0900 Subject: [PATCH 55/57] chore: since critic is not necessary, it should be None at default --- handyrl/evaluation.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/handyrl/evaluation.py b/handyrl/evaluation.py index c57a1712..64f3306a 100755 --- a/handyrl/evaluation.py +++ b/handyrl/evaluation.py @@ -76,7 +76,7 @@ def observe(self, player): return send_recv(self.conn, ('observe', [player])) -def exec_match(env, agents, critic, show=False, game_args={}): +def exec_match(env, agents, critic=None, show=False, game_args={}): ''' match with shared game environment ''' if env.reset(game_args): return None @@ -105,7 +105,7 @@ def exec_match(env, agents, critic, show=False, game_args={}): return outcome -def exec_network_match(env, network_agents, critic, show=False, game_args={}): +def exec_network_match(env, network_agents, critic=None, show=False, game_args={}): ''' match with divided game environment ''' if env.reset(game_args): return None @@ -165,7 +165,7 @@ def execute(self, models, args): else: agents[p] = Agent(model) - outcome = exec_match(self.env, agents, None) + outcome = exec_match(self.env, agents) if outcome is None: print('None episode in evaluation!') return None From 39cfe4cc99fb0996c24ff580373172656a3a2c9b Mon Sep 17 00:00:00 2001 From: YuriCat Date: Sat, 5 Feb 2022 19:45:41 +0900 Subject: [PATCH 56/57] chore: remove unused imports from scripts/win_rate_plot.py --- scripts/win_rate_plot.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/win_rate_plot.py b/scripts/win_rate_plot.py index 9d11c8e0..7d8e67e3 100644 --- a/scripts/win_rate_plot.py +++ b/scripts/win_rate_plot.py @@ -3,9 +3,9 @@ # You should not include figures generated by this script in your academic paper, because # 1. This version of HandyRL doesn't display all the results of the matches. -# 2. Smoothing method in this script is not a simple moving average. +# 2. Smoothing method in this script is not a simple moving average. -import os.path, sys, csv +import sys import numpy as np From af94d9b0052054d6cecff79669b2dd85fe53be8d Mon Sep 17 00:00:00 2001 From: YuriCat Date: Sat, 5 Feb 2022 20:10:20 +0900 Subject: [PATCH 57/57] chore: remove unused import from connection.py --- handyrl/connection.py | 1 - 1 file changed, 1 deletion(-) diff --git a/handyrl/connection.py b/handyrl/connection.py index 28805d00..e5a163af 100755 --- a/handyrl/connection.py +++ b/handyrl/connection.py @@ -2,7 +2,6 @@ # Licensed under The MIT License [see LICENSE for details] import io -import time import struct import socket import pickle