diff --git a/README.md b/README.md index 05c7eb64..351e7246 100755 --- a/README.md +++ b/README.md @@ -113,5 +113,5 @@ NOTE: Default opponent AI is random agent implemented in `evaluation.py`. You ca ## Use Cases -* [Month 1 Winner in Hungry Geese (Kaggle)](https://www.kaggle.com/c/hungry-geese/discussion/222941) -* [The 5th solution in Google Research Football with Manchester City F.C. (Kaggle)](https://www.kaggle.com/c/google-football/discussion/203412) +* [The 1st place solution in Hungry Geese (Kaggle)](https://www.kaggle.com/c/hungry-geese/discussion/263279) +* [The 5th place solution in Google Research Football with Manchester City F.C. (Kaggle)](https://www.kaggle.com/c/google-football/discussion/203412) diff --git a/config.yaml b/config.yaml index c141869e..2bf65d85 100755 --- a/config.yaml +++ b/config.yaml @@ -10,6 +10,7 @@ train_args: observation: False gamma: 0.8 forward_steps: 16 + burn_in_steps: 0 # for RNNs compress_steps: 4 entropy_regularization: 1.0e-1 entropy_regularization_decay: 0.1 diff --git a/handyrl/agent.py b/handyrl/agent.py index 72f1778b..4af84f5e 100755 --- a/handyrl/agent.py +++ b/handyrl/agent.py @@ -34,16 +34,17 @@ def print_outputs(env, prob, v): if hasattr(env, 'print_outputs'): env.print_outputs(prob, v) else: - print('v = %f' % v) - print('p = %s' % (prob * 1000).astype(int)) + if v is not None: + print('v = %f' % v) + if prob is not None: + print('p = %s' % (prob * 1000).astype(int)) class Agent: - def __init__(self, model, observation=False, temperature=0.0): + def __init__(self, model, temperature=0.0): # model might be a neural net, or some planning algorithm such as game tree search self.model = model self.hidden = None - self.observation = observation self.temperature = temperature def reset(self, env, show=False): @@ -73,12 +74,10 @@ def action(self, env, player, show=False): return random.choices(np.arange(len(p)), weights=softmax(p / self.temperature))[0] def observe(self, env, player, show=False): - v = None - if self.observation: - outputs = self.plan(env.observation(player)) - v = outputs.get('value', None) - if show: - print_outputs(env, None, v) + outputs = self.plan(env.observation(player)) + v = outputs.get('value', None) + if show: + print_outputs(env, None, v) return v if v is not None else [0.0] @@ -101,5 +100,5 @@ def plan(self, obs): class SoftAgent(Agent): - def __init__(self, model, observation=False): - super().__init__(model, observation=observation, temperature=1.0) + def __init__(self, model): + super().__init__(model, temperature=1.0) diff --git a/handyrl/connection.py b/handyrl/connection.py index 28805d00..e5a163af 100755 --- a/handyrl/connection.py +++ b/handyrl/connection.py @@ -2,7 +2,6 @@ # Licensed under The MIT License [see LICENSE for details] import io -import time import struct import socket import pickle diff --git a/handyrl/environment.py b/handyrl/environment.py index 9bca1713..080a27b1 100755 --- a/handyrl/environment.py +++ b/handyrl/environment.py @@ -77,6 +77,13 @@ def turn(self): def turns(self): return [self.turn()] + # + # Should be defined if there are other players besides the turn player + # who should observe the environment (mainly with RNNs) + # + def observers(self): + return [] + # # Should be defined in all games # diff --git a/handyrl/envs/geister.py b/handyrl/envs/geister.py index 747beec2..a82bd6af 100755 --- a/handyrl/envs/geister.py +++ b/handyrl/envs/geister.py @@ -34,16 +34,10 @@ def __init__(self, input_dim, hidden_dim, kernel_size, bias): ) def init_hidden(self, input_size, batch_size): - if batch_size is None: # for inference - return tuple([ - np.zeros((self.hidden_dim, *input_size), dtype=np.float32), - np.zeros((self.hidden_dim, *input_size), dtype=np.float32) - ]) - else: # for training - return tuple([ - torch.zeros(*batch_size, self.hidden_dim, *input_size), - torch.zeros(*batch_size, self.hidden_dim, *input_size) - ]) + return tuple([ + torch.zeros(*batch_size, self.hidden_dim, *input_size), + torch.zeros(*batch_size, self.hidden_dim, *input_size) + ]) def forward(self, input_tensor, cur_state): h_cur, c_cur = cur_state @@ -63,6 +57,11 @@ def forward(self, input_tensor, cur_state): return h_next, c_next +# Deep Repeated Conv-LSTM (https://arxiv.org/abs/1901.03559) +# increases expressive power with fewer parameters +# by repeatedly computing multi-layer convolutional LSTM. +# When num_repeats=1, it is simply a multi-layer Conv-LSTM. + class DRC(nn.Module): def __init__(self, num_layers, input_dim, hidden_dim, kernel_size=3, bias=True): super().__init__() @@ -93,7 +92,7 @@ def forward(self, x, hidden, num_repeats): hs, cs = hidden for _ in range(num_repeats): for i, block in enumerate(self.blocks): - hs[i], cs[i] = block(x, (hs[i], cs[i])) + hs[i], cs[i] = block(hs[i - 1] if i > 0 else x, (hs[i], cs[i])) return hs[-1], (hs, cs) @@ -145,7 +144,7 @@ def __init__(self): self.head_v = ScalarHead((filters * 2, 6, 6), 1, 1) self.head_r = ScalarHead((filters * 2, 6, 6), 1, 1) - def init_hidden(self, batch_size=None): + def init_hidden(self, batch_size=[]): return self.body.init_hidden(self.input_size[1:], batch_size) def forward(self, x, hidden): @@ -448,6 +447,8 @@ def legal(self, action): if self.turn_count < 0: layout = action - 4 * 6 * 6 return 0 <= layout < 70 + elif not 0 <= action < 4 * 6 * 6: + return False pos_from = self.action2from(action, self.color) pos_to = self.action2to(action, self.color) diff --git a/handyrl/evaluation.py b/handyrl/evaluation.py index b00db9f6..64f3306a 100755 --- a/handyrl/evaluation.py +++ b/handyrl/evaluation.py @@ -76,7 +76,7 @@ def observe(self, player): return send_recv(self.conn, ('observe', [player])) -def exec_match(env, agents, critic, show=False, game_args={}): +def exec_match(env, agents, critic=None, show=False, game_args={}): ''' match with shared game environment ''' if env.reset(game_args): return None @@ -88,11 +88,12 @@ def exec_match(env, agents, critic, show=False, game_args={}): if show and critic is not None: print('cv = ', critic.observe(env, None, show=False)[0]) turn_players = env.turns() + observers = env.observers() actions = {} for p, agent in agents.items(): if p in turn_players: actions[p] = agent.action(env, p, show=show) - else: + elif p in observers: agent.observe(env, p, show=show) if env.step(actions): return None @@ -104,7 +105,7 @@ def exec_match(env, agents, critic, show=False, game_args={}): return outcome -def exec_network_match(env, network_agents, critic, show=False, game_args={}): +def exec_network_match(env, network_agents, critic=None, show=False, game_args={}): ''' match with divided game environment ''' if env.reset(game_args): return None @@ -117,12 +118,13 @@ def exec_network_match(env, network_agents, critic, show=False, game_args={}): if show and critic is not None: print('cv = ', critic.observe(env, None, show=False)[0]) turn_players = env.turns() + observers = env.observers() actions = {} for p, agent in network_agents.items(): if p in turn_players: action = agent.action(p) actions[p] = env.str2action(action, p) - else: + elif p in observers: agent.observe(p) if env.step(actions): return None @@ -161,9 +163,9 @@ def execute(self, models, args): if model is None: agents[p] = build_agent(opponent, self.env) else: - agents[p] = Agent(model, self.args['observation']) + agents[p] = Agent(model) - outcome = exec_match(self.env, agents, None) + outcome = exec_match(self.env, agents) if outcome is None: print('None episode in evaluation!') return None @@ -277,10 +279,78 @@ def network_match_acception(n, env_args, num_agents, port): return agents_list -def get_model(env, model_path): +class OnnxModel: + def __init__(self, model_path): + self.model_path = model_path + self.ort_session = None + + def _open_session(self): + import os + os.environ['OMP_NUM_THREADS'] = '1' + os.environ['OMP_WAIT_POLICY'] = 'PASSIVE' + + import onnxruntime + opts = onnxruntime.SessionOptions() + opts.intra_op_num_threads = 1 + opts.inter_op_num_threads = 1 + opts.execution_mode = onnxruntime.ExecutionMode.ORT_SEQUENTIAL + + self.ort_session = onnxruntime.InferenceSession(self.model_path, sess_options=opts) + + def init_hidden(self): + if self.ort_session is None: + self._open_session() + hidden_inputs = [y for y in self.ort_session.get_inputs() if y.name.startswith('hidden')] + if len(hidden_inputs) == 0: + return None + import numpy as np + type_map = { + 'tensor(float)': np.float32, + 'tensor(int64)': np.int64, + } + hidden_tensors = [np.zeros(y.shape[1:], dtype=type_map[y.type]) for y in hidden_inputs] + return hidden_tensors + + def inference(self, x, hidden=None, batch_input=False): + # numpy array -> numpy array + if self.ort_session is None: + self._open_session() + + ort_inputs = {} + ort_input_names = [y.name for y in self.ort_session.get_inputs()] + + import numpy as np + def insert_input(y): + y = y if batch_input else np.expand_dims(y, 0) + ort_inputs[ort_input_names[len(ort_inputs)]] = y + from .util import map_r + map_r(x, lambda y: insert_input(y)) + if hidden is not None: + map_r(hidden, lambda y: insert_input(y)) + ort_outputs = self.ort_session.run(None, ort_inputs) + if not batch_input: + ort_outputs = [o.squeeze(0) for o in ort_outputs] + + ort_output_names = [y.name for y in self.ort_session.get_outputs()] + outputs = {name: ort_outputs[i] for i, name in enumerate(ort_output_names)} + + hidden_outputs = [] + for k in list(outputs.keys()): + if k.startswith('hidden'): + hidden_outputs.append(outputs.pop(k)) + if len(hidden_outputs) == 0: + hidden_outputs = None + + outputs = {**outputs, 'hidden': hidden_outputs} + return outputs + + +def load_model(model_path, model): + if model_path.endswith('.onnx'): + model = OnnxModel(model_path) + return model import torch from .model import ModelWrapper - model = env.net() model.load_state_dict(torch.load(model_path)) model.eval() return ModelWrapper(model) @@ -290,7 +360,7 @@ def client_mp_child(env_args, model_path, conn): env = make_env(env_args) agent = build_agent(model_path, env) if agent is None: - model = get_model(env, model_path) + model = load_model(model_path, env.net()) agent = Agent(model) NetworkAgentClient(agent, env, conn).run() @@ -306,7 +376,8 @@ def eval_main(args, argv): agent1 = build_agent(model_path, env) if agent1 is None: - agent1 = Agent(get_model(env, model_path)) + model = load_model(model_path, env.net()) + agent1 = Agent(model) critic = None print('%d process, %d games' % (num_process, num_games)) diff --git a/handyrl/generation.py b/handyrl/generation.py index 63b7e553..e03e857b 100755 --- a/handyrl/generation.py +++ b/handyrl/generation.py @@ -29,32 +29,35 @@ def generate(self, models, args): return None while not self.env.terminal(): - moment_keys = ['observation', 'policy', 'action_mask', 'action', 'value', 'reward', 'return'] + moment_keys = ['observation', 'selected_prob', 'action_mask', 'action', 'value', 'reward', 'return'] moment = {key: {p: None for p in self.env.players()} for key in moment_keys} turn_players = self.env.turns() + observers = self.env.observers() for player in self.env.players(): - if player in turn_players or self.args['observation']: - obs = self.env.observation(player) - model = models[player] - outputs = model.inference(obs, hidden[player]) - hidden[player] = outputs.get('hidden', None) - v = outputs.get('value', None) - - moment['observation'][player] = obs - moment['value'][player] = v - - if player in turn_players: - p_ = outputs['policy'] - legal_actions = self.env.legal_actions(player) - action_mask = np.ones_like(p_) * 1e32 - action_mask[legal_actions] = 0 - p = p_ - action_mask - action = random.choices(legal_actions, weights=softmax(p[legal_actions]))[0] - - moment['policy'][player] = p - moment['action_mask'][player] = action_mask - moment['action'][player] = action + if player not in turn_players + observers: + continue + + obs = self.env.observation(player) + model = models[player] + outputs = model.inference(obs, hidden[player]) + hidden[player] = outputs.get('hidden', None) + v = outputs.get('value', None) + + moment['observation'][player] = obs + moment['value'][player] = v + + if player in turn_players: + p_ = outputs['policy'] + legal_actions = self.env.legal_actions(player) + action_mask = np.ones_like(p_) * 1e32 + action_mask[legal_actions] = 0 + p = softmax(p_ - action_mask) + action = random.choices(legal_actions, weights=p[legal_actions])[0] + + moment['selected_prob'][player] = p[action] + moment['action_mask'][player] = action_mask + moment['action'][player] = action err = self.env.step(moment['action']) if err: diff --git a/handyrl/model.py b/handyrl/model.py index 621d703f..9eb7b94b 100755 --- a/handyrl/model.py +++ b/handyrl/model.py @@ -37,7 +37,11 @@ def __init__(self, model): def init_hidden(self, batch_size=None): if hasattr(self.model, 'init_hidden'): - return self.model.init_hidden(batch_size) + if batch_size is None: # for inference + hidden = self.model.init_hidden([]) + return map_r(hidden, lambda h: h.detach().numpy() if isinstance(h, torch.Tensor) else h) + else: # for training + return self.model.init_hidden(batch_size) return None def forward(self, *args, **kwargs): diff --git a/handyrl/train.py b/handyrl/train.py index b933ebaa..07dbf81f 100755 --- a/handyrl/train.py +++ b/handyrl/train.py @@ -26,7 +26,6 @@ from .model import to_torch, to_gpu, ModelWrapper from .losses import compute_target from .connection import MultiProcessJobExecutor -from .connection import accept_socket_connections from .worker import WorkerCluster, WorkerServer @@ -58,19 +57,19 @@ def replace_none(a, b): players = [random.choice(players)] obs_zeros = map_r(moments[0]['observation'][moments[0]['turn'][0]], lambda o: np.zeros_like(o)) # template for padding - p_zeros = np.zeros_like(moments[0]['policy'][moments[0]['turn'][0]]) # template for padding + amask_zeros = np.zeros_like(moments[0]['action_mask'][moments[0]['turn'][0]]) # template for padding - # data that is chainge by training configuration + # data that is changed by training configuration if args['turn_based_training'] and not args['observation']: obs = [[m['observation'][m['turn'][0]]] for m in moments] - p = np.array([[m['policy'][m['turn'][0]]] for m in moments]) + prob = np.array([[[m['selected_prob'][m['turn'][0]]]] for m in moments]) act = np.array([[m['action'][m['turn'][0]]] for m in moments], dtype=np.int64)[..., np.newaxis] amask = np.array([[m['action_mask'][m['turn'][0]]] for m in moments]) else: obs = [[replace_none(m['observation'][player], obs_zeros) for player in players] for m in moments] - p = np.array([[replace_none(m['policy'][player], p_zeros) for player in players] for m in moments]) + prob = np.array([[[replace_none(m['selected_prob'][player], 1.0)] for player in players] for m in moments]) act = np.array([[replace_none(m['action'][player], 0) for player in players] for m in moments], dtype=np.int64)[..., np.newaxis] - amask = np.array([[replace_none(m['action_mask'][player], p_zeros + 1e32) for player in players] for m in moments]) + amask = np.array([[replace_none(m['action_mask'][player], amask_zeros + 1e32) for player in players] for m in moments]) # reshape observation obs = rotate(rotate(obs)) # (T, P, ..., ...) -> (P, ..., T, ...) -> (..., T, P, ...) @@ -83,35 +82,37 @@ def replace_none(a, b): oc = np.array([ep['outcome'][player] for player in players], dtype=np.float32).reshape(1, len(players), -1) emask = np.ones((len(moments), 1, 1), dtype=np.float32) # episode mask - tmask = np.array([[[m['policy'][player] is not None] for player in players] for m in moments], dtype=np.float32) - omask = np.array([[[m['value'][player] is not None] for player in players] for m in moments], dtype=np.float32) + tmask = np.array([[[m['selected_prob'][player] is not None] for player in players] for m in moments], dtype=np.float32) + omask = np.array([[[m['observation'][player] is not None] for player in players] for m in moments], dtype=np.float32) progress = np.arange(ep['start'], ep['end'], dtype=np.float32)[..., np.newaxis] / ep['total'] # pad each array if step length is short - if len(tmask) < args['forward_steps']: - pad_len = args['forward_steps'] - len(tmask) - obs = map_r(obs, lambda o: np.pad(o, [(0, pad_len)] + [(0, 0)] * (len(o.shape) - 1), 'constant', constant_values=0)) - p = np.pad(p, [(0, pad_len), (0, 0), (0, 0)], 'constant', constant_values=0) - v = np.concatenate([v, np.tile(oc, [pad_len, 1, 1])]) - act = np.pad(act, [(0, pad_len), (0, 0), (0, 0)], 'constant', constant_values=0) - rew = np.pad(rew, [(0, pad_len), (0, 0), (0, 0)], 'constant', constant_values=0) - ret = np.pad(ret, [(0, pad_len), (0, 0), (0, 0)], 'constant', constant_values=0) - emask = np.pad(emask, [(0, pad_len), (0, 0), (0, 0)], 'constant', constant_values=0) - tmask = np.pad(tmask, [(0, pad_len), (0, 0), (0, 0)], 'constant', constant_values=0) - omask = np.pad(omask, [(0, pad_len), (0, 0), (0, 0)], 'constant', constant_values=0) - amask = np.pad(amask, [(0, pad_len), (0, 0), (0, 0)], 'constant', constant_values=1e32) - progress = np.pad(progress, [(0, pad_len), (0, 0)], 'constant', constant_values=1) + batch_steps = args['burn_in_steps'] + args['forward_steps'] + if len(tmask) < batch_steps: + pad_len_b = args['burn_in_steps'] - (ep['train_start'] - ep['start']) + pad_len_a = batch_steps - len(tmask) - pad_len_b + obs = map_r(obs, lambda o: np.pad(o, [(pad_len_b, pad_len_a)] + [(0, 0)] * (len(o.shape) - 1), 'constant', constant_values=0)) + prob = np.pad(prob, [(pad_len_b, pad_len_a), (0, 0), (0, 0)], 'constant', constant_values=1) + v = np.concatenate([np.pad(v, [(pad_len_b, 0), (0, 0), (0, 0)], 'constant', constant_values=0), np.tile(oc, [pad_len_a, 1, 1])]) + act = np.pad(act, [(pad_len_b, pad_len_a), (0, 0), (0, 0)], 'constant', constant_values=0) + rew = np.pad(rew, [(pad_len_b, pad_len_a), (0, 0), (0, 0)], 'constant', constant_values=0) + ret = np.pad(ret, [(pad_len_b, pad_len_a), (0, 0), (0, 0)], 'constant', constant_values=0) + emask = np.pad(emask, [(pad_len_b, pad_len_a), (0, 0), (0, 0)], 'constant', constant_values=0) + tmask = np.pad(tmask, [(pad_len_b, pad_len_a), (0, 0), (0, 0)], 'constant', constant_values=0) + omask = np.pad(omask, [(pad_len_b, pad_len_a), (0, 0), (0, 0)], 'constant', constant_values=0) + amask = np.pad(amask, [(pad_len_b, pad_len_a), (0, 0), (0, 0)], 'constant', constant_values=1e32) + progress = np.pad(progress, [(pad_len_b, pad_len_a), (0, 0)], 'constant', constant_values=1) obss.append(obs) - datum.append((p, v, act, oc, rew, ret, emask, tmask, omask, amask, progress)) + datum.append((prob, v, act, oc, rew, ret, emask, tmask, omask, amask, progress)) obs = to_torch(bimap_r(obs_zeros, rotate(obss), lambda _, o: np.array(o))) - p, v, act, oc, rew, ret, emask, tmask, omask, amask, progress = [to_torch(np.array(val)) for val in zip(*datum)] + prob, v, act, oc, rew, ret, emask, tmask, omask, amask, progress = [to_torch(np.array(val)) for val in zip(*datum)] return { 'observation': obs, - 'policy': p, 'value': v, + 'selected_prob': prob, 'value': v, 'action': act, 'outcome': oc, 'reward': rew, 'return': ret, 'episode_mask': emask, @@ -133,39 +134,49 @@ def forward_prediction(model, hidden, batch, args): tuple: batch outputs of neural network """ - observations = batch['observation'] # (B, T, P, ...) + observations = batch['observation'] # (..., B, T, P or 1, ...) + batch_shape = batch['action'].size()[:3] # (B, T, P or 1) if hidden is None: # feed-forward neural network - obs = map_r(observations, lambda o: o.view(-1, *o.size()[3:])) + obs = map_r(observations, lambda o: o.flatten(0, 2)) # (..., B * T * P or 1, ...) outputs = model(obs, None) + outputs = map_r(outputs, lambda o: o.unflatten(0, batch_shape)) # (..., B, T, P or 1, ...) else: # sequential computation with RNN outputs = {} - for t in range(batch['turn_mask'].size(1)): - obs = map_r(observations, lambda o: o[:, t].reshape(-1, *o.size()[3:])) # (..., B * P, ...) + for t in range(batch_shape[1]): + obs = map_r(observations, lambda o: o[:, t].flatten(0, 1)) # (..., B * P or 1, ...) omask_ = batch['observation_mask'][:, t] - omask = map_r(hidden, lambda h: omask_.view(*h.size()[:2], *([1] * (len(h.size()) - 2)))) + omask = map_r(hidden, lambda h: omask_.view(*h.size()[:2], *([1] * (h.dim() - 2)))) hidden_ = bimap_r(hidden, omask, lambda h, m: h * m) # (..., B, P, ...) if args['turn_based_training'] and not args['observation']: hidden_ = map_r(hidden_, lambda h: h.sum(1)) # (..., B * 1, ...) else: - hidden_ = map_r(hidden_, lambda h: h.view(-1, *h.size()[2:])) # (..., B * P, ...) - outputs_ = model(obs, hidden_) + hidden_ = map_r(hidden_, lambda h: h.flatten(0, 1)) # (..., B * P, ...) + if t < args['burn_in_steps']: + model.eval() + with torch.no_grad(): + outputs_ = model(obs, hidden_) + else: + if not model.training: + model.train() + outputs_ = model(obs, hidden_) + outputs_ = map_r(outputs_, lambda o: o.unflatten(0, (batch_shape[0], batch_shape[2]))) # (..., B, P or 1, ...) for k, o in outputs_.items(): if k == 'hidden': - next_hidden = outputs_['hidden'] + next_hidden = o else: outputs[k] = outputs.get(k, []) + [o] - next_hidden = bimap_r(next_hidden, hidden, lambda nh, h: nh.view(h.size(0), -1, *h.size()[2:])) # (..., B, P or 1, ...) hidden = trimap_r(hidden, next_hidden, omask, lambda h, nh, m: h * (1 - m) + nh * m) outputs = {k: torch.stack(o, dim=1) for k, o in outputs.items() if o[0] is not None} for k, o in outputs.items(): - o = o.view(*batch['turn_mask'].size()[:2], -1, o.size(-1)) if k == 'policy': - # gather turn player's policies - outputs[k] = o.mul(batch['turn_mask']).sum(2, keepdim=True) - batch['action_mask'] + o = o.mul(batch['turn_mask']) + if o.size(2) > 1 and batch_shape[2] == 1: # turn-alternating batch + o = o.sum(2, keepdim=True) # gather turn player's policies + outputs[k] = o - batch['action_mask'] else: # mask valid target values and cumulative rewards outputs[k] = o.mul(batch['observation_mask']) @@ -205,11 +216,15 @@ def compose_losses(outputs, log_selected_policies, total_advantages, targets, ba def compute_loss(batch, model, hidden, args): outputs = forward_prediction(model, hidden, batch, args) + if args['burn_in_steps'] > 0: + batch = map_r(batch, lambda v: v[args['burn_in_steps']:]) + outputs = map_r(outputs, lambda v: v[args['burn_in_steps']:]) + actions = batch['action'] emasks = batch['episode_mask'] clip_rho_threshold, clip_c_threshold = 1.0, 1.0 - log_selected_b_policies = F.log_softmax(batch['policy'] , dim=-1).gather(-1, actions) * emasks + log_selected_b_policies = torch.log(torch.clamp(batch['selected_prob'], 1e-16, 1)) * emasks log_selected_t_policies = F.log_softmax(outputs['policy'], dim=-1).gather(-1, actions) * emasks # thresholds of importance sampling @@ -277,15 +292,16 @@ def select_episode(self): break ep = self.episodes[ep_idx] turn_candidates = 1 + max(0, ep['steps'] - self.args['forward_steps']) # change start turn by sequence length - st = random.randrange(turn_candidates) - ed = min(st + self.args['forward_steps'], ep['steps']) + train_st = random.randrange(turn_candidates) + st = max(0, train_st - self.args['burn_in_steps']) + ed = min(train_st + self.args['forward_steps'], ep['steps']) st_block = st // self.args['compress_steps'] ed_block = (ed - 1) // self.args['compress_steps'] + 1 ep_minimum = { 'args': ep['args'], 'outcome': ep['outcome'], 'moment': ep['moment'][st_block:ed_block], 'base': st_block * self.args['compress_steps'], - 'start': st, 'end': ed, 'total': ep['steps'], + 'start': st, 'end': ed, 'train_start': train_st, 'total': ep['steps'], } return ep_minimum @@ -349,8 +365,8 @@ def shutdown(self): def train(self): if self.optimizer is None: # non-parametric model - print() - return + time.sleep(0.1) + return self.model batch_cnt, data_cnt, loss_sum = 0, 0, {} if self.gpu > 0: @@ -395,7 +411,7 @@ def run(self): if len(self.episodes) < self.args['minimum_episodes']: time.sleep(1) continue - if self.steps == 0: + if self.steps == 0 and self.optimizer is not None: self.batcher.run() print('started training') model = self.train() diff --git a/handyrl/worker.py b/handyrl/worker.py index 58cd12f7..7fe28bc7 100755 --- a/handyrl/worker.py +++ b/handyrl/worker.py @@ -84,8 +84,8 @@ def run(self): send_recv(self.conn, ('result', result)) -def make_worker_args(args, n_ga, gaid, wid, conn): - return args, conn, wid * n_ga + gaid +def make_worker_args(args, n_ga, gaid, base_wid, wid, conn): + return args, conn, base_wid + wid * n_ga + gaid def open_worker(args, conn, wid): @@ -107,10 +107,12 @@ def __init__(self, args, conn, gaid): n_pro, n_ga = args['worker']['num_parallel'], args['worker']['num_gathers'] num_workers_per_gather = (n_pro // n_ga) + int(gaid < n_pro % n_ga) + base_wid = args['worker'].get('base_worker_id', 0) + worker_conns = open_multiprocessing_connections( num_workers_per_gather, open_worker, - functools.partial(make_worker_args, args, n_ga, gaid) + functools.partial(make_worker_args, args, n_ga, gaid, base_wid) ) for conn in worker_conns: @@ -188,6 +190,7 @@ class WorkerServer(QueueCommunicator): def __init__(self, args): super().__init__() self.args = args + self.total_worker_count = 0 def run(self): # prepare listening connections @@ -199,6 +202,8 @@ def entry_server(port): if conn is not None: worker_args = conn.recv() print('accepted connection from %s!' % worker_args['address']) + worker_args['base_worker_id'] = self.total_worker_count + self.total_worker_count += worker_args['num_parallel'] args = copy.deepcopy(self.args) args['worker'] = worker_args conn.send(args) @@ -206,9 +211,9 @@ def entry_server(port): print('finished entry server') def worker_server(port): - conn_acceptor = accept_socket_connections(port=port, timeout=0.3) print('started worker server %d' % port) - while not self.shutdown_flag: # use super class's flag + conn_acceptor = accept_socket_connections(port=port, timeout=0.3) + while not self.shutdown_flag: conn = next(conn_acceptor) if conn is not None: self.add_connection(conn) diff --git a/scripts/aux_swa.py b/scripts/aux_swa.py new file mode 100644 index 00000000..6c10a164 --- /dev/null +++ b/scripts/aux_swa.py @@ -0,0 +1,56 @@ + +# Usage: python3 script/aux_swa.py [FINAL_EPOCH] [EPOCHS] [EPOCH_STEP] + +import os +import sys + +sys.path.append('./') + +import yaml + +import torch +from torch.optim.swa_utils import AveragedModel + +from handyrl.environment import make_env + + +# +# SWA (running equal averaging) +# + +model_dir = 'models' +saved_model_path = os.path.join('models', 'swa.pth') + +ed, length = int(sys.argv[1]), int(sys.argv[2]) +step = 1 +if len(sys.argv) >= 4: + step = int(sys.argv[3]) + +model_ids = [str(i) + '.pth' for i in range(ed - length + 1, ed + 1, step)] + +with open('config.yaml') as f: + args = yaml.safe_load(f) + +env = make_env(args['env_args']) +model = env.net() +model.load_state_dict(torch.load(os.path.join(model_dir, model_ids[0])), strict=True) + +def _avg_fn(averaged_model_parameter, model_parameter, num_averaged): + return averaged_model_parameter + (model_parameter - averaged_model_parameter) / (num_averaged + 1) + +swa_model = AveragedModel(model, avg_fn=_avg_fn) + +for model_id in model_ids: + model.load_state_dict(torch.load(os.path.join(model_dir, model_id)), strict=True) + swa_model.update_parameters(model) + +torch.save(swa_model.module.state_dict(), saved_model_path) + +print('Saved %s' % saved_model_path) + +# +# Test (load in strict=True) +# + +model = env.net() +model.load_state_dict(torch.load(saved_model_path), strict=True) diff --git a/scripts/make_onnx_model.py b/scripts/make_onnx_model.py new file mode 100644 index 00000000..9a0d8892 --- /dev/null +++ b/scripts/make_onnx_model.py @@ -0,0 +1,57 @@ + +# Usage: python3 script/make_onnx_model.py MODEL_PATH + +import sys +import yaml +import torch + +sys.path.append('./') + +from handyrl.environment import make_env +from handyrl.model import to_torch +from handyrl.util import map_r + + +model_path = sys.argv[-1] +saved_model_path = model_path.rstrip('.pth') + '.onnx' + +with open('config.yaml') as f: + args = yaml.safe_load(f) + +env = make_env(args['env_args']) +model = env.net() +model.load_state_dict(torch.load(model_path), strict=False) +model.eval() +print('loaded PyTorch model from %s' % model_path) + +env.reset() +obs = to_torch(env.observation(player=env.turns()[0])) +obs = map_r(obs, lambda x: x.unsqueeze(0)) + +hidden = model.init_hidden([1]) if hasattr(model, 'init_hidden') else None +inputs = obs, hidden + +# You can specify meaningful names for the inputs here. +input_names = [] +map_r(obs, lambda y: input_names.append('input.' + str(len(input_names)))) + +hidden_names = [] +if hidden is not None: + map_r(hidden, lambda y: hidden_names.append('hidden.' + str(len(hidden_names)))) + input_names += hidden_names + +outputs = model(*inputs) +output_names = list(outputs.keys()) +if 'hidden' in output_names: + index = output_names.index('hidden') + output_names = output_names[:index] + [name + 'o' for name in hidden_names] + output_names[index+1:] + +print('input =', input_names) +print('output =', output_names) + +dynamic_axes = {name: {0: 'batch_size'} for name in (input_names + output_names)} + +torch.onnx.export(model, inputs, saved_model_path, + input_names=input_names, output_names=output_names, + dynamic_axes=dynamic_axes) +print('saved ONNX model to %s' % saved_model_path) diff --git a/scripts/win_rate_plot.py b/scripts/win_rate_plot.py new file mode 100644 index 00000000..7d8e67e3 --- /dev/null +++ b/scripts/win_rate_plot.py @@ -0,0 +1,115 @@ + +# Usage: python3 scripts/win_rate_plot.py train-log-002.txt all + +# You should not include figures generated by this script in your academic paper, because +# 1. This version of HandyRL doesn't display all the results of the matches. +# 2. Smoothing method in this script is not a simple moving average. + +import sys +import numpy as np + + +n = 15 + +def kernel(n): + a = np.array(list(range(1, 1 + (n+1)//2)) + list(range(1 + n//2,1,-1))) + return a / a.sum() + + +def get_wp_list(path): + opponents = set() + epoch_data_list = [{}] + epoch_list = [0] + step_list = [0] + game_list = [0] + + f = open(path) + lines = f.readlines() + prev_line = '' + + for line in lines: + if line.startswith('updated'): + epoch_data_list.append({}) + epoch_list.append(len(epoch_list)) + step_list.append(int(line.split('(')[1].rstrip().rstrip(')'))) + if line.startswith('win rate'): + elms = line.split() + opponent = elms[2].lstrip('(').rstrip(')') + games = int(elms[-1].lstrip('(').rstrip(')')) + wp = float(elms[-4]) if games > 0 else 0.0 + epoch_data_list[-1][opponent] = {'w': games * wp, 'n': games} + opponents.add(opponent) + if line.startswith('epoch '): + print(line, len(epoch_list)) + if ' ' in prev_line: + game = int(prev_line.split()[-1]) + game_list.append(game) + + prev_line = line + + game_list = game_list[:len(epoch_data_list)] + + clipped_epoch_list = epoch_list[n//2:-n//2+1] + clipped_step_list = step_list[n//2:-n//2+1] + clipped_game_list = game_list[n//2:-n//2+1] + null_data = {'w': 0, 'n': 0} + kn = kernel(n) + averaged_wp_lists = {} + start_epoch = {} + for opponent in opponents: + win_list = [e.get(opponent, null_data)['w'] for e in epoch_data_list] + n_list = [e.get(opponent, null_data)['n'] for e in epoch_data_list] + averaged_win_list = np.convolve(win_list, kn, mode='valid') + averaged_n_list = np.convolve(n_list, kn, mode='valid') + 1e-6 + averaged_wp_lists[opponent] = averaged_win_list / averaged_n_list + try: + start_epoch[opponent] = next(i for i, n in enumerate(n_list) if n >= 1) + except: + start_epoch[opponent] = 0 + return clipped_epoch_list, clipped_step_list, clipped_game_list, averaged_wp_lists, start_epoch + + +import matplotlib.pyplot as plt +import seaborn as sns +flatui = ["#9b59b6", "#95a5a6", "#34495e", "#3498db", "#e74c3c", "#2ecc71", "#b22222"] +sns.set_palette(sns.color_palette(flatui, 24)) + +clipped_epoch_list, clipped_step_list, clipped_game_list, averaged_wp_lists, start_epoch = get_wp_list(sys.argv[1]) + +opponents_ = list(averaged_wp_lists.keys()) +opponents = sorted(opponents_, key=lambda o: averaged_wp_lists[o][-1], reverse=True) + +fig = plt.figure() +ax = fig.add_subplot(1, 1, 1) + +last_win_rate = {} +for opponent in opponents: + if opponent == 'total': + continue + wp_list = averaged_wp_lists[opponent] + start = start_epoch[opponent] + # ax.plot(clipped_epoch_list[start:], wp_list[start:], label=opponent) + ax.plot(clipped_game_list[start:], wp_list[start:], label=opponent) + last_win_rate[opponent] = wp_list[-1] + +ax.set_xlabel('Games', size=14) +ax.set_ylabel('Win rate', size=14) +ax.set_title(sys.argv[2]) +ax.set_ylim(0, 1) +ax.legend() + +# Major ticks every 20, minor ticks every 5 +major_ticks = np.linspace(0, 1, 11) +minor_ticks = np.linspace(0, 1, 21) + +ax.set_yticks(major_ticks) +ax.set_yticks(minor_ticks, minor=True) + +# A corresponding grid +plt.grid(which='minor', color='gray', alpha=0.5, + linestyle='--', linewidth=0.5) +plt.grid(which='major', color='gray', alpha=0.5, + linestyle='--', linewidth=1) + +fig.tight_layout() +plt.show()