From e934a3970103d7551f2fd667891124528e6b3e94 Mon Sep 17 00:00:00 2001 From: YuriCat Date: Fri, 3 Dec 2021 23:35:28 +0900 Subject: [PATCH 1/5] feature: compute total reward in evaluator --- handyrl/evaluation.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/handyrl/evaluation.py b/handyrl/evaluation.py index 51d3c6c5..4f58588f 100755 --- a/handyrl/evaluation.py +++ b/handyrl/evaluation.py @@ -78,10 +78,12 @@ def observe(self, player): def exec_match(env, agents, critic, show=False, game_args={}): ''' match with shared game environment ''' + total_rewards = {} if env.reset(game_args): return None for agent in agents.values(): agent.reset(env, show=show) + total_rewards[p] = 0 while not env.terminal(): if show: view(env) @@ -98,6 +100,8 @@ def exec_match(env, agents, critic, show=False, game_args={}): return None if show: view_transition(env) + for p, reward in env.reward().items(): + total_rewards[p] += np.array(reward).reshape(-1) outcome = env.outcome() if show: print('final outcome = %s' % outcome) @@ -106,11 +110,13 @@ def exec_match(env, agents, critic, show=False, game_args={}): def exec_network_match(env, network_agents, critic, show=False, game_args={}): ''' match with divided game environment ''' + total_rewards = {} if env.reset(game_args): return None for p, agent in network_agents.items(): info = env.diff_info(p) agent.update(info, True) + total_rewards[p] = 0 while not env.terminal(): if show: view(env) @@ -126,6 +132,8 @@ def exec_network_match(env, network_agents, critic, show=False, game_args={}): agent.observe(p) if env.step(actions): return None + for p, reward in env.reward().items(): + total_rewards[p] += np.array(reward).reshape(-1) for p, agent in network_agents.items(): info = env.diff_info(p) agent.update(info, False) From 650d0c570c3f301ea5ddb2b125179331e2d94638 Mon Sep 17 00:00:00 2001 From: YuriCat Date: Sun, 5 Dec 2021 04:18:38 +0900 Subject: [PATCH 2/5] fix: compute total reward in evaluator --- handyrl/evaluation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/handyrl/evaluation.py b/handyrl/evaluation.py index 4f58588f..f28452c8 100755 --- a/handyrl/evaluation.py +++ b/handyrl/evaluation.py @@ -81,7 +81,7 @@ def exec_match(env, agents, critic, show=False, game_args={}): total_rewards = {} if env.reset(game_args): return None - for agent in agents.values(): + for p, agent in agents.items(): agent.reset(env, show=show) total_rewards[p] = 0 while not env.terminal(): From 86220519fe825ba8a1024e607de84dd7ee1a0eaa Mon Sep 17 00:00:00 2001 From: YuriCat Date: Sun, 5 Dec 2021 04:18:58 +0900 Subject: [PATCH 3/5] feature: compute total reward in generator --- handyrl/generation.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/handyrl/generation.py b/handyrl/generation.py index 63b7e553..3b07af74 100755 --- a/handyrl/generation.py +++ b/handyrl/generation.py @@ -21,8 +21,10 @@ def generate(self, models, args): # episode generation moments = [] hidden = {} + total_rewards = {} for player in self.env.players(): hidden[player] = models[player].init_hidden() + total_rewards[player] = 0 err = self.env.reset() if err: @@ -63,6 +65,7 @@ def generate(self, models, args): reward = self.env.reward() for player in self.env.players(): moment['reward'][player] = reward.get(player, None) + total_rewards[player] += np.array(reward.get(player, 0)).reshape(-1) moment['turn'] = turn_players moments.append(moment) @@ -73,12 +76,13 @@ def generate(self, models, args): for player in self.env.players(): ret = 0 for i, m in reversed(list(enumerate(moments))): - ret = (m['reward'][player] or 0) + self.args['gamma'] * ret + ret = np.array(m['reward'][player] or 0) + np.array(self.args['gamma']) * ret moments[i]['return'][player] = ret episode = { 'args': args, 'steps': len(moments), 'outcome': self.env.outcome(), + 'total_reward': total_rewards, 'moment': [ bz2.compress(pickle.dumps(moments[i:i+self.args['compress_steps']])) for i in range(0, len(moments), self.args['compress_steps']) From d548de864f868f12a03218039cea952fa4806be9 Mon Sep 17 00:00:00 2001 From: YuriCat Date: Wed, 15 Dec 2021 12:31:23 +0900 Subject: [PATCH 4/5] feature: return game result (and other stats?) as a dict --- handyrl/evaluation.py | 21 +++++++++++---------- handyrl/train.py | 2 +- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/handyrl/evaluation.py b/handyrl/evaluation.py index 8b038f1a..1757bfe4 100755 --- a/handyrl/evaluation.py +++ b/handyrl/evaluation.py @@ -105,7 +105,7 @@ def exec_match(env, agents, critic, show=False, game_args={}): outcome = env.outcome() if show: print('final outcome = %s' % outcome) - return outcome + return {'outcome': outcome, 'total_reward': total_rewards} def exec_network_match(env, network_agents, critic, show=False, game_args={}): @@ -140,7 +140,7 @@ def exec_network_match(env, network_agents, critic, show=False, game_args={}): outcome = env.outcome() for p, agent in network_agents.items(): agent.outcome(outcome[p]) - return outcome + return {'outcome': outcome, 'total_reward': total_rewards} def build_agent(raw, env): @@ -171,11 +171,11 @@ def execute(self, models, args): else: agents[p] = Agent(model, self.args['observation']) - outcome = exec_match(self.env, agents, None) - if outcome is None: + result = exec_match(self.env, agents, None) + if result is None: print('None episode in evaluation!') return None - return {'args': args, 'result': outcome, 'opponent': opponent} + return {'args': args, 'opponent': opponent, **result} def wp_func(results): @@ -197,10 +197,10 @@ def eval_process_mp_child(agents, critic, env_args, index, in_queue, out_queue, print('*** Game %d ***' % g) agent_map = {env.players()[p]: agents[ai] for p, ai in enumerate(agent_ids)} if isinstance(list(agent_map.values())[0], NetworkAgent): - outcome = exec_network_match(env, agent_map, critic, show=show, game_args=game_args) + result = exec_network_match(env, agent_map, critic, show=show, game_args=game_args) else: - outcome = exec_match(env, agent_map, critic, show=show, game_args=game_args) - out_queue.put((pat_idx, agent_ids, outcome)) + result = exec_match(env, agent_map, critic, show=show, game_args=game_args) + out_queue.put((pat_idx, agent_ids, result)) out_queue.put(None) @@ -247,8 +247,9 @@ def evaluate_mp(env, agents, critic, env_args, args_patterns, num_process, num_g if ret is None: finished_cnt += 1 continue - pat_idx, agent_ids, outcome = ret - if outcome is not None: + pat_idx, agent_ids, result = ret + if result is not None: + outcome = result['outcome'] for idx, p in enumerate(env.players()): agent_id = agent_ids[idx] oc = outcome[p] diff --git a/handyrl/train.py b/handyrl/train.py index 35815031..cc3282c0 100755 --- a/handyrl/train.py +++ b/handyrl/train.py @@ -505,7 +505,7 @@ def feed_results(self, results): continue for p in result['args']['player']: model_id = result['args']['model_id'][p] - res = result['result'][p] + res = result['outcome'][p] n, r, r2 = self.results.get(model_id, (0, 0, 0)) self.results[model_id] = n + 1, r + res, r2 + res ** 2 From b3cac06b906c9e4044c18363ee18c458522b0b33 Mon Sep 17 00:00:00 2001 From: YuriCat Date: Sat, 12 Feb 2022 17:20:03 +0900 Subject: [PATCH 5/5] fix: import numpy in evaluation.py --- handyrl/evaluation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/handyrl/evaluation.py b/handyrl/evaluation.py index 6df83625..e3ed1634 100755 --- a/handyrl/evaluation.py +++ b/handyrl/evaluation.py @@ -7,6 +7,8 @@ import time import multiprocessing as mp +import numpy as np + from .environment import prepare_env, make_env from .connection import send_recv, accept_socket_connections, connect_socket_connection from .agent import RandomAgent, RuleBasedAgent, Agent, EnsembleAgent, SoftAgent @@ -312,7 +314,6 @@ def init_hidden(self): hidden_inputs = [y for y in self.ort_session.get_inputs() if y.name.startswith('hidden')] if len(hidden_inputs) == 0: return None - import numpy as np type_map = { 'tensor(float)': np.float32, 'tensor(int64)': np.int64, @@ -328,7 +329,6 @@ def inference(self, x, hidden=None, batch_input=False): ort_inputs = {} ort_input_names = [y.name for y in self.ort_session.get_inputs()] - import numpy as np def insert_input(y): y = y if batch_input else np.expand_dims(y, 0) ort_inputs[ort_input_names[len(ort_inputs)]] = y