From 3f9fd7a12264ea51b567239a3703d7cd8a456c3d Mon Sep 17 00:00:00 2001 From: James P Taggart Date: Sat, 2 May 2020 00:09:21 -0400 Subject: [PATCH] Add files via upload --- agent.py | 23 ++++++ brain_dqn.py | 193 +++++++++++++++++++++++++++++++++++++++++++++++++ train_model.py | 51 +++++++++++++ 3 files changed, 267 insertions(+) create mode 100644 agent.py create mode 100644 brain_dqn.py create mode 100644 train_model.py diff --git a/agent.py b/agent.py new file mode 100644 index 0000000..b9bdf1e --- /dev/null +++ b/agent.py @@ -0,0 +1,23 @@ +from .models.brain_dqn import BrainDQN +import numpy as np + +''' +Wrapper class for models, use for providing consistent api to environment for purpose of training +''' +class Agent_DQN: + def __init__(self): + self.model = None + + def initialize(self, actions): + self.model = BrainDQN(actions) + + def setInitState(self, observation): + self.model.setInitState(observation) + self.model.currentState = np.squeeze(self.model.currentState) + + def getAction(self): + return np.argmax(self.model.getAction()) + + def setPerception(self, next_observation, action, reward, is_done): + self.model.setPerception(next_observation, action, reward, is_done); + diff --git a/brain_dqn.py b/brain_dqn.py new file mode 100644 index 0000000..905ca17 --- /dev/null +++ b/brain_dqn.py @@ -0,0 +1,193 @@ +# ----------------------------- +# File: Deep Q-Learning Algorithm +# Author: Flood Sung +# Date: 2016.3.21 +# ----------------------------- + +import tensorflow as tf +import numpy as np +import random +from collections import deque + +# Hyper Parameters: +FRAME_PER_ACTION = 1 +GAMMA = 0.95 # decay rate of past observations +OBSERVE = 50000. # timesteps to observe before training +EXPLORE = 1000000. # frames over which to anneal epsilon +FINAL_EPSILON = 0.1#0.001 # final value of epsilon +INITIAL_EPSILON = 1.0#0.01 # starting value of epsilon +REPLAY_MEMORY = 40000 # number of previous transitions to remember +BATCH_SIZE = 32 # size of minibatch +UPDATE_TIME = 10000 + +class BrainDQN: + + def __init__(self,actions): + # init replay memory + self.replayMemory = deque() + # init some parameters + self.timeStep = 0 + self.epsilon = INITIAL_EPSILON + self.actions = actions + # init Q network + self.stateInput,self.QValue,self.W_conv1,self.b_conv1,self.W_conv2,self.b_conv2,self.W_conv3,self.b_conv3,self.W_fc1,self.b_fc1,self.W_fc2,self.b_fc2 = self.createQNetwork() + + # init Target Q Network + self.stateInputT,self.QValueT,self.W_conv1T,self.b_conv1T,self.W_conv2T,self.b_conv2T,self.W_conv3T,self.b_conv3T,self.W_fc1T,self.b_fc1T,self.W_fc2T,self.b_fc2T = self.createQNetwork() + + self.copyTargetQNetworkOperation = [self.W_conv1T.assign(self.W_conv1),self.b_conv1T.assign(self.b_conv1),self.W_conv2T.assign(self.W_conv2),self.b_conv2T.assign(self.b_conv2),self.W_conv3T.assign(self.W_conv3),self.b_conv3T.assign(self.b_conv3),self.W_fc1T.assign(self.W_fc1),self.b_fc1T.assign(self.b_fc1),self.W_fc2T.assign(self.W_fc2),self.b_fc2T.assign(self.b_fc2)] + + self.createTrainingMethod() + + # saving and loading networks + self.saver = tf.train.Saver() + self.session = tf.InteractiveSession() + self.session.run(tf.global_variables_initializer()) + checkpoint = tf.train.get_checkpoint_state("./savedweights") + if checkpoint and checkpoint.model_checkpoint_path: + self.saver.restore(self.session, checkpoint.model_checkpoint_path) + print("Successfully loaded:", checkpoint.model_checkpoint_path) + else: + print("Could not find old network weights") + + + def createQNetwork(self): + # network weights + W_conv1 = self.weight_variable([8,8,4,32]) + b_conv1 = self.bias_variable([32]) + + W_conv2 = self.weight_variable([4,4,32,64]) + b_conv2 = self.bias_variable([64]) + + W_conv3 = self.weight_variable([3,3,64,64]) + b_conv3 = self.bias_variable([64]) + + W_fc1 = self.weight_variable([3136,512]) + b_fc1 = self.bias_variable([512]) + + W_fc2 = self.weight_variable([512,self.actions]) + b_fc2 = self.bias_variable([self.actions]) + + # input layer + + stateInput = tf.placeholder("float",[None,84,84,4]) + + # hidden layers + h_conv1 = tf.nn.relu(self.conv2d(stateInput,W_conv1,4) + b_conv1) + #h_pool1 = self.max_pool_2x2(h_conv1) + + h_conv2 = tf.nn.relu(self.conv2d(h_conv1,W_conv2,2) + b_conv2) + + h_conv3 = tf.nn.relu(self.conv2d(h_conv2,W_conv3,1) + b_conv3) + h_conv3_shape = h_conv3.get_shape().as_list() + print("dimension:",h_conv3_shape[1]*h_conv3_shape[2]*h_conv3_shape[3]) + h_conv3_flat = tf.reshape(h_conv3,[-1,3136]) + h_fc1 = tf.nn.relu(tf.matmul(h_conv3_flat,W_fc1) + b_fc1) + + # Q Value layer + QValue = tf.matmul(h_fc1,W_fc2) + b_fc2 + + return stateInput,QValue,W_conv1,b_conv1,W_conv2,b_conv2,W_conv3,b_conv3,W_fc1,b_fc1,W_fc2,b_fc2 + + def copyTargetQNetwork(self): + self.session.run(self.copyTargetQNetworkOperation) + + def createTrainingMethod(self): + self.actionInput = tf.placeholder("float",[None,self.actions]) + self.yInput = tf.placeholder("float", [None]) + Q_Action = tf.reduce_sum(tf.multiply(self.QValue, self.actionInput), reduction_indices = 1) + self.cost = tf.reduce_mean(tf.square(self.yInput - Q_Action)) + self.trainStep = tf.train.RMSPropOptimizer(0.00025,0.99,0.0,1e-6).minimize(self.cost) + + + def trainQNetwork(self): + # Step 1: obtain random minibatch from replay memory + minibatch = random.sample(self.replayMemory,BATCH_SIZE) + state_batch = [data[0] for data in minibatch] + action_batch = [data[1] for data in minibatch] + reward_batch = [data[2] for data in minibatch] + nextState_batch = [data[3] for data in minibatch] + + # Step 2: calculate y + y_batch = [] + QValue_batch = self.QValueT.eval(feed_dict={self.stateInputT:nextState_batch}) + for i in range(0,BATCH_SIZE): + terminal = minibatch[i][4] + if terminal: + y_batch.append(reward_batch[i]) + else: + y_batch.append(reward_batch[i] + GAMMA * np.max(QValue_batch[i])) + + self.trainStep.run(feed_dict={ + self.yInput : y_batch, + self.actionInput : action_batch, + self.stateInput : state_batch + }) + + # save network every 100000 iteration + if self.timeStep % 10000 == 0: + self.saver.save(self.session, './savedweights/network' + '-dqn', global_step = self.timeStep) + + if self.timeStep % UPDATE_TIME == 0: + self.copyTargetQNetwork() + + + def setPerception(self,nextObservation,action,reward,terminal): + newState = np.append(nextObservation,self.currentState[:,:,1:],axis = 2) + self.replayMemory.append((self.currentState,action,reward,newState,terminal)) + if len(self.replayMemory) > REPLAY_MEMORY: + self.replayMemory.popleft() + if self.timeStep > OBSERVE: + # Train the network + self.trainQNetwork() + + # print info + state = "" + if self.timeStep <= OBSERVE: + state = "observe" + elif self.timeStep > OBSERVE and self.timeStep <= OBSERVE + EXPLORE: + state = "explore" + else: + state = "train" + if self.timeStep % 10000 == 0: + print("TIMESTEP", self.timeStep, "/ STATE", state, "/ EPSILON", self.epsilon) + + self.currentState = newState + self.timeStep += 1 + + def getAction(self): + QValue = self.QValue.eval(feed_dict= {self.stateInput:[self.currentState]})[0] + action = np.zeros(self.actions) + action_index = 0 + if self.timeStep % FRAME_PER_ACTION == 0: + if random.random() <= self.epsilon: + action_index = random.randrange(self.actions) + action[action_index] = 1 + else: + action_index = np.argmax(QValue) + action[action_index] = 1 + else: + action[0] = 1 # do nothingf + + # change episilon + if self.epsilon > FINAL_EPSILON and self.timeStep > OBSERVE: + self.epsilon -= (INITIAL_EPSILON - FINAL_EPSILON)/EXPLORE + + return action + + def setInitState(self,observation): + self.currentState = np.stack((observation, observation, observation, observation), axis = 2) + + def weight_variable(self,shape): + initial = tf.truncated_normal(shape, stddev = 0.01) + return tf.Variable(initial) + + def bias_variable(self,shape): + initial = tf.constant(0.01, shape = shape) + return tf.Variable(initial) + + def conv2d(self,x, W, stride): + return tf.nn.conv2d(x, W, strides = [1, stride, stride, 1], padding = "VALID") + + def max_pool_2x2(self,x): + return tf.nn.max_pool(x, ksize = [1, 2, 2, 1], strides = [1, 2, 2, 1], padding = "SAME") \ No newline at end of file diff --git a/train_model.py b/train_model.py new file mode 100644 index 0000000..1bff901 --- /dev/null +++ b/train_model.py @@ -0,0 +1,51 @@ +import matplotlib.pyplot as plt + +import cv2 +from .models.brain_dqn import BrainDQN +import numpy as np + +import gym + +''' +Models are allowed to train for a fixed amount of timesteps for a fixed number of episodes +''' +def train(agent, episodes_to_play, timesteps_alloted): + env = gym.make('SpaceInvaders-v0') + env.reset() + + actions = env.action_space.n + agent.initialize(actions) + + # Define preprocess step to be applied for all agents (reduce scope of problem for runtime) + def preprocess(observation): + observation = cv2.cvtColor(cv2.resize(observation, (84, 110)), cv2.COLOR_BGR2GRAY) + observation = observation[26:110, :] + ret, observation = cv2.threshold(observation, 1, 255, cv2.THRESH_BINARY) + return np.reshape(observation, (84, 84, 1)) + + rewards = [] + + while episodes_to_play >= 0: + timesteps = timesteps_alloted + + action0 = 0 # do nothing for first action + observation0, reward0, is_done, info = env.step(action0) + observation0 = preprocess(observation0) + agent.setInitState(observation0) + total_reward = 0 + while timesteps >= 0: + action = agent.getAction() # Feel like state/observation should be passed here + + next_observation, reward, is_done, info = env.step(action) + + if is_done: + next_observation = env.reset() + reward = -10 # Penalize retries + agent.setPerception(preprocess(next_observation), action, reward, is_done) + timesteps = timesteps - 1 + total_reward = total_reward + reward + rewards.append(total_reward) + episodes_to_play = episodes_to_play - 1 + print("T-" + str(episodes_to_play) + ": " + str(total_reward)) + print("Training Session Completed") + print("Rewards: " + str(rewards)) \ No newline at end of file