JPT2 · JPT2 · May 2, 2020
diff --git a/agent.py b/agent.py
@@ -0,0 +1,23 @@
+from .models.brain_dqn import BrainDQN
+import numpy as np
+
+'''
+Wrapper class for models, use for providing consistent api to environment for purpose of training
+'''
+class Agent_DQN:
+    def __init__(self):
+        self.model = None
+
+    def initialize(self, actions):
+        self.model = BrainDQN(actions)
+
+    def setInitState(self, observation):
+        self.model.setInitState(observation)
+        self.model.currentState = np.squeeze(self.model.currentState)
+
+    def getAction(self):
+        return np.argmax(self.model.getAction())
+
+    def setPerception(self, next_observation, action, reward, is_done):
+        self.model.setPerception(next_observation, action, reward, is_done);
+
diff --git a/brain_dqn.py b/brain_dqn.py
@@ -0,0 +1,193 @@
+# -----------------------------
+# File: Deep Q-Learning Algorithm
+# Author: Flood Sung
+# Date: 2016.3.21
+# -----------------------------
+
+import tensorflow as tf 
+import numpy as np 
+import random
+from collections import deque 
+
+# Hyper Parameters:
+FRAME_PER_ACTION = 1
+GAMMA = 0.95 # decay rate of past observations
+OBSERVE = 50000. # timesteps to observe before training
+EXPLORE = 1000000. # frames over which to anneal epsilon
+FINAL_EPSILON = 0.1#0.001 # final value of epsilon
+INITIAL_EPSILON = 1.0#0.01 # starting value of epsilon
+REPLAY_MEMORY = 40000 # number of previous transitions to remember
+BATCH_SIZE = 32 # size of minibatch
+UPDATE_TIME = 10000
+
+class BrainDQN:
+
+	def __init__(self,actions):
+		# init replay memory
+		self.replayMemory = deque()
+		# init some parameters
+		self.timeStep = 0
+		self.epsilon = INITIAL_EPSILON
+		self.actions = actions
+		# init Q network
+		self.stateInput,self.QValue,self.W_conv1,self.b_conv1,self.W_conv2,self.b_conv2,self.W_conv3,self.b_conv3,self.W_fc1,self.b_fc1,self.W_fc2,self.b_fc2 = self.createQNetwork()
+
+		# init Target Q Network
+		self.stateInputT,self.QValueT,self.W_conv1T,self.b_conv1T,self.W_conv2T,self.b_conv2T,self.W_conv3T,self.b_conv3T,self.W_fc1T,self.b_fc1T,self.W_fc2T,self.b_fc2T = self.createQNetwork()
+
+		self.copyTargetQNetworkOperation = [self.W_conv1T.assign(self.W_conv1),self.b_conv1T.assign(self.b_conv1),self.W_conv2T.assign(self.W_conv2),self.b_conv2T.assign(self.b_conv2),self.W_conv3T.assign(self.W_conv3),self.b_conv3T.assign(self.b_conv3),self.W_fc1T.assign(self.W_fc1),self.b_fc1T.assign(self.b_fc1),self.W_fc2T.assign(self.W_fc2),self.b_fc2T.assign(self.b_fc2)]
+
+		self.createTrainingMethod()
+
+		# saving and loading networks
+		self.saver = tf.train.Saver()
+		self.session = tf.InteractiveSession()
+		self.session.run(tf.global_variables_initializer())
+		checkpoint = tf.train.get_checkpoint_state("./savedweights")
+		if checkpoint and checkpoint.model_checkpoint_path:
+				self.saver.restore(self.session, checkpoint.model_checkpoint_path)
+				print("Successfully loaded:", checkpoint.model_checkpoint_path)
+		else:
+				print("Could not find old network weights")
+
+
+	def createQNetwork(self):
+		# network weights
+		W_conv1 = self.weight_variable([8,8,4,32])
+		b_conv1 = self.bias_variable([32])
+
+		W_conv2 = self.weight_variable([4,4,32,64])
+		b_conv2 = self.bias_variable([64])
+
+		W_conv3 = self.weight_variable([3,3,64,64])
+		b_conv3 = self.bias_variable([64])
+
+		W_fc1 = self.weight_variable([3136,512])
+		b_fc1 = self.bias_variable([512])
+
+		W_fc2 = self.weight_variable([512,self.actions])
+		b_fc2 = self.bias_variable([self.actions])
+
+		# input layer
+
+		stateInput = tf.placeholder("float",[None,84,84,4])
+
+		# hidden layers
+		h_conv1 = tf.nn.relu(self.conv2d(stateInput,W_conv1,4) + b_conv1)
+		#h_pool1 = self.max_pool_2x2(h_conv1)
+
+		h_conv2 = tf.nn.relu(self.conv2d(h_conv1,W_conv2,2) + b_conv2)
+
+		h_conv3 = tf.nn.relu(self.conv2d(h_conv2,W_conv3,1) + b_conv3)
+		h_conv3_shape = h_conv3.get_shape().as_list()
+		print("dimension:",h_conv3_shape[1]*h_conv3_shape[2]*h_conv3_shape[3])
+		h_conv3_flat = tf.reshape(h_conv3,[-1,3136])
+		h_fc1 = tf.nn.relu(tf.matmul(h_conv3_flat,W_fc1) + b_fc1)
+
+		# Q Value layer
+		QValue = tf.matmul(h_fc1,W_fc2) + b_fc2
+
+		return stateInput,QValue,W_conv1,b_conv1,W_conv2,b_conv2,W_conv3,b_conv3,W_fc1,b_fc1,W_fc2,b_fc2
+
+	def copyTargetQNetwork(self):
+		self.session.run(self.copyTargetQNetworkOperation)
+
+	def createTrainingMethod(self):
+		self.actionInput = tf.placeholder("float",[None,self.actions])
+		self.yInput = tf.placeholder("float", [None]) 
+		Q_Action = tf.reduce_sum(tf.multiply(self.QValue, self.actionInput), reduction_indices = 1)
+		self.cost = tf.reduce_mean(tf.square(self.yInput - Q_Action))
+		self.trainStep = tf.train.RMSPropOptimizer(0.00025,0.99,0.0,1e-6).minimize(self.cost)
+
+
+	def trainQNetwork(self):
+		# Step 1: obtain random minibatch from replay memory
+		minibatch = random.sample(self.replayMemory,BATCH_SIZE)
+		state_batch = [data[0] for data in minibatch]
+		action_batch = [data[1] for data in minibatch]
+		reward_batch = [data[2] for data in minibatch]
+		nextState_batch = [data[3] for data in minibatch]
+
+		# Step 2: calculate y 
+		y_batch = []
+		QValue_batch = self.QValueT.eval(feed_dict={self.stateInputT:nextState_batch})
+		for i in range(0,BATCH_SIZE):
+			terminal = minibatch[i][4]
+			if terminal:
+				y_batch.append(reward_batch[i])
+			else:
+				y_batch.append(reward_batch[i] + GAMMA * np.max(QValue_batch[i]))
+
+		self.trainStep.run(feed_dict={
+			self.yInput : y_batch,
+			self.actionInput : action_batch,
+			self.stateInput : state_batch
+			})
+
+		# save network every 100000 iteration
+		if self.timeStep % 10000 == 0:
+			self.saver.save(self.session, './savedweights/network' + '-dqn', global_step = self.timeStep)
+
+		if self.timeStep % UPDATE_TIME == 0:
+			self.copyTargetQNetwork()
+
+
+	def setPerception(self,nextObservation,action,reward,terminal):
+		newState = np.append(nextObservation,self.currentState[:,:,1:],axis = 2)
+		self.replayMemory.append((self.currentState,action,reward,newState,terminal))
+		if len(self.replayMemory) > REPLAY_MEMORY:
+			self.replayMemory.popleft()
+		if self.timeStep > OBSERVE:
+			# Train the network
+			self.trainQNetwork()
+
+		# print info
+		state = ""
+		if self.timeStep <= OBSERVE:
+			state = "observe"
+		elif self.timeStep > OBSERVE and self.timeStep <= OBSERVE + EXPLORE:
+			state = "explore"
+		else:
+			state = "train"
+		if self.timeStep % 10000 == 0:
+			print("TIMESTEP", self.timeStep, "/ STATE", state, "/ EPSILON", self.epsilon)
+
+		self.currentState = newState
+		self.timeStep += 1
+
+	def getAction(self):
+		QValue = self.QValue.eval(feed_dict= {self.stateInput:[self.currentState]})[0]
+		action = np.zeros(self.actions)
+		action_index = 0
+		if self.timeStep % FRAME_PER_ACTION == 0:
+			if random.random() <= self.epsilon:
+				action_index = random.randrange(self.actions)
+				action[action_index] = 1
+			else:
+				action_index = np.argmax(QValue)
+				action[action_index] = 1
+		else:
+			action[0] = 1 # do nothingf
+
+		# change episilon
+		if self.epsilon > FINAL_EPSILON and self.timeStep > OBSERVE:
+			self.epsilon -= (INITIAL_EPSILON - FINAL_EPSILON)/EXPLORE
+
+		return action
+
+	def setInitState(self,observation):
+		self.currentState = np.stack((observation, observation, observation, observation), axis = 2)
+
+	def weight_variable(self,shape):
+		initial = tf.truncated_normal(shape, stddev = 0.01)
+		return tf.Variable(initial)
+
+	def bias_variable(self,shape):
+		initial = tf.constant(0.01, shape = shape)
+		return tf.Variable(initial)
+
+	def conv2d(self,x, W, stride):
+		return tf.nn.conv2d(x, W, strides = [1, stride, stride, 1], padding = "VALID")
+
+	def max_pool_2x2(self,x):
+		return tf.nn.max_pool(x, ksize = [1, 2, 2, 1], strides = [1, 2, 2, 1], padding = "SAME")
diff --git a/train_model.py b/train_model.py
@@ -0,0 +1,51 @@
+import matplotlib.pyplot as plt
+
+import cv2
+from .models.brain_dqn import BrainDQN
+import numpy as np
+
+import gym
+
+'''
+Models are allowed to train for a fixed amount of timesteps for a fixed number of episodes
+'''
+def train(agent, episodes_to_play, timesteps_alloted):
+    env = gym.make('SpaceInvaders-v0')
+    env.reset()
+
+    actions = env.action_space.n
+    agent.initialize(actions)
+
+    # Define preprocess step to be applied for all agents (reduce scope of problem for runtime)
+    def preprocess(observation):
+        observation = cv2.cvtColor(cv2.resize(observation, (84, 110)), cv2.COLOR_BGR2GRAY)
+        observation = observation[26:110, :]
+        ret, observation = cv2.threshold(observation, 1, 255, cv2.THRESH_BINARY)
+        return np.reshape(observation, (84, 84, 1))
+
+    rewards = []
+
+    while episodes_to_play >= 0:
+        timesteps = timesteps_alloted
+
+        action0 = 0 # do nothing for first action
+        observation0, reward0, is_done, info = env.step(action0)
+        observation0 = preprocess(observation0)
+        agent.setInitState(observation0)
+        total_reward = 0
+        while timesteps >= 0:
+            action = agent.getAction() # Feel like state/observation should be passed here
+
+            next_observation, reward, is_done, info = env.step(action)
+
+            if is_done:
+                next_observation = env.reset()
+                reward = -10 # Penalize retries
+            agent.setPerception(preprocess(next_observation), action, reward, is_done)
+            timesteps = timesteps - 1
+            total_reward = total_reward + reward
+        rewards.append(total_reward)
+        episodes_to_play = episodes_to_play - 1
+        print("T-" + str(episodes_to_play) + ": " + str(total_reward))
+    print("Training Session Completed")
+    print("Rewards: " + str(rewards))