Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions agent.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
from .models.brain_dqn import BrainDQN
import numpy as np

'''
Wrapper class for models, use for providing consistent api to environment for purpose of training
'''
class Agent_DQN:
def __init__(self):
self.model = None

def initialize(self, actions):
self.model = BrainDQN(actions)

def setInitState(self, observation):
self.model.setInitState(observation)
self.model.currentState = np.squeeze(self.model.currentState)

def getAction(self):
return np.argmax(self.model.getAction())

def setPerception(self, next_observation, action, reward, is_done):
self.model.setPerception(next_observation, action, reward, is_done);

193 changes: 193 additions & 0 deletions brain_dqn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,193 @@
# -----------------------------
# File: Deep Q-Learning Algorithm
# Author: Flood Sung
# Date: 2016.3.21
# -----------------------------

import tensorflow as tf
import numpy as np
import random
from collections import deque

# Hyper Parameters:
FRAME_PER_ACTION = 1
GAMMA = 0.95 # decay rate of past observations
OBSERVE = 50000. # timesteps to observe before training
EXPLORE = 1000000. # frames over which to anneal epsilon
FINAL_EPSILON = 0.1#0.001 # final value of epsilon
INITIAL_EPSILON = 1.0#0.01 # starting value of epsilon
REPLAY_MEMORY = 40000 # number of previous transitions to remember
BATCH_SIZE = 32 # size of minibatch
UPDATE_TIME = 10000

class BrainDQN:

def __init__(self,actions):
# init replay memory
self.replayMemory = deque()
# init some parameters
self.timeStep = 0
self.epsilon = INITIAL_EPSILON
self.actions = actions
# init Q network
self.stateInput,self.QValue,self.W_conv1,self.b_conv1,self.W_conv2,self.b_conv2,self.W_conv3,self.b_conv3,self.W_fc1,self.b_fc1,self.W_fc2,self.b_fc2 = self.createQNetwork()

# init Target Q Network
self.stateInputT,self.QValueT,self.W_conv1T,self.b_conv1T,self.W_conv2T,self.b_conv2T,self.W_conv3T,self.b_conv3T,self.W_fc1T,self.b_fc1T,self.W_fc2T,self.b_fc2T = self.createQNetwork()

self.copyTargetQNetworkOperation = [self.W_conv1T.assign(self.W_conv1),self.b_conv1T.assign(self.b_conv1),self.W_conv2T.assign(self.W_conv2),self.b_conv2T.assign(self.b_conv2),self.W_conv3T.assign(self.W_conv3),self.b_conv3T.assign(self.b_conv3),self.W_fc1T.assign(self.W_fc1),self.b_fc1T.assign(self.b_fc1),self.W_fc2T.assign(self.W_fc2),self.b_fc2T.assign(self.b_fc2)]

self.createTrainingMethod()

# saving and loading networks
self.saver = tf.train.Saver()
self.session = tf.InteractiveSession()
self.session.run(tf.global_variables_initializer())
checkpoint = tf.train.get_checkpoint_state("./savedweights")
if checkpoint and checkpoint.model_checkpoint_path:
self.saver.restore(self.session, checkpoint.model_checkpoint_path)
print("Successfully loaded:", checkpoint.model_checkpoint_path)
else:
print("Could not find old network weights")


def createQNetwork(self):
# network weights
W_conv1 = self.weight_variable([8,8,4,32])
b_conv1 = self.bias_variable([32])

W_conv2 = self.weight_variable([4,4,32,64])
b_conv2 = self.bias_variable([64])

W_conv3 = self.weight_variable([3,3,64,64])
b_conv3 = self.bias_variable([64])

W_fc1 = self.weight_variable([3136,512])
b_fc1 = self.bias_variable([512])

W_fc2 = self.weight_variable([512,self.actions])
b_fc2 = self.bias_variable([self.actions])

# input layer

stateInput = tf.placeholder("float",[None,84,84,4])

# hidden layers
h_conv1 = tf.nn.relu(self.conv2d(stateInput,W_conv1,4) + b_conv1)
#h_pool1 = self.max_pool_2x2(h_conv1)

h_conv2 = tf.nn.relu(self.conv2d(h_conv1,W_conv2,2) + b_conv2)

h_conv3 = tf.nn.relu(self.conv2d(h_conv2,W_conv3,1) + b_conv3)
h_conv3_shape = h_conv3.get_shape().as_list()
print("dimension:",h_conv3_shape[1]*h_conv3_shape[2]*h_conv3_shape[3])
h_conv3_flat = tf.reshape(h_conv3,[-1,3136])
h_fc1 = tf.nn.relu(tf.matmul(h_conv3_flat,W_fc1) + b_fc1)

# Q Value layer
QValue = tf.matmul(h_fc1,W_fc2) + b_fc2

return stateInput,QValue,W_conv1,b_conv1,W_conv2,b_conv2,W_conv3,b_conv3,W_fc1,b_fc1,W_fc2,b_fc2

def copyTargetQNetwork(self):
self.session.run(self.copyTargetQNetworkOperation)

def createTrainingMethod(self):
self.actionInput = tf.placeholder("float",[None,self.actions])
self.yInput = tf.placeholder("float", [None])
Q_Action = tf.reduce_sum(tf.multiply(self.QValue, self.actionInput), reduction_indices = 1)
self.cost = tf.reduce_mean(tf.square(self.yInput - Q_Action))
self.trainStep = tf.train.RMSPropOptimizer(0.00025,0.99,0.0,1e-6).minimize(self.cost)


def trainQNetwork(self):
# Step 1: obtain random minibatch from replay memory
minibatch = random.sample(self.replayMemory,BATCH_SIZE)
state_batch = [data[0] for data in minibatch]
action_batch = [data[1] for data in minibatch]
reward_batch = [data[2] for data in minibatch]
nextState_batch = [data[3] for data in minibatch]

# Step 2: calculate y
y_batch = []
QValue_batch = self.QValueT.eval(feed_dict={self.stateInputT:nextState_batch})
for i in range(0,BATCH_SIZE):
terminal = minibatch[i][4]
if terminal:
y_batch.append(reward_batch[i])
else:
y_batch.append(reward_batch[i] + GAMMA * np.max(QValue_batch[i]))

self.trainStep.run(feed_dict={
self.yInput : y_batch,
self.actionInput : action_batch,
self.stateInput : state_batch
})

# save network every 100000 iteration
if self.timeStep % 10000 == 0:
self.saver.save(self.session, './savedweights/network' + '-dqn', global_step = self.timeStep)

if self.timeStep % UPDATE_TIME == 0:
self.copyTargetQNetwork()


def setPerception(self,nextObservation,action,reward,terminal):
newState = np.append(nextObservation,self.currentState[:,:,1:],axis = 2)
self.replayMemory.append((self.currentState,action,reward,newState,terminal))
if len(self.replayMemory) > REPLAY_MEMORY:
self.replayMemory.popleft()
if self.timeStep > OBSERVE:
# Train the network
self.trainQNetwork()

# print info
state = ""
if self.timeStep <= OBSERVE:
state = "observe"
elif self.timeStep > OBSERVE and self.timeStep <= OBSERVE + EXPLORE:
state = "explore"
else:
state = "train"
if self.timeStep % 10000 == 0:
print("TIMESTEP", self.timeStep, "/ STATE", state, "/ EPSILON", self.epsilon)

self.currentState = newState
self.timeStep += 1

def getAction(self):
QValue = self.QValue.eval(feed_dict= {self.stateInput:[self.currentState]})[0]
action = np.zeros(self.actions)
action_index = 0
if self.timeStep % FRAME_PER_ACTION == 0:
if random.random() <= self.epsilon:
action_index = random.randrange(self.actions)
action[action_index] = 1
else:
action_index = np.argmax(QValue)
action[action_index] = 1
else:
action[0] = 1 # do nothingf

# change episilon
if self.epsilon > FINAL_EPSILON and self.timeStep > OBSERVE:
self.epsilon -= (INITIAL_EPSILON - FINAL_EPSILON)/EXPLORE

return action

def setInitState(self,observation):
self.currentState = np.stack((observation, observation, observation, observation), axis = 2)

def weight_variable(self,shape):
initial = tf.truncated_normal(shape, stddev = 0.01)
return tf.Variable(initial)

def bias_variable(self,shape):
initial = tf.constant(0.01, shape = shape)
return tf.Variable(initial)

def conv2d(self,x, W, stride):
return tf.nn.conv2d(x, W, strides = [1, stride, stride, 1], padding = "VALID")

def max_pool_2x2(self,x):
return tf.nn.max_pool(x, ksize = [1, 2, 2, 1], strides = [1, 2, 2, 1], padding = "SAME")
51 changes: 51 additions & 0 deletions train_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import matplotlib.pyplot as plt

import cv2
from .models.brain_dqn import BrainDQN
import numpy as np

import gym

'''
Models are allowed to train for a fixed amount of timesteps for a fixed number of episodes
'''
def train(agent, episodes_to_play, timesteps_alloted):
env = gym.make('SpaceInvaders-v0')
env.reset()

actions = env.action_space.n
agent.initialize(actions)

# Define preprocess step to be applied for all agents (reduce scope of problem for runtime)
def preprocess(observation):
observation = cv2.cvtColor(cv2.resize(observation, (84, 110)), cv2.COLOR_BGR2GRAY)
observation = observation[26:110, :]
ret, observation = cv2.threshold(observation, 1, 255, cv2.THRESH_BINARY)
return np.reshape(observation, (84, 84, 1))

rewards = []

while episodes_to_play >= 0:
timesteps = timesteps_alloted

action0 = 0 # do nothing for first action
observation0, reward0, is_done, info = env.step(action0)
observation0 = preprocess(observation0)
agent.setInitState(observation0)
total_reward = 0
while timesteps >= 0:
action = agent.getAction() # Feel like state/observation should be passed here

next_observation, reward, is_done, info = env.step(action)

if is_done:
next_observation = env.reset()
reward = -10 # Penalize retries
agent.setPerception(preprocess(next_observation), action, reward, is_done)
timesteps = timesteps - 1
total_reward = total_reward + reward
rewards.append(total_reward)
episodes_to_play = episodes_to_play - 1
print("T-" + str(episodes_to_play) + ": " + str(total_reward))
print("Training Session Completed")
print("Rewards: " + str(rewards))