Created
July 29, 2016 23:46
-
-
Save kvfrans/131e0e7eeb1d035427c599c6ba8774c6 to your computer and use it in GitHub Desktop.
Q-learn w/ function approximator and experience replay
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import argparse | |
import tensorflow as tf | |
import time | |
import random | |
# This is an improved version of Q_Basic. It has experience replay, and remembers previous transitions to train on again. | |
# In order to fix non-convergence problems, I manually put a reward of -200 when failing to reach 200 timesteps, and I | |
# run 10 supervised training updates after each episode. | |
# This model uses two Q networks: An old network that stays fixed for some number of episodes, while a new network is trained | |
# from a one-step-lookahead to the old network. The old network occasionally updates itself to the new network. | |
# EDIT: it turns out the two Q network setup doesn't really help, so I commented it out. | |
class Q_Experience_Replay(): | |
def __init__(self, name, sess, num_actions, num_observations, num_hidden): | |
self.num_actions = num_actions | |
self.num_observations = num_observations | |
self.num_hidden = num_hidden | |
self.sess = sess | |
self.observations_in = tf.placeholder(tf.float32, [None,num_observations]) | |
with tf.variable_scope(name): | |
self.w1 = tf.Variable(self.xavier_initializer([num_observations, num_hidden]), name="w1") | |
self.b1 = tf.Variable(self.xavier_initializer([num_hidden]), name="b1") | |
self.w2 = tf.Variable(self.xavier_initializer([num_hidden, num_hidden]), name="w2") | |
self.b2 = tf.Variable(self.xavier_initializer([num_hidden]), name="b2") | |
self.w3 = tf.Variable(self.xavier_initializer([num_hidden, num_actions]), name="w3") | |
self.b3 = tf.Variable(self.xavier_initializer([num_actions]), name="b3") | |
self.h1 = tf.sigmoid(tf.matmul(self.observations_in, self.w1) + self.b1) | |
self.h2 = tf.sigmoid(tf.matmul(self.h1, self.w2) + self.b2) | |
self.estimated_values = tf.matmul(self.h2, self.w3) + self.b3 | |
self.tvars = tf.trainable_variables() | |
# one-hot matrix of which action was taken | |
self.action_in = tf.placeholder(tf.float32,[None,num_actions]) | |
# vector of size [timesteps] | |
self.return_in = tf.placeholder(tf.float32,[None]) | |
guessed_action_value = tf.reduce_sum(self.estimated_values * self.action_in, reduction_indices=1) | |
loss = tf.nn.l2_loss(guessed_action_value - self.return_in) | |
self.debug = loss | |
self.learning_rate = tf.placeholder(tf.float32) | |
self.optimizer = tf.train.GradientDescentOptimizer(self.learning_rate).minimize(loss) | |
self.w1_placeholder = tf.placeholder(tf.float32, [num_observations, num_hidden]) | |
self.b1_placeholder = tf.placeholder(tf.float32, [num_hidden]) | |
self.w2_placeholder = tf.placeholder(tf.float32, [num_hidden, num_hidden]) | |
self.b2_placeholder = tf.placeholder(tf.float32, [num_hidden]) | |
self.w3_placeholder = tf.placeholder(tf.float32, [num_hidden, num_actions]) | |
self.b3_placeholder = tf.placeholder(tf.float32, [num_actions]) | |
self.w1_assign = self.w1.assign(self.w1_placeholder) | |
self.b1_assign = self.b1.assign(self.b1_placeholder) | |
self.w2_assign = self.w2.assign(self.w2_placeholder) | |
self.b2_assign = self.b2.assign(self.b2_placeholder) | |
self.w3_assign = self.w3.assign(self.w3_placeholder) | |
self.b3_assign = self.b3.assign(self.b3_placeholder) | |
def xavier_initializer(self, shape): | |
dim_sum = np.sum(shape) | |
if len(shape) == 1: | |
dim_sum += 1 | |
bound = np.sqrt(6.0 / dim_sum) | |
return tf.random_uniform(shape, minval=-bound, maxval=bound) | |
def getAction(self, observation): | |
values = self.getValues(observation) | |
return np.argmax(values[0], axis=0) | |
def getValues(self, observation): | |
observation_reshaped = np.reshape(observation,(1,self.num_observations)) | |
return self.sess.run(self.estimated_values, feed_dict={self.observations_in: observation_reshaped}) | |
def getBatchValues(self,observation): | |
return self.sess.run(self.estimated_values, feed_dict={self.observations_in: observation}) | |
def update(self, observation, action, reward, learning_rate): | |
estimated, _ = self.sess.run([self.debug, self.optimizer], feed_dict={self.learning_rate: learning_rate, self.observations_in: observation, self.action_in: action, self.return_in: reward}) | |
return estimated | |
def transferParams(self, otherModel): | |
w1, b1, w2, b2 = self.sess.run([otherModel.w1, otherModel.b1, otherModel.w2, otherModel.b2]) | |
self.sess.run([self.w1_assign, self.b1_assign, self.w2_assign, self.b2_assign], feed_dict={self.w1_placeholder: w1, self.b1_placeholder: b1, self.w2_placeholder: w2, self.b2_placeholder: b2}) | |
def learn(env, args): | |
num_actions = int(env.action_space.n) | |
num_observations, = env.observation_space.shape | |
sess = tf.Session() | |
lr = args.learningrate | |
model = Q_Experience_Replay("main", sess, num_actions, num_observations, args.hidden) | |
old_model = Q_Experience_Replay("old", sess, num_actions, num_observations, args.hidden) | |
sess.run(tf.initialize_all_variables()) | |
old_model.transferParams(model) | |
np.set_printoptions(precision=3,suppress=True) | |
transitions = [] | |
epsilon = 1 | |
finished_learning = False | |
for episode in xrange(args.episodes): | |
observation = env.reset() | |
epsilon = epsilon*args.epsilon_decay | |
# epsilon = 0 | |
# totalreward = 0 | |
for frame in xrange(args.maxframes + 1): | |
if args.render: | |
env.render() | |
if not finished_learning: | |
print "%s : %s" % (np.array_str(observation), np.array_str(model.getValues(observation))) | |
# epsilon-greedy actions | |
action = model.getAction(observation) | |
if random.uniform(0,1) < epsilon: | |
action = env.action_space.sample() | |
old_observation = observation | |
observation, reward, done, info = env.step(action) | |
if done and frame < args.maxframes-1: | |
reward = args.shame | |
transitions.append((old_observation,action,reward,True,observation)) | |
else: | |
transitions.append((old_observation,action,reward,False,observation)) | |
if frame >= args.maxframes - 1: | |
print "done" | |
finished_learning = True | |
if len(transitions) > args.memory_size: | |
transitions.pop(0) | |
if done: | |
print "episode%d, lasted %d eps %f lr %f" % (episode, frame, epsilon, lr*10000) | |
break | |
if not finished_learning: | |
if len(transitions) == args.memory_size: | |
print "trained %f" % epsilon | |
for x in xrange(args.training_iterations): | |
observation_history = np.zeros((0,num_observations)) | |
action_history = np.zeros((0,num_actions)) | |
TQ_history = np.array(()) | |
for transition in random.sample(transitions, args.batchsize): | |
old_observation, action, reward, done, next_observation = transition | |
# print "%s w/action %d gives reward %f to %s which was worth %s" % (np.array_str(old_observation), action, reward, np.array_str(next_observation), np.array_str(model.getValues(next_observation))) | |
TQ = reward + args.discount * np.amax(model.getValues(next_observation)) | |
if done: | |
# a big negative reward when failing to reach the goal: this solves convergence problems | |
TQ = args.shame | |
old_observation_reshaped = np.reshape(old_observation,(1,num_observations)) | |
observation_history = np.append(observation_history,old_observation_reshaped,axis=0) | |
action_onehot = np.zeros((1,num_actions)) | |
action_onehot[:,action] = 1.0 | |
action_history = np.append(action_history,action_onehot,axis=0) | |
TQ_history = np.append(TQ_history,TQ) | |
model.update(observation_history,action_history,TQ_history,lr) | |
lr *= args.learningrate_decay | |
# print "params" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import gym | |
import os | |
import argparse | |
import q_basic | |
import q_experience_replay | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser() | |
parser.add_argument("--env", type=str, default="Acrobot-v0", help="OpenAI gym env to run on") | |
parser.add_argument("--submit", type=bool, default=False, help="record the trial for submission?") | |
parser.add_argument("--render", type=bool, default=False, help="render the environment while training?") | |
parser.add_argument("--episodes", type=int, default=2000, help="how many episodes to train for") | |
parser.add_argument("--maxframes", type=int, default=200, help="max frames to run an episode for") | |
parser.add_argument("--discount", type=float, default=0.95, help="discount factor for later timesteps") | |
parser.add_argument("--hidden", type=int, default=64, help="nodes in hidden layer") | |
parser.add_argument("--batchsize", type=int, default=64, help="batches for experience replay") | |
parser.add_argument("--training_iterations", type=int, default=50, help="how many times to train per episode") | |
parser.add_argument("--updaterate", type=int, default=16, help="when to swap frozen and updated models") | |
parser.add_argument("--epsilon_decay", type=float, default=0.99, help="scale down factor for epsilon-greedy") | |
parser.add_argument("--memory_size", type=int, default=300, help="memory size for experience replay") | |
parser.add_argument("--shame", type=int, default=-200, help="'shame' negative reward when the network doesn't reach final timestep. used in CartPole") | |
parser.add_argument("--accomplishment", type=int, default=1000, help="'accomplishment' positive reward when the network reaches a final state. used in Acrobot") | |
parser.add_argument("--learningrate", type=float, default=0.0001, help="learning rate") | |
parser.add_argument("--learningrate_decay", type=int, default=0.9995, help="decay learningrate by multiply") | |
args = parser.parse_args() | |
env = gym.make(args.env) | |
if args.submit: | |
env.monitor.start('monitor/', force=True) | |
# q_basic.learn(env,args) | |
q_experience_replay.learn(env,args) | |
if args.submit: | |
env.monitor.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment