Skip to content

Instantly share code, notes, and snippets.

@tambetm
Created May 7, 2016 12:14
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save tambetm/0bd29b14d76b85946422b79f3a87df70 to your computer and use it in GitHub Desktop.
Save tambetm/0bd29b14d76b85946422b79f3a87df70 to your computer and use it in GitHub Desktop.

Used dueling network architecture with Q-learning, as outlined in this paper:

Dueling Network Architectures for Deep Reinforcement Learning
Ziyu Wang, Tom Schaul, Matteo Hessel, Hado van Hasselt, Marc Lanctot, Nando de Freitas
http://arxiv.org/abs/1511.06581

Command line:

python duel.py CartPole-v0 --gamma 0.995
import argparse
import gym
from gym.spaces import Box, Discrete
from keras.models import Model
from keras.layers import Input, Dense, Lambda
from keras.layers.normalization import BatchNormalization
from keras import backend as K
import numpy as np
parser = argparse.ArgumentParser()
parser.add_argument('--batch_size', type=int, default=100)
parser.add_argument('--hidden_size', type=int, default=100)
parser.add_argument('--layers', type=int, default=1)
parser.add_argument('--batch_norm', action="store_true", default=False)
parser.add_argument('--no_batch_norm', action="store_false", dest='batch_norm')
parser.add_argument('--min_train', type=int, default=10)
parser.add_argument('--train_repeat', type=int, default=10)
parser.add_argument('--gamma', type=float, default=0.99)
parser.add_argument('--tau', type=float, default=0.001)
parser.add_argument('--episodes', type=int, default=200)
parser.add_argument('--max_timesteps', type=int, default=200)
parser.add_argument('--activation', choices=['tanh', 'relu'], default='tanh')
parser.add_argument('--optimizer', choices=['adam', 'rmsprop'], default='adam')
#parser.add_argument('--optimizer_lr', type=float, default=0.001)
parser.add_argument('--exploration', type=float, default=0.1)
parser.add_argument('--advantage', choices=['naive', 'max', 'avg'], default='naive')
parser.add_argument('--display', action='store_true', default=True)
parser.add_argument('--no_display', dest='display', action='store_false')
parser.add_argument('--gym_record')
parser.add_argument('environment')
args = parser.parse_args()
env = gym.make(args.environment)
assert isinstance(env.observation_space, Box)
assert isinstance(env.action_space, Discrete)
if args.gym_record:
env.monitor.start(args.gym_record)
def createLayers():
x = Input(shape=env.observation_space.shape)
if args.batch_norm:
h = BatchNormalization()(x)
else:
h = x
for i in xrange(args.layers):
h = Dense(args.hidden_size, activation=args.activation)(h)
if args.batch_norm and i != args.layers - 1:
h = BatchNormalization()(h)
y = Dense(env.action_space.n + 1)(h)
if args.advantage == 'avg':
z = Lambda(lambda a: K.expand_dims(a[:,0], dim=-1) + a[:,1:] - K.mean(a[:, 1:], keepdims=True), output_shape=(env.action_space.n,))(y)
elif args.advantage == 'max':
z = Lambda(lambda a: K.expand_dims(a[:,0], dim=-1) + a[:,1:] - K.max(a[:, 1:], keepdims=True), output_shape=(env.action_space.n,))(y)
elif args.advantage == 'naive':
z = Lambda(lambda a: K.expand_dims(a[:,0], dim=-1) + a[:,1:], output_shape=(env.action_space.n,))(y)
else:
assert False
return x, z
x, z = createLayers()
model = Model(input=x, output=z)
model.summary()
model.compile(optimizer='adam', loss='mse')
x, z = createLayers()
target_model = Model(input=x, output=z)
target_model.set_weights(model.get_weights())
prestates = []
actions = []
rewards = []
poststates = []
terminals = []
total_reward = 0
for i_episode in xrange(args.episodes):
observation = env.reset()
episode_reward = 0
for t in xrange(args.max_timesteps):
if args.display:
env.render()
if np.random.random() < args.exploration:
action = env.action_space.sample()
else:
s = np.array([observation])
q = model.predict(s, batch_size=1)
#print "q:", q
action = np.argmax(q[0])
#print "action:", action
prestates.append(observation)
actions.append(action)
observation, reward, done, info = env.step(action)
episode_reward += reward
#print "reward:", reward
rewards.append(reward)
poststates.append(observation)
terminals.append(done)
if len(prestates) > args.min_train:
for k in xrange(args.train_repeat):
if len(prestates) > args.batch_size:
indexes = np.random.choice(len(prestates), size=args.batch_size)
else:
indexes = range(len(prestates))
qpre = model.predict(np.array(prestates)[indexes])
qpost = model.predict(np.array(poststates)[indexes])
for i in xrange(len(indexes)):
if terminals[indexes[i]]:
qpre[i, actions[indexes[i]]] = rewards[indexes[i]]
else:
qpre[i, actions[indexes[i]]] = rewards[indexes[i]] + args.gamma * np.amax(qpost[i])
model.train_on_batch(np.array(prestates)[indexes], qpre)
weights = model.get_weights()
target_weights = target_model.get_weights()
for i in xrange(len(weights)):
target_weights[i] = args.tau * weights[i] + (1 - args.tau) * target_weights[i]
target_model.set_weights(target_weights)
if done:
break
print "Episode {} finished after {} timesteps, episode reward {}".format(i_episode + 1, t + 1, episode_reward)
total_reward += episode_reward
print "Average reward per episode {}".format(total_reward / args.episodes)
if args.gym_record:
env.monitor.close()
@Sohojoe
Copy link

Sohojoe commented May 29, 2016

@tambetm - this is very cool!! -

I've been trying to reproduce; it is close but < 195. Is this just a random artifact? It seams that it is solving way faster than other algorthems so maybe I just need to run it for longer? That said, it's pretty slow and took >1hr on my macbook pro (cpu only), I was using Theano and will try again with TensorFlow
https://gym.openai.com/evaluations/eval_GPY4SsOZRzqUdnthlrynHw

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment