Skip to content

Instantly share code, notes, and snippets.

@colinmorris
Last active August 2, 2017 08:13
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save colinmorris/98e6586d07a1d5ce258efb226caccbee to your computer and use it in GitHub Desktop.
Save colinmorris/98e6586d07a1d5ce258efb226caccbee to your computer and use it in GitHub Desktop.

algorithmic.py is a script that solves a few environments in the Algorithmic category in the OpenAI gym.

Basically semi-homemade policy gradients implementation. Some inspiration drawn from these examples:

Can pretty consistently beat Copy and DuplicatedInput envs. Could probably beat RepeatCopy with some tuning. Unlikely to beat addition environments cause they look hard and this code is currently only set up to handle 1-dimensional input tapes. Could maaaybe beat Reverse with some changes, but would be tricky. I've tried it, and it seems really hard to jump out of the local minimum that comes from just blindly guessing the output.

Hacks

This code is littered with the corpses of failed experiments. The main one that stuck was adjusting the base reward for non-writing actions from 0 to a negative value. Basically, I wanted to discourage dilly-dallying - ping-ponging left-and right a few times then writing an 'A' shouldn't give a higher total reward than just writing the 'A'. I think this is kind of justifiable because the algorithmic environments have (very harsh) time limits. So an action that accomplishes nothing is worse than neutral.

Policy network gets the current character as input and (for some environments) the previously read character and previous action. The previous action input is supplemented with an extra value for 'no previous action', which seems to help.

from matplotlib import pyplot as plt
import tensorflow as tf
import numpy as np
import gym
import sys
# Or try 'DuplicatedInput-v0' or 'RepeatCopy-v0'
ENV_NAME = 'Copy-v0'
_CONFIGS = {
'Copy-v0': dict(
gamma=.5,
zero_penalty=-.6
),
'DuplicatedInput-v0': dict(
zero_penalty=-.001,
input_last_char=1,
input_last_action=1,
learning_rate=0.01,
),
# Doesn't work :(
'RepeatCopy-v0': dict(
input_last_char=0,
input_last_action=0,
input_last_direction=1,
completion_bonus=1.5,
gamma=.7,
learning_rate=0.005
),
}
_CONFIG = _CONFIGS[ENV_NAME]
N_EPISODES = 5000
# Bail out early if we beat the final level (by expanding the environment's
# current_length to its max of 30)
EARLY_EXIT = 0
MONITOR = 1
# Does setting this to high values encourage dilly-dallying? (And does ZERO_PENALTY offset that?)
GAMMA = _CONFIG.get('gamma', .9)
# How many episodes to render after training
RENDER_EPISODES = 0
# L2 weight penalty
L2 = 1
L2_WEIGHT = 0.001
# If an episode was successfully completed (i.e. all target characters were
# written to the tape), multiply all rewards for that episode by this amount,
# if set.
COMPLETION_BONUS = _CONFIG.get('completion_bonus', 0)
LEARNING_RATE = _CONFIG.get('learning_rate', 0.001)
INPUT_LAST_CHAR = _CONFIG.get('input_last_char', 0)
INPUT_LAST_ACTION = _CONFIG.get('input_last_action', 0)
# Subset of the above, just covering left/right (or neither). Enable at most
# one of these, not both.
INPUT_LAST_DIRECTION = _CONFIG.get('input_last_direction', 0)
assert not (INPUT_LAST_ACTION and INPUT_LAST_DIRECTION)
INPUT_LAST_REWARD = 0
N_HIDDEN = 40
TELESCOPING_REWARDS = 0
# This hurts things very badly. iunno why. Got the idea from here:
# https://gist.github.com/karpathy/a4166c7fe253700972fcbc77e4ea32c5
# Maybe it makes more sense for that kind of problem where one episode
# can have thousands of transitions?
STANDARDIZE_REWARD = 0
# Setting this to -.2 or less seems to help a lot for copy (almost necessary)
# Sweet spot maybe around -.6
# Obviously a high value of this is not good for dedupe, but it can tolerate
# a penalty <= -.3. A very small penalty like -.01 might even help? Haven't
# collected a ton of data though.
ZERO_PENALTY = _CONFIG.get('zero_penalty', 0)
def bias_var(shape):
initial = tf.constant(.1, shape=shape)
return tf.Variable(initial)
def weight_var(shape, name):
initial = tf.truncated_normal(shape, stddev=0.1)
return tf.Variable(initial, name=name)
def episode(sess, env, pactions, nchars, actions_size, x, render=False):
obs = env.reset()
# Last position represents no previous action taken
last_action = [0] * actions_size + [1]
last_char = None
last_reward = 0
rewards = []
xs = []
action_history = []
total_reward = 0
def pack_x():
xarr = [0]*nchars
xarr[obs] = 1
if INPUT_LAST_CHAR:
last_char_arr = [0]*nchars
if last_char is not None:
last_char_arr[last_char] = 1
xarr += last_char_arr
if INPUT_LAST_ACTION:
xarr += last_action
if INPUT_LAST_DIRECTION:
# left, right, neither
ld = last_action[:2] + [0]
if 1 not in ld:
ld[2] = 1
xarr += ld
if INPUT_LAST_REWARD:
xarr += [last_reward]
return xarr
counter = 0
for i in range(100):
counter += 1
if render:
env.render()
curr_x = pack_x()
xs.append(curr_x)
action_probs = sess.run(pactions, feed_dict={x: [curr_x]})
# choose next action
actions = []
actions_onehot = [0] * actions_size
onehot_offset = 0
for i, probs in enumerate(action_probs):
assert probs.shape[0] == 1
probs = probs[0]
sample = np.sum(np.cumsum(probs) < np.random.rand())
actions_onehot[onehot_offset+sample] = 1
onehot_offset += probs.shape[0]
actions.append(sample)
action_history.append(actions_onehot)
assert env.action_space.contains(actions)
last_char = obs
obs, last_reward, done, _ = env.step(actions)
total_reward += last_reward
rewards.append(last_reward)
last_action = actions_onehot + [0]
if done:
if render:
env.render()
break
return xs, action_history, rewards, total_reward, counter
def policy_gradient(paction_cat, actions_size):
rewards = tf.placeholder(tf.float32, shape=[None])
actions = tf.placeholder(tf.float32, shape=[None, actions_size])
# For each transition, isolate the probabilities of the actions that
# were actually performed.
focused_probs = tf.reduce_sum(tf.mul(actions, paction_cat), reduction_indices=[1])
# Starting from gradients that encourage the performed actions, multiply by
# the 'advantage'. If the performed actions were bad, this should be a negative
# multiplier, thus discouraging that set of actions.
# (The particular loss function I'm using here is in imitation of this example:
# https://github.com/kvfrans/openai-cartpole/blob/master/cartpole-policygradient.py
# not sure if there are other reasonable ways to do it.)
tempered = tf.mul(tf.log(focused_probs), rewards)
#tempered = tf.mul(focused_probs, rewards)
loss = -tf.reduce_sum(tempered)
if L2:
l2_loss = tf.add_n([tf.nn.l2_loss(v) for v in tf.trainable_variables()])
loss = loss + L2_WEIGHT * l2_loss
# I have no idea what I'm doing
optimizer = tf.train.AdamOptimizer(LEARNING_RATE, beta1=.8, beta2=.99).minimize(loss)
#optimizer = tf.train.GradientDescentOptimizer(LEARNING_RATE).minimize(loss)
#optimizer = tf.train.AdadeltaOptimizer(LEARNING_RATE).minimize(loss)
return (
rewards, actions, optimizer,
focused_probs, tempered, loss,
)
def discounted_rewards(r):
if TELESCOPING_REWARDS:
running = 1
for i, ree in enumerate(r):
if ree == 1:
r[i] = running
running += 1
#running = running*2 + 4
r2 = np.zeros_like(r)
running_sum = 0
bonus = COMPLETION_BONUS if (COMPLETION_BONUS and r[-1] == 1) else 1
for i in reversed(range(len(r))):
# The min here is basically cheating :/
#running_sum = min(running_sum * GAMMA,0) + r[i]
rew = r[i]
# Also cheating
if ZERO_PENALTY and r[i] == 0:
rew = ZERO_PENALTY
running_sum = running_sum * GAMMA + rew
r2[i] = running_sum * bonus
return r2
if __name__ == '__main__':
# TODO: This would all have been easier if I had just used np arrays throughout rather than
# mixing them with lists. Bleh.
sess = tf.InteractiveSession()
env = gym.make(ENV_NAME)
''' experiment
gym.envs.register(
id='Reverse-v1',
entry_point='gym.envs.algorithmic:ReverseEnv',
kwargs={'base': 5},
timestep_limit=200,
reward_threshold=25.0,
)
env = gym.make('Reverse-v1')
'''
if MONITOR:
env.monitor.start('/tmp/{}'.format(ENV_NAME), force=True)
nchars = env.observation_space.n
n_actions = len(env.action_space.spaces)
# 2 + 2 + 5 (move head, write or nah, which char to write)
actions_size = sum(space.n for space in env.action_space.spaces)
# Currently seen char, last seen char, last action, last reward
input_size = (nchars +
(INPUT_LAST_CHAR * nchars) +
# We'll add one more input which represents "no action taken" (i.e. this is the first step)
INPUT_LAST_ACTION * (actions_size+1) +
INPUT_LAST_DIRECTION * (3) +
INPUT_LAST_REWARD * 1
)
x = tf.placeholder(tf.float32, shape=[None, input_size])
w1 = weight_var([input_size, N_HIDDEN], 'w1')
b1 = bias_var([N_HIDDEN])
h1 = tf.nn.relu( tf.matmul(x, w1) + b1 )
w2 = weight_var([N_HIDDEN, actions_size], 'w2')
b2 = bias_var([actions_size])
#noize = tf.truncated_normal([1, actions_size], mean=.6, stddev=.3) # Hardcoding batch size :/
y = tf.nn.relu( tf.matmul(h1, w2) + b2 )
#y = tf.add(tf.matmul(h1, w2), b2)
#y = tf.add(tf.nn.relu( tf.matmul(h1, w2) + b2 ), noize)
# Probabilities of each action
pactions = []
offset = 0
for space in env.action_space.spaces:
slice_ = tf.slice(y, [0, offset], [-1, space.n])
paction = tf.nn.softmax(slice_)
pactions.append(paction)
offset += space.n
paction_cat = tf.concat(1, pactions)
reward_ph, actions_ph, optimizer,\
focprobs, tempered, loss = policy_gradient(paction_cat, actions_size)
sess.run(tf.initialize_all_variables())
rewards_per_ep = []
iters_per_ep = []
loss_per_ep = []
DEBUG = 0
DEBUG_ITER = 14000
env_length = env.current_length
# Experimented with batching. Seemed to hurt more than help.
EPISODES_PER_BATCH = 1
# Experiment: give all actions in an episode the same score (1,-1) according to
# whether the episode ended successfully. Didn't really work out that well.
REWARD_EXPERIMENT = 0
xs = []
actions = []
rewards = []
successes = []
for ep in range(N_EPISODES):
epxs, epactions, eprewards, total_reward, iters = episode(sess, env, pactions, nchars, actions_size, x)
if REWARD_EXPERIMENT:
disco_rewards = [total_reward for _ in eprewards]
else:
disco_rewards = discounted_rewards(eprewards)
successes.append(1 if eprewards[-1] == 1 else 0)
xs += epxs
actions += epactions
rewards = np.concatenate([rewards, disco_rewards])
if (ep+1) % EPISODES_PER_BATCH == 0:
if DEBUG and ep > DEBUG_ITER:
wut, but, why = sess.run([w1, b1, y], feed_dict={x:xs})
if STANDARDIZE_REWARD:
rewards = rewards - np.mean(rewards)
std = np.std(rewards)
if std:
rewards /= std
if DEBUG and ep > DEBUG_ITER:
pca, foc, temp, loz = sess.run([paction_cat, focprobs, tempered, loss],
feed_dict={x:xs, actions_ph: actions, reward_ph: rewards})
if ep == DEBUG_ITER + 1:
import pdb; pdb.set_trace()
lossval, _ = sess.run([loss, optimizer], feed_dict={x: xs, actions_ph: actions, reward_ph: rewards})
loss_per_ep.append(lossval)
if DEBUG and ep > DEBUG_ITER:
pca2 = sess.run(paction_cat, feed_dict={x: xs})
xs = []
actions = []
rewards = []
rewards_per_ep.append(total_reward)
iters_per_ep.append(iters)
# Have we graduated to the next curriculum?
if env.current_length != env_length:
sys.stderr.write("Leveled up! Current length = {} (ep={})\n".format(
env.current_length, ep))
env_length = env.current_length
if env.current_length == 30 and EARLY_EXIT:
break
if MONITOR:
env.monitor.close()
if RENDER_EPISODES:
for i in range(RENDER_EPISODES):
episode(sess, env, pactions, nchars, actions_size, x, render=True)
if i != RENDER_EPISODES-1:
print
print "*" * 80
print
window = 200 + N_EPISODES/100
ravg = lambda vals: np.convolve(vals, np.ones((window,))/window, mode='valid')
running_avg = np.convolve(rewards_per_ep, np.ones( (window,))/window, mode='valid')
iters_running_avg = np.convolve(iters_per_ep, np.ones( (window,))/window, mode='valid')
running_loss = ravg(loss_per_ep)
running_successes = ravg(successes)
plt.plot(running_avg)
plt.plot(iters_running_avg, 'g--')
plt.plot(running_successes, 'c:')
#plt.plot(running_loss, 'r:')
print "Final avg reward around {:.2f}".format(running_avg[-1])
plt.text(len(running_avg), running_avg[-1], '{:.1f}'.format(running_avg[-1]))
if len(sys.argv) > 1:
fname = sys.argv[1]
print "Saving plot to {}".format(fname)
plt.savefig(fname)
else:
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment