colinmorris/README.md

## README.md

      
    Raw
  

              README.md
            
          
    algorithmic.py is a script that solves a few environments in the Algorithmic category in the OpenAI gym.
Basically semi-homemade policy gradients implementation. Some inspiration drawn from these examples:

https://gist.github.com/karpathy/a4166c7fe253700972fcbc77e4ea32c5
https://github.com/kvfrans/openai-cartpole/blob/master/cartpole-policygradient.py

Can pretty consistently beat Copy and DuplicatedInput envs. Could probably beat RepeatCopy with some tuning. Unlikely to beat addition environments cause they look hard and this code is currently only set up to handle 1-dimensional input tapes. Could maaaybe beat Reverse with some changes, but would be tricky. I've tried it, and it seems really hard to jump out of the local minimum that comes from just blindly guessing the output.
Hacks

This code is littered with the corpses of failed experiments. The main one that stuck was adjusting the base reward for non-writing actions from 0 to a negative value. Basically, I wanted to discourage dilly-dallying - ping-ponging left-and right a few times then writing an 'A' shouldn't give a higher total reward than just writing the 'A'. I think this is kind of justifiable because the algorithmic environments have (very harsh) time limits. So an action that accomplishes nothing is worse than neutral.
Policy network gets the current character as input and (for some environments) the previously read character and previous action. The previous action input is supplemented with an extra value for 'no previous action', which seems to help.

  
## algorithmic.py
from matplotlib import pyplot as plt
import tensorflow as tf
import numpy as np
import gym
import sys

# Or try 'DuplicatedInput-v0' or 'RepeatCopy-v0'
ENV_NAME = 'Copy-v0'

_CONFIGS = {
  'Copy-v0': dict(
    gamma=.5,
    zero_penalty=-.6
  ),
  'DuplicatedInput-v0': dict(
    zero_penalty=-.001,
    input_last_char=1,
    input_last_action=1,
    learning_rate=0.01,
  ),
  # Doesn't work :(
  'RepeatCopy-v0': dict(
    input_last_char=0,
    input_last_action=0,
    input_last_direction=1,
    completion_bonus=1.5,
    gamma=.7,
    learning_rate=0.005
  ),
}


_CONFIG = _CONFIGS[ENV_NAME]

N_EPISODES = 5000
# Bail out early if we beat the final level (by expanding the environment's
# current_length to its max of 30)
EARLY_EXIT = 0
MONITOR = 1
# Does setting this to high values encourage dilly-dallying? (And does ZERO_PENALTY offset that?)
GAMMA = _CONFIG.get('gamma', .9)
# How many episodes to render after training
RENDER_EPISODES = 0
# L2 weight penalty
L2 = 1
L2_WEIGHT = 0.001
# If an episode was successfully completed (i.e. all target characters were
# written to the tape), multiply all rewards for that episode by this amount,
# if set.
COMPLETION_BONUS = _CONFIG.get('completion_bonus', 0)
LEARNING_RATE = _CONFIG.get('learning_rate', 0.001)

INPUT_LAST_CHAR = _CONFIG.get('input_last_char', 0)
INPUT_LAST_ACTION = _CONFIG.get('input_last_action', 0)
# Subset of the above, just covering left/right (or neither). Enable at most
# one of these, not both.
INPUT_LAST_DIRECTION = _CONFIG.get('input_last_direction', 0)
assert not (INPUT_LAST_ACTION and INPUT_LAST_DIRECTION)
INPUT_LAST_REWARD = 0

N_HIDDEN = 40

TELESCOPING_REWARDS = 0

# This hurts things very badly. iunno why. Got the idea from here:
#     https://gist.github.com/karpathy/a4166c7fe253700972fcbc77e4ea32c5
# Maybe it makes more sense for that kind of problem where one episode
# can have thousands of transitions?
STANDARDIZE_REWARD = 0

# Setting this to -.2 or less seems to help a lot for copy (almost necessary)
# Sweet spot maybe around -.6
# Obviously a high value of this is not good for dedupe, but it can tolerate
# a penalty <= -.3. A very small penalty like -.01 might even help? Haven't
# collected a ton of data though.
ZERO_PENALTY = _CONFIG.get('zero_penalty', 0)

def bias_var(shape):
  initial = tf.constant(.1, shape=shape)
  return tf.Variable(initial)

def weight_var(shape, name):
  initial = tf.truncated_normal(shape, stddev=0.1)
  return tf.Variable(initial, name=name)

def episode(sess, env, pactions, nchars, actions_size, x, render=False):
  obs = env.reset()
  # Last position represents no previous action taken
  last_action = [0] * actions_size + [1]
  last_char = None
  last_reward = 0
  rewards = []
  xs = []
  action_history = []
  total_reward = 0
  def pack_x():
    xarr = [0]*nchars
    xarr[obs] = 1
    if INPUT_LAST_CHAR:
      last_char_arr = [0]*nchars
      if last_char is not None:
        last_char_arr[last_char] = 1
      xarr += last_char_arr
    if INPUT_LAST_ACTION:
      xarr += last_action
    if INPUT_LAST_DIRECTION:
      # left, right, neither
      ld = last_action[:2] + [0]
      if 1 not in ld:
        ld[2] = 1
      xarr += ld
    if INPUT_LAST_REWARD:
      xarr += [last_reward]
    return xarr

  counter = 0
  for i in range(100):
    counter += 1
    if render:
      env.render()
    curr_x = pack_x()
    xs.append(curr_x)
    action_probs = sess.run(pactions, feed_dict={x: [curr_x]})
    # choose next action
    actions = []
    actions_onehot = [0] * actions_size
    onehot_offset = 0
    for i, probs in enumerate(action_probs):
      assert probs.shape[0] == 1
      probs = probs[0]
      sample = np.sum(np.cumsum(probs) < np.random.rand())
      actions_onehot[onehot_offset+sample] = 1
      onehot_offset += probs.shape[0]
      actions.append(sample)

    action_history.append(actions_onehot)
    assert env.action_space.contains(actions)
    last_char = obs
    obs, last_reward, done, _ = env.step(actions)
    total_reward += last_reward
    rewards.append(last_reward)
    last_action = actions_onehot + [0]
    if done:
      if render:
        env.render()
      break

  return xs, action_history, rewards, total_reward, counter

def policy_gradient(paction_cat, actions_size):
  rewards = tf.placeholder(tf.float32, shape=[None])
  actions = tf.placeholder(tf.float32, shape=[None, actions_size])
  # For each transition, isolate the probabilities of the actions that
  # were actually performed.
  focused_probs = tf.reduce_sum(tf.mul(actions, paction_cat), reduction_indices=[1])
  # Starting from gradients that encourage the performed actions, multiply by
  # the 'advantage'. If the performed actions were bad, this should be a negative
  # multiplier, thus discouraging that set of actions.
  # (The particular loss function I'm using here is in imitation of this example:
  #     https://github.com/kvfrans/openai-cartpole/blob/master/cartpole-policygradient.py
  #  not sure if there are other reasonable ways to do it.)

  tempered = tf.mul(tf.log(focused_probs), rewards)
  #tempered = tf.mul(focused_probs, rewards)
  loss = -tf.reduce_sum(tempered)
  if L2:
    l2_loss = tf.add_n([tf.nn.l2_loss(v) for v in tf.trainable_variables()])
    loss = loss + L2_WEIGHT * l2_loss
  # I have no idea what I'm doing
  optimizer = tf.train.AdamOptimizer(LEARNING_RATE, beta1=.8, beta2=.99).minimize(loss)
  #optimizer = tf.train.GradientDescentOptimizer(LEARNING_RATE).minimize(loss)
  #optimizer = tf.train.AdadeltaOptimizer(LEARNING_RATE).minimize(loss)
  return (
    rewards, actions, optimizer,
      focused_probs, tempered, loss,
  )

def discounted_rewards(r):
  if TELESCOPING_REWARDS:
    running = 1
    for i, ree in enumerate(r):
      if ree == 1:
        r[i] = running
        running += 1
        #running = running*2 + 4
  r2 = np.zeros_like(r)
  running_sum = 0
  bonus = COMPLETION_BONUS if (COMPLETION_BONUS and r[-1] == 1) else 1
  for i in reversed(range(len(r))):
    # The min here is basically cheating :/
    #running_sum = min(running_sum * GAMMA,0) + r[i]
    rew = r[i]
    # Also cheating
    if ZERO_PENALTY and r[i] == 0:
      rew = ZERO_PENALTY
    running_sum = running_sum * GAMMA + rew
    r2[i] = running_sum * bonus
  return r2


if __name__ == '__main__':
  # TODO: This would all have been easier if I had just used np arrays throughout rather than
  # mixing them with lists. Bleh.
  sess = tf.InteractiveSession()
  env = gym.make(ENV_NAME)
  ''' experiment
  gym.envs.register(
      id='Reverse-v1',
      entry_point='gym.envs.algorithmic:ReverseEnv',
      kwargs={'base': 5},
      timestep_limit=200,
      reward_threshold=25.0,
  )
  env = gym.make('Reverse-v1')
  '''
  if MONITOR:
    env.monitor.start('/tmp/{}'.format(ENV_NAME), force=True)
  nchars = env.observation_space.n
  n_actions = len(env.action_space.spaces)
  # 2 + 2 + 5 (move head, write or nah, which char to write)
  actions_size = sum(space.n for space in env.action_space.spaces)
  # Currently seen char, last seen char, last action, last reward
  input_size = (nchars +
    (INPUT_LAST_CHAR * nchars) +
    # We'll add one more input which represents "no action taken" (i.e. this is the first step)
    INPUT_LAST_ACTION * (actions_size+1) +
    INPUT_LAST_DIRECTION * (3) +
    INPUT_LAST_REWARD * 1
  )
  x = tf.placeholder(tf.float32, shape=[None, input_size])
  w1 = weight_var([input_size, N_HIDDEN], 'w1')
  b1 = bias_var([N_HIDDEN])
  h1 = tf.nn.relu( tf.matmul(x, w1) + b1 )
  w2 = weight_var([N_HIDDEN, actions_size], 'w2')
  b2 = bias_var([actions_size])
  #noize = tf.truncated_normal([1, actions_size], mean=.6, stddev=.3) # Hardcoding batch size :/
  y = tf.nn.relu( tf.matmul(h1, w2) + b2 )
  #y = tf.add(tf.matmul(h1, w2), b2)
  #y = tf.add(tf.nn.relu( tf.matmul(h1, w2) + b2 ), noize)


  # Probabilities of each action
  pactions = []
  offset = 0
  for space in env.action_space.spaces:
    slice_ = tf.slice(y, [0, offset], [-1, space.n])
    paction = tf.nn.softmax(slice_)
    pactions.append(paction)
    offset += space.n
  paction_cat = tf.concat(1, pactions)

  reward_ph, actions_ph, optimizer,\
    focprobs, tempered, loss = policy_gradient(paction_cat, actions_size)

  sess.run(tf.initialize_all_variables())

  rewards_per_ep = []
  iters_per_ep = []
  loss_per_ep = []
  DEBUG = 0
  DEBUG_ITER = 14000
  env_length = env.current_length

  # Experimented with batching. Seemed to hurt more than help.
  EPISODES_PER_BATCH = 1
  # Experiment: give all actions in an episode the same score (1,-1) according to
  # whether the episode ended successfully. Didn't really work out that well.
  REWARD_EXPERIMENT = 0
  xs = []
  actions = []
  rewards = []
  successes = []
  for ep in range(N_EPISODES):
    epxs, epactions, eprewards, total_reward, iters = episode(sess, env, pactions, nchars, actions_size, x)
    if REWARD_EXPERIMENT:
      disco_rewards = [total_reward for _ in eprewards]
    else:
      disco_rewards = discounted_rewards(eprewards)

    successes.append(1 if eprewards[-1] == 1 else 0)
    xs += epxs
    actions += epactions
    rewards = np.concatenate([rewards, disco_rewards])
    if (ep+1) % EPISODES_PER_BATCH == 0:
      if DEBUG and ep > DEBUG_ITER:
        wut, but, why = sess.run([w1, b1, y], feed_dict={x:xs})
      if STANDARDIZE_REWARD:
        rewards = rewards - np.mean(rewards)
        std = np.std(rewards)
        if std:
          rewards /= std

      if DEBUG and ep > DEBUG_ITER:
        pca, foc, temp, loz = sess.run([paction_cat, focprobs, tempered, loss],
          feed_dict={x:xs, actions_ph: actions, reward_ph: rewards})
        if ep == DEBUG_ITER + 1:
          import pdb; pdb.set_trace()

      lossval, _ = sess.run([loss, optimizer], feed_dict={x: xs, actions_ph: actions, reward_ph: rewards})
      loss_per_ep.append(lossval)

      if DEBUG and ep > DEBUG_ITER:
        pca2 = sess.run(paction_cat, feed_dict={x: xs})

      xs = []
      actions = []
      rewards = []

    rewards_per_ep.append(total_reward)
    iters_per_ep.append(iters)

    # Have we graduated to the next curriculum?
    if env.current_length != env_length:
      sys.stderr.write("Leveled up! Current length = {} (ep={})\n".format(
        env.current_length, ep))
      env_length = env.current_length
      if env.current_length == 30 and EARLY_EXIT:
        break

  if MONITOR:
    env.monitor.close()

  if RENDER_EPISODES:
    for i in range(RENDER_EPISODES):
      episode(sess, env, pactions, nchars, actions_size, x, render=True)
      if i != RENDER_EPISODES-1:
        print
        print "*" * 80
        print
  window = 200 + N_EPISODES/100
  ravg = lambda vals: np.convolve(vals, np.ones((window,))/window, mode='valid')
  running_avg = np.convolve(rewards_per_ep, np.ones( (window,))/window, mode='valid')
  iters_running_avg = np.convolve(iters_per_ep, np.ones( (window,))/window, mode='valid')
  running_loss = ravg(loss_per_ep)
  running_successes = ravg(successes)
  plt.plot(running_avg)
  plt.plot(iters_running_avg, 'g--')
  plt.plot(running_successes, 'c:')
  #plt.plot(running_loss, 'r:')
  print "Final avg reward around {:.2f}".format(running_avg[-1])
  plt.text(len(running_avg), running_avg[-1], '{:.1f}'.format(running_avg[-1]))

  if len(sys.argv) > 1:
    fname = sys.argv[1]
    print "Saving plot to {}".format(fname)
    plt.savefig(fname)
  else:
    plt.show()
	from matplotlib import pyplot as plt
	import tensorflow as tf
	import numpy as np
	import gym
	import sys

	# Or try 'DuplicatedInput-v0' or 'RepeatCopy-v0'
	ENV_NAME = 'Copy-v0'

	_CONFIGS = {
	'Copy-v0': dict(
	gamma=.5,
	zero_penalty=-.6
	),
	'DuplicatedInput-v0': dict(
	zero_penalty=-.001,
	input_last_char=1,
	input_last_action=1,
	learning_rate=0.01,
	),
	# Doesn't work :(
	'RepeatCopy-v0': dict(
	input_last_char=0,
	input_last_action=0,
	input_last_direction=1,
	completion_bonus=1.5,
	gamma=.7,
	learning_rate=0.005
	),
	}


	_CONFIG = _CONFIGS[ENV_NAME]

	N_EPISODES = 5000
	# Bail out early if we beat the final level (by expanding the environment's
	# current_length to its max of 30)
	EARLY_EXIT = 0
	MONITOR = 1
	# Does setting this to high values encourage dilly-dallying? (And does ZERO_PENALTY offset that?)
	GAMMA = _CONFIG.get('gamma', .9)
	# How many episodes to render after training
	RENDER_EPISODES = 0
	# L2 weight penalty
	L2 = 1
	L2_WEIGHT = 0.001
	# If an episode was successfully completed (i.e. all target characters were
	# written to the tape), multiply all rewards for that episode by this amount,
	# if set.
	COMPLETION_BONUS = _CONFIG.get('completion_bonus', 0)
	LEARNING_RATE = _CONFIG.get('learning_rate', 0.001)

	INPUT_LAST_CHAR = _CONFIG.get('input_last_char', 0)
	INPUT_LAST_ACTION = _CONFIG.get('input_last_action', 0)
	# Subset of the above, just covering left/right (or neither). Enable at most
	# one of these, not both.
	INPUT_LAST_DIRECTION = _CONFIG.get('input_last_direction', 0)
	assert not (INPUT_LAST_ACTION and INPUT_LAST_DIRECTION)
	INPUT_LAST_REWARD = 0

	N_HIDDEN = 40

	TELESCOPING_REWARDS = 0

	# This hurts things very badly. iunno why. Got the idea from here:
	# https://gist.github.com/karpathy/a4166c7fe253700972fcbc77e4ea32c5
	# Maybe it makes more sense for that kind of problem where one episode
	# can have thousands of transitions?
	STANDARDIZE_REWARD = 0

	# Setting this to -.2 or less seems to help a lot for copy (almost necessary)
	# Sweet spot maybe around -.6
	# Obviously a high value of this is not good for dedupe, but it can tolerate
	# a penalty <= -.3. A very small penalty like -.01 might even help? Haven't
	# collected a ton of data though.
	ZERO_PENALTY = _CONFIG.get('zero_penalty', 0)

	def bias_var(shape):
	initial = tf.constant(.1, shape=shape)
	return tf.Variable(initial)

	def weight_var(shape, name):
	initial = tf.truncated_normal(shape, stddev=0.1)
	return tf.Variable(initial, name=name)

	def episode(sess, env, pactions, nchars, actions_size, x, render=False):
	obs = env.reset()
	# Last position represents no previous action taken
	last_action = [0] * actions_size + [1]
	last_char = None
	last_reward = 0
	rewards = []
	xs = []
	action_history = []
	total_reward = 0
	def pack_x():
	xarr = [0]*nchars
	xarr[obs] = 1
	if INPUT_LAST_CHAR:
	last_char_arr = [0]*nchars
	if last_char is not None:
	last_char_arr[last_char] = 1
	xarr += last_char_arr
	if INPUT_LAST_ACTION:
	xarr += last_action
	if INPUT_LAST_DIRECTION:
	# left, right, neither
	ld = last_action[:2] + [0]
	if 1 not in ld:
	ld[2] = 1
	xarr += ld
	if INPUT_LAST_REWARD:
	xarr += [last_reward]
	return xarr

	counter = 0
	for i in range(100):
	counter += 1
	if render:
	env.render()
	curr_x = pack_x()
	xs.append(curr_x)
	action_probs = sess.run(pactions, feed_dict={x: [curr_x]})
	# choose next action
	actions = []
	actions_onehot = [0] * actions_size
	onehot_offset = 0
	for i, probs in enumerate(action_probs):
	assert probs.shape[0] == 1
	probs = probs[0]
	sample = np.sum(np.cumsum(probs) < np.random.rand())
	actions_onehot[onehot_offset+sample] = 1
	onehot_offset += probs.shape[0]
	actions.append(sample)

	action_history.append(actions_onehot)
	assert env.action_space.contains(actions)
	last_char = obs
	obs, last_reward, done, _ = env.step(actions)
	total_reward += last_reward
	rewards.append(last_reward)
	last_action = actions_onehot + [0]
	if done:
	if render:
	env.render()
	break

	return xs, action_history, rewards, total_reward, counter

	def policy_gradient(paction_cat, actions_size):
	rewards = tf.placeholder(tf.float32, shape=[None])
	actions = tf.placeholder(tf.float32, shape=[None, actions_size])
	# For each transition, isolate the probabilities of the actions that
	# were actually performed.
	focused_probs = tf.reduce_sum(tf.mul(actions, paction_cat), reduction_indices=[1])
	# Starting from gradients that encourage the performed actions, multiply by
	# the 'advantage'. If the performed actions were bad, this should be a negative
	# multiplier, thus discouraging that set of actions.
	# (The particular loss function I'm using here is in imitation of this example:
	# https://github.com/kvfrans/openai-cartpole/blob/master/cartpole-policygradient.py
	# not sure if there are other reasonable ways to do it.)

	tempered = tf.mul(tf.log(focused_probs), rewards)
	#tempered = tf.mul(focused_probs, rewards)
	loss = -tf.reduce_sum(tempered)
	if L2:
	l2_loss = tf.add_n([tf.nn.l2_loss(v) for v in tf.trainable_variables()])
	loss = loss + L2_WEIGHT * l2_loss
	# I have no idea what I'm doing
	optimizer = tf.train.AdamOptimizer(LEARNING_RATE, beta1=.8, beta2=.99).minimize(loss)
	#optimizer = tf.train.GradientDescentOptimizer(LEARNING_RATE).minimize(loss)
	#optimizer = tf.train.AdadeltaOptimizer(LEARNING_RATE).minimize(loss)
	return (
	rewards, actions, optimizer,
	focused_probs, tempered, loss,
	)

	def discounted_rewards(r):
	if TELESCOPING_REWARDS:
	running = 1
	for i, ree in enumerate(r):
	if ree == 1:
	r[i] = running
	running += 1
	#running = running*2 + 4
	r2 = np.zeros_like(r)
	running_sum = 0
	bonus = COMPLETION_BONUS if (COMPLETION_BONUS and r[-1] == 1) else 1
	for i in reversed(range(len(r))):
	# The min here is basically cheating :/
	#running_sum = min(running_sum * GAMMA,0) + r[i]
	rew = r[i]
	# Also cheating
	if ZERO_PENALTY and r[i] == 0:
	rew = ZERO_PENALTY
	running_sum = running_sum * GAMMA + rew
	r2[i] = running_sum * bonus
	return r2


	if __name__ == '__main__':
	# TODO: This would all have been easier if I had just used np arrays throughout rather than
	# mixing them with lists. Bleh.
	sess = tf.InteractiveSession()
	env = gym.make(ENV_NAME)
	''' experiment
	gym.envs.register(
	id='Reverse-v1',
	entry_point='gym.envs.algorithmic:ReverseEnv',
	kwargs={'base': 5},
	timestep_limit=200,
	reward_threshold=25.0,
	)
	env = gym.make('Reverse-v1')
	'''
	if MONITOR:
	env.monitor.start('/tmp/{}'.format(ENV_NAME), force=True)
	nchars = env.observation_space.n
	n_actions = len(env.action_space.spaces)
	# 2 + 2 + 5 (move head, write or nah, which char to write)
	actions_size = sum(space.n for space in env.action_space.spaces)
	# Currently seen char, last seen char, last action, last reward
	input_size = (nchars +
	(INPUT_LAST_CHAR * nchars) +
	# We'll add one more input which represents "no action taken" (i.e. this is the first step)
	INPUT_LAST_ACTION * (actions_size+1) +
	INPUT_LAST_DIRECTION * (3) +
	INPUT_LAST_REWARD * 1
	)
	x = tf.placeholder(tf.float32, shape=[None, input_size])
	w1 = weight_var([input_size, N_HIDDEN], 'w1')
	b1 = bias_var([N_HIDDEN])
	h1 = tf.nn.relu( tf.matmul(x, w1) + b1 )
	w2 = weight_var([N_HIDDEN, actions_size], 'w2')
	b2 = bias_var([actions_size])
	#noize = tf.truncated_normal([1, actions_size], mean=.6, stddev=.3) # Hardcoding batch size :/
	y = tf.nn.relu( tf.matmul(h1, w2) + b2 )
	#y = tf.add(tf.matmul(h1, w2), b2)
	#y = tf.add(tf.nn.relu( tf.matmul(h1, w2) + b2 ), noize)


	# Probabilities of each action
	pactions = []
	offset = 0
	for space in env.action_space.spaces:
	slice_ = tf.slice(y, [0, offset], [-1, space.n])
	paction = tf.nn.softmax(slice_)
	pactions.append(paction)
	offset += space.n
	paction_cat = tf.concat(1, pactions)

	reward_ph, actions_ph, optimizer,\
	focprobs, tempered, loss = policy_gradient(paction_cat, actions_size)

	sess.run(tf.initialize_all_variables())

	rewards_per_ep = []
	iters_per_ep = []
	loss_per_ep = []
	DEBUG = 0
	DEBUG_ITER = 14000
	env_length = env.current_length

	# Experimented with batching. Seemed to hurt more than help.
	EPISODES_PER_BATCH = 1
	# Experiment: give all actions in an episode the same score (1,-1) according to
	# whether the episode ended successfully. Didn't really work out that well.
	REWARD_EXPERIMENT = 0
	xs = []
	actions = []
	rewards = []
	successes = []
	for ep in range(N_EPISODES):
	epxs, epactions, eprewards, total_reward, iters = episode(sess, env, pactions, nchars, actions_size, x)
	if REWARD_EXPERIMENT:
	disco_rewards = [total_reward for _ in eprewards]
	else:
	disco_rewards = discounted_rewards(eprewards)

	successes.append(1 if eprewards[-1] == 1 else 0)
	xs += epxs
	actions += epactions
	rewards = np.concatenate([rewards, disco_rewards])
	if (ep+1) % EPISODES_PER_BATCH == 0:
	if DEBUG and ep > DEBUG_ITER:
	wut, but, why = sess.run([w1, b1, y], feed_dict={x:xs})
	if STANDARDIZE_REWARD:
	rewards = rewards - np.mean(rewards)
	std = np.std(rewards)
	if std:
	rewards /= std

	if DEBUG and ep > DEBUG_ITER:
	pca, foc, temp, loz = sess.run([paction_cat, focprobs, tempered, loss],
	feed_dict={x:xs, actions_ph: actions, reward_ph: rewards})
	if ep == DEBUG_ITER + 1:
	import pdb; pdb.set_trace()

	lossval, _ = sess.run([loss, optimizer], feed_dict={x: xs, actions_ph: actions, reward_ph: rewards})
	loss_per_ep.append(lossval)

	if DEBUG and ep > DEBUG_ITER:
	pca2 = sess.run(paction_cat, feed_dict={x: xs})

	xs = []
	actions = []
	rewards = []

	rewards_per_ep.append(total_reward)
	iters_per_ep.append(iters)

	# Have we graduated to the next curriculum?
	if env.current_length != env_length:
	sys.stderr.write("Leveled up! Current length = {} (ep={})\n".format(
	env.current_length, ep))
	env_length = env.current_length
	if env.current_length == 30 and EARLY_EXIT:
	break

	if MONITOR:
	env.monitor.close()

	if RENDER_EPISODES:
	for i in range(RENDER_EPISODES):
	episode(sess, env, pactions, nchars, actions_size, x, render=True)
	if i != RENDER_EPISODES-1:
	print
	print "" 80
	print
	window = 200 + N_EPISODES/100
	ravg = lambda vals: np.convolve(vals, np.ones((window,))/window, mode='valid')
	running_avg = np.convolve(rewards_per_ep, np.ones( (window,))/window, mode='valid')
	iters_running_avg = np.convolve(iters_per_ep, np.ones( (window,))/window, mode='valid')
	running_loss = ravg(loss_per_ep)
	running_successes = ravg(successes)
	plt.plot(running_avg)
	plt.plot(iters_running_avg, 'g--')
	plt.plot(running_successes, 'c:')
	#plt.plot(running_loss, 'r:')
	print "Final avg reward around {:.2f}".format(running_avg[-1])
	plt.text(len(running_avg), running_avg[-1], '{:.1f}'.format(running_avg[-1]))

	if len(sys.argv) > 1:
	fname = sys.argv[1]
	print "Saving plot to {}".format(fname)
	plt.savefig(fname)
	else:
	plt.show()