griver/reinforce_with_baseline.py

## reinforce_with_baseline.py
import gym
import tensorflow as tf
import numpy as np
import itertools
import tensorflow.contrib.layers as layers
from tqdm import trange
from gym.spaces import Discrete, Box

def get_traj(agent, env, max_episode_steps, render, deterministic_acts=False):
  '''
  Runs agent-environment loop for one whole episdoe (trajectory).
  Returns dictionary of results.
  '''
  steps = itertools.count() if max_episode_steps is None else xrange(max_episode_steps)
  obs = env.reset()
  actions = []
  rewards = []
  observations = []

  for step in steps:
    observations.append(obs) #obs_i

    act = agent.act(obs, deterministic=deterministic_acts)
    obs, r, done, _ = env.step(act)

    actions.append(act) # act_i
    rewards.append(r) # r_i a reward received after act_i in obs_i

    if done: break
    if render: env.render()

  return { 'rewards': np.array(rewards),
           'actions': np.array(actions),
           'observations': np.array(observations) }


def total_discounted_returns(rewards, gamma):
  '''
  Given episode rewards, computes a vector y such that
    y_i = r_i + gamma*r_{i+1} + gamma^2 * r_{i+2} + ..
  '''
  n = len(rewards)
  result = np.zeros_like(rewards)
  next_rew = 0.

  for i in reversed(xrange(n)):
    result[i] = rewards[i] + gamma*next_rew
    next_rew = result[i]

  return result


def baseline_total_return(returns_per_traj):
  N = len(returns_per_traj)
  maxlen = max(len(ret) for ret in returns_per_traj)

  masked = np.full((N, maxlen), float('nan'))
  for i in xrange(N):
    masked[i,0:len(returns_per_traj[i])] = returns_per_traj[i]
  masked = np.ma.array(masked, mask=np.isnan(masked))
  return masked.mean(axis=0)


class REINFORCEAgent(object):
  '''
  REINFORCE with baselines
  Based on John Schulman's lectures(https://youtu.be/aUrX-rP_ss4)
  '''
  def __init__(self, obs_space, act_space, **user_params):
    assert isinstance(act_space, Discrete), \
    'Agent works only with discrete action spaces'

    self.input_shape = (None, ) + obs_space.shape
    self.is_continious = isinstance(act_space, Box)
    self.num_actions = act_space.n

    self.curr_sess = None
    self.config = dict(
      max_episode_steps=500, #if None continue episode until terminal condition
      steps_per_batch=10000,
      n_iter=100,
      gamma=1.0,
      optim_config = {'lr':0.05, 'rho':0.9, 'eps':1e-9},
      num_hidden=20,
      dtype=tf.float32,
      scope_name='REINFORCE'
    )
    self._num_layers = 0
    self._check_config_args(user_params)
    self.config.update(user_params)
    dtype= self.config['dtype']

    with tf.variable_scope(self.config['scope_name']):
      #advs stands for advantage value: total discounted returns - baseline
      self.advs_pl = tf.placeholder(shape=[None,], dtype=dtype, name='rets')
      self.acts_pl = tf.placeholder(shape=[None,], dtype=tf.int32, name='acts')
      self.obs_pl = tf.placeholder(shape=self.input_shape, dtype=dtype, name='obs')

      flatten_obs = layers.flatten(self.obs_pl)
      fc1 = self._add_fc(flatten_obs, self.config['num_hidden'], activation=tf.nn.relu)
      fc2 = self._add_fc(fc1, self.num_actions, activation=None)
      self.action_probs = tf.nn.softmax(fc2)

      one_hot_acts = layers.one_hot_encoding(self.acts_pl, self.num_actions)
      selected_probs = tf.reduce_sum(self.action_probs * one_hot_acts, reduction_indices=1)
      #tf can't compute gradients of gather_nd :(
      #selected_probs = tf.gather_nd(self.action_probs, indices=self.id2acts_pl)

      #conventional sgd-like updates goes in tries to minimize loss functions,
      #so we need to add minus to our loss function to move the parameters in
      # direction of the original "loss" ascend:
      neg_logprob = -tf.log(selected_probs)
      N = tf.shape(self.advs_pl)[0]
      self.loss = tf.reduce_sum(tf.mul(neg_logprob, self.advs_pl)) / tf.to_float(N)

      #create_optimizer:
      opt_cfg = self.config['optim_config']
      self.optimizer = tf.train.RMSPropOptimizer(opt_cfg['lr'],
                                                 opt_cfg['rho'],
                                                 0.0, opt_cfg['eps'])

      self.opt_step = self.optimizer.minimize(self.loss)

  def _check_config_args(self, user_args):
    unrecognized = set(user_args.keys()) - set(self.config.keys())
    if unrecognized:
      raise ValueError('Unrecognized config params: {0}'.format(unrecognized))

  def _add_fc(self, inputs, num_units, activation=None):
    return layers.fully_connected(
      inputs, num_units, activation_fn=activation,
      weights_initializer=self._xavier_init(),
      biases_initializer=self._const_init() )

  def _xavier_init(self, factor=2.0):
    return layers.initializers.variance_scaling_initializer(
      factor=2.0, mode='FAN_IN', dtype=self.config['dtype']
    )

  def _const_init(self, value=0.0):
    return tf.constant_initializer(value, dtype=self.config['dtype'])

  def act(self, observation, deterministic=False):
    observation = observation[np.newaxis, :] #append new dimention to the left
    probs = self.curr_sess.run(self.action_probs, feed_dict={self.obs_pl:observation})
    if not deterministic:
      return np.random.choice(self.num_actions, p=probs[0])
    else:
      return np.argmax(probs[0])

  def update(self, obs, acts, advs):
    feeds = {
      self.obs_pl:obs,
      self.acts_pl:acts,
      self.advs_pl:advs
    }
    self.curr_sess.run([self.opt_step], feed_dict=feeds)

  def learn(self, sess, env , verbose=1):
    config = self.config
    self.curr_sess = sess
    try:
      for it in xrange(config['n_iter']):
        # Collect trajectories unitl we get steps_per_batch total timesteps:
        trajs = []
        total_steps = 0
        while total_steps < config['steps_per_batch']:
          traj = get_traj(self, env,
                          config['max_episode_steps'], render=False)
          trajs.append(traj)
          total_steps += len(traj['actions'])

        #compute advantages for all steps in all trajectories:
        gamma = config['gamma']
        ret_per_traj = [total_discounted_returns(tr['rewards'],gamma) for tr in trajs]
        baseline = np.array(baseline_total_return(ret_per_traj))
        all_advs = [ret - baseline[:len(ret)] for ret in ret_per_traj]
        all_advs = np.concatenate(all_advs) #avds.shape = (sum(len(advs_fro_traj_i)), )

        all_obs = np.concatenate([traj['observations'] for traj in trajs])
        all_acts = np.concatenate([traj['actions'] for traj in trajs])
        #update agent parameters:
        self.update(all_obs, all_acts, all_advs)

        #print iteration stats:
        ep_data = [(tr['rewards'].sum(), len(tr['rewards'])) for tr in trajs]
        reward_per_ep, len_per_ep = zip(*ep_data)
        if verbose > 0:
          report_iteration_stats(it, reward_per_ep, len_per_ep)
        if verbose > 1:
          get_traj(self, env, config['max_episode_steps'], render=True)
    finally:
      if verbose > 1:
        env.render(close=True)
      self.curr_sess = None

  def test(self, sess, env, num_episodes, max_episode_steps,
           render=False, deterministic_acts=False):
    self.curr_sess = sess
    try:
      num_steps = []
      rewards = []
      episodes = trange(0, num_episodes, desc='Episodes completed')
      for ep in episodes:
        traj = get_traj(self, env, max_episode_steps,
                        render=render, deterministic_acts=deterministic_acts)
        rewards.append(sum(traj['rewards']))
        num_steps.append(len(traj['rewards']))

    finally:
      if render:
        env.render(close=True)
      self.curr_sess = None
    return {'num_steps':np.array(num_steps), 'rewards':np.array(rewards)}


def report_iteration_stats(it, reward_per_ep, len_per_ep):
  print '=='*30
  print 'Iteration #{0}'.format(it)
  print 'Num episodes:', len(reward_per_ep)
  print 'Total timesteps:', sum(len_per_ep)
  print 'Max episode R:', max(reward_per_ep)
  print 'Mean ep reward:', np.mean(reward_per_ep), 'Var:', np.std(reward_per_ep)
  print 'Mean ep steps:', np.mean(len_per_ep), 'Var:', np.std(len_per_ep)
  print '=='*30

## reproduce_gym_results.py
import gym, os
import numpy as np
import tensorflow as tf
import reinforce_with_baseline as tf_reinforce
from gym import wrappers


def ensure_dir(d):
    """
    Check if directories in d exists
    if not creates corresponding directories.
    """
    if not os.path.exists(d):
        os.makedirs(d)


if __name__ == '__main__':
  env = gym.make('Acrobot-v1')
  user_params = {
    'max_episode_steps':500, #env.spec.timestep_limit,
    'optim_config':{'lr':0.005, 'rho':0.9, 'eps':1e-9},
    'num_hidden':40,
  }
  agent = tf_reinforce.REINFORCEAgent(env.observation_space, env.action_space, **user_params)

  with tf.Session() as sess:
    print '================ LEARNING ================='
    sess.run(tf.initialize_all_variables())
    agent.learn(sess, env, verbose=2)

    print '================ TESTING ================='
    results = agent.test(sess, env, num_episodes=100,
                        max_episode_steps=agent.config['max_episode_steps'],
                        render=True, deterministic_acts = True)

    print 'Mean episode reward:', np.mean(results['rewards'])
    print 'Mean episode length:', np.mean(results['num_steps'])
  env.close()
	import gym
	import tensorflow as tf
	import numpy as np
	import itertools
	import tensorflow.contrib.layers as layers
	from tqdm import trange
	from gym.spaces import Discrete, Box

	def get_traj(agent, env, max_episode_steps, render, deterministic_acts=False):
	'''
	Runs agent-environment loop for one whole episdoe (trajectory).
	Returns dictionary of results.
	'''
	steps = itertools.count() if max_episode_steps is None else xrange(max_episode_steps)
	obs = env.reset()
	actions = []
	rewards = []
	observations = []

	for step in steps:
	observations.append(obs) #obs_i

	act = agent.act(obs, deterministic=deterministic_acts)
	obs, r, done, _ = env.step(act)

	actions.append(act) # act_i
	rewards.append(r) # r_i a reward received after act_i in obs_i

	if done: break
	if render: env.render()

	return { 'rewards': np.array(rewards),
	'actions': np.array(actions),
	'observations': np.array(observations) }


	def total_discounted_returns(rewards, gamma):
	'''
	Given episode rewards, computes a vector y such that
	y_i = r_i + gammar_{i+1} + gamma^2 r_{i+2} + ..
	'''
	n = len(rewards)
	result = np.zeros_like(rewards)
	next_rew = 0.

	for i in reversed(xrange(n)):
	result[i] = rewards[i] + gamma*next_rew
	next_rew = result[i]

	return result


	def baseline_total_return(returns_per_traj):
	N = len(returns_per_traj)
	maxlen = max(len(ret) for ret in returns_per_traj)

	masked = np.full((N, maxlen), float('nan'))
	for i in xrange(N):
	masked[i,0:len(returns_per_traj[i])] = returns_per_traj[i]
	masked = np.ma.array(masked, mask=np.isnan(masked))
	return masked.mean(axis=0)


	class REINFORCEAgent(object):
	'''
	REINFORCE with baselines
	Based on John Schulman's lectures(https://youtu.be/aUrX-rP_ss4)
	'''
	def __init__(self, obs_space, act_space, **user_params):
	assert isinstance(act_space, Discrete), \
	'Agent works only with discrete action spaces'

	self.input_shape = (None, ) + obs_space.shape
	self.is_continious = isinstance(act_space, Box)
	self.num_actions = act_space.n

	self.curr_sess = None
	self.config = dict(
	max_episode_steps=500, #if None continue episode until terminal condition
	steps_per_batch=10000,
	n_iter=100,
	gamma=1.0,
	optim_config = {'lr':0.05, 'rho':0.9, 'eps':1e-9},
	num_hidden=20,
	dtype=tf.float32,
	scope_name='REINFORCE'
	)
	self._num_layers = 0
	self._check_config_args(user_params)
	self.config.update(user_params)
	dtype= self.config['dtype']

	with tf.variable_scope(self.config['scope_name']):
	#advs stands for advantage value: total discounted returns - baseline
	self.advs_pl = tf.placeholder(shape=[None,], dtype=dtype, name='rets')
	self.acts_pl = tf.placeholder(shape=[None,], dtype=tf.int32, name='acts')
	self.obs_pl = tf.placeholder(shape=self.input_shape, dtype=dtype, name='obs')

	flatten_obs = layers.flatten(self.obs_pl)
	fc1 = self._add_fc(flatten_obs, self.config['num_hidden'], activation=tf.nn.relu)
	fc2 = self._add_fc(fc1, self.num_actions, activation=None)
	self.action_probs = tf.nn.softmax(fc2)

	one_hot_acts = layers.one_hot_encoding(self.acts_pl, self.num_actions)
	selected_probs = tf.reduce_sum(self.action_probs * one_hot_acts, reduction_indices=1)
	#tf can't compute gradients of gather_nd :(
	#selected_probs = tf.gather_nd(self.action_probs, indices=self.id2acts_pl)

	#conventional sgd-like updates goes in tries to minimize loss functions,
	#so we need to add minus to our loss function to move the parameters in
	# direction of the original "loss" ascend:
	neg_logprob = -tf.log(selected_probs)
	N = tf.shape(self.advs_pl)[0]
	self.loss = tf.reduce_sum(tf.mul(neg_logprob, self.advs_pl)) / tf.to_float(N)

	#create_optimizer:
	opt_cfg = self.config['optim_config']
	self.optimizer = tf.train.RMSPropOptimizer(opt_cfg['lr'],
	opt_cfg['rho'],
	0.0, opt_cfg['eps'])

	self.opt_step = self.optimizer.minimize(self.loss)

	def _check_config_args(self, user_args):
	unrecognized = set(user_args.keys()) - set(self.config.keys())
	if unrecognized:
	raise ValueError('Unrecognized config params: {0}'.format(unrecognized))

	def _add_fc(self, inputs, num_units, activation=None):
	return layers.fully_connected(
	inputs, num_units, activation_fn=activation,
	weights_initializer=self._xavier_init(),
	biases_initializer=self._const_init() )

	def _xavier_init(self, factor=2.0):
	return layers.initializers.variance_scaling_initializer(
	factor=2.0, mode='FAN_IN', dtype=self.config['dtype']
	)

	def _const_init(self, value=0.0):
	return tf.constant_initializer(value, dtype=self.config['dtype'])

	def act(self, observation, deterministic=False):
	observation = observation[np.newaxis, :] #append new dimention to the left
	probs = self.curr_sess.run(self.action_probs, feed_dict={self.obs_pl:observation})
	if not deterministic:
	return np.random.choice(self.num_actions, p=probs[0])
	else:
	return np.argmax(probs[0])

	def update(self, obs, acts, advs):
	feeds = {
	self.obs_pl:obs,
	self.acts_pl:acts,
	self.advs_pl:advs
	}
	self.curr_sess.run([self.opt_step], feed_dict=feeds)

	def learn(self, sess, env , verbose=1):
	config = self.config
	self.curr_sess = sess
	try:
	for it in xrange(config['n_iter']):
	# Collect trajectories unitl we get steps_per_batch total timesteps:
	trajs = []
	total_steps = 0
	while total_steps < config['steps_per_batch']:
	traj = get_traj(self, env,
	config['max_episode_steps'], render=False)
	trajs.append(traj)
	total_steps += len(traj['actions'])

	#compute advantages for all steps in all trajectories:
	gamma = config['gamma']
	ret_per_traj = [total_discounted_returns(tr['rewards'],gamma) for tr in trajs]
	baseline = np.array(baseline_total_return(ret_per_traj))
	all_advs = [ret - baseline[:len(ret)] for ret in ret_per_traj]
	all_advs = np.concatenate(all_advs) #avds.shape = (sum(len(advs_fro_traj_i)), )

	all_obs = np.concatenate([traj['observations'] for traj in trajs])
	all_acts = np.concatenate([traj['actions'] for traj in trajs])
	#update agent parameters:
	self.update(all_obs, all_acts, all_advs)

	#print iteration stats:
	ep_data = [(tr['rewards'].sum(), len(tr['rewards'])) for tr in trajs]
	reward_per_ep, len_per_ep = zip(*ep_data)
	if verbose > 0:
	report_iteration_stats(it, reward_per_ep, len_per_ep)
	if verbose > 1:
	get_traj(self, env, config['max_episode_steps'], render=True)
	finally:
	if verbose > 1:
	env.render(close=True)
	self.curr_sess = None

	def test(self, sess, env, num_episodes, max_episode_steps,
	render=False, deterministic_acts=False):
	self.curr_sess = sess
	try:
	num_steps = []
	rewards = []
	episodes = trange(0, num_episodes, desc='Episodes completed')
	for ep in episodes:
	traj = get_traj(self, env, max_episode_steps,
	render=render, deterministic_acts=deterministic_acts)
	rewards.append(sum(traj['rewards']))
	num_steps.append(len(traj['rewards']))

	finally:
	if render:
	env.render(close=True)
	self.curr_sess = None
	return {'num_steps':np.array(num_steps), 'rewards':np.array(rewards)}


	def report_iteration_stats(it, reward_per_ep, len_per_ep):
	print '=='*30
	print 'Iteration #{0}'.format(it)
	print 'Num episodes:', len(reward_per_ep)
	print 'Total timesteps:', sum(len_per_ep)
	print 'Max episode R:', max(reward_per_ep)
	print 'Mean ep reward:', np.mean(reward_per_ep), 'Var:', np.std(reward_per_ep)
	print 'Mean ep steps:', np.mean(len_per_ep), 'Var:', np.std(len_per_ep)
	print '=='*30
	import gym, os
	import numpy as np
	import tensorflow as tf
	import reinforce_with_baseline as tf_reinforce
	from gym import wrappers


	def ensure_dir(d):
	"""
	Check if directories in d exists
	if not creates corresponding directories.
	"""
	if not os.path.exists(d):
	os.makedirs(d)


	if __name__ == '__main__':
	env = gym.make('Acrobot-v1')
	user_params = {
	'max_episode_steps':500, #env.spec.timestep_limit,
	'optim_config':{'lr':0.005, 'rho':0.9, 'eps':1e-9},
	'num_hidden':40,
	}
	agent = tf_reinforce.REINFORCEAgent(env.observation_space, env.action_space, **user_params)

	with tf.Session() as sess:
	print '================ LEARNING ================='
	sess.run(tf.initialize_all_variables())
	agent.learn(sess, env, verbose=2)

	print '================ TESTING ================='
	results = agent.test(sess, env, num_episodes=100,
	max_episode_steps=agent.config['max_episode_steps'],
	render=True, deterministic_acts = True)

	print 'Mean episode reward:', np.mean(results['rewards'])
	print 'Mean episode length:', np.mean(results['num_steps'])
	env.close()