Last active January 5, 2017 21:13
import gym
import tensorflow as tf
import numpy as np
import itertools
import tensorflow.contrib.layers as layers
from tqdm import trange
from gym.spaces import Discrete, Box
def get_traj(agent, env, max_episode_steps, render, deterministic_acts=False):
Runs agent-environment loop for one whole episdoe (trajectory).
Returns dictionary of results.
steps = itertools.count() if max_episode_steps is None else xrange(max_episode_steps)
obs = env.reset()
actions = []
rewards = []
observations = []
for step in steps:
observations.append(obs) #obs_i
act = agent.act(obs, deterministic=deterministic_acts)
obs, r, done, _ = env.step(act)
actions.append(act) # act_i
rewards.append(r) # r_i a reward received after act_i in obs_i
if done: break
if render: env.render()
return { 'rewards': np.array(rewards),
'actions': np.array(actions),
'observations': np.array(observations) }
def total_discounted_returns(rewards, gamma):
Given episode rewards, computes a vector y such that
y_i = r_i + gamma*r_{i+1} + gamma^2 * r_{i+2} + ..
n = len(rewards)
result = np.zeros_like(rewards)
next_rew = 0.
for i in reversed(xrange(n)):
result[i] = rewards[i] + gamma*next_rew
next_rew = result[i]
return result
def baseline_total_return(returns_per_traj):
N = len(returns_per_traj)
maxlen = max(len(ret) for ret in returns_per_traj)
masked = np.full((N, maxlen), float('nan'))
for i in xrange(N):
masked[i,0:len(returns_per_traj[i])] = returns_per_traj[i]
masked =, mask=np.isnan(masked))
return masked.mean(axis=0)
class REINFORCEAgent(object):
REINFORCE with baselines
Based on John Schulman's lectures(
def __init__(self, obs_space, act_space, **user_params):
assert isinstance(act_space, Discrete), \
'Agent works only with discrete action spaces'
self.input_shape = (None, ) + obs_space.shape
self.is_continious = isinstance(act_space, Box)
self.num_actions = act_space.n
self.curr_sess = None
self.config = dict(
max_episode_steps=500, #if None continue episode until terminal condition
optim_config = {'lr':0.05, 'rho':0.9, 'eps':1e-9},
self._num_layers = 0
dtype= self.config['dtype']
with tf.variable_scope(self.config['scope_name']):
#advs stands for advantage value: total discounted returns - baseline
self.advs_pl = tf.placeholder(shape=[None,], dtype=dtype, name='rets')
self.acts_pl = tf.placeholder(shape=[None,], dtype=tf.int32, name='acts')
self.obs_pl = tf.placeholder(shape=self.input_shape, dtype=dtype, name='obs')
flatten_obs = layers.flatten(self.obs_pl)
fc1 = self._add_fc(flatten_obs, self.config['num_hidden'], activation=tf.nn.relu)
fc2 = self._add_fc(fc1, self.num_actions, activation=None)
self.action_probs = tf.nn.softmax(fc2)
one_hot_acts = layers.one_hot_encoding(self.acts_pl, self.num_actions)
selected_probs = tf.reduce_sum(self.action_probs * one_hot_acts, reduction_indices=1)
#tf can't compute gradients of gather_nd :(
#selected_probs = tf.gather_nd(self.action_probs, indices=self.id2acts_pl)
#conventional sgd-like updates goes in tries to minimize loss functions,
#so we need to add minus to our loss function to move the parameters in
# direction of the original "loss" ascend:
neg_logprob = -tf.log(selected_probs)
N = tf.shape(self.advs_pl)[0]
self.loss = tf.reduce_sum(tf.mul(neg_logprob, self.advs_pl)) / tf.to_float(N)
opt_cfg = self.config['optim_config']
self.optimizer = tf.train.RMSPropOptimizer(opt_cfg['lr'],
0.0, opt_cfg['eps'])
self.opt_step = self.optimizer.minimize(self.loss)
def _check_config_args(self, user_args):
unrecognized = set(user_args.keys()) - set(self.config.keys())
if unrecognized:
raise ValueError('Unrecognized config params: {0}'.format(unrecognized))
def _add_fc(self, inputs, num_units, activation=None):
return layers.fully_connected(
inputs, num_units, activation_fn=activation,
biases_initializer=self._const_init() )
def _xavier_init(self, factor=2.0):
return layers.initializers.variance_scaling_initializer(
factor=2.0, mode='FAN_IN', dtype=self.config['dtype']
def _const_init(self, value=0.0):
return tf.constant_initializer(value, dtype=self.config['dtype'])
def act(self, observation, deterministic=False):
observation = observation[np.newaxis, :] #append new dimention to the left
probs =, feed_dict={self.obs_pl:observation})
if not deterministic:
return np.random.choice(self.num_actions, p=probs[0])
return np.argmax(probs[0])
def update(self, obs, acts, advs):
feeds = {
}[self.opt_step], feed_dict=feeds)
def learn(self, sess, env , verbose=1):
config = self.config
self.curr_sess = sess
for it in xrange(config['n_iter']):
# Collect trajectories unitl we get steps_per_batch total timesteps:
trajs = []
total_steps = 0
while total_steps < config['steps_per_batch']:
traj = get_traj(self, env,
config['max_episode_steps'], render=False)
total_steps += len(traj['actions'])
#compute advantages for all steps in all trajectories:
gamma = config['gamma']
ret_per_traj = [total_discounted_returns(tr['rewards'],gamma) for tr in trajs]
baseline = np.array(baseline_total_return(ret_per_traj))
all_advs = [ret - baseline[:len(ret)] for ret in ret_per_traj]
all_advs = np.concatenate(all_advs) #avds.shape = (sum(len(advs_fro_traj_i)), )
all_obs = np.concatenate([traj['observations'] for traj in trajs])
all_acts = np.concatenate([traj['actions'] for traj in trajs])
#update agent parameters:
self.update(all_obs, all_acts, all_advs)
#print iteration stats:
ep_data = [(tr['rewards'].sum(), len(tr['rewards'])) for tr in trajs]
reward_per_ep, len_per_ep = zip(*ep_data)
if verbose > 0:
report_iteration_stats(it, reward_per_ep, len_per_ep)
if verbose > 1:
get_traj(self, env, config['max_episode_steps'], render=True)
if verbose > 1:
self.curr_sess = None
def test(self, sess, env, num_episodes, max_episode_steps,
render=False, deterministic_acts=False):
self.curr_sess = sess
num_steps = []
rewards = []
episodes = trange(0, num_episodes, desc='Episodes completed')
for ep in episodes:
traj = get_traj(self, env, max_episode_steps,
render=render, deterministic_acts=deterministic_acts)
if render:
self.curr_sess = None
return {'num_steps':np.array(num_steps), 'rewards':np.array(rewards)}
def report_iteration_stats(it, reward_per_ep, len_per_ep):
print '=='*30
print 'Iteration #{0}'.format(it)
print 'Num episodes:', len(reward_per_ep)
print 'Total timesteps:', sum(len_per_ep)
print 'Max episode R:', max(reward_per_ep)
print 'Mean ep reward:', np.mean(reward_per_ep), 'Var:', np.std(reward_per_ep)
print 'Mean ep steps:', np.mean(len_per_ep), 'Var:', np.std(len_per_ep)
print '=='*30
import gym, os
import numpy as np
import tensorflow as tf
import reinforce_with_baseline as tf_reinforce
from gym import wrappers
def ensure_dir(d):
Check if directories in d exists
if not creates corresponding directories.
if not os.path.exists(d):
if __name__ == '__main__':
env = gym.make('Acrobot-v1')
user_params = {
'max_episode_steps':500, #env.spec.timestep_limit,
'optim_config':{'lr':0.005, 'rho':0.9, 'eps':1e-9},
agent = tf_reinforce.REINFORCEAgent(env.observation_space, env.action_space, **user_params)
with tf.Session() as sess:
print '================ LEARNING ================='
agent.learn(sess, env, verbose=2)
print '================ TESTING ================='
results = agent.test(sess, env, num_episodes=100,
render=True, deterministic_acts = True)
print 'Mean episode reward:', np.mean(results['rewards'])
print 'Mean episode length:', np.mean(results['num_steps'])
griver commented Jan 5, 2017

Just run

