Skip to content

Instantly share code, notes, and snippets.

@griver
Last active January 5, 2017 21:13
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save griver/a51232dc41ce5ae3f77ad576bff16b43 to your computer and use it in GitHub Desktop.
Save griver/a51232dc41ce5ae3f77ad576bff16b43 to your computer and use it in GitHub Desktop.
import gym
import tensorflow as tf
import numpy as np
import itertools
import tensorflow.contrib.layers as layers
from tqdm import trange
from gym.spaces import Discrete, Box
def get_traj(agent, env, max_episode_steps, render, deterministic_acts=False):
'''
Runs agent-environment loop for one whole episdoe (trajectory).
Returns dictionary of results.
'''
steps = itertools.count() if max_episode_steps is None else xrange(max_episode_steps)
obs = env.reset()
actions = []
rewards = []
observations = []
for step in steps:
observations.append(obs) #obs_i
act = agent.act(obs, deterministic=deterministic_acts)
obs, r, done, _ = env.step(act)
actions.append(act) # act_i
rewards.append(r) # r_i a reward received after act_i in obs_i
if done: break
if render: env.render()
return { 'rewards': np.array(rewards),
'actions': np.array(actions),
'observations': np.array(observations) }
def total_discounted_returns(rewards, gamma):
'''
Given episode rewards, computes a vector y such that
y_i = r_i + gamma*r_{i+1} + gamma^2 * r_{i+2} + ..
'''
n = len(rewards)
result = np.zeros_like(rewards)
next_rew = 0.
for i in reversed(xrange(n)):
result[i] = rewards[i] + gamma*next_rew
next_rew = result[i]
return result
def baseline_total_return(returns_per_traj):
N = len(returns_per_traj)
maxlen = max(len(ret) for ret in returns_per_traj)
masked = np.full((N, maxlen), float('nan'))
for i in xrange(N):
masked[i,0:len(returns_per_traj[i])] = returns_per_traj[i]
masked = np.ma.array(masked, mask=np.isnan(masked))
return masked.mean(axis=0)
class REINFORCEAgent(object):
'''
REINFORCE with baselines
Based on John Schulman's lectures(https://youtu.be/aUrX-rP_ss4)
'''
def __init__(self, obs_space, act_space, **user_params):
assert isinstance(act_space, Discrete), \
'Agent works only with discrete action spaces'
self.input_shape = (None, ) + obs_space.shape
self.is_continious = isinstance(act_space, Box)
self.num_actions = act_space.n
self.curr_sess = None
self.config = dict(
max_episode_steps=500, #if None continue episode until terminal condition
steps_per_batch=10000,
n_iter=100,
gamma=1.0,
optim_config = {'lr':0.05, 'rho':0.9, 'eps':1e-9},
num_hidden=20,
dtype=tf.float32,
scope_name='REINFORCE'
)
self._num_layers = 0
self._check_config_args(user_params)
self.config.update(user_params)
dtype= self.config['dtype']
with tf.variable_scope(self.config['scope_name']):
#advs stands for advantage value: total discounted returns - baseline
self.advs_pl = tf.placeholder(shape=[None,], dtype=dtype, name='rets')
self.acts_pl = tf.placeholder(shape=[None,], dtype=tf.int32, name='acts')
self.obs_pl = tf.placeholder(shape=self.input_shape, dtype=dtype, name='obs')
flatten_obs = layers.flatten(self.obs_pl)
fc1 = self._add_fc(flatten_obs, self.config['num_hidden'], activation=tf.nn.relu)
fc2 = self._add_fc(fc1, self.num_actions, activation=None)
self.action_probs = tf.nn.softmax(fc2)
one_hot_acts = layers.one_hot_encoding(self.acts_pl, self.num_actions)
selected_probs = tf.reduce_sum(self.action_probs * one_hot_acts, reduction_indices=1)
#tf can't compute gradients of gather_nd :(
#selected_probs = tf.gather_nd(self.action_probs, indices=self.id2acts_pl)
#conventional sgd-like updates goes in tries to minimize loss functions,
#so we need to add minus to our loss function to move the parameters in
# direction of the original "loss" ascend:
neg_logprob = -tf.log(selected_probs)
N = tf.shape(self.advs_pl)[0]
self.loss = tf.reduce_sum(tf.mul(neg_logprob, self.advs_pl)) / tf.to_float(N)
#create_optimizer:
opt_cfg = self.config['optim_config']
self.optimizer = tf.train.RMSPropOptimizer(opt_cfg['lr'],
opt_cfg['rho'],
0.0, opt_cfg['eps'])
self.opt_step = self.optimizer.minimize(self.loss)
def _check_config_args(self, user_args):
unrecognized = set(user_args.keys()) - set(self.config.keys())
if unrecognized:
raise ValueError('Unrecognized config params: {0}'.format(unrecognized))
def _add_fc(self, inputs, num_units, activation=None):
return layers.fully_connected(
inputs, num_units, activation_fn=activation,
weights_initializer=self._xavier_init(),
biases_initializer=self._const_init() )
def _xavier_init(self, factor=2.0):
return layers.initializers.variance_scaling_initializer(
factor=2.0, mode='FAN_IN', dtype=self.config['dtype']
)
def _const_init(self, value=0.0):
return tf.constant_initializer(value, dtype=self.config['dtype'])
def act(self, observation, deterministic=False):
observation = observation[np.newaxis, :] #append new dimention to the left
probs = self.curr_sess.run(self.action_probs, feed_dict={self.obs_pl:observation})
if not deterministic:
return np.random.choice(self.num_actions, p=probs[0])
else:
return np.argmax(probs[0])
def update(self, obs, acts, advs):
feeds = {
self.obs_pl:obs,
self.acts_pl:acts,
self.advs_pl:advs
}
self.curr_sess.run([self.opt_step], feed_dict=feeds)
def learn(self, sess, env , verbose=1):
config = self.config
self.curr_sess = sess
try:
for it in xrange(config['n_iter']):
# Collect trajectories unitl we get steps_per_batch total timesteps:
trajs = []
total_steps = 0
while total_steps < config['steps_per_batch']:
traj = get_traj(self, env,
config['max_episode_steps'], render=False)
trajs.append(traj)
total_steps += len(traj['actions'])
#compute advantages for all steps in all trajectories:
gamma = config['gamma']
ret_per_traj = [total_discounted_returns(tr['rewards'],gamma) for tr in trajs]
baseline = np.array(baseline_total_return(ret_per_traj))
all_advs = [ret - baseline[:len(ret)] for ret in ret_per_traj]
all_advs = np.concatenate(all_advs) #avds.shape = (sum(len(advs_fro_traj_i)), )
all_obs = np.concatenate([traj['observations'] for traj in trajs])
all_acts = np.concatenate([traj['actions'] for traj in trajs])
#update agent parameters:
self.update(all_obs, all_acts, all_advs)
#print iteration stats:
ep_data = [(tr['rewards'].sum(), len(tr['rewards'])) for tr in trajs]
reward_per_ep, len_per_ep = zip(*ep_data)
if verbose > 0:
report_iteration_stats(it, reward_per_ep, len_per_ep)
if verbose > 1:
get_traj(self, env, config['max_episode_steps'], render=True)
finally:
if verbose > 1:
env.render(close=True)
self.curr_sess = None
def test(self, sess, env, num_episodes, max_episode_steps,
render=False, deterministic_acts=False):
self.curr_sess = sess
try:
num_steps = []
rewards = []
episodes = trange(0, num_episodes, desc='Episodes completed')
for ep in episodes:
traj = get_traj(self, env, max_episode_steps,
render=render, deterministic_acts=deterministic_acts)
rewards.append(sum(traj['rewards']))
num_steps.append(len(traj['rewards']))
finally:
if render:
env.render(close=True)
self.curr_sess = None
return {'num_steps':np.array(num_steps), 'rewards':np.array(rewards)}
def report_iteration_stats(it, reward_per_ep, len_per_ep):
print '=='*30
print 'Iteration #{0}'.format(it)
print 'Num episodes:', len(reward_per_ep)
print 'Total timesteps:', sum(len_per_ep)
print 'Max episode R:', max(reward_per_ep)
print 'Mean ep reward:', np.mean(reward_per_ep), 'Var:', np.std(reward_per_ep)
print 'Mean ep steps:', np.mean(len_per_ep), 'Var:', np.std(len_per_ep)
print '=='*30
import gym, os
import numpy as np
import tensorflow as tf
import reinforce_with_baseline as tf_reinforce
from gym import wrappers
def ensure_dir(d):
"""
Check if directories in d exists
if not creates corresponding directories.
"""
if not os.path.exists(d):
os.makedirs(d)
if __name__ == '__main__':
env = gym.make('Acrobot-v1')
user_params = {
'max_episode_steps':500, #env.spec.timestep_limit,
'optim_config':{'lr':0.005, 'rho':0.9, 'eps':1e-9},
'num_hidden':40,
}
agent = tf_reinforce.REINFORCEAgent(env.observation_space, env.action_space, **user_params)
with tf.Session() as sess:
print '================ LEARNING ================='
sess.run(tf.initialize_all_variables())
agent.learn(sess, env, verbose=2)
print '================ TESTING ================='
results = agent.test(sess, env, num_episodes=100,
max_episode_steps=agent.config['max_episode_steps'],
render=True, deterministic_acts = True)
print 'Mean episode reward:', np.mean(results['rewards'])
print 'Mean episode length:', np.mean(results['num_steps'])
env.close()
@griver
Copy link
Author

griver commented Jan 5, 2017

Just run reproduce_gym_results.py

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment