Skip to content

Instantly share code, notes, and snippets.

@befelix
Last active July 27, 2017 16:22
Show Gist options
  • Save befelix/517887234139a5d9ebb5654161fa0b54 to your computer and use it in GitHub Desktop.
Save befelix/517887234139a5d9ebb5654161fa0b54 to your computer and use it in GitHub Desktop.
Comparison
#!/usr/bin/env python
# noinspection PyUnresolvedReferences
import mujoco_py # Mujoco must come before other imports. https://openai.slack.com/archives/C1H6P3R7B/p1492828680631850
from mpi4py import MPI
from baselines.common import set_global_seeds
import os
import gym
import logging
from baselines import logger
from baselines.pposgd.mlp_policy import MlpPolicy
from baselines import bench
from baselines.trpo_mpi import trpo_mpi
num_cpu=1
def train(env_id, num_timesteps, seed):
import baselines.common.tf_util as U
logger.session().__enter__()
sess = U.single_threaded_session()
sess.__enter__()
rank = MPI.COMM_WORLD.Get_rank()
if rank != 0:
logger.set_level(logger.DISABLED)
workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
set_global_seeds(workerseed)
env = gym.make(env_id)
def policy_fn(name, ob_space, ac_space):
return MlpPolicy(name=name, ob_space=env.observation_space, ac_space=env.action_space,
hid_size=64, num_hid_layers=2)
env = bench.Monitor(env, os.path.join(logger.get_dir(), "%i.monitor.json" % rank))
env.seed(workerseed)
gym.logger.setLevel(logging.WARN)
trpo_mpi.learn(env,
policy_fn,
timesteps_per_batch=25000,
max_kl=0.01,
cg_iters=10,
cg_damping=0.1,
max_timesteps=1e3 * num_timesteps,
gamma=0.99,
lam=0.97,
vf_iters=5,
vf_stepsize=1e-3)
env.close()
def main():
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
os.environ['OMP_NUM_THREADS'] = '2'
os.environ['MKL_NUM_THREADS'] = '2'
train('Hopper-v1', num_timesteps=1e6, seed=0)
if __name__ == '__main__':
main()
#!/usr/bin/env python
from __future__ import division, print_function
import os
import logging
from tensorforce.agents import TRPOAgent
from tensorforce.environments.openai_gym import OpenAIGym
from tensorforce.execution import Runner
from tensorforce.core.networks import layered_network_builder
from tensorforce import Configuration
def main():
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
environment = OpenAIGym('Hopper-v1')
network_config = [{"type": "dense", "size": 64, "activation": "tanh"},
{"type": "dense", "size": 64, "activation": "tanh"}]
network = layered_network_builder(network_config)
print(environment.actions)
agent_config = dict(states=environment.states,
actions=environment.actions,
network=network,
loglevel="info",
batch_size=float('inf'),
discount=0.97,
generalized_advantage_estimation=True,
gae_lambda=0.99,
baseline={"type": "mlp",
"size": 64,
"repeat_update": 5},
learning_rate=1e-3,
normalize_advantage=True,
override_line_search=False,
cg_damping=0.1,
line_search_steps=20,
max_kl_divergence=0.01,
cg_iterations=10)
agent = TRPOAgent(config=Configuration(allow_defaults=True, **agent_config))
runner = Runner(agent=agent,
environment=environment)
class Callback(object):
def __init__(self):
self.update_count = 0
self.last_episode = 0
def __call__(self, r):
agent = r.agent
if agent.batch_count >= 25000:
logger.info('-' * 50)
logger.info("Iteration {iteration}".format(iteration=self.update_count))
episode_rewards = r.episode_rewards[self.last_episode:]
logger.info("Average batch reward: {}".format(sum(episode_rewards) /
len(episode_rewards)))
episode_lengths = r.episode_lengths[self.last_episode:]
logger.info("Average episode length: {}".format(sum(episode_lengths) /
len(episode_lengths)))
logger.info("Episode reward: {}".format(r.episode_rewards[-1]))
agent.model.update(agent.batch)
agent.reset_batch()
self.update_count += 1
self.last_episode = r.episode
return True
logger.info("Starting {agent} for Environment '{env}'".format(agent=agent, env=environment))
runner.run(episodes=float('inf'), max_timesteps=1e6, episode_finished=Callback())
logger.info("Learning finished. Total episodes: {ep}".format(ep=runner.episode))
environment.close()
if __name__ == '__main__':
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
os.environ['OMP_NUM_THREADS'] = '2'
os.environ['MKL_NUM_THREADS'] = '2'
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment