Skip to content

Instantly share code, notes, and snippets.

@JKCooper2
Last active May 6, 2016 01:05
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save JKCooper2/5998c3bbb387ea9e8a18ce29e2e335b8 to your computer and use it in GitHub Desktop.
Save JKCooper2/5998c3bbb387ea9e8a18ce29e2e335b8 to your computer and use it in GitHub Desktop.
# CARTPOLE MULTI AGENT
# Set up to allow for using a pool of agents
import logging
import gym
from CrossEntropyMethod import CrossEntropyMethodPool
import gym.scoreboard.scoring
import gym.monitoring.monitor
def main():
logger = logging.getLogger()
logger.setLevel(logging.INFO)
outdir = '/tmp/cem-results/'
agents = CrossEntropyMethodPool('CartPole-v0', outdir, n_samples=5, top_n=.4)
episodes = 30
for i in range(episodes):
agents.train()
agents.update()
print gym.scoreboard.scoring.score_from_local(outdir)
gym.monitoring.monitor.close_all_monitors()
# Upload to the scoreboard.
upload = True # Sets whether to upload to OpenAI
if upload:
logger.info("Complete. Uploading Results")
gym.upload(outdir, algorithm_id="pool-cem", api_key="api_key")
if __name__ == '__main__':
main()
import numpy as np
import gym
import gym.monitoring.monitor
np.random.seed(0)
class CrossEntropyMethodAgent(object):
def __init__(self):
self.name = "cem"
self.mean = None
def choose_action(self, observation):
action_score = sum([observation[i] * self.mean[i] for i in range(len(observation))])
return int(action_score > 0) # Return 1 if score > 0, else 0
def act(self, observation, reward, done):
# If first time running set up distribution values for each observed value
if self.mean is None:
self.mean = np.random.randn(len(observation))
return self.choose_action(observation)
def update(self, mean, variance):
self.mean = np.random.normal(mean, variance)
def capped_cubic_video_schedule_single(episode_id, monitor_id):
if monitor_id != 0:
return False
if episode_id < 1000:
return int(round(episode_id ** (1. / 3))) ** 3 == episode_id
else:
return episode_id % 1000 == 0
class CrossEntropyMethodPool(object):
def __init__(self, env, path, n_samples=10, top_n=0.2):
self.name = "cem"
self.env = env
self.n_samples = n_samples
self.path = path
self.agent_pool = [CrossEntropyMethodAgent() for _ in range(n_samples)]
self.env_pool = [gym.make(self.env) for _ in range(self.n_samples)]
for env in self.env_pool:
env.monitor.start(self.path, force=True, n_monitors=self.n_samples, video_callable=capped_cubic_video_schedule_single)
self.top_n = int(top_n * n_samples)
self.mean = None
self.variance = None
self.obs_length = None
self.rewards = None
def run_episode(self, env, agent):
reward = 0
ep_reward = 0
done = False
ob = env.reset()
while not done:
action = agent.act(ob, reward, done)
ob, reward, done, _ = env.step(action)
ep_reward += reward
return ep_reward
# Trains all agents with their current settings
def train(self):
if self.obs_length is None:
self.obs_length = self.env_pool[0].observation_space.shape[0]
self.rewards = [self.run_episode(self.env_pool[i], self.agent_pool[i]) for i in range(self.n_samples)]
# Selects top_n agents and creates next generation
def update(self):
# Get the id's of the top_n models
top_id = np.array(self.rewards).argsort()[-self.top_n:][::-1]
np_top = np.array([self.agent_pool[i].mean for i in top_id])
self.mean = np.mean(np_top, axis=0)
self.variance = np.var(np_top, axis=0)
self.variance = np.array([max(self.variance[i], 0.0001) for i in range(len(self.variance))])
for agent in self.agent_pool:
agent.update(self.mean, self.variance)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment