Skip to content

Instantly share code, notes, and snippets.

@JKCooper2
Last active November 1, 2018 18:03
Show Gist options
  • Save JKCooper2/b3543e665758ff4a2b7a3c82db97f336 to your computer and use it in GitHub Desktop.
Save JKCooper2/b3543e665758ff4a2b7a3c82db97f336 to your computer and use it in GitHub Desktop.
[Open AI| CartPole v0 - Simulated Annealing v0
Based on treeforms solution here: https://gym.openai.com/evaluations/eval_LjL2QnWCRlmh58dLCpMUTg
Changes:
- Moved agent into it's own class so can be reused on other problems
- Each time the best score is hit the best values and updated towards the mean of the best values found so far (required because problem is bounded)
- Alpha decays when best score is matched (required because problem is bounded)
- Alpha increases if you neither match nor exceeed the best score (helps get around initial 'wrong' choices by random improvement
Improvements from changes:
- Repeats no longer needed (but included in class anyway in case it was useful for other problems)
- Time to find first complete solution reduced
- Variation after finding first complete solution reduced
- Time spent with incorrect values at the start from randomly moving in the 'wrong' direction reduced
import logging
import gym
from SimulatedAnnealing import SimulatedAnnealingAgent
def main():
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
env = gym.make('CartPole-v0')
agent = SimulatedAnnealingAgent(env.action_space, repeats=1, decay=0.9, spread=0.3) # Initialise agent
outdir = '/tmp/' + agent.name + '-results'
env.monitor.start(outdir, force=True)
episode_count = 200
max_steps = 200
reward = 0
done = False
for i in xrange(episode_count):
ob = env.reset()
for j in xrange(max_steps):
action = agent.act(ob, reward, done)
ob, reward, done, _ = env.step(action)
if done:
break
# Dump result info to disk
env.monitor.close()
if __name__ == '__main__':
main()
import random
random.seed(0) # So scores are (slightly) more consistent. Randomness in pole counteracts this a bit
class SimulatedAnnealingAgent(object):
def __init__(self, action_space, repeats=10, alpha=1, decay=0.9, spread=0.5):
self.name = 'SimAnn' # Name to be submitted to OpenAI
self.action_space = action_space # Just for consistency with other agents, not used in this case
self.alpha = alpha # Learning rate
self.decay = decay # Decay in impact of alpha
self.spread = spread # Spread of randomness when selecting new values to test
self.repeats = repeats # Number of times to repeat testing a value
self.obs_count = 0 # Number of observation returned (can probably get from the environment somehow)
self.best = [] # Holds best values (set on first run of action)
self.test = [] # Holds test values
self.best_score = 0 # Current max score found
self.best_count = 0 # Times hit max score (used for bounded problems)
self.ep_score = 0 # Total score for episode
self.repeat_count = 0 # Times repeated running test
# Set the new test values at the start of the episode
def set_test(self):
# If less than required repeats than just run again
if self.repeat_count < self.repeats:
self.repeat_count += 1
return self.test
# Else reset repeat count and set new values based on current best, spread and alpha
self.repeat_count = 0
return [self.best[i] + (random.random() - self.spread) * self.alpha for i in range(self.obs_count)]
# Choose action based on observed values
def choose_action(self, observation):
if sum(observation[i] * self.test[i] for i in range(self.obs_count)) > 0:
return 1
return 0
# If get the same ep score then update best to average of all values that have reached the best score
def update_best(self):
self.best = [(self.best[i] * self.best_count + self.test[i])/(self.best_count + 1) for i in range(self.obs_count)]
self.best_count += 1
# What gets called
def act(self, observation, reward, done):
# Set initial values if first time agent is seeing observations
if self.obs_count == 0:
self.obs_count = len(observation)
self.best = [0] * self.obs_count
self.test = self.best
# Set new test values for new episode
if self.ep_score == 0:
self.test = self.set_test()
# Select action
action = self.choose_action(observation)
# Update episode score
self.ep_score += reward
if done:
# If score is the same as best then the amount of variance in future choices goes down
# Set the new best to be the average of all the best scores so far (using incremental mean)
if self.ep_score == self.best_score:
self.alpha *= self.decay
self.update_best()
# If new score is greater then set everything to that
elif self.ep_score > self.best_score:
self.best_score = self.ep_score
self.best = self.test
self.best_count = 0
self.alpha *= self.decay
# If new score isn't >= then increase the spread when selecting values
# This helps get around issues making incorrect starting decisions but can probably be improved
else:
self.alpha /= self.decay
self.ep_score = 0
return action
@neildhir
Copy link

Is there a paper that explains the details of simulated annealing w.r.t. to RL? A quick google search did not yield anything much useful.

Thanks.

@thepup
Copy link

thepup commented May 26, 2016

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment