Skip to content

Instantly share code, notes, and snippets.

@flyman3046
Last active February 6, 2020 06:08
Show Gist options
  • Save flyman3046/d37680eeaac469a4030c690ae65b0419 to your computer and use it in GitHub Desktop.
Save flyman3046/d37680eeaac469a4030c690ae65b0419 to your computer and use it in GitHub Desktop.
Implementation of Evolution Strategies to Solve CartPole-v0
# https://gist.github.com/karpathy/77fbb6a8dac5395f1b73e7a89300318d
import gym
import numpy as np
def f(env, weight):
total_reward = 0.0
num_run = 100
for t in range(num_run):
observation = env.reset()
for i in range(300):
action = 1 if np.dot(weight, observation) > 0 else 0
observation, reward, done, info = env.step(action)
total_reward += reward
if done:
break
return total_reward / num_run
def evolution_strategy(env):
# hyperparameters
npop = 50 # population size
sigma = 0.1 # noise standard deviation
alpha = 0.001 # learning rate
# start the optimization
weight = np.random.rand(4) # our initial guess is random
for i in range(50):
# print current fitness of the most likely parameter setting
print 'iter {}. weight: {}, reward: {}'.format(i, str(weight), f(env, weight))
# initialize memory for a population of w's, and their rewards
N = np.random.randn(npop, 4) # samples from a normal distribution N(0,1)
R = np.zeros(npop)
for j in range(npop):
w_try = weight + sigma * N[j] # jitter w using gaussian of sigma 0.1
R[j] = f(env, w_try) # evaluate the jittered version
# standardize the rewards to have a gaussian distribution
A = (R - np.mean(R)) / np.std(R)
# perform the parameter update. The matrix multiply below
# is just an efficient way to sum up all the rows of the noise matrix N,
# where each row N[j] is weighted by A[j]
weight = weight + alpha / (npop * sigma) * np.dot(N.T, A)
env = gym.make('CartPole-v0')
evolution_strategy(env)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment