flyman3046/es-CartPole.py

## es-CartPole.py
# https://gist.github.com/karpathy/77fbb6a8dac5395f1b73e7a89300318d
import gym
import numpy as np

def f(env, weight):
    total_reward = 0.0
    num_run = 100
    for t in range(num_run):
        observation = env.reset()
        for i in range(300):
            action = 1 if np.dot(weight, observation) > 0 else 0

            observation, reward, done, info = env.step(action)
            total_reward += reward
            if done:
                break
    return total_reward / num_run

def evolution_strategy(env):
    # hyperparameters
    npop = 50 # population size
    sigma = 0.1 # noise standard deviation
    alpha = 0.001 # learning rate

    # start the optimization
    weight = np.random.rand(4) # our initial guess is random
    for i in range(50):

        # print current fitness of the most likely parameter setting
        print 'iter {}. weight: {}, reward: {}'.format(i, str(weight), f(env, weight))

        # initialize memory for a population of w's, and their rewards
        N = np.random.randn(npop, 4) # samples from a normal distribution N(0,1)
        R = np.zeros(npop)
        for j in range(npop):
            w_try = weight + sigma * N[j] # jitter w using gaussian of sigma 0.1
            R[j] = f(env, w_try) # evaluate the jittered version

        # standardize the rewards to have a gaussian distribution
        A = (R - np.mean(R)) / np.std(R)
        # perform the parameter update. The matrix multiply below
        # is just an efficient way to sum up all the rows of the noise matrix N,
        # where each row N[j] is weighted by A[j]
        weight = weight + alpha / (npop * sigma) * np.dot(N.T, A)

env = gym.make('CartPole-v0')
evolution_strategy(env)
	# https://gist.github.com/karpathy/77fbb6a8dac5395f1b73e7a89300318d
	import gym
	import numpy as np

	def f(env, weight):
	total_reward = 0.0
	num_run = 100
	for t in range(num_run):
	observation = env.reset()
	for i in range(300):
	action = 1 if np.dot(weight, observation) > 0 else 0

	observation, reward, done, info = env.step(action)
	total_reward += reward
	if done:
	break
	return total_reward / num_run

	def evolution_strategy(env):
	# hyperparameters
	npop = 50 # population size
	sigma = 0.1 # noise standard deviation
	alpha = 0.001 # learning rate

	# start the optimization
	weight = np.random.rand(4) # our initial guess is random
	for i in range(50):

	# print current fitness of the most likely parameter setting
	print 'iter {}. weight: {}, reward: {}'.format(i, str(weight), f(env, weight))

	# initialize memory for a population of w's, and their rewards
	N = np.random.randn(npop, 4) # samples from a normal distribution N(0,1)
	R = np.zeros(npop)
	for j in range(npop):
	w_try = weight + sigma * N[j] # jitter w using gaussian of sigma 0.1
	R[j] = f(env, w_try) # evaluate the jittered version

	# standardize the rewards to have a gaussian distribution
	A = (R - np.mean(R)) / np.std(R)
	# perform the parameter update. The matrix multiply below
	# is just an efficient way to sum up all the rows of the noise matrix N,
	# where each row N[j] is weighted by A[j]
	weight = weight + alpha / (npop * sigma) * np.dot(N.T, A)

	env = gym.make('CartPole-v0')
	evolution_strategy(env)