Skip to content

Instantly share code, notes, and snippets.

@kimbring2
Created February 17, 2017 19:00
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kimbring2/fa453b1328523b50bda1c7f2da5dcb8c to your computer and use it in GitHub Desktop.
Save kimbring2/fa453b1328523b50bda1c7f2da5dcb8c to your computer and use it in GitHub Desktop.
'''
Policy gradient algorithm: here, instead of choosing the action as a deterministic function of the sign of
the weighted sum, make it so that action is chosen randomly, but where the distribution over actions
(of which there are two) depends on the numerical output of the inner product.
Policy gradient prescribes a principled parameter update rule [1, 2].
Your goal is to implement this algorithm for the simple linear model, and see how long it takes to converge.
'''
import gym
import random
import math
import numpy as np
env = gym.make('CartPole-v0')
par1 = random.normalvariate(0, 0.1)
par2 = random.normalvariate(0, 0.1)
par3 = random.normalvariate(0, 0.1)
par4 = random.normalvariate(0, 0.1)
grad_buffer = []
rmsprop_cache = [0, 0, 0, 0]
batch_size = 10 # every how many episodes to do a param update?
learning_rate = 1e-4
decay_rate = 0.99
gamma = 0.99
render = False
def discount_rewards(r):
""" take 1D float array of rewards and compute discounted reward """
discounted_r = np.zeros_like(r)
running_add = 0
for t in reversed(xrange(0, r.size)):
running_add = running_add * gamma + r[t]
discounted_r[t] = running_add
return discounted_r
observation_list = []
reward_list = []
gradient_list = []
episode_number = 0
reward_sum = 0
total_episodes = 100000
running_reward = None
observation = env.reset()
# training parameters
while True:
#observation = env.reset()
#if render:
#env.render()
observation = env.reset()
gradient_list = []
while episode_number <= total_episodes:
#print observation
observation_list.append([observation[0], observation[1], observation[2], observation[3]])
#print observation_list
# simple linear model
probability = par1 * observation[0] + par2 * observation[1] + par3 * observation[2] + par4 * observation[3]
#inner_product = pow(observation[0], 2) + pow(observation[1], 2) + pow(observation[2], 2) + pow(observation[3], 2)
#inner_product = pow(par1, 2) + pow(par2, 2) + pow(par3, 2) + pow(par4, 2)
#print inner_producty = 1 if action == 0 else 0
#print probability
# determine action
if np.random.uniform() < probability:
action = 1
else:
action = 0
input_y = 1 if action == 0 else 0
gradient = input_y - probability
#gradient = input_y * (input_y - probability) + (1 - input_y) * (input_y + probability)
#print gradient
gradient_list.append(gradient)
#print gradient_list
#print type(gradient_list)
# do action and get the reward
#action = env.action_space.sample()
observation, reward, done, info = env.step(action)
#print reward
# accumulate the reward
reward_sum += reward
reward_list.append(reward)
if done:
episode_number += 1
reward_stack = np.vstack(reward_list)
gradient_stack = np.vstack(gradient_list)
observation_stack = np.vstack(observation_list)
reward_list, gradient_list, observation_list = [], [], []
discounted_reward = discount_rewards(reward_stack)
#print discounted_reward
discounted_reward -= np.mean(discounted_reward)
#print discounted_reward
discounted_reward /= np.std(discounted_reward)
gradient_stack *= discounted_reward
#print gradient_stack
#print discounted_reward
#print '\n'
if episode_number % batch_size == 0:
for i in range(0, len(gradient_stack)):
rmsprop_cache[0] = decay_rate * rmsprop_cache[0] + (1 - decay_rate) * gradient_stack[i] ** 2
rmsprop_cache[1] = decay_rate * rmsprop_cache[1] + (1 - decay_rate) * gradient_stack[i] ** 2
rmsprop_cache[2] = decay_rate * rmsprop_cache[2] + (1 - decay_rate) * gradient_stack[i] ** 2
rmsprop_cache[3] = decay_rate * rmsprop_cache[3] + (1 - decay_rate) * gradient_stack[i] ** 2
par1 += learning_rate * gradient_stack[i] * observation_stack[i][0] / (np.sqrt(rmsprop_cache[0]) + 1e-5)
par2 += learning_rate * gradient_stack[i] * observation_stack[i][1] / (np.sqrt(rmsprop_cache[1]) + 1e-5)
par3 += learning_rate * gradient_stack[i] * observation_stack[i][2] / (np.sqrt(rmsprop_cache[2]) + 1e-5)
par4 += learning_rate * gradient_stack[i] * observation_stack[i][3] / (np.sqrt(rmsprop_cache[3]) + 1e-5)
# Give a summary of how well our network is doing for each batch of episodes.
running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01
print 'Average reward for episode %f. Total average reward %f.' % (
reward_sum / batch_size, running_reward / batch_size)
if reward_sum / batch_size > 200:
print "Task solved in", episode_number, 'episodes!'
break
reward_sum = 0
observation = env.reset() # reset env
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment