Created
February 17, 2017 19:00
-
-
Save kimbring2/fa453b1328523b50bda1c7f2da5dcb8c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
Policy gradient algorithm: here, instead of choosing the action as a deterministic function of the sign of | |
the weighted sum, make it so that action is chosen randomly, but where the distribution over actions | |
(of which there are two) depends on the numerical output of the inner product. | |
Policy gradient prescribes a principled parameter update rule [1, 2]. | |
Your goal is to implement this algorithm for the simple linear model, and see how long it takes to converge. | |
''' | |
import gym | |
import random | |
import math | |
import numpy as np | |
env = gym.make('CartPole-v0') | |
par1 = random.normalvariate(0, 0.1) | |
par2 = random.normalvariate(0, 0.1) | |
par3 = random.normalvariate(0, 0.1) | |
par4 = random.normalvariate(0, 0.1) | |
grad_buffer = [] | |
rmsprop_cache = [0, 0, 0, 0] | |
batch_size = 10 # every how many episodes to do a param update? | |
learning_rate = 1e-4 | |
decay_rate = 0.99 | |
gamma = 0.99 | |
render = False | |
def discount_rewards(r): | |
""" take 1D float array of rewards and compute discounted reward """ | |
discounted_r = np.zeros_like(r) | |
running_add = 0 | |
for t in reversed(xrange(0, r.size)): | |
running_add = running_add * gamma + r[t] | |
discounted_r[t] = running_add | |
return discounted_r | |
observation_list = [] | |
reward_list = [] | |
gradient_list = [] | |
episode_number = 0 | |
reward_sum = 0 | |
total_episodes = 100000 | |
running_reward = None | |
observation = env.reset() | |
# training parameters | |
while True: | |
#observation = env.reset() | |
#if render: | |
#env.render() | |
observation = env.reset() | |
gradient_list = [] | |
while episode_number <= total_episodes: | |
#print observation | |
observation_list.append([observation[0], observation[1], observation[2], observation[3]]) | |
#print observation_list | |
# simple linear model | |
probability = par1 * observation[0] + par2 * observation[1] + par3 * observation[2] + par4 * observation[3] | |
#inner_product = pow(observation[0], 2) + pow(observation[1], 2) + pow(observation[2], 2) + pow(observation[3], 2) | |
#inner_product = pow(par1, 2) + pow(par2, 2) + pow(par3, 2) + pow(par4, 2) | |
#print inner_producty = 1 if action == 0 else 0 | |
#print probability | |
# determine action | |
if np.random.uniform() < probability: | |
action = 1 | |
else: | |
action = 0 | |
input_y = 1 if action == 0 else 0 | |
gradient = input_y - probability | |
#gradient = input_y * (input_y - probability) + (1 - input_y) * (input_y + probability) | |
#print gradient | |
gradient_list.append(gradient) | |
#print gradient_list | |
#print type(gradient_list) | |
# do action and get the reward | |
#action = env.action_space.sample() | |
observation, reward, done, info = env.step(action) | |
#print reward | |
# accumulate the reward | |
reward_sum += reward | |
reward_list.append(reward) | |
if done: | |
episode_number += 1 | |
reward_stack = np.vstack(reward_list) | |
gradient_stack = np.vstack(gradient_list) | |
observation_stack = np.vstack(observation_list) | |
reward_list, gradient_list, observation_list = [], [], [] | |
discounted_reward = discount_rewards(reward_stack) | |
#print discounted_reward | |
discounted_reward -= np.mean(discounted_reward) | |
#print discounted_reward | |
discounted_reward /= np.std(discounted_reward) | |
gradient_stack *= discounted_reward | |
#print gradient_stack | |
#print discounted_reward | |
#print '\n' | |
if episode_number % batch_size == 0: | |
for i in range(0, len(gradient_stack)): | |
rmsprop_cache[0] = decay_rate * rmsprop_cache[0] + (1 - decay_rate) * gradient_stack[i] ** 2 | |
rmsprop_cache[1] = decay_rate * rmsprop_cache[1] + (1 - decay_rate) * gradient_stack[i] ** 2 | |
rmsprop_cache[2] = decay_rate * rmsprop_cache[2] + (1 - decay_rate) * gradient_stack[i] ** 2 | |
rmsprop_cache[3] = decay_rate * rmsprop_cache[3] + (1 - decay_rate) * gradient_stack[i] ** 2 | |
par1 += learning_rate * gradient_stack[i] * observation_stack[i][0] / (np.sqrt(rmsprop_cache[0]) + 1e-5) | |
par2 += learning_rate * gradient_stack[i] * observation_stack[i][1] / (np.sqrt(rmsprop_cache[1]) + 1e-5) | |
par3 += learning_rate * gradient_stack[i] * observation_stack[i][2] / (np.sqrt(rmsprop_cache[2]) + 1e-5) | |
par4 += learning_rate * gradient_stack[i] * observation_stack[i][3] / (np.sqrt(rmsprop_cache[3]) + 1e-5) | |
# Give a summary of how well our network is doing for each batch of episodes. | |
running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01 | |
print 'Average reward for episode %f. Total average reward %f.' % ( | |
reward_sum / batch_size, running_reward / batch_size) | |
if reward_sum / batch_size > 200: | |
print "Task solved in", episode_number, 'episodes!' | |
break | |
reward_sum = 0 | |
observation = env.reset() # reset env |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment