Skip to content

Instantly share code, notes, and snippets.

@unixpickle
Last active February 20, 2017 17:00
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save unixpickle/7989056b3a6799595de46cd30cd0e2ad to your computer and use it in GitHub Desktop.
Save unixpickle/7989056b3a6799595de46cd30cd0e2ad to your computer and use it in GitHub Desktop.
CartPole Agent
import gym
import numpy as np
import random
# Use policy gradients to train a linear model.
# Achieves a good success rate after 500 trials.
def softmax(vec):
divisor = np.sum(np.sum(np.exp(vec)))
return np.exp(vec) / divisor
def trial(env, policy):
obs = env.reset()
gradient = np.zeros((2, 4))
totalReward = 0
while True:
env.render()
obsColumn = np.reshape(obs, (4, 1))
policyOut = softmax(np.dot(policy, obsColumn).T[0])
action = 0
softmaxUpstream = [policyOut[1], -policyOut[1]]
if random.random() > policyOut[0]:
softmaxUpstream = [-policyOut[0], policyOut[0]]
action = 1
# Score for the action.
gradient[0] += obs * softmaxUpstream[0]
gradient[1] += obs * softmaxUpstream[1]
obs, rew, done, info = env.step(action)
totalReward += rew
if done:
return (gradient, totalReward)
def main():
policy = np.zeros((2, 4))
batchSize = 10
stepSize = 0.01
baseline = 10
numTrials = 0
env = gym.make('CartPole-v0')
while True:
stepSize *= 0.95
totalGrad = np.zeros((2, 4))
totalReward = 0
for i in range(0, batchSize):
gradient, reward = trial(env, policy)
totalGrad += gradient * (reward - baseline)
totalReward += reward
numTrials += 1
baseline = totalReward / batchSize
policy += totalGrad * (stepSize / batchSize)
print('%d trials: reward=%f step=%f' % (numTrials, baseline, stepSize))
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment