Last active
February 20, 2017 17:00
-
-
Save unixpickle/7989056b3a6799595de46cd30cd0e2ad to your computer and use it in GitHub Desktop.
CartPole Agent
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import gym | |
import numpy as np | |
import random | |
# Use policy gradients to train a linear model. | |
# Achieves a good success rate after 500 trials. | |
def softmax(vec): | |
divisor = np.sum(np.sum(np.exp(vec))) | |
return np.exp(vec) / divisor | |
def trial(env, policy): | |
obs = env.reset() | |
gradient = np.zeros((2, 4)) | |
totalReward = 0 | |
while True: | |
env.render() | |
obsColumn = np.reshape(obs, (4, 1)) | |
policyOut = softmax(np.dot(policy, obsColumn).T[0]) | |
action = 0 | |
softmaxUpstream = [policyOut[1], -policyOut[1]] | |
if random.random() > policyOut[0]: | |
softmaxUpstream = [-policyOut[0], policyOut[0]] | |
action = 1 | |
# Score for the action. | |
gradient[0] += obs * softmaxUpstream[0] | |
gradient[1] += obs * softmaxUpstream[1] | |
obs, rew, done, info = env.step(action) | |
totalReward += rew | |
if done: | |
return (gradient, totalReward) | |
def main(): | |
policy = np.zeros((2, 4)) | |
batchSize = 10 | |
stepSize = 0.01 | |
baseline = 10 | |
numTrials = 0 | |
env = gym.make('CartPole-v0') | |
while True: | |
stepSize *= 0.95 | |
totalGrad = np.zeros((2, 4)) | |
totalReward = 0 | |
for i in range(0, batchSize): | |
gradient, reward = trial(env, policy) | |
totalGrad += gradient * (reward - baseline) | |
totalReward += reward | |
numTrials += 1 | |
baseline = totalReward / batchSize | |
policy += totalGrad * (stepSize / batchSize) | |
print('%d trials: reward=%f step=%f' % (numTrials, baseline, stepSize)) | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment