Skip to content

Instantly share code, notes, and snippets.

@baoblackcoal
Forked from karpathy/pg-pong.py
Last active July 27, 2016 09:12
Show Gist options
  • Save baoblackcoal/d3647c57bf1101ca6f8d8ae0dab38292 to your computer and use it in GitHub Desktop.
Save baoblackcoal/d3647c57bf1101ca6f8d8ae0dab38292 to your computer and use it in GitHub Desktop.
Training a Neural Network ATARI Pong agent with Policy Gradients from raw pixels
""" Trains an agent with (stochastic) Policy Gradients on Pong. Uses OpenAI Gym. """
import numpy as np
import cPickle as pickle
import gym
# hyperparameters
H = 200 # number of hidden layer neurons
batch_size = 10 # every how many episodes to do a param update?
learning_rate = 1e-1
gamma = 0.99 # discount factor for reward
decay_rate = 0.99 # decay factor for RMSProp leaky sum of grad^2
resume = False # resume from previous checkpoint?
render = False
upload = True
episode_reward_sum = 0.
episode_reward_average = 0.
np.random.seed(5)
# model initialization
D = 4 # input dimensionality: 80x80 grid
if resume:
model = pickle.load(open('save.p', 'rb'))
else:
model = {}
model['W1'] = np.random.randn(H, D) / np.sqrt(D) # "Xavier" initialization
model['W2'] = np.random.randn(H) / np.sqrt(H)
grad_buffer = {k: np.zeros_like(v) for k, v in
model.iteritems()} # update buffers that add up gradients over a batch
rmsprop_cache = {k: np.zeros_like(v) for k, v in
model.iteritems()} # rmsprop memory
def sigmoid(x):
return 1.0 / (
1.0 + np.exp(-x)) # sigmoid "squashing" function to interval [0,1]
# def prepro(I):
# """ prepro 210x160x3 uint8 frame into 6400 (80x80) 1D float vector """
# I = I[35:195] # crop
# I = I[::2,::2,0] # downsample by factor of 2
# I[I == 144] = 0 # erase background (background type 1)
# I[I == 109] = 0 # erase background (background type 2)
# I[I != 0] = 1 # everything else (paddles, ball) just set to 1
# return I.astype(np.float).ravel()
def discount_rewards(reward_threshold):
""" take 1D float array of rewards and compute discounted reward """
discounted_r = np.zeros_like(reward_threshold)
running_add = 0
for t in reversed(xrange(0, reward_threshold.size)):
if reward_threshold[
t] != 0: running_add = 0 # reset the sum, since this was a game boundary (pong specific!)
running_add = running_add * gamma + reward_threshold[t]
discounted_r[t] = running_add
return discounted_r
def policy_forward(x):
h = np.dot(model['W1'], x)
h[h < 0] = 0 # ReLU nonlinearity
logp = np.dot(model['W2'], h)
p = sigmoid(logp)
return p, h # return probability of taking action 2, and hidden state
def policy_backward(eph, epdlogp):
""" backward pass. (eph is array of intermediate hidden states) """
dW2 = np.dot(eph.T, epdlogp).ravel()
dh = np.outer(epdlogp, model['W2'])
dh[eph <= 0] = 0 # backpro prelu
dW1 = np.dot(dh.T, epx)
return {'W1': dW1, 'W2': dW2}
env = gym.make("CartPole-v0")
outdir = '/tmp/pg-cartpaole-results'
env.monitor.start(outdir, force=True)
observation = env.reset()
prev_x = None # used in computing the difference frame
xs, hs, dlogps, drs = [], [], [], []
running_reward = None
reward_sum = 0
episode_number = 0
reward_current = 0
train_finish = False
while True:
if render: env.render()
# preprocess the observation, set input to network to be difference image
# cur_x = prepro(observation)
# cur_x = observation
# x = cur_x - prev_x if prev_x is not None else np.zeros(D)
# prev_x = cur_x
x = observation
# forward the policy network and sample an action from the returned probability
aprob, h = policy_forward(x)
# action = 0 if np.random.uniform() < aprob else 1 # roll the dice!
action = 0 if 0.5 < aprob else 1 # roll the dice!
# record various intermediates (needed later for backprop)
xs.append(x) # observation
hs.append(h) # hidden state
y = 1 if action == 1 else 0 # a "fake label"
# grad that encourages the action that was taken to be taken (see http://cs231n.github.io/neural-networks-2/#losses if confused)
dlogps.append(y - aprob)
# step the environment and get new measurements
observation, reward, done, info = env.step(action)
reward_sum += reward
if reward_sum >= 200:
print('reward_sum >= 200!')
train_finish = True
if done is not True:
reward_current = 0.
# drs.append(reward) # record reward (has to be done after we call step() to get reward for previous action)
# record reward (has to be done after we call step() to get reward for previous action)
drs.append(reward_current)
else: # an episode finished
reward_current = reward_sum * 1.
drs.append(reward_current)
episode_number += 1
if episode_number > 1000:
break
episode_reward_sum += reward_sum
if train_finish is not True:
print('train')
# stack together all inputs, hidden states, action gradients, and rewards for this episode
epx = np.vstack(xs)
eph = np.vstack(hs)
epdlogp = np.vstack(dlogps)
epr = np.vstack(drs)
xs, hs, dlogps, drs = [], [], [], [] # reset array memory
# compute the discounted reward backwards through time
discounted_epr = discount_rewards(epr)
# standardize the rewards to be unit normal (helps control the gradient estimator variance)
discounted_epr -= np.mean(discounted_epr)
discounted_epr /= np.std(discounted_epr)
epdlogp *= discounted_epr # modulate the gradient with advantage (PG magic happens right here.)
grad = policy_backward(eph, epdlogp)
for k in model: grad_buffer[k] += grad[k] # accumulate grad over batch
# perform rmsprop parameter update every batch_size episodes
if episode_number % batch_size == 0:
for k, v in model.iteritems():
g = grad_buffer[k] # gradient
rmsprop_cache[k] = decay_rate * rmsprop_cache[k] + (
1 - decay_rate) * g ** 2
model[k] += learning_rate * g / (np.sqrt(rmsprop_cache[k]) + 1e-5)
grad_buffer[k] = np.zeros_like(v) # reset batch gradient buffer
episode_reward_average = episode_reward_sum / episode_number
if episode_reward_average >= 195:
print('episode_reward_average got 195! episode_reward_average=%f',
episode_reward_average)
break
# boring book-keeping
running_reward = reward_current if running_reward is None else running_reward * 0.99 + reward_current * 0.01
print (
'resetting env. episode = %d, reward_sum = %f, reward_current = %f, episode_reward_average = %f, running mean = %f' % (
episode_number, reward_sum, reward_current, episode_reward_average,
running_reward))
if episode_number % 100 == 0: pickle.dump(model, open('save.p', 'wb'))
reward_sum = 0
observation = env.reset() # reset env
# prev_x = None
# if reward != 0: # Pong has either +1 or -1 reward exactly when game ends.
# print ('ep %d: game finished, reward: %f' % (episode_number, reward)) + ('' if reward == -1 else ' !!!!!!!!')
env.monitor.close()
if upload:
gym.upload(outdir, api_key=open('../api_key.txt', 'r').readline())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment