Created
March 1, 2017 04:58
-
-
Save quq99/4a07175b65eacd84cb34634a6e97c27c to your computer and use it in GitHub Desktop.
a improved PG algorithm for Atari pong
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" Trains an agent with (stochastic) Policy Gradients on Pong. Uses OpenAI Gym. """ | |
import numpy as np | |
import cPickle as pickle | |
import gym | |
import copy | |
# hyperparameters | |
H = 200 # number of hidden layer neurons | |
batch_size = 10 # every how many episodes to do a param update? | |
learning_rate = 1e-4 | |
gamma = 0.99 # discount factor for reward | |
decay_rate = 0.99 # decay factor for RMSProp leaky sum of grad^2 | |
resume = False # resume from previous checkpoint? | |
render = False | |
# model initialization | |
D = 80 * 80 # input dimensionality: 80x80 grid | |
DD = (80,80) | |
if resume: | |
model = pickle.load(open('save.p', 'rb')) | |
else: | |
model = {} | |
model['W1'] = np.random.randn(H,D) / np.sqrt(D) # "Xavier" initialization | |
model['W2'] = np.random.randn(H) / np.sqrt(H) | |
model['theta1'] = np.random.randn(H,D) / np.sqrt(D) # "Xavier" initialization | |
model['theta2'] = np.random.randn(H) / np.sqrt(H) | |
grad_buffer = { k : np.zeros_like(v) for k,v in model.iteritems() } # update buffers that add up gradients over a batch | |
rmsprop_cache = { k : np.zeros_like(v) for k,v in model.iteritems() } # rmsprop memory | |
def sigmoid(x): | |
return 1.0 / (1.0 + np.exp(-x)) # sigmoid "squashing" function to interval [0,1] | |
def prepro(I): | |
""" prepro 210x160x3 uint8 frame into 6400 (80x80) 1D float vector """ | |
I = I[35:195] # crop | |
I = I[::2,::2,0] # downsample by factor of 2 | |
I[I == 144] = 0 # erase background (background type 1) | |
I[I == 109] = 0 # erase background (background type 2) | |
I[I != 0] = 1 # everything else (paddles, ball) just set to 1 | |
#return I.astype(np.float).ravel() | |
#print np.shape(I) | |
#print I.astype(np.float) | |
return I.astype(np.float) | |
def dealwith(iy): | |
iy=abs(iy) | |
if (iy >= 79): | |
iy=79-(iy-79) | |
if (iy <= 0): | |
iy=dealwith(iy) | |
return iy | |
#plot the trajectory of the ball | |
def traj(mat): | |
mattmp=copy.deepcopy(mat) | |
#print np.shape(mattmp) | |
mattmp[:,0:10]=0 | |
mattmp[:,70:80]=0 | |
#plt.imshow(mattmp) | |
#plt.show() | |
#find the ball | |
[negY,negX] = [np.argmax(np.argmax(mattmp,axis=1)), np.max(np.argmax(mattmp,axis=1))] | |
#print [negY,negX] | |
[posY,posX] = [np.argmax(np.argmin(mattmp,axis=1)), np.max(np.argmin(mattmp,axis=1))] | |
#print [posY,posX] | |
if ([posX,posY] != [0,0] and [negX,negY] != [0,0]): | |
if (posX > negX): | |
ascent=float(posY-negY) / float(posX-negX) | |
for i in xrange(negX+1,70): | |
if (i == posX): | |
continue | |
iy = int(ascent*(i-posX)+posY) | |
iy = dealwith(iy) | |
#draw the trajectory | |
mat[iy,i] = 0.8 | |
elif (posX < negX): | |
ascent=float(posY-negY) / float(posX-negX) | |
for i in xrange(10,negX): | |
if (i == posX): | |
continue | |
iy = int(ascent*(i-posX)+posY) | |
iy = dealwith(iy) | |
mat[iy,i] = 0.8 | |
#plt.imshow(mat) | |
#plt.show() | |
return np.ravel(mat) | |
def discount_rewards(r): | |
""" take 1D float array of rewards and compute discounted reward """ | |
discounted_r = np.zeros_like(r) | |
running_add = 0 | |
for t in reversed(xrange(0, r.size)): | |
if r[t] != 0: running_add = 0 # reset the sum, since this was a game boundary (pong specific!). ie, in a episode at least play 21 times. | |
running_add = running_add * gamma + r[t] | |
discounted_r[t] = running_add | |
return discounted_r | |
def policy_forward(x): | |
h = np.dot(model['W1'], x) | |
h[h<0] = 0 # ReLU nonlinearity | |
logp = np.dot(model['W2'], h) | |
p = sigmoid(logp) | |
return p, h # return probability of taking action 2, and hidden state | |
def value_forward(x): | |
h = np.dot(model['theta1'], x) | |
h[h<0] = 0 # ReLU nonlinearity | |
logp = np.dot(model['theta2'], h) | |
p = sigmoid(logp) | |
return p, h # return probability of taking action 2, and hidden state | |
def policy_backward(eph, epdlogp): | |
""" backward pass. (eph is array of intermediate hidden states) """ | |
dW2 = np.dot(eph.T, epdlogp).ravel() | |
dh = np.outer(epdlogp, model['W2']) | |
dh[eph <= 0] = 0 # backpro prelu | |
dW1 = np.dot(dh.T, epx) | |
return {'W1':dW1, 'W2':dW2} | |
def value_backward(eph, epdp): | |
""" backward pass. (eph is array of intermediate hidden states) """ | |
dtheta2 = np.dot(eph.T, epdp).ravel() | |
dh = np.outer(epdp, model['theta2']) | |
dh[eph <= 0] = 0 # backpro prelu | |
dtheta1 = np.dot(dh.T, epx) | |
return {'theta1':dtheta1, 'theta2':dtheta2} | |
env = gym.make("Pong-v0") | |
observation = env.reset() | |
prev_x = None # used in computing the difference frame | |
xs,hs,hvs,dlogps,dvs,drs = [],[],[],[],[],[] | |
running_reward = None | |
reward_sum = 0 | |
episode_number = 0 | |
while True: | |
if render: env.render() | |
# preprocess the observation, set input to network to be difference image | |
cur_x = prepro(observation) | |
subX = cur_x - prev_x if prev_x is not None else np.zeros(DD) | |
prev_x = cur_x | |
#print np.shape(x) | |
# forward the policy network and sample an action from the returned probability | |
#print np.shape(subX) | |
x=traj(subX) | |
aprob, h = policy_forward(x) | |
vs, vh = value_forward(x) | |
action = 2 if np.random.uniform() < aprob else 3 # roll the dice! | |
# record various intermediates (needed later for backprop) | |
xs.append(x) # observation | |
hs.append(h) # hidden state in policy | |
hvs.append(vh) # hidden state in value | |
y = 1 if action == 2 else 0 # a "fake label" | |
dlogps.append(y - aprob) # grad that encourages the action that was taken to be taken (see http://cs231n.github.io/neural-networks-2/#losses if confused) | |
dvs.append(vs) | |
# step the environment and get new measurements | |
observation, reward, done, info = env.step(action) | |
reward_sum += reward | |
drs.append(reward) # record reward (has to be done after we call step() to get reward for previous action) | |
if done: # an episode finished | |
episode_number += 1 | |
# stack together all inputs, hidden states, action gradients, and rewards for this episode | |
epx = np.vstack(xs) | |
eph = np.vstack(hs) | |
epvh = np.vstack(hvs) | |
epdlogp = np.vstack(dlogps) | |
epdv = np.vstack(dvs) | |
epr = np.vstack(drs) | |
xs,hs,hvs,dlogps,dvs,drs = [],[],[],[],[],[] # reset array memory | |
# compute the discounted reward backwards through time | |
discounted_epr = discount_rewards(epr) | |
# standardize the rewards to be unit normal (helps control the gradient estimator variance) | |
discounted_epr -= np.mean(discounted_epr) | |
discounted_epr /= np.std(discounted_epr) | |
# randomly sample some states(give up some states), that can break the relations between every two steps. | |
dis=0.3 | |
deletecount = int(dis*epx.shape[0]) | |
deletea = np.array(epx.shape[0]) | |
deletearraynum = np.random.choice(deletea,deletecount,replace=False) | |
np.delete(epx, deletearraynum, 0) | |
np.delete(eph, deletearraynum, 0) | |
np.delete(epvh, deletearraynum, 0) | |
np.delete(epdlogp, deletearraynum, 0) | |
np.delete(epdv, deletearraynum, 0) | |
np.delete(epr, deletearraynum, 0) | |
np.delete(discounted_epr, deletearraynum, 0) | |
epdp = (discounted_epr - epdv)*(epdv)*(1-epdv) | |
epdlogp = epdlogp *(discounted_epr - epdv) # modulate the gradient with advantage (PG magic happens right here.) | |
grad={} | |
grad_policy = policy_backward(eph, epdlogp) | |
grad_value = value_backward(epvh, epdp) | |
grad = dict(grad_policy,**grad_value) | |
for k in model: grad_buffer[k] += grad[k] # accumulate grad over batch | |
# perform rmsprop parameter update every batch_size episodes | |
if episode_number % batch_size == 0: | |
for k,v in model.iteritems(): | |
g = grad_buffer[k] # gradient | |
rmsprop_cache[k] = decay_rate * rmsprop_cache[k] + (1 - decay_rate) * g**2 | |
model[k] += learning_rate * g / (np.sqrt(rmsprop_cache[k]) + 1e-5) | |
grad_buffer[k] = np.zeros_like(v) # reset batch gradient buffer | |
# boring book-keeping | |
running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01 | |
print 'resetting env. episode reward total was %f. running mean: %f' % (reward_sum, running_reward) | |
if episode_number % 100 == 0: pickle.dump(model, open('save.p', 'wb')) | |
reward_sum = 0 | |
observation = env.reset() # reset env | |
prev_x = None |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment