Last active
March 16, 2019 16:07
-
-
Save brijml/c7c1707280f4c64e26bcf800b5c2f61b to your computer and use it in GitHub Desktop.
Training a neural network, an OpenAI Cart-pole agent with Policy Gradients
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import gym | |
def softmax(z): | |
exponent = np.exp(z) | |
return exponent/np.sum(exponent) | |
def policy_forward(s): | |
h1 = np.dot(model['W1'],s) | |
h1[h1<0] = 0 | |
z = np.dot(model['W2'],h1) | |
return softmax(z),h1 | |
def policy_backward(epdlogp, eph, eps): | |
dw2 = np.dot(epdlogp.T,eph) | |
dh1 = np.matmul(epdlogp,model['W2']) | |
dh1[dh1<=0] = 0 | |
dw1 = np.dot(dh1.T,eps) | |
return {'W1':dw1,'W2':dw2} | |
def one_hot(int_): | |
probability = np.zeros(num_actions) | |
probability[int_] = 1 | |
return probability | |
def calculate_return(r): | |
discounted_r = np.zeros_like(r) | |
running_add = 0 | |
for i in reversed(range(len(r))): | |
running_add = running_add*gamma + r[i] | |
discounted_r[i] = running_add | |
return np.array(discounted_r) | |
#Hyperparemeters | |
gamma = 0.99 #discount factor, how far in the future we should look! | |
update_frequency = 20 #The batch size number of episodes before the weights are updated | |
lr = 1e-3 | |
h_nodes = 10 | |
episode_number = 0 | |
reward_sum = 0 | |
running_reward = None | |
ss,rs,hs,dlogps = [],[],[],[] | |
#Import the OpenAI Cartpole environment | |
env = gym.make('CartPole-v0') | |
s = env.reset() | |
obs_space,num_actions = env.observation_space.shape[0], env.action_space.n | |
render = True | |
#Initialise the model | |
model = {'W1':np.random.randn(h_nodes,obs_space)/np.sqrt(obs_space), | |
'W2':np.random.randn(num_actions,h_nodes)/np.sqrt(h_nodes)} | |
grad_buffer = { k : np.zeros_like(v) for k,v in model.iteritems() } | |
while True: | |
if render: env.render() | |
aprob,hidden = policy_forward(s)#Forward through the network to get a probability distrbution over actions | |
action = np.random.choice(range(num_actions),p=aprob)#Choose a random action given a probability distribution | |
y = one_hot(action) | |
dlogps.append(y - aprob)#gradient of the log policy with respect to input logits to the softmax layer | |
#Take one step | |
s1,r,d,info = env.step(action) | |
reward_sum+=r | |
rs.append(r) | |
hs.append(hidden) | |
ss.append(s) | |
s = s1 | |
if d == True: | |
episode_number+=1 | |
#record all the information for one episode | |
epr = np.vstack(rs) | |
epdlogp = np.vstack(dlogps) | |
eph = np.vstack(hs) | |
eps = np.vstack(ss) | |
#calculate the discounted return for each step in the episode | |
G = calculate_return(epr) | |
#We want to maximise the probability mass on the action which fetches more reward | |
epdlogp *= G | |
#backward through the network for a complete batch | |
grad = policy_backward(epdlogp,eph,eps) | |
#Add the gradients cumulatively over a complete episode | |
for k in model: grad_buffer[k] += grad[k] | |
if episode_number % update_frequency == 0: | |
#update the weights | |
for k,v in model.iteritems(): | |
model[k]+=lr*grad_buffer[k] #Stochastic gradient ascent(because we want to maximise the reward) | |
grad_buffer[k] = np.zeros_like(v) | |
#Display the mean reward this must go on increasing as the agent gets more experienced | |
running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01 | |
print 'resetting env. episode reward total was {}. running mean: {}'.format(reward_sum, running_reward) | |
ss,rs,hs,dlogps = [],[],[],[] | |
s = env.reset() | |
reward_sum = 0 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment