Skip to content

Instantly share code, notes, and snippets.

@brijml
Last active March 16, 2019 16:07
Show Gist options
  • Save brijml/c7c1707280f4c64e26bcf800b5c2f61b to your computer and use it in GitHub Desktop.
Save brijml/c7c1707280f4c64e26bcf800b5c2f61b to your computer and use it in GitHub Desktop.
Training a neural network, an OpenAI Cart-pole agent with Policy Gradients
import numpy as np
import gym
def softmax(z):
exponent = np.exp(z)
return exponent/np.sum(exponent)
def policy_forward(s):
h1 = np.dot(model['W1'],s)
h1[h1<0] = 0
z = np.dot(model['W2'],h1)
return softmax(z),h1
def policy_backward(epdlogp, eph, eps):
dw2 = np.dot(epdlogp.T,eph)
dh1 = np.matmul(epdlogp,model['W2'])
dh1[dh1<=0] = 0
dw1 = np.dot(dh1.T,eps)
return {'W1':dw1,'W2':dw2}
def one_hot(int_):
probability = np.zeros(num_actions)
probability[int_] = 1
return probability
def calculate_return(r):
discounted_r = np.zeros_like(r)
running_add = 0
for i in reversed(range(len(r))):
running_add = running_add*gamma + r[i]
discounted_r[i] = running_add
return np.array(discounted_r)
#Hyperparemeters
gamma = 0.99 #discount factor, how far in the future we should look!
update_frequency = 20 #The batch size number of episodes before the weights are updated
lr = 1e-3
h_nodes = 10
episode_number = 0
reward_sum = 0
running_reward = None
ss,rs,hs,dlogps = [],[],[],[]
#Import the OpenAI Cartpole environment
env = gym.make('CartPole-v0')
s = env.reset()
obs_space,num_actions = env.observation_space.shape[0], env.action_space.n
render = True
#Initialise the model
model = {'W1':np.random.randn(h_nodes,obs_space)/np.sqrt(obs_space),
'W2':np.random.randn(num_actions,h_nodes)/np.sqrt(h_nodes)}
grad_buffer = { k : np.zeros_like(v) for k,v in model.iteritems() }
while True:
if render: env.render()
aprob,hidden = policy_forward(s)#Forward through the network to get a probability distrbution over actions
action = np.random.choice(range(num_actions),p=aprob)#Choose a random action given a probability distribution
y = one_hot(action)
dlogps.append(y - aprob)#gradient of the log policy with respect to input logits to the softmax layer
#Take one step
s1,r,d,info = env.step(action)
reward_sum+=r
rs.append(r)
hs.append(hidden)
ss.append(s)
s = s1
if d == True:
episode_number+=1
#record all the information for one episode
epr = np.vstack(rs)
epdlogp = np.vstack(dlogps)
eph = np.vstack(hs)
eps = np.vstack(ss)
#calculate the discounted return for each step in the episode
G = calculate_return(epr)
#We want to maximise the probability mass on the action which fetches more reward
epdlogp *= G
#backward through the network for a complete batch
grad = policy_backward(epdlogp,eph,eps)
#Add the gradients cumulatively over a complete episode
for k in model: grad_buffer[k] += grad[k]
if episode_number % update_frequency == 0:
#update the weights
for k,v in model.iteritems():
model[k]+=lr*grad_buffer[k] #Stochastic gradient ascent(because we want to maximise the reward)
grad_buffer[k] = np.zeros_like(v)
#Display the mean reward this must go on increasing as the agent gets more experienced
running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01
print 'resetting env. episode reward total was {}. running mean: {}'.format(reward_sum, running_reward)
ss,rs,hs,dlogps = [],[],[],[]
s = env.reset()
reward_sum = 0
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment