brijml/pg-cartpole.py

## pg-cartpole.py
import numpy as np
import gym

def softmax(z):
	exponent = np.exp(z)
	return exponent/np.sum(exponent)

def policy_forward(s):
	h1 = np.dot(model['W1'],s)
	h1[h1<0] = 0
	z = np.dot(model['W2'],h1)
	return softmax(z),h1

def policy_backward(epdlogp, eph, eps):
	dw2 = np.dot(epdlogp.T,eph)
	dh1 = np.matmul(epdlogp,model['W2'])
	dh1[dh1<=0] = 0
	dw1 = np.dot(dh1.T,eps)
	return {'W1':dw1,'W2':dw2}

def one_hot(int_):
	probability = np.zeros(num_actions)
	probability[int_] = 1
	return probability

def calculate_return(r):
	discounted_r = np.zeros_like(r)
	running_add = 0
	for i in reversed(range(len(r))):
		running_add = running_add*gamma + r[i]
		discounted_r[i] = running_add
	return np.array(discounted_r)

#Hyperparemeters
gamma = 0.99 #discount factor, how far in the future we should look!
update_frequency = 20 #The batch size number of episodes before the weights are updated
lr = 1e-3
h_nodes = 10
episode_number = 0
reward_sum = 0
running_reward = None
ss,rs,hs,dlogps = [],[],[],[]

#Import the OpenAI Cartpole environment
env = gym.make('CartPole-v0')
s = env.reset()
obs_space,num_actions = env.observation_space.shape[0], env.action_space.n
render = True

#Initialise the model
model = {'W1':np.random.randn(h_nodes,obs_space)/np.sqrt(obs_space),
		 'W2':np.random.randn(num_actions,h_nodes)/np.sqrt(h_nodes)}
grad_buffer = { k : np.zeros_like(v) for k,v in model.iteritems() }

while True:
	if render: env.render()

	aprob,hidden = policy_forward(s)#Forward through the network to get a probability distrbution over actions
	action = np.random.choice(range(num_actions),p=aprob)#Choose a random action given a probability distribution
	y = one_hot(action)
	dlogps.append(y - aprob)#gradient of the log policy with respect to input logits to the softmax layer

	#Take one step
	s1,r,d,info = env.step(action)
	reward_sum+=r

	rs.append(r)
	hs.append(hidden)
	ss.append(s)
	s = s1
	if d == True:
		episode_number+=1

		#record all the information for one episode
		epr = np.vstack(rs)
		epdlogp = np.vstack(dlogps)
		eph = np.vstack(hs)
		eps = np.vstack(ss)

		#calculate the discounted return for each step in the episode
		G = calculate_return(epr)

		#We want to maximise the probability mass on the action which fetches more reward
		epdlogp *= G

		#backward through the network for a complete batch
		grad = policy_backward(epdlogp,eph,eps)

		#Add the gradients cumulatively over a complete episode
		for k in model: grad_buffer[k] += grad[k]
		if episode_number % update_frequency == 0:
			#update the weights
			for k,v in model.iteritems():
				model[k]+=lr*grad_buffer[k] #Stochastic gradient ascent(because we want to maximise the reward)
				grad_buffer[k] = np.zeros_like(v)

		#Display the mean reward this must go on increasing as the agent gets more experienced
		running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01
		print 'resetting env. episode reward total was {}. running mean: {}'.format(reward_sum, running_reward)
		ss,rs,hs,dlogps = [],[],[],[]
		s = env.reset()
		reward_sum = 0
	import numpy as np
	import gym

	def softmax(z):
	exponent = np.exp(z)
	return exponent/np.sum(exponent)

	def policy_forward(s):
	h1 = np.dot(model['W1'],s)
	h1[h1<0] = 0
	z = np.dot(model['W2'],h1)
	return softmax(z),h1

	def policy_backward(epdlogp, eph, eps):
	dw2 = np.dot(epdlogp.T,eph)
	dh1 = np.matmul(epdlogp,model['W2'])
	dh1[dh1<=0] = 0
	dw1 = np.dot(dh1.T,eps)
	return {'W1':dw1,'W2':dw2}

	def one_hot(int_):
	probability = np.zeros(num_actions)
	probability[int_] = 1
	return probability

	def calculate_return(r):
	discounted_r = np.zeros_like(r)
	running_add = 0
	for i in reversed(range(len(r))):
	running_add = running_add*gamma + r[i]
	discounted_r[i] = running_add
	return np.array(discounted_r)

	#Hyperparemeters
	gamma = 0.99 #discount factor, how far in the future we should look!
	update_frequency = 20 #The batch size number of episodes before the weights are updated
	lr = 1e-3
	h_nodes = 10
	episode_number = 0
	reward_sum = 0
	running_reward = None
	ss,rs,hs,dlogps = [],[],[],[]

	#Import the OpenAI Cartpole environment
	env = gym.make('CartPole-v0')
	s = env.reset()
	obs_space,num_actions = env.observation_space.shape[0], env.action_space.n
	render = True

	#Initialise the model
	model = {'W1':np.random.randn(h_nodes,obs_space)/np.sqrt(obs_space),
	'W2':np.random.randn(num_actions,h_nodes)/np.sqrt(h_nodes)}
	grad_buffer = { k : np.zeros_like(v) for k,v in model.iteritems() }

	while True:
	if render: env.render()

	aprob,hidden = policy_forward(s)#Forward through the network to get a probability distrbution over actions
	action = np.random.choice(range(num_actions),p=aprob)#Choose a random action given a probability distribution
	y = one_hot(action)
	dlogps.append(y - aprob)#gradient of the log policy with respect to input logits to the softmax layer

	#Take one step
	s1,r,d,info = env.step(action)
	reward_sum+=r

	rs.append(r)
	hs.append(hidden)
	ss.append(s)
	s = s1
	if d == True:
	episode_number+=1

	#record all the information for one episode
	epr = np.vstack(rs)
	epdlogp = np.vstack(dlogps)
	eph = np.vstack(hs)
	eps = np.vstack(ss)

	#calculate the discounted return for each step in the episode
	G = calculate_return(epr)

	#We want to maximise the probability mass on the action which fetches more reward
	epdlogp *= G

	#backward through the network for a complete batch
	grad = policy_backward(epdlogp,eph,eps)

	#Add the gradients cumulatively over a complete episode
	for k in model: grad_buffer[k] += grad[k]
	if episode_number % update_frequency == 0:
	#update the weights
	for k,v in model.iteritems():
	model[k]+=lr*grad_buffer[k] #Stochastic gradient ascent(because we want to maximise the reward)
	grad_buffer[k] = np.zeros_like(v)

	#Display the mean reward this must go on increasing as the agent gets more experienced
	running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01
	print 'resetting env. episode reward total was {}. running mean: {}'.format(reward_sum, running_reward)
	ss,rs,hs,dlogps = [],[],[],[]
	s = env.reset()
	reward_sum = 0