flyman3046/pg-MountainCar.py

## pg-MountainCar.py
# Original code from https://gist.github.com/karpathy/a4166c7fe253700972fcbc77e4ea32c5
# Use it to solve MountainCar-v0

import numpy as np
import gym
import matplotlib.pyplot as plt

# hyperparameters
H = 10 # number of hidden layer neurons
batch_size = 1 # every how many episodes to do a param update?
learning_rate = 1e-2
gamma = 0.99 # discount factor for reward
decay_rate = 0.99 # decay factor for RMSProp leaky sum of grad^2

# model initialization
D = 2 # input dimensionality
C = 3 # class number

model = {}
model['W1'] = np.random.randn(H, D) / np.sqrt(D) # "Xavier" initialization, shape (H, D)
model['W2'] = np.random.randn(C, H) / np.sqrt(H) # shape (C, H)

grad_buffer = { k : np.zeros_like(v) for k,v in model.iteritems() } # update buffers that add up gradients over a batch
rmsprop_cache = { k : np.zeros_like(v) for k,v in model.iteritems() } # rmsprop memory

def sigmoid(x):
	x = x - max(x)
	return np.exp(x) / sum(np.exp(x)) # sigmoid "squashing" function to interval [0,1]

def discount_rewards(r):
	""" take 1D float array of rewards and compute discounted reward """
	discounted_r = np.zeros_like(r)
	running_add = 0
	for t in reversed(xrange(0, r.size)):
		running_add = running_add * gamma + r[t]
		discounted_r[t] = running_add
	return discounted_r

def policy_forward(x):
	h = np.dot(model['W1'], x) # shape (H,)
	h[h<0] = 0 # ReLU nonlinearity
	logp = np.dot(model['W2'], h) # shape (C,)
	p = sigmoid(logp) # shape (C,)
	return p, h # return probability of taking action 1, and hidden state

def policy_backward(eph, epdlogp):
	#eph shape (Ns, H), Ns is number of steps in this episode
	#epdlogp shape (Ns, C)
	#epx shape (Ns, D)

	""" backward pass. (eph is array of intermediate hidden states) """
	dW2 = np.dot(epdlogp.T, eph) # shape (C, H)
	dh = np.dot(epdlogp, model['W2']) # shape (Ns, H)
	dh[eph <= 0] = 0 # backpro prelu
	dW1 = np.dot(dh.T, epx) # shape (H, D)

	return {'W1':dW1, 'W2':dW2}

def choose_action(prob):
	action = np.random.choice(range(len(prob)), p=prob)  # select action w.r.t the actions prob
	return action

env = gym.make("MountainCar-v0")

xs,hs,dlogps,drs = [],[],[],[]
running_reward = None
reward_sum = 0
episode_number = 0
reward_trend = []

for episode_number in range(2000):
	observation = env.reset()

	while True:
		x = observation #shape (D,)

		# forward the policy network and sample an action from the returned probability
		aprob, h = policy_forward(x)
		# Take action with the highest probability
		action = choose_action(aprob)

		# record various intermediates (needed later for backprop)
		xs.append(x) # observation
		hs.append(h) # hidden state

		y = np.zeros_like(aprob)
		y[action] = 1

		dlogps.append(y - aprob) # grad that encourages the action that was taken to be taken (see http://cs231n.github.io/neural-networks-2/#losses if confused)
		# step the environment and get new measurements
		observation, reward, done, info = env.step(action)
		reward_sum += reward

		drs.append(reward) # record reward (has to be done after we call step() to get reward for previous action)

		if done: # an episode finished
			# stack together all inputs, hidden states, action gradients, and rewards for this episode
			if done and episode_number % 10 == 0:
				print "episode is done"
				print "reward_sum: {}".format(reward_sum)

			epx = np.vstack(xs)
			eph = np.vstack(hs)
			epdlogp = np.vstack(dlogps)
			epr = np.vstack(drs)
			xs,hs,dlogps,drs = [],[],[],[] # reset array memory

			# compute the discounted reward backwards through time
			discounted_epr = discount_rewards(epr)
			# standardize the rewards to be unit normal (helps control the gradient estimator variance)
			discounted_epr -= np.mean(discounted_epr)
			discounted_epr /= np.std(discounted_epr)

			# plt.plot(discounted_epr)
			# plt.show()

			epdlogp *= discounted_epr # modulate the gradient with advantage (PG magic happens right here.)

			# print epdlogp
			grad = policy_backward(eph, epdlogp)
			for k in model: grad_buffer[k] += grad[k] # accumulate grad over batch

			# perform rmsprop parameter update every batch_size episodes
			if episode_number % batch_size == 0:
				# print "update parameter"
				for k,v in model.iteritems():
					g = grad_buffer[k] # gradient
					rmsprop_cache[k] = decay_rate * rmsprop_cache[k] + (1 - decay_rate) * g**2
					model[k] += learning_rate * g / (np.sqrt(rmsprop_cache[k]) + 1e-5)
					grad_buffer[k] = np.zeros_like(v) # reset batch gradient buffer

			reward_trend.append(reward_sum)
			reward_sum = 0
			observation = env.reset() # reset env

			break

plt.plot(reward_trend)
plt.ylim([-1000, 10])
plt.show()
	# Original code from https://gist.github.com/karpathy/a4166c7fe253700972fcbc77e4ea32c5
	# Use it to solve MountainCar-v0

	import numpy as np
	import gym
	import matplotlib.pyplot as plt

	# hyperparameters
	H = 10 # number of hidden layer neurons
	batch_size = 1 # every how many episodes to do a param update?
	learning_rate = 1e-2
	gamma = 0.99 # discount factor for reward
	decay_rate = 0.99 # decay factor for RMSProp leaky sum of grad^2

	# model initialization
	D = 2 # input dimensionality
	C = 3 # class number

	model = {}
	model['W1'] = np.random.randn(H, D) / np.sqrt(D) # "Xavier" initialization, shape (H, D)
	model['W2'] = np.random.randn(C, H) / np.sqrt(H) # shape (C, H)

	grad_buffer = { k : np.zeros_like(v) for k,v in model.iteritems() } # update buffers that add up gradients over a batch
	rmsprop_cache = { k : np.zeros_like(v) for k,v in model.iteritems() } # rmsprop memory

	def sigmoid(x):
	x = x - max(x)
	return np.exp(x) / sum(np.exp(x)) # sigmoid "squashing" function to interval [0,1]

	def discount_rewards(r):
	""" take 1D float array of rewards and compute discounted reward """
	discounted_r = np.zeros_like(r)
	running_add = 0
	for t in reversed(xrange(0, r.size)):
	running_add = running_add * gamma + r[t]
	discounted_r[t] = running_add
	return discounted_r

	def policy_forward(x):
	h = np.dot(model['W1'], x) # shape (H,)
	h[h<0] = 0 # ReLU nonlinearity
	logp = np.dot(model['W2'], h) # shape (C,)
	p = sigmoid(logp) # shape (C,)
	return p, h # return probability of taking action 1, and hidden state

	def policy_backward(eph, epdlogp):
	#eph shape (Ns, H), Ns is number of steps in this episode
	#epdlogp shape (Ns, C)
	#epx shape (Ns, D)

	""" backward pass. (eph is array of intermediate hidden states) """
	dW2 = np.dot(epdlogp.T, eph) # shape (C, H)
	dh = np.dot(epdlogp, model['W2']) # shape (Ns, H)
	dh[eph <= 0] = 0 # backpro prelu
	dW1 = np.dot(dh.T, epx) # shape (H, D)

	return {'W1':dW1, 'W2':dW2}

	def choose_action(prob):
	action = np.random.choice(range(len(prob)), p=prob) # select action w.r.t the actions prob
	return action

	env = gym.make("MountainCar-v0")

	xs,hs,dlogps,drs = [],[],[],[]
	running_reward = None
	reward_sum = 0
	episode_number = 0
	reward_trend = []

	for episode_number in range(2000):
	observation = env.reset()

	while True:
	x = observation #shape (D,)

	# forward the policy network and sample an action from the returned probability
	aprob, h = policy_forward(x)
	# Take action with the highest probability
	action = choose_action(aprob)

	# record various intermediates (needed later for backprop)
	xs.append(x) # observation
	hs.append(h) # hidden state

	y = np.zeros_like(aprob)
	y[action] = 1

	dlogps.append(y - aprob) # grad that encourages the action that was taken to be taken (see http://cs231n.github.io/neural-networks-2/#losses if confused)
	# step the environment and get new measurements
	observation, reward, done, info = env.step(action)
	reward_sum += reward

	drs.append(reward) # record reward (has to be done after we call step() to get reward for previous action)

	if done: # an episode finished
	# stack together all inputs, hidden states, action gradients, and rewards for this episode
	if done and episode_number % 10 == 0:
	print "episode is done"
	print "reward_sum: {}".format(reward_sum)

	epx = np.vstack(xs)
	eph = np.vstack(hs)
	epdlogp = np.vstack(dlogps)
	epr = np.vstack(drs)
	xs,hs,dlogps,drs = [],[],[],[] # reset array memory

	# compute the discounted reward backwards through time
	discounted_epr = discount_rewards(epr)
	# standardize the rewards to be unit normal (helps control the gradient estimator variance)
	discounted_epr -= np.mean(discounted_epr)
	discounted_epr /= np.std(discounted_epr)

	# plt.plot(discounted_epr)
	# plt.show()

	epdlogp *= discounted_epr # modulate the gradient with advantage (PG magic happens right here.)

	# print epdlogp
	grad = policy_backward(eph, epdlogp)
	for k in model: grad_buffer[k] += grad[k] # accumulate grad over batch

	# perform rmsprop parameter update every batch_size episodes
	if episode_number % batch_size == 0:
	# print "update parameter"
	for k,v in model.iteritems():
	g = grad_buffer[k] # gradient
	rmsprop_cache[k] = decay_rate * rmsprop_cache[k] + (1 - decay_rate) * g**2
	model[k] += learning_rate * g / (np.sqrt(rmsprop_cache[k]) + 1e-5)
	grad_buffer[k] = np.zeros_like(v) # reset batch gradient buffer

	reward_trend.append(reward_sum)
	reward_sum = 0
	observation = env.reset() # reset env

	break

	plt.plot(reward_trend)
	plt.ylim([-1000, 10])
	plt.show()