flyman3046/pg-CartPole-MultiProb.py

## pg-CartPole-MultiProb.py
# Original code from https://gist.github.com/karpathy/a4166c7fe253700972fcbc77e4ea32c5
# Use it to solve CartPole-v0
import numpy as np
import gym

# hyperparameters
H = 10 # number of hidden layer neurons
batch_size = 5 # every how many episodes to do a param update?
learning_rate = 1e-3
gamma = 0.99 # discount factor for reward
decay_rate = 0.99 # decay factor for RMSProp leaky sum of grad^2

# model initialization
D = 4 # input dimensionality
C = 2 # class number

model = {}
model['W1'] = np.random.randn(H, D) / np.sqrt(D) # "Xavier" initialization, shape (H, D)
model['W2'] = np.random.randn(C, H) / np.sqrt(C) # shape (C, H)

grad_buffer = { k : np.zeros_like(v) for k,v in model.iteritems() } # update buffers that add up gradients over a batch
rmsprop_cache = { k : np.zeros_like(v) for k,v in model.iteritems() } # rmsprop memory

def sigmoid(x):
	x = x - max(x)
	return np.exp(x) / sum(np.exp(x)) # sigmoid "squashing" function to interval [0,1]

def discount_rewards(r):
	""" take 1D float array of rewards and compute discounted reward """
	discounted_r = np.zeros_like(r)
	running_add = 0
	for t in reversed(xrange(0, r.size)):
		running_add = running_add * gamma + r[t]
		discounted_r[t] = running_add
	return discounted_r

def policy_forward(x):
	h = np.dot(model['W1'], x) # shape (H,)
	h[h<0] = 0 # ReLU nonlinearity
	logp = np.dot(model['W2'], h) # shape (C,)
	p = sigmoid(logp) # shape (C,)
	return p, h # return probability of taking all actions, and hidden state, here sum of p is 1.0

def policy_backward(eph, epdlogp):
	#eph shape (Ns, H), Ns is number of steps in this episode
	#epdlogp shape (Ns, C)
	#epx shape (Ns, D)

	""" backward pass. (eph is array of intermediate hidden states) """
	dW2 = np.dot(epdlogp.T, eph) # shape (C, H)
	dh = np.dot(epdlogp, model['W2']) # shape (Ns, H)
	dh[eph <= 0] = 0 # backpro prelu
	dW1 = np.dot(dh.T, epx) # shape (H, D)

	return {'W1':dW1, 'W2':dW2}

env = gym.make("CartPole-v0")

xs,hs,dlogps,drs = [],[],[],[]
running_reward = None
reward_sum = 0
episode_number = 0

for episode_number in range(5000):

	observation = env.reset()

	while True:
		x = observation #shape (D,)

		# forward the policy network and sample an action from the returned probability
		aprob, h = policy_forward(x)
		action = 0 if np.random.uniform() < aprob[0] else 1 # sample action based on probability

		# record various intermediates (needed later for backprop)
		xs.append(x) # observation
		hs.append(h) # hidden state

		y = np.zeros_like(aprob)
		y[action] = 1

		dlogps.append(y - aprob) # grad that encourages the action that was taken to be taken (see http://cs231n.github.io/neural-networks-2/#losses if confused)

		# step the environment and get new measurements
		observation, reward, done, info = env.step(action)
		reward_sum += reward

		drs.append(reward) # record reward (has to be done after we call step() to get reward for previous action)

		if done or reward_sum >= 300: # an episode finished
			# stack together all inputs, hidden states, action gradients, and rewards for this episode
			epx = np.vstack(xs)
			eph = np.vstack(hs)
			epdlogp = np.vstack(dlogps)
			epr = np.vstack(drs)
			xs,hs,dlogps,drs = [],[],[],[] # reset array memory

			# compute the discounted reward backwards through time
			discounted_epr = discount_rewards(epr)
			# standardize the rewards to be unit normal (helps control the gradient estimator variance)
			discounted_epr -= np.mean(discounted_epr)
			discounted_epr /= np.std(discounted_epr)

			epdlogp *= discounted_epr # modulate the gradient with advantage (PG magic happens right here.)

			# print epdlogp
			grad = policy_backward(eph, epdlogp)
			for k in model: grad_buffer[k] += grad[k] # accumulate grad over batch

			# perform rmsprop parameter update every batch_size episodes
			if episode_number % batch_size == 0:
				for k,v in model.iteritems():
					g = grad_buffer[k] # gradient
					rmsprop_cache[k] = decay_rate * rmsprop_cache[k] + (1 - decay_rate) * g**2
					model[k] += learning_rate * g / (np.sqrt(rmsprop_cache[k]) + 1e-5)
					grad_buffer[k] = np.zeros_like(v) # reset batch gradient buffer

			reward_sum = 0
			observation = env.reset() # reset env

			break

#test algorithm
test_number = 100
test_reward = 0
for i in range(test_number):
	iter = 0
	reward_sum = 0
	observation = env.reset() # Obtain an initial observation of the environment
	while True:
		# Run the policy network and get an action to take.
		aprob, _ = policy_forward(observation)
		action = 0 if np.random.uniform() < aprob[0] else 1

		# step the environment and get new measurements
		observation, reward, done, info = env.step(action)
		reward_sum += reward
		iter += 1
		if done or iter >= 300:

			test_reward += reward_sum
			iter = 0
			reward_sum = 0
			break

print "test average reward is {}".format(test_reward / test_number)
	# Original code from https://gist.github.com/karpathy/a4166c7fe253700972fcbc77e4ea32c5
	# Use it to solve CartPole-v0
	import numpy as np
	import gym

	# hyperparameters
	H = 10 # number of hidden layer neurons
	batch_size = 5 # every how many episodes to do a param update?
	learning_rate = 1e-3
	gamma = 0.99 # discount factor for reward
	decay_rate = 0.99 # decay factor for RMSProp leaky sum of grad^2

	# model initialization
	D = 4 # input dimensionality
	C = 2 # class number

	model = {}
	model['W1'] = np.random.randn(H, D) / np.sqrt(D) # "Xavier" initialization, shape (H, D)
	model['W2'] = np.random.randn(C, H) / np.sqrt(C) # shape (C, H)

	grad_buffer = { k : np.zeros_like(v) for k,v in model.iteritems() } # update buffers that add up gradients over a batch
	rmsprop_cache = { k : np.zeros_like(v) for k,v in model.iteritems() } # rmsprop memory

	def sigmoid(x):
	x = x - max(x)
	return np.exp(x) / sum(np.exp(x)) # sigmoid "squashing" function to interval [0,1]

	def discount_rewards(r):
	""" take 1D float array of rewards and compute discounted reward """
	discounted_r = np.zeros_like(r)
	running_add = 0
	for t in reversed(xrange(0, r.size)):
	running_add = running_add * gamma + r[t]
	discounted_r[t] = running_add
	return discounted_r

	def policy_forward(x):
	h = np.dot(model['W1'], x) # shape (H,)
	h[h<0] = 0 # ReLU nonlinearity
	logp = np.dot(model['W2'], h) # shape (C,)
	p = sigmoid(logp) # shape (C,)
	return p, h # return probability of taking all actions, and hidden state, here sum of p is 1.0

	def policy_backward(eph, epdlogp):
	#eph shape (Ns, H), Ns is number of steps in this episode
	#epdlogp shape (Ns, C)
	#epx shape (Ns, D)

	""" backward pass. (eph is array of intermediate hidden states) """
	dW2 = np.dot(epdlogp.T, eph) # shape (C, H)
	dh = np.dot(epdlogp, model['W2']) # shape (Ns, H)
	dh[eph <= 0] = 0 # backpro prelu
	dW1 = np.dot(dh.T, epx) # shape (H, D)

	return {'W1':dW1, 'W2':dW2}

	env = gym.make("CartPole-v0")

	xs,hs,dlogps,drs = [],[],[],[]
	running_reward = None
	reward_sum = 0
	episode_number = 0

	for episode_number in range(5000):

	observation = env.reset()

	while True:
	x = observation #shape (D,)

	# forward the policy network and sample an action from the returned probability
	aprob, h = policy_forward(x)
	action = 0 if np.random.uniform() < aprob[0] else 1 # sample action based on probability

	# record various intermediates (needed later for backprop)
	xs.append(x) # observation
	hs.append(h) # hidden state

	y = np.zeros_like(aprob)
	y[action] = 1

	dlogps.append(y - aprob) # grad that encourages the action that was taken to be taken (see http://cs231n.github.io/neural-networks-2/#losses if confused)

	# step the environment and get new measurements
	observation, reward, done, info = env.step(action)
	reward_sum += reward

	drs.append(reward) # record reward (has to be done after we call step() to get reward for previous action)

	if done or reward_sum >= 300: # an episode finished
	# stack together all inputs, hidden states, action gradients, and rewards for this episode
	epx = np.vstack(xs)
	eph = np.vstack(hs)
	epdlogp = np.vstack(dlogps)
	epr = np.vstack(drs)
	xs,hs,dlogps,drs = [],[],[],[] # reset array memory

	# compute the discounted reward backwards through time
	discounted_epr = discount_rewards(epr)
	# standardize the rewards to be unit normal (helps control the gradient estimator variance)
	discounted_epr -= np.mean(discounted_epr)
	discounted_epr /= np.std(discounted_epr)

	epdlogp *= discounted_epr # modulate the gradient with advantage (PG magic happens right here.)

	# print epdlogp
	grad = policy_backward(eph, epdlogp)
	for k in model: grad_buffer[k] += grad[k] # accumulate grad over batch

	# perform rmsprop parameter update every batch_size episodes
	if episode_number % batch_size == 0:
	for k,v in model.iteritems():
	g = grad_buffer[k] # gradient
	rmsprop_cache[k] = decay_rate * rmsprop_cache[k] + (1 - decay_rate) * g**2
	model[k] += learning_rate * g / (np.sqrt(rmsprop_cache[k]) + 1e-5)
	grad_buffer[k] = np.zeros_like(v) # reset batch gradient buffer

	reward_sum = 0
	observation = env.reset() # reset env

	break

	#test algorithm
	test_number = 100
	test_reward = 0
	for i in range(test_number):
	iter = 0
	reward_sum = 0
	observation = env.reset() # Obtain an initial observation of the environment
	while True:
	# Run the policy network and get an action to take.
	aprob, _ = policy_forward(observation)
	action = 0 if np.random.uniform() < aprob[0] else 1

	# step the environment and get new measurements
	observation, reward, done, info = env.step(action)
	reward_sum += reward
	iter += 1
	if done or iter >= 300:

	test_reward += reward_sum
	iter = 0
	reward_sum = 0
	break

	print "test average reward is {}".format(test_reward / test_number)