justanotherminh/cart-pole.py

## cart-pole.py
# Sometimes it works, sometimes it doesn't
import numpy as np
import gym


class Net(object):
    def __init__(self, input, output):
        # W = 0.01 * np.random.randn(input, output)
        # b = np.zeros([1, output])  # Random initialization
        W = np.array([[-0.60944746, 0.45539405],
                      [0.53731965, -0.15138026],
                      [-2.38568372, 1.7827912],
                      [-5.02650308, 5.4449609]])
        b = np.array([[0.39728291, 2.86320634]])  # Optimized parameters
        self.params = [W, b]
        self.grad = [np.zeros_like(W), np.zeros_like(b)]
        self.lr = 0.005
        self.gamma = 0.95

    def forward(self, x):
        W, b = self.params
        out = x.dot(W) + b
        return out

    def update_params(self, x, dout):
        #  SGD + Momentum
        W, b = self.params
        db = dout.sum(axis=0)
        dW = x.T.dot(dout)
        grad = [dW, db]
        for g, gc in zip(grad, self.grad):
            gc *= self.gamma
            gc += self.lr * g
        for w, d in zip(self.params, self.grad):
            w -= d


if __name__ == '__main__':
    mem = [[], [], [], [], []]  # State, Action, Result, Reward, Terminal
    times = []
    q = []
    actor = Net(4, 2)
    randp = 0.

    env = gym.make('CartPole-v0')
    for i_episode in range(100):
        ob = env.reset()
        for t in range(200):  # Change to 500 for v1
            # Observing
            if np.random.rand() < randp:
                action = env.action_space.sample()
            else:
                ob -= np.array([0.00076127, 0.01893811, 0.00292497, -0.01291666])
                ob /= np.array([0.09564344, 0.57818071, 0.10437309, 0.87035585])
                action = np.argmax(actor.forward(ob[None, :]), axis=1)[0]
            mem[0].append(ob[None, :])
            ac = np.zeros([1, 2])
            ac[:, action] = 1
            mem[1].append(ac)
            ob, reward, done, info = env.step(action)
            ob -= np.array([0.00076127, 0.01893811, 0.00292497, -0.01291666])
            ob /= np.array([0.09564344, 0.57818071, 0.10437309, 0.87035585])
            mem[2].append(ob[None, :])
            mem[3].append(reward)
            mem[4].append(done)
            # Training
            if len(mem[0]) == 200:
                s, a, s_, r, term = [np.vstack(m) for m in mem]
                target = np.copy(r)
                Q_target = np.max(actor.forward(s_), axis=1)[:, None]
                target[term == False] += 0.9 * Q_target[term == False]
                dout = (actor.forward(s) - target * a) / 200
                # actor.update_params(s, dout)
                randp *= 0.99
                for m in mem:
                    m.pop(0)
                q.append(Q_target.mean())
            if done:
                print t + 1
                times.append(t + 1)
                break
	# Sometimes it works, sometimes it doesn't
	import numpy as np
	import gym


	class Net(object):
	def __init__(self, input, output):
	# W = 0.01 * np.random.randn(input, output)
	# b = np.zeros([1, output]) # Random initialization
	W = np.array([[-0.60944746, 0.45539405],
	[0.53731965, -0.15138026],
	[-2.38568372, 1.7827912],
	[-5.02650308, 5.4449609]])
	b = np.array([[0.39728291, 2.86320634]]) # Optimized parameters
	self.params = [W, b]
	self.grad = [np.zeros_like(W), np.zeros_like(b)]
	self.lr = 0.005
	self.gamma = 0.95

	def forward(self, x):
	W, b = self.params
	out = x.dot(W) + b
	return out

	def update_params(self, x, dout):
	# SGD + Momentum
	W, b = self.params
	db = dout.sum(axis=0)
	dW = x.T.dot(dout)
	grad = [dW, db]
	for g, gc in zip(grad, self.grad):
	gc *= self.gamma
	gc += self.lr * g
	for w, d in zip(self.params, self.grad):
	w -= d


	if __name__ == '__main__':
	mem = [[], [], [], [], []] # State, Action, Result, Reward, Terminal
	times = []
	q = []
	actor = Net(4, 2)
	randp = 0.

	env = gym.make('CartPole-v0')
	for i_episode in range(100):
	ob = env.reset()
	for t in range(200): # Change to 500 for v1
	# Observing
	if np.random.rand() < randp:
	action = env.action_space.sample()
	else:
	ob -= np.array([0.00076127, 0.01893811, 0.00292497, -0.01291666])
	ob /= np.array([0.09564344, 0.57818071, 0.10437309, 0.87035585])
	action = np.argmax(actor.forward(ob[None, :]), axis=1)[0]
	mem[0].append(ob[None, :])
	ac = np.zeros([1, 2])
	ac[:, action] = 1
	mem[1].append(ac)
	ob, reward, done, info = env.step(action)
	ob -= np.array([0.00076127, 0.01893811, 0.00292497, -0.01291666])
	ob /= np.array([0.09564344, 0.57818071, 0.10437309, 0.87035585])
	mem[2].append(ob[None, :])
	mem[3].append(reward)
	mem[4].append(done)
	# Training
	if len(mem[0]) == 200:
	s, a, s_, r, term = [np.vstack(m) for m in mem]
	target = np.copy(r)
	Q_target = np.max(actor.forward(s_), axis=1)[:, None]
	target[term == False] += 0.9 * Q_target[term == False]
	dout = (actor.forward(s) - target * a) / 200
	# actor.update_params(s, dout)
	randp *= 0.99
	for m in mem:
	m.pop(0)
	q.append(Q_target.mean())
	if done:
	print t + 1
	times.append(t + 1)
	break