Skip to content

Instantly share code, notes, and snippets.

@justanotherminh
Last active May 29, 2017 10:45
Show Gist options
  • Save justanotherminh/c1a29e656c4d744fa9f095e9191fdf00 to your computer and use it in GitHub Desktop.
Save justanotherminh/c1a29e656c4d744fa9f095e9191fdf00 to your computer and use it in GitHub Desktop.
# Sometimes it works, sometimes it doesn't
import numpy as np
import gym
class Net(object):
def __init__(self, input, output):
# W = 0.01 * np.random.randn(input, output)
# b = np.zeros([1, output]) # Random initialization
W = np.array([[-0.60944746, 0.45539405],
[0.53731965, -0.15138026],
[-2.38568372, 1.7827912],
[-5.02650308, 5.4449609]])
b = np.array([[0.39728291, 2.86320634]]) # Optimized parameters
self.params = [W, b]
self.grad = [np.zeros_like(W), np.zeros_like(b)]
self.lr = 0.005
self.gamma = 0.95
def forward(self, x):
W, b = self.params
out = x.dot(W) + b
return out
def update_params(self, x, dout):
# SGD + Momentum
W, b = self.params
db = dout.sum(axis=0)
dW = x.T.dot(dout)
grad = [dW, db]
for g, gc in zip(grad, self.grad):
gc *= self.gamma
gc += self.lr * g
for w, d in zip(self.params, self.grad):
w -= d
if __name__ == '__main__':
mem = [[], [], [], [], []] # State, Action, Result, Reward, Terminal
times = []
q = []
actor = Net(4, 2)
randp = 0.
env = gym.make('CartPole-v0')
for i_episode in range(100):
ob = env.reset()
for t in range(200): # Change to 500 for v1
# Observing
if np.random.rand() < randp:
action = env.action_space.sample()
else:
ob -= np.array([0.00076127, 0.01893811, 0.00292497, -0.01291666])
ob /= np.array([0.09564344, 0.57818071, 0.10437309, 0.87035585])
action = np.argmax(actor.forward(ob[None, :]), axis=1)[0]
mem[0].append(ob[None, :])
ac = np.zeros([1, 2])
ac[:, action] = 1
mem[1].append(ac)
ob, reward, done, info = env.step(action)
ob -= np.array([0.00076127, 0.01893811, 0.00292497, -0.01291666])
ob /= np.array([0.09564344, 0.57818071, 0.10437309, 0.87035585])
mem[2].append(ob[None, :])
mem[3].append(reward)
mem[4].append(done)
# Training
if len(mem[0]) == 200:
s, a, s_, r, term = [np.vstack(m) for m in mem]
target = np.copy(r)
Q_target = np.max(actor.forward(s_), axis=1)[:, None]
target[term == False] += 0.9 * Q_target[term == False]
dout = (actor.forward(s) - target * a) / 200
# actor.update_params(s, dout)
randp *= 0.99
for m in mem:
m.pop(0)
q.append(Q_target.mean())
if done:
print t + 1
times.append(t + 1)
break
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment