Last active
May 29, 2017 10:45
-
-
Save justanotherminh/c1a29e656c4d744fa9f095e9191fdf00 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Sometimes it works, sometimes it doesn't | |
import numpy as np | |
import gym | |
class Net(object): | |
def __init__(self, input, output): | |
# W = 0.01 * np.random.randn(input, output) | |
# b = np.zeros([1, output]) # Random initialization | |
W = np.array([[-0.60944746, 0.45539405], | |
[0.53731965, -0.15138026], | |
[-2.38568372, 1.7827912], | |
[-5.02650308, 5.4449609]]) | |
b = np.array([[0.39728291, 2.86320634]]) # Optimized parameters | |
self.params = [W, b] | |
self.grad = [np.zeros_like(W), np.zeros_like(b)] | |
self.lr = 0.005 | |
self.gamma = 0.95 | |
def forward(self, x): | |
W, b = self.params | |
out = x.dot(W) + b | |
return out | |
def update_params(self, x, dout): | |
# SGD + Momentum | |
W, b = self.params | |
db = dout.sum(axis=0) | |
dW = x.T.dot(dout) | |
grad = [dW, db] | |
for g, gc in zip(grad, self.grad): | |
gc *= self.gamma | |
gc += self.lr * g | |
for w, d in zip(self.params, self.grad): | |
w -= d | |
if __name__ == '__main__': | |
mem = [[], [], [], [], []] # State, Action, Result, Reward, Terminal | |
times = [] | |
q = [] | |
actor = Net(4, 2) | |
randp = 0. | |
env = gym.make('CartPole-v0') | |
for i_episode in range(100): | |
ob = env.reset() | |
for t in range(200): # Change to 500 for v1 | |
# Observing | |
if np.random.rand() < randp: | |
action = env.action_space.sample() | |
else: | |
ob -= np.array([0.00076127, 0.01893811, 0.00292497, -0.01291666]) | |
ob /= np.array([0.09564344, 0.57818071, 0.10437309, 0.87035585]) | |
action = np.argmax(actor.forward(ob[None, :]), axis=1)[0] | |
mem[0].append(ob[None, :]) | |
ac = np.zeros([1, 2]) | |
ac[:, action] = 1 | |
mem[1].append(ac) | |
ob, reward, done, info = env.step(action) | |
ob -= np.array([0.00076127, 0.01893811, 0.00292497, -0.01291666]) | |
ob /= np.array([0.09564344, 0.57818071, 0.10437309, 0.87035585]) | |
mem[2].append(ob[None, :]) | |
mem[3].append(reward) | |
mem[4].append(done) | |
# Training | |
if len(mem[0]) == 200: | |
s, a, s_, r, term = [np.vstack(m) for m in mem] | |
target = np.copy(r) | |
Q_target = np.max(actor.forward(s_), axis=1)[:, None] | |
target[term == False] += 0.9 * Q_target[term == False] | |
dout = (actor.forward(s) - target * a) / 200 | |
# actor.update_params(s, dout) | |
randp *= 0.99 | |
for m in mem: | |
m.pop(0) | |
q.append(Q_target.mean()) | |
if done: | |
print t + 1 | |
times.append(t + 1) | |
break |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment