Last active
September 2, 2016 00:09
-
-
Save Smerity/127e4aa15e8746530ac309050cbe1d98 to your computer and use it in GitHub Desktop.
Buggy (but preserved for posterity) script for Cartpole using policy gradient via Chainer, two layer MLP, dropout, and vaguely rejection sampling of historical memories
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" Quick script for Cartpole using policy gradient via Chainer, two layer MLP, dropout, and vaguely rejection sampling of historical memories """ | |
import gym | |
import numpy as np | |
import chainer | |
from chainer import optimizers | |
from chainer import ChainList, Variable | |
import chainer.functions as F | |
import chainer.links as L | |
env = gym.make('CartPole-v0') | |
env.monitor.start('./cartpole-experiment') | |
print('Action space:', env.action_space) | |
print('Observation space:', env.observation_space) | |
class PolicyNetwork(ChainList): | |
def __init__(self, input_size=4, hidden_size=32): | |
super(PolicyNetwork, self).__init__( | |
L.Linear(input_size, hidden_size, nobias=True), | |
L.Linear(hidden_size, 1, nobias=True), | |
) | |
def __call__(self, x, train=True, dp=0.5): | |
h = x | |
h = F.dropout(self[0](h), train=train, ratio=dp) | |
h = self[1](F.tanh(h)) | |
return F.sigmoid(h) | |
model = PolicyNetwork() | |
optimizer = optimizers.SGD(lr=0.8) | |
optimizer.setup(model) | |
env.reset() | |
episodes = [] | |
reward_history = [] | |
for iter in range(10000): | |
episode = [] | |
total_reward = 0 | |
state = env.reset() | |
for t in range(201): | |
#env.render() | |
raw_action = model(np.array([state], dtype=np.float32), train=False) | |
action = 1 if np.random.random() < raw_action.data else 0 | |
#if np.random.random() > 0.99: | |
# action = env.action_space.sample() | |
new_state, reward, done, info = env.step(action) | |
episode.append((state, action, reward)) | |
state = new_state | |
total_reward += reward | |
if done: | |
break | |
episodes.append((total_reward, episode)) | |
reward_history.append(total_reward) | |
if len(episodes) > 2: | |
gradW = [] | |
for _, episode in episodes: | |
R = [r for idx, (s, a, r) in enumerate(episode)] | |
accR = [sum(r * 0.99 ** i for i, r in enumerate(R[idx:])) for idx, (s, a, r) in enumerate(episode)] | |
pred_actions = [model(np.array([s], dtype=np.float32), train=True) for (s, a, r) in episode] | |
actions = [(pa - a) ** 2 for pa, (s, a, r) in zip(pred_actions, episode)] | |
for loss, r in zip(actions, accR): | |
model.zerograds() | |
loss.backward() | |
gradW.append(r * model[0].W.grad) | |
model[0].W.grad = np.mean(gradW, axis=0, dtype=np.float32) | |
optimizer.update() | |
episodes = [(r, ep) for (r, ep) in episodes if 200 * np.random.random() < r] | |
np.random.shuffle(episodes) | |
episodes = episodes[:16] | |
print('Episode {} finished after {} timesteps (avg for last 100 - {})'.format(iter, t, np.mean(reward_history[-100:]))) | |
env.monitor.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Gah, silly bug! Only one of the two layers are having their gradient correctly calculated. The weights for only the first layer (input => hidden) of the MLP are updated appropriately but the second layer (hidden => output) are only updated according to the gradient of the last action used to compute the correct gradient.
Still works though ... ¯_(ツ)_/¯ Darn gradient descent working out ways to hide my darn bugs. I'll post a secondary version with this fixed.