Skip to content

Instantly share code, notes, and snippets.

@Smerity
Last active September 2, 2016 00:09
Show Gist options
  • Save Smerity/127e4aa15e8746530ac309050cbe1d98 to your computer and use it in GitHub Desktop.
Save Smerity/127e4aa15e8746530ac309050cbe1d98 to your computer and use it in GitHub Desktop.
Buggy (but preserved for posterity) script for Cartpole using policy gradient via Chainer, two layer MLP, dropout, and vaguely rejection sampling of historical memories
""" Quick script for Cartpole using policy gradient via Chainer, two layer MLP, dropout, and vaguely rejection sampling of historical memories """
import gym
import numpy as np
import chainer
from chainer import optimizers
from chainer import ChainList, Variable
import chainer.functions as F
import chainer.links as L
env = gym.make('CartPole-v0')
env.monitor.start('./cartpole-experiment')
print('Action space:', env.action_space)
print('Observation space:', env.observation_space)
class PolicyNetwork(ChainList):
def __init__(self, input_size=4, hidden_size=32):
super(PolicyNetwork, self).__init__(
L.Linear(input_size, hidden_size, nobias=True),
L.Linear(hidden_size, 1, nobias=True),
)
def __call__(self, x, train=True, dp=0.5):
h = x
h = F.dropout(self[0](h), train=train, ratio=dp)
h = self[1](F.tanh(h))
return F.sigmoid(h)
model = PolicyNetwork()
optimizer = optimizers.SGD(lr=0.8)
optimizer.setup(model)
env.reset()
episodes = []
reward_history = []
for iter in range(10000):
episode = []
total_reward = 0
state = env.reset()
for t in range(201):
#env.render()
raw_action = model(np.array([state], dtype=np.float32), train=False)
action = 1 if np.random.random() < raw_action.data else 0
#if np.random.random() > 0.99:
# action = env.action_space.sample()
new_state, reward, done, info = env.step(action)
episode.append((state, action, reward))
state = new_state
total_reward += reward
if done:
break
episodes.append((total_reward, episode))
reward_history.append(total_reward)
if len(episodes) > 2:
gradW = []
for _, episode in episodes:
R = [r for idx, (s, a, r) in enumerate(episode)]
accR = [sum(r * 0.99 ** i for i, r in enumerate(R[idx:])) for idx, (s, a, r) in enumerate(episode)]
pred_actions = [model(np.array([s], dtype=np.float32), train=True) for (s, a, r) in episode]
actions = [(pa - a) ** 2 for pa, (s, a, r) in zip(pred_actions, episode)]
for loss, r in zip(actions, accR):
model.zerograds()
loss.backward()
gradW.append(r * model[0].W.grad)
model[0].W.grad = np.mean(gradW, axis=0, dtype=np.float32)
optimizer.update()
episodes = [(r, ep) for (r, ep) in episodes if 200 * np.random.random() < r]
np.random.shuffle(episodes)
episodes = episodes[:16]
print('Episode {} finished after {} timesteps (avg for last 100 - {})'.format(iter, t, np.mean(reward_history[-100:])))
env.monitor.close()
@Smerity
Copy link
Author

Smerity commented Sep 1, 2016

Gah, silly bug! Only one of the two layers are having their gradient correctly calculated. The weights for only the first layer (input => hidden) of the MLP are updated appropriately but the second layer (hidden => output) are only updated according to the gradient of the last action used to compute the correct gradient.

Still works though ... ¯_(ツ)_/¯ Darn gradient descent working out ways to hide my darn bugs. I'll post a secondary version with this fixed.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment