Skip to content

Instantly share code, notes, and snippets.

@ebetica
Created January 18, 2017 23:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ebetica/c0a1ab47973ba4f247ab2df845b8b32d to your computer and use it in GitHub Desktop.
Save ebetica/c0a1ab47973ba4f247ab2df845b8b32d to your computer and use it in GitHub Desktop.
import argparse
import gym
import numpy as np
from itertools import count
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.autograd as autograd
from torch.autograd import Variable
import torchvision.transforms as T
parser = argparse.ArgumentParser(description='PyTorch REINFORCE example')
parser.add_argument('--gamma', type=float, default=0.9, metavar='G',
help='discount factor (default: 0.999)')
parser.add_argument('--seed', type=int, default=1, metavar='N',
help='random seed (default: 1)')
parser.add_argument('--log-interval', type=int, default=50, metavar='N',
help='interval between training status logs')
args = parser.parse_args()
# torch.manual_seed(args.seed)
class Policy(nn.Module):
def __init__(self):
super(Policy, self).__init__()
self.affine1 = nn.Linear(4, 128)
self.affine3 = nn.Linear(128, 2)
self.sampled_probs = []
self.sampled_actions = []
self.rewards = []
def forward(self, x):
x = F.relu(self.affine1(x))
x = F.relu(self.affine3(x))
return F.softmax(x)
env = gym.make('CartPole-v0')
model = Policy()
optimizer = optim.RMSprop(model.parameters(), lr=1e-2, alpha=1, eps=1e-10)
def select_action(state):
state = torch.from_numpy(state).float().unsqueeze(0)
probs = model(Variable(state))
action = probs.multinomial(1, True).data.squeeze()[0]
model.sampled_probs.append(probs)
model.sampled_actions.append(action)
return action
def finish_episode():
R = 0
sampled_actions = model.sampled_actions
sampled_probs = model.sampled_probs
rewards = []
for action, r in zip(sampled_actions[::-1], model.rewards[::-1]):
R = r + args.gamma * R
rewards.insert(0, R)
rewards = torch.Tensor(rewards)
rewards = (rewards - rewards.mean()) / rewards.std()
ys = [torch.zeros(1, 2) for _ in sampled_probs]
for i, action in enumerate(sampled_actions):
ys[i][0][action] = 1
loss = [((Variable(y) - p)**2).sum() / 2 for p, y in zip(sampled_probs, ys)]
grads = {}
for i, l in enumerate(loss):
optimizer.zero_grad()
l.backward()
for j, group in enumerate(optimizer.param_groups):
saved_group = grads.get(j, {})
'''
x = group['params'][-1]
print(x)
print(rewards[i] * x)
'''
for k, param in enumerate(group['params']):
cumsum = saved_group.get(k, torch.zeros(param.grad.size()))
cumsum += rewards[i] * param.grad.data
saved_group[k] = cumsum
'''
print(cumsum)
import pdb; pdb.set_trace()
'''
grads[j] = saved_group
for j, group in enumerate(optimizer.param_groups):
for k, param in enumerate(group['params']):
param.grad.data = grads[j][k] / len(loss)
'''
for j, group in enumerate(optimizer.param_groups):
for k, param in enumerate(group['params']):
print("grad: ", param.grad.data.abs().max())
print("param: ", param.data.abs().max())
'''
optimizer.step()
del model.rewards[:]
del model.sampled_actions[:]
del model.sampled_probs[:]
running_reward = 10
for i_episode in count(1):
reward_sum = 0
state = env.reset()
for t in count(1):
action = select_action(state)
state, reward, done, _ = env.step(action)
model.rewards.append(reward)
reward_sum += reward
if done:
break
running_reward = running_reward * 0.99 + reward_sum * 0.01
finish_episode()
if i_episode % args.log_interval == 0:
print('Episode {}\tLast length: {:5f}\tAverage length: {:.2f}'.format(
i_episode, reward_sum, running_reward))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment