Skip to content

Instantly share code, notes, and snippets.

@parajain
Last active November 2, 2017 14:36
Show Gist options
  • Save parajain/fce77fe977263bb9360d257c332fe44b to your computer and use it in GitHub Desktop.
Save parajain/fce77fe977263bb9360d257c332fe44b to your computer and use it in GitHub Desktop.
Trying rl byscaling gradients with rewards
import numpy as np
import cPickle as pickle
import gym
import torch
import torch.nn as nn
from torch import optim
from torch.autograd import Variable
from torch import optim
import torch.nn.functional as F
render = False
env = gym.make("Pong-v0")
print('Action space')
print(env.action_space)
observation = env.reset()
prev_x = None # used in computing the difference frame
hidden_size = 200
num_inputs = 80 * 80
num_outputs = 6
class Policy(nn.Module):
def __init__(self, hidden_size, num_inputs, num_outputs):
super(Policy, self).__init__()
self.num_outputs = num_outputs
self.linear1 = nn.Linear(num_inputs, hidden_size)
self.linear2 = nn.Linear(hidden_size, num_outputs)
def forward(self, inputs):
x = inputs
x = F.relu(self.linear1(x))
action_scores = self.linear2(x)
return F.softmax(action_scores)
def update_policy(rewards, actions, aprobs, optimizer):
loss = nn.CrossEntropyLoss()
logits = torch.stack(aprobs)
batch_size = len(actions)
#print('logits ',logits.size())
rewards = np.tile(rewards, num_outputs)
rewards = np.reshape(rewards, (batch_size, num_outputs))
rewards_tensor = Variable(torch.from_numpy(rewards).float())
#print('rewards ', rewards_tensor.size())
fake_target_outputs = Variable(torch.stack(actions))
fake_target_outputs = torch.stack(fake_target_outputs).squeeze(1)
loss_value = loss(logits, fake_target_outputs)
def update_grad(grad):
grad = torch.mul(grad, rewards_tensor)
return grad
logits.register_hook(update_grad)
loss_value.backward()
optimizer.step()
def prepro(I):
""" prepro 210x160x3 uint8 frame into 6400 (80x80) 1D float vector """
# took from https://gist.github.com/karpathy/a4166c7fe253700972fcbc77e4ea32c5
I = I[35:195] # crop
I = I[::2, ::2, 0] # downsample by factor of 2
I[I == 144] = 0 # erase background (background type 1)
I[I == 109] = 0 # erase background (background type 2)
I[I != 0] = 1 # everything else (paddles, ball) just set to 1
return I.astype(np.float).ravel()
rewards = []
actions = []
aprobs = []
episode_number = 0
policy = Policy(hidden_size, num_inputs, num_outputs)
optimizer = optim.RMSprop(policy.parameters())
iter = 0
while True:
if render:
env.render()
#print('iter ', iter)
iter = iter + 1
# preprocess the observation, set input to network to be difference image
cur_x = prepro(observation)
x = cur_x - prev_x if prev_x is not None else np.zeros(num_inputs)
prev_x = cur_x
input = torch.from_numpy(x).float()
input = Variable(input)
aprob = policy(input)
action = aprob.multinomial().data
#print('action ', action[0])
observation, reward, done, info = env.step(action[0])
rewards.append(reward)
actions.append(action)
aprobs.append(aprob)
if done or iter > 500:
iter = 0
rewards = np.asarray(rewards)
observation = env.reset()
prev_x = None
print('Rewards ', rewards.sum())
update_policy(rewards, actions, aprobs, optimizer)
rewards = []
actions = []
aprobs = []
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment