reddragon/pong-next20-data.py

## pong-next20-data.py
import gym
import logging
import sys
import numpy as np
from gym import wrappers

import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import numpy as np
import torch.optim as optim
import matplotlib.image as mpimg
import cPickle as pickle

from math import sqrt, ceil

from torch.autograd import Variable

def visualize_grid(Xs, ubound=255.0, padding=1):
  """
  Reshape a 4D tensor of image data to a grid for easy visualization.
  Inputs:
  - Xs: Data of shape (N, H, W, C)
  - ubound: Output grid will have values scaled to the range [0, ubound]
  - padding: The number of blank pixels between elements of the grid
  """
  (N, H, W, C) = Xs.shape
  grid_size = int(ceil(sqrt(N)))
  grid_height = H * grid_size + padding * (grid_size - 1)
  grid_width = W * grid_size + padding * (grid_size - 1)
  grid = np.zeros((grid_height, grid_width, C))
  next_idx = 0
  y0, y1 = 0, H
  for y in xrange(grid_size):
    x0, x1 = 0, W
    for x in xrange(grid_size):
      if next_idx < N:
        img = Xs[next_idx]
        low, high = np.min(img), np.max(img)
        grid[y0:y1, x0:x1] = ubound * (img - low) / (high - low)
        # grid[y0:y1, x0:x1] = Xs[next_idx]
        next_idx += 1
      x0 += W + padding
      x1 += W + padding
    y0 += H + padding
    y1 += H + padding
  # grid_max = np.max(grid)
  # grid_min = np.min(grid)
  # grid = ubound * (grid - grid_min) / (grid_max - grid_min)
  return grid

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(3, 1, 20)
        self.fc2 = nn.Linear(1 * 191 * 141, 4)

    def forward(self, x):
        x = F.relu((self.conv1(x)))
        x = x.view(-1, 1 * 191 * 141)
        x = F.relu(self.fc2(x))
        return F.log_softmax(x)

def preprocess(I):
    I = I[35:195] # crop
    I = I[::2,::2,0] # downsample by factor of 2
    I[I == 144] = 0 # erase background (background type 1)
    I[I == 109] = 0 # erase background (background type 2)
    I[I != 0] = 1 # everything else (paddles, ball) just set to 1
    return I.astype(np.float).ravel()

gym.undo_logger_setup()
logger = logging.getLogger()
formatter = logging.Formatter('[%(asctime)s] %(message)s')
handler = logging.StreamHandler(sys.stderr)
handler.setFormatter(formatter)
logger.addHandler(handler)

# You can set the level to logging.DEBUG or logging.WARN if you
# want to change the amount of output.
logger.setLevel(logging.INFO)

outdir = 'rl-data'
env = gym.make('Pong-v0')
# env = wrappers.Monitor(env, directory=outdir, force=True)
env.seed(0)


iters = 0
total = 0

episodes = 10
# Shape of observation (210, 160, 3)

net = Net()
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.5)

# zero the parameter gradients
optimizer.zero_grad()

out = []
labels = []

while episodes > 0:
    # Get the observation
    ob = env.reset()
    total = 0
    ob_screens = []
    episode_screens = []
    while True:
        iters += 1
        ob, reward, done, _ = env.step(env.action_space.sample())
        ob = preprocess(ob)
        # ob_screens.append(ob)
        episode_screens.append(ob.reshape(80, 80))

        total += reward
        if reward != 0:
            print 'Received reward %d in iter %d. Total: %d' % (reward, iters, total)

            #plt.imshow(ob.reshape(80, 80))
            #plt.show()

            step = 1.0 / (iters)
            cur_val = 0.0
            prev_screen = np.zeros((80, 80))
            rem = 20
            for screen in episode_screens[-22:-2]:
                cur_val += step
                cur_screen = screen.reshape(80, 80)
                diff = cur_screen - prev_screen
                out.append(diff)
                # labels.append(cur_val)
                labels.append(rem)
                rem = rem - 1
                prev_screen = cur_screen
                done = True

            iters = 0
            episode_screens = []

        if done:
            print 'Done'
            break

    episodes = episodes - 1


d = [out, labels]
pickle.dump( d, open( "data.p", "wb" ), protocol=2 )

env.close()

## pong-next20-predict.py
import gym
import logging
import sys
import numpy as np
from gym import wrappers

import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import numpy as np
import torch.optim as optim
import matplotlib.image as mpimg
import cPickle as pickle
import torch.utils.data
import os as os
import imageio

from math import sqrt, ceil

from torch.autograd import Variable
from PIL import Image, ImageDraw

def preprocess(I):
    I = I[35:195] # crop
    I = I[::2,::2,0] # downsample by factor of 2
    I[I == 144] = 0 # erase background (background type 1)
    I[I == 109] = 0 # erase background (background type 2)
    I[I != 0] = 1 # everything else (paddles, ball) just set to 1
    return I.astype(np.float).ravel()

def opimg(I, pct):
    pct = min(max(pct, 0), 1.0)
    I = np.kron(I, np.ones((4,4)))
    # print I.shape
    I = np.append(I, np.ones((20,320)), axis=0)
    img = Image.fromarray(np.uint8(I * 255))
    img = img.convert('RGB')

    d = ImageDraw.Draw(img)
    pad = 3
    maxwidth = 320
    width = int(maxwidth * pct)

    h = pct
    green = int(max(2 * (h-0.5), 0) * 255)
    red = int(max(2 * (0.5-h), 0) * 255)
    blue = int(max(4 * min(0.75-h, h-0.25), 0) * 255)
    # print("h: {}, green:{}, red: {}, blue: {}".format(h, green, red, blue))

    d.rectangle([(0, 320), (maxwidth, 340)], fill=(220,220,220))
    d.rectangle([(pad, 320+pad), (maxwidth-pad, 340-pad)], fill=(0,0,0))
    d.rectangle([(pad, 320+pad), (width-pad, 340-pad)], fill=(red,green,blue))
    return img

CHECKPOINT_FILE_PATH = 'next20_ckpt'

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 1, 11)
        self.fc2 = nn.Linear(1 * 70 * 70, 1)
        self.fc1 = nn.Linear(1 * 80 * 80, 1)

    def forward(self, x):
        x = F.relu((self.conv1(x)))
        x = x.view(-1, 1 * 70 * 70)
        x = self.fc2(x)
        return x

def save(net, optimizer, epoch):
    state = {
        'state_dict': net.state_dict(),
        'optimizer': optimizer.state_dict(),
        'epoch': epoch,
    }
    print ("Saving checkpoint to file '{}'" . format(CHECKPOINT_FILE_PATH))
    torch.save(state, CHECKPOINT_FILE_PATH)


# Returns net, optimizer, epoch
def load():
    net = Net()
    optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.5)
    epoch = 0

    if os.path.isfile(CHECKPOINT_FILE_PATH):
        print ("Loading checkpoint from file '{}'" . format(CHECKPOINT_FILE_PATH))
        checkpoint = torch.load(CHECKPOINT_FILE_PATH)
        epoch = checkpoint['epoch']
        net.load_state_dict(checkpoint['state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer'])

    return net, optimizer, epoch


net, _, _ = load()
imgs = []

d = pickle.load(open('data2.p', 'r'))
diffs = np.asarray(d[0])
exps = np.asarray(d[1])

prev = np.zeros((80,80))


gym.undo_logger_setup()
logger = logging.getLogger()
formatter = logging.Formatter('[%(asctime)s] %(message)s')
handler = logging.StreamHandler(sys.stderr)
handler.setFormatter(formatter)
logger.addHandler(handler)

logger.setLevel(logging.INFO)

outdir = 'rl-data'
env = gym.make('Pong-v0')
env.seed(0)


iters = 0
total = 0

episodes = 40
imgs = []

while episodes > 0:
    # Get the observation
    ob = env.reset()
    total = 0
    ob_screens = []
    episode_screens = []
    while True:
        iters += 1
        ob, reward, done, _ = env.step(env.action_space.sample())
        ob = preprocess(ob)

        # ob_screens.append(ob)
        episode_screens.append(ob.reshape(80, 80))

        total += reward
        if reward != 0:
            prev_screen = np.zeros((80, 80))
            for screen in episode_screens[-22:-2]:
                cur_screen = screen.reshape(80, 80)
                diff = cur_screen - prev_screen

                diff = np.expand_dims(diff, axis=0)
                diff = np.expand_dims(diff, axis=0)
                dt = Variable(torch.Tensor(diff))
                op = net(dt).data.numpy().reshape(-1)

                pct = op[0] * 1.0 / 20.0

                imgs.append(np.asarray(opimg(screen, pct)))

                prev_screen = cur_screen
                done = True

            iters = 0
            episode_screens = []

        if done:
            break

    episodes = episodes - 1


imageio.mimsave('pong-next20.gif', imgs, duration=0.15)

## pong-next20-train.py
import gym
import logging
import sys
import numpy as np
from gym import wrappers

import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import numpy as np
import torch.optim as optim
import matplotlib.image as mpimg
import cPickle as pickle
import torch.utils.data
import os as os

from math import sqrt, ceil

from torch.autograd import Variable

CHECKPOINT_FILE_PATH = 'next20_ckpt'

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 1, 11)
        self.fc2 = nn.Linear(1 * 70 * 70, 1)
        self.fc1 = nn.Linear(1 * 80 * 80, 1)

    def forward(self, x):
        x = F.relu((self.conv1(x)))
        x = x.view(-1, 1 * 70 * 70)
        x = self.fc2(x)
        return x


def save(net, optimizer, epoch):
    state = {
        'state_dict': net.state_dict(),
        'optimizer': optimizer.state_dict(),
        'epoch': epoch,
    }
    print ("Saving checkpoint to file '{}'" . format(CHECKPOINT_FILE_PATH))
    torch.save(state, CHECKPOINT_FILE_PATH)


# Returns net, optimizer, epoch
def load():
    net = Net()
    optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.5)
    epoch = 0

    if os.path.isfile(CHECKPOINT_FILE_PATH):
        print ("Loading checkpoint from file '{}'" . format(CHECKPOINT_FILE_PATH))
        checkpoint = torch.load(CHECKPOINT_FILE_PATH)
        epoch = checkpoint['epoch']
        net.load_state_dict(checkpoint['state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer'])

    return net, optimizer, epoch


def visualize_grid(Xs, ubound=255.0, padding=1):
  """
  Reshape a 4D tensor of image data to a grid for easy visualization.
  Inputs:
  - Xs: Data of shape (N, H, W, C)
  - ubound: Output grid will have values scaled to the range [0, ubound]
  - padding: The number of blank pixels between elements of the grid
  """
  (N, H, W, C) = Xs.shape
  grid_size = int(ceil(sqrt(N)))
  grid_height = H * grid_size + padding * (grid_size - 1)
  grid_width = W * grid_size + padding * (grid_size - 1)
  grid = np.zeros((grid_height, grid_width, C))
  next_idx = 0
  y0, y1 = 0, H
  for y in xrange(grid_size):
    x0, x1 = 0, W
    for x in xrange(grid_size):
      if next_idx < N:
        img = Xs[next_idx]
        low, high = np.min(img), np.max(img)
        grid[y0:y1, x0:x1] = ubound * (img - low) / (high - low)
        # grid[y0:y1, x0:x1] = Xs[next_idx]
        next_idx += 1
      x0 += W + padding
      x1 += W + padding
    y0 += H + padding
    y1 += H + padding
  # grid_max = np.max(grid)
  # grid_min = np.min(grid)
  # grid = ubound * (grid - grid_min) / (grid_max - grid_min)
  return grid

d = pickle.load(open('data.p', 'r'))
diffs = np.asarray(d[0])
diffs = diffs.reshape(-1, 1, 80, 80)
labels = np.asarray(d[1])

print diffs.shape

prev_img = np.zeros((80, 80))
screens = []
for idx in range(len(diffs)):
    cur_img = diffs[idx].reshape(80, 80) + prev_img
    # plt.imshow(cur_img)
    # plt.show()
    prev_img = cur_img
    screens.append(cur_img)

dt = torch.FloatTensor(diffs)
lt = torch.FloatTensor(labels)
print ("Loading data")
td = torch.utils.data.TensorDataset(data_tensor=dt, target_tensor=lt)
print ("Done with loading the file")
criterion = nn.SmoothL1Loss()

trainloader = torch.utils.data.DataLoader(td, batch_size=10,
                                          shuffle=True, num_workers=2)
net, optimizer, init_epoch = load()

for epoch in range(init_epoch, 100):  # loop over the dataset multiple times
    running_loss = 0.0
    for i, data in enumerate(trainloader, 0):
        # get the inputs
        inputs, labels = data
        labels = labels.float()
        inputs, labels = Variable(inputs), Variable(labels)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.data[0]
        if i % 5 == 4:    # print every 2000 mini-batches
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 100))
            running_loss = 0.0

    save(net, optimizer, epoch)

for i, data in enumerate(trainloader, 0):
    # get the inputs
    inputs, labels = data
    print np.count_nonzero(inputs.numpy())

    outputs = net(Variable(inputs))
    print labels
    print outputs
    break
	import gym
	import logging
	import sys
	import numpy as np
	from gym import wrappers

	import torch
	import torchvision
	import torch.nn as nn
	import torch.nn.functional as F
	import torchvision.transforms as transforms
	import matplotlib.pyplot as plt
	import numpy as np
	import torch.optim as optim
	import matplotlib.image as mpimg
	import cPickle as pickle

	from math import sqrt, ceil

	from torch.autograd import Variable

	def visualize_grid(Xs, ubound=255.0, padding=1):
	"""
	Reshape a 4D tensor of image data to a grid for easy visualization.
	Inputs:
	- Xs: Data of shape (N, H, W, C)
	- ubound: Output grid will have values scaled to the range [0, ubound]
	- padding: The number of blank pixels between elements of the grid
	"""
	(N, H, W, C) = Xs.shape
	grid_size = int(ceil(sqrt(N)))
	grid_height = H * grid_size + padding * (grid_size - 1)
	grid_width = W * grid_size + padding * (grid_size - 1)
	grid = np.zeros((grid_height, grid_width, C))
	next_idx = 0
	y0, y1 = 0, H
	for y in xrange(grid_size):
	x0, x1 = 0, W
	for x in xrange(grid_size):
	if next_idx < N:
	img = Xs[next_idx]
	low, high = np.min(img), np.max(img)
	grid[y0:y1, x0:x1] = ubound * (img - low) / (high - low)
	# grid[y0:y1, x0:x1] = Xs[next_idx]
	next_idx += 1
	x0 += W + padding
	x1 += W + padding
	y0 += H + padding
	y1 += H + padding
	# grid_max = np.max(grid)
	# grid_min = np.min(grid)
	# grid = ubound * (grid - grid_min) / (grid_max - grid_min)
	return grid

	class Net(nn.Module):
	def __init__(self):
	super(Net, self).__init__()
	self.conv1 = nn.Conv2d(3, 1, 20)
	self.fc2 = nn.Linear(1 * 191 * 141, 4)

	def forward(self, x):
	x = F.relu((self.conv1(x)))
	x = x.view(-1, 1 * 191 * 141)
	x = F.relu(self.fc2(x))
	return F.log_softmax(x)

	def preprocess(I):
	I = I[35:195] # crop
	I = I[::2,::2,0] # downsample by factor of 2
	I[I == 144] = 0 # erase background (background type 1)
	I[I == 109] = 0 # erase background (background type 2)
	I[I != 0] = 1 # everything else (paddles, ball) just set to 1
	return I.astype(np.float).ravel()

	gym.undo_logger_setup()
	logger = logging.getLogger()
	formatter = logging.Formatter('[%(asctime)s] %(message)s')
	handler = logging.StreamHandler(sys.stderr)
	handler.setFormatter(formatter)
	logger.addHandler(handler)

	# You can set the level to logging.DEBUG or logging.WARN if you
	# want to change the amount of output.
	logger.setLevel(logging.INFO)

	outdir = 'rl-data'
	env = gym.make('Pong-v0')
	# env = wrappers.Monitor(env, directory=outdir, force=True)
	env.seed(0)


	iters = 0
	total = 0

	episodes = 10
	# Shape of observation (210, 160, 3)

	net = Net()
	criterion = nn.CrossEntropyLoss()
	optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.5)

	# zero the parameter gradients
	optimizer.zero_grad()

	out = []
	labels = []

	while episodes > 0:
	# Get the observation
	ob = env.reset()
	total = 0
	ob_screens = []
	episode_screens = []
	while True:
	iters += 1
	ob, reward, done, _ = env.step(env.action_space.sample())
	ob = preprocess(ob)
	# ob_screens.append(ob)
	episode_screens.append(ob.reshape(80, 80))

	total += reward
	if reward != 0:
	print 'Received reward %d in iter %d. Total: %d' % (reward, iters, total)

	#plt.imshow(ob.reshape(80, 80))
	#plt.show()

	step = 1.0 / (iters)
	cur_val = 0.0
	prev_screen = np.zeros((80, 80))
	rem = 20
	for screen in episode_screens[-22:-2]:
	cur_val += step
	cur_screen = screen.reshape(80, 80)
	diff = cur_screen - prev_screen
	out.append(diff)
	# labels.append(cur_val)
	labels.append(rem)
	rem = rem - 1
	prev_screen = cur_screen
	done = True

	iters = 0
	episode_screens = []

	if done:
	print 'Done'
	break

	episodes = episodes - 1


	d = [out, labels]
	pickle.dump( d, open( "data.p", "wb" ), protocol=2 )

	env.close()