Skip to content

Instantly share code, notes, and snippets.

@schmidtdominik
Last active March 14, 2021 02:25
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save schmidtdominik/3799130c7495b96f4cd9a1533fdd6083 to your computer and use it in GitHub Desktop.
Save schmidtdominik/3799130c7495b96f4cd9a1533fdd6083 to your computer and use it in GitHub Desktop.
import copy
import itertools, random
from collections import deque
from datetime import datetime
from pathlib import Path
import gym
from gym.wrappers import FrameStack
from gym.spaces import Box
import numpy as np
from tqdm.auto import tqdm
import torch
import torch.nn as nn
from torchsummary import summary
from torchvision import transforms as T
from matplotlib import pyplot as plt
from utils import *
def prep_observation_for_qnet(tensor):
""" Transfer the tensor to GPU normalize into range [0, 1] """
return tensor.to('cuda', dtype=torch.float32)/255
class QNetwork(nn.Module):
def __init__(self, depth, actions):
super().__init__()
self.main = nn.Sequential(
nn.Conv2d(in_channels=depth, out_channels=32, kernel_size=8, stride=4),
nn.ReLU(),
nn.Conv2d(in_channels=32, out_channels=64, kernel_size=4, stride=2),
nn.ReLU(),
nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1),
nn.ReLU(),
nn.Flatten(),
nn.Linear(3136, 512),
nn.ReLU(),
nn.Linear(512, actions),
)
def forward(self, x):
return self.main(x)
class DQN:
def __init__(self, state_dim, action_dim,
eps_initial=1, eps_min=0.1, eps_decay_over=1e6, buffer_size=1e6, batch_size=32, gamma=0.99,
burnin=5e4, train_every=4, sync_every=1e4, lr=0.00025):
self.state_dim = state_dim
self.action_dim = action_dim
self.q_policy = QNetwork(state_dim[0], action_dim).cuda()
self.q_target = copy.deepcopy(self.q_policy)
self.eps_initial = eps_initial
self.eps_min = eps_min
self.eps_decay_over = eps_decay_over
self.train_step = 0
self.game_frame = 0
self.memory = deque(maxlen=int(buffer_size))
self.batch_size = batch_size
self.gamma = gamma
self.burnin = burnin # min. experiences before training
self.train_every = train_every # no. of experiences between updates to q_policy
self.sync_every = int(sync_every) # no. of experiences between q_policy & q_target sync
self.opt = torch.optim.RMSprop(self.q_policy.parameters(), lr=lr, alpha=0.95, momentum=0.95, eps=0.01)
#self.opt = torch.optim.Adam(self.q_policy.parameters(), lr=lr)
self.loss_fn = torch.nn.SmoothL1Loss()
def act(self, state):
""" computes an epsilon-greedy step with respect to the current policy self.q_policy """
# sync the q target every self.sync_every frames
# see https://github.com/spragunr/deep_q_rl/blob/master/deep_q_rl/launcher.py#L155
if self.game_frame % self.sync_every == 0:
self.sync_Q_target()
# compute epsilon (https://openai.com/blog/openai-baselines-dqn/)
# 1. skip the first ~50000 steps while we are filling the ER-buffer
# 2. then linearly decay from 1 to 0.1 during the first 1e6 steps
# 3. then linearly decay from 0.1 to 0.01 during the next 24 million steps
if self.game_frame < self.burnin:
self.eps = self.eps_initial
elif self.game_frame-self.burnin < self.eps_decay_over:
self.eps = max(self.eps_initial-(self.eps_initial-self.eps_min)*(self.game_frame-self.burnin)/self.eps_decay_over, self.eps_min)
else:
self.eps = max(self.eps_min - (self.eps_min-0.01)*(self.game_frame-self.burnin-self.eps_decay_over) / 24_000_000, 0.01)
# return either random or best action
if np.random.rand() < self.eps:
return np.random.randint(self.action_dim)
else:
with torch.no_grad():
state = torch.from_numpy(state.__array__()).unsqueeze(0)
action_values = self.q_policy(prep_observation_for_qnet(state))
return torch.argmax(action_values, dim=1).item()
def cache(self, state, next_state, action, reward, done):
""" Store the experience in the ER buffer """
action = torch.tensor([action])
reward = torch.tensor([reward])
done = torch.tensor([done])
self.memory.append((state, next_state, action, reward, done,))
def sample(self):
""" Sample a minibatch from the ER buffer (also converts the FrameStack'ed LazyFrames to contiguous tensors) """
batch = random.sample(self.memory, self.batch_size)
state, next_state, action, reward, done = zip(*batch)
state = list(map(lambda x: torch.from_numpy(x.__array__()), state))
next_state = list(map(lambda x: torch.from_numpy(x.__array__()), next_state))
state, next_state, action, reward, done = map(torch.stack, [state, next_state, action, reward, done])
return prep_observation_for_qnet(state), prep_observation_for_qnet(next_state), action.squeeze().cuda(), reward.squeeze().cuda(), done.squeeze().cuda()
def td_estimate(self, state, action):
""" Compute the TD estimate """
return self.q_policy(state)[:, action]
@torch.no_grad()
def td_target(self, reward, next_state, done):
""" Compute the TD target (Double-DQN) """
best_action = torch.argmax(self.q_policy(next_state), dim=1)
next_Q = self.q_target(next_state)[:, best_action]
return reward + self.gamma * next_Q * (1 - done.float())
def update_q_policy(self, td_estimate, td_target):
""" Perform update step """
loss = self.loss_fn(td_estimate, td_target)
self.opt.zero_grad()
loss.backward()
self.opt.step()
return loss.item()
def sync_Q_target(self):
""" Sync target network"""
print('Synced q_target')
self.q_target.load_state_dict(self.q_policy.state_dict())
def learn(self):
"""Update online action value (Q) function with a batch of experiences"""
if self.game_frame < self.burnin:
print('.', end='')
return None, None
if self.game_frame % self.train_every != 0:
return None, None
# Sample from memory
state, next_state, action, reward, done = self.sample()
# Get TD Estimate
td_est = self.td_estimate(state, action)
# Get TD Target
td_tgt = self.td_target(reward, next_state, done)
# Backpropagate loss through q_policy
loss = self.update_q_policy(td_est, td_tgt)
self.train_step += 1
return (td_est.mean().item(), loss)
env_name = 'BreakoutNoFrameskip-v4'
env = gym.make(env_name)
rec_wrapper1 = RecorderWrapper(env, fps=4*60) # record original frames
env = SkipFrame(rec_wrapper1, skip=4)
env = GrayScaleObservation(env)
env = ResizeObservation(env, shape=84)
rec_wrapper2 = RecorderWrapper(env, fps=4*60//4) # record preprocessed frames
env = FrameStack(rec_wrapper2, num_stack=4)
config = dict(eps_initial=1, eps_min=0.1, eps_decay_over=1e6, buffer_size=1e6,
batch_size=32, gamma=0.99, burnin=5e4, train_every=4, sync_every=1e4, lr=0.00025)
dqn = DQN(state_dim=(4, 84, 84), action_dim=env.action_space.n, **config)
episode_rewards = deque(maxlen=1000)
for episode in itertools.count(0, 1):
done = False
state = env.reset()
ep_reward = 0
while not done:
action = dqn.act(state)
next_state, reward, done, info = env.step(action)
dqn.game_frame += 1
ep_reward += reward
if info['ale.lives'] < 5:
done = True
clipped_reward = 0 if reward == 0 else reward/abs(reward)
dqn.cache(state, next_state, action, clipped_reward, done)
q, loss = dqn.learn()
state = next_state
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment