-
-
Save schmidtdominik/3799130c7495b96f4cd9a1533fdd6083 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import copy | |
import itertools, random | |
from collections import deque | |
from datetime import datetime | |
from pathlib import Path | |
import gym | |
from gym.wrappers import FrameStack | |
from gym.spaces import Box | |
import numpy as np | |
from tqdm.auto import tqdm | |
import torch | |
import torch.nn as nn | |
from torchsummary import summary | |
from torchvision import transforms as T | |
from matplotlib import pyplot as plt | |
from utils import * | |
def prep_observation_for_qnet(tensor): | |
""" Transfer the tensor to GPU normalize into range [0, 1] """ | |
return tensor.to('cuda', dtype=torch.float32)/255 | |
class QNetwork(nn.Module): | |
def __init__(self, depth, actions): | |
super().__init__() | |
self.main = nn.Sequential( | |
nn.Conv2d(in_channels=depth, out_channels=32, kernel_size=8, stride=4), | |
nn.ReLU(), | |
nn.Conv2d(in_channels=32, out_channels=64, kernel_size=4, stride=2), | |
nn.ReLU(), | |
nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1), | |
nn.ReLU(), | |
nn.Flatten(), | |
nn.Linear(3136, 512), | |
nn.ReLU(), | |
nn.Linear(512, actions), | |
) | |
def forward(self, x): | |
return self.main(x) | |
class DQN: | |
def __init__(self, state_dim, action_dim, | |
eps_initial=1, eps_min=0.1, eps_decay_over=1e6, buffer_size=1e6, batch_size=32, gamma=0.99, | |
burnin=5e4, train_every=4, sync_every=1e4, lr=0.00025): | |
self.state_dim = state_dim | |
self.action_dim = action_dim | |
self.q_policy = QNetwork(state_dim[0], action_dim).cuda() | |
self.q_target = copy.deepcopy(self.q_policy) | |
self.eps_initial = eps_initial | |
self.eps_min = eps_min | |
self.eps_decay_over = eps_decay_over | |
self.train_step = 0 | |
self.game_frame = 0 | |
self.memory = deque(maxlen=int(buffer_size)) | |
self.batch_size = batch_size | |
self.gamma = gamma | |
self.burnin = burnin # min. experiences before training | |
self.train_every = train_every # no. of experiences between updates to q_policy | |
self.sync_every = int(sync_every) # no. of experiences between q_policy & q_target sync | |
self.opt = torch.optim.RMSprop(self.q_policy.parameters(), lr=lr, alpha=0.95, momentum=0.95, eps=0.01) | |
#self.opt = torch.optim.Adam(self.q_policy.parameters(), lr=lr) | |
self.loss_fn = torch.nn.SmoothL1Loss() | |
def act(self, state): | |
""" computes an epsilon-greedy step with respect to the current policy self.q_policy """ | |
# sync the q target every self.sync_every frames | |
# see https://github.com/spragunr/deep_q_rl/blob/master/deep_q_rl/launcher.py#L155 | |
if self.game_frame % self.sync_every == 0: | |
self.sync_Q_target() | |
# compute epsilon (https://openai.com/blog/openai-baselines-dqn/) | |
# 1. skip the first ~50000 steps while we are filling the ER-buffer | |
# 2. then linearly decay from 1 to 0.1 during the first 1e6 steps | |
# 3. then linearly decay from 0.1 to 0.01 during the next 24 million steps | |
if self.game_frame < self.burnin: | |
self.eps = self.eps_initial | |
elif self.game_frame-self.burnin < self.eps_decay_over: | |
self.eps = max(self.eps_initial-(self.eps_initial-self.eps_min)*(self.game_frame-self.burnin)/self.eps_decay_over, self.eps_min) | |
else: | |
self.eps = max(self.eps_min - (self.eps_min-0.01)*(self.game_frame-self.burnin-self.eps_decay_over) / 24_000_000, 0.01) | |
# return either random or best action | |
if np.random.rand() < self.eps: | |
return np.random.randint(self.action_dim) | |
else: | |
with torch.no_grad(): | |
state = torch.from_numpy(state.__array__()).unsqueeze(0) | |
action_values = self.q_policy(prep_observation_for_qnet(state)) | |
return torch.argmax(action_values, dim=1).item() | |
def cache(self, state, next_state, action, reward, done): | |
""" Store the experience in the ER buffer """ | |
action = torch.tensor([action]) | |
reward = torch.tensor([reward]) | |
done = torch.tensor([done]) | |
self.memory.append((state, next_state, action, reward, done,)) | |
def sample(self): | |
""" Sample a minibatch from the ER buffer (also converts the FrameStack'ed LazyFrames to contiguous tensors) """ | |
batch = random.sample(self.memory, self.batch_size) | |
state, next_state, action, reward, done = zip(*batch) | |
state = list(map(lambda x: torch.from_numpy(x.__array__()), state)) | |
next_state = list(map(lambda x: torch.from_numpy(x.__array__()), next_state)) | |
state, next_state, action, reward, done = map(torch.stack, [state, next_state, action, reward, done]) | |
return prep_observation_for_qnet(state), prep_observation_for_qnet(next_state), action.squeeze().cuda(), reward.squeeze().cuda(), done.squeeze().cuda() | |
def td_estimate(self, state, action): | |
""" Compute the TD estimate """ | |
return self.q_policy(state)[:, action] | |
@torch.no_grad() | |
def td_target(self, reward, next_state, done): | |
""" Compute the TD target (Double-DQN) """ | |
best_action = torch.argmax(self.q_policy(next_state), dim=1) | |
next_Q = self.q_target(next_state)[:, best_action] | |
return reward + self.gamma * next_Q * (1 - done.float()) | |
def update_q_policy(self, td_estimate, td_target): | |
""" Perform update step """ | |
loss = self.loss_fn(td_estimate, td_target) | |
self.opt.zero_grad() | |
loss.backward() | |
self.opt.step() | |
return loss.item() | |
def sync_Q_target(self): | |
""" Sync target network""" | |
print('Synced q_target') | |
self.q_target.load_state_dict(self.q_policy.state_dict()) | |
def learn(self): | |
"""Update online action value (Q) function with a batch of experiences""" | |
if self.game_frame < self.burnin: | |
print('.', end='') | |
return None, None | |
if self.game_frame % self.train_every != 0: | |
return None, None | |
# Sample from memory | |
state, next_state, action, reward, done = self.sample() | |
# Get TD Estimate | |
td_est = self.td_estimate(state, action) | |
# Get TD Target | |
td_tgt = self.td_target(reward, next_state, done) | |
# Backpropagate loss through q_policy | |
loss = self.update_q_policy(td_est, td_tgt) | |
self.train_step += 1 | |
return (td_est.mean().item(), loss) | |
env_name = 'BreakoutNoFrameskip-v4' | |
env = gym.make(env_name) | |
rec_wrapper1 = RecorderWrapper(env, fps=4*60) # record original frames | |
env = SkipFrame(rec_wrapper1, skip=4) | |
env = GrayScaleObservation(env) | |
env = ResizeObservation(env, shape=84) | |
rec_wrapper2 = RecorderWrapper(env, fps=4*60//4) # record preprocessed frames | |
env = FrameStack(rec_wrapper2, num_stack=4) | |
config = dict(eps_initial=1, eps_min=0.1, eps_decay_over=1e6, buffer_size=1e6, | |
batch_size=32, gamma=0.99, burnin=5e4, train_every=4, sync_every=1e4, lr=0.00025) | |
dqn = DQN(state_dim=(4, 84, 84), action_dim=env.action_space.n, **config) | |
episode_rewards = deque(maxlen=1000) | |
for episode in itertools.count(0, 1): | |
done = False | |
state = env.reset() | |
ep_reward = 0 | |
while not done: | |
action = dqn.act(state) | |
next_state, reward, done, info = env.step(action) | |
dqn.game_frame += 1 | |
ep_reward += reward | |
if info['ale.lives'] < 5: | |
done = True | |
clipped_reward = 0 if reward == 0 else reward/abs(reward) | |
dqn.cache(state, next_state, action, clipped_reward, done) | |
q, loss = dqn.learn() | |
state = next_state |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment