schmidtdominik/ddqn.py Secret

## ddqn.py
import copy
import itertools, random
from collections import deque
from datetime import datetime
from pathlib import Path

import gym
from gym.wrappers import FrameStack
from gym.spaces import Box
import numpy as np
from tqdm.auto import tqdm

import torch
import torch.nn as nn
from torchsummary import summary
from torchvision import transforms as T
from matplotlib import pyplot as plt
from utils import *


def prep_observation_for_qnet(tensor):
    """ Transfer the tensor to GPU normalize into range [0, 1] """
    return tensor.to('cuda', dtype=torch.float32)/255


class QNetwork(nn.Module):
    def __init__(self, depth, actions):
        super().__init__()

        self.main = nn.Sequential(
            nn.Conv2d(in_channels=depth, out_channels=32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1),
            nn.ReLU(),
            nn.Flatten(),
            nn.Linear(3136, 512),
            nn.ReLU(),
            nn.Linear(512, actions),
        )

    def forward(self, x):
        return self.main(x)


class DQN:
    def __init__(self, state_dim, action_dim,
                 eps_initial=1, eps_min=0.1, eps_decay_over=1e6, buffer_size=1e6, batch_size=32, gamma=0.99,
                 burnin=5e4, train_every=4, sync_every=1e4, lr=0.00025):
        self.state_dim = state_dim
        self.action_dim = action_dim

        self.q_policy = QNetwork(state_dim[0], action_dim).cuda()
        self.q_target = copy.deepcopy(self.q_policy)

        self.eps_initial = eps_initial
        self.eps_min = eps_min
        self.eps_decay_over = eps_decay_over

        self.train_step = 0
        self.game_frame = 0

        self.memory = deque(maxlen=int(buffer_size))
        self.batch_size = batch_size
        self.gamma = gamma

        self.burnin = burnin  # min. experiences before training
        self.train_every = train_every  # no. of experiences between updates to q_policy
        self.sync_every = int(sync_every)  # no. of experiences between q_policy & q_target sync

        self.opt = torch.optim.RMSprop(self.q_policy.parameters(), lr=lr, alpha=0.95, momentum=0.95, eps=0.01)
        #self.opt = torch.optim.Adam(self.q_policy.parameters(), lr=lr)
        self.loss_fn = torch.nn.SmoothL1Loss()

    def act(self, state):
        """ computes an epsilon-greedy step with respect to the current policy self.q_policy """

        # sync the q target every self.sync_every frames
        # see https://github.com/spragunr/deep_q_rl/blob/master/deep_q_rl/launcher.py#L155
        if self.game_frame % self.sync_every == 0:
            self.sync_Q_target()

        # compute epsilon (https://openai.com/blog/openai-baselines-dqn/)
        # 1. skip the first ~50000 steps while we are filling the ER-buffer
        # 2. then linearly decay from 1 to 0.1 during the first 1e6 steps
        # 3. then linearly decay from 0.1 to 0.01 during the next 24 million steps
        if self.game_frame < self.burnin:
            self.eps = self.eps_initial
        elif self.game_frame-self.burnin < self.eps_decay_over:
            self.eps = max(self.eps_initial-(self.eps_initial-self.eps_min)*(self.game_frame-self.burnin)/self.eps_decay_over, self.eps_min)
        else:
            self.eps = max(self.eps_min - (self.eps_min-0.01)*(self.game_frame-self.burnin-self.eps_decay_over) / 24_000_000, 0.01)

        # return either random or best action
        if np.random.rand() < self.eps:
            return np.random.randint(self.action_dim)
        else:
            with torch.no_grad():
                state = torch.from_numpy(state.__array__()).unsqueeze(0)
                action_values = self.q_policy(prep_observation_for_qnet(state))
                return torch.argmax(action_values, dim=1).item()


    def cache(self, state, next_state, action, reward, done):
        """ Store the experience in the ER buffer """

        action = torch.tensor([action])
        reward = torch.tensor([reward])
        done = torch.tensor([done])

        self.memory.append((state, next_state, action, reward, done,))

    def sample(self):
        """ Sample a minibatch from the ER buffer (also converts the FrameStack'ed LazyFrames to contiguous tensors) """

        batch = random.sample(self.memory, self.batch_size)
        state, next_state, action, reward, done = zip(*batch)
        state = list(map(lambda x: torch.from_numpy(x.__array__()), state))
        next_state = list(map(lambda x: torch.from_numpy(x.__array__()), next_state))

        state, next_state, action, reward, done = map(torch.stack, [state, next_state, action, reward, done])
        return prep_observation_for_qnet(state), prep_observation_for_qnet(next_state), action.squeeze().cuda(), reward.squeeze().cuda(), done.squeeze().cuda()

    def td_estimate(self, state, action):
        """ Compute the TD estimate """
        return self.q_policy(state)[:, action]

    @torch.no_grad()
    def td_target(self, reward, next_state, done):
        """ Compute the TD target (Double-DQN) """
        best_action = torch.argmax(self.q_policy(next_state), dim=1)
        next_Q = self.q_target(next_state)[:, best_action]
        return reward + self.gamma * next_Q * (1 - done.float())

    def update_q_policy(self, td_estimate, td_target):
        """ Perform update step """
        loss = self.loss_fn(td_estimate, td_target)
        self.opt.zero_grad()
        loss.backward()
        self.opt.step()
        return loss.item()

    def sync_Q_target(self):
        """ Sync target network"""
        print('Synced q_target')
        self.q_target.load_state_dict(self.q_policy.state_dict())

    def learn(self):
        """Update online action value (Q) function with a batch of experiences"""
        if self.game_frame < self.burnin:
            print('.', end='')
            return None, None

        if self.game_frame % self.train_every != 0:
            return None, None

            # Sample from memory
        state, next_state, action, reward, done = self.sample()

        # Get TD Estimate
        td_est = self.td_estimate(state, action)

        # Get TD Target
        td_tgt = self.td_target(reward, next_state, done)

        # Backpropagate loss through q_policy
        loss = self.update_q_policy(td_est, td_tgt)

        self.train_step += 1

        return (td_est.mean().item(), loss)

env_name = 'BreakoutNoFrameskip-v4'
env = gym.make(env_name)
rec_wrapper1 = RecorderWrapper(env, fps=4*60) # record original frames
env = SkipFrame(rec_wrapper1, skip=4)
env = GrayScaleObservation(env)
env = ResizeObservation(env, shape=84)
rec_wrapper2 = RecorderWrapper(env, fps=4*60//4) # record preprocessed frames
env = FrameStack(rec_wrapper2, num_stack=4)

config = dict(eps_initial=1, eps_min=0.1, eps_decay_over=1e6, buffer_size=1e6,
              batch_size=32, gamma=0.99, burnin=5e4, train_every=4, sync_every=1e4, lr=0.00025)
dqn = DQN(state_dim=(4, 84, 84), action_dim=env.action_space.n, **config)

episode_rewards = deque(maxlen=1000)

for episode in itertools.count(0, 1):
    done = False
    state = env.reset()

    ep_reward = 0

    while not done:
        action = dqn.act(state)
        next_state, reward, done, info = env.step(action)
        dqn.game_frame += 1
        ep_reward += reward

        if info['ale.lives'] < 5:
            done = True

        clipped_reward = 0 if reward == 0 else reward/abs(reward)

        dqn.cache(state, next_state, action, clipped_reward, done)
        q, loss = dqn.learn()
        state = next_state
	import copy
	import itertools, random
	from collections import deque
	from datetime import datetime
	from pathlib import Path

	import gym
	from gym.wrappers import FrameStack
	from gym.spaces import Box
	import numpy as np
	from tqdm.auto import tqdm

	import torch
	import torch.nn as nn
	from torchsummary import summary
	from torchvision import transforms as T
	from matplotlib import pyplot as plt
	from utils import *


	def prep_observation_for_qnet(tensor):
	""" Transfer the tensor to GPU normalize into range [0, 1] """
	return tensor.to('cuda', dtype=torch.float32)/255


	class QNetwork(nn.Module):
	def __init__(self, depth, actions):
	super().__init__()

	self.main = nn.Sequential(
	nn.Conv2d(in_channels=depth, out_channels=32, kernel_size=8, stride=4),
	nn.ReLU(),
	nn.Conv2d(in_channels=32, out_channels=64, kernel_size=4, stride=2),
	nn.ReLU(),
	nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1),
	nn.ReLU(),
	nn.Flatten(),
	nn.Linear(3136, 512),
	nn.ReLU(),
	nn.Linear(512, actions),
	)

	def forward(self, x):
	return self.main(x)


	class DQN:
	def __init__(self, state_dim, action_dim,
	eps_initial=1, eps_min=0.1, eps_decay_over=1e6, buffer_size=1e6, batch_size=32, gamma=0.99,
	burnin=5e4, train_every=4, sync_every=1e4, lr=0.00025):
	self.state_dim = state_dim
	self.action_dim = action_dim

	self.q_policy = QNetwork(state_dim[0], action_dim).cuda()
	self.q_target = copy.deepcopy(self.q_policy)

	self.eps_initial = eps_initial
	self.eps_min = eps_min
	self.eps_decay_over = eps_decay_over

	self.train_step = 0
	self.game_frame = 0

	self.memory = deque(maxlen=int(buffer_size))
	self.batch_size = batch_size
	self.gamma = gamma

	self.burnin = burnin # min. experiences before training
	self.train_every = train_every # no. of experiences between updates to q_policy
	self.sync_every = int(sync_every) # no. of experiences between q_policy & q_target sync

	self.opt = torch.optim.RMSprop(self.q_policy.parameters(), lr=lr, alpha=0.95, momentum=0.95, eps=0.01)
	#self.opt = torch.optim.Adam(self.q_policy.parameters(), lr=lr)
	self.loss_fn = torch.nn.SmoothL1Loss()

	def act(self, state):
	""" computes an epsilon-greedy step with respect to the current policy self.q_policy """

	# sync the q target every self.sync_every frames
	# see https://github.com/spragunr/deep_q_rl/blob/master/deep_q_rl/launcher.py#L155
	if self.game_frame % self.sync_every == 0:
	self.sync_Q_target()

	# compute epsilon (https://openai.com/blog/openai-baselines-dqn/)
	# 1. skip the first ~50000 steps while we are filling the ER-buffer
	# 2. then linearly decay from 1 to 0.1 during the first 1e6 steps
	# 3. then linearly decay from 0.1 to 0.01 during the next 24 million steps
	if self.game_frame < self.burnin:
	self.eps = self.eps_initial
	elif self.game_frame-self.burnin < self.eps_decay_over:
	self.eps = max(self.eps_initial-(self.eps_initial-self.eps_min)*(self.game_frame-self.burnin)/self.eps_decay_over, self.eps_min)
	else:
	self.eps = max(self.eps_min - (self.eps_min-0.01)*(self.game_frame-self.burnin-self.eps_decay_over) / 24_000_000, 0.01)

	# return either random or best action
	if np.random.rand() < self.eps:
	return np.random.randint(self.action_dim)
	else:
	with torch.no_grad():
	state = torch.from_numpy(state.__array__()).unsqueeze(0)
	action_values = self.q_policy(prep_observation_for_qnet(state))
	return torch.argmax(action_values, dim=1).item()


	def cache(self, state, next_state, action, reward, done):
	""" Store the experience in the ER buffer """

	action = torch.tensor([action])
	reward = torch.tensor([reward])
	done = torch.tensor([done])

	self.memory.append((state, next_state, action, reward, done,))

	def sample(self):
	""" Sample a minibatch from the ER buffer (also converts the FrameStack'ed LazyFrames to contiguous tensors) """

	batch = random.sample(self.memory, self.batch_size)
	state, next_state, action, reward, done = zip(*batch)
	state = list(map(lambda x: torch.from_numpy(x.__array__()), state))
	next_state = list(map(lambda x: torch.from_numpy(x.__array__()), next_state))

	state, next_state, action, reward, done = map(torch.stack, [state, next_state, action, reward, done])
	return prep_observation_for_qnet(state), prep_observation_for_qnet(next_state), action.squeeze().cuda(), reward.squeeze().cuda(), done.squeeze().cuda()

	def td_estimate(self, state, action):
	""" Compute the TD estimate """
	return self.q_policy(state)[:, action]

	@torch.no_grad()
	def td_target(self, reward, next_state, done):
	""" Compute the TD target (Double-DQN) """
	best_action = torch.argmax(self.q_policy(next_state), dim=1)
	next_Q = self.q_target(next_state)[:, best_action]
	return reward + self.gamma * next_Q * (1 - done.float())

	def update_q_policy(self, td_estimate, td_target):
	""" Perform update step """
	loss = self.loss_fn(td_estimate, td_target)
	self.opt.zero_grad()
	loss.backward()
	self.opt.step()
	return loss.item()

	def sync_Q_target(self):
	""" Sync target network"""
	print('Synced q_target')
	self.q_target.load_state_dict(self.q_policy.state_dict())

	def learn(self):
	"""Update online action value (Q) function with a batch of experiences"""
	if self.game_frame < self.burnin:
	print('.', end='')
	return None, None

	if self.game_frame % self.train_every != 0:
	return None, None

	# Sample from memory
	state, next_state, action, reward, done = self.sample()

	# Get TD Estimate
	td_est = self.td_estimate(state, action)

	# Get TD Target
	td_tgt = self.td_target(reward, next_state, done)

	# Backpropagate loss through q_policy
	loss = self.update_q_policy(td_est, td_tgt)

	self.train_step += 1

	return (td_est.mean().item(), loss)

	env_name = 'BreakoutNoFrameskip-v4'
	env = gym.make(env_name)
	rec_wrapper1 = RecorderWrapper(env, fps=4*60) # record original frames
	env = SkipFrame(rec_wrapper1, skip=4)
	env = GrayScaleObservation(env)
	env = ResizeObservation(env, shape=84)
	rec_wrapper2 = RecorderWrapper(env, fps=4*60//4) # record preprocessed frames
	env = FrameStack(rec_wrapper2, num_stack=4)

	config = dict(eps_initial=1, eps_min=0.1, eps_decay_over=1e6, buffer_size=1e6,
	batch_size=32, gamma=0.99, burnin=5e4, train_every=4, sync_every=1e4, lr=0.00025)
	dqn = DQN(state_dim=(4, 84, 84), action_dim=env.action_space.n, **config)

	episode_rewards = deque(maxlen=1000)

	for episode in itertools.count(0, 1):
	done = False
	state = env.reset()

	ep_reward = 0

	while not done:
	action = dqn.act(state)
	next_state, reward, done, info = env.step(action)
	dqn.game_frame += 1
	ep_reward += reward

	if info['ale.lives'] < 5:
	done = True

	clipped_reward = 0 if reward == 0 else reward/abs(reward)

	dqn.cache(state, next_state, action, clipped_reward, done)
	q, loss = dqn.learn()
	state = next_state