Skip to content

Instantly share code, notes, and snippets.

@riteshreddyr
Last active June 11, 2019 03:19
Show Gist options
  • Save riteshreddyr/7d9fba477c5b1a976a91b4f2782945c2 to your computer and use it in GitHub Desktop.
Save riteshreddyr/7d9fba477c5b1a976a91b4f2782945c2 to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*- #
"""*********************************************************************************************"""
# FileName [ atari wrapper.py ]
# Synopsis [ environment wrapper for atari ]
# Author [ Ting-Wei Liu (Andi611) ]
# Copyright [ Copyleft(c), NTUEE, NTU, Taiwan ]
# Reference [ github.com/openai/baselines/blob/master/baselines/common/atari_wrappers.py ]
"""*********************************************************************************************"""
import cv2
import gym
import numpy as np
from gym import spaces
from collections import deque
class NoopResetEnv(gym.Wrapper):
def __init__(self, env, noop_max=30):
"""Sample initial states by taking random number of no-ops on reset.
No-op is assumed to be action 0.
"""
gym.Wrapper.__init__(self, env)
self.noop_max = noop_max
self.override_num_noops = None
if isinstance(env.action_space, gym.spaces.MultiBinary):
self.noop_action = np.zeros(self.env.action_space.n, dtype=np.int64)
else:
# used for atari environments
self.noop_action = 0
assert env.unwrapped.get_action_meanings()[0] == 'NOOP'
def _reset(self, **kwargs):
""" Do no-op action for a number of steps in [1, noop_max]."""
self.env.reset(**kwargs)
if self.override_num_noops is not None:
noops = self.override_num_noops
else:
noops = self.unwrapped.np_random.randint(1, self.noop_max + 1) #pylint: disable=E1101
assert noops > 0
obs = None
for _ in range(noops):
obs, _, done, _ = self.env.step(self.noop_action)
if done:
obs = self.env.reset(**kwargs)
return obs
class FireResetEnv(gym.Wrapper):
def __init__(self, env):
"""Take action on reset for environments that are fixed until firing."""
gym.Wrapper.__init__(self, env)
assert env.unwrapped.get_action_meanings()[1] == 'FIRE'
assert len(env.unwrapped.get_action_meanings()) >= 3
def _reset(self, **kwargs):
self.env.reset(**kwargs)
obs, _, done, _ = self.env.step(1)
if done:
self.env.reset(**kwargs)
obs, _, done, _ = self.env.step(2)
if done:
self.env.reset(**kwargs)
return obs
class EpisodicLifeEnv(gym.Wrapper):
def __init__(self, env):
"""Make end-of-life == end-of-episode, but only reset on true game over.
Done by DeepMind for the DQN and co. since it helps value estimation.
"""
gym.Wrapper.__init__(self, env)
self.lives = 0
self.was_real_done = True
def _step(self, action):
obs, reward, done, info = self.env.step(action)
self.was_real_done = done
# check current lives, make loss of life terminal,
# then update lives to handle bonus lives
lives = self.env.unwrapped.ale.lives()
if lives < self.lives and lives > 0:
# for Qbert somtimes we stay in lives == 0 condtion for a few frames
# so its important to keep lives > 0, so that we only reset once
# the environment advertises done.
done = True
self.lives = lives
return obs, reward, done, info
def _reset(self, **kwargs):
"""Reset only when lives are exhausted.
This way all states are still reachable even though lives are episodic,
and the learner need not know about any of this behind-the-scenes.
"""
if self.was_real_done:
obs = self.env.reset(**kwargs)
else:
# no-op step to advance from terminal/lost life state
obs, _, _, _ = self.env.step(0)
self.lives = self.env.unwrapped.ale.lives()
return obs
class MaxAndSkipEnv(gym.Wrapper):
def __init__(self, env, skip=4):
"""Return only every `skip`-th frame"""
gym.Wrapper.__init__(self, env)
# most recent raw observations (for max pooling across time steps)
self._obs_buffer = np.zeros((2,)+env.observation_space.shape, dtype='uint8')
self._skip = skip
def _step(self, action):
"""Repeat action, sum reward, and max over last observations."""
total_reward = 0.0
done = None
for i in range(self._skip):
obs, reward, done, info = self.env.step(action)
if i == self._skip - 2: self._obs_buffer[0] = obs
if i == self._skip - 1: self._obs_buffer[1] = obs
total_reward += reward
if done:
break
# Note that the observation on the done=True frame
# doesn't matter
max_frame = self._obs_buffer.max(axis=0)
return max_frame, total_reward, done, info
class ClipRewardEnv(gym.RewardWrapper):
def _reward(self, reward):
"""Bin reward to {+1, 0, -1} by its sign."""
return np.sign(reward)
class WarpFrame(gym.ObservationWrapper):
def __init__(self, env):
"""Warp frames to 84x84 as done in the Nature paper and later work."""
gym.ObservationWrapper.__init__(self, env)
self.width = 84
self.height = 84
self.observation_space = spaces.Box(low=0, high=255, shape=(self.height, self.width, 1))
def _observation(self, frame):
frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
frame = cv2.resize(frame, (self.width, self.height), interpolation=cv2.INTER_AREA)
return frame[:, :, None]
class FrameStack(gym.Wrapper):
def __init__(self, env, k):
"""Stack k last frames.
Returns lazy array, which is much more memory efficient.
See Also
--------
baselines.common.atari_wrappers.LazyFrames
"""
gym.Wrapper.__init__(self, env)
self.k = k
self.frames = deque([], maxlen=k)
shp = env.observation_space.shape
self.observation_space = spaces.Box(low=0, high=255, shape=(shp[0], shp[1], shp[2] * k))
def _reset(self):
ob = self.env.reset()
for _ in range(self.k):
self.frames.append(ob)
return self._get_ob()
def _step(self, action):
ob, reward, done, info = self.env.step(action)
self.frames.append(ob)
return self._get_ob(), reward, done, info
def _get_ob(self):
assert len(self.frames) == self.k
return LazyFrames(list(self.frames))
class ScaledFloatFrame(gym.ObservationWrapper):
def _observation(self, observation):
# careful! This undoes the memory optimization, use
# with smaller replay buffers only.
return np.array(observation).astype(np.float32) / 255.0
class LazyFrames(object):
def __init__(self, frames):
"""This object ensures that common frames between the observations are only stored once.
It exists purely to optimize memory usage which can be huge for DQN's 1M frames replay
buffers.
This object should only be converted to numpy array before being passed to the model.
You'd not belive how complex the previous solution was."""
self._frames = frames
def __array__(self, dtype=None):
out = np.concatenate(self._frames, axis=2)
if dtype is not None:
out = out.astype(dtype)
return out
def make_atari(env_id):
env = gym.make(env_id)
assert 'NoFrameskip' in env.spec.id
env = NoopResetEnv(env, noop_max=30)
env = MaxAndSkipEnv(env, skip=4)
return env
def wrap_deepmind(env, episode_life=True, clip_rewards=True, frame_stack=False, scale=False):
"""Configure environment for DeepMind-style Atari.
"""
if episode_life:
env = EpisodicLifeEnv(env)
if 'FIRE' in env.unwrapped.get_action_meanings():
env = FireResetEnv(env)
env = WarpFrame(env)
if scale:
env = ScaledFloatFrame(env)
if clip_rewards:
env = ClipRewardEnv(env)
if frame_stack:
env = FrameStack(env, 4)
return env
def make_wrap_atari(env_id='Breakout-v0', clip_rewards=True):
env = make_atari(env_id)
return wrap_deepmind(env, clip_rewards=clip_rewards, frame_stack=True, scale=True)
#!python3
from __future__ import print_function
from atari_wrapper import make_wrap_atari
import gym
import matplotlib.pyplot as plt
import numpy as np
import os
import pickle
import tensorflow as tf
from tensorflow.python import debug as tf_debug
class StepInformation:
def __init__(self, observation, action, reward, done, obs_):
self.observation = observation
self.action = action
self.reward = reward
self.done = done
self.obs_ = obs_
class LearnBreakout:
def __init__(self):
self.render = True # Seeing is fun! :)
# This environment handles a lot of the details needed for this Learning:
# It grayscales and resizes all the frames to the size 84x84
# It stacks the last 4 observations into an array producing the final observation space of 84,84,4
self.env = make_wrap_atari('BreakoutNoFrameskip-v3')
self.max_steps = 1e7
self.global_step = 0
# Neural Net Libraries intrinsically support batch learning, so parametrize it to control easily
self.batch_size = 32
# Convenient to have this here to know how large a single observation is.
self.state_size = 84 * 84 * 4
# all the possible actions in this game: No-op, left, right
self.num_actions = 3
# the epsilon value for explore-exploit strategy
self.epsilon = 1.
self.epsilon_decay = (self.epsilon-0.07)/1e6 #Linear decay
self.gamma = 0.99 # Discount value for future rewards
# Since we learn in batches, we need to collect at least batch_size # of samples and run them through the net
self.memory_size = self.batch_size
# The array that stores the observations.
# We can do this in many ways, but one simple way is to store all the info into an object.
self.memory = np.array([None] * self.memory_size)
# The main model tf variables
# We can run the model to compute self.q_predict by providing self.observation_placeholder
self.observation_placeholder, self.q_predict = self.createModel("eval")
# We can use the same Q Net to predict the target as well as our current prediction.
# However, this can be overridden below when we use a fixed target network.
self.target_observ_placeholder, self.q_target_predict = self.observation_placeholder, self.q_predict
# The session we'll run our TF Graph in.
self.tf_sess = tf.Session()
# For debugging
## self.tf_sess = tf_debug.LocalCLIDebugWrapperSession(self.tf_sess)
# Our objective is to learn the Q function Q(s,a) the value of taking an action a, in state s.
# The approach is to use our neural net to learn some weights such that given the state as a stack of four frames,
# we get an estimate of Q(s,a) for all possible a.
# We'll do this by trying to converge the prediction to what should be the "true" value
# The prediction will be our neural net answer.
# The true value is the reward for taking action a from state s in the environment + the highest possible Q value from state s',
# the successor of s, by taking action a discounted by gamma.
# So Prediction: self.q_predict
# Target: reward + gamma * max_{a*} Q(s',a*)
# To converge these two, we will use the squared loss function and run them through the RMSProp optimizer to find the right weights.
# After this for any state s, we can compute Q(s,a) for all a, and pick the a that produces the max.
self.q_target = tf.placeholder(tf.float32, [None, self.num_actions], name='Q_target')
self.loss = tf.reduce_mean(tf.squared_difference(self.q_target, self.q_predict, name='Q_dist'))
#The optimizer that we'll use to minimze the loss
optimizer = tf.train.RMSPropOptimizer(learning_rate=0.001, decay=0.99)
self.trainer = optimizer.minimize(self.loss)
## Metrics
self.save_dir = "data/"
self.rewards_path = os.path.join(self.save_dir, "rewards")
self.train_rewards = []
self.evaluate_rewards = []
self.evaluate_every_n_episodes = 100
self.save_rewards_every_n_episodes = self.evaluate_every_n_episodes * 5
## Model persistence
self.model_save_path = os.path.join(self.save_dir, "model", "model.chkpt")
self.saver = tf.train.Saver()
self.save_model_every_n_episodes = 300
## Enhancements
## Don't forget previous experiences, works with a large memory buffer.
self.use_experience_replay = True
if self.use_experience_replay:
self.memory_size = 10000
self.memory = np.array([None] * self.memory_size)
# Fixed Target Network
# Don't chase a moving target, so fix the target predictor for some iterations.
self.use_fixed_target_network = True
self.update_target_network_weights_every_n_steps = 1000
if self.use_fixed_target_network:
self.target_observ_placeholder, self.q_target_predict = self.createModel("target")
target_weights = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="target")
eval_weights = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="eval")
with tf.variable_scope('update_weights'):
self.update_target_weights = [tf.assign(t, e) for t, e in zip(target_weights, eval_weights)]
# Whether to use Double Q Learning. Note this has to be used in conjunction with Fixed Target Network
# The idea here is that since the weights in the target network don't update frequently, using the deterministic
# max operator will tend to bias the target predictions.
# Therefore, let's continue to run the max operation on the eval network which has constantly changing weights
# to account for any biases, while still keeping the target fixed by using the Q value for this chosen action from the
# target network.
self.use_ddqn = self.use_fixed_target_network and True
def update_target_network_weights(self):
self.tf_sess.run(self.update_target_weights)
def createModel(self, tag):
# Shape is defined as follows:
# First dimension for the item in a batch: allows for batching
# The next 3 are the actual observation of shape 84x84x4
with tf.variable_scope(tag):
obs_pl = tf.placeholder(shape=(None,84,84,4), dtype=tf.float32, name="obs")
conv1 = tf.layers.conv2d(inputs=obs_pl, filters=32, kernel_size=(8, 8), strides=(4,4), activation='relu')
conv2 = tf.layers.conv2d(inputs=conv1, filters=64, kernel_size=(4, 4), strides=(2,2), activation='relu')
conv3 = tf.layers.conv2d(inputs=conv2, filters=32, kernel_size=(2, 2), strides=(1,1), activation='relu')
flat = tf.contrib.layers.flatten(conv3)
dense = tf.contrib.layers.fully_connected(flat, 512)
# The shape of this final layer is always (batch_size_input_to_layer, self.num_actions)
# This is because all the layers here actually take in an intrinsic batch dimension as the first one.
actions_pred = tf.contrib.layers.fully_connected(dense, self.num_actions)
return obs_pl, actions_pred
def compute_action(self, observation, epsilon):
if np.random.uniform() > epsilon:
# Neural Net takes in [None, 84,84,4] but observation is only [84,84,4] so add dimension.
observation = np.expand_dims(observation, axis=0)
action = self.tf_sess.run(self.q_predict, feed_dict={self.observation_placeholder:observation})
return np.argmax(action)
else:
return np.random.randint(0, self.num_actions)
def store(self, observation, action, reward, done, obs_, step):
self.memory[(step % self.memory_size)] = StepInformation(observation, action, reward, done, obs_)
def get_from_memory(self, size_to_get):
# Sample from memory using a uniform distribution
# Get the indices from this distribution
sample_indices = np.random.choice(
np.minimum(self.global_step, self.memory_size),
size_to_get, replace=self.use_experience_replay)
sample = self.memory[sample_indices]
states = np.empty((size_to_get, 84,84,4))
rewards = np.empty(size_to_get)
actions = np.empty(size_to_get, dtype=np.int32)
done = np.empty(size_to_get)
states_ = np.empty((size_to_get, 84,84,4))
for idx, memory in enumerate(sample):
states[idx] = memory.observation
rewards[idx] = memory.reward
actions[idx] = memory.action
done[idx] = memory.done
states_[idx] = memory.obs_
return states, rewards, actions, done, states_
def learn(self):
# Get the samples from memory
# For each sample
# Compute max Q(s', *)
# Use the above to then compute the target: q_target
# Run the trainer using s, and q_target to compute the loss and minimze it
states, rewards, actions, done, states_ = self.get_from_memory(self.batch_size)
# An array of indices to use in various functions to apply a function to each element in the batch
batch_indices = np.arange(self.batch_size, dtype=np.int32)
if self.use_ddqn:
# When using double DQN, we have to compute Q(s',a') using the eval net first for all a'
q_next_states_eval_net = self.tf_sess.run(self.q_predict,
feed_dict={self.observation_placeholder:states_})
# We then pick the index of the action that has the highest value
q_next_state_best_action = np.argmax(q_next_states_eval_net, axis=1)
# We then compute the Q values for all a' = Q(s',a'), and then pick the one based on the best_aciton_above
q_next_states_target_net = self.tf_sess.run(self.q_target_predict,
feed_dict={self.target_observ_placeholder:states_})
# Notice: there's no max arg here, we're just selecting a single action for each item in the batch.
q_next_max = q_next_states_target_net[batch_indices, q_next_state_best_action]
else:
# We predict the next states either based on the single Q network, or using a separate Fixed Network
# This is determined by what self.q_target_predict and self.target_observ_placeholder are set to in __init__
q_next_states = self.tf_sess.run(self.q_target_predict,
feed_dict={self.target_observ_placeholder:states_})
# q_next_states now contains for each sample in the batch, num_action values
# each num_action value corresponds to the Q value of taking that action
# For a given sample we only want to pick the maximum value for all action values
q_next_max = np.max(q_next_states, axis=1)
# Now we have to compute target, q_target for each of the samples in the batch.
# q_target = reward + (1-done) * gamma * q_next_max
q_target = rewards + ((1-done) * self.gamma * q_next_max) # 1-done: to handle end states, done = 1 for end states, 0 otherwise.
# Q_target is now of shape (batch_size), but since our neural network actually has num_actions final nodes
# We need to now again reconfigure/reshape our q_target to be of shape (batch_size, num_actions)
# But what values should we fill into the other num_actions - 1 instances? We only want to learn for the action that we have taken.
# Therefore, let's set all the other values to what was predicted by our network so that they don't impact training.
# Numpy is cool! We can express a forloop over two variables in one line!
q_target_reshaped = self.tf_sess.run(self.q_predict, feed_dict={self.observation_placeholder:states})
q_target_reshaped[batch_indices, actions] = q_target
# At this point we have our target, but not the prediction.
# We can combine computing the prediction (feed-forward) with the loss minimization because of TF's awesomeness:
_, loss = self.tf_sess.run([self.trainer, self.loss], feed_dict={self.q_target:q_target_reshaped, self.observation_placeholder:states})
return loss
def train(self):
episode = 0
loss = 1000
# Initialize TF
self.tf_sess.run(tf.global_variables_initializer())
while self.global_step < self.max_steps:
done = False
observation = np.array(self.env.reset()) # initial observation
episode_reward = 0.0
while not done:
if self.epsilon - self.epsilon_decay >= 0.07:
self.epsilon -= self.epsilon_decay
action = self.compute_action(observation, self.epsilon)
obs_, reward, done, info = self.env.step(action)
episode_reward += reward
print('Step: %i, Episode: %i, Action: %i, Reward: %.0f, Epsilon: %.5f, Loss: %.5f' % (self.global_step, episode, action, reward, self.epsilon, loss), end='\r')
if self.render:
self.env.render()
obs_ = np.array(obs_) # since obs_ is a lazy frame
# insert into memory later
self.store(observation, action, reward, int(done), obs_, self.global_step)
observation = obs_
self.global_step += 1
if self.global_step % self.batch_size == 0:
loss = self.learn()
episode += 1
print('Step: %i/%i, Episode: %i, Action: %i, Episode Reward: %.0f, Epsilon: %.2f, Loss: %.5f' % (self.global_step, self.max_steps, episode, action, episode_reward, self.epsilon, loss))
self.train_rewards.append(episode_reward)
if episode % self.evaluate_every_n_episodes == 0:
self.evaluate_rewards.append(self.evaluate())
if self.use_fixed_target_network and self.global_step % self.update_target_network_weights_every_n_steps == 0:
self.update_target_network_weights()
if episode % self.save_rewards_every_n_episodes == 0:
self.save_rewards()
if episode % self.save_model_every_n_episodes == 0:
self.saver.save(self.tf_sess, self.model_save_path)
def evaluate(self):
num_episodes = 10
episode_rewards = [0] * num_episodes
for i in range(num_episodes):
episode_reward = 0.0
obs = np.array(self.env.reset())
done = False
while not done:
action = self.compute_action(obs, epsilon=-1)
obs, reward, done, info = self.env.step(action)
episode_reward += reward
episode_rewards[i] = episode_reward
return sum(episode_rewards)/float(num_episodes)
def plot(self):
if np.sum(self.evaluate_rewards) == 0: self.evaluate_rewards = pickle.load(open(self.rewards_path + "/eval.pkl", 'rb'))
avg_rwd = []
for i in range(len(self.evaluate_rewards)):
if i < 30:
avg_rwd.append(np.mean(self.evaluate_rewards[:i]))
else:
avg_rwd.append(np.mean(self.evaluate_rewards[i-30:i]))
plt.plot(np.arange(len(avg_rwd)), avg_rwd)
plt.ylabel('Average Reward in Last 30 Episodes')
plt.xlabel('Number of Episodes')
plt.show()
def save_rewards(self):
pickle.dump(self.train_rewards, open(self.rewards_path + "/train.pkl", 'wb'), True)
pickle.dump(self.evaluate_rewards, open(self.rewards_path + "/eval.pkl", 'wb'), True)
learn = LearnBreakout()
learn.train()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment