riteshreddyr/atari_wrapper.py

## atari_wrapper.py
# -*- coding: utf-8 -*- #
"""*********************************************************************************************"""
#   FileName     [ atari wrapper.py ]
#   Synopsis     [ environment wrapper for atari ]
#   Author       [ Ting-Wei Liu (Andi611) ]
#   Copyright    [ Copyleft(c), NTUEE, NTU, Taiwan ]
#   Reference    [ github.com/openai/baselines/blob/master/baselines/common/atari_wrappers.py ]
"""*********************************************************************************************"""


import cv2
import gym
import numpy as np
from gym import spaces
from collections import deque


class NoopResetEnv(gym.Wrapper):
	def __init__(self, env, noop_max=30):
		"""Sample initial states by taking random number of no-ops on reset.
		No-op is assumed to be action 0.
		"""
		gym.Wrapper.__init__(self, env)
		self.noop_max = noop_max
		self.override_num_noops = None
		if isinstance(env.action_space, gym.spaces.MultiBinary):
			self.noop_action = np.zeros(self.env.action_space.n, dtype=np.int64)
		else:
			# used for atari environments
			self.noop_action = 0
			assert env.unwrapped.get_action_meanings()[0] == 'NOOP'

	def _reset(self, **kwargs):
		""" Do no-op action for a number of steps in [1, noop_max]."""
		self.env.reset(**kwargs)
		if self.override_num_noops is not None:
			noops = self.override_num_noops
		else:
			noops = self.unwrapped.np_random.randint(1, self.noop_max + 1) #pylint: disable=E1101
		assert noops > 0
		obs = None
		for _ in range(noops):
			obs, _, done, _ = self.env.step(self.noop_action)
			if done:
				obs = self.env.reset(**kwargs)
		return obs


class FireResetEnv(gym.Wrapper):
	def __init__(self, env):
		"""Take action on reset for environments that are fixed until firing."""
		gym.Wrapper.__init__(self, env)
		assert env.unwrapped.get_action_meanings()[1] == 'FIRE'
		assert len(env.unwrapped.get_action_meanings()) >= 3

	def _reset(self, **kwargs):
		self.env.reset(**kwargs)
		obs, _, done, _ = self.env.step(1)
		if done:
			self.env.reset(**kwargs)
		obs, _, done, _ = self.env.step(2)
		if done:
			self.env.reset(**kwargs)
		return obs


class EpisodicLifeEnv(gym.Wrapper):
	def __init__(self, env):
		"""Make end-of-life == end-of-episode, but only reset on true game over.
		Done by DeepMind for the DQN and co. since it helps value estimation.
		"""
		gym.Wrapper.__init__(self, env)
		self.lives = 0
		self.was_real_done  = True

	def _step(self, action):
		obs, reward, done, info = self.env.step(action)
		self.was_real_done = done
		# check current lives, make loss of life terminal,
		# then update lives to handle bonus lives
		lives = self.env.unwrapped.ale.lives()
		if lives < self.lives and lives > 0:
			# for Qbert somtimes we stay in lives == 0 condtion for a few frames
			# so its important to keep lives > 0, so that we only reset once
			# the environment advertises done.
			done = True
		self.lives = lives
		return obs, reward, done, info

	def _reset(self, **kwargs):
		"""Reset only when lives are exhausted.
		This way all states are still reachable even though lives are episodic,
		and the learner need not know about any of this behind-the-scenes.
		"""
		if self.was_real_done:
			obs = self.env.reset(**kwargs)
		else:
			# no-op step to advance from terminal/lost life state
			obs, _, _, _ = self.env.step(0)
		self.lives = self.env.unwrapped.ale.lives()
		return obs


class MaxAndSkipEnv(gym.Wrapper):
	def __init__(self, env, skip=4):
		"""Return only every `skip`-th frame"""
		gym.Wrapper.__init__(self, env)
		# most recent raw observations (for max pooling across time steps)
		self._obs_buffer = np.zeros((2,)+env.observation_space.shape, dtype='uint8')
		self._skip       = skip

	def _step(self, action):
		"""Repeat action, sum reward, and max over last observations."""
		total_reward = 0.0
		done = None
		for i in range(self._skip):
			obs, reward, done, info = self.env.step(action)
			if i == self._skip - 2: self._obs_buffer[0] = obs
			if i == self._skip - 1: self._obs_buffer[1] = obs
			total_reward += reward
			if done:
				break
		# Note that the observation on the done=True frame
		# doesn't matter
		max_frame = self._obs_buffer.max(axis=0)

		return max_frame, total_reward, done, info


class ClipRewardEnv(gym.RewardWrapper):
	def _reward(self, reward):
		"""Bin reward to {+1, 0, -1} by its sign."""
		return np.sign(reward)


class WarpFrame(gym.ObservationWrapper):
	def __init__(self, env):
		"""Warp frames to 84x84 as done in the Nature paper and later work."""
		gym.ObservationWrapper.__init__(self, env)
		self.width = 84
		self.height = 84
		self.observation_space = spaces.Box(low=0, high=255, shape=(self.height, self.width, 1))

	def _observation(self, frame):
		frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
		frame = cv2.resize(frame, (self.width, self.height), interpolation=cv2.INTER_AREA)
		return frame[:, :, None]


class FrameStack(gym.Wrapper):
	def __init__(self, env, k):
		"""Stack k last frames.
		Returns lazy array, which is much more memory efficient.
		See Also
		--------
		baselines.common.atari_wrappers.LazyFrames
		"""
		gym.Wrapper.__init__(self, env)
		self.k = k
		self.frames = deque([], maxlen=k)
		shp = env.observation_space.shape
		self.observation_space = spaces.Box(low=0, high=255, shape=(shp[0], shp[1], shp[2] * k))

	def _reset(self):
		ob = self.env.reset()
		for _ in range(self.k):
			self.frames.append(ob)
		return self._get_ob()

	def _step(self, action):
		ob, reward, done, info = self.env.step(action)
		self.frames.append(ob)
		return self._get_ob(), reward, done, info

	def _get_ob(self):
		assert len(self.frames) == self.k
		return LazyFrames(list(self.frames))


class ScaledFloatFrame(gym.ObservationWrapper):
	def _observation(self, observation):
		# careful! This undoes the memory optimization, use
		# with smaller replay buffers only.
		return np.array(observation).astype(np.float32) / 255.0


class LazyFrames(object):
	def __init__(self, frames):
		"""This object ensures that common frames between the observations are only stored once.
		It exists purely to optimize memory usage which can be huge for DQN's 1M frames replay
		buffers.
		This object should only be converted to numpy array before being passed to the model.
		You'd not belive how complex the previous solution was."""
		self._frames = frames

	def __array__(self, dtype=None):
		out = np.concatenate(self._frames, axis=2)
		if dtype is not None:
			out = out.astype(dtype)
		return out


def make_atari(env_id):
	env = gym.make(env_id)
	assert 'NoFrameskip' in env.spec.id
	env = NoopResetEnv(env, noop_max=30)
	env = MaxAndSkipEnv(env, skip=4)
	return env


def wrap_deepmind(env, episode_life=True, clip_rewards=True, frame_stack=False, scale=False):
	"""Configure environment for DeepMind-style Atari.
	"""
	if episode_life:
		env = EpisodicLifeEnv(env)
	if 'FIRE' in env.unwrapped.get_action_meanings():
		env = FireResetEnv(env)
	env = WarpFrame(env)
	if scale:
		env = ScaledFloatFrame(env)
	if clip_rewards:
		env = ClipRewardEnv(env)
	if frame_stack:
		env = FrameStack(env, 4)
	return env


def make_wrap_atari(env_id='Breakout-v0', clip_rewards=True):
	env = make_atari(env_id)
	return wrap_deepmind(env, clip_rewards=clip_rewards, frame_stack=True, scale=True)

## cs221_breakout_dqn.py
#!python3
from __future__ import print_function
from atari_wrapper import make_wrap_atari
import gym
import matplotlib.pyplot as plt
import numpy as np
import os
import pickle
import tensorflow as tf
from tensorflow.python import debug as tf_debug

class StepInformation:
    def __init__(self, observation, action, reward, done, obs_):
        self.observation = observation
        self.action = action
        self.reward = reward
        self.done = done
        self.obs_ = obs_


class LearnBreakout:
    def __init__(self):
        self.render = True # Seeing is fun! :)
        # This environment handles a lot of the details needed for this Learning:
        # It grayscales and resizes all the frames to the size 84x84
        # It stacks the last 4 observations into an array producing the final observation space of 84,84,4
        self.env = make_wrap_atari('BreakoutNoFrameskip-v3')
        self.max_steps = 1e7
        self.global_step = 0

        # Neural Net Libraries intrinsically support batch learning, so parametrize it to control easily
        self.batch_size = 32
        # Convenient to have this here to know how large a single observation is.
        self.state_size = 84 * 84 * 4
        # all the possible actions in this game: No-op, left, right
        self.num_actions = 3
        # the epsilon value for explore-exploit strategy
        self.epsilon = 1.
        self.epsilon_decay = (self.epsilon-0.07)/1e6 #Linear decay
        self.gamma = 0.99 # Discount value for future rewards

        # Since we learn in batches, we need to collect at least batch_size # of samples and run them through the net
        self.memory_size = self.batch_size
        # The array that stores the observations.
        # We can do this in many ways, but one simple way is to store all the info into an object.
        self.memory = np.array([None] * self.memory_size)

        # The main model tf variables
        # We can run the model to compute self.q_predict by providing self.observation_placeholder
        self.observation_placeholder, self.q_predict = self.createModel("eval")
        # We can use the same Q Net to predict the target as well as our current prediction.
        # However, this can be overridden below when we use a fixed target network.
        self.target_observ_placeholder, self.q_target_predict = self.observation_placeholder, self.q_predict
        # The session we'll run our TF Graph in.
        self.tf_sess = tf.Session()
        # For debugging
        ## self.tf_sess = tf_debug.LocalCLIDebugWrapperSession(self.tf_sess)

        # Our objective is to learn the Q function Q(s,a) the value of taking an action a, in state s.
        # The approach is to use our neural net to learn some weights such that given the state as a stack of four frames,
        # we get an estimate of Q(s,a) for all possible a.
        # We'll do this by trying to converge the prediction to what should be the "true" value
        # The prediction will be our neural net answer.
        # The true value is the reward for taking action a from state s in the environment + the highest possible Q value from state s',
        # the successor of s, by taking action a discounted by gamma.
        # So Prediction: self.q_predict
        # Target: reward + gamma *  max_{a*} Q(s',a*)
        # To converge these two, we will use the squared loss function and run them through the RMSProp optimizer to find the right weights.
        # After this for any state s, we can compute Q(s,a) for all a, and pick the a that produces the max.
        self.q_target = tf.placeholder(tf.float32, [None, self.num_actions], name='Q_target')
        self.loss = tf.reduce_mean(tf.squared_difference(self.q_target, self.q_predict, name='Q_dist'))
        #The optimizer that we'll use to minimze the loss
        optimizer = tf.train.RMSPropOptimizer(learning_rate=0.001, decay=0.99)
        self.trainer = optimizer.minimize(self.loss)


        ## Metrics
        self.save_dir = "data/"
        self.rewards_path = os.path.join(self.save_dir, "rewards")
        self.train_rewards = []
        self.evaluate_rewards = []
        self.evaluate_every_n_episodes = 100
        self.save_rewards_every_n_episodes = self.evaluate_every_n_episodes * 5

        ## Model persistence
        self.model_save_path = os.path.join(self.save_dir, "model", "model.chkpt")
        self.saver = tf.train.Saver()
        self.save_model_every_n_episodes = 300

        ## Enhancements
        ## Don't forget previous experiences, works with a large memory buffer.
        self.use_experience_replay = True
        if self.use_experience_replay:
            self.memory_size = 10000
            self.memory = np.array([None] * self.memory_size)

        # Fixed Target Network
        # Don't chase a moving target, so fix the target predictor for some iterations.
        self.use_fixed_target_network = True
        self.update_target_network_weights_every_n_steps = 1000
        if self.use_fixed_target_network:
            self.target_observ_placeholder, self.q_target_predict = self.createModel("target")
            target_weights = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="target")
            eval_weights = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="eval")
            with tf.variable_scope('update_weights'):
                self.update_target_weights = [tf.assign(t, e) for t, e in zip(target_weights, eval_weights)]

        # Whether to use Double Q Learning. Note this has to be used in conjunction with Fixed Target Network
        # The idea here is that since the weights in the target network don't update frequently, using the deterministic
        # max operator will tend to bias the target predictions.
        # Therefore, let's continue to run the max operation on the eval network which has constantly changing weights
        # to account for any biases, while still keeping the target fixed by using the Q value for this chosen action from the
        # target network.
        self.use_ddqn = self.use_fixed_target_network and True

    def update_target_network_weights(self):
        self.tf_sess.run(self.update_target_weights)

    def createModel(self, tag):
        # Shape is defined as follows:
        #   First dimension for the item in a batch: allows for batching
        #   The next 3 are the actual observation of shape 84x84x4
        with tf.variable_scope(tag):
            obs_pl = tf.placeholder(shape=(None,84,84,4), dtype=tf.float32, name="obs")
            conv1 = tf.layers.conv2d(inputs=obs_pl, filters=32, kernel_size=(8, 8), strides=(4,4), activation='relu')
            conv2 = tf.layers.conv2d(inputs=conv1, filters=64, kernel_size=(4, 4), strides=(2,2), activation='relu')
            conv3 = tf.layers.conv2d(inputs=conv2, filters=32, kernel_size=(2, 2), strides=(1,1), activation='relu')
            flat = tf.contrib.layers.flatten(conv3)
            dense = tf.contrib.layers.fully_connected(flat, 512)
            # The shape of this final layer is always (batch_size_input_to_layer, self.num_actions)
            # This is because all the layers here actually take in an intrinsic batch dimension as the first one.
            actions_pred = tf.contrib.layers.fully_connected(dense, self.num_actions)
        return obs_pl, actions_pred

    def compute_action(self, observation, epsilon):
        if np.random.uniform() > epsilon:
            # Neural Net takes in [None, 84,84,4] but observation is only [84,84,4] so add dimension.
            observation = np.expand_dims(observation, axis=0)
            action = self.tf_sess.run(self.q_predict, feed_dict={self.observation_placeholder:observation})
            return np.argmax(action)
        else:
            return np.random.randint(0, self.num_actions)

    def store(self, observation, action, reward, done, obs_, step):
        self.memory[(step % self.memory_size)] = StepInformation(observation, action, reward, done, obs_)

    def get_from_memory(self, size_to_get):
        # Sample from memory using a uniform distribution
        # Get the indices from this distribution
        sample_indices = np.random.choice(
            np.minimum(self.global_step, self.memory_size),
            size_to_get, replace=self.use_experience_replay)

        sample = self.memory[sample_indices]
        states = np.empty((size_to_get, 84,84,4))
        rewards = np.empty(size_to_get)
        actions = np.empty(size_to_get, dtype=np.int32)
        done = np.empty(size_to_get)
        states_ = np.empty((size_to_get, 84,84,4))
        for idx, memory in enumerate(sample):
            states[idx] = memory.observation
            rewards[idx] = memory.reward
            actions[idx] = memory.action
            done[idx] = memory.done
            states_[idx] = memory.obs_
        return states, rewards, actions, done, states_


    def learn(self):
        # Get the samples from memory
        # For each sample
        #   Compute max Q(s', *)
        #   Use the above to then compute the target: q_target
        #   Run the trainer using s, and q_target to compute the loss and minimze it
        states, rewards, actions, done, states_ = self.get_from_memory(self.batch_size)

        # An array of indices to use in various functions to apply a function to each element in the batch
        batch_indices = np.arange(self.batch_size, dtype=np.int32)

        if self.use_ddqn:
            # When using double DQN, we have to compute Q(s',a') using the eval net first for all a'
            q_next_states_eval_net = self.tf_sess.run(self.q_predict,
                feed_dict={self.observation_placeholder:states_})
            # We then pick the index of the action that has the highest value
            q_next_state_best_action = np.argmax(q_next_states_eval_net, axis=1)
            # We then compute the Q values for all a' = Q(s',a'), and then pick the one based on the best_aciton_above
            q_next_states_target_net = self.tf_sess.run(self.q_target_predict,
                feed_dict={self.target_observ_placeholder:states_})
            # Notice: there's no max arg here, we're just selecting a single action for each item in the batch.
            q_next_max = q_next_states_target_net[batch_indices, q_next_state_best_action]
        else:
            # We predict the next states either based on the single Q network, or using a separate Fixed Network
            # This is determined by what self.q_target_predict and self.target_observ_placeholder are set to in __init__
            q_next_states = self.tf_sess.run(self.q_target_predict,
                feed_dict={self.target_observ_placeholder:states_})
            # q_next_states now contains for each sample in the batch, num_action values
            # each num_action value corresponds to the Q value of taking that action
            # For a given sample we only want to pick the maximum value for all action values
            q_next_max = np.max(q_next_states, axis=1)


        # Now we have to compute target, q_target for each of the samples in the batch.
        # q_target = reward + (1-done) * gamma * q_next_max
        q_target = rewards + ((1-done) * self.gamma * q_next_max) # 1-done: to handle end states, done = 1 for end states, 0 otherwise.

        # Q_target is now of shape (batch_size), but since our neural network actually has num_actions final nodes
        # We need to now again reconfigure/reshape our q_target to be of shape (batch_size, num_actions)
        # But what values should we fill into the other num_actions - 1 instances? We only want to learn for the action that we have taken.
        # Therefore, let's set all the other values to what was predicted by our network so that they don't impact training.
        # Numpy is cool! We can express a forloop over two variables in one line!
        q_target_reshaped = self.tf_sess.run(self.q_predict, feed_dict={self.observation_placeholder:states})
        q_target_reshaped[batch_indices, actions] = q_target

        # At this point we have our target, but not the prediction.
        # We can combine computing the prediction (feed-forward) with the loss minimization because of TF's awesomeness:
        _, loss = self.tf_sess.run([self.trainer, self.loss], feed_dict={self.q_target:q_target_reshaped, self.observation_placeholder:states})
        return loss

    def train(self):
        episode = 0
        loss = 1000
        # Initialize TF
        self.tf_sess.run(tf.global_variables_initializer())

        while self.global_step < self.max_steps:
            done = False
            observation = np.array(self.env.reset()) # initial observation
            episode_reward = 0.0

            while not done:
                if self.epsilon - self.epsilon_decay >= 0.07:
                    self.epsilon -= self.epsilon_decay
                action = self.compute_action(observation, self.epsilon)

                obs_, reward, done, info = self.env.step(action)
                episode_reward += reward
                print('Step: %i,  Episode: %i,  Action: %i,  Reward: %.0f,  Epsilon: %.5f, Loss: %.5f' % (self.global_step, episode, action, reward, self.epsilon, loss), end='\r')

                if self.render:
                    self.env.render()

                obs_ = np.array(obs_) # since obs_ is a lazy frame
                # insert into memory later
                self.store(observation, action, reward, int(done), obs_, self.global_step)
                observation = obs_
                self.global_step += 1
                if self.global_step % self.batch_size == 0:
                    loss = self.learn()
            episode += 1
            print('Step: %i/%i,  Episode: %i,  Action: %i,  Episode Reward: %.0f,  Epsilon: %.2f, Loss: %.5f' % (self.global_step, self.max_steps, episode, action, episode_reward, self.epsilon, loss))

            self.train_rewards.append(episode_reward)
            if episode % self.evaluate_every_n_episodes == 0:
                self.evaluate_rewards.append(self.evaluate())

            if self.use_fixed_target_network and self.global_step % self.update_target_network_weights_every_n_steps == 0:
                self.update_target_network_weights()

            if episode % self.save_rewards_every_n_episodes == 0:
                self.save_rewards()

            if episode % self.save_model_every_n_episodes == 0:
                self.saver.save(self.tf_sess, self.model_save_path)

    def evaluate(self):
        num_episodes = 10
        episode_rewards = [0] * num_episodes
        for i in range(num_episodes):
            episode_reward = 0.0
            obs = np.array(self.env.reset())
            done = False
            while not done:
                action = self.compute_action(obs, epsilon=-1)
                obs, reward, done, info = self.env.step(action)
                episode_reward += reward
            episode_rewards[i] = episode_reward
        return sum(episode_rewards)/float(num_episodes)

    def plot(self):
        if np.sum(self.evaluate_rewards) == 0: self.evaluate_rewards = pickle.load(open(self.rewards_path + "/eval.pkl", 'rb'))
        avg_rwd = []
        for i in range(len(self.evaluate_rewards)):
            if i < 30:
                avg_rwd.append(np.mean(self.evaluate_rewards[:i]))
            else:
                avg_rwd.append(np.mean(self.evaluate_rewards[i-30:i]))
        plt.plot(np.arange(len(avg_rwd)), avg_rwd)
        plt.ylabel('Average Reward in Last 30 Episodes')
        plt.xlabel('Number of Episodes')
        plt.show()

    def save_rewards(self):
        pickle.dump(self.train_rewards, open(self.rewards_path + "/train.pkl", 'wb'), True)
        pickle.dump(self.evaluate_rewards, open(self.rewards_path + "/eval.pkl", 'wb'), True)

learn = LearnBreakout()
learn.train()
	# -- coding: utf-8 -- #
	"""*********************************************************************************************"""
	# FileName [ atari wrapper.py ]
	# Synopsis [ environment wrapper for atari ]
	# Author [ Ting-Wei Liu (Andi611) ]
	# Copyright [ Copyleft(c), NTUEE, NTU, Taiwan ]
	# Reference [ github.com/openai/baselines/blob/master/baselines/common/atari_wrappers.py ]
	"""*********************************************************************************************"""


	import cv2
	import gym
	import numpy as np
	from gym import spaces
	from collections import deque


	class NoopResetEnv(gym.Wrapper):
	def __init__(self, env, noop_max=30):
	"""Sample initial states by taking random number of no-ops on reset.
	No-op is assumed to be action 0.
	"""
	gym.Wrapper.__init__(self, env)
	self.noop_max = noop_max
	self.override_num_noops = None
	if isinstance(env.action_space, gym.spaces.MultiBinary):
	self.noop_action = np.zeros(self.env.action_space.n, dtype=np.int64)
	else:
	# used for atari environments
	self.noop_action = 0
	assert env.unwrapped.get_action_meanings()[0] == 'NOOP'

	def _reset(self, **kwargs):
	""" Do no-op action for a number of steps in [1, noop_max]."""
	self.env.reset(**kwargs)
	if self.override_num_noops is not None:
	noops = self.override_num_noops
	else:
	noops = self.unwrapped.np_random.randint(1, self.noop_max + 1) #pylint: disable=E1101
	assert noops > 0
	obs = None
	for _ in range(noops):
	obs, _, done, _ = self.env.step(self.noop_action)
	if done:
	obs = self.env.reset(**kwargs)
	return obs


	class FireResetEnv(gym.Wrapper):
	def __init__(self, env):
	"""Take action on reset for environments that are fixed until firing."""
	gym.Wrapper.__init__(self, env)
	assert env.unwrapped.get_action_meanings()[1] == 'FIRE'
	assert len(env.unwrapped.get_action_meanings()) >= 3

	def _reset(self, **kwargs):
	self.env.reset(**kwargs)
	obs, _, done, _ = self.env.step(1)
	if done:
	self.env.reset(**kwargs)
	obs, _, done, _ = self.env.step(2)
	if done:
	self.env.reset(**kwargs)
	return obs


	class EpisodicLifeEnv(gym.Wrapper):
	def __init__(self, env):
	"""Make end-of-life == end-of-episode, but only reset on true game over.
	Done by DeepMind for the DQN and co. since it helps value estimation.
	"""
	gym.Wrapper.__init__(self, env)
	self.lives = 0
	self.was_real_done = True

	def _step(self, action):
	obs, reward, done, info = self.env.step(action)
	self.was_real_done = done
	# check current lives, make loss of life terminal,
	# then update lives to handle bonus lives
	lives = self.env.unwrapped.ale.lives()
	if lives < self.lives and lives > 0:
	# for Qbert somtimes we stay in lives == 0 condtion for a few frames
	# so its important to keep lives > 0, so that we only reset once
	# the environment advertises done.
	done = True
	self.lives = lives
	return obs, reward, done, info

	def _reset(self, **kwargs):
	"""Reset only when lives are exhausted.
	This way all states are still reachable even though lives are episodic,
	and the learner need not know about any of this behind-the-scenes.
	"""
	if self.was_real_done:
	obs = self.env.reset(**kwargs)
	else:
	# no-op step to advance from terminal/lost life state
	obs, _, _, _ = self.env.step(0)
	self.lives = self.env.unwrapped.ale.lives()
	return obs


	class MaxAndSkipEnv(gym.Wrapper):
	def __init__(self, env, skip=4):
	"""Return only every `skip`-th frame"""
	gym.Wrapper.__init__(self, env)
	# most recent raw observations (for max pooling across time steps)
	self._obs_buffer = np.zeros((2,)+env.observation_space.shape, dtype='uint8')
	self._skip = skip

	def _step(self, action):
	"""Repeat action, sum reward, and max over last observations."""
	total_reward = 0.0
	done = None
	for i in range(self._skip):
	obs, reward, done, info = self.env.step(action)
	if i == self._skip - 2: self._obs_buffer[0] = obs
	if i == self._skip - 1: self._obs_buffer[1] = obs
	total_reward += reward
	if done:
	break
	# Note that the observation on the done=True frame
	# doesn't matter
	max_frame = self._obs_buffer.max(axis=0)

	return max_frame, total_reward, done, info


	class ClipRewardEnv(gym.RewardWrapper):
	def _reward(self, reward):
	"""Bin reward to {+1, 0, -1} by its sign."""
	return np.sign(reward)


	class WarpFrame(gym.ObservationWrapper):
	def __init__(self, env):
	"""Warp frames to 84x84 as done in the Nature paper and later work."""
	gym.ObservationWrapper.__init__(self, env)
	self.width = 84
	self.height = 84
	self.observation_space = spaces.Box(low=0, high=255, shape=(self.height, self.width, 1))

	def _observation(self, frame):
	frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
	frame = cv2.resize(frame, (self.width, self.height), interpolation=cv2.INTER_AREA)
	return frame[:, :, None]


	class FrameStack(gym.Wrapper):
	def __init__(self, env, k):
	"""Stack k last frames.
	Returns lazy array, which is much more memory efficient.
	See Also
	--------
	baselines.common.atari_wrappers.LazyFrames
	"""
	gym.Wrapper.__init__(self, env)
	self.k = k
	self.frames = deque([], maxlen=k)
	shp = env.observation_space.shape
	self.observation_space = spaces.Box(low=0, high=255, shape=(shp[0], shp[1], shp[2] * k))

	def _reset(self):
	ob = self.env.reset()
	for _ in range(self.k):
	self.frames.append(ob)
	return self._get_ob()

	def _step(self, action):
	ob, reward, done, info = self.env.step(action)
	self.frames.append(ob)
	return self._get_ob(), reward, done, info

	def _get_ob(self):
	assert len(self.frames) == self.k
	return LazyFrames(list(self.frames))


	class ScaledFloatFrame(gym.ObservationWrapper):
	def _observation(self, observation):
	# careful! This undoes the memory optimization, use
	# with smaller replay buffers only.
	return np.array(observation).astype(np.float32) / 255.0


	class LazyFrames(object):
	def __init__(self, frames):
	"""This object ensures that common frames between the observations are only stored once.
	It exists purely to optimize memory usage which can be huge for DQN's 1M frames replay
	buffers.
	This object should only be converted to numpy array before being passed to the model.
	You'd not belive how complex the previous solution was."""
	self._frames = frames

	def __array__(self, dtype=None):
	out = np.concatenate(self._frames, axis=2)
	if dtype is not None:
	out = out.astype(dtype)
	return out


	def make_atari(env_id):
	env = gym.make(env_id)
	assert 'NoFrameskip' in env.spec.id
	env = NoopResetEnv(env, noop_max=30)
	env = MaxAndSkipEnv(env, skip=4)
	return env


	def wrap_deepmind(env, episode_life=True, clip_rewards=True, frame_stack=False, scale=False):
	"""Configure environment for DeepMind-style Atari.
	"""
	if episode_life:
	env = EpisodicLifeEnv(env)
	if 'FIRE' in env.unwrapped.get_action_meanings():
	env = FireResetEnv(env)
	env = WarpFrame(env)
	if scale:
	env = ScaledFloatFrame(env)
	if clip_rewards:
	env = ClipRewardEnv(env)
	if frame_stack:
	env = FrameStack(env, 4)
	return env


	def make_wrap_atari(env_id='Breakout-v0', clip_rewards=True):
	env = make_atari(env_id)
	return wrap_deepmind(env, clip_rewards=clip_rewards, frame_stack=True, scale=True)
	#!python3
	from __future__ import print_function
	from atari_wrapper import make_wrap_atari
	import gym
	import matplotlib.pyplot as plt
	import numpy as np
	import os
	import pickle
	import tensorflow as tf
	from tensorflow.python import debug as tf_debug

	class StepInformation:
	def __init__(self, observation, action, reward, done, obs_):
	self.observation = observation
	self.action = action
	self.reward = reward
	self.done = done
	self.obs_ = obs_


	class LearnBreakout:
	def __init__(self):
	self.render = True # Seeing is fun! :)
	# This environment handles a lot of the details needed for this Learning:
	# It grayscales and resizes all the frames to the size 84x84
	# It stacks the last 4 observations into an array producing the final observation space of 84,84,4
	self.env = make_wrap_atari('BreakoutNoFrameskip-v3')
	self.max_steps = 1e7
	self.global_step = 0

	# Neural Net Libraries intrinsically support batch learning, so parametrize it to control easily
	self.batch_size = 32
	# Convenient to have this here to know how large a single observation is.
	self.state_size = 84 * 84 * 4
	# all the possible actions in this game: No-op, left, right
	self.num_actions = 3
	# the epsilon value for explore-exploit strategy
	self.epsilon = 1.
	self.epsilon_decay = (self.epsilon-0.07)/1e6 #Linear decay
	self.gamma = 0.99 # Discount value for future rewards

	# Since we learn in batches, we need to collect at least batch_size # of samples and run them through the net
	self.memory_size = self.batch_size
	# The array that stores the observations.
	# We can do this in many ways, but one simple way is to store all the info into an object.
	self.memory = np.array([None] * self.memory_size)

	# The main model tf variables
	# We can run the model to compute self.q_predict by providing self.observation_placeholder
	self.observation_placeholder, self.q_predict = self.createModel("eval")
	# We can use the same Q Net to predict the target as well as our current prediction.
	# However, this can be overridden below when we use a fixed target network.
	self.target_observ_placeholder, self.q_target_predict = self.observation_placeholder, self.q_predict
	# The session we'll run our TF Graph in.
	self.tf_sess = tf.Session()
	# For debugging
	## self.tf_sess = tf_debug.LocalCLIDebugWrapperSession(self.tf_sess)

	# Our objective is to learn the Q function Q(s,a) the value of taking an action a, in state s.
	# The approach is to use our neural net to learn some weights such that given the state as a stack of four frames,
	# we get an estimate of Q(s,a) for all possible a.
	# We'll do this by trying to converge the prediction to what should be the "true" value
	# The prediction will be our neural net answer.
	# The true value is the reward for taking action a from state s in the environment + the highest possible Q value from state s',
	# the successor of s, by taking action a discounted by gamma.
	# So Prediction: self.q_predict
	# Target: reward + gamma * max_{a} Q(s',a)
	# To converge these two, we will use the squared loss function and run them through the RMSProp optimizer to find the right weights.
	# After this for any state s, we can compute Q(s,a) for all a, and pick the a that produces the max.
	self.q_target = tf.placeholder(tf.float32, [None, self.num_actions], name='Q_target')
	self.loss = tf.reduce_mean(tf.squared_difference(self.q_target, self.q_predict, name='Q_dist'))
	#The optimizer that we'll use to minimze the loss
	optimizer = tf.train.RMSPropOptimizer(learning_rate=0.001, decay=0.99)
	self.trainer = optimizer.minimize(self.loss)


	## Metrics
	self.save_dir = "data/"
	self.rewards_path = os.path.join(self.save_dir, "rewards")
	self.train_rewards = []
	self.evaluate_rewards = []
	self.evaluate_every_n_episodes = 100
	self.save_rewards_every_n_episodes = self.evaluate_every_n_episodes * 5

	## Model persistence
	self.model_save_path = os.path.join(self.save_dir, "model", "model.chkpt")
	self.saver = tf.train.Saver()
	self.save_model_every_n_episodes = 300

	## Enhancements
	## Don't forget previous experiences, works with a large memory buffer.
	self.use_experience_replay = True
	if self.use_experience_replay:
	self.memory_size = 10000
	self.memory = np.array([None] * self.memory_size)

	# Fixed Target Network
	# Don't chase a moving target, so fix the target predictor for some iterations.
	self.use_fixed_target_network = True
	self.update_target_network_weights_every_n_steps = 1000
	if self.use_fixed_target_network:
	self.target_observ_placeholder, self.q_target_predict = self.createModel("target")
	target_weights = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="target")
	eval_weights = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="eval")
	with tf.variable_scope('update_weights'):
	self.update_target_weights = [tf.assign(t, e) for t, e in zip(target_weights, eval_weights)]

	# Whether to use Double Q Learning. Note this has to be used in conjunction with Fixed Target Network
	# The idea here is that since the weights in the target network don't update frequently, using the deterministic
	# max operator will tend to bias the target predictions.
	# Therefore, let's continue to run the max operation on the eval network which has constantly changing weights
	# to account for any biases, while still keeping the target fixed by using the Q value for this chosen action from the
	# target network.
	self.use_ddqn = self.use_fixed_target_network and True

	def update_target_network_weights(self):
	self.tf_sess.run(self.update_target_weights)

	def createModel(self, tag):
	# Shape is defined as follows:
	# First dimension for the item in a batch: allows for batching
	# The next 3 are the actual observation of shape 84x84x4
	with tf.variable_scope(tag):
	obs_pl = tf.placeholder(shape=(None,84,84,4), dtype=tf.float32, name="obs")
	conv1 = tf.layers.conv2d(inputs=obs_pl, filters=32, kernel_size=(8, 8), strides=(4,4), activation='relu')
	conv2 = tf.layers.conv2d(inputs=conv1, filters=64, kernel_size=(4, 4), strides=(2,2), activation='relu')
	conv3 = tf.layers.conv2d(inputs=conv2, filters=32, kernel_size=(2, 2), strides=(1,1), activation='relu')
	flat = tf.contrib.layers.flatten(conv3)
	dense = tf.contrib.layers.fully_connected(flat, 512)
	# The shape of this final layer is always (batch_size_input_to_layer, self.num_actions)
	# This is because all the layers here actually take in an intrinsic batch dimension as the first one.
	actions_pred = tf.contrib.layers.fully_connected(dense, self.num_actions)
	return obs_pl, actions_pred

	def compute_action(self, observation, epsilon):
	if np.random.uniform() > epsilon:
	# Neural Net takes in [None, 84,84,4] but observation is only [84,84,4] so add dimension.
	observation = np.expand_dims(observation, axis=0)
	action = self.tf_sess.run(self.q_predict, feed_dict={self.observation_placeholder:observation})
	return np.argmax(action)
	else:
	return np.random.randint(0, self.num_actions)

	def store(self, observation, action, reward, done, obs_, step):
	self.memory[(step % self.memory_size)] = StepInformation(observation, action, reward, done, obs_)

	def get_from_memory(self, size_to_get):
	# Sample from memory using a uniform distribution
	# Get the indices from this distribution
	sample_indices = np.random.choice(
	np.minimum(self.global_step, self.memory_size),
	size_to_get, replace=self.use_experience_replay)

	sample = self.memory[sample_indices]
	states = np.empty((size_to_get, 84,84,4))
	rewards = np.empty(size_to_get)
	actions = np.empty(size_to_get, dtype=np.int32)
	done = np.empty(size_to_get)
	states_ = np.empty((size_to_get, 84,84,4))
	for idx, memory in enumerate(sample):
	states[idx] = memory.observation
	rewards[idx] = memory.reward
	actions[idx] = memory.action
	done[idx] = memory.done
	states_[idx] = memory.obs_
	return states, rewards, actions, done, states_


	def learn(self):
	# Get the samples from memory
	# For each sample
	# Compute max Q(s', *)
	# Use the above to then compute the target: q_target
	# Run the trainer using s, and q_target to compute the loss and minimze it
	states, rewards, actions, done, states_ = self.get_from_memory(self.batch_size)

	# An array of indices to use in various functions to apply a function to each element in the batch
	batch_indices = np.arange(self.batch_size, dtype=np.int32)

	if self.use_ddqn:
	# When using double DQN, we have to compute Q(s',a') using the eval net first for all a'
	q_next_states_eval_net = self.tf_sess.run(self.q_predict,
	feed_dict={self.observation_placeholder:states_})
	# We then pick the index of the action that has the highest value
	q_next_state_best_action = np.argmax(q_next_states_eval_net, axis=1)
	# We then compute the Q values for all a' = Q(s',a'), and then pick the one based on the best_aciton_above
	q_next_states_target_net = self.tf_sess.run(self.q_target_predict,
	feed_dict={self.target_observ_placeholder:states_})
	# Notice: there's no max arg here, we're just selecting a single action for each item in the batch.
	q_next_max = q_next_states_target_net[batch_indices, q_next_state_best_action]
	else:
	# We predict the next states either based on the single Q network, or using a separate Fixed Network
	# This is determined by what self.q_target_predict and self.target_observ_placeholder are set to in __init__
	q_next_states = self.tf_sess.run(self.q_target_predict,
	feed_dict={self.target_observ_placeholder:states_})
	# q_next_states now contains for each sample in the batch, num_action values
	# each num_action value corresponds to the Q value of taking that action
	# For a given sample we only want to pick the maximum value for all action values
	q_next_max = np.max(q_next_states, axis=1)


	# Now we have to compute target, q_target for each of the samples in the batch.
	# q_target = reward + (1-done) * gamma * q_next_max
	q_target = rewards + ((1-done) * self.gamma * q_next_max) # 1-done: to handle end states, done = 1 for end states, 0 otherwise.

	# Q_target is now of shape (batch_size), but since our neural network actually has num_actions final nodes
	# We need to now again reconfigure/reshape our q_target to be of shape (batch_size, num_actions)
	# But what values should we fill into the other num_actions - 1 instances? We only want to learn for the action that we have taken.
	# Therefore, let's set all the other values to what was predicted by our network so that they don't impact training.
	# Numpy is cool! We can express a forloop over two variables in one line!
	q_target_reshaped = self.tf_sess.run(self.q_predict, feed_dict={self.observation_placeholder:states})
	q_target_reshaped[batch_indices, actions] = q_target

	# At this point we have our target, but not the prediction.
	# We can combine computing the prediction (feed-forward) with the loss minimization because of TF's awesomeness:
	_, loss = self.tf_sess.run([self.trainer, self.loss], feed_dict={self.q_target:q_target_reshaped, self.observation_placeholder:states})
	return loss

	def train(self):
	episode = 0
	loss = 1000
	# Initialize TF
	self.tf_sess.run(tf.global_variables_initializer())

	while self.global_step < self.max_steps:
	done = False
	observation = np.array(self.env.reset()) # initial observation
	episode_reward = 0.0

	while not done:
	if self.epsilon - self.epsilon_decay >= 0.07:
	self.epsilon -= self.epsilon_decay
	action = self.compute_action(observation, self.epsilon)

	obs_, reward, done, info = self.env.step(action)
	episode_reward += reward
	print('Step: %i, Episode: %i, Action: %i, Reward: %.0f, Epsilon: %.5f, Loss: %.5f' % (self.global_step, episode, action, reward, self.epsilon, loss), end='\r')

	if self.render:
	self.env.render()

	obs_ = np.array(obs_) # since obs_ is a lazy frame
	# insert into memory later
	self.store(observation, action, reward, int(done), obs_, self.global_step)
	observation = obs_
	self.global_step += 1
	if self.global_step % self.batch_size == 0:
	loss = self.learn()
	episode += 1
	print('Step: %i/%i, Episode: %i, Action: %i, Episode Reward: %.0f, Epsilon: %.2f, Loss: %.5f' % (self.global_step, self.max_steps, episode, action, episode_reward, self.epsilon, loss))

	self.train_rewards.append(episode_reward)
	if episode % self.evaluate_every_n_episodes == 0:
	self.evaluate_rewards.append(self.evaluate())

	if self.use_fixed_target_network and self.global_step % self.update_target_network_weights_every_n_steps == 0:
	self.update_target_network_weights()

	if episode % self.save_rewards_every_n_episodes == 0:
	self.save_rewards()

	if episode % self.save_model_every_n_episodes == 0:
	self.saver.save(self.tf_sess, self.model_save_path)

	def evaluate(self):
	num_episodes = 10
	episode_rewards = [0] * num_episodes
	for i in range(num_episodes):
	episode_reward = 0.0
	obs = np.array(self.env.reset())
	done = False
	while not done:
	action = self.compute_action(obs, epsilon=-1)
	obs, reward, done, info = self.env.step(action)
	episode_reward += reward
	episode_rewards[i] = episode_reward
	return sum(episode_rewards)/float(num_episodes)

	def plot(self):
	if np.sum(self.evaluate_rewards) == 0: self.evaluate_rewards = pickle.load(open(self.rewards_path + "/eval.pkl", 'rb'))
	avg_rwd = []
	for i in range(len(self.evaluate_rewards)):
	if i < 30:
	avg_rwd.append(np.mean(self.evaluate_rewards[:i]))
	else:
	avg_rwd.append(np.mean(self.evaluate_rewards[i-30:i]))
	plt.plot(np.arange(len(avg_rwd)), avg_rwd)
	plt.ylabel('Average Reward in Last 30 Episodes')
	plt.xlabel('Number of Episodes')
	plt.show()

	def save_rewards(self):
	pickle.dump(self.train_rewards, open(self.rewards_path + "/train.pkl", 'wb'), True)
	pickle.dump(self.evaluate_rewards, open(self.rewards_path + "/eval.pkl", 'wb'), True)

	learn = LearnBreakout()
	learn.train()