MorvanZhou/DuelingDQN

## DuelingDQN
"""
The DQN improvement: Prioritized Experience Replay (based on https://arxiv.org/abs/1511.05952)
View more on 莫烦Python: https://morvanzhou.github.io/tutorials/
Using:
Tensorflow: 1.0
"""

import numpy as np
import tensorflow as tf

np.random.seed(1)
tf.set_random_seed(1)


class SumTree(object):
    """
    This SumTree code is modified version and the original code is from:
    https://github.com/jaara/AI-blog/blob/master/SumTree.py
    Story the data with it priority in tree and data frameworks.
    """
    data_pointer = 0

    def __init__(self, capacity):
        self.capacity = capacity  # for all priority values
        self.tree = np.zeros(2 * capacity - 1)
        # [--------------Parent nodes-------------][-------leaves to recode priority-------]
        #             size: capacity - 1                       size: capacity
        self.data = np.zeros(capacity, dtype=object)  # for all transitions
        # [--------------data frame-------------]
        #             size: capacity

    def add_new_priority(self, p, data):
        leaf_idx = self.data_pointer + self.capacity - 1

        self.data[self.data_pointer] = data  # update data_frame
        self.update(leaf_idx, p)  # update tree_frame
        self.data_pointer += 1
        if self.data_pointer >= self.capacity:  # replace when exceed the capacity
            self.data_pointer = 0

    def update(self, tree_idx, p):
        change = p - self.tree[tree_idx]

        self.tree[tree_idx] = p
        self._propagate_change(tree_idx, change)

    def _propagate_change(self, tree_idx, change):
        """change the sum of priority value in all parent nodes"""
        parent_idx = (tree_idx - 1) // 2
        self.tree[parent_idx] += change
        if parent_idx != 0:
            self._propagate_change(parent_idx, change)

    def get_leaf(self, lower_bound):
        leaf_idx = self._retrieve(lower_bound)  # search the max leaf priority based on the lower_bound
        data_idx = leaf_idx - self.capacity + 1
        return [leaf_idx, self.tree[leaf_idx], self.data[data_idx]]

    def _retrieve(self, lower_bound, parent_idx=0):
        """
        Tree structure and array storage:
        Tree index:
             0         -> storing priority sum
            / \
          1     2
         / \   / \
        3   4 5   6    -> storing priority for transitions
        Array type for storing:
        [0,1,2,3,4,5,6]
        """
        left_child_idx = 2 * parent_idx + 1
        right_child_idx = left_child_idx + 1

        if left_child_idx >= len(self.tree):  # end search when no more child
            return parent_idx

        if self.tree[left_child_idx] == self.tree[right_child_idx]:
            return self._retrieve(lower_bound, np.random.choice([left_child_idx, right_child_idx]))
        if lower_bound <= self.tree[left_child_idx]:  # downward search, always search for a higher priority node
            return self._retrieve(lower_bound, left_child_idx)
        else:
            return self._retrieve(lower_bound - self.tree[left_child_idx], right_child_idx)

    @property
    def root_priority(self):
        return self.tree[0]  # the root


class Memory(object):  # stored as ( s, a, r, s_ ) in SumTree
    """
    This SumTree code is modified version and the original code is from:
    https://github.com/jaara/AI-blog/blob/master/Seaquest-DDQN-PER.py
    """
    epsilon = 0.001  # small amount to avoid zero priority
    alpha = 0.6  # [0~1] convert the importance of TD error to priority
    beta = 0.4  # importance-sampling, from initial value increasing to 1
    beta_increment_per_sampling = 1e-4  # annealing the bias
    abs_err_upper = 1   # for stability refer to paper

    def __init__(self, capacity):
        self.tree = SumTree(capacity)

    def store(self, error, transition):
        p = self._get_priority(error)
        self.tree.add_new_priority(p, transition)

    def sample(self, n):
        batch_idx, batch_memory, ISWeights = [], [], []
        segment = self.tree.root_priority / n
        self.beta = np.min([1, self.beta + self.beta_increment_per_sampling])  # max = 1

        min_prob = np.min(self.tree.tree[-self.tree.capacity:]) / self.tree.root_priority
        maxiwi = np.power(self.tree.capacity * min_prob, -self.beta)  # for later normalizing ISWeights
        for i in range(n):
            a = segment * i
            b = segment * (i + 1)
            lower_bound = np.random.uniform(a, b)
            idx, p, data = self.tree.get_leaf(lower_bound)
            prob = p / self.tree.root_priority
            ISWeights.append(self.tree.capacity * prob)
            batch_idx.append(idx)
            batch_memory.append(data)

        ISWeights = np.vstack(ISWeights)
        ISWeights = np.power(ISWeights, -self.beta) / maxiwi  # normalize
        return batch_idx, np.vstack(batch_memory), ISWeights

    def update(self, idx, error):
        p = self._get_priority(error)
        self.tree.update(idx, p)

    def _get_priority(self, error):
        error += self.epsilon   # avoid 0
        clipped_error = np.clip(error, 0, self.abs_err_upper)
        return np.power(clipped_error, self.alpha)


class DuelingDQNPrioritizedReplay:
    def __init__(
            self,
            n_actions,
            n_features,
            learning_rate=0.005,
            reward_decay=0.9,
            e_greedy=0.9,
            replace_target_iter=500,
            memory_size=10000,
            batch_size=32,
            e_greedy_increment=None,
            hidden=[100, 50],
            output_graph=False,
            sess=None,
    ):
        self.n_actions = n_actions
        self.n_features = n_features
        self.lr = learning_rate
        self.gamma = reward_decay
        self.epsilon_max = e_greedy
        self.replace_target_iter = replace_target_iter
        self.memory_size = memory_size
        self.batch_size = batch_size
        self.hidden = hidden
        self.epsilon_increment = e_greedy_increment
        self.epsilon = 0.5 if e_greedy_increment is not None else self.epsilon_max

        self.learn_step_counter = 0
        self._build_net()
        self.memory = Memory(capacity=memory_size)

        if sess is None:
            self.sess = tf.Session()
            self.sess.run(tf.global_variables_initializer())
        else:
            self.sess = sess

        if output_graph:
            tf.summary.FileWriter("logs/", self.sess.graph)

        self.cost_his = []

    def _build_net(self):
        def build_layers(s, c_names, w_initializer, b_initializer):
            for i, h in enumerate(self.hidden):
                if i == 0:
                    in_units, out_units, inputs = self.n_features, self.hidden[i], s
                else:
                    in_units, out_units, inputs = self.hidden[i-1], self.hidden[i], l
                with tf.variable_scope('l%i' % i):
                    w = tf.get_variable('w', [in_units, out_units], initializer=w_initializer, collections=c_names)
                    b = tf.get_variable('b', [1, out_units], initializer=b_initializer, collections=c_names)
                    l = tf.nn.relu(tf.matmul(inputs, w) + b)

            with tf.variable_scope('Value'):
                w = tf.get_variable('w', [self.hidden[-1], 1], initializer=w_initializer, collections=c_names)
                b = tf.get_variable('b', [1, 1], initializer=b_initializer, collections=c_names)
                self.V = tf.matmul(l, w) + b

            with tf.variable_scope('Advantage'):
                w = tf.get_variable('w', [self.hidden[-1], self.n_actions], initializer=w_initializer, collections=c_names)
                b = tf.get_variable('b', [1, self.n_actions], initializer=b_initializer, collections=c_names)
                self.A = tf.matmul(l, w) + b

            with tf.variable_scope('Q'):
                out = self.V + (self.A - tf.reduce_mean(self.A, axis=1, keep_dims=True))  # Q = V(s) + A(s,a)

            # with tf.variable_scope('out'):
            #     w = tf.get_variable('w', [self.hidden[-1], self.n_actions], initializer=w_initializer, collections=c_names)
            #     b = tf.get_variable('b', [1, self.n_actions], initializer=b_initializer, collections=c_names)
            #     out = tf.matmul(l, w) + b
            return out

        # ------------------ build evaluate_net ------------------
        self.s = tf.placeholder(tf.float32, [None, self.n_features], name='s')  # input
        self.q_target = tf.placeholder(tf.float32, [None, self.n_actions], name='Q_target')  # for calculating loss
        self.ISWeights = tf.placeholder(tf.float32, [None, 1], name='IS_weights')
        with tf.variable_scope('eval_net'):
            c_names, w_initializer, b_initializer = \
                ['eval_net_params', tf.GraphKeys.GLOBAL_VARIABLES], \
                tf.random_normal_initializer(0., 0.01), tf.constant_initializer(0.01)  # config of layers

            self.q_eval = build_layers(self.s, c_names, w_initializer, b_initializer)

        with tf.variable_scope('loss'):
            self.abs_errors = tf.abs(tf.reduce_sum(self.q_target - self.q_eval, axis=1))  # for updating Sumtree
            self.loss = tf.reduce_mean(self.ISWeights * tf.squared_difference(self.q_target, self.q_eval))

        with tf.variable_scope('train'):
            self._train_op = tf.train.AdamOptimizer(self.lr).minimize(self.loss)

        # ------------------ build target_net ------------------
        self.s_ = tf.placeholder(tf.float32, [None, self.n_features], name='s_')  # input
        with tf.variable_scope('target_net'):
            c_names = ['target_net_params', tf.GraphKeys.GLOBAL_VARIABLES]
            self.q_next = build_layers(self.s_, c_names, w_initializer, b_initializer)

    def store_transition(self, s, a, r, s_):
        transition = np.hstack((s, [a, r], s_))
        max_p = np.max(self.memory.tree.tree[-self.memory.tree.capacity:])
        self.memory.store(max_p, transition)

    def choose_action(self, observation):
        observation = observation[np.newaxis, :]
        if np.random.uniform() < self.epsilon:
            actions_value = self.sess.run(self.q_eval, feed_dict={self.s: observation})
            action = np.argmax(actions_value)
        else:
            action = np.random.randint(0, self.n_actions)
        return action

    def _replace_target_params(self):
        t_params = tf.get_collection('target_net_params')
        e_params = tf.get_collection('eval_net_params')
        self.sess.run([tf.assign(t, e) for t, e in zip(t_params, e_params)])

    def learn(self):
        if self.learn_step_counter % self.replace_target_iter == 0:
            self._replace_target_params()

        tree_idx, batch_memory, ISWeights = self.memory.sample(self.batch_size)

        # double DQN
        q_next, q_eval4next = self.sess.run(
            [self.q_next, self.q_eval],
            feed_dict={self.s_: batch_memory[:, -self.n_features:],  # next observation
                       self.s: batch_memory[:, -self.n_features:]})  # next observation
        q_eval = self.sess.run(self.q_eval, {self.s: batch_memory[:, :self.n_features]})

        q_target = q_eval.copy()

        batch_index = np.arange(self.batch_size, dtype=np.int32)
        eval_act_index = batch_memory[:, self.n_features].astype(int)
        reward = batch_memory[:, self.n_features + 1]
        max_act4next = np.argmax(q_eval4next,
                                 axis=1)  # the action that brings the highest value is evaluated by q_eval
        selected_q_next = q_next[batch_index, max_act4next]  # Double DQN, select q_next depending on above actions

        q_target[batch_index, eval_act_index] = reward + self.gamma * selected_q_next

        # q_next, q_eval = self.sess.run(
        #     [self.q_next, self.q_eval],
        #     feed_dict={self.s_: batch_memory[:, -self.n_features:],
        #                self.s: batch_memory[:, :self.n_features]})
        #
        # q_target = q_eval.copy()
        # batch_index = np.arange(self.batch_size, dtype=np.int32)
        # eval_act_index = batch_memory[:, self.n_features].astype(int)
        # reward = batch_memory[:, self.n_features + 1]
        #
        # q_target[batch_index, eval_act_index] = reward + self.gamma * np.max(q_next, axis=1)

        _, abs_errors, self.cost = self.sess.run([self._train_op, self.abs_errors, self.loss],
                                                 feed_dict={self.s: batch_memory[:, :self.n_features],
                                                            self.q_target: q_target,
                                                            self.ISWeights: ISWeights})
        for i in range(len(tree_idx)):  # update priority
            idx = tree_idx[i]
            self.memory.update(idx, abs_errors[i])

        self.cost_his.append(self.cost)

        self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max
        self.learn_step_counter += 1

## main
"""
Deep Q network,
LunarLander-v2 example
Using:
Tensorflow: 1.0
gym: 0.8.0
"""


import gym
from gym import wrappers
from DuelingDQNPrioritizedReplay import DuelingDQNPrioritizedReplay

env = gym.make('LunarLander-v2')
# env = env.unwrapped
env.seed(1)

N_A = env.action_space.n
N_S = env.observation_space.shape[0]
MEMORY_CAPACITY = 50000
TARGET_REP_ITER = 2000
MAX_EPISODES = 900
E_GREEDY = 0.95
E_INCREMENT = 0.00001
GAMMA = 0.99
LR = 0.0001
BATCH_SIZE = 32
HIDDEN = [400, 400]
RENDER = True

RL = DuelingDQNPrioritizedReplay(
    n_actions=N_A, n_features=N_S, learning_rate=LR, e_greedy=E_GREEDY, reward_decay=GAMMA,
    hidden=HIDDEN, batch_size=BATCH_SIZE, replace_target_iter=TARGET_REP_ITER,
    memory_size=MEMORY_CAPACITY, e_greedy_increment=E_INCREMENT,)


total_steps = 0
running_r = 0
r_scale = 100
for i_episode in range(MAX_EPISODES):
    s = env.reset()  # (coord_x, coord_y, vel_x, vel_y, angle, angular_vel, l_leg_on_ground, r_leg_on_ground)
    ep_r = 0
    while True:
        if total_steps > MEMORY_CAPACITY: env.render()
        a = RL.choose_action(s)
        s_, r, done, _ = env.step(a)
        if r == -100: r = -30
        r /= r_scale

        ep_r += r
        RL.store_transition(s, a, r, s_)
        if total_steps > MEMORY_CAPACITY:
            RL.learn()
        if done:
            land = '| Landed' if r == 100/r_scale else '| ------'
            running_r = 0.99 * running_r + 0.01 * ep_r
            print('Epi: ', i_episode,
                  land,
                  '| Epi_R: ', round(ep_r, 2),
                  '| Running_R: ', round(running_r, 2),
                  '| Epsilon: ', round(RL.epsilon, 3))
            break

        s = s_
        total_steps += 1
	"""
	The DQN improvement: Prioritized Experience Replay (based on https://arxiv.org/abs/1511.05952)
	View more on 莫烦Python: https://morvanzhou.github.io/tutorials/
	Using:
	Tensorflow: 1.0
	"""

	import numpy as np
	import tensorflow as tf

	np.random.seed(1)
	tf.set_random_seed(1)


	class SumTree(object):
	"""
	This SumTree code is modified version and the original code is from:
	https://github.com/jaara/AI-blog/blob/master/SumTree.py
	Story the data with it priority in tree and data frameworks.
	"""
	data_pointer = 0

	def __init__(self, capacity):
	self.capacity = capacity # for all priority values
	self.tree = np.zeros(2 * capacity - 1)
	# [--------------Parent nodes-------------][-------leaves to recode priority-------]
	# size: capacity - 1 size: capacity
	self.data = np.zeros(capacity, dtype=object) # for all transitions
	# [--------------data frame-------------]
	# size: capacity

	def add_new_priority(self, p, data):
	leaf_idx = self.data_pointer + self.capacity - 1

	self.data[self.data_pointer] = data # update data_frame
	self.update(leaf_idx, p) # update tree_frame
	self.data_pointer += 1
	if self.data_pointer >= self.capacity: # replace when exceed the capacity
	self.data_pointer = 0

	def update(self, tree_idx, p):
	change = p - self.tree[tree_idx]

	self.tree[tree_idx] = p
	self._propagate_change(tree_idx, change)

	def _propagate_change(self, tree_idx, change):
	"""change the sum of priority value in all parent nodes"""
	parent_idx = (tree_idx - 1) // 2
	self.tree[parent_idx] += change
	if parent_idx != 0:
	self._propagate_change(parent_idx, change)

	def get_leaf(self, lower_bound):
	leaf_idx = self._retrieve(lower_bound) # search the max leaf priority based on the lower_bound
	data_idx = leaf_idx - self.capacity + 1
	return [leaf_idx, self.tree[leaf_idx], self.data[data_idx]]

	def _retrieve(self, lower_bound, parent_idx=0):
	"""
	Tree structure and array storage:
	Tree index:
	0 -> storing priority sum
	/ \
	1 2
	/ \ / \
	3 4 5 6 -> storing priority for transitions
	Array type for storing:
	[0,1,2,3,4,5,6]
	"""
	left_child_idx = 2 * parent_idx + 1
	right_child_idx = left_child_idx + 1

	if left_child_idx >= len(self.tree): # end search when no more child
	return parent_idx

	if self.tree[left_child_idx] == self.tree[right_child_idx]:
	return self._retrieve(lower_bound, np.random.choice([left_child_idx, right_child_idx]))
	if lower_bound <= self.tree[left_child_idx]: # downward search, always search for a higher priority node
	return self._retrieve(lower_bound, left_child_idx)
	else:
	return self._retrieve(lower_bound - self.tree[left_child_idx], right_child_idx)

	@property
	def root_priority(self):
	return self.tree[0] # the root


	class Memory(object): # stored as ( s, a, r, s_ ) in SumTree
	"""
	This SumTree code is modified version and the original code is from:
	https://github.com/jaara/AI-blog/blob/master/Seaquest-DDQN-PER.py
	"""
	epsilon = 0.001 # small amount to avoid zero priority
	alpha = 0.6 # [0~1] convert the importance of TD error to priority
	beta = 0.4 # importance-sampling, from initial value increasing to 1
	beta_increment_per_sampling = 1e-4 # annealing the bias
	abs_err_upper = 1 # for stability refer to paper

	def __init__(self, capacity):
	self.tree = SumTree(capacity)

	def store(self, error, transition):
	p = self._get_priority(error)
	self.tree.add_new_priority(p, transition)

	def sample(self, n):
	batch_idx, batch_memory, ISWeights = [], [], []
	segment = self.tree.root_priority / n
	self.beta = np.min([1, self.beta + self.beta_increment_per_sampling]) # max = 1

	min_prob = np.min(self.tree.tree[-self.tree.capacity:]) / self.tree.root_priority
	maxiwi = np.power(self.tree.capacity * min_prob, -self.beta) # for later normalizing ISWeights
	for i in range(n):
	a = segment * i
	b = segment * (i + 1)
	lower_bound = np.random.uniform(a, b)
	idx, p, data = self.tree.get_leaf(lower_bound)
	prob = p / self.tree.root_priority
	ISWeights.append(self.tree.capacity * prob)
	batch_idx.append(idx)
	batch_memory.append(data)

	ISWeights = np.vstack(ISWeights)
	ISWeights = np.power(ISWeights, -self.beta) / maxiwi # normalize
	return batch_idx, np.vstack(batch_memory), ISWeights

	def update(self, idx, error):
	p = self._get_priority(error)
	self.tree.update(idx, p)

	def _get_priority(self, error):
	error += self.epsilon # avoid 0
	clipped_error = np.clip(error, 0, self.abs_err_upper)
	return np.power(clipped_error, self.alpha)


	class DuelingDQNPrioritizedReplay:
	def __init__(
	self,
	n_actions,
	n_features,
	learning_rate=0.005,
	reward_decay=0.9,
	e_greedy=0.9,
	replace_target_iter=500,
	memory_size=10000,
	batch_size=32,
	e_greedy_increment=None,
	hidden=[100, 50],
	output_graph=False,
	sess=None,
	):
	self.n_actions = n_actions
	self.n_features = n_features
	self.lr = learning_rate
	self.gamma = reward_decay
	self.epsilon_max = e_greedy
	self.replace_target_iter = replace_target_iter
	self.memory_size = memory_size
	self.batch_size = batch_size
	self.hidden = hidden
	self.epsilon_increment = e_greedy_increment
	self.epsilon = 0.5 if e_greedy_increment is not None else self.epsilon_max

	self.learn_step_counter = 0
	self._build_net()
	self.memory = Memory(capacity=memory_size)

	if sess is None:
	self.sess = tf.Session()
	self.sess.run(tf.global_variables_initializer())
	else:
	self.sess = sess

	if output_graph:
	tf.summary.FileWriter("logs/", self.sess.graph)

	self.cost_his = []

	def _build_net(self):
	def build_layers(s, c_names, w_initializer, b_initializer):
	for i, h in enumerate(self.hidden):
	if i == 0:
	in_units, out_units, inputs = self.n_features, self.hidden[i], s
	else:
	in_units, out_units, inputs = self.hidden[i-1], self.hidden[i], l
	with tf.variable_scope('l%i' % i):
	w = tf.get_variable('w', [in_units, out_units], initializer=w_initializer, collections=c_names)
	b = tf.get_variable('b', [1, out_units], initializer=b_initializer, collections=c_names)
	l = tf.nn.relu(tf.matmul(inputs, w) + b)

	with tf.variable_scope('Value'):
	w = tf.get_variable('w', [self.hidden[-1], 1], initializer=w_initializer, collections=c_names)
	b = tf.get_variable('b', [1, 1], initializer=b_initializer, collections=c_names)
	self.V = tf.matmul(l, w) + b

	with tf.variable_scope('Advantage'):
	w = tf.get_variable('w', [self.hidden[-1], self.n_actions], initializer=w_initializer, collections=c_names)
	b = tf.get_variable('b', [1, self.n_actions], initializer=b_initializer, collections=c_names)
	self.A = tf.matmul(l, w) + b

	with tf.variable_scope('Q'):
	out = self.V + (self.A - tf.reduce_mean(self.A, axis=1, keep_dims=True)) # Q = V(s) + A(s,a)

	# with tf.variable_scope('out'):
	# w = tf.get_variable('w', [self.hidden[-1], self.n_actions], initializer=w_initializer, collections=c_names)
	# b = tf.get_variable('b', [1, self.n_actions], initializer=b_initializer, collections=c_names)
	# out = tf.matmul(l, w) + b
	return out

	# ------------------ build evaluate_net ------------------
	self.s = tf.placeholder(tf.float32, [None, self.n_features], name='s') # input
	self.q_target = tf.placeholder(tf.float32, [None, self.n_actions], name='Q_target') # for calculating loss
	self.ISWeights = tf.placeholder(tf.float32, [None, 1], name='IS_weights')
	with tf.variable_scope('eval_net'):
	c_names, w_initializer, b_initializer = \
	['eval_net_params', tf.GraphKeys.GLOBAL_VARIABLES], \
	tf.random_normal_initializer(0., 0.01), tf.constant_initializer(0.01) # config of layers

	self.q_eval = build_layers(self.s, c_names, w_initializer, b_initializer)

	with tf.variable_scope('loss'):
	self.abs_errors = tf.abs(tf.reduce_sum(self.q_target - self.q_eval, axis=1)) # for updating Sumtree
	self.loss = tf.reduce_mean(self.ISWeights * tf.squared_difference(self.q_target, self.q_eval))

	with tf.variable_scope('train'):
	self._train_op = tf.train.AdamOptimizer(self.lr).minimize(self.loss)

	# ------------------ build target_net ------------------
	self.s_ = tf.placeholder(tf.float32, [None, self.n_features], name='s_') # input
	with tf.variable_scope('target_net'):
	c_names = ['target_net_params', tf.GraphKeys.GLOBAL_VARIABLES]
	self.q_next = build_layers(self.s_, c_names, w_initializer, b_initializer)

	def store_transition(self, s, a, r, s_):
	transition = np.hstack((s, [a, r], s_))
	max_p = np.max(self.memory.tree.tree[-self.memory.tree.capacity:])
	self.memory.store(max_p, transition)

	def choose_action(self, observation):
	observation = observation[np.newaxis, :]
	if np.random.uniform() < self.epsilon:
	actions_value = self.sess.run(self.q_eval, feed_dict={self.s: observation})
	action = np.argmax(actions_value)
	else:
	action = np.random.randint(0, self.n_actions)
	return action

	def _replace_target_params(self):
	t_params = tf.get_collection('target_net_params')
	e_params = tf.get_collection('eval_net_params')
	self.sess.run([tf.assign(t, e) for t, e in zip(t_params, e_params)])

	def learn(self):
	if self.learn_step_counter % self.replace_target_iter == 0:
	self._replace_target_params()

	tree_idx, batch_memory, ISWeights = self.memory.sample(self.batch_size)

	# double DQN
	q_next, q_eval4next = self.sess.run(
	[self.q_next, self.q_eval],
	feed_dict={self.s_: batch_memory[:, -self.n_features:], # next observation
	self.s: batch_memory[:, -self.n_features:]}) # next observation
	q_eval = self.sess.run(self.q_eval, {self.s: batch_memory[:, :self.n_features]})

	q_target = q_eval.copy()

	batch_index = np.arange(self.batch_size, dtype=np.int32)
	eval_act_index = batch_memory[:, self.n_features].astype(int)
	reward = batch_memory[:, self.n_features + 1]
	max_act4next = np.argmax(q_eval4next,
	axis=1) # the action that brings the highest value is evaluated by q_eval
	selected_q_next = q_next[batch_index, max_act4next] # Double DQN, select q_next depending on above actions

	q_target[batch_index, eval_act_index] = reward + self.gamma * selected_q_next

	# q_next, q_eval = self.sess.run(
	# [self.q_next, self.q_eval],
	# feed_dict={self.s_: batch_memory[:, -self.n_features:],
	# self.s: batch_memory[:, :self.n_features]})
	#
	# q_target = q_eval.copy()
	# batch_index = np.arange(self.batch_size, dtype=np.int32)
	# eval_act_index = batch_memory[:, self.n_features].astype(int)
	# reward = batch_memory[:, self.n_features + 1]
	#
	# q_target[batch_index, eval_act_index] = reward + self.gamma * np.max(q_next, axis=1)

	_, abs_errors, self.cost = self.sess.run([self._train_op, self.abs_errors, self.loss],
	feed_dict={self.s: batch_memory[:, :self.n_features],
	self.q_target: q_target,
	self.ISWeights: ISWeights})
	for i in range(len(tree_idx)): # update priority
	idx = tree_idx[i]
	self.memory.update(idx, abs_errors[i])

	self.cost_his.append(self.cost)

	self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max
	self.learn_step_counter += 1
	"""
	Deep Q network,
	LunarLander-v2 example
	Using:
	Tensorflow: 1.0
	gym: 0.8.0
	"""


	import gym
	from gym import wrappers
	from DuelingDQNPrioritizedReplay import DuelingDQNPrioritizedReplay

	env = gym.make('LunarLander-v2')
	# env = env.unwrapped
	env.seed(1)

	N_A = env.action_space.n
	N_S = env.observation_space.shape[0]
	MEMORY_CAPACITY = 50000
	TARGET_REP_ITER = 2000
	MAX_EPISODES = 900
	E_GREEDY = 0.95
	E_INCREMENT = 0.00001
	GAMMA = 0.99
	LR = 0.0001
	BATCH_SIZE = 32
	HIDDEN = [400, 400]
	RENDER = True

	RL = DuelingDQNPrioritizedReplay(
	n_actions=N_A, n_features=N_S, learning_rate=LR, e_greedy=E_GREEDY, reward_decay=GAMMA,
	hidden=HIDDEN, batch_size=BATCH_SIZE, replace_target_iter=TARGET_REP_ITER,
	memory_size=MEMORY_CAPACITY, e_greedy_increment=E_INCREMENT,)


	total_steps = 0
	running_r = 0
	r_scale = 100
	for i_episode in range(MAX_EPISODES):
	s = env.reset() # (coord_x, coord_y, vel_x, vel_y, angle, angular_vel, l_leg_on_ground, r_leg_on_ground)
	ep_r = 0
	while True:
	if total_steps > MEMORY_CAPACITY: env.render()
	a = RL.choose_action(s)
	s_, r, done, _ = env.step(a)
	if r == -100: r = -30
	r /= r_scale

	ep_r += r
	RL.store_transition(s, a, r, s_)
	if total_steps > MEMORY_CAPACITY:
	RL.learn()
	if done:
	land = '\| Landed' if r == 100/r_scale else '\| ------'
	running_r = 0.99 * running_r + 0.01 * ep_r
	print('Epi: ', i_episode,
	land,
	'\| Epi_R: ', round(ep_r, 2),
	'\| Running_R: ', round(running_r, 2),
	'\| Epsilon: ', round(RL.epsilon, 3))
	break

	s = s_
	total_steps += 1