Skip to content

Instantly share code, notes, and snippets.

@MorvanZhou
Created March 22, 2017 07:04
Show Gist options
  • Save MorvanZhou/99da5d4f2b5bcc65af668a4f17efe4fe to your computer and use it in GitHub Desktop.
Save MorvanZhou/99da5d4f2b5bcc65af668a4f17efe4fe to your computer and use it in GitHub Desktop.
"""
The DQN improvement: Prioritized Experience Replay (based on https://arxiv.org/abs/1511.05952)
View more on 莫烦Python: https://morvanzhou.github.io/tutorials/
Using:
Tensorflow: 1.0
"""
import numpy as np
import tensorflow as tf
np.random.seed(1)
tf.set_random_seed(1)
class SumTree(object):
"""
This SumTree code is modified version and the original code is from:
https://github.com/jaara/AI-blog/blob/master/SumTree.py
Story the data with it priority in tree and data frameworks.
"""
data_pointer = 0
def __init__(self, capacity):
self.capacity = capacity # for all priority values
self.tree = np.zeros(2 * capacity - 1)
# [--------------Parent nodes-------------][-------leaves to recode priority-------]
# size: capacity - 1 size: capacity
self.data = np.zeros(capacity, dtype=object) # for all transitions
# [--------------data frame-------------]
# size: capacity
def add_new_priority(self, p, data):
leaf_idx = self.data_pointer + self.capacity - 1
self.data[self.data_pointer] = data # update data_frame
self.update(leaf_idx, p) # update tree_frame
self.data_pointer += 1
if self.data_pointer >= self.capacity: # replace when exceed the capacity
self.data_pointer = 0
def update(self, tree_idx, p):
change = p - self.tree[tree_idx]
self.tree[tree_idx] = p
self._propagate_change(tree_idx, change)
def _propagate_change(self, tree_idx, change):
"""change the sum of priority value in all parent nodes"""
parent_idx = (tree_idx - 1) // 2
self.tree[parent_idx] += change
if parent_idx != 0:
self._propagate_change(parent_idx, change)
def get_leaf(self, lower_bound):
leaf_idx = self._retrieve(lower_bound) # search the max leaf priority based on the lower_bound
data_idx = leaf_idx - self.capacity + 1
return [leaf_idx, self.tree[leaf_idx], self.data[data_idx]]
def _retrieve(self, lower_bound, parent_idx=0):
"""
Tree structure and array storage:
Tree index:
0 -> storing priority sum
/ \
1 2
/ \ / \
3 4 5 6 -> storing priority for transitions
Array type for storing:
[0,1,2,3,4,5,6]
"""
left_child_idx = 2 * parent_idx + 1
right_child_idx = left_child_idx + 1
if left_child_idx >= len(self.tree): # end search when no more child
return parent_idx
if self.tree[left_child_idx] == self.tree[right_child_idx]:
return self._retrieve(lower_bound, np.random.choice([left_child_idx, right_child_idx]))
if lower_bound <= self.tree[left_child_idx]: # downward search, always search for a higher priority node
return self._retrieve(lower_bound, left_child_idx)
else:
return self._retrieve(lower_bound - self.tree[left_child_idx], right_child_idx)
@property
def root_priority(self):
return self.tree[0] # the root
class Memory(object): # stored as ( s, a, r, s_ ) in SumTree
"""
This SumTree code is modified version and the original code is from:
https://github.com/jaara/AI-blog/blob/master/Seaquest-DDQN-PER.py
"""
epsilon = 0.001 # small amount to avoid zero priority
alpha = 0.6 # [0~1] convert the importance of TD error to priority
beta = 0.4 # importance-sampling, from initial value increasing to 1
beta_increment_per_sampling = 1e-4 # annealing the bias
abs_err_upper = 1 # for stability refer to paper
def __init__(self, capacity):
self.tree = SumTree(capacity)
def store(self, error, transition):
p = self._get_priority(error)
self.tree.add_new_priority(p, transition)
def sample(self, n):
batch_idx, batch_memory, ISWeights = [], [], []
segment = self.tree.root_priority / n
self.beta = np.min([1, self.beta + self.beta_increment_per_sampling]) # max = 1
min_prob = np.min(self.tree.tree[-self.tree.capacity:]) / self.tree.root_priority
maxiwi = np.power(self.tree.capacity * min_prob, -self.beta) # for later normalizing ISWeights
for i in range(n):
a = segment * i
b = segment * (i + 1)
lower_bound = np.random.uniform(a, b)
idx, p, data = self.tree.get_leaf(lower_bound)
prob = p / self.tree.root_priority
ISWeights.append(self.tree.capacity * prob)
batch_idx.append(idx)
batch_memory.append(data)
ISWeights = np.vstack(ISWeights)
ISWeights = np.power(ISWeights, -self.beta) / maxiwi # normalize
return batch_idx, np.vstack(batch_memory), ISWeights
def update(self, idx, error):
p = self._get_priority(error)
self.tree.update(idx, p)
def _get_priority(self, error):
error += self.epsilon # avoid 0
clipped_error = np.clip(error, 0, self.abs_err_upper)
return np.power(clipped_error, self.alpha)
class DuelingDQNPrioritizedReplay:
def __init__(
self,
n_actions,
n_features,
learning_rate=0.005,
reward_decay=0.9,
e_greedy=0.9,
replace_target_iter=500,
memory_size=10000,
batch_size=32,
e_greedy_increment=None,
hidden=[100, 50],
output_graph=False,
sess=None,
):
self.n_actions = n_actions
self.n_features = n_features
self.lr = learning_rate
self.gamma = reward_decay
self.epsilon_max = e_greedy
self.replace_target_iter = replace_target_iter
self.memory_size = memory_size
self.batch_size = batch_size
self.hidden = hidden
self.epsilon_increment = e_greedy_increment
self.epsilon = 0.5 if e_greedy_increment is not None else self.epsilon_max
self.learn_step_counter = 0
self._build_net()
self.memory = Memory(capacity=memory_size)
if sess is None:
self.sess = tf.Session()
self.sess.run(tf.global_variables_initializer())
else:
self.sess = sess
if output_graph:
tf.summary.FileWriter("logs/", self.sess.graph)
self.cost_his = []
def _build_net(self):
def build_layers(s, c_names, w_initializer, b_initializer):
for i, h in enumerate(self.hidden):
if i == 0:
in_units, out_units, inputs = self.n_features, self.hidden[i], s
else:
in_units, out_units, inputs = self.hidden[i-1], self.hidden[i], l
with tf.variable_scope('l%i' % i):
w = tf.get_variable('w', [in_units, out_units], initializer=w_initializer, collections=c_names)
b = tf.get_variable('b', [1, out_units], initializer=b_initializer, collections=c_names)
l = tf.nn.relu(tf.matmul(inputs, w) + b)
with tf.variable_scope('Value'):
w = tf.get_variable('w', [self.hidden[-1], 1], initializer=w_initializer, collections=c_names)
b = tf.get_variable('b', [1, 1], initializer=b_initializer, collections=c_names)
self.V = tf.matmul(l, w) + b
with tf.variable_scope('Advantage'):
w = tf.get_variable('w', [self.hidden[-1], self.n_actions], initializer=w_initializer, collections=c_names)
b = tf.get_variable('b', [1, self.n_actions], initializer=b_initializer, collections=c_names)
self.A = tf.matmul(l, w) + b
with tf.variable_scope('Q'):
out = self.V + (self.A - tf.reduce_mean(self.A, axis=1, keep_dims=True)) # Q = V(s) + A(s,a)
# with tf.variable_scope('out'):
# w = tf.get_variable('w', [self.hidden[-1], self.n_actions], initializer=w_initializer, collections=c_names)
# b = tf.get_variable('b', [1, self.n_actions], initializer=b_initializer, collections=c_names)
# out = tf.matmul(l, w) + b
return out
# ------------------ build evaluate_net ------------------
self.s = tf.placeholder(tf.float32, [None, self.n_features], name='s') # input
self.q_target = tf.placeholder(tf.float32, [None, self.n_actions], name='Q_target') # for calculating loss
self.ISWeights = tf.placeholder(tf.float32, [None, 1], name='IS_weights')
with tf.variable_scope('eval_net'):
c_names, w_initializer, b_initializer = \
['eval_net_params', tf.GraphKeys.GLOBAL_VARIABLES], \
tf.random_normal_initializer(0., 0.01), tf.constant_initializer(0.01) # config of layers
self.q_eval = build_layers(self.s, c_names, w_initializer, b_initializer)
with tf.variable_scope('loss'):
self.abs_errors = tf.abs(tf.reduce_sum(self.q_target - self.q_eval, axis=1)) # for updating Sumtree
self.loss = tf.reduce_mean(self.ISWeights * tf.squared_difference(self.q_target, self.q_eval))
with tf.variable_scope('train'):
self._train_op = tf.train.AdamOptimizer(self.lr).minimize(self.loss)
# ------------------ build target_net ------------------
self.s_ = tf.placeholder(tf.float32, [None, self.n_features], name='s_') # input
with tf.variable_scope('target_net'):
c_names = ['target_net_params', tf.GraphKeys.GLOBAL_VARIABLES]
self.q_next = build_layers(self.s_, c_names, w_initializer, b_initializer)
def store_transition(self, s, a, r, s_):
transition = np.hstack((s, [a, r], s_))
max_p = np.max(self.memory.tree.tree[-self.memory.tree.capacity:])
self.memory.store(max_p, transition)
def choose_action(self, observation):
observation = observation[np.newaxis, :]
if np.random.uniform() < self.epsilon:
actions_value = self.sess.run(self.q_eval, feed_dict={self.s: observation})
action = np.argmax(actions_value)
else:
action = np.random.randint(0, self.n_actions)
return action
def _replace_target_params(self):
t_params = tf.get_collection('target_net_params')
e_params = tf.get_collection('eval_net_params')
self.sess.run([tf.assign(t, e) for t, e in zip(t_params, e_params)])
def learn(self):
if self.learn_step_counter % self.replace_target_iter == 0:
self._replace_target_params()
tree_idx, batch_memory, ISWeights = self.memory.sample(self.batch_size)
# double DQN
q_next, q_eval4next = self.sess.run(
[self.q_next, self.q_eval],
feed_dict={self.s_: batch_memory[:, -self.n_features:], # next observation
self.s: batch_memory[:, -self.n_features:]}) # next observation
q_eval = self.sess.run(self.q_eval, {self.s: batch_memory[:, :self.n_features]})
q_target = q_eval.copy()
batch_index = np.arange(self.batch_size, dtype=np.int32)
eval_act_index = batch_memory[:, self.n_features].astype(int)
reward = batch_memory[:, self.n_features + 1]
max_act4next = np.argmax(q_eval4next,
axis=1) # the action that brings the highest value is evaluated by q_eval
selected_q_next = q_next[batch_index, max_act4next] # Double DQN, select q_next depending on above actions
q_target[batch_index, eval_act_index] = reward + self.gamma * selected_q_next
# q_next, q_eval = self.sess.run(
# [self.q_next, self.q_eval],
# feed_dict={self.s_: batch_memory[:, -self.n_features:],
# self.s: batch_memory[:, :self.n_features]})
#
# q_target = q_eval.copy()
# batch_index = np.arange(self.batch_size, dtype=np.int32)
# eval_act_index = batch_memory[:, self.n_features].astype(int)
# reward = batch_memory[:, self.n_features + 1]
#
# q_target[batch_index, eval_act_index] = reward + self.gamma * np.max(q_next, axis=1)
_, abs_errors, self.cost = self.sess.run([self._train_op, self.abs_errors, self.loss],
feed_dict={self.s: batch_memory[:, :self.n_features],
self.q_target: q_target,
self.ISWeights: ISWeights})
for i in range(len(tree_idx)): # update priority
idx = tree_idx[i]
self.memory.update(idx, abs_errors[i])
self.cost_his.append(self.cost)
self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max
self.learn_step_counter += 1
"""
Deep Q network,
LunarLander-v2 example
Using:
Tensorflow: 1.0
gym: 0.8.0
"""
import gym
from gym import wrappers
from DuelingDQNPrioritizedReplay import DuelingDQNPrioritizedReplay
env = gym.make('LunarLander-v2')
# env = env.unwrapped
env.seed(1)
N_A = env.action_space.n
N_S = env.observation_space.shape[0]
MEMORY_CAPACITY = 50000
TARGET_REP_ITER = 2000
MAX_EPISODES = 900
E_GREEDY = 0.95
E_INCREMENT = 0.00001
GAMMA = 0.99
LR = 0.0001
BATCH_SIZE = 32
HIDDEN = [400, 400]
RENDER = True
RL = DuelingDQNPrioritizedReplay(
n_actions=N_A, n_features=N_S, learning_rate=LR, e_greedy=E_GREEDY, reward_decay=GAMMA,
hidden=HIDDEN, batch_size=BATCH_SIZE, replace_target_iter=TARGET_REP_ITER,
memory_size=MEMORY_CAPACITY, e_greedy_increment=E_INCREMENT,)
total_steps = 0
running_r = 0
r_scale = 100
for i_episode in range(MAX_EPISODES):
s = env.reset() # (coord_x, coord_y, vel_x, vel_y, angle, angular_vel, l_leg_on_ground, r_leg_on_ground)
ep_r = 0
while True:
if total_steps > MEMORY_CAPACITY: env.render()
a = RL.choose_action(s)
s_, r, done, _ = env.step(a)
if r == -100: r = -30
r /= r_scale
ep_r += r
RL.store_transition(s, a, r, s_)
if total_steps > MEMORY_CAPACITY:
RL.learn()
if done:
land = '| Landed' if r == 100/r_scale else '| ------'
running_r = 0.99 * running_r + 0.01 * ep_r
print('Epi: ', i_episode,
land,
'| Epi_R: ', round(ep_r, 2),
'| Running_R: ', round(running_r, 2),
'| Epsilon: ', round(RL.epsilon, 3))
break
s = s_
total_steps += 1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment