Skip to content

Instantly share code, notes, and snippets.

@avalcarce
Created March 3, 2017 17:02
Show Gist options
  • Save avalcarce/93991f052ecbf19cfef99c76b8f0b470 to your computer and use it in GitHub Desktop.
Save avalcarce/93991f052ecbf19cfef99c76b8f0b470 to your computer and use it in GitHub Desktop.
Solving MountainCar-v0 with DQN and Prioritized Experience Replay

Synopsis

This is a Deep Reinforcement Learning solution to the MountainCar-v0 environment in OpenAI's Gym. This code uses Tensorflow to model a value function for a Reinforcement Learning agent. I've run it with Tensorflow 1.0 on Python 3.5 under Windows 7.

The algorithm is a Deep Q Network (DQN) with Prioritized Experience Replay (PER). All hyper parameters have been chosen by hand based on past experience. However, the learning rate, the priorization exponent alpha and the initial importance sampling exponen beta0 have been optained via Bayesian optimization with Scikit-Optimize.

The hyperparameters are:

  • Size of 1st fully connected layer: 256
  • Size of 2nd fully connected layer: 512
  • Period of the update of the target network parameters: 1000 steps
  • Discount factor: 0.99
  • Decay factor for epsilon in epsilon-greedy policy: 0.99
  • Minimum epsilon in epsilon-greeddy policy: 1E-4
  • Learning rate: 6E-4
  • Size of replay memory: 1000000
  • Period of experience replay: 4 steps
  • PER alpha: 0.8
  • PER beta0: 0.8

References

  1. Deep Learning tutorial, David Silver, Google DeepMind.
  2. My code on Github
  3. Prioritized Experience Replay, T. Schaul., J. Quan and D. Silver. Feb 2016.
import numpy as np
class AgentEpsGreedy:
def __init__(self, n_actions, value_function_model, eps=1., summaries_path_current=None):
self.n_actions = n_actions
self.value_func = value_function_model
self.eps = eps
self.summaries_path_current = summaries_path_current
self.current_value = None # Current value of the value function (i.e. expected discounted return)
def act(self, state):
action_values = self.value_func.predict([state])[0]
policy = np.ones(self.n_actions) * self.eps / self.n_actions
a_max = np.argmax(action_values)
policy[a_max] += 1. - self.eps
a = np.random.choice(self.n_actions, p=policy)
self.current_value = action_values[a]
return a
def train(self, states, targets, w=None):
loss, errors = self.value_func.train(states, targets, w=w)
return loss, errors
def predict_q_values(self, states, use_old_params=False):
return self.value_func.predict(states, use_old_params)
import numpy as np
from collections import deque
class DoubleEndedQueue:
def __init__(self, max_size=128):
self.max_size = max_size
self.n_entries = 0
self.memory = deque(maxlen=max_size)
def sample(self, batch_size):
batch_size = min(len(self.memory), batch_size)
idxs = np.random.choice(len(self.memory), batch_size)
return [self.memory[idx] for idx in idxs]
def add(self, item):
self.memory.append(item)
if self.n_entries < self.max_size:
self.n_entries += 1
class SumTree:
"""
Version 5aa9f0b on Nov 7, 2016 from:
https://github.com/jaara/AI-blog/blob/master/SumTree.py
"""
def __init__(self, capacity=100000):
self.capacity = capacity
self.tree = np.zeros(2*capacity - 1)
self.data = np.zeros(capacity, dtype=object)
self.write = 0
self.n_entries = 0
self.tree_len = len(self.tree)
def _propagate(self, idx, change):
parent = (idx - 1) // 2
self.tree[parent] += change
if parent != 0:
self._propagate(parent, change)
def _retrieve(self, idx, s):
left = 2 * idx + 1
if left >= self.tree_len:
return idx
if s <= self.tree[left]:
return self._retrieve(left, s)
else:
right = left + 1
return self._retrieve(right, s-self.tree[left])
def total(self):
return self.tree[0]
def add(self, p, data):
idx = self.write + self.capacity - 1
self.data[self.write] = data
self.update(idx, p)
self.write += 1
if self.write >= self.capacity:
self.write = 0
if self.n_entries < self.capacity:
self.n_entries += 1
def update(self, idx, p):
change = p - self.tree[idx]
self.tree[idx] = p
self._propagate(idx, change)
def get(self, s):
idx = self._retrieve(0, s)
data_idx = idx - self.capacity + 1
return idx, self.tree[idx], self.data[data_idx]
def sample(self, batch_size):
batch_idx = [None] * batch_size
batch_priorities = [None] * batch_size
batch = [None] * batch_size
segment = self.total() / batch_size
a = [segment*i for i in range(batch_size)]
b = [segment * (i+1) for i in range(batch_size)]
s = np.random.uniform(a, b)
for i in range(batch_size):
(batch_idx[i], batch_priorities[i], batch[i]) = self.get(s[i])
return batch_idx, batch_priorities, batch
from .utils import *
from .agents import AgentEpsGreedy
from .valuefunctions import ValueFunctionDQN
from .datastructures import DoubleEndedQueue, SumTree
import gym
from gym import wrappers
import copy
import time
from sys import platform
from textwrap import wrap
if platform == "linux" or platform == "linux2":
import matplotlib
matplotlib.use('Agg') # This is to generate images without having a window appear.
import matplotlib.pyplot as plt
class ExperimentsManager:
def __init__(self, env_name, agent_value_function_hidden_layers_size, results_dir_prefix=None, summaries_path=None,
figures_dir=None, discount=0.99, decay_eps=0.995, eps_min=0.0001, learning_rate=1E-4, decay_lr=False,
max_step=10000, replay_memory_max_size=100000, ep_verbose=False, exp_verbose=True, batch_size=64,
upload_last_exp=False, double_dqn=False, target_params_update_period_steps=1, gym_api_key="",
gym_algorithm_id=None, checkpoints_dir=None, min_avg_rwd=-110, replay_period_steps=1,
per_proportional_prioritization=False, per_apply_importance_sampling=False, per_alpha=0.6,
per_beta0=0.4):
self.env_name = env_name
self.results_dir_prefix = results_dir_prefix
self.gym_stats_dir = None
self.summaries_path = summaries_path
self.summaries_path_current = summaries_path
self.figures_dir = figures_dir
self.discount = discount
self.decay_eps = decay_eps
self.eps_min = eps_min
self.learning_rate = learning_rate
self.decay_lr = decay_lr
self.max_step = max_step
self.n_ep = None
self.replay_period_steps = replay_period_steps
self.replay_memory_max_size = replay_memory_max_size
self.ep_verbose = ep_verbose # Whether or not to print progress during episodes
self.exp_verbose = exp_verbose # Whether or not to print progress during experiments
self.upload_last_exp = upload_last_exp
assert target_params_update_period_steps > 0, "The period for updating the target parameters must be positive."
self.target_params_update_period_steps = target_params_update_period_steps
self.gym_api_key = gym_api_key
self.gym_algorithm_id = gym_algorithm_id
self.checkpoints_dir = checkpoints_dir
self.checkpoints_dir_current = checkpoints_dir
# Prioritized Experience Replay parameters. See https://arxiv.org/pdf/1511.05952.pdf
self.per_proportional_prioritization = per_proportional_prioritization # Flavour of Prioritized Experience Rep.
self.per_apply_importance_sampling = per_apply_importance_sampling
self.prio_max = 0
self.per_epsilon = 1E-6
self.per_alpha = per_alpha
self.per_beta0 = per_beta0
self.per_beta = self.per_beta0
self.agent = None
self.memory = None # Experience replay memory
self.batch_size = batch_size
self.agent_value_function_hidden_layers_size = agent_value_function_hidden_layers_size
self.double_dqn = double_dqn
self.global_step = 0 # Current step over all episodes
self.step = 0 # Current step per episode
self.ep = 0
self.exp = 0
self.step_durations_s = np.zeros(shape=self.max_step, dtype=float)
self.min_avg_rwd = min_avg_rwd # Minimum average reward to consider the problem as solved
self.n_avg_ep = 100 # Number of consecutive episodes to calculate the average reward
self.conf_msg = "\nEXECUTING EXPERIMENT {} OF {} IN ENVIRONMENT {}."
self.episode_progress_msg = "Step {:5d}/{:5d}. Avg step duration: {:3.6f} ms." + \
" Loss = {:3.2e}."
self.exp_progress_msg = "Exp {:3d}. Ep {:5d}, Rwd={:4.0f} (mean={:4.0f} over {:3d} episodes)." + \
" {} exceeded in {:4d} eps. Loss={:1.2e} (avg={:1.2e}). Agent epsilon={:3.2f} %." + \
" Average step duration: {:2.6f} ms."
self.exps_conf_str = ""
# Memory pre-allocation
self.Rwd_per_ep_v = np.zeros((1, 5000))
self.Loss_per_ep_v = np.zeros((1, 5000))
self.Avg_Rwd_per_ep = np.zeros((1, 5000))
self.Avg_Loss_per_ep = np.zeros((1, 5000))
self.n_eps_to_reach_min_avg_rwd = np.zeros(1, dtype=float)
self.Agent_Epsilon_per_ep = np.zeros((1, 5000))
self.agent_value_function = np.zeros((1, 1, self.max_step))
self.rwd_exps_avg = np.mean(self.Rwd_per_ep_v, axis=0) # Rwd averaged over all experiments
self.rwd_exps_avg_ma = np.zeros(self.rwd_exps_avg.shape[0])
self.rwd_exps_avg_movstd = np.zeros(self.rwd_exps_avg.shape[0])
self.rwd_exps_avg_percentile5 = np.zeros(self.rwd_exps_avg.shape[0])
self.rwd_exps_avg_percentile95 = np.zeros(self.rwd_exps_avg.shape[0])
def __print_episode_progress(self, loss_v):
if self.ep_verbose:
if self.step > 0 and (self.step+1) % 20 == 0:
print(self.episode_progress_msg.format(self.step, self.max_step,
np.mean(self.step_durations_s[self.ep, 0:self.step]) * 1000,
loss_v))
def __retrieve_experience(self):
idx = None
priorities = None
w = None
# Extract a batch of random transitions from the replay memory
if self.per_proportional_prioritization:
idx, priorities, experience = self.memory.sample(self.batch_size)
if self.per_apply_importance_sampling:
sampling_probabilities = priorities / self.memory.total()
w = np.power(self.memory.n_entries * sampling_probabilities, -self.per_beta)
w = w / w.max()
else:
experience = self.memory.sample(self.batch_size)
return idx, priorities, w, experience
@staticmethod
def __format_experience(experience):
states_b, actions_b, rewards_b, states_n_b, done_b = zip(*experience)
states_b = np.array(states_b)
actions_b = np.array(actions_b)
rewards_b = np.array(rewards_b)
states_n_b = np.array(states_n_b)
done_b = np.array(done_b).astype(int)
return states_b, actions_b, rewards_b, states_n_b, done_b
def __train_on_experience(self):
loss_v = 0
if self.memory.n_entries >= self.batch_size:
idx, priorities, w, experience = self.__retrieve_experience()
states_b, actions_b, rewards_b, states_n_b, done_b = self.__format_experience(experience)
if self.double_dqn:
q_n_b = self.agent.predict_q_values(states_n_b) # Action values on the arriving state
best_a = np.argmax(q_n_b, axis=1)
q_n_target_b = self.agent.predict_q_values(states_n_b, use_old_params=True)
targets_b = rewards_b + (1. - done_b) * self.discount * q_n_target_b[np.arange(self.batch_size), best_a]
else:
q_n_b = self.agent.predict_q_values(states_n_b, use_old_params=True) # Action values on the next state
targets_b = rewards_b + (1. - done_b) * self.discount * np.amax(q_n_b, axis=1)
targets = self.agent.predict_q_values(states_b)
for j, action in enumerate(actions_b):
targets[j, action] = targets_b[j]
if self.per_apply_importance_sampling:
loss_v, errors = self.agent.train(states_b, targets, w=w)
else:
loss_v, errors = self.agent.train(states_b, targets)
errors = errors[np.arange(len(errors)), actions_b]
if self.per_proportional_prioritization: # Update transition priorities
priorities = self.error2priority(errors)
for i in range(self.batch_size):
self.memory.update(idx[i], priorities[i])
self.prio_max = max(priorities.max(), self.prio_max)
return loss_v
def error2priority(self, errors):
return np.power(np.abs(errors) + self.per_epsilon, self.per_alpha)
def __print_experiment_progress(self):
if self.exp_verbose:
rwd = self.Rwd_per_ep_v[self.exp, self.ep]
avg_rwd = self.Avg_Rwd_per_ep[self.exp, self.ep]
loss = self.Loss_per_ep_v[self.exp, self.ep]
avg_loss = self.Avg_Loss_per_ep[self.exp, self.ep]
avg_rwds = self.Avg_Rwd_per_ep[self.exp, 0:self.ep+1]
i_last_low_rwd = np.max(np.where(avg_rwds < self.min_avg_rwd))
n_solved_eps = self.ep - i_last_low_rwd
duration_ms = 0
if self.ep > 0:
duration_ms = np.mean(self.step_durations_s[0:self.ep, :]) * 1000
print(
self.exp_progress_msg.format(self.exp, self.ep, rwd, avg_rwd, self.n_avg_ep, self.min_avg_rwd,
n_solved_eps, loss, avg_loss, self.agent.eps*100, duration_ms))
def anneal_per_importance_sampling(self):
if self.per_proportional_prioritization and self.per_apply_importance_sampling:
self.per_beta = self.per_beta0 + self.ep*(1-self.per_beta0)/self.n_ep
def run_episode(self, env, train=True):
state = env.reset()
done = False
total_reward = 0
loss_v = 0
for self.step in range(self.max_step):
# Maybe update the target estimator
if self.global_step % self.target_params_update_period_steps == 0:
self.agent.value_func.update_old_params()
if self.ep_verbose:
print("Copied model parameters to target network.")
self.anneal_per_importance_sampling()
t = time.time()
self.__print_episode_progress(loss_v)
if done:
break
action = self.agent.act(state)
self.agent_value_function[self.exp, self.ep, self.step] = self.agent.current_value
self.global_step += 1
state_next, reward, done, info = env.step(action)
total_reward += reward
if self.memory is not None:
experience = (state, action, reward, state_next, done)
if self.per_proportional_prioritization:
self.memory.add(max(self.prio_max, self.per_epsilon), experience)
else:
self.memory.add(experience)
if train:
if self.global_step % self.replay_period_steps == 0:
loss_v = self.__train_on_experience()
else:
raise NotImplementedError("Please provide an Experience Replay memory")
state = copy.copy(state_next)
self.step_durations_s[self.ep, self.step] = time.time() - t # Time elapsed during this step
return loss_v, total_reward
def run_experiment(self, env, n_ep, stop_training_min_avg_rwd=None):
self.n_ep = n_ep
self.global_step = 0
train = True
# One experiment is composed of n_ep sequential episodes
for self.ep in range(n_ep):
loss_v, total_reward = self.run_episode(env, train)
# Collect episode results
self.Rwd_per_ep_v[self.exp, self.ep] = total_reward
self.Loss_per_ep_v[self.exp, self.ep] = loss_v
# Calculate episode statistics
last_rwds = self.Rwd_per_ep_v[self.exp, np.maximum(self.ep - (self.n_avg_ep - 1), 0):self.ep+1]
last_losses = self.Loss_per_ep_v[self.exp, np.maximum(self.ep - (self.n_avg_ep - 1), 0):self.ep+1]
self.Avg_Rwd_per_ep[self.exp, self.ep] = np.mean(last_rwds)
self.Avg_Loss_per_ep[self.exp, self.ep] = np.mean(last_losses)
self.Agent_Epsilon_per_ep[self.exp, self.ep] = self.agent.eps
if stop_training_min_avg_rwd is not None:
if train and self.Avg_Rwd_per_ep[self.exp, self.ep] >= stop_training_min_avg_rwd:
train = False
print("Minimum average reward reached. Stopping training.")
if self.Avg_Rwd_per_ep[self.exp, self.ep] >= self.min_avg_rwd:
self.n_eps_to_reach_min_avg_rwd[self.exp] = np.minimum(self.ep,
self.n_eps_to_reach_min_avg_rwd[self.exp])
if self.agent.eps > self.eps_min:
self.agent.eps *= self.decay_eps
self.__print_experiment_progress()
def __create_gym_stats_directory(self, env):
if self.results_dir_prefix is None:
raise ValueError("A prefix for the Gym results directory must be provided.")
if not os.path.exists(self.results_dir_prefix):
os.makedirs(self.results_dir_prefix)
t = get_last_folder_id(self.results_dir_prefix) + 1 # Calculate next test id
self.gym_stats_dir = os.path.join(self.results_dir_prefix, str(t).zfill(4))
if not os.path.exists(self.gym_stats_dir):
os.makedirs(self.gym_stats_dir)
else:
raise FileExistsError(self.gym_stats_dir)
return wrappers.Monitor(env, self.gym_stats_dir)
def __build_experiments_conf_str(self, n_exps, n_ep, n_actions, state_dim):
layers_size = str(state_dim)
for s in self.agent_value_function_hidden_layers_size:
layers_size += "-"+str(s)
layers_size += "-"+str(n_actions)
exp_conf_str = "{}_{}_D{:1.2f}_DE{:1.2e}_Em{:1.2e}_LR{:1.2e}_DL{}_MS{}_" +\
"DDQN{}_N{:1.1e}_BS{}_NEx{}_NEp{}_C{}_K{}_PER{}_IS{}_a{:1.2f}_b0{:1.2f}"
self.exps_conf_str = exp_conf_str.format(time.strftime("%Y_%m_%d_%H_%M_%S"), layers_size, self.discount,
self.decay_eps, self.eps_min, self.learning_rate,
1 if self.decay_lr else 0, self.max_step, 1 if self.double_dqn else 0,
self.replay_memory_max_size, self.batch_size, n_exps, n_ep,
self.target_params_update_period_steps, self.replay_period_steps,
1 if self.per_proportional_prioritization else 0,
1 if self.per_apply_importance_sampling else 0,
self.per_alpha, self.per_beta0)
def __create_figures_directory(self):
if self.figures_dir is not None:
self.figures_dir = os.path.join(self.figures_dir, self.env_name, self.exps_conf_str)
if not os.path.exists(self.figures_dir):
os.makedirs(self.figures_dir)
else:
for dirpath, dirnames, files in os.walk(self.figures_dir):
if files:
raise FileExistsError("The figures directory exists and has files: {}".format(self.figures_dir))
else:
break
def get_environment_actions(self, env):
if isinstance(env.action_space, gym.spaces.Box):
raise NotImplementedError("Continuous action spaces are not supported yet.")
elif isinstance(env.action_space, gym.spaces.Discrete):
n_actions = env.action_space.n
else:
raise NotImplementedError("{} action spaces are not supported yet.".format(type(env.action_space)))
return n_actions
def run_experiments(self, n_exps, n_ep, stop_training_min_avg_rwd=None, plot_results=True, figures_format=None):
self.Rwd_per_ep_v = np.zeros((n_exps, n_ep))
self.Loss_per_ep_v = np.zeros((n_exps, n_ep))
self.Avg_Rwd_per_ep = np.zeros((n_exps, n_ep))
self.n_eps_to_reach_min_avg_rwd = np.zeros(n_exps, dtype=float)
self.n_eps_to_reach_min_avg_rwd.fill(n_ep)
self.Avg_Loss_per_ep = np.zeros((n_exps, n_ep))
self.Agent_Epsilon_per_ep = np.zeros((n_exps, n_ep))
self.agent_value_function = np.zeros((n_exps, n_ep, self.max_step))
self.step_durations_s = np.zeros(shape=(n_ep, self.max_step), dtype=float)
# Create environment
env = gym.make(self.env_name)
n_actions = self.get_environment_actions(env)
state_dim = env.observation_space.high.shape[0]
self.__build_experiments_conf_str(n_exps, n_ep, n_actions, state_dim)
self.__create_figures_directory()
for self.exp in range(n_exps):
print(self.conf_msg.format(self.exp, n_exps, self.env_name))
print(self.exps_conf_str)
env = gym.make(self.env_name) # Create new environment
assert state_dim == env.observation_space.high.shape[0]
if self.upload_last_exp and self.exp == n_exps-1:
env = self.__create_gym_stats_directory(env)
if self.summaries_path is not None:
self.summaries_path_current = os.path.join(self.summaries_path,
self.env_name,
self.exps_conf_str + "_Exp" + str(self.exp))
if self.checkpoints_dir is not None:
self.checkpoints_dir_current = os.path.join(self.checkpoints_dir,
self.env_name,
self.exps_conf_str+"_Exp"+str(self.exp))
if not os.path.exists(self.checkpoints_dir_current):
os.makedirs(self.checkpoints_dir_current)
# Create agent
value_function = ValueFunctionDQN(scope="q", state_dim=state_dim, n_actions=n_actions,
train_batch_size=self.batch_size, learning_rate=self.learning_rate,
hidden_layers_size=self.agent_value_function_hidden_layers_size,
decay_lr=self.decay_lr, huber_loss=False,
summaries_path=self.summaries_path_current,
reset_default_graph=True,
checkpoints_dir=self.checkpoints_dir_current,
apply_wis=self.per_apply_importance_sampling)
self.agent = AgentEpsGreedy(n_actions=n_actions, value_function_model=value_function, eps=0.9,
summaries_path_current=self.summaries_path_current)
if self.per_proportional_prioritization:
self.memory = SumTree(self.replay_memory_max_size)
else:
self.memory = DoubleEndedQueue(max_size=self.replay_memory_max_size)
self.run_experiment(env, n_ep, stop_training_min_avg_rwd) # This is where the action happens
value_function.close_summary_file()
env.close()
if self.upload_last_exp and self.exp == n_exps - 1:
print("Trying to upload results to the scoreboard.")
gym.upload(self.gym_stats_dir, api_key=self.gym_api_key, algorithm_id=self.gym_algorithm_id)
# Plot results
self.plot_rwd_loss(figures_format=figures_format)
self.plot_value_function(figures_format=figures_format)
self.print_experiment_summary()
self.calculate_avg_rwd()
self.plot_rwd_averages(n_exps, figures_format=figures_format)
self.print_summary()
if plot_results:
plt.show()
# Return the final Rwd averaged over all experiments AND the mean number of episodes needed to reach the min Rwd
return self.rwd_exps_avg_ma[-1], np.mean(self.n_eps_to_reach_min_avg_rwd)
def print_experiment_summary(self):
duration_ms = np.mean(self.step_durations_s) * 1000
print("Average step duration: {:2.6f} ms".format(duration_ms))
def calculate_avg_rwd(self):
self.rwd_exps_avg = np.mean(self.Rwd_per_ep_v, axis=0) # Rwd averaged over all experiments
self.rwd_exps_avg_ma = np.zeros(self.rwd_exps_avg.shape[0])
self.rwd_exps_avg_movstd = np.zeros(self.rwd_exps_avg.shape[0])
self.rwd_exps_avg_percentile5 = np.zeros(self.rwd_exps_avg.shape[0])
self.rwd_exps_avg_percentile95 = np.zeros(self.rwd_exps_avg.shape[0])
for s in range(self.rwd_exps_avg.shape[0]):
self.rwd_exps_avg_ma[s] = np.mean(self.rwd_exps_avg[max(0, s - 99):s + 1])
self.rwd_exps_avg_movstd[s] = np.std(self.rwd_exps_avg[max(0, s - 99):s + 1])
self.rwd_exps_avg_percentile5[s] = np.percentile(self.rwd_exps_avg[max(0, s - 99):s + 1], 5)
self.rwd_exps_avg_percentile95[s] = np.percentile(self.rwd_exps_avg[max(0, s - 99):s + 1], 95)
def plot_rwd_averages(self, n_exps, figures_format=None):
n_ep = self.Rwd_per_ep_v.shape[1]
eps = range(n_ep)
if self.figures_dir is not None:
# PLOT ALL EXPERIMENTS
fig = plt.figure()
for i in range(n_exps):
plt.plot(eps, self.Avg_Rwd_per_ep[i, :], label="Exp {}".format(i))
# plt.ylim([-self.max_step - 10, -70])
plt.xlabel("Episode number")
plt.ylabel("Reward")
plt.grid(True)
plt.legend(loc='upper left')
ttl = "Average reward. " + self.exps_conf_str
plt.title("\n".join(wrap(ttl, 60)))
if self.figures_dir is not None:
fig_savepath = os.path.join(self.figures_dir, "RwdsComparisonsAcrossExps.png")
plt.savefig(fig_savepath)
if figures_format is not None:
try:
fig_savepath = os.path.join(self.figures_dir,
"RwdsComparisonsAcrossExps.{}".format(figures_format))
plt.savefig(fig_savepath, format=figures_format)
except:
print("Error while saving figure in {} format.".format(figures_format))
plt.close(fig)
# PLOT AVERAGE OVER ALL EXPERIMENTS
fig = plt.figure()
plt.subplot(211)
plt.plot(eps, self.rwd_exps_avg, label="Average over {:3d} experiments".format(n_exps))
# plt.ylim([-self.max_step - 10, -70])
plt.ylabel("Reward per episode")
plt.grid(True)
plt.plot(eps, self.rwd_exps_avg_percentile95, label="95th percentile over 100 episodes")
plt.plot(eps, self.rwd_exps_avg_ma, label="100-episode moving average")
plt.plot(eps, self.rwd_exps_avg_percentile5, label="5th percentile over 100 episodes")
plt.legend(loc='lower right')
plt.title("Final average reward: {:3.2f} (std={:3.2f})".format(self.rwd_exps_avg_ma[-1],
self.rwd_exps_avg_movstd[-1]))
loss_exps_avg = np.mean(self.Loss_per_ep_v, axis=0)
plt.subplot(212)
plt.semilogy(eps, loss_exps_avg, label="Average over {:3d} experiments".format(n_exps))
plt.xlabel("Episode number")
plt.ylabel("Loss per episode")
plt.grid(True)
loss_exps_avg_ma = np.zeros(loss_exps_avg.shape[0])
for s in range(loss_exps_avg.shape[0]):
loss_exps_avg_ma[s] = np.mean(loss_exps_avg[max(0, s - 100):s + 1])
plt.plot(eps, loss_exps_avg_ma, label="100-episode moving average")
plt.legend(loc='lower right')
plt.suptitle("\n".join(wrap(self.exps_conf_str, 60)))
plt.tight_layout()
plt.subplots_adjust(top=0.85)
if self.figures_dir is not None:
fig_savepath = os.path.join(self.figures_dir, "ExpsAverage.png")
plt.savefig(fig_savepath)
if figures_format is not None:
try:
fig_savepath = os.path.join(self.figures_dir, "ExpsAverage.{}".format(figures_format))
plt.savefig(fig_savepath, format=figures_format)
except:
print("Error while saving figure in {} format.".format(figures_format))
plt.close(fig)
def print_summary(self):
n_eps = np.argmax(self.rwd_exps_avg_ma >= self.min_avg_rwd)
print("Average final reward: {:3.2f} (std={:3.2f}).\n".format(self.rwd_exps_avg_ma[-1],
self.rwd_exps_avg_movstd[-1]))
if n_eps is None:
print("The 100-episode moving average never reached {}.".format(self.min_avg_rwd))
else:
print("The 100-episode moving average reached {} after {} episodes.".format(self.min_avg_rwd, n_eps))
def plot_value_function(self, figures_format=None):
if self.figures_dir is not None:
n_ep = self.Rwd_per_ep_v.shape[1]
fig = plt.figure()
for ep in draw_equispaced_items_from_sequence(7, n_ep):
plt.plot(self.agent_value_function[self.exp, ep, :], label="Episode {:4d}".format(ep))
plt.xlabel("Steps")
plt.ylabel("Value")
plt.grid(True)
plt.legend(loc='lower right')
plt.title("Value functions for experiment {:2d}".format(self.exp))
if self.figures_dir is not None:
fig_savepath = os.path.join(self.figures_dir, "Exp{}_ValueFuncs.png".format(self.exp))
plt.savefig(fig_savepath)
if figures_format is not None:
try:
fig_savepath = os.path.join(self.figures_dir,
"Exp{}_ValueFuncs.{}".format(self.exp, figures_format))
plt.savefig(fig_savepath, format=figures_format)
except:
print("Error while saving figure in {} format.".format(figures_format))
plt.close(fig)
def plot_rwd_loss(self, figures_format=None):
if self.figures_dir is not None:
n_ep = self.Rwd_per_ep_v.shape[1]
eps = range(n_ep)
fig = plt.figure()
ax1 = plt.subplot(211)
plt.plot(eps, self.Rwd_per_ep_v[self.exp, :], label="Instantaneous")
plt.plot(eps, self.Avg_Rwd_per_ep[self.exp, :], label="Mean over {} eps".format(self.n_avg_ep))
# plt.ylim([-self.max_step - 10, -70])
plt.xlabel("Episode number")
plt.ylabel("Reward per episode")
ax2 = ax1.twinx()
plt.plot(eps, self.Agent_Epsilon_per_ep[self.exp, :], label="Agent epsilon", color='r')
ax2.set_ylabel(r'Agent $\varepsilon$', color='r')
ax2.tick_params('y', colors='r')
plt.grid(True)
ttl = "Final average reward: {:3.2f} (SD={:3.2f})"
plt.title(ttl.format(self.Avg_Rwd_per_ep[self.exp, -1],
np.std(self.Rwd_per_ep_v[self.exp, n_ep-100:n_ep-1])))
plt.legend(loc='lower right')
rwd_per_ep_exp_avg = np.mean(self.Rwd_per_ep_v[0:self.exp+1, n_ep-100:n_ep-1], axis=1)
print("Final mean reward, averaged over {} experiment{}: {} (std = {}).".format(self.exp+1,
's' if self.exp > 0 else '',
np.mean(rwd_per_ep_exp_avg),
np.std(rwd_per_ep_exp_avg)))
plt.subplot(212)
plt.semilogy(eps, self.Loss_per_ep_v[self.exp, :], label="Instantaneous")
plt.semilogy(eps, self.Avg_Loss_per_ep[self.exp, :], label="Mean over {} eps".format(self.n_avg_ep))
plt.xlabel("Episode number")
plt.ylabel("Loss per episode")
plt.grid(True)
plt.title("Value function loss")
plt.legend(loc='lower right')
sttl = self.exps_conf_str + ". Experiment {}".format(self.exp)
plt.suptitle("\n".join(wrap(sttl, 60)))
plt.tight_layout()
plt.subplots_adjust(top=0.85)
if self.figures_dir is not None:
fig_savepath = os.path.join(self.figures_dir, "Experiment{}_Rwd_Loss.png".format(self.exp))
plt.savefig(fig_savepath)
if figures_format is not None:
try:
fig_savepath = os.path.join(self.figures_dir, "Experiment{}_Rwd_Loss.{}".format(self.exp,
figures_format))
plt.savefig(fig_savepath, format=figures_format)
except:
print("Error while saving figure in {} format.".format(figures_format))
plt.close(fig)
import os
from openai_playground.gymhelpers import ExperimentsManager
env_name = "MountainCar-v0"
gym_stats_dir_prefix = os.path.join('Gym_stats', env_name)
figures_dir = 'Figures'
api_key = '###'
n_ep = 1500
n_exps = 1
expsman = ExperimentsManager(env_name=env_name, agent_value_function_hidden_layers_size=[256, 512],
figures_dir=figures_dir, discount=0.99, decay_eps=0.99, eps_min=1E-4, learning_rate=6E-4,
decay_lr=False, max_step=200, replay_memory_max_size=1000000, ep_verbose=False,
exp_verbose=True, batch_size=64, upload_last_exp=True, double_dqn=False,
target_params_update_period_steps=1000, replay_period_steps=4, min_avg_rwd=-110,
per_proportional_prioritization=True, per_apply_importance_sampling=True, per_alpha=0.8,
per_beta0=0.8, gym_api_key=api_key, results_dir_prefix=gym_stats_dir_prefix)
expsman.run_experiments(n_exps=n_exps, n_ep=n_ep, stop_training_min_avg_rwd=-100, plot_results=False)
input("Press Enter to terminate.")
import os
import numpy as np
def get_last_folder_id(folder_path):
t = 0
for fn in os.listdir(folder_path):
t = max(t, int(fn))
return t
def movingaverage(values, window):
weights = np.repeat(1.0, window)/window
sma = np.convolve(values, weights, 'valid')
return sma
def draw_equispaced_items_from_sequence(m, n):
"""
draw_equispaced_items_from_sequence(m, n)
Args:
m (int): How many items to draw.
n (int): Length of sequence to draw from.
"""
return [i * n // m + n // (2 * m) for i in range(m)]
import os
import tensorflow as tf
import numpy as np
class ValueFunctionDQN:
def __init__(self, scope="MyValueFunctionEstimator", state_dim=2, n_actions=3, train_batch_size=64,
learning_rate=1e-4, hidden_layers_size=None, decay_lr=False, huber_loss=False, summaries_path=None,
reset_default_graph=False, checkpoints_dir=None, apply_wis=False):
# Input check
if hidden_layers_size is None:
hidden_layers_size = [128, 64] # Default ANN architecture
assert len(hidden_layers_size) >= 1, "At least one hidden layer must be specified."
# Support variables
self.scope = scope
self.layers_size = [state_dim] + hidden_layers_size + [n_actions] # Size of all layers (including in & out)
self.weights = []
self.biases = []
self.weights_old = []
self.biases_old = []
self.learning_rate = learning_rate
self.train_batch_size = train_batch_size
self.n_train_epochs = 0
self.summaries_path = summaries_path
self.train_writer = None
self.checkpoints_dir = checkpoints_dir
# Apply Weighted Importance Sampling. See "Weighted importance sampling for off-policy learning with linear
# function approximation". In Advances in Neural Information Processing Systems, pp. 3014–3022, 2014
# https://pdfs.semanticscholar.org/f8ef/8d1c31ae97c8acdd2d758dd2c0fe4e4bd6d7.pdf
self.apply_wis = apply_wis
if reset_default_graph:
tf.reset_default_graph()
# Build Tensorflow graph
with tf.variable_scope(self.scope):
# Inputs, weights, biases and targets of the ANN
self.x = tf.placeholder(tf.float32, shape=(None, state_dim), name="x")
self.train_targets = tf.placeholder(tf.float32, shape=(None, n_actions), name="train_targets")
for l in range(len(self.layers_size) - 1):
self.weights.append(tf.get_variable(name="w" + str(l), shape=[self.layers_size[l],
self.layers_size[l + 1]],
initializer=tf.contrib.layers.xavier_initializer()))
self.biases.append(tf.get_variable(name="b" + str(l), shape=[self.layers_size[l + 1]],
initializer=tf.constant_initializer(0.0)))
self.weights_old.append(tf.get_variable(name="w-" + str(l),
initializer=self.weights[l].initialized_value()))
self.biases_old.append(tf.get_variable(name="b-" + str(l),
initializer=self.biases[l].initialized_value()))
if summaries_path is not None:
with tf.name_scope('params_summaries'):
for l in range(len(self.layers_size) - 1):
self.variable_summaries(self.weights[l], "w" + str(l), histogram=True)
self.variable_summaries(self.biases[l], "b" + str(l), histogram=True)
# Interconnection of the various ANN nodes
self.prediction = self.model(self.x)
self.prediction_with_old_params = self.model(self.x, use_old_params=True)
# Training calculations
if huber_loss:
self.loss = self.huber_loss(self.train_targets, self.prediction)
else:
self.E = tf.subtract(self.train_targets, self.prediction, name="Error")
self.SE = tf.square(self.E, name="SquaredError")
if self.apply_wis:
self.rho = tf.placeholder(tf.float32, shape=(train_batch_size, n_actions), name="wis_weights")
self.loss = tf.reduce_mean(tf.multiply(self.rho, self.SE), name="loss")
else:
self.loss = tf.reduce_mean(self.SE, name="loss")
self.global_step = tf.Variable(0, trainable=False)
if decay_lr:
self.learning_rate = tf.train.exponential_decay(1e-4, self.global_step, 3000 * 200, 1e-5 / 1e-4)
self.opt_op = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
self.train_op = self.opt_op.minimize(self.loss, global_step=self.global_step)
self.init_op = tf.global_variables_initializer()
# Operations to update the target Q network
self.update_ops = []
for l in range(len(self.layers_size) - 1):
self.update_ops.append(self.weights_old[l].assign(self.weights[l]))
self.update_ops.append(self.biases_old[l].assign(self.biases[l]))
if self.summaries_path is not None:
self.variable_summaries(self.loss, "loss", scalar_only=True)
self.variable_summaries(self.learning_rate, "learning_rate", scalar_only=True)
if self.checkpoints_dir is not None:
var_list = []
for l in range(len(self.layers_size) - 1):
var_list.append(self.weights[l])
var_list.append(self.biases[l])
self.saver = tf.train.Saver(var_list, pad_step_number=True)
if self.summaries_path is not None:
self.merged_summaries = tf.summary.merge_all()
self.summaries_path += "_{}".format(self.scope)
if not os.path.exists(self.summaries_path):
os.makedirs(self.summaries_path)
self.train_writer = tf.summary.FileWriter(self.summaries_path, graph=tf.get_default_graph())
else:
self.merged_summaries = None
self.session = None
def model(self, x, use_old_params=False):
z = []
hidden = [x]
for l in range(len(self.layers_size)-2):
if use_old_params:
z.append(tf.matmul(hidden[l], self.weights_old[l]) + self.biases_old[l])
else:
z.append(tf.matmul(hidden[l], self.weights[l]) + self.biases[l])
hidden.append(tf.nn.relu(z[l], name="hidden_" + str(l + 1)))
if use_old_params:
z.append(tf.matmul(hidden[-1], self.weights_old[-1]) + self.biases_old[-1])
else:
z.append(tf.matmul(hidden[-1], self.weights[-1]) + self.biases[-1])
if not use_old_params:
if self.summaries_path is not None:
with tf.name_scope('layers_summaries'):
for l in range(len(self.layers_size) - 1):
self.variable_summaries(z[l], "z" + str(l))
self.variable_summaries(hidden[l], "hidden" + str(l))
return z[-1] # Output layer has Identity units.
@staticmethod
def huber_loss(targets, predictions):
error = targets - predictions
fn_choice_maker1 = (tf.to_int32(tf.sign(error + 1)) + 1) / 2
fn_choice_maker2 = (tf.to_int32(tf.sign(-error + 1)) + 1) / 2
choice_maker_sqr = tf.to_float(tf.multiply(fn_choice_maker1, fn_choice_maker2))
sqr_contrib = tf.multiply(choice_maker_sqr, tf.square(error)*0.5)
abs_contrib = tf.abs(error)-0.5 - tf.multiply(choice_maker_sqr, tf.abs(error)-0.5)
loss = tf.reduce_mean(sqr_contrib + abs_contrib)
return loss
def init_tf_session(self):
if self.session is None:
self.session = tf.Session()
self.session.run(self.init_op) # Global Variables Initializer (init op)
def predict(self, states, use_old_params=False):
self.init_tf_session() # Make sure the Tensorflow session exists
feed_dict = {self.x: states}
if use_old_params:
q = self.session.run(self.prediction_with_old_params, feed_dict=feed_dict)
else:
q = self.session.run(self.prediction, feed_dict=feed_dict)
return q
def train(self, states, targets, w=None):
self.init_tf_session() # Make sure the Tensorflow session exists
feed_dict = {self.x: states, self.train_targets: targets}
if self.apply_wis:
feed_dict[self.rho] = np.transpose(np.tile(w, (self.layers_size[-1], 1)))
if self.summaries_path is not None and self.n_train_epochs % 2000 == 0:
fetches = [self.loss, self.train_op, self.E, self.merged_summaries]
else:
fetches = [self.loss, self.train_op, self.E]
values = self.session.run(fetches, feed_dict=feed_dict)
if self.summaries_path is not None and self.n_train_epochs % 2000 == 0:
self.train_writer.add_summary(values[3], global_step=self.n_train_epochs)
if self.checkpoints_dir is not None and self.n_train_epochs % 40000 == 0:
self.saver.save(self.session, self.checkpoints_dir, global_step=self.global_step)
self.n_train_epochs += 1
return values[0], values[2]
@staticmethod
def variable_summaries(var, name, histogram=False, scalar_only=False):
"""Attach a lot of summaries to a Tensor (for TensorBoard visualization)."""
if scalar_only:
tf.summary.scalar(name, var)
else:
mean = tf.reduce_mean(var)
tf.summary.scalar(name+'_mean', mean)
with tf.name_scope('stddev'):
stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean)))
tf.summary.scalar(name+'_stddev', stddev)
tf.summary.scalar(name+'_max', tf.reduce_max(var))
tf.summary.scalar(name+'_min', tf.reduce_min(var))
if histogram:
tf.summary.histogram(name+'_histogram', var)
def update_old_params(self):
self.init_tf_session() # Make sure the Tensorflow session exists
self.session.run(self.update_ops)
def close_summary_file(self):
if self.summaries_path is not None:
self.train_writer.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment