Skip to content

Instantly share code, notes, and snippets.

@avalcarce
Created February 27, 2017 09:27
Show Gist options
  • Save avalcarce/5129a7fb4a0c6a5a02fecf26611b7051 to your computer and use it in GitHub Desktop.
Save avalcarce/5129a7fb4a0c6a5a02fecf26611b7051 to your computer and use it in GitHub Desktop.
Solving MountainCar-v0 with DQN in the least possible number of learning episodes for a minimum average reward of -110.

Synopsis

This is a Deep Reinforcement Learning solution to some classic control problems. I've used it to solve MountainCar-v0 problem, CartPole-v0 and [CartPole-v1] (https://gym.openai.com/envs/CartPole-v1) in OpenAI's Gym. This code uses Tensorflow to model a value function for a Reinforcement Learning agent. I've run it with Tensorflow 1.0 on Python 3.5 under Windows 7.

Some of the hyperparameters used in the main.py script to solve MountainCar-v0 have been optained via Bayesian optimization with Scikit-Optimize. The optimized hyperparameters and their values are:

  • Size of 1st fully connected layer: 47
  • Size of 2nd fully connected layer: 197
  • Epsilon (as in greedy epsilon exploration) decay factor: 0.8513032459
  • Minimum epsilon: 1.872686e-05
  • Learning rate: 4.561407e-04
  • Period (in steps) for the update of the target network parameters as per the DQN algorithm: 1161
  • Discount factor: 0.99
  • Whether to use Double DQN: False

References

  1. Deep Learning tutorial, David Silver, Google DeepMind.
  2. My code on Github
import numpy as np
class AgentEpsGreedy:
def __init__(self, n_actions, value_function_model, eps=1., summaries_path_current=None):
self.n_actions = n_actions
self.value_func = value_function_model
self.eps = eps
self.summaries_path_current = summaries_path_current
self.current_value = None # Current value of the value function (i.e. expected discounted return)
def act(self, state):
action_values = self.value_func.predict([state])[0]
policy = np.ones(self.n_actions) * self.eps / self.n_actions
a_max = np.argmax(action_values)
policy[a_max] += 1. - self.eps
a = np.random.choice(self.n_actions, p=policy)
self.current_value = action_values[a]
return a
def train(self, states, targets):
return self.value_func.train(states, targets)
def predict_q_values(self, states, use_old_params=False):
return self.value_func.predict(states, use_old_params)
import gym
from gym import wrappers
import copy
import time
from sys import platform
from textwrap import wrap
if platform == "linux" or platform == "linux2":
import matplotlib
matplotlib.use('Agg') # This is to generate images without having a window appear.
import matplotlib.pyplot as plt
from .utils import *
from .agents import AgentEpsGreedy
from .valuefunctions import ValueFunctionDQN
from .ReplayMemory import ReplayMemory
class ExperimentsManager:
def __init__(self, env_name, agent_value_function_hidden_layers_size, results_dir_prefix=None, summaries_path=None,
figures_dir=None, discount=0.99, decay_eps=0.995, eps_min=0.0001, learning_rate=1E-4, decay_lr=False,
max_step=10000, replay_memory_max_size=100000, ep_verbose=False, exp_verbose=True, batch_size=64,
upload_last_exp=False, double_dqn=False, target_params_update_period_steps=1, gym_api_key="",
gym_algorithm_id=None, checkpoints_dir=None, min_avg_rwd=-110):
self.env_name = env_name
self.results_dir_prefix = results_dir_prefix
self.gym_stats_dir = None
self.summaries_path = summaries_path
self.summaries_path_current = summaries_path
self.figures_dir = figures_dir
self.discount = discount
self.decay_eps = decay_eps
self.eps_min = eps_min
self.learning_rate = learning_rate
self.decay_lr = decay_lr
self.max_step = max_step
self.replay_memory_max_size = replay_memory_max_size
self.ep_verbose = ep_verbose # Whether or not to print progress during episodes
self.exp_verbose = exp_verbose # Whether or not to print progress during experiments
self.upload_last_exp = upload_last_exp
assert target_params_update_period_steps > 0, "The period for updating the target parameters must be positive."
self.target_params_update_period_steps = target_params_update_period_steps
self.gym_api_key = gym_api_key
self.gym_algorithm_id = gym_algorithm_id
self.checkpoints_dir = checkpoints_dir
self.checkpoints_dir_current = checkpoints_dir
self.agent = None
self.memory = None # Experience replay memory
self.batch_size = batch_size
self.agent_value_function_hidden_layers_size = agent_value_function_hidden_layers_size
self.double_dqn = double_dqn
self.global_step = 0 # Current step over all episodes
self.step = 0 # Current step per episode
self.ep = 0
self.exp = 0
self.step_durations_s = np.zeros(shape=self.max_step, dtype=float)
self.min_avg_rwd = min_avg_rwd # Minimum average reward to consider the problem as solved
self.n_avg_ep = 100 # Number of consecutive episodes to calculate the average reward
self.conf_msg = "\nEXECUTING EXPERIMENT {} OF {} IN ENVIRONMENT {}."
self.episode_progress_msg = "Step {:5d}/{:5d}. Avg step duration: {:3.6f} ms." + \
" Loss = {:3.2e}."
self.exp_progress_msg = "Exp {:3d}. Ep {:5d}, Rwd={:4.0f} (mean={:4.0f} over {:3d} episodes)." + \
" {} exceeded in {:4d} eps. Loss={:1.2e} (avg={:1.2e}). Agent epsilon={:3.2f} %." + \
" Average step duration: {:2.6f} ms."
self.exps_conf_str = ""
# Memory pre-allocation
self.Rwd_per_ep_v = np.zeros((1, 5000))
self.Loss_per_ep_v = np.zeros((1, 5000))
self.Avg_Rwd_per_ep = np.zeros((1, 5000))
self.Avg_Loss_per_ep = np.zeros((1, 5000))
self.n_eps_to_reach_min_avg_rwd = np.zeros(1, dtype=float)
self.Agent_Epsilon_per_ep = np.zeros((1, 5000))
self.agent_value_function = np.zeros((1, 1, self.max_step))
self.rwd_exps_avg = np.mean(self.Rwd_per_ep_v, axis=0) # Rwd averaged over all experiments
self.rwd_exps_avg_ma = np.zeros(self.rwd_exps_avg.shape[0])
self.rwd_exps_avg_movstd = np.zeros(self.rwd_exps_avg.shape[0])
self.rwd_exps_avg_percentile5 = np.zeros(self.rwd_exps_avg.shape[0])
self.rwd_exps_avg_percentile95 = np.zeros(self.rwd_exps_avg.shape[0])
def __print_episode_progress(self, loss_v):
if self.ep_verbose:
if self.step > 0 and (self.step+1) % 20 == 0:
print(self.episode_progress_msg.format(self.step, self.max_step,
np.mean(self.step_durations_s[self.ep, 0:self.step]) * 1000,
loss_v))
def __double_dqn_train(self):
# DQN Experience Replay
loss_v = 0
if len(self.memory.memory) > self.batch_size:
# Extract a batch of random transitions from the replay memory
states_b, actions_b, rewards_b, states_n_b, done_b = zip(*self.memory.sample(self.batch_size))
states_b = np.array(states_b)
actions_b = np.array(actions_b)
rewards_b = np.array(rewards_b)
states_n_b = np.array(states_n_b)
done_b = np.array(done_b).astype(int)
q_n_b = self.agent.predict_q_values(states_n_b) # Action values on the arriving state
best_a = np.argmax(q_n_b, axis=1)
q_n_target_b = self.agent.predict_q_values(states_n_b, use_old_params=True)
targets_b = rewards_b + (1. - done_b) * self.discount * q_n_target_b[np.arange(self.batch_size), best_a]
targets = self.agent.predict_q_values(states_b)
for j, action in enumerate(actions_b):
targets[j, action] = targets_b[j]
loss_v = self.agent.train(states_b, targets)
return loss_v
def __train_on_experience(self):
# DQN Experience Replay
loss_v = 0
if len(self.memory.memory) > self.batch_size:
# Extract a batch of random transitions from the replay memory
states_b, actions_b, rewards_b, states_n_b, done_b = zip(*self.memory.sample(self.batch_size))
states_b = np.array(states_b)
actions_b = np.array(actions_b)
rewards_b = np.array(rewards_b)
states_n_b = np.array(states_n_b)
done_b = np.array(done_b).astype(int)
if self.target_params_update_period_steps == 1: # This is avoid having to copy the old params every step
q_n_b = self.agent.predict_q_values(states_n_b) # Action values on the next state
else:
q_n_b = self.agent.predict_q_values(states_n_b, use_old_params=True) # Action values on the next state
targets_b = rewards_b + (1. - done_b) * self.discount * np.amax(q_n_b, axis=1)
targets = self.agent.predict_q_values(states_b)
for j, action in enumerate(actions_b):
targets[j, action] = targets_b[j]
loss_v = self.agent.train(states_b, targets)
return loss_v
def __print_experiment_progress(self):
if self.exp_verbose:
rwd = self.Rwd_per_ep_v[self.exp, self.ep]
avg_rwd = self.Avg_Rwd_per_ep[self.exp, self.ep]
loss = self.Loss_per_ep_v[self.exp, self.ep]
avg_loss = self.Avg_Loss_per_ep[self.exp, self.ep]
avg_rwds = self.Avg_Rwd_per_ep[self.exp, 0:self.ep+1]
i_last_low_rwd = np.max(np.where(avg_rwds < self.min_avg_rwd))
n_solved_eps = self.ep - i_last_low_rwd
duration_ms = 0
if self.ep > 0:
duration_ms = np.mean(self.step_durations_s[0:self.ep, :]) * 1000
print(
self.exp_progress_msg.format(self.exp, self.ep, rwd, avg_rwd, self.n_avg_ep, self.min_avg_rwd,
n_solved_eps, loss, avg_loss, self.agent.eps*100, duration_ms))
def run_episode(self, env, train=True):
state = env.reset()
done = False
total_reward = 0
loss_v = 0
for self.step in range(self.max_step):
# Maybe update the target estimator
if self.target_params_update_period_steps > 1:
if self.global_step % self.target_params_update_period_steps == 0:
self.agent.value_func.update_old_params()
if self.ep_verbose:
print("Copied model parameters to target network.")
t = time.time()
self.__print_episode_progress(loss_v)
if done:
break
action = self.agent.act(state)
self.agent_value_function[self.exp, self.ep, self.step] = self.agent.current_value
self.global_step += 1
state_next, reward, done, info = env.step(action)
total_reward += reward
if self.memory is not None:
self.memory.add((state, action, reward, state_next, done))
if train:
if self.double_dqn:
loss_v = self.__double_dqn_train()
else:
loss_v = self.__train_on_experience()
else:
raise NotImplementedError("Please provide an Experience Replay memory")
state = copy.copy(state_next)
self.step_durations_s[self.ep, self.step] = time.time() - t # Time elapsed during this step
return loss_v, total_reward
def run_experiment(self, env, n_ep, stop_training_min_avg_rwd=None):
self.global_step = 0
train = True
# One experiment is composed of n_ep sequential episodes
for self.ep in range(n_ep):
loss_v, total_reward = self.run_episode(env, train)
# Collect episode results
self.Rwd_per_ep_v[self.exp, self.ep] = total_reward
self.Loss_per_ep_v[self.exp, self.ep] = loss_v
# Calculate episode statistics
last_rwds = self.Rwd_per_ep_v[self.exp, np.maximum(self.ep - (self.n_avg_ep - 1), 0):self.ep+1]
last_losses = self.Loss_per_ep_v[self.exp, np.maximum(self.ep - (self.n_avg_ep - 1), 0):self.ep+1]
self.Avg_Rwd_per_ep[self.exp, self.ep] = np.mean(last_rwds)
self.Avg_Loss_per_ep[self.exp, self.ep] = np.mean(last_losses)
self.Agent_Epsilon_per_ep[self.exp, self.ep] = self.agent.eps
if stop_training_min_avg_rwd is not None:
if train and self.Avg_Rwd_per_ep[self.exp, self.ep] >= stop_training_min_avg_rwd:
train = False
print("Minimum average reward reached. Stopping training.")
if self.Avg_Rwd_per_ep[self.exp, self.ep] >= self.min_avg_rwd:
self.n_eps_to_reach_min_avg_rwd[self.exp] = np.minimum(self.ep,
self.n_eps_to_reach_min_avg_rwd[self.exp])
if self.agent.eps > self.eps_min:
self.agent.eps *= self.decay_eps
self.__print_experiment_progress()
def __create_gym_stats_directory(self, env):
if self.results_dir_prefix is None:
raise ValueError("A prefix for the Gym results directory must be provided.")
if not os.path.exists(self.results_dir_prefix):
os.makedirs(self.results_dir_prefix)
t = get_last_folder_id(self.results_dir_prefix) + 1 # Calculate next test id
self.gym_stats_dir = os.path.join(self.results_dir_prefix, str(t).zfill(4))
if not os.path.exists(self.gym_stats_dir):
os.makedirs(self.gym_stats_dir)
else:
raise FileExistsError(self.gym_stats_dir)
return wrappers.Monitor(env, self.gym_stats_dir)
def __build_experiments_conf_str(self, n_exps, n_ep, n_actions, state_dim):
layers_size = str(state_dim)
for s in self.agent_value_function_hidden_layers_size:
layers_size += "-"+str(s)
layers_size += "-"+str(n_actions)
exp_conf_str = "{}_{}_Disc{:1.3e}_DecE{:1.2e}_EMin{:1.2e}_LR{:1.2e}_DecLR{}_MaxStp{}_" +\
"DDQN{}_RepMm{:1.1e}_BS{}_NEx{}_NEp{}_PmsUp{}"
self.exps_conf_str = exp_conf_str.format(time.strftime("%Y_%m_%d__%H_%M_%S"), layers_size, self.discount,
self.decay_eps, self.eps_min, self.learning_rate,
1 if self.decay_lr else 0, self.max_step, 1 if self.double_dqn else 0,
self.replay_memory_max_size, self.batch_size, n_exps, n_ep,
self.target_params_update_period_steps)
def __create_figures_directory(self):
if self.figures_dir is not None:
self.figures_dir = os.path.join(self.figures_dir, self.env_name, self.exps_conf_str)
if not os.path.exists(self.figures_dir):
os.makedirs(self.figures_dir)
else:
for dirpath, dirnames, files in os.walk(self.figures_dir):
if files:
raise FileExistsError("The figures directory exists and has files: {}".format(self.figures_dir))
else:
break
def run_experiments(self, n_exps, n_ep, stop_training_min_avg_rwd=None, plot_results=True, figures_format=None):
self.Rwd_per_ep_v = np.zeros((n_exps, n_ep))
self.Loss_per_ep_v = np.zeros((n_exps, n_ep))
self.Avg_Rwd_per_ep = np.zeros((n_exps, n_ep))
self.n_eps_to_reach_min_avg_rwd = np.zeros(n_exps, dtype=float)
self.n_eps_to_reach_min_avg_rwd.fill(n_ep)
self.Avg_Loss_per_ep = np.zeros((n_exps, n_ep))
self.Agent_Epsilon_per_ep = np.zeros((n_exps, n_ep))
self.agent_value_function = np.zeros((n_exps, n_ep, self.max_step))
self.step_durations_s = np.zeros(shape=(n_ep, self.max_step), dtype=float)
# Create environment
env = gym.make(self.env_name)
n_actions = env.action_space.n
state_dim = env.observation_space.high.shape[0]
self.__build_experiments_conf_str(n_exps, n_ep, n_actions, state_dim)
self.__create_figures_directory()
for self.exp in range(n_exps):
print(self.conf_msg.format(self.exp, n_exps, self.env_name))
print(self.exps_conf_str)
env = gym.make(self.env_name) # Create new environment
assert n_actions == env.action_space.n
assert state_dim == env.observation_space.high.shape[0]
if self.upload_last_exp and self.exp == n_exps-1:
env = self.__create_gym_stats_directory(env)
if self.summaries_path is not None:
self.summaries_path_current = os.path.join(self.summaries_path,
self.env_name,
self.exps_conf_str + "_Exp" + str(self.exp))
if self.checkpoints_dir is not None:
self.checkpoints_dir_current = os.path.join(self.checkpoints_dir,
self.env_name,
self.exps_conf_str+"_Exp"+str(self.exp))
if not os.path.exists(self.checkpoints_dir_current):
os.makedirs(self.checkpoints_dir_current)
# Create agent
value_function = ValueFunctionDQN(scope="q", state_dim=state_dim, n_actions=n_actions,
train_batch_size=self.batch_size, learning_rate=self.learning_rate,
hidden_layers_size=self.agent_value_function_hidden_layers_size,
decay_lr=self.decay_lr, huber_loss=False,
summaries_path=self.summaries_path_current,
reset_default_graph=True,
checkpoints_dir=self.checkpoints_dir_current)
self.agent = AgentEpsGreedy(n_actions=n_actions, value_function_model=value_function, eps=0.9,
summaries_path_current=self.summaries_path_current)
self.memory = ReplayMemory(max_size=self.replay_memory_max_size)
self.run_experiment(env, n_ep, stop_training_min_avg_rwd) # This is where the action happens
value_function.close_summary_file()
env.close()
if self.upload_last_exp and self.exp == n_exps - 1:
print("Trying to upload results to the scoreboard.")
gym.upload(self.gym_stats_dir, api_key=self.gym_api_key, algorithm_id=self.gym_algorithm_id)
# Plot results
self.plot_rwd_loss(figures_format=figures_format)
self.plot_value_function(figures_format=figures_format)
self.print_experiment_summary()
self.calculate_avg_rwd()
self.plot_rwd_averages(n_exps, figures_format=figures_format)
if plot_results:
plt.show()
# Return the final Rwd averaged over all experiments AND the mean number of episodes needed to reach the min Rwd
return self.rwd_exps_avg_ma[-1], np.mean(self.n_eps_to_reach_min_avg_rwd)
def print_experiment_summary(self):
duration_ms = np.mean(self.step_durations_s) * 1000
print("Average step duration: {:2.6f} ms".format(duration_ms))
def calculate_avg_rwd(self):
self.rwd_exps_avg = np.mean(self.Rwd_per_ep_v, axis=0) # Rwd averaged over all experiments
self.rwd_exps_avg_ma = np.zeros(self.rwd_exps_avg.shape[0])
self.rwd_exps_avg_movstd = np.zeros(self.rwd_exps_avg.shape[0])
self.rwd_exps_avg_percentile5 = np.zeros(self.rwd_exps_avg.shape[0])
self.rwd_exps_avg_percentile95 = np.zeros(self.rwd_exps_avg.shape[0])
for s in range(self.rwd_exps_avg.shape[0]):
self.rwd_exps_avg_ma[s] = np.mean(self.rwd_exps_avg[max(0, s - 99):s + 1])
self.rwd_exps_avg_movstd[s] = np.std(self.rwd_exps_avg[max(0, s - 99):s + 1])
self.rwd_exps_avg_percentile5[s] = np.percentile(self.rwd_exps_avg[max(0, s - 99):s + 1], 5)
self.rwd_exps_avg_percentile95[s] = np.percentile(self.rwd_exps_avg[max(0, s - 99):s + 1], 95)
def plot_rwd_averages(self, n_exps, figures_format=None):
n_ep = self.Rwd_per_ep_v.shape[1]
eps = range(n_ep)
if self.figures_dir is not None:
# PLOT ALL EXPERIMENTS
fig = plt.figure()
for i in range(n_exps):
plt.plot(eps, self.Avg_Rwd_per_ep[i, :], label="Exp {}".format(i))
# plt.ylim([-self.max_step - 10, -70])
plt.xlabel("Episode number")
plt.ylabel("Reward")
plt.grid(True)
plt.legend(loc='upper left')
ttl = "Average reward. " + self.exps_conf_str
plt.title("\n".join(wrap(ttl, 60)))
if self.figures_dir is not None:
fig_savepath = os.path.join(self.figures_dir, "RwdsComparisonsAcrossExps.png")
plt.savefig(fig_savepath)
if figures_format is not None:
try:
fig_savepath = os.path.join(self.figures_dir,
"RwdsComparisonsAcrossExps.{}".format(figures_format))
plt.savefig(fig_savepath, format=figures_format)
except:
print("Error while saving figure in {} format.".format(figures_format))
plt.close(fig)
# PLOT AVERAGE OVER ALL EXPERIMENTS
fig = plt.figure()
plt.subplot(211)
plt.plot(eps, self.rwd_exps_avg, label="Average over {:3d} experiments".format(n_exps))
# plt.ylim([-self.max_step - 10, -70])
plt.ylabel("Reward per episode")
plt.grid(True)
plt.plot(eps, self.rwd_exps_avg_percentile95, label="95th percentile over 100 episodes")
plt.plot(eps, self.rwd_exps_avg_ma, label="100-episode moving average")
plt.plot(eps, self.rwd_exps_avg_percentile5, label="5th percentile over 100 episodes")
plt.legend(loc='lower right')
print("Average final reward: {:3.2f} (std={:3.2f}).\n".format(self.rwd_exps_avg_ma[-1],
self.rwd_exps_avg_movstd[-1]))
plt.title("Final average reward: {:3.2f} (std={:3.2f})".format(self.rwd_exps_avg_ma[-1],
self.rwd_exps_avg_movstd[-1]))
loss_exps_avg = np.mean(self.Loss_per_ep_v, axis=0)
plt.subplot(212)
plt.semilogy(eps, loss_exps_avg, label="Average over {:3d} experiments".format(n_exps))
plt.xlabel("Episode number")
plt.ylabel("Loss per episode")
plt.grid(True)
loss_exps_avg_ma = np.zeros(loss_exps_avg.shape[0])
for s in range(loss_exps_avg.shape[0]):
loss_exps_avg_ma[s] = np.mean(loss_exps_avg[max(0, s - 100):s + 1])
plt.plot(eps, loss_exps_avg_ma, label="100-episode moving average")
plt.legend(loc='lower right')
plt.suptitle("\n".join(wrap(self.exps_conf_str, 60)))
plt.tight_layout()
plt.subplots_adjust(top=0.85)
if self.figures_dir is not None:
fig_savepath = os.path.join(self.figures_dir, "ExpsAverage.png")
plt.savefig(fig_savepath)
if figures_format is not None:
try:
fig_savepath = os.path.join(self.figures_dir, "ExpsAverage.{}".format(figures_format))
plt.savefig(fig_savepath, format=figures_format)
except:
print("Error while saving figure in {} format.".format(figures_format))
plt.close(fig)
def plot_value_function(self, figures_format=None):
if self.figures_dir is not None:
n_ep = self.Rwd_per_ep_v.shape[1]
fig = plt.figure()
for ep in draw_equispaced_items_from_sequence(7, n_ep):
plt.plot(self.agent_value_function[self.exp, ep, :], label="Episode {:4d}".format(ep))
plt.xlabel("Steps")
plt.ylabel("Value")
plt.grid(True)
plt.legend(loc='lower right')
plt.title("Value functions for experiment {:2d}".format(self.exp))
if self.figures_dir is not None:
fig_savepath = os.path.join(self.figures_dir, "Experiment{}_ValueFunctions.png".format(self.exp))
plt.savefig(fig_savepath)
if figures_format is not None:
try:
fig_savepath = os.path.join(self.figures_dir,
"Experiment{}_ValueFunctions.{}".format(self.exp, figures_format))
plt.savefig(fig_savepath, format=figures_format)
except:
print("Error while saving figure in {} format.".format(figures_format))
plt.close(fig)
def plot_rwd_loss(self, figures_format=None):
if self.figures_dir is not None:
n_ep = self.Rwd_per_ep_v.shape[1]
eps = range(n_ep)
fig = plt.figure()
ax1 = plt.subplot(211)
plt.plot(eps, self.Rwd_per_ep_v[self.exp, :], label="Instantaneous")
plt.plot(eps, self.Avg_Rwd_per_ep[self.exp, :], label="Mean over {} eps".format(self.n_avg_ep))
# plt.ylim([-self.max_step - 10, -70])
plt.xlabel("Episode number")
plt.ylabel("Reward per episode")
ax2 = ax1.twinx()
plt.plot(eps, self.Agent_Epsilon_per_ep[self.exp, :], label="Agent epsilon", color='r')
ax2.set_ylabel(r'Agent $\varepsilon$', color='r')
ax2.tick_params('y', colors='r')
plt.grid(True)
ttl = "Final average reward: {:3.2f} (SD={:3.2f})"
plt.title(ttl.format(self.Avg_Rwd_per_ep[self.exp, -1], np.std(self.Rwd_per_ep_v[self.exp, n_ep-100:n_ep-1])))
plt.legend(loc='lower right')
rwd_per_ep_exp_avg = np.mean(self.Rwd_per_ep_v[0:self.exp+1, n_ep-100:n_ep-1], axis=1)
print("Final mean reward, averaged over {} experiment{}: {} (std = {}).".format(self.exp+1,
's' if self.exp > 0 else '',
np.mean(rwd_per_ep_exp_avg),
np.std(rwd_per_ep_exp_avg)))
plt.subplot(212)
plt.semilogy(eps, self.Loss_per_ep_v[self.exp, :], label="Instantaneous")
plt.semilogy(eps, self.Avg_Loss_per_ep[self.exp, :], label="Mean over {} eps".format(self.n_avg_ep))
plt.xlabel("Episode number")
plt.ylabel("Loss per episode")
plt.grid(True)
plt.title("Value function loss")
plt.legend(loc='lower right')
sttl = self.exps_conf_str + ". Experiment {}".format(self.exp)
plt.suptitle("\n".join(wrap(sttl, 60)))
plt.tight_layout()
plt.subplots_adjust(top=0.85)
if self.figures_dir is not None:
fig_savepath = os.path.join(self.figures_dir, "Experiment{}_Rwd_Loss.png".format(self.exp))
plt.savefig(fig_savepath)
if figures_format is not None:
try:
fig_savepath = os.path.join(self.figures_dir, "Experiment{}_Rwd_Loss.{}".format(self.exp,
figures_format))
plt.savefig(fig_savepath, format=figures_format)
except:
print("Error while saving figure in {} format.".format(figures_format))
plt.close(fig)
import os
from openai_playground.gymhelpers import ExperimentsManager
env_name = "MountainCar-v0"
gym_stats_dir_prefix = os.path.join('Gym_stats', env_name)
figures_dir = 'Figures'
api_key = '###'
n_ep = 2000
n_exps = 1
hidden_layers_size = [47, 197]
expsman = ExperimentsManager(env_name=env_name, results_dir_prefix=gym_stats_dir_prefix,
agent_value_function_hidden_layers_size=hidden_layers_size, figures_dir=figures_dir,
discount=0.99, decay_eps=0.8513032459, eps_min=1.872686e-05, learning_rate=4.561407e-04,
decay_lr=False, max_step=200, replay_memory_max_size=100000, ep_verbose=False,
exp_verbose=True, batch_size=64, upload_last_exp=True, double_dqn=False,
target_params_update_period_steps=1161, gym_api_key=api_key, min_avg_rwd=-110)
expsman.run_experiments(n_exps=n_exps, n_ep=n_ep, stop_training_min_avg_rwd=-98, plot_results=False)
input("Press Enter to terminate.")
from collections import deque
import numpy as np
class ReplayMemory:
def __init__(self, max_size=128):
self.memory = deque(maxlen=max_size)
def sample(self, batch_size):
batch_size = min(len(self.memory), batch_size)
idxs = np.random.choice(len(self.memory), batch_size)
return [self.memory[idx] for idx in idxs]
def add(self, item):
self.memory.append(item)
import os
import numpy as np
def get_last_folder_id(folder_path):
t = 0
for fn in os.listdir(folder_path):
t = max(t, int(fn))
return t
def movingaverage(values, window):
weights = np.repeat(1.0, window)/window
sma = np.convolve(values, weights, 'valid')
return sma
def draw_equispaced_items_from_sequence(m, n):
"""
draw_equispaced_items_from_sequence(m, n)
Args:
m (int): How many items to draw.
n (int): Length of sequence to draw from.
"""
return [i * n // m + n // (2 * m) for i in range(m)]
import os
import tensorflow as tf
class ValueFunctionDQN:
def __init__(self, scope="MyValueFunctionEstimator", state_dim=2, n_actions=3, train_batch_size=64,
learning_rate=1e-4, hidden_layers_size=None, decay_lr=False, huber_loss=False, summaries_path=None,
reset_default_graph=False, checkpoints_dir=None):
# Input check
if hidden_layers_size is None:
hidden_layers_size = [128, 64] # Default ANN architecture
assert len(hidden_layers_size) >= 1, "At least one hidden layer must be specified."
# Support variables
self.scope = scope
self.layers_size = [state_dim] + hidden_layers_size + [n_actions] # Size of all layers (including in & out)
self.weights = []
self.biases = []
self.weights_old = []
self.biases_old = []
self.learning_rate = learning_rate
self.train_batch_size = train_batch_size
self.n_train_epochs = 0
self.summaries_path = summaries_path
self.train_writer = None
self.checkpoints_dir = checkpoints_dir
if reset_default_graph:
tf.reset_default_graph()
# Build Tensorflow graph
with tf.variable_scope(self.scope):
# Inputs, weights, biases and targets of the ANN
self.x = tf.placeholder(tf.float32, shape=(None, state_dim), name="x")
self.train_targets = tf.placeholder(tf.float32, shape=(None, n_actions), name="train_targets")
for l in range(len(self.layers_size) - 1):
self.weights.append(tf.get_variable(name="w" + str(l), shape=[self.layers_size[l],
self.layers_size[l + 1]],
initializer=tf.contrib.layers.xavier_initializer()))
self.biases.append(tf.get_variable(name="b" + str(l), shape=[self.layers_size[l + 1]],
initializer=tf.constant_initializer(0.0)))
self.weights_old.append(tf.get_variable(name="w-" + str(l), shape=[self.layers_size[l],
self.layers_size[l + 1]],
initializer=tf.contrib.layers.xavier_initializer()))
self.biases_old.append(tf.get_variable(name="b-" + str(l), shape=[self.layers_size[l + 1]],
initializer=tf.constant_initializer(0.0)))
if summaries_path is not None:
with tf.name_scope('params_summaries'):
for l in range(len(self.layers_size) - 1):
self.variable_summaries(self.weights[l], "w" + str(l), histogram=True)
self.variable_summaries(self.biases[l], "b" + str(l), histogram=True)
# Interconnection of the various ANN nodes
self.prediction = self.model(self.x)
self.prediction_with_old_params = self.model(self.x, use_old_params=True)
# Training calculations
if huber_loss:
self.loss = self.huber_loss(self.train_targets, self.prediction)
else:
self.SE = tf.squared_difference(self.train_targets, self.prediction, name="SquaredError")
self.loss = tf.reduce_mean(self.SE, name="loss")
self.global_step = tf.Variable(0, trainable=False)
if decay_lr:
self.learning_rate = tf.train.exponential_decay(1e-4, self.global_step, 3000 * 200, 1e-5 / 1e-4)
self.opt_op = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
self.train_op = self.opt_op.minimize(self.loss, global_step=self.global_step)
self.init_op = tf.global_variables_initializer()
if self.summaries_path is not None:
self.variable_summaries(self.loss, "loss", scalar_only=True)
self.variable_summaries(self.learning_rate, "learning_rate", scalar_only=True)
if self.checkpoints_dir is not None:
var_list = []
for l in range(len(self.layers_size) - 1):
var_list.append(self.weights[l])
var_list.append(self.biases[l])
self.saver = tf.train.Saver(var_list, pad_step_number=True)
if self.summaries_path is not None:
self.merged_summaries = tf.summary.merge_all()
self.summaries_path += "_{}".format(self.scope)
if not os.path.exists(self.summaries_path):
os.makedirs(self.summaries_path)
self.train_writer = tf.summary.FileWriter(self.summaries_path, graph=tf.get_default_graph())
else:
self.merged_summaries = None
self.session = None
def model(self, x, use_old_params=False):
z = []
hidden = [x]
for l in range(len(self.layers_size)-2):
if use_old_params:
z.append(tf.matmul(hidden[l], self.weights_old[l]) + self.biases_old[l])
else:
z.append(tf.matmul(hidden[l], self.weights[l]) + self.biases[l])
hidden.append(tf.nn.relu(z[l], name="hidden_" + str(l + 1)))
if use_old_params:
z.append(tf.matmul(hidden[-1], self.weights_old[-1]) + self.biases_old[-1])
else:
z.append(tf.matmul(hidden[-1], self.weights[-1]) + self.biases[-1])
if not use_old_params:
if self.summaries_path is not None:
with tf.name_scope('layers_summaries'):
for l in range(len(self.layers_size) - 1):
self.variable_summaries(z[l], "z" + str(l))
self.variable_summaries(hidden[l], "hidden" + str(l))
return z[-1] # Output layer has Identity units.
@staticmethod
def huber_loss(targets, predictions):
error = targets - predictions
fn_choice_maker1 = (tf.to_int32(tf.sign(error + 1)) + 1) / 2
fn_choice_maker2 = (tf.to_int32(tf.sign(-error + 1)) + 1) / 2
choice_maker_sqr = tf.to_float(tf.multiply(fn_choice_maker1, fn_choice_maker2))
sqr_contrib = tf.multiply(choice_maker_sqr, tf.square(error)*0.5)
abs_contrib = tf.abs(error)-0.5 - tf.multiply(choice_maker_sqr, tf.abs(error)-0.5)
loss = tf.reduce_mean(sqr_contrib + abs_contrib)
return loss
def init_tf_session(self):
if self.session is None:
self.session = tf.Session()
self.session.run(self.init_op) # Global Variables Initializer (init op)
def predict(self, states, use_old_params=False):
self.init_tf_session() # Make sure the Tensorflow session exists
feed_dict = {self.x: states}
if use_old_params:
q = self.session.run(self.prediction_with_old_params, feed_dict=feed_dict)
else:
q = self.session.run(self.prediction, feed_dict=feed_dict)
return q
def train(self, states, targets):
self.init_tf_session() # Make sure the Tensorflow session exists
feed_dict = {self.x: states, self.train_targets: targets}
if self.summaries_path is not None and self.n_train_epochs % 2000 == 0:
fetches = [self.loss, self.train_op, self.merged_summaries]
else:
fetches = [self.loss, self.train_op]
values = self.session.run(fetches, feed_dict=feed_dict)
if self.summaries_path is not None and self.n_train_epochs % 2000 == 0:
self.train_writer.add_summary(values[2], global_step=self.n_train_epochs)
if self.checkpoints_dir is not None and self.n_train_epochs % 40000 == 0:
self.saver.save(self.session, self.checkpoints_dir, global_step=self.global_step)
self.n_train_epochs += 1
return values[0]
@staticmethod
def variable_summaries(var, name, histogram=False, scalar_only=False):
"""Attach a lot of summaries to a Tensor (for TensorBoard visualization)."""
if scalar_only:
tf.summary.scalar(name, var)
else:
mean = tf.reduce_mean(var)
tf.summary.scalar(name+'_mean', mean)
with tf.name_scope('stddev'):
stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean)))
tf.summary.scalar(name+'_stddev', stddev)
tf.summary.scalar(name+'_max', tf.reduce_max(var))
tf.summary.scalar(name+'_min', tf.reduce_min(var))
if histogram:
tf.summary.histogram(name+'_histogram', var)
def update_old_params(self):
self.init_tf_session() # Make sure the Tensorflow session exists
update_ops = []
for l in range(len(self.layers_size) - 1):
update_ops.append(self.weights_old[l].assign(self.weights[l]))
update_ops.append(self.biases_old[l].assign(self.biases[l]))
self.session.run(update_ops)
def close_summary_file(self):
if self.summaries_path is not None:
self.train_writer.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment