Skip to content

Instantly share code, notes, and snippets.

@pocokhc
Created June 23, 2019 02:06
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save pocokhc/4b0296c93951fd8f6a41831f1662089b to your computer and use it in GitHub Desktop.
Save pocokhc/4b0296c93951fd8f6a41831f1662089b to your computer and use it in GitHub Desktop.
DQNでハイパーパラメータを比較したときのコードです。
import gym
import pickle
import os
import numpy as np
import random
import time
import traceback
import math
import tensorflow as tf
from keras.optimizers import Adam
from keras.models import Model
from keras.layers import *
from keras import backend as K
import rl.core
import multiprocessing as mp
import matplotlib.pyplot as plt
from PIL import Image, ImageDraw
#--- GPU設定
from keras.backend.tensorflow_backend import set_session
config = tf.ConfigProto(
gpu_options=tf.GPUOptions(
#visible_device_list="0", # specify GPU number
allow_growth=True,
per_process_gpu_memory_fraction=1,
)
)
set_session(tf.Session(config=config))
#---------------------------------------------------
# PendulumProcessor
#---------------------------------------------------
class PendulumProcessor(rl.core.Processor):
def __init__(self):
self.mode = "train"
def process_action(self, action):
ACT_ID_TO_VALUE = {
0: [-2.0],
1: [-1.0],
2: [0.0],
3: [+1.0],
4: [+2.0],
}
return ACT_ID_TO_VALUE[action]
def process_reward(self, reward):
if self.mode == "test": # testは本当の値を返す
return reward
# -16.5~0 を -1~1 に正規化
self.max = 0
self.min = -16.5
# min max normarization
if (self.max - self.min) == 0:
return 0
M = 1
m = -0.5
return ((reward - self.min) / (self.max - self.min))*(M - m) + m
#---------------------------------------------------
# CartPoleProcessor
#---------------------------------------------------
class CartPoleProcessor(rl.core.Processor):
def __init__(self):
self.mode = ""
self.step = 0
def process_step(self, observation, reward, done, info):
observation = self.process_observation(observation)
reward = self.process_reward(reward)
info = self.process_info(info)
if self.mode == "test":
return observation, reward, done, info
self.step += 1
if done :
reward = -1
self.step = 0
else:
reward = 0.01
return observation, reward, done, info
#---------------------------------------------------
# AcrobotProcessor
#---------------------------------------------------
class AcrobotProcessor(rl.core.Processor):
def __init__(self):
self.mode = ""
self.step = 0
def process_step(self, observation, reward, done, info):
observation = self.process_observation(observation)
reward = self.process_reward(reward)
info = self.process_info(info)
if self.mode == "test":
return observation, reward, done, info
self.step += 1
if done :
reward = 500-self.step
self.step = 0
else:
reward = 0
return observation, reward, done, info
#---------------------------------------------------
# manager
#---------------------------------------------------
class R2D2Manager():
def __init__(self,
actor_func,
args,
create_processor_func,
create_optimizer_func,
):
# 引数整形
args["save_weights_path"] = args["save_weights_path"] if ("save_weights_path" in args) else ""
args["load_weights_path"] = args["load_weights_path"] if ("load_weights_path" in args) else ""
# type チェック
lstm_types = [
"",
"lstm",
"lstm_ful",
]
if args["lstm_type"] not in lstm_types:
raise ValueError('lstm_type is ["","lstm","lstm_ful"]')
# lstm_ful のみburnin有効
if args["lstm_type"] != "lstm_ful":
args["burnin_length"] = 0
self.actor_func = actor_func
self.num_actors = args["num_actors"]
self.args = args
# build_compile_model 関数用の引数
model_args = {
"input_shape": self.args["input_shape"],
"enable_image_layer": self.args["enable_image_layer"],
"nb_actions": self.args["nb_actions"],
"input_sequence": self.args["input_sequence"],
"enable_dueling_network": self.args["enable_dueling_network"],
"dueling_network_type": self.args["dueling_network_type"],
"enable_noisynet": self.args["enable_noisynet"],
"dense_units_num": self.args["dense_units_num"],
"lstm_type": self.args["lstm_type"],
"lstm_units_num": self.args["lstm_units_num"],
"metrics": self.args["metrics"],
"create_optimizer_func": create_optimizer_func,
}
self._create_process(model_args, create_processor_func, args)
def _create_process(self, model_args, create_processor_func, args_org):
# 各Queueを作成
experience_q = mp.Queue()
model_sync_q = [[mp.Queue(), mp.Queue(), mp.Queue()] for _ in range(self.num_actors)]
self.learner_end_q = [mp.Queue(), mp.Queue()]
self.actors_end_q = [[mp.Queue(), mp.Queue()] for _ in range(self.num_actors)]
self.learner_logger_q = mp.Queue()
self.actors_logger_q = mp.Queue()
# learner ps を作成
args = (
model_args,
self.args,
experience_q,
model_sync_q,
self.learner_end_q,
self.learner_logger_q,
)
if args_org["enable_GPU"]:
self.learner_ps = mp.Process(target=learner_run_gpu, args=args)
else:
self.learner_ps = mp.Process(target=learner_run, args=args)
# actor ps を作成
self.actors_ps = []
for i in range(self.num_actors):
args = (
i,
self.actor_func,
model_args,
self.args,
create_processor_func,
experience_q,
model_sync_q[i],
self.actors_logger_q,
self.actors_end_q[i],
)
if args_org["enable_GPU"]:
self.actors_ps.append(mp.Process(target=actor_run_cpu, args=args))
else:
self.actors_ps.append(mp.Process(target=actor_run, args=args))
# test用 Actor は子 Process では作らないのでselfにする。
self.model_args = model_args
self.create_processor_func = create_processor_func
def __del__(self):
self.learner_ps.terminate()
for p in self.actors_ps:
p.terminate()
def train(self):
learner_logs = []
actors_logs = {}
# プロセスを動かす
try:
self.learner_ps.start()
for p in self.actors_ps:
p.start()
# 終了を待つ
while True:
time.sleep(1) # polling time
# 定期的にログを吸出し
while not self.learner_logger_q.empty():
learner_logs.append(self.learner_logger_q.get(timeout=1))
while not self.actors_logger_q.empty():
log = self.actors_logger_q.get(timeout=1)
if log["name"] not in actors_logs:
actors_logs[log["name"]] = []
actors_logs[log["name"]].append(log)
# learner 修了確認
if not self.learner_end_q[1].empty():
# actorに終了を投げる
for i, q in enumerate(self.actors_end_q):
if q[1].empty():
print("Send actor{} end signal.".format(i))
q[1].put(1)
# actor 終了確認
f = True
for q in self.actors_end_q:
if q[0].empty():
f = False
break
if f:
break
except KeyboardInterrupt:
pass
except Exception:
print(traceback.format_exc())
# 最後のログ吸出し
while not self.learner_logger_q.empty():
learner_logs.append(self.learner_logger_q.get(timeout=1))
while not self.actors_logger_q.empty():
log = self.actors_logger_q.get(timeout=1)
if log["name"] not in actors_logs:
actors_logs[log["name"]] = []
actors_logs[log["name"]].append(log)
# learner に終了を投げる
self.learner_end_q[0].put(1)
# learner から最後の状態を取得
print("Last Learner weights waiting...")
weights = self.learner_end_q[1].get(timeout=60)
# test用の Actor を作成
test_actor = Actor(
-1,
self.model_args,
self.args,
None,
None,
processor=self.create_processor_func()
)
test_actor.model.set_weights(weights)
# kill
self.learner_ps.terminate()
for p in self.actors_ps:
p.terminate()
return test_actor, learner_logs, actors_logs
#---------------------------------------------------
# network
#---------------------------------------------------
def clipped_error_loss(y_true, y_pred):
err = y_true - y_pred # エラー
L2 = 0.5 * K.square(err)
L1 = K.abs(err) - 0.5
# エラーが[-1,1]区間ならL2、それ以外ならL1を選択する。
loss = tf.where((K.abs(err) < 1.0), L2, L1) # Keras does not cover where function in tensorflow :-(
return K.mean(loss)
def rescaling(x, epsilon=0.001):
n = math.sqrt(abs(x)+1) - 1
return np.sign(x)*n + epsilon*x
def rescaling_inverse(x):
return np.sign(x)*( (x+np.sign(x) ) ** 2 - 1)
def build_compile_model(
input_shape, # 入力shape
enable_image_layer, # image_layerを入れるか
input_sequence, # input_sequence
nb_actions, # アクション数
enable_dueling_network, # dueling_network を有効にするか
dueling_network_type,
enable_noisynet,
dense_units_num, # Dense層のユニット数
lstm_type, # 使用するLSTMアルゴリズム
lstm_units_num, # LSTMのユニット数
create_optimizer_func,
metrics, # compile に渡す metrics
):
if lstm_type == "lstm_ful":
# (batch_size, timesteps, width, height)
c = input_ = Input(batch_shape=(1, 1) + input_shape)
else:
# 入力層(input_sequence, width, height)
c = input_ = Input(shape=(input_sequence,) + input_shape)
if enable_image_layer:
if lstm_type == "":
c = Permute((2, 3, 1))(c) # (window,w,h) -> (w,h,window)
c = Conv2D(32, (8, 8), strides=(4, 4), padding="same", name="c1")(c)
c = Activation("relu")(c)
c = Conv2D(64, (4, 4), strides=(2, 2), padding="same", name="c2")(c)
c = Activation("relu")(c)
c = Conv2D(64, (3, 3), strides=(1, 1), padding="same", name="c3")(c)
c = Activation("relu")(c)
c = Flatten()(c)
else:
# (time steps, w, h) -> (time steps, w, h, ch)
if lstm_type == "lstm_ful":
c = Reshape((1, ) + input_shape + (1,) )(c)
else:
c = Reshape((input_sequence, ) + input_shape + (1,) )(c)
# https://keras.io/layers/wrappers/
c = TimeDistributed(Conv2D(32, (8, 8), strides=(4, 4), padding="same"), name="c1")(c)
c = Activation("relu")(c)
c = TimeDistributed(Conv2D(64, (4, 4), strides=(2, 2), padding="same"), name="c2")(c)
c = Activation("relu")(c)
c = TimeDistributed(Conv2D(64, (3, 3), strides=(1, 1), padding="same"), name="c3")(c)
c = Activation("relu")(c)
c = TimeDistributed(Flatten())(c)
elif lstm_type == "":
c = Flatten()(c)
if lstm_type == "lstm":
c = LSTM(lstm_units_num, name="lstm")(c)
elif lstm_type == "lstm_ful":
c = LSTM(lstm_units_num, stateful=True, name="lstm")(c)
if enable_dueling_network:
# value
v = Dense(dense_units_num, activation="relu")(c)
if enable_noisynet:
v = NoisyDense(1, name="v")(v)
else:
v = Dense(1, name="v")(v)
# advance
adv = Dense(dense_units_num, activation='relu')(c)
if enable_noisynet:
adv = NoisyDense(nb_actions, name="adv")(adv)
else:
adv = Dense(nb_actions, name="adv")(adv)
# 連結で結合
c = Concatenate()([v,adv])
if dueling_network_type == "ave":
c = Lambda(lambda a: K.expand_dims(a[:, 0], -1) + a[:, 1:] - K.mean(a[:, 1:], axis=1, keepdims=True), output_shape=(nb_actions,))(c)
elif dueling_network_type == "max":
c = Lambda(lambda a: K.expand_dims(a[:, 0], -1) + a[:, 1:] - K.max(a[:, 1:], axis=1, keepdims=True), output_shape=(nb_actions,))(c)
elif dueling_network_type == "naive":
c = Lambda(lambda a: K.expand_dims(a[:, 0], -1) + a[:, 1:], output_shape=(nb_actions,))(c)
else:
raise ValueError('dueling_network_type is ["ave","max","naive"]')
else:
c = Dense(dense_units_num, activation="relu")(c)
if enable_noisynet:
c = NoisyDense(nb_actions, activation="linear", name="adv")(c)
else:
c = Dense(nb_actions, activation="linear", name="adv")(c)
model = Model(input_, c)
# compile
model.compile(
loss=clipped_error_loss,
optimizer=create_optimizer_func(),
metrics=metrics)
return model
#---------------------------------------------------
# learner
#---------------------------------------------------
def learner_run_gpu(
model_args,
args,
experience_q,
model_sync_q,
learner_end_q,
logger_q,
):
with tf.device("/device:GPU:0"):
learner_run(
model_args,
args,
experience_q,
model_sync_q,
learner_end_q,
logger_q)
def learner_run(
model_args,
args,
experience_q,
model_sync_q,
learner_end_q,
logger_q,
):
learner = Learner(
model_args=model_args,
args=args,
experience_q=experience_q,
model_sync_q=model_sync_q,
)
try:
# model load
if os.path.isfile(args["load_weights_path"]):
learner.model.load_weights(args["load_weights_path"])
learner.target_model.load_weights(args["load_weights_path"])
# logger用
t0 = time.time()
# learner はひたすら学習する
print("Learner Starts!")
while True:
learner.train()
# logger
if time.time() - t0 > args["logger_interval"]:
t0 = time.time()
logger_q.put({
"name": "learner",
"train_num": learner.train_num,
})
# 終了判定
if not learner_end_q[0].empty():
break
# 終了判定
if args["limit_train_count"] > 0:
if learner.train_num > args["limit_train_count"]:
break
except KeyboardInterrupt:
pass
except Exception:
print(traceback.format_exc())
finally:
print("Learning End. Train Count:{}".format(learner.train_num))
# model save
if args["save_weights_path"] != "":
print("save:" + args["save_weights_path"])
learner.model.save_weights(args["save_weights_path"], args["save_overwrite"])
# 最後の状態を manager に投げる
print("Last Learner weights sending...")
learner_end_q[1].put(learner.model.get_weights())
class Learner():
def __init__(self,
model_args,
args,
experience_q,
model_sync_q
):
self.experience_q = experience_q
self.model_sync_q = model_sync_q
# memory
memory_type = args["remote_memory_type"]
memory_args = args["remote_memory_args"]
if memory_type == "replay":
self.memory = ReplayMemory(**memory_args)
elif memory_type == "per_greedy":
self.memory = PERGreedyMemory(**memory_args)
elif memory_type == "per_proportional":
self.memory = PERProportionalMemory(**memory_args)
elif memory_type == "per_rankbase":
self.memory = PERRankBaseMemory(**memory_args)
else:
raise ValueError('memory_type is ["replay","per_greedy","per_proportional","per_rankbase"]')
self.memory_warmup_size = args["remote_memory_warmup_size"]
self.gamma = args["gamma"]
self.batch_size = args["batch_size"]
self.enable_double_dqn = args["enable_double_dqn"]
self.target_model_update = args["target_model_update"]
self.multireward_steps = args["multireward_steps"]
self.input_sequence = args["input_sequence"]
self.lstm_type = args["lstm_type"]
self.enable_rescaling_priority = args["enable_rescaling_priority"]
self.enable_rescaling_train = args["enable_rescaling_train"]
self.rescaling_epsilon = args["rescaling_epsilon"]
self.burnin_length = args["burnin_length"]
self.priority_exponent = args["priority_exponent"]
assert memory_args["capacity"] > self.batch_size, "Memory capacity is small.(Larger than batch size)"
assert self.memory_warmup_size > self.batch_size, "Warmup steps is few.(Larger than batch size)"
# local
self.train_num = 0
# model create
self.model = build_compile_model(**model_args)
self.target_model = build_compile_model(**model_args)
# lstm ful では lstmレイヤーを使う
if self.lstm_type == "lstm_ful":
self.lstm = self.model.get_layer("lstm")
self.target_lstm = self.target_model.get_layer("lstm")
def train(self):
# Actor から要求があれば weights を渡す
for q in self.model_sync_q:
if not q[0].empty():
# 空にする(念のため)
while not q[0].empty():
q[0].get(timeout=1)
# 送る
q[1].put(self.model.get_weights())
# experience があれば RemoteMemory に追加
while not self.experience_q.empty():
exps = self.experience_q.get(timeout=1)
for exp in exps:
if self.lstm_type == "lstm_ful":
self.memory.add(exp, exp[4])
else:
self.memory.add(exp, exp[4])
# RemoteMemory が一定数貯まるまで学習しない。
if len(self.memory) <= self.memory_warmup_size:
time.sleep(1) # なんとなく
return
(indexes, batchs, weights) = self.memory.sample(self.batch_size, self.train_num)
# 学習(長いので関数化)
if self.lstm_type == "lstm_ful":
self.train_model_ful(indexes, batchs, weights)
else:
self.train_model(indexes, batchs, weights)
self.train_num += 1
# target networkの更新
if self.train_num % self.target_model_update == 0:
self.target_model.set_weights(self.model.get_weights())
# ノーマルの学習
def train_model(self, indexes, batchs, weights):
state0_batch = []
action_batch = []
reward_batch = []
state1_batch = []
for batch in batchs:
state0_batch.append(batch[0])
action_batch.append(batch[1])
reward_batch.append(batch[2])
state1_batch.append(batch[3])
# 更新用に現在のQネットワークを出力(Q network)
outputs = self.model.predict(np.asarray(state0_batch), self.batch_size)
if self.enable_double_dqn:
# TargetNetworkとQNetworkのQ値を出す
state1_model_qvals_batch = self.model.predict(np.asarray(state1_batch), self.batch_size)
state1_target_qvals_batch = self.target_model.predict(np.asarray(state1_batch), self.batch_size)
else:
# 次の状態のQ値を取得(target_network)
target_qvals = self.target_model.predict(np.asarray(state1_batch), self.batch_size)
for i in range(self.batch_size):
if self.enable_double_dqn:
action = np.argmax(state1_model_qvals_batch[i]) # modelからアクションを出す
maxq = state1_target_qvals_batch[i][action] # Q値はtarget_modelを使って出す
else:
maxq = np.max(target_qvals[i])
# priority計算
if self.enable_rescaling_priority:
tmp = rescaling_inverse(maxq)
else:
tmp = maxq
tmp = reward_batch[i] + (self.gamma ** self.multireward_steps) * tmp
tmp *= weights[i]
if self.enable_rescaling_priority:
tmp = rescaling(tmp, self.rescaling_epsilon)
priority = abs(tmp - outputs[i][action_batch[i]])
# Q値の更新
if self.enable_rescaling_train:
maxq = rescaling_inverse(maxq)
td_error = reward_batch[i] + (self.gamma ** self.multireward_steps) * maxq
td_error *= weights[i]
if self.enable_rescaling_train:
td_error = rescaling(td_error, self.rescaling_epsilon)
outputs[i][action_batch[i]] = td_error
# priorityを更新を更新
self.memory.update(indexes[i], batchs[i], priority)
# 学習
self.model.train_on_batch(np.asarray(state0_batch), np.asarray(outputs))
#self.model.fit(np.asarray(state0_batch), np.asarray(outputs), batch_size=self.batch_size, epochs=1, verbose=0)
# ステートフルLSTMの学習
def train_model_ful(self, indexes, batchs, weights):
# 各経験毎に処理を実施
for batch_i, batch in enumerate(batchs):
states = batch[0]
action = batch[1]
reward = batch[2]
hidden_state = batch[3]
prioritys = []
# burn-in
self.lstm.reset_states(hidden_state)
for i in range(self.burnin_length):
self.model.predict(np.asarray([[states[i]]]), 1)
# burn-in 後の結果を保存
hidden_state = [K.get_value(self.lstm.states[0]), K.get_value(self.lstm.states[1])]
# 以降は1sequenceずつ更新させる
for i in range(self.input_sequence):
state0 = [states[self.burnin_length + i]]
state1 = [states[self.burnin_length + i + self.multireward_steps]]
# 現在のQネットワークを出力
self.lstm.reset_states(hidden_state)
output = self.model.predict(np.asarray([state0]), 1)[0]
# TargetネットワークとQネットワークの値を出力
if self.enable_double_dqn:
self.lstm.reset_states(hidden_state)
self.target_lstm.reset_states(hidden_state)
state1_model_qvals = self.model.predict(np.asarray([state1]), 1)[0]
state1_target_qvals = self.target_model.predict(np.asarray([state1]), 1)[0]
action_q = np.argmax(state1_model_qvals)
maxq = state1_target_qvals[action_q]
else:
self.target_lstm.reset_states(hidden_state)
target_qvals = self.target_model.predict(np.asarray([state1], 1))[0]
maxq = np.max(target_qvals)
# priority計算
if self.enable_rescaling_priority:
tmp = rescaling_inverse(maxq)
else:
tmp = maxq
tmp = reward[i] + (self.gamma ** self.multireward_steps) * tmp
tmp *= weights[batch_i]
if self.enable_rescaling_priority:
tmp = rescaling(tmp)
priority = abs(tmp - output[action[i]])
prioritys.append(priority)
# Q値 update用
if self.enable_rescaling_train:
maxq = rescaling_inverse(maxq)
td_error = reward[i] + (self.gamma ** self.multireward_steps) * maxq
td_error *= weights[batch_i]
if self.enable_rescaling_train:
td_error = rescaling(td_error, self.rescaling_epsilon)
output[action[i]] = td_error
# 学習
self.lstm.reset_states(hidden_state)
self.model.fit(
np.asarray([state0]),
np.asarray([output]),
batch_size=1,
epochs=1,
verbose=0,
shuffle=False
)
# 次の学習用に hidden state を保存
hidden_state = [K.get_value(self.lstm.states[0]), K.get_value(self.lstm.states[1])]
# 今回使用したsamplingのpriorityを更新
priority = self.priority_exponent * np.max(prioritys) + (1-self.priority_exponent) * np.average(prioritys)
self.memory.update(indexes[batch_i], batch, priority)
#---------------------------------------------------
# actor
#---------------------------------------------------
class ActorLogger(rl.callbacks.Callback):
def __init__(self, index, logger_q, interval):
self.index = index
self.interval = interval
self.logger_q = logger_q
def on_train_begin(self, logs):
self.t0 = time.time()
self.reward_min = None
self.reward_max = None
def on_episode_end(self, episode, logs):
if self.reward_min is None:
self.reward_min = logs["episode_reward"]
self.reward_max = logs["episode_reward"]
else:
if self.reward_min > logs["episode_reward"]:
self.reward_min = logs["episode_reward"]
if self.reward_max < logs["episode_reward"]:
self.reward_max = logs["episode_reward"]
if time.time() - self.t0 > self.interval:
self.t0 = time.time()
self.logger_q.put({
"name": "actor" + str(self.index),
"reward_min": self.reward_min,
"reward_max": self.reward_max,
"nb_steps": logs["nb_steps"],
})
self.reward_min = None
self.reward_max = None
class ActorStop(rl.callbacks.Callback):
def __init__(self, end_q, index):
self.end_q = end_q
self.index = index
def on_step_end(self, episode, logs):
if not self.end_q.empty():
raise KeyboardInterrupt()
def actor_run_cpu(
actor_index,
actor_func,
model_args,
args,
create_processor_func,
experience_q,
model_sync_q,
logger_q,
actors_end_q,
):
with tf.device("/device:CPU:0"):
actor_run(
actor_index,
actor_func,
model_args,
args,
create_processor_func,
experience_q,
model_sync_q,
logger_q,
actors_end_q)
def actor_run(
actor_index,
actor_func,
model_args,
args,
create_processor_func,
experience_q,
model_sync_q,
logger_q,
actors_end_q,
):
print("Actor{} Starts!".format(actor_index))
try:
actor = Actor(
actor_index,
model_args,
args,
experience_q,
model_sync_q,
processor=create_processor_func()
)
# model load
if os.path.isfile( args["load_weights_path"] ):
actor.model.load_weights(args["load_weights_path"])
actor.load_weights(args["load_weights_path"])
# logger用
callbacks = [
ActorLogger(actor_index, logger_q, args["logger_interval"]),
ActorStop(actors_end_q[1], actor_index),
]
# run
actor_func(actor_index, actor, callbacks=callbacks)
except KeyboardInterrupt:
pass
except Exception:
print(traceback.format_exc())
finally:
actor.save_weights(args["save_weights_path"])
print("Actor{} End!".format(actor_index))
actors_end_q[0].put(1)
from collections import deque
class Actor(rl.core.Agent):
def __init__(self,
actor_index,
model_args,
args,
experience_q,
model_sync_q,
**kwargs):
super(Actor, self).__init__(**kwargs)
self.actor_index = actor_index
self.experience_q = experience_q
self.model_sync_q = model_sync_q
# 探索ポリシー
policy = args["action_policies"][actor_index]
policies = {
"greedy": EpsilonGreedy,
"greedy_actor": EpsilonGreedyActor,
"annieling_greedy": AnnealingEpsilonGreedy,
"softmax": SoftmaxPolicy,
"ucb1": UCB1,
"ucb1_tuned": UCB1_Tuned,
"ucbv": UCBv,
"kl_ucb": KL_UCB,
"ts_beta": ThompsonSamplingBeta,
"ts_gaussian": ThompsonSamplingGaussian,
}
if policy["type"] not in policies.keys():
raise ValueError('action_policy_type is [{}]'.format(",".join(policies.keys())))
if policy["type"] == "greedy_actor":
policy["args"]["index"] = actor_index
policy["args"]["num_actors"] = args["num_actors"]
self.action_policy = policies[policy["type"]](**policy["args"])
self.action_policy.compile(build_compile_model, model_args)
self.nb_actions = args["nb_actions"]
self.input_shape = args["input_shape"]
self.input_sequence = args["input_sequence"]
self.actor_model_sync_interval = args["actor_model_sync_interval"]
self.gamma = args["gamma"]
self.multireward_steps = args["multireward_steps"]
self.action_interval = args["action_interval"]
self.burnin_length = args["burnin_length"]
self.lstm_type = args["lstm_type"]
self.enable_dueling_network = args["enable_dueling_network"]
self.enable_noisynet = args["enable_noisynet"]
self.enable_rescaling_priority = args["enable_rescaling_priority"]
self.rescaling_epsilon = args["rescaling_epsilon"]
self.priority_exponent = args["priority_exponent"]
# local memory
self.local_memory = deque()
self.local_memory_update_size = args["local_memory_update_size"]
self.learner_train_num = 0
# model
self.model = build_compile_model(**model_args)
# lstm ful では lstmレイヤーを使う
if self.lstm_type == "lstm_ful":
self.lstm = self.model.get_layer("lstm")
self.compiled = True
def reset_states(self):
self.repeated_action = 0
self.recent_action = [ 0 for _ in range(self.input_sequence)]
self.recent_reward = [ 0 for _ in range(self.input_sequence + self.multireward_steps - 1)]
obs_length = self.burnin_length + self.input_sequence + self.multireward_steps
self.recent_observations = [np.zeros(self.input_shape) for _ in range(obs_length)]
if self.lstm_type == "lstm_ful":
self.model.reset_states()
self.recent_hidden_state = [
[K.get_value(self.lstm.states[0]), K.get_value(self.lstm.states[1])]
for _ in range(self.burnin_length + self.input_sequence)
]
def get_prev_observation(self):
if self.lstm_type == "lstm_ful":
return np.asarray([[self.recent_observations[-1]]])
else:
return np.asarray([self.recent_observations[-self.input_sequence:]])
def get_prev_action(self):
return self.recent_action[-1]
def get_prev_reward(self):
return self.recent_reward[-1]
def compile(self, optimizer, metrics=[]):
self.compiled = True
def save_weights(self, filepath, overwrite=False):
filepath += "_{}.h5".format(self.actor_index)
if overwrite or not os.path.isfile(filepath):
d = {}
self.action_policy.save_weights(d)
with open(filepath, 'wb') as f:
pickle.dump(d, f)
def load_weights(self, filepath):
filepath += "_{}.h5".format(self.actor_index)
with open(filepath, 'rb') as f:
d = pickle.load(f)
self.action_policy.load_weights(d)
def forward(self, observation):
self.recent_observations.append(observation) # 最後に追加
self.recent_observations.pop(0) # 先頭を削除
if self.training:
#--- 経験を送る
if self.lstm_type == "lstm_ful":
# priorityを計算
prioritys = []
rewards = []
for i in range(self.input_sequence):
state0 = [self.recent_observations[self.burnin_length + i]]
state1 = [self.recent_observations[self.burnin_length + i + self.multireward_steps]]
hidden_state = self.recent_hidden_state[i]
action = self.recent_action[i]
# Multi-Step learning
reward = 0
for j in range(self.multireward_steps):
reward += self.recent_reward[i+j] * (self.gamma ** j)
rewards.append(reward)
# 現在のQネットワークを出力
self.lstm.reset_states(hidden_state)
state0_qvals = self.model.predict(np.asarray([state0]), 1)[0]
self.lstm.reset_states(hidden_state)
state1_qvals = self.model.predict(np.asarray([state1]), 1)[0]
maxq = np.max(state1_qvals)
# priority計算
if self.enable_rescaling_priority:
td_error = rescaling_inverse(maxq)
td_error = reward + (self.gamma ** self.multireward_steps) * maxq
if self.enable_rescaling_priority:
td_error = rescaling(td_error, self.rescaling_epsilon)
priority = abs(td_error - state0_qvals[action])
prioritys.append(priority)
# 今回使用したsamplingのpriorityを更新
priority = self.priority_exponent * np.max(prioritys) + (1-self.priority_exponent) * np.average(prioritys)
# local memoryに追加
self.local_memory.append((
self.recent_observations[:],
self.recent_action[:],
rewards,
self.recent_hidden_state[0],
priority,
))
else:
# Multi-Step learning
reward = 0
for i, r in enumerate(reversed(self.recent_reward)):
reward += r * (self.gamma ** i)
state0 = self.recent_observations[self.burnin_length:self.burnin_length+self.input_sequence]
state1 = self.recent_observations[-self.input_sequence:]
action = self.recent_action[-1]
# priority のために TD-error をだす。
state0_qvals = self.model.predict(np.asarray([state0]), 1)[0]
state1_qvals = self.model.predict(np.asarray([state1]), 1)[0]
maxq = np.max(state1_qvals)
# priority計算
if self.enable_rescaling_priority:
td_error = rescaling(maxq) ** -1
td_error = reward + (self.gamma ** self.multireward_steps) * maxq
if self.enable_rescaling_priority:
td_error = rescaling(td_error, self.rescaling_epsilon)
priority = abs(td_error - state0_qvals[action])
# local memoryに追加
self.local_memory.append((
state0,
action,
reward,
state1,
priority,
))
# フレームスキップ(action_interval毎に行動を選択する)
action = self.repeated_action
if self.step % self.action_interval == 0:
if self.lstm_type == "lstm_ful":
# 状態を復元
self.lstm.reset_states(self.recent_hidden_state[-1])
# 行動を決定
if self.training and not self.enable_noisynet:
# action policyに従う
state = np.asarray([self.recent_observations[-self.input_sequence:]])
action = self.action_policy.select_action(self, state)
else:
action = self._get_qmax_action()
if self.lstm_type == "lstm_ful":
# 状態を保存
self.recent_hidden_state.append([K.get_value(self.lstm.states[0]), K.get_value(self.lstm.states[1])])
self.recent_hidden_state.pop(0)
# リピート用
self.repeated_action = action
# 学習用
self.recent_action.append(action) # 最後に追加
self.recent_action.pop(0) # 先頭を削除
return action
def _get_qmax_action(self):
state = self.get_prev_observation()
q_values = self.model.predict(state, batch_size=1)[0]
return np.argmax(q_values)
def backward(self, reward, terminal):
self.recent_reward.append(reward) # 最後に追加
self.recent_reward.pop(0) # 先頭を削除
if self.training:
# 一定間隔で model を learner からsyncさせる
if self.step % self.actor_model_sync_interval == 0:
# 要求を送る
self.model_sync_q[0].put(1) # 要求
# weightが届いていれば更新
if not self.model_sync_q[1].empty():
weights = self.model_sync_q[1].get(timeout=1)
# 空にする(念のため)
while not self.model_sync_q[1].empty():
self.model_sync_q[1].get(timeout=1)
self.model.set_weights(weights)
# localメモリが一定量超えていれば RemoteMemory に送信
if len(self.local_memory) > self.local_memory_update_size:
# 共有qに送るのは重いので配列化
data = []
while len(self.local_memory) > 0:
data.append(self.local_memory.pop())
self.experience_q.put(data)
return []
@property
def layers(self):
return self.model.layers[:]
#--------------------------------------------------------
class ReplayMemory():
def __init__(self, capacity):
self.capacity= capacity
self.index = 0
self.buffer = []
def add(self, experience, priority):
if len(self.buffer) < self.capacity:
self.buffer.append(None)
self.buffer[self.index] = experience
self.index = (self.index + 1) % self.capacity
def update(self, idx, experience, priority):
pass
def sample(self, batch_size, steps):
batchs = random.sample(self.buffer, batch_size)
indexes = np.empty(batch_size, dtype='float32')
weights = [ 1 for _ in range(batch_size)]
return (indexes, batchs, weights)
def __len__(self):
return len(self.buffer)
#--------------------------------------------------------
class _bisect_wrapper2():
def __init__(self, data):
self.d = data
self.priority = 0
def __lt__(self, o): # a<b
return self.priority < o.priority
class PERGreedyMemory():
def __init__(self, capacity):
self.buffer = []
self.capacity = capacity
def add(self, experience, priority):
if self.capacity <= len(self.buffer):
# 上限より多い場合は要素を削除
self.buffer.pop(0)
# priority は最初は最大を選択
experience = _bisect_wrapper2(experience)
experience.priority = priority
bisect.insort(self.buffer, experience)
def update(self, idx, experience, priority):
experience = _bisect_wrapper2(experience)
experience.priority = priority
bisect.insort(self.buffer, experience)
def sample(self, batch_size, step):
# 取り出す(学習後に再度追加)
batchs = [self.buffer.pop().d for _ in range(batch_size)]
indexes = np.empty(batch_size, dtype='float32')
weights = [ 1 for _ in range(batch_size)]
return (indexes, batchs, weights)
def __len__(self):
return len(self.buffer)
#---------------------------------------------------
#copy from https://github.com/jaromiru/AI-blog/blob/5aa9f0b/SumTree.py
import numpy
class SumTree:
write = 0
def __init__(self, capacity):
self.capacity = capacity
self.tree = numpy.zeros( 2*capacity - 1 )
self.data = numpy.zeros( capacity, dtype=object )
def _propagate(self, idx, change):
parent = (idx - 1) // 2
self.tree[parent] += change
if parent != 0:
self._propagate(parent, change)
def _retrieve(self, idx, s):
left = 2 * idx + 1
right = left + 1
if left >= len(self.tree):
return idx
if s <= self.tree[left]:
return self._retrieve(left, s)
else:
return self._retrieve(right, s-self.tree[left])
def total(self):
return self.tree[0]
def add(self, p, data):
idx = self.write + self.capacity - 1
self.data[self.write] = data
self.update(idx, p)
self.write += 1
if self.write >= self.capacity:
self.write = 0
def update(self, idx, p):
change = p - self.tree[idx]
self.tree[idx] = p
self._propagate(idx, change)
def get(self, s):
idx = self._retrieve(0, s)
dataIdx = idx - self.capacity + 1
return (idx, self.tree[idx], self.data[dataIdx])
class PERProportionalMemory():
def __init__(self, capacity, alpha, beta_initial, beta_steps, enable_is):
self.capacity = capacity
self.tree = SumTree(capacity)
self.beta_initial = beta_initial
self.beta_steps = beta_steps
self.enable_is = enable_is
self.alpha = alpha
self.size = 0
def add(self, experience, priority):
priority = priority ** self.alpha
self.tree.add(priority, experience)
self.size += 1
if self.size > self.capacity:
self.size = self.capacity
def update(self, index, experience, priority):
priority = priority ** self.alpha
self.tree.update(index, priority)
def sample(self, batch_size, step):
indexes = []
batchs = []
weights = np.empty(batch_size, dtype='float32')
if self.enable_is:
# βは最初は低く、学習終わりに1にする
beta = self.beta_initial + (1 - self.beta_initial) * step / self.beta_steps
total = self.tree.total()
for i in range(batch_size):
# indexesにないものを追加
loop_over = True
for _ in range(100): # for safety
r = random.random()*total
(idx, priority, experience) = self.tree.get(r)
if idx not in indexes:
loop_over = False
break
assert not loop_over
indexes.append(idx)
batchs.append(experience)
if self.enable_is:
# 重要度サンプリングを計算
weights[i] = (self.size * priority / total) ** (-beta)
else:
weights[i] = 1 # 無効なら1
if self.enable_is:
# 安定性の理由から最大値で正規化
weights = weights / weights.max()
return (indexes ,batchs, weights)
def __len__(self):
return self.size
#------------------------------------
import bisect
class _bisect_wrapper():
def __init__(self, data):
self.d = data
self.priority = 0
self.p = 0
def __lt__(self, o): # a<b
return self.priority < o.priority
def rank_sum(k, a):
return k*( 2+(k-1)*a )/2
def rank_sum_inverse(k, a):
if a == 0:
return k
t = a-2 + math.sqrt((2-a)**2 + 8*a*k)
return t/(2*a)
class PERRankBaseMemory():
def __init__(self, capacity, alpha, beta_initial, beta_steps, enable_is):
self.capacity = capacity
self.buffer = []
self.alpha = alpha
self.beta_initial = beta_initial
self.beta_steps = beta_steps
self.enable_is = enable_is
def add(self, experience, priority):
if self.capacity <= len(self.buffer):
# 上限より多い場合は最後の要素を削除
self.buffer.pop(0)
experience = _bisect_wrapper(experience)
experience.priority = priority
bisect.insort(self.buffer, experience)
def update(self, index, experience, priority):
experience = _bisect_wrapper(experience)
experience.priority = priority
bisect.insort(self.buffer, experience)
def sample(self, batch_size, step):
indexes = []
batchs = []
weights = np.empty(batch_size, dtype='float32')
if self.enable_is:
# βは最初は低く、学習終わりに1にする。
beta = self.beta_initial + (1 - self.beta_initial) * step / self.beta_steps
# 合計値をだす
total = rank_sum(len(self.buffer), self.alpha)
# index_lst
index_lst = []
for _ in range(batch_size):
# index_lstにないものを追加
for _ in range(100): # for safety
r = random.random()*total
index = rank_sum_inverse(r, self.alpha)
index = int(index) # 整数にする(切り捨て)
if index not in index_lst:
index_lst.append(index)
break
assert len(index_lst) == batch_size
index_lst.sort()
buffer_size = len(self.buffer)
for i, index in enumerate(reversed(index_lst)):
o = self.buffer.pop(index) # 後ろから取得するのでindexに変化なし
batchs.append(o.d)
indexes.append(index)
if self.enable_is:
# 重要度サンプリングを計算
priority = (rank_sum(index+1, self.alpha) - rank_sum(index, self.alpha)) / total
weights[i] = (buffer_size * priority) ** (-beta)
else:
weights[i] = 1 # 無効なら1
if self.enable_is:
# 安定性の理由から最大値で正規化
weights = weights / weights.max()
return (indexes, batchs, weights)
def __len__(self):
return len(self.buffer)
#----------------------------------------
#-------------------------------------------
class EpsilonGreedy():
def __init__(self, epsilon):
self.epsilon = epsilon
def compile(self, build_compile_model_func, model_args):
pass
def save_weights(self, d):
pass
def load_weights(self, d):
pass
def select_action(self, agent, state):
if self.epsilon > np.random.uniform(0, 1):
# アクションをランダムに選択
action = np.random.randint(0, agent.nb_actions)
else:
# 評価が最大のアクションを選択
action = agent._get_qmax_action()
return action
class EpsilonGreedyActor():
def __init__(self, num_actors, index, epsilon, alpha):
if num_actors <= 1:
tmp = epsilon ** (1 + alpha)
else:
tmp = epsilon ** (1 + index/(num_actors-1)*alpha)
print("Actor{} Epsilon:{}".format(index, tmp))
self.epsilon = tmp
def compile(self, build_compile_model_func, model_args):
pass
def save_weights(self, d):
pass
def load_weights(self, d):
pass
def select_action(self, agent, state):
if self.epsilon > np.random.uniform(0, 1):
# アクションをランダムに選択
action = np.random.randint(0, agent.nb_actions)
else:
# 評価が最大のアクションを選択
action = agent._get_qmax_action()
return action
class AnnealingEpsilonGreedy():
def __init__(self,
initial_epsilon, # 初期ε
final_epsilon, # 最終状態でのε
exploration_steps # 初期~最終状態になるまでのステップ数
):
self.epsilon_step = (initial_epsilon - final_epsilon) / exploration_steps
self.initial_epsilon = initial_epsilon
self.final_epsilon = final_epsilon
self.step = 0
def compile(self, build_compile_model_func, model_args):
pass
def save_weights(self, d):
d["step"] = self.step
def load_weights(self, d):
self.step = d["step"]
def select_action(self, agent, state):
# epsilon の計算
epsilon = self.initial_epsilon - self.step * self.epsilon_step
if epsilon < self.final_epsilon:
epsilon = self.final_epsilon
self.step += 1
if epsilon > np.random.uniform(0, 1):
# アクションをランダムに選択
action = np.random.randint(0, agent.nb_actions)
else:
# 評価が最大のアクションを選択
action = agent._get_qmax_action()
return action
class SoftmaxPolicy():
def __init__(self):
pass
def compile(self, build_compile_model_func, model_args):
pass
def save_weights(self, d):
pass
def load_weights(self, d):
pass
def select_action(self, agent, state):
qvals = agent.model.predict(state, batch_size=1)[0]
exp_x = np.exp(qvals)
total = sum(exp_x)
vals = []
for i in range(agent.nb_actions):
softmax = exp_x[i] / total
# softmax 値以下の乱数を生成
vals.append( random.uniform(0, softmax) )
# 乱数の結果一番大きいアクションを選択
action = np.argmax(vals)
return action
class UCB1():
def __init__(self):
pass
def compile(self, build_compile_model_func, model_args):
# 選択回数を数える為の変数を追加
self.ucb_count = build_compile_model_func(**model_args)
def save_weights(self, d):
d["ucb_count"] = self.ucb_count.get_weights()
def load_weights(self, d):
self.ucb_count.set_weights(d["ucb_count"])
def select_action(self, agent, state):
counts = self.ucb_count.predict(state, 1)[0]
qvals = agent.model.predict(state, 1)[0]
# counts は1以上にする
for i in range(len(counts)):
if counts[i] < 1:
counts[i] = 1
total = sum(counts)
total = math.log(total) # 重そうな計算なので for の外で
ucbs = []
for i in range(agent.nb_actions):
count = counts[i]
ave = qvals[i] / count
tmp = ave + math.sqrt(2 * total / count)
ucbs.append(tmp)
# ucbが最大値となるアクションを選択
action = np.argmax(ucbs)
# 選択したアクションの選択回数を増やす
counts[action] += 1
self.ucb_count.train_on_batch(state, np.asarray([counts]))
return action
class UCB1_Tuned():
def __init__(self):
pass
def compile(self, build_compile_model_func, model_args):
# 選択回数を数える為の変数を追加
self.ucb_count = build_compile_model_func(**model_args)
self.ucb_var = build_compile_model_func(**model_args)
def save_weights(self, d):
d["ucb_count"] = self.ucb_count.get_weights()
d["ucb_var"] = self.ucb_var.get_weights()
def load_weights(self, d):
self.ucb_count.set_weights(d["ucb_count"])
self.ucb_var.set_weights(d["ucb_var"])
def select_action(self, agent, state):
# 前回の報酬を計算
state0 = agent.get_prev_observation() # 前回の状態
action = agent.get_prev_action() # 前回のaction
reward = agent.get_prev_reward() # 前回の報酬
counts = self.ucb_count.predict(state0, 1)[0]
qvals = agent.model.predict(state0, 1)[0]
ucb_vars = self.ucb_var.predict(state0, 1)[0]
# counts は1以上にする
for i in range(len(counts)):
if counts[i] < 1:
counts[i] = 1
# 分散を更新
prev_count = counts[action]
prev_ave = qvals[action] / prev_count
var = ucb_vars[action]
var += ((reward - prev_ave) ** 2) / prev_count
ucb_vars[action] = var
# 更新
self.ucb_var.train_on_batch(state0, np.asarray([ucb_vars]))
counts = self.ucb_count.predict(state, 1)[0]
qvals = agent.model.predict(state, 1)[0]
ucb_vars = self.ucb_var.predict(state, 1)[0]
# counts は1以上にする
for i in range(len(counts)):
if counts[i] < 1:
counts[i] = 1
# 分散がマイナスは0以上にする
for i in range(len(ucb_vars)):
if ucb_vars[i] < 0:
ucb_vars[i] = 0
# 合計を出す(数式ではN)
total = sum(counts)
total = math.log(total) # 重そうな計算なので for の外で
# 各アクションのUCB値を計算
ucbs = []
for i in range(agent.nb_actions):
count = counts[i]
# 平均
ave = qvals[i] / count
# 分散
var = ucb_vars[i]
# 数式を計算
v = var + math.sqrt(2 * total / count)
if 1/4 < v:
v = 1/4
tmp = ave + math.sqrt( (total / count) * v )
ucbs.append(tmp)
# ucbが最大値となるアクションを選択
action = np.argmax(ucbs)
# 選択したアクションの選択回数を増やす
counts[action] += 1
self.ucb_count.train_on_batch(state, np.asarray([counts]))
return action
class UCBv():
def __init__(self):
pass
def compile(self, build_compile_model_func, model_args):
# 選択回数を数える為の変数を追加
self.ucb_count = build_compile_model_func(**model_args)
self.ucb_var = build_compile_model_func(**model_args)
def save_weights(self, d):
d["ucb_count"] = self.ucb_count.get_weights()
d["ucb_var"] = self.ucb_var.get_weights()
def load_weights(self, d):
self.ucb_count.set_weights(d["ucb_count"])
self.ucb_var.set_weights(d["ucb_var"])
def select_action(self, agent, state):
# 前回の報酬を計算
state0 = agent.get_prev_observation() # 前回の状態
action = agent.get_prev_action() # 前回のaction
reward = agent.get_prev_reward() # 前回の報酬
counts = self.ucb_count.predict(state0, 1)[0]
qvals = agent.model.predict(state0, 1)[0]
ucb_vars = self.ucb_var.predict(state0, 1)[0]
# counts は1以上にする
for i in range(len(counts)):
if counts[i] < 1:
counts[i] = 1
# 分散を更新
prev_count = counts[action]
prev_ave = qvals[action] / prev_count
var = ucb_vars[action]
var += ((reward - prev_ave) ** 2) / prev_count
ucb_vars[action] = var
# 更新
self.ucb_var.train_on_batch(state0, np.asarray([ucb_vars]))
counts = self.ucb_count.predict(state, 1)[0]
qvals = agent.model.predict(state, 1)[0]
ucb_vars = self.ucb_var.predict(state, 1)[0]
# counts は1以上にする
for i in range(len(counts)):
if counts[i] < 1:
counts[i] = 1
# 分散がマイナスは0以上にする
for i in range(len(ucb_vars)):
if ucb_vars[i] < 0:
ucb_vars[i] = 0
# 合計を出す(数式ではN)
total = sum(counts)
# 各アクションのUCB値を計算
zeta = 1.2
c = 1
b = 1
e = zeta*math.log(total)
ucbs = []
for i in range(agent.nb_actions):
count = counts[i]
# 平均
ave = qvals[i] / count
# 分散
var = ucb_vars[i]
tmp = ave + math.sqrt( (2*var*e)/count ) + c* (3*b*e)/count
ucbs.append(tmp)
# ucbが最大値となるアクションを選択
action = np.argmax(ucbs)
# 選択したアクションの選択回数を増やす
counts[action] += 1
self.ucb_count.train_on_batch(state, np.asarray([counts]))
return action
class KL_UCB():
def __init__(self, C=0, delta=1e-8, eps=1e-12):
self.C = C
self.delta = delta # 探索幅
self.eps = eps # 探索の許容誤差
def compile(self, build_compile_model_func, model_args):
# 選択回数を数える為の変数を追加
self.ucb_count = build_compile_model_func(**model_args)
def save_weights(self, d):
d["ucb_count"] = self.ucb_count.get_weights()
def load_weights(self, d):
self.ucb_count.set_weights(d["ucb_count"])
def select_action(self, agent, state):
counts = self.ucb_count.predict(state, 1)[0]
qvals = agent.model.predict(state, 1)[0]
# counts は1以上にする
for i in range(len(counts)):
if counts[i] < 1:
counts[i] = 1
# 合計を出す(数式ではN)
total = sum(counts)
# 右辺をだしておく
logndn = math.log(total) + self.C * math.log(math.log(total))
# 各アクションのUCB値を計算
ucbs = []
for i in range(agent.nb_actions):
count = counts[i]
p = qvals[i] / count
# 例外処理:p は 0~1
if p >= 1:
ucbs.append(1)
continue
if p <= 0:
p = self.delta
# 最大値を探索する
q = p + self.delta
converged = False # debug
for _ in range(10):
# kl-divergence
try:
kl = p * math.log(p/q) + (1-p) * math.log((1-p)/(1-q))
except ValueError:
break
f = logndn - kl
df = -(q-p)/(q*(1.0-q))
if f*f < self.eps:
converged = True
break
q = min([1-self.delta, max([q-f/df, p+self.delta])])
# debug
#assert converged, "WARNING:KL-UCB did not converge!! p={} logndn={} q={}".format(p, logndn, q)
ucbs.append(q)
# ucbが最大値となるアクションを選択
action = np.argmax(ucbs)
# 選択したアクションの選択回数を増やす
counts[action] += 1
self.ucb_count.train_on_batch(state, np.asarray([counts]))
return action
class ThompsonSamplingBeta():
def __init__(self):
pass
def compile(self, build_compile_model_func, model_args):
# 選択回数を数える為の変数を追加
self.reward_alpha = build_compile_model_func(**model_args)
self.reward_beta = build_compile_model_func(**model_args)
def save_weights(self, d):
d["reward_alpha"] = self.reward_alpha.get_weights()
d["reward_beta"] = self.reward_beta.get_weights()
def load_weights(self, d):
self.reward_alpha.set_weights(d["reward_alpha"])
self.reward_beta.set_weights(d["reward_beta"])
def select_action(self, agent, state):
# 前回の報酬を計算
state0 = agent.get_prev_observation() # 前回の状態
action = agent.get_prev_action() # 前回のaction
reward = agent.get_prev_reward() # 前回の報酬
# 更新
if reward > 0:
v = self.reward_alpha.predict(state0, 1)[0]
for i in range(len(v)):
if v[i] < 1:
v[i] = 1
v[action] += 1
self.reward_alpha.train_on_batch(state0, np.asarray([v]))
else:
v = self.reward_beta.predict(state0, 1)[0]
for i in range(len(v)):
if v[i] < 1:
v[i] = 1
v[action] += 1
self.reward_beta.train_on_batch(state0, np.asarray([v]))
alphas = self.reward_alpha.predict(state, 1)[0]
betas = self.reward_alpha.predict(state, 1)[0]
# alpha,beta は1以上にする
for i in range(len(alphas)):
if alphas[i] < 1:
alphas[i] = 1
if betas[i] < 1:
betas[i] = 1
# アクションを計算
vals = []
for i in range(agent.nb_actions):
# ベータ分布に従って乱数を生成
v = np.random.beta(alphas[i], betas[i])
vals.append(v)
# 乱数が最大値となるアクションを選択
action = np.argmax(vals)
return action
class ThompsonSamplingGaussian():
def __init__(self, dispersion=1):
# 既知の分散
self.dispersion = dispersion
assert self.dispersion != 0
def compile(self, build_compile_model_func, model_args):
# 現在の分散と平均
self.recent_sigma = build_compile_model_func(**model_args)
self.recent_mu = build_compile_model_func(**model_args)
def save_weights(self, d):
d["recent_sigma"] = self.recent_sigma.get_weights()
d["recent_mu"] = self.recent_mu.get_weights()
def load_weights(self, d):
self.recent_sigma.set_weights(d["recent_sigma"])
self.recent_mu.set_weights(d["recent_mu"])
def select_action(self, agent, state):
# 前回の報酬を加算
state0 = agent.get_prev_observation() # 前回の状態
action = agent.get_prev_action() # 前回のaction
reward = agent.get_prev_reward() # 前回の報酬
mu = self.recent_mu.predict(state0, 1)[0]
sigma = self.recent_sigma.predict(state0, 1)[0]
# 分散は1以上
for i in range(len(sigma)):
if sigma[i] < 1:
sigma[i] = 1
# 更新(平均)
tmp1 = reward/self.dispersion + mu[action]/sigma[action]
tmp2 = 1/self.dispersion + 1/sigma[action]
mu[action] = tmp1/tmp2
self.recent_mu.train_on_batch(state0, np.asarray([mu]))
# 更新(分散)
sigma[action] = 1/( (1/self.dispersion) + (1/sigma[action]) )
self.recent_sigma.train_on_batch(state0, np.asarray([sigma]))
mu = self.recent_mu.predict(state, 1)[0]
sigma = self.recent_sigma.predict(state, 1)[0]
# 分散は1以上
for i in range(len(sigma)):
if sigma[i] < 1:
sigma[i] = 1
# アクションを計算
vals = []
for i in range(agent.nb_actions):
# 正規分布に従い乱数を生成
v = np.random.normal(mu[i], sigma[i])
vals.append(v)
# 乱数が最大値となるアクションを選択
action = np.argmax(vals)
return action
#--------------------------------------------------
# from : https://github.com/LuEE-C/Noisy-A3C-Keras/blob/master/NoisyDense.py
# from : https://github.com/keiohta/tf2rl/blob/atari/tf2rl/networks/noisy_dense.py
class NoisyDense(Layer):
def __init__(self, units,
sigma_init=0.02,
activation=None,
use_bias=True,
kernel_initializer='glorot_uniform',
bias_initializer='zeros',
kernel_regularizer=None,
bias_regularizer=None,
activity_regularizer=None,
kernel_constraint=None,
bias_constraint=None,
**kwargs):
if 'input_shape' not in kwargs and 'input_dim' in kwargs:
kwargs['input_shape'] = (kwargs.pop('input_dim'),)
super(NoisyDense, self).__init__(**kwargs)
self.units = units
self.sigma_init = sigma_init
self.activation = activations.get(activation)
self.use_bias = use_bias
self.kernel_initializer = initializers.get(kernel_initializer)
self.bias_initializer = initializers.get(bias_initializer)
self.kernel_regularizer = regularizers.get(kernel_regularizer)
self.bias_regularizer = regularizers.get(bias_regularizer)
self.activity_regularizer = regularizers.get(activity_regularizer)
self.kernel_constraint = constraints.get(kernel_constraint)
self.bias_constraint = constraints.get(bias_constraint)
def build(self, input_shape):
assert len(input_shape) >= 2
self.input_dim = input_shape[-1]
self.kernel_shape = tf.constant((self.input_dim, self.units))
self.bias_shape = tf.constant((self.units,))
self.kernel = self.add_weight(shape=(self.input_dim, self.units),
initializer=self.kernel_initializer,
name='kernel',
regularizer=self.kernel_regularizer,
constraint=self.kernel_constraint)
self.sigma_kernel = self.add_weight(shape=(self.input_dim, self.units),
initializer=initializers.Constant(value=self.sigma_init),
name='sigma_kernel'
)
if self.use_bias:
self.bias = self.add_weight(shape=(self.units,),
initializer=self.bias_initializer,
name='bias',
regularizer=self.bias_regularizer,
constraint=self.bias_constraint)
self.sigma_bias = self.add_weight(shape=(self.units,),
initializer=initializers.Constant(value=self.sigma_init),
name='sigma_bias')
else:
self.bias = None
self.epsilon_bias = None
self.epsilon_kernel = K.zeros(shape=(self.input_dim, self.units))
self.epsilon_bias = K.zeros(shape=(self.units,))
self.sample_noise()
super(NoisyDense, self).build(input_shape)
def call(self, X):
#perturbation = self.sigma_kernel * self.epsilon_kernel
#perturbed_kernel = self.kernel + perturbation
perturbed_kernel = self.sigma_kernel * K.random_uniform(shape=self.kernel_shape)
output = K.dot(X, perturbed_kernel)
if self.use_bias:
#bias_perturbation = self.sigma_bias * self.epsilon_bias
#perturbed_bias = self.bias + bias_perturbation
perturbed_bias = self.bias + self.sigma_bias * K.random_uniform(shape=self.bias_shape)
output = K.bias_add(output, perturbed_bias)
if self.activation is not None:
output = self.activation(output)
return output
def compute_output_shape(self, input_shape):
assert input_shape and len(input_shape) >= 2
assert input_shape[-1]
output_shape = list(input_shape)
output_shape[-1] = self.units
return tuple(output_shape)
def sample_noise(self):
K.set_value(self.epsilon_kernel, np.random.normal(0, 1, (self.input_dim, self.units)))
K.set_value(self.epsilon_bias, np.random.normal(0, 1, (self.units,)))
def remove_noise(self):
K.set_value(self.epsilon_kernel, np.zeros(shape=(self.input_dim, self.units)))
K.set_value(self.epsilon_bias, np.zeros(shape=self.units,))
#---------------------------------------------------
def plot(names):
data = []
for name in names:
path = os.path.join(BASE_DIR, name + ".txt")
with open(path, "r") as f:
name = f.readline()
t = float(f.readline())
train = float(f.readline())
test_reward = float(f.readline())
actor0 = []
for line in f:
actor0.append(float(line))
data.append({
"name": name,
"time": t,
"train": train,
"test_reward": test_reward,
"actor0": actor0,
})
# train 情報の描画
plt.figure(figsize=(14, 5), dpi=100)
plt.ylabel("Train Reward")
for d in data:
num = 100 # 移動平均の個数
b = np.ones(int(num))/num
y = np.convolve(d["actor0"], b, mode='same') # 移動平均
plt.plot(y[:-num], label=d["name"])
plt.legend(loc="upper left")
plt.show()
# train 情報の描画
plt.figure(figsize=(14, 5), dpi=100)
plt.ylabel("Train Time(s)")
plt.bar([d["name"] for d in data ], [d["time"] for d in data ])
plt.show()
# result 情報の描画
plt.figure(figsize=(14, 5), dpi=100)
plt.ylabel("Test Reward(100 ave)")
plt.bar([d["name"] for d in data ], [d["test_reward"] for d in data ])
plt.show()
#-----------------------------------------------------------
# function
#-----------------------------------------------------------
def create_processor_pendulum():
return PendulumProcessor()
def create_processor_cartpole():
return CartPoleProcessor()
def create_processor_acrobot():
return AcrobotProcessor()
def create_optimizer():
return Adam(lr=0.0005)
def actor_func(index, actor, callbacks):
env = gym.make(ENV_NAME)
if index == 0:
verbose = 0
else:
verbose = 0
actor.fit(env, nb_steps=999_999_999, visualize=False, verbose=verbose, callbacks=callbacks)
#--------------------------------------
def main(name):
print("---------- start {} {}".format(ENV_NAME, name))
env = gym.make(ENV_NAME)
if ENV_NAME == "Pendulum-v0":
processor = create_processor_pendulum
input_shape = env.observation_space.shape
nb_actions = 5
elif ENV_NAME == "CartPole-v0":
processor = create_processor_cartpole
input_shape = env.observation_space.shape
nb_actions = env.action_space.n
elif ENV_NAME == "Acrobot-v1":
processor = create_processor_acrobot
input_shape = env.observation_space.shape
nb_actions = env.action_space.n
# 引数
args = {
# model関係
"input_shape": input_shape,
"enable_image_layer": False,
"nb_actions": nb_actions,
"input_sequence": 8, # 入力フレーム数
"dense_units_num": 64, # Dense層のユニット数
"metrics": [], # optimizer用
"enable_dueling_network": True, # dueling_network有効フラグ
"dueling_network_type": "ave", # dueling_networkのアルゴリズム
"enable_noisynet": False, # NoisyNet有効フラグ
"lstm_type": "lstm", # LSTMのアルゴリズム
"lstm_units_num": 64, # LSTM層のユニット数
# learner 関係
"remote_memory_warmup_size": 100, # 初期のメモリー確保用step数(学習しない)
"batch_size": 16, # batch_size
"target_model_update": 2000, # target networkのupdate間隔
"enable_double_dqn": True, # DDQN有効フラグ
"enable_rescaling_priority": False, # rescalingを有効にするか(priotrity)
"enable_rescaling_train": False, # rescalingを有効にするか(train)
"rescaling_epsilon": 0.001, # rescalingの定数
"burnin_length": 20, # burn-in期間
"priority_exponent": 0.9, # priority優先度
# memory 関係
"remote_memory_type": "per_proportional",
"remote_memory_args": {
"capacity": 500_000, # メモリサイズ
"alpha": 0.8, # PERの確率反映率
"beta_initial": 0.0, # IS反映率の初期値
"beta_steps": 100_000, # IS反映率の上昇step数
"enable_is": False, # ISを有効にするかどうか
},
# actor 関係
"local_memory_update_size": 50, # LocalMemoryからRemoteMemoryへ投げるサイズ
"actor_model_sync_interval": 500, # learner から model を同期する間隔
"gamma": 0.99, # Q学習の割引率
"multireward_steps": 1, # multistep reward
"action_interval": 1, # アクションを実行する間隔
# その他
"num_actors": 2, # actor の数
"enable_GPU": True, # GPUを使うか
"limit_train_count": 50_000, # 最大学習回数(0で制限なし)
"load_weights_path": "", # 保存ファイル名
"save_weights_path": "", # 保存ファイル名
"save_overwrite": True, # 上書き保存するか
"logger_interval": 1, # ログ取得間隔(秒)
}
act_type = "greedy_actor"
act_args = {"epsilon": 0.4, "alpha": 2 }
#-------------
if name == "No_DDQN":
args["enable_double_dqn"] = False
elif name == "ReplayMemory":
args["remote_memory_type"] = "replay"
args["remote_memory_args"] = {"capacity": 500_000}
elif name == "PER_Greedy":
args["remote_memory_type"] = "per_greedy"
args["remote_memory_args"] = {"capacity": 500_000}
elif name == "PER_Rank":
args["remote_memory_type"] = "per_rankbase"
args["remote_memory_args"] = {
"capacity": 500_000, # メモリサイズ
"alpha": 0.8, # PERの確率反映率
"beta_initial": 0.0, # IS反映率の初期値
"beta_steps": 100_000, # IS反映率の上昇step数
"enable_is": False, # ISを有効にするかどうか
}
elif name == "PER_IS":
args["remote_memory_type"] = "per_proportional"
args["remote_memory_args"] = {
"capacity": 500_000, # メモリサイズ
"alpha": 0.8, # PERの確率反映率
"beta_initial": 0.0, # IS反映率の初期値
"beta_steps": 100_000, # IS反映率の上昇step数
"enable_is": True, # ISを有効にするかどうか
}
elif name == "No_DuelNet":
args["enable_dueling_network"] = False
elif name == "DuelNetMax":
args["dueling_network_type"] = "max"
elif name == "DuelNetNaive":
args["enable_dueling_network"] = "naive"
elif name == "MultiReward3":
args["multireward_steps"] = 3
elif name == "NoisyNet":
args["enable_noisynet"] = False
elif name == "NoLSTM":
args["lstm_type"] = ""
elif name == "LSTMful_noburnin":
args["lstm_type"] = "lstm_ful"
args["batch_size"] = 1
args["burnin_length"] = 0
elif name == "LSTMful_burnin":
args["lstm_type"] = "lstm_ful"
args["batch_size"] = 1
args["burnin_length"] = 4
elif name == "rescaling":
args["enable_rescaling_priority"] = True
args["enable_rescaling_train"] = True
elif name == "Greedy":
act_type = "greedy"
act_args = {"epsilon": 0.1}
elif name == "AnnealingGreedy":
act_type = "annieling_greedy"
act_args = {"initial_epsilon": 1.0, "final_epsilon": 0.01, "exploration_steps": 100_000}
elif name == "Softmax":
act_type = "softmax"
act_args = {}
elif name == "UCB1":
act_type = "ucb1"
act_args = {}
elif name == "UCB1_Tuned":
act_type = "ucb1_tuned"
act_args = {}
elif name == "UCBv":
act_type = "ucbv"
act_args = {}
elif name == "KL_UCB":
act_type = "kl_ucb"
act_args = {}
elif name == "TS_Beta":
act_type = "ts_beta"
act_args = {}
elif name == "TS_Gaussian":
act_type = "ts_gaussian"
act_args = {}
#------------------------------
# 探索ポリシー関係、配列の数はactor数と同数
action_policies = []
for _ in range(args["num_actors"]):
action_policies.append({
"type": act_type,
"args": act_args
})
args["action_policies"] = action_policies
#--- R2D2
manager = R2D2Manager(
actor_func=actor_func,
args=args,
create_processor_func=processor,
create_optimizer_func=create_optimizer,
)
t0 = time.time()
agent, learner_logs, actors_logs = manager.train()
t1 = time.time() - t0
# 訓練結果を見る
agent.processor.mode = "test"
agent.test(env, nb_episodes=5, visualize=False, verbose=1)
# 学習結果を計算
test_num = 100
history = agent.test(env, nb_episodes=test_num, visualize=False, verbose=0)
test_reward = sum([ n for n in history.history["episode_reward"]]) / test_num
# log書き込み
with open(os.path.join(BASE_DIR, name + ".txt"), "w") as f:
f.write(name + "\n") # 名前
f.write(str(t1) + "\n") # 時間
f.write(str(learner_logs[-1]["train_num"]) + "\n") # lernaer の学習回数
f.write(str(test_reward) + "\n") # テスト100の平均報酬
# actor0の平均報酬
for actor in actors_logs["actor0"]:
reward = (actor["reward_min"] + actor["reward_max"])/2
f.write(str(reward) + "\n")
#-----------------------------------------------------------
# コメントで切り替える
#ENV_NAME = "Pendulum-v0"
ENV_NAME = "CartPole-v0"
#ENV_NAME = "Acrobot-v1"
# log
BASE_DIR = "log/" + ENV_NAME
os.makedirs(BASE_DIR, exist_ok=True)
if __name__ == '__main__':
# GPU確認
#from tensorflow.python.client import device_lib
#print(device_lib.list_local_devices())
# ゲーム情報
env = gym.make(ENV_NAME)
print("action_space : " + str(env.action_space))
print("observation_space : " + str(env.observation_space))
print("reward_range : " + str(env.reward_range))
if True:
main("normal")
main("No_DDQN")
main("ReplayMemory")
main("PER_Greedy")
main("PER_Rank")
main("PER_IS")
main("No_DuelNet")
main("DuelNetMax")
main("DuelNetNaive")
main("MultiReward3")
main("MultiReward10")
main("NoisyNet")
main("NoLSTM")
main("LSTMful_noburnin")
main("LSTMful_burnin")
main("rescaling")
main("Greedy")
main("AnnealingGreedy")
main("Softmax")
main("UCB1")
main("UCB1_Tuned")
main("UCBv")
main("KL_UCB")
main("TS_Beta")
main("TS_Gaussian")
#--- グラフ
plot([
"normal",
"No_DDQN",
"No_DuelNet",
"DuelNetMax",
"DuelNetNaive",
"MultiReward3",
"MultiReward10",
"NoisyNet",
])
plot([
"normal",
"ReplayMemory",
"PER_Greedy",
"PER_Rank",
"PER_IS",
])
plot([
"normal",
"NoLSTM",
"LSTMful_noburnin",
"LSTMful_burnin",
"rescaling",
])
plot([
"normal",
"Greedy",
"AnnealingGreedy",
"Softmax",
"UCB1",
"UCB1_Tuned",
"UCBv",
"KL_UCB",
"TS_Beta",
"TS_Gaussian",
])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment