Skip to content

Instantly share code, notes, and snippets.

@pocokhc
Last active June 9, 2019 06:30
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save pocokhc/5890cd28c69d69078478278fda1e22ae to your computer and use it in GitHub Desktop.
Save pocokhc/5890cd28c69d69078478278fda1e22ae to your computer and use it in GitHub Desktop.
kera-rlでR2D2用のAgentを実装したコードです。
import gym
import pickle
import os
import numpy as np
import random
import time
import traceback
import math
import tensorflow as tf
from keras.optimizers import Adam
from keras.models import Model
from keras.layers import *
from keras import backend as K
import rl.core
import multiprocessing as mp
import matplotlib.pyplot as plt
from PIL import Image, ImageDraw
#--- GPU設定
from keras.backend.tensorflow_backend import set_session
config = tf.ConfigProto(
gpu_options=tf.GPUOptions(
#visible_device_list="0", # specify GPU number
allow_growth=True,
per_process_gpu_memory_fraction=1,
)
)
set_session(tf.Session(config=config))
class PendulumProcessorForDQN(rl.core.Processor):
def __init__(self, enable_image=False, image_size=84):
self.image_size = image_size
self.enable_image = enable_image
self.mode = "train"
def process_observation(self, observation):
if not self.enable_image:
return observation
return self._get_rgb_state(observation) # reshazeせずに返す
def process_action(self, action):
ACT_ID_TO_VALUE = {
0: [-2.0],
1: [-1.0],
2: [0.0],
3: [+1.0],
4: [+2.0],
}
return ACT_ID_TO_VALUE[action]
def process_reward(self, reward):
if self.mode == "test": # testは本当の値を返す
return reward
# return np.clip(reward, -1., 1.)
# -16.5~0 を -1~1 に正規化
self.max = 0
self.min = -16.5
# min max normarization
if (self.max - self.min) == 0:
return 0
M = 1
m = -0.5
return ((reward - self.min) / (self.max - self.min))*(M - m) + m
# 状態(x,y座標)から対応画像を描画する関数
def _get_rgb_state(self, state):
img_size = self.image_size
h_size = img_size/1.1
img = Image.new("RGB", (img_size, img_size), (255, 255, 255))
dr = ImageDraw.Draw(img)
# 棒の長さ
l = img_size/2.0 * 3.0/ 2.0
# 棒のラインの描写
dr.line(((h_size - l * state[1], h_size - l * state[0]), (h_size, h_size)), (0, 0, 0), 1)
# 棒の中心の円を描写(それっぽくしてみた)
buff = img_size/32.0
dr.ellipse(((h_size - buff, h_size - buff), (h_size + buff, h_size + buff)),
outline=(0, 0, 0), fill=(255, 0, 0))
# 画像の一次元化(GrayScale化)とarrayへの変換
pilImg = img.convert("L")
img_arr = np.asarray(pilImg)
# 画像の規格化
img_arr = img_arr/255.0
return img_arr
#---------------------------------------------------
# manager
#---------------------------------------------------
class R2D2Manager():
def __init__(self,
actor_func,
num_actors,
args,
create_processor_func,
create_optimizer_func,
):
# 引数整形
args["save_weights_path"] = args["save_weights_path"] if ("save_weights_path" in args) else ""
args["load_weights_path"] = args["load_weights_path"] if ("load_weights_path" in args) else ""
# type チェック
lstm_types = [
"",
"lstm",
"lstm_ful",
]
if args["lstm_type"] not in lstm_types:
raise ValueError('lstm_type is ["","lstm","lstm_ful"]')
# lstm_ful のみburnin有効
if args["lstm_type"] != "lstm_ful":
args["burnin_length"] = 0
self.actor_func = actor_func
self.num_actors = num_actors
self.args = args
# build_compile_model 関数用の引数
model_args = {
"input_shape": self.args["input_shape"],
"enable_image_layer": self.args["enable_image_layer"],
"nb_actions": self.args["nb_actions"],
"input_sequence": self.args["input_sequence"],
"enable_dueling_network": self.args["enable_dueling_network"],
"dueling_network_type": self.args["dueling_network_type"],
"enable_noisynet": self.args["enable_noisynet"],
"dense_units_num": self.args["dense_units_num"],
"lstm_type": self.args["lstm_type"],
"lstm_units_num": self.args["lstm_units_num"],
"metrics": self.args["metrics"],
"create_optimizer_func": create_optimizer_func,
}
self._create_process(model_args, create_processor_func, args)
def _create_process(self, model_args, create_processor_func, args_org):
# 各Queueを作成
experience_q = mp.Queue()
model_sync_q = [[mp.Queue(), mp.Queue(), mp.Queue()] for _ in range(self.num_actors)]
self.learner_end_q = [mp.Queue(), mp.Queue()]
self.actors_end_q = [mp.Queue() for _ in range(self.num_actors)]
self.learner_logger_q = mp.Queue()
self.actors_logger_q = mp.Queue()
# learner ps を作成
args = (
model_args,
self.args,
experience_q,
model_sync_q,
self.learner_end_q,
self.learner_logger_q,
)
if args_org["enable_GPU"]:
self.learner_ps = mp.Process(target=learner_run_gpu, args=args)
else:
self.learner_ps = mp.Process(target=learner_run, args=args)
# actor ps を作成
self.actors_ps = []
epsilon = self.args["epsilon"]
epsilon_alpha = self.args["epsilon_alpha"]
for i in range(self.num_actors):
if self.num_actors <= 1:
epsilon_i = epsilon ** (1 + epsilon_alpha)
else:
epsilon_i = epsilon ** (1 + i/(self.num_actors-1)*epsilon_alpha)
print("Actor{} Epsilon:{}".format(i,epsilon_i))
self.args["epsilon"] = epsilon_i
args = (
i,
self.actor_func,
model_args,
self.args,
create_processor_func,
experience_q,
model_sync_q[i],
self.actors_logger_q,
self.actors_end_q[i],
)
if args_org["enable_GPU"]:
self.actors_ps.append(mp.Process(target=actor_run_cpu, args=args))
else:
self.actors_ps.append(mp.Process(target=actor_run, args=args))
# test用 Actor は子 Process では作らないのでselfにする。
self.model_args = model_args
self.create_processor_func = create_processor_func
def __del__(self):
self.learner_ps.terminate()
for p in self.actors_ps:
p.terminate()
def train(self):
learner_logs = []
actors_logs = {}
# プロセスを動かす
try:
self.learner_ps.start()
for p in self.actors_ps:
p.start()
# 終了を待つ
while True:
time.sleep(1) # polling time
# 定期的にログを吸出し
while not self.learner_logger_q.empty():
learner_logs.append(self.learner_logger_q.get(timeout=1))
while not self.actors_logger_q.empty():
log = self.actors_logger_q.get(timeout=1)
if log["name"] not in actors_logs:
actors_logs[log["name"]] = []
actors_logs[log["name"]].append(log)
# 終了判定
f = True
for q in self.actors_end_q:
if q.empty():
f = False
break
if f:
break
except KeyboardInterrupt:
pass
except Exception:
print(traceback.format_exc())
# 定期的にログを吸出し
while not self.learner_logger_q.empty():
learner_logs.append(self.learner_logger_q.get(timeout=1))
while not self.actors_logger_q.empty():
log = self.actors_logger_q.get(timeout=1)
if log["name"] not in actors_logs:
actors_logs[log["name"]] = []
actors_logs[log["name"]].append(log)
# learner に終了を投げる
self.learner_end_q[0].put(1)
# learner から最後の状態を取得
print("Last Learner weights waiting...")
weights = self.learner_end_q[1].get(timeout=60)
# test用の Actor を作成
test_actor = Actor(
-1,
self.model_args,
self.args,
None,
None,
processor=self.create_processor_func()
)
test_actor.model.set_weights(weights)
# kill
self.learner_ps.terminate()
for p in self.actors_ps:
p.terminate()
return test_actor, learner_logs, actors_logs
#---------------------------------------------------
# network
#---------------------------------------------------
def clipped_error_loss(y_true, y_pred):
err = y_true - y_pred # エラー
L2 = 0.5 * K.square(err)
L1 = K.abs(err) - 0.5
# エラーが[-1,1]区間ならL2、それ以外ならL1を選択する。
loss = tf.where((K.abs(err) < 1.0), L2, L1) # Keras does not cover where function in tensorflow :-(
return K.mean(loss)
def rescaling(x, epsilon=0.001):
n = math.sqrt(abs(x)+1) - 1
return np.sign(x)*n + epsilon*x
def rescaling_inverse(x):
return np.sign(x)*( (x+np.sign(x) ) ** 2 - 1)
def build_compile_model(
input_shape, # 入力shape
enable_image_layer, # image_layerを入れるか
input_sequence, # input_sequence
nb_actions, # アクション数
enable_dueling_network, # dueling_network を有効にするか
dueling_network_type,
enable_noisynet,
dense_units_num, # Dense層のユニット数
lstm_type, # 使用するLSTMアルゴリズム
lstm_units_num, # LSTMのユニット数
create_optimizer_func,
metrics, # compile に渡す metrics
):
if lstm_type == "lstm_ful":
# (batch_size, timesteps, width, height)
c = input_ = Input(batch_shape=(1, 1) + input_shape)
else:
# 入力層(input_sequence, width, height)
c = input_ = Input(shape=(input_sequence,) + input_shape)
if enable_image_layer:
if lstm_type == "":
c = Permute((2, 3, 1))(c) # (window,w,h) -> (w,h,window)
c = Conv2D(32, (8, 8), strides=(4, 4), padding="same", name="c1")(c)
c = Activation("relu")(c)
c = Conv2D(64, (4, 4), strides=(2, 2), padding="same", name="c2")(c)
c = Activation("relu")(c)
c = Conv2D(64, (3, 3), strides=(1, 1), padding="same", name="c3")(c)
c = Activation("relu")(c)
c = Flatten()(c)
else:
# (time steps, w, h) -> (time steps, w, h, ch)
if lstm_type == "lstm_ful":
c = Reshape((1, ) + input_shape + (1,) )(c)
else:
c = Reshape((input_sequence, ) + input_shape + (1,) )(c)
# https://keras.io/layers/wrappers/
c = TimeDistributed(Conv2D(32, (8, 8), strides=(4, 4), padding="same"), name="c1")(c)
c = Activation("relu")(c)
c = TimeDistributed(Conv2D(64, (4, 4), strides=(2, 2), padding="same"), name="c2")(c)
c = Activation("relu")(c)
c = TimeDistributed(Conv2D(64, (3, 3), strides=(1, 1), padding="same"), name="c3")(c)
c = Activation("relu")(c)
c = TimeDistributed(Flatten())(c)
elif lstm_type == "":
c = Flatten()(c)
if lstm_type == "lstm":
c = LSTM(lstm_units_num, name="lstm")(c)
elif lstm_type == "lstm_ful":
c = LSTM(lstm_units_num, stateful=True, name="lstm")(c)
if enable_dueling_network:
# value
v = Dense(dense_units_num, activation="relu")(c)
if enable_noisynet:
v = NoisyDense(1, name="v")(v)
else:
v = Dense(1, name="v")(v)
# advance
adv = Dense(dense_units_num, activation='relu')(c)
if enable_noisynet:
adv = NoisyDense(nb_actions, name="adv")(adv)
else:
adv = Dense(nb_actions, name="adv")(adv)
# 連結で結合
c = Concatenate()([v,adv])
if dueling_network_type == "ave":
c = Lambda(lambda a: K.expand_dims(a[:, 0], -1) + a[:, 1:] - K.mean(a[:, 1:], axis=1, keepdims=True), output_shape=(nb_actions,))(c)
elif dueling_network_type == "max":
c = Lambda(lambda a: K.expand_dims(a[:, 0], -1) + a[:, 1:] - K.max(a[:, 1:], axis=1, keepdims=True), output_shape=(nb_actions,))(c)
elif dueling_network_type == "naive":
c = Lambda(lambda a: K.expand_dims(a[:, 0], -1) + a[:, 1:], output_shape=(nb_actions,))(c)
else:
raise ValueError('dueling_network_type is ["ave","max","naive"]')
else:
c = Dense(dense_units_num, activation="relu")(c)
if enable_noisynet:
c = NoisyDense(nb_actions, activation="linear", name="adv")(c)
else:
c = Dense(nb_actions, activation="linear", name="adv")(c)
model = Model(input_, c)
# compile
model.compile(
loss=clipped_error_loss,
optimizer=create_optimizer_func(),
metrics=metrics)
return model
#---------------------------------------------------
# learner
#---------------------------------------------------
def learner_run_gpu(
model_args,
args,
experience_q,
model_sync_q,
learner_end_q,
logger_q,
):
with tf.device("/device:GPU:0"):
learner_run(
model_args,
args,
experience_q,
model_sync_q,
learner_end_q,
logger_q)
def learner_run(
model_args,
args,
experience_q,
model_sync_q,
learner_end_q,
logger_q,
):
learner = Learner(
model_args=model_args,
args=args,
experience_q=experience_q,
model_sync_q=model_sync_q,
)
try:
# model load
if os.path.isfile(args["load_weights_path"]):
learner.model.load_weights(args["load_weights_path"])
learner.target_model.load_weights(args["load_weights_path"])
# logger用
t0 = time.time()
# learner はひたすら学習する
print("Learner Starts!")
while True:
learner.train()
# logger
if time.time() - t0 > args["logger_interval"]:
t0 = time.time()
logger_q.put({
"name": "learner",
"train_num": learner.train_num,
})
# 終了判定
if not learner_end_q[0].empty():
break
else:
# model load
if os.path.isfile(args["load_weights_path"]):
learner.model.load_weights(args["load_weights_path"])
learner.target_model.load_weights(args["load_weights_path"])
# logger用
t0 = time.time()
# learner はひたすら学習する
print("Learner Starts!")
while True:
learner.train()
# logger
if time.time() - t0 > args["logger_interval"]:
t0 = time.time()
logger_q.put({
"name": "learner",
"train_num": learner.train_num,
})
# 終了判定
if not learner_end_q[0].empty():
break
except KeyboardInterrupt:
pass
except Exception:
print(traceback.format_exc())
finally:
print("Learning End. Train Count:{}".format(learner.train_num))
# model save
if args["save_weights_path"] != "":
print("save:" + args["save_weights_path"])
learner.model.save_weights(args["save_weights_path"], args["save_overwrite"])
# 最後の状態を manager に投げる
print("Last Learner weights sending...")
learner_end_q[1].put(learner.model.get_weights())
class Learner():
def __init__(self,
model_args,
args,
experience_q,
model_sync_q
):
self.experience_q = experience_q
self.model_sync_q = model_sync_q
self.memory_warmup_size = args["remote_memory_warmup_size"]
self.per_alpha = args["per_alpha"]
per_beta_initial = args["per_beta_initial"]
per_beta_steps = args["per_beta_steps"]
per_enable_is = args["per_enable_is"]
memory_capacity = args["remote_memory_capacity"]
memory_type = args["remote_memory_type"]
if memory_type == "replay":
self.memory = ReplayMemory(memory_capacity)
elif memory_type == "per_greedy":
self.memory = PERGreedyMemory(memory_capacity)
elif memory_type == "per_proportional":
self.memory = PERProportionalMemory(memory_capacity, per_beta_initial, per_beta_steps, per_enable_is)
elif memory_type == "per_rankbase":
self.memory = PERRankBaseMemory(memory_capacity, self.per_alpha, per_beta_initial, per_beta_steps, per_enable_is)
else:
raise ValueError('memory_type is ["replay","per_proportional","per_rankbase"]')
self.gamma = args["gamma"]
self.batch_size = args["batch_size"]
self.enable_double_dqn = args["enable_double_dqn"]
self.target_model_update = args["target_model_update"]
self.multireward_steps = args["multireward_steps"]
self.input_sequence = args["input_sequence"]
self.lstm_type = args["lstm_type"]
self.enable_rescaling_priority = args["enable_rescaling_priority"]
self.enable_rescaling_train = args["enable_rescaling_train"]
self.rescaling_epsilon = args["rescaling_epsilon"]
self.burnin_length = args["burnin_length"]
self.priority_exponent = args["priority_exponent"]
assert memory_capacity > self.batch_size, "Memory capacity is small.(Larger than batch size)"
assert self.memory_warmup_size > self.batch_size, "Warmup steps is few.(Larger than batch size)"
# local
self.train_num = 0
# model create
self.model = build_compile_model(**model_args)
self.target_model = build_compile_model(**model_args)
# lstm ful では lstmレイヤーを使う
if self.lstm_type == "lstm_ful":
self.lstm = self.model.get_layer("lstm")
self.target_lstm = self.target_model.get_layer("lstm")
def train(self):
# Actor から要求があれば weights を渡す
for q in self.model_sync_q:
if not q[0].empty():
# 空にする(念のため)
while not q[0].empty():
q[0].get(timeout=1)
# 送る
q[1].put(self.model.get_weights())
# experience があれば RemoteMemory に追加
while not self.experience_q.empty():
exps = self.experience_q.get(timeout=1)
for exp in exps:
if self.lstm_type == "lstm_ful":
self.memory.add(exp, exp[4])
else:
self.memory.add(exp, exp[4])
# RemoteMemory が一定数貯まるまで学習しない。
if len(self.memory) <= self.memory_warmup_size:
time.sleep(1) # なんとなく
return
(indexes, batchs, weights) = self.memory.sample(self.batch_size, self.train_num)
# 学習(長いので関数化)
if self.lstm_type == "lstm_ful":
self.train_model_ful(indexes, batchs, weights)
else:
self.train_model(indexes, batchs, weights)
self.train_num += 1
# target networkの更新
if self.train_num % self.target_model_update == 0:
self.target_model.set_weights(self.model.get_weights())
# ノーマルの学習
def train_model(self, indexes, batchs, weights):
state0_batch = []
action_batch = []
reward_batch = []
state1_batch = []
for batch in batchs:
state0_batch.append(batch[0])
action_batch.append(batch[1])
reward_batch.append(batch[2])
state1_batch.append(batch[3])
# 更新用に現在のQネットワークを出力(Q network)
outputs = self.model.predict(np.asarray(state0_batch), self.batch_size)
if self.enable_double_dqn:
# TargetNetworkとQNetworkのQ値を出す
state1_model_qvals_batch = self.model.predict(np.asarray(state1_batch), self.batch_size)
state1_target_qvals_batch = self.target_model.predict(np.asarray(state1_batch), self.batch_size)
else:
# 次の状態のQ値を取得(target_network)
target_qvals = self.target_model.predict(np.asarray(state1_batch), self.batch_size)
for i in range(self.batch_size):
if self.enable_double_dqn:
action = np.argmax(state1_model_qvals_batch[i]) # modelからアクションを出す
maxq = state1_target_qvals_batch[i][action] # Q値はtarget_modelを使って出す
else:
maxq = np.max(target_qvals[i])
# priority計算
if self.enable_rescaling_priority:
tmp = rescaling_inverse(maxq)
else:
tmp = maxq
tmp = reward_batch[i] + (self.gamma ** self.multireward_steps) * tmp
tmp *= weights[i]
if self.enable_rescaling_priority:
tmp = rescaling(tmp, self.rescaling_epsilon)
priority = abs(tmp - outputs[i][action_batch[i]]) ** self.per_alpha
# Q値の更新
if self.enable_rescaling_train:
maxq = rescaling_inverse(maxq)
td_error = reward_batch[i] + (self.gamma ** self.multireward_steps) * maxq
td_error *= weights[i]
if self.enable_rescaling_train:
td_error = rescaling(td_error, self.rescaling_epsilon)
outputs[i][action_batch[i]] = td_error
# priorityを更新を更新
self.memory.update(indexes[i], batchs[i], priority)
# 学習
self.model.train_on_batch(np.asarray(state0_batch), np.asarray(outputs))
#self.model.fit(np.asarray(state0_batch), np.asarray(outputs), batch_size=self.batch_size, epochs=1, verbose=0)
# ステートフルLSTMの学習
def train_model_ful(self, indexes, batchs, weights):
# 各経験毎に処理を実施
for batch_i, batch in enumerate(batchs):
states = batch[0]
action = batch[1]
reward = batch[2]
hidden_state = batch[3]
prioritys = []
# burn-in
self.lstm.reset_states(hidden_state)
for i in range(self.burnin_length):
self.model.predict(np.asarray([[states[i]]]), 1)
# burn-in 後の結果を保存
hidden_state = [K.get_value(self.lstm.states[0]), K.get_value(self.lstm.states[1])]
# 以降は1sequenceずつ更新させる
for i in range(self.input_sequence):
state0 = [states[self.burnin_length + i]]
state1 = [states[self.burnin_length + i + self.multireward_steps]]
# 現在のQネットワークを出力
self.lstm.reset_states(hidden_state)
output = self.model.predict(np.asarray([state0]), 1)[0]
# TargetネットワークとQネットワークの値を出力
if self.enable_double_dqn:
self.lstm.reset_states(hidden_state)
self.target_lstm.reset_states(hidden_state)
state1_model_qvals = self.model.predict(np.asarray([state1]), 1)[0]
state1_target_qvals = self.target_model.predict(np.asarray([state1]), 1)[0]
action_q = np.argmax(state1_model_qvals)
maxq = state1_target_qvals[action_q]
else:
self.target_lstm.reset_states(hidden_state)
target_qvals = self.target_model.predict(np.asarray([state1], 1))[0]
maxq = np.max(target_qvals)
# priority計算
if self.enable_rescaling_priority:
tmp = rescaling_inverse(maxq)
else:
tmp = maxq
tmp = reward[i] + (self.gamma ** self.multireward_steps) * tmp
tmp *= weights[batch_i]
if self.enable_rescaling_priority:
tmp = rescaling(tmp)
priority = abs(tmp - output[action[i]]) ** self.per_alpha
prioritys.append(priority)
# Q値 update用
if self.enable_rescaling_train:
maxq = rescaling_inverse(maxq)
td_error = reward[i] + (self.gamma ** self.multireward_steps) * maxq
td_error *= weights[batch_i]
if self.enable_rescaling_train:
td_error = rescaling(td_error, self.rescaling_epsilon)
output[action[i]] = td_error
# 学習
self.lstm.reset_states(hidden_state)
self.model.fit(
np.asarray([state0]),
np.asarray([output]),
batch_size=1,
epochs=1,
verbose=0,
shuffle=False
)
# 次の学習用に hidden state を保存
hidden_state = [K.get_value(self.lstm.states[0]), K.get_value(self.lstm.states[1])]
# 今回使用したsamplingのpriorityを更新
priority = self.priority_exponent * np.max(prioritys) + (1-self.priority_exponent) * np.average(prioritys)
self.memory.update(indexes[batch_i], batch, priority)
#---------------------------------------------------
# actor
#---------------------------------------------------
class ActorLogger(rl.callbacks.Callback):
def __init__(self, index, logger_q, interval):
self.index = index
self.interval = interval
self.logger_q = logger_q
def on_train_begin(self, logs):
self.t0 = time.time()
self.reward_min = None
self.reward_max = None
def on_episode_end(self, episode, logs):
if self.reward_min is None:
self.reward_min = logs["episode_reward"]
self.reward_max = logs["episode_reward"]
else:
if self.reward_min > logs["episode_reward"]:
self.reward_min = logs["episode_reward"]
if self.reward_max < logs["episode_reward"]:
self.reward_max = logs["episode_reward"]
if time.time() - self.t0 > self.interval:
self.t0 = time.time()
self.logger_q.put({
"name": "actor" + str(self.index),
"reward_min": self.reward_min,
"reward_max": self.reward_max,
"nb_steps": logs["nb_steps"],
})
self.reward_min = None
self.reward_max = None
def actor_run_cpu(
actor_index,
actor_func,
model_args,
args,
create_processor_func,
experience_q,
model_sync_q,
logger_q,
actors_end_q,
):
with tf.device("/device:CPU:0"):
actor_run(
actor_index,
actor_func,
model_args,
args,
create_processor_func,
experience_q,
model_sync_q,
logger_q,
actors_end_q)
def actor_run(
actor_index,
actor_func,
model_args,
args,
create_processor_func,
experience_q,
model_sync_q,
logger_q,
actors_end_q,
):
print("Actor{} Starts!".format(actor_index))
try:
actor = Actor(
actor_index,
model_args,
args,
experience_q,
model_sync_q,
processor=create_processor_func()
)
# model load
if os.path.isfile( args["load_weights_path"] ):
actor.model.load_weights(args["load_weights_path"])
# logger用
callbacks = [
ActorLogger(actor_index, logger_q, args["logger_interval"])
]
# run
actor_func(actor_index, actor, callbacks=callbacks)
except KeyboardInterrupt:
pass
except Exception:
print(traceback.format_exc())
finally:
print("Actor{} End!".format(actor_index))
actors_end_q.put(1)
from collections import deque
class Actor(rl.core.Agent):
def __init__(self,
actor_index,
model_args,
args,
experience_q,
model_sync_q,
**kwargs):
super(Actor, self).__init__(**kwargs)
self.actor_index = actor_index
self.experience_q = experience_q
self.model_sync_q = model_sync_q
self.nb_actions = args["nb_actions"]
self.input_shape = args["input_shape"]
self.input_sequence = args["input_sequence"]
self.actor_model_sync_interval = args["actor_model_sync_interval"]
self.gamma = args["gamma"]
self.epsilon = args["epsilon"]
self.multireward_steps = args["multireward_steps"]
self.action_interval = args["action_interval"]
self.burnin_length = args["burnin_length"]
self.lstm_type = args["lstm_type"]
self.enable_dueling_network = args["enable_dueling_network"]
self.enable_noisynet = args["enable_noisynet"]
self.enable_rescaling_priority = args["enable_rescaling_priority"]
self.rescaling_epsilon = args["rescaling_epsilon"]
self.priority_exponent = args["priority_exponent"]
self.per_alpha = args["per_alpha"]
# local memory
self.local_memory = deque()
self.local_memory_update_size = args["local_memory_update_size"]
self.learner_train_num = 0
# model
self.model = build_compile_model(**model_args)
# lstm ful では lstmレイヤーを使う
if self.lstm_type == "lstm_ful":
self.lstm = self.model.get_layer("lstm")
self.compiled = True
def reset_states(self):
self.repeated_action = 0
self.recent_action = [ 0 for _ in range(self.input_sequence)]
self.recent_reward = [ 0 for _ in range(self.input_sequence + self.multireward_steps - 1)]
obs_length = self.burnin_length + self.input_sequence + self.multireward_steps
self.recent_observations = [np.zeros(self.input_shape) for _ in range(obs_length)]
if self.lstm_type == "lstm_ful":
self.model.reset_states()
self.recent_hidden_state = [
[K.get_value(self.lstm.states[0]), K.get_value(self.lstm.states[1])]
for _ in range(self.burnin_length + self.input_sequence)
]
def compile(self, optimizer, metrics=[]):
self.compiled = True
def load_weights(self, filepath):
print("WARNING: Not Loaded. Please use 'load_weights_path' param.")
def save_weights(self, filepath, overwrite=False):
print("WARNING: Not Saved. Please use 'save_weights_path' param.")
def forward(self, observation):
self.recent_observations.append(observation) # 最後に追加
self.recent_observations.pop(0) # 先頭を削除
if self.training:
#--- 経験を送る
if self.lstm_type == "lstm_ful":
# priorityを計算
prioritys = []
rewards = []
for i in range(self.input_sequence):
state0 = [self.recent_observations[self.burnin_length + i]]
state1 = [self.recent_observations[self.burnin_length + i + self.multireward_steps]]
hidden_state = self.recent_hidden_state[i]
action = self.recent_action[i]
# Multi-Step learning
reward = 0
for j in range(self.multireward_steps):
reward += self.recent_reward[i+j] * (self.gamma ** j)
rewards.append(reward)
# 現在のQネットワークを出力
self.lstm.reset_states(hidden_state)
state0_qvals = self.model.predict(np.asarray([state0]), 1)[0]
self.lstm.reset_states(hidden_state)
state1_qvals = self.model.predict(np.asarray([state1]), 1)[0]
maxq = np.max(state1_qvals)
# priority計算
if self.enable_rescaling_priority:
td_error = rescaling_inverse(maxq)
td_error = reward + (self.gamma ** self.multireward_steps) * maxq
if self.enable_rescaling_priority:
td_error = rescaling(td_error, self.rescaling_epsilon)
priority = abs(td_error - state0_qvals[action]) ** self.per_alpha
prioritys.append(priority)
# 今回使用したsamplingのpriorityを更新
priority = self.priority_exponent * np.max(prioritys) + (1-self.priority_exponent) * np.average(prioritys)
# local memoryに追加
self.local_memory.append((
self.recent_observations[:],
self.recent_action[:],
rewards,
self.recent_hidden_state[0],
priority,
))
else:
# Multi-Step learning
reward = 0
for i, r in enumerate(reversed(self.recent_reward)):
reward += r * (self.gamma ** i)
state0 = self.recent_observations[self.burnin_length:self.burnin_length+self.input_sequence]
state1 = self.recent_observations[-self.input_sequence:]
action = self.recent_action[-1]
# priority のために TD-error をだす。
state0_qvals = self.model.predict(np.asarray([state0]), 1)[0]
state1_qvals = self.model.predict(np.asarray([state1]), 1)[0]
maxq = np.max(state1_qvals)
# priority計算
if self.enable_rescaling_priority:
td_error = rescaling(maxq) ** -1
td_error = reward + (self.gamma ** self.multireward_steps) * maxq
if self.enable_rescaling_priority:
td_error = rescaling(td_error, self.rescaling_epsilon)
priority = abs(td_error - state0_qvals[action]) ** self.per_alpha
# local memoryに追加
self.local_memory.append((
state0,
action,
reward,
state1,
priority,
))
# フレームスキップ(action_interval毎に行動を選択する)
action = self.repeated_action
if self.step % self.action_interval == 0:
if self.lstm_type == "lstm_ful":
# 状態を復元
self.lstm.reset_states(self.recent_hidden_state[-1])
# 行動を決定
action = self.select_action()
if self.lstm_type == "lstm_ful":
# 状態を保存
self.recent_hidden_state.append([K.get_value(self.lstm.states[0]), K.get_value(self.lstm.states[1])])
self.recent_hidden_state.pop(0)
# リピート用
self.repeated_action = action
# backword用に保存
self.recent_action.append(action) # 最後に追加
self.recent_action.pop(0) # 先頭を削除
return action
# 長いので関数に
def select_action(self):
# noisy netが有効の場合はそちらで探索する
if self.training and not self.enable_noisynet:
# ϵ-greedy法
if self.epsilon > np.random.uniform(0, 1):
# ランダム
action = np.random.randint(0, self.nb_actions)
else:
action = self._get_qmax_action()
else:
action = self._get_qmax_action()
return action
# 2箇所あるので関数に、現状の最大Q値のアクションを返す
def _get_qmax_action(self):
if self.lstm_type == "lstm_ful":
# 最後の状態のみ
state1 = [self.recent_observations[-1]]
q_values = self.model.predict(np.asarray([state1]), batch_size=1)[0]
else:
# sequence分の入力
state1 = self.recent_observations[-self.input_sequence:]
q_values = self.model.predict(np.asarray([state1]), batch_size=1)[0]
return np.argmax(q_values)
def backward(self, reward, terminal):
self.recent_reward.append(reward) # 最後に追加
self.recent_reward.pop(0) # 先頭を削除
if self.training:
# 一定間隔で model を learner からsyncさせる
if self.step % self.actor_model_sync_interval == 0:
# 要求を送る
self.model_sync_q[0].put(1) # 要求
# weightが届いていれば更新
if not self.model_sync_q[1].empty():
weights = self.model_sync_q[1].get(timeout=1)
# 空にする(念のため)
while not self.model_sync_q[1].empty():
self.model_sync_q[1].get(timeout=1)
self.model.set_weights(weights)
# localメモリが一定量超えていれば RemoteMemory に送信
if len(self.local_memory) > self.local_memory_update_size:
# 共有qに送るのは重いので配列化
data = []
while len(self.local_memory) > 0:
data.append(self.local_memory.pop())
self.experience_q.put(data)
return []
@property
def layers(self):
return self.model.layers[:]
#--------------------------------------------------------
class ReplayMemory():
def __init__(self, capacity):
self.capacity= capacity
self.index = 0
self.buffer = []
def add(self, experience, priority):
if len(self.buffer) < self.capacity:
self.buffer.append(None)
self.buffer[self.index] = experience
self.index = (self.index + 1) % self.capacity
def update(self, idx, experience, priority):
pass
def sample(self, batch_size, steps):
batchs = random.sample(self.buffer, batch_size)
indexes = np.empty(batch_size, dtype='float32')
weights = [ 1 for _ in range(batch_size)]
return (indexes, batchs, weights)
def __len__(self):
return len(self.buffer)
#--------------------------------------------------------
import heapq
class _head_wrapper():
def __init__(self, data):
self.d = data
def __eq__(self, other):
return True
class PERGreedyMemory():
def __init__(self, capacity):
self.buffer = []
self.capacity = capacity
def add(self, experience, priority):
if self.capacity <= len(self.buffer):
# 上限より多い場合は最後の要素を削除
self.buffer.pop()
# priority は最初は最大を選択
experience = _head_wrapper(experience)
heapq.heappush(self.buffer, (-priority, experience))
def update(self, idx, experience, priority):
# heapqは最小値を出すためマイナス
experience = _head_wrapper(experience)
heapq.heappush(self.buffer, (-priority, experience))
def sample(self, batch_size, step):
# 取り出す(学習後に再度追加)
batchs = [heapq.heappop(self.buffer)[1].d for _ in range(batch_size)]
indexes = np.empty(batch_size, dtype='float32')
weights = [ 1 for _ in range(batch_size)]
return (indexes, batchs, weights)
def __len__(self):
return len(self.buffer)
#---------------------------------------------------
#copy from https://github.com/jaromiru/AI-blog/blob/5aa9f0b/SumTree.py
import numpy
class SumTree:
write = 0
def __init__(self, capacity):
self.capacity = capacity
self.tree = numpy.zeros( 2*capacity - 1 )
self.data = numpy.zeros( capacity, dtype=object )
def _propagate(self, idx, change):
parent = (idx - 1) // 2
self.tree[parent] += change
if parent != 0:
self._propagate(parent, change)
def _retrieve(self, idx, s):
left = 2 * idx + 1
right = left + 1
if left >= len(self.tree):
return idx
if s <= self.tree[left]:
return self._retrieve(left, s)
else:
return self._retrieve(right, s-self.tree[left])
def total(self):
return self.tree[0]
def add(self, p, data):
idx = self.write + self.capacity - 1
self.data[self.write] = data
self.update(idx, p)
self.write += 1
if self.write >= self.capacity:
self.write = 0
def update(self, idx, p):
change = p - self.tree[idx]
self.tree[idx] = p
self._propagate(idx, change)
def get(self, s):
idx = self._retrieve(0, s)
dataIdx = idx - self.capacity + 1
return (idx, self.tree[idx], self.data[dataIdx])
class PERProportionalMemory():
def __init__(self, capacity, beta_initial, beta_steps, enable_is):
self.capacity = capacity
self.tree = SumTree(capacity)
self.beta_initial = beta_initial
self.beta_steps = beta_steps
self.enable_is = enable_is
def add(self, experience, priority):
self.tree.add(priority, experience)
def update(self, index, experience, priority):
self.tree.update(index, priority)
def sample(self, batch_size, step):
indexes = []
batchs = []
weights = np.empty(batch_size, dtype='float32')
if self.enable_is:
# βは最初は低く、学習終わりに1にする
beta = self.beta_initial + (1 - self.beta_initial) * step / self.beta_steps
# 合計を均等に割り、その範囲内からそれぞれ乱数を出す。
total = self.tree.total()
section = total / batch_size
for i in range(batch_size):
r = section*i + random.random()*section
(idx, priority, experience) = self.tree.get(r)
indexes.append(idx)
batchs.append(experience)
if self.enable_is:
# 重要度サンプリングを計算
weights[i] = (self.capacity * priority / total) ** (-beta)
else:
weights[i] = 1 # 無効なら1
if self.enable_is:
# 安定性の理由から最大値で正規化
weights = weights / weights.max()
return (indexes ,batchs, weights)
def __len__(self):
return self.tree.write
#------------------------------------
import bisect
class _bisect_wrapper():
def __init__(self, data):
self.d = data
self.priority = 0
self.p = 0
def __lt__(self, o): # a<b
return self.priority > o.priority
class PERRankBaseMemory():
def __init__(self, capacity, alpha, beta_initial, beta_steps, enable_is):
self.capacity = capacity
self.buffer = []
self.alpha = alpha
self.beta_initial = beta_initial
self.beta_steps = beta_steps
self.enable_is = enable_is
def add(self, experience, priority):
if self.capacity <= len(self.buffer):
# 上限より多い場合は最後の要素を削除
self.buffer.pop()
experience = _bisect_wrapper(experience)
experience.priority = priority
bisect.insort(self.buffer, experience)
def update(self, index, experience, priority):
experience = _bisect_wrapper(experience)
experience.priority = priority
bisect.insort(self.buffer, experience)
def sample(self, batch_size, step):
indexes = []
batchs = []
weights = np.empty(batch_size, dtype='float32')
if self.enable_is:
# βは最初は低く、学習終わりに1にする。
beta = self.beta_initial + (1 - self.beta_initial) * step / self.beta_steps
total = 0
for i, o in enumerate(self.buffer):
o.index = i
o.p = (len(self.buffer) - i) ** self.alpha
total += o.p
o.p_total = total
# 合計を均等に割り、その範囲内からそれぞれ乱数を出す。
index_lst = []
section = total / batch_size
rand = []
for i in range(batch_size):
rand.append(section*i + random.random()*section)
rand_i = 0
for i in range(len(self.buffer)):
if rand[rand_i] < self.buffer[i].p_total:
index_lst.append(i)
rand_i += 1
if rand_i >= len(rand):
break
for i, index in enumerate(reversed(index_lst)):
o = self.buffer.pop(index) # 後ろから取得するのでindexに変化なし
batchs.append(o.d)
indexes.append(index)
if self.enable_is:
# 重要度サンプリングを計算
priority = o.p
weights[i] = (self.capacity * priority / total) ** (-beta)
else:
weights[i] = 1 # 無効なら1
if self.enable_is:
# 安定性の理由から最大値で正規化
weights = weights / weights.max()
return (indexes, batchs, weights)
def __len__(self):
return len(self.buffer)
#----------------------------------------
# from : https://github.com/LuEE-C/Noisy-A3C-Keras/blob/master/NoisyDense.py
# from : https://github.com/keiohta/tf2rl/blob/atari/tf2rl/networks/noisy_dense.py
class NoisyDense(Layer):
def __init__(self, units,
sigma_init=0.02,
activation=None,
use_bias=True,
kernel_initializer='glorot_uniform',
bias_initializer='zeros',
kernel_regularizer=None,
bias_regularizer=None,
activity_regularizer=None,
kernel_constraint=None,
bias_constraint=None,
**kwargs):
if 'input_shape' not in kwargs and 'input_dim' in kwargs:
kwargs['input_shape'] = (kwargs.pop('input_dim'),)
super(NoisyDense, self).__init__(**kwargs)
self.units = units
self.sigma_init = sigma_init
self.activation = activations.get(activation)
self.use_bias = use_bias
self.kernel_initializer = initializers.get(kernel_initializer)
self.bias_initializer = initializers.get(bias_initializer)
self.kernel_regularizer = regularizers.get(kernel_regularizer)
self.bias_regularizer = regularizers.get(bias_regularizer)
self.activity_regularizer = regularizers.get(activity_regularizer)
self.kernel_constraint = constraints.get(kernel_constraint)
self.bias_constraint = constraints.get(bias_constraint)
def build(self, input_shape):
assert len(input_shape) >= 2
self.input_dim = input_shape[-1]
self.kernel_shape = tf.constant((self.input_dim, self.units))
self.bias_shape = tf.constant((self.units,))
self.kernel = self.add_weight(shape=(self.input_dim, self.units),
initializer=self.kernel_initializer,
name='kernel',
regularizer=self.kernel_regularizer,
constraint=self.kernel_constraint)
self.sigma_kernel = self.add_weight(shape=(self.input_dim, self.units),
initializer=initializers.Constant(value=self.sigma_init),
name='sigma_kernel'
)
if self.use_bias:
self.bias = self.add_weight(shape=(self.units,),
initializer=self.bias_initializer,
name='bias',
regularizer=self.bias_regularizer,
constraint=self.bias_constraint)
self.sigma_bias = self.add_weight(shape=(self.units,),
initializer=initializers.Constant(value=self.sigma_init),
name='sigma_bias')
else:
self.bias = None
self.epsilon_bias = None
self.epsilon_kernel = K.zeros(shape=(self.input_dim, self.units))
self.epsilon_bias = K.zeros(shape=(self.units,))
self.sample_noise()
super(NoisyDense, self).build(input_shape)
def call(self, X):
#perturbation = self.sigma_kernel * self.epsilon_kernel
#perturbed_kernel = self.kernel + perturbation
perturbed_kernel = self.sigma_kernel * K.random_uniform(shape=self.kernel_shape)
output = K.dot(X, perturbed_kernel)
if self.use_bias:
#bias_perturbation = self.sigma_bias * self.epsilon_bias
#perturbed_bias = self.bias + bias_perturbation
perturbed_bias = self.bias + self.sigma_bias * K.random_uniform(shape=self.bias_shape)
output = K.bias_add(output, perturbed_bias)
if self.activation is not None:
output = self.activation(output)
return output
def compute_output_shape(self, input_shape):
assert input_shape and len(input_shape) >= 2
assert input_shape[-1]
output_shape = list(input_shape)
output_shape[-1] = self.units
return tuple(output_shape)
def sample_noise(self):
K.set_value(self.epsilon_kernel, np.random.normal(0, 1, (self.input_dim, self.units)))
K.set_value(self.epsilon_bias, np.random.normal(0, 1, (self.units,)))
def remove_noise(self):
K.set_value(self.epsilon_kernel, np.zeros(shape=(self.input_dim, self.units)))
K.set_value(self.epsilon_bias, np.zeros(shape=self.units,))
#---------------------------------------------------
#-----------------------------------------------------
# NN可視化用
#-----------------------------------------------------
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.animation
import cv2
class ObservationLogger(rl.callbacks.Callback):
def __init__(self):
self.observations = []
def on_step_end(self, step, logs):
self.observations.append(logs["observation"])
agent = None
logger = None
def grad_cam(c_output, c_val, img, shape):
global agent
if agent.lstm_type == "":
c_output = c_output[0]
c_val = c_val[0]
else:
c_output = c_output[0][-1]
c_val = c_val[0][-1]
weights = np.mean(c_val, axis=(0, 1))
cam = np.dot(c_output, weights)
cam = cv2.resize(cam, shape, cv2.INTER_LINEAR)
cam = np.maximum(cam, 0)
cam = cam / cam.max()
cam = cv2.applyColorMap(np.uint8(255 * cam), cv2.COLORMAP_JET)
rate = 0.4
cam = cv2.addWeighted(src1=img, alpha=(1-rate), src2=cam, beta=rate, gamma=0)
cam = cv2.cvtColor(cam, cv2.COLOR_BGR2RGB) # 色をRGBに変換
return cam
def plot(frame):
if frame % 30 == 0: # debug
print(frame)
global agent, logger
observations = logger.observations
input_sequence = agent.input_sequence
model = agent.model
enable_dueling_network = agent.enable_dueling_network
# 入力分の frame がたまるまで待つ
if frame < input_sequence:
return
# 入力用の変数を作成
# 入力は input_sequence の長さ分必要(DQN編を参照)
input_state = observations[frame - input_sequence:frame]
# ついでに shape も取得
shape = np.asarray(observations[0]).shape
# 出力用のオリジナル画像を作成
# 形式は(w,h)でかつ0~1で正規化されているので画像形式に変換
img = np.asarray(observations[frame]) # (w,h)
img *= 255
img = cv2.cvtColor(np.uint8(img), cv2.COLOR_GRAY2BGR) # (w,h) -> (w,h,3)
c1_output = model.get_layer("c1").output
c2_output = model.get_layer("c2").output
c3_output = model.get_layer("c3").output
if enable_dueling_network:
v_output = model.get_layer("v").output
adv_output = model.get_layer("adv").output
# 予測結果を出す
prediction = model.predict(np.asarray([input_state]), 1)[0]
class_idx = np.argmax(prediction)
class_output = model.output[0][class_idx]
# 各勾配を定義
# adv層は出力と同じ(action数)なので予測結果を指定
# v層はUnit数が1つしかないので0を指定
grads_c1 = K.gradients(class_output, c1_output)[0]
grads_c2 = K.gradients(class_output, c2_output)[0]
grads_c3 = K.gradients(class_output, c3_output)[0]
if enable_dueling_network:
grads_v = K.gradients(v_output[0][0], model.input)[0]
grads_adv = K.gradients(adv_output[0][class_idx], model.input)[0]
# functionを定義、1度にすべて計算
if enable_dueling_network:
grads_func = K.function([model.input, K.learning_phase()],
[c1_output, grads_c1, c2_output, grads_c2, c3_output, grads_c3, grads_adv, grads_v])
# 勾配を計算
(c1_output, c1_val, c2_output, c2_val, c3_output, c3_val, adv_val, v_val) = grads_func([np.asarray([input_state]), 0])
adv_val = adv_val[0][input_sequence-1]
v_val = v_val[0][input_sequence-1]
# SaliencyMap
adv_val = np.abs(adv_val.reshape(shape))
v_val = np.abs(v_val.reshape(shape))
# Grad-CAMの計算と画像化、3回も書きたくないので関数化
cam1 = grad_cam(c1_output, c1_val, img, shape)
cam2 = grad_cam(c2_output, c2_val, img, shape)
cam3 = grad_cam(c3_output, c3_val, img, shape)
imgs = [img, cam1, cam2, cam3, adv_val, v_val]
names = ["original", "c1", "c2", "c3", "advance", "value"]
cmaps = ["", "", "", "", "gray", "gray"]
else:
grads_func = K.function([model.input, K.learning_phase()],
[c1_output, grads_c1, c2_output, grads_c2, c3_output, grads_c3, grads_adv])
# 勾配を計算
(c1_output, c1_val, c2_output, c2_val, c3_output, c3_val, adv_val) = grads_func([np.asarray([input_state]), 0])
adv_val = adv_val[0][input_sequence-1]
# SaliencyMap
adv_val = np.abs(adv_val.reshape(shape))
# Grad-CAMの計算と画像化、3回も書きたくないので関数化
cam1 = grad_cam(c1_output, c1_val, img, shape)
cam2 = grad_cam(c2_output, c2_val, img, shape)
cam3 = grad_cam(c3_output, c3_val, img, shape)
imgs = [img, cam1, cam2, cam3, adv_val]
names = ["original", "c1", "c2", "c3", "advance"]
cmaps = ["", "", "", "", "gray"]
# plot
for i in range(len(imgs)):
plt.subplot(2, 3, i+1)
plt.gca().tick_params(labelbottom="off",bottom="off") # x軸の削除
plt.gca().tick_params(labelleft="off",left="off") # y軸の削除
plt.title(names[i]).set_fontsize(12)
if cmaps[i] == "":
plt.imshow(imgs[i])
else:
plt.imshow(imgs[i], cmap=cmaps[i])
def plot_logs(learner_logs, actors_logs):
n = len(actors_logs) + 1
train_num = [ l["train_num"] for l in learner_logs]
actor_names = []
actor_steps = []
actor_reward_min = []
actor_reward_max = []
for name, logs in actors_logs.items():
actor_names.append(name)
steps = []
reward_min = []
reward_max = []
for log in logs:
steps.append(log["nb_steps"])
reward_min.append(log["reward_min"])
reward_max.append(log["reward_max"])
actor_steps.append(steps)
actor_reward_min.append(reward_min)
actor_reward_max.append(reward_max)
plt.subplot(n,1,1)
plt.ylabel("Trains,Steps")
plt.plot(train_num, label="train_num")
for i in range(len(actor_names)):
plt.plot(actor_steps[i], label=actor_names[i])
#-- legend
from matplotlib.font_manager import FontProperties
plt.subplots_adjust(left=0.1, right=0.85, bottom=0.1, top=0.95)
plt.legend(bbox_to_anchor=(1.00, 1), loc='upper left', borderaxespad=0, fontsize=8)
# reward
for i in range(len(actor_names)):
plt.subplot(n,1,i+2)
plt.plot(actor_reward_min[i], label="min")
plt.plot(actor_reward_max[i], label="max")
plt.ylabel("Actor" + str(i) + " Reward")
plt.show()
#-----------------------------------------------------
# ムービー用
#-----------------------------------------------------
import time
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.animation
class MovieLogger(rl.callbacks.Callback):
def __init__(self):
self.frames = []
self.history = []
def on_action_end(self, action, logs):
self.frames.append(self.env.render(mode='rgb_array'))
def on_step_end(self, step, logs):
self.history.append(logs)
#-----------------------
def view(self, interval=10, start_frame=0, end_frame=0, gifname="", mp4name=""):
assert start_frame<len(self.frames), "start frame is over frames({})".format(len(self.frames))
if end_frame == 0:
end_frame = len(self.frames)
elif end_frame > len(self.frames):
end_frame = len(self.frames)
self.start_frame = start_frame
self.patch = plt.imshow(self.frames[0])
plt.axis('off')
ani = matplotlib.animation.FuncAnimation(plt.gcf(), self._plot, frames=end_frame - start_frame, interval=interval)
if gifname != "":
#ani.save(gifname, writer="pillow", fps=5)
ani.save(gifname, writer="imagemagick", fps=60)
if mp4name != "":
ani.save(mp4name, writer="ffmpeg")
#plt.show()
def _plot(self, frame):
if frame % 100 == 0:
print("{}f".format(frame))
#plt.imshow(self.frames[frame + self.start_frame])
self.patch.set_data(self.frames[frame + self.start_frame])
#-----------------------------------------------------------
# main
#-----------------------------------------------------------
ENV_NAME = "Pendulum-v0"
def create_processor():
return PendulumProcessorForDQN(enable_image=False)
def create_processor_image():
return PendulumProcessorForDQN(enable_image=True, image_size=84)
def create_optimizer():
return Adam(lr=0.00025)
def actor_func(index, actor, callbacks):
env = gym.make(ENV_NAME)
if index == 0:
verbose = 1
else:
verbose = 0
actor.fit(env, nb_steps=200_000, visualize=False, verbose=verbose, callbacks=callbacks)
#--------------------------------------
def main(image):
global agent, logger
env = gym.make(ENV_NAME)
if image:
processor = create_processor_image
input_shape = (84, 84)
else:
processor = create_processor
input_shape = env.observation_space.shape
# 引数
args = {
# model関係
"input_shape": input_shape,
"enable_image_layer": image,
"nb_actions": 5,
"input_sequence": 4, # 入力フレーム数
"dense_units_num": 64, # Dense層のユニット数
"metrics": [], # optimizer用
"enable_dueling_network": True, # dueling_network有効フラグ
"dueling_network_type": "ave", # dueling_networkのアルゴリズム
"enable_noisynet": False, # NoisyNet有効フラグ
"lstm_type": "lstm", # LSTMのアルゴリズム
"lstm_units_num": 64, # LSTM層のユニット数
# learner 関係
"remote_memory_capacity": 500_000, # 確保するメモリーサイズ
"remote_memory_warmup_size": 200, # 初期のメモリー確保用step数(学習しない)
"remote_memory_type": "per_proportional", # メモリの種類
"per_alpha": 0.8, # PERの確率反映率
"per_beta_initial": 0.0, # IS反映率の初期値
"per_beta_steps": 100_000, # IS反映率の上昇step数
"per_enable_is": False, # ISを有効にするかどうか
"batch_size": 16, # batch_size
"target_model_update": 1500, # target networkのupdate間隔
"enable_double_dqn": True, # DDQN有効フラグ
"enable_rescaling_priority": False, # rescalingを有効にするか(priotrity)
"enable_rescaling_train": False, # rescalingを有効にするか(train)
"rescaling_epsilon": 0.001, # rescalingの定数
"burnin_length": 20, # burn-in期間
"priority_exponent": 0.9, # priority優先度
# actor 関係
"local_memory_update_size": 50, # LocalMemoryからRemoteMemoryへ投げるサイズ
"actor_model_sync_interval": 500, # learner から model を同期する間隔
"gamma": 0.99, # Q学習の割引率
"epsilon": 0.4, # ϵ-greedy法
"epsilon_alpha": 2, # ϵ-greedy法
"multireward_steps": 1, # multistep reward
"action_interval": 1, # アクションを実行する間隔
# その他
"load_weights_path": "", # 保存ファイル名
#"load_weights_path": "qiita08r2d2.h5", # 読み込みファイル名
"save_weights_path": "qiita08_r2d2_image_lstm.h5", # 保存ファイル名
"save_overwrite": True, # 上書き保存するか
"logger_interval": 10, # ログ取得間隔(秒)
"enable_GPU": True, # GPUを使うか
}
manager = R2D2Manager(
actor_func=actor_func,
num_actors=2, # actor数
args=args,
create_processor_func=processor,
create_optimizer_func=create_optimizer,
)
agent, learner_logs, actors_logs = manager.train()
#--- plot
plot_logs(learner_logs, actors_logs)
#--- test
agent.processor.mode = "test" # env本来の報酬を返す
agent.test(env, nb_episodes=5, visualize=True)
view = MovieLogger() # 動画用
logger = ObservationLogger()
agent.test(env, nb_episodes=1, visualize=False, callbacks=[view, logger])
view.view(interval=1, gifname="anim1.gif") # 動画用
#--- NNの可視化
if image:
plt.figure(figsize=(8.0, 6.0), dpi = 100) # 大きさを指定
plt.axis('off')
#ani = matplotlib.animation.FuncAnimation(plt.gcf(), plot, frames=150, interval=5)
ani = matplotlib.animation.FuncAnimation(plt.gcf(), plot, frames=len(logger.observations), interval=5)
#ani.save('anim2.mp4', writer="ffmpeg")
ani.save('anim2.gif', writer="imagemagick", fps=60)
#plt.show()
if __name__ == '__main__':
# コメントアウトで切り替え
#main(image=False)
main(image=True)
# GPU確認
#from tensorflow.python.client import device_lib
#print(device_lib.list_local_devices())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment