Skip to content

Instantly share code, notes, and snippets.

@mitmul
Created March 26, 2013 08:59
Show Gist options
  • Save mitmul/5244017 to your computer and use it in GitHub Desktop.
Save mitmul/5244017 to your computer and use it in GitHub Desktop.
require "gnuplot"
require "narray"
def draw_chart(x, y)
Gnuplot.open do |gp|
Gnuplot::Plot.new(gp) do |plot|
y.each do |name, value|
if x.size == value.size
plot.data << Gnuplot::DataSet.new([x, value]) do |ds|
ds.with = "lines"
ds.title = name
end
end
end
end
end
end
def make_hist(prob_array)
hist = Array.new(100, 0)
prob_array[0].each do |prob|
hist[prob*10.to_i + 50] += 1
end
draw_chart(NArray[0..hist.size-1].to_a, {"Normal Distribution!!!!" => hist})
end
def nrand
12.times.inject(0){|a, i| a += rand} - 6.0
end
def greedy(try, reward_array, exp)
q_t_a = Array.new(10, 0.0)
r_sum = Array.new(10, 0.0)
result = 0.0
choices = Array.new(10, 0)
path = []
# 真に最適な手
exp_reward_array = []
(0..9).each do |i|
exp_reward_array << reward_array[i][exp]
end
opt_choice = NVector[exp_reward_array].flatten.sort_index[-1]
opt_select = 0
opt_path = []
try.times do |t|
choice = NVector[q_t_a].flatten.sort_index[-1]
opt_select += 1 if choice == opt_choice
choices[choice] += 1
reward = nrand + reward_array[choice][exp]
r_sum[choice] += reward
result += reward
q_t_a[choice] = r_sum[choice] / choices[choice]
path << reward
opt_path << opt_select.to_f / t.to_f
end
[path, opt_path]
end
def epsilon_greedy(try, reward_array, exp, epsilon)
q_t_a = Array.new(10, 5.0)
r_sum = Array.new(10, 0.0)
result = 0.0
choices = Array.new(10, 0)
path = []
# 真に最適な手
exp_reward_array = []
(0..9).each do |i|
exp_reward_array << reward_array[i][exp]
end
opt_choice = NVector[exp_reward_array].flatten.sort_index[-1]
opt_select = 0
opt_path = []
try.times do |t|
choice = 0
if rand <= epsilon ? true : false
choice = rand(10)
else
choice = NVector[q_t_a].flatten.sort_index[-1]
end
opt_select += 1 if choice == opt_choice
choices[choice] += 1
reward = nrand + reward_array[choice][exp]
r_sum[choice] += reward
result += reward
q_t_a[choice] = r_sum[choice] / choices[choice]
path << reward
opt_path << opt_select.to_f / t.to_f
end
[path, opt_path]
end
def get_occurrence_id(distribution)
occur_id = rand(distribution.size)
prob_sum = 0.0
r = rand
distribution.each_with_index do |d, i|
if prob_sum <= r && r < prob_sum + d
occur_id = i
break
end
prob_sum += d
end
occur_id
end
def softmax(try, reward_array, exp, tau)
q_t_a = Array.new(10, 0.0)
r_sum = Array.new(10, 0.0)
result = 0.0
choices = Array.new(10, 0)
path = []
# 真に最適な手
exp_reward_array = []
(0..9).each do |i|
exp_reward_array << reward_array[i][exp]
end
opt_choice = NVector[exp_reward_array].flatten.sort_index[-1]
opt_select = 0
opt_path = []
choice_prob = []
try.times do |t|
# 各行動の選択確率(Boltzmann分布)
denominator = q_t_a.inject(0){|sum, qta| sum += Math.exp(qta / tau)}
if denominator.infinite?
q_t_a.each_with_index do |qta, i|
choice_prob[i] = 0
end
else
q_t_a.each_with_index do |qta, i|
numerator = Math.exp(qta / tau)
choice_prob[i] = numerator / denominator
end
end
# 選択確率に従って行動を選択
choice = get_occurrence_id(choice_prob)
opt_select += 1 if choice == opt_choice
choices[choice] += 1
reward = nrand + reward_array[choice][exp]
r_sum[choice] += reward
result += reward
q_t_a[choice] = r_sum[choice] / choices[choice]
path << reward
opt_path << opt_select.to_f / t.to_f
end
[path, opt_path]
end
def greedy_experiment
try = 1000
exp = 2000
# 問題作成
reward_array = []
10.times do
reward_array << exp.times.inject([]){|a, r| a << 12.times.inject(0){|a, i| a += rand} - 6.0}
end
# 正解率の推移
greedy_path = ep_greedy_path = ep2_greedy_path = NVector.float(try)
# 最適度の推移
greedy_opt_path = ep_greedy_opt_path = ep2_greedy_opt_path = NVector.float(try)
# 実験
exp.times do |i|
greedy_result = epsilon_greedy(try, reward_array, i, 0.0)
greedy_path += NVector.to_na(greedy_result[0])
greedy_opt_path += NVector.to_na(greedy_result[1])
ep_greedy_result = epsilon_greedy(try, reward_array, i, 0.01)
ep_greedy_path += NVector.to_na(ep_greedy_result[0])
ep_greedy_opt_path += NVector.to_na(ep_greedy_result[1])
ep2_greedy_result = epsilon_greedy(try, reward_array, i, 0.1)
ep2_greedy_path += NVector.to_na(ep2_greedy_result[0])
ep2_greedy_opt_path += NVector.to_na(ep2_greedy_result[1])
end
greedy_path = (greedy_path / exp).to_a
greedy_opt_path = (greedy_opt_path / exp).to_a
ep_greedy_path = (ep_greedy_path / exp).to_a
ep_greedy_opt_path = (ep_greedy_opt_path / exp).to_a
ep2_greedy_path = (ep2_greedy_path / exp).to_a
ep2_greedy_opt_path = (ep2_greedy_opt_path / exp).to_a
draw_chart(NArray[0..try-1].to_a, {"greedy" => greedy_path,
"epsilon(=0.01) greedy" => ep_greedy_path,
"epsilon(=0.1) greedy" => ep2_greedy_path})
draw_chart(NArray[0..try-1].to_a, {"greedy" => greedy_opt_path,
"epsilon(=0.01) greedy" => ep_greedy_opt_path,
"epsilon(=0.1) greedy" => ep2_greedy_opt_path})
end
def softmax_experiment
try = 1000
exp = 2000
# 問題作成
reward_array = []
10.times do
reward_array << exp.times.inject([]){|a, r| a << 12.times.inject(0){|a, i| a += rand} - 6.0}
end
softmax_path_a = softmax_path_b = softmax_path_c = NVector.float(try)
softmax_opt_path_a = softmax_opt_path_b = softmax_opt_path_c = NVector.float(try)
# 実験
exp.times do |i|
softmax_result_a = softmax(try, reward_array, i, 0.1)
softmax_path_a += NVector.to_na(softmax_result_a[0])
softmax_opt_path_a += NVector.to_na(softmax_result_a[1])
softmax_result_b = softmax(try, reward_array, i, 0.5)
softmax_path_b += NVector.to_na(softmax_result_b[0])
softmax_opt_path_b += NVector.to_na(softmax_result_b[1])
softmax_result_c = softmax(try, reward_array, i, 1)
softmax_path_c += NVector.to_na(softmax_result_c[0])
softmax_opt_path_c += NVector.to_na(softmax_result_c[1])
end
softmax_path_a = (softmax_path_a / exp).to_a
softmax_opt_path_a = (softmax_opt_path_a / exp).to_a
softmax_path_b = (softmax_path_b / exp).to_a
softmax_opt_path_b = (softmax_opt_path_b / exp).to_a
softmax_path_c = (softmax_path_c / exp).to_a
softmax_opt_path_c = (softmax_opt_path_c / exp).to_a
draw_chart(NArray[0..try-1].to_a, {"softmax(tau=0.1)" => softmax_path_a,
"softmax(tau=0.5)" => softmax_path_b,
"softmax(tau=1)" => softmax_path_c})
draw_chart(NArray[0..try-1].to_a, {"softmax(tau=0.1)" => softmax_opt_path_a,
"softmax(tau=0.5)" => softmax_opt_path_b,
"softmax(tau=1)" => softmax_opt_path_c})
end
def greedy_softmax_experiment
try = 1000
exp = 2000
# 問題作成
reward_array = []
10.times do
reward_array << exp.times.inject([]){|a, r| a << nrand}
end
greedy_path = ep_greedy_path = ep2_greedy_path = NVector.float(try)
greedy_opt_path = ep_greedy_opt_path = ep2_greedy_opt_path = NVector.float(try)
softmax_path_a = softmax_path_b = softmax_path_c = NVector.float(try)
softmax_opt_path_a = softmax_opt_path_b = softmax_opt_path_c = NVector.float(try)
# 実験
exp.times do |i|
greedy_result = epsilon_greedy(try, reward_array, i, 0.0)
greedy_path += NVector.to_na(greedy_result[0])
greedy_opt_path += NVector.to_na(greedy_result[1])
ep_greedy_result = epsilon_greedy(try, reward_array, i, 0.01)
ep_greedy_path += NVector.to_na(ep_greedy_result[0])
ep_greedy_opt_path += NVector.to_na(ep_greedy_result[1])
ep2_greedy_result = epsilon_greedy(try, reward_array, i, 0.1)
ep2_greedy_path += NVector.to_na(ep2_greedy_result[0])
ep2_greedy_opt_path += NVector.to_na(ep2_greedy_result[1])
softmax_result_a = softmax(try, reward_array, i, 0.1)
softmax_path_a += NVector.to_na(softmax_result_a[0])
softmax_opt_path_a += NVector.to_na(softmax_result_a[1])
softmax_result_b = softmax(try, reward_array, i, 0.5)
softmax_path_b += NVector.to_na(softmax_result_b[0])
softmax_opt_path_b += NVector.to_na(softmax_result_b[1])
softmax_result_c = softmax(try, reward_array, i, 1)
softmax_path_c += NVector.to_na(softmax_result_c[0])
softmax_opt_path_c += NVector.to_na(softmax_result_c[1])
end
greedy_path = (greedy_path / exp).to_a
greedy_opt_path = (greedy_opt_path / exp).to_a
ep_greedy_path = (ep_greedy_path / exp).to_a
ep_greedy_opt_path = (ep_greedy_opt_path / exp).to_a
ep2_greedy_path = (ep2_greedy_path / exp).to_a
ep2_greedy_opt_path = (ep2_greedy_opt_path / exp).to_a
softmax_path_a = (softmax_path_a / exp).to_a
softmax_opt_path_a = (softmax_opt_path_a / exp).to_a
softmax_path_b = (softmax_path_b / exp).to_a
softmax_opt_path_b = (softmax_opt_path_b / exp).to_a
softmax_path_c = (softmax_path_c / exp).to_a
softmax_opt_path_c = (softmax_opt_path_c / exp).to_a
draw_chart(NArray[0..try-1].to_a, {"greedy" => greedy_path,
"epsilon(=0.01) greedy" => ep_greedy_path,
"epsilon(=0.1) greedy" => ep2_greedy_path,
"softmax(tau=0.1)" => softmax_path_a,
"softmax(tau=0.5)" => softmax_path_b,
"softmax(tau=1)" => softmax_path_c})
end
greedy_softmax_experiment
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment