Skip to content

Instantly share code, notes, and snippets.

View JaeDukSeo's full-sized avatar
🙏
Praying

J JaeDukSeo

🙏
Praying
View GitHub Profile
# l softmax
l_pull_count = np.zeros((num_ep,num_bandit))
l_estimation = np.zeros((num_ep,num_bandit))
l_reward = np.zeros((num_ep,num_iter))
l_optimal_pull = np.zeros((num_ep,num_iter))
l_regret_total = np.zeros((num_ep,num_iter))
for eps in range(num_ep):
temp_pull_count = np.zeros(num_bandit)
temp_estimation = np.zeros(num_bandit) + 1/num_bandit
# k neural network (with adam)
k_pull_count = np.zeros((num_ep,num_bandit))
k_estimation = np.zeros((num_ep,num_bandit))
k_reward = np.zeros((num_ep,num_iter))
k_optimal_pull = np.zeros((num_ep,num_iter))
k_regret_total = np.zeros((num_ep,num_iter))
def sigmoid(x): return 1/(1+np.exp(-x))
def d_sigmoid(x): return sigmoid(x)*(1-sigmoid(x))
# j Thompson Sampling (uniform) (slow)
j_pull_count = np.zeros((num_ep,num_bandit))
j_estimation = np.zeros((num_ep,num_bandit))
j_reward = np.zeros((num_ep,num_iter))
j_optimal_pull = np.zeros((num_ep,num_iter))
j_regret_total = np.zeros((num_ep,num_iter))
for eps in range(num_ep):
temp_pull_count = np.zeros(num_bandit)
# i Thompson Sampling (beta) (slow)
i_pull_count = np.zeros((num_ep,num_bandit))
i_estimation = np.zeros((num_ep,num_bandit))
i_reward = np.zeros((num_ep,num_iter))
i_optimal_pull = np.zeros((num_ep,num_iter))
i_regret_total = np.zeros((num_ep,num_iter))
for eps in range(num_ep):
temp_pull_count = np.zeros(num_bandit)
# h UBC 1 Tuned
h_pull_count = np.zeros((num_ep,num_bandit))
h_estimation = np.zeros((num_ep,num_bandit))
h_reward = np.zeros((num_ep,num_iter))
h_optimal_pull = np.zeros((num_ep,num_iter))
h_regret_total = np.zeros((num_ep,num_iter))
for eps in range(num_ep):
temp_pull_count = np.zeros(num_bandit)
temp_estimation = np.zeros(num_bandit)
# g UBC 1
g_pull_count = np.zeros((num_ep,num_bandit))
g_estimation = np.zeros((num_ep,num_bandit))
g_reward = np.zeros((num_ep,num_iter))
g_optimal_pull = np.zeros((num_ep,num_iter))
g_regret_total = np.zeros((num_ep,num_iter))
for eps in range(num_ep):
temp_pull_count = np.zeros(num_bandit)
temp_estimation = np.zeros(num_bandit)
# f Linear Reward Penalty
f_pull_count = np.zeros((num_ep,num_bandit))
f_estimation = np.zeros((num_ep,num_bandit))
f_reward = np.zeros((num_ep,num_iter))
f_optimal_pull = np.zeros((num_ep,num_iter))
f_regret_total = np.zeros((num_ep,num_iter))
for eps in range(num_ep):
alpha = 0.01
beta = 0.001
# e Linear Reward Inaction
e_pull_count = np.zeros((num_ep,num_bandit))
e_estimation = np.zeros((num_ep,num_bandit))
e_reward = np.zeros((num_ep,num_iter))
e_optimal_pull = np.zeros((num_ep,num_iter))
e_regret_total = np.zeros((num_ep,num_iter))
for eps in range(num_ep):
learning_rate = 0.1
temp_pull_count = np.zeros(num_bandit)
# d decy e greedy
d_pull_count = np.zeros((num_ep,num_bandit))
d_estimation = np.zeros((num_ep,num_bandit))
d_reward = np.zeros((num_ep,num_iter))
d_optimal_pull = np.zeros((num_ep,num_iter))
d_regret_total = np.zeros((num_ep,num_iter))
for eps in range(num_ep):
epsilon = 1.0
temp_pull_count = np.zeros(num_bandit)
# c e greedy
c_pull_count = np.zeros((num_ep,num_bandit))
c_estimation = np.zeros((num_ep,num_bandit))
c_reward = np.zeros((num_ep,num_iter))
c_optimal_pull = np.zeros((num_ep,num_iter))
c_regret_total = np.zeros((num_ep,num_iter))
for eps in range(num_ep):
epsilon = np.random.uniform(0,1)
temp_pull_count = np.zeros(num_bandit)