Skip to content

Instantly share code, notes, and snippets.

@JaeDukSeo
Last active January 14, 2019 09:52
Show Gist options
  • Save JaeDukSeo/5f1d2ba3a1843766aca70e14264b7429 to your computer and use it in GitHub Desktop.
Save JaeDukSeo/5f1d2ba3a1843766aca70e14264b7429 to your computer and use it in GitHub Desktop.
# b greedy
b_pull_count = np.zeros((num_ep,num_bandit))
b_estimation = np.zeros((num_ep,num_bandit))
b_reward = np.zeros((num_ep,num_iter))
b_optimal_pull = np.zeros((num_ep,num_iter))
b_regret_total = np.zeros((num_ep,num_iter))
for eps in range(num_ep):
temp_pull_count = np.zeros(num_bandit)
temp_estimation = np.zeros(num_bandit) + np.random.uniform(0,1,num_bandit)
temp_reward = np.zeros(num_iter)
temp_optimal_pull = np.zeros(num_iter)
temp_regret = np.zeros(num_iter)
for iter in range(num_iter):
# select bandit / get reward /increase count / update estimate
current_choice = np.argmax(temp_estimation)
current_reward = 1 if np.random.uniform(0,1) < gt_prob[current_choice] else 0
temp_pull_count[current_choice] = temp_pull_count[current_choice] + 1
temp_estimation[current_choice] = temp_estimation[current_choice] + (1/(temp_pull_count[current_choice]+1)) * (current_reward-temp_estimation[current_choice])
# update reward and optimal choice
temp_reward[iter] = current_reward if iter == 0 else temp_reward[iter-1] + current_reward
temp_optimal_pull[iter] = 1 if current_choice == optimal_choice else 0
temp_regret[iter] = gt_prob[optimal_choice] - gt_prob[current_choice] if iter == 0 else temp_regret[iter-1] + (gt_prob[optimal_choice] - gt_prob[current_choice])
b_pull_count[eps,:] = temp_pull_count
b_estimation[eps,:] = temp_estimation
b_reward[eps,:] = temp_reward
b_optimal_pull[eps,:] = temp_optimal_pull
b_regret_total[eps,:] = temp_regret
print('Ground Truth')
print(gt_prob)
print('Expected ')
print(b_estimation.mean(0))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment