JaeDukSeo/greedy.py

## greedy.py
# b greedy
b_pull_count   = np.zeros((num_ep,num_bandit))
b_estimation   = np.zeros((num_ep,num_bandit))
b_reward       = np.zeros((num_ep,num_iter))
b_optimal_pull = np.zeros((num_ep,num_iter))
b_regret_total = np.zeros((num_ep,num_iter))

for eps in range(num_ep):
    temp_pull_count   = np.zeros(num_bandit)
    temp_estimation   = np.zeros(num_bandit) + np.random.uniform(0,1,num_bandit)
    temp_reward       = np.zeros(num_iter)
    temp_optimal_pull = np.zeros(num_iter)
    temp_regret = np.zeros(num_iter)

    for iter in range(num_iter):

        # select bandit / get reward /increase count / update estimate
        current_choice = np.argmax(temp_estimation)
        current_reward = 1 if np.random.uniform(0,1) < gt_prob[current_choice] else 0
        temp_pull_count[current_choice] = temp_pull_count[current_choice] + 1
        temp_estimation[current_choice] = temp_estimation[current_choice] + (1/(temp_pull_count[current_choice]+1)) * (current_reward-temp_estimation[current_choice])

        # update reward and optimal choice
        temp_reward[iter] = current_reward if iter == 0 else temp_reward[iter-1] + current_reward
        temp_optimal_pull[iter] = 1 if current_choice == optimal_choice else 0
        temp_regret[iter] = gt_prob[optimal_choice] - gt_prob[current_choice] if iter == 0 else temp_regret[iter-1] + (gt_prob[optimal_choice] - gt_prob[current_choice])

    b_pull_count[eps,:]   = temp_pull_count
    b_estimation[eps,:]   = temp_estimation
    b_reward[eps,:]       = temp_reward
    b_optimal_pull[eps,:] = temp_optimal_pull
    b_regret_total[eps,:] = temp_regret

print('Ground Truth')
print(gt_prob)
print('Expected ')
print(b_estimation.mean(0))
	# b greedy
	b_pull_count = np.zeros((num_ep,num_bandit))
	b_estimation = np.zeros((num_ep,num_bandit))
	b_reward = np.zeros((num_ep,num_iter))
	b_optimal_pull = np.zeros((num_ep,num_iter))
	b_regret_total = np.zeros((num_ep,num_iter))

	for eps in range(num_ep):
	temp_pull_count = np.zeros(num_bandit)
	temp_estimation = np.zeros(num_bandit) + np.random.uniform(0,1,num_bandit)
	temp_reward = np.zeros(num_iter)
	temp_optimal_pull = np.zeros(num_iter)
	temp_regret = np.zeros(num_iter)

	for iter in range(num_iter):

	# select bandit / get reward /increase count / update estimate
	current_choice = np.argmax(temp_estimation)
	current_reward = 1 if np.random.uniform(0,1) < gt_prob[current_choice] else 0
	temp_pull_count[current_choice] = temp_pull_count[current_choice] + 1
	temp_estimation[current_choice] = temp_estimation[current_choice] + (1/(temp_pull_count[current_choice]+1)) * (current_reward-temp_estimation[current_choice])

	# update reward and optimal choice
	temp_reward[iter] = current_reward if iter == 0 else temp_reward[iter-1] + current_reward
	temp_optimal_pull[iter] = 1 if current_choice == optimal_choice else 0
	temp_regret[iter] = gt_prob[optimal_choice] - gt_prob[current_choice] if iter == 0 else temp_regret[iter-1] + (gt_prob[optimal_choice] - gt_prob[current_choice])

	b_pull_count[eps,:] = temp_pull_count
	b_estimation[eps,:] = temp_estimation
	b_reward[eps,:] = temp_reward
	b_optimal_pull[eps,:] = temp_optimal_pull
	b_regret_total[eps,:] = temp_regret

	print('Ground Truth')
	print(gt_prob)
	print('Expected ')
	print(b_estimation.mean(0))