Skip to content

Instantly share code, notes, and snippets.

@nagataka
Last active May 15, 2021 11:39
Show Gist options
  • Save nagataka/c02b9acd6e8a8d7696e09f8a129d3362 to your computer and use it in GitHub Desktop.
Save nagataka/c02b9acd6e8a8d7696e09f8a129d3362 to your computer and use it in GitHub Desktop.
Epsilon-Greedy written in python
import random
class EpsilonGreedy():
def __init__(self, epsilon, counts, values):
self.epsilon = epsilon # probability of explore
self.counts = counts # number of pulls for each arms
self.values = values # average amount of reward we've gotten from each arms
return
def initialize(self, n_arms):
self.counts = [0 for col in range(n_arms)]
self.values = [0.0 for col in range(n_arms)]
return
def ind_max(self, x):
m = max(x)
return x.index(m)
def select_arm(self):
if random.random() > self.epsilon:
return self.ind_max(self.values)
else:
return random.randrange(len(self.values))
def update(self, chosen_arm, reward):
self.counts[chosen_arm] = self.counts[chosen_arm] + 1
n = self.counts[chosen_arm]
value = self.values[chosen_arm]
new_value = ((n-1) / float(n)) * value + (1 / float(n)) * reward # weighted average of the previously estimated value and the reward we just received
self.values[chosen_arm] = new_value
return
class BernoulliArm():
def __init__(self, p):
self.p = p
def draw(self):
if random.random() > self.p:
return 0.0
else:
return 1.0
def test_algorithm(algo, arms, num_sims, horizon):
chosen_arms = [0.0 for i in range(num_sims * horizon)]
rewards = [0.0 for i in range(num_sims * horizon)]
cumulative_rewards = [0.0 for i in range(num_sims * horizon)]
sim_nums = [0.0 for i in range(num_sims * horizon)]
times = [0.0 for i in range(num_sims * horizon)]
for sim in range(num_sims):
sim = sim + 1
algo.initialize(len(arms))
for t in range(horizon):
t = t + 1
index = (sim -1) * horizon + t - 1 #???
sim_nums[index] = sim
times[index] = t
chosen_arm = algo.select_arm()
chosen_arms[index] = chosen_arm
reward = arms[chosen_arms[index]].draw()
rewards[index] = reward
if t == 1:
cumulative_rewards[index] = reward
else:
cumulative_rewards[index] = cumulative_rewards[index - 1] + reward
algo.update(chosen_arm, reward)
return [sim_nums, times, chosen_arms, rewards, cumulative_rewards]
random.seed(1)
means = [0.1, 0.1, 0.1, 0.1, 0.9]
n_arms = len(means)
random.shuffle(means)
arms = list(map(lambda mu: BernoulliArm(mu), means))
# print("Best arm is "+ str(ind_max(means)))
f = open("~/algorithms/epsilon_greedy/standard_results.tsv", "w")
for epsilon in [0.1, 0.2, 0.3, 0.4, 0.5]:
algo = EpsilonGreedy(epsilon, [], [])
algo.initialize(n_arms)
results = test_algorithm(algo, arms, 5000, 250)
for i in range(len(results[0])):
f.write(str(epsilon) + "\t")
f.write("\t".join([str(results[j][i]) for j in range(len(results))]) + "\n")
f.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment