Skip to content

Instantly share code, notes, and snippets.

@conormm
Created September 21, 2018 16:38
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save conormm/d5149a6e2191be6403e8a954012b042c to your computer and use it in GitHub Desktop.
Save conormm/d5149a6e2191be6403e8a954012b042c to your computer and use it in GitHub Desktop.
# coding: utf-8
# In[150]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")
get_ipython().run_line_magic('matplotlib', 'inline')
# In[25]:
# we have four machines each with p of paying out (reward equal 1 or nothing (0))
machines = [0, 1, 2, 3]
payoffs = [0.07, 0.11, 0.22, 0.24]
# In[26]:
# random selections
payouts = []
n_payouts_random = 0
for i in range(10000):
machine = np.random.choice([0, 1, 2, 3])
m = np.random.binomial(1, p=payoffs[machine])
n_payouts_random += m
print(f"Sum of the reward is: {n_payouts_random}")
# In[275]:
# greedy epsilon algorithm. More efficieient ways to do this.
# greedy epsilon selections
n_learning = 2000
payoffs = [0.07, 0.11, 0.10, 0.08]
e = 0.02
n_trials = 10000
successes = np.zeros(4)
m_chosen = np.zeros(4)
m_prob = np.zeros(4)
n_payouts_greedy = 0
t = 0
regret = np.zeros(n_trials)
for i in range(n_trials):
t += 1
payoffs[3] = .15 if i % 3 == 0 else .007 # add variability to payouts
#payoffs[3] = .25 if i % 3 == 0 else .20 # add variability to payouts
# randomly select a machine
#payoffs[3] = .4 if i % 3 == 0 else .04 # add variability to payouts
# after n_learning random draws select the machine that is paying out the most
machine = machine if i < n_learning else np.argmax(m_prob)
# 0.02% of the time take a random draw from machines
machine = np.random.choice(machines) if np.random.uniform(0, 1) > (1 - e) else machine
m = np.random.binomial(1, p=payoffs[machine])
# every 100 trials update the successes
if i % 100:
# update the count of successes for the chosen machine
successes[machine] += m
# update the probability of payout for each machine
m_prob = successes/m_chosen
# how many times was machine m chosen
m_chosen[machine] += 1
# count total reward (sum of payouts)
n_payouts_greedy += m
regret[i] = np.max(m_prob) - m_prob[machine]
print(f"Sum of the reward is: {n_payouts_greedy}")
# In[248]:
print(a)
print(b)
# In[277]:
# thompson sampling
payoffs = [0.07, 0.11, 0.10, 0.08]
n_trials = 10000
shape = (4, 200)
m = 0
k = 0
a = np.ones(4)
b = np.ones(4)
m_prob = np.zeros(4)
thetam = np.zeros(4)
# priors for machine payout distribution - uniformly distributed
beta_post = np.random.uniform(0, 1, size=shape)
regret = np.zeros(n_trials)
total_reward = 0
for i in range(n_trials):
# updated posterior
beta_post[m, :] = np.random.beta(a[m], b[m], size=shape)[m]
payoffs[3] = .15 if i % 3 == 0 else .007 # add variability to payouts
#payoffs[0] = 0.07 if i < 4000 else .8
for k in range(len(machines)):
# gives mean of beta distribution
#thetam[k] = a[k]/(a[k] + b[k])
# sample from posterior (this is the thompson sampling approach)
# this leads to more exploration because machines with > uncertainty can then be selected as the machine
thetam[k] = np.random.choice(beta_post[k, :])
# select machine with highest posterior p of payout
m = machines[np.argmax(thetam)]
# play machine - payout is binomial e [0, 1] with p payout
reward = np.random.binomial(1, p=payoffs[m])
regret[i] = np.max(thetam) - thetam[k]
#update dist
# (a, b) = (a, b) + (r, 1 - r)
a[m] += reward
b[m] += 1 - reward
total_reward += reward
print(total_reward)
print(a)
print(b)
# In[278]:
plt.figure(figsize=(14, 4))
plt.subplot(121)
for i in range(len(machines)):
plt.plot(beta_post[i, :], alpha=.4, label=i)
plt.subplot(122)
for i in range(len(machines)):
sns.distplot(beta_post[i, :], hist=False, label=i);
# In[210]:
np.random.choice(beta_post[k, :])
# In[241]:
b
# In[242]:
a
# In[211]:
beta_post[k, :]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment