conormm/bandits.py

## bandits.py

# coding: utf-8

# In[150]:


import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("whitegrid")
get_ipython().run_line_magic('matplotlib', 'inline')


# In[25]:


# we have four machines each with p of paying out (reward equal 1 or nothing (0))
machines = [0, 1, 2, 3]
payoffs = [0.07, 0.11, 0.22, 0.24]


# In[26]:


# random selections
payouts = []
n_payouts_random = 0
for i in range(10000):
    machine = np.random.choice([0, 1, 2, 3])
    m = np.random.binomial(1, p=payoffs[machine])
    n_payouts_random += m

print(f"Sum of the reward is: {n_payouts_random}")


# In[275]:


# greedy epsilon algorithm. More efficieient ways to do this.

# greedy epsilon selections
n_learning = 2000
payoffs = [0.07, 0.11, 0.10, 0.08]
e = 0.02
n_trials = 10000

successes = np.zeros(4)
m_chosen = np.zeros(4)
m_prob = np.zeros(4)
n_payouts_greedy = 0
t = 0
regret = np.zeros(n_trials)

for i in range(n_trials):
    t += 1
    payoffs[3] = .15 if i % 3 == 0 else .007 # add variability to payouts
    #payoffs[3] = .25 if i % 3 == 0 else .20 # add variability to payouts
    # randomly select a machine
    #payoffs[3] = .4 if i % 3 == 0 else .04 # add variability to payouts
    # after n_learning random draws select the machine that is paying out the most
    machine = machine if i < n_learning else np.argmax(m_prob)
    # 0.02% of the time take a random draw from machines
    machine = np.random.choice(machines) if np.random.uniform(0, 1) > (1 - e) else machine
    m = np.random.binomial(1, p=payoffs[machine])
    # every 100 trials update the successes
    if i % 100:
        # update the count of successes for the chosen machine
        successes[machine] += m
        # update the probability of payout for each machine
        m_prob = successes/m_chosen
        # how many times was machine m chosen
    m_chosen[machine] += 1
    # count total reward (sum of payouts)
    n_payouts_greedy += m
    regret[i] = np.max(m_prob) - m_prob[machine]

print(f"Sum of the reward is: {n_payouts_greedy}")


# In[248]:


print(a)
print(b)


# In[277]:


# thompson sampling
payoffs = [0.07, 0.11, 0.10, 0.08]
n_trials = 10000
shape = (4, 200)

m = 0
k = 0
a = np.ones(4)
b = np.ones(4)
m_prob = np.zeros(4)
thetam = np.zeros(4)
# priors for machine payout distribution - uniformly distributed
beta_post = np.random.uniform(0, 1, size=shape)
regret = np.zeros(n_trials)
total_reward = 0

for i in range(n_trials):
    # updated posterior
    beta_post[m, :] = np.random.beta(a[m], b[m], size=shape)[m]
    payoffs[3] = .15 if i % 3 == 0 else .007 # add variability to payouts
    #payoffs[0] = 0.07 if i < 4000 else .8
    for k in range(len(machines)):
        # gives mean of beta distribution
        #thetam[k] = a[k]/(a[k] + b[k])
        # sample from posterior (this is the thompson sampling approach)
        # this leads to more exploration because machines with > uncertainty can then be selected as the machine
        thetam[k] = np.random.choice(beta_post[k, :])

    # select machine with highest posterior p of payout
    m = machines[np.argmax(thetam)]
    # play machine - payout is binomial e [0, 1] with p payout
    reward = np.random.binomial(1, p=payoffs[m])
    regret[i] = np.max(thetam) - thetam[k]
    #update dist
    # (a, b) = (a, b) + (r, 1 - r)
    a[m] += reward
    b[m] += 1 - reward
    total_reward += reward

print(total_reward)
print(a)
print(b)


# In[278]:


plt.figure(figsize=(14, 4))

plt.subplot(121)
for i in range(len(machines)):
    plt.plot(beta_post[i, :], alpha=.4, label=i)

plt.subplot(122)
for i in range(len(machines)):
    sns.distplot(beta_post[i, :], hist=False, label=i);


# In[210]:


np.random.choice(beta_post[k, :])


# In[241]:


b


# In[242]:


a


# In[211]:


beta_post[k, :]

	# coding: utf-8

	# In[150]:


	import numpy as np
	import matplotlib.pyplot as plt
	import seaborn as sns

	sns.set_style("whitegrid")
	get_ipython().run_line_magic('matplotlib', 'inline')


	# In[25]:


	# we have four machines each with p of paying out (reward equal 1 or nothing (0))
	machines = [0, 1, 2, 3]
	payoffs = [0.07, 0.11, 0.22, 0.24]


	# In[26]:


	# random selections
	payouts = []
	n_payouts_random = 0
	for i in range(10000):
	machine = np.random.choice([0, 1, 2, 3])
	m = np.random.binomial(1, p=payoffs[machine])
	n_payouts_random += m

	print(f"Sum of the reward is: {n_payouts_random}")


	# In[275]:


	# greedy epsilon algorithm. More efficieient ways to do this.

	# greedy epsilon selections
	n_learning = 2000
	payoffs = [0.07, 0.11, 0.10, 0.08]
	e = 0.02
	n_trials = 10000

	successes = np.zeros(4)
	m_chosen = np.zeros(4)
	m_prob = np.zeros(4)
	n_payouts_greedy = 0
	t = 0
	regret = np.zeros(n_trials)

	for i in range(n_trials):
	t += 1
	payoffs[3] = .15 if i % 3 == 0 else .007 # add variability to payouts
	#payoffs[3] = .25 if i % 3 == 0 else .20 # add variability to payouts
	# randomly select a machine
	#payoffs[3] = .4 if i % 3 == 0 else .04 # add variability to payouts
	# after n_learning random draws select the machine that is paying out the most
	machine = machine if i < n_learning else np.argmax(m_prob)
	# 0.02% of the time take a random draw from machines
	machine = np.random.choice(machines) if np.random.uniform(0, 1) > (1 - e) else machine
	m = np.random.binomial(1, p=payoffs[machine])
	# every 100 trials update the successes
	if i % 100:
	# update the count of successes for the chosen machine
	successes[machine] += m
	# update the probability of payout for each machine
	m_prob = successes/m_chosen
	# how many times was machine m chosen
	m_chosen[machine] += 1
	# count total reward (sum of payouts)
	n_payouts_greedy += m
	regret[i] = np.max(m_prob) - m_prob[machine]

	print(f"Sum of the reward is: {n_payouts_greedy}")


	# In[248]:


	print(a)
	print(b)


	# In[277]:


	# thompson sampling
	payoffs = [0.07, 0.11, 0.10, 0.08]
	n_trials = 10000
	shape = (4, 200)

	m = 0
	k = 0
	a = np.ones(4)
	b = np.ones(4)
	m_prob = np.zeros(4)
	thetam = np.zeros(4)
	# priors for machine payout distribution - uniformly distributed
	beta_post = np.random.uniform(0, 1, size=shape)
	regret = np.zeros(n_trials)
	total_reward = 0

	for i in range(n_trials):
	# updated posterior
	beta_post[m, :] = np.random.beta(a[m], b[m], size=shape)[m]
	payoffs[3] = .15 if i % 3 == 0 else .007 # add variability to payouts
	#payoffs[0] = 0.07 if i < 4000 else .8
	for k in range(len(machines)):
	# gives mean of beta distribution
	#thetam[k] = a[k]/(a[k] + b[k])
	# sample from posterior (this is the thompson sampling approach)
	# this leads to more exploration because machines with > uncertainty can then be selected as the machine
	thetam[k] = np.random.choice(beta_post[k, :])

	# select machine with highest posterior p of payout
	m = machines[np.argmax(thetam)]
	# play machine - payout is binomial e [0, 1] with p payout
	reward = np.random.binomial(1, p=payoffs[m])
	regret[i] = np.max(thetam) - thetam[k]
	#update dist
	# (a, b) = (a, b) + (r, 1 - r)
	a[m] += reward
	b[m] += 1 - reward
	total_reward += reward

	print(total_reward)
	print(a)
	print(b)


	# In[278]:


	plt.figure(figsize=(14, 4))

	plt.subplot(121)
	for i in range(len(machines)):
	plt.plot(beta_post[i, :], alpha=.4, label=i)

	plt.subplot(122)
	for i in range(len(machines)):
	sns.distplot(beta_post[i, :], hist=False, label=i);


	# In[210]:


	np.random.choice(beta_post[k, :])


	# In[241]:


	b


	# In[242]:


	a


	# In[211]:


	beta_post[k, :]