conormm/thompsonsamper.py

## thompsonsamper.py
class ThompsonSampler(BaseSampler):

    def __init__(self, env):
        super().__init__(env)

    def choose_k(self):
        # sample from posterior (this is the thompson sampling approach)
        # this leads to more exploration because machines with > uncertainty can then be selected as the machine
        self.theta = np.random.beta(self.a, self.b)
        # select machine with highest posterior p of payout
        self.k = self.variants[np.argmax(self.theta)]
        return self.k

    def update(self):

        #update dist (a, b) = (a, b) + (r, 1 - r)
        self.a[self.k] += self.reward
        self.b[self.k] += 1 - self.reward # i.e. only increment b when it's a swing and a miss. 1 - 0 = 1, 1 - 1 = 0

        self.thetas[self.i] = self.theta[self.k]
        self.thetaregret[self.i] = np.max(self.thetas) - self.theta[self.k]

        self.ad_i[self.i] = self.k
        self.r_i[self.i] = self.reward
        self.i += 1
	class ThompsonSampler(BaseSampler):

	def __init__(self, env):
	super().__init__(env)

	def choose_k(self):
	# sample from posterior (this is the thompson sampling approach)
	# this leads to more exploration because machines with > uncertainty can then be selected as the machine
	self.theta = np.random.beta(self.a, self.b)
	# select machine with highest posterior p of payout
	self.k = self.variants[np.argmax(self.theta)]
	return self.k

	def update(self):

	#update dist (a, b) = (a, b) + (r, 1 - r)
	self.a[self.k] += self.reward
	self.b[self.k] += 1 - self.reward # i.e. only increment b when it's a swing and a miss. 1 - 0 = 1, 1 - 1 = 0

	self.thetas[self.i] = self.theta[self.k]
	self.thetaregret[self.i] = np.max(self.thetas) - self.theta[self.k]

	self.ad_i[self.i] = self.k
	self.r_i[self.i] = self.reward
	self.i += 1