maxpagels/linear-thompson-contextual-bandit.py

## linear-thompson-contextual-bandit.py
import numpy as np

class ContextualThompson(object):

  def __init__(self, d=10, R=0.01, epsilon=0.5, delta=1.0, n_arms=10):
    self.n_arms = n_arms
    self.d = d
    self.R = R
    self.delta = delta
    self.epsilon = epsilon
    self.t = 0
    self.mu_hat = [np.zeros((self.d, 1)) for arm in range(n_arms)]
    self.f = [np.zeros((self.d, 1)) for arm in range(n_arms)]
    self.B = [np.identity(self.d) for arm in range(n_arms)]


  def get_action(self, context):
    self.t += 1
    v = self.R * np.sqrt(9 / self.epsilon * self.d * np.log(self.t / self.delta))
    scores = []
    for arm in range(self.n_arms):
      mu_tilde = np.random.multivariate_normal(self.mu_hat[arm].flat, v**2 * np.linalg.inv(self.B[arm]))
      scores.append(np.array(context).dot(mu_tilde))
    return np.argmax(scores)


  def reward(self, context, action, reward):
    cn = np.array([context]).T
    self.B[action] += cn.dot(cn.T)
    self.f[action] += reward * cn
    self.mu_hat[action] = np.linalg.inv(self.B[action]).dot(self.f[action])
	import numpy as np

	class ContextualThompson(object):

	def __init__(self, d=10, R=0.01, epsilon=0.5, delta=1.0, n_arms=10):
	self.n_arms = n_arms
	self.d = d
	self.R = R
	self.delta = delta
	self.epsilon = epsilon
	self.t = 0
	self.mu_hat = [np.zeros((self.d, 1)) for arm in range(n_arms)]
	self.f = [np.zeros((self.d, 1)) for arm in range(n_arms)]
	self.B = [np.identity(self.d) for arm in range(n_arms)]


	def get_action(self, context):
	self.t += 1
	v = self.R * np.sqrt(9 / self.epsilon * self.d * np.log(self.t / self.delta))
	scores = []
	for arm in range(self.n_arms):
	mu_tilde = np.random.multivariate_normal(self.mu_hat[arm].flat, v*2 np.linalg.inv(self.B[arm]))
	scores.append(np.array(context).dot(mu_tilde))
	return np.argmax(scores)


	def reward(self, context, action, reward):
	cn = np.array([context]).T
	self.B[action] += cn.dot(cn.T)
	self.f[action] += reward * cn
	self.mu_hat[action] = np.linalg.inv(self.B[action]).dot(self.f[action])