Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save robinvanemden/b61b047f9a9cc5d069d5fc89a4cc1a06 to your computer and use it in GitHub Desktop.
Save robinvanemden/b61b047f9a9cc5d069d5fc89a4cc1a06 to your computer and use it in GitHub Desktop.
import numpy as np
class ContextualThompson(object):
def __init__(self, d=10, R=0.01, epsilon=0.5, delta=1.0, n_arms=10):
self.n_arms = n_arms
self.d = d
self.R = R
self.delta = delta
self.epsilon = epsilon
self.t = 0
self.mu_hat = [np.zeros((self.d, 1)) for arm in range(n_arms)]
self.f = [np.zeros((self.d, 1)) for arm in range(n_arms)]
self.B = [np.identity(self.d) for arm in range(n_arms)]
def get_action(self, context):
self.t += 1
v = self.R * np.sqrt(9 / self.epsilon * self.d * np.log(self.t / self.delta))
scores = []
for arm in range(self.n_arms):
mu_tilde = np.random.multivariate_normal(self.mu_hat[arm].flat, v**2 * np.linalg.inv(self.B[arm]))
scores.append(np.array(context).dot(mu_tilde))
return np.argmax(scores)
def reward(self, context, action, reward):
cn = np.array([context]).T
self.B[action] += cn.dot(cn.T)
self.f[action] += reward * cn
self.mu_hat[action] = np.linalg.inv(self.B[action]).dot(self.f[action])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment