Skip to content

Instantly share code, notes, and snippets.

@fohria
Created April 6, 2020 15:45
Show Gist options
  • Save fohria/2447e3813dd94402611d20728a7300e7 to your computer and use it in GitHub Desktop.
Save fohria/2447e3813dd94402611d20728a7300e7 to your computer and use it in GitHub Desktop.
pymc3 bug report
import theano
import theano.tensor as tt
import numpy as np
import pymc3 as pm
def update_qvalsQL(action, reward, qvals, alpha, tau, gamma):
probs = tt.nnet.softmax(qvals * tau)
probs = probs[0] # because softmax returns array inside array
error = reward - qvals[action] + gamma * tt.max(qvals)
qvals = tt.set_subtensor(qvals[action],
qvals[action] + alpha * error)
return [qvals, probs]
def categorical_actionsQL(actions, rewards, alpha, tau, gamma):
# intial qvalues for each action
qvals_init = 0.5 * tt.ones((2), dtype='float64')
output, updates = theano.scan(fn=update_qvalsQL,
sequences=[actions, rewards],
outputs_info=[qvals_init, None],
non_sequences=[alpha, tau, gamma])
return output[1]
actions = theano.shared(np.array([0,1,0,1,1,0,0], dtype='int16'))
rewards = theano.shared(np.array([4,2,6,3,2,5,4], dtype='int16'))
with pm.Model() as qlearn3:
alpha = pm.Beta('alpha', alpha=1, beta=1)
tau = pm.HalfNormal('tau', 10)
gamma = pm.Beta('gamma', alpha=1, beta=1)
probs = categorical_actionsQL(actions, rewards, alpha, tau, gamma)
like = pm.Categorical('like', p=probs, observed=actions)
trace = pm.sample()
print(az.summary(trace))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment