Skip to content

Instantly share code, notes, and snippets.

Created March 12, 2018 14:29
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save anonymous/50c8dc69fb9dea41eea5032cf5e31cd7 to your computer and use it in GitHub Desktop.
Save anonymous/50c8dc69fb9dea41eea5032cf5e31cd7 to your computer and use it in GitHub Desktop.
Minimal code example to recreate issue
import pandas as pd
import pymc3 as pm
from sklearn.preprocessing import LabelEncoder
from theano import shared
import theano.tensor as tt
measurement = pd.read_csv('anonymized_data.csv')
predictors = ['network', 'content_id', 'genre_id', 'weekday_id', 'stratified_hour_id']
train = measurement.loc[measurement.leave_out_set == 0]
test = measurement.loc[measurement.leave_out_set == 1]
# This is to mimic encoding done in actual program so as to be used during model construction
encoders = {}
for predictor in predictors:
encoder = LabelEncoder()
encoder.fit(measurement[predictor])
encoders[predictor] = encoder
def construct_model(model_variables, encoders):
with pm.Model() as model:
intercept = pm.Normal(
'intercept', mu=0.0, tau=1 / (2 ** 2), shape=1
)
network_coeff_sigma = pm.HalfNormal('network_coeff_sigma', sd=5)
network_coeff_offset = pm.Normal(
'network_coeff_offset', mu=0, sd=1,
shape=encoders['network'].classes_.size
)
network_coeff = pm.Deterministic(
'network_coeff',
0.0 + network_coeff_offset * network_coeff_sigma
)
content_id_coeff_sigma = pm.HalfNormal('content_id_coeff_sigma', sd=5)
content_id_coeff_offset = pm.Normal(
'content_id_coeff_offset', mu=0, sd=1,
shape=encoders['content_id'].classes_.size
)
content_id_coeff = pm.Deterministic(
'content_id_coeff',
0.0 + content_id_coeff_offset * content_id_coeff_sigma
)
genre_id_coeff_sigma = pm.HalfNormal('genre_id_coeff_sigma', sd=5)
genre_id_coeff_offset = pm.Normal(
'genre_id_coeff_offset', mu=0, sd=1,
shape=encoders['genre_id'].classes_.size
)
genre_id_coeff = pm.Deterministic(
'genre_id_coeff',
0.0 + genre_id_coeff_offset * genre_id_coeff_sigma
)
weekday_id_coeff_sigma = pm.HalfNormal('weekday_id_coeff_sigma', sd=5)
weekday_id_coeff_offset = pm.Normal(
'weekday_id_coeff_offset', mu=0, sd=1,
shape=encoders['weekday_id'].classes_.size
)
weekday_id_coeff = pm.Deterministic(
'weekday_id_coeff',
0.0 + weekday_id_coeff_offset * weekday_id_coeff_sigma
)
stratified_hour_id_coeff_sigma = pm.HalfNormal('stratified_hour_id_coeff_sigma', sd=5)
stratified_hour_id_coeff_offset = pm.Normal(
'stratified_hour_id_coeff_offset', mu=0, sd=1,
shape=encoders['stratified_hour_id'].classes_.size
)
stratified_hour_id_coeff = pm.Deterministic(
'stratified_hour_id_coeff',
0.0 + stratified_hour_id_coeff_offset * stratified_hour_id_coeff_sigma
)
# Parameters for categories
link_argument = (
intercept +
network_coeff[model_variables['network']] +
content_id_coeff[model_variables['content_id']] +
genre_id_coeff[model_variables['genre_id']] +
weekday_id_coeff[model_variables['weekday_id']] +
stratified_hour_id_coeff[model_variables['stratified_hour_id']]
)
omega = pm.Deterministic('omega', pm.invlogit(link_argument))
kappa = pm.HalfStudentT('kappa', nu=3, sd=3)
# Mean parameter for individual data
mu = pm.Beta(
'mu', alpha=omega * kappa + 1, beta=(1 - omega) * kappa + 1,
# number of observations
shape=model_variables['y_obs'].eval().size,
# total_size=model_variables['y_obs'].shared.eval().size
)
likelihood = pm.Binomial(
'likelihood',
p=mu,
n=tt.cast(model_variables['n'], 'int64'),
observed=tt.cast(model_variables['y_obs'], 'int64'),
# total_size=model_variables['y_obs'].shared.eval().size
)
# Rescale coefficients to be deflections from baseline
# b_0 = pm.Deterministic('b_0', tt.mean(link_argument))
# b_1 = pm.Deterministic('b_1', link_argument[model_variables['network']] - b_0)
# b_2 = pm.Deterministic('b_2', link_argument[model_variables['content_id']] - b_0)
# b_3 = pm.Deterministic('b_3', link_argument[model_variables['genre_id']] - b_0)
# b_4 = pm.Deterministic('b_4', link_argument[model_variables['weekday_id']] - b_0)
# b_5 = pm.Deterministic('b_5', link_argument[model_variables['stratified_hour_id']] - b_0)
return model
# creating shared variables results in no error in test point evaluation
model_variables = {}
# Nominal Predictors of model
for predictor in predictors:
model_variables[predictor] = shared(train[predictor].values)
model_variables['y_obs'] = shared(train.y.values)
model_variables['n'] = shared(train.n.values)
model = construct_model(model_variables, encoders)
for RV in model.basic_RVs:
print(RV.name, RV.logp(model.test_point))
# However, when repeating with minibatch, model test point evaluates to inf for likelihood
batch_size = 2000
random_seed = 42
model_variables = {}
# Nominal Predictors of model
for predictor in encoders:
encoder = encoders[predictor]
model_variables[predictor] = pm.Minibatch(train[predictor].values, batch_size=batch_size)
model_variables['y_obs'] = pm.Minibatch(train.y.values, batch_size=batch_size)
model_variables['n'] = pm.Minibatch(train.n.values, batch_size=batch_size)
model = construct_model(model_variables, encoders)
for RV in model.basic_RVs:
print(RV.name, RV.logp(model.test_point))
# Find all rows where y > n, i.e. rows that would cause likelihood to be inf.
df = pd.DataFrame({'y': model_variables['y_obs'].eval(), 'n': model_variables['y_obs'].eval()})
minibatch = df.loc[df['y'] > df['n']]
df = pd.DataFrame({'y': model_variables['y_obs'].shared.eval(), 'n': model_variables['y_obs'].shared.eval()})
minibatch_shared = df.loc[df['y'] > df['n']]
df = pd.DataFrame({'y': shared(train.y.values).eval(), 'n': shared(train.n.values).eval()})
shared_df = df.loc[df['y'] > df['n']]
# these results show that eval() seems to be giving the wrong index and is what is causing the likelihood to blow up.
len(minibatch), len(minibatch_shared), len(shared_df)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment