Created
March 12, 2018 14:29
-
-
Save anonymous/50c8dc69fb9dea41eea5032cf5e31cd7 to your computer and use it in GitHub Desktop.
Minimal code example to recreate issue
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import pymc3 as pm | |
from sklearn.preprocessing import LabelEncoder | |
from theano import shared | |
import theano.tensor as tt | |
measurement = pd.read_csv('anonymized_data.csv') | |
predictors = ['network', 'content_id', 'genre_id', 'weekday_id', 'stratified_hour_id'] | |
train = measurement.loc[measurement.leave_out_set == 0] | |
test = measurement.loc[measurement.leave_out_set == 1] | |
# This is to mimic encoding done in actual program so as to be used during model construction | |
encoders = {} | |
for predictor in predictors: | |
encoder = LabelEncoder() | |
encoder.fit(measurement[predictor]) | |
encoders[predictor] = encoder | |
def construct_model(model_variables, encoders): | |
with pm.Model() as model: | |
intercept = pm.Normal( | |
'intercept', mu=0.0, tau=1 / (2 ** 2), shape=1 | |
) | |
network_coeff_sigma = pm.HalfNormal('network_coeff_sigma', sd=5) | |
network_coeff_offset = pm.Normal( | |
'network_coeff_offset', mu=0, sd=1, | |
shape=encoders['network'].classes_.size | |
) | |
network_coeff = pm.Deterministic( | |
'network_coeff', | |
0.0 + network_coeff_offset * network_coeff_sigma | |
) | |
content_id_coeff_sigma = pm.HalfNormal('content_id_coeff_sigma', sd=5) | |
content_id_coeff_offset = pm.Normal( | |
'content_id_coeff_offset', mu=0, sd=1, | |
shape=encoders['content_id'].classes_.size | |
) | |
content_id_coeff = pm.Deterministic( | |
'content_id_coeff', | |
0.0 + content_id_coeff_offset * content_id_coeff_sigma | |
) | |
genre_id_coeff_sigma = pm.HalfNormal('genre_id_coeff_sigma', sd=5) | |
genre_id_coeff_offset = pm.Normal( | |
'genre_id_coeff_offset', mu=0, sd=1, | |
shape=encoders['genre_id'].classes_.size | |
) | |
genre_id_coeff = pm.Deterministic( | |
'genre_id_coeff', | |
0.0 + genre_id_coeff_offset * genre_id_coeff_sigma | |
) | |
weekday_id_coeff_sigma = pm.HalfNormal('weekday_id_coeff_sigma', sd=5) | |
weekday_id_coeff_offset = pm.Normal( | |
'weekday_id_coeff_offset', mu=0, sd=1, | |
shape=encoders['weekday_id'].classes_.size | |
) | |
weekday_id_coeff = pm.Deterministic( | |
'weekday_id_coeff', | |
0.0 + weekday_id_coeff_offset * weekday_id_coeff_sigma | |
) | |
stratified_hour_id_coeff_sigma = pm.HalfNormal('stratified_hour_id_coeff_sigma', sd=5) | |
stratified_hour_id_coeff_offset = pm.Normal( | |
'stratified_hour_id_coeff_offset', mu=0, sd=1, | |
shape=encoders['stratified_hour_id'].classes_.size | |
) | |
stratified_hour_id_coeff = pm.Deterministic( | |
'stratified_hour_id_coeff', | |
0.0 + stratified_hour_id_coeff_offset * stratified_hour_id_coeff_sigma | |
) | |
# Parameters for categories | |
link_argument = ( | |
intercept + | |
network_coeff[model_variables['network']] + | |
content_id_coeff[model_variables['content_id']] + | |
genre_id_coeff[model_variables['genre_id']] + | |
weekday_id_coeff[model_variables['weekday_id']] + | |
stratified_hour_id_coeff[model_variables['stratified_hour_id']] | |
) | |
omega = pm.Deterministic('omega', pm.invlogit(link_argument)) | |
kappa = pm.HalfStudentT('kappa', nu=3, sd=3) | |
# Mean parameter for individual data | |
mu = pm.Beta( | |
'mu', alpha=omega * kappa + 1, beta=(1 - omega) * kappa + 1, | |
# number of observations | |
shape=model_variables['y_obs'].eval().size, | |
# total_size=model_variables['y_obs'].shared.eval().size | |
) | |
likelihood = pm.Binomial( | |
'likelihood', | |
p=mu, | |
n=tt.cast(model_variables['n'], 'int64'), | |
observed=tt.cast(model_variables['y_obs'], 'int64'), | |
# total_size=model_variables['y_obs'].shared.eval().size | |
) | |
# Rescale coefficients to be deflections from baseline | |
# b_0 = pm.Deterministic('b_0', tt.mean(link_argument)) | |
# b_1 = pm.Deterministic('b_1', link_argument[model_variables['network']] - b_0) | |
# b_2 = pm.Deterministic('b_2', link_argument[model_variables['content_id']] - b_0) | |
# b_3 = pm.Deterministic('b_3', link_argument[model_variables['genre_id']] - b_0) | |
# b_4 = pm.Deterministic('b_4', link_argument[model_variables['weekday_id']] - b_0) | |
# b_5 = pm.Deterministic('b_5', link_argument[model_variables['stratified_hour_id']] - b_0) | |
return model | |
# creating shared variables results in no error in test point evaluation | |
model_variables = {} | |
# Nominal Predictors of model | |
for predictor in predictors: | |
model_variables[predictor] = shared(train[predictor].values) | |
model_variables['y_obs'] = shared(train.y.values) | |
model_variables['n'] = shared(train.n.values) | |
model = construct_model(model_variables, encoders) | |
for RV in model.basic_RVs: | |
print(RV.name, RV.logp(model.test_point)) | |
# However, when repeating with minibatch, model test point evaluates to inf for likelihood | |
batch_size = 2000 | |
random_seed = 42 | |
model_variables = {} | |
# Nominal Predictors of model | |
for predictor in encoders: | |
encoder = encoders[predictor] | |
model_variables[predictor] = pm.Minibatch(train[predictor].values, batch_size=batch_size) | |
model_variables['y_obs'] = pm.Minibatch(train.y.values, batch_size=batch_size) | |
model_variables['n'] = pm.Minibatch(train.n.values, batch_size=batch_size) | |
model = construct_model(model_variables, encoders) | |
for RV in model.basic_RVs: | |
print(RV.name, RV.logp(model.test_point)) | |
# Find all rows where y > n, i.e. rows that would cause likelihood to be inf. | |
df = pd.DataFrame({'y': model_variables['y_obs'].eval(), 'n': model_variables['y_obs'].eval()}) | |
minibatch = df.loc[df['y'] > df['n']] | |
df = pd.DataFrame({'y': model_variables['y_obs'].shared.eval(), 'n': model_variables['y_obs'].shared.eval()}) | |
minibatch_shared = df.loc[df['y'] > df['n']] | |
df = pd.DataFrame({'y': shared(train.y.values).eval(), 'n': shared(train.n.values).eval()}) | |
shared_df = df.loc[df['y'] > df['n']] | |
# these results show that eval() seems to be giving the wrong index and is what is causing the likelihood to blow up. | |
len(minibatch), len(minibatch_shared), len(shared_df) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment