Skip to content

Instantly share code, notes, and snippets.

@tmellan
Created April 25, 2020 15:28
Show Gist options
  • Save tmellan/6a1f8e93ff7dbbb2339075dd6aaa57a0 to your computer and use it in GitHub Desktop.
Save tmellan/6a1f8e93ff7dbbb2339075dd6aaa57a0 to your computer and use it in GitHub Desktop.
pool hangs
#!/usr/bin/env python
# coding: utf-8
# In[1]:
get_ipython().run_line_magic('cd', '~/Documents/TI')
# In[2]:
import pystan
import os
import pandas as pd
import numpy as np
import scipy
import time
import matplotlib.pyplot as plt
from sklearn import preprocessing
import seaborn as sns
import multiprocessing
from TI_functions import *
sns.set()
# In[3]:
#os.environ['STAN_NUM_THREADS'] = "1"
# In[4]:
pool=multiprocessing.Pool(10)
lambdaVals = np.arange(0,1,0.1)
# In[5]:
#### load and clean the data ##########################################
diabData = pd.read_csv('diabetes.csv')
df = diabData[(diabData.BMI != 0) & (diabData.Glucose != 0) & (diabData.BloodPressure != 0) & (diabData.Insulin != 0) & (diabData.SkinThickness != 0)]
df.drop(columns = ['BloodPressure','SkinThickness','Insulin'], inplace = True)
x = df.iloc[:, :-1].values # remove the output
x = preprocessing.scale(x) # standardise x
y = df['Outcome'].values
# the data should be cleaned, expect 532 rows not 644
# In[6]:
#### build a Stan model ######################################################
code = """
data {
int N; //the number of observations
int K; //the number of features (variables)
int y[N]; //the response
matrix[N,K] X; //the model matrix (all observations)
}
parameters {
real theta0; // intercept
vector[K] theta; // regression coefficients
}
transformed parameters {}
model {
theta0 ~ normal(0,1); // prior for the intercept
for(i in 1:K)
theta[i] ~ normal(0,1); // prior for the regression coefficients
y ~ bernoulli_logit(theta0 + X * theta); // likelihood
}
generated quantities {
// this must be wrong but i'm desperate
real log_lik;
log_lik = binomial_lpmf(y | N, inv_logit(theta0 + X * theta));
}"""
# In[7]:
model = pystan.StanModel(model_code=code)
data = {'N': x.shape[0], 'K': x.shape[1], 'y': y, 'X':x}
fit = model.sampling(data=data, iter=50000,n_jobs=-1)
print(fit)
# In[8]:
df = fit.to_dataframe()
samples = df.loc[:, df.columns.str.startswith('theta')].values
cov = np.cov(np.array(samples), rowvar = False)
covinv = np.linalg.inv(cov)
theta_mean = samples.mean(axis = 0)
mu = theta_mean.copy() # these look similar to the examples on kaggle
mu
# In[9]:
# i want to find the posterior mode and I want it quick
print(df[df.log_lik == df.log_lik.max()])
a = df[df.log_lik == df.log_lik.max()]
theta_mode = a.loc[:, a.columns.str.startswith('theta')].values
mu = theta_mode.reshape(theta_mean.shape)
mu
# In[10]:
def f(x):
xcp = x.copy()
if len(x) == 5:
x_tmp = np.r_[1,xcp]
else:
x_tmp = xcp
x_tmp[0] = 1
exp = np.exp((x_tmp * theta_mean).sum())
pi = exp / (1+ exp)
return pi
def fArray(array_args):
return np.apply_along_axis(f,1,array_args)
y_pred = fArray(x)
y_pred_bin = list(y_pred)
y_pred_bin = np.asarray([round(el) for el in y_pred_bin])
from sklearn.metrics import confusion_matrix
cf = confusion_matrix(y, y_pred_bin) # this also looks similar to the kaggle example
cf
# In[11]:
# this is built using the sampled theta values from the mcmc
def frefCovarianceArray(array_args):
colOnes = np.ones([array_args.shape[0],1])
array_args = np.c_[colOnes,array_args] # to handle the intercept
f0 = f(mu)
return np.apply_along_axis(frefMatrixCovariance,1,array_args, f0, mu, cov)
# In[12]:
###### TI ##########################################
def replace_in_stan_code(stan_code, str_to_replace, value):
return stan_code.replace(str_to_replace, str(value))
custom_pdf_code2 = """
functions {
real lf(vector x, vector peak)
{
real ex = exp(x' * peak);
real result = ex / (1 + ex);
return result;
}
real lfref(vector x, real f0, vector peak, matrix C)
{
return f0 - 0.5 * (x-peak)' * C * (x-peak);
}
real path_lpdf(vector x, real f0, vector peak, matrix C, real lambda)
{
row_vector[6] x_tmp = append_col(1, x');
vector[6] x_new = x_tmp';
return lambda * log(lf(x_new, peak)) + (1-lambda) * lfref(x_new, f0, peak, C);
}
}
data {
int N; // number of variables
matrix[N,N] C; // inverse of the covariates matrix
vector[N] peak; // typically mean or mode
real f0; // log(f(peak))
real lambda; // used for creating a path between f and f_ref
}
transformed data {}
parameters {
vector[N-1] y;
}
transformed parameters {}
model {
y ~ path(f0, peak, C, lambda);
}
generated quantities {}
"""
# In[13]:
t0 = time.time()
m = pystan.StanModel(model_code=custom_pdf_code2)
print(round(time.time() - t0,2), 'seconds elapsed for the stan model compilation')
# In[14]:
data = {'N': len(mu),'C': covinv, 'peak': mu, 'f0': np.log(f(mu))}
def get_expect_for_lambda(lam):
n_iter = 50000 #change
data.update({'lambda': lam})
fit = m.sampling(data=data, iter=n_iter, n_jobs=1)
df = fit.to_dataframe()
samples = df.loc[:, df.columns.str.startswith('y')].values
vals = np.log(fArray(samples) / frefCovarianceArray(samples))
expects = vals.mean()
variance = vals.var()
sdErrs = np.sqrt(variance/len(vals))
return {'expects': expects, 'variance': variance,
'sdErrs': sdErrs, 'steps': n_iter}
def MCMC_for_all_lambdas(lambdaVals):
"""Execute TI MCMC for multiple lambdas.
lambdaVals: list of values of lambdas"""
t0 = time.time()
lambdaOutput = {}
for l in lambdaVals:
lam = round(l,1)
print(lam)
lambdaOutput.update({lam: get_expect_for_lambda(lam)})
print(time.time() - t0, 'seconds elapsed')
return lambdaOutput
# In[15]:
#Serial version
#lambda_dict = MCMC_for_all_lambdas(lambdaVals)
# In[18]:
poolout=pool.map(MCMC_for_one_lambdas, lambdaVals)
lambda_dict=dict((key,d[key]) for d in poolout for key in d)
lambda_dict
# In[19]:
lambdaVals = list(lambda_dict.keys())
expectsPerLambda = []
# if lambda_dict[0]['expects'] == 1 * np.inf:
# print('Expectation for lambda = 0 is -inf, removing this value')
# del lambda_dict[0]
for lam in lambdaVals:
expectsPerLambda.append(lambda_dict[lam]['expects'])
tck = scipy.interpolate.splrep(lambdaVals, expectsPerLambda, s=0)
xnew = np.linspace(0, 1)
ynew = scipy.interpolate.splev(xnew, tck, der=0)
plt.plot(xnew,ynew)
plt.scatter(lambdaVals, expectsPerLambda)
plt.xlim([0,1])
plt.xlabel('lambda')
plt.ylabel('expectation')
plt.show()
# calculate exponent of the integral of the interpolated line
logzExtra = scipy.interpolate.splint(lambdaVals[0], lambdaVals[-1], tck, full_output=0)
logzExtra
# In[20]:
logzExtra_covariance = logzExtra
# logzExtra_covariance = get_Zextra(lambda_dict)
logzRefCovariance = np.log(gaussianIntegral(np.linalg.inv(cov))) + np.log(f(mu))
logzTI_covariance = logzExtra_covariance + logzRefCovariance
print('Covariance approximation')
print('log zRef = ', logzRefCovariance)
print('log zExtra = ', logzExtra_covariance)
print('log(zTI) = ', logzTI_covariance)
# In[21]:
pool.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment