Created
April 25, 2020 15:28
-
-
Save tmellan/6a1f8e93ff7dbbb2339075dd6aaa57a0 to your computer and use it in GitHub Desktop.
pool hangs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# coding: utf-8 | |
# In[1]: | |
get_ipython().run_line_magic('cd', '~/Documents/TI') | |
# In[2]: | |
import pystan | |
import os | |
import pandas as pd | |
import numpy as np | |
import scipy | |
import time | |
import matplotlib.pyplot as plt | |
from sklearn import preprocessing | |
import seaborn as sns | |
import multiprocessing | |
from TI_functions import * | |
sns.set() | |
# In[3]: | |
#os.environ['STAN_NUM_THREADS'] = "1" | |
# In[4]: | |
pool=multiprocessing.Pool(10) | |
lambdaVals = np.arange(0,1,0.1) | |
# In[5]: | |
#### load and clean the data ########################################## | |
diabData = pd.read_csv('diabetes.csv') | |
df = diabData[(diabData.BMI != 0) & (diabData.Glucose != 0) & (diabData.BloodPressure != 0) & (diabData.Insulin != 0) & (diabData.SkinThickness != 0)] | |
df.drop(columns = ['BloodPressure','SkinThickness','Insulin'], inplace = True) | |
x = df.iloc[:, :-1].values # remove the output | |
x = preprocessing.scale(x) # standardise x | |
y = df['Outcome'].values | |
# the data should be cleaned, expect 532 rows not 644 | |
# In[6]: | |
#### build a Stan model ###################################################### | |
code = """ | |
data { | |
int N; //the number of observations | |
int K; //the number of features (variables) | |
int y[N]; //the response | |
matrix[N,K] X; //the model matrix (all observations) | |
} | |
parameters { | |
real theta0; // intercept | |
vector[K] theta; // regression coefficients | |
} | |
transformed parameters {} | |
model { | |
theta0 ~ normal(0,1); // prior for the intercept | |
for(i in 1:K) | |
theta[i] ~ normal(0,1); // prior for the regression coefficients | |
y ~ bernoulli_logit(theta0 + X * theta); // likelihood | |
} | |
generated quantities { | |
// this must be wrong but i'm desperate | |
real log_lik; | |
log_lik = binomial_lpmf(y | N, inv_logit(theta0 + X * theta)); | |
}""" | |
# In[7]: | |
model = pystan.StanModel(model_code=code) | |
data = {'N': x.shape[0], 'K': x.shape[1], 'y': y, 'X':x} | |
fit = model.sampling(data=data, iter=50000,n_jobs=-1) | |
print(fit) | |
# In[8]: | |
df = fit.to_dataframe() | |
samples = df.loc[:, df.columns.str.startswith('theta')].values | |
cov = np.cov(np.array(samples), rowvar = False) | |
covinv = np.linalg.inv(cov) | |
theta_mean = samples.mean(axis = 0) | |
mu = theta_mean.copy() # these look similar to the examples on kaggle | |
mu | |
# In[9]: | |
# i want to find the posterior mode and I want it quick | |
print(df[df.log_lik == df.log_lik.max()]) | |
a = df[df.log_lik == df.log_lik.max()] | |
theta_mode = a.loc[:, a.columns.str.startswith('theta')].values | |
mu = theta_mode.reshape(theta_mean.shape) | |
mu | |
# In[10]: | |
def f(x): | |
xcp = x.copy() | |
if len(x) == 5: | |
x_tmp = np.r_[1,xcp] | |
else: | |
x_tmp = xcp | |
x_tmp[0] = 1 | |
exp = np.exp((x_tmp * theta_mean).sum()) | |
pi = exp / (1+ exp) | |
return pi | |
def fArray(array_args): | |
return np.apply_along_axis(f,1,array_args) | |
y_pred = fArray(x) | |
y_pred_bin = list(y_pred) | |
y_pred_bin = np.asarray([round(el) for el in y_pred_bin]) | |
from sklearn.metrics import confusion_matrix | |
cf = confusion_matrix(y, y_pred_bin) # this also looks similar to the kaggle example | |
cf | |
# In[11]: | |
# this is built using the sampled theta values from the mcmc | |
def frefCovarianceArray(array_args): | |
colOnes = np.ones([array_args.shape[0],1]) | |
array_args = np.c_[colOnes,array_args] # to handle the intercept | |
f0 = f(mu) | |
return np.apply_along_axis(frefMatrixCovariance,1,array_args, f0, mu, cov) | |
# In[12]: | |
###### TI ########################################## | |
def replace_in_stan_code(stan_code, str_to_replace, value): | |
return stan_code.replace(str_to_replace, str(value)) | |
custom_pdf_code2 = """ | |
functions { | |
real lf(vector x, vector peak) | |
{ | |
real ex = exp(x' * peak); | |
real result = ex / (1 + ex); | |
return result; | |
} | |
real lfref(vector x, real f0, vector peak, matrix C) | |
{ | |
return f0 - 0.5 * (x-peak)' * C * (x-peak); | |
} | |
real path_lpdf(vector x, real f0, vector peak, matrix C, real lambda) | |
{ | |
row_vector[6] x_tmp = append_col(1, x'); | |
vector[6] x_new = x_tmp'; | |
return lambda * log(lf(x_new, peak)) + (1-lambda) * lfref(x_new, f0, peak, C); | |
} | |
} | |
data { | |
int N; // number of variables | |
matrix[N,N] C; // inverse of the covariates matrix | |
vector[N] peak; // typically mean or mode | |
real f0; // log(f(peak)) | |
real lambda; // used for creating a path between f and f_ref | |
} | |
transformed data {} | |
parameters { | |
vector[N-1] y; | |
} | |
transformed parameters {} | |
model { | |
y ~ path(f0, peak, C, lambda); | |
} | |
generated quantities {} | |
""" | |
# In[13]: | |
t0 = time.time() | |
m = pystan.StanModel(model_code=custom_pdf_code2) | |
print(round(time.time() - t0,2), 'seconds elapsed for the stan model compilation') | |
# In[14]: | |
data = {'N': len(mu),'C': covinv, 'peak': mu, 'f0': np.log(f(mu))} | |
def get_expect_for_lambda(lam): | |
n_iter = 50000 #change | |
data.update({'lambda': lam}) | |
fit = m.sampling(data=data, iter=n_iter, n_jobs=1) | |
df = fit.to_dataframe() | |
samples = df.loc[:, df.columns.str.startswith('y')].values | |
vals = np.log(fArray(samples) / frefCovarianceArray(samples)) | |
expects = vals.mean() | |
variance = vals.var() | |
sdErrs = np.sqrt(variance/len(vals)) | |
return {'expects': expects, 'variance': variance, | |
'sdErrs': sdErrs, 'steps': n_iter} | |
def MCMC_for_all_lambdas(lambdaVals): | |
"""Execute TI MCMC for multiple lambdas. | |
lambdaVals: list of values of lambdas""" | |
t0 = time.time() | |
lambdaOutput = {} | |
for l in lambdaVals: | |
lam = round(l,1) | |
print(lam) | |
lambdaOutput.update({lam: get_expect_for_lambda(lam)}) | |
print(time.time() - t0, 'seconds elapsed') | |
return lambdaOutput | |
# In[15]: | |
#Serial version | |
#lambda_dict = MCMC_for_all_lambdas(lambdaVals) | |
# In[18]: | |
poolout=pool.map(MCMC_for_one_lambdas, lambdaVals) | |
lambda_dict=dict((key,d[key]) for d in poolout for key in d) | |
lambda_dict | |
# In[19]: | |
lambdaVals = list(lambda_dict.keys()) | |
expectsPerLambda = [] | |
# if lambda_dict[0]['expects'] == 1 * np.inf: | |
# print('Expectation for lambda = 0 is -inf, removing this value') | |
# del lambda_dict[0] | |
for lam in lambdaVals: | |
expectsPerLambda.append(lambda_dict[lam]['expects']) | |
tck = scipy.interpolate.splrep(lambdaVals, expectsPerLambda, s=0) | |
xnew = np.linspace(0, 1) | |
ynew = scipy.interpolate.splev(xnew, tck, der=0) | |
plt.plot(xnew,ynew) | |
plt.scatter(lambdaVals, expectsPerLambda) | |
plt.xlim([0,1]) | |
plt.xlabel('lambda') | |
plt.ylabel('expectation') | |
plt.show() | |
# calculate exponent of the integral of the interpolated line | |
logzExtra = scipy.interpolate.splint(lambdaVals[0], lambdaVals[-1], tck, full_output=0) | |
logzExtra | |
# In[20]: | |
logzExtra_covariance = logzExtra | |
# logzExtra_covariance = get_Zextra(lambda_dict) | |
logzRefCovariance = np.log(gaussianIntegral(np.linalg.inv(cov))) + np.log(f(mu)) | |
logzTI_covariance = logzExtra_covariance + logzRefCovariance | |
print('Covariance approximation') | |
print('log zRef = ', logzRefCovariance) | |
print('log zExtra = ', logzExtra_covariance) | |
print('log(zTI) = ', logzTI_covariance) | |
# In[21]: | |
pool.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment