tmellan/pima-indians_multiprocess.py

## pima-indians_multiprocess.py
#!/usr/bin/env python
# coding: utf-8

# In[1]:


get_ipython().run_line_magic('cd', '~/Documents/TI')


# In[2]:


import pystan
import os
import pandas as pd
import numpy as np
import scipy
import time
import matplotlib.pyplot as plt
from sklearn import preprocessing
import seaborn as sns
import multiprocessing
from TI_functions import *
sns.set()


# In[3]:


#os.environ['STAN_NUM_THREADS'] = "1"


# In[4]:


pool=multiprocessing.Pool(10)
lambdaVals = np.arange(0,1,0.1)


# In[5]:


#### load and clean the data ##########################################

diabData = pd.read_csv('diabetes.csv')
df = diabData[(diabData.BMI != 0) & (diabData.Glucose != 0) & (diabData.BloodPressure != 0) & (diabData.Insulin != 0) & (diabData.SkinThickness != 0)]
df.drop(columns = ['BloodPressure','SkinThickness','Insulin'], inplace = True)


x = df.iloc[:, :-1].values # remove the output
x = preprocessing.scale(x) # standardise x
y = df['Outcome'].values

# the data should be cleaned, expect 532 rows not 644


# In[6]:


#### build a Stan model ######################################################

code = """
data {
  int N; //the number of  observations
  int K; //the number of features (variables)
  int y[N]; //the response
  matrix[N,K] X; //the model matrix (all observations)
}
parameters {
  real theta0; // intercept
  vector[K] theta; // regression coefficients
}
transformed parameters {}
model {
  theta0 ~ normal(0,1); // prior for the intercept

  for(i in 1:K)
    theta[i] ~ normal(0,1); // prior for the regression coefficients

  y ~ bernoulli_logit(theta0 + X * theta); // likelihood
}
generated quantities {
    // this must be wrong but i'm desperate
    real log_lik;
    log_lik = binomial_lpmf(y | N, inv_logit(theta0 + X * theta));
}"""


# In[7]:


model = pystan.StanModel(model_code=code)
data = {'N': x.shape[0], 'K': x.shape[1], 'y': y, 'X':x}
fit = model.sampling(data=data, iter=50000,n_jobs=-1)
print(fit)


# In[8]:


df = fit.to_dataframe()
samples = df.loc[:, df.columns.str.startswith('theta')].values
cov = np.cov(np.array(samples), rowvar = False)
covinv = np.linalg.inv(cov)
theta_mean = samples.mean(axis = 0)
mu = theta_mean.copy()  # these look similar to the examples on kaggle
mu


# In[9]:


# i want to find the posterior mode and I want it quick
print(df[df.log_lik == df.log_lik.max()])
a = df[df.log_lik == df.log_lik.max()]
theta_mode = a.loc[:, a.columns.str.startswith('theta')].values
mu = theta_mode.reshape(theta_mean.shape)
mu


# In[10]:


def f(x):
    xcp = x.copy()
    if len(x) == 5:
        x_tmp = np.r_[1,xcp]
    else:
        x_tmp = xcp
        x_tmp[0] = 1
    exp = np.exp((x_tmp * theta_mean).sum())
    pi = exp / (1+ exp)
    return pi

def fArray(array_args):
    return np.apply_along_axis(f,1,array_args)

y_pred = fArray(x)
y_pred_bin = list(y_pred)
y_pred_bin = np.asarray([round(el) for el in y_pred_bin])
from sklearn.metrics import confusion_matrix
cf = confusion_matrix(y, y_pred_bin) # this also looks similar to the kaggle example
cf


# In[11]:


# this is built using the sampled theta values from the mcmc
def frefCovarianceArray(array_args):
    colOnes = np.ones([array_args.shape[0],1])
    array_args = np.c_[colOnes,array_args] # to handle the intercept
    f0 = f(mu)
    return np.apply_along_axis(frefMatrixCovariance,1,array_args, f0, mu, cov)


# In[12]:


###### TI ##########################################

def replace_in_stan_code(stan_code, str_to_replace, value):
    return stan_code.replace(str_to_replace, str(value))

custom_pdf_code2 = """
functions {
    real lf(vector x, vector peak)
    {
     real ex = exp(x' * peak);
     real result = ex / (1 + ex);
     return result;
    }
    real lfref(vector x, real f0, vector peak, matrix C)
    {
     return f0 - 0.5 * (x-peak)' * C * (x-peak);
     }
    real path_lpdf(vector x, real f0, vector peak, matrix C, real lambda)
    {
    row_vector[6] x_tmp = append_col(1, x');
    vector[6] x_new = x_tmp';
    return lambda * log(lf(x_new, peak)) + (1-lambda) * lfref(x_new, f0, peak, C);
    }
    }
data {
      int N; // number of variables
      matrix[N,N] C; // inverse of the covariates matrix
      vector[N] peak; // typically mean or mode
      real f0; // log(f(peak))
      real lambda; // used for creating a path between f and f_ref
      }
transformed data {}
parameters {
    vector[N-1] y;
}
transformed parameters {}
model {
       y ~ path(f0, peak, C, lambda);
}
generated quantities {}
"""


# In[13]:


t0 = time.time()
m = pystan.StanModel(model_code=custom_pdf_code2)
print(round(time.time() - t0,2), 'seconds elapsed for the stan model compilation')


# In[14]:


data = {'N': len(mu),'C': covinv, 'peak': mu, 'f0': np.log(f(mu))}

def get_expect_for_lambda(lam):
    n_iter = 50000 #change
    data.update({'lambda': lam})
    fit = m.sampling(data=data, iter=n_iter, n_jobs=1)
    df = fit.to_dataframe()
    samples = df.loc[:, df.columns.str.startswith('y')].values
    vals = np.log(fArray(samples) / frefCovarianceArray(samples))
    expects = vals.mean()
    variance = vals.var()
    sdErrs = np.sqrt(variance/len(vals))
    return {'expects': expects, 'variance': variance,
                    'sdErrs': sdErrs, 'steps': n_iter}

def MCMC_for_all_lambdas(lambdaVals):
    """Execute TI MCMC for multiple lambdas.
    lambdaVals: list of values of lambdas"""
    t0 = time.time()
    lambdaOutput = {}
    for l in lambdaVals:
        lam = round(l,1)
        print(lam)
        lambdaOutput.update({lam: get_expect_for_lambda(lam)})
    print(time.time() - t0, 'seconds elapsed')
    return lambdaOutput


# In[15]:


#Serial version
#lambda_dict = MCMC_for_all_lambdas(lambdaVals)


# In[18]:


poolout=pool.map(MCMC_for_one_lambdas, lambdaVals)
lambda_dict=dict((key,d[key]) for d in poolout for key in d)
lambda_dict


# In[19]:


lambdaVals = list(lambda_dict.keys())
expectsPerLambda = []
# if lambda_dict[0]['expects'] == 1 * np.inf:
#     print('Expectation for lambda = 0 is -inf, removing this value')
#     del lambda_dict[0]
for lam in lambdaVals:
    expectsPerLambda.append(lambda_dict[lam]['expects'])
tck = scipy.interpolate.splrep(lambdaVals, expectsPerLambda, s=0)
xnew = np.linspace(0, 1)
ynew = scipy.interpolate.splev(xnew, tck, der=0)
plt.plot(xnew,ynew)
plt.scatter(lambdaVals, expectsPerLambda)
plt.xlim([0,1])
plt.xlabel('lambda')
plt.ylabel('expectation')
plt.show()
# calculate exponent of the integral of the interpolated line
logzExtra = scipy.interpolate.splint(lambdaVals[0], lambdaVals[-1], tck, full_output=0)
logzExtra


# In[20]:


logzExtra_covariance = logzExtra
# logzExtra_covariance = get_Zextra(lambda_dict)
logzRefCovariance = np.log(gaussianIntegral(np.linalg.inv(cov))) + np.log(f(mu))
logzTI_covariance = logzExtra_covariance + logzRefCovariance

print('Covariance approximation')
print('log zRef = ', logzRefCovariance)
print('log zExtra = ', logzExtra_covariance)
print('log(zTI) = ', logzTI_covariance)


# In[21]:


pool.close()
	#!/usr/bin/env python
	# coding: utf-8

	# In[1]:


	get_ipython().run_line_magic('cd', '~/Documents/TI')


	# In[2]:


	import pystan
	import os
	import pandas as pd
	import numpy as np
	import scipy
	import time
	import matplotlib.pyplot as plt
	from sklearn import preprocessing
	import seaborn as sns
	import multiprocessing
	from TI_functions import *
	sns.set()


	# In[3]:


	#os.environ['STAN_NUM_THREADS'] = "1"


	# In[4]:


	pool=multiprocessing.Pool(10)
	lambdaVals = np.arange(0,1,0.1)


	# In[5]:


	#### load and clean the data ##########################################

	diabData = pd.read_csv('diabetes.csv')
	df = diabData[(diabData.BMI != 0) & (diabData.Glucose != 0) & (diabData.BloodPressure != 0) & (diabData.Insulin != 0) & (diabData.SkinThickness != 0)]
	df.drop(columns = ['BloodPressure','SkinThickness','Insulin'], inplace = True)


	x = df.iloc[:, :-1].values # remove the output
	x = preprocessing.scale(x) # standardise x
	y = df['Outcome'].values

	# the data should be cleaned, expect 532 rows not 644


	# In[6]:


	#### build a Stan model ######################################################

	code = """
	data {
	int N; //the number of observations
	int K; //the number of features (variables)
	int y[N]; //the response
	matrix[N,K] X; //the model matrix (all observations)
	}
	parameters {
	real theta0; // intercept
	vector[K] theta; // regression coefficients
	}
	transformed parameters {}
	model {
	theta0 ~ normal(0,1); // prior for the intercept

	for(i in 1:K)
	theta[i] ~ normal(0,1); // prior for the regression coefficients

	y ~ bernoulli_logit(theta0 + X * theta); // likelihood
	}
	generated quantities {
	// this must be wrong but i'm desperate
	real log_lik;
	log_lik = binomial_lpmf(y \| N, inv_logit(theta0 + X * theta));
	}"""


	# In[7]:


	model = pystan.StanModel(model_code=code)
	data = {'N': x.shape[0], 'K': x.shape[1], 'y': y, 'X':x}
	fit = model.sampling(data=data, iter=50000,n_jobs=-1)
	print(fit)


	# In[8]:


	df = fit.to_dataframe()
	samples = df.loc[:, df.columns.str.startswith('theta')].values
	cov = np.cov(np.array(samples), rowvar = False)
	covinv = np.linalg.inv(cov)
	theta_mean = samples.mean(axis = 0)
	mu = theta_mean.copy() # these look similar to the examples on kaggle
	mu


	# In[9]:


	# i want to find the posterior mode and I want it quick
	print(df[df.log_lik == df.log_lik.max()])
	a = df[df.log_lik == df.log_lik.max()]
	theta_mode = a.loc[:, a.columns.str.startswith('theta')].values
	mu = theta_mode.reshape(theta_mean.shape)
	mu


	# In[10]:


	def f(x):
	xcp = x.copy()
	if len(x) == 5:
	x_tmp = np.r_[1,xcp]
	else:
	x_tmp = xcp
	x_tmp[0] = 1
	exp = np.exp((x_tmp * theta_mean).sum())
	pi = exp / (1+ exp)
	return pi

	def fArray(array_args):
	return np.apply_along_axis(f,1,array_args)

	y_pred = fArray(x)
	y_pred_bin = list(y_pred)
	y_pred_bin = np.asarray([round(el) for el in y_pred_bin])
	from sklearn.metrics import confusion_matrix
	cf = confusion_matrix(y, y_pred_bin) # this also looks similar to the kaggle example
	cf


	# In[11]:


	# this is built using the sampled theta values from the mcmc
	def frefCovarianceArray(array_args):
	colOnes = np.ones([array_args.shape[0],1])
	array_args = np.c_[colOnes,array_args] # to handle the intercept
	f0 = f(mu)
	return np.apply_along_axis(frefMatrixCovariance,1,array_args, f0, mu, cov)


	# In[12]:


	###### TI ##########################################

	def replace_in_stan_code(stan_code, str_to_replace, value):
	return stan_code.replace(str_to_replace, str(value))

	custom_pdf_code2 = """
	functions {
	real lf(vector x, vector peak)
	{
	real ex = exp(x' * peak);
	real result = ex / (1 + ex);
	return result;
	}
	real lfref(vector x, real f0, vector peak, matrix C)
	{
	return f0 - 0.5 * (x-peak)' * C * (x-peak);
	}
	real path_lpdf(vector x, real f0, vector peak, matrix C, real lambda)
	{
	row_vector[6] x_tmp = append_col(1, x');
	vector[6] x_new = x_tmp';
	return lambda * log(lf(x_new, peak)) + (1-lambda) * lfref(x_new, f0, peak, C);
	}
	}
	data {
	int N; // number of variables
	matrix[N,N] C; // inverse of the covariates matrix
	vector[N] peak; // typically mean or mode
	real f0; // log(f(peak))
	real lambda; // used for creating a path between f and f_ref
	}
	transformed data {}
	parameters {
	vector[N-1] y;
	}
	transformed parameters {}
	model {
	y ~ path(f0, peak, C, lambda);
	}
	generated quantities {}
	"""


	# In[13]:


	t0 = time.time()
	m = pystan.StanModel(model_code=custom_pdf_code2)
	print(round(time.time() - t0,2), 'seconds elapsed for the stan model compilation')


	# In[14]:


	data = {'N': len(mu),'C': covinv, 'peak': mu, 'f0': np.log(f(mu))}

	def get_expect_for_lambda(lam):
	n_iter = 50000 #change
	data.update({'lambda': lam})
	fit = m.sampling(data=data, iter=n_iter, n_jobs=1)
	df = fit.to_dataframe()
	samples = df.loc[:, df.columns.str.startswith('y')].values
	vals = np.log(fArray(samples) / frefCovarianceArray(samples))
	expects = vals.mean()
	variance = vals.var()
	sdErrs = np.sqrt(variance/len(vals))
	return {'expects': expects, 'variance': variance,
	'sdErrs': sdErrs, 'steps': n_iter}

	def MCMC_for_all_lambdas(lambdaVals):
	"""Execute TI MCMC for multiple lambdas.
	lambdaVals: list of values of lambdas"""
	t0 = time.time()
	lambdaOutput = {}
	for l in lambdaVals:
	lam = round(l,1)
	print(lam)
	lambdaOutput.update({lam: get_expect_for_lambda(lam)})
	print(time.time() - t0, 'seconds elapsed')
	return lambdaOutput


	# In[15]:


	#Serial version
	#lambda_dict = MCMC_for_all_lambdas(lambdaVals)


	# In[18]:


	poolout=pool.map(MCMC_for_one_lambdas, lambdaVals)
	lambda_dict=dict((key,d[key]) for d in poolout for key in d)
	lambda_dict


	# In[19]:


	lambdaVals = list(lambda_dict.keys())
	expectsPerLambda = []
	# if lambda_dict[0]['expects'] == 1 * np.inf:
	# print('Expectation for lambda = 0 is -inf, removing this value')
	# del lambda_dict[0]
	for lam in lambdaVals:
	expectsPerLambda.append(lambda_dict[lam]['expects'])
	tck = scipy.interpolate.splrep(lambdaVals, expectsPerLambda, s=0)
	xnew = np.linspace(0, 1)
	ynew = scipy.interpolate.splev(xnew, tck, der=0)
	plt.plot(xnew,ynew)
	plt.scatter(lambdaVals, expectsPerLambda)
	plt.xlim([0,1])
	plt.xlabel('lambda')
	plt.ylabel('expectation')
	plt.show()
	# calculate exponent of the integral of the interpolated line
	logzExtra = scipy.interpolate.splint(lambdaVals[0], lambdaVals[-1], tck, full_output=0)
	logzExtra


	# In[20]:


	logzExtra_covariance = logzExtra
	# logzExtra_covariance = get_Zextra(lambda_dict)
	logzRefCovariance = np.log(gaussianIntegral(np.linalg.inv(cov))) + np.log(f(mu))
	logzTI_covariance = logzExtra_covariance + logzRefCovariance

	print('Covariance approximation')
	print('log zRef = ', logzRefCovariance)
	print('log zExtra = ', logzExtra_covariance)
	print('log(zTI) = ', logzTI_covariance)


	# In[21]:


	pool.close()