Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Partial Correlation significance using kfolds
#notebook:
#https://github.com/thistleknot/python-ml/blob/master/code/pcorr-significance.ipynb
import pandas as pd
import numpy as np
from scipy import stats # For in-built method to get PCC
import scipy
from sklearn.model_selection import KFold
import pingouin as pg
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
from zca import zca
import seaborn as sns
import sklearn.linear_model
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import OLSInfluence
import statsmodels.tools
import matplotlib.pyplot as plt
zca = zca.ZCA()
target = 'Poverty'
all_data = pd.read_csv('../data/raw/states.csv')
scaler.fit(np.array(all_data[target]).reshape(-1, 1))
num_folds = 10
kfold = KFold(n_splits=num_folds, shuffle=True)
kfold.get_n_splits(all_data.index)
exclude = 'States'
sig_table = np.zeros(shape=(len(all_data.columns)))
signs_table = np.zeros(shape=(len(all_data.columns)))
p_threshold = .05
New_Names = all_data.columns[2:]
iteration = 0
for train_index, test_index in kfold.split(all_data):
#print(iteration)
max_pvalue = 1
subset = all_data.iloc[train_index].loc[:, ~all_data.columns.isin([exclude])]
#skip y and states
set_ = subset.loc[:, ~subset.columns.isin([target])].columns.tolist()
n=len(subset)
while(max_pvalue>=.05):
dist = scipy.stats.beta(n/2 - 1, n/2 - 1, loc=-1, scale=2)
p_values = pd.DataFrame(2*dist.cdf(-abs(subset.pcorr()[target]))).T
p_values.columns = list(subset.columns)
max_pname = p_values.idxmax(axis=1)[0]
max_pvalue = p_values[max_pname].values[0]
if (max_pvalue > .05):
set_.remove(max_pname)
temp = [target]
temp.extend(set_)
subset = subset[temp]
winners = p_values.loc[:, ~p_values.columns.isin([target])].columns.tolist()
sig_table = (sig_table + np.where(all_data.columns.isin(winners),1,0)).copy()
signs_table[all_data.columns.get_indexer(winners)]+=np.where(subset.pcorr()[target][winners]<0,-1,1)
significance = pd.DataFrame(sig_table).T
significance.columns = list(all_data.columns)
display(significance)
sign = pd.DataFrame(signs_table).T
sign.columns = list(all_data.columns)
display(sign)
purity = abs((sign/num_folds)*(sign/significance)).T.replace([np.inf, -np.inf, np.NaN], 0)
display(purity.T)
threshold = .5
chosen = list(purity.T.columns.values[np.array(purity.T>=threshold).reshape(len(all_data.columns,))])
dataSet = pd.concat([all_data[target],all_data[chosen]],axis=1)
y_scaled = pd.DataFrame(scaler.transform(np.array(dataSet[target]).reshape(-1, 1)))
y_scaled.columns=[target]
display(chosen)
zca_data = pd.concat([y_scaled,pd.DataFrame(zca.fit_transform(dataSet[chosen]),columns=chosen)],axis=1)
zca_data.pcorr()
sns.pairplot(zca_data)
#model = sklearn.linear_model.LinearRegression()
data_set_wConstant = statsmodels.tools.tools.add_constant(zca_data)
y = data_set_wConstant[target]
X = data_set_wConstant[data_set_wConstant.columns.drop(target)]
#results = model.fit(X, y)
model = sm.OLS(y,X)
results = model.fit()
results.summary()
pd.DataFrame(results.get_influence().resid_studentized_internal).hist()
plt.scatter(results.fittedvalues*scaler.scale_[0] + scaler.mean_[0], y*scaler.scale_[0] + scaler.mean_[0])
@thistleknot

This comment has been minimized.

Copy link
Owner Author

@thistleknot thistleknot commented Jul 26, 2021

v1
image

@thistleknot

This comment has been minimized.

Copy link
Owner Author

@thistleknot thistleknot commented Jul 26, 2021

v2
pcorr5

@thistleknot

This comment has been minimized.

Copy link
Owner Author

@thistleknot thistleknot commented Jul 26, 2021

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment