Skip to content

Instantly share code, notes, and snippets.

@krassowski
Created March 22, 2020 11:58
Show Gist options
  • Save krassowski/788a72c770838102a96daec2e5f4a901 to your computer and use it in GitHub Desktop.
Save krassowski/788a72c770838102a96daec2e5f4a901 to your computer and use it in GitHub Desktop.
Loadings similarity method - a simplified version
from pandas import DataFrame
from sklearn.pipeline import Pipeline
from sklearn.base import clone
from sklearn.model_selection import KFold
from scipy.stats import pearsonr
from numpy import Inf
def loadings_similarity_simple(
pipeline: Pipeline, data,
limit_to_n_components=Inf, cv=KFold(),
method=pearsonr, threshold=0.5
):
"""The simplified version of the algorithm"""
result = []
keeps = []
train_pipeline = clone(pipeline)
test_pipeline = clone(pipeline)
for train, test in cv.split(data):
train = data[train]
test = data[test]
train_pipeline.fit(train)
train_pca = train_pipeline.steps[-1][1]
train_loadings = train_pca.components_
test_pipeline.fit(test)
test_pca = test_pipeline.steps[-1][1]
test_loadings = test_pca.components_
components_n = min(limit_to_n_components, len(train_loadings))
cv_keeps = []
for n in range(1, components_n + 1):
abs_correlations = []
trained_loading = train_loadings[n - 1]
components_m = min(components_n, len(test_loadings))
corresponding_pc_in_test = None
keep = False
for m in range(1, components_m + 1):
test_loading = test_loadings[m - 1]
correlation, pvalue = method(test_loading, trained_loading)
abs_correlation = abs(correlation)
abs_correlations.append(abs_correlation)
if abs_correlation >= threshold:
keep = True
corresponding_pc_in_test = m
break
cv_keeps.append(keep)
result.append({
'n': n,
'best_abs_corr': max(abs_correlations),
'threshold': threshold,
'corresponding_pc_in_test': corresponding_pc_in_test,
'keep': keep,
})
keeps.extend([sum(cv_keeps)] * components_n)
df = DataFrame(result)
df['keep_n'] = keeps
return df
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment