Skip to content

Instantly share code, notes, and snippets.

What would you like to do?
A parallel and sparse approach to calculating correlations between columns of a Pandas DataFrame
def calculate_correlations(df, correlation_threshold=0.3, pvalue_threshold=0.05):
shape = (df.shape[1], df.shape[1])
correlation_matrix = sp.sparse.lil_matrix(shape)
pvalues = sp.sparse.lil_matrix(shape)
mask = sp.sparse.lil_matrix(shape)
overlap = sp.sparse.lil_matrix(shape)
def column_corr(col1_idx):
col1 = df.iloc[:, col1_idx]
def gen():
for col2_idx in range(df.shape[1]):
if col2_idx <= col1_idx:
correlation, pvalue = pearsonr(col1, df.iloc[:,col2_idx])
overlap = len(col1) - np.isnan(col1 + df.iloc[:,col2_idx]).sum()
if (correlation_threshold is None or correlation >= correlation_threshold) \
and (pvalue_threshold is None or pvalue <= pvalue_threshold):
yield col2_idx, correlation, pvalue, overlap
return list(gen())
result = joblib.Parallel()(joblib.delayed(column_corr)(col_idx) for col_idx in range(df.shape[1]))
if len(result) > 0:
for row_idx, row in enumerate(result):
if len(row) > 0:
correlation_matrix[row_idx, [ col_idx for col_idx,_,_,_ in row ]] = [ correlation for _,correlation,_,_ in row ]
pvalues [row_idx, [ col_idx for col_idx,_,_,_ in row ]] = [ pvalue for _,_,pvalue,_ in row ]
overlap [row_idx, [ col_idx for col_idx,_,_,_ in row ]] = [ overlap for _,_,_,overlap in row ]
mask [row_idx, [ col_idx for col_idx,_,_,_ in row ]] = 1
return {
"correlation_matrix": correlation_matrix.tocsr(),
"pvalues": pvalues.tocsr(),
"overlap": overlap.tocsr(),
"mask": mask.tocsr()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.