findCorrelation in python
import pandas as pd | |
import numpy as np | |
def find_correlation(df, thresh=0.9): | |
""" | |
Given a numeric pd.DataFrame, this will find highly correlated features, | |
and return a list of features to remove | |
params: | |
- df : pd.DataFrame | |
- thresh : correlation threshold, will remove one of pairs of features with | |
a correlation greater than this value | |
""" | |
corrMatrix = df.corr() | |
corrMatrix.loc[:,:] = np.tril(corrMatrix, k=-1) | |
already_in = set() | |
result = [] | |
for col in corrMatrix: | |
perfect_corr = corrMatrix[col][corrMatrix[col] > thresh].index.tolist() | |
if perfect_corr and col not in already_in: | |
already_in.update(set(perfect_corr)) | |
perfect_corr.append(col) | |
result.append(perfect_corr) | |
select_nested = [f[1:] for f in result] | |
select_flat = [i for j in select_nested for i in j] | |
return select_flat |
This comment has been minimized.
This comment has been minimized.
Here is a website that I often refer to, for categorical columns and ways to change them: Hope that helps. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This comment has been minimized.
hey Hi ,
Nice to see. this it was useful. ''
This will be for only numerical columns.
How to do it for categorical columns?