Skip to content

Instantly share code, notes, and snippets.

@snakers4
Created March 16, 2017 09:53
Show Gist options
  • Save snakers4/bd0c9ce8bc823159907be6f28ef2676f to your computer and use it in GitHub Desktop.
Save snakers4/bd0c9ce8bc823159907be6f28ef2676f to your computer and use it in GitHub Desktop.
# Unsophisticated corr analysis to deal w variable bias
data_corr = sDf.corr()
size = data_corr.shape[0] - 1
# Set the threshold to select only highly correlated attributes
threshold = 0.5
# List of pairs along with correlation above threshold
corr_list = []
#Search for the highly correlated pairs
for i in range(0,size): #for 'size' features
for j in range(i+1,size): #avoid repetition
if (data_corr.iloc[i,j] >= threshold and data_corr.iloc[i,j] < 1) or (data_corr.iloc[i,j] < 0 and data_corr.iloc[i,j] <= -threshold):
corr_list.append([data_corr.iloc[i,j],i,j]) #store correlation and columns index
#Sort to show higher ones first
s_corr_list = sorted(corr_list,key=lambda x: -abs(x[0]))
cols = list(sDf.columns.values)
corrDf = pd.DataFrame(columns=('A','B','corr'))
#Print correlations and column names
for v,i,j in s_corr_list:
print ("%s and %s = %.2f" % (cols[i],cols[j],v))
corrDf.loc[i] = [cols[i],cols[j],v ]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment