Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
# Search for variables that are very similar
def show_similars(cols, threshold=0.90):
for i1, col1 in enumerate(cols):
for i2, col2 in enumerate(cols):
if (i1<i2):
cm12 = pd.crosstab(dfX[col1], dfX[col2]).values # contingency table
cv12 = cramers_corrected_stat(cm12) # Cramer V statistic
if (cv12 > threshold):
print((col1, col2), int(cv12*100))
show_similars(['basin','region','region_code','district_code','lga'], 0.95)
# Output :
# ('region', 'region_code') 99
# ('region', 'lga') 99
# ('region_code', 'lga') 97
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment