Last active
September 23, 2016 08:33
-
-
Save georgiana-b/72d08d0b1f08cc39c7eca364277c6a73 to your computer and use it in GitHub Desktop.
Find pairs of duplicated columns, both exact duplicates and percentual duplicates
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas | |
def find_exact_duplicate_columns(frame): | |
"""Find pairs of columns that are exact duplicates | |
i.e. each value should match with the other's column value for all rows. | |
Adapted from this thoughtful answer: http://stackoverflow.com/a/32961145 | |
""" | |
dups = [] | |
columns = frame.columns | |
values = frame | |
columns_len = len(columns) | |
for i in range(columns_len): | |
ia = values.iloc[:,i].values | |
for j in range(i+1, columns_len): | |
ja = values.iloc[:,j].values | |
if pandas.core.common.array_equivalent(ia, ja): | |
if frame[columns[i]].count() > 0: | |
dups.append((columns[i], columns[j])) | |
break | |
return dups | |
def percentual_column_comparison(frame, min_similar_percent): | |
"""Find pairs of columns that have a pretty high chance of holding the | |
same data i.e. percent of not null similar values between 2 columns. | |
Args: | |
frame (DataFrame): the frame whose columns should be compared | |
min_similar_percent (integer): minimum percent of values that should | |
be similar for columns to be canditates | |
""" | |
results = pandas.DataFrame(columns=["column_combination", "same_and_not_null_percent"]) | |
for index, column in enumerate(frame.columns): | |
check_columns = frame.columns[index+1:] | |
nulls = frame[column].notnull() | |
percent_not_null = nulls.sum() | |
for check_column in check_columns: | |
compare = frame[check_column] == frame[column] | |
same_and_not_null = nulls & compare | |
similar_percent = (100 * same_and_not_null.sum()) / percent_not_null | |
if similar_percent > min_similar_percent: | |
columns_combo = '{0}-{1}'.format(column, check_column) | |
results.loc[len(results)]=[columns_combo, similar_percent] | |
return results | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment