Skip to content

Instantly share code, notes, and snippets.

@georgiana-b
Last active September 23, 2016 08:33
Show Gist options
  • Save georgiana-b/72d08d0b1f08cc39c7eca364277c6a73 to your computer and use it in GitHub Desktop.
Save georgiana-b/72d08d0b1f08cc39c7eca364277c6a73 to your computer and use it in GitHub Desktop.
Find pairs of duplicated columns, both exact duplicates and percentual duplicates
import pandas
def find_exact_duplicate_columns(frame):
"""Find pairs of columns that are exact duplicates
i.e. each value should match with the other's column value for all rows.
Adapted from this thoughtful answer: http://stackoverflow.com/a/32961145
"""
dups = []
columns = frame.columns
values = frame
columns_len = len(columns)
for i in range(columns_len):
ia = values.iloc[:,i].values
for j in range(i+1, columns_len):
ja = values.iloc[:,j].values
if pandas.core.common.array_equivalent(ia, ja):
if frame[columns[i]].count() > 0:
dups.append((columns[i], columns[j]))
break
return dups
def percentual_column_comparison(frame, min_similar_percent):
"""Find pairs of columns that have a pretty high chance of holding the
same data i.e. percent of not null similar values between 2 columns.
Args:
frame (DataFrame): the frame whose columns should be compared
min_similar_percent (integer): minimum percent of values that should
be similar for columns to be canditates
"""
results = pandas.DataFrame(columns=["column_combination", "same_and_not_null_percent"])
for index, column in enumerate(frame.columns):
check_columns = frame.columns[index+1:]
nulls = frame[column].notnull()
percent_not_null = nulls.sum()
for check_column in check_columns:
compare = frame[check_column] == frame[column]
same_and_not_null = nulls & compare
similar_percent = (100 * same_and_not_null.sum()) / percent_not_null
if similar_percent > min_similar_percent:
columns_combo = '{0}-{1}'.format(column, check_column)
results.loc[len(results)]=[columns_combo, similar_percent]
return results
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment