Skip to content

Instantly share code, notes, and snippets.

@mmatkinson
Created August 5, 2016 19:52
Show Gist options
  • Save mmatkinson/9babe3e5665125302a3b5c21441a6892 to your computer and use it in GitHub Desktop.
Save mmatkinson/9babe3e5665125302a3b5c21441a6892 to your computer and use it in GitHub Desktop.
compare two tables & all of their values
import pandas as pd
def df_diff(index_cols, data1, data2, lsuffix='_1'):
"""
usage:
comparisondf= df_diff( ['unique_id','date'], current_df, new_df, lsuffix='_curr')
retuns:
single dataframe with index_cols on the index, as well as all other variables stacked on the index, and the
values in each dataframe along the columns.
todo: make this take a list of dfs
"""
all_cols= set(data2.columns.tolist()).union(set(data1.columns.tolist()))
other_cols = list(set(all_cols).difference(set(index_cols)))
dat2 = data2.set_index(index_cols).sort_index(ascending=False)
dat1 = data1.set_index(index_cols).sort_index(ascending=False)
data2_melt = pd.melt(data2, id_vars = index_cols).set_index(index_cols).sort_index()
data1_melt = pd.melt(data1, id_vars = index_cols).set_index(index_cols).sort_index()
data2_melt = data2_melt.reset_index().set_index(index_cols + ['variable'])
data1_melt = data1_melt.reset_index().set_index(index_cols + ['variable'])
comparison = data1_melt.join(data2_melt, lsuffix=lsuffix, how = 'outer')
return comparison
def make_comparison(val_list, e=0.05):
"""
generic comparison function : takes a list of values of any type and returns
can be used in df.apply() -- where columns are the values to compare.
eg: mydf[mydf.apply(lambda x: make_comparison(x, e=0.01), axis =1)] -- displays different values & ignores same values
val_list = list of values to compare
e = acceptable threshold of difference
"""
val_types = val_list.map(type)
if (val_types == str).value_counts().to_dict().get(True) :
return len(set(val_list)) > 1
elif (val_list.isnull()).value_counts().to_dict().get(True):
return len(val_list.isnull().unique()) > 1
else:
mean_list = np.mean(val_list)
return True in (val_list - mean_list > e )
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment