Skip to content

Instantly share code, notes, and snippets.

@motin
Created November 15, 2022 14:08
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save motin/d74c49b0803d9716787aee2298ed2d79 to your computer and use it in GitHub Desktop.
Save motin/d74c49b0803d9716787aee2298ed2d79 to your computer and use it in GitHub Desktop.
df1 = pd.DataFrame([
{"url": "a", "Foo": '=C$2:C2', "bar": 'Cat'},
{"url": "b", "Foo": '=C$2:C3', "bar": 'Mouse'},
{"url": "c", "Foo": '=C$2:C2', "bar": 'Dog'},
{"url": "d", "Foo": '=C$2:C5', "bar": 'Eagle'},
{"url": "e", "Foo": 'C$2:C6', "bar": 'Albatross'},
{"url": "f", "Foo": '', "bar": 'Albatross'},
])
def df_add_col(df):
df_new_col = df.copy()
df_new_col["zoo"] = "New col"
return df_new_col
df2_new_col = df_add_col(df1)
def df_add_row(df):
df_with_new_row = pd.DataFrame([{"url": "g", "Foo": 'Goo', "bar": 'Tiger'}])
return pd.concat([df, df_with_new_row]).reset_index(drop=True)
df3_new_row = df_add_row(df1)
df4_new_col_new_row = df_add_col(df_add_row(df1))
df5_changed_contents = df1.copy()
df5_changed_contents.at[2,'bar'] = 'Snake'
df6_same_contents_different_index = df1.copy()
df6_same_contents_different_index = df6_same_contents_different_index.set_index(df6_same_contents_different_index.index + 1)
df7_dupes = df1.copy()
df7_dupes.at[0,'url'] = 'z'
df7_dupes.at[1,'url'] = 'z'
df7_dupes.at[2,'url'] = 'z'
display(df7_dupes)
import pandas as pd
from pandas.errors import MergeError
def display_df_comparison_raw(a_df: pd.DataFrame, b_df: pd.DataFrame, primary_key_columns: list, what: str):
try:
outer_merge = a_df.reset_index().merge(
b_df.reset_index(),
on=primary_key_columns,
how="outer",
indicator=True,
suffixes=("_a", "_b"),
validate="1:1",
).sort_values(by=["index_a", *primary_key_columns])
#print("outer_merge")
#display(outer_merge)
#print("Shared rows:")
shared_rows_index_a = outer_merge[outer_merge["_merge"] == "both"].set_index("index_a").index
shared_rows_index_b = outer_merge[outer_merge["_merge"] == "both"].set_index("index_b").index
#display(shared_rows)
print(f"{what} only in A:")
rows_only_in_a = outer_merge[outer_merge["_merge"] == "left_only"].set_index("index_a").index
if len(rows_only_in_a) == 0:
print("None")
else:
display(a_df.loc[rows_only_in_a])
print(f"{what} only in B:")
rows_only_in_b = outer_merge[outer_merge["_merge"] == "right_only"].set_index("index_b").index
if len(rows_only_in_b) == 0:
print("None")
else:
display(b_df.loc[rows_only_in_b])
# for the columns and rows that are common, show the actual differences
print(f"Diffs within:")
shared_columns = list(set(a_df.columns) & set(b_df.columns))
a_df_shared_columns = a_df[shared_columns]
b_df_shared_columns = b_df[shared_columns]
#print("a_df_shared_columns", a_df_shared_columns)
#print("b_df_shared_columns", b_df_shared_columns)
#print("shared_rows_index_a", shared_rows_index_a)
#print("shared_rows_index_b", shared_rows_index_b)
a_df_to_compare = a_df_shared_columns.loc[shared_rows_index_a].reset_index(drop=False).rename(columns={"index_a": "Original Index"}).set_index(primary_key_columns, drop=True)
b_df_to_compare = b_df_shared_columns.loc[shared_rows_index_b].reset_index(drop=False).rename(columns={"index_b": "Original Index"}).set_index(primary_key_columns, drop=True)
if a_df_to_compare.equals(b_df_to_compare):
print(f"Identical")
else:
#display(a_df_to_compare)
#display(b_df_to_compare)
comparison_result_df = a_df_to_compare.compare(b_df_to_compare) # pandas 1.5.0+ , result_names=('a', 'b')
display(comparison_result_df)
except MergeError as mergeError:
print("MergeError", mergeError)
print("Duplicates along the primary key columns:")
print("A")
duplicates_a_df = a_df[a_df.duplicated(
subset=primary_key_columns,
keep=False,
)].sort_values(by=primary_key_columns)
display(duplicates_a_df)
print("B")
duplicates_b_df = b_df[b_df.duplicated(
subset=primary_key_columns,
keep=False,
)].sort_values(by=primary_key_columns)
display(duplicates_b_df)
def display_df_comparison(a_df: pd.DataFrame, b_df: pd.DataFrame, primary_key_columns: list):
# data = GsheetsWorksheetData(df, header_row_number=0, attributes_to_columns_map={'foo': 'Foo'})
if a_df.equals(b_df):
print("Identical dataframes")
return
print("-- Columns --")
#print("Shared columns:")
#shared_columns = list(set(a_df.columns) & set(b_df.columns))
#print(shared_columns)
a_df_dtypes_df = pd.DataFrame(a_df.dtypes, columns=["dtype"]).reset_index().rename(columns={"index": "column"})
b_df_dtypes_df = pd.DataFrame(b_df.dtypes, columns=["dtype"]).reset_index().rename(columns={"index": "column"})
display_df_comparison_raw(a_df_dtypes_df, b_df_dtypes_df, primary_key_columns=["column"], what="Columns")
print("-- Rows / cell contents --")
display_df_comparison_raw(a_df, b_df, primary_key_columns=primary_key_columns, what="Rows")
print("Columns data only in A:")
columns_only_in_a = list(set(a_df.columns) - set(b_df.columns))
if len(columns_only_in_a) == 0:
print("None")
else:
display(a_df[columns_only_in_a])
print("Columns data only in B:")
columns_only_in_b = list(set(b_df.columns) - set(a_df.columns))
if len(columns_only_in_b) == 0:
print("None")
else:
display(b_df[columns_only_in_b])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment