Created
November 15, 2022 14:08
-
-
Save motin/d74c49b0803d9716787aee2298ed2d79 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
df1 = pd.DataFrame([ | |
{"url": "a", "Foo": '=C$2:C2', "bar": 'Cat'}, | |
{"url": "b", "Foo": '=C$2:C3', "bar": 'Mouse'}, | |
{"url": "c", "Foo": '=C$2:C2', "bar": 'Dog'}, | |
{"url": "d", "Foo": '=C$2:C5', "bar": 'Eagle'}, | |
{"url": "e", "Foo": 'C$2:C6', "bar": 'Albatross'}, | |
{"url": "f", "Foo": '', "bar": 'Albatross'}, | |
]) | |
def df_add_col(df): | |
df_new_col = df.copy() | |
df_new_col["zoo"] = "New col" | |
return df_new_col | |
df2_new_col = df_add_col(df1) | |
def df_add_row(df): | |
df_with_new_row = pd.DataFrame([{"url": "g", "Foo": 'Goo', "bar": 'Tiger'}]) | |
return pd.concat([df, df_with_new_row]).reset_index(drop=True) | |
df3_new_row = df_add_row(df1) | |
df4_new_col_new_row = df_add_col(df_add_row(df1)) | |
df5_changed_contents = df1.copy() | |
df5_changed_contents.at[2,'bar'] = 'Snake' | |
df6_same_contents_different_index = df1.copy() | |
df6_same_contents_different_index = df6_same_contents_different_index.set_index(df6_same_contents_different_index.index + 1) | |
df7_dupes = df1.copy() | |
df7_dupes.at[0,'url'] = 'z' | |
df7_dupes.at[1,'url'] = 'z' | |
df7_dupes.at[2,'url'] = 'z' | |
display(df7_dupes) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
from pandas.errors import MergeError | |
def display_df_comparison_raw(a_df: pd.DataFrame, b_df: pd.DataFrame, primary_key_columns: list, what: str): | |
try: | |
outer_merge = a_df.reset_index().merge( | |
b_df.reset_index(), | |
on=primary_key_columns, | |
how="outer", | |
indicator=True, | |
suffixes=("_a", "_b"), | |
validate="1:1", | |
).sort_values(by=["index_a", *primary_key_columns]) | |
#print("outer_merge") | |
#display(outer_merge) | |
#print("Shared rows:") | |
shared_rows_index_a = outer_merge[outer_merge["_merge"] == "both"].set_index("index_a").index | |
shared_rows_index_b = outer_merge[outer_merge["_merge"] == "both"].set_index("index_b").index | |
#display(shared_rows) | |
print(f"{what} only in A:") | |
rows_only_in_a = outer_merge[outer_merge["_merge"] == "left_only"].set_index("index_a").index | |
if len(rows_only_in_a) == 0: | |
print("None") | |
else: | |
display(a_df.loc[rows_only_in_a]) | |
print(f"{what} only in B:") | |
rows_only_in_b = outer_merge[outer_merge["_merge"] == "right_only"].set_index("index_b").index | |
if len(rows_only_in_b) == 0: | |
print("None") | |
else: | |
display(b_df.loc[rows_only_in_b]) | |
# for the columns and rows that are common, show the actual differences | |
print(f"Diffs within:") | |
shared_columns = list(set(a_df.columns) & set(b_df.columns)) | |
a_df_shared_columns = a_df[shared_columns] | |
b_df_shared_columns = b_df[shared_columns] | |
#print("a_df_shared_columns", a_df_shared_columns) | |
#print("b_df_shared_columns", b_df_shared_columns) | |
#print("shared_rows_index_a", shared_rows_index_a) | |
#print("shared_rows_index_b", shared_rows_index_b) | |
a_df_to_compare = a_df_shared_columns.loc[shared_rows_index_a].reset_index(drop=False).rename(columns={"index_a": "Original Index"}).set_index(primary_key_columns, drop=True) | |
b_df_to_compare = b_df_shared_columns.loc[shared_rows_index_b].reset_index(drop=False).rename(columns={"index_b": "Original Index"}).set_index(primary_key_columns, drop=True) | |
if a_df_to_compare.equals(b_df_to_compare): | |
print(f"Identical") | |
else: | |
#display(a_df_to_compare) | |
#display(b_df_to_compare) | |
comparison_result_df = a_df_to_compare.compare(b_df_to_compare) # pandas 1.5.0+ , result_names=('a', 'b') | |
display(comparison_result_df) | |
except MergeError as mergeError: | |
print("MergeError", mergeError) | |
print("Duplicates along the primary key columns:") | |
print("A") | |
duplicates_a_df = a_df[a_df.duplicated( | |
subset=primary_key_columns, | |
keep=False, | |
)].sort_values(by=primary_key_columns) | |
display(duplicates_a_df) | |
print("B") | |
duplicates_b_df = b_df[b_df.duplicated( | |
subset=primary_key_columns, | |
keep=False, | |
)].sort_values(by=primary_key_columns) | |
display(duplicates_b_df) | |
def display_df_comparison(a_df: pd.DataFrame, b_df: pd.DataFrame, primary_key_columns: list): | |
# data = GsheetsWorksheetData(df, header_row_number=0, attributes_to_columns_map={'foo': 'Foo'}) | |
if a_df.equals(b_df): | |
print("Identical dataframes") | |
return | |
print("-- Columns --") | |
#print("Shared columns:") | |
#shared_columns = list(set(a_df.columns) & set(b_df.columns)) | |
#print(shared_columns) | |
a_df_dtypes_df = pd.DataFrame(a_df.dtypes, columns=["dtype"]).reset_index().rename(columns={"index": "column"}) | |
b_df_dtypes_df = pd.DataFrame(b_df.dtypes, columns=["dtype"]).reset_index().rename(columns={"index": "column"}) | |
display_df_comparison_raw(a_df_dtypes_df, b_df_dtypes_df, primary_key_columns=["column"], what="Columns") | |
print("-- Rows / cell contents --") | |
display_df_comparison_raw(a_df, b_df, primary_key_columns=primary_key_columns, what="Rows") | |
print("Columns data only in A:") | |
columns_only_in_a = list(set(a_df.columns) - set(b_df.columns)) | |
if len(columns_only_in_a) == 0: | |
print("None") | |
else: | |
display(a_df[columns_only_in_a]) | |
print("Columns data only in B:") | |
columns_only_in_b = list(set(b_df.columns) - set(a_df.columns)) | |
if len(columns_only_in_b) == 0: | |
print("None") | |
else: | |
display(b_df[columns_only_in_b]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment