-
-
Save wvigiloliver/b54ce0e06b04ab235736dbcc1a1058af to your computer and use it in GitHub Desktop.
Compare Excel sheets with Pandas
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Command-Line Interface Program to Compare Excel Sheets using Python | |
Author: https://github.com/sanzoghenzo | |
Inspired by https://pbpython.com/excel-diff-pandas-update.html | |
""" | |
import argparse # Module to integrate Python code with command-line interfaces | |
import pandas as pd | |
import numpy as np | |
def report_diff(x): | |
"""Helper function to use with groupby.apply to highlight changes in cell values""" | |
return x[0] if x[0] == x[1] or pd.isna(x).all() else f'{x[0]} ---> {x[1]}' | |
def strip(x): | |
"""Helper function to use with applymap to strip all whitespaces from the dataframe""" | |
return x.strip() if isinstance(x, str) else x | |
def diff_pd(old_df, new_df, idx_col): | |
"""Identify differences between two pandas DataFrames using a key column containing unique row identifers | |
Args: | |
old_df (pd.DataFrame): first dataframe | |
new_df (pd.DataFrame): second dataframe | |
idx_col (str): column name of the index, needs to be present in both DataFrames | |
Note: Key column is assumed to have a unique row identifier, i.e. no duplicates | |
""" | |
# setting the column name as index for fast operations | |
old_df = old_df.set_index(idx_col) | |
new_df = new_df.set_index(idx_col) | |
# get the added and removed rows | |
old_keys = old_df.index | |
new_keys = new_df.index | |
removed_keys = np.setdiff1d(old_keys, new_keys) | |
added_keys = np.setdiff1d(new_keys, old_keys) | |
out_data = { | |
'removed': old_df.loc[removed_keys], | |
'added': new_df.loc[added_keys] | |
} | |
# focusing on common data of both dataframes | |
common_keys = np.intersect1d(old_keys, new_keys, assume_unique=True) | |
common_columns = np.intersect1d(old_df.columns, new_df.columns, assume_unique=True) | |
new_common = new_df.loc[common_keys, common_columns].applymap(strip) | |
old_common = old_df.loc[common_keys, common_columns].applymap(strip) | |
# get the changed rows keys by dropping identical rows | |
# (indexes are ignored, so we'll reset them) | |
common_data = pd.concat([old_common.reset_index(), new_common.reset_index()]) | |
changed_keys = common_data.drop_duplicates(keep=False)[idx_col].unique() | |
# combining the changed rows via multi level columns | |
df_all_changes = pd.concat([old_common.loc[changed_keys], new_common.loc[changed_keys]], | |
axis='columns', keys=['old', 'new']) | |
df_all_changes = df_all_changes.swaplevel(axis='columns')[new_common.columns] | |
# using report_diff to merge the changes in a single cell with "-->" | |
df_changed = df_all_changes.groupby(level=0, axis=1).apply( | |
lambda frame: frame.apply(report_diff, axis=1)) | |
out_data['changed'] = df_changed | |
return out_data | |
def compare_excel(path1, path2, out_path, sheet_name, index_col_name=None, **kwargs): | |
old_df = pd.read_excel(path1, sheet_name=sheet_name, **kwargs) | |
new_df = pd.read_excel(path2, sheet_name=sheet_name, **kwargs) | |
diff = diff_pd(old_df, new_df, index_col_name) | |
with pd.ExcelWriter(out_path) as writer: | |
for sname, data in diff.items(): | |
data.to_excel(writer, sheet_name=sname) | |
print(f"Differences saved in {out_path}") | |
def main(): | |
cfg = argparse.ArgumentParser( | |
description="Compares two Excel sheets and outputs the differences to a separate Excel file. " | |
"A column name can be specified as the unique row identifier." | |
) | |
cfg.add_argument("path1", help="Fist Excel file") | |
cfg.add_argument("path2", help="Second Excel file") | |
cfg.add_argument("sheetname", help="Name of the sheet to compare.") | |
cfg.add_argument("-c", "--index-column", help="Name of the column with unique row identifier", | |
required=True) | |
cfg.add_argument("-o", "--output-path", help="Path of the comparison results", | |
default="compared.xlsx") | |
cfg.add_argument("--skiprows", help='Excel row containing the table headers', type=int, action='append', | |
default=None) | |
opt = cfg.parse_args() | |
compare_excel(opt.path1, opt.path2, opt.output_path, opt.sheetname, opt.index_column, | |
skiprows=opt.skiprows) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment