Skip to content

Instantly share code, notes, and snippets.

@wvigiloliver
Forked from sanzoghenzo/compare.py
Last active January 2, 2020 04:35
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save wvigiloliver/b54ce0e06b04ab235736dbcc1a1058af to your computer and use it in GitHub Desktop.
Save wvigiloliver/b54ce0e06b04ab235736dbcc1a1058af to your computer and use it in GitHub Desktop.
Compare Excel sheets with Pandas
"""
Command-Line Interface Program to Compare Excel Sheets using Python
Author: https://github.com/sanzoghenzo
Inspired by https://pbpython.com/excel-diff-pandas-update.html
"""
import argparse # Module to integrate Python code with command-line interfaces
import pandas as pd
import numpy as np
def report_diff(x):
"""Helper function to use with groupby.apply to highlight changes in cell values"""
return x[0] if x[0] == x[1] or pd.isna(x).all() else f'{x[0]} ---> {x[1]}'
def strip(x):
"""Helper function to use with applymap to strip all whitespaces from the dataframe"""
return x.strip() if isinstance(x, str) else x
def diff_pd(old_df, new_df, idx_col):
"""Identify differences between two pandas DataFrames using a key column containing unique row identifers
Args:
old_df (pd.DataFrame): first dataframe
new_df (pd.DataFrame): second dataframe
idx_col (str): column name of the index, needs to be present in both DataFrames
Note: Key column is assumed to have a unique row identifier, i.e. no duplicates
"""
# setting the column name as index for fast operations
old_df = old_df.set_index(idx_col)
new_df = new_df.set_index(idx_col)
# get the added and removed rows
old_keys = old_df.index
new_keys = new_df.index
removed_keys = np.setdiff1d(old_keys, new_keys)
added_keys = np.setdiff1d(new_keys, old_keys)
out_data = {
'removed': old_df.loc[removed_keys],
'added': new_df.loc[added_keys]
}
# focusing on common data of both dataframes
common_keys = np.intersect1d(old_keys, new_keys, assume_unique=True)
common_columns = np.intersect1d(old_df.columns, new_df.columns, assume_unique=True)
new_common = new_df.loc[common_keys, common_columns].applymap(strip)
old_common = old_df.loc[common_keys, common_columns].applymap(strip)
# get the changed rows keys by dropping identical rows
# (indexes are ignored, so we'll reset them)
common_data = pd.concat([old_common.reset_index(), new_common.reset_index()])
changed_keys = common_data.drop_duplicates(keep=False)[idx_col].unique()
# combining the changed rows via multi level columns
df_all_changes = pd.concat([old_common.loc[changed_keys], new_common.loc[changed_keys]],
axis='columns', keys=['old', 'new'])
df_all_changes = df_all_changes.swaplevel(axis='columns')[new_common.columns]
# using report_diff to merge the changes in a single cell with "-->"
df_changed = df_all_changes.groupby(level=0, axis=1).apply(
lambda frame: frame.apply(report_diff, axis=1))
out_data['changed'] = df_changed
return out_data
def compare_excel(path1, path2, out_path, sheet_name, index_col_name=None, **kwargs):
old_df = pd.read_excel(path1, sheet_name=sheet_name, **kwargs)
new_df = pd.read_excel(path2, sheet_name=sheet_name, **kwargs)
diff = diff_pd(old_df, new_df, index_col_name)
with pd.ExcelWriter(out_path) as writer:
for sname, data in diff.items():
data.to_excel(writer, sheet_name=sname)
print(f"Differences saved in {out_path}")
def main():
cfg = argparse.ArgumentParser(
description="Compares two Excel sheets and outputs the differences to a separate Excel file. "
"A column name can be specified as the unique row identifier."
)
cfg.add_argument("path1", help="Fist Excel file")
cfg.add_argument("path2", help="Second Excel file")
cfg.add_argument("sheetname", help="Name of the sheet to compare.")
cfg.add_argument("-c", "--index-column", help="Name of the column with unique row identifier",
required=True)
cfg.add_argument("-o", "--output-path", help="Path of the comparison results",
default="compared.xlsx")
cfg.add_argument("--skiprows", help='Excel row containing the table headers', type=int, action='append',
default=None)
opt = cfg.parse_args()
compare_excel(opt.path1, opt.path2, opt.output_path, opt.sheetname, opt.index_column,
skiprows=opt.skiprows)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment