Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Compare Excel sheets with Pandas
"""
Compare two Excel sheets
Inspired by https://pbpython.com/excel-diff-pandas-update.html
For the documentation, download this file and type:
python compare.py --help
"""
import argparse
import pandas as pd
import numpy as np
def report_diff(x):
"""Function to use with groupby.apply to highlight value changes."""
return x[0] if x[0] == x[1] or pd.isna(x).all() else f'{x[0]} ---> {x[1]}'
def strip(x):
"""Function to use with applymap to strip whitespaces from a dataframe."""
return x.strip() if isinstance(x, str) else x
def diff_pd(old_df, new_df, idx_col):
"""
Identify differences between two pandas DataFrames using a key column.
Key column is assumed to have a unique row identifier, i.e. no duplicates.
Args:
old_df (pd.DataFrame): first dataframe
new_df (pd.DataFrame): second dataframe
idx_col (str|list(str)): column name(s) of the index,
needs to be present in both DataFrames
"""
# setting the column name as index for fast operations
old_df = old_df.set_index(idx_col)
new_df = new_df.set_index(idx_col)
# get the added and removed rows
old_keys = old_df.index
new_keys = new_df.index
if isinstance(old_keys, pd.MultiIndex):
removed_keys = old_keys.difference(new_keys)
added_keys = new_keys.difference(old_keys)
else:
removed_keys = np.setdiff1d(old_keys, new_keys)
added_keys = np.setdiff1d(new_keys, old_keys)
# populate the output data with non empty dataframes
out_data = {}
removed = old_df.loc[removed_keys]
if not removed.empty:
out_data["removed"] = removed
added = new_df.loc[added_keys]
if not added.empty:
out_data["added"] = added
# focusing on common data of both dataframes
common_keys = np.intersect1d(old_keys, new_keys, assume_unique=True)
common_columns = np.intersect1d(
old_df.columns, new_df.columns, assume_unique=True
)
new_common = new_df.loc[common_keys, common_columns].applymap(strip)
old_common = old_df.loc[common_keys, common_columns].applymap(strip)
# get the changed rows keys by dropping identical rows
# (indexes are ignored, so we'll reset them)
common_data = pd.concat(
[old_common.reset_index(), new_common.reset_index()], sort=True
)
changed_keys = common_data.drop_duplicates(keep=False)[idx_col]
if isinstance(changed_keys, pd.Series):
changed_keys = changed_keys.unique()
else:
changed_keys = changed_keys.drop_duplicates().set_index(idx_col).index
# combining the changed rows via multi level columns
df_all_changes = pd.concat(
[old_common.loc[changed_keys], new_common.loc[changed_keys]],
axis='columns',
keys=['old', 'new']
).swaplevel(axis='columns')
# using report_diff to merge the changes in a single cell with "-->"
df_changed = df_all_changes.groupby(level=0, axis=1).apply(
lambda frame: frame.apply(report_diff, axis=1))
# add changed dataframe to output data only if non empty
if not df_changed.empty:
out_data['changed'] = df_changed
return out_data
def compare_excel(
path1, path2, out_path, sheet_name, index_col_name, **kwargs
):
old_df = pd.read_excel(path1, sheet_name=sheet_name, **kwargs)
new_df = pd.read_excel(path2, sheet_name=sheet_name, **kwargs)
diff = diff_pd(old_df, new_df, index_col_name)
if diff:
with pd.ExcelWriter(out_path) as writer:
for sname, data in diff.items():
data.to_excel(writer, sheet_name=sname)
print(f"Differences saved in {out_path}")
else:
print("No differences spotted")
def build_parser():
cfg = argparse.ArgumentParser(
description="Compares two Excel sheets and outputs the differences "
"to a separate Excel file."
)
cfg.add_argument("path1", help="Fist Excel file")
cfg.add_argument("path2", help="Second Excel file")
cfg.add_argument("sheetname", help="Name of the sheet to compare.")
cfg.add_argument(
"key_column",
help="Name of the column(s) with unique row identifier. It has to be "
"the actual text of the first row, not the excel notation."
"Use multiple times to create a composite index.",
nargs="+",
)
cfg.add_argument("-o", "--output-path", default="compared.xlsx",
help="Path of the comparison results")
cfg.add_argument("--skiprows", help='Excel row containing the table headers',
type=int, action='append', default=None)
return cfg
def main():
cfg = build_parser()
opt = cfg.parse_args()
compare_excel(opt.path1, opt.path2, opt.output_path, opt.sheetname,
opt.key_column, skiprows=opt.skiprows)
if __name__ == '__main__':
main()
"""Excel compare test suite."""
import io
import pandas as pd
import compare
def test_parser():
cfg = compare.build_parser()
opt = cfg.parse_args(["test1.xlsx", "test2.xlsx", "Sheet 1", "Col1", "Col2", "-o", "output.xlsx"])
assert opt.path1 == "test1.xlsx"
assert opt.path2 == "test2.xlsx"
assert opt.output_path == "output.xlsx"
assert opt.sheetname == "Sheet 1"
assert opt.key_column == ["Col1", "Col2"]
assert opt.skiprows is None
def build_excel_stream(df, sheetname):
"""Create an excel workbook as a file-like object."""
output = io.BytesIO()
with pd.ExcelWriter(output, engine="xlsxwriter") as writer:
df.to_excel(writer, sheet_name=sheetname, index=False)
return output
def sample_xlsx(df_1, df_2):
xlsx_1 = build_excel_stream(df_1, "Sheet1")
xlsx_2 = build_excel_stream(df_2, "Sheet1")
return xlsx_1, xlsx_2
def sample_dfs():
df_1 = pd.DataFrame({
"ID": [123456, 654321, 543219, 432198, 765432],
"Name": ["Lemonade", "Cola", "Orange", "Fruit Punch", "Tobacco"],
"Flavour Description": ["Fuzzy", "Fuzzy", "Fuzzy", "Fuzzy", "Smoky"],
})
df_2 = pd.DataFrame({
"ID": [123456, 654321, 543219, 432198, 876543],
"Name": ["Lemonade", "Cola", "Orange", "Fruit Punch", "Soda"],
"Flavour Description": ["Fuzzy", "Bubbly", "Fuzzy", "Fuzzy", "Sugary"],
})
return df_1, df_2
def run_assertion(diff):
changed = diff["changed"]
assert len(changed) == 1
assert changed.iloc[0]["Flavour Description"] == "Fuzzy ---> Bubbly"
added = diff["added"]
assert len(added) == 1
assert added.iloc[0]["Flavour Description"] == "Sugary"
removed = diff["removed"]
assert len(removed) == 1
assert removed.iloc[0]["Flavour Description"] == "Smoky"
print("OK.")
def test_single_index():
df_1, df_2 = sample_dfs()
diff = compare.diff_pd(df_1, df_2, ["ID"])
run_assertion(diff)
def test_single_index_excel():
xlsx_1, xlsx_2 = sample_xlsx(*sample_dfs())
diff_io = io.BytesIO()
compare.compare_excel(xlsx_1, xlsx_2, diff_io, "Sheet1", "ID")
diff = pd.read_excel(diff_io, sheet_name=None)
run_assertion(diff)
def sample_multiindex_dfs():
df_1 = pd.DataFrame({
"ID": [123456, 123456, 654321, 543219, 432198, 765432],
"Name": ["Lemonade", "Lemonade", "Cola", "Orange", "Fruit Punch", "Tobacco"],
"Flavour ID": [1, 2, None, None, None, None],
"Flavour Description": ["Fuzzy", "Fuzzy", "Fuzzy", "Fuzzy", "Fuzzy", "Smoky"],
})
df_2 = pd.DataFrame({
"ID": [123456, 123456, 654321, 543219, 432198, 876543],
"Name": ["Lemonade", "Lemonade", "Cola", "Orange", "Fruit Punch", "Soda"],
"Flavour ID": [1, 2, None, None, None, None],
"Flavour Description": ["Fuzzy", "Bubbly", "Fuzzy", "Fuzzy", "Fuzzy", "Sugary"],
})
return df_1, df_2
def test_multiindex():
df_1, df_2 = sample_multiindex_dfs()
diff = compare.diff_pd(df_1, df_2, ["ID", "Flavour ID"])
run_assertion(diff)
def test_multiindex_excel():
xlsx_1, xlsx_2 = sample_xlsx(*sample_multiindex_dfs())
diff_io = io.BytesIO()
compare.compare_excel(xlsx_1, xlsx_2, diff_io, "Sheet1", ["ID", "Flavour ID"])
diff = pd.read_excel(diff_io, sheet_name=None)
run_assertion(diff)
def test_no_diffs():
df_1, _ = sample_multiindex_dfs()
diff = compare.diff_pd(df_1, df_1, ["ID", "Flavour ID"])
assert not diff
print("OK.")
if __name__ == '__main__':
test_multiindex()
test_multiindex_excel()
test_single_index()
test_single_index_excel()
test_parser()
test_no_diff()
@sanzoghenzo

This comment has been minimized.

Copy link
Owner Author

@sanzoghenzo sanzoghenzo commented Jan 3, 2020

Readme

TL;DR

Just type the following in a prompt to get help:

python compare.py -h

Preamble

This script originates from this great blog post by Chris Moffitt, and tries to make things reusable.

We (the author of the post and me) are making a few assumptions about the data we try to compare:

  • the tables in the excel sheet starts at column A and the first row is used as header (but you can skip initial empty/non data rows with --skip-rows);
  • the tables have a column that holds unique values, like a primary key of a database table, that way we can detect added/removed/changed rows in a simpler way and regardless of their order;

To get a sense of this, just download the sample data linked in the blog post near the beginning.

Usage

You can use the script in different ways:

  • using the command python compare.py and provide the parameters inside a command/shell prompt;
  • calling the compare_excel function inside another python script, providing the files, sheet and key column as arguments;
  • if your script already has dataframes loaded, you can use diff_pd directly.

CLI usage

The first method uses argparse, a built-in library that allows you to quickly create command line interfaces.
It also automatically adds the -h switch to display the help as many other commands do, simply type:

python compare.py -h

to get the info for every parameters you need to specify (and optional ones).

The command to issue with the sample data from the blog is:

python compare.py sample-address-1.xlsx sample-address-2.xlsx Sheet1 'account number'

If you have any problem with the code...

...please help me to help you.

  • Can you provide me some minimal test data to reproduce the bug?
  • Can you post the command you ran?
  • Can you post the full error message, or at least tell me the line that throws the error?

If you don't provide me this information, I will not be able to perform the code analysis in a short time, so that I'll not be afraid to ignore you.
Life is short, for both of us! :)

@wvigiloliver

This comment has been minimized.

Copy link

@wvigiloliver wvigiloliver commented Jan 6, 2020

Hello, the script is great! Just wondering if you could share some advice on how it could be modified to show differences between the numeric columns in the two files in percentage terms (ie. x/y-1) rather than with the '--->' symbol. The non-numeric columns could be kept the same to not overly complicate things, but the ideal case would be to show the '--->' if there is indeed a difference. Thanks in advance.

@sanzoghenzo

This comment has been minimized.

Copy link
Owner Author

@sanzoghenzo sanzoghenzo commented Jan 7, 2020

What a great idea! Personally I only needed to check what changed in terms of absolute values, but I can see the usefulness of your request.

You only need to change the report_diff function to handle the case of numeric values, something like that (untested):

def report_diff(x):
    if x[0] == x[1] or pd.isna(x).all():
        # same value, keeping it
        return x[0]
    if x.str.isnumeric().all():
        # different numeric values, calculating the variation
        return x[0] / x[1] - 1
    # falling back to show the two values
    return f'{x[0]} ---> {x[1]}'

The above only shows the numeric value without the %, for it you should format the cell (harder since you need to get their index somehow) or use an f-string to format it directly.

Of course, this doesn't cover the edge cases (division by zero is the first case I can think of), I'll leave it to you as an exercise ;)

@wvigiloliver

This comment has been minimized.

Copy link

@wvigiloliver wvigiloliver commented Jan 7, 2020

Thanks. I assume I can still keep the non-numeric columns as the if x.str.isnumeric().all(): handles whether a value is numeric or not. That said, I tried the code in spreadsheets with only two columns, the first being the col with the unique row identifier and the other being a col with only numeric values and an error came up: AttributeError: ('Can only use .str accessor with string values, which use np.object_ dtype in pandas', 'occurred at index 2'). i believe this is related to the fact that you can only use .str with string columns, although I tried modifying my column to have text and numbers, but I guess it's always read by python as numeric regardless.

@sanzoghenzo

This comment has been minimized.

Copy link
Owner Author

@sanzoghenzo sanzoghenzo commented Jan 10, 2020

Of course I should have test it...

Instead of checking if they are numbers, go the pythonic way and just try the calculation, falling back to the arrow notation on except (TypeError, Zero division error)

@kanemurphy1

This comment has been minimized.

Copy link

@kanemurphy1 kanemurphy1 commented Feb 4, 2020

Hello,

Do you have any suggestion for a scenario where the unique identifier is duplicated in the key-column, but there is an additional identifier? I have case within one excel file where this does and doesn't occur, as per the example below.

ID Name Flavour ID Flavour Description
123456 Lemonade 1 Fuzzy
123456 Lemonade 2 Fuzzy
654321 Cola   Fuzzy
543219 Orange   Fuzzy
432198 Fruit Punch   Fuzzy
ID Name Flavour ID Flavour Description
123456 Lemonade 1 Fuzzy
123456 Lemonade 2 Bubbly
654321 Cola   Fuzzy
543219 Orange   Fuzzy
432198 Fruit Punch   Fuzzy

Thank you!

@sanzoghenzo

This comment has been minimized.

Copy link
Owner Author

@sanzoghenzo sanzoghenzo commented Feb 5, 2020

Hi @kanemurphy1,
I've update the code to handle multiple column as index, and I've written a test suite with your sample data (and some more data to test the added and removed rows).
Now you can specify multiple columns in the command line, and they are used as a MultiIndex.
This is what I've changed:

  • using "nargs='+'" in argument key_column to allow multiple columns to be specified;
  • renaming the argument key_column, because otherwise it showed in the namespace as key-column;
  • handle the case of common_keys being a DataFrame and trasforming it into a MultiIndex.
  • handle the case of old_keys and new_keys being a MultiIndex and using the difference method to get added and removed row indices.
@roborobrob

This comment has been minimized.

Copy link

@roborobrob roborobrob commented Feb 15, 2020

Hello sanzoghenzo,

When trying to run the latest code with the sample data files, I get following error:

KeyError: 'None of ["'name'"] are in the columns'

Any ideas why? Thanks!

@elmikele

This comment has been minimized.

Copy link

@elmikele elmikele commented Feb 19, 2020

Hello Sanzoghenzo,
first of all many thanks for all your efforts.
I have two questions:

  1. If I run your latest code, I get the follwing error at line 38:
    line 38, in diff_pd: if isinstance(old_keys, pd.MultiIndex):
    NameError: name 'old_keys' is not defined
    The earlier version works without problems for me. Do you know what the problem could be?
  2. You and Chris Moffit are assuming that "the tables have a column that holds unique values, like a primary key of a database table, that way we can detect added/removed/changed rows in a simpler way and regardless of their order"
    In my case, I would like to compare two excel files, which have a unique column; however, the identifiers in this column are a mix of integers and strings like this: '234564' or '1-19003' or 'XY/1/17/1254/001' or 'AF446444'
    Is there anyway that I can use that such a column for indexing the single rows?

Thanks a lot!

@sanzoghenzo

This comment has been minimized.

Copy link
Owner Author

@sanzoghenzo sanzoghenzo commented Feb 23, 2020

Hello sanzoghenzo,

When trying to run the latest code with the sample data files, I get following error:
KeyError: 'None of ["'name'"] are in the columns'
Any ideas why? Thanks!

Hi @roborobrob, You've been too generic in describing the issue, I can't help you.
Please specify which sample data you use (I suppose Chris Moffitt's, I haven't really ever tested them) and the exact command/function you're calling.

Hello Sanzoghenzo,
first of all many thanks for all your efforts.

Hi @elmikele,
I'm glad to help!

I have two questions:

1. If I run your latest code, I get the following error at line 38:
   line 38, in diff_pd:    if isinstance(old_keys, pd.MultiIndex):
   NameError: name 'old_keys' is not defined
   The earlier version works without problems for me. Do you know what the problem could be?

I'm sorry, I forgot to copy the two lines before line 38 that create old_keys and new_keys variables (they're the indexes of the two data frames, I extracted them to reuse in the next lines).
Now it's fixed!

2. You and Chris Moffit are assuming that "the tables have a column that holds unique values, like a primary key of a database table, that way we can detect added/removed/changed rows in a simpler way and regardless of their order"
   In my case, I would like to compare two excel files, which have a unique column; however, the identifiers in this column are a mix of integers and strings like this: '234564' or '1-19003' or 'XY/1/17/1254/001' or 'AF446444'
   Is there anyway that I can use that such a column for indexing the single rows?

I've done a quick test and the problem is that an index needs to have values of the same (or comparable) type to be sortable.
Pandas gets excel values from xlrd or openpyxl, and they convert the numbers into ints or floats.
One option is to force the type of your unique column as text directly inside Excel;
The other option is to add a converter in this script (or in a script that calls the diff_pd function), using the astype method, such as the following (untested):

def compare_excel_mixed_index(path1, path2, out_path, sheet_name, index_col_name, **kwargs):
    old_df = pd.read_excel(path1, sheet_name=sheet_name, **kwargs)
    new_df = pd.read_excel(path2, sheet_name=sheet_name, **kwargs)
    old_df[index_col_name] = old_df[index_col_name].astype(str)
    new_df[index_col_name] = new_df[index_col_name].astype(str)
    diff = diff_pd(old_df, new_df, index_col_name)
    with pd.ExcelWriter(out_path) as writer:
        for sname, data in diff.items():
            data.to_excel(writer, sheet_name=sname)
    print(f"Differences saved in {out_path}")

I will not add this to my script, since I prefer to do a normalization/cleaning of my data as a separate step.
Hope this helps!

@elmikele

This comment has been minimized.

Copy link

@elmikele elmikele commented Feb 23, 2020

Hi Sanzoghenzo,
many thanks for this swift reply.
The code works now perfectly fine.
Your suggestion to force all data of the indexing column into text directly in Excel is probably the easiest and it works perfectly fine! Sorry that i haven't thought about htis. I will consider, however, your approach using the astype method, if I want to automate the workflow a bit more.
Thanks again! This saves a lot of manual work now!

@dhayams

This comment has been minimized.

Copy link

@dhayams dhayams commented Mar 3, 2020

I am getting this error with latest version of pandas

KeyError: 'Passing list-likes to .loc or [] with any missing labels is no longer supported

Please do mention which version of pandas are supported

@sanzoghenzo

This comment has been minimized.

Copy link
Owner Author

@sanzoghenzo sanzoghenzo commented Mar 4, 2020

I am getting this error with latest version of pandas

KeyError: 'Passing list-likes to .loc or [] with any missing labels is no longer supported

Please do mention which version of pandas are supported

Hi @dhayams,
please help me to help you.
Can you provide me some minimal test data to reproduce the bug?
Can you post the command you ran?
Can you post the full error message, or at least tell me the line that throws the error?
I had no trouble running my tests with the latest pandas version.
I suspected that the problem is when there's no changes, so I added a test for it but everything is ok.
I could try to investigate other parts of code, but I prefer to save time for other important things and wait for your help :)

BTW: pre-1.0 pandas should work ok. This error was a simple deprecation warning until 0.25

@javipkt

This comment has been minimized.

Copy link

@javipkt javipkt commented Jan 6, 2021

Hi! Thanks for the script! Great work!

I have an issue when picking 2 columns as key column (the 1st and 2nd ones). The error is "File "D:\ANACONDA\lib\site-packages\pandas\core\indexes\multi.py", line 1902, in getitem
if level_codes[key] == -1:
IndexError: index 0 is out of bounds for axis 0 with size 0"

Numero Indice Nombre Apellido Edad
1500001 a Name1 Surname1 39
1500002 b Name2 Surname2 52
1500003 c Name3 Surname3 18
1500004 d Name4 Surname4 76
1500005 e Name5 Surname5 23
1500006 f Name6 Surname6 99
Numero Indice Nombre Apellido Edad
1500001 a Name1 Surname1 39
1500002 b Name2 Surme2 52
1500003 c Name3 Surname3 23
1500005 e Name5 Surname5 23
1500006 f Name6 Surname6 88

If instead of the second column I select another, it does not happen and it is correctly processed. I am using Anaconda 2020.11 with Python 3.8.5. Please, let me know if you need further information.

Thanks!

@sanzoghenzo

This comment has been minimized.

Copy link
Owner Author

@sanzoghenzo sanzoghenzo commented Jan 9, 2021

hi @javipkt,
You caught a dormant bug!
This occurs when there are no rows added (or removed), and we passed an empty dataframe to data.to_excel.
It is corrected now.

@javipkt

This comment has been minimized.

Copy link

@javipkt javipkt commented Jan 9, 2021

Thanks! It works like a charm now!

I have added to my version the following code to convert the data to strings and drop and fill nan values:
.................................
def compare_excel(
path1, path2, out_path, sheet_name, index_col_name, **kwargs
):
old_df = pd.read_excel(path1, sheet_name=sheet_name, **kwargs)
new_df = pd.read_excel(path2, sheet_name=sheet_name, **kwargs)

# droping all those rows which have any 'nan' value in it.
old_df.dropna(axis = 1, how ='all', inplace = True)
new_df.dropna(axis = 1, how ='all', inplace = True)

# filling in all remaining 'nan' value with blanks
old_df.fillna("", inplace = True)
new_df.fillna("", inplace = True)

# convert all dataframes' columns to string type to prevent errors
old_df = old_df.astype(str)
new_df = new_df.astype(str)

My Python skills are very basic, so probably this can be done in a better way.

@javipkt

This comment has been minimized.

Copy link

@javipkt javipkt commented Jan 9, 2021

Btw, I think it would be a good feature to highlight the text of the changed values that contains "--->" in red color for instance. So it would be easier to see at a glance the changed values amongst all the other values.

@sanzoghenzo

This comment has been minimized.

Copy link
Owner Author

@sanzoghenzo sanzoghenzo commented Jan 9, 2021

Btw, I think it would be a good feature to highlight the text of the changed values that contains "--->" in red color for instance. So it would be easier to see at a glance the changed values amongst all the other values.

Well, this is something that can be done, but requires the use of xlsxwriter to manipulate the excel file.
It could also be used to output a single sheet with, i.e., removed rows highlighted in red, added rows in green, changed cells in yellow...

But it has to be rewritten by a lot, and the excel part cannot be decoupled from the core logic anymore.

Unfortunately I don't think I have time to do any of this soon 😅

@gbozzetti

This comment has been minimized.

Copy link

@gbozzetti gbozzetti commented Feb 16, 2021

Hi @sanzoghenzo,

your last revision on compare_excel function throws an error when all the values in the dictionary diff are empty (so no removed, no added, no changed)

image

A possible solution I've successfully tested could be as below (check if the diff dictionary has all the values empty):

def compare_excel(
        path1, path2, out_path, sheet_name, index_col_name, **kwargs
):
    old_df = pd.read_excel(path1, sheet_name=sheet_name, **kwargs)
    new_df = pd.read_excel(path2, sheet_name=sheet_name, **kwargs)
    diff = diff_pd(old_df, new_df, index_col_name)
    if not all(value.empty for value in diff.values()):
        with pd.ExcelWriter(out_path) as writer:
            for sname, data in diff.items():
                if not data.empty:
                    data.to_excel(writer, sheet_name=sname)
        print(f"Differences saved in {out_path}")
    else:
        print("No differences spotted")

I hope it helps.

Ciao

@sanzoghenzo

This comment has been minimized.

Copy link
Owner Author

@sanzoghenzo sanzoghenzo commented Feb 16, 2021

Ciao @gbozzetti, thanks for raising this issue and for your good fix!
(tip: you can use any in place of not all)

I've gone the other way around: I moved the checks in the diff_pd function; the returning dictionary has only items with valid data, so we can simply check for if diff:.

This also makes things simpler for other functions that use diff_pd: if "added" in diff:... and so on...

Well, one can argue that I broke the API (one expected to always find a 3 items dictionary with a data frame in it), but I think it's cleaner this way...

Thanks again for the heads up!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment