Skip to content

Instantly share code, notes, and snippets.

@sanzoghenzo
Last active April 12, 2024 04:48
Show Gist options
  • Star 19 You must be signed in to star a gist
  • Fork 11 You must be signed in to fork a gist
  • Save sanzoghenzo/73275613c592331180a24cb2ddfd5bcb to your computer and use it in GitHub Desktop.
Save sanzoghenzo/73275613c592331180a24cb2ddfd5bcb to your computer and use it in GitHub Desktop.
Compare Excel sheets with Pandas
"""
Compare two Excel sheets
Inspired by https://pbpython.com/excel-diff-pandas-update.html
For the documentation, download this file and type:
python compare.py --help
"""
import argparse
import pandas as pd
import numpy as np
def report_diff(x):
"""Function to use with groupby.apply to highlight value changes."""
return x[0] if x[0] == x[1] or pd.isna(x).all() else f'{x[0]} ---> {x[1]}'
def strip(x):
"""Function to use with applymap to strip whitespaces from a dataframe."""
return x.strip() if isinstance(x, str) else x
def diff_pd(old_df, new_df, idx_col):
"""
Identify differences between two pandas DataFrames using a key column.
Key column is assumed to have a unique row identifier, i.e. no duplicates.
Args:
old_df (pd.DataFrame): first dataframe
new_df (pd.DataFrame): second dataframe
idx_col (str|list(str)): column name(s) of the index,
needs to be present in both DataFrames
"""
# setting the column name as index for fast operations
old_df = old_df.set_index(idx_col)
new_df = new_df.set_index(idx_col)
# get the added and removed rows
old_keys = old_df.index
new_keys = new_df.index
if isinstance(old_keys, pd.MultiIndex):
removed_keys = old_keys.difference(new_keys)
added_keys = new_keys.difference(old_keys)
else:
removed_keys = np.setdiff1d(old_keys, new_keys)
added_keys = np.setdiff1d(new_keys, old_keys)
# populate the output data with non empty dataframes
out_data = {}
removed = old_df.loc[removed_keys]
if not removed.empty:
out_data["removed"] = removed
added = new_df.loc[added_keys]
if not added.empty:
out_data["added"] = added
# focusing on common data of both dataframes
common_keys = np.intersect1d(old_keys, new_keys, assume_unique=True)
common_columns = np.intersect1d(
old_df.columns, new_df.columns, assume_unique=True
)
new_common = new_df.loc[common_keys, common_columns].applymap(strip)
old_common = old_df.loc[common_keys, common_columns].applymap(strip)
# get the changed rows keys by dropping identical rows
# (indexes are ignored, so we'll reset them)
common_data = pd.concat(
[old_common.reset_index(), new_common.reset_index()], sort=True
)
changed_keys = common_data.drop_duplicates(keep=False)[idx_col]
if isinstance(changed_keys, pd.Series):
changed_keys = changed_keys.unique()
else:
changed_keys = changed_keys.drop_duplicates().set_index(idx_col).index
# combining the changed rows via multi level columns
df_all_changes = pd.concat(
[old_common.loc[changed_keys], new_common.loc[changed_keys]],
axis='columns',
keys=['old', 'new']
).swaplevel(axis='columns')
# using report_diff to merge the changes in a single cell with "-->"
df_changed = df_all_changes.groupby(level=0, axis=1).apply(
lambda frame: frame.apply(report_diff, axis=1))
# add changed dataframe to output data only if non empty
if not df_changed.empty:
out_data['changed'] = df_changed
return out_data
def compare_excel(
path1, path2, out_path, sheet_name, index_col_name, **kwargs
):
old_df = pd.read_excel(path1, sheet_name=sheet_name, **kwargs)
new_df = pd.read_excel(path2, sheet_name=sheet_name, **kwargs)
diff = diff_pd(old_df, new_df, index_col_name)
if diff:
with pd.ExcelWriter(out_path) as writer:
for sname, data in diff.items():
data.to_excel(writer, sheet_name=sname)
print(f"Differences saved in {out_path}")
else:
print("No differences spotted")
def build_parser():
cfg = argparse.ArgumentParser(
description="Compares two Excel sheets and outputs the differences "
"to a separate Excel file."
)
cfg.add_argument("path1", help="Fist Excel file")
cfg.add_argument("path2", help="Second Excel file")
cfg.add_argument("sheetname", help="Name of the sheet to compare.")
cfg.add_argument(
"key_column",
help="Name of the column(s) with unique row identifier. It has to be "
"the actual text of the first row, not the excel notation."
"Use multiple times to create a composite index.",
nargs="+",
)
cfg.add_argument("-o", "--output-path", default="compared.xlsx",
help="Path of the comparison results")
cfg.add_argument("--skiprows", help='Excel row containing the table headers',
type=int, action='append', default=None)
return cfg
def main():
cfg = build_parser()
opt = cfg.parse_args()
compare_excel(opt.path1, opt.path2, opt.output_path, opt.sheetname,
opt.key_column, skiprows=opt.skiprows)
if __name__ == '__main__':
main()
"""Excel compare test suite."""
import io
import pandas as pd
import compare
def test_parser():
cfg = compare.build_parser()
opt = cfg.parse_args(["test1.xlsx", "test2.xlsx", "Sheet 1", "Col1", "Col2", "-o", "output.xlsx"])
assert opt.path1 == "test1.xlsx"
assert opt.path2 == "test2.xlsx"
assert opt.output_path == "output.xlsx"
assert opt.sheetname == "Sheet 1"
assert opt.key_column == ["Col1", "Col2"]
assert opt.skiprows is None
def build_excel_stream(df, sheetname):
"""Create an excel workbook as a file-like object."""
output = io.BytesIO()
with pd.ExcelWriter(output, engine="xlsxwriter") as writer:
df.to_excel(writer, sheet_name=sheetname, index=False)
return output
def sample_xlsx(df_1, df_2):
xlsx_1 = build_excel_stream(df_1, "Sheet1")
xlsx_2 = build_excel_stream(df_2, "Sheet1")
return xlsx_1, xlsx_2
def sample_dfs():
df_1 = pd.DataFrame({
"ID": [123456, 654321, 543219, 432198, 765432],
"Name": ["Lemonade", "Cola", "Orange", "Fruit Punch", "Tobacco"],
"Flavour Description": ["Fuzzy", "Fuzzy", "Fuzzy", "Fuzzy", "Smoky"],
})
df_2 = pd.DataFrame({
"ID": [123456, 654321, 543219, 432198, 876543],
"Name": ["Lemonade", "Cola", "Orange", "Fruit Punch", "Soda"],
"Flavour Description": ["Fuzzy", "Bubbly", "Fuzzy", "Fuzzy", "Sugary"],
})
return df_1, df_2
def run_assertion(diff):
changed = diff["changed"]
assert len(changed) == 1
assert changed.iloc[0]["Flavour Description"] == "Fuzzy ---> Bubbly"
added = diff["added"]
assert len(added) == 1
assert added.iloc[0]["Flavour Description"] == "Sugary"
removed = diff["removed"]
assert len(removed) == 1
assert removed.iloc[0]["Flavour Description"] == "Smoky"
print("OK.")
def test_single_index():
df_1, df_2 = sample_dfs()
diff = compare.diff_pd(df_1, df_2, ["ID"])
run_assertion(diff)
def test_single_index_excel():
xlsx_1, xlsx_2 = sample_xlsx(*sample_dfs())
diff_io = io.BytesIO()
compare.compare_excel(xlsx_1, xlsx_2, diff_io, "Sheet1", "ID")
diff = pd.read_excel(diff_io, sheet_name=None)
run_assertion(diff)
def sample_multiindex_dfs():
df_1 = pd.DataFrame({
"ID": [123456, 123456, 654321, 543219, 432198, 765432],
"Name": ["Lemonade", "Lemonade", "Cola", "Orange", "Fruit Punch", "Tobacco"],
"Flavour ID": [1, 2, None, None, None, None],
"Flavour Description": ["Fuzzy", "Fuzzy", "Fuzzy", "Fuzzy", "Fuzzy", "Smoky"],
})
df_2 = pd.DataFrame({
"ID": [123456, 123456, 654321, 543219, 432198, 876543],
"Name": ["Lemonade", "Lemonade", "Cola", "Orange", "Fruit Punch", "Soda"],
"Flavour ID": [1, 2, None, None, None, None],
"Flavour Description": ["Fuzzy", "Bubbly", "Fuzzy", "Fuzzy", "Fuzzy", "Sugary"],
})
return df_1, df_2
def test_multiindex():
df_1, df_2 = sample_multiindex_dfs()
diff = compare.diff_pd(df_1, df_2, ["ID", "Flavour ID"])
run_assertion(diff)
def test_multiindex_excel():
xlsx_1, xlsx_2 = sample_xlsx(*sample_multiindex_dfs())
diff_io = io.BytesIO()
compare.compare_excel(xlsx_1, xlsx_2, diff_io, "Sheet1", ["ID", "Flavour ID"])
diff = pd.read_excel(diff_io, sheet_name=None)
run_assertion(diff)
def test_no_diffs():
df_1, _ = sample_multiindex_dfs()
diff = compare.diff_pd(df_1, df_1, ["ID", "Flavour ID"])
assert not diff
print("OK.")
if __name__ == '__main__':
test_multiindex()
test_multiindex_excel()
test_single_index()
test_single_index_excel()
test_parser()
test_no_diff()
@kanemurphy1
Copy link

Hello,

Do you have any suggestion for a scenario where the unique identifier is duplicated in the key-column, but there is an additional identifier? I have case within one excel file where this does and doesn't occur, as per the example below.

ID Name Flavour ID Flavour Description
123456 Lemonade 1 Fuzzy
123456 Lemonade 2 Fuzzy
654321 Cola   Fuzzy
543219 Orange   Fuzzy
432198 Fruit Punch   Fuzzy
ID Name Flavour ID Flavour Description
123456 Lemonade 1 Fuzzy
123456 Lemonade 2 Bubbly
654321 Cola   Fuzzy
543219 Orange   Fuzzy
432198 Fruit Punch   Fuzzy

Thank you!

@sanzoghenzo
Copy link
Author

sanzoghenzo commented Feb 5, 2020

Hi @kanemurphy1,
I've update the code to handle multiple column as index, and I've written a test suite with your sample data (and some more data to test the added and removed rows).
Now you can specify multiple columns in the command line, and they are used as a MultiIndex.
This is what I've changed:

  • using "nargs='+'" in argument key_column to allow multiple columns to be specified;
  • renaming the argument key_column, because otherwise it showed in the namespace as key-column;
  • handle the case of common_keys being a DataFrame and trasforming it into a MultiIndex.
  • handle the case of old_keys and new_keys being a MultiIndex and using the difference method to get added and removed row indices.

@roborobrob
Copy link

Hello sanzoghenzo,

When trying to run the latest code with the sample data files, I get following error:

KeyError: 'None of ["'name'"] are in the columns'

Any ideas why? Thanks!

@elmikele
Copy link

Hello Sanzoghenzo,
first of all many thanks for all your efforts.
I have two questions:

  1. If I run your latest code, I get the follwing error at line 38:
    line 38, in diff_pd: if isinstance(old_keys, pd.MultiIndex):
    NameError: name 'old_keys' is not defined
    The earlier version works without problems for me. Do you know what the problem could be?
  2. You and Chris Moffit are assuming that "the tables have a column that holds unique values, like a primary key of a database table, that way we can detect added/removed/changed rows in a simpler way and regardless of their order"
    In my case, I would like to compare two excel files, which have a unique column; however, the identifiers in this column are a mix of integers and strings like this: '234564' or '1-19003' or 'XY/1/17/1254/001' or 'AF446444'
    Is there anyway that I can use that such a column for indexing the single rows?

Thanks a lot!

@sanzoghenzo
Copy link
Author

Hello sanzoghenzo,

When trying to run the latest code with the sample data files, I get following error:
KeyError: 'None of ["'name'"] are in the columns'
Any ideas why? Thanks!

Hi @roborobrob, You've been too generic in describing the issue, I can't help you.
Please specify which sample data you use (I suppose Chris Moffitt's, I haven't really ever tested them) and the exact command/function you're calling.

Hello Sanzoghenzo,
first of all many thanks for all your efforts.

Hi @elmikele,
I'm glad to help!

I have two questions:

1. If I run your latest code, I get the following error at line 38:
   line 38, in diff_pd:    if isinstance(old_keys, pd.MultiIndex):
   NameError: name 'old_keys' is not defined
   The earlier version works without problems for me. Do you know what the problem could be?

I'm sorry, I forgot to copy the two lines before line 38 that create old_keys and new_keys variables (they're the indexes of the two data frames, I extracted them to reuse in the next lines).
Now it's fixed!

2. You and Chris Moffit are assuming that "the tables have a column that holds unique values, like a primary key of a database table, that way we can detect added/removed/changed rows in a simpler way and regardless of their order"
   In my case, I would like to compare two excel files, which have a unique column; however, the identifiers in this column are a mix of integers and strings like this: '234564' or '1-19003' or 'XY/1/17/1254/001' or 'AF446444'
   Is there anyway that I can use that such a column for indexing the single rows?

I've done a quick test and the problem is that an index needs to have values of the same (or comparable) type to be sortable.
Pandas gets excel values from xlrd or openpyxl, and they convert the numbers into ints or floats.
One option is to force the type of your unique column as text directly inside Excel;
The other option is to add a converter in this script (or in a script that calls the diff_pd function), using the astype method, such as the following (untested):

def compare_excel_mixed_index(path1, path2, out_path, sheet_name, index_col_name, **kwargs):
    old_df = pd.read_excel(path1, sheet_name=sheet_name, **kwargs)
    new_df = pd.read_excel(path2, sheet_name=sheet_name, **kwargs)
    old_df[index_col_name] = old_df[index_col_name].astype(str)
    new_df[index_col_name] = new_df[index_col_name].astype(str)
    diff = diff_pd(old_df, new_df, index_col_name)
    with pd.ExcelWriter(out_path) as writer:
        for sname, data in diff.items():
            data.to_excel(writer, sheet_name=sname)
    print(f"Differences saved in {out_path}")

I will not add this to my script, since I prefer to do a normalization/cleaning of my data as a separate step.
Hope this helps!

@elmikele
Copy link

Hi Sanzoghenzo,
many thanks for this swift reply.
The code works now perfectly fine.
Your suggestion to force all data of the indexing column into text directly in Excel is probably the easiest and it works perfectly fine! Sorry that i haven't thought about htis. I will consider, however, your approach using the astype method, if I want to automate the workflow a bit more.
Thanks again! This saves a lot of manual work now!

@dhayams
Copy link

dhayams commented Mar 3, 2020

I am getting this error with latest version of pandas

KeyError: 'Passing list-likes to .loc or [] with any missing labels is no longer supported

Please do mention which version of pandas are supported

@sanzoghenzo
Copy link
Author

I am getting this error with latest version of pandas

KeyError: 'Passing list-likes to .loc or [] with any missing labels is no longer supported

Please do mention which version of pandas are supported

Hi @dhayams,
please help me to help you.
Can you provide me some minimal test data to reproduce the bug?
Can you post the command you ran?
Can you post the full error message, or at least tell me the line that throws the error?
I had no trouble running my tests with the latest pandas version.
I suspected that the problem is when there's no changes, so I added a test for it but everything is ok.
I could try to investigate other parts of code, but I prefer to save time for other important things and wait for your help :)

BTW: pre-1.0 pandas should work ok. This error was a simple deprecation warning until 0.25

@javipkt
Copy link

javipkt commented Jan 6, 2021

Hi! Thanks for the script! Great work!

I have an issue when picking 2 columns as key column (the 1st and 2nd ones). The error is "File "D:\ANACONDA\lib\site-packages\pandas\core\indexes\multi.py", line 1902, in getitem
if level_codes[key] == -1:
IndexError: index 0 is out of bounds for axis 0 with size 0"

Numero Indice Nombre Apellido Edad
1500001 a Name1 Surname1 39
1500002 b Name2 Surname2 52
1500003 c Name3 Surname3 18
1500004 d Name4 Surname4 76
1500005 e Name5 Surname5 23
1500006 f Name6 Surname6 99
Numero Indice Nombre Apellido Edad
1500001 a Name1 Surname1 39
1500002 b Name2 Surme2 52
1500003 c Name3 Surname3 23
1500005 e Name5 Surname5 23
1500006 f Name6 Surname6 88

If instead of the second column I select another, it does not happen and it is correctly processed. I am using Anaconda 2020.11 with Python 3.8.5. Please, let me know if you need further information.

Thanks!

@sanzoghenzo
Copy link
Author

hi @javipkt,
You caught a dormant bug!
This occurs when there are no rows added (or removed), and we passed an empty dataframe to data.to_excel.
It is corrected now.

@javipkt
Copy link

javipkt commented Jan 9, 2021

Thanks! It works like a charm now!

I have added to my version the following code to convert the data to strings and drop and fill nan values:
.................................
def compare_excel(
path1, path2, out_path, sheet_name, index_col_name, **kwargs
):
old_df = pd.read_excel(path1, sheet_name=sheet_name, **kwargs)
new_df = pd.read_excel(path2, sheet_name=sheet_name, **kwargs)

# droping all those rows which have any 'nan' value in it.
old_df.dropna(axis = 1, how ='all', inplace = True)
new_df.dropna(axis = 1, how ='all', inplace = True)

# filling in all remaining 'nan' value with blanks
old_df.fillna("", inplace = True)
new_df.fillna("", inplace = True)

# convert all dataframes' columns to string type to prevent errors
old_df = old_df.astype(str)
new_df = new_df.astype(str)

My Python skills are very basic, so probably this can be done in a better way.

@javipkt
Copy link

javipkt commented Jan 9, 2021

Btw, I think it would be a good feature to highlight the text of the changed values that contains "--->" in red color for instance. So it would be easier to see at a glance the changed values amongst all the other values.

@sanzoghenzo
Copy link
Author

Btw, I think it would be a good feature to highlight the text of the changed values that contains "--->" in red color for instance. So it would be easier to see at a glance the changed values amongst all the other values.

Well, this is something that can be done, but requires the use of xlsxwriter to manipulate the excel file.
It could also be used to output a single sheet with, i.e., removed rows highlighted in red, added rows in green, changed cells in yellow...

But it has to be rewritten by a lot, and the excel part cannot be decoupled from the core logic anymore.

Unfortunately I don't think I have time to do any of this soon 😅

@gbozzetti
Copy link

Hi @sanzoghenzo,

your last revision on compare_excel function throws an error when all the values in the dictionary diff are empty (so no removed, no added, no changed)

image

A possible solution I've successfully tested could be as below (check if the diff dictionary has all the values empty):

def compare_excel(
        path1, path2, out_path, sheet_name, index_col_name, **kwargs
):
    old_df = pd.read_excel(path1, sheet_name=sheet_name, **kwargs)
    new_df = pd.read_excel(path2, sheet_name=sheet_name, **kwargs)
    diff = diff_pd(old_df, new_df, index_col_name)
    if not all(value.empty for value in diff.values()):
        with pd.ExcelWriter(out_path) as writer:
            for sname, data in diff.items():
                if not data.empty:
                    data.to_excel(writer, sheet_name=sname)
        print(f"Differences saved in {out_path}")
    else:
        print("No differences spotted")

I hope it helps.

Ciao

@sanzoghenzo
Copy link
Author

Ciao @gbozzetti, thanks for raising this issue and for your good fix!
(tip: you can use any in place of not all)

I've gone the other way around: I moved the checks in the diff_pd function; the returning dictionary has only items with valid data, so we can simply check for if diff:.

This also makes things simpler for other functions that use diff_pd: if "added" in diff:... and so on...

Well, one can argue that I broke the API (one expected to always find a 3 items dictionary with a data frame in it), but I think it's cleaner this way...

Thanks again for the heads up!

@srimaiden
Copy link

srimaiden commented Mar 10, 2021

Hi sanzoghenzo
I used the latest code and I get the KeyError: 'Passing list-likes to .loc or [] with any missing labels is no longer supported, see https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike'
Could this be a panda version error?
Thanks
Sri

PS: I am trying to compare 2 files with 2600 rows * 68 columns, file is around 1.1mb

@sanzoghenzo
Copy link
Author

Hi sanzoghenzo
I used the latest code and I get the KeyError: 'Passing list-likes to .loc or [] with any missing labels is no longer supported, see https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike'
Could this be a panda version error?
Thanks
Sri

PS: I am trying to compare 2 files with 2600 rows * 68 columns, file is around 1.1mb

Hi, have you read my answer to a similar question and the end of the readme?

I can't help you with the info you provided.

@nlpsbr
Copy link

nlpsbr commented Mar 17, 2021

@srimaiden
Copy link

srimaiden commented Mar 17, 2021

Hi sanzoghenzo
I used the latest code and I get the KeyError: 'Passing list-likes to .loc or [] with any missing labels is no longer supported, see https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike'
Could this be a panda version error?
Thanks
Sri
PS: I am trying to compare 2 files with 2600 rows * 68 columns, file is around 1.1mb

Hi, have you read my answer to a similar question and the end of the readme?

I can't help you with the info you provided.

Hi sanzoghenzo,

I understand, I deal with sensitive data so I cannot share. Here is what i figured out from your earlier reply for the same error.
I have Pandas 1.0.3 and I remember this very code working earlier before 1.0 version. I tried rolling back the version but I am unable too and I get whole list of errors if I try to do so.

I stuck to the error and tried to debug a bit and it looks like if we were to use reindex instead of .loc it would resolve the issue. I tried this but wont work. Python is not my primary skill and I am learning at it. Can you or anyone reading this help me out?

These are the problem lines:

removed = old_df.reindex(removed_keys)
if not removed.empty:
out_data["removed"] = removed
added = new_df.reindex(added_keys)
if not added.empty:
out_data["added"] = added
# focusing on common data of both dataframes
common_keys = np.intersect1d(old_keys, new_keys, assume_unique=True)
common_columns = np.intersect1d(
old_df.columns, new_df.columns, assume_unique=True
)
new_common = new_df.reindex(common_keys, common_columns).applymap(strip)
old_common = old_df.reindex(common_keys, common_columns).applymap(strip)
# get the changed rows keys by dropping identical rows
# (indexes are ignored, so we'll reset them)
common_data = pd.concat(
[old_common.reset_index(), new_common.reset_index()], sort=True
)
changed_keys = common_data.drop_duplicates(keep=False)[idx_col]
if isinstance(changed_keys, pd.Series):
changed_keys = changed_keys.unique()
else:
changed_keys = changed_keys.drop_duplicates().set_index(idx_col).index
# combining the changed rows via multi level columns
df_all_changes = pd.concat(
[old_common.reindex(changed_keys), new_common.reindex(changed_keys)],

@sanzoghenzo
Copy link
Author

Hi @srimaiden,
You still told me half of the story... How does the "reindex" way not work? It outputs the same error or another one? You need to understand that stack traces (the awful messages that python spits on error) are important for us developers to quickly get to the problem.

Anyway, I'm betting you're trying to use as key a column that has blank cells in it.

Keep also in mind that the "used range" of an excel sheet can stretch to blank rows under the actual table if you applied formatting to it.
I usually clean my Excel files in this way:

  • go down to the last row (ctrl+ down arrow)
  • select the entire first empty row
  • select all the rows till the sheet end with ctrl+shift+down arrow
  • delete all content with del and remove the rows with ctrl+-

I do the same with the columns at the right of the table.

Hope this helps.

@SKDMaxout
Copy link

Hi sanzoghenzo,

I have the below requirement : right now i have the files in .txt format with | separated .

1)Rows or records in File1 but not in File2.
2)Rows or records in File2 but not in File1.
3)Rows or records that in both files but values changed for a given column.

what I did for the first 2 requirements as 👍

import pandas as pd
from pandas.util.testing import assert_frame_equal

df_first = pd.read_csv('./a.txt', sep="|")
df_second = pd.read_csv('./b.txt', sep="|")

df=pd.merge(df_first, df_second, on=['pk1','pk2'],how='left', indicator=True)
first_minus_second=df.query('_merge=="left_only"')
first_minus_second.to_excel("./first_minus_second.xlsx","sheet1" ,index=False)
first_minus_second.to_csv("./first_minus_second.csv",sep="|",index=False)
df=pd.merge(df_second,df_first, on=['pk1','pk2'],how='left', indicator=True)
second_minus_first=df.query('_merge=="left_only"')
second_minus_first.to_csv("./second_minus_first.csv",sep="|",index=False)
second_minus_first.to_excel("./second_minus_first.xlsx","sheet1" , index=False)

For the 3rd requirement the Compare method does not work since the DataFrame Shape mismatch error is coming:+1:

assert_frame_equal(df_first, df_second )

AssertionError: DataFrame are different

DataFrame shape mismatch
[left]: (8447, 125)
[right]: (8427, 125)

Could you tell me how i can use your code for all my requirements ?A new excel file will be generated with the row difference between both files and most importantly * Rows or records that in both files but values changed for a given column*

Need your help and guidance ..Thanks

@sanzoghenzo
Copy link
Author

sanzoghenzo commented May 5, 2021

Hi sanzoghenzo,

I have the below requirement : right now i have the files in .txt format with | separated .

1)Rows or records in File1 but not in File2.
2)Rows or records in File2 but not in File1.
3)Rows or records that in both files but values changed for a given column.

what I did for the first 2 requirements as 👍

import pandas as pd
from pandas.util.testing import assert_frame_equal

df_first = pd.read_csv('./a.txt', sep="|")
df_second = pd.read_csv('./b.txt', sep="|")

...

Hi SKDMaxout,
if you use

diff = diff_pd(df_first, df_second, ['pk1','pk2'])

you get a dictionary that can contain the keys "added", "removed" and "changed".
To save it to an Excel file, you can use the code from this line.

I don't have any solution ready for the "changed rows by a given column", since this was not the scope of the tool;
You can always filter the resulting Excel sheet by "-->" in the column that you need.
Alternatively, if you only need the "given column" instead of the entire record, you can change the common_columns variable to hold a list with only the column name.

Hope this helps

@beow
Copy link

beow commented Jun 13, 2021

Hi sanzoghenzo,
I get this error message when running the script

$ python compare.py "test1.xlsx" "test2.xlsx" "Sheet1" "Product Number"
Traceback (most recent call last):
  File "compare.py", line 134, in <module>
    main()
  File "compare.py", line 129, in main
    compare_excel(opt.path1, opt.path2, opt.output_path, opt.sheetname,
  File "compare.py", line 94, in compare_excel
    diff = diff_pd(old_df, new_df, index_col_name)
  File "compare.py", line 46, in diff_pd
    removed_keys = np.setdiff1d(old_keys, new_keys)
  File "<__array_function__ internals>", line 5, in setdiff1d
  File "/home/beos/.local/lib/python3.8/site-packages/numpy/lib/arraysetops.py", line 785, in setdiff1d
    ar1 = unique(ar1)
  File "<__array_function__ internals>", line 5, in unique
  File "/home/beos/.local/lib/python3.8/site-packages/numpy/lib/arraysetops.py", line 263, in unique
    ret = _unique1d(ar, return_index, return_inverse, return_counts)
  File "/home/beos/.local/lib/python3.8/site-packages/numpy/lib/arraysetops.py", line 311, in _unique1d
    ar.sort()
TypeError: '<' not supported between instances of 'str' and 'int'

Any idea what is causing it? Thanks for a useful script otherwise.

@beow
Copy link

beow commented Jun 13, 2021

Comment to myself above:
The index/key_column ("Product Number") contained both string and integer value items. When removing the few rows with integer valued keys the script worked. Maybe something to consider or at least document?

@sanzoghenzo
Copy link
Author

@beow you can ensure a column has all the data in the same type by converting the idx_col to string as the first thing in the diff_pd function:

old_df[idx_col] = old_df[idx_col].astype(str)
new_df[idx_col] = new_df[idx_col].astype(str)

@beow
Copy link

beow commented Jun 18, 2021

Thanks sanzoghenzo. Have another problem with the "--skiprows" parameter. When I use it like "--skiprows 3" it does not remove Excel rows 1 and 2 as expected, it rather seems to remove Excel row 4 only (row number, zero indexed) and not the range of rows before 3. This has the effect that the index column can't be found since it never finds the key name... gets this error

KeyError: "None of [Index(['Product Number'], dtype='object')] are in the [columns]"

If I remove the first rows from the Excel docs (row 1 and 2 in this case) and don't use the "--skiprows" argument it works fine.

@sanzoghenzo
Copy link
Author

@beow, you're right, passing a single number as --skiprows doesn't behave as stated in the read_excel documentation.
This is because I built the argparse to be able to specify --skiprows multiple times, and in doing so I always feed a list to read_excel.

You have two options:

  • If you're sure your rows to skip are always at the beginning of the sheet, simply remove the action='append' argument from line 122;
  • Otherwise, add a line before 129 to handle opt.skiprows: skip = skip[0] if skip and len(skip) == 1 else skip and then use skip instead of opt.skiprows as the last parameter of the compare_excel call.

@beow
Copy link

beow commented Jun 20, 2021

Excellent advice @sanzighenzo, I chose the first method and all works fine now. Thanks for a very useful tool!

@cmondi27
Copy link

@sanzighenzo May I know what would be the best approach to find the differences between two Excel or CSV file if they contain duplicate ids in each files. For instance, Excel 'A' has 123 as an Id but it is repeated 5 times with different column value in Excel A, where as Excel B with 123 id has 7 rows with different column values.
I'm really searching to find the difference for this scenarios.
Thanks.

@sanzoghenzo
Copy link
Author

@sanzighenzo May I know what would be the best approach to find the differences between two Excel or CSV file if they contain duplicate ids in each files. For instance, Excel 'A' has 123 as an Id but it is repeated 5 times with different column value in Excel A, where as Excel B with 123 id has 7 rows with different column values.

Hi @cmondi27,

It's hard to solve a problem without having a sample of the data; this is the reason I ask for a minimum sample at the end of the Readme.

Does the table have only the ID and the value columns (let's call them "ID" and "Value")?
If so, you could add another column ("Check") with a constant value (let's say "1") and use both "ID" and "Value" columns as keys;
that way you'll end up with the two sheets "added" and "removed" (and no "changed" since the "Check" value is always 1).

You could do this (adding the "check" column and then removing it from the added and removed dataframes) via script, just

old_df["check"] = 1
new_df["check"] = 1

before calling diff_pd and then, for each item in diff (didn't test it, it may be wrong):

data = data.reset_index()
del data["check"]
data.to_excel(writer, sheet_name=sname, index=None)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment