Last active
September 30, 2017 14:21
-
-
Save bmweiner/1b7b837feb280057918ccd1e83f0898b to your computer and use it in GitHub Desktop.
Compare equality of two pandas dataframes
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"**Compare two pandas dataframes for equality**\n", | |
"\n", | |
"Specify df1, df2, and absolute/relative tolerance for numeric comparison.\n", | |
"\n", | |
"1. Check Shapes - Number of (rows, cols) - Match\n", | |
"\n", | |
"2. Check Column Names Match\n", | |
"\n", | |
"3. Check Values Match\n", | |
"\n", | |
" * Values are compared by index - indices that exist in both frames are compared\n", | |
" * Numeric values are compared with `numpy.isclose`\n", | |
" * Object values are compared with `numpy.equal`" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import pprint\n", | |
"import pandas as pd\n", | |
"import numpy as np\n", | |
"\n", | |
"results = dict(setup=[], shapes=[], columns=[], values=[])\n", | |
"def print_results(r):\n", | |
" for k, v in r:\n", | |
" print(k + ':', v)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"# specify input settings\n", | |
"fname1 = 'data1.csv'\n", | |
"fname2 = 'data2.csv'\n", | |
"desc = 'Comparison of df1 and df2'\n", | |
"\n", | |
"df1 = pd.read_csv(fname1)\n", | |
"df2 = pd.read_csv(fname2)\n", | |
"\n", | |
"atol = 1e-4\n", | |
"rtol = 0" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Filename 1: data1.csv\n", | |
"Filename 2: data2.csv\n", | |
"Description: Comparison of df1 and df2\n", | |
"Absolute Numeric Tolerance: 0.0001\n", | |
"Relative Numeric Tolerance: 0\n" | |
] | |
} | |
], | |
"source": [ | |
"r = results['setup']\n", | |
"r.clear()\n", | |
"\n", | |
"r.append(('Filename 1', fname1))\n", | |
"r.append(('Filename 2', fname2))\n", | |
"r.append(('Description', desc))\n", | |
"r.append(('Absolute Numeric Tolerance', atol))\n", | |
"r.append(('Relative Numeric Tolerance', rtol))\n", | |
"\n", | |
"print_results(r)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Shapes Match: False\n", | |
"df1 Shape: (7, 4)\n", | |
"df2 Shape: (6, 3)\n" | |
] | |
} | |
], | |
"source": [ | |
"# Compare Shapes (row, col)\n", | |
"r = results['shapes']\n", | |
"r.clear()\n", | |
"\n", | |
"r.append(('Shapes Match', df1.shape == df2.shape))\n", | |
"for i, df in enumerate([df1, df2]):\n", | |
" r.append(('df{} Shape'.format(i+1), df.shape))\n", | |
" \n", | |
"print_results(r)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Columns Match: False\n", | |
"df1 Columns: ['char' 'num' 'mix' 'extra']\n", | |
"df2 Columns: ['char' 'num' 'mix']\n", | |
"df1 Only: ['extra']\n", | |
"df2 Only: []\n" | |
] | |
} | |
], | |
"source": [ | |
"# Compare Columns\n", | |
"r = results['columns']\n", | |
"r.clear()\n", | |
"\n", | |
"r.append(('Columns Match', df1.shape == df2.shape))\n", | |
"for i, df in enumerate([df1, df2]):\n", | |
" r.append(('df{} Columns'.format(i+1), df.columns.values))\n", | |
"cols = df1.columns.difference(df2.columns).values\n", | |
"r.append(('df1 Only', cols))\n", | |
"cols = df2.columns.difference(df1.columns).values\n", | |
"r.append(('df2 Only', cols))\n", | |
"print_results(r)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Values Match: False\n", | |
"Max Shape: (7, 4)\n", | |
"Possible Values: 28\n", | |
"Excluded Values: 10\n", | |
"Shape Compared: (6, 3)\n", | |
"Total Values Compared: 18\n", | |
"Total Values Matched: 15\n", | |
"Total Values Not Matched: 3\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>i</th>\n", | |
" <th>j</th>\n", | |
" <th>col1</th>\n", | |
" <th>col2</th>\n", | |
" <th>val1</th>\n", | |
" <th>val2</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>num</td>\n", | |
" <td>num</td>\n", | |
" <td>2</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>4</td>\n", | |
" <td>1</td>\n", | |
" <td>num</td>\n", | |
" <td>num</td>\n", | |
" <td>5</td>\n", | |
" <td>2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>2</td>\n", | |
" <td>0</td>\n", | |
" <td>char</td>\n", | |
" <td>char</td>\n", | |
" <td>c</td>\n", | |
" <td>a</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" i j col1 col2 val1 val2\n", | |
"0 1 1 num num 2 NaN\n", | |
"1 4 1 num num 5 2\n", | |
"2 2 0 char char c a" | |
] | |
}, | |
"execution_count": 6, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# Compare values\n", | |
"r = results['values']\n", | |
"r.clear()\n", | |
"\n", | |
"x = df1.values\n", | |
"y = df2.values\n", | |
"\n", | |
"# min / max size of arrays\n", | |
"min_dim = tuple(min(a, b) for a, b in zip(x.shape, y.shape))\n", | |
"max_dim = tuple(max(a, b) for a, b in zip(x.shape, y.shape))\n", | |
"\n", | |
"# numeric columns\n", | |
"x_num = [np.issubdtype(t, np.number) for t in df1.dtypes]\n", | |
"y_num = [np.issubdtype(t, np.number) for t in df2.dtypes]\n", | |
"cols_num = np.logical_and(x_num[:min_dim[1]], y_num[:min_dim[1]])\n", | |
"\n", | |
"# numeric check\n", | |
"x = df1.iloc[:min_dim[0], cols_num].values\n", | |
"y = df2.iloc[:min_dim[0], cols_num].values\n", | |
"elements = np.where(~np.isclose(x, y, atol=atol, equal_nan=True))\n", | |
"col_map = np.arange(len(cols_num))[cols_num]\n", | |
"rows = elements[0]\n", | |
"cols = np.array([col_map[col] for col in elements[1]])\n", | |
"idx = np.column_stack((rows, cols))\n", | |
"\n", | |
"# obj check\n", | |
"x = df1.iloc[:min_dim[0], ~cols_num].values\n", | |
"y = df2.iloc[:min_dim[0], ~cols_num].values\n", | |
"elements = np.where(x != y)\n", | |
"col_map = np.arange(len(cols_num))[~cols_num]\n", | |
"rows = elements[0]\n", | |
"cols = np.array([col_map[col] for col in elements[1]])\n", | |
"idx = np.concatenate((idx, np.column_stack((rows, cols))))\n", | |
"\n", | |
"# table of mismatch values\n", | |
"columns = (\"i\", \"j\", \"col1\", \"col2\", \"val1\", \"val2\")\n", | |
"data = []\n", | |
"for i, j in idx:\n", | |
" z = ((i, j, df1.columns[j], df2.columns[j], df1.iloc[i, j], df2.iloc[i, j]))\n", | |
" data.append(z)\n", | |
"df = pd.DataFrame(data, None, columns)\n", | |
"\n", | |
"r.append(('Values Match', df.empty))\n", | |
"r.append(('Max Shape', max_dim))\n", | |
"r.append(('Possible Values', np.prod(max_dim)))\n", | |
"r.append(('Excluded Values', np.prod(max_dim) - np.prod(min_dim)))\n", | |
"r.append(('Shape Compared', min_dim))\n", | |
"r.append(('Total Values Compared', np.prod(min_dim)))\n", | |
"r.append(('Total Values Matched', np.prod(min_dim) - idx.shape[0]))\n", | |
"r.append(('Total Values Not Matched', df.shape[0]))\n", | |
"\n", | |
"print_results(r)\n", | |
"df" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# export results\n", | |
"with open('report.txt', 'w') as f:\n", | |
" f.write(pprint.pformat(results))\n", | |
"df.to_csv('results_data.csv', index=False)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.5.1" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
char | num | mix | extra | |
---|---|---|---|---|
a | 1 | h | 1 | |
b | 2 | i | 1 | |
c | 3 | 3532 | 0 | |
d | 4 | k | 0 | |
e | 5 | 42 | 1 | |
f | 6 | m | 1 | |
g | 7 | n | 1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
char | num | mix | |
---|---|---|---|
a | 1 | h | |
b | i | ||
a | 3 | 3532 | |
d | 4 | k | |
e | 2 | 42 | |
f | 6 | m |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{'columns': [('Columns Match', False), | |
('df1 Columns', | |
array(['char', 'num', 'mix', 'extra'], dtype=object)), | |
('df2 Columns', array(['char', 'num', 'mix'], dtype=object)), | |
('df1 Only', array(['extra'], dtype=object)), | |
('df2 Only', array([], dtype=object))], | |
'setup': [('Filename 1', 'data1.csv'), | |
('Filename 2', 'data2.csv'), | |
('Description', 'Comparison of df1 and df2'), | |
('Absolute Numeric Tolerance', 0.0001), | |
('Relative Numeric Tolerance', 0)], | |
'shapes': [('Shapes Match', False), | |
('df1 Shape', (7, 4)), | |
('df2 Shape', (6, 3))], | |
'values': [('Values Match', False), | |
('Max Shape', (7, 4)), | |
('Possible Values', 28), | |
('Excluded Values', 10), | |
('Shape Compared', (6, 3)), | |
('Total Values Compared', 18), | |
('Total Values Matched', 15), | |
('Total Values Not Matched', 3)]} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
i | j | col1 | col2 | val1 | val2 | |
---|---|---|---|---|---|---|
1 | 1 | num | num | 2 | ||
4 | 1 | num | num | 5 | 2.0 | |
2 | 0 | char | char | c | a |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment