Skip to content

Instantly share code, notes, and snippets.

@bmweiner
Last active September 30, 2017 14:21
Show Gist options
  • Save bmweiner/1b7b837feb280057918ccd1e83f0898b to your computer and use it in GitHub Desktop.
Save bmweiner/1b7b837feb280057918ccd1e83f0898b to your computer and use it in GitHub Desktop.
Compare equality of two pandas dataframes
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Compare two pandas dataframes for equality**\n",
"\n",
"Specify df1, df2, and absolute/relative tolerance for numeric comparison.\n",
"\n",
"1. Check Shapes - Number of (rows, cols) - Match\n",
"\n",
"2. Check Column Names Match\n",
"\n",
"3. Check Values Match\n",
"\n",
" * Values are compared by index - indices that exist in both frames are compared\n",
" * Numeric values are compared with `numpy.isclose`\n",
" * Object values are compared with `numpy.equal`"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import pprint\n",
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"results = dict(setup=[], shapes=[], columns=[], values=[])\n",
"def print_results(r):\n",
" for k, v in r:\n",
" print(k + ':', v)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# specify input settings\n",
"fname1 = 'data1.csv'\n",
"fname2 = 'data2.csv'\n",
"desc = 'Comparison of df1 and df2'\n",
"\n",
"df1 = pd.read_csv(fname1)\n",
"df2 = pd.read_csv(fname2)\n",
"\n",
"atol = 1e-4\n",
"rtol = 0"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Filename 1: data1.csv\n",
"Filename 2: data2.csv\n",
"Description: Comparison of df1 and df2\n",
"Absolute Numeric Tolerance: 0.0001\n",
"Relative Numeric Tolerance: 0\n"
]
}
],
"source": [
"r = results['setup']\n",
"r.clear()\n",
"\n",
"r.append(('Filename 1', fname1))\n",
"r.append(('Filename 2', fname2))\n",
"r.append(('Description', desc))\n",
"r.append(('Absolute Numeric Tolerance', atol))\n",
"r.append(('Relative Numeric Tolerance', rtol))\n",
"\n",
"print_results(r)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Shapes Match: False\n",
"df1 Shape: (7, 4)\n",
"df2 Shape: (6, 3)\n"
]
}
],
"source": [
"# Compare Shapes (row, col)\n",
"r = results['shapes']\n",
"r.clear()\n",
"\n",
"r.append(('Shapes Match', df1.shape == df2.shape))\n",
"for i, df in enumerate([df1, df2]):\n",
" r.append(('df{} Shape'.format(i+1), df.shape))\n",
" \n",
"print_results(r)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Columns Match: False\n",
"df1 Columns: ['char' 'num' 'mix' 'extra']\n",
"df2 Columns: ['char' 'num' 'mix']\n",
"df1 Only: ['extra']\n",
"df2 Only: []\n"
]
}
],
"source": [
"# Compare Columns\n",
"r = results['columns']\n",
"r.clear()\n",
"\n",
"r.append(('Columns Match', df1.shape == df2.shape))\n",
"for i, df in enumerate([df1, df2]):\n",
" r.append(('df{} Columns'.format(i+1), df.columns.values))\n",
"cols = df1.columns.difference(df2.columns).values\n",
"r.append(('df1 Only', cols))\n",
"cols = df2.columns.difference(df1.columns).values\n",
"r.append(('df2 Only', cols))\n",
"print_results(r)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Values Match: False\n",
"Max Shape: (7, 4)\n",
"Possible Values: 28\n",
"Excluded Values: 10\n",
"Shape Compared: (6, 3)\n",
"Total Values Compared: 18\n",
"Total Values Matched: 15\n",
"Total Values Not Matched: 3\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>i</th>\n",
" <th>j</th>\n",
" <th>col1</th>\n",
" <th>col2</th>\n",
" <th>val1</th>\n",
" <th>val2</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>num</td>\n",
" <td>num</td>\n",
" <td>2</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>num</td>\n",
" <td>num</td>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>char</td>\n",
" <td>char</td>\n",
" <td>c</td>\n",
" <td>a</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" i j col1 col2 val1 val2\n",
"0 1 1 num num 2 NaN\n",
"1 4 1 num num 5 2\n",
"2 2 0 char char c a"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Compare values\n",
"r = results['values']\n",
"r.clear()\n",
"\n",
"x = df1.values\n",
"y = df2.values\n",
"\n",
"# min / max size of arrays\n",
"min_dim = tuple(min(a, b) for a, b in zip(x.shape, y.shape))\n",
"max_dim = tuple(max(a, b) for a, b in zip(x.shape, y.shape))\n",
"\n",
"# numeric columns\n",
"x_num = [np.issubdtype(t, np.number) for t in df1.dtypes]\n",
"y_num = [np.issubdtype(t, np.number) for t in df2.dtypes]\n",
"cols_num = np.logical_and(x_num[:min_dim[1]], y_num[:min_dim[1]])\n",
"\n",
"# numeric check\n",
"x = df1.iloc[:min_dim[0], cols_num].values\n",
"y = df2.iloc[:min_dim[0], cols_num].values\n",
"elements = np.where(~np.isclose(x, y, atol=atol, equal_nan=True))\n",
"col_map = np.arange(len(cols_num))[cols_num]\n",
"rows = elements[0]\n",
"cols = np.array([col_map[col] for col in elements[1]])\n",
"idx = np.column_stack((rows, cols))\n",
"\n",
"# obj check\n",
"x = df1.iloc[:min_dim[0], ~cols_num].values\n",
"y = df2.iloc[:min_dim[0], ~cols_num].values\n",
"elements = np.where(x != y)\n",
"col_map = np.arange(len(cols_num))[~cols_num]\n",
"rows = elements[0]\n",
"cols = np.array([col_map[col] for col in elements[1]])\n",
"idx = np.concatenate((idx, np.column_stack((rows, cols))))\n",
"\n",
"# table of mismatch values\n",
"columns = (\"i\", \"j\", \"col1\", \"col2\", \"val1\", \"val2\")\n",
"data = []\n",
"for i, j in idx:\n",
" z = ((i, j, df1.columns[j], df2.columns[j], df1.iloc[i, j], df2.iloc[i, j]))\n",
" data.append(z)\n",
"df = pd.DataFrame(data, None, columns)\n",
"\n",
"r.append(('Values Match', df.empty))\n",
"r.append(('Max Shape', max_dim))\n",
"r.append(('Possible Values', np.prod(max_dim)))\n",
"r.append(('Excluded Values', np.prod(max_dim) - np.prod(min_dim)))\n",
"r.append(('Shape Compared', min_dim))\n",
"r.append(('Total Values Compared', np.prod(min_dim)))\n",
"r.append(('Total Values Matched', np.prod(min_dim) - idx.shape[0]))\n",
"r.append(('Total Values Not Matched', df.shape[0]))\n",
"\n",
"print_results(r)\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# export results\n",
"with open('report.txt', 'w') as f:\n",
" f.write(pprint.pformat(results))\n",
"df.to_csv('results_data.csv', index=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.1"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
char num mix extra
a 1 h 1
b 2 i 1
c 3 3532 0
d 4 k 0
e 5 42 1
f 6 m 1
g 7 n 1
char num mix
a 1 h
b i
a 3 3532
d 4 k
e 2 42
f 6 m
{'columns': [('Columns Match', False),
('df1 Columns',
array(['char', 'num', 'mix', 'extra'], dtype=object)),
('df2 Columns', array(['char', 'num', 'mix'], dtype=object)),
('df1 Only', array(['extra'], dtype=object)),
('df2 Only', array([], dtype=object))],
'setup': [('Filename 1', 'data1.csv'),
('Filename 2', 'data2.csv'),
('Description', 'Comparison of df1 and df2'),
('Absolute Numeric Tolerance', 0.0001),
('Relative Numeric Tolerance', 0)],
'shapes': [('Shapes Match', False),
('df1 Shape', (7, 4)),
('df2 Shape', (6, 3))],
'values': [('Values Match', False),
('Max Shape', (7, 4)),
('Possible Values', 28),
('Excluded Values', 10),
('Shape Compared', (6, 3)),
('Total Values Compared', 18),
('Total Values Matched', 15),
('Total Values Not Matched', 3)]}
i j col1 col2 val1 val2
1 1 num num 2
4 1 num num 5 2.0
2 0 char char c a
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment