Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save mcleary/c86ebd2da98ca7f0f7b355c8e493c553 to your computer and use it in GitHub Desktop.
Save mcleary/c86ebd2da98ca7f0f7b355c8e493c553 to your computer and use it in GitHub Desktop.
Example of a function to compare two DataFrames independent of row/column ordering and with handling of null values.
Display the source blob
Display the rendered blob
Raw
{
"metadata": {
"name": "",
"signature": "sha256:9c26f46f87352d6bedc804325404deca8e9cf8b7e2e0c151b7a2635f27e6d447"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": [
"import numpy as np\n",
"import numpy.testing as npt\n",
"import pandas as pd"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 9
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"def assert_frames_equal(actual, expected, use_close=False):\n",
" \"\"\"\n",
" Compare DataFrame items by index and column and\n",
" raise AssertionError if any item is not equal.\n",
"\n",
" Ordering is unimportant, items are compared only by label.\n",
" NaN and infinite values are supported.\n",
" \n",
" Parameters\n",
" ----------\n",
" actual : pandas.DataFrame\n",
" expected : pandas.DataFrame\n",
" use_close : bool, optional\n",
" If True, use numpy.testing.assert_allclose instead of\n",
" numpy.testing.assert_equal.\n",
"\n",
" \"\"\"\n",
" if use_close:\n",
" comp = npt.assert_allclose\n",
" else:\n",
" comp = npt.assert_equal\n",
"\n",
" assert (isinstance(actual, pd.DataFrame) and\n",
" isinstance(expected, pd.DataFrame)), \\\n",
" 'Inputs must both be pandas DataFrames.'\n",
"\n",
" for i, exp_row in expected.iterrows():\n",
" assert i in actual.index, 'Expected row {!r} not found.'.format(i)\n",
"\n",
" act_row = actual.loc[i]\n",
"\n",
" for j, exp_item in exp_row.iteritems():\n",
" assert j in act_row.index, \\\n",
" 'Expected column {!r} not found.'.format(j)\n",
"\n",
" act_item = act_row[j]\n",
"\n",
" try:\n",
" comp(act_item, exp_item)\n",
" except AssertionError as e:\n",
" raise AssertionError(\n",
" e.message + '\\n\\nColumn: {!r}\\nRow: {!r}'.format(j, i))"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 53
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"expected = pd.DataFrame({'a': [1, np.nan, 3],\n",
" 'b': [np.nan, 5, 6]},\n",
" index=['x', 'y', 'z'])"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 54
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"actual = pd.DataFrame([[4, 1],\n",
" [6, 3],\n",
" [5, np.nan]],\n",
" index=['x', 'z', 'y'],\n",
" columns=['b', 'a'])"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 55
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"assert_frames_equal(actual, actual)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 56
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"assert_frames_equal(actual, expected)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"ename": "AssertionError",
"evalue": "\nItems are not equal:\n ACTUAL: 4.0\n DESIRED: nan\n\nColumn: 'b'\nRow: 'x'",
"output_type": "pyerr",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-57-2fa991ae8dd6>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0massert_frames_equal\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mactual\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mexpected\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m<ipython-input-53-fedbc359fc19>\u001b[0m in \u001b[0;36massert_frames_equal\u001b[0;34m(actual, expected)\u001b[0m\n\u001b[1;32m 24\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mAssertionError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 25\u001b[0m raise AssertionError(e.message + \n\u001b[0;32m---> 26\u001b[0;31m '\\n\\nColumn: {!r}\\nRow: {!r}'.format(j, i))\n\u001b[0m",
"\u001b[0;31mAssertionError\u001b[0m: \nItems are not equal:\n ACTUAL: 4.0\n DESIRED: nan\n\nColumn: 'b'\nRow: 'x'"
]
}
],
"prompt_number": 57
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment