bmweiner/compare_dataframes.ipynb

## compare_dataframes.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Compare two pandas dataframes for equality**\n",
    "\n",
    "Specify df1, df2, and absolute/relative tolerance for numeric comparison.\n",
    "\n",
    "1. Check Shapes - Number of (rows, cols) - Match\n",
    "\n",
    "2. Check Column Names Match\n",
    "\n",
    "3. Check Values Match\n",
    "\n",
    "  * Values are compared by index - indices that exist in both frames are compared\n",
    "  * Numeric values are compared with `numpy.isclose`\n",
    "  * Object values are compared with `numpy.equal`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pprint\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "results = dict(setup=[], shapes=[], columns=[], values=[])\n",
    "def print_results(r):\n",
    "    for k, v in r:\n",
    "        print(k + ':', v)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# specify input settings\n",
    "fname1 = 'data1.csv'\n",
    "fname2 = 'data2.csv'\n",
    "desc = 'Comparison of df1 and df2'\n",
    "\n",
    "df1 = pd.read_csv(fname1)\n",
    "df2 = pd.read_csv(fname2)\n",
    "\n",
    "atol = 1e-4\n",
    "rtol = 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Filename 1: data1.csv\n",
      "Filename 2: data2.csv\n",
      "Description: Comparison of df1 and df2\n",
      "Absolute Numeric Tolerance: 0.0001\n",
      "Relative Numeric Tolerance: 0\n"
     ]
    }
   ],
   "source": [
    "r = results['setup']\n",
    "r.clear()\n",
    "\n",
    "r.append(('Filename 1', fname1))\n",
    "r.append(('Filename 2', fname2))\n",
    "r.append(('Description', desc))\n",
    "r.append(('Absolute Numeric Tolerance', atol))\n",
    "r.append(('Relative Numeric Tolerance', rtol))\n",
    "\n",
    "print_results(r)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Shapes Match: False\n",
      "df1 Shape: (7, 4)\n",
      "df2 Shape: (6, 3)\n"
     ]
    }
   ],
   "source": [
    "# Compare Shapes (row, col)\n",
    "r = results['shapes']\n",
    "r.clear()\n",
    "\n",
    "r.append(('Shapes Match', df1.shape == df2.shape))\n",
    "for i, df in enumerate([df1, df2]):\n",
    "    r.append(('df{} Shape'.format(i+1), df.shape))\n",
    "    \n",
    "print_results(r)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Columns Match: False\n",
      "df1 Columns: ['char' 'num' 'mix' 'extra']\n",
      "df2 Columns: ['char' 'num' 'mix']\n",
      "df1 Only: ['extra']\n",
      "df2 Only: []\n"
     ]
    }
   ],
   "source": [
    "# Compare Columns\n",
    "r = results['columns']\n",
    "r.clear()\n",
    "\n",
    "r.append(('Columns Match', df1.shape == df2.shape))\n",
    "for i, df in enumerate([df1, df2]):\n",
    "    r.append(('df{} Columns'.format(i+1), df.columns.values))\n",
    "cols = df1.columns.difference(df2.columns).values\n",
    "r.append(('df1 Only', cols))\n",
    "cols = df2.columns.difference(df1.columns).values\n",
    "r.append(('df2 Only', cols))\n",
    "print_results(r)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Values Match: False\n",
      "Max Shape: (7, 4)\n",
      "Possible Values: 28\n",
      "Excluded Values: 10\n",
      "Shape Compared: (6, 3)\n",
      "Total Values Compared: 18\n",
      "Total Values Matched: 15\n",
      "Total Values Not Matched: 3\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>i</th>\n",
       "      <th>j</th>\n",
       "      <th>col1</th>\n",
       "      <th>col2</th>\n",
       "      <th>val1</th>\n",
       "      <th>val2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>num</td>\n",
       "      <td>num</td>\n",
       "      <td>2</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>num</td>\n",
       "      <td>num</td>\n",
       "      <td>5</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>char</td>\n",
       "      <td>char</td>\n",
       "      <td>c</td>\n",
       "      <td>a</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   i  j  col1  col2 val1 val2\n",
       "0  1  1   num   num    2  NaN\n",
       "1  4  1   num   num    5    2\n",
       "2  2  0  char  char    c    a"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Compare values\n",
    "r = results['values']\n",
    "r.clear()\n",
    "\n",
    "x = df1.values\n",
    "y = df2.values\n",
    "\n",
    "# min / max size of arrays\n",
    "min_dim = tuple(min(a, b) for a, b in zip(x.shape, y.shape))\n",
    "max_dim = tuple(max(a, b) for a, b in zip(x.shape, y.shape))\n",
    "\n",
    "# numeric columns\n",
    "x_num = [np.issubdtype(t, np.number) for t in df1.dtypes]\n",
    "y_num = [np.issubdtype(t, np.number) for t in df2.dtypes]\n",
    "cols_num = np.logical_and(x_num[:min_dim[1]], y_num[:min_dim[1]])\n",
    "\n",
    "# numeric check\n",
    "x = df1.iloc[:min_dim[0], cols_num].values\n",
    "y = df2.iloc[:min_dim[0], cols_num].values\n",
    "elements = np.where(~np.isclose(x, y, atol=atol, equal_nan=True))\n",
    "col_map = np.arange(len(cols_num))[cols_num]\n",
    "rows = elements[0]\n",
    "cols = np.array([col_map[col] for col in elements[1]])\n",
    "idx = np.column_stack((rows, cols))\n",
    "\n",
    "# obj check\n",
    "x = df1.iloc[:min_dim[0], ~cols_num].values\n",
    "y = df2.iloc[:min_dim[0], ~cols_num].values\n",
    "elements = np.where(x != y)\n",
    "col_map = np.arange(len(cols_num))[~cols_num]\n",
    "rows = elements[0]\n",
    "cols = np.array([col_map[col] for col in elements[1]])\n",
    "idx = np.concatenate((idx, np.column_stack((rows, cols))))\n",
    "\n",
    "# table of mismatch values\n",
    "columns = (\"i\", \"j\", \"col1\", \"col2\", \"val1\", \"val2\")\n",
    "data = []\n",
    "for i, j in idx:\n",
    "    z = ((i, j, df1.columns[j], df2.columns[j], df1.iloc[i, j], df2.iloc[i, j]))\n",
    "    data.append(z)\n",
    "df = pd.DataFrame(data, None, columns)\n",
    "\n",
    "r.append(('Values Match', df.empty))\n",
    "r.append(('Max Shape', max_dim))\n",
    "r.append(('Possible Values', np.prod(max_dim)))\n",
    "r.append(('Excluded Values', np.prod(max_dim) - np.prod(min_dim)))\n",
    "r.append(('Shape Compared', min_dim))\n",
    "r.append(('Total Values Compared', np.prod(min_dim)))\n",
    "r.append(('Total Values Matched', np.prod(min_dim) - idx.shape[0]))\n",
    "r.append(('Total Values Not Matched', df.shape[0]))\n",
    "\n",
    "print_results(r)\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# export results\n",
    "with open('report.txt', 'w') as f:\n",
    "    f.write(pprint.pformat(results))\n",
    "df.to_csv('results_data.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}

## data1.csv

          
            char
            num
            mix
            extra

            
              a
              1
              h
              1

            
              b
              2
              i
              1

            
              c
              3
              3532
              0

            
              d
              4
              k
              0

            
              e
              5
              42
              1

            
              f
              6
              m
              1

            
              g
              7
              n
              1

## data2.csv

          
            char
            num
            mix

            
              a
              1
              h

            
              b
              
              i

            
              a
              3
              3532

            
              d
              4
              k

            
              e
              2
              42

            
              f
              6
              m

## report.txt
{'columns': [('Columns Match', False),
             ('df1 Columns',
              array(['char', 'num', 'mix', 'extra'], dtype=object)),
             ('df2 Columns', array(['char', 'num', 'mix'], dtype=object)),
             ('df1 Only', array(['extra'], dtype=object)),
             ('df2 Only', array([], dtype=object))],
 'setup': [('Filename 1', 'data1.csv'),
           ('Filename 2', 'data2.csv'),
           ('Description', 'Comparison of df1 and df2'),
           ('Absolute Numeric Tolerance', 0.0001),
           ('Relative Numeric Tolerance', 0)],
 'shapes': [('Shapes Match', False),
            ('df1 Shape', (7, 4)),
            ('df2 Shape', (6, 3))],
 'values': [('Values Match', False),
            ('Max Shape', (7, 4)),
            ('Possible Values', 28),
            ('Excluded Values', 10),
            ('Shape Compared', (6, 3)),
            ('Total Values Compared', 18),
            ('Total Values Matched', 15),
            ('Total Values Not Matched', 3)]}

## results_data.csv

          
            i
            j
            col1
            col2
            val1
            val2

            
              1
              1
              num
              num
              2

            
              4
              1
              num
              num
              5
              2.0

            
              2
              0
              char
              char
              c
              a
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Compare two pandas dataframes for equality\n",
	"\n",
	"Specify df1, df2, and absolute/relative tolerance for numeric comparison.\n",
	"\n",
	"1. Check Shapes - Number of (rows, cols) - Match\n",
	"\n",
	"2. Check Column Names Match\n",
	"\n",
	"3. Check Values Match\n",
	"\n",
	" * Values are compared by index - indices that exist in both frames are compared\n",
	" * Numeric values are compared with `numpy.isclose`\n",
	" * Object values are compared with `numpy.equal`"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"import pprint\n",
	"import pandas as pd\n",
	"import numpy as np\n",
	"\n",
	"results = dict(setup=[], shapes=[], columns=[], values=[])\n",
	"def print_results(r):\n",
	" for k, v in r:\n",
	" print(k + ':', v)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"# specify input settings\n",
	"fname1 = 'data1.csv'\n",
	"fname2 = 'data2.csv'\n",
	"desc = 'Comparison of df1 and df2'\n",
	"\n",
	"df1 = pd.read_csv(fname1)\n",
	"df2 = pd.read_csv(fname2)\n",
	"\n",
	"atol = 1e-4\n",
	"rtol = 0"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Filename 1: data1.csv\n",
	"Filename 2: data2.csv\n",
	"Description: Comparison of df1 and df2\n",
	"Absolute Numeric Tolerance: 0.0001\n",
	"Relative Numeric Tolerance: 0\n"
	]
	}
	],
	"source": [
	"r = results['setup']\n",
	"r.clear()\n",
	"\n",
	"r.append(('Filename 1', fname1))\n",
	"r.append(('Filename 2', fname2))\n",
	"r.append(('Description', desc))\n",
	"r.append(('Absolute Numeric Tolerance', atol))\n",
	"r.append(('Relative Numeric Tolerance', rtol))\n",
	"\n",
	"print_results(r)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Shapes Match: False\n",
	"df1 Shape: (7, 4)\n",
	"df2 Shape: (6, 3)\n"
	]
	}
	],
	"source": [
	"# Compare Shapes (row, col)\n",
	"r = results['shapes']\n",
	"r.clear()\n",
	"\n",
	"r.append(('Shapes Match', df1.shape == df2.shape))\n",
	"for i, df in enumerate([df1, df2]):\n",
	" r.append(('df{} Shape'.format(i+1), df.shape))\n",
	" \n",
	"print_results(r)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Columns Match: False\n",
	"df1 Columns: ['char' 'num' 'mix' 'extra']\n",
	"df2 Columns: ['char' 'num' 'mix']\n",
	"df1 Only: ['extra']\n",
	"df2 Only: []\n"
	]
	}
	],
	"source": [
	"# Compare Columns\n",
	"r = results['columns']\n",
	"r.clear()\n",
	"\n",
	"r.append(('Columns Match', df1.shape == df2.shape))\n",
	"for i, df in enumerate([df1, df2]):\n",
	" r.append(('df{} Columns'.format(i+1), df.columns.values))\n",
	"cols = df1.columns.difference(df2.columns).values\n",
	"r.append(('df1 Only', cols))\n",
	"cols = df2.columns.difference(df1.columns).values\n",
	"r.append(('df2 Only', cols))\n",
	"print_results(r)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Values Match: False\n",
	"Max Shape: (7, 4)\n",
	"Possible Values: 28\n",
	"Excluded Values: 10\n",
	"Shape Compared: (6, 3)\n",
	"Total Values Compared: 18\n",
	"Total Values Matched: 15\n",
	"Total Values Not Matched: 3\n"
	]
	},
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>i</th>\n",
	" <th>j</th>\n",
	" <th>col1</th>\n",
	" <th>col2</th>\n",
	" <th>val1</th>\n",
	" <th>val2</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>1</td>\n",
	" <td>1</td>\n",
	" <td>num</td>\n",
	" <td>num</td>\n",
	" <td>2</td>\n",
	" <td>NaN</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>4</td>\n",
	" <td>1</td>\n",
	" <td>num</td>\n",
	" <td>num</td>\n",
	" <td>5</td>\n",
	" <td>2</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>2</td>\n",
	" <td>0</td>\n",
	" <td>char</td>\n",
	" <td>char</td>\n",
	" <td>c</td>\n",
	" <td>a</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" i j col1 col2 val1 val2\n",
	"0 1 1 num num 2 NaN\n",
	"1 4 1 num num 5 2\n",
	"2 2 0 char char c a"
	]
	},
	"execution_count": 6,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# Compare values\n",
	"r = results['values']\n",
	"r.clear()\n",
	"\n",
	"x = df1.values\n",
	"y = df2.values\n",
	"\n",
	"# min / max size of arrays\n",
	"min_dim = tuple(min(a, b) for a, b in zip(x.shape, y.shape))\n",
	"max_dim = tuple(max(a, b) for a, b in zip(x.shape, y.shape))\n",
	"\n",
	"# numeric columns\n",
	"x_num = [np.issubdtype(t, np.number) for t in df1.dtypes]\n",
	"y_num = [np.issubdtype(t, np.number) for t in df2.dtypes]\n",
	"cols_num = np.logical_and(x_num[:min_dim[1]], y_num[:min_dim[1]])\n",
	"\n",
	"# numeric check\n",
	"x = df1.iloc[:min_dim[0], cols_num].values\n",
	"y = df2.iloc[:min_dim[0], cols_num].values\n",
	"elements = np.where(~np.isclose(x, y, atol=atol, equal_nan=True))\n",
	"col_map = np.arange(len(cols_num))[cols_num]\n",
	"rows = elements[0]\n",
	"cols = np.array([col_map[col] for col in elements[1]])\n",
	"idx = np.column_stack((rows, cols))\n",
	"\n",
	"# obj check\n",
	"x = df1.iloc[:min_dim[0], ~cols_num].values\n",
	"y = df2.iloc[:min_dim[0], ~cols_num].values\n",
	"elements = np.where(x != y)\n",
	"col_map = np.arange(len(cols_num))[~cols_num]\n",
	"rows = elements[0]\n",
	"cols = np.array([col_map[col] for col in elements[1]])\n",
	"idx = np.concatenate((idx, np.column_stack((rows, cols))))\n",
	"\n",
	"# table of mismatch values\n",
	"columns = (\"i\", \"j\", \"col1\", \"col2\", \"val1\", \"val2\")\n",
	"data = []\n",
	"for i, j in idx:\n",
	" z = ((i, j, df1.columns[j], df2.columns[j], df1.iloc[i, j], df2.iloc[i, j]))\n",
	" data.append(z)\n",
	"df = pd.DataFrame(data, None, columns)\n",
	"\n",
	"r.append(('Values Match', df.empty))\n",
	"r.append(('Max Shape', max_dim))\n",
	"r.append(('Possible Values', np.prod(max_dim)))\n",
	"r.append(('Excluded Values', np.prod(max_dim) - np.prod(min_dim)))\n",
	"r.append(('Shape Compared', min_dim))\n",
	"r.append(('Total Values Compared', np.prod(min_dim)))\n",
	"r.append(('Total Values Matched', np.prod(min_dim) - idx.shape[0]))\n",
	"r.append(('Total Values Not Matched', df.shape[0]))\n",
	"\n",
	"print_results(r)\n",
	"df"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"# export results\n",
	"with open('report.txt', 'w') as f:\n",
	" f.write(pprint.pformat(results))\n",
	"df.to_csv('results_data.csv', index=False)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": []
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": []
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.5.1"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 0
	}
	{'columns': [('Columns Match', False),
	('df1 Columns',
	array(['char', 'num', 'mix', 'extra'], dtype=object)),
	('df2 Columns', array(['char', 'num', 'mix'], dtype=object)),
	('df1 Only', array(['extra'], dtype=object)),
	('df2 Only', array([], dtype=object))],
	'setup': [('Filename 1', 'data1.csv'),
	('Filename 2', 'data2.csv'),
	('Description', 'Comparison of df1 and df2'),
	('Absolute Numeric Tolerance', 0.0001),
	('Relative Numeric Tolerance', 0)],
	'shapes': [('Shapes Match', False),
	('df1 Shape', (7, 4)),
	('df2 Shape', (6, 3))],
	'values': [('Values Match', False),
	('Max Shape', (7, 4)),
	('Possible Values', 28),
	('Excluded Values', 10),
	('Shape Compared', (6, 3)),
	('Total Values Compared', 18),
	('Total Values Matched', 15),
	('Total Values Not Matched', 3)]}