Skip to content

Instantly share code, notes, and snippets.

@baldwint
Created July 21, 2017 23:36
Show Gist options
  • Save baldwint/32b570ac47dc22405516b2e807b7b013 to your computer and use it in GitHub Desktop.
Save baldwint/32b570ac47dc22405516b2e807b7b013 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'0.21.0.dev+286.g4efe6560e'"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.__version__"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Pandas stacking bug\n",
"\n",
"I set out to build a differencing tool for dataframes, and ran into some unexpected behavior in Pandas."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## example dataframes\n",
"\n",
"Construct two dataframes (of object type) that are similar excpet for some missing rows and some changed values."
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"ix = pd.Index(['one', 'two', 'three', 'four',\n",
" 'five', 'six', 'seven', 'eight'], name='i1')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"cols = pd.Index(['class1', 'class2', 'class3', 'class4'], name='c1')"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"l = len(ix)\n",
"w = len(cols)\n",
"dt = pd.np.arange(l*w).reshape((l,w)).astype(str)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"dt[-2, :] = None # or pd.np.nan"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style>\n",
" .dataframe thead tr:only-child th {\n",
" text-align: right;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: left;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th>c1</th>\n",
" <th>class1</th>\n",
" <th>class2</th>\n",
" <th>class3</th>\n",
" <th>class4</th>\n",
" </tr>\n",
" <tr>\n",
" <th>i1</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>one</th>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>two</th>\n",
" <td>4</td>\n",
" <td>5</td>\n",
" <td>6</td>\n",
" <td>7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>three</th>\n",
" <td>8</td>\n",
" <td>9</td>\n",
" <td>10</td>\n",
" <td>11</td>\n",
" </tr>\n",
" <tr>\n",
" <th>four</th>\n",
" <td>12</td>\n",
" <td>13</td>\n",
" <td>14</td>\n",
" <td>15</td>\n",
" </tr>\n",
" <tr>\n",
" <th>five</th>\n",
" <td>16</td>\n",
" <td>17</td>\n",
" <td>18</td>\n",
" <td>19</td>\n",
" </tr>\n",
" <tr>\n",
" <th>six</th>\n",
" <td>20</td>\n",
" <td>21</td>\n",
" <td>22</td>\n",
" <td>23</td>\n",
" </tr>\n",
" <tr>\n",
" <th>seven</th>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>eight</th>\n",
" <td>28</td>\n",
" <td>29</td>\n",
" <td>30</td>\n",
" <td>31</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"c1 class1 class2 class3 class4\n",
"i1 \n",
"one 0 1 2 3\n",
"two 4 5 6 7\n",
"three 8 9 10 11\n",
"four 12 13 14 15\n",
"five 16 17 18 19\n",
"six 20 21 22 23\n",
"seven None None None None\n",
"eight 28 29 30 31"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"eg1 = pd.DataFrame(data=dt, columns=cols, index=ix)\n",
"eg2 = eg1.copy(deep=True)\n",
"eg1"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"c1\n",
"class1 object\n",
"class2 object\n",
"class3 object\n",
"class4 object\n",
"dtype: object"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"eg1.dtypes"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# introduce differences to find\n",
"eg1 = eg1.drop('six').set_value('two', 'class2', -1)\n",
"eg2 = eg2.drop('one').set_value('seven', 'class3', -1)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style>\n",
" .dataframe thead tr:only-child th {\n",
" text-align: right;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: left;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th>c1</th>\n",
" <th>class1</th>\n",
" <th>class2</th>\n",
" <th>class3</th>\n",
" <th>class4</th>\n",
" </tr>\n",
" <tr>\n",
" <th>i1</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>one</th>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>two</th>\n",
" <td>4</td>\n",
" <td>-1</td>\n",
" <td>6</td>\n",
" <td>7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>three</th>\n",
" <td>8</td>\n",
" <td>9</td>\n",
" <td>10</td>\n",
" <td>11</td>\n",
" </tr>\n",
" <tr>\n",
" <th>four</th>\n",
" <td>12</td>\n",
" <td>13</td>\n",
" <td>14</td>\n",
" <td>15</td>\n",
" </tr>\n",
" <tr>\n",
" <th>five</th>\n",
" <td>16</td>\n",
" <td>17</td>\n",
" <td>18</td>\n",
" <td>19</td>\n",
" </tr>\n",
" <tr>\n",
" <th>seven</th>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>eight</th>\n",
" <td>28</td>\n",
" <td>29</td>\n",
" <td>30</td>\n",
" <td>31</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"c1 class1 class2 class3 class4\n",
"i1 \n",
"one 0 1 2 3\n",
"two 4 -1 6 7\n",
"three 8 9 10 11\n",
"four 12 13 14 15\n",
"five 16 17 18 19\n",
"seven None None None None\n",
"eight 28 29 30 31"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"eg1"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style>\n",
" .dataframe thead tr:only-child th {\n",
" text-align: right;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: left;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th>c1</th>\n",
" <th>class1</th>\n",
" <th>class2</th>\n",
" <th>class3</th>\n",
" <th>class4</th>\n",
" </tr>\n",
" <tr>\n",
" <th>i1</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>two</th>\n",
" <td>4</td>\n",
" <td>5</td>\n",
" <td>6</td>\n",
" <td>7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>three</th>\n",
" <td>8</td>\n",
" <td>9</td>\n",
" <td>10</td>\n",
" <td>11</td>\n",
" </tr>\n",
" <tr>\n",
" <th>four</th>\n",
" <td>12</td>\n",
" <td>13</td>\n",
" <td>14</td>\n",
" <td>15</td>\n",
" </tr>\n",
" <tr>\n",
" <th>five</th>\n",
" <td>16</td>\n",
" <td>17</td>\n",
" <td>18</td>\n",
" <td>19</td>\n",
" </tr>\n",
" <tr>\n",
" <th>six</th>\n",
" <td>20</td>\n",
" <td>21</td>\n",
" <td>22</td>\n",
" <td>23</td>\n",
" </tr>\n",
" <tr>\n",
" <th>seven</th>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>-1</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>eight</th>\n",
" <td>28</td>\n",
" <td>29</td>\n",
" <td>30</td>\n",
" <td>31</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"c1 class1 class2 class3 class4\n",
"i1 \n",
"two 4 5 6 7\n",
"three 8 9 10 11\n",
"four 12 13 14 15\n",
"five 16 17 18 19\n",
"six 20 21 22 23\n",
"seven None None -1 None\n",
"eight 28 29 30 31"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"eg2"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## perform differencing\n",
"\n",
"by concatenating dataframes together and dropping duplicate rows."
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['class1', 'class2', 'class3', 'class4']"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"list(cols)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"eg1['Source'] = 'eg1'\n",
"eg2['Source'] = 'eg2'"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['i1', 'class1', 'class2', 'class3', 'class4']"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"index_name = eg1.index.name\n",
"assert index_name == eg2.index.name\n",
"\n",
"cmp_cols = [index_name, ] + list(cols)\n",
"cmp_cols"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"unm = pd.concat(\n",
" (eg1, eg2)\n",
").reset_index().drop_duplicates(\n",
" keep=False,\n",
" subset=cmp_cols,\n",
").set_index(index_name)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style>\n",
" .dataframe thead tr:only-child th {\n",
" text-align: right;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: left;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th>c1</th>\n",
" <th>class1</th>\n",
" <th>class2</th>\n",
" <th>class3</th>\n",
" <th>class4</th>\n",
" <th>Source</th>\n",
" </tr>\n",
" <tr>\n",
" <th>i1</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>one</th>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>eg1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>two</th>\n",
" <td>4</td>\n",
" <td>-1</td>\n",
" <td>6</td>\n",
" <td>7</td>\n",
" <td>eg1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>seven</th>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>eg1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>two</th>\n",
" <td>4</td>\n",
" <td>5</td>\n",
" <td>6</td>\n",
" <td>7</td>\n",
" <td>eg2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>six</th>\n",
" <td>20</td>\n",
" <td>21</td>\n",
" <td>22</td>\n",
" <td>23</td>\n",
" <td>eg2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>seven</th>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>-1</td>\n",
" <td>None</td>\n",
" <td>eg2</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"c1 class1 class2 class3 class4 Source\n",
"i1 \n",
"one 0 1 2 3 eg1\n",
"two 4 -1 6 7 eg1\n",
"seven None None None None eg1\n",
"two 4 5 6 7 eg2\n",
"six 20 21 22 23 eg2\n",
"seven None None -1 None eg2"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"unm"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This \"unmatched\" frame includes the orphaned rows on either side, and a pair of rows for each changed row. We can pivot this on `Source` to view a side-by-side comparison within each column.\n",
"\n",
"To retain information about which rows were added/deleted, as opposed to being changed to or from a literal null, add an \"Exists\" flag before pivoting:"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"unm['Exists'] = True"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"perform the pivot:"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"egp = unm.set_index(['Source'], append=True).unstack()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"unpaired rows (for additions or deletions) will now contain NaN values in one of the \"Exists\" columns. Fill those with False:"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# blank values in this column pair indicate that the row didn't exist there\n",
"egp.Exists = egp.Exists.fillna(False)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style>\n",
" .dataframe thead tr:only-child th {\n",
" text-align: right;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: left;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr>\n",
" <th>c1</th>\n",
" <th colspan=\"2\" halign=\"left\">class1</th>\n",
" <th colspan=\"2\" halign=\"left\">class2</th>\n",
" <th colspan=\"2\" halign=\"left\">class3</th>\n",
" <th colspan=\"2\" halign=\"left\">class4</th>\n",
" <th colspan=\"2\" halign=\"left\">Exists</th>\n",
" </tr>\n",
" <tr>\n",
" <th>Source</th>\n",
" <th>eg1</th>\n",
" <th>eg2</th>\n",
" <th>eg1</th>\n",
" <th>eg2</th>\n",
" <th>eg1</th>\n",
" <th>eg2</th>\n",
" <th>eg1</th>\n",
" <th>eg2</th>\n",
" <th>eg1</th>\n",
" <th>eg2</th>\n",
" </tr>\n",
" <tr>\n",
" <th>i1</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>one</th>\n",
" <td>0</td>\n",
" <td>None</td>\n",
" <td>1</td>\n",
" <td>None</td>\n",
" <td>2</td>\n",
" <td>None</td>\n",
" <td>3</td>\n",
" <td>None</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>seven</th>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>-1</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>six</th>\n",
" <td>None</td>\n",
" <td>20</td>\n",
" <td>None</td>\n",
" <td>21</td>\n",
" <td>None</td>\n",
" <td>22</td>\n",
" <td>None</td>\n",
" <td>23</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>two</th>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>-1</td>\n",
" <td>5</td>\n",
" <td>6</td>\n",
" <td>6</td>\n",
" <td>7</td>\n",
" <td>7</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"c1 class1 class2 class3 class4 Exists \n",
"Source eg1 eg2 eg1 eg2 eg1 eg2 eg1 eg2 eg1 eg2\n",
"i1 \n",
"one 0 None 1 None 2 None 3 None True False\n",
"seven None None None None None -1 None None True True\n",
"six None 20 None 21 None 22 None 23 False True\n",
"two 4 4 -1 5 6 6 7 7 True True"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"egp"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"It'd be nicer to combine the \"Exists\" columns by human-readable labels:"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"chtype = {\n",
" (True, True): \"Change\",\n",
" (True, False): \"Delete\",\n",
" (False, True): \"Add\",\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"change_type = egp.Exists.apply(tuple, axis=1).map(chtype) #.astype('category')"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"i1\n",
"one Delete\n",
"seven Change\n",
"six Add\n",
"two Change\n",
"dtype: object"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"change_type"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"make a human-readable version of this:"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"egh = egp.copy(deep=True)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"egh['Change Type'] = change_type"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style>\n",
" .dataframe thead tr:only-child th {\n",
" text-align: right;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: left;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr>\n",
" <th>c1</th>\n",
" <th colspan=\"2\" halign=\"left\">class1</th>\n",
" <th colspan=\"2\" halign=\"left\">class2</th>\n",
" <th colspan=\"2\" halign=\"left\">class3</th>\n",
" <th colspan=\"2\" halign=\"left\">class4</th>\n",
" <th colspan=\"2\" halign=\"left\">Exists</th>\n",
" <th>Change Type</th>\n",
" </tr>\n",
" <tr>\n",
" <th>Source</th>\n",
" <th>eg1</th>\n",
" <th>eg2</th>\n",
" <th>eg1</th>\n",
" <th>eg2</th>\n",
" <th>eg1</th>\n",
" <th>eg2</th>\n",
" <th>eg1</th>\n",
" <th>eg2</th>\n",
" <th>eg1</th>\n",
" <th>eg2</th>\n",
" <th></th>\n",
" </tr>\n",
" <tr>\n",
" <th>i1</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>one</th>\n",
" <td>0</td>\n",
" <td>None</td>\n",
" <td>1</td>\n",
" <td>None</td>\n",
" <td>2</td>\n",
" <td>None</td>\n",
" <td>3</td>\n",
" <td>None</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>Delete</td>\n",
" </tr>\n",
" <tr>\n",
" <th>seven</th>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>-1</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>Change</td>\n",
" </tr>\n",
" <tr>\n",
" <th>six</th>\n",
" <td>None</td>\n",
" <td>20</td>\n",
" <td>None</td>\n",
" <td>21</td>\n",
" <td>None</td>\n",
" <td>22</td>\n",
" <td>None</td>\n",
" <td>23</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>Add</td>\n",
" </tr>\n",
" <tr>\n",
" <th>two</th>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>-1</td>\n",
" <td>5</td>\n",
" <td>6</td>\n",
" <td>6</td>\n",
" <td>7</td>\n",
" <td>7</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>Change</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"c1 class1 class2 class3 class4 Exists \\\n",
"Source eg1 eg2 eg1 eg2 eg1 eg2 eg1 eg2 eg1 eg2 \n",
"i1 \n",
"one 0 None 1 None 2 None 3 None True False \n",
"seven None None None None None -1 None None True True \n",
"six None 20 None 21 None 22 None 23 False True \n",
"two 4 4 -1 5 6 6 7 7 True True \n",
"\n",
"c1 Change Type \n",
"Source \n",
"i1 \n",
"one Delete \n",
"seven Change \n",
"six Add \n",
"two Change "
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"egh"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"put it in the index:"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style>\n",
" .dataframe thead tr:only-child th {\n",
" text-align: right;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: left;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr>\n",
" <th></th>\n",
" <th>c1</th>\n",
" <th colspan=\"2\" halign=\"left\">class1</th>\n",
" <th colspan=\"2\" halign=\"left\">class2</th>\n",
" <th colspan=\"2\" halign=\"left\">class3</th>\n",
" <th colspan=\"2\" halign=\"left\">class4</th>\n",
" <th colspan=\"2\" halign=\"left\">Exists</th>\n",
" </tr>\n",
" <tr>\n",
" <th></th>\n",
" <th>Source</th>\n",
" <th>eg1</th>\n",
" <th>eg2</th>\n",
" <th>eg1</th>\n",
" <th>eg2</th>\n",
" <th>eg1</th>\n",
" <th>eg2</th>\n",
" <th>eg1</th>\n",
" <th>eg2</th>\n",
" <th>eg1</th>\n",
" <th>eg2</th>\n",
" </tr>\n",
" <tr>\n",
" <th>Change Type</th>\n",
" <th>i1</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>Delete</th>\n",
" <th>one</th>\n",
" <td>0</td>\n",
" <td>None</td>\n",
" <td>1</td>\n",
" <td>None</td>\n",
" <td>2</td>\n",
" <td>None</td>\n",
" <td>3</td>\n",
" <td>None</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Change</th>\n",
" <th>seven</th>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>-1</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Add</th>\n",
" <th>six</th>\n",
" <td>None</td>\n",
" <td>20</td>\n",
" <td>None</td>\n",
" <td>21</td>\n",
" <td>None</td>\n",
" <td>22</td>\n",
" <td>None</td>\n",
" <td>23</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Change</th>\n",
" <th>two</th>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>-1</td>\n",
" <td>5</td>\n",
" <td>6</td>\n",
" <td>6</td>\n",
" <td>7</td>\n",
" <td>7</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"c1 class1 class2 class3 class4 Exists \\\n",
"Source eg1 eg2 eg1 eg2 eg1 eg2 eg1 eg2 eg1 \n",
"Change Type i1 \n",
"Delete one 0 None 1 None 2 None 3 None True \n",
"Change seven None None None None None -1 None None True \n",
"Add six None 20 None 21 None 22 None 23 False \n",
"Change two 4 4 -1 5 6 6 7 7 True \n",
"\n",
"c1 \n",
"Source eg2 \n",
"Change Type i1 \n",
"Delete one False \n",
"Change seven True \n",
"Add six True \n",
"Change two True "
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"egh = egh.reset_index().set_index(['Change Type', 'i1'])\n",
"egh.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This is nice and readable, but by unstacking it we could focus on just the columns that had changes:"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style>\n",
" .dataframe thead tr:only-child th {\n",
" text-align: right;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: left;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th></th>\n",
" <th>Source</th>\n",
" <th>eg1</th>\n",
" <th>eg2</th>\n",
" </tr>\n",
" <tr>\n",
" <th>Change Type</th>\n",
" <th>i1</th>\n",
" <th>c1</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th rowspan=\"5\" valign=\"top\">Delete</th>\n",
" <th rowspan=\"5\" valign=\"top\">one</th>\n",
" <th>Exists</th>\n",
" <td>0</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>class1</th>\n",
" <td>1</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>class2</th>\n",
" <td>2</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>class3</th>\n",
" <td>3</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>class4</th>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th rowspan=\"5\" valign=\"top\">Change</th>\n",
" <th rowspan=\"5\" valign=\"top\">seven</th>\n",
" <th>Exists</th>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>class1</th>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>class2</th>\n",
" <td>None</td>\n",
" <td>-1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>class3</th>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>class4</th>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th rowspan=\"5\" valign=\"top\">Add</th>\n",
" <th rowspan=\"5\" valign=\"top\">six</th>\n",
" <th>Exists</th>\n",
" <td>None</td>\n",
" <td>20</td>\n",
" </tr>\n",
" <tr>\n",
" <th>class1</th>\n",
" <td>None</td>\n",
" <td>21</td>\n",
" </tr>\n",
" <tr>\n",
" <th>class2</th>\n",
" <td>None</td>\n",
" <td>22</td>\n",
" </tr>\n",
" <tr>\n",
" <th>class3</th>\n",
" <td>None</td>\n",
" <td>23</td>\n",
" </tr>\n",
" <tr>\n",
" <th>class4</th>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th rowspan=\"5\" valign=\"top\">Change</th>\n",
" <th rowspan=\"5\" valign=\"top\">two</th>\n",
" <th>Exists</th>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>class1</th>\n",
" <td>-1</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>class2</th>\n",
" <td>6</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>class3</th>\n",
" <td>7</td>\n",
" <td>7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>class4</th>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"Source eg1 eg2\n",
"Change Type i1 c1 \n",
"Delete one Exists 0 None\n",
" class1 1 None\n",
" class2 2 None\n",
" class3 3 None\n",
" class4 True False\n",
"Change seven Exists None None\n",
" class1 None None\n",
" class2 None -1\n",
" class3 None None\n",
" class4 True True\n",
"Add six Exists None 20\n",
" class1 None 21\n",
" class2 None 22\n",
" class3 None 23\n",
" class4 False True\n",
"Change two Exists 4 4\n",
" class1 -1 5\n",
" class2 6 6\n",
" class3 7 7\n",
" class4 True True"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"egh.stack(level=0, dropna=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"there is a bug here.\n",
"\n",
" - the innermost index values are rotated by one with respect to the data\n",
"\n",
"In another instance, I observed a stranger version of this bug where\n",
"\n",
" - `class1` is missing\n",
" - the rest of the innermost index values are shifted up\n",
" - the label for one of the index levels (`Change Type`) shows up as the final innermost index value\n",
" \n",
"the data seems to all be in the right place but the indexing is incorrect."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Workaround\n",
"\n",
"Per Pandas bug [16925](https://github.com/pandas-dev/pandas/issues/16925), sorting the columns fixes it:"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style>\n",
" .dataframe thead tr:only-child th {\n",
" text-align: right;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: left;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th></th>\n",
" <th>Source</th>\n",
" <th>eg1</th>\n",
" <th>eg2</th>\n",
" </tr>\n",
" <tr>\n",
" <th>Change Type</th>\n",
" <th>i1</th>\n",
" <th>c1</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th rowspan=\"5\" valign=\"top\">Delete</th>\n",
" <th rowspan=\"5\" valign=\"top\">one</th>\n",
" <th>Exists</th>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>class1</th>\n",
" <td>0</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>class2</th>\n",
" <td>1</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>class3</th>\n",
" <td>2</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>class4</th>\n",
" <td>3</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th rowspan=\"5\" valign=\"top\">Change</th>\n",
" <th rowspan=\"5\" valign=\"top\">seven</th>\n",
" <th>Exists</th>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>class1</th>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>class2</th>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>class3</th>\n",
" <td>None</td>\n",
" <td>-1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>class4</th>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th rowspan=\"5\" valign=\"top\">Add</th>\n",
" <th rowspan=\"5\" valign=\"top\">six</th>\n",
" <th>Exists</th>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>class1</th>\n",
" <td>None</td>\n",
" <td>20</td>\n",
" </tr>\n",
" <tr>\n",
" <th>class2</th>\n",
" <td>None</td>\n",
" <td>21</td>\n",
" </tr>\n",
" <tr>\n",
" <th>class3</th>\n",
" <td>None</td>\n",
" <td>22</td>\n",
" </tr>\n",
" <tr>\n",
" <th>class4</th>\n",
" <td>None</td>\n",
" <td>23</td>\n",
" </tr>\n",
" <tr>\n",
" <th rowspan=\"5\" valign=\"top\">Change</th>\n",
" <th rowspan=\"5\" valign=\"top\">two</th>\n",
" <th>Exists</th>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>class1</th>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>class2</th>\n",
" <td>-1</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>class3</th>\n",
" <td>6</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>class4</th>\n",
" <td>7</td>\n",
" <td>7</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"Source eg1 eg2\n",
"Change Type i1 c1 \n",
"Delete one Exists True False\n",
" class1 0 None\n",
" class2 1 None\n",
" class3 2 None\n",
" class4 3 None\n",
"Change seven Exists True True\n",
" class1 None None\n",
" class2 None None\n",
" class3 None -1\n",
" class4 None None\n",
"Add six Exists False True\n",
" class1 None 20\n",
" class2 None 21\n",
" class3 None 22\n",
" class4 None 23\n",
"Change two Exists True True\n",
" class1 4 4\n",
" class2 -1 5\n",
" class3 6 6\n",
" class4 7 7"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"egh.sort_index(axis=1).stack(level=0, dropna=False)"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"INSTALLED VERSIONS\n",
"------------------\n",
"commit: None\n",
"python: 3.6.2.final.0\n",
"python-bits: 64\n",
"OS: Darwin\n",
"OS-release: 16.6.0\n",
"machine: x86_64\n",
"processor: i386\n",
"byteorder: little\n",
"LC_ALL: None\n",
"LANG: en_US.UTF-8\n",
"LOCALE: en_US.UTF-8\n",
"\n",
"pandas: 0.21.0.dev+286.g4efe6560e\n",
"pytest: None\n",
"pip: 9.0.1\n",
"setuptools: 27.2.0\n",
"Cython: 0.25.2\n",
"numpy: 1.13.1\n",
"scipy: None\n",
"xarray: None\n",
"IPython: 6.1.0\n",
"sphinx: None\n",
"patsy: None\n",
"dateutil: 2.6.1\n",
"pytz: 2017.2\n",
"blosc: None\n",
"bottleneck: None\n",
"tables: None\n",
"numexpr: None\n",
"feather: None\n",
"matplotlib: None\n",
"openpyxl: None\n",
"xlrd: None\n",
"xlwt: None\n",
"xlsxwriter: None\n",
"lxml: None\n",
"bs4: None\n",
"html5lib: None\n",
"sqlalchemy: None\n",
"pymysql: None\n",
"psycopg2: None\n",
"jinja2: None\n",
"s3fs: None\n",
"pandas_gbq: None\n",
"pandas_datareader: None\n"
]
}
],
"source": [
"pd.show_versions()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Environment (conda_pandasdev)",
"language": "python",
"name": "conda_pandasdev"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment