Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save tdhopper/91f03250892c12c6e0d35ca6d2ade1ca to your computer and use it in GitHub Desktop.
Save tdhopper/91f03250892c12c6e0d35ca6d2ade1ca to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"import shelve"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df1 = pd.DataFrame({\"x\": [1,2,3,4], \"y\": [\"a\", \"b\", \"c\", \"a\"], \"z\": [\"m\", \"n\", \"m\", \"n\"]})\n",
"df2 = pd.DataFrame({\"x\": [4,3,2,1], \"y\": [\"d\", \"d\", \"a\", \"a\"], \"z\": [\"n\", \"n\", \"n\", \"n\"]})"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Concatenating dataframes is usually pretty easy."
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>x</th>\n",
" <th>y</th>\n",
" <th>z</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>a</td>\n",
" <td>m</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>b</td>\n",
" <td>n</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>c</td>\n",
" <td>m</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>a</td>\n",
" <td>n</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4</td>\n",
" <td>d</td>\n",
" <td>n</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>3</td>\n",
" <td>d</td>\n",
" <td>n</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>2</td>\n",
" <td>a</td>\n",
" <td>n</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>1</td>\n",
" <td>a</td>\n",
" <td>n</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" x y z\n",
"0 1 a m\n",
"1 2 b n\n",
"2 3 c m\n",
"3 4 a n\n",
"4 4 d n\n",
"5 3 d n\n",
"6 2 a n\n",
"7 1 a n"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.concat([df1, df2], ignore_index=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"However, the handy new Category datatype doesn't allow it if the categories aren't identical."
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"for col in ['x', 'y']:\n",
" df1[col] = df1[col].astype('category')\n",
" df2[col] = df2[col].astype('category')"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false
},
"outputs": [
{
"ename": "ValueError",
"evalue": "incompatible categories in categorical concat",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-11-e4fd87734c99>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconcat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mdf1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdf2\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mignore_index\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m/Users/tdhopper/miniconda/envs/concat_categoricals/lib/python3.5/site-packages/pandas/tools/merge.py\u001b[0m in \u001b[0;36mconcat\u001b[0;34m(objs, axis, join, join_axes, ignore_index, keys, levels, names, verify_integrity, copy)\u001b[0m\n\u001b[1;32m 833\u001b[0m \u001b[0mverify_integrity\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mverify_integrity\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 834\u001b[0m copy=copy)\n\u001b[0;32m--> 835\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mop\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_result\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 836\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 837\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/Users/tdhopper/miniconda/envs/concat_categoricals/lib/python3.5/site-packages/pandas/tools/merge.py\u001b[0m in \u001b[0;36mget_result\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1023\u001b[0m new_data = concatenate_block_managers(\n\u001b[1;32m 1024\u001b[0m \u001b[0mmgrs_indexers\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnew_axes\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1025\u001b[0;31m concat_axis=self.axis, copy=self.copy)\n\u001b[0m\u001b[1;32m 1026\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcopy\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1027\u001b[0m \u001b[0mnew_data\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_consolidate_inplace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/Users/tdhopper/miniconda/envs/concat_categoricals/lib/python3.5/site-packages/pandas/core/internals.py\u001b[0m in \u001b[0;36mconcatenate_block_managers\u001b[0;34m(mgrs_indexers, axes, concat_axis, copy)\u001b[0m\n\u001b[1;32m 4472\u001b[0m copy=copy),\n\u001b[1;32m 4473\u001b[0m placement=placement)\n\u001b[0;32m-> 4474\u001b[0;31m for placement, join_units in concat_plan]\n\u001b[0m\u001b[1;32m 4475\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4476\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mBlockManager\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mblocks\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxes\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/Users/tdhopper/miniconda/envs/concat_categoricals/lib/python3.5/site-packages/pandas/core/internals.py\u001b[0m in \u001b[0;36m<listcomp>\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 4472\u001b[0m copy=copy),\n\u001b[1;32m 4473\u001b[0m placement=placement)\n\u001b[0;32m-> 4474\u001b[0;31m for placement, join_units in concat_plan]\n\u001b[0m\u001b[1;32m 4475\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4476\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mBlockManager\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mblocks\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxes\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/Users/tdhopper/miniconda/envs/concat_categoricals/lib/python3.5/site-packages/pandas/core/internals.py\u001b[0m in \u001b[0;36mconcatenate_join_units\u001b[0;34m(join_units, concat_axis, copy)\u001b[0m\n\u001b[1;32m 4577\u001b[0m \u001b[0mconcat_values\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mconcat_values\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcopy\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4578\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 4579\u001b[0;31m \u001b[0mconcat_values\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcom\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_concat_compat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mto_concat\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mconcat_axis\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 4580\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4581\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mconcat_values\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/Users/tdhopper/miniconda/envs/concat_categoricals/lib/python3.5/site-packages/pandas/core/common.py\u001b[0m in \u001b[0;36m_concat_compat\u001b[0;34m(to_concat, axis)\u001b[0m\n\u001b[1;32m 2722\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0;34m'category'\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mtyps\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2723\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mpandas\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcore\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcategorical\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0m_concat_compat\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2724\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0m_concat_compat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mto_concat\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0maxis\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2725\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2726\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mnonempty\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/Users/tdhopper/miniconda/envs/concat_categoricals/lib/python3.5/site-packages/pandas/core/categorical.py\u001b[0m in \u001b[0;36m_concat_compat\u001b[0;34m(to_concat, axis)\u001b[0m\n\u001b[1;32m 1948\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mx\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mcategoricals\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1949\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mcategories\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mis_dtype_equal\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1950\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"incompatible categories in categorical concat\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1951\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1952\u001b[0m \u001b[0;31m# we've already checked that all categoricals are the same, so if their\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mValueError\u001b[0m: incompatible categories in categorical concat"
]
}
],
"source": [
"pd.concat([df1, df2], ignore_index=True)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def get_possible_values(dataframes, column_names, shelf_name):\n",
" \"\"\"Do first pass over dataframes to find possible categorical values.\"\"\"\n",
" with shelve.open(shelf_name, writeback=True) as shelf:\n",
" for col in column_names:\n",
" shelf[col] = set()\n",
" for df in dataframes:\n",
" for col in column_names:\n",
" for val in df[col]:\n",
" shelf[col].add(val)\n",
" \n",
"def concat(dataframes, categorical_columns, ignore_index=False, shelf_name=\"possible_values.shl\"):\n",
" \"\"\"Concatenate dataframes with unordered categorical columns. \n",
" \n",
" Will mutate categorical columns of origial dataframes.\n",
" \n",
" dataframes: list of dataframes.\n",
" categorical_columns: list of names of unordered, categorical columns.\n",
" ignore_index: same as from pd.concat.\n",
" shelf_name: filename for shelve object to store possible values.\n",
" \"\"\"\n",
" get_possible_values(dataframes, categorical_columns, shelf_name)\n",
" with shelve.open(shelf_name) as shelf:\n",
" for df in dataframes:\n",
" for col in categorical_columns:\n",
" df[col] = pd.Categorical(df[col], categories=shelf[col], ordered=False)\n",
" return pd.concat(dataframes, axis=0, ignore_index=ignore_index)\\\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>x</th>\n",
" <th>y</th>\n",
" <th>z</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>a</td>\n",
" <td>m</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>b</td>\n",
" <td>n</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>c</td>\n",
" <td>m</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>a</td>\n",
" <td>n</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>4</td>\n",
" <td>d</td>\n",
" <td>n</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3</td>\n",
" <td>d</td>\n",
" <td>n</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2</td>\n",
" <td>a</td>\n",
" <td>n</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>a</td>\n",
" <td>n</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" x y z\n",
"0 1 a m\n",
"1 2 b n\n",
"2 3 c m\n",
"3 4 a n\n",
"0 4 d n\n",
"1 3 d n\n",
"2 2 a n\n",
"3 1 a n"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = concat([df1, df2], ['x', 'y'])\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"x category\n",
"y category\n",
"z object\n",
"dtype: object"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.dtypes"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.1"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
@tdhopper
Copy link
Author

There is a ticket about this here: pandas-dev/pandas#12699

@nicoa
Copy link

nicoa commented Sep 4, 2017

unfortunately this doesn't work any more in my setup:

INSTALLED VERSIONS
------------------
commit: None
python: 2.7.13.final.0
python-bits: 64
OS: Darwin
OS-release: 15.6.0
machine: x86_64
processor: i386
byteorder: little
LC_ALL: None
LANG: de_DE.UTF-8
LOCALE: None.None

pandas: 0.20.1
pytest: 3.0.6
pip: 9.0.1
setuptools: 34.2.0
Cython: 0.25.2
numpy: 1.12.0
scipy: 0.19.0
xarray: 0.9.1
IPython: 5.2.2
sphinx: 1.5.2
patsy: 0.4.1
dateutil: 2.5.0
pytz: 2016.10
blosc: None
bottleneck: 1.2.1
tables: 3.3.0
numexpr: 2.6.2
feather: None
matplotlib: 2.0.2
openpyxl: 2.4.2
xlrd: 1.0.0
xlwt: 1.2.0
xlsxwriter: 0.9.6
lxml: 3.6.4
bs4: 4.5.3
html5lib: 0.999
sqlalchemy: 1.1.4
pymysql: 0.7.10.None
psycopg2: None
jinja2: 2.9.6
s3fs: None
pandas_gbq: 0.2.0
pandas_datareader: None

@lenguyenthedat
Copy link

lenguyenthedat commented Jan 9, 2018

For anyone who wants a shorter version of the above (without using shelve, which give me this complain below:

  File "---.py", line 104, in get_possible_values
    with shelve.open(shelf_name, writeback=True) as shelf:
AttributeError: DbfilenameShelf instance has no attribute '__exit__'
    def concat(dataframes, categorical_columns, ignore_index=False):
        """Concatenate dataframes with unordered categorical columns.

        Will mutate categorical columns of origial dataframes.

        dataframes: list of dataframes.
        categorical_columns: list of names of unordered, categorical columns.
        ignore_index: same as from pd.concat.
        shelf_name: filename for shelve object to store possible values.
        """

        # Get all possible values for all categorical_columns
        possible_values = {}
        for col in categorical_columns:
            possible_values[col] = set()
        for df in dataframes:
            for col in categorical_columns:
                for val in df[col]:
                    possible_values[col].add(val)

        # Use pd.Categorical() to re-categorizing the values in all columns
        for df in dataframes:
            for col in categorical_columns:
                df[col] = pd.Categorical(
                    df[col], categories=possible_values[col], ordered=False)

        return pd.concat(dataframes, axis=0, ignore_index=ignore_index)\

PS: you won't need to do all this if you are running pandas 0.19 or later. In my case I gotta live with 0.18 and this saved my life today! Thank you @tdhopper !

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment