Skip to content

Instantly share code, notes, and snippets.

@kspeeckaert
Created July 24, 2016 22:18
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kspeeckaert/012822e05c043db103617f2c738ade88 to your computer and use it in GitHub Desktop.
Save kspeeckaert/012822e05c043db103617f2c738ade88 to your computer and use it in GitHub Desktop.
SO: Pandas gives an error from str.extractall('#')
{
"cells": [
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"3.5.1 (default, Dec 26 2015, 18:11:22) \n",
"[GCC 4.2.1 Compatible Apple LLVM 7.0.2 (clang-700.1.81)]\n",
"Pandas: 0.18.1\n"
]
}
],
"source": [
"# Version info\n",
"import sys\n",
"print(sys.version)\n",
"print('Pandas: ', pd.__version__)"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# Sample data\n",
"data=\"\"\"01, home #sweet home\n",
"01, #happy #life \n",
"02, #world peace\n",
"03, #all are one\n",
"\"\"\""
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import pandas as pd\n",
"from io import StringIO"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"df = pd.read_csv(StringIO(data), \n",
" header=None, \n",
" names=['col1', 'col2'],\n",
" index_col=0)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>col2</th>\n",
" </tr>\n",
" <tr>\n",
" <th>col1</th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>home #sweet home</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>#happy #life</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>#world peace</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>#all are one</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" col2\n",
"col1 \n",
"1 home #sweet home\n",
"1 #happy #life \n",
"2 #world peace\n",
"3 #all are one"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th></th>\n",
" <th>0</th>\n",
" </tr>\n",
" <tr>\n",
" <th>col1</th>\n",
" <th>match</th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th rowspan=\"3\" valign=\"top\">1</th>\n",
" <th>0</th>\n",
" <td>s</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>h</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>l</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <th>0</th>\n",
" <td>w</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <th>0</th>\n",
" <td>a</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" 0\n",
"col1 match \n",
"1 0 s\n",
" 0 h\n",
" 1 l\n",
"2 0 w\n",
"3 0 a"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# This should work (will only output the first character of each match)\n",
"df['col2'].str.extractall('#(\\S)')"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [
{
"ename": "AssertionError",
"evalue": "1 columns passed, passed data had 6 columns",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-8-4792d3b6c94f>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# This won't work due to a bug\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'col2'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mextractall\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'(#\\S+)'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m/Users/kristof/Development/virtualenv/jupyter/lib/python3.5/site-packages/pandas/core/strings.py\u001b[0m in \u001b[0;36mextractall\u001b[0;34m(self, pat, flags)\u001b[0m\n\u001b[1;32m 1619\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0mcopy\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstr_extractall\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1620\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mextractall\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpat\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mflags\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1621\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mstr_extractall\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_orig\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpat\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mflags\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mflags\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1622\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1623\u001b[0m _shared_docs['find'] = (\"\"\"\n",
"\u001b[0;32m/Users/kristof/Development/virtualenv/jupyter/lib/python3.5/site-packages/pandas/core/strings.py\u001b[0m in \u001b[0;36mstr_extractall\u001b[0;34m(arr, pat, flags)\u001b[0m\n\u001b[1;32m 714\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 715\u001b[0m \u001b[0mindex\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 716\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mDataFrame\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmatch_list\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mindex\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 717\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 718\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/Users/kristof/Development/virtualenv/jupyter/lib/python3.5/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, data, index, columns, dtype, copy)\u001b[0m\n\u001b[1;32m 261\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mcom\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mis_named_tuple\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mcolumns\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 262\u001b[0m \u001b[0mcolumns\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_fields\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 263\u001b[0;31m \u001b[0marrays\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcolumns\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_to_arrays\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 264\u001b[0m \u001b[0mcolumns\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_ensure_index\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 265\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/Users/kristof/Development/virtualenv/jupyter/lib/python3.5/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36m_to_arrays\u001b[0;34m(data, columns, coerce_float, dtype)\u001b[0m\n\u001b[1;32m 5350\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mlist\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtuple\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5351\u001b[0m return _list_to_arrays(data, columns, coerce_float=coerce_float,\n\u001b[0;32m-> 5352\u001b[0;31m dtype=dtype)\n\u001b[0m\u001b[1;32m 5353\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcollections\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mMapping\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5354\u001b[0m return _list_of_dict_to_arrays(data, columns,\n",
"\u001b[0;32m/Users/kristof/Development/virtualenv/jupyter/lib/python3.5/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36m_list_to_arrays\u001b[0;34m(data, columns, coerce_float, dtype)\u001b[0m\n\u001b[1;32m 5429\u001b[0m \u001b[0mcontent\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlib\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_object_array\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mT\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5430\u001b[0m return _convert_object_array(content, columns, dtype=dtype,\n\u001b[0;32m-> 5431\u001b[0;31m coerce_float=coerce_float)\n\u001b[0m\u001b[1;32m 5432\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5433\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/Users/kristof/Development/virtualenv/jupyter/lib/python3.5/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36m_convert_object_array\u001b[0;34m(content, columns, coerce_float, dtype)\u001b[0m\n\u001b[1;32m 5487\u001b[0m \u001b[0;31m# caller's responsibility to check for this...\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5488\u001b[0m raise AssertionError('%d columns passed, passed data had %s '\n\u001b[0;32m-> 5489\u001b[0;31m 'columns' % (len(columns), len(content)))\n\u001b[0m\u001b[1;32m 5490\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5491\u001b[0m \u001b[0;31m# provide soft conversion of object dtypes\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mAssertionError\u001b[0m: 1 columns passed, passed data had 6 columns"
]
}
],
"source": [
"# This won't work due to a bug\n",
"df['col2'].str.extractall('(#\\S+)')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.1"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
@kspeeckaert
Copy link
Author

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment