Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save afniedermayer/9dc9015004ae02414f4238d2f6d24004 to your computer and use it in GitHub Desktop.
Save afniedermayer/9dc9015004ae02414f4238d2f6d24004 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Performance Comparison of Different Ways of Iterating Through a DataFrame"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from __future__ import print_function, division, absolute_import"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"NR_ROWS = 1000"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import time"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def tic():\n",
" tic.start = time.time()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def toc():\n",
" diff = time.time()-tic.start\n",
" print('time: ', diff, 's')\n",
" return diff"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Method 1: Chained Indexing"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"See also http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
"That text basically says that chained indexing is not just slow, but might sometimes also give wrong results."
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df = pd.DataFrame({'name': ['text {}'.format(i) for i in xrange(NR_ROWS)], 'number': range(NR_ROWS)})"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>name</th>\n",
" <th>number</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>text 0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>text 1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>text 2</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>text 3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>text 4</td>\n",
" <td>4</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" name number\n",
"0 text 0 0\n",
"1 text 1 1\n",
"2 text 2 2\n",
"3 text 3 3\n",
"4 text 4 4"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[:5]"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"time: 34.3359999657 s\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\andras\\Anaconda\\lib\\site-packages\\IPython\\kernel\\__main__.py:3: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame\n",
"\n",
"See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
" app.launch_new_instance()\n"
]
}
],
"source": [
"tic()\n",
"for idx, row in df.iterrows():\n",
" df['number'][idx] = df['number'][idx]+10\n",
"method1 = toc()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>name</th>\n",
" <th>number</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>text 0</td>\n",
" <td>10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>text 1</td>\n",
" <td>11</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>text 2</td>\n",
" <td>12</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>text 3</td>\n",
" <td>13</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>text 4</td>\n",
" <td>14</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" name number\n",
"0 text 0 10\n",
"1 text 1 11\n",
"2 text 2 12\n",
"3 text 3 13\n",
"4 text 4 14"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[:5]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Increase the Number of Rows by a Factor 10"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"1000"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"NR_ROWS"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"NR_ROWS = 10000"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Method 2: ``.loc`` Property"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This method is (much) faster and always gives the correct result."
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df = pd.DataFrame({'name': ['text {}'.format(i) for i in xrange(NR_ROWS)], 'number': range(NR_ROWS)})"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"time: 15.4559998512 s\n"
]
}
],
"source": [
"tic()\n",
"for idx, row in df.iterrows():\n",
" df.loc[idx, 'number'] = df.loc[idx, 'number']+10\n",
"method2 = toc()/10"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>name</th>\n",
" <th>number</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>text 0</td>\n",
" <td>10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>text 1</td>\n",
" <td>11</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>text 2</td>\n",
" <td>12</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>text 3</td>\n",
" <td>13</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>text 4</td>\n",
" <td>14</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" name number\n",
"0 text 0 10\n",
"1 text 1 11\n",
"2 text 2 12\n",
"3 text 3 13\n",
"4 text 4 14"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[:5]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Method 3: Vectorized Operations"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This method also always gives the correct results and is even faster. But sometimes it's not possible/convenient to vectorize the code."
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df = pd.DataFrame({'name': ['text {}'.format(i) for i in xrange(NR_ROWS)], 'number': range(NR_ROWS)})"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"time: 0.00100016593933 s\n"
]
}
],
"source": [
"tic()\n",
"df['number'] = df['number']+10\n",
"method3 = toc()/10"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>name</th>\n",
" <th>number</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>text 0</td>\n",
" <td>10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>text 1</td>\n",
" <td>11</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>text 2</td>\n",
" <td>12</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>text 3</td>\n",
" <td>13</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>text 4</td>\n",
" <td>14</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" name number\n",
"0 text 0 10\n",
"1 text 1 11\n",
"2 text 2 12\n",
"3 text 3 13\n",
"4 text 4 14"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[:5]"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": true
},
"source": [
"## Method 4: Chained Indexing on Right-Hand-Side"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"df = pd.DataFrame({'name': ['text {}'.format(i) for i in xrange(NR_ROWS)], 'number': range(NR_ROWS)})"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"time: 12.1629998684 s\n"
]
}
],
"source": [
"tic()\n",
"for idx, row in df.iterrows():\n",
" df.loc[idx, 'number'] = df['number'][idx]+10\n",
"method4 = toc()/10"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>name</th>\n",
" <th>number</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>text 0</td>\n",
" <td>10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>text 1</td>\n",
" <td>11</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>text 2</td>\n",
" <td>12</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>text 3</td>\n",
" <td>13</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>text 4</td>\n",
" <td>14</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" name number\n",
"0 text 0 10\n",
"1 text 1 11\n",
"2 text 2 12\n",
"3 text 3 13\n",
"4 text 4 14"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[:5]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Method 5: ``.ix`` Property"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df = pd.DataFrame({'name': ['text {}'.format(i) for i in xrange(NR_ROWS)], 'number': range(NR_ROWS)})"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"time: 10.6949999332 s\n"
]
}
],
"source": [
"tic()\n",
"for idx, row in df.iterrows():\n",
" df.ix[idx, 'number'] = df.ix[idx, 'number']+10\n",
"method5 = toc()/10"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>name</th>\n",
" <th>number</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>text 0</td>\n",
" <td>10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>text 1</td>\n",
" <td>11</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>text 2</td>\n",
" <td>12</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>text 3</td>\n",
" <td>13</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>text 4</td>\n",
" <td>14</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" name number\n",
"0 text 0 10\n",
"1 text 1 11\n",
"2 text 2 12\n",
"3 text 3 13\n",
"4 text 4 14"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[:5]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Method 6: ``.apply`` Method"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df = pd.DataFrame({'name': ['text {}'.format(i) for i in xrange(NR_ROWS)], 'number': range(NR_ROWS)})"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def add_value(row):\n",
" row['number'] = row['number']+10\n",
" return row"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"time: 3.4240000248 s\n"
]
}
],
"source": [
"tic()\n",
"df = df.apply(add_value, axis=1)\n",
"method6 = toc()/10"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>name</th>\n",
" <th>number</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>text 0</td>\n",
" <td>10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>text 1</td>\n",
" <td>11</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>text 2</td>\n",
" <td>12</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>text 3</td>\n",
" <td>13</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>text 4</td>\n",
" <td>14</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" name number\n",
"0 text 0 10\n",
"1 text 1 11\n",
"2 text 2 12\n",
"3 text 3 13\n",
"4 text 4 14"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[:5]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Speed Comparison"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"methods = {'Method {}'.format(i+1): globals()['method{}'.format(i+1)] for i in xrange(6)}"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"{'Method 1': 34.335999965667725,\n",
" 'Method 2': 1.5455999851226807,\n",
" 'Method 3': 0.00010001659393310547,\n",
" 'Method 4': 1.2162999868392945,\n",
" 'Method 5': 1.0694999933242797,\n",
" 'Method 6': 0.3424000024795532}"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"methods"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"mintime = min(methods.values())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Computational time relative to fastest method:"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"{'Method 1': 343303.03218116803,\n",
" 'Method 2': 15453.435518474374,\n",
" 'Method 3': 1.0,\n",
" 'Method 4': 12160.98188319428,\n",
" 'Method 5': 10693.225506555422,\n",
" 'Method 6': 3423.4319427890346}"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"{k:v/mintime for k, v in methods.iteritems()}"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.11"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment