Skip to content

Instantly share code, notes, and snippets.

@darribas
Created July 27, 2015 17:04
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save darribas/41940dfe7bf4f987eeaa to your computer and use it in GitHub Desktop.
Save darribas/41940dfe7bf4f987eeaa to your computer and use it in GitHub Desktop.
Quick comparison between `pandas` and `dask` groupby functionality.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"%matplotlib inline\n",
"\n",
"import time\n",
"import pandas as pd\n",
"import numpy as np\n",
"import statsmodels.formula.api as sm\n",
"import dask.dataframe as dk"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Settings:"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def reg(s):\n",
" lm = sm.ols('y ~ x1 + x2', s).fit()\n",
" return lm.params"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"n = 1000000\n",
"l = 100"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Set up a dataset to run a model by groups:"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"x = pd.DataFrame(np.random.random((n, 2)), columns=['x1', 'x2'])\n",
"x['c'] = 1\n",
"x['u'] = np.random.random(n)\n",
"x['l'] = np.random.random_integers(1, l+1, n)\n",
"g = x.groupby('l')\n",
"for id, sub in g:\n",
" test = sub[['c', 'x1', 'x2']].dot(np.array([sub['l'].iloc[0]]*3)) + sub['u']\n",
" x.loc[test.index, 'y'] = test"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"`pandas` implementation:"
]
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 654 ms, sys: 117 ms, total: 770 ms\n",
"Wall time: 801 ms\n"
]
}
],
"source": [
"%time b_pd = x.groupby('l').apply(reg)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"`dask` implementation:"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"xd = dk.from_pandas(x, 10)"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 1.26 s, sys: 286 ms, total: 1.55 s\n",
"Wall time: 1.45 s\n"
]
}
],
"source": [
"%time b_dk = xd.groupby('l').apply(reg).compute()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Comparison over dataset sizes:"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 1min 12s, sys: 23.3 s, total: 1min 36s\n",
"Wall time: 2min 15s\n"
]
}
],
"source": [
"%%time\n",
"ns = [1000, 1000000, 10000000]\n",
"t_pd = {}\n",
"t_dk = {}\n",
"for n in ns:\n",
" x = pd.DataFrame(np.random.random((n, 2)), columns=['x1', 'x2'])\n",
" x['c'] = 1\n",
" x['u'] = np.random.random(n)\n",
" x['l'] = np.random.random_integers(1, l+1, n)\n",
" g = x.groupby('l')\n",
" for id, sub in g:\n",
" test = sub[['c', 'x1', 'x2']].dot(np.array([sub['l'].iloc[0]]*3)) + sub['u']\n",
" x.loc[test.index, 'y'] = test\n",
" t0 = time.time()\n",
" b_pd = x.groupby('l').apply(reg)\n",
" t1 = time.time()\n",
" t_pd[n] = t1-t0\n",
" t0 = time.time()\n",
" xd = dk.from_pandas(x, 10)\n",
" b_dk = xd.groupby('l').apply(reg).compute()\n",
" t1 = time.time()\n",
" t_dk[n] = t1-t0"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>dask</th>\n",
" <th>pandas</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1000</th>\n",
" <td>9.358188</td>\n",
" <td>1.158390</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1000000</th>\n",
" <td>21.577656</td>\n",
" <td>3.056699</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10000000</th>\n",
" <td>34.014051</td>\n",
" <td>19.967284</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" dask pandas\n",
"1000 9.358188 1.158390\n",
"1000000 21.577656 3.056699\n",
"10000000 34.014051 19.967284"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"res = pd.DataFrame({'pandas': t_pd, 'dask': t_dk})\n",
"res"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"<matplotlib.axes._subplots.AxesSubplot at 0x10c3eead0>"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAXMAAAENCAYAAAD9koUjAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3Xec1NW9//HXocnSpKwuvXeYhWAHlBWIgtIJsDBDjDHq\nNerVGL3R3PwUQ8xNcjUxmsQWvRq+s3RUUKQILCBSxKAzNBGlSpHe27Ln98fsLisu22e+U97Px2Mf\nTvmWD0f24/F8zjlfY61FRERiWwW3AxARkbJTMhcRiQNK5iIicUDJXEQkDiiZi4jEASVzEZE4UGgy\nN8ZUNcasNMZ8ZoxZb4z5n5zPxxljdhpj1uT89ItMuCIiUhBT1DxzY0w1a+1JY0wl4CPgUaAPcMxa\n++cIxCgiIkUocpjFWnsy52UVoCJwKOe9CVdQIiJSMkUmc2NMBWPMZ8BeYJG1dl3OVw8aYz43xrxu\njKkd1ihFRKRQRQ6z5B1ozOXAXOBxYD2wL+er8UADa+1dYYlQRESKVKm4B1prjxhj3geuttZm5n5u\njPknMOvi440x2vRFRKQUrLUlHsYuajZLcu4QijEmCfghsMYYUz/fYUOB4CUC0o+1PPXUU67HEC0/\nagu1hdqi8J/SKqpn3gB4yxhTgVDin2CtXWCM+ZcxpitggS3AvaWOQEREyqzQZG6tDQLdCvj8x2GL\nSERESkwrQCMgLS3N7RCihtriArXFBWqLsiv2bJYSX9gYG65ri4jEE2th/XpYsAAeeshgS1EALfZs\nlvJijNYahYv+4ykSO7ZvDyXv3J+kJOjTp/TXi3jP3BijpBMGaleR6HbwICxaBB9+GErehw9D796h\nBN6nD7RsGTou53e5xL1eJfM4oXYViS4nT8JHH13oeW/aBDfeeCF5ezxQoYCqpZJ5glO7irgrKwtW\nr77Q8169Grp2hb59Q8n72muhSpWir6NknuDUriKRlb9o+eGHsGQJNG9+oed9001Qo0bJr6tkHiY/\n+clPaNKkCePHjy/V+W+++Savv/46S5cuLefIvivW2lUkFl2qaNm3L9x8M1x5ZdnvUdpkHvHZLLHG\nGKMZOCIJqrCi5W9/e6FoGQ2UzItBPV6RxFBQ0bJnz1DP+z/+49JFy2gQpWG5Z82aNXTr1o1atWqR\nnp7O6dOnATh8+DADBgzgyiuvpG7dugwcOJBvvvkm77w333yTVq1aUatWLVq2bElGRkaB13/ssce4\n8cYbOXr0aET+PCJyaVlZsGIF/O53oWGSlBQYPz40fPL887B/P8yeDY88Al26RG8iByXz7zh79ixD\nhgzhjjvu4NChQ4wYMYLp06djjCE7O5u77rqL7du3s337dpKSknjggQcAOHHiBA899BBz5szh6NGj\nLF++nK5du37n2tZa7r77btauXcv8+fOpVauWG39EkYRmLaxbBy+8AIMGQXJyqMd96BA89hjs3g1L\nl8K4caEeeXFmn0SLqBxmKa8h6pKOjqxYsYKsrCweeughAIYPH84111wDQN26dRk6dGjesb/+9a/p\n3bt33vsKFSoQDAZp3LgxKSkppKSk5H137tw50tPTyc7OZtasWVSqFJXNLhKXLlW09Hrhn/8sn6Jl\nNIjKrOLWEPWuXbto1KjRdz5r1qwZAKdOneLhhx9m7ty5HDoUegzq8ePHsdZSvXp1Jk+ezLPPPstd\nd91Fjx49eO6552jXrh0AmzdvJhAIsHLlSiVykTC7uGh56NCF6YLRVrQsTxpmyadBgwbfGQcH2LZt\nG9Zann32WTZt2sSqVas4cuQIixcv/s5m8rfccgvz5s1jz549tG/fnrvvvjvvGh06dOCNN96gf//+\nbNq0KaJ/JpF4d/IkzJsHv/oVXH11aK73669DmzYwdSrs3QuTJsHdd8dvIoco7Zm7pXv37lSqVIkX\nXniB++67j1mzZvHJJ5/Qu3dvjh8/TlJSEpdffjkHDx7k6aefzjvv22+/Zfny5fTt25ekpCSqV69O\nxYoVv3Pt9PR0zp49S9++fcnMzKRlPP+tEgmjS6207NMnVLQs7krLeKOeeT6VK1dmxowZvPnmm9Sr\nV48pU6YwfPhwjDE8/PDDnDp1iuTkZLp3707//v3z5p9nZ2fzl7/8hUaNGlGvXj2WLl3KSy+9BHx3\nnvqPf/xjnnzySXr37s327dtd+3OKxJL8RcvBg0NFy3vvjY+iZXnSCtA4oXaVeBKJlZbRSsv5E5za\nVWJZYUXL/NvDJgIl8wSndpVYUthKy8K2h00ESuYJTu0q0aywomXfvolbtCyIknmCU7tKNMm/PeyC\nBbB4MTRrdqHnfeONULOm21FGJyXzBKd2FbclctGyPIUlmRtjqgKLgcuAKsC71tonjDF1gclAM2Ar\nMNJae/iic5XMI0jtKpGmomV4hK1nboypZq09aYypBHwEPAoMAvZba/9kjPkVUMda+/hF5ymZR5Da\nVcLtUkXL3N53Ihcty1PYh1mMMdUI9dJ/AkwHellr9xpj6gOZ1tr2Fx2vZB5Balcpb7lFy9zHouUv\nWvbpA9ddp6JlOISzZ14B+DfQCnjJWvtfxphD1to6Od8b4GDu+3znKZlfQvPmzXn99dfp06dPuV1T\n7SplpaJldAjbY+OstdlAV2PM5cBcY8zNF31vjTEFZpFx48blvU5LSyMtLa2k8cUlPYpOosXFRcuq\nVUPJe8wYeO01FS0jITMzk8zMzDJfp0SzWYwx/w84BfwMSLPW7jHGNAAWaZil+Fq0aMHrr7/+nf3Q\ny0rtKsWRW7TMHTpR0TL6lLZnXmi5whiTbIypnfM6CfghsAaYCdyRc9gdwDslvXE0at68OX/4wx/o\n1KkTdevW5ac//Slnzpzh0KFDhT4yLi0tjSeffJKePXtSq1Ytbr31Vg4cOJD3/YQJE2jWrBnJycn8\n/ve//849V61axQ033ECdOnVo2LAhDz74IOfOncv7/he/+AUpKSlcfvnlpKamsm7duvA3hMSNkydh\n/vzvbw/bqlVibQ+bEHL35C7oB/AQGi//DAgAj+V8Xhf4ENgEzANqF3CuLcilPo8GzZo1sx6Px+7c\nudMePHjQ9ujRw/7mN7+xBw4csDNmzLCnTp2yx44dsyNGjLBDhgzJO69Xr162devW9ssvv7SnTp2y\naWlp9vHHH7fWWrtu3Tpbo0YNu3TpUnvmzBn7yCOP2EqVKtkFCxZYa6399NNP7cqVK+358+ft1q1b\nbYcOHezzzz9vrbV2zpw59qqrrrJHjhyx1lq7ceNGu3v37gJjj+Z2lcg5d87a5cut/d3vrE1Ls7ZG\nDWt79rT2qaesXbLE2jNn3I5QipLzu1xobi7op9Axc2ttEOhWwOcHgb7l9N+T7zFPl894sn2qZMMO\nxhgeeOCBvKcN/fd//zcPPvgg48ePL/SRccYY7rzzTlq3bg3AyJEjmTlzJgDTpk1j4MCB9OzZE4Dx\n48fzt7/9Le/cbt0uNG+zZs245557WLx4MQ899BCVK1fm2LFjbNiwgWuuuSbvyUUiuS5VtOzTJ7Q9\nrIqWiSMqH05R0iRcnpo0aZL3umnTpuzatavQR8blFjLr16+fd15SUhLHjx8HQo+ia9y4cd531apV\no169ennvN23axCOPPMKnn37KyZMnycrK4uqrrwagd+/ePPDAA9x///1s27aNYcOG8eyzz1JTv50J\nbceOC2Pe+YuWo0eraJnINMX/IvkfGrF9+3YaNmzIc889V+gj4wrTsGFDduzYkff+5MmT3xlPv+++\n++jYsSObN2/myJEjPPPMM2RnZ+d9/+CDD7J69WrWr1/Ppk2b+N///d9y+pNKrDh4EKZPh5//HNq2\nhW7dYM4c6NULli2DLVtCSTw9XYk8kUVlz9wt1lr+8Y9/MGDAAJKSknjmmWdIT0/n2LFjl3xkXP5z\nCzJ8+HCuv/56li1bxjXXXMOTTz75nWR9/PhxatasSbVq1di4cSMvvfQSKSkpAKxevZrz58/TrVs3\nqlWrRtWqVb/3ODqJPydPhpJ0bs87/0rLe+/VSkspmP5K5GOMYcyYMdxyyy20atWKNm3a8Jvf/KbQ\nR8blPzf/69z3nTp14u9//ztjxoyhYcOG1K1b9ztDOc8++ywZGRnUqlWLe+65h/T09Lzvjh49yj33\n3EPdunVp3rw5ycnJPPbYY2FuBYm0rCxYsQKeeSa0IVVKCjz9dGijqr/8Bfbvh9mz4Ze/hC5dlMil\nYNo1MZ9wzP+OlGhuV/kua2HDhgs97/xFy759VbRMdGFbASoiZZe/aLlwIVx2mYqWUr6UzEXCoKCV\nlr17hxL4b3+rBTpS/jTMEifUru4qrGip7WGlJPSkoQSndo2sgraH7dLlwg6D2h5WSkvJPMGpXcNL\nRUuJFCXzBKd2LX+XKlr26RMa/1bRUsIhppK5hIeSedkUVrTU9rASKTGTzEWihYqWEo2UzEWKoKKl\nxAIlc5GLFFa07NMHbrpJRUuJPkrmIqhoKbFPyVwSkoqWEm+UzCUhFFa07NMHUlNVtJTYpmQucUlF\nS0k0SuYSF1S0lESnZC4xq6CiZe5cbxUtJdEomUvMyF+0XLAg9L537wsJXEVLSWRK5hK1VLQUKb6w\nJHNjTBPgX8CVgAVetda+YIwZB/wM2Jdz6BPW2jkXnatknqAuLlp+8gl07Xqh562ipcilhSuZ1wfq\nW2s/M8bUAD4FhgAjgWPW2j8Xcq6SeYIoqGjZtOmFGScqWooUX1ieAWqt3QPsyXl93BizAWiUe88S\nRylx41JFSz3TUsQdxR4zN8Y0BxYDnYBfAncCR4DVwC+ttYcvOl498ziioqVIZISlZ57v4jWAacBD\nOT30l4Df5nw9HngOuOvi88aNG5f3Oi0tjbS0tJLGJy4pqGjZo0cocd9zj4qWIuUlMzOTzMzMMl+n\nyJ65MaYy8B7wgbX2+QK+bw7MstZ6LvpcPfMYoqKlSHQIS8/chB4L9DqwPn8iN8Y0sNbuznk7FAiW\n9MbirsKKlo8+qqKlSKwpajZLT2AJECA0NRHg18BooGvOZ1uAe621ey86Vz3zKJNbtMz9qVIllLy1\n0lIkemjRkHxPYUXL3O1h9UhWkeiiZC6FFi210lIkNiiZJ6D8RcsFC2DVKhUtRWKdknkCyC1a5s44\nWbIEmjTRSkuReKJkHqcKK1refDOkpLgdoYiUJyXzOKGipUhiUzKPUadOwUcfXRg6UdFSJLEpmceI\nwoqWffrA9deraCmSyJTMo5SKliJSEkrmUeRSRcs+fULj3ypaisilKJm76OBByMy8sFjnwIFQ0s5N\n4CpaikhxKZlH0KWKlrmLdVS0FJHSUjIPIxUtRSRSlMzL0cVFy9ztYXN73ipaiki4KJmXkYqWIhIN\nlMxLSEVLEYlGSuZFUNFSRGKBknkBtm+HCRMuFC27dLnQ81bRUkSikZJ5AW66KTRcMmKEipYiEhvC\n8kDnWLZmDWzdCgsXQqW4/VOKiITE7Sjxiy/Cz3+uRC4iiSEuh1n27YO2beHLLyE52ZUQRERKpbTD\nLHHZM3/tNRg+XIlcRBJH3PXMz52DFi3g/fdDs1dERGJJWHrmxpgmxphFxph1xpi1xpj/zPm8rjFm\nvjFmkzFmnjGmdmkDL29vvw2tWyuRi0hiKWqY5RzwC2ttJ+B64H5jTAfgcWC+tbYtsCDnfVR44QX4\nz/90OwoRkcgqNJlba/dYaz/LeX0c2AA0AgYBb+Uc9hYwJJxBFtenn4b2WBk0yO1IREQiq9gFUGNM\nc+AHwEogxVq7N+ervUBUbEP14otw//2ajigiiadYac8YUwOYDjxkrT1m8u1AZa21xpgCK53jxo3L\ne52WlkZaWlpZYi3Ut9/Cu+/Cc8+F7RYiIuUuMzOT9+a9x9pv1xLYGyj1dYqczWKMqQy8B3xgrX0+\n57ONQJq1do8xpgGwyFrb/qLzIjqb5ZlnQis+X3stYrcUESm142eP8/aGt/EH/azYuYLB7Qfj9Xi5\ntfWt5b83iwl1wd8CDlhrf5Hv8z/lfPZHY8zjQG1r7eMXnRuxZJ47HXH27NDuhyIi0ejc+XPM/3o+\n/qCf9ze9T8+mPfGl+hjUbhDVKlcDwrTRljGmJ7AECAC5Bz4BrAKmAE2BrcBIa+3hi86NWDKfPBle\nfhkWLYrI7UREis1ay6pvVuEEHKasn0LLOi3xeXyM7DSSK6pf8b3jE3rXxB494NFHYejQiNxORKRI\nXx74En/Qjz/op4KpgNfjxevx0qpuq0LPS9hdE1evhm++gYED3Y5ERBLdtye+ZfLayfiDfrYe3sqo\nTqPIGJbB1Q2vxoT50WUx3zO/4w7o3BkeeyzstxIR+Z4TZ0/w7hfv4gQcPt7xMQPbDcTr8dK3ZV8q\nVSh5fzkhh1n27oX27eGrr6Bu3bDeSkQkT1Z2Fgu+XoATdJj1xSy6N+mO1+NlcPvB1KhSo0zXTshh\nlldfhZEjlchFJPystazetRp/0M+ktZNoVrsZXo+XZ3/4LCk13F83GbM987NnoXlzmDsXPJ6w3UZE\nEtzXh77GH/DjBB3OZ5/Hl+pjjGcMbeu1Dcv9Eq5nPmNGaIhFiVxEytv+k/uZsm4KTsBh88HNjOo0\nireGvMV1ja4LeyGztGK2Z969O/zXf8GQqNjiS0Ri3clzJ5n5xUz8QT9Lti3h9ja340v18cOWP6Ry\nxcoRiyOhCqCffAIjRoQKnxUrhuUWIpIAzmefZ+GWhfiDft794l2ubXQtPo+PIe2HUPOymq7ElFDD\nLC++CA88oEQuIiVnrWXNnjX4A34mrp1Iw5oN8aX6+EPfP1C/Rn23wyu1mOuZ79kDHTrA119DnTrl\nfnkRiVNbDm0hI5iBP+jndNbp0IrMVC/tk9sXfXIEJUzP/NVXYdQoJXIRKdqBkweYun4q/qCfjfs3\nMqLjCP456J/c0PiGqC1kllZM9cxzpyPOnw+dOpXrpUUkTpw6d4r3Nr2HE3TI3JpJv9b98Hl83Nr6\nVqpUrOJ2eEVKiJ75tGnQsaMSuYh81/ns8yzethgn4PD2xre5uuHVeD1eJgydQK3LarkdXkTEVM/8\n+uvhiSdg8OByvayIxCBrLYG9AZyAQ8baDFKqp+D1eEnvnE6jWo3cDq/U4r5nvmpVaC+WAQPcjkRE\n3LT9yHYyghk4AYfjZ4/j9XiZP3Y+Ha/o6HZoroqZZK7piCKJ69CpQ0xbPw0n6LD227WM6DiCl25/\niR5Ne1DBFPu59HEtJoZZNB1RJPGczjrN7C9n4wQcFmxZwC2tbsHr8dK/dX8uq3SZ2+GFTVwPs7zy\nCqSnK5GLxLtsm82SbUvwB/zM2DiDLild8KX6eGPwG9SuWtvt8KJa1PfMz56FZs1gwYLQTBYRiT/B\nvUH8QT8ZwQzqJNXB5/Ex2jOaxrUaux1axMVtz3zq1NCThJTIReLLzqM781ZkHjx1EK/Hy/tj3seT\noq1QSyPqe+bXXQe/+Y2e8SkSDw6fPsz09dPxB/18vvdzhrUfhi/Vx43NblQhM0dc9sxXroT9++G2\n29yORERK60zWGT7Y/AH+oJ95X82jT4s+PHDtA9zW5jaqVqrqdnhxI6qT+QsvwP33azqiSKzJttks\n274MJ+AwfcN0Ol/ZGa/Hy6sDXqVOkmYyhEORwyzGmDeA24FvrbWenM/GAT8D9uUc9oS1ds5F55Vp\nmGX37tA4+ZYtUFtFbJGYsH7fepyAgz/op2aVmnmPWGt6eVO3Q4sZ4Rxm+T/gReBf+T6zwJ+ttX8u\n6Q2L65VXYPRoJXKRaLfr2C4mBifiBB32ndjH6M6jmZk+k9SU1LjbmTCaFZnMrbVLjTHNC/gqbP+W\nzpwJJfMFC8J1BxEpi6NnjjJjwwycgMO/d/+boe2H8twtz9GrWS8qVtC4qBvKMmb+oDHmx8Bq4JfW\n2sPlFBNTp4Ye1KzpiCLR4+z5s8zdPBcn6DBn8xxubn4z9151LwPaDiCpcpLb4SW80ibzl4Df5rwe\nDzwH3HXxQePGjct7nZaWRlpaWpEXthb++ld46qlSRiYi5cZay8c7PsYf9DN1/VTaJ7fH6/Hyj9v+\nQb1q9dwOLy5kZmaSmZlZ5usUa555zjDLrNwCaHG+K20BdMUK8Plg0yaooGmnIq7YuH8j/oAff9BP\n1UpV8wqZzWs3dzu0uBfReebGmAbW2t05b4cCwdJcpyAvvBDaHVGJXCSy9hzfw6S1k3ACDruO7WJ0\n59FMHzmdrvW7qpAZA4ozNXEi0AtIBvYCTwFpQFdCs1q2APdaa/dedF6Je+a7doWW7m/ZApdfXqJT\nRaQUjp05xtsb38Yf9LPqm1UMbjcYX6qPm5vfrEKmS0rbM4+q5fxPPgkHDsDf/x6WkEQEOHf+HPO+\nmoc/6Gf2l7O5sdmN+Dw+BrYbSLXK1dwOL+HFfDI/cya0O+KiRaG9y0Wk/FhrWfnNSvwBP5PXTaZ1\n3db4Un2M7DSS5GrJbocn+cT83ixTpkCXLkrkIuVp04FNeYXMihUq4vP4WH7XclrVbeV2aFLOoiKZ\n505HfPpptyMRiX17j+9l8rrJOAGH7Ue2k945nUk/msRVDa5SITOORUUyX7ECDh+G/v3djkQkNp04\ne4J3Nr6DE3RYvmM5A9sNZPzN4+nTsg+VKkTFr7mEWVT8W37hBXjwQU1HFCmJrOwsPvz6Q5yAw3ub\n3qNH0x6MTR3LtBHTqF6lutvhSYS5XgD95pvQ0n1NRxQpmrWW1btW4wQcJq2bRIvaLfB6vIzqPIor\nq1/pdnhSDmK2APryy+D1KpGLFOarg1/hD/pxAg4Wi8/j46M7P6JNvTZuhyZRwtWe+enToemIS5ZA\nu3ZhCUMkZu07sY8p66bgBB2+OvgV6Z3T8Xq8XNvoWhUy41hM9swnT4Zu3ZTIRXKdPHeSmV/MxAk4\nfLT9I25veztP3vQkfVv2pXLFym6HJ1HMtZ65tXDVVfC73+kZn5LYsrKzWLhlIf6gn5lfzOS6Rtfh\nS/UxpP0QalSp4XZ4EmEx1zP/+GM4dgz69XMrAhH3WGv59+5/4w/6mbh2Io1rNcbn8fHHvn+kfo36\nbocnMci1ZP7ii5qOKIlny6Et+IOhFZlnss7gS/WReUcm7ZI11ihl48owy86dkJoKW7dCrVphub1I\n1Dhw8kBeIXPTgU2M7DgSX6qP6xtfr0KmfE9MDbO8/HLoARRK5BKvTp07xaxNs3ACDou3LaZ/6/48\n0fMJbm11qwqZEhYR75nnTkdcuhTatg3LrUVccT77PJlbM3GCDu9sfIerG16Nz+NjaIeh1LpMPRcp\nnpjpmU+aFJrFokQu8cBay+d7P8cJOExcO5H6Nerj9Xh5pvczNKzZ0O3wJIFENJlbG9qH5fe/j+Rd\nRcrftsPbyAhm4AQdTpw9gdfjZf7Y+XS8oqPboUmCimgyX7YMTpyAW26J5F1FysfBUweZtn4aTsBh\n/b71jOg4glcGvEL3Jt2pYDQtS9wV0WSuhzVLrDmddZr3N72PE3RYuGUht7a6lUe7P0q/1v2oUrGK\n2+GJ5IlYAXTHjtCThDQdUaJdts1m8dbF+IN+ZmyYwQ8a/ACfx8ewDsO4vKp2hJPwivoC6Esvwdix\nSuQSvQJ7A/gDfjLWZlAvqR6+VB+B+wI0rtXY7dBEihSRnvmpU6HpiMuWQRvt2ClRZMeRHUxcOxEn\n4HD49GG8Hi/eVC+dr+zsdmiSoKK6Zz5pElxzjRK5RIfDpw8zbf00/EE/gb0BhncYzt9u+xs9m/ZU\nIVNiVpE9c2PMG8DtwLfWWk/OZ3WByUAzYCsw0lp7+KLzrLUWa0Pb3P7hD3DrreH4I4gU7UzWGWZ/\nORsn6PDh1x/St2VffB4ft7W5jcsqXeZ2eCJ5StszL04yvxE4DvwrXzL/E7DfWvsnY8yvgDrW2scv\nOs9aa1m6FO6+G9av1ywWiaxsm81H2z/CCThM3zAdz5UefKk+hncYTp2kOm6HJ1KgsA2zWGuXGmOa\nX/TxIKBXzuu3gEzgcQqghzVLpK37dh1OwCFjbQa1LquFz+Njzb1raHp5U7dDEwmb0o6Zp1hr9+a8\n3gukFHTQ9u2wcCG88UYp7yJSTN8c/SavkLn/5H7GeMYwa/QsUlNS3Q5NJCLKXAC11lpjTIFjNWPG\njKNNG3juOUhLSyMtLa2stxPJc+T0EWZsmIETdFizew3DOgzjL7f+hZua3UTFChXdDk+kWDIzM8nM\nzCzzdYo1NTFnmGVWvjHzjUCatXaPMaYBsMha2/6ic2xysmX5cmjdusxxigBw9vxZ5myegxNwmPvV\nXHq36I3X42VA2wFUrVTV7fBEyizSUxNnAncAf8z55zsFHXTddUrkUnbWWj7e8TFOwGHq+ql0vKIj\nvlQfLw94mbpJdd0OTyQqFGc2y0RCxc5kQuPjTwLvAlOAphQyNXHuXKtNtaTUNuzbkPeItaRKSYxN\nHctoz2ia127udmgiYRO2qYmlZYyx589bzWKREtl9bDeT1k7CCTrsPrabMZ4xeD1eutbvqkesSUKI\nymQermtLfDl25hgzNszAH/Tzya5PGNJ+CD6Pj7TmaSpkSsJRMpeYcu78OeZ+NRcn4PDB5g/o1awX\nvlQfA9sOJKlyktvhibhGyVyinrWWFTtX5BUy29Rrg8/jY0SnESRXS3Y7PJGoENUbbUli+2L/F3mF\nzMoVKuNL9bHiZytoWael26GJxA31zCUs9h7fm1fI3Hl0J+md0vGl+ujWoJsKmSKF0DCLuO742eO8\ns/EdnIDDip0rGNRuEL5UH71b9KZSBf1PoEhxKJmLK7Kys5j/1XycoMP7m96nZ9OeeD1eBrUbRPUq\n1d0OTyTmKJlLxFhr+WTXJzgBh8nrJtOyTku8Hi8jO43kyupXuh2eSExTAVTCbvPBzfgDfpygg8Hg\nS/Wx7KfLaF1XezaIuE09cynUvhP7mLxuMk7AYcvhLaR3Sseb6uWahteokCkSBhpmkXJz4uwJZn4x\nEyfosGz7Mga0HYAv1Uffln1VyBQJMyVzKZOs7CwWfL0Af9DPzC9mckOTG/B5fAxuP5gaVWq4HZ5I\nwlAylxKz1vLp7k/zCplNajXBl+pjVKdRpNQo8OFRIhJmKoBKsX196Gv8gdCKzHPZ5/B5fCz+yWLa\n1mvrdmjppzWGAAAKyElEQVQiUkrqmSeI/Sf3M2XdFPxBP5sObGJUp1H4Un1c1+g6FTJFooiGWeR7\nTp47yawvZuEEHZZuW0r/Nv3xeXzc0uoWKles7HZ4IlIAJXMB4Hz2eRZtXYQTcHj3i3e5ttG1eD1e\nhrYfSs3LarodnogUQck8gVlr+WzPZzgBh4lrJ9KwZkO8Hi/pndNpULOB2+GJSAmoAJqAth7eSkYw\nAyfgcCrrFD6Pj4V3LKR9cnu3QxORCFPPPMYcPHWQqeum4gQdNuzbwMhOI/F6vHRv0l2FTJE4oGGW\nOHY66zTvbXoPJ+CwaOsi+rXuh9fjpV/rflSpWMXt8ESkHCmZx5lsm83irYtxAg5vb3ybbg264Uv1\nMazDMGpdVsvt8EQkTJTM44C1lsDeAP6gn4xgBldUvwKfx0d653Qa1WrkdngiEgGuFECNMVuBo8B5\n4Jy19tqyXC9RbT+ynYxgBv6gn6NnjuL1eJnrm0unKzu5HZqIxIgy9cyNMVuAq6y1Bwv4Tj3zQhw6\ndYhp66fhD/oJfhvkRx1+hC/VR4+mPahgKrgdnoi4xM2piZpCUUxnss7w/pfv4wQcFmxZwA9b/pCH\nr3+Y/q37c1mly9wOT0RiWFl75l8DRwgNs7xirX0t33fqmRMqZC7dthQn4DBj4wxSU1LxeXwM7zic\n2lVrux2eiEQZt3rmPay1u40xVwDzjTEbrbVLc78cN25c3oFpaWmkpaWV8XaxY+23a3ECDhnBDOok\n1cHr8fLZvZ/R5PImbocmIlEkMzOTzMzMMl+n3GazGGOeAo5ba5/LeZ9wPfOdR3cyMTgRJ+hw8NRB\nxnQegzfVS2pKqtuhiUiMiHjP3BhTDahorT1mjKkO3AI8Xdrrxaojp48wfcN0nIDDZ3s+Y3iH4fy1\n31+5qdlNKmSKSMSUZZglBXg7Zwl5JcBvrZ1XLlFFubPnz/LBlx/gBB3mfTWPPi36cP8193N729up\nWqmq2+GJSALSoqFiyrbZfLzjY5yAw7T10+h0ZSe8Hi8/6vgj6ibVdTs8EYkT2jUxTNbvW5/3iLXq\nVaozNnUsn97zKc1qN3M7NBGRPOqZF2DXsV1MDE7EH/Sz98TevEJml5Qu2plQRMJKe7OU0dEzR5mx\nYQb+oJ/Vu1YztP1QfKk+ejXrRcUKFd0OT0QShJJ5KZw9f5a5m+fiD/r5YPMHpDVPw+fxMaDtAJIq\nJ7kdnogkICXzYrLWsnzncpyAw9T1U2lXrx2+VB8jOo6gXrV6bocnIglOBdAibNy/EX/AT8baDKpU\nrILP42PVz1bRok4Lt0MTESmzuO6Z7zm+h0lrJ+EP+tl5dCejO4/Gl+rjB/V/oEKmiEQlDbPkOHH2\nBO9sfIcJgQms/GYlg9oNwufx0btFbxUyRSTqJXQyP599noVbFjIhMIGZX8yke5Pu+FJ9DG43mOpV\nqkckBhGR8pCQyfzzPZ8zITCBjGAGDWs2xJfqY3Tn0aTUSAnrfUVEwiVhCqA7j+4kI5iBE3A4cuYI\nPo+PBT9eQIcrOrgdmoiIa2KiZ37szLG8nQn/vfvfDOswjLGpY7mx2Y3amVBE4krcDbNkZWcx76t5\nOAGH2V/OplfzXvg8Pga2G6idCUUkbsVFMrfW8unuT3ECDpPWTqJFnRb4PD5GdR5FcrXksMQpIhJN\nYnrMfNvhbfiDfiYEJnD2/Fl8Hh9L71xKm3pt3A5NRCQmuNYzP3z6MNPWT2NCYALrvl3HiI4jGNtl\nLDc0vkELekQkYcXEMMvZ82eZs3kOEwITmPfVPPq27MvY1LHc1uY2qlSsEpY4RERiSdQmc2stK79Z\nyYTPJzBl/RTaJ7dnbOpYRnQcQZ2kOmG5t4hIrIrKMfOnM5/GCTpUMBUYmzpWG1uJiIRJWJP5gVMH\nyBiWwdUNr9Y4uIhIGEXV1EQRkURX2mEWLZ8UEYkDpU7mxph+xpiNxpgvjTG/Ks+gRESkZEqVzI0x\nFYG/Af2AjsBoY4x2urqEzMxMt0OIGmqLC9QWF6gtyq60PfNrgc3W2q3W2nPAJGBw+YUVX/QX9QK1\nxQVqiwvUFmVX2mTeCNiR7/3OnM9ERMQFpU3mmqYiIhJFSjU10RhzPTDOWtsv5/0TQLa19o/5jlHC\nFxEphYgt5zfGVAK+APoAu4BVwGhr7YYSX0xERMqsVCtArbVZxpgHgLlAReB1JXIREfeEbQWoiIhE\nTplXgBZn8ZAx5oWc7z83xvygrPeMVkW1hTHGm9MGAWPMMmNMqhtxRkJxF5UZY64xxmQZY4ZFMr5I\nKubvSJoxZo0xZq0xJjPCIUZMMX5Hko0xc4wxn+W0xU9cCDPsjDFvGGP2GmOChRxTsryZu01taX4I\nDbFsBpoDlYHPgA4XHXMbMDvn9XXAirLcM1p/itkWNwCX57zul8htke+4hcB7wHC343bx70VtYB3Q\nOOd9sttxu9gW44D/yW0H4ABQye3Yw9AWNwI/AIKX+L7EebOsPfPiLB4aBLwFYK1dCdQ2xqSU8b7R\nqMi2sNYut9YeyXm7Emgc4RgjpbiLyh4EpgH7IhlchBWnLcYA0621OwGstfsjHGOkFKctdgO1cl7X\nAg5Ya7MiGGNEWGuXAocKOaTEebOsybw4i4cKOiYek1hJF1LdBcwOa0TuKbItjDGNCP0iv5TzUbwW\nb4rz96INUNcYs8gYs9oYMzZi0UVWcdriNaCTMWYX8DnwUIRiizYlzptl3c+8uL+AF8+ZjMdf3GL/\nmYwxNwM/BXqELxxXFactngcet9ZaE9rsPl43vC9OW1QGuhGa6lsNWG6MWWGt/TKskUVecdri18Bn\n1to0Y0wrYL4xpou19liYY4tGJcqbZU3m3wBN8r1vQui/IIUd0zjns3hTnLYgp+j5GtDPWlvY/2bF\nsuK0xVXApJyHliQD/Y0x56y1MyMTYsQUpy12APuttaeAU8aYJUAXIN6SeXHaojvwDIC19itjzBag\nHbA6IhFGjxLnzbIOs6wG2hhjmhtjqgCjgIt/GWcCP4a8laOHrbV7y3jfaFRkWxhjmgIzAJ+1drML\nMUZKkW1hrW1prW1hrW1BaNz8vjhM5FC835F3gZ7GmIrGmGqECl7rIxxnJBSnLTYCfQFyxojbAV9H\nNMroUOK8Waaeub3E4iFjzL05379irZ1tjLnNGLMZOAHcWZZ7RqvitAXwJFAHeCmnR3rOWnutWzGH\nSzHbIiEU83dkozFmDhAAsoHXrLVxl8yL+ffi98D/GWM+J9TZ/C9r7UHXgg4TY8xEoBeQbIzZATxF\naLit1HlTi4ZEROKAHhsnIhIHlMxFROKAkrmISBxQMhcRiQNK5iIi5aQ4G2jlO/bPOZurrTHGfGGM\nKdO6E81mEREpJ8aYG4HjwL+stZ4SnPcA0NVa+7PS3ls9cxGRclLQBlrGmFbGmA9y9t1ZYoxpV8Cp\nY4CJZbl3WZfzi4hI4V4F7rXWbjbGXAf8g9A+PAAYY5oR2hZ4YVluomQuIhImxpgahJ5jMDVn1TdA\nlYsOSwem2jKOeSuZi4iETwVC+6oU9qSgUcDPy+NGIiISBtbao8AWY8yPAExI3uMijTHtgTrW2hVl\nvZeSuYhIOcnZQOtjoJ0xZocx5k7AC9xljPkMWEvoKUK5RlHGwmfevTU1UUQk9qlnLiISB5TMRUTi\ngJK5iEgcUDIXEYkDSuYiInFAyVxEJA4omYuIxAElcxGROPD/AU9i/B2CWxb8AAAAAElFTkSuQmCC\n",
"text/plain": [
"<matplotlib.figure.Figure at 0x10c3d88d0>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"res.plot()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.10"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
@mrocklin
Copy link

mrocklin commented Aug 3, 2015

Nice comparison.

If your data fits in memory then you should almost always just use Pandas. Full groupby-applies like df.groupby(...).apply(func) are hard to do in parallel and require a full dataset shuffle. Dask (or any parallel library) should perform about as well under groupby-reductions for standard reductions like df.groupby(...).col.mean().

@spott
Copy link

spott commented Feb 8, 2018

If I understand this correctly, you are comparing a pandas groupby to dask converting from pandas then doing a groupby.

Is this really a fair test?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment