Last active
January 29, 2016 10:13
-
-
Save miraculixx/b1fe001041610f30862a to your computer and use it in GitHub Desktop.
benchmarking groupwise apply and transform to set flag based on group aggregation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# http://stackoverflow.com/questions/35046725/pandas-set-value-in-groupby/35047316#35047316\n", | |
"import pandas as pd\n", | |
"import numpy as np\n", | |
"\n", | |
"def init(N=1):\n", | |
" \"\"\" generate data \"\"\"\n", | |
" sample = ['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c'] \n", | |
" sample = sample * max(int(N / len(sample)), 1)\n", | |
" sample_size = len(sample)\n", | |
" df = pd.DataFrame({\n", | |
" 'letters' : sample, \n", | |
" 'is_min' : np.zeros(sample_size),\n", | |
" 'numbers' : np.random.randn(sample_size)\n", | |
" })\n", | |
" return df" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"100 loops, best of 3: 16.6 ms per loop\n", | |
"1 loops, best of 3: 4.32 s per loop\n", | |
"10 loops, best of 3: 21.6 ms per loop\n", | |
"10 loops, best of 3: 56.2 ms per loop\n", | |
"99999 samples\n" | |
] | |
} | |
], | |
"source": [ | |
"# various methods to achieve at the same result\n", | |
"\n", | |
"def idxfun(fun):\n", | |
" \"\"\" generalized transform(idxmax/min) \"\"\"\n", | |
" def inner(g):\n", | |
" return g[g == fun(g)].index\n", | |
" return inner\n", | |
" \n", | |
"def v1(df):\n", | |
" # fastest\n", | |
" def set_is_min(m):\n", | |
" df.loc[df.numbers == m, 'is_min'] = 1\n", | |
" mins = df.groupby('letters').numbers.min().apply(set_is_min)\n", | |
" \n", | |
"def v2(df):\n", | |
" # slowest\n", | |
" mins = df.groupby('letters').numbers.min()\n", | |
" df['is_min'] = df.apply(lambda r : 1 if mins[r.letters] == r.numbers else 0, axis=1)\n", | |
" \n", | |
"def v3(df):\n", | |
" # accepted answer\n", | |
" df.loc[df.groupby('letters')['numbers'].transform('idxmin').unique(), 'is_min']=1\n", | |
" \n", | |
"def v4(df):\n", | |
" # generalized accepted answer - faster than v2, almost 4x slower than fastest \n", | |
" df.loc[df.groupby('letters')['numbers'].transform(idxfun(min)), 'is_min']=1\n", | |
" \n", | |
"# note init() is called for every new timeit run to avoid caching effects\n", | |
"N = 100000\n", | |
"%timeit v1(init(N))\n", | |
"%timeit v2(init(N))\n", | |
"%timeit v3(init(N))\n", | |
"%timeit v4(init(N))\n", | |
"\n", | |
"print len(init(N).index), \"samples\"" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 2", | |
"language": "python", | |
"name": "python2" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.11" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment