Skip to content

Instantly share code, notes, and snippets.

@miraculixx
Last active January 29, 2016 10:13
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save miraculixx/b1fe001041610f30862a to your computer and use it in GitHub Desktop.
Save miraculixx/b1fe001041610f30862a to your computer and use it in GitHub Desktop.
benchmarking groupwise apply and transform to set flag based on group aggregation
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# http://stackoverflow.com/questions/35046725/pandas-set-value-in-groupby/35047316#35047316\n",
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"def init(N=1):\n",
" \"\"\" generate data \"\"\"\n",
" sample = ['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c'] \n",
" sample = sample * max(int(N / len(sample)), 1)\n",
" sample_size = len(sample)\n",
" df = pd.DataFrame({\n",
" 'letters' : sample, \n",
" 'is_min' : np.zeros(sample_size),\n",
" 'numbers' : np.random.randn(sample_size)\n",
" })\n",
" return df"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"100 loops, best of 3: 16.6 ms per loop\n",
"1 loops, best of 3: 4.32 s per loop\n",
"10 loops, best of 3: 21.6 ms per loop\n",
"10 loops, best of 3: 56.2 ms per loop\n",
"99999 samples\n"
]
}
],
"source": [
"# various methods to achieve at the same result\n",
"\n",
"def idxfun(fun):\n",
" \"\"\" generalized transform(idxmax/min) \"\"\"\n",
" def inner(g):\n",
" return g[g == fun(g)].index\n",
" return inner\n",
" \n",
"def v1(df):\n",
" # fastest\n",
" def set_is_min(m):\n",
" df.loc[df.numbers == m, 'is_min'] = 1\n",
" mins = df.groupby('letters').numbers.min().apply(set_is_min)\n",
" \n",
"def v2(df):\n",
" # slowest\n",
" mins = df.groupby('letters').numbers.min()\n",
" df['is_min'] = df.apply(lambda r : 1 if mins[r.letters] == r.numbers else 0, axis=1)\n",
" \n",
"def v3(df):\n",
" # accepted answer\n",
" df.loc[df.groupby('letters')['numbers'].transform('idxmin').unique(), 'is_min']=1\n",
" \n",
"def v4(df):\n",
" # generalized accepted answer - faster than v2, almost 4x slower than fastest \n",
" df.loc[df.groupby('letters')['numbers'].transform(idxfun(min)), 'is_min']=1\n",
" \n",
"# note init() is called for every new timeit run to avoid caching effects\n",
"N = 100000\n",
"%timeit v1(init(N))\n",
"%timeit v2(init(N))\n",
"%timeit v3(init(N))\n",
"%timeit v4(init(N))\n",
"\n",
"print len(init(N).index), \"samples\""
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.11"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment