Last active
December 22, 2017 23:38
-
-
Save jiffyclub/63aec19aa75a672fcb58 to your computer and use it in GitHub Desktop.
Example of making a groupby generator for a pandas DataFrame when the groupby column is sorted.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"name": "", | |
"signature": "sha256:3acfd6ad2d4ae6e000fc66d6b111334a3d01ab0ae47f4adc4c183c04eabe0d72" | |
}, | |
"nbformat": 3, | |
"nbformat_minor": 0, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"import string\n", | |
"\n", | |
"import numpy as np\n", | |
"import pandas as pd" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 23 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"n = 1000000\n", | |
"df = pd.DataFrame({'alpha': np.random.choice(list(string.lowercase), n),\n", | |
" 'num': np.random.random(n)})\n", | |
"sorted_df = df.sort('alpha')" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 36 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"def sorted_groupby(df, groupby):\n", | |
" start = 0\n", | |
" prev = df[groupby].iloc[start]\n", | |
" for i, x in enumerate(df[groupby]):\n", | |
" if x != prev:\n", | |
" yield prev, df.iloc[start:i]\n", | |
" prev = x\n", | |
" start = i\n", | |
" # need to send back the last group\n", | |
" yield prev, df.iloc[start:]" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 43 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"%timeit {name: x for name, x in df.groupby('alpha', sort=False)}" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"10 loops, best of 3: 182 ms per loop\n" | |
] | |
} | |
], | |
"prompt_number": 44 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"%timeit {name: x for name, x in sorted_df.groupby('alpha', sort=False)}" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"10 loops, best of 3: 138 ms per loop\n" | |
] | |
} | |
], | |
"prompt_number": 50 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"%timeit {name: x for name, x in sorted_groupby(sorted_df, 'alpha')}" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"10 loops, best of 3: 72.3 ms per loop\n" | |
] | |
} | |
], | |
"prompt_number": 45 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"def groupby_to_dicts(gb):\n", | |
" return {name: df.to_dict() for name, df in gb}" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 47 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"assert groupby_to_dicts(df.groupby('alpha', sort=False)) == \\\n", | |
" groupby_to_dicts(sorted_groupby(sorted_df, 'alpha'))" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 49 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
} | |
], | |
"metadata": {} | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment