Skip to content

Instantly share code, notes, and snippets.

@jiffyclub
Last active December 22, 2017 23:38
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jiffyclub/63aec19aa75a672fcb58 to your computer and use it in GitHub Desktop.
Save jiffyclub/63aec19aa75a672fcb58 to your computer and use it in GitHub Desktop.
Example of making a groupby generator for a pandas DataFrame when the groupby column is sorted.
Display the source blob
Display the rendered blob
Raw
{
"metadata": {
"name": "",
"signature": "sha256:3acfd6ad2d4ae6e000fc66d6b111334a3d01ab0ae47f4adc4c183c04eabe0d72"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": [
"import string\n",
"\n",
"import numpy as np\n",
"import pandas as pd"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 23
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"n = 1000000\n",
"df = pd.DataFrame({'alpha': np.random.choice(list(string.lowercase), n),\n",
" 'num': np.random.random(n)})\n",
"sorted_df = df.sort('alpha')"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 36
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"def sorted_groupby(df, groupby):\n",
" start = 0\n",
" prev = df[groupby].iloc[start]\n",
" for i, x in enumerate(df[groupby]):\n",
" if x != prev:\n",
" yield prev, df.iloc[start:i]\n",
" prev = x\n",
" start = i\n",
" # need to send back the last group\n",
" yield prev, df.iloc[start:]"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 43
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"%timeit {name: x for name, x in df.groupby('alpha', sort=False)}"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"10 loops, best of 3: 182 ms per loop\n"
]
}
],
"prompt_number": 44
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"%timeit {name: x for name, x in sorted_df.groupby('alpha', sort=False)}"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"10 loops, best of 3: 138 ms per loop\n"
]
}
],
"prompt_number": 50
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"%timeit {name: x for name, x in sorted_groupby(sorted_df, 'alpha')}"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"10 loops, best of 3: 72.3 ms per loop\n"
]
}
],
"prompt_number": 45
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"def groupby_to_dicts(gb):\n",
" return {name: df.to_dict() for name, df in gb}"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 47
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"assert groupby_to_dicts(df.groupby('alpha', sort=False)) == \\\n",
" groupby_to_dicts(sorted_groupby(sorted_df, 'alpha'))"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 49
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment