Last active
December 15, 2021 16:36
-
-
Save shoyer/f538ac78ae904c936844 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 0, | |
"metadata": { | |
"colab": { | |
"version": "0.3.2", | |
"views": {}, | |
"default_view": {}, | |
"name": "numpy groupby.ipynb", | |
"provenance": [] | |
} | |
}, | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "GeA2Av_6vApm", | |
"colab_type": "code", | |
"colab": { | |
"autoexec": { | |
"startup": false, | |
"wait_interval": 0 | |
}, | |
"output_extras": [] | |
}, | |
"cellView": "both", | |
"executionInfo": { | |
"status": "ok", | |
"timestamp": 1458422405757, | |
"user_tz": 420, | |
"elapsed": 400, | |
"user": { | |
"sessionId": "234853c9abaa862c", | |
"userId": "100105766565685654482", | |
"permissionId": "01386112912994523038", | |
"displayName": "Stephan Hoyer", | |
"color": "#1FA15D", | |
"isMe": true, | |
"isAnonymous": false, | |
"photoUrl": "//lh4.googleusercontent.com/-bwQVXpRw0z8/AAAAAAAAAAI/AAAAAAAAACQ/obT9z9YnNnc/s50-c-k-no/photo.jpg" | |
} | |
}, | |
"outputId": "0a2ee5bb-f5bc-43c4-ebcb-7af22177196c" | |
}, | |
"source": [ | |
"import numpy as np\n", | |
"import pandas as pd" | |
], | |
"outputs": [], | |
"execution_count": 0 | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "_vzKe7WZvF7m", | |
"colab_type": "code", | |
"colab": { | |
"autoexec": { | |
"startup": false, | |
"wait_interval": 0 | |
}, | |
"output_extras": [] | |
}, | |
"cellView": "both", | |
"executionInfo": { | |
"status": "ok", | |
"timestamp": 1458425448893, | |
"user_tz": 420, | |
"elapsed": 355, | |
"user": { | |
"sessionId": "63f82df4d7686427", | |
"userId": "100105766565685654482", | |
"permissionId": "01386112912994523038", | |
"displayName": "Stephan Hoyer", | |
"color": "#1FA15D", | |
"isMe": true, | |
"isAnonymous": false, | |
"photoUrl": "//lh4.googleusercontent.com/-bwQVXpRw0z8/AAAAAAAAAAI/AAAAAAAAACQ/obT9z9YnNnc/s50-c-k-no/photo.jpg" | |
} | |
}, | |
"outputId": "3215e6e3-bd6b-4535-9f1e-e30ca933d01c" | |
}, | |
"source": [ | |
"def grouped_sum(array, groups, axis=0, issorted=False):\n", | |
" array = np.asarray(array)\n", | |
" groups = np.asarray(groups)\n", | |
"\n", | |
" if issorted:\n", | |
" aux = groups\n", | |
" ordered_array = array\n", | |
" else:\n", | |
" perm = groups.argsort()\n", | |
" aux = groups[perm]\n", | |
" ordered_array = array[perm]\n", | |
" \n", | |
" flag = np.concatenate(([True], aux[1:] != aux[:-1]))\n", | |
" uniques = aux[flag]\n", | |
" inv_idx, = flag.nonzero()\n", | |
" \n", | |
" result = np.add.reduceat(ordered_array, inv_idx)\n", | |
"\n", | |
" return uniques, result\n" | |
], | |
"outputs": [], | |
"execution_count": 0 | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "wKpvyIyPvyjP", | |
"colab_type": "code", | |
"colab": { | |
"autoexec": { | |
"startup": false, | |
"wait_interval": 0 | |
}, | |
"output_extras": [] | |
}, | |
"cellView": "both", | |
"executionInfo": { | |
"status": "ok", | |
"timestamp": 1458425348130, | |
"user_tz": 420, | |
"elapsed": 1699, | |
"user": { | |
"sessionId": "63f82df4d7686427", | |
"userId": "100105766565685654482", | |
"permissionId": "01386112912994523038", | |
"displayName": "Stephan Hoyer", | |
"color": "#1FA15D", | |
"isMe": true, | |
"isAnonymous": false, | |
"photoUrl": "//lh4.googleusercontent.com/-bwQVXpRw0z8/AAAAAAAAAAI/AAAAAAAAACQ/obT9z9YnNnc/s50-c-k-no/photo.jpg" | |
} | |
}, | |
"outputId": "0546a9ba-911d-4cef-c2e0-7cbca265d21c" | |
}, | |
"source": [ | |
"x = np.random.RandomState(0).randn(int(1e7))\n", | |
"y = np.random.RandomState(2).randint(10, size=int(1e7))\n", | |
"df = pd.DataFrame({'x': x, 'y': y})\n", | |
"\n", | |
"y_sorted = np.sort(y)\n", | |
"df_sorted = pd.DataFrame({'x': x, 'y': y_sorted})\n" | |
], | |
"outputs": [], | |
"execution_count": 0 | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "PRl66Bnt1lmM", | |
"colab_type": "code", | |
"colab": { | |
"autoexec": { | |
"startup": false, | |
"wait_interval": 0 | |
}, | |
"output_extras": [ | |
{ | |
"item_id": 1 | |
} | |
] | |
}, | |
"cellView": "both", | |
"executionInfo": { | |
"status": "ok", | |
"timestamp": 1458425499514, | |
"user_tz": 420, | |
"elapsed": 509, | |
"user": { | |
"sessionId": "63f82df4d7686427", | |
"userId": "100105766565685654482", | |
"permissionId": "01386112912994523038", | |
"displayName": "Stephan Hoyer", | |
"color": "#1FA15D", | |
"isMe": true, | |
"isAnonymous": false, | |
"photoUrl": "//lh4.googleusercontent.com/-bwQVXpRw0z8/AAAAAAAAAAI/AAAAAAAAACQ/obT9z9YnNnc/s50-c-k-no/photo.jpg" | |
} | |
}, | |
"outputId": "eafcd3e7-ac1c-4dc1-abc0-6353fe0d521e" | |
}, | |
"source": [ | |
"df.groupby('y').x.sum()" | |
], | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 152, | |
"metadata": {}, | |
"data": { | |
"text/plain": [ | |
"y\n", | |
"0 325.395301\n", | |
"1 116.311628\n", | |
"2 -360.622610\n", | |
"3 342.183991\n", | |
"4 -200.706264\n", | |
"5 964.375425\n", | |
"6 170.656262\n", | |
"7 1651.389744\n", | |
"8 -715.373282\n", | |
"9 734.414114\n", | |
"Name: x, dtype: float64" | |
] | |
} | |
} | |
], | |
"execution_count": 152 | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "coCqa9w11nOM", | |
"colab_type": "code", | |
"colab": { | |
"autoexec": { | |
"startup": false, | |
"wait_interval": 0 | |
}, | |
"output_extras": [ | |
{ | |
"item_id": 1 | |
} | |
] | |
}, | |
"cellView": "both", | |
"executionInfo": { | |
"status": "ok", | |
"timestamp": 1458425501411, | |
"user_tz": 420, | |
"elapsed": 1421, | |
"user": { | |
"sessionId": "63f82df4d7686427", | |
"userId": "100105766565685654482", | |
"permissionId": "01386112912994523038", | |
"displayName": "Stephan Hoyer", | |
"color": "#1FA15D", | |
"isMe": true, | |
"isAnonymous": false, | |
"photoUrl": "//lh4.googleusercontent.com/-bwQVXpRw0z8/AAAAAAAAAAI/AAAAAAAAACQ/obT9z9YnNnc/s50-c-k-no/photo.jpg" | |
} | |
}, | |
"outputId": "ef2fa8e8-539f-4d0a-c1f5-b74f5163f928" | |
}, | |
"source": [ | |
"grouped_sum(x, y)" | |
], | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 153, | |
"metadata": {}, | |
"data": { | |
"text/plain": [ | |
"(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),\n", | |
" array([ 325.39530127, 116.31162771, -360.62260997, 342.18399102,\n", | |
" -200.70626376, 964.37542492, 170.65626202, 1651.38974376,\n", | |
" -715.37328207, 734.41411426]))" | |
] | |
} | |
} | |
], | |
"execution_count": 153 | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "Yekd0qXC0_R7", | |
"colab_type": "code", | |
"colab": { | |
"autoexec": { | |
"startup": false, | |
"wait_interval": 0 | |
}, | |
"output_extras": [ | |
{ | |
"item_id": 1 | |
} | |
] | |
}, | |
"cellView": "both", | |
"executionInfo": { | |
"status": "ok", | |
"timestamp": 1458425465618, | |
"user_tz": 420, | |
"elapsed": 13683, | |
"user": { | |
"sessionId": "63f82df4d7686427", | |
"userId": "100105766565685654482", | |
"permissionId": "01386112912994523038", | |
"displayName": "Stephan Hoyer", | |
"color": "#1FA15D", | |
"isMe": true, | |
"isAnonymous": false, | |
"photoUrl": "//lh4.googleusercontent.com/-bwQVXpRw0z8/AAAAAAAAAAI/AAAAAAAAACQ/obT9z9YnNnc/s50-c-k-no/photo.jpg" | |
} | |
}, | |
"outputId": "2e2a5999-0d6e-4776-ce06-51466a302589" | |
}, | |
"source": [ | |
"s1 = df.groupby('y').x.sum()\n", | |
"s2 = pd.Series(*grouped_sum(x, y)[::-1])\n", | |
"assert abs(s1 - s2).mean() < 1e-10\n", | |
"\n", | |
"%timeit df.groupby('y').x.sum()\n", | |
"%timeit grouped_sum(x, y)" | |
], | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"10 loops, best of 3: 189 ms per loop\n", | |
"1 loops, best of 3: 1.04 s per loop\n" | |
], | |
"name": "stdout" | |
} | |
], | |
"execution_count": 0 | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "xLYdnim266Cx", | |
"colab_type": "code", | |
"colab": { | |
"autoexec": { | |
"startup": false, | |
"wait_interval": 0 | |
}, | |
"output_extras": [ | |
{ | |
"item_id": 2 | |
} | |
] | |
}, | |
"cellView": "both", | |
"executionInfo": { | |
"status": "ok", | |
"timestamp": 1458425489012, | |
"user_tz": 420, | |
"elapsed": 11564, | |
"user": { | |
"sessionId": "63f82df4d7686427", | |
"userId": "100105766565685654482", | |
"permissionId": "01386112912994523038", | |
"displayName": "Stephan Hoyer", | |
"color": "#1FA15D", | |
"isMe": true, | |
"isAnonymous": false, | |
"photoUrl": "//lh4.googleusercontent.com/-bwQVXpRw0z8/AAAAAAAAAAI/AAAAAAAAACQ/obT9z9YnNnc/s50-c-k-no/photo.jpg" | |
} | |
}, | |
"outputId": "94bd9ec1-4f51-44d8-fc26-160592ed4d03" | |
}, | |
"source": [ | |
"s1 = df_sorted.groupby('y').x.sum()\n", | |
"s2 = pd.Series(*grouped_sum(x, y_sorted)[::-1])\n", | |
"s3 = pd.Series(*grouped_sum(x, y_sorted, issorted=True)[::-1])\n", | |
"assert abs(s1 - s2).mean() < 1e-10\n", | |
"assert abs(s1 - s3).mean() < 1e-10\n", | |
"\n", | |
"%timeit df_sorted.groupby('y').x.sum()\n", | |
"%timeit grouped_sum(x, y_sorted)\n", | |
"%timeit grouped_sum(x, y_sorted, issorted=True)" | |
], | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"10 loops, best of 3: 191 ms per loop\n", | |
"1 loops, best of 3: 370 ms per loop\n", | |
"10 loops, best of 3: 28.2 ms per loop\n" | |
], | |
"name": "stdout" | |
} | |
], | |
"execution_count": 0 | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "vZ8qa6BRzA27", | |
"colab_type": "code", | |
"colab": { | |
"autoexec": { | |
"startup": false, | |
"wait_interval": 0 | |
} | |
}, | |
"cellView": "both" | |
}, | |
"source": [ | |
"" | |
], | |
"outputs": [], | |
"execution_count": 0 | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment