Skip to content

Instantly share code, notes, and snippets.

@shoyer
Last active December 15, 2021 16:36
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save shoyer/f538ac78ae904c936844 to your computer and use it in GitHub Desktop.
Save shoyer/f538ac78ae904c936844 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"version": "0.3.2",
"views": {},
"default_view": {},
"name": "numpy groupby.ipynb",
"provenance": []
}
},
"cells": [
{
"cell_type": "code",
"metadata": {
"id": "GeA2Av_6vApm",
"colab_type": "code",
"colab": {
"autoexec": {
"startup": false,
"wait_interval": 0
},
"output_extras": []
},
"cellView": "both",
"executionInfo": {
"status": "ok",
"timestamp": 1458422405757,
"user_tz": 420,
"elapsed": 400,
"user": {
"sessionId": "234853c9abaa862c",
"userId": "100105766565685654482",
"permissionId": "01386112912994523038",
"displayName": "Stephan Hoyer",
"color": "#1FA15D",
"isMe": true,
"isAnonymous": false,
"photoUrl": "//lh4.googleusercontent.com/-bwQVXpRw0z8/AAAAAAAAAAI/AAAAAAAAACQ/obT9z9YnNnc/s50-c-k-no/photo.jpg"
}
},
"outputId": "0a2ee5bb-f5bc-43c4-ebcb-7af22177196c"
},
"source": [
"import numpy as np\n",
"import pandas as pd"
],
"outputs": [],
"execution_count": 0
},
{
"cell_type": "code",
"metadata": {
"id": "_vzKe7WZvF7m",
"colab_type": "code",
"colab": {
"autoexec": {
"startup": false,
"wait_interval": 0
},
"output_extras": []
},
"cellView": "both",
"executionInfo": {
"status": "ok",
"timestamp": 1458425448893,
"user_tz": 420,
"elapsed": 355,
"user": {
"sessionId": "63f82df4d7686427",
"userId": "100105766565685654482",
"permissionId": "01386112912994523038",
"displayName": "Stephan Hoyer",
"color": "#1FA15D",
"isMe": true,
"isAnonymous": false,
"photoUrl": "//lh4.googleusercontent.com/-bwQVXpRw0z8/AAAAAAAAAAI/AAAAAAAAACQ/obT9z9YnNnc/s50-c-k-no/photo.jpg"
}
},
"outputId": "3215e6e3-bd6b-4535-9f1e-e30ca933d01c"
},
"source": [
"def grouped_sum(array, groups, axis=0, issorted=False):\n",
" array = np.asarray(array)\n",
" groups = np.asarray(groups)\n",
"\n",
" if issorted:\n",
" aux = groups\n",
" ordered_array = array\n",
" else:\n",
" perm = groups.argsort()\n",
" aux = groups[perm]\n",
" ordered_array = array[perm]\n",
" \n",
" flag = np.concatenate(([True], aux[1:] != aux[:-1]))\n",
" uniques = aux[flag]\n",
" inv_idx, = flag.nonzero()\n",
" \n",
" result = np.add.reduceat(ordered_array, inv_idx)\n",
"\n",
" return uniques, result\n"
],
"outputs": [],
"execution_count": 0
},
{
"cell_type": "code",
"metadata": {
"id": "wKpvyIyPvyjP",
"colab_type": "code",
"colab": {
"autoexec": {
"startup": false,
"wait_interval": 0
},
"output_extras": []
},
"cellView": "both",
"executionInfo": {
"status": "ok",
"timestamp": 1458425348130,
"user_tz": 420,
"elapsed": 1699,
"user": {
"sessionId": "63f82df4d7686427",
"userId": "100105766565685654482",
"permissionId": "01386112912994523038",
"displayName": "Stephan Hoyer",
"color": "#1FA15D",
"isMe": true,
"isAnonymous": false,
"photoUrl": "//lh4.googleusercontent.com/-bwQVXpRw0z8/AAAAAAAAAAI/AAAAAAAAACQ/obT9z9YnNnc/s50-c-k-no/photo.jpg"
}
},
"outputId": "0546a9ba-911d-4cef-c2e0-7cbca265d21c"
},
"source": [
"x = np.random.RandomState(0).randn(int(1e7))\n",
"y = np.random.RandomState(2).randint(10, size=int(1e7))\n",
"df = pd.DataFrame({'x': x, 'y': y})\n",
"\n",
"y_sorted = np.sort(y)\n",
"df_sorted = pd.DataFrame({'x': x, 'y': y_sorted})\n"
],
"outputs": [],
"execution_count": 0
},
{
"cell_type": "code",
"metadata": {
"id": "PRl66Bnt1lmM",
"colab_type": "code",
"colab": {
"autoexec": {
"startup": false,
"wait_interval": 0
},
"output_extras": [
{
"item_id": 1
}
]
},
"cellView": "both",
"executionInfo": {
"status": "ok",
"timestamp": 1458425499514,
"user_tz": 420,
"elapsed": 509,
"user": {
"sessionId": "63f82df4d7686427",
"userId": "100105766565685654482",
"permissionId": "01386112912994523038",
"displayName": "Stephan Hoyer",
"color": "#1FA15D",
"isMe": true,
"isAnonymous": false,
"photoUrl": "//lh4.googleusercontent.com/-bwQVXpRw0z8/AAAAAAAAAAI/AAAAAAAAACQ/obT9z9YnNnc/s50-c-k-no/photo.jpg"
}
},
"outputId": "eafcd3e7-ac1c-4dc1-abc0-6353fe0d521e"
},
"source": [
"df.groupby('y').x.sum()"
],
"outputs": [
{
"output_type": "execute_result",
"execution_count": 152,
"metadata": {},
"data": {
"text/plain": [
"y\n",
"0 325.395301\n",
"1 116.311628\n",
"2 -360.622610\n",
"3 342.183991\n",
"4 -200.706264\n",
"5 964.375425\n",
"6 170.656262\n",
"7 1651.389744\n",
"8 -715.373282\n",
"9 734.414114\n",
"Name: x, dtype: float64"
]
}
}
],
"execution_count": 152
},
{
"cell_type": "code",
"metadata": {
"id": "coCqa9w11nOM",
"colab_type": "code",
"colab": {
"autoexec": {
"startup": false,
"wait_interval": 0
},
"output_extras": [
{
"item_id": 1
}
]
},
"cellView": "both",
"executionInfo": {
"status": "ok",
"timestamp": 1458425501411,
"user_tz": 420,
"elapsed": 1421,
"user": {
"sessionId": "63f82df4d7686427",
"userId": "100105766565685654482",
"permissionId": "01386112912994523038",
"displayName": "Stephan Hoyer",
"color": "#1FA15D",
"isMe": true,
"isAnonymous": false,
"photoUrl": "//lh4.googleusercontent.com/-bwQVXpRw0z8/AAAAAAAAAAI/AAAAAAAAACQ/obT9z9YnNnc/s50-c-k-no/photo.jpg"
}
},
"outputId": "ef2fa8e8-539f-4d0a-c1f5-b74f5163f928"
},
"source": [
"grouped_sum(x, y)"
],
"outputs": [
{
"output_type": "execute_result",
"execution_count": 153,
"metadata": {},
"data": {
"text/plain": [
"(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),\n",
" array([ 325.39530127, 116.31162771, -360.62260997, 342.18399102,\n",
" -200.70626376, 964.37542492, 170.65626202, 1651.38974376,\n",
" -715.37328207, 734.41411426]))"
]
}
}
],
"execution_count": 153
},
{
"cell_type": "code",
"metadata": {
"id": "Yekd0qXC0_R7",
"colab_type": "code",
"colab": {
"autoexec": {
"startup": false,
"wait_interval": 0
},
"output_extras": [
{
"item_id": 1
}
]
},
"cellView": "both",
"executionInfo": {
"status": "ok",
"timestamp": 1458425465618,
"user_tz": 420,
"elapsed": 13683,
"user": {
"sessionId": "63f82df4d7686427",
"userId": "100105766565685654482",
"permissionId": "01386112912994523038",
"displayName": "Stephan Hoyer",
"color": "#1FA15D",
"isMe": true,
"isAnonymous": false,
"photoUrl": "//lh4.googleusercontent.com/-bwQVXpRw0z8/AAAAAAAAAAI/AAAAAAAAACQ/obT9z9YnNnc/s50-c-k-no/photo.jpg"
}
},
"outputId": "2e2a5999-0d6e-4776-ce06-51466a302589"
},
"source": [
"s1 = df.groupby('y').x.sum()\n",
"s2 = pd.Series(*grouped_sum(x, y)[::-1])\n",
"assert abs(s1 - s2).mean() < 1e-10\n",
"\n",
"%timeit df.groupby('y').x.sum()\n",
"%timeit grouped_sum(x, y)"
],
"outputs": [
{
"output_type": "stream",
"text": [
"10 loops, best of 3: 189 ms per loop\n",
"1 loops, best of 3: 1.04 s per loop\n"
],
"name": "stdout"
}
],
"execution_count": 0
},
{
"cell_type": "code",
"metadata": {
"id": "xLYdnim266Cx",
"colab_type": "code",
"colab": {
"autoexec": {
"startup": false,
"wait_interval": 0
},
"output_extras": [
{
"item_id": 2
}
]
},
"cellView": "both",
"executionInfo": {
"status": "ok",
"timestamp": 1458425489012,
"user_tz": 420,
"elapsed": 11564,
"user": {
"sessionId": "63f82df4d7686427",
"userId": "100105766565685654482",
"permissionId": "01386112912994523038",
"displayName": "Stephan Hoyer",
"color": "#1FA15D",
"isMe": true,
"isAnonymous": false,
"photoUrl": "//lh4.googleusercontent.com/-bwQVXpRw0z8/AAAAAAAAAAI/AAAAAAAAACQ/obT9z9YnNnc/s50-c-k-no/photo.jpg"
}
},
"outputId": "94bd9ec1-4f51-44d8-fc26-160592ed4d03"
},
"source": [
"s1 = df_sorted.groupby('y').x.sum()\n",
"s2 = pd.Series(*grouped_sum(x, y_sorted)[::-1])\n",
"s3 = pd.Series(*grouped_sum(x, y_sorted, issorted=True)[::-1])\n",
"assert abs(s1 - s2).mean() < 1e-10\n",
"assert abs(s1 - s3).mean() < 1e-10\n",
"\n",
"%timeit df_sorted.groupby('y').x.sum()\n",
"%timeit grouped_sum(x, y_sorted)\n",
"%timeit grouped_sum(x, y_sorted, issorted=True)"
],
"outputs": [
{
"output_type": "stream",
"text": [
"10 loops, best of 3: 191 ms per loop\n",
"1 loops, best of 3: 370 ms per loop\n",
"10 loops, best of 3: 28.2 ms per loop\n"
],
"name": "stdout"
}
],
"execution_count": 0
},
{
"cell_type": "code",
"metadata": {
"id": "vZ8qa6BRzA27",
"colab_type": "code",
"colab": {
"autoexec": {
"startup": false,
"wait_interval": 0
}
},
"cellView": "both"
},
"source": [
""
],
"outputs": [],
"execution_count": 0
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment