Skip to content

Instantly share code, notes, and snippets.

@tomokishii
Created September 15, 2016 21:10
Show Gist options
  • Save tomokishii/f051de44ecf8844fb74cacb36226a65f to your computer and use it in GitHub Desktop.
Save tomokishii/f051de44ecf8844fb74cacb36226a65f to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Pandas grouping operation\n",
"(ref.) http://pandas.pydata.org/pandas-docs/stable/cookbook.html#grouping"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# packages\n",
"import numpy as np\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df = pd.DataFrame({'animal': 'cat dog cat fish dog cat cat'.split(),\n",
" 'size': list('SSMMMLL'),\n",
" 'weight': [8, 10, 11, 1, 20, 12, 12],\n",
" 'adult' : [False] * 5 + [True] * 2})"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>adult</th>\n",
" <th>animal</th>\n",
" <th>size</th>\n",
" <th>weight</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>False</td>\n",
" <td>cat</td>\n",
" <td>S</td>\n",
" <td>8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>False</td>\n",
" <td>dog</td>\n",
" <td>S</td>\n",
" <td>10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>False</td>\n",
" <td>cat</td>\n",
" <td>M</td>\n",
" <td>11</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>False</td>\n",
" <td>fish</td>\n",
" <td>M</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>False</td>\n",
" <td>dog</td>\n",
" <td>M</td>\n",
" <td>20</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>True</td>\n",
" <td>cat</td>\n",
" <td>L</td>\n",
" <td>12</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>True</td>\n",
" <td>cat</td>\n",
" <td>L</td>\n",
" <td>12</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" adult animal size weight\n",
"0 False cat S 8\n",
"1 False dog S 10\n",
"2 False cat M 11\n",
"3 False fish M 1\n",
"4 False dog M 20\n",
"5 True cat L 12\n",
"6 True cat L 12"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"animal\n",
"cat L\n",
"dog M\n",
"fish M\n",
"dtype: object"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# List the size of the animals with the highest weight.\n",
"df.groupby('animal').apply(lambda subf: subf['size'][subf['weight'].idxmax()])"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>adult</th>\n",
" <th>animal</th>\n",
" <th>size</th>\n",
" <th>weight</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>False</td>\n",
" <td>cat</td>\n",
" <td>S</td>\n",
" <td>8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>False</td>\n",
" <td>cat</td>\n",
" <td>M</td>\n",
" <td>11</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>True</td>\n",
" <td>cat</td>\n",
" <td>L</td>\n",
" <td>12</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>True</td>\n",
" <td>cat</td>\n",
" <td>L</td>\n",
" <td>12</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" adult animal size weight\n",
"0 False cat S 8\n",
"2 False cat M 11\n",
"5 True cat L 12\n",
"6 True cat L 12"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# get_group example\n",
"gb = df.groupby(['animal'])\n",
"gb.get_group('cat')"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"df = pd.DataFrame({'A' : [1, 1, 2, 2], 'B' : [1, -1, 1, 2]})\n",
"gb = df.groupby('A')"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>B</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" B\n",
"0 1.0\n",
"1 1.0\n",
"2 1.0\n",
"3 2.0"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# replace a cell value\n",
"def replace(g):\n",
" mask = g < 0\n",
" g.loc[mask] = g[~mask].mean()\n",
" \n",
" return g\n",
"\n",
"gb.transform(replace)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>code</th>\n",
" <th>data</th>\n",
" <th>flag</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>foo</td>\n",
" <td>0.16</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>bar</td>\n",
" <td>-0.21</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>baz</td>\n",
" <td>0.33</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>foo</td>\n",
" <td>0.45</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>bar</td>\n",
" <td>-0.59</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>baz</td>\n",
" <td>0.62</td>\n",
" <td>True</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" code data flag\n",
"0 foo 0.16 False\n",
"1 bar -0.21 True\n",
"2 baz 0.33 False\n",
"3 foo 0.45 True\n",
"4 bar -0.59 False\n",
"5 baz 0.62 True"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Sort groups by aggregated data\n",
"df = pd.DataFrame({'code': ['foo', 'bar', 'baz'] * 2,\n",
" 'data': [0.16, -0.21, 0.33, 0.45, -0.59, 0.62],\n",
" 'flag': [False, True] * 3})\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>code</th>\n",
" <th>data</th>\n",
" <th>flag</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>bar</td>\n",
" <td>-0.21</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>bar</td>\n",
" <td>-0.59</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>foo</td>\n",
" <td>0.16</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>foo</td>\n",
" <td>0.45</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>baz</td>\n",
" <td>0.33</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>baz</td>\n",
" <td>0.62</td>\n",
" <td>True</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" code data flag\n",
"1 bar -0.21 True\n",
"4 bar -0.59 False\n",
"0 foo 0.16 False\n",
"3 foo 0.45 True\n",
"2 baz 0.33 False\n",
"5 baz 0.62 True"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"code_groups = df.groupby('code')\n",
"agg_n_sort_order = code_groups[['data']].transform(sum).sort_values(by='data')\n",
"sorted_df = df.ix[agg_n_sort_order.index]\n",
"sorted_df"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Color</th>\n",
" <th>Value</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Red</td>\n",
" <td>100</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Red</td>\n",
" <td>150</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Red</td>\n",
" <td>50</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Blue</td>\n",
" <td>50</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Color Value\n",
"0 Red 100\n",
"1 Red 150\n",
"2 Red 50\n",
"3 Blue 50"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.DataFrame({'Color': 'Red Red Red Blue'.split(),\n",
" 'Value': [100, 150, 50, 50]})\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Color</th>\n",
" <th>Value</th>\n",
" <th>Counts</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Red</td>\n",
" <td>100</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Red</td>\n",
" <td>150</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Red</td>\n",
" <td>50</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Blue</td>\n",
" <td>50</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Color Value Counts\n",
"0 Red 100 3\n",
"1 Red 150 3\n",
"2 Red 50 3\n",
"3 Blue 50 1"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['Counts'] = df.groupby(['Color']).transform(len)\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"Color\n",
"Blue 1\n",
"Red 3\n",
"Name: Counts, dtype: int64"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.groupby('Color')['Counts'].mean()"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Color</th>\n",
" <th>Value</th>\n",
" <th>Counts</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Red</td>\n",
" <td>100</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Red</td>\n",
" <td>150</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Red</td>\n",
" <td>50</td>\n",
" <td>3</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Color Value Counts\n",
"0 Red 100 3\n",
"1 Red 150 3\n",
"2 Red 50 3"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[df['Color'] == 'Red']"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"3"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_red = df[df['Color']=='Red']\n",
"len(df_red)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.11"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment