Created
June 17, 2018 17:54
-
-
Save BodonFerenc/015a24e629b6dd0944b27c6b64cbca35 to your computer and use it in GitHub Desktop.
Aggregation on multiple columns
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import pandas as pd\n", | |
"import numpy as np" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>bucket</th>\n", | |
" <th>qty</th>\n", | |
" <th>risk</th>\n", | |
" <th>weight</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>a</td>\n", | |
" <td>100</td>\n", | |
" <td>10</td>\n", | |
" <td>2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>a</td>\n", | |
" <td>500</td>\n", | |
" <td>20</td>\n", | |
" <td>3</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>b</td>\n", | |
" <td>200</td>\n", | |
" <td>12</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>b</td>\n", | |
" <td>800</td>\n", | |
" <td>60</td>\n", | |
" <td>4</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>b</td>\n", | |
" <td>700</td>\n", | |
" <td>58</td>\n", | |
" <td>3</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" bucket qty risk weight\n", | |
"0 a 100 10 2\n", | |
"1 a 500 20 3\n", | |
"2 b 200 12 1\n", | |
"3 b 800 60 4\n", | |
"4 b 700 58 3" | |
] | |
}, | |
"execution_count": 3, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"t = pd.DataFrame({'bucket':['a', 'a', 'b', 'b', 'b'], 'weight': [2, 3, 1, 4, 3], \n", | |
" 'qty': [100, 500, 200, 800, 700], 'risk': [10, 20, 12, 60, 58]})\n", | |
"t" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"we would like to see for each `bucket` \n", | |
" * the number of element, as column **NR**\n", | |
" * sum and average of `qty` and `risk`, as columns **TOTAL_QTY**/**TOTAL_RISK** and **AVG_QTY**/**AVG_RISK**\n", | |
" * weighted average of `qty` and `risk`, as columns **W_AVG_QTY** and **W_AVG_RISK**. For weighted average, let us use numpy function [np.average](https://docs.scipy.org/doc/numpy/reference/generated/numpy.average.html)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>NR</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>bucket</th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>a</th>\n", | |
" <td>2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>b</th>\n", | |
" <td>3</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" NR\n", | |
"bucket \n", | |
"a 2\n", | |
"b 3" | |
] | |
}, | |
"execution_count": 4, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"t.groupby('bucket').agg({'bucket': len}).rename(columns= {'bucket':'NR'})" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>TOTAL_QTY</th>\n", | |
" <th>AVG_QTY</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>bucket</th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>a</th>\n", | |
" <td>600</td>\n", | |
" <td>300.000000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>b</th>\n", | |
" <td>1700</td>\n", | |
" <td>566.666667</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" TOTAL_QTY AVG_QTY\n", | |
"bucket \n", | |
"a 600 300.000000\n", | |
"b 1700 566.666667" | |
] | |
}, | |
"execution_count": 5, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"t.groupby('bucket')['qty'].agg([sum, np.mean]).rename(columns={'sum': 'TOTAL_QTY', 'mean': 'AVG_QTY'})" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>TOTAL_QTY</th>\n", | |
" <th>TOTAL_QTY</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>bucket</th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>a</th>\n", | |
" <td>600</td>\n", | |
" <td>30</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>b</th>\n", | |
" <td>1700</td>\n", | |
" <td>130</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" TOTAL_QTY TOTAL_QTY\n", | |
"bucket \n", | |
"a 600 30\n", | |
"b 1700 130" | |
] | |
}, | |
"execution_count": 6, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"t.groupby('bucket')['qty', 'risk'].agg(sum).rename(columns={'qty': 'TOTAL_QTY', 'risk': 'TOTAL_QTY'})" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>TOTAL_QTY</th>\n", | |
" <th>AVG_QTY</th>\n", | |
" <th>TOTAL_RISK</th>\n", | |
" <th>AVG_RISK</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>bucket</th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>a</th>\n", | |
" <td>600</td>\n", | |
" <td>300.000000</td>\n", | |
" <td>30</td>\n", | |
" <td>15.000000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>b</th>\n", | |
" <td>1700</td>\n", | |
" <td>566.666667</td>\n", | |
" <td>130</td>\n", | |
" <td>43.333333</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" TOTAL_QTY AVG_QTY TOTAL_RISK AVG_RISK\n", | |
"bucket \n", | |
"a 600 300.000000 30 15.000000\n", | |
"b 1700 566.666667 130 43.333333" | |
] | |
}, | |
"execution_count": 7, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"res = t.groupby('bucket').agg({'qty': [sum, np.mean], 'risk': [sum, np.mean]})\n", | |
"res.columns = res.columns.map('_'.join)\n", | |
"res.rename(columns={'qty_sum':'TOTAL_QTY','qty_mean':'AVG_QTY', \n", | |
" 'risk_sum':'TOTAL_RISK','risk_mean':'AVG_RISK'})" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>W_AVG_QTY</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>bucket</th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>a</th>\n", | |
" <td>340.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>b</th>\n", | |
" <td>687.5</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" W_AVG_QTY\n", | |
"bucket \n", | |
"a 340.0\n", | |
"b 687.5" | |
] | |
}, | |
"execution_count": 8, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"t.groupby('bucket').apply(lambda g: np.average(g.qty, weights=g.weight)).to_frame('W_AVG_QTY')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": { | |
"scrolled": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>NR</th>\n", | |
" <th>TOTAL_QTY</th>\n", | |
" <th>AVG_QTY</th>\n", | |
" <th>TOTAL_RISK</th>\n", | |
" <th>AVG_RISK</th>\n", | |
" <th>W_AVG_QTY</th>\n", | |
" <th>W_AVG_RISK</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>bucket</th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>a</th>\n", | |
" <td>2</td>\n", | |
" <td>600</td>\n", | |
" <td>300.000000</td>\n", | |
" <td>30</td>\n", | |
" <td>15.000000</td>\n", | |
" <td>340.0</td>\n", | |
" <td>16.00</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>b</th>\n", | |
" <td>3</td>\n", | |
" <td>1700</td>\n", | |
" <td>566.666667</td>\n", | |
" <td>130</td>\n", | |
" <td>43.333333</td>\n", | |
" <td>687.5</td>\n", | |
" <td>53.25</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" NR TOTAL_QTY AVG_QTY TOTAL_RISK AVG_RISK W_AVG_QTY \\\n", | |
"bucket \n", | |
"a 2 600 300.000000 30 15.000000 340.0 \n", | |
"b 3 1700 566.666667 130 43.333333 687.5 \n", | |
"\n", | |
" W_AVG_RISK \n", | |
"bucket \n", | |
"a 16.00 \n", | |
"b 53.25 " | |
] | |
}, | |
"execution_count": 9, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"res = t.groupby('bucket').agg({'bucket': len, 'qty': [sum, np.mean], 'risk': [sum, np.mean]})\n", | |
"res.columns = res.columns.map('_'.join)\n", | |
"res.rename(columns={'bucket_len':'NR', 'qty_sum':'TOTAL_QTY','qty_mean':'AVG_QTY', \n", | |
" 'risk_sum':'TOTAL_RISK','risk_mean':'AVG_RISK'}).join(\n", | |
" t.groupby('bucket').apply(lambda g: np.average(g.qty, weights=g.weight)).to_frame('W_AVG_QTY')).join(\n", | |
" t.groupby('bucket').apply(lambda g: np.average(g.risk, weights=g.weight)).to_frame('W_AVG_RISK')\n", | |
")\n" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"This renaming workaround is needed due to column name match (two sum column on level 2)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"For comparion, the corresponding Q:\n", | |
"\n", | |
" select NR: count i, \n", | |
" TOTAL_QTY: sum qty, AVG_QTY: avg qty, \n", | |
" TOTAL_RISK: sum risk, AVG_RISK: avg risk, \n", | |
" W_AVG_QTY: weight wavg qty, W_AVG_RISK: weight wavg risk \n", | |
" by bucket from t" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Expert's solution based on comment from https://github.com/pandas-dev/pandas/issues/18366#issuecomment-348800691" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>NR</th>\n", | |
" <th>TOTAL_QTY</th>\n", | |
" <th>AVG_QTY</th>\n", | |
" <th>TOTAL_RISK</th>\n", | |
" <th>AVG_RISK</th>\n", | |
" <th>W_AVG_QTY</th>\n", | |
" <th>W_AVG_RISK</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>bucket</th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>a</th>\n", | |
" <td>2.0</td>\n", | |
" <td>600.0</td>\n", | |
" <td>300.000000</td>\n", | |
" <td>30.0</td>\n", | |
" <td>15.000000</td>\n", | |
" <td>340.0</td>\n", | |
" <td>16.00</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>b</th>\n", | |
" <td>3.0</td>\n", | |
" <td>1700.0</td>\n", | |
" <td>566.666667</td>\n", | |
" <td>130.0</td>\n", | |
" <td>43.333333</td>\n", | |
" <td>687.5</td>\n", | |
" <td>53.25</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" NR TOTAL_QTY AVG_QTY TOTAL_RISK AVG_RISK W_AVG_QTY \\\n", | |
"bucket \n", | |
"a 2.0 600.0 300.000000 30.0 15.000000 340.0 \n", | |
"b 3.0 1700.0 566.666667 130.0 43.333333 687.5 \n", | |
"\n", | |
" W_AVG_RISK \n", | |
"bucket \n", | |
"a 16.00 \n", | |
"b 53.25 " | |
] | |
}, | |
"execution_count": 12, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"def my_agg(x):\n", | |
" data = {'NR': x.bucket.count(),\n", | |
" 'TOTAL_QTY': x.qty.sum(),\n", | |
" 'AVG_QTY': x.qty.mean(),\n", | |
" 'TOTAL_RISK': x.risk.sum(),\n", | |
" 'AVG_RISK': x.risk.mean(),\n", | |
" 'W_AVG_QTY': np.average(x.qty, weights=x.weight),\n", | |
" 'W_AVG_RISK': np.average(x.risk, weights=x.weight)\n", | |
" }\n", | |
" return pd.Series(data, index=['NR', 'TOTAL_QTY', 'AVG_QTY', 'TOTAL_RISK', \n", | |
" 'AVG_RISK', 'W_AVG_QTY', 'W_AVG_RISK'])\n", | |
" \n", | |
"t.groupby('bucket').apply(my_agg)" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.4" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment