Skip to content

Instantly share code, notes, and snippets.

@BodonFerenc
Created June 17, 2018 17:54
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save BodonFerenc/015a24e629b6dd0944b27c6b64cbca35 to your computer and use it in GitHub Desktop.
Save BodonFerenc/015a24e629b6dd0944b27c6b64cbca35 to your computer and use it in GitHub Desktop.
Aggregation on multiple columns
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>bucket</th>\n",
" <th>qty</th>\n",
" <th>risk</th>\n",
" <th>weight</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>a</td>\n",
" <td>100</td>\n",
" <td>10</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>a</td>\n",
" <td>500</td>\n",
" <td>20</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>b</td>\n",
" <td>200</td>\n",
" <td>12</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>b</td>\n",
" <td>800</td>\n",
" <td>60</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>b</td>\n",
" <td>700</td>\n",
" <td>58</td>\n",
" <td>3</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" bucket qty risk weight\n",
"0 a 100 10 2\n",
"1 a 500 20 3\n",
"2 b 200 12 1\n",
"3 b 800 60 4\n",
"4 b 700 58 3"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"t = pd.DataFrame({'bucket':['a', 'a', 'b', 'b', 'b'], 'weight': [2, 3, 1, 4, 3], \n",
" 'qty': [100, 500, 200, 800, 700], 'risk': [10, 20, 12, 60, 58]})\n",
"t"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"we would like to see for each `bucket` \n",
" * the number of element, as column **NR**\n",
" * sum and average of `qty` and `risk`, as columns **TOTAL_QTY**/**TOTAL_RISK** and **AVG_QTY**/**AVG_RISK**\n",
" * weighted average of `qty` and `risk`, as columns **W_AVG_QTY** and **W_AVG_RISK**. For weighted average, let us use numpy function [np.average](https://docs.scipy.org/doc/numpy/reference/generated/numpy.average.html)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>NR</th>\n",
" </tr>\n",
" <tr>\n",
" <th>bucket</th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>a</th>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>b</th>\n",
" <td>3</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" NR\n",
"bucket \n",
"a 2\n",
"b 3"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"t.groupby('bucket').agg({'bucket': len}).rename(columns= {'bucket':'NR'})"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>TOTAL_QTY</th>\n",
" <th>AVG_QTY</th>\n",
" </tr>\n",
" <tr>\n",
" <th>bucket</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>a</th>\n",
" <td>600</td>\n",
" <td>300.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>b</th>\n",
" <td>1700</td>\n",
" <td>566.666667</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" TOTAL_QTY AVG_QTY\n",
"bucket \n",
"a 600 300.000000\n",
"b 1700 566.666667"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"t.groupby('bucket')['qty'].agg([sum, np.mean]).rename(columns={'sum': 'TOTAL_QTY', 'mean': 'AVG_QTY'})"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>TOTAL_QTY</th>\n",
" <th>TOTAL_QTY</th>\n",
" </tr>\n",
" <tr>\n",
" <th>bucket</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>a</th>\n",
" <td>600</td>\n",
" <td>30</td>\n",
" </tr>\n",
" <tr>\n",
" <th>b</th>\n",
" <td>1700</td>\n",
" <td>130</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" TOTAL_QTY TOTAL_QTY\n",
"bucket \n",
"a 600 30\n",
"b 1700 130"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"t.groupby('bucket')['qty', 'risk'].agg(sum).rename(columns={'qty': 'TOTAL_QTY', 'risk': 'TOTAL_QTY'})"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>TOTAL_QTY</th>\n",
" <th>AVG_QTY</th>\n",
" <th>TOTAL_RISK</th>\n",
" <th>AVG_RISK</th>\n",
" </tr>\n",
" <tr>\n",
" <th>bucket</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>a</th>\n",
" <td>600</td>\n",
" <td>300.000000</td>\n",
" <td>30</td>\n",
" <td>15.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>b</th>\n",
" <td>1700</td>\n",
" <td>566.666667</td>\n",
" <td>130</td>\n",
" <td>43.333333</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" TOTAL_QTY AVG_QTY TOTAL_RISK AVG_RISK\n",
"bucket \n",
"a 600 300.000000 30 15.000000\n",
"b 1700 566.666667 130 43.333333"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"res = t.groupby('bucket').agg({'qty': [sum, np.mean], 'risk': [sum, np.mean]})\n",
"res.columns = res.columns.map('_'.join)\n",
"res.rename(columns={'qty_sum':'TOTAL_QTY','qty_mean':'AVG_QTY', \n",
" 'risk_sum':'TOTAL_RISK','risk_mean':'AVG_RISK'})"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>W_AVG_QTY</th>\n",
" </tr>\n",
" <tr>\n",
" <th>bucket</th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>a</th>\n",
" <td>340.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>b</th>\n",
" <td>687.5</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" W_AVG_QTY\n",
"bucket \n",
"a 340.0\n",
"b 687.5"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"t.groupby('bucket').apply(lambda g: np.average(g.qty, weights=g.weight)).to_frame('W_AVG_QTY')"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"scrolled": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>NR</th>\n",
" <th>TOTAL_QTY</th>\n",
" <th>AVG_QTY</th>\n",
" <th>TOTAL_RISK</th>\n",
" <th>AVG_RISK</th>\n",
" <th>W_AVG_QTY</th>\n",
" <th>W_AVG_RISK</th>\n",
" </tr>\n",
" <tr>\n",
" <th>bucket</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>a</th>\n",
" <td>2</td>\n",
" <td>600</td>\n",
" <td>300.000000</td>\n",
" <td>30</td>\n",
" <td>15.000000</td>\n",
" <td>340.0</td>\n",
" <td>16.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>b</th>\n",
" <td>3</td>\n",
" <td>1700</td>\n",
" <td>566.666667</td>\n",
" <td>130</td>\n",
" <td>43.333333</td>\n",
" <td>687.5</td>\n",
" <td>53.25</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" NR TOTAL_QTY AVG_QTY TOTAL_RISK AVG_RISK W_AVG_QTY \\\n",
"bucket \n",
"a 2 600 300.000000 30 15.000000 340.0 \n",
"b 3 1700 566.666667 130 43.333333 687.5 \n",
"\n",
" W_AVG_RISK \n",
"bucket \n",
"a 16.00 \n",
"b 53.25 "
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"res = t.groupby('bucket').agg({'bucket': len, 'qty': [sum, np.mean], 'risk': [sum, np.mean]})\n",
"res.columns = res.columns.map('_'.join)\n",
"res.rename(columns={'bucket_len':'NR', 'qty_sum':'TOTAL_QTY','qty_mean':'AVG_QTY', \n",
" 'risk_sum':'TOTAL_RISK','risk_mean':'AVG_RISK'}).join(\n",
" t.groupby('bucket').apply(lambda g: np.average(g.qty, weights=g.weight)).to_frame('W_AVG_QTY')).join(\n",
" t.groupby('bucket').apply(lambda g: np.average(g.risk, weights=g.weight)).to_frame('W_AVG_RISK')\n",
")\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This renaming workaround is needed due to column name match (two sum column on level 2)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"For comparion, the corresponding Q:\n",
"\n",
" select NR: count i, \n",
" TOTAL_QTY: sum qty, AVG_QTY: avg qty, \n",
" TOTAL_RISK: sum risk, AVG_RISK: avg risk, \n",
" W_AVG_QTY: weight wavg qty, W_AVG_RISK: weight wavg risk \n",
" by bucket from t"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Expert's solution based on comment from https://github.com/pandas-dev/pandas/issues/18366#issuecomment-348800691"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>NR</th>\n",
" <th>TOTAL_QTY</th>\n",
" <th>AVG_QTY</th>\n",
" <th>TOTAL_RISK</th>\n",
" <th>AVG_RISK</th>\n",
" <th>W_AVG_QTY</th>\n",
" <th>W_AVG_RISK</th>\n",
" </tr>\n",
" <tr>\n",
" <th>bucket</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>a</th>\n",
" <td>2.0</td>\n",
" <td>600.0</td>\n",
" <td>300.000000</td>\n",
" <td>30.0</td>\n",
" <td>15.000000</td>\n",
" <td>340.0</td>\n",
" <td>16.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>b</th>\n",
" <td>3.0</td>\n",
" <td>1700.0</td>\n",
" <td>566.666667</td>\n",
" <td>130.0</td>\n",
" <td>43.333333</td>\n",
" <td>687.5</td>\n",
" <td>53.25</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" NR TOTAL_QTY AVG_QTY TOTAL_RISK AVG_RISK W_AVG_QTY \\\n",
"bucket \n",
"a 2.0 600.0 300.000000 30.0 15.000000 340.0 \n",
"b 3.0 1700.0 566.666667 130.0 43.333333 687.5 \n",
"\n",
" W_AVG_RISK \n",
"bucket \n",
"a 16.00 \n",
"b 53.25 "
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def my_agg(x):\n",
" data = {'NR': x.bucket.count(),\n",
" 'TOTAL_QTY': x.qty.sum(),\n",
" 'AVG_QTY': x.qty.mean(),\n",
" 'TOTAL_RISK': x.risk.sum(),\n",
" 'AVG_RISK': x.risk.mean(),\n",
" 'W_AVG_QTY': np.average(x.qty, weights=x.weight),\n",
" 'W_AVG_RISK': np.average(x.risk, weights=x.weight)\n",
" }\n",
" return pd.Series(data, index=['NR', 'TOTAL_QTY', 'AVG_QTY', 'TOTAL_RISK', \n",
" 'AVG_RISK', 'W_AVG_QTY', 'W_AVG_RISK'])\n",
" \n",
"t.groupby('bucket').apply(my_agg)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment