BodonFerenc/AggregationByMultipleColumns.ipynb

## AggregationByMultipleColumns.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>bucket</th>\n",
       "      <th>qty</th>\n",
       "      <th>risk</th>\n",
       "      <th>weight</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>a</td>\n",
       "      <td>100</td>\n",
       "      <td>10</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>a</td>\n",
       "      <td>500</td>\n",
       "      <td>20</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>b</td>\n",
       "      <td>200</td>\n",
       "      <td>12</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>b</td>\n",
       "      <td>800</td>\n",
       "      <td>60</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>b</td>\n",
       "      <td>700</td>\n",
       "      <td>58</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  bucket  qty  risk  weight\n",
       "0      a  100    10       2\n",
       "1      a  500    20       3\n",
       "2      b  200    12       1\n",
       "3      b  800    60       4\n",
       "4      b  700    58       3"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "t = pd.DataFrame({'bucket':['a', 'a', 'b', 'b', 'b'], 'weight': [2, 3, 1, 4, 3], \n",
    "                  'qty': [100, 500, 200, 800, 700], 'risk': [10, 20, 12, 60, 58]})\n",
    "t"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "we would like to see for each `bucket` \n",
    "   * the number of element, as column **NR**\n",
    "   * sum and average of `qty` and `risk`, as columns **TOTAL_QTY**/**TOTAL_RISK** and **AVG_QTY**/**AVG_RISK**\n",
    "   * weighted average of `qty` and `risk`, as columns **W_AVG_QTY** and **W_AVG_RISK**. For weighted average, let us use numpy function [np.average](https://docs.scipy.org/doc/numpy/reference/generated/numpy.average.html)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>NR</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>bucket</th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>a</th>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>b</th>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        NR\n",
       "bucket    \n",
       "a        2\n",
       "b        3"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "t.groupby('bucket').agg({'bucket': len}).rename(columns= {'bucket':'NR'})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>TOTAL_QTY</th>\n",
       "      <th>AVG_QTY</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>bucket</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>a</th>\n",
       "      <td>600</td>\n",
       "      <td>300.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>b</th>\n",
       "      <td>1700</td>\n",
       "      <td>566.666667</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        TOTAL_QTY     AVG_QTY\n",
       "bucket                       \n",
       "a             600  300.000000\n",
       "b            1700  566.666667"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "t.groupby('bucket')['qty'].agg([sum, np.mean]).rename(columns={'sum': 'TOTAL_QTY', 'mean': 'AVG_QTY'})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>TOTAL_QTY</th>\n",
       "      <th>TOTAL_QTY</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>bucket</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>a</th>\n",
       "      <td>600</td>\n",
       "      <td>30</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>b</th>\n",
       "      <td>1700</td>\n",
       "      <td>130</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        TOTAL_QTY  TOTAL_QTY\n",
       "bucket                      \n",
       "a             600         30\n",
       "b            1700        130"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "t.groupby('bucket')['qty', 'risk'].agg(sum).rename(columns={'qty': 'TOTAL_QTY', 'risk': 'TOTAL_QTY'})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>TOTAL_QTY</th>\n",
       "      <th>AVG_QTY</th>\n",
       "      <th>TOTAL_RISK</th>\n",
       "      <th>AVG_RISK</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>bucket</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>a</th>\n",
       "      <td>600</td>\n",
       "      <td>300.000000</td>\n",
       "      <td>30</td>\n",
       "      <td>15.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>b</th>\n",
       "      <td>1700</td>\n",
       "      <td>566.666667</td>\n",
       "      <td>130</td>\n",
       "      <td>43.333333</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        TOTAL_QTY     AVG_QTY  TOTAL_RISK   AVG_RISK\n",
       "bucket                                              \n",
       "a             600  300.000000          30  15.000000\n",
       "b            1700  566.666667         130  43.333333"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "res = t.groupby('bucket').agg({'qty': [sum, np.mean], 'risk': [sum, np.mean]})\n",
    "res.columns = res.columns.map('_'.join)\n",
    "res.rename(columns={'qty_sum':'TOTAL_QTY','qty_mean':'AVG_QTY', \n",
    "                    'risk_sum':'TOTAL_RISK','risk_mean':'AVG_RISK'})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>W_AVG_QTY</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>bucket</th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>a</th>\n",
       "      <td>340.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>b</th>\n",
       "      <td>687.5</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        W_AVG_QTY\n",
       "bucket           \n",
       "a           340.0\n",
       "b           687.5"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "t.groupby('bucket').apply(lambda g: np.average(g.qty, weights=g.weight)).to_frame('W_AVG_QTY')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>NR</th>\n",
       "      <th>TOTAL_QTY</th>\n",
       "      <th>AVG_QTY</th>\n",
       "      <th>TOTAL_RISK</th>\n",
       "      <th>AVG_RISK</th>\n",
       "      <th>W_AVG_QTY</th>\n",
       "      <th>W_AVG_RISK</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>bucket</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>a</th>\n",
       "      <td>2</td>\n",
       "      <td>600</td>\n",
       "      <td>300.000000</td>\n",
       "      <td>30</td>\n",
       "      <td>15.000000</td>\n",
       "      <td>340.0</td>\n",
       "      <td>16.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>b</th>\n",
       "      <td>3</td>\n",
       "      <td>1700</td>\n",
       "      <td>566.666667</td>\n",
       "      <td>130</td>\n",
       "      <td>43.333333</td>\n",
       "      <td>687.5</td>\n",
       "      <td>53.25</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        NR  TOTAL_QTY     AVG_QTY  TOTAL_RISK   AVG_RISK  W_AVG_QTY  \\\n",
       "bucket                                                                \n",
       "a        2        600  300.000000          30  15.000000      340.0   \n",
       "b        3       1700  566.666667         130  43.333333      687.5   \n",
       "\n",
       "        W_AVG_RISK  \n",
       "bucket              \n",
       "a            16.00  \n",
       "b            53.25  "
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "res = t.groupby('bucket').agg({'bucket': len, 'qty': [sum, np.mean], 'risk': [sum, np.mean]})\n",
    "res.columns = res.columns.map('_'.join)\n",
    "res.rename(columns={'bucket_len':'NR', 'qty_sum':'TOTAL_QTY','qty_mean':'AVG_QTY', \n",
    "                    'risk_sum':'TOTAL_RISK','risk_mean':'AVG_RISK'}).join(\n",
    "    t.groupby('bucket').apply(lambda g: np.average(g.qty, weights=g.weight)).to_frame('W_AVG_QTY')).join(\n",
    "    t.groupby('bucket').apply(lambda g: np.average(g.risk, weights=g.weight)).to_frame('W_AVG_RISK')\n",
    ")\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This renaming workaround is needed due to column name match (two sum column on level 2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "For comparion, the corresponding Q:\n",
    "\n",
    "    select NR: count i, \n",
    "       TOTAL_QTY:  sum qty, AVG_QTY:   avg qty, \n",
    "       TOTAL_RISK: sum risk, AVG_RISK: avg risk, \n",
    "       W_AVG_QTY: weight wavg qty, W_AVG_RISK: weight wavg risk \n",
    "       by bucket from t"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Expert's solution based on comment from https://github.com/pandas-dev/pandas/issues/18366#issuecomment-348800691"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>NR</th>\n",
       "      <th>TOTAL_QTY</th>\n",
       "      <th>AVG_QTY</th>\n",
       "      <th>TOTAL_RISK</th>\n",
       "      <th>AVG_RISK</th>\n",
       "      <th>W_AVG_QTY</th>\n",
       "      <th>W_AVG_RISK</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>bucket</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>a</th>\n",
       "      <td>2.0</td>\n",
       "      <td>600.0</td>\n",
       "      <td>300.000000</td>\n",
       "      <td>30.0</td>\n",
       "      <td>15.000000</td>\n",
       "      <td>340.0</td>\n",
       "      <td>16.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>b</th>\n",
       "      <td>3.0</td>\n",
       "      <td>1700.0</td>\n",
       "      <td>566.666667</td>\n",
       "      <td>130.0</td>\n",
       "      <td>43.333333</td>\n",
       "      <td>687.5</td>\n",
       "      <td>53.25</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         NR  TOTAL_QTY     AVG_QTY  TOTAL_RISK   AVG_RISK  W_AVG_QTY  \\\n",
       "bucket                                                                 \n",
       "a       2.0      600.0  300.000000        30.0  15.000000      340.0   \n",
       "b       3.0     1700.0  566.666667       130.0  43.333333      687.5   \n",
       "\n",
       "        W_AVG_RISK  \n",
       "bucket              \n",
       "a            16.00  \n",
       "b            53.25  "
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def my_agg(x):\n",
    "    data = {'NR': x.bucket.count(),\n",
    "            'TOTAL_QTY': x.qty.sum(),\n",
    "            'AVG_QTY': x.qty.mean(),\n",
    "            'TOTAL_RISK': x.risk.sum(),\n",
    "            'AVG_RISK': x.risk.mean(),\n",
    "            'W_AVG_QTY':  np.average(x.qty, weights=x.weight),\n",
    "            'W_AVG_RISK':  np.average(x.risk, weights=x.weight)\n",
    "           }\n",
    "    return pd.Series(data, index=['NR', 'TOTAL_QTY', 'AVG_QTY', 'TOTAL_RISK', \n",
    "                                  'AVG_RISK', 'W_AVG_QTY', 'W_AVG_RISK'])\n",
    "    \n",
    "t.groupby('bucket').apply(my_agg)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [],
	"source": [
	"import pandas as pd\n",
	"import numpy as np"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {
	"scrolled": true
	},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>bucket</th>\n",
	" <th>qty</th>\n",
	" <th>risk</th>\n",
	" <th>weight</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>a</td>\n",
	" <td>100</td>\n",
	" <td>10</td>\n",
	" <td>2</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>a</td>\n",
	" <td>500</td>\n",
	" <td>20</td>\n",
	" <td>3</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>b</td>\n",
	" <td>200</td>\n",
	" <td>12</td>\n",
	" <td>1</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3</th>\n",
	" <td>b</td>\n",
	" <td>800</td>\n",
	" <td>60</td>\n",
	" <td>4</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>4</th>\n",
	" <td>b</td>\n",
	" <td>700</td>\n",
	" <td>58</td>\n",
	" <td>3</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" bucket qty risk weight\n",
	"0 a 100 10 2\n",
	"1 a 500 20 3\n",
	"2 b 200 12 1\n",
	"3 b 800 60 4\n",
	"4 b 700 58 3"
	]
	},
	"execution_count": 3,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"t = pd.DataFrame({'bucket':['a', 'a', 'b', 'b', 'b'], 'weight': [2, 3, 1, 4, 3], \n",
	" 'qty': [100, 500, 200, 800, 700], 'risk': [10, 20, 12, 60, 58]})\n",
	"t"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"we would like to see for each `bucket` \n",
	" * the number of element, as column NR\n",
	" * sum and average of `qty` and `risk`, as columns TOTAL_QTY/TOTAL_RISK and AVG_QTY/AVG_RISK\n",
	" * weighted average of `qty` and `risk`, as columns W_AVG_QTY and W_AVG_RISK. For weighted average, let us use numpy function [np.average](https://docs.scipy.org/doc/numpy/reference/generated/numpy.average.html)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {
	"scrolled": true
	},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>NR</th>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>bucket</th>\n",
	" <th></th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>a</th>\n",
	" <td>2</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>b</th>\n",
	" <td>3</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" NR\n",
	"bucket \n",
	"a 2\n",
	"b 3"
	]
	},
	"execution_count": 4,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"t.groupby('bucket').agg({'bucket': len}).rename(columns= {'bucket':'NR'})"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {
	"scrolled": true
	},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>TOTAL_QTY</th>\n",
	" <th>AVG_QTY</th>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>bucket</th>\n",
	" <th></th>\n",
	" <th></th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>a</th>\n",
	" <td>600</td>\n",
	" <td>300.000000</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>b</th>\n",
	" <td>1700</td>\n",
	" <td>566.666667</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" TOTAL_QTY AVG_QTY\n",
	"bucket \n",
	"a 600 300.000000\n",
	"b 1700 566.666667"
	]
	},
	"execution_count": 5,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"t.groupby('bucket')['qty'].agg([sum, np.mean]).rename(columns={'sum': 'TOTAL_QTY', 'mean': 'AVG_QTY'})"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>TOTAL_QTY</th>\n",
	" <th>TOTAL_QTY</th>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>bucket</th>\n",
	" <th></th>\n",
	" <th></th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>a</th>\n",
	" <td>600</td>\n",
	" <td>30</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>b</th>\n",
	" <td>1700</td>\n",
	" <td>130</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" TOTAL_QTY TOTAL_QTY\n",
	"bucket \n",
	"a 600 30\n",
	"b 1700 130"
	]
	},
	"execution_count": 6,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"t.groupby('bucket')['qty', 'risk'].agg(sum).rename(columns={'qty': 'TOTAL_QTY', 'risk': 'TOTAL_QTY'})"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {
	"scrolled": true
	},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>TOTAL_QTY</th>\n",
	" <th>AVG_QTY</th>\n",
	" <th>TOTAL_RISK</th>\n",
	" <th>AVG_RISK</th>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>bucket</th>\n",
	" <th></th>\n",
	" <th></th>\n",
	" <th></th>\n",
	" <th></th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>a</th>\n",
	" <td>600</td>\n",
	" <td>300.000000</td>\n",
	" <td>30</td>\n",
	" <td>15.000000</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>b</th>\n",
	" <td>1700</td>\n",
	" <td>566.666667</td>\n",
	" <td>130</td>\n",
	" <td>43.333333</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" TOTAL_QTY AVG_QTY TOTAL_RISK AVG_RISK\n",
	"bucket \n",
	"a 600 300.000000 30 15.000000\n",
	"b 1700 566.666667 130 43.333333"
	]
	},
	"execution_count": 7,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"res = t.groupby('bucket').agg({'qty': [sum, np.mean], 'risk': [sum, np.mean]})\n",
	"res.columns = res.columns.map('_'.join)\n",
	"res.rename(columns={'qty_sum':'TOTAL_QTY','qty_mean':'AVG_QTY', \n",
	" 'risk_sum':'TOTAL_RISK','risk_mean':'AVG_RISK'})"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>W_AVG_QTY</th>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>bucket</th>\n",
	" <th></th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>a</th>\n",
	" <td>340.0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>b</th>\n",
	" <td>687.5</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" W_AVG_QTY\n",
	"bucket \n",
	"a 340.0\n",
	"b 687.5"
	]
	},
	"execution_count": 8,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"t.groupby('bucket').apply(lambda g: np.average(g.qty, weights=g.weight)).to_frame('W_AVG_QTY')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"metadata": {
	"scrolled": false
	},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>NR</th>\n",
	" <th>TOTAL_QTY</th>\n",
	" <th>AVG_QTY</th>\n",
	" <th>TOTAL_RISK</th>\n",
	" <th>AVG_RISK</th>\n",
	" <th>W_AVG_QTY</th>\n",
	" <th>W_AVG_RISK</th>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>bucket</th>\n",
	" <th></th>\n",
	" <th></th>\n",
	" <th></th>\n",
	" <th></th>\n",
	" <th></th>\n",
	" <th></th>\n",
	" <th></th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>a</th>\n",
	" <td>2</td>\n",
	" <td>600</td>\n",
	" <td>300.000000</td>\n",
	" <td>30</td>\n",
	" <td>15.000000</td>\n",
	" <td>340.0</td>\n",
	" <td>16.00</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>b</th>\n",
	" <td>3</td>\n",
	" <td>1700</td>\n",
	" <td>566.666667</td>\n",
	" <td>130</td>\n",
	" <td>43.333333</td>\n",
	" <td>687.5</td>\n",
	" <td>53.25</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" NR TOTAL_QTY AVG_QTY TOTAL_RISK AVG_RISK W_AVG_QTY \\\n",
	"bucket \n",
	"a 2 600 300.000000 30 15.000000 340.0 \n",
	"b 3 1700 566.666667 130 43.333333 687.5 \n",
	"\n",
	" W_AVG_RISK \n",
	"bucket \n",
	"a 16.00 \n",
	"b 53.25 "
	]
	},
	"execution_count": 9,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"res = t.groupby('bucket').agg({'bucket': len, 'qty': [sum, np.mean], 'risk': [sum, np.mean]})\n",
	"res.columns = res.columns.map('_'.join)\n",
	"res.rename(columns={'bucket_len':'NR', 'qty_sum':'TOTAL_QTY','qty_mean':'AVG_QTY', \n",
	" 'risk_sum':'TOTAL_RISK','risk_mean':'AVG_RISK'}).join(\n",
	" t.groupby('bucket').apply(lambda g: np.average(g.qty, weights=g.weight)).to_frame('W_AVG_QTY')).join(\n",
	" t.groupby('bucket').apply(lambda g: np.average(g.risk, weights=g.weight)).to_frame('W_AVG_RISK')\n",
	")\n"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"This renaming workaround is needed due to column name match (two sum column on level 2)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"For comparion, the corresponding Q:\n",
	"\n",
	" select NR: count i, \n",
	" TOTAL_QTY: sum qty, AVG_QTY: avg qty, \n",
	" TOTAL_RISK: sum risk, AVG_RISK: avg risk, \n",
	" W_AVG_QTY: weight wavg qty, W_AVG_RISK: weight wavg risk \n",
	" by bucket from t"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Expert's solution based on comment from https://github.com/pandas-dev/pandas/issues/18366#issuecomment-348800691"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 12,
	"metadata": {
	"scrolled": true
	},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>NR</th>\n",
	" <th>TOTAL_QTY</th>\n",
	" <th>AVG_QTY</th>\n",
	" <th>TOTAL_RISK</th>\n",
	" <th>AVG_RISK</th>\n",
	" <th>W_AVG_QTY</th>\n",
	" <th>W_AVG_RISK</th>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>bucket</th>\n",
	" <th></th>\n",
	" <th></th>\n",
	" <th></th>\n",
	" <th></th>\n",
	" <th></th>\n",
	" <th></th>\n",
	" <th></th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>a</th>\n",
	" <td>2.0</td>\n",
	" <td>600.0</td>\n",
	" <td>300.000000</td>\n",
	" <td>30.0</td>\n",
	" <td>15.000000</td>\n",
	" <td>340.0</td>\n",
	" <td>16.00</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>b</th>\n",
	" <td>3.0</td>\n",
	" <td>1700.0</td>\n",
	" <td>566.666667</td>\n",
	" <td>130.0</td>\n",
	" <td>43.333333</td>\n",
	" <td>687.5</td>\n",
	" <td>53.25</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" NR TOTAL_QTY AVG_QTY TOTAL_RISK AVG_RISK W_AVG_QTY \\\n",
	"bucket \n",
	"a 2.0 600.0 300.000000 30.0 15.000000 340.0 \n",
	"b 3.0 1700.0 566.666667 130.0 43.333333 687.5 \n",
	"\n",
	" W_AVG_RISK \n",
	"bucket \n",
	"a 16.00 \n",
	"b 53.25 "
	]
	},
	"execution_count": 12,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"def my_agg(x):\n",
	" data = {'NR': x.bucket.count(),\n",
	" 'TOTAL_QTY': x.qty.sum(),\n",
	" 'AVG_QTY': x.qty.mean(),\n",
	" 'TOTAL_RISK': x.risk.sum(),\n",
	" 'AVG_RISK': x.risk.mean(),\n",
	" 'W_AVG_QTY': np.average(x.qty, weights=x.weight),\n",
	" 'W_AVG_RISK': np.average(x.risk, weights=x.weight)\n",
	" }\n",
	" return pd.Series(data, index=['NR', 'TOTAL_QTY', 'AVG_QTY', 'TOTAL_RISK', \n",
	" 'AVG_RISK', 'W_AVG_QTY', 'W_AVG_RISK'])\n",
	" \n",
	"t.groupby('bucket').apply(my_agg)"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.6.4"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}