Skip to content

Instantly share code, notes, and snippets.

@MaxHalford
Created August 9, 2023 17:32
Show Gist options
  • Save MaxHalford/5d3da23bbf8bf76e9f23c62a4f2539a9 to your computer and use it in GitHub Desktop.
Save MaxHalford/5d3da23bbf8bf76e9f23c62a4f2539a9 to your computer and use it in GitHub Desktop.
Metric decomposition
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Metric decomposition"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"- https://docs.google.com/spreadsheets/d/1-hYJesNCMlCyANPOPeLijg1sCfv-6nREPtkHtTVEwd4/edit#gid=0\n",
"- https://observablehq.com/@carbonfact/diff"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"https://chat.openai.com/share/6a5e1c60-b9a2-42bb-9b23-39e61358b577"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[0.09999999999999999, 0.068]"
]
},
"execution_count": 49,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# denominators\n",
"D = [\n",
" [1000, 2000],\n",
" [800, 1200]\n",
"]\n",
"# shares\n",
"S = [\n",
" [dj / sum(d) for j, dj in enumerate(d)]\n",
" for i, d in enumerate(D)\n",
"]\n",
"# numerators\n",
"N = [\n",
" [150, 150],\n",
" [88, 48]\n",
"]\n",
"# ratios\n",
"R = [\n",
" [ni / di for ni, di in zip(n, d)]\n",
" for n, d in zip(N, D)\n",
"]\n",
"# shares x ratios\n",
"SR = [\n",
" [ri * si for ri, si in zip(r, s)]\n",
" for r, s in zip(R, S)\n",
"]\n",
"# global means\n",
"M = list(map(sum, SR))\n",
"M"
]
},
{
"cell_type": "code",
"execution_count": 60,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"-0.03199999999999999"
]
},
"execution_count": 60,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"inner = [\n",
" S[1][0] * (R[1][0] - R[0][0]),\n",
" S[1][1] * (R[1][1] - R[0][1]),\n",
"]\n",
"mix = [\n",
" (S[1][0] - S[0][0]) * (R[0][0] - M[0]),\n",
" (S[1][1] - S[0][1]) * (R[0][1] - M[0])\n",
"]\n",
"sum(inner) + sum(mix)"
]
},
{
"cell_type": "code",
"execution_count": 67,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[0.09999999999999999, 0.068]"
]
},
"execution_count": 67,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"KPI = [\n",
" sum(ri * si for ri, si in zip(r, s))\n",
" for r, s in zip(R, S)\n",
"]\n",
"KPI"
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.068"
]
},
"execution_count": 70,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"(\n",
" KPI[0] +\n",
" sum(S[1][i] * (R[1][i] - R[0][i]) for i in range(2)) +\n",
" sum((S[1][i] - S[0][i]) * (R[0][i] - M[0]) for i in range(2))\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# denominators\n",
"D = [\n",
" [1000, 2000],\n",
" [800, 1200]\n",
"]\n",
"# shares\n",
"S = [\n",
" [dj / sum(d) for j, dj in enumerate(d)]\n",
" for i, d in enumerate(D)\n",
"]\n",
"# numerators\n",
"N = [\n",
" [150, 150],\n",
" [88, 48]\n",
"]\n",
"# ratios\n",
"R = [\n",
" [ni / di for ni, di in zip(n, d)]\n",
" for n, d in zip(N, D)\n",
"]\n",
"# shares x ratios\n",
"SR = [\n",
" [ri * si for ri, si in zip(r, s)]\n",
" for r, s in zip(R, S)\n",
"]\n",
"# global means\n",
"M = list(map(sum, SR))\n",
"# KPIs\n",
"KPI = [\n",
" sum(ri * si for ri, si in zip(r, s))\n",
" for r, s in zip(R, S)\n",
"]\n",
"# decomposed KPI\n",
"KPI[1] == (\n",
" KPI[0] +\n",
" sum(S[1][i] * (R[1][i] - R[0][i]) for i in range(2)) +\n",
" sum((S[1][i] - S[0][i]) * (R[0][i] - KPI[0]) for i in range(2))\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.068"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"M[1]"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.068"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"KPI[1]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"---"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>person</th>\n",
" <th>claim_type</th>\n",
" <th>date</th>\n",
" <th>year</th>\n",
" <th>amount</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>John</td>\n",
" <td>Dentist</td>\n",
" <td>2021-04-08</td>\n",
" <td>2021</td>\n",
" <td>129.66</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Jane</td>\n",
" <td>Dentist</td>\n",
" <td>2021-09-03</td>\n",
" <td>2021</td>\n",
" <td>127.07</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Jane</td>\n",
" <td>Physiotherapy</td>\n",
" <td>2021-02-07</td>\n",
" <td>2021</td>\n",
" <td>125.27</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Michael</td>\n",
" <td>Dentist</td>\n",
" <td>2021-12-21</td>\n",
" <td>2021</td>\n",
" <td>122.45</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Michael</td>\n",
" <td>Physiotherapy</td>\n",
" <td>2021-10-09</td>\n",
" <td>2021</td>\n",
" <td>132.82</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" person claim_type date year amount\n",
"0 John Dentist 2021-04-08 2021 129.66\n",
"1 Jane Dentist 2021-09-03 2021 127.07\n",
"2 Jane Physiotherapy 2021-02-07 2021 125.27\n",
"3 Michael Dentist 2021-12-21 2021 122.45\n",
"4 Michael Physiotherapy 2021-10-09 2021 132.82"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"import random\n",
"\n",
"random.seed(42)\n",
"\n",
"# Function to generate a random cost based on the claim type and year\n",
"def generate_claim_cost(claim_type, year):\n",
" if claim_type == 'Dentist':\n",
" base_cost = 100\n",
" elif claim_type == 'Psychiatrist':\n",
" base_cost = 150\n",
" elif claim_type == 'General Physician':\n",
" base_cost = 80\n",
" elif claim_type == 'Physiotherapy':\n",
" base_cost = 120\n",
" else:\n",
" base_cost = 50\n",
"\n",
" # Adjust cost based on year\n",
" if year == 2021:\n",
" base_cost *= 1.2\n",
" elif year == 2023:\n",
" base_cost *= 1.5\n",
"\n",
" # Add some random variation\n",
" cost = random.uniform(base_cost - 20, base_cost + 20)\n",
" return round(cost, 2)\n",
"\n",
"# Generating sample data\n",
"claim_types = ['Dentist', 'Psychiatrist', 'General Physician', 'Physiotherapy']\n",
"years = [2021, 2022, 2023]\n",
"people = ['John', 'Jane', 'Michael', 'Emily', 'William', 'Emma', 'Daniel', 'Olivia', 'Lucas', 'Ava']\n",
"\n",
"data = []\n",
"for year in years:\n",
" for person in people:\n",
" num_claims = random.randint(1, 5) # Random number of claims per person per year\n",
" for _ in range(num_claims):\n",
" claim_type = random.choice(claim_types)\n",
" cost = generate_claim_cost(claim_type, year)\n",
" date = pd.to_datetime(f\"{random.randint(1, 12)}/{random.randint(1, 28)}/{year}\", format='%m/%d/%Y')\n",
" data.append([person, claim_type, date, year, cost])\n",
"\n",
"# Create the DataFrame\n",
"columns = ['person', 'claim_type', 'date', 'year', 'amount']\n",
"claims_df = pd.DataFrame(data, columns=columns)\n",
"\n",
"# Display the DataFrame\n",
"claims_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"80"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(claims_df)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"| person | claim_type | date | year | amount |\n",
"|:---------|:--------------|:--------------------|-------:|---------:|\n",
"| John | Dentist | 2021-04-08 00:00:00 | 2021 | 129.66 |\n",
"| Jane | Dentist | 2021-09-03 00:00:00 | 2021 | 127.07 |\n",
"| Jane | Physiotherapy | 2021-02-07 00:00:00 | 2021 | 125.27 |\n",
"| Michael | Dentist | 2021-12-21 00:00:00 | 2021 | 122.45 |\n",
"| Michael | Physiotherapy | 2021-10-09 00:00:00 | 2021 | 132.82 |\n"
]
}
],
"source": [
"print(claims_df.head().to_markdown(index=False))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Sums"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"- Total cost\n",
"- Total footprint"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"| year | sum | diff |\n",
"|-------:|--------:|--------:|\n",
"| 2021 | 3814.54 | nan |\n",
"| 2022 | 2890.29 | -924.25 |\n",
"| 2023 | 4178.03 | 1287.74 |\n"
]
}
],
"source": [
"sums = claims_df.groupby('year')['amount'].sum()\n",
"sums = pd.DataFrame({'sum': sums, 'diff': sums - sums.shift()})\n",
"print(sums.to_markdown())"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"| year | claim_type | sum | diff |\n",
"|-------:|:------------------|--------:|--------:|\n",
"| 2021 | Dentist | 1104.42 | nan |\n",
"| 2021 | General Physician | 594.44 | nan |\n",
"| 2021 | Physiotherapy | 801.78 | nan |\n",
"| 2021 | Psychiatrist | 1313.9 | nan |\n",
"| 2022 | Dentist | 622.48 | -481.94 |\n",
"| 2022 | General Physician | 749.08 | 154.64 |\n",
"| 2022 | Physiotherapy | 339.45 | -462.33 |\n",
"| 2022 | Psychiatrist | 1179.28 | -134.62 |\n",
"| 2023 | Dentist | 1440.99 | 818.51 |\n",
"| 2023 | General Physician | 826.18 | 77.1 |\n",
"| 2023 | Physiotherapy | 1049.15 | 709.7 |\n",
"| 2023 | Psychiatrist | 861.71 | -317.57 |\n"
]
}
],
"source": [
"print(pd.DataFrame({\n",
" 'sum': (s := claims_df.groupby(['year', 'claim_type'])['amount'].sum()),\n",
" 'diff': (\n",
" s - s.groupby('claim_type').shift()\n",
" )\n",
"}).reset_index().to_markdown(index=False))"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>year</th>\n",
" <th>claim_type</th>\n",
" <th>mean</th>\n",
" <th>count</th>\n",
" <th>sum</th>\n",
" <th>mean_lag</th>\n",
" <th>count_lag</th>\n",
" <th>inner</th>\n",
" <th>mix</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2021</td>\n",
" <td>Dentist</td>\n",
" <td>122.713333</td>\n",
" <td>9</td>\n",
" <td>1104.42</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2021</td>\n",
" <td>General Physician</td>\n",
" <td>99.073333</td>\n",
" <td>6</td>\n",
" <td>594.44</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2021</td>\n",
" <td>Physiotherapy</td>\n",
" <td>133.630000</td>\n",
" <td>6</td>\n",
" <td>801.78</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2021</td>\n",
" <td>Psychiatrist</td>\n",
" <td>187.700000</td>\n",
" <td>7</td>\n",
" <td>1313.90</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2022</td>\n",
" <td>Dentist</td>\n",
" <td>103.746667</td>\n",
" <td>6</td>\n",
" <td>622.48</td>\n",
" <td>122.713333</td>\n",
" <td>9.0</td>\n",
" <td>-170.700000</td>\n",
" <td>-311.240000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>2022</td>\n",
" <td>General Physician</td>\n",
" <td>83.231111</td>\n",
" <td>9</td>\n",
" <td>749.08</td>\n",
" <td>99.073333</td>\n",
" <td>6.0</td>\n",
" <td>-95.053333</td>\n",
" <td>249.693333</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>2022</td>\n",
" <td>Physiotherapy</td>\n",
" <td>113.150000</td>\n",
" <td>3</td>\n",
" <td>339.45</td>\n",
" <td>133.630000</td>\n",
" <td>6.0</td>\n",
" <td>-122.880000</td>\n",
" <td>-339.450000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>2022</td>\n",
" <td>Psychiatrist</td>\n",
" <td>147.410000</td>\n",
" <td>8</td>\n",
" <td>1179.28</td>\n",
" <td>187.700000</td>\n",
" <td>7.0</td>\n",
" <td>-282.030000</td>\n",
" <td>147.410000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>2023</td>\n",
" <td>Dentist</td>\n",
" <td>160.110000</td>\n",
" <td>9</td>\n",
" <td>1440.99</td>\n",
" <td>103.746667</td>\n",
" <td>6.0</td>\n",
" <td>338.180000</td>\n",
" <td>480.330000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>2023</td>\n",
" <td>General Physician</td>\n",
" <td>118.025714</td>\n",
" <td>7</td>\n",
" <td>826.18</td>\n",
" <td>83.231111</td>\n",
" <td>9.0</td>\n",
" <td>313.151429</td>\n",
" <td>-236.051429</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>2023</td>\n",
" <td>Physiotherapy</td>\n",
" <td>174.858333</td>\n",
" <td>6</td>\n",
" <td>1049.15</td>\n",
" <td>113.150000</td>\n",
" <td>3.0</td>\n",
" <td>185.125000</td>\n",
" <td>524.575000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>2023</td>\n",
" <td>Psychiatrist</td>\n",
" <td>215.427500</td>\n",
" <td>4</td>\n",
" <td>861.71</td>\n",
" <td>147.410000</td>\n",
" <td>8.0</td>\n",
" <td>544.140000</td>\n",
" <td>-861.710000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" year claim_type mean count sum mean_lag \\\n",
"0 2021 Dentist 122.713333 9 1104.42 NaN \n",
"1 2021 General Physician 99.073333 6 594.44 NaN \n",
"2 2021 Physiotherapy 133.630000 6 801.78 NaN \n",
"3 2021 Psychiatrist 187.700000 7 1313.90 NaN \n",
"4 2022 Dentist 103.746667 6 622.48 122.713333 \n",
"5 2022 General Physician 83.231111 9 749.08 99.073333 \n",
"6 2022 Physiotherapy 113.150000 3 339.45 133.630000 \n",
"7 2022 Psychiatrist 147.410000 8 1179.28 187.700000 \n",
"8 2023 Dentist 160.110000 9 1440.99 103.746667 \n",
"9 2023 General Physician 118.025714 7 826.18 83.231111 \n",
"10 2023 Physiotherapy 174.858333 6 1049.15 113.150000 \n",
"11 2023 Psychiatrist 215.427500 4 861.71 147.410000 \n",
"\n",
" count_lag inner mix \n",
"0 NaN NaN NaN \n",
"1 NaN NaN NaN \n",
"2 NaN NaN NaN \n",
"3 NaN NaN NaN \n",
"4 9.0 -170.700000 -311.240000 \n",
"5 6.0 -95.053333 249.693333 \n",
"6 6.0 -122.880000 -339.450000 \n",
"7 7.0 -282.030000 147.410000 \n",
"8 6.0 338.180000 480.330000 \n",
"9 9.0 313.151429 -236.051429 \n",
"10 3.0 185.125000 524.575000 \n",
"11 8.0 544.140000 -861.710000 "
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"metric = 'amount'\n",
"period = 'year'\n",
"dimension = 'claim_type'\n",
"\n",
"totals = (\n",
" claims_df\n",
" .groupby([period, dimension])\n",
" [metric]\n",
" .agg(['mean', 'count', 'sum'])\n",
" .reset_index()\n",
" .sort_values(period)\n",
")\n",
"\n",
"totals['mean_lag'] = totals.groupby(dimension)['mean'].shift(1)\n",
"totals['count_lag'] = totals.groupby(dimension)['count'].shift(1)\n",
"totals['inner'] = totals.eval('(mean - mean_lag) * count_lag')\n",
"totals['mix'] = totals.eval('(count - count_lag) * mean')\n",
"totals"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"| year | 0 |\n",
"|-------:|--------:|\n",
"| 2021 | 0 |\n",
"| 2022 | -924.25 |\n",
"| 2023 | 1287.74 |\n"
]
}
],
"source": [
"print(totals.groupby('year').apply(lambda x: (x.inner + x.mix).sum()).to_markdown())"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"| person | claim_type | date | year | amount | status |\n",
"|:---------|:-------------|:--------------------|-------:|---------:|:---------|\n",
"| Jane | Dentist | 2023-03-26 00:00:00 | 2023 | 136.11 | NEW |\n",
"| Megan | Dentist | 2023-06-01 00:00:00 | 2023 | 138.99 | EXISTING |\n",
"| John | Psychiatrist | 2021-10-09 00:00:00 | 2021 | 168.82 | NEW |\n",
"| Emily | Psychiatrist | 2024-10-16 00:00:00 | 2024 | 132.29 | EXISTING |\n",
"| Michael | Dentist | 2023-10-15 00:00:00 | 2023 | 145.39 | NEW |\n"
]
}
],
"source": [
"import collections\n",
"import random\n",
"import names # This library generates human names\n",
"import pandas as pd\n",
"\n",
"random.seed(42)\n",
"\n",
"# Function to generate a random cost based on the claim type and year\n",
"def generate_claim_cost(claim_type, year):\n",
" if claim_type == 'Dentist':\n",
" base_cost = 100\n",
" elif claim_type == 'Psychiatrist':\n",
" base_cost = 150\n",
"\n",
" # Adjust cost based on year\n",
" if year == 2021:\n",
" base_cost *= 1.2\n",
" elif year == 2023:\n",
" base_cost *= 1.5\n",
"\n",
" # Add some random variation\n",
" cost = random.uniform(base_cost - 20, base_cost + 20)\n",
" return round(cost, 2)\n",
"\n",
"# Generating sample data\n",
"claim_types = ['Dentist', 'Psychiatrist']\n",
"years = [2021, 2022, 2023, 2024]\n",
"people = ['John', 'Jane', 'Michael', 'Emily', 'William']\n",
"\n",
"data = []\n",
"for year in years:\n",
" new_people = (\n",
" [names.get_first_name() for _ in range(random.randint(1, 3))]\n",
" if year > 2021\n",
" else []\n",
" )\n",
" existing_people = [person for person in people if random.random() > 0.3]\n",
" people_this_year = existing_people + new_people\n",
" people.extend(new_people)\n",
"\n",
" for person in people_this_year:\n",
" num_claims = random.randint(1, 5) # Random number of claims per existing customer per year\n",
" for _ in range(num_claims):\n",
" claim_type = random.choice(claim_types)\n",
" cost = generate_claim_cost(claim_type, year)\n",
" date = pd.to_datetime(f\"{random.randint(1, 12)}/{random.randint(1, 28)}/{year}\", format='%m/%d/%Y')\n",
" data.append([person, claim_type, date, year, cost])\n",
"\n",
"# Create the DataFrame\n",
"columns = ['person', 'claim_type', 'date', 'year', 'amount']\n",
"claims_df = pd.DataFrame(data, columns=columns)\n",
"\n",
"# Indicate whether people are existing, new, or returning\n",
"years_seen = collections.defaultdict(set)\n",
"statuses = []\n",
"for claim in claims_df.to_dict(orient='records'):\n",
" years_seen[claim['person']].add(claim['year'])\n",
" if claim['year'] - 1 in years_seen[claim['person']]:\n",
" statuses.append('EXISTING')\n",
" elif any(year < claim['year'] for year in years_seen[claim['person']]):\n",
" statuses.append('RETURNING')\n",
" elif not {year for year in years_seen[claim['person']] if year != claim['year']}:\n",
" statuses.append('NEW')\n",
"\n",
"claims_df['status'] = statuses\n",
"\n",
"print(claims_df.sample(5).to_markdown(index=False))"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"| year | 0 |\n",
"|-------:|--------:|\n",
"| 2021 | 0 |\n",
"| 2022 | -924.25 |\n",
"| 2023 | 1287.74 |\n"
]
}
],
"source": [
"print(totals.groupby('year').apply(lambda x: (x.inner + x.mix).sum()).to_markdown())"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"| year | sum | diff |\n",
"|-------:|--------:|---------:|\n",
"| 2021 | 1312.28 | nan |\n",
"| 2022 | 676.06 | -636.22 |\n",
"| 2023 | 5191.31 | 4515.25 |\n",
"| 2024 | 1966.73 | -3224.58 |\n"
]
}
],
"source": [
"sums = claims_df.groupby('year')['amount'].sum()\n",
"sums = pd.DataFrame({'sum': sums, 'diff': sums - sums.shift()})\n",
"print(sums.to_markdown())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Ratios\n",
"\n",
"- Cost by claim\n",
"- Cost by user\n",
"- Footprint by product\n",
"- Footprint by gram"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"| year | average | diff |\n",
"|-------:|----------:|---------:|\n",
"| 2021 | 145.809 | nan |\n",
"| 2022 | 112.677 | -33.1322 |\n",
"| 2023 | 173.044 | 60.367 |\n",
"| 2024 | 122.921 | -50.123 |\n"
]
}
],
"source": [
"averages = claims_df.groupby('year')['amount'].mean()\n",
"averages = pd.DataFrame({'average': averages, 'diff': averages - averages.shift()})\n",
"print(averages.to_markdown())"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"| year | claim_type | sum | count |\n",
"|-------:|:-------------|--------:|--------:|\n",
"| 2021 | Dentist | 614.36 | 5 |\n",
"| 2021 | Psychiatrist | 697.92 | 4 |\n",
"| 2022 | Dentist | 393.5 | 4 |\n",
"| 2022 | Psychiatrist | 282.56 | 2 |\n",
"| 2023 | Dentist | 2967.3 | 20 |\n"
]
}
],
"source": [
"metric = 'amount'\n",
"period = 'year'\n",
"dimension = 'claim_type'\n",
"\n",
"decomp = (\n",
" claims_df\n",
" .groupby([period, dimension], dropna=True)\n",
" [metric].agg(['sum', 'count'])\n",
" .reset_index()\n",
" .sort_values(period)\n",
")\n",
"print(decomp.head().to_markdown(index=False))"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>year</th>\n",
" <th>claim_type</th>\n",
" <th>sum</th>\n",
" <th>count</th>\n",
" <th>mean</th>\n",
" <th>share</th>\n",
" <th>global_mean</th>\n",
" <th>mean_lag</th>\n",
" <th>share_lag</th>\n",
" <th>global_mean_lag</th>\n",
" <th>inner</th>\n",
" <th>mix</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2021</td>\n",
" <td>Dentist</td>\n",
" <td>614.36</td>\n",
" <td>5</td>\n",
" <td>122.87200</td>\n",
" <td>0.555556</td>\n",
" <td>145.808889</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2021</td>\n",
" <td>Psychiatrist</td>\n",
" <td>697.92</td>\n",
" <td>4</td>\n",
" <td>174.48000</td>\n",
" <td>0.444444</td>\n",
" <td>145.808889</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2022</td>\n",
" <td>Dentist</td>\n",
" <td>393.50</td>\n",
" <td>4</td>\n",
" <td>98.37500</td>\n",
" <td>0.666667</td>\n",
" <td>112.676667</td>\n",
" <td>122.872</td>\n",
" <td>0.555556</td>\n",
" <td>145.808889</td>\n",
" <td>-16.331333</td>\n",
" <td>-2.548543</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2022</td>\n",
" <td>Psychiatrist</td>\n",
" <td>282.56</td>\n",
" <td>2</td>\n",
" <td>141.28000</td>\n",
" <td>0.333333</td>\n",
" <td>112.676667</td>\n",
" <td>174.480</td>\n",
" <td>0.444444</td>\n",
" <td>145.808889</td>\n",
" <td>-11.066667</td>\n",
" <td>-3.185679</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2023</td>\n",
" <td>Dentist</td>\n",
" <td>2967.30</td>\n",
" <td>20</td>\n",
" <td>148.36500</td>\n",
" <td>0.666667</td>\n",
" <td>173.043667</td>\n",
" <td>98.375</td>\n",
" <td>0.666667</td>\n",
" <td>112.676667</td>\n",
" <td>33.326667</td>\n",
" <td>-0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>2023</td>\n",
" <td>Psychiatrist</td>\n",
" <td>2224.01</td>\n",
" <td>10</td>\n",
" <td>222.40100</td>\n",
" <td>0.333333</td>\n",
" <td>173.043667</td>\n",
" <td>141.280</td>\n",
" <td>0.333333</td>\n",
" <td>112.676667</td>\n",
" <td>27.040333</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>2024</td>\n",
" <td>Dentist</td>\n",
" <td>781.46</td>\n",
" <td>8</td>\n",
" <td>97.68250</td>\n",
" <td>0.500000</td>\n",
" <td>122.920625</td>\n",
" <td>148.365</td>\n",
" <td>0.666667</td>\n",
" <td>173.043667</td>\n",
" <td>-25.341250</td>\n",
" <td>4.113111</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>2024</td>\n",
" <td>Psychiatrist</td>\n",
" <td>1185.27</td>\n",
" <td>8</td>\n",
" <td>148.15875</td>\n",
" <td>0.500000</td>\n",
" <td>122.920625</td>\n",
" <td>222.401</td>\n",
" <td>0.333333</td>\n",
" <td>173.043667</td>\n",
" <td>-37.121125</td>\n",
" <td>8.226222</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" year claim_type sum count mean share global_mean \\\n",
"0 2021 Dentist 614.36 5 122.87200 0.555556 145.808889 \n",
"1 2021 Psychiatrist 697.92 4 174.48000 0.444444 145.808889 \n",
"2 2022 Dentist 393.50 4 98.37500 0.666667 112.676667 \n",
"3 2022 Psychiatrist 282.56 2 141.28000 0.333333 112.676667 \n",
"4 2023 Dentist 2967.30 20 148.36500 0.666667 173.043667 \n",
"5 2023 Psychiatrist 2224.01 10 222.40100 0.333333 173.043667 \n",
"6 2024 Dentist 781.46 8 97.68250 0.500000 122.920625 \n",
"7 2024 Psychiatrist 1185.27 8 148.15875 0.500000 122.920625 \n",
"\n",
" mean_lag share_lag global_mean_lag inner mix \n",
"0 NaN NaN NaN NaN NaN \n",
"1 NaN NaN NaN NaN NaN \n",
"2 122.872 0.555556 145.808889 -16.331333 -2.548543 \n",
"3 174.480 0.444444 145.808889 -11.066667 -3.185679 \n",
"4 98.375 0.666667 112.676667 33.326667 -0.000000 \n",
"5 141.280 0.333333 112.676667 27.040333 0.000000 \n",
"6 148.365 0.666667 173.043667 -25.341250 4.113111 \n",
"7 222.401 0.333333 173.043667 -37.121125 8.226222 "
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"decomp['mean'] = decomp.eval('sum / count')\n",
"decomp['share'] = decomp['count'] / decomp.groupby('year')['count'].transform('sum')\n",
"decomp['global_mean'] = (\n",
" decomp.groupby('year')['sum'].transform('sum') /\n",
" decomp.groupby('year')['count'].transform('sum')\n",
")\n",
"decomp['mean_lag'] = decomp.groupby(dimension)['mean'].shift(1)\n",
"decomp['share_lag'] = decomp.groupby(dimension)['share'].shift(1)\n",
"decomp['global_mean_lag'] = decomp.groupby(dimension)['global_mean'].shift(1)\n",
"decomp['inner'] = decomp.eval('share * (mean - mean_lag)')\n",
"decomp['mix'] = decomp.eval('(share - share_lag) * (mean_lag - global_mean_lag)')\n",
"decomp"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"| year | 0 |\n",
"|-------:|---------:|\n",
"| 2021 | 0 |\n",
"| 2022 | -33.1322 |\n",
"| 2023 | 60.367 |\n",
"| 2024 | -50.123 |\n"
]
}
],
"source": [
"print(decomp.groupby('year').apply(lambda x: (x.inner + x.mix).sum()).to_markdown())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### DuckDB"
]
},
{
"cell_type": "code",
"execution_count": 94,
"metadata": {},
"outputs": [
{
"ename": "CatalogException",
"evalue": "Catalog Error: Table with name \"claims\" already exists!",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mCatalogException\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m/Users/max/projects/maxhalford.github.io/decomp.ipynb Cell 15\u001b[0m in \u001b[0;36m3\n\u001b[1;32m <a href='vscode-notebook-cell:/Users/max/projects/maxhalford.github.io/decomp.ipynb#X40sZmlsZQ%3D%3D?line=0'>1</a>\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mduckdb\u001b[39;00m\n\u001b[0;32m----> <a href='vscode-notebook-cell:/Users/max/projects/maxhalford.github.io/decomp.ipynb#X40sZmlsZQ%3D%3D?line=2'>3</a>\u001b[0m duckdb\u001b[39m.\u001b[39;49msql(\u001b[39m\"\u001b[39;49m\u001b[39mCREATE TABLE claims AS SELECT * FROM claims_df\u001b[39;49m\u001b[39m\"\u001b[39;49m)\n",
"\u001b[0;31mCatalogException\u001b[0m: Catalog Error: Table with name \"claims\" already exists!"
]
}
],
"source": [
"import duckdb\n",
"\n",
"duckdb.sql(\"CREATE TABLE claims AS SELECT * FROM claims_df\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.0"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment