Skip to content

Instantly share code, notes, and snippets.

@walkerh
Last active March 30, 2022 16:22
Show Gist options
  • Save walkerh/a697f0d530a8836c2daa8d28809c7892 to your computer and use it in GitHub Desktop.
Save walkerh/a697f0d530a8836c2daa8d28809c7892 to your computer and use it in GitHub Desktop.
Combining Aggregation With Column Update
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "357ab29a",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "49cc1fbf",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>A</th>\n",
" <th>B</th>\n",
" <th>C</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>m</th>\n",
" <td>a</td>\n",
" <td>1</td>\n",
" <td>a</td>\n",
" </tr>\n",
" <tr>\n",
" <th>n</th>\n",
" <td>a</td>\n",
" <td>2</td>\n",
" <td>b</td>\n",
" </tr>\n",
" <tr>\n",
" <th>o</th>\n",
" <td>a</td>\n",
" <td>3</td>\n",
" <td>c</td>\n",
" </tr>\n",
" <tr>\n",
" <th>p</th>\n",
" <td>a</td>\n",
" <td>4</td>\n",
" <td>d</td>\n",
" </tr>\n",
" <tr>\n",
" <th>q</th>\n",
" <td>b</td>\n",
" <td>1</td>\n",
" <td>e</td>\n",
" </tr>\n",
" <tr>\n",
" <th>r</th>\n",
" <td>b</td>\n",
" <td>2</td>\n",
" <td>f</td>\n",
" </tr>\n",
" <tr>\n",
" <th>s</th>\n",
" <td>b</td>\n",
" <td>3</td>\n",
" <td>g</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" A B C\n",
"m a 1 a\n",
"n a 2 b\n",
"o a 3 c\n",
"p a 4 d\n",
"q b 1 e\n",
"r b 2 f\n",
"s b 3 g"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df0 = pd.DataFrame(\n",
" dict(\n",
" A=iter(\"aaaabbb\"),\n",
" B=[1, 2, 3, 4, 1, 2, 3],\n",
" C=iter(\"abcdefg\"),\n",
" ),\n",
" index=iter(\"mnopqrs\"),\n",
")\n",
"# Note:\n",
"# - the custom index\n",
"# - C has a unique value for every row\n",
"df0"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "b1212a70",
"metadata": {},
"outputs": [],
"source": [
"g = df0.groupby(\"A\")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "49a8f037",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"A\n",
"a 4\n",
"b 3\n",
"Name: B, dtype: int64"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"m = g[\"B\"].max()\n",
"m"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "471f717c",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"pandas.core.series.Series"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"type(m)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "fd879d6a",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>A</th>\n",
" <th>B</th>\n",
" <th>C</th>\n",
" <th>B_max</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>m</th>\n",
" <td>a</td>\n",
" <td>1</td>\n",
" <td>a</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>n</th>\n",
" <td>a</td>\n",
" <td>2</td>\n",
" <td>b</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>o</th>\n",
" <td>a</td>\n",
" <td>3</td>\n",
" <td>c</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>p</th>\n",
" <td>a</td>\n",
" <td>4</td>\n",
" <td>d</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>q</th>\n",
" <td>b</td>\n",
" <td>1</td>\n",
" <td>e</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>r</th>\n",
" <td>b</td>\n",
" <td>2</td>\n",
" <td>f</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>s</th>\n",
" <td>b</td>\n",
" <td>3</td>\n",
" <td>g</td>\n",
" <td>3</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" A B C B_max\n",
"m a 1 a 4\n",
"n a 2 b 4\n",
"o a 3 c 4\n",
"p a 4 d 4\n",
"q b 1 e 3\n",
"r b 2 f 3\n",
"s b 3 g 3"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df1 = df0.join(m, on=\"A\", rsuffix=\"_max\")\n",
"df1"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "eb00c973",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>A</th>\n",
" <th>B</th>\n",
" <th>C</th>\n",
" <th>B_max</th>\n",
" <th>must_change</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>m</th>\n",
" <td>a</td>\n",
" <td>1</td>\n",
" <td>a</td>\n",
" <td>4</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>n</th>\n",
" <td>a</td>\n",
" <td>2</td>\n",
" <td>b</td>\n",
" <td>4</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>o</th>\n",
" <td>a</td>\n",
" <td>3</td>\n",
" <td>c</td>\n",
" <td>4</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>p</th>\n",
" <td>a</td>\n",
" <td>4</td>\n",
" <td>d</td>\n",
" <td>4</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>q</th>\n",
" <td>b</td>\n",
" <td>1</td>\n",
" <td>e</td>\n",
" <td>3</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>r</th>\n",
" <td>b</td>\n",
" <td>2</td>\n",
" <td>f</td>\n",
" <td>3</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>s</th>\n",
" <td>b</td>\n",
" <td>3</td>\n",
" <td>g</td>\n",
" <td>3</td>\n",
" <td>True</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" A B C B_max must_change\n",
"m a 1 a 4 False\n",
"n a 2 b 4 False\n",
"o a 3 c 4 False\n",
"p a 4 d 4 True\n",
"q b 1 e 3 False\n",
"r b 2 f 3 False\n",
"s b 3 g 3 True"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df1[\"must_change\"] = df1.B == df1.B_max\n",
"df1"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "e6c78a64",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>C</th>\n",
" <th>must_change</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>m</th>\n",
" <td>a</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>n</th>\n",
" <td>b</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>o</th>\n",
" <td>c</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>p</th>\n",
" <td>d</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>q</th>\n",
" <td>e</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>r</th>\n",
" <td>f</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>s</th>\n",
" <td>g</td>\n",
" <td>True</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" C must_change\n",
"m a False\n",
"n b False\n",
"o c False\n",
"p d True\n",
"q e False\n",
"r f False\n",
"s g True"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sub = df1[[\"C\", \"must_change\"]]\n",
"sub"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "45659bd6",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('m', 'a', False),\n",
" ('n', 'b', False),\n",
" ('o', 'c', False),\n",
" ('p', 'd', True),\n",
" ('q', 'e', False),\n",
" ('r', 'f', False),\n",
" ('s', 'g', True)]"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"[(i, value, must_change) for i, value, must_change in sub.to_records()]"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "3b462905",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['a', 'b', 'c', 'y', 'e', 'f', 'y']"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"[\"y\" if must_change else value for i, value, must_change in sub.to_records()]"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "dc40cf69",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>A</th>\n",
" <th>B</th>\n",
" <th>C</th>\n",
" <th>B_max</th>\n",
" <th>must_change</th>\n",
" <th>C_new</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>m</th>\n",
" <td>a</td>\n",
" <td>1</td>\n",
" <td>a</td>\n",
" <td>4</td>\n",
" <td>False</td>\n",
" <td>a</td>\n",
" </tr>\n",
" <tr>\n",
" <th>n</th>\n",
" <td>a</td>\n",
" <td>2</td>\n",
" <td>b</td>\n",
" <td>4</td>\n",
" <td>False</td>\n",
" <td>b</td>\n",
" </tr>\n",
" <tr>\n",
" <th>o</th>\n",
" <td>a</td>\n",
" <td>3</td>\n",
" <td>c</td>\n",
" <td>4</td>\n",
" <td>False</td>\n",
" <td>c</td>\n",
" </tr>\n",
" <tr>\n",
" <th>p</th>\n",
" <td>a</td>\n",
" <td>4</td>\n",
" <td>d</td>\n",
" <td>4</td>\n",
" <td>True</td>\n",
" <td>y</td>\n",
" </tr>\n",
" <tr>\n",
" <th>q</th>\n",
" <td>b</td>\n",
" <td>1</td>\n",
" <td>e</td>\n",
" <td>3</td>\n",
" <td>False</td>\n",
" <td>e</td>\n",
" </tr>\n",
" <tr>\n",
" <th>r</th>\n",
" <td>b</td>\n",
" <td>2</td>\n",
" <td>f</td>\n",
" <td>3</td>\n",
" <td>False</td>\n",
" <td>f</td>\n",
" </tr>\n",
" <tr>\n",
" <th>s</th>\n",
" <td>b</td>\n",
" <td>3</td>\n",
" <td>g</td>\n",
" <td>3</td>\n",
" <td>True</td>\n",
" <td>y</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" A B C B_max must_change C_new\n",
"m a 1 a 4 False a\n",
"n a 2 b 4 False b\n",
"o a 3 c 4 False c\n",
"p a 4 d 4 True y\n",
"q b 1 e 3 False e\n",
"r b 2 f 3 False f\n",
"s b 3 g 3 True y"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Use a Series to collect the results of a generator expression.\n",
"# That way we don't waste memory constructing a list object.\n",
"# Using a Series requires us to handle the possibility of a non-trivial index in the original DataFrame.\n",
"df1[\"C_new\"] = pd.Series(\n",
" (\"y\" if must_change else value for _, value, must_change in sub.to_records()),\n",
" index=sub.index,\n",
")\n",
"df1"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.2"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment