Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jorisvandenbossche/38bf6bb776092bc6fda2b2967b85b43d to your computer and use it in GitHub Desktop.
Save jorisvandenbossche/38bf6bb776092bc6fda2b2967b85b43d to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## DataFrame with non-numeric key"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"df = pd.DataFrame({\"key\": ['val1', 'val1', 'val2'], \"values\": [1, 2, 3]})"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>key</th>\n",
" <th>values</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>val1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>val1</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>val2</td>\n",
" <td>3</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" key values\n",
"0 val1 1\n",
"1 val1 2\n",
"2 val2 3"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Key as column"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"On master (after https://github.com/pandas-dev/pandas/pull/38737):"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th></th>\n",
" <th>values</th>\n",
" </tr>\n",
" <tr>\n",
" <th>key</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th rowspan=\"2\" valign=\"top\">val1</th>\n",
" <th>0</th>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>val2</th>\n",
" <th>2</th>\n",
" <td>3.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" values\n",
"key \n",
"val1 0 1.0\n",
" 1 3.0\n",
"val2 2 3.0"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.groupby(\"key\").rolling(2, min_periods=1).sum()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Previous versions:"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<table style='display:inline' border=\"1\" class=\"dataframe\"><caption>1.0.5</caption>\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th></th>\n",
" <th>values</th>\n",
" </tr>\n",
" <tr>\n",
" <th>key</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th rowspan=\"2\" valign=\"top\">val1</th>\n",
" <th>0</th>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>val2</th>\n",
" <th>2</th>\n",
" <td>3.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>               <table style='display:inline' border=\"1\" class=\"dataframe\"><caption>1.1.5</caption>\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th></th>\n",
" <th>values</th>\n",
" </tr>\n",
" <tr>\n",
" <th>key</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th rowspan=\"2\" valign=\"top\">val1</th>\n",
" <th>0</th>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>val2</th>\n",
" <th>2</th>\n",
" <td>3.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>               <table style='display:inline' border=\"1\" class=\"dataframe\"><caption>1.2.0</caption>\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th></th>\n",
" <th>values</th>\n",
" </tr>\n",
" <tr>\n",
" <th>key</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th rowspan=\"2\" valign=\"top\">val1</th>\n",
" <th>0</th>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>val2</th>\n",
" <th>2</th>\n",
" <td>3.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>               "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"display_results(\"string-column-group_keys=True\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"So here, the key column becomes an index level (and is prepended to the existing index)."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"With `group_keys=False`:"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th></th>\n",
" <th>values</th>\n",
" </tr>\n",
" <tr>\n",
" <th>key</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th rowspan=\"2\" valign=\"top\">val1</th>\n",
" <th>0</th>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>val2</th>\n",
" <th>2</th>\n",
" <td>3.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" values\n",
"key \n",
"val1 0 1.0\n",
" 1 3.0\n",
"val2 2 3.0"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.groupby(\"key\", group_keys=False).rolling(2, min_periods=1).sum()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<table style='display:inline' border=\"1\" class=\"dataframe\"><caption>1.0.5</caption>\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>values</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>               <table style='display:inline' border=\"1\" class=\"dataframe\"><caption>1.1.5</caption>\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th></th>\n",
" <th>values</th>\n",
" </tr>\n",
" <tr>\n",
" <th>key</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th rowspan=\"2\" valign=\"top\">val1</th>\n",
" <th>0</th>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>val2</th>\n",
" <th>2</th>\n",
" <td>3.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>               <table style='display:inline' border=\"1\" class=\"dataframe\"><caption>1.2.0</caption>\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th></th>\n",
" <th>values</th>\n",
" </tr>\n",
" <tr>\n",
" <th>key</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th rowspan=\"2\" valign=\"top\">val1</th>\n",
" <th>0</th>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>val2</th>\n",
" <th>2</th>\n",
" <td>3.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>               "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"display_results(\"string-column-group_keys=False\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Previously (but only in 1.0.x), this ensured that the group columns were not prepended, but then that also means they are lost."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"If we compare this to other groupby methods:"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>values</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" values\n",
"0 1\n",
"1 2\n",
"2 3"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.groupby(\"key\").transform(lambda x: x)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>values</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" values\n",
"0 1.0\n",
"1 3.0\n",
"2 3.0"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.groupby(\"key\").transform(lambda x: x.rolling(2, min_periods=1).sum())"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"def f(group):\n",
" print(group)\n",
" return group"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" key values\n",
"0 val1 1\n",
"1 val1 2\n",
" key values\n",
"2 val2 3\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>key</th>\n",
" <th>values</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>val1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>val1</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>val2</td>\n",
" <td>3</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" key values\n",
"0 val1 1\n",
"1 val1 2\n",
"2 val2 3"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.groupby(\"key\").apply(f)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>values</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" values\n",
"0 1.0\n",
"1 3.0\n",
"2 3.0"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.groupby(\"key\").apply(lambda x: x.rolling(2, min_periods=1).sum())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Key as index"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"df2 = df.set_index(\"key\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Master (after https://github.com/pandas-dev/pandas/pull/38737):"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th></th>\n",
" <th>values</th>\n",
" </tr>\n",
" <tr>\n",
" <th>key</th>\n",
" <th>key</th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th rowspan=\"2\" valign=\"top\">val1</th>\n",
" <th>val1</th>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>val1</th>\n",
" <td>3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>val2</th>\n",
" <th>val2</th>\n",
" <td>3.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" values\n",
"key key \n",
"val1 val1 1.0\n",
" val1 3.0\n",
"val2 val2 3.0"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df2.groupby(\"key\").rolling(2, min_periods=1).sum()"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<table style='display:inline' border=\"1\" class=\"dataframe\"><caption>1.0.5</caption>\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th></th>\n",
" <th>values</th>\n",
" </tr>\n",
" <tr>\n",
" <th>key</th>\n",
" <th>key</th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th rowspan=\"2\" valign=\"top\">val1</th>\n",
" <th>val1</th>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>val1</th>\n",
" <td>3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>val2</th>\n",
" <th>val2</th>\n",
" <td>3.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>               <table style='display:inline' border=\"1\" class=\"dataframe\"><caption>1.1.5</caption>\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>values</th>\n",
" </tr>\n",
" <tr>\n",
" <th>key</th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>val1</th>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>val1</th>\n",
" <td>3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>val2</th>\n",
" <td>3.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>               <table style='display:inline' border=\"1\" class=\"dataframe\"><caption>1.2.0</caption>\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>values</th>\n",
" </tr>\n",
" <tr>\n",
" <th>key</th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>val1</th>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>val1</th>\n",
" <td>3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>val2</th>\n",
" <td>3.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>               "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"display_results(\"string-index-group_keys=True\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"So here, again the keys are prepended as index levels. But now (on master) the keys are not dropped (like it is the case for the columns), resulting in duplicated index levels.\n",
"\n",
"Somewhat accidentally, this was changed in 1.1.5 / 1.2.0."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"With `group_keys=False`, we could already achieve this behaviour in 1.0.x as well, but it no longer works on master:"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th></th>\n",
" <th>values</th>\n",
" </tr>\n",
" <tr>\n",
" <th>key</th>\n",
" <th>key</th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th rowspan=\"2\" valign=\"top\">val1</th>\n",
" <th>val1</th>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>val1</th>\n",
" <td>3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>val2</th>\n",
" <th>val2</th>\n",
" <td>3.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" values\n",
"key key \n",
"val1 val1 1.0\n",
" val1 3.0\n",
"val2 val2 3.0"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df2.groupby(\"key\", group_keys=False).rolling(2, min_periods=1).sum()"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<table style='display:inline' border=\"1\" class=\"dataframe\"><caption>1.0.5</caption>\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>values</th>\n",
" </tr>\n",
" <tr>\n",
" <th>key</th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>val1</th>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>val1</th>\n",
" <td>3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>val2</th>\n",
" <td>3.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>               <table style='display:inline' border=\"1\" class=\"dataframe\"><caption>1.1.5</caption>\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>values</th>\n",
" </tr>\n",
" <tr>\n",
" <th>key</th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>val1</th>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>val1</th>\n",
" <td>3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>val2</th>\n",
" <td>3.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>               <table style='display:inline' border=\"1\" class=\"dataframe\"><caption>1.2.0</caption>\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>values</th>\n",
" </tr>\n",
" <tr>\n",
" <th>key</th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>val1</th>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>val1</th>\n",
" <td>3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>val2</th>\n",
" <td>3.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>               "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"display_results(\"string-index-group_keys=False\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"If we compare this to other groupby methods:"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>values</th>\n",
" </tr>\n",
" <tr>\n",
" <th>key</th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>val1</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>val1</th>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>val2</th>\n",
" <td>3</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" values\n",
"key \n",
"val1 1\n",
"val1 2\n",
"val2 3"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df2.groupby(\"key\").transform(lambda x: x)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>values</th>\n",
" </tr>\n",
" <tr>\n",
" <th>key</th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>val1</th>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>val1</th>\n",
" <td>3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>val2</th>\n",
" <td>3.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" values\n",
"key \n",
"val1 1.0\n",
"val1 3.0\n",
"val2 3.0"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df2.groupby(\"key\").transform(lambda x: x.rolling(2, min_periods=1).sum())"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"def f(group):\n",
" print(group)\n",
" return group"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" values\n",
"key \n",
"val1 1\n",
"val1 2\n",
" values\n",
"key \n",
"val2 3\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>values</th>\n",
" </tr>\n",
" <tr>\n",
" <th>key</th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>val1</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>val1</th>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>val2</th>\n",
" <td>3</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" values\n",
"key \n",
"val1 1\n",
"val1 2\n",
"val2 3"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df2.groupby(\"key\").apply(f)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>values</th>\n",
" </tr>\n",
" <tr>\n",
" <th>key</th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>val1</th>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>val1</th>\n",
" <td>3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>val2</th>\n",
" <td>3.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" values\n",
"key \n",
"val1 1.0\n",
"val1 3.0\n",
"val2 3.0"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df2.groupby(\"key\").apply(lambda x: x.rolling(2, min_periods=1).sum())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## DataFrame with numeric key"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"df = pd.DataFrame({\"key\": [1, 1, 2], \"values\": [1, 2, 3]})"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>key</th>\n",
" <th>values</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" key values\n",
"0 1 1\n",
"1 1 2\n",
"2 2 3"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Key as column"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"On master (after https://github.com/pandas-dev/pandas/pull/38737):"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th></th>\n",
" <th>key</th>\n",
" <th>values</th>\n",
" </tr>\n",
" <tr>\n",
" <th>key</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th rowspan=\"2\" valign=\"top\">1</th>\n",
" <th>0</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2.0</td>\n",
" <td>3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <th>2</th>\n",
" <td>2.0</td>\n",
" <td>3.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" key values\n",
"key \n",
"1 0 1.0 1.0\n",
" 1 2.0 3.0\n",
"2 2 2.0 3.0"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.groupby(\"key\").rolling(2, min_periods=1).sum()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Previous versions:"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<table style='display:inline' border=\"1\" class=\"dataframe\"><caption>1.0.5</caption>\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th></th>\n",
" <th>key</th>\n",
" <th>values</th>\n",
" </tr>\n",
" <tr>\n",
" <th>key</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th rowspan=\"2\" valign=\"top\">1</th>\n",
" <th>0</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2.0</td>\n",
" <td>3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <th>2</th>\n",
" <td>2.0</td>\n",
" <td>3.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>               <table style='display:inline' border=\"1\" class=\"dataframe\"><caption>1.1.5</caption>\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th></th>\n",
" <th>key</th>\n",
" <th>values</th>\n",
" </tr>\n",
" <tr>\n",
" <th>key</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th rowspan=\"2\" valign=\"top\">1</th>\n",
" <th>0</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2.0</td>\n",
" <td>3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <th>2</th>\n",
" <td>2.0</td>\n",
" <td>3.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>               <table style='display:inline' border=\"1\" class=\"dataframe\"><caption>1.2.0</caption>\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th></th>\n",
" <th>key</th>\n",
" <th>values</th>\n",
" </tr>\n",
" <tr>\n",
" <th>key</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th rowspan=\"2\" valign=\"top\">1</th>\n",
" <th>0</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2.0</td>\n",
" <td>3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <th>2</th>\n",
" <td>2.0</td>\n",
" <td>3.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>               "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"display_results(\"numeric-column-group_keys=True\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"So here, the key column becomes an index level (and is prepended to the existing index), but the key column is *also* kept in the dataframe which is \"rolled\"."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"With `group_keys=False`:"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th></th>\n",
" <th>key</th>\n",
" <th>values</th>\n",
" </tr>\n",
" <tr>\n",
" <th>key</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th rowspan=\"2\" valign=\"top\">1</th>\n",
" <th>0</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2.0</td>\n",
" <td>3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <th>2</th>\n",
" <td>2.0</td>\n",
" <td>3.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" key values\n",
"key \n",
"1 0 1.0 1.0\n",
" 1 2.0 3.0\n",
"2 2 2.0 3.0"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.groupby(\"key\", group_keys=False).rolling(2, min_periods=1).sum()"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<table style='display:inline' border=\"1\" class=\"dataframe\"><caption>1.0.5</caption>\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>key</th>\n",
" <th>values</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2.0</td>\n",
" <td>3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2.0</td>\n",
" <td>3.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>               <table style='display:inline' border=\"1\" class=\"dataframe\"><caption>1.1.5</caption>\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th></th>\n",
" <th>key</th>\n",
" <th>values</th>\n",
" </tr>\n",
" <tr>\n",
" <th>key</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th rowspan=\"2\" valign=\"top\">1</th>\n",
" <th>0</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2.0</td>\n",
" <td>3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <th>2</th>\n",
" <td>2.0</td>\n",
" <td>3.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>               <table style='display:inline' border=\"1\" class=\"dataframe\"><caption>1.2.0</caption>\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th></th>\n",
" <th>key</th>\n",
" <th>values</th>\n",
" </tr>\n",
" <tr>\n",
" <th>key</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th rowspan=\"2\" valign=\"top\">1</th>\n",
" <th>0</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2.0</td>\n",
" <td>3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <th>2</th>\n",
" <td>2.0</td>\n",
" <td>3.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>               "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"display_results(\"numeric-column-group_keys=False\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Previously (but only in 1.0.x), this ensured that the group columns were not prepended, but then that also means they are lost."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"If we compare this to other groupby methods:"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>values</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" values\n",
"0 1\n",
"1 2\n",
"2 3"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.groupby(\"key\").transform(lambda x: x)"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>values</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" values\n",
"0 1.0\n",
"1 3.0\n",
"2 3.0"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.groupby(\"key\").transform(lambda x: x.rolling(2, min_periods=1).sum())"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
"def f(group):\n",
" print(group)\n",
" return group"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" key values\n",
"0 1 1\n",
"1 1 2\n",
" key values\n",
"2 2 3\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>key</th>\n",
" <th>values</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" key values\n",
"0 1 1\n",
"1 1 2\n",
"2 2 3"
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.groupby(\"key\").apply(f)"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>key</th>\n",
" <th>values</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2.0</td>\n",
" <td>3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2.0</td>\n",
" <td>3.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" key values\n",
"0 1.0 1.0\n",
"1 2.0 3.0\n",
"2 2.0 3.0"
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.groupby(\"key\").apply(lambda x: x.rolling(2, min_periods=1).sum())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Key as index"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [],
"source": [
"df2 = df.set_index(\"key\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Master (after https://github.com/pandas-dev/pandas/pull/38737):"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th></th>\n",
" <th>values</th>\n",
" </tr>\n",
" <tr>\n",
" <th>key</th>\n",
" <th>key</th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th rowspan=\"2\" valign=\"top\">1</th>\n",
" <th>1</th>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <th>2</th>\n",
" <td>3.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" values\n",
"key key \n",
"1 1 1.0\n",
" 1 3.0\n",
"2 2 3.0"
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df2.groupby(\"key\").rolling(2, min_periods=1).sum()"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<table style='display:inline' border=\"1\" class=\"dataframe\"><caption>1.0.5</caption>\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th></th>\n",
" <th>values</th>\n",
" </tr>\n",
" <tr>\n",
" <th>key</th>\n",
" <th>key</th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th rowspan=\"2\" valign=\"top\">1</th>\n",
" <th>1</th>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <th>2</th>\n",
" <td>3.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>               <table style='display:inline' border=\"1\" class=\"dataframe\"><caption>1.1.5</caption>\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>values</th>\n",
" </tr>\n",
" <tr>\n",
" <th>key</th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>               <table style='display:inline' border=\"1\" class=\"dataframe\"><caption>1.2.0</caption>\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>values</th>\n",
" </tr>\n",
" <tr>\n",
" <th>key</th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>               "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"display_results(\"numeric-index-group_keys=True\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"So here, again the keys are prepended as index levels. But now (on master) the keys are not dropped (like it is the case for the columns), resulting in duplicated index levels.\n",
"\n",
"Somewhat accidentally, this was changed in 1.1.5 / 1.2.0."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"With `group_keys=False`, we could already achieve this behaviour in 1.0.x as well, but it no longer works on master:"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th></th>\n",
" <th>values</th>\n",
" </tr>\n",
" <tr>\n",
" <th>key</th>\n",
" <th>key</th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th rowspan=\"2\" valign=\"top\">1</th>\n",
" <th>1</th>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <th>2</th>\n",
" <td>3.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" values\n",
"key key \n",
"1 1 1.0\n",
" 1 3.0\n",
"2 2 3.0"
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df2.groupby(\"key\", group_keys=False).rolling(2, min_periods=1).sum()"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<table style='display:inline' border=\"1\" class=\"dataframe\"><caption>1.0.5</caption>\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>values</th>\n",
" </tr>\n",
" <tr>\n",
" <th>key</th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>               <table style='display:inline' border=\"1\" class=\"dataframe\"><caption>1.1.5</caption>\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>values</th>\n",
" </tr>\n",
" <tr>\n",
" <th>key</th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>               <table style='display:inline' border=\"1\" class=\"dataframe\"><caption>1.2.0</caption>\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>values</th>\n",
" </tr>\n",
" <tr>\n",
" <th>key</th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>               "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"display_results(\"numeric-index-group_keys=False\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"If we compare this to other groupby methods:"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>values</th>\n",
" </tr>\n",
" <tr>\n",
" <th>key</th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" values\n",
"key \n",
"1 1\n",
"1 2\n",
"2 3"
]
},
"execution_count": 40,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df2.groupby(\"key\").transform(lambda x: x)"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>values</th>\n",
" </tr>\n",
" <tr>\n",
" <th>key</th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" values\n",
"key \n",
"1 1.0\n",
"1 3.0\n",
"2 3.0"
]
},
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df2.groupby(\"key\").transform(lambda x: x.rolling(2, min_periods=1).sum())"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [],
"source": [
"def f(group):\n",
" print(group)\n",
" return group"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" values\n",
"key \n",
"1 1\n",
"1 2\n",
" values\n",
"key \n",
"2 3\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>values</th>\n",
" </tr>\n",
" <tr>\n",
" <th>key</th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" values\n",
"key \n",
"1 1\n",
"1 2\n",
"2 3"
]
},
"execution_count": 43,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df2.groupby(\"key\").apply(f)"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>values</th>\n",
" </tr>\n",
" <tr>\n",
" <th>key</th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" values\n",
"key \n",
"1 1.0\n",
"1 3.0\n",
"2 3.0"
]
},
"execution_count": 44,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df2.groupby(\"key\").apply(lambda x: x.rolling(2, min_periods=1).sum())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Function to display side by side"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"from IPython.core.display import display, HTML\n",
"\n",
"def display_side_by_side(dfs: list, captions: list):\n",
" \"\"\"Display tables side by side to save vertical space\"\"\"\n",
" output = \"\"\n",
"\n",
" for df, caption in zip(dfs, captions):\n",
" df_html = df.to_html()\n",
" df_html = df_html.replace('class=\"dataframe\">', f'class=\"dataframe\"><caption>{caption}</caption>')\n",
" output += df_html\n",
" output += \"\\xa0\\xa0\\xa0\"*5\n",
" output = output.replace(\"<table\", \"<table style='display:inline'\")\n",
" display(HTML(output))\n",
"\n",
"\n",
"def display_results(name):\n",
" dfs = []\n",
" for version in [\"1.0.5\", \"1.1.5\", \"1.2.0\"]:\n",
" results = pd.read_pickle(f\"results-pandas-{version}.pkl\")\n",
" dfs.append(results[name])\n",
" display_side_by_side(dfs, [\"1.0.5\", \"1.1.5\", \"1.2.0\"])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Getting results with different pandas versions"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"df = pd.DataFrame({\"key\": ['val1', 'val1', 'val2'], \"values\": [1, 2, 3]})\n",
"df2 = df.set_index(\"key\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"results = {\n",
" \"string-column-group_keys=True\": df.groupby(\"key\").rolling(2, min_periods=1).sum(),\n",
" \"string-column-group_keys=False\": df.groupby(\"key\", group_keys=False).rolling(2, min_periods=1).sum(),\n",
" \"string-index-group_keys=True\": df2.groupby(\"key\").rolling(2, min_periods=1).sum(),\n",
" \"string-index-group_keys=False\": df2.groupby(\"key\", group_keys=False).rolling(2, min_periods=1).sum(),\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"df = pd.DataFrame({\"key\": [1, 1, 2], \"values\": [1, 2, 3]})\n",
"df2 = df.set_index(\"key\")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"results.update({\n",
" \"numeric-column-group_keys=True\": df.groupby(\"key\").rolling(2, min_periods=1).sum(),\n",
" \"numeric-column-group_keys=False\": df.groupby(\"key\", group_keys=False).rolling(2, min_periods=1).sum(),\n",
" \"numeric-index-group_keys=True\": df2.groupby(\"key\").rolling(2, min_periods=1).sum(),\n",
" \"numeric-index-group_keys=False\": df2.groupby(\"key\", group_keys=False).rolling(2, min_periods=1).sum(),\n",
"})"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"pd.to_pickle(results, f\"results-pandas-{pd.__version__}.pkl\", protocol=4)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python (dev)",
"language": "python",
"name": "dev"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment