Skip to content

Instantly share code, notes, and snippets.

@gitgithan
Last active April 25, 2024 03:08
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save gitgithan/0ba595e3ef9cf8fab7deeb7b8b533ba3 to your computer and use it in GitHub Desktop.
Save gitgithan/0ba595e3ef9cf8fab7deeb7b8b533ba3 to your computer and use it in GitHub Desktop.
Exploring stack, melt, pivot_table
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from IPython.core.interactiveshell import InteractiveShell\n",
"InteractiveShell.ast_node_interactivity = \"all\"\n",
"\n",
"import pandas as pd\n",
"\n",
"#----------------Transformation pipelines--------------------\n",
"#1. df-->stack-->reset_index-->pivot-->df\n",
"#2. df-->stack-->convert to dataframe-->pivot-->df\n",
"#3. df-->melt-->add index-->pivot-->df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 1. df-->stack-->reset_index-->pivot-->df"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"***********************************dataframe************************************\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>weight</th>\n",
" <th>height</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>cat</th>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>dog</th>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" weight height\n",
"cat 0 1\n",
"dog 2 3"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"*******************************stacked dataframe********************************\n"
]
},
{
"data": {
"text/plain": [
"cat weight 0\n",
" height 1\n",
"dog weight 2\n",
" height 3\n",
"dtype: int64"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"*********************stacked dataframe with index in column*********************\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>level_1</th>\n",
" <th>0</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>cat</th>\n",
" <td>weight</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>cat</th>\n",
" <td>height</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>dog</th>\n",
" <td>weight</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>dog</th>\n",
" <td>height</td>\n",
" <td>3</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" level_1 0\n",
"cat weight 0\n",
"cat height 1\n",
"dog weight 2\n",
"dog height 3"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"*****pivot_table recovered original dataframe (with extra name for columns)*****\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>height</th>\n",
" <th>weight</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>cat</th>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>dog</th>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" height weight\n",
"cat 1 0\n",
"dog 3 2"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df= pd.DataFrame([[0, 1], [2, 3]],index=['cat', 'dog'],columns=['weight', 'height'])\n",
"\n",
"print('{:*^80}'.format('dataframe'))\n",
"df\n",
"\n",
"print('{:*^80}'.format('stacked dataframe'))\n",
"df.stack()\n",
"\n",
"print('{:*^80}'.format('stacked dataframe with index in column'))\n",
"df.stack().reset_index(level = 1) #AttributeError: 'Series' object has no attribute 'pivot_table' , so must convert to dataframe before pivot_table\n",
"stacked = df.stack().reset_index(level = 1) \n",
"\n",
"print('{:*^80}'.format('pivot_table recovered original dataframe (with extra name for columns)'))\n",
"recovered_df1 = df.stack().reset_index(level = 1).pivot_table(index = stacked.index, columns = 'level_1',values = 0) #pivot_table orders columns alphabetically,specifying values parameter prevents creation of useless multiindex column \n",
"recovered_df1.columns.name = None #remove 'level_1' column.name\n",
"recovered_df1"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 2. df-->stack-->convert to dataframe-->pivot-->df "
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"pandas.core.indexes.base.Index"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"type(stacked_df.index.get_level_values(0))"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"***********************************dataframe************************************\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>weight</th>\n",
" <th>height</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>cat</th>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>dog</th>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" weight height\n",
"cat 0 1\n",
"dog 2 3"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"**************stack and convert to dataframe to expose pivot_table**************\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th></th>\n",
" <th>0</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th rowspan=\"2\" valign=\"top\">cat</th>\n",
" <th>weight</th>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>height</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th rowspan=\"2\" valign=\"top\">dog</th>\n",
" <th>weight</th>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>height</th>\n",
" <td>3</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" 0\n",
"cat weight 0\n",
" height 1\n",
"dog weight 2\n",
" height 3"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"***************rather than unstack, pivot_table achieves the same***************\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>height</th>\n",
" <th>weight</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>cat</th>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>dog</th>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" height weight\n",
"cat 1 0\n",
"dog 3 2"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"print('{:*^80}'.format('dataframe'))\n",
"df\n",
"print('{:*^80}'.format('stack and convert to dataframe to expose pivot_table'))\n",
"stacked_df = pd.DataFrame(df.stack())\n",
"stacked_df\n",
"\n",
"print('{:*^80}'.format('rather than unstack, pivot_table achieves the same'))\n",
"idx_lvl0, idx_lvl1 = stacked_df.index.get_level_values(0), stacked_df.index.get_level_values(1)\n",
"recovered_df2 = stacked_df.pivot_table(index=idx_lvl0,columns = idx_lvl1,values = 0)\n",
"recovered_df2"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 3. df-->melt-->add index-->pivot-->df "
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"***********************************dataframe************************************\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>weight</th>\n",
" <th>height</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>cat</th>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>dog</th>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" weight height\n",
"cat 0 1\n",
"dog 2 3"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"************************melting loses index information*************************\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>variable</th>\n",
" <th>value</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>weight</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>weight</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>height</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>height</td>\n",
" <td>3</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" variable value\n",
"0 weight 0\n",
"1 weight 2\n",
"2 height 1\n",
"3 height 3"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"*****************************manually enrich index******************************\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>variable</th>\n",
" <th>value</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>cat</th>\n",
" <td>weight</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>dog</th>\n",
" <td>weight</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>cat</th>\n",
" <td>height</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>dog</th>\n",
" <td>height</td>\n",
" <td>3</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" variable value\n",
"cat weight 0\n",
"dog weight 2\n",
"cat height 1\n",
"dog height 3"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"*****pivot_table recovered original dataframe (with extra name for columns)*****\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>height</th>\n",
" <th>weight</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>cat</th>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>dog</th>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" height weight\n",
"cat 1 0\n",
"dog 3 2"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"print('{:*^80}'.format('dataframe'))\n",
"df\n",
"print('{:*^80}'.format('melting loses index information'))\n",
"melted = df.melt() #melt appends columns into new \"variable\" column, while stack adds columns to new inner index layer (same information end up different places)\n",
"melted\n",
"\n",
"print('{:*^80}'.format('manually enrich index')) # until this is solved: https://github.com/pandas-dev/pandas/issues/17440\n",
"melted.index = ['cat','dog']*2 #list(df.index)*len(df.columns) for more generalizable index generation\n",
"melted \n",
"\n",
"print('{:*^80}'.format('pivot_table recovered original dataframe (with extra name for columns)'))\n",
"recovered_df3 = melted.pivot_table(index = melted.index, columns = 'variable',values = 'value')\n",
"recovered_df3.columns.name=None #remove 'variable' column.name\n",
"recovered_df3\n",
"\n",
"#melting loses index while pivot_table requires index parameter"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment