Skip to content

Instantly share code, notes, and snippets.

@krassowski
Created January 22, 2019 23:20
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save krassowski/0259a2cd2ba774ccd9f69bbcc3187fbf to your computer and use it in GitHub Desktop.
Save krassowski/0259a2cd2ba774ccd9f69bbcc3187fbf to your computer and use it in GitHub Desktop.
Performance and peak-memory evaluation of some explode implementations, see https://stackoverflow.com/q/12680754
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"%matplotlib inline"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"def generate_df(rows_before=5, list_size=10):\n",
" rows = range(rows_before)\n",
" return pd.DataFrame({\n",
" 'var1': rows,\n",
" 'var2': [list(range(list_size)) for _ in rows]\n",
" })"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"def explode_concatenate(df, col, fill_value=''):\n",
" \"\"\"CC-BY-SA 3.0 MaxU, https://stackoverflow.com/a/40449726, adapted\"\"\"\n",
" lst_cols = [col]\n",
" \n",
" # all columns except `lst_cols`\n",
" idx_cols = df.columns.difference(lst_cols)\n",
"\n",
" # calculate lengths of lists\n",
" lens = df[lst_cols[0]].str.len()\n",
"\n",
" res = pd.DataFrame({\n",
" col:np.repeat(df[col].values, lens)\n",
" for col in idx_cols\n",
" }).assign(**{col:np.concatenate(df[col].values) for col in lst_cols}) \\\n",
" .loc[:, df.columns]\n",
"\n",
" return res\n",
" \n",
"def explode_dict(df, column: str):\n",
" data = []\n",
" for row in df.itertuples(index=False):\n",
" base = row._asdict()\n",
" for entry in base.pop(column):\n",
" data.append({column: entry, **base})\n",
" res = pd.DataFrame(data)\n",
" return res\n",
"\n",
"def explode_list(df, column: str):\n",
" data = []\n",
" columns = list(df.columns)\n",
" to_explode_index = columns.index(column)\n",
" columns.remove(column)\n",
" columns = [column, *columns]\n",
" for row in df.itertuples(index=False):\n",
" base = [*row[:to_explode_index], *row[to_explode_index + 1:]]\n",
" for entry in row[to_explode_index]:\n",
" data.append([entry, *base])\n",
" res = pd.DataFrame(data, columns=columns)[df.columns]\n",
" return res\n",
"\n",
"def explode_stack(df, column):\n",
" \"\"\"CC-BY-SA 3.0 DMulligan, https://stackoverflow.com/a/28182629, adapted\"\"\"\n",
" other = df.columns.difference([column])[0]\n",
" index = df[other]\n",
" res = pd.DataFrame(df[column].tolist(), index=index).stack().reset_index()[[0, other]]\n",
" res.columns = [column, other]\n",
" return res\n",
"\n",
"def explode_accepted(df, column):\n",
" \"\"\"CC-BY-SA 3.0 Chang She, https://stackoverflow.com/a/12681217, adapted\"\"\"\n",
" other = df.columns.difference([column])[0]\n",
" res = pd.concat([\n",
" pd.Series(row[other], row[column]) \n",
" for _, row in df.iterrows()\n",
" ]).reset_index()\n",
" res.columns = [column, other]\n",
" return res\n",
"\n",
"def tidy_split(df, column, keep=False):\n",
" \"\"\"CC-BY-SA 3.0, Daniel Himmelstein, https://stackoverflow.com/a/39946744, adapted\"\"\"\n",
" indexes = list()\n",
" new_values = list()\n",
" df = df.dropna(subset=[column])\n",
" for i, presplit in enumerate(df[column]):\n",
" values = presplit\n",
" if keep and len(values) > 1:\n",
" indexes.append(i)\n",
" new_values.append(presplit)\n",
" for value in values:\n",
" indexes.append(i)\n",
" new_values.append(value)\n",
" new_df = df.iloc[indexes, :].copy()\n",
" new_df[column] = new_values\n",
" return new_df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Small demo:"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>a</th>\n",
" <th>b</th>\n",
" <th>c</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>[1, 2, 3]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>4</td>\n",
" <td>[1, 2]</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" a b c\n",
"0 1 3 [1, 2, 3]\n",
"1 2 4 [1, 2]"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"d = pd.DataFrame({\"a\": [1, 2], \"b\": [3, 4], 'c': [[1, 2, 3], [1, 2]]})\n",
"d"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"functions = [explode_concatenate, tidy_split, explode_accepted, explode_stack, explode_dict, explode_list]"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<function explode_concatenate at 0x7fd6dc26f840>\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>a</th>\n",
" <th>b</th>\n",
" <th>c</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2</td>\n",
" <td>4</td>\n",
" <td>2</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" a b c\n",
"0 1 3 1\n",
"1 1 3 2\n",
"2 1 3 3\n",
"3 2 4 1\n",
"4 2 4 2"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"<function tidy_split at 0x7fd6dc2700d0>\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>a</th>\n",
" <th>b</th>\n",
" <th>c</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>4</td>\n",
" <td>2</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" a b c\n",
"0 1 3 1\n",
"0 1 3 2\n",
"0 1 3 3\n",
"1 2 4 1\n",
"1 2 4 2"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"<function explode_accepted at 0x7fd6dc26f620>\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>c</th>\n",
" <th>a</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" c a\n",
"0 1 1\n",
"1 2 1\n",
"2 3 1\n",
"3 1 2\n",
"4 2 2"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"<function explode_stack at 0x7fd6dc26fae8>\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>c</th>\n",
" <th>a</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1.0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2.0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3.0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1.0</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2.0</td>\n",
" <td>2</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" c a\n",
"0 1.0 1\n",
"1 2.0 1\n",
"2 3.0 1\n",
"3 1.0 2\n",
"4 2.0 2"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"<function explode_dict at 0x7fd6dc26f6a8>\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>a</th>\n",
" <th>b</th>\n",
" <th>c</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2</td>\n",
" <td>4</td>\n",
" <td>2</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" a b c\n",
"0 1 3 1\n",
"1 1 3 2\n",
"2 1 3 3\n",
"3 2 4 1\n",
"4 2 4 2"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"<function explode_list at 0x7fd6dc26f7b8>\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>a</th>\n",
" <th>b</th>\n",
" <th>c</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2</td>\n",
" <td>4</td>\n",
" <td>2</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" a b c\n",
"0 1 3 1\n",
"1 1 3 2\n",
"2 1 3 3\n",
"3 2 4 1\n",
"4 2 4 2"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"for func in functions:\n",
" print(func)\n",
" display(func(d, 'c'))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Two of the functions (`stack` and `accepted`) do not support data frames with additional columns. Moreover, `stack` changes type (floats instead of integers)."
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"not_multi_column = ['stack', 'accepted']"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Benchmarking"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"I will run the functions in with varying sizes of lists and dataframes:"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"cases = {\n",
" # case name: (data frame length, list size)\n",
" 'list_much_smaller': (50000, 5),\n",
" 'list_smaller': (5000, 50),\n",
" 'balanced': (500, 500),\n",
" 'list_bigger': (50, 5000),\n",
" 'list_much_bigger': (5, 50000)\n",
"}"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"I will also record memory usage aiming to get 500 reads per function run."
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"from memory_profiler import memory_usage"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"from copy import copy\n",
"\n",
"\n",
"def benchmark(functions, cases, memory_record_frequency=150):\n",
" measurments = []\n",
" for case, (n_rows, list_len) in cases.items():\n",
" df = generate_df(n_rows, list_len)\n",
" for function in functions:\n",
" time_result = %timeit -o -r 50 function(copy(df), 'var2') \n",
" for run in time_result.all_runs:\n",
" memory = memory_usage((function, [copy(df), 'var2']), interval=time_result.average / memory_record_frequency)\n",
" measurments.append({\n",
" 'case': case,\n",
" 'time': run / time_result.loops,\n",
" 'function': function.__name__.replace('explode_', ''),\n",
" 'peak_memory': max(memory)\n",
" })\n",
" return measurments"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"75.8 ms ± 2.39 ms per loop (mean ± std. dev. of 50 runs, 10 loops each)\n",
"128 ms ± 14.8 ms per loop (mean ± std. dev. of 50 runs, 10 loops each)\n",
"15.3 s ± 1.66 s per loop (mean ± std. dev. of 50 runs, 1 loop each)\n",
"59.9 ms ± 4.84 ms per loop (mean ± std. dev. of 50 runs, 10 loops each)\n",
"415 ms ± 36.3 ms per loop (mean ± std. dev. of 50 runs, 1 loop each)\n",
"166 ms ± 12.7 ms per loop (mean ± std. dev. of 50 runs, 10 loops each)\n",
"25.4 ms ± 2.44 ms per loop (mean ± std. dev. of 50 runs, 10 loops each)\n",
"111 ms ± 6.39 ms per loop (mean ± std. dev. of 50 runs, 10 loops each)\n",
"1.51 s ± 116 ms per loop (mean ± std. dev. of 50 runs, 1 loop each)\n",
"54.2 ms ± 2.98 ms per loop (mean ± std. dev. of 50 runs, 10 loops each)\n",
"301 ms ± 7.06 ms per loop (mean ± std. dev. of 50 runs, 1 loop each)\n",
"129 ms ± 15.4 ms per loop (mean ± std. dev. of 50 runs, 10 loops each)\n",
"18.2 ms ± 245 µs per loop (mean ± std. dev. of 50 runs, 100 loops each)\n",
"169 ms ± 83.5 ms per loop (mean ± std. dev. of 50 runs, 10 loops each)\n",
"446 ms ± 56 ms per loop (mean ± std. dev. of 50 runs, 1 loop each)\n",
"239 ms ± 38.6 ms per loop (mean ± std. dev. of 50 runs, 10 loops each)\n",
"938 ms ± 137 ms per loop (mean ± std. dev. of 50 runs, 1 loop each)\n",
"391 ms ± 64.7 ms per loop (mean ± std. dev. of 50 runs, 1 loop each)\n",
"41.3 ms ± 6.51 ms per loop (mean ± std. dev. of 50 runs, 10 loops each)\n",
"371 ms ± 81.4 ms per loop (mean ± std. dev. of 50 runs, 1 loop each)\n",
"128 ms ± 22.3 ms per loop (mean ± std. dev. of 50 runs, 10 loops each)\n",
"915 ms ± 197 ms per loop (mean ± std. dev. of 50 runs, 1 loop each)\n",
"1 s ± 151 ms per loop (mean ± std. dev. of 50 runs, 1 loop each)\n",
"414 ms ± 58.1 ms per loop (mean ± std. dev. of 50 runs, 1 loop each)\n",
"57.4 ms ± 8.79 ms per loop (mean ± std. dev. of 50 runs, 10 loops each)\n",
"375 ms ± 73.7 ms per loop (mean ± std. dev. of 50 runs, 1 loop each)\n",
"70.1 ms ± 12.8 ms per loop (mean ± std. dev. of 50 runs, 10 loops each)\n",
"7.48 s ± 626 ms per loop (mean ± std. dev. of 50 runs, 1 loop each)\n",
"1.08 s ± 169 ms per loop (mean ± std. dev. of 50 runs, 1 loop each)\n",
"416 ms ± 66.5 ms per loop (mean ± std. dev. of 50 runs, 1 loop each)\n"
]
}
],
"source": [
"m = pd.DataFrame(benchmark(functions, cases))"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>case</th>\n",
" <th>function</th>\n",
" <th>peak_memory</th>\n",
" <th>time</th>\n",
" <th>multi_column</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>list_much_smaller</td>\n",
" <td>concatenate</td>\n",
" <td>122.898438</td>\n",
" <td>0.088612</td>\n",
" <td>Supports multiple columns</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>list_much_smaller</td>\n",
" <td>concatenate</td>\n",
" <td>122.835938</td>\n",
" <td>0.085024</td>\n",
" <td>Supports multiple columns</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>list_much_smaller</td>\n",
" <td>concatenate</td>\n",
" <td>121.078125</td>\n",
" <td>0.075090</td>\n",
" <td>Supports multiple columns</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>list_much_smaller</td>\n",
" <td>concatenate</td>\n",
" <td>122.957031</td>\n",
" <td>0.075017</td>\n",
" <td>Supports multiple columns</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>list_much_smaller</td>\n",
" <td>concatenate</td>\n",
" <td>121.414062</td>\n",
" <td>0.076088</td>\n",
" <td>Supports multiple columns</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" case function peak_memory time \\\n",
"0 list_much_smaller concatenate 122.898438 0.088612 \n",
"1 list_much_smaller concatenate 122.835938 0.085024 \n",
"2 list_much_smaller concatenate 121.078125 0.075090 \n",
"3 list_much_smaller concatenate 122.957031 0.075017 \n",
"4 list_much_smaller concatenate 121.414062 0.076088 \n",
"\n",
" multi_column \n",
"0 Supports multiple columns \n",
"1 Supports multiple columns \n",
"2 Supports multiple columns \n",
"3 Supports multiple columns \n",
"4 Supports multiple columns "
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"m['multi_column'] = m.function.apply(\n",
" lambda f: (\n",
" 'Supports multiple columns'\n",
" if f not in not_multi_column else\n",
" 'Only one column suppported'\n",
" )\n",
")\n",
"m.case = pd.Categorical(m.case, categories=cases.keys(), ordered=True)\n",
"m.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Plotting"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"from plotnine import *"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 1500x500 with 5 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"<ggplot: (8785082253786)>"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"g = (\n",
" ggplot(m, aes('function', y='time', color='multi_column'))\n",
" + facet_wrap(' ~ case', scales='free', ncol=5)\n",
" + geom_boxplot()\n",
" + theme(axis_text_x=element_text(angle=45, hjust=1), figure_size=(15, 5), legend_position='top')\n",
" + scale_y_continuous(trans='log2')\n",
" + labs(color='')\n",
")\n",
"g"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 1500x500 with 5 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"<ggplot: (8785046244970)>"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"g = (\n",
" ggplot(m, aes('function', y='peak_memory', color='multi_column'))\n",
" + facet_wrap(' ~ case', scales='free', ncol=5)\n",
" + geom_boxplot()\n",
" + theme(axis_text_x=element_text(angle=45, hjust=1), figure_size=(15, 5), legend_position='top')\n",
" + scale_y_continuous(trans='log2')\n",
" + labs(color='')\n",
" + ylab('Peak memory usage')\n",
")\n",
"g"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment