-
-
Save koaning/5a0f3f27164859c42da5f20148ef3856 to your computer and use it in GitHub Desktop.
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"id": "8f6a37d0-4cb5-4945-b9ed-8218fe015364", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import polars as pl" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"id": "6f515f33-6025-4105-bfab-ab57c5908771", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"'0.10.14'" | |
] | |
}, | |
"execution_count": 2, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"pl.__version__" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "8ea75603-fa54-4885-8359-a10ededbf6eb", | |
"metadata": {}, | |
"source": [ | |
"Let's do some stuff with a dataset! \n", | |
"\n", | |
"## Benchmark 1: Polars" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"id": "c3f1e845-6abb-4b53-82e0-53607dae59a9", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 3.65 s, sys: 579 ms, total: 4.23 s\n", | |
"Wall time: 679 ms\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time \n", | |
"\n", | |
"df = pl.read_csv(\"wowah_data.csv\", parse_dates=False, n_threads=10)\n", | |
"df.columns = [c.replace(\" \", \"\") for c in df.columns]\n", | |
"df = df.lazy()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"id": "ec620387-5e2b-4c0a-a55f-31c7eb525799", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def set_types(dataf):\n", | |
" return (dataf\n", | |
" .with_columns([\n", | |
" pl.col(\"timestamp\").str.strptime(pl.Datetime, fmt=\"%m/%d/%y %H:%M:%S\"),\n", | |
" pl.col(\"guild\") != -1,\n", | |
" ]))\n", | |
"\n", | |
"def sessionize(dataf, threshold=1_000_000):\n", | |
" return (dataf\n", | |
" .sort([\"char\", \"timestamp\"])\n", | |
" .with_columns([\n", | |
" (pl.col(\"timestamp\").diff().cast(pl.Int64) > threshold).fill_null(True).alias(\"ts_diff\"),\n", | |
" (pl.col(\"char\").diff() != 0).fill_null(True).alias(\"char_diff\"),\n", | |
" ])\n", | |
" .with_columns([\n", | |
" (pl.col(\"ts_diff\") | pl.col(\"char_diff\")).alias(\"new_session_mark\")\n", | |
" ])\n", | |
" .with_columns([\n", | |
" pl.col(\"new_session_mark\").cumsum().alias(\"session\")\n", | |
" ])\n", | |
" .drop(['char_diff', 'ts_diff', 'new_session_mark']))\n", | |
"\n", | |
"def add_features(dataf):\n", | |
" return (dataf\n", | |
" .with_columns([\n", | |
" pl.lit(1).alias(\"one\")\n", | |
" ])\n", | |
" .with_columns([\n", | |
" pl.col(\"one\").count().over(\"session\").alias(\"session_length\"),\n", | |
" pl.col(\"session\").n_unique().over(\"char\").alias(\"n_sessions\")\n", | |
" ]))\n", | |
"\n", | |
"def remove_bots(dataf, max_session_hours=24):\n", | |
" n_rows = max_session_hours*6\n", | |
" return (dataf\n", | |
" .filter(pl.col(\"session_length\").max().over(\"char\") < n_rows))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"id": "6331b005-7641-4a0e-bff9-18f72fd09a1e", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"(10826734, 7)" | |
] | |
}, | |
"execution_count": 15, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df.collect().shape" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"id": "480bba8e-bd69-46db-928f-3f562dc7b11e", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 25.4 s, sys: 1.1 s, total: 26.5 s\n", | |
"Wall time: 7.44 s\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1 \"class=\"dataframe \">\n", | |
"<thead>\n", | |
"<tr>\n", | |
"<th>\n", | |
"char\n", | |
"</th>\n", | |
"<th>\n", | |
"level\n", | |
"</th>\n", | |
"<th>\n", | |
"race\n", | |
"</th>\n", | |
"<th>\n", | |
"charclass\n", | |
"</th>\n", | |
"<th>\n", | |
"zone\n", | |
"</th>\n", | |
"<th>\n", | |
"guild\n", | |
"</th>\n", | |
"<th>\n", | |
"timestamp\n", | |
"</th>\n", | |
"<th>\n", | |
"session\n", | |
"</th>\n", | |
"<th>\n", | |
"one\n", | |
"</th>\n", | |
"<th>\n", | |
"session_length\n", | |
"</th>\n", | |
"<th>\n", | |
"n_sessions\n", | |
"</th>\n", | |
"</tr>\n", | |
"<tr>\n", | |
"<td>\n", | |
"i64\n", | |
"</td>\n", | |
"<td>\n", | |
"i64\n", | |
"</td>\n", | |
"<td>\n", | |
"str\n", | |
"</td>\n", | |
"<td>\n", | |
"str\n", | |
"</td>\n", | |
"<td>\n", | |
"str\n", | |
"</td>\n", | |
"<td>\n", | |
"bool\n", | |
"</td>\n", | |
"<td>\n", | |
"datetime\n", | |
"</td>\n", | |
"<td>\n", | |
"u32\n", | |
"</td>\n", | |
"<td>\n", | |
"i32\n", | |
"</td>\n", | |
"<td>\n", | |
"u32\n", | |
"</td>\n", | |
"<td>\n", | |
"u32\n", | |
"</td>\n", | |
"</tr>\n", | |
"</thead>\n", | |
"<tbody>\n", | |
"<tr>\n", | |
"<td>\n", | |
"2\n", | |
"</td>\n", | |
"<td>\n", | |
"18\n", | |
"</td>\n", | |
"<td>\n", | |
"\"Orc\"\n", | |
"</td>\n", | |
"<td>\n", | |
"\"Shaman\"\n", | |
"</td>\n", | |
"<td>\n", | |
"\"The Barrens\"\n", | |
"</td>\n", | |
"<td>\n", | |
"true\n", | |
"</td>\n", | |
"<td>\n", | |
"2008-12-03 10:41:47\n", | |
"</td>\n", | |
"<td>\n", | |
"1\n", | |
"</td>\n", | |
"<td>\n", | |
"1\n", | |
"</td>\n", | |
"<td>\n", | |
"1\n", | |
"</td>\n", | |
"<td>\n", | |
"1\n", | |
"</td>\n", | |
"</tr>\n", | |
"<tr>\n", | |
"<td>\n", | |
"7\n", | |
"</td>\n", | |
"<td>\n", | |
"54\n", | |
"</td>\n", | |
"<td>\n", | |
"\"Orc\"\n", | |
"</td>\n", | |
"<td>\n", | |
"\"Hunter\"\n", | |
"</td>\n", | |
"<td>\n", | |
"\"Feralas\"\n", | |
"</td>\n", | |
"<td>\n", | |
"false\n", | |
"</td>\n", | |
"<td>\n", | |
"2008-01-15 21:47:09\n", | |
"</td>\n", | |
"<td>\n", | |
"2\n", | |
"</td>\n", | |
"<td>\n", | |
"1\n", | |
"</td>\n", | |
"<td>\n", | |
"17\n", | |
"</td>\n", | |
"<td>\n", | |
"186\n", | |
"</td>\n", | |
"</tr>\n", | |
"<tr>\n", | |
"<td>\n", | |
"7\n", | |
"</td>\n", | |
"<td>\n", | |
"54\n", | |
"</td>\n", | |
"<td>\n", | |
"\"Orc\"\n", | |
"</td>\n", | |
"<td>\n", | |
"\"Hunter\"\n", | |
"</td>\n", | |
"<td>\n", | |
"\"Un'Goro Crater\"\n", | |
"</td>\n", | |
"<td>\n", | |
"false\n", | |
"</td>\n", | |
"<td>\n", | |
"2008-01-15 21:56:54\n", | |
"</td>\n", | |
"<td>\n", | |
"2\n", | |
"</td>\n", | |
"<td>\n", | |
"1\n", | |
"</td>\n", | |
"<td>\n", | |
"17\n", | |
"</td>\n", | |
"<td>\n", | |
"186\n", | |
"</td>\n", | |
"</tr>\n", | |
"<tr>\n", | |
"<td>\n", | |
"7\n", | |
"</td>\n", | |
"<td>\n", | |
"54\n", | |
"</td>\n", | |
"<td>\n", | |
"\"Orc\"\n", | |
"</td>\n", | |
"<td>\n", | |
"\"Hunter\"\n", | |
"</td>\n", | |
"<td>\n", | |
"\"The Barrens\"\n", | |
"</td>\n", | |
"<td>\n", | |
"false\n", | |
"</td>\n", | |
"<td>\n", | |
"2008-01-15 22:07:23\n", | |
"</td>\n", | |
"<td>\n", | |
"2\n", | |
"</td>\n", | |
"<td>\n", | |
"1\n", | |
"</td>\n", | |
"<td>\n", | |
"17\n", | |
"</td>\n", | |
"<td>\n", | |
"186\n", | |
"</td>\n", | |
"</tr>\n", | |
"<tr>\n", | |
"<td>\n", | |
"7\n", | |
"</td>\n", | |
"<td>\n", | |
"54\n", | |
"</td>\n", | |
"<td>\n", | |
"\"Orc\"\n", | |
"</td>\n", | |
"<td>\n", | |
"\"Hunter\"\n", | |
"</td>\n", | |
"<td>\n", | |
"\"Badlands\"\n", | |
"</td>\n", | |
"<td>\n", | |
"false\n", | |
"</td>\n", | |
"<td>\n", | |
"2008-01-15 22:17:08\n", | |
"</td>\n", | |
"<td>\n", | |
"2\n", | |
"</td>\n", | |
"<td>\n", | |
"1\n", | |
"</td>\n", | |
"<td>\n", | |
"17\n", | |
"</td>\n", | |
"<td>\n", | |
"186\n", | |
"</td>\n", | |
"</tr>\n", | |
"<tr>\n", | |
"<td>\n", | |
"7\n", | |
"</td>\n", | |
"<td>\n", | |
"54\n", | |
"</td>\n", | |
"<td>\n", | |
"\"Orc\"\n", | |
"</td>\n", | |
"<td>\n", | |
"\"Hunter\"\n", | |
"</td>\n", | |
"<td>\n", | |
"\"Badlands\"\n", | |
"</td>\n", | |
"<td>\n", | |
"false\n", | |
"</td>\n", | |
"<td>\n", | |
"2008-01-15 22:26:52\n", | |
"</td>\n", | |
"<td>\n", | |
"2\n", | |
"</td>\n", | |
"<td>\n", | |
"1\n", | |
"</td>\n", | |
"<td>\n", | |
"17\n", | |
"</td>\n", | |
"<td>\n", | |
"186\n", | |
"</td>\n", | |
"</tr>\n", | |
"<tr>\n", | |
"<td>\n", | |
"7\n", | |
"</td>\n", | |
"<td>\n", | |
"54\n", | |
"</td>\n", | |
"<td>\n", | |
"\"Orc\"\n", | |
"</td>\n", | |
"<td>\n", | |
"\"Hunter\"\n", | |
"</td>\n", | |
"<td>\n", | |
"\"Badlands\"\n", | |
"</td>\n", | |
"<td>\n", | |
"false\n", | |
"</td>\n", | |
"<td>\n", | |
"2008-01-15 22:37:25\n", | |
"</td>\n", | |
"<td>\n", | |
"2\n", | |
"</td>\n", | |
"<td>\n", | |
"1\n", | |
"</td>\n", | |
"<td>\n", | |
"17\n", | |
"</td>\n", | |
"<td>\n", | |
"186\n", | |
"</td>\n", | |
"</tr>\n", | |
"<tr>\n", | |
"<td>\n", | |
"7\n", | |
"</td>\n", | |
"<td>\n", | |
"54\n", | |
"</td>\n", | |
"<td>\n", | |
"\"Orc\"\n", | |
"</td>\n", | |
"<td>\n", | |
"\"Hunter\"\n", | |
"</td>\n", | |
"<td>\n", | |
"\"Swamp of Sorrows\"\n", | |
"</td>\n", | |
"<td>\n", | |
"true\n", | |
"</td>\n", | |
"<td>\n", | |
"2008-01-15 22:47:10\n", | |
"</td>\n", | |
"<td>\n", | |
"2\n", | |
"</td>\n", | |
"<td>\n", | |
"1\n", | |
"</td>\n", | |
"<td>\n", | |
"17\n", | |
"</td>\n", | |
"<td>\n", | |
"186\n", | |
"</td>\n", | |
"</tr>\n", | |
"<tr>\n", | |
"<td>\n", | |
"7\n", | |
"</td>\n", | |
"<td>\n", | |
"54\n", | |
"</td>\n", | |
"<td>\n", | |
"\"Orc\"\n", | |
"</td>\n", | |
"<td>\n", | |
"\"Hunter\"\n", | |
"</td>\n", | |
"<td>\n", | |
"\"The Temple of Atal'Hakkar\"\n", | |
"</td>\n", | |
"<td>\n", | |
"true\n", | |
"</td>\n", | |
"<td>\n", | |
"2008-01-15 22:56:53\n", | |
"</td>\n", | |
"<td>\n", | |
"2\n", | |
"</td>\n", | |
"<td>\n", | |
"1\n", | |
"</td>\n", | |
"<td>\n", | |
"17\n", | |
"</td>\n", | |
"<td>\n", | |
"186\n", | |
"</td>\n", | |
"</tr>\n", | |
"<tr>\n", | |
"<td>\n", | |
"7\n", | |
"</td>\n", | |
"<td>\n", | |
"54\n", | |
"</td>\n", | |
"<td>\n", | |
"\"Orc\"\n", | |
"</td>\n", | |
"<td>\n", | |
"\"Hunter\"\n", | |
"</td>\n", | |
"<td>\n", | |
"\"The Temple of Atal'Hakkar\"\n", | |
"</td>\n", | |
"<td>\n", | |
"true\n", | |
"</td>\n", | |
"<td>\n", | |
"2008-01-15 23:07:25\n", | |
"</td>\n", | |
"<td>\n", | |
"2\n", | |
"</td>\n", | |
"<td>\n", | |
"1\n", | |
"</td>\n", | |
"<td>\n", | |
"17\n", | |
"</td>\n", | |
"<td>\n", | |
"186\n", | |
"</td>\n", | |
"</tr>\n", | |
"<tr>\n", | |
"<td>\n", | |
"7\n", | |
"</td>\n", | |
"<td>\n", | |
"54\n", | |
"</td>\n", | |
"<td>\n", | |
"\"Orc\"\n", | |
"</td>\n", | |
"<td>\n", | |
"\"Hunter\"\n", | |
"</td>\n", | |
"<td>\n", | |
"\"The Temple of Atal'Hakkar\"\n", | |
"</td>\n", | |
"<td>\n", | |
"true\n", | |
"</td>\n", | |
"<td>\n", | |
"2008-01-15 23:17:09\n", | |
"</td>\n", | |
"<td>\n", | |
"2\n", | |
"</td>\n", | |
"<td>\n", | |
"1\n", | |
"</td>\n", | |
"<td>\n", | |
"17\n", | |
"</td>\n", | |
"<td>\n", | |
"186\n", | |
"</td>\n", | |
"</tr>\n", | |
"<tr>\n", | |
"<td>\n", | |
"7\n", | |
"</td>\n", | |
"<td>\n", | |
"55\n", | |
"</td>\n", | |
"<td>\n", | |
"\"Orc\"\n", | |
"</td>\n", | |
"<td>\n", | |
"\"Hunter\"\n", | |
"</td>\n", | |
"<td>\n", | |
"\"The Temple of Atal'Hakkar\"\n", | |
"</td>\n", | |
"<td>\n", | |
"true\n", | |
"</td>\n", | |
"<td>\n", | |
"2008-01-15 23:26:53\n", | |
"</td>\n", | |
"<td>\n", | |
"2\n", | |
"</td>\n", | |
"<td>\n", | |
"1\n", | |
"</td>\n", | |
"<td>\n", | |
"17\n", | |
"</td>\n", | |
"<td>\n", | |
"186\n", | |
"</td>\n", | |
"</tr>\n", | |
"<tr>\n", | |
"<td>\n", | |
"...\n", | |
"</td>\n", | |
"<td>\n", | |
"...\n", | |
"</td>\n", | |
"<td>\n", | |
"...\n", | |
"</td>\n", | |
"<td>\n", | |
"...\n", | |
"</td>\n", | |
"<td>\n", | |
"...\n", | |
"</td>\n", | |
"<td>\n", | |
"...\n", | |
"</td>\n", | |
"<td>\n", | |
"...\n", | |
"</td>\n", | |
"<td>\n", | |
"...\n", | |
"</td>\n", | |
"<td>\n", | |
"...\n", | |
"</td>\n", | |
"<td>\n", | |
"...\n", | |
"</td>\n", | |
"<td>\n", | |
"...\n", | |
"</td>\n", | |
"</tr>\n", | |
"<tr>\n", | |
"<td>\n", | |
"90575\n", | |
"</td>\n", | |
"<td>\n", | |
"2\n", | |
"</td>\n", | |
"<td>\n", | |
"\"Orc\"\n", | |
"</td>\n", | |
"<td>\n", | |
"\"Hunter\"\n", | |
"</td>\n", | |
"<td>\n", | |
"\"Orgrimmar\"\n", | |
"</td>\n", | |
"<td>\n", | |
"false\n", | |
"</td>\n", | |
"<td>\n", | |
"2008-12-31 21:14:51\n", | |
"</td>\n", | |
"<td>\n", | |
"1733212\n", | |
"</td>\n", | |
"<td>\n", | |
"1\n", | |
"</td>\n", | |
"<td>\n", | |
"2\n", | |
"</td>\n", | |
"<td>\n", | |
"1\n", | |
"</td>\n", | |
"</tr>\n", | |
"<tr>\n", | |
"<td>\n", | |
"90576\n", | |
"</td>\n", | |
"<td>\n", | |
"2\n", | |
"</td>\n", | |
"<td>\n", | |
"\"Blood Elf\"\n", | |
"</td>\n", | |
"<td>\n", | |
"\"Warlock\"\n", | |
"</td>\n", | |
"<td>\n", | |
"\"Eversong Woods\"\n", | |
"</td>\n", | |
"<td>\n", | |
"false\n", | |
"</td>\n", | |
"<td>\n", | |
"2008-12-31 22:06:58\n", | |
"</td>\n", | |
"<td>\n", | |
"1733213\n", | |
"</td>\n", | |
"<td>\n", | |
"1\n", | |
"</td>\n", | |
"<td>\n", | |
"4\n", | |
"</td>\n", | |
"<td>\n", | |
"2\n", | |
"</td>\n", | |
"</tr>\n", | |
"<tr>\n", | |
"<td>\n", | |
"90576\n", | |
"</td>\n", | |
"<td>\n", | |
"3\n", | |
"</td>\n", | |
"<td>\n", | |
"\"Blood Elf\"\n", | |
"</td>\n", | |
"<td>\n", | |
"\"Warlock\"\n", | |
"</td>\n", | |
"<td>\n", | |
"\"Eversong Woods\"\n", | |
"</td>\n", | |
"<td>\n", | |
"false\n", | |
"</td>\n", | |
"<td>\n", | |
"2008-12-31 22:17:35\n", | |
"</td>\n", | |
"<td>\n", | |
"1733213\n", | |
"</td>\n", | |
"<td>\n", | |
"1\n", | |
"</td>\n", | |
"<td>\n", | |
"4\n", | |
"</td>\n", | |
"<td>\n", | |
"2\n", | |
"</td>\n", | |
"</tr>\n", | |
"<tr>\n", | |
"<td>\n", | |
"90576\n", | |
"</td>\n", | |
"<td>\n", | |
"3\n", | |
"</td>\n", | |
"<td>\n", | |
"\"Blood Elf\"\n", | |
"</td>\n", | |
"<td>\n", | |
"\"Warlock\"\n", | |
"</td>\n", | |
"<td>\n", | |
"\"Eversong Woods\"\n", | |
"</td>\n", | |
"<td>\n", | |
"false\n", | |
"</td>\n", | |
"<td>\n", | |
"2008-12-31 22:32:52\n", | |
"</td>\n", | |
"<td>\n", | |
"1733213\n", | |
"</td>\n", | |
"<td>\n", | |
"1\n", | |
"</td>\n", | |
"<td>\n", | |
"4\n", | |
"</td>\n", | |
"<td>\n", | |
"2\n", | |
"</td>\n", | |
"</tr>\n", | |
"<tr>\n", | |
"<td>\n", | |
"90576\n", | |
"</td>\n", | |
"<td>\n", | |
"4\n", | |
"</td>\n", | |
"<td>\n", | |
"\"Blood Elf\"\n", | |
"</td>\n", | |
"<td>\n", | |
"\"Warlock\"\n", | |
"</td>\n", | |
"<td>\n", | |
"\"Eversong Woods\"\n", | |
"</td>\n", | |
"<td>\n", | |
"false\n", | |
"</td>\n", | |
"<td>\n", | |
"2008-12-31 22:47:54\n", | |
"</td>\n", | |
"<td>\n", | |
"1733213\n", | |
"</td>\n", | |
"<td>\n", | |
"1\n", | |
"</td>\n", | |
"<td>\n", | |
"4\n", | |
"</td>\n", | |
"<td>\n", | |
"2\n", | |
"</td>\n", | |
"</tr>\n", | |
"<tr>\n", | |
"<td>\n", | |
"90576\n", | |
"</td>\n", | |
"<td>\n", | |
"5\n", | |
"</td>\n", | |
"<td>\n", | |
"\"Blood Elf\"\n", | |
"</td>\n", | |
"<td>\n", | |
"\"Warlock\"\n", | |
"</td>\n", | |
"<td>\n", | |
"\"Eversong Woods\"\n", | |
"</td>\n", | |
"<td>\n", | |
"false\n", | |
"</td>\n", | |
"<td>\n", | |
"2008-12-31 23:07:13\n", | |
"</td>\n", | |
"<td>\n", | |
"1733214\n", | |
"</td>\n", | |
"<td>\n", | |
"1\n", | |
"</td>\n", | |
"<td>\n", | |
"1\n", | |
"</td>\n", | |
"<td>\n", | |
"2\n", | |
"</td>\n", | |
"</tr>\n", | |
"<tr>\n", | |
"<td>\n", | |
"90577\n", | |
"</td>\n", | |
"<td>\n", | |
"1\n", | |
"</td>\n", | |
"<td>\n", | |
"\"Blood Elf\"\n", | |
"</td>\n", | |
"<td>\n", | |
"\"Warlock\"\n", | |
"</td>\n", | |
"<td>\n", | |
"\"Eversong Woods\"\n", | |
"</td>\n", | |
"<td>\n", | |
"false\n", | |
"</td>\n", | |
"<td>\n", | |
"2008-12-31 22:17:35\n", | |
"</td>\n", | |
"<td>\n", | |
"1733215\n", | |
"</td>\n", | |
"<td>\n", | |
"1\n", | |
"</td>\n", | |
"<td>\n", | |
"3\n", | |
"</td>\n", | |
"<td>\n", | |
"1\n", | |
"</td>\n", | |
"</tr>\n", | |
"<tr>\n", | |
"<td>\n", | |
"90577\n", | |
"</td>\n", | |
"<td>\n", | |
"2\n", | |
"</td>\n", | |
"<td>\n", | |
"\"Blood Elf\"\n", | |
"</td>\n", | |
"<td>\n", | |
"\"Warlock\"\n", | |
"</td>\n", | |
"<td>\n", | |
"\"Eversong Woods\"\n", | |
"</td>\n", | |
"<td>\n", | |
"false\n", | |
"</td>\n", | |
"<td>\n", | |
"2008-12-31 22:32:52\n", | |
"</td>\n", | |
"<td>\n", | |
"1733215\n", | |
"</td>\n", | |
"<td>\n", | |
"1\n", | |
"</td>\n", | |
"<td>\n", | |
"3\n", | |
"</td>\n", | |
"<td>\n", | |
"1\n", | |
"</td>\n", | |
"</tr>\n", | |
"<tr>\n", | |
"<td>\n", | |
"90577\n", | |
"</td>\n", | |
"<td>\n", | |
"3\n", | |
"</td>\n", | |
"<td>\n", | |
"\"Blood Elf\"\n", | |
"</td>\n", | |
"<td>\n", | |
"\"Warlock\"\n", | |
"</td>\n", | |
"<td>\n", | |
"\"Eversong Woods\"\n", | |
"</td>\n", | |
"<td>\n", | |
"false\n", | |
"</td>\n", | |
"<td>\n", | |
"2008-12-31 22:47:54\n", | |
"</td>\n", | |
"<td>\n", | |
"1733215\n", | |
"</td>\n", | |
"<td>\n", | |
"1\n", | |
"</td>\n", | |
"<td>\n", | |
"3\n", | |
"</td>\n", | |
"<td>\n", | |
"1\n", | |
"</td>\n", | |
"</tr>\n", | |
"<tr>\n", | |
"<td>\n", | |
"90578\n", | |
"</td>\n", | |
"<td>\n", | |
"1\n", | |
"</td>\n", | |
"<td>\n", | |
"\"Blood Elf\"\n", | |
"</td>\n", | |
"<td>\n", | |
"\"Paladin\"\n", | |
"</td>\n", | |
"<td>\n", | |
"\"Eversong Woods\"\n", | |
"</td>\n", | |
"<td>\n", | |
"false\n", | |
"</td>\n", | |
"<td>\n", | |
"2008-12-31 22:32:52\n", | |
"</td>\n", | |
"<td>\n", | |
"1733216\n", | |
"</td>\n", | |
"<td>\n", | |
"1\n", | |
"</td>\n", | |
"<td>\n", | |
"1\n", | |
"</td>\n", | |
"<td>\n", | |
"1\n", | |
"</td>\n", | |
"</tr>\n", | |
"<tr>\n", | |
"<td>\n", | |
"90579\n", | |
"</td>\n", | |
"<td>\n", | |
"1\n", | |
"</td>\n", | |
"<td>\n", | |
"\"Orc\"\n", | |
"</td>\n", | |
"<td>\n", | |
"\"Warrior\"\n", | |
"</td>\n", | |
"<td>\n", | |
"\"Durotar\"\n", | |
"</td>\n", | |
"<td>\n", | |
"false\n", | |
"</td>\n", | |
"<td>\n", | |
"2008-12-31 22:44:45\n", | |
"</td>\n", | |
"<td>\n", | |
"1733217\n", | |
"</td>\n", | |
"<td>\n", | |
"1\n", | |
"</td>\n", | |
"<td>\n", | |
"1\n", | |
"</td>\n", | |
"<td>\n", | |
"1\n", | |
"</td>\n", | |
"</tr>\n", | |
"<tr>\n", | |
"<td>\n", | |
"90580\n", | |
"</td>\n", | |
"<td>\n", | |
"1\n", | |
"</td>\n", | |
"<td>\n", | |
"\"Tauren\"\n", | |
"</td>\n", | |
"<td>\n", | |
"\"Warrior\"\n", | |
"</td>\n", | |
"<td>\n", | |
"\"Mulgore\"\n", | |
"</td>\n", | |
"<td>\n", | |
"false\n", | |
"</td>\n", | |
"<td>\n", | |
"2008-12-31 23:15:20\n", | |
"</td>\n", | |
"<td>\n", | |
"1733218\n", | |
"</td>\n", | |
"<td>\n", | |
"1\n", | |
"</td>\n", | |
"<td>\n", | |
"1\n", | |
"</td>\n", | |
"<td>\n", | |
"1\n", | |
"</td>\n", | |
"</tr>\n", | |
"</tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
"shape: (10700792, 11)\n", | |
"┌───────┬───────┬───────────┬───────────┬─────┬─────────┬─────┬────────────────┬────────────┐\n", | |
"│ char ┆ level ┆ race ┆ charclass ┆ ... ┆ session ┆ one ┆ session_length ┆ n_sessions │\n", | |
"│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", | |
"│ i64 ┆ i64 ┆ str ┆ str ┆ ┆ u32 ┆ i32 ┆ u32 ┆ u32 │\n", | |
"╞═══════╪═══════╪═══════════╪═══════════╪═════╪═════════╪═════╪════════════════╪════════════╡\n", | |
"│ 2 ┆ 18 ┆ Orc ┆ Shaman ┆ ... ┆ 1 ┆ 1 ┆ 1 ┆ 1 │\n", | |
"├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤\n", | |
"│ 7 ┆ 54 ┆ Orc ┆ Hunter ┆ ... ┆ 2 ┆ 1 ┆ 17 ┆ 186 │\n", | |
"├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤\n", | |
"│ 7 ┆ 54 ┆ Orc ┆ Hunter ┆ ... ┆ 2 ┆ 1 ┆ 17 ┆ 186 │\n", | |
"├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤\n", | |
"│ 7 ┆ 54 ┆ Orc ┆ Hunter ┆ ... ┆ 2 ┆ 1 ┆ 17 ┆ 186 │\n", | |
"├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤\n", | |
"│ ... ┆ ... ┆ ... ┆ ... ┆ ... ┆ ... ┆ ... ┆ ... ┆ ... │\n", | |
"├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤\n", | |
"│ 90577 ┆ 2 ┆ Blood Elf ┆ Warlock ┆ ... ┆ 1733215 ┆ 1 ┆ 3 ┆ 1 │\n", | |
"├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤\n", | |
"│ 90577 ┆ 3 ┆ Blood Elf ┆ Warlock ┆ ... ┆ 1733215 ┆ 1 ┆ 3 ┆ 1 │\n", | |
"├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤\n", | |
"│ 90578 ┆ 1 ┆ Blood Elf ┆ Paladin ┆ ... ┆ 1733216 ┆ 1 ┆ 1 ┆ 1 │\n", | |
"├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤\n", | |
"│ 90579 ┆ 1 ┆ Orc ┆ Warrior ┆ ... ┆ 1733217 ┆ 1 ┆ 1 ┆ 1 │\n", | |
"├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤\n", | |
"│ 90580 ┆ 1 ┆ Tauren ┆ Warrior ┆ ... ┆ 1733218 ┆ 1 ┆ 1 ┆ 1 │\n", | |
"└───────┴───────┴───────────┴───────────┴─────┴─────────┴─────┴────────────────┴────────────┘" | |
] | |
}, | |
"execution_count": 16, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"\n", | |
"(df\n", | |
" .pipe(set_types)\n", | |
" .pipe(sessionize)\n", | |
" .pipe(add_features)\n", | |
" .pipe(remove_bots)\n", | |
" .collect())" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "acba1aff-aebe-4f5e-8934-1d6d0bfc0bb1", | |
"metadata": {}, | |
"source": [ | |
"## Benchmark 2: Pandas" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"id": "c89eb86c-de5a-4910-b556-cf790beea5d8", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import pandas as pd " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"id": "f3443796-53f5-4fc1-a52f-e91747f15936", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 4.89 s, sys: 424 ms, total: 5.32 s\n", | |
"Wall time: 5.32 s\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"\n", | |
"df = pd.read_csv(\"wowah_data.csv\")\n", | |
"df.columns = [c.replace(\" \", \"\") for c in df.columns]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"id": "ee2571a1-30ed-4434-93d1-0b49292e0e40", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>char</th>\n", | |
" <th>level</th>\n", | |
" <th>race</th>\n", | |
" <th>charclass</th>\n", | |
" <th>zone</th>\n", | |
" <th>guild</th>\n", | |
" <th>timestamp</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>59425</td>\n", | |
" <td>1</td>\n", | |
" <td>Orc</td>\n", | |
" <td>Rogue</td>\n", | |
" <td>Orgrimmar</td>\n", | |
" <td>165</td>\n", | |
" <td>01/01/08 00:02:04</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>65494</td>\n", | |
" <td>9</td>\n", | |
" <td>Orc</td>\n", | |
" <td>Hunter</td>\n", | |
" <td>Durotar</td>\n", | |
" <td>-1</td>\n", | |
" <td>01/01/08 00:02:04</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>65325</td>\n", | |
" <td>14</td>\n", | |
" <td>Orc</td>\n", | |
" <td>Warrior</td>\n", | |
" <td>Ghostlands</td>\n", | |
" <td>-1</td>\n", | |
" <td>01/01/08 00:02:04</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>65490</td>\n", | |
" <td>18</td>\n", | |
" <td>Orc</td>\n", | |
" <td>Hunter</td>\n", | |
" <td>Ghostlands</td>\n", | |
" <td>-1</td>\n", | |
" <td>01/01/08 00:02:04</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>2288</td>\n", | |
" <td>60</td>\n", | |
" <td>Orc</td>\n", | |
" <td>Hunter</td>\n", | |
" <td>Hellfire Peninsula</td>\n", | |
" <td>-1</td>\n", | |
" <td>01/01/08 00:02:09</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>...</th>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>10826729</th>\n", | |
" <td>86766</td>\n", | |
" <td>80</td>\n", | |
" <td>Blood Elf</td>\n", | |
" <td>Death Knight</td>\n", | |
" <td>Halls of Lightning</td>\n", | |
" <td>101</td>\n", | |
" <td>12/31/08 23:50:18</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>10826730</th>\n", | |
" <td>86497</td>\n", | |
" <td>77</td>\n", | |
" <td>Blood Elf</td>\n", | |
" <td>Death Knight</td>\n", | |
" <td>The Storm Peaks</td>\n", | |
" <td>358</td>\n", | |
" <td>12/31/08 23:50:18</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>10826731</th>\n", | |
" <td>34893</td>\n", | |
" <td>80</td>\n", | |
" <td>Blood Elf</td>\n", | |
" <td>Death Knight</td>\n", | |
" <td>The Storm Peaks</td>\n", | |
" <td>189</td>\n", | |
" <td>12/31/08 23:50:18</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>10826732</th>\n", | |
" <td>86881</td>\n", | |
" <td>80</td>\n", | |
" <td>Blood Elf</td>\n", | |
" <td>Death Knight</td>\n", | |
" <td>Dragonblight</td>\n", | |
" <td>478</td>\n", | |
" <td>12/31/08 23:50:18</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>10826733</th>\n", | |
" <td>86457</td>\n", | |
" <td>80</td>\n", | |
" <td>Blood Elf</td>\n", | |
" <td>Death Knight</td>\n", | |
" <td>Dragonblight</td>\n", | |
" <td>204</td>\n", | |
" <td>12/31/08 23:50:18</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>10826734 rows × 7 columns</p>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" char level race charclass zone guild \\\n", | |
"0 59425 1 Orc Rogue Orgrimmar 165 \n", | |
"1 65494 9 Orc Hunter Durotar -1 \n", | |
"2 65325 14 Orc Warrior Ghostlands -1 \n", | |
"3 65490 18 Orc Hunter Ghostlands -1 \n", | |
"4 2288 60 Orc Hunter Hellfire Peninsula -1 \n", | |
"... ... ... ... ... ... ... \n", | |
"10826729 86766 80 Blood Elf Death Knight Halls of Lightning 101 \n", | |
"10826730 86497 77 Blood Elf Death Knight The Storm Peaks 358 \n", | |
"10826731 34893 80 Blood Elf Death Knight The Storm Peaks 189 \n", | |
"10826732 86881 80 Blood Elf Death Knight Dragonblight 478 \n", | |
"10826733 86457 80 Blood Elf Death Knight Dragonblight 204 \n", | |
"\n", | |
" timestamp \n", | |
"0 01/01/08 00:02:04 \n", | |
"1 01/01/08 00:02:04 \n", | |
"2 01/01/08 00:02:04 \n", | |
"3 01/01/08 00:02:04 \n", | |
"4 01/01/08 00:02:09 \n", | |
"... ... \n", | |
"10826729 12/31/08 23:50:18 \n", | |
"10826730 12/31/08 23:50:18 \n", | |
"10826731 12/31/08 23:50:18 \n", | |
"10826732 12/31/08 23:50:18 \n", | |
"10826733 12/31/08 23:50:18 \n", | |
"\n", | |
"[10826734 rows x 7 columns]" | |
] | |
}, | |
"execution_count": 17, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 68, | |
"id": "1b66e567-134b-4f06-8b66-65572fc39e2a", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def set_types(dataf):\n", | |
" return (dataf\n", | |
" .assign(timestamp=lambda d: pd.to_datetime(d['timestamp'], format=\"%m/%d/%y %H:%M:%S\"),\n", | |
" guild=lambda d: d['guild'] != -1))\n", | |
" \n", | |
"def sessionize(dataf, threshold=60*10):\n", | |
" return (dataf\n", | |
" .sort_values([\"char\", \"timestamp\"])\n", | |
" .assign(ts_diff=lambda d: (d['timestamp'] - d['timestamp'].shift()).dt.seconds > threshold,\n", | |
" char_diff=lambda d: (d['char'].diff() != 0),\n", | |
" new_session_mark=lambda d: d['ts_diff'] | d['char_diff'],\n", | |
" session=lambda d: d['new_session_mark'].fillna(0).cumsum())\n", | |
" .drop(columns=['char_diff', 'ts_diff', 'new_session_mark']))\n", | |
"\n", | |
"def add_features(dataf):\n", | |
" return (dataf\n", | |
" .assign(session_length=lambda d: d.groupby('session')['char'].transform(lambda d: d.count()))\n", | |
" .assign(n_sessions=lambda d: d.groupby('char')['session'].transform(lambda d: d.nunique())))\n", | |
"\n", | |
"def remove_bots(dataf, max_session_hours=24):\n", | |
" n_rows = max_session_hours*6\n", | |
" return (dataf\n", | |
" .assign(max_sess_len=lambda d: d.groupby('char')['session_length'].transform(lambda d: d.max()))\n", | |
" .loc[lambda d: d[\"max_sess_len\"] < n_rows]\n", | |
" .drop(columns=[\"max_sess_len\"]))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 69, | |
"id": "9b700847-9fdd-4ffc-8af8-2ff09b7ef665", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 14.5 s, sys: 1.11 s, total: 15.6 s\n", | |
"Wall time: 15.6 s\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time \n", | |
"\n", | |
"dataf = df.pipe(set_types).pipe(sessionize)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 70, | |
"id": "c22d6fab-4c7e-4623-803a-28ebc70bb63d", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 9min 15s, sys: 8.03 s, total: 9min 24s\n", | |
"Wall time: 9min 23s\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"\n", | |
"final = dataf.pipe(add_features).pipe(remove_bots)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "2d538fca-44bc-4e8c-97e0-168db0a3bb5f", | |
"metadata": {}, | |
"source": [ | |
"## The Results?\n", | |
"\n", | |
"- polars `8s`\n", | |
"- pandas `9m 44s`" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "42513dcd-60c4-4df1-aa9a-beb59cdd76b4", | |
"metadata": {}, | |
"source": [ | |
"It's not a perfect benchmark, and it depends a bit on how on measures ... but a rough speedup factor is:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"id": "b3957b28-55bb-40f1-8d02-ca1acfd60b08", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"73.0" | |
] | |
}, | |
"execution_count": 12, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"(9*60+44)/8" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "c9ad81bb-8156-4da1-af34-00fc9e29a313", | |
"metadata": {}, | |
"source": [ | |
"Not bad!" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3 (ipykernel)", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.9" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
While I am a fan of Calmcode and accept that Polars is faster in many cases, I think this benchmark is selling Pandas short. The main reason the Pandas code is slow is because it's using GroupBy.transform
, which is actually slow Python for-loop over the groups under the hood. Changing those functions to use the Pandas built-ins (like GroupBy.count
or GroupBy.nunique
), I get the benchmarks
polars (0.16.16) 0m 4.5s
pandas (1.5.3) 0m 6.5s
def add_features(dataf):
return (dataf
.assign(session_length=lambda d: d.groupby('session')['char'].count())
.assign(n_sessions=lambda d: d.groupby('char')['session'].nunique()))
def remove_bots(dataf, max_session_hours=24):
n_rows = max_session_hours*6
return (dataf
.assign(max_sess_len=lambda d: d.groupby('char')['session_length'].max())
.loc[lambda d: d["max_sess_len"] < n_rows]
.drop(columns=["max_sess_len"]))
TL;DR: Avoid using transform, apply, aggregate
in performant Pandas code!
This is a fair comment! I responded on the calmcode repo as well, will dive into this a bit and run a proper benchmark on my end. Something is telling me the pandas 2.0 version might also speed things up.
I'm looking at the numbers now, and while it's totally fair to nuance my numbers ... I just want to check on a few details.
The new add_features
and remove_bots
pandas functions run in 2.95 seconds on my machine. But! The set_types
and sessionnize
functions take 14.9s and reading in the .csv file takes 5.72s. That's a total of 23.57s for pandas. I also re-ran polars, which became faster in the meantime and ran in ~2.1s.
So it's certainly not a 70x speedup, but it seems like I'm still looking at a 5.72 + 14.9 + 2.95)/2.1 = 11.2x speedup. Or, have I skipped a step? Figured I'd check.
On more diving deeper, I should immediately mention that the polars runtime does fluctuate between 2s-5s. So maybe it's more like a 4x-11x speedup.
I'm also not 100% sure if your method calculates the same thing as polars. Just to check, your method:
def add_features(dataf):
return (dataf
.assign(session_length=lambda d: d.groupby('session')['char'].count())
.assign(n_sessions=lambda d: d.groupby('char')['session'].nunique()))
When you call d.groupby('session')['char'].count()
you get a series back that has a different length than the original dataframe.
dataf = df.pipe(set_types).pipe(sessionize)
dataf.groupby("session")["char"].count().shape, dataf.shape
# ((5954828,), (10826734, 8))
This is confirmed when looking at the output of the function.
Figured I'd stop here and check for feedback, it might certainly be that I'm skipping over something, and I'll gladly hear it if that is the case.
Ah you're right, thanks! This is a bit more tricky than I thought.
I overlooked that the pandas.DataFrame.transform
returns a DataFrame with the same dimension as the input DataFrame, so your original code avoids having to do a join, while my first revision above misses that.
Here's another revision that uses join
instead of assign
. This fixes these issues and is about an order magnitude faster than the original! I added a compare method to make sure the new code and the original code have identical outputs!
I'm getting these benchmarks for the full pipeline (set types, sessionize, add features, and remove bots) in Polars and Pandas (after loading both into a df):
polars: 4.23 s
pandas: 23.5 s
So I agree with you that Polars is almost an order of magnitude faster here, but at least it's not two orders! 😄
(Also minor note, not important for the benchmark: the sessionize default thresholds are set differently for Polars vs Pandas.)
import pandas as pd
%%time
pandas_df = pd.read_csv("wowah_data.csv")
pandas_df.columns = [c.replace(" ", "") for c in pandas_df.columns]
# CPU times: user 5.8 s, sys: 1.21 s, total: 7 s
def set_types(dataf):
return (dataf
.assign(timestamp=lambda d: pd.to_datetime(d['timestamp'], format="%m/%d/%y %H:%M:%S"),
guild=lambda d: d['guild'] != -1))
def sessionize(dataf, threshold=60*10):
return (dataf
.sort_values(["char", "timestamp"])
.assign(ts_diff=lambda d: (d['timestamp'] - d['timestamp'].shift()).dt.seconds > threshold,
char_diff=lambda d: (d['char'].diff() != 0),
new_session_mark=lambda d: d['ts_diff'] | d['char_diff'],
session=lambda d: d['new_session_mark'].fillna(0).cumsum())
.drop(columns=['char_diff', 'ts_diff', 'new_session_mark']))
def add_features(dataf):
return (dataf
.assign(session_length=lambda d: d.groupby('session')['char'].transform(lambda d: d.count()))
.assign(n_sessions=lambda d: d.groupby('char')['session'].transform(lambda d: d.nunique())))
def remove_bots(dataf, max_session_hours=24):
n_rows = max_session_hours*6
return (dataf
.assign(max_sess_len=lambda d: d.groupby('char')['session_length'].transform(lambda d: d.max()))
.loc[lambda d: d["max_sess_len"] < n_rows]
.drop(columns=["max_sess_len"]))
def add_features_fast(dataf):
return (dataf
.join(
dataf.groupby('session')['char'].count().rename("session_length"),
on="session")
.join(
dataf.groupby('char')['session'].nunique().rename("n_sessions"),
on="char"))
def remove_bots_fast(dataf, max_session_hours=24):
n_rows = max_session_hours*6
return (dataf
.join(
dataf.groupby('char')['session_length'].max().rename("max_sess_len"),
on="char")
.loc[lambda d: d["max_sess_len"] < n_rows]
.drop(columns=["max_sess_len"]))
%%time
pandas_slow = pandas_df.pipe(set_types).pipe(sessionize).pipe(add_features).pipe(remove_bots)
# CPU times: user 7min 59s, sys: 10.7 s, total: 8min 9s
%%time
pandas_fast = pandas_df.pipe(set_types).pipe(sessionize).pipe(add_features_fast).pipe(remove_bots_fast)
# CPU times: user 20.7 s, sys: 2.91 s, total: 23.6 s
assert pandas_fast.compare(pandas_slow).empty
assert (pandas_fast == pandas_slow).all().all()
polars (0.16.16) 4.9s
pandas (1.5.3) 28.2s
pandas (2.0.0rc1) throws an error: <class 'numpy.intc'>
my code for comparison is at https://github.com/wgong/py4kids/blob/master/lesson-14.6-polars/polars-cookbook/cookbook/pandas_vs_polars.py#L532
I would not have expected .transform()
to be slower than .join()
!
But yeah, I'll also poke around some more here, but thanks for the reply!
Thank you for a great tutorial on polars - https://calmcode.io/polars/introduction.html
On my HP laptop (AMD Ryzen 5 2.10 GHz 16 GB Ram),
I see a much bigger speedup by 137
Go Polars