Skip to content

Instantly share code, notes, and snippets.

@jcorbin
Created March 20, 2021 21:52
Show Gist options
  • Save jcorbin/82fb7dd9e737f6cd3b2c3a5a718f9b0d to your computer and use it in GitHub Desktop.
Save jcorbin/82fb7dd9e737f6cd3b2c3a5a718f9b0d to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"id": "crazy-pickup",
"metadata": {},
"source": [
"# Data Acquisition\n",
"\n",
"Collect datasets with something like:\n",
"```shell\n",
"$ for i in $(seq N); do\n",
"> time CMD <INPUT >/dev/null\n",
"> done |& tee SAMPLE\n",
"```\n",
"\n",
"NOTE: may be overfit to the `time` zsh builtin, ymmv with other implementations."
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "racial-controversy",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"def parsetimes(filename, asns=False):\n",
" with open(filename) as f:\n",
" lines = pd.Series(f)\n",
"\n",
" dat = lines.str.extract(r'''(?x)\n",
" (?P<cmd>.+)\n",
" \\s+\n",
" (?P<usertime>[^\\s]+) \\s+ user\n",
" \\s+\n",
" (?P<systime>[^\\s]+) \\s+ system\n",
" \\s+\n",
" (?P<cpupct>[^\\s]+) \\s+ cpu\n",
" \\s+\n",
" (?P<walltime>[^\\s]+) \\s+ total\n",
" ''')\n",
"\n",
" dat['usertime'] = pd.to_timedelta(dat['usertime'])\n",
" dat['systime'] = pd.to_timedelta(dat['systime'])\n",
" dat['walltime'] = pd.to_timedelta(dat['walltime'].astype('float'), unit='s')\n",
" dat['cpupct'] = dat['cpupct'].str.strip('%').astype('float')\n",
" \n",
" if asns:\n",
" dat['usertime'] = dat['usertime'].astype('int')\n",
" dat['systime'] = dat['systime'].astype('int')\n",
" dat['walltime'] = dat['walltime'].astype('int')\n",
"\n",
" return dat"
]
},
{
"cell_type": "markdown",
"id": "recovered-buyer",
"metadata": {},
"source": [
"# Simplistic before/after dataset loading and inspection"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "graphic-synthetic",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"./parago.old < kjvbible_x100.txt > /dev/null \n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>usertime</th>\n",
" <th>systime</th>\n",
" <th>cpupct</th>\n",
" <th>walltime</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>16800000000</td>\n",
" <td>370000000</td>\n",
" <td>1040.0</td>\n",
" <td>1651000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>17100000000</td>\n",
" <td>380000000</td>\n",
" <td>1021.0</td>\n",
" <td>1712000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>17300000000</td>\n",
" <td>390000000</td>\n",
" <td>1058.0</td>\n",
" <td>1670000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>17760000000</td>\n",
" <td>390000000</td>\n",
" <td>1071.0</td>\n",
" <td>1693000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>17970000000</td>\n",
" <td>400000000</td>\n",
" <td>1061.0</td>\n",
" <td>1730000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>17960000000</td>\n",
" <td>400000000</td>\n",
" <td>1057.0</td>\n",
" <td>1736000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>18340000000</td>\n",
" <td>400000000</td>\n",
" <td>1057.0</td>\n",
" <td>1773000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>19460000000</td>\n",
" <td>440000000</td>\n",
" <td>1053.0</td>\n",
" <td>1888000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>19600000000</td>\n",
" <td>430000000</td>\n",
" <td>1065.0</td>\n",
" <td>1880000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>19330000000</td>\n",
" <td>420000000</td>\n",
" <td>1075.0</td>\n",
" <td>1837000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" usertime systime cpupct walltime\n",
"0 16800000000 370000000 1040.0 1651000000\n",
"1 17100000000 380000000 1021.0 1712000000\n",
"2 17300000000 390000000 1058.0 1670000000\n",
"3 17760000000 390000000 1071.0 1693000000\n",
"4 17970000000 400000000 1061.0 1730000000\n",
"5 17960000000 400000000 1057.0 1736000000\n",
"6 18340000000 400000000 1057.0 1773000000\n",
"7 19460000000 440000000 1053.0 1888000000\n",
"8 19600000000 430000000 1065.0 1880000000\n",
"9 19330000000 420000000 1075.0 1837000000"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"old = parsetimes('old', asns=True)\n",
"\n",
"old_cmd = old.pop('cmd').value_counts()\n",
"assert len(old_cmd) == 1, 'old cmd must be unique'\n",
"\n",
"print(old_cmd.index[0])\n",
"old"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "practical-shift",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"./parago.new < kjvbible_x100.txt > /dev/null \n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>usertime</th>\n",
" <th>systime</th>\n",
" <th>cpupct</th>\n",
" <th>walltime</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>11870000000</td>\n",
" <td>150000000</td>\n",
" <td>1021.0</td>\n",
" <td>1176000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>11710000000</td>\n",
" <td>140000000</td>\n",
" <td>1032.0</td>\n",
" <td>1148000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>11710000000</td>\n",
" <td>130000000</td>\n",
" <td>1090.0</td>\n",
" <td>1087000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>11830000000</td>\n",
" <td>130000000</td>\n",
" <td>1057.0</td>\n",
" <td>1132000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>12010000000</td>\n",
" <td>140000000</td>\n",
" <td>1104.0</td>\n",
" <td>1100000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>12180000000</td>\n",
" <td>140000000</td>\n",
" <td>1106.0</td>\n",
" <td>1114000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>12350000000</td>\n",
" <td>140000000</td>\n",
" <td>1041.0</td>\n",
" <td>1199000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>12370000000</td>\n",
" <td>140000000</td>\n",
" <td>1104.0</td>\n",
" <td>1132000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>12500000000</td>\n",
" <td>140000000</td>\n",
" <td>1095.0</td>\n",
" <td>1154000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>13000000000</td>\n",
" <td>140000000</td>\n",
" <td>1097.0</td>\n",
" <td>1198000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" usertime systime cpupct walltime\n",
"0 11870000000 150000000 1021.0 1176000000\n",
"1 11710000000 140000000 1032.0 1148000000\n",
"2 11710000000 130000000 1090.0 1087000000\n",
"3 11830000000 130000000 1057.0 1132000000\n",
"4 12010000000 140000000 1104.0 1100000000\n",
"5 12180000000 140000000 1106.0 1114000000\n",
"6 12350000000 140000000 1041.0 1199000000\n",
"7 12370000000 140000000 1104.0 1132000000\n",
"8 12500000000 140000000 1095.0 1154000000\n",
"9 13000000000 140000000 1097.0 1198000000"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"new = parsetimes('new', asns=True)\n",
"\n",
"new_cmd = new.pop('cmd').value_counts()\n",
"assert len(new_cmd) == 1, 'new cmd must be unique'\n",
"\n",
"print(new_cmd.index[0])\n",
"new"
]
},
{
"cell_type": "markdown",
"id": "independent-capability",
"metadata": {},
"source": [
"# Median ratio, read: \"Half of New runs did X% better than the best half of Old runs\""
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "radio-signal",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"usertime 67.325355\n",
"systime 35.000000\n",
"cpupct 103.309693\n",
"walltime 65.781881\n",
"dtype: float64"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"new.median() / old.median() * 100"
]
},
{
"cell_type": "markdown",
"id": "personal-formation",
"metadata": {},
"source": [
"# Min ratio, read: \"Best New is X% of Old Best\""
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "second-issue",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"usertime 0.697024\n",
"systime 0.351351\n",
"cpupct 1.000000\n",
"walltime 0.658389\n",
"dtype: float64"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"new.min() / old.min()"
]
},
{
"cell_type": "markdown",
"id": "julian-prevention",
"metadata": {},
"source": [
"# Variance Ratio, read: \"New Variance is X% of Old Variance\"\n",
"\n",
"NOTE using the Inner Quartile Range for a variance metric"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "gross-willow",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"usertime 0.314843\n",
"systime 0.000000\n",
"cpupct 5.725000\n",
"walltime 0.421907\n",
"dtype: float64"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"new_iqr = new.quantile(0.75) - new.quantile(0.25)\n",
"old_iqr = old.quantile(0.75) - old.quantile(0.25)\n",
"new_iqr / old_iqr"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.8"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
@jcorbin
Copy link
Author

jcorbin commented Mar 20, 2021

A trace from the old program:
image

Notes:

  • experience around 20 GCs in around 1.4s
  • HWM heap around 61.4MB
  • the secondary rows that appear to be nearly solid yellow/brown-ish are syscalls

@jcorbin
Copy link
Author

jcorbin commented Mar 20, 2021

A trace from the new program:
image

Notes:

  • after several GCs during a warm up phase, the heap remains stable, and no more GCs happen
  • HWM heap around 43.1MB
  • syscalls are much less rampant

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment