Skip to content

Instantly share code, notes, and snippets.

@bfarzin
Created March 5, 2019 18:42
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bfarzin/6103511f51844e35138c52c6ea3a424d to your computer and use it in GitHub Desktop.
Save bfarzin/6103511f51844e35138c52c6ea3a424d to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Find max model / batch_size combo for GPU Ram"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"%reload_ext autoreload\n",
"%autoreload 2\n",
"%matplotlib inline"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"from fastai.vision import *\n",
"from fastai.callbacks.mem import *\n",
"from ipyexperiments import *\n",
"from ipyexperiments.utils.mem import *"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"seed = 42\n",
"\n",
"# python RNG\n",
"random.seed(seed)\n",
"\n",
"# pytorch RNGs\n",
"import torch\n",
"torch.manual_seed(seed)\n",
"torch.backends.cudnn.deterministic = True\n",
"if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)\n",
"\n",
"# numpy RNG\n",
"import numpy as np\n",
"np.random.seed(seed)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"PosixPath('/home/farzin/.fastai/data/oxford-iiit-pet')"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"path = untar_data(URLs.PETS); path"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[PosixPath('/home/farzin/.fastai/data/oxford-iiit-pet/images/leonberger_172.jpg'),\n",
" PosixPath('/home/farzin/.fastai/data/oxford-iiit-pet/images/British_Shorthair_100.jpg'),\n",
" PosixPath('/home/farzin/.fastai/data/oxford-iiit-pet/images/american_bulldog_32.jpg'),\n",
" PosixPath('/home/farzin/.fastai/data/oxford-iiit-pet/images/pomeranian_127.jpg'),\n",
" PosixPath('/home/farzin/.fastai/data/oxford-iiit-pet/images/Bengal_106.jpg')]"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"path_img = path/'images'\n",
"\n",
"fnames = get_image_files(path_img)\n",
"fnames[:5]"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"pat = r'/([^/]+)_\\d+.jpg$'"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"bs = 64\n",
"data = ImageDataBunch.from_name_re(path_img, fnames, pat, ds_tfms=get_transforms(), size=224, bs=bs\n",
" ).normalize(imagenet_stats)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"from fastai.callback import Callback\n",
"\n",
"class FitNBatch(Callback):\n",
" def __init__(self, n_batch:int=2): \n",
" self.stop,self.n_batch = False,n_batch\n",
" \n",
" def on_batch_end(self, iteration, **kwargs):\n",
" if iteration >= self.n_batch: return {'stop_epoch': True, 'stop_training': True}"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"def test_from_max_bs(learn,max_bs:int=1024,r_factor:float=0.80,verbose:bool=True,memtrace=None)->int:\n",
" test_dict = dict()\n",
" bs = max_bs\n",
" cant_fit = True\n",
" if memtrace is None: memtrace = GPUMemTrace()\n",
" while cant_fit:\n",
" try:\n",
" learn.data.batch_size = bs\n",
" learn.fit(1)\n",
" cant_fit = False\n",
" out = bs\n",
" except RuntimeError as e:\n",
" if verbose:\n",
" print(e)\n",
" print(f'tried bs:{bs}')\n",
" if verbose: memtrace.report(f'bs:{bs}')\n",
" test_dict[bs]=tuple(memtrace.data())\n",
" bs = int(bs*r_factor)\n",
" if bs<2: \n",
" cant_fit=False\n",
" out=None\n",
" \n",
" return out,test_dict"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"import fastai\n",
"import fastprogress\n",
"\n",
"def disable_progress():\n",
" fastprogress.fastprogress.NO_BAR = True\n",
" master_bar, progress_bar = fastprogress.force_console_behavior()\n",
" fastai.basic_train.master_bar, fastai.basic_train.progress_bar = master_bar, progress_bar\n",
" \n",
"def enable_progress():\n",
" fastai.basic_train.master_bar, fastai.basic_train.progress_bar = fastprogress.master_bar, fastprogress.progress_bar"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"## proto do_the_magic()\n",
"def do_the_magic(learn,max_bs=32,verbose=True,memtrace=None):\n",
" disable_progress()\n",
" defaults.extra_callbacks = [FitNBatch()]\n",
" bs_test_data = test_from_max_bs(learn,max_bs=max_bs,verbose=verbose,memtrace=memtrace)\n",
" enable_progress()\n",
" defaults.extra_callbacks = None\n",
" return bs_test_data"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"res_models = [models.resnet18,models.resnet34,models.resnet50,models.resnet101,models.resnet152]"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"*** Experiment started with the Pytorch backend\n",
"Device: ID 0, TITAN Xp (12194 RAM)\n",
"\n",
"・ RAM: △Consumed △Peaked Used Total | Exec time 0:00:04.665\n",
"・ CPU: 0 0 2,317 MB |\n",
"・ GPU: 9,034 0 10,194 MB |\n"
]
}
],
"source": [
"exp1 = IPyExperimentsPytorch(exp_enable=False)\n",
"x1 = gpu_mem_leave_free_mbs(2000)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"epoch train_loss valid_loss time \n",
"epoch train_loss valid_loss time \n",
"epoch train_loss valid_loss time \n",
"epoch train_loss valid_loss time \n",
"epoch train_loss valid_loss time \n",
"epoch train_loss valid_loss time \n",
"epoch train_loss valid_loss time \n",
"epoch train_loss valid_loss time \n",
"epoch train_loss valid_loss time \n",
"epoch train_loss valid_loss time \n",
"epoch train_loss valid_loss time \n",
"epoch train_loss valid_loss time \n",
"1 3.716801 4.486217 00:03 \n"
]
},
{
"data": {
"text/plain": [
"4"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"・ RAM: △Consumed △Peaked Used Total | Exec time 0:00:39.274\n",
"・ CPU: 2 0 2,765 MB |\n",
"・ GPU: 1,168 828 11,362 MB |\n"
]
}
],
"source": [
"arch = res_models[-1]\n",
"learn = create_cnn(data, arch)\n",
"learn.unfreeze()\n",
"arch.__name__\n",
"\n",
"memtrace = GPUMemTrace('RunningTest')\n",
"max_bs,tested_bs_data = do_the_magic(learn,max_bs=64,verbose=False,memtrace=memtrace)\n",
"max_bs"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"・ RAM: △Consumed △Peaked Used Total | Exec time 0:00:00.001\n",
"・ CPU: 0 0 2,765 MB |\n",
"・ GPU: 0 0 11,362 MB |\n"
]
}
],
"source": [
"memtrace.report()"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"・ RAM: △Consumed △Peaked Used Total | Exec time 0:00:00.126\n",
"・ CPU: 0 0 2,765 MB |\n",
"・ GPU: -10,184 0 1,178 MB |\n",
"Error in callback <function CellLogger.post_run_cell at 0x7fea54a75ea0> (for post_run_cell):\n"
]
},
{
"ename": "AttributeError",
"evalue": "'NoneType' object has no attribute 'gpu_ram_used'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m~/anaconda3/envs/fastaiv1_dev/lib/python3.7/site-packages/backcall/backcall.py\u001b[0m in \u001b[0;36madapted\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 102\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpop\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 103\u001b[0m \u001b[0;31m# print(args, kwargs, unmatched_pos, cut_positional, unmatched_kw)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 104\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mcallback\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 105\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 106\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0madapted\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/anaconda3/envs/fastaiv1_dev/lib/python3.7/site-packages/ipyexperiments/cell_logger.py\u001b[0m in \u001b[0;36mpost_run_cell\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 158\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 159\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbackend\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0;34m'cpu'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 160\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgpu_mem_used_new\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgpu_ram_used\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 161\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 162\u001b[0m \u001b[0;31m# delta_used is the difference between current used mem and used mem at the start\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mAttributeError\u001b[0m: 'NoneType' object has no attribute 'gpu_ram_used'"
]
}
],
"source": [
"del learn\n",
"del x1\n",
"del exp1"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>delta_used</th>\n",
" <th>delta_peaked</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>64</th>\n",
" <td>1568</td>\n",
" <td>74</td>\n",
" </tr>\n",
" <tr>\n",
" <th>51</th>\n",
" <td>1718</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>40</th>\n",
" <td>1700</td>\n",
" <td>18</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32</th>\n",
" <td>1724</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25</th>\n",
" <td>1702</td>\n",
" <td>26</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>1726</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>1730</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>1734</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>1730</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>1732</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>1734</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>906</td>\n",
" <td>828</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" delta_used delta_peaked\n",
"64 1568 74\n",
"51 1718 0\n",
"40 1700 18\n",
"32 1724 4\n",
"25 1702 26\n",
"20 1726 4\n",
"16 1730 0\n",
"12 1734 0\n",
"9 1730 4\n",
"7 1732 2\n",
"5 1734 0\n",
"4 906 828"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test_df = pd.DataFrame(tested_bs_data).transpose()\n",
"test_df.columns = ['delta_used','delta_peaked']\n",
"test_df"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"*** Experiment started with the Pytorch backend\n",
"Device: ID 0, TITAN Xp (12194 RAM)\n",
"\n",
"・ RAM: △Consumed △Peaked Used Total | Exec time 0:00:03.767\n",
"・ CPU: 0 0 2,766 MB |\n",
"・ GPU: 7,016 0 8,194 MB |\n"
]
}
],
"source": [
"exp2 = IPyExperimentsPytorch(exp_enable=False)\n",
"x1 = gpu_mem_leave_free_mbs(4000)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"epoch train_loss valid_loss time \n",
"epoch train_loss valid_loss time \n",
"epoch train_loss valid_loss time \n",
"epoch train_loss valid_loss time \n",
"epoch train_loss valid_loss time \n",
"epoch train_loss valid_loss time \n",
"epoch train_loss valid_loss time \n",
"epoch train_loss valid_loss time \n",
"1 3.195156 4.384258 00:05 \n"
]
},
{
"data": {
"text/plain": [
"12"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"・ RAM: △Consumed △Peaked Used Total | Exec time 0:00:35.294\n",
"・ CPU: 1 0 2,773 MB |\n",
"・ GPU: 1,724 2,272 9,918 MB |\n"
]
}
],
"source": [
"arch = res_models[-1]\n",
"learn = create_cnn(data, arch)\n",
"learn.unfreeze()\n",
"arch.__name__\n",
"\n",
"max_bs,tested_bs_data = do_the_magic(learn,max_bs=64,verbose=False)\n",
"max_bs"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"・ RAM: △Consumed △Peaked Used Total | Exec time 0:00:00.008\n",
"・ CPU: 0 0 2,772 MB |\n",
"・ GPU: -8,740 0 1,178 MB |\n"
]
}
],
"source": [
"del learn\n",
"del x1\n",
"del exp2"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>delta_used</th>\n",
" <th>delta_peaked</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>64</th>\n",
" <td>3734</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>51</th>\n",
" <td>3730</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>40</th>\n",
" <td>3732</td>\n",
" <td>8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32</th>\n",
" <td>3724</td>\n",
" <td>16</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25</th>\n",
" <td>3738</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>3746</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>3738</td>\n",
" <td>8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>1474</td>\n",
" <td>2272</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" delta_used delta_peaked\n",
"64 3734 2\n",
"51 3730 6\n",
"40 3732 8\n",
"32 3724 16\n",
"25 3738 2\n",
"20 3746 0\n",
"16 3738 8\n",
"12 1474 2272"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test_df = pd.DataFrame(tested_bs_data).transpose()\n",
"test_df.columns = ['delta_used','delta_peaked']\n",
"test_df"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"/home/farzin/fast_ai/course-v3/nbs/dl1\r\n"
]
}
],
"source": [
"!pwd"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.7 fasta.ai1 DEV",
"language": "python",
"name": "fastai1_dev"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.1"
},
"varInspector": {
"cols": {
"lenName": 16,
"lenType": 16,
"lenVar": 40
},
"kernels_config": {
"python": {
"delete_cmd_postfix": "",
"delete_cmd_prefix": "del ",
"library": "var_list.py",
"varRefreshCmd": "print(var_dic_list())"
},
"r": {
"delete_cmd_postfix": ") ",
"delete_cmd_prefix": "rm(",
"library": "var_list.r",
"varRefreshCmd": "cat(var_dic_list()) "
}
},
"types_to_exclude": [
"module",
"function",
"builtin_function_or_method",
"instance",
"_Feature"
],
"window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment