Skip to content

Instantly share code, notes, and snippets.

@khanrc
Created January 26, 2018 06:05
Show Gist options
  • Save khanrc/a21dbe0dc316c31387a56b683b94aa2d to your computer and use it in GitHub Desktop.
Save khanrc/a21dbe0dc316c31387a56b683b94aa2d to your computer and use it in GitHub Desktop.
DataParallel module leaks memory
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import torch\n",
"import torch.nn as nn\n",
"import torch.nn.functional as F\n",
"from torch.autograd import Variable\n",
"from torch.utils.data import Dataset, sampler\n",
"from torchvision import datasets, transforms\n",
"import gc"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"train_ds = datasets.MNIST('./data', train=True, download=True, transform=transforms.ToTensor())\n",
"test_ds = datasets.MNIST('./data', train=False, transform=transforms.ToTensor())"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"train_loader = torch.utils.data.DataLoader(train_ds, batch_size=128, shuffle=True)\n",
"test_loader = torch.utils.data.DataLoader(test_ds, batch_size=128, shuffle=False)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"class CNN(nn.Module):\n",
" def __init__(self):\n",
" super(CNN, self).__init__()\n",
"\n",
" # 28x28\n",
" self.conv1 = nn.Conv2d( 1, 32, 3, 1, 1) # 14x14\n",
" self.conv2 = nn.Conv2d(32, 32, 3, 1, 1) # 7x7\n",
" self.conv3 = nn.Conv2d(32, 32, 3, 1, 1) # 4x4\n",
" self.linear = nn.Linear(4*4*32, 10)\n",
" \n",
" def forward(self, x):\n",
" out = self.conv1(x)\n",
" out = F.max_pool2d(out, 2)\n",
" out = F.relu(out, True)\n",
" \n",
" out = self.conv2(out)\n",
" out = F.max_pool2d(out, 2)\n",
" out = F.relu(out, True)\n",
" \n",
" out = self.conv3(out)\n",
" out = F.max_pool2d(out, 2, padding=[1, 1])\n",
" out = F.relu(out, True)\n",
" \n",
" out = out.view(out.size(0), -1) # flatten\n",
" out = self.linear(out)\n",
" \n",
" return out"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"4L"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"torch.cuda.device_count()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"model = CNN()\n",
"model = torch.nn.DataParallel(model)\n",
"model = model.cuda()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"def train(epoch):\n",
" model.train()\n",
" cc = 0\n",
" for batch_idx, (data, target) in enumerate(train_loader):\n",
" data, target = data.cuda(), target.cuda()\n",
" data, target = Variable(data), Variable(target)\n",
" optimizer.zero_grad()\n",
" output = model(data)\n",
" loss = criterion(output, target)\n",
" loss.backward()\n",
" optimizer.step()\n",
" if batch_idx % 100 == 0:\n",
" print('Train Epoch: {} [{}/{} ({:.0f}%)]\\tLoss: {:.6f}'.format(\n",
" epoch, batch_idx * len(data), len(train_loader.dataset), \n",
" 100. * batch_idx / len(train_loader), loss.data[0]))\n",
" \n",
" collected = gc.collect()\n",
" cc += collected\n",
" print \"collected\", collected\n",
" print gc.garbage\n",
" if batch_idx == 5:\n",
" break\n",
" print\n",
" \n",
" print \"total collected\", cc\n",
"\n",
"def test():\n",
" model.eval()\n",
" test_loss = 0\n",
" correct = 0\n",
" for data, target in test_loader:\n",
" data, target = data.cuda(), target.cuda()\n",
" data, target = Variable(data, volatile=True), Variable(target)\n",
" output = model(data)\n",
" test_loss += criterion(output, target).data[0] # sum up batch loss\n",
" pred = output.data.max(1, keepdim=True)[1] # get the index of the max log-probability\n",
" correct += pred.eq(target.data.view_as(pred)).cpu().sum()\n",
"\n",
" test_loss /= len(test_loader.dataset)\n",
" print('Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\\n'.format(\n",
" test_loss, correct, len(test_loader.dataset), \n",
" 100. * correct / len(test_loader.dataset)))"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Train Epoch: 0 [0/60000 (0%)]\tLoss: 2.299755\n",
"collected 118\n",
"[]\n",
"\n",
"collected 118\n",
"[]\n",
"\n",
"collected 118\n",
"[]\n",
"\n",
"collected 118\n",
"[]\n",
"\n",
"collected 118\n",
"[]\n",
"\n",
"collected 118\n",
"[]\n",
"total collected 708\n"
]
}
],
"source": [
"n_epochs = 1\n",
"optimizer = torch.optim.Adam(model.parameters())\n",
"criterion = nn.CrossEntropyLoss().cuda()\n",
"\n",
"# gc.set_debug(gc.DEBUG_LEAK)\n",
"gc.set_debug(0)\n",
"gc.disable()\n",
"\n",
"for epoch in range(n_epochs):\n",
" train(epoch)\n",
"# test()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2 - tf.latest",
"language": "python",
"name": "python2-tf-latest"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment