feevos/gluon_accuracy_example_multi_gpu.ipynb

## gluon_accuracy_example_multi_gpu.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from __future__ import print_function\n",
    "import numpy as np\n",
    "import mxnet as mx\n",
    "from mxnet import nd, autograd, gluon\n",
    "from time import time\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')\n",
    "mx.random.seed(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "batch_size = 256\n",
    "num_inputs = 784\n",
    "num_outputs = 10\n",
    "num_gpus = 1\n",
    "learning_rate = .1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "ctx = [mx.gpu(i) for i in range(num_gpus)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "def transform(data, label):\n",
    "    return nd.transpose(data.astype(np.float32), (2,0,1))/255, label.astype(np.float32)\n",
    "\n",
    "train_data = gluon.data.DataLoader(gluon.data.vision.MNIST(train=True, transform=transform),\n",
    "                                   batch_size, shuffle=True, num_workers=4,last_batch='discard')\n",
    "test_data = gluon.data.DataLoader(gluon.data.vision.MNIST(train=False, transform=transform),\n",
    "                                  batch_size, shuffle=False, num_workers=4,last_batch='discard')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "num_fc = 512\n",
    "net = gluon.nn.Sequential()\n",
    "with net.name_scope():\n",
    "    net.add(gluon.nn.Conv2D(channels=20, kernel_size=5, activation='relu'))\n",
    "    net.add(gluon.nn.MaxPool2D(pool_size=2, strides=2))\n",
    "    net.add(gluon.nn.Conv2D(channels=50, kernel_size=5, activation='relu'))\n",
    "    net.add(gluon.nn.MaxPool2D(pool_size=2, strides=2))\n",
    "    # The Flatten layer collapses all axis, except the first one, into one axis.\n",
    "    net.add(gluon.nn.Flatten())\n",
    "    net.add(gluon.nn.Dense(num_fc, activation=\"relu\"))\n",
    "    net.add(gluon.nn.Dense(num_outputs))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "net.initialize(mx.init.Xavier(magnitude=2.24), force_reinit=True, ctx=ctx)\n",
    "net.hybridize(static_alloc=True,static_shape=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "softmax_cross_entropy = gluon.loss.SoftmaxCrossEntropyLoss()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': learning_rate})"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# train_data_l and test_data_l are redundant since you've defined your dataloader objects above"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load training data into GPUs, each data_l contains arrays deployed to gpu1/2/3/4\n",
    "# there will be 235 loop iterations\n",
    "# train_data_l = []\n",
    "# train_label_l = []\n",
    "# for data,label in train_data:\n",
    "#     train_data_l.append(gluon.utils.split_and_load(data, ctx))\n",
    "#     train_label_l.append(gluon.utils.split_and_load(label, ctx))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load test data inro GPUs\n",
    "# test_data_l = []\n",
    "# test_label_l = []\n",
    "# for data,label in test_data:\n",
    "#     test_data_l.append(gluon.utils.split_and_load(data, ctx))\n",
    "#     test_label_l.append(gluon.utils.split_and_load(label, ctx))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "data_l - List with 235 elements, each element of data_l is\n",
    "List of 4 elements, each of these 4 elems is\n",
    "NDArray of shape: (64, 1, 28, 28)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Default accuracy function (this only works on one GPU and won't work for ctx = [gpu(0), gpu(1),])\n",
    "def evaluate_accuracy(data_iterator, net):\n",
    "    acc = mx.metric.Accuracy()\n",
    "    for i, (data, label) in enumerate(data_iterator):\n",
    "        data = data.as_in_context(ctx)\n",
    "        label = label.as_in_context(ctx)\n",
    "        output = net(data)\n",
    "        predictions = nd.argmax(output, axis=1)\n",
    "        acc.update(preds=predictions, labels=label)\n",
    "    return acc.get()[1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Suggession by feevos: runs\n",
    "def eval_acc_feevos1(net, _data_generator):\n",
    "    acc = mx.metric.Accuracy() # Single accuracy \n",
    "    for i, (tdata, tlabel) in enumerate(_data_generator):\n",
    "        data = tdata.as_in_context(mx.gpu(0))\n",
    "        label = nd.array(tlabel) # keep this in cpu context, since this is already done inside the definition of Accuracy\n",
    "        pred = nd.argmax(net(data),axis=1).as_in_context(mx.cpu())\n",
    "        acc.update(preds=pred,labels=label)\n",
    "    return (acc.get()[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Suggession by feevos: runs\n",
    "def eval_acc_feevos2(net, _data_generator):\n",
    "    acc = mx.metric.Accuracy() # Single accuracy \n",
    "    for i, (tdata, tlabel) in enumerate(_data_generator):\n",
    "        data = gluon.utils.split_and_load(tdata, ctx)\n",
    "        label = nd.array(tlabel) # keep this in cpu context, since this is already done inside the definition of Accuracy   \n",
    "        # Perform inference on each separate GPU \n",
    "        pred = [nd.argmax(net(X),axis=1).as_in_context(mx.cpu()) for X in data]\n",
    "        pred = nd.concat(*pred,dim=0) # Collect results\n",
    "        \n",
    "        acc.update(preds=pred,labels=label) # update single accuracy\n",
    "\n",
    "    return (acc.get()[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "# This works, but ugly, slow and requires loading labels into GPUs, which is redundant!\n",
    "# As we see below accuracy calculation adds ~20 seconds into epoch time\n",
    "# See more at: https://discuss.mxnet.io/t/evaluate-accuracy-on-multi-gpu-machine/1972\n",
    "def eval_acc(net, data_l, label_l):\n",
    "    acc = [mx.metric.Accuracy() for i in range(num_gpus)]\n",
    "    for i, (data, label) in enumerate(zip(data_l, label_l)): # loop on 235 batches\n",
    "        D=[data[n].as_in_context(mx.gpu(n)) for n in range(0,num_gpus)]\n",
    "        L=[label[n].as_in_context(mx.gpu(n)) for n in range(0,num_gpus)]\n",
    "        P = [nd.argmax(net(d), axis=1) for d in D]\n",
    "        [a.update(preds=p, labels=l) for p, a, l in zip(P, acc, L)]\n",
    "    return sum([a.get()[1] for a in acc])/num_gpus"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch 0: Loss: 0.103, train_accuracy 0.977, test_accuracy 0.979, Time 3.7 sec\n",
      "Epoch 1: Loss: 0.071, train_accuracy 0.977, test_accuracy 0.979, Time 2.9 sec\n",
      "Epoch 2: Loss: 0.057, train_accuracy 0.977, test_accuracy 0.979, Time 2.9 sec\n",
      "Epoch 3: Loss: 0.049, train_accuracy 0.977, test_accuracy 0.979, Time 3.2 sec\n",
      "Epoch 4: Loss: 0.042, train_accuracy 0.977, test_accuracy 0.979, Time 3.1 sec\n",
      "Epoch 5: Loss: 0.036, train_accuracy 0.992, test_accuracy 0.990, Time 3.3 sec\n",
      "Epoch 6: Loss: 0.031, train_accuracy 0.992, test_accuracy 0.990, Time 2.6 sec\n",
      "Epoch 7: Loss: 0.028, train_accuracy 0.992, test_accuracy 0.990, Time 2.7 sec\n",
      "Epoch 8: Loss: 0.025, train_accuracy 0.992, test_accuracy 0.990, Time 2.6 sec\n",
      "Epoch 9: Loss: 0.023, train_accuracy 0.992, test_accuracy 0.990, Time 2.7 sec\n",
      "Epoch 10: Loss: 0.021, train_accuracy 0.995, test_accuracy 0.990, Time 3.4 sec\n"
     ]
    }
   ],
   "source": [
    "epochs = 11\n",
    "smoothing_constant = .01\n",
    "test_acc = train_acc = 0\n",
    "\n",
    "for e in range(epochs):\n",
    "    train_loss = 0.\n",
    "    tic = time()\n",
    "    c=1\n",
    "    for data, label in train_data: # read the batch (batch_size rows) from train_data, see batch_size in DataLoader\n",
    "        data_list = gluon.utils.split_and_load(data, ctx) # split batch_size into num_gpu devices\n",
    "        label_list = gluon.utils.split_and_load(label, ctx)\n",
    "\n",
    "        with autograd.record():\n",
    "            losses = [softmax_cross_entropy(net(X), y)\n",
    "                      for X, y in zip(data_list, label_list)]\n",
    "        for l in losses:\n",
    "            l.backward()\n",
    "\n",
    "        trainer.step(batch_size)\n",
    "        # Sum losses over all devices\n",
    "        train_loss += sum([l.sum().asscalar() for l in losses])\n",
    "        \n",
    "    if (e % 5 == 0): # calculate accuracy every 5th epoch\n",
    "        test_acc = eval_acc(net, test_data_l, test_label_l) #eval_acc_cpu(net, test_data_l, test_label_l)\n",
    "        train_acc = eval_acc(net, train_data_l, train_label_l) #eval_acc_cpu(net, train_data_l, train_label_l)\n",
    "    \n",
    "    print(\"Epoch %d: Loss: %.3f, train_accuracy %.3f, test_accuracy %.3f, Time %.1f sec\" % \n",
    "          (e, train_loss/len(train_data)/batch_size, train_acc, test_acc, time()-tic))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "net.save_params(\"models/cnn_4gpu_mnist.par\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch 0: Loss: 0.012, train_accuracy 0.992, test_accuracy 0.985, Time 5.4 sec\n",
      "Epoch 1: Loss: 0.011, train_accuracy 0.992, test_accuracy 0.985, Time 2.5 sec\n",
      "Epoch 2: Loss: 0.011, train_accuracy 0.992, test_accuracy 0.985, Time 2.8 sec\n",
      "Epoch 3: Loss: 0.009, train_accuracy 0.992, test_accuracy 0.985, Time 2.5 sec\n",
      "Epoch 4: Loss: 0.008, train_accuracy 0.992, test_accuracy 0.985, Time 2.6 sec\n",
      "Epoch 5: Loss: 0.008, train_accuracy 0.998, test_accuracy 0.991, Time 5.4 sec\n",
      "Epoch 6: Loss: 0.007, train_accuracy 0.998, test_accuracy 0.991, Time 2.6 sec\n"
     ]
    }
   ],
   "source": [
    "epochs = 7\n",
    "smoothing_constant = .01\n",
    "test_acc = train_acc = 0\n",
    "\n",
    "for e in range(epochs):\n",
    "    train_loss = 0.\n",
    "    tic = time()\n",
    "    c=1\n",
    "    for data, label in train_data: # read the batch (batch_size rows) from train_data, see batch_size in DataLoader\n",
    "        data_list = gluon.utils.split_and_load(data, ctx) # split batch_size into num_gpu devices\n",
    "        label_list = gluon.utils.split_and_load(label, ctx)\n",
    "\n",
    "        with autograd.record():\n",
    "            losses = [softmax_cross_entropy(net(X), y)\n",
    "                      for X, y in zip(data_list, label_list)]\n",
    "        for l in losses:\n",
    "            l.backward()\n",
    "\n",
    "        trainer.step(batch_size)\n",
    "        # Sum losses over all devices\n",
    "        train_loss += sum([l.sum().asscalar() for l in losses])\n",
    "        \n",
    "    if (e % 5 == 0): # calculate accuracy every 5th epoch\n",
    "        test_acc = eval_acc_feevos2(net, test_data) #eval_acc_cpu(net, test_data_l, test_label_l)\n",
    "        train_acc = eval_acc_feevos2(net, train_data) #eval_acc_cpu(net, train_data_l, train_label_l)\n",
    "    \n",
    "    print(\"Epoch %d: Loss: %.3f, train_accuracy %.3f, test_accuracy %.3f, Time %.1f sec\" % \n",
    "          (e, train_loss/len(train_data)/batch_size, train_acc, test_acc, time()-tic))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.9901842948717948"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "eval_acc_feevos2(net,test_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {},
	"outputs": [],
	"source": [
	"from __future__ import print_function\n",
	"import numpy as np\n",
	"import mxnet as mx\n",
	"from mxnet import nd, autograd, gluon\n",
	"from time import time\n",
	"import warnings\n",
	"warnings.filterwarnings('ignore')\n",
	"mx.random.seed(1)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [],
	"source": [
	"batch_size = 256\n",
	"num_inputs = 784\n",
	"num_outputs = 10\n",
	"num_gpus = 1\n",
	"learning_rate = .1"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [],
	"source": [
	"ctx = [mx.gpu(i) for i in range(num_gpus)]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 13,
	"metadata": {},
	"outputs": [],
	"source": [
	"def transform(data, label):\n",
	" return nd.transpose(data.astype(np.float32), (2,0,1))/255, label.astype(np.float32)\n",
	"\n",
	"train_data = gluon.data.DataLoader(gluon.data.vision.MNIST(train=True, transform=transform),\n",
	" batch_size, shuffle=True, num_workers=4,last_batch='discard')\n",
	"test_data = gluon.data.DataLoader(gluon.data.vision.MNIST(train=False, transform=transform),\n",
	" batch_size, shuffle=False, num_workers=4,last_batch='discard')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 14,
	"metadata": {},
	"outputs": [],
	"source": [
	"num_fc = 512\n",
	"net = gluon.nn.Sequential()\n",
	"with net.name_scope():\n",
	" net.add(gluon.nn.Conv2D(channels=20, kernel_size=5, activation='relu'))\n",
	" net.add(gluon.nn.MaxPool2D(pool_size=2, strides=2))\n",
	" net.add(gluon.nn.Conv2D(channels=50, kernel_size=5, activation='relu'))\n",
	" net.add(gluon.nn.MaxPool2D(pool_size=2, strides=2))\n",
	" # The Flatten layer collapses all axis, except the first one, into one axis.\n",
	" net.add(gluon.nn.Flatten())\n",
	" net.add(gluon.nn.Dense(num_fc, activation=\"relu\"))\n",
	" net.add(gluon.nn.Dense(num_outputs))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 15,
	"metadata": {},
	"outputs": [],
	"source": [
	"net.initialize(mx.init.Xavier(magnitude=2.24), force_reinit=True, ctx=ctx)\n",
	"net.hybridize(static_alloc=True,static_shape=True)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 16,
	"metadata": {},
	"outputs": [],
	"source": [
	"softmax_cross_entropy = gluon.loss.SoftmaxCrossEntropyLoss()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 17,
	"metadata": {},
	"outputs": [],
	"source": [
	"trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': learning_rate})"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# train_data_l and test_data_l are redundant since you've defined your dataloader objects above"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Load training data into GPUs, each data_l contains arrays deployed to gpu1/2/3/4\n",
	"# there will be 235 loop iterations\n",
	"# train_data_l = []\n",
	"# train_label_l = []\n",
	"# for data,label in train_data:\n",
	"# train_data_l.append(gluon.utils.split_and_load(data, ctx))\n",
	"# train_label_l.append(gluon.utils.split_and_load(label, ctx))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Load test data inro GPUs\n",
	"# test_data_l = []\n",
	"# test_label_l = []\n",
	"# for data,label in test_data:\n",
	"# test_data_l.append(gluon.utils.split_and_load(data, ctx))\n",
	"# test_label_l.append(gluon.utils.split_and_load(label, ctx))"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"data_l - List with 235 elements, each element of data_l is\n",
	"List of 4 elements, each of these 4 elems is\n",
	"NDArray of shape: (64, 1, 28, 28)\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Default accuracy function (this only works on one GPU and won't work for ctx = [gpu(0), gpu(1),])\n",
	"def evaluate_accuracy(data_iterator, net):\n",
	" acc = mx.metric.Accuracy()\n",
	" for i, (data, label) in enumerate(data_iterator):\n",
	" data = data.as_in_context(ctx)\n",
	" label = label.as_in_context(ctx)\n",
	" output = net(data)\n",
	" predictions = nd.argmax(output, axis=1)\n",
	" acc.update(preds=predictions, labels=label)\n",
	" return acc.get()[1]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Suggession by feevos: runs\n",
	"def eval_acc_feevos1(net, _data_generator):\n",
	" acc = mx.metric.Accuracy() # Single accuracy \n",
	" for i, (tdata, tlabel) in enumerate(_data_generator):\n",
	" data = tdata.as_in_context(mx.gpu(0))\n",
	" label = nd.array(tlabel) # keep this in cpu context, since this is already done inside the definition of Accuracy\n",
	" pred = nd.argmax(net(data),axis=1).as_in_context(mx.cpu())\n",
	" acc.update(preds=pred,labels=label)\n",
	" return (acc.get()[1])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Suggession by feevos: runs\n",
	"def eval_acc_feevos2(net, _data_generator):\n",
	" acc = mx.metric.Accuracy() # Single accuracy \n",
	" for i, (tdata, tlabel) in enumerate(_data_generator):\n",
	" data = gluon.utils.split_and_load(tdata, ctx)\n",
	" label = nd.array(tlabel) # keep this in cpu context, since this is already done inside the definition of Accuracy \n",
	" # Perform inference on each separate GPU \n",
	" pred = [nd.argmax(net(X),axis=1).as_in_context(mx.cpu()) for X in data]\n",
	" pred = nd.concat(*pred,dim=0) # Collect results\n",
	" \n",
	" acc.update(preds=pred,labels=label) # update single accuracy\n",
	"\n",
	" return (acc.get()[1])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"metadata": {},
	"outputs": [],
	"source": [
	"# This works, but ugly, slow and requires loading labels into GPUs, which is redundant!\n",
	"# As we see below accuracy calculation adds ~20 seconds into epoch time\n",
	"# See more at: https://discuss.mxnet.io/t/evaluate-accuracy-on-multi-gpu-machine/1972\n",
	"def eval_acc(net, data_l, label_l):\n",
	" acc = [mx.metric.Accuracy() for i in range(num_gpus)]\n",
	" for i, (data, label) in enumerate(zip(data_l, label_l)): # loop on 235 batches\n",
	" D=[data[n].as_in_context(mx.gpu(n)) for n in range(0,num_gpus)]\n",
	" L=[label[n].as_in_context(mx.gpu(n)) for n in range(0,num_gpus)]\n",
	" P = [nd.argmax(net(d), axis=1) for d in D]\n",
	" [a.update(preds=p, labels=l) for p, a, l in zip(P, acc, L)]\n",
	" return sum([a.get()[1] for a in acc])/num_gpus"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 15,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Epoch 0: Loss: 0.103, train_accuracy 0.977, test_accuracy 0.979, Time 3.7 sec\n",
	"Epoch 1: Loss: 0.071, train_accuracy 0.977, test_accuracy 0.979, Time 2.9 sec\n",
	"Epoch 2: Loss: 0.057, train_accuracy 0.977, test_accuracy 0.979, Time 2.9 sec\n",
	"Epoch 3: Loss: 0.049, train_accuracy 0.977, test_accuracy 0.979, Time 3.2 sec\n",
	"Epoch 4: Loss: 0.042, train_accuracy 0.977, test_accuracy 0.979, Time 3.1 sec\n",
	"Epoch 5: Loss: 0.036, train_accuracy 0.992, test_accuracy 0.990, Time 3.3 sec\n",
	"Epoch 6: Loss: 0.031, train_accuracy 0.992, test_accuracy 0.990, Time 2.6 sec\n",
	"Epoch 7: Loss: 0.028, train_accuracy 0.992, test_accuracy 0.990, Time 2.7 sec\n",
	"Epoch 8: Loss: 0.025, train_accuracy 0.992, test_accuracy 0.990, Time 2.6 sec\n",
	"Epoch 9: Loss: 0.023, train_accuracy 0.992, test_accuracy 0.990, Time 2.7 sec\n",
	"Epoch 10: Loss: 0.021, train_accuracy 0.995, test_accuracy 0.990, Time 3.4 sec\n"
	]
	}
	],
	"source": [
	"epochs = 11\n",
	"smoothing_constant = .01\n",
	"test_acc = train_acc = 0\n",
	"\n",
	"for e in range(epochs):\n",
	" train_loss = 0.\n",
	" tic = time()\n",
	" c=1\n",
	" for data, label in train_data: # read the batch (batch_size rows) from train_data, see batch_size in DataLoader\n",
	" data_list = gluon.utils.split_and_load(data, ctx) # split batch_size into num_gpu devices\n",
	" label_list = gluon.utils.split_and_load(label, ctx)\n",
	"\n",
	" with autograd.record():\n",
	" losses = [softmax_cross_entropy(net(X), y)\n",
	" for X, y in zip(data_list, label_list)]\n",
	" for l in losses:\n",
	" l.backward()\n",
	"\n",
	" trainer.step(batch_size)\n",
	" # Sum losses over all devices\n",
	" train_loss += sum([l.sum().asscalar() for l in losses])\n",
	" \n",
	" if (e % 5 == 0): # calculate accuracy every 5th epoch\n",
	" test_acc = eval_acc(net, test_data_l, test_label_l) #eval_acc_cpu(net, test_data_l, test_label_l)\n",
	" train_acc = eval_acc(net, train_data_l, train_label_l) #eval_acc_cpu(net, train_data_l, train_label_l)\n",
	" \n",
	" print(\"Epoch %d: Loss: %.3f, train_accuracy %.3f, test_accuracy %.3f, Time %.1f sec\" % \n",
	" (e, train_loss/len(train_data)/batch_size, train_acc, test_acc, time()-tic))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"net.save_params(\"models/cnn_4gpu_mnist.par\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 20,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Epoch 0: Loss: 0.012, train_accuracy 0.992, test_accuracy 0.985, Time 5.4 sec\n",
	"Epoch 1: Loss: 0.011, train_accuracy 0.992, test_accuracy 0.985, Time 2.5 sec\n",
	"Epoch 2: Loss: 0.011, train_accuracy 0.992, test_accuracy 0.985, Time 2.8 sec\n",
	"Epoch 3: Loss: 0.009, train_accuracy 0.992, test_accuracy 0.985, Time 2.5 sec\n",
	"Epoch 4: Loss: 0.008, train_accuracy 0.992, test_accuracy 0.985, Time 2.6 sec\n",
	"Epoch 5: Loss: 0.008, train_accuracy 0.998, test_accuracy 0.991, Time 5.4 sec\n",
	"Epoch 6: Loss: 0.007, train_accuracy 0.998, test_accuracy 0.991, Time 2.6 sec\n"
	]
	}
	],
	"source": [
	"epochs = 7\n",
	"smoothing_constant = .01\n",
	"test_acc = train_acc = 0\n",
	"\n",
	"for e in range(epochs):\n",
	" train_loss = 0.\n",
	" tic = time()\n",
	" c=1\n",
	" for data, label in train_data: # read the batch (batch_size rows) from train_data, see batch_size in DataLoader\n",
	" data_list = gluon.utils.split_and_load(data, ctx) # split batch_size into num_gpu devices\n",
	" label_list = gluon.utils.split_and_load(label, ctx)\n",
	"\n",
	" with autograd.record():\n",
	" losses = [softmax_cross_entropy(net(X), y)\n",
	" for X, y in zip(data_list, label_list)]\n",
	" for l in losses:\n",
	" l.backward()\n",
	"\n",
	" trainer.step(batch_size)\n",
	" # Sum losses over all devices\n",
	" train_loss += sum([l.sum().asscalar() for l in losses])\n",
	" \n",
	" if (e % 5 == 0): # calculate accuracy every 5th epoch\n",
	" test_acc = eval_acc_feevos2(net, test_data) #eval_acc_cpu(net, test_data_l, test_label_l)\n",
	" train_acc = eval_acc_feevos2(net, train_data) #eval_acc_cpu(net, train_data_l, train_label_l)\n",
	" \n",
	" print(\"Epoch %d: Loss: %.3f, train_accuracy %.3f, test_accuracy %.3f, Time %.1f sec\" % \n",
	" (e, train_loss/len(train_data)/batch_size, train_acc, test_acc, time()-tic))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 21,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"0.9901842948717948"
	]
	},
	"execution_count": 21,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"eval_acc_feevos2(net,test_data)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.5.2"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}