mohanr/distributetrainiing.ipynb

## distributetrainiing.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9229d8fd-f0a9-4d66-b7c0-8700a7076202",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%writefile task.py\n",
    "\n",
    "import os\n",
    "import numpy as np\n",
    "import tensorflow as tf\n",
    "from tensorflow import keras\n",
    "import time\n",
    "\n",
    "# At the begining of the program\n",
    "distribution = tf.distribute.MultiWorkerMirroredStrategy()\n",
    "\n",
    "resolver = tf.distribute.cluster_resolver.TFConfigClusterResolver()\n",
    "print(\"Starting task {}{}\".format(resolver.task_type, resolver.task_id))\n",
    "\n",
    "# Only worker #0 will write checkpoints and log to TensorBoard\n",
    "if resolver.task_id == 0:\n",
    "    root_logdir = os.path.join(os.curdir,  # os.getcwd(), #os.curdir,\n",
    "                               \"task_logs/\",\n",
    "                               \"psworker_logs\"\n",
    "                               )\n",
    "    run_id = time.strftime(\"run_%Y_%m_%d-%H_%M_%S\")\n",
    "    run_dir = os.path.join(root_logdir, run_id)\n",
    "    callbacks = [keras.callbacks.TensorBoard(run_dir),\n",
    "                 keras.callbacks.ModelCheckpoint(\"psworker_model.h5\",\n",
    "                                                 save_best_only=True\n",
    "                                                 ),\n",
    "                 ]\n",
    "else:\n",
    "    callbacks = []\n",
    "\n",
    "# load and prepare the MNIST dataset\n",
    "(X_train_full, y_train_full), (X_test, y_test) = keras.datasets.mnist.load_data()\n",
    "X_train_full = X_train_full[..., np.newaxis] / 255.\n",
    "X_valid, X_train = X_train_full[:5000], X_train_full[5000:]\n",
    "y_valid, y_train = y_train_full[:5000], y_train_full[5000:]\n",
    "\n",
    "with distribution.scope():\n",
    "    model = keras.models.Sequential([\n",
    "        keras.layers.Conv2D(filters=64, kernel_size=7, activation=\"relu\",\n",
    "                            padding=\"same\", input_shape=[28, 28, 1]\n",
    "                            ),  # (None, 28, 28, 64)\n",
    "        keras.layers.MaxPooling2D(pool_size=2),  # (None, 14, 14, 64)\n",
    "\n",
    "        keras.layers.Conv2D(filters=128, kernel_size=3, activation=\"relu\",\n",
    "                            padding=\"same\"\n",
    "                            ),  # (None, 14, 14, 128)\n",
    "        keras.layers.Conv2D(filters=128, kernel_size=3, activation=\"relu\",\n",
    "                            padding=\"same\"\n",
    "                            ),\n",
    "        keras.layers.MaxPooling2D(pool_size=2),  # (None, 7, 7, 128)\n",
    "\n",
    "        keras.layers.Flatten(),  # (None, 6272)\n",
    "        keras.layers.Dense(units=64, activation='relu'),  # (None, 64)\n",
    "        keras.layers.Dropout(0.5),\n",
    "        keras.layers.Dense(units=10, activation=\"softmax\"),\n",
    "        # (None, 10)\n",
    "    ])\n",
    "    model.compile(loss=\"sparse_categorical_crossentropy\",\n",
    "                  optimizer=keras.optimizers.SGD(learning_rate=1e-2),\n",
    "                  metrics=[\"accuracy\"]\n",
    "                  )\n",
    "\n",
    "model.fit(X_train, y_train, validation_data=(X_valid, y_valid),\n",
    "          epochs=10, callbacks=callbacks\n",
    "          )\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "8cd842e9-4755-4c75-acd7-f88dfb9f81df",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2022-05-25 07:44:47.504469: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
      "2022-05-25 07:44:47.504765: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
      "2022-05-25 07:44:47.515766: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
      "2022-05-25 07:44:47.515872: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
      "2022-05-25 07:44:47.517581: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
      "2022-05-25 07:44:47.517654: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
      "2022-05-25 07:44:47.519971: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA\n",
      "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
      "2022-05-25 07:44:47.520006: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA\n",
      "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
      "2022-05-25 07:44:47.520607: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
      "2022-05-25 07:44:47.520636: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
      "2022-05-25 07:44:47.522471: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
      "2022-05-25 07:44:47.522487: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
      "2022-05-25 07:44:47.524314: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
      "2022-05-25 07:44:47.524329: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
      "2022-05-25 07:44:48.172215: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
      "2022-05-25 07:44:48.173450: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
      "2022-05-25 07:44:48.174346: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
      "2022-05-25 07:44:48.175074: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13823 MB memory:  -> device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5\n",
      "2022-05-25 07:44:48.176202: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
      "2022-05-25 07:44:48.177006: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
      "2022-05-25 07:44:48.177751: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
      "2022-05-25 07:44:48.178593: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
      "2022-05-25 07:44:48.179372: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
      "2022-05-25 07:44:48.180067: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:worker/replica:0/task:0/device:GPU:0 with 13823 MB memory:  -> device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5\n",
      "2022-05-25 07:44:48.184702: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
      "2022-05-25 07:44:48.185967: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
      "2022-05-25 07:44:48.186076: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:272] Initialize GrpcChannelCache for job worker -> {0 -> 127.0.0.1:9901, 1 -> 127.0.0.1:9902}\n",
      "2022-05-25 07:44:48.186390: I tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc:437] Started server with target: grpc://127.0.0.1:9901\n",
      "2022-05-25 07:44:48.187072: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
      "2022-05-25 07:44:48.187988: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13823 MB memory:  -> device: 0, name: Tesla T4, pci bus id: 0000:00:05.0, compute capability: 7.5\n",
      "2022-05-25 07:44:48.190367: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
      "2022-05-25 07:44:48.191235: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
      "2022-05-25 07:44:48.192153: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
      "2022-05-25 07:44:48.192898: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
      "2022-05-25 07:44:48.193639: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
      "2022-05-25 07:44:48.194373: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:worker/replica:0/task:1/device:GPU:0 with 13823 MB memory:  -> device: 0, name: Tesla T4, pci bus id: 0000:00:05.0, compute capability: 7.5\n",
      "2022-05-25 07:44:48.199645: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:272] Initialize GrpcChannelCache for job worker -> {0 -> 127.0.0.1:9901, 1 -> 127.0.0.1:9902}\n",
      "2022-05-25 07:44:48.200143: I tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc:437] Started server with target: grpc://127.0.0.1:9902\n",
      "2022-05-25 07:44:49.314752: W tensorflow/core/grappler/optimizers/data/auto_shard.cc:776] AUTO sharding policy will apply DATA sharding policy as it failed to apply FILE sharding policy because of the following reason: Did not find a shardable source, walked to a node which is not a dataset: name: \"FlatMapDataset/_9\"\n",
      "op: \"FlatMapDataset\"\n",
      "input: \"PrefetchDataset/_8\"\n",
      "attr {\n",
      "  key: \"Targuments\"\n",
      "  value {\n",
      "    list {\n",
      "    }\n",
      "  }\n",
      "}\n",
      "attr {\n",
      "  key: \"_cardinality\"\n",
      "  value {\n",
      "    i: -2\n",
      "  }\n",
      "}\n",
      "attr {\n",
      "  key: \"f\"\n",
      "  value {\n",
      "    func {\n",
      "      name: \"__inference_Dataset_flat_map_slice_batch_indices_308\"\n",
      "    }\n",
      "  }\n",
      "}\n",
      "attr {\n",
      "  key: \"metadata\"\n",
      "  value {\n",
      "    s: \"\\n\\020FlatMapDataset:4\"\n",
      "  }\n",
      "}\n",
      "attr {\n",
      "  key: \"output_shapes\"\n",
      "  value {\n",
      "    list {\n",
      "      shape {\n",
      "        dim {\n",
      "          size: -1\n",
      "        }\n",
      "      }\n",
      "    }\n",
      "  }\n",
      "}\n",
      "attr {\n",
      "  key: \"output_types\"\n",
      "  value {\n",
      "    list {\n",
      "      type: DT_INT64\n",
      "    }\n",
      "  }\n",
      "}\n",
      "experimental_type {\n",
      "  type_id: TFT_PRODUCT\n",
      "  args {\n",
      "    type_id: TFT_DATASET\n",
      "    args {\n",
      "      type_id: TFT_PRODUCT\n",
      "      args {\n",
      "        type_id: TFT_TENSOR\n",
      "        args {\n",
      "          type_id: TFT_INT64\n",
      "        }\n",
      "      }\n",
      "    }\n",
      "  }\n",
      "  args {\n",
      "    type_id: TFT_DATASET\n",
      "    args {\n",
      "      type_id: TFT_PRODUCT\n",
      "      args {\n",
      "        type_id: TFT_TENSOR\n",
      "        args {\n",
      "          type_id: TFT_INT64\n",
      "        }\n",
      "      }\n",
      "    }\n",
      "  }\n",
      "}\n",
      ". Consider either turning off auto-sharding or switching the auto_shard_policy to DATA to shard this dataset. You can do this by creating a new `tf.data.Options()` object then setting `options.experimental_distribute.auto_shard_policy = AutoShardPolicy.DATA` before applying the options object to the dataset via `dataset.with_options(options)`.\n",
      "2022-05-25 07:44:49.316088: W tensorflow/core/grappler/optimizers/data/auto_shard.cc:776] AUTO sharding policy will apply DATA sharding policy as it failed to apply FILE sharding policy because of the following reason: Did not find a shardable source, walked to a node which is not a dataset: name: \"FlatMapDataset/_9\"\n",
      "op: \"FlatMapDataset\"\n",
      "input: \"PrefetchDataset/_8\"\n",
      "attr {\n",
      "  key: \"Targuments\"\n",
      "  value {\n",
      "    list {\n",
      "    }\n",
      "  }\n",
      "}\n",
      "attr {\n",
      "  key: \"_cardinality\"\n",
      "  value {\n",
      "    i: -2\n",
      "  }\n",
      "}\n",
      "attr {\n",
      "  key: \"f\"\n",
      "  value {\n",
      "    func {\n",
      "      name: \"__inference_Dataset_flat_map_slice_batch_indices_292\"\n",
      "    }\n",
      "  }\n",
      "}\n",
      "attr {\n",
      "  key: \"metadata\"\n",
      "  value {\n",
      "    s: \"\\n\\020FlatMapDataset:4\"\n",
      "  }\n",
      "}\n",
      "attr {\n",
      "  key: \"output_shapes\"\n",
      "  value {\n",
      "    list {\n",
      "      shape {\n",
      "        dim {\n",
      "          size: -1\n",
      "        }\n",
      "      }\n",
      "    }\n",
      "  }\n",
      "}\n",
      "attr {\n",
      "  key: \"output_types\"\n",
      "  value {\n",
      "    list {\n",
      "      type: DT_INT64\n",
      "    }\n",
      "  }\n",
      "}\n",
      "experimental_type {\n",
      "  type_id: TFT_PRODUCT\n",
      "  args {\n",
      "    type_id: TFT_DATASET\n",
      "    args {\n",
      "      type_id: TFT_PRODUCT\n",
      "      args {\n",
      "        type_id: TFT_TENSOR\n",
      "        args {\n",
      "          type_id: TFT_INT64\n",
      "        }\n",
      "      }\n",
      "    }\n",
      "  }\n",
      "  args {\n",
      "    type_id: TFT_DATASET\n",
      "    args {\n",
      "      type_id: TFT_PRODUCT\n",
      "      args {\n",
      "        type_id: TFT_TENSOR\n",
      "        args {\n",
      "          type_id: TFT_INT64\n",
      "        }\n",
      "      }\n",
      "    }\n",
      "  }\n",
      "}\n",
      ". Consider either turning off auto-sharding or switching the auto_shard_policy to DATA to shard this dataset. You can do this by creating a new `tf.data.Options()` object then setting `options.experimental_distribute.auto_shard_policy = AutoShardPolicy.DATA` before applying the options object to the dataset via `dataset.with_options(options)`.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Starting task worker1\n",
      "Epoch 1/10\n",
      "Starting task worker0\n",
      "Epoch 1/10\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2022-05-25 07:44:52.699962: I tensorflow/stream_executor/cuda/cuda_dnn.cc:368] Loaded cuDNN version 8200\n",
      "2022-05-25 07:44:52.706704: I tensorflow/stream_executor/cuda/cuda_dnn.cc:368] Loaded cuDNN version 8200\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1719/1719 [==============================] - ETA: 0s - loss: 0.7568 - accuracy: 0.7579"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2022-05-25 07:45:23.092390: W tensorflow/core/grappler/optimizers/data/auto_shard.cc:776] AUTO sharding policy will apply DATA sharding policy as it failed to apply FILE sharding policy because of the following reason: Did not find a shardable source, walked to a node which is not a dataset: name: \"FlatMapDataset/_9\"\n",
      "op: \"FlatMapDataset\"\n",
      "input: \"PrefetchDataset/_8\"\n",
      "attr {\n",
      "  key: \"Targuments\"\n",
      "  value {\n",
      "    list {\n",
      "    }\n",
      "  }\n",
      "}\n",
      "attr {\n",
      "  key: \"_cardinality\"\n",
      "  value {\n",
      "    i: -2\n",
      "  }\n",
      "}\n",
      "attr {\n",
      "  key: \"f\"\n",
      "  value {\n",
      "    func {\n",
      "      name: \"__inference_Dataset_flat_map_slice_batch_indices_20246\"\n",
      "    }\n",
      "  }\n",
      "}\n",
      "attr {\n",
      "  key: \"metadata\"\n",
      "  value {\n",
      "    s: \"\\n\\021FlatMapDataset:31\"\n",
      "  }\n",
      "}\n",
      "attr {\n",
      "  key: \"output_shapes\"\n",
      "  value {\n",
      "    list {\n",
      "      shape {\n",
      "        dim {\n",
      "          size: -1\n",
      "        }\n",
      "      }\n",
      "    }\n",
      "  }\n",
      "}\n",
      "attr {\n",
      "  key: \"output_types\"\n",
      "  value {\n",
      "    list {\n",
      "      type: DT_INT64\n",
      "    }\n",
      "  }\n",
      "}\n",
      "experimental_type {\n",
      "  type_id: TFT_PRODUCT\n",
      "  args {\n",
      "    type_id: TFT_DATASET\n",
      "    args {\n",
      "      type_id: TFT_PRODUCT\n",
      "      args {\n",
      "        type_id: TFT_TENSOR\n",
      "        args {\n",
      "          type_id: TFT_INT64\n",
      "        }\n",
      "      }\n",
      "    }\n",
      "  }\n",
      "  args {\n",
      "    type_id: TFT_DATASET\n",
      "    args {\n",
      "      type_id: TFT_PRODUCT\n",
      "      args {\n",
      "        type_id: TFT_TENSOR\n",
      "        args {\n",
      "          type_id: TFT_INT64\n",
      "        }\n",
      "      }\n",
      "    }\n",
      "  }\n",
      "}\n",
      ". Consider either turning off auto-sharding or switching the auto_shard_policy to DATA to shard this dataset. You can do this by creating a new `tf.data.Options()` object then setting `options.experimental_distribute.auto_shard_policy = AutoShardPolicy.DATA` before applying the options object to the dataset via `dataset.with_options(options)`.\n",
      "2022-05-25 07:45:23.114419: W tensorflow/core/grappler/optimizers/data/auto_shard.cc:776] AUTO sharding policy will apply DATA sharding policy as it failed to apply FILE sharding policy because of the following reason: Did not find a shardable source, walked to a node which is not a dataset: name: \"FlatMapDataset/_9\"\n",
      "op: \"FlatMapDataset\"\n",
      "input: \"PrefetchDataset/_8\"\n",
      "attr {\n",
      "  key: \"Targuments\"\n",
      "  value {\n",
      "    list {\n",
      "    }\n",
      "  }\n",
      "}\n",
      "attr {\n",
      "  key: \"_cardinality\"\n",
      "  value {\n",
      "    i: -2\n",
      "  }\n",
      "}\n",
      "attr {\n",
      "  key: \"f\"\n",
      "  value {\n",
      "    func {\n",
      "      name: \"__inference_Dataset_flat_map_slice_batch_indices_20288\"\n",
      "    }\n",
      "  }\n",
      "}\n",
      "attr {\n",
      "  key: \"metadata\"\n",
      "  value {\n",
      "    s: \"\\n\\021FlatMapDataset:31\"\n",
      "  }\n",
      "}\n",
      "attr {\n",
      "  key: \"output_shapes\"\n",
      "  value {\n",
      "    list {\n",
      "      shape {\n",
      "        dim {\n",
      "          size: -1\n",
      "        }\n",
      "      }\n",
      "    }\n",
      "  }\n",
      "}\n",
      "attr {\n",
      "  key: \"output_types\"\n",
      "  value {\n",
      "    list {\n",
      "      type: DT_INT64\n",
      "    }\n",
      "  }\n",
      "}\n",
      "experimental_type {\n",
      "  type_id: TFT_PRODUCT\n",
      "  args {\n",
      "    type_id: TFT_DATASET\n",
      "    args {\n",
      "      type_id: TFT_PRODUCT\n",
      "      args {\n",
      "        type_id: TFT_TENSOR\n",
      "        args {\n",
      "          type_id: TFT_INT64\n",
      "        }\n",
      "      }\n",
      "    }\n",
      "  }\n",
      "  args {\n",
      "    type_id: TFT_DATASET\n",
      "    args {\n",
      "      type_id: TFT_PRODUCT\n",
      "      args {\n",
      "        type_id: TFT_TENSOR\n",
      "        args {\n",
      "          type_id: TFT_INT64\n",
      "        }\n",
      "      }\n",
      "    }\n",
      "  }\n",
      "}\n",
      ". Consider either turning off auto-sharding or switching the auto_shard_policy to DATA to shard this dataset. You can do this by creating a new `tf.data.Options()` object then setting `options.experimental_distribute.auto_shard_policy = AutoShardPolicy.DATA` before applying the options object to the dataset via `dataset.with_options(options)`.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1719/1719 [==============================] - 36s 18ms/step - loss: 0.7568 - accuracy: 0.7579 - val_loss: 0.1375 - val_accuracy: 0.9602\n",
      "1719/1719 [==============================] - 36s 18ms/step - loss: 0.7568 - accuracy: 0.7579 - val_loss: 0.1375 - val_accuracy: 0.9602\n",
      "Epoch 2/10\n",
      "Epoch 2/10\n",
      "1719/1719 [==============================] - 29s 17ms/step - loss: 0.2274 - accuracy: 0.9339 - val_loss: 0.0958 - val_accuracy: 0.9726\n",
      "1719/1719 [==============================] - 31s 18ms/step - loss: 0.2274 - accuracy: 0.9339 - val_loss: 0.0958 - val_accuracy: 0.9726\n",
      "Epoch 3/10\n",
      "Epoch 3/10\n",
      "1719/1719 [==============================] - 30s 17ms/step - loss: 0.1644 - accuracy: 0.9526 - val_loss: 0.0760 - val_accuracy: 0.9768\n",
      "1719/1719 [==============================] - 31s 18ms/step - loss: 0.1644 - accuracy: 0.9526 - val_loss: 0.0760 - val_accuracy: 0.9768\n",
      "Epoch 4/10\n",
      "Epoch 4/10\n",
      "1719/1719 [==============================] - 30s 17ms/step - loss: 0.1326 - accuracy: 0.9613 - val_loss: 0.0631 - val_accuracy: 0.9798\n",
      "1719/1719 [==============================] - 31s 18ms/step - loss: 0.1326 - accuracy: 0.9613 - val_loss: 0.0631 - val_accuracy: 0.9798\n",
      "Epoch 5/10\n",
      "Epoch 5/10\n",
      "1719/1719 [==============================] - 30s 17ms/step - loss: 0.1122 - accuracy: 0.9670 - val_loss: 0.0586 - val_accuracy: 0.9838\n",
      "1719/1719 [==============================] - 30s 17ms/step - loss: 0.1122 - accuracy: 0.9670 - val_loss: 0.0586 - val_accuracy: 0.9838\n",
      "Epoch 6/10\n",
      "Epoch 6/10\n",
      " 443/1719 [======>.......................] - ETA: 21s - loss: 0.0998 - accuracy: 0.9720"
     ]
    }
   ],
   "source": [
    "import subprocess\n",
    "import json\n",
    "import os\n",
    "\n",
    "# tf_config_str = os.environ.get('TF_CONFIG')\n",
    "# tf_config_dict  = json.loads(tf_config_str)\n",
    "#\n",
    "# # Convert back to string just for pretty printing\n",
    "# print(json.dumps(tf_config_dict, indent=2))\n",
    "\n",
    "import os\n",
    "import numpy as np\n",
    "import tensorflow as tf\n",
    "from tensorflow import keras\n",
    "import time\n",
    "\n",
    "gpus = tf.config.experimental.list_physical_devices('GPU')\n",
    "if gpus:\n",
    "  try:\n",
    "    for gpu in gpus:\n",
    "      tf.config.experimental.set_memory_growth(gpu, True)\n",
    "  except RuntimeError as e:\n",
    "    print(e)\n",
    "# The cluster spec is a dictionary with one key per job,\n",
    "# and the values are lists of task addresses (IP:port)\n",
    "cluster_spec = { \"worker\":[\"127.0.0.1:9901\",\n",
    "                           \"127.0.0.1:9902\"]\n",
    "               }\n",
    "\n",
    "# set the TF_CONFIG environment variable before starting TensorFlow\n",
    "# JSON-encoded dictionary containing a cluster specification (under the \"cluster\" key)\n",
    "# and the type and index of the current task (under the \"task\" key)\n",
    "for index, worker_address in enumerate( cluster_spec[\"worker\"] ):\n",
    "  \n",
    "  os.environ['CUDA_VISIBLE_DEVICES']=str(index)\n",
    "  os.environ[\"TF_CONFIG\"] = json.dumps( { \"cluster\":cluster_spec,\n",
    "                                          \"task\":{\"type\":\"worker\",\n",
    "                                                  \"index\": index}\n",
    "                                        } )\n",
    "  subprocess.Popen( \"python /home/jupyter/task.py\",\n",
    "                    shell = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "c610ad33-a7c9-4694-a6b1-55bb3d03d488",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{\n",
      "  \"cluster\": {\n",
      "    \"worker\": [\n",
      "      \"127.0.0.1:9901\",\n",
      "      \"127.0.0.1:9902\"\n",
      "    ]\n",
      "  },\n",
      "  \"task\": {\n",
      "    \"type\": \"worker\",\n",
      "    \"index\": 1\n",
      "  }\n",
      "}\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "\n",
    "tf_config_str = os.environ.get('TF_CONFIG')\n",
    "tf_config_dict  = json.loads(tf_config_str)\n",
    "\n",
    "# Convert back to string just for pretty printing\n",
    "print(json.dumps(tf_config_dict, indent=2))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "97a2fa5b-aec5-4e7d-8fb2-956ef018905d",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "environment": {
   "kernel": "python3",
   "name": "tf2-gpu.2-8.m92",
   "type": "gcloud",
   "uri": "gcr.io/deeplearning-platform-release/tf2-gpu.2-8:m92"
  },
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}