Skip to content

Instantly share code, notes, and snippets.

@mohanr
Last active May 26, 2022 05:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mohanr/5df47f748d28e7e75a261a41df5b5ff0 to your computer and use it in GitHub Desktop.
Save mohanr/5df47f748d28e7e75a261a41df5b5ff0 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "9229d8fd-f0a9-4d66-b7c0-8700a7076202",
"metadata": {},
"outputs": [],
"source": [
"%%writefile task.py\n",
"\n",
"import os\n",
"import numpy as np\n",
"import tensorflow as tf\n",
"from tensorflow import keras\n",
"import time\n",
"\n",
"# At the begining of the program\n",
"distribution = tf.distribute.MultiWorkerMirroredStrategy()\n",
"\n",
"resolver = tf.distribute.cluster_resolver.TFConfigClusterResolver()\n",
"print(\"Starting task {}{}\".format(resolver.task_type, resolver.task_id))\n",
"\n",
"# Only worker #0 will write checkpoints and log to TensorBoard\n",
"if resolver.task_id == 0:\n",
" root_logdir = os.path.join(os.curdir, # os.getcwd(), #os.curdir,\n",
" \"task_logs/\",\n",
" \"psworker_logs\"\n",
" )\n",
" run_id = time.strftime(\"run_%Y_%m_%d-%H_%M_%S\")\n",
" run_dir = os.path.join(root_logdir, run_id)\n",
" callbacks = [keras.callbacks.TensorBoard(run_dir),\n",
" keras.callbacks.ModelCheckpoint(\"psworker_model.h5\",\n",
" save_best_only=True\n",
" ),\n",
" ]\n",
"else:\n",
" callbacks = []\n",
"\n",
"# load and prepare the MNIST dataset\n",
"(X_train_full, y_train_full), (X_test, y_test) = keras.datasets.mnist.load_data()\n",
"X_train_full = X_train_full[..., np.newaxis] / 255.\n",
"X_valid, X_train = X_train_full[:5000], X_train_full[5000:]\n",
"y_valid, y_train = y_train_full[:5000], y_train_full[5000:]\n",
"\n",
"with distribution.scope():\n",
" model = keras.models.Sequential([\n",
" keras.layers.Conv2D(filters=64, kernel_size=7, activation=\"relu\",\n",
" padding=\"same\", input_shape=[28, 28, 1]\n",
" ), # (None, 28, 28, 64)\n",
" keras.layers.MaxPooling2D(pool_size=2), # (None, 14, 14, 64)\n",
"\n",
" keras.layers.Conv2D(filters=128, kernel_size=3, activation=\"relu\",\n",
" padding=\"same\"\n",
" ), # (None, 14, 14, 128)\n",
" keras.layers.Conv2D(filters=128, kernel_size=3, activation=\"relu\",\n",
" padding=\"same\"\n",
" ),\n",
" keras.layers.MaxPooling2D(pool_size=2), # (None, 7, 7, 128)\n",
"\n",
" keras.layers.Flatten(), # (None, 6272)\n",
" keras.layers.Dense(units=64, activation='relu'), # (None, 64)\n",
" keras.layers.Dropout(0.5),\n",
" keras.layers.Dense(units=10, activation=\"softmax\"),\n",
" # (None, 10)\n",
" ])\n",
" model.compile(loss=\"sparse_categorical_crossentropy\",\n",
" optimizer=keras.optimizers.SGD(learning_rate=1e-2),\n",
" metrics=[\"accuracy\"]\n",
" )\n",
"\n",
"model.fit(X_train, y_train, validation_data=(X_valid, y_valid),\n",
" epochs=10, callbacks=callbacks\n",
" )\n"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "8cd842e9-4755-4c75-acd7-f88dfb9f81df",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2022-05-25 07:44:47.504469: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
"2022-05-25 07:44:47.504765: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
"2022-05-25 07:44:47.515766: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
"2022-05-25 07:44:47.515872: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
"2022-05-25 07:44:47.517581: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
"2022-05-25 07:44:47.517654: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
"2022-05-25 07:44:47.519971: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n",
"To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
"2022-05-25 07:44:47.520006: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n",
"To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
"2022-05-25 07:44:47.520607: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
"2022-05-25 07:44:47.520636: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
"2022-05-25 07:44:47.522471: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
"2022-05-25 07:44:47.522487: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
"2022-05-25 07:44:47.524314: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
"2022-05-25 07:44:47.524329: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
"2022-05-25 07:44:48.172215: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
"2022-05-25 07:44:48.173450: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
"2022-05-25 07:44:48.174346: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
"2022-05-25 07:44:48.175074: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13823 MB memory: -> device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5\n",
"2022-05-25 07:44:48.176202: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
"2022-05-25 07:44:48.177006: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
"2022-05-25 07:44:48.177751: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
"2022-05-25 07:44:48.178593: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
"2022-05-25 07:44:48.179372: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
"2022-05-25 07:44:48.180067: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:worker/replica:0/task:0/device:GPU:0 with 13823 MB memory: -> device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5\n",
"2022-05-25 07:44:48.184702: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
"2022-05-25 07:44:48.185967: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
"2022-05-25 07:44:48.186076: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:272] Initialize GrpcChannelCache for job worker -> {0 -> 127.0.0.1:9901, 1 -> 127.0.0.1:9902}\n",
"2022-05-25 07:44:48.186390: I tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc:437] Started server with target: grpc://127.0.0.1:9901\n",
"2022-05-25 07:44:48.187072: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
"2022-05-25 07:44:48.187988: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13823 MB memory: -> device: 0, name: Tesla T4, pci bus id: 0000:00:05.0, compute capability: 7.5\n",
"2022-05-25 07:44:48.190367: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
"2022-05-25 07:44:48.191235: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
"2022-05-25 07:44:48.192153: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
"2022-05-25 07:44:48.192898: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
"2022-05-25 07:44:48.193639: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
"2022-05-25 07:44:48.194373: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:worker/replica:0/task:1/device:GPU:0 with 13823 MB memory: -> device: 0, name: Tesla T4, pci bus id: 0000:00:05.0, compute capability: 7.5\n",
"2022-05-25 07:44:48.199645: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:272] Initialize GrpcChannelCache for job worker -> {0 -> 127.0.0.1:9901, 1 -> 127.0.0.1:9902}\n",
"2022-05-25 07:44:48.200143: I tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc:437] Started server with target: grpc://127.0.0.1:9902\n",
"2022-05-25 07:44:49.314752: W tensorflow/core/grappler/optimizers/data/auto_shard.cc:776] AUTO sharding policy will apply DATA sharding policy as it failed to apply FILE sharding policy because of the following reason: Did not find a shardable source, walked to a node which is not a dataset: name: \"FlatMapDataset/_9\"\n",
"op: \"FlatMapDataset\"\n",
"input: \"PrefetchDataset/_8\"\n",
"attr {\n",
" key: \"Targuments\"\n",
" value {\n",
" list {\n",
" }\n",
" }\n",
"}\n",
"attr {\n",
" key: \"_cardinality\"\n",
" value {\n",
" i: -2\n",
" }\n",
"}\n",
"attr {\n",
" key: \"f\"\n",
" value {\n",
" func {\n",
" name: \"__inference_Dataset_flat_map_slice_batch_indices_308\"\n",
" }\n",
" }\n",
"}\n",
"attr {\n",
" key: \"metadata\"\n",
" value {\n",
" s: \"\\n\\020FlatMapDataset:4\"\n",
" }\n",
"}\n",
"attr {\n",
" key: \"output_shapes\"\n",
" value {\n",
" list {\n",
" shape {\n",
" dim {\n",
" size: -1\n",
" }\n",
" }\n",
" }\n",
" }\n",
"}\n",
"attr {\n",
" key: \"output_types\"\n",
" value {\n",
" list {\n",
" type: DT_INT64\n",
" }\n",
" }\n",
"}\n",
"experimental_type {\n",
" type_id: TFT_PRODUCT\n",
" args {\n",
" type_id: TFT_DATASET\n",
" args {\n",
" type_id: TFT_PRODUCT\n",
" args {\n",
" type_id: TFT_TENSOR\n",
" args {\n",
" type_id: TFT_INT64\n",
" }\n",
" }\n",
" }\n",
" }\n",
" args {\n",
" type_id: TFT_DATASET\n",
" args {\n",
" type_id: TFT_PRODUCT\n",
" args {\n",
" type_id: TFT_TENSOR\n",
" args {\n",
" type_id: TFT_INT64\n",
" }\n",
" }\n",
" }\n",
" }\n",
"}\n",
". Consider either turning off auto-sharding or switching the auto_shard_policy to DATA to shard this dataset. You can do this by creating a new `tf.data.Options()` object then setting `options.experimental_distribute.auto_shard_policy = AutoShardPolicy.DATA` before applying the options object to the dataset via `dataset.with_options(options)`.\n",
"2022-05-25 07:44:49.316088: W tensorflow/core/grappler/optimizers/data/auto_shard.cc:776] AUTO sharding policy will apply DATA sharding policy as it failed to apply FILE sharding policy because of the following reason: Did not find a shardable source, walked to a node which is not a dataset: name: \"FlatMapDataset/_9\"\n",
"op: \"FlatMapDataset\"\n",
"input: \"PrefetchDataset/_8\"\n",
"attr {\n",
" key: \"Targuments\"\n",
" value {\n",
" list {\n",
" }\n",
" }\n",
"}\n",
"attr {\n",
" key: \"_cardinality\"\n",
" value {\n",
" i: -2\n",
" }\n",
"}\n",
"attr {\n",
" key: \"f\"\n",
" value {\n",
" func {\n",
" name: \"__inference_Dataset_flat_map_slice_batch_indices_292\"\n",
" }\n",
" }\n",
"}\n",
"attr {\n",
" key: \"metadata\"\n",
" value {\n",
" s: \"\\n\\020FlatMapDataset:4\"\n",
" }\n",
"}\n",
"attr {\n",
" key: \"output_shapes\"\n",
" value {\n",
" list {\n",
" shape {\n",
" dim {\n",
" size: -1\n",
" }\n",
" }\n",
" }\n",
" }\n",
"}\n",
"attr {\n",
" key: \"output_types\"\n",
" value {\n",
" list {\n",
" type: DT_INT64\n",
" }\n",
" }\n",
"}\n",
"experimental_type {\n",
" type_id: TFT_PRODUCT\n",
" args {\n",
" type_id: TFT_DATASET\n",
" args {\n",
" type_id: TFT_PRODUCT\n",
" args {\n",
" type_id: TFT_TENSOR\n",
" args {\n",
" type_id: TFT_INT64\n",
" }\n",
" }\n",
" }\n",
" }\n",
" args {\n",
" type_id: TFT_DATASET\n",
" args {\n",
" type_id: TFT_PRODUCT\n",
" args {\n",
" type_id: TFT_TENSOR\n",
" args {\n",
" type_id: TFT_INT64\n",
" }\n",
" }\n",
" }\n",
" }\n",
"}\n",
". Consider either turning off auto-sharding or switching the auto_shard_policy to DATA to shard this dataset. You can do this by creating a new `tf.data.Options()` object then setting `options.experimental_distribute.auto_shard_policy = AutoShardPolicy.DATA` before applying the options object to the dataset via `dataset.with_options(options)`.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Starting task worker1\n",
"Epoch 1/10\n",
"Starting task worker0\n",
"Epoch 1/10\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"2022-05-25 07:44:52.699962: I tensorflow/stream_executor/cuda/cuda_dnn.cc:368] Loaded cuDNN version 8200\n",
"2022-05-25 07:44:52.706704: I tensorflow/stream_executor/cuda/cuda_dnn.cc:368] Loaded cuDNN version 8200\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"1719/1719 [==============================] - ETA: 0s - loss: 0.7568 - accuracy: 0.7579"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"2022-05-25 07:45:23.092390: W tensorflow/core/grappler/optimizers/data/auto_shard.cc:776] AUTO sharding policy will apply DATA sharding policy as it failed to apply FILE sharding policy because of the following reason: Did not find a shardable source, walked to a node which is not a dataset: name: \"FlatMapDataset/_9\"\n",
"op: \"FlatMapDataset\"\n",
"input: \"PrefetchDataset/_8\"\n",
"attr {\n",
" key: \"Targuments\"\n",
" value {\n",
" list {\n",
" }\n",
" }\n",
"}\n",
"attr {\n",
" key: \"_cardinality\"\n",
" value {\n",
" i: -2\n",
" }\n",
"}\n",
"attr {\n",
" key: \"f\"\n",
" value {\n",
" func {\n",
" name: \"__inference_Dataset_flat_map_slice_batch_indices_20246\"\n",
" }\n",
" }\n",
"}\n",
"attr {\n",
" key: \"metadata\"\n",
" value {\n",
" s: \"\\n\\021FlatMapDataset:31\"\n",
" }\n",
"}\n",
"attr {\n",
" key: \"output_shapes\"\n",
" value {\n",
" list {\n",
" shape {\n",
" dim {\n",
" size: -1\n",
" }\n",
" }\n",
" }\n",
" }\n",
"}\n",
"attr {\n",
" key: \"output_types\"\n",
" value {\n",
" list {\n",
" type: DT_INT64\n",
" }\n",
" }\n",
"}\n",
"experimental_type {\n",
" type_id: TFT_PRODUCT\n",
" args {\n",
" type_id: TFT_DATASET\n",
" args {\n",
" type_id: TFT_PRODUCT\n",
" args {\n",
" type_id: TFT_TENSOR\n",
" args {\n",
" type_id: TFT_INT64\n",
" }\n",
" }\n",
" }\n",
" }\n",
" args {\n",
" type_id: TFT_DATASET\n",
" args {\n",
" type_id: TFT_PRODUCT\n",
" args {\n",
" type_id: TFT_TENSOR\n",
" args {\n",
" type_id: TFT_INT64\n",
" }\n",
" }\n",
" }\n",
" }\n",
"}\n",
". Consider either turning off auto-sharding or switching the auto_shard_policy to DATA to shard this dataset. You can do this by creating a new `tf.data.Options()` object then setting `options.experimental_distribute.auto_shard_policy = AutoShardPolicy.DATA` before applying the options object to the dataset via `dataset.with_options(options)`.\n",
"2022-05-25 07:45:23.114419: W tensorflow/core/grappler/optimizers/data/auto_shard.cc:776] AUTO sharding policy will apply DATA sharding policy as it failed to apply FILE sharding policy because of the following reason: Did not find a shardable source, walked to a node which is not a dataset: name: \"FlatMapDataset/_9\"\n",
"op: \"FlatMapDataset\"\n",
"input: \"PrefetchDataset/_8\"\n",
"attr {\n",
" key: \"Targuments\"\n",
" value {\n",
" list {\n",
" }\n",
" }\n",
"}\n",
"attr {\n",
" key: \"_cardinality\"\n",
" value {\n",
" i: -2\n",
" }\n",
"}\n",
"attr {\n",
" key: \"f\"\n",
" value {\n",
" func {\n",
" name: \"__inference_Dataset_flat_map_slice_batch_indices_20288\"\n",
" }\n",
" }\n",
"}\n",
"attr {\n",
" key: \"metadata\"\n",
" value {\n",
" s: \"\\n\\021FlatMapDataset:31\"\n",
" }\n",
"}\n",
"attr {\n",
" key: \"output_shapes\"\n",
" value {\n",
" list {\n",
" shape {\n",
" dim {\n",
" size: -1\n",
" }\n",
" }\n",
" }\n",
" }\n",
"}\n",
"attr {\n",
" key: \"output_types\"\n",
" value {\n",
" list {\n",
" type: DT_INT64\n",
" }\n",
" }\n",
"}\n",
"experimental_type {\n",
" type_id: TFT_PRODUCT\n",
" args {\n",
" type_id: TFT_DATASET\n",
" args {\n",
" type_id: TFT_PRODUCT\n",
" args {\n",
" type_id: TFT_TENSOR\n",
" args {\n",
" type_id: TFT_INT64\n",
" }\n",
" }\n",
" }\n",
" }\n",
" args {\n",
" type_id: TFT_DATASET\n",
" args {\n",
" type_id: TFT_PRODUCT\n",
" args {\n",
" type_id: TFT_TENSOR\n",
" args {\n",
" type_id: TFT_INT64\n",
" }\n",
" }\n",
" }\n",
" }\n",
"}\n",
". Consider either turning off auto-sharding or switching the auto_shard_policy to DATA to shard this dataset. You can do this by creating a new `tf.data.Options()` object then setting `options.experimental_distribute.auto_shard_policy = AutoShardPolicy.DATA` before applying the options object to the dataset via `dataset.with_options(options)`.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"1719/1719 [==============================] - 36s 18ms/step - loss: 0.7568 - accuracy: 0.7579 - val_loss: 0.1375 - val_accuracy: 0.9602\n",
"1719/1719 [==============================] - 36s 18ms/step - loss: 0.7568 - accuracy: 0.7579 - val_loss: 0.1375 - val_accuracy: 0.9602\n",
"Epoch 2/10\n",
"Epoch 2/10\n",
"1719/1719 [==============================] - 29s 17ms/step - loss: 0.2274 - accuracy: 0.9339 - val_loss: 0.0958 - val_accuracy: 0.9726\n",
"1719/1719 [==============================] - 31s 18ms/step - loss: 0.2274 - accuracy: 0.9339 - val_loss: 0.0958 - val_accuracy: 0.9726\n",
"Epoch 3/10\n",
"Epoch 3/10\n",
"1719/1719 [==============================] - 30s 17ms/step - loss: 0.1644 - accuracy: 0.9526 - val_loss: 0.0760 - val_accuracy: 0.9768\n",
"1719/1719 [==============================] - 31s 18ms/step - loss: 0.1644 - accuracy: 0.9526 - val_loss: 0.0760 - val_accuracy: 0.9768\n",
"Epoch 4/10\n",
"Epoch 4/10\n",
"1719/1719 [==============================] - 30s 17ms/step - loss: 0.1326 - accuracy: 0.9613 - val_loss: 0.0631 - val_accuracy: 0.9798\n",
"1719/1719 [==============================] - 31s 18ms/step - loss: 0.1326 - accuracy: 0.9613 - val_loss: 0.0631 - val_accuracy: 0.9798\n",
"Epoch 5/10\n",
"Epoch 5/10\n",
"1719/1719 [==============================] - 30s 17ms/step - loss: 0.1122 - accuracy: 0.9670 - val_loss: 0.0586 - val_accuracy: 0.9838\n",
"1719/1719 [==============================] - 30s 17ms/step - loss: 0.1122 - accuracy: 0.9670 - val_loss: 0.0586 - val_accuracy: 0.9838\n",
"Epoch 6/10\n",
"Epoch 6/10\n",
" 443/1719 [======>.......................] - ETA: 21s - loss: 0.0998 - accuracy: 0.9720"
]
}
],
"source": [
"import subprocess\n",
"import json\n",
"import os\n",
"\n",
"# tf_config_str = os.environ.get('TF_CONFIG')\n",
"# tf_config_dict = json.loads(tf_config_str)\n",
"#\n",
"# # Convert back to string just for pretty printing\n",
"# print(json.dumps(tf_config_dict, indent=2))\n",
"\n",
"import os\n",
"import numpy as np\n",
"import tensorflow as tf\n",
"from tensorflow import keras\n",
"import time\n",
"\n",
"gpus = tf.config.experimental.list_physical_devices('GPU')\n",
"if gpus:\n",
" try:\n",
" for gpu in gpus:\n",
" tf.config.experimental.set_memory_growth(gpu, True)\n",
" except RuntimeError as e:\n",
" print(e)\n",
"# The cluster spec is a dictionary with one key per job,\n",
"# and the values are lists of task addresses (IP:port)\n",
"cluster_spec = { \"worker\":[\"127.0.0.1:9901\",\n",
" \"127.0.0.1:9902\"]\n",
" }\n",
"\n",
"# set the TF_CONFIG environment variable before starting TensorFlow\n",
"# JSON-encoded dictionary containing a cluster specification (under the \"cluster\" key)\n",
"# and the type and index of the current task (under the \"task\" key)\n",
"for index, worker_address in enumerate( cluster_spec[\"worker\"] ):\n",
" \n",
" os.environ['CUDA_VISIBLE_DEVICES']=str(index)\n",
" os.environ[\"TF_CONFIG\"] = json.dumps( { \"cluster\":cluster_spec,\n",
" \"task\":{\"type\":\"worker\",\n",
" \"index\": index}\n",
" } )\n",
" subprocess.Popen( \"python /home/jupyter/task.py\",\n",
" shell = True)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "c610ad33-a7c9-4694-a6b1-55bb3d03d488",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{\n",
" \"cluster\": {\n",
" \"worker\": [\n",
" \"127.0.0.1:9901\",\n",
" \"127.0.0.1:9902\"\n",
" ]\n",
" },\n",
" \"task\": {\n",
" \"type\": \"worker\",\n",
" \"index\": 1\n",
" }\n",
"}\n"
]
}
],
"source": [
"import os\n",
"\n",
"tf_config_str = os.environ.get('TF_CONFIG')\n",
"tf_config_dict = json.loads(tf_config_str)\n",
"\n",
"# Convert back to string just for pretty printing\n",
"print(json.dumps(tf_config_dict, indent=2))\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "97a2fa5b-aec5-4e7d-8fb2-956ef018905d",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"environment": {
"kernel": "python3",
"name": "tf2-gpu.2-8.m92",
"type": "gcloud",
"uri": "gcr.io/deeplearning-platform-release/tf2-gpu.2-8:m92"
},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment