Skip to content

Instantly share code, notes, and snippets.

@mtngld
Created January 4, 2018 21:19
Show Gist options
  • Save mtngld/147335874af6fe5d79a5673406679640 to your computer and use it in GitHub Desktop.
Save mtngld/147335874af6fe5d79a5673406679640 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"best_effort_tf_restore.ipynb","version":"0.3.2","views":{},"default_view":{},"provenance":[],"collapsed_sections":[]},"kernelspec":{"name":"python3","display_name":"Python 3"}},"cells":[{"metadata":{"id":"LsovcXx53CzM","colab_type":"code","colab":{"autoexec":{"startup":false,"wait_interval":0},"output_extras":[{"item_id":19},{"item_id":20}],"base_uri":"https://localhost:8080/","height":904},"outputId":"71efc213-62c6-49ee-ee43-e79bcc8bf40e","executionInfo":{"status":"ok","timestamp":1515099944921,"user_tz":-120,"elapsed":5739,"user":{"displayName":"Matan Goldman","photoUrl":"//lh5.googleusercontent.com/-y-gKlzdbJQY/AAAAAAAAAAI/AAAAAAABrkc/cq6U5NsgO44/s50-c-k-no/photo.jpg","userId":"114114462975770757175"}}},"source":["import tensorflow as tf\n","\n","# First let's train a simple DNNClassifier\n","# with 2 hidden units on MNIST\n","\n","X_FEATURE = 'x' # Name of the input feature.\n","\n","mnist = tf.contrib.learn.datasets.DATASETS['mnist']('/tmp/mnist')\n","\n","train_input_fn = tf.estimator.inputs.numpy_input_fn(\n"," x={X_FEATURE: mnist.train.images},\n"," y=mnist.train.labels.astype(np.int32),\n"," batch_size=100,\n"," num_epochs=None,\n"," shuffle=True)\n","\n","\n","feature_columns = [\n"," tf.feature_column.numeric_column(\n"," X_FEATURE, shape=mnist.train.images.shape[1:])]\n","\n","classifier = tf.estimator.DNNClassifier(\n"," feature_columns=feature_columns,\n"," hidden_units=[10, 10],\n"," n_classes=10,\n"," model_dir='models/iris')\n","\n","classifier.train(input_fn=train_input_fn, steps=2000)\n"],"cell_type":"code","execution_count":6,"outputs":[{"output_type":"stream","text":["Extracting /tmp/mnist/train-images-idx3-ubyte.gz\n","Extracting /tmp/mnist/train-labels-idx1-ubyte.gz\n","Extracting /tmp/mnist/t10k-images-idx3-ubyte.gz\n","Extracting /tmp/mnist/t10k-labels-idx1-ubyte.gz\n","INFO:tensorflow:Using default config.\n","INFO:tensorflow:Using config: {'_model_dir': 'models/iris', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f51f758f400>, '_task_type': 'worker', '_task_id': 0, '_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}\n","INFO:tensorflow:Create CheckpointSaverHook.\n","INFO:tensorflow:Restoring parameters from models/iris/model.ckpt-200\n","INFO:tensorflow:Saving checkpoints for 201 into models/iris/model.ckpt.\n","INFO:tensorflow:loss = 74.2904, step = 201\n","INFO:tensorflow:global_step/sec: 519.051\n","INFO:tensorflow:loss = 61.6111, step = 301 (0.196 sec)\n","INFO:tensorflow:global_step/sec: 562.76\n","INFO:tensorflow:loss = 26.2714, step = 401 (0.178 sec)\n","INFO:tensorflow:global_step/sec: 550.55\n","INFO:tensorflow:loss = 36.3342, step = 501 (0.181 sec)\n","INFO:tensorflow:global_step/sec: 530.601\n","INFO:tensorflow:loss = 36.5732, step = 601 (0.189 sec)\n","INFO:tensorflow:global_step/sec: 539.529\n","INFO:tensorflow:loss = 31.4095, step = 701 (0.184 sec)\n","INFO:tensorflow:global_step/sec: 533.508\n","INFO:tensorflow:loss = 18.5787, step = 801 (0.189 sec)\n","INFO:tensorflow:global_step/sec: 533.631\n","INFO:tensorflow:loss = 36.0869, step = 901 (0.187 sec)\n","INFO:tensorflow:global_step/sec: 468.801\n","INFO:tensorflow:loss = 40.539, step = 1001 (0.214 sec)\n","INFO:tensorflow:global_step/sec: 455.037\n","INFO:tensorflow:loss = 41.9855, step = 1101 (0.216 sec)\n","INFO:tensorflow:global_step/sec: 491.151\n","INFO:tensorflow:loss = 31.788, step = 1201 (0.208 sec)\n","INFO:tensorflow:global_step/sec: 503.688\n","INFO:tensorflow:loss = 35.5958, step = 1301 (0.202 sec)\n","INFO:tensorflow:global_step/sec: 441.407\n","INFO:tensorflow:loss = 41.1752, step = 1401 (0.218 sec)\n","INFO:tensorflow:global_step/sec: 435.732\n","INFO:tensorflow:loss = 28.97, step = 1501 (0.234 sec)\n","INFO:tensorflow:global_step/sec: 459.19\n","INFO:tensorflow:loss = 39.0298, step = 1601 (0.215 sec)\n","INFO:tensorflow:global_step/sec: 471.421\n","INFO:tensorflow:loss = 41.4038, step = 1701 (0.213 sec)\n","INFO:tensorflow:global_step/sec: 448.85\n","INFO:tensorflow:loss = 38.7126, step = 1801 (0.220 sec)\n","INFO:tensorflow:global_step/sec: 429.715\n","INFO:tensorflow:loss = 39.6939, step = 1901 (0.233 sec)\n","INFO:tensorflow:global_step/sec: 485.01\n","INFO:tensorflow:loss = 22.9617, step = 2001 (0.208 sec)\n","INFO:tensorflow:global_step/sec: 423.397\n","INFO:tensorflow:loss = 16.228, step = 2101 (0.235 sec)\n","INFO:tensorflow:Saving checkpoints for 2200 into models/iris/model.ckpt.\n","INFO:tensorflow:Loss for final step: 32.4603.\n"],"name":"stdout"},{"output_type":"execute_result","data":{"text/plain":["<tensorflow.python.estimator.canned.dnn.DNNClassifier at 0x7f51f7b004e0>"]},"metadata":{"tags":[]},"execution_count":6}]},{"metadata":{"id":"FcsAbq3F_piR","colab_type":"code","colab":{"autoexec":{"startup":false,"wait_interval":0},"output_extras":[{"item_id":1},{"item_id":2}],"base_uri":"https://localhost:8080/","height":4043},"outputId":"4bb37c2c-7356-43d6-dce3-9765065ce834","executionInfo":{"status":"error","timestamp":1515099996389,"user_tz":-120,"elapsed":954,"user":{"displayName":"Matan Goldman","photoUrl":"//lh5.googleusercontent.com/-y-gKlzdbJQY/AAAAAAAAAAI/AAAAAAABrkc/cq6U5NsgO44/s50-c-k-no/photo.jpg","userId":"114114462975770757175"}}},"source":["# Now lets say we want to add a new hidden unit and continue training\n","# Optimally, I would like the estimator to do a \"best-effort\" restoration, \n","# i.e restore the two first hidden units and train the last unit from scratch\n","\n","classifier = tf.estimator.DNNClassifier(\n"," feature_columns=feature_columns,\n"," hidden_units=[10, 10, 10],\n"," n_classes=10,\n"," model_dir='models/iris')\n","\n","classifier.train(input_fn=train_input_fn, steps=2000)\n","\n","# but that will result in an error, which is understandable, but I am wondering\n","# if there is any way to handle it?\n","# I find it a common scenrio where I train a net and later want to try a small\n","# change without retraining from scratch."],"cell_type":"code","execution_count":7,"outputs":[{"output_type":"stream","text":["INFO:tensorflow:Using default config.\n","INFO:tensorflow:Using config: {'_model_dir': 'models/iris', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f51f7690eb8>, '_task_type': 'worker', '_task_id': 0, '_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}\n","INFO:tensorflow:Create CheckpointSaverHook.\n","INFO:tensorflow:Restoring parameters from models/iris/model.ckpt-2200\n"],"name":"stdout"},{"output_type":"error","ename":"NotFoundError","evalue":"ignored","traceback":["\u001b[0;31m---------------------------------------------------------------------------\u001b[0m","\u001b[0;31mNotFoundError\u001b[0m Traceback (most recent call last)","\u001b[0;32m/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py\u001b[0m in \u001b[0;36m_do_call\u001b[0;34m(self, fn, *args)\u001b[0m\n\u001b[1;32m 1322\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1323\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1324\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0merrors\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mOpError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;32m/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py\u001b[0m in \u001b[0;36m_run_fn\u001b[0;34m(session, feed_dict, fetch_list, target_list, options, run_metadata)\u001b[0m\n\u001b[1;32m 1301\u001b[0m \u001b[0mfeed_dict\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfetch_list\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtarget_list\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1302\u001b[0;31m status, run_metadata)\n\u001b[0m\u001b[1;32m 1303\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;32m/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/errors_impl.py\u001b[0m in \u001b[0;36m__exit__\u001b[0;34m(self, type_arg, value_arg, traceback_arg)\u001b[0m\n\u001b[1;32m 472\u001b[0m \u001b[0mcompat\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mas_text\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mc_api\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mTF_Message\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstatus\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstatus\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 473\u001b[0;31m c_api.TF_GetCode(self.status.status))\n\u001b[0m\u001b[1;32m 474\u001b[0m \u001b[0;31m# Delete the underlying status object from memory otherwise it stays alive\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;31mNotFoundError\u001b[0m: Key dnn/hiddenlayer_2/kernel/t_0/Adagrad not found in checkpoint\n\t [[Node: save/RestoreV2_11 = RestoreV2[dtypes=[DT_FLOAT], _device=\"/job:localhost/replica:0/task:0/device:CPU:0\"](_arg_save/Const_0_0, save/RestoreV2_11/tensor_names, save/RestoreV2_11/shape_and_slices)]]","\nDuring handling of the above exception, another exception occurred:\n","\u001b[0;31mNotFoundError\u001b[0m Traceback (most recent call last)","\u001b[0;32m<ipython-input-7-288d85aa455b>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 5\u001b[0m model_dir='models/iris')\n\u001b[1;32m 6\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 7\u001b[0;31m \u001b[0mclassifier\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtrain\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minput_fn\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtrain_input_fn\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msteps\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m2000\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m","\u001b[0;32m/usr/local/lib/python3.6/dist-packages/tensorflow/python/estimator/estimator.py\u001b[0m in \u001b[0;36mtrain\u001b[0;34m(self, input_fn, hooks, steps, max_steps, saving_listeners)\u001b[0m\n\u001b[1;32m 300\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 301\u001b[0m \u001b[0msaving_listeners\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_check_listeners_type\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msaving_listeners\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 302\u001b[0;31m \u001b[0mloss\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_train_model\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minput_fn\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mhooks\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msaving_listeners\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 303\u001b[0m \u001b[0mlogging\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0minfo\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'Loss for final step: %s.'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mloss\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 304\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;32m/usr/local/lib/python3.6/dist-packages/tensorflow/python/estimator/estimator.py\u001b[0m in \u001b[0;36m_train_model\u001b[0;34m(self, input_fn, hooks, saving_listeners)\u001b[0m\n\u001b[1;32m 778\u001b[0m \u001b[0msave_summaries_steps\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_config\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msave_summary_steps\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 779\u001b[0m \u001b[0mconfig\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_session_config\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 780\u001b[0;31m log_step_count_steps=self._config.log_step_count_steps) as mon_sess:\n\u001b[0m\u001b[1;32m 781\u001b[0m \u001b[0mloss\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 782\u001b[0m \u001b[0;32mwhile\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mmon_sess\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshould_stop\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;32m/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/monitored_session.py\u001b[0m in \u001b[0;36mMonitoredTrainingSession\u001b[0;34m(master, is_chief, checkpoint_dir, scaffold, hooks, chief_only_hooks, save_checkpoint_secs, save_summaries_steps, save_summaries_secs, config, stop_grace_period_secs, log_step_count_steps)\u001b[0m\n\u001b[1;32m 366\u001b[0m \u001b[0mall_hooks\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mextend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mhooks\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 367\u001b[0m return MonitoredSession(session_creator=session_creator, hooks=all_hooks,\n\u001b[0;32m--> 368\u001b[0;31m stop_grace_period_secs=stop_grace_period_secs)\n\u001b[0m\u001b[1;32m 369\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 370\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;32m/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/monitored_session.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, session_creator, hooks, stop_grace_period_secs)\u001b[0m\n\u001b[1;32m 671\u001b[0m super(MonitoredSession, self).__init__(\n\u001b[1;32m 672\u001b[0m \u001b[0msession_creator\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mhooks\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mshould_recover\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 673\u001b[0;31m stop_grace_period_secs=stop_grace_period_secs)\n\u001b[0m\u001b[1;32m 674\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 675\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;32m/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/monitored_session.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, session_creator, hooks, should_recover, stop_grace_period_secs)\u001b[0m\n\u001b[1;32m 491\u001b[0m stop_grace_period_secs=stop_grace_period_secs)\n\u001b[1;32m 492\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mshould_recover\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 493\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_sess\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_RecoverableSession\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_coordinated_creator\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 494\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 495\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_sess\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_coordinated_creator\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcreate_session\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;32m/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/monitored_session.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, sess_creator)\u001b[0m\n\u001b[1;32m 849\u001b[0m \"\"\"\n\u001b[1;32m 850\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_sess_creator\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msess_creator\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 851\u001b[0;31m \u001b[0m_WrappedSession\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__init__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_create_session\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 852\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 853\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_create_session\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;32m/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/monitored_session.py\u001b[0m in \u001b[0;36m_create_session\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 854\u001b[0m \u001b[0;32mwhile\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 855\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 856\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_sess_creator\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcreate_session\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 857\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0m_PREEMPTION_ERRORS\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 858\u001b[0m logging.info('An error was raised while a session was being created. '\n","\u001b[0;32m/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/monitored_session.py\u001b[0m in \u001b[0;36mcreate_session\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 552\u001b[0m \u001b[0;34m\"\"\"Creates a coordinated session.\"\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 553\u001b[0m \u001b[0;31m# Keep the tf_sess for unit testing.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 554\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtf_sess\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_session_creator\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcreate_session\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 555\u001b[0m \u001b[0;31m# We don't want coordinator to suppress any exception.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 556\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcoord\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcoordinator\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mCoordinator\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mclean_stop_exception_types\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;32m/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/monitored_session.py\u001b[0m in \u001b[0;36mcreate_session\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 426\u001b[0m \u001b[0minit_op\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_scaffold\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0minit_op\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 427\u001b[0m \u001b[0minit_feed_dict\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_scaffold\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0minit_feed_dict\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 428\u001b[0;31m init_fn=self._scaffold.init_fn)\n\u001b[0m\u001b[1;32m 429\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 430\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;32m/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/session_manager.py\u001b[0m in \u001b[0;36mprepare_session\u001b[0;34m(self, master, init_op, saver, checkpoint_dir, checkpoint_filename_with_path, wait_for_checkpoint, max_wait_secs, config, init_feed_dict, init_fn)\u001b[0m\n\u001b[1;32m 271\u001b[0m \u001b[0mwait_for_checkpoint\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mwait_for_checkpoint\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 272\u001b[0m \u001b[0mmax_wait_secs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmax_wait_secs\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 273\u001b[0;31m config=config)\n\u001b[0m\u001b[1;32m 274\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mis_loaded_from_checkpoint\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 275\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0minit_op\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0minit_fn\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_local_init_op\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;32m/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/session_manager.py\u001b[0m in \u001b[0;36m_restore_checkpoint\u001b[0;34m(self, master, saver, checkpoint_dir, checkpoint_filename_with_path, wait_for_checkpoint, max_wait_secs, config)\u001b[0m\n\u001b[1;32m 203\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 204\u001b[0m \u001b[0;31m# Loads the checkpoint.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 205\u001b[0;31m \u001b[0msaver\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrestore\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msess\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mckpt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmodel_checkpoint_path\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 206\u001b[0m \u001b[0msaver\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrecover_last_checkpoints\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mckpt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mall_model_checkpoint_paths\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 207\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0msess\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;32m/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/saver.py\u001b[0m in \u001b[0;36mrestore\u001b[0;34m(self, sess, save_path)\u001b[0m\n\u001b[1;32m 1664\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mcontext\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0min_graph_mode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1665\u001b[0m sess.run(self.saver_def.restore_op_name,\n\u001b[0;32m-> 1666\u001b[0;31m {self.saver_def.filename_tensor_name: save_path})\n\u001b[0m\u001b[1;32m 1667\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1668\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_build_eager\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msave_path\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbuild_save\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbuild_restore\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;32m/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py\u001b[0m in \u001b[0;36mrun\u001b[0;34m(self, fetches, feed_dict, options, run_metadata)\u001b[0m\n\u001b[1;32m 887\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 888\u001b[0m result = self._run(None, fetches, feed_dict, options_ptr,\n\u001b[0;32m--> 889\u001b[0;31m run_metadata_ptr)\n\u001b[0m\u001b[1;32m 890\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mrun_metadata\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 891\u001b[0m \u001b[0mproto_data\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtf_session\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mTF_GetBuffer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrun_metadata_ptr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;32m/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py\u001b[0m in \u001b[0;36m_run\u001b[0;34m(self, handle, fetches, feed_dict, options, run_metadata)\u001b[0m\n\u001b[1;32m 1118\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mfinal_fetches\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mfinal_targets\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mhandle\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mfeed_dict_tensor\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1119\u001b[0m results = self._do_run(handle, final_targets, final_fetches,\n\u001b[0;32m-> 1120\u001b[0;31m feed_dict_tensor, options, run_metadata)\n\u001b[0m\u001b[1;32m 1121\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1122\u001b[0m \u001b[0mresults\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;32m/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py\u001b[0m in \u001b[0;36m_do_run\u001b[0;34m(self, handle, target_list, fetch_list, feed_dict, options, run_metadata)\u001b[0m\n\u001b[1;32m 1315\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mhandle\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1316\u001b[0m return self._do_call(_run_fn, self._session, feeds, fetches, targets,\n\u001b[0;32m-> 1317\u001b[0;31m options, run_metadata)\n\u001b[0m\u001b[1;32m 1318\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1319\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_do_call\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m_prun_fn\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_session\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mhandle\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfeeds\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfetches\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;32m/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py\u001b[0m in \u001b[0;36m_do_call\u001b[0;34m(self, fn, *args)\u001b[0m\n\u001b[1;32m 1334\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mKeyError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1335\u001b[0m \u001b[0;32mpass\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1336\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mtype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnode_def\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mop\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmessage\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1337\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1338\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_extend_graph\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;31mNotFoundError\u001b[0m: Key dnn/hiddenlayer_2/kernel/t_0/Adagrad not found in checkpoint\n\t [[Node: save/RestoreV2_11 = RestoreV2[dtypes=[DT_FLOAT], _device=\"/job:localhost/replica:0/task:0/device:CPU:0\"](_arg_save/Const_0_0, save/RestoreV2_11/tensor_names, save/RestoreV2_11/shape_and_slices)]]\n\nCaused by op 'save/RestoreV2_11', defined at:\n File \"/usr/lib/python3.6/runpy.py\", line 193, in _run_module_as_main\n \"__main__\", mod_spec)\n File \"/usr/lib/python3.6/runpy.py\", line 85, in _run_code\n exec(code, run_globals)\n File \"/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py\", line 16, in <module>\n app.launch_new_instance()\n File \"/usr/local/lib/python3.6/dist-packages/traitlets/config/application.py\", line 658, in launch_instance\n app.start()\n File \"/usr/local/lib/python3.6/dist-packages/ipykernel/kernelapp.py\", line 477, in start\n ioloop.IOLoop.instance().start()\n File \"/usr/local/lib/python3.6/dist-packages/zmq/eventloop/ioloop.py\", line 177, in start\n super(ZMQIOLoop, self).start()\n File \"/usr/local/lib/python3.6/dist-packages/tornado/ioloop.py\", line 887, in start\n handler_func(fd_obj, events)\n File \"/usr/local/lib/python3.6/dist-packages/tornado/stack_context.py\", line 275, in null_wrapper\n return fn(*args, **kwargs)\n File \"/usr/local/lib/python3.6/dist-packages/zmq/eventloop/zmqstream.py\", line 440, in _handle_events\n self._handle_recv()\n File \"/usr/local/lib/python3.6/dist-packages/zmq/eventloop/zmqstream.py\", line 472, in _handle_recv\n self._run_callback(callback, msg)\n File \"/usr/local/lib/python3.6/dist-packages/zmq/eventloop/zmqstream.py\", line 414, in _run_callback\n callback(*args, **kwargs)\n File \"/usr/local/lib/python3.6/dist-packages/tornado/stack_context.py\", line 275, in null_wrapper\n return fn(*args, **kwargs)\n File \"/usr/local/lib/python3.6/dist-packages/ipykernel/kernelbase.py\", line 283, in dispatcher\n return self.dispatch_shell(stream, msg)\n File \"/usr/local/lib/python3.6/dist-packages/ipykernel/kernelbase.py\", line 235, in dispatch_shell\n handler(stream, idents, msg)\n File \"/usr/local/lib/python3.6/dist-packages/ipykernel/kernelbase.py\", line 399, in execute_request\n user_expressions, allow_stdin)\n File \"/usr/local/lib/python3.6/dist-packages/ipykernel/ipkernel.py\", line 196, in do_execute\n res = shell.run_cell(code, store_history=store_history, silent=silent)\n File \"/usr/local/lib/python3.6/dist-packages/ipykernel/zmqshell.py\", line 533, in run_cell\n return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)\n File \"/usr/local/lib/python3.6/dist-packages/IPython/core/interactiveshell.py\", line 2718, in run_cell\n interactivity=interactivity, compiler=compiler, result=result)\n File \"/usr/local/lib/python3.6/dist-packages/IPython/core/interactiveshell.py\", line 2828, in run_ast_nodes\n if self.run_code(code, result):\n File \"/usr/local/lib/python3.6/dist-packages/IPython/core/interactiveshell.py\", line 2882, in run_code\n exec(code_obj, self.user_global_ns, self.user_ns)\n File \"<ipython-input-7-288d85aa455b>\", line 7, in <module>\n classifier.train(input_fn=train_input_fn, steps=2000)\n File \"/usr/local/lib/python3.6/dist-packages/tensorflow/python/estimator/estimator.py\", line 302, in train\n loss = self._train_model(input_fn, hooks, saving_listeners)\n File \"/usr/local/lib/python3.6/dist-packages/tensorflow/python/estimator/estimator.py\", line 780, in _train_model\n log_step_count_steps=self._config.log_step_count_steps) as mon_sess:\n File \"/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/monitored_session.py\", line 368, in MonitoredTrainingSession\n stop_grace_period_secs=stop_grace_period_secs)\n File \"/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/monitored_session.py\", line 673, in __init__\n stop_grace_period_secs=stop_grace_period_secs)\n File \"/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/monitored_session.py\", line 493, in __init__\n self._sess = _RecoverableSession(self._coordinated_creator)\n File \"/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/monitored_session.py\", line 851, in __init__\n _WrappedSession.__init__(self, self._create_session())\n File \"/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/monitored_session.py\", line 856, in _create_session\n return self._sess_creator.create_session()\n File \"/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/monitored_session.py\", line 554, in create_session\n self.tf_sess = self._session_creator.create_session()\n File \"/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/monitored_session.py\", line 419, in create_session\n self._scaffold.finalize()\n File \"/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/monitored_session.py\", line 212, in finalize\n self._saver.build()\n File \"/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/saver.py\", line 1227, in build\n self._build(self._filename, build_save=True, build_restore=True)\n File \"/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/saver.py\", line 1263, in _build\n build_save=build_save, build_restore=build_restore)\n File \"/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/saver.py\", line 745, in _build_internal\n restore_sequentially, reshape)\n File \"/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/saver.py\", line 470, in _AddShardedRestoreOps\n name=\"restore_shard\"))\n File \"/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/saver.py\", line 427, in _AddRestoreOps\n tensors = self.restore_op(filename_tensor, saveable, preferred_shard)\n File \"/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/saver.py\", line 267, in restore_op\n [spec.tensor.dtype])[0])\n File \"/usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/gen_io_ops.py\", line 1021, in restore_v2\n shape_and_slices=shape_and_slices, dtypes=dtypes, name=name)\n File \"/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/op_def_library.py\", line 787, in _apply_op_helper\n op_def=op_def)\n File \"/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/ops.py\", line 2956, in create_op\n op_def=op_def)\n File \"/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/ops.py\", line 1470, in __init__\n self._traceback = self._graph._extract_stack() # pylint: disable=protected-access\n\nNotFoundError (see above for traceback): Key dnn/hiddenlayer_2/kernel/t_0/Adagrad not found in checkpoint\n\t [[Node: save/RestoreV2_11 = RestoreV2[dtypes=[DT_FLOAT], _device=\"/job:localhost/replica:0/task:0/device:CPU:0\"](_arg_save/Const_0_0, save/RestoreV2_11/tensor_names, save/RestoreV2_11/shape_and_slices)]]\n"]}]},{"metadata":{"id":"OTwTg6Ku3Ofn","colab_type":"code","colab":{"autoexec":{"startup":false,"wait_interval":0}}},"source":[""],"cell_type":"code","execution_count":0,"outputs":[]}]}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment