Instantly share code, notes, and snippets.

Embed
What would you like to do?
'''
A logistic regression example using the meta-graph checkpointing
features of Tensorflow.
Author: João Felipe Santos, based on code by Aymeric Damien
(https://github.com/aymericdamien/TensorFlow-Examples/)
'''
from __future__ import print_function
import tensorflow as tf
import numpy as np
import argparse
# Import MNIST data
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("/tmp/data/", one_hot=True)
# Parameters
learning_rate = 0.01
batch_size = 100
display_step = 1
parser = argparse.ArgumentParser()
parser.add_argument('--load', default=False)
parser.add_argument('--max_epochs', type=int, default=5)
args = parser.parse_args()
load = args.load
training_epochs = args.max_epochs
# Instantiate saver
if not load:
# tf Graph Input
x = tf.placeholder(tf.float32, [None, 784], name='x') # mnist data image of shape 28*28=784
y = tf.placeholder(tf.float32, [None, 10], name='y') # 0-9 digits recognition => 10 classes
# Set model weights
W = tf.Variable(tf.zeros([784, 10]), name='W')
b = tf.Variable(tf.zeros([10]), name='b')
# Construct model
pred = tf.nn.softmax(tf.matmul(x, W) + b) # Softmax
# Minimize error using cross entropy
cost = tf.reduce_mean(-tf.reduce_sum(y*tf.log(pred), reduction_indices=1))
# Gradient Descent
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)
init = tf.initialize_all_variables()
saver = tf.train.Saver()
# In order to be able to easily retrieve variables and ops later,
# we add them to collections
tf.add_to_collection('train_op', optimizer)
tf.add_to_collection('cost_op', cost)
tf.add_to_collection('input', x)
tf.add_to_collection('target', y)
tf.add_to_collection('pred', pred)
initial_epoch = 0
else:
# Find last executed epoch
from glob import glob
history = list(map(lambda x: int(x.split('-')[1][:-5]), glob('model.ckpt-*.meta')))
last_epoch = np.max(history)
# Instantiate saver object using previously saved meta-graph
saver = tf.train.import_meta_graph('model.ckpt-{}.meta'.format(last_epoch))
initial_epoch = last_epoch + 1
# Launch the graph
with tf.Session() as sess:
if not load:
sess.run(init)
else:
saver.restore(sess, 'model.ckpt')
optimizer = tf.get_collection('train_op')[0]
cost = tf.get_collection('cost_op')[0]
x = tf.get_collection('input')[0]
y = tf.get_collection('target')[0]
pred = tf.get_collection('pred')[0]
# Training cycle
for epoch in range(initial_epoch, training_epochs):
avg_cost = 0.
total_batch = int(mnist.train.num_examples/batch_size)
# Loop over all batches
for i in range(total_batch):
batch_xs, batch_ys = mnist.train.next_batch(batch_size)
# Run optimization op (backprop) and cost op (to get loss value)
_, c = sess.run([optimizer, cost], feed_dict={x: batch_xs,
y: batch_ys})
# Compute average loss
avg_cost += c / total_batch
# Display logs per epoch step
if (epoch+1) % display_step == 0:
print("Epoch:", '%04d' % (epoch+1), "cost=", "{:.9f}".format(avg_cost))
saver.save(sess, 'model.ckpt', global_step=epoch)
print("Optimization Finished!")
# Test model
correct_prediction = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1))
# Calculate accuracy
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
print("Accuracy:", accuracy.eval({x: mnist.test.images, y: mnist.test.labels}))
@jfsantos

This comment has been minimized.

Owner

jfsantos commented Dec 1, 2016

How to try this:

  1. Run file without any arguments (python logistic_regression_with_checkpointing.py). It will run for 5 epochs and save checkpoints for each epoch.
  2. Run file again, now passing --load True --max_epochs 10. The script will detect it has already trained for 5 epochs, and run for another 5 epochs.
@laventura

This comment has been minimized.

laventura commented Aug 30, 2017

Thanks for this. This is useful.

However, I get an error while trying to resume training after checkpointing.

Here's the stack trace. FYI - I added a few print statements to print out the last epoch, and changed the model name. Just cosmetic.
I see that the .meta files are there, and so are the index files, etc.

FWIW, this is TensorFlow 1.1.0

Any ideas what's incorrect??

tf1 ▶ ~ ▶ Developer ❯ courses ❯ self_driving_car ❯ semantic_segment ▶ $ ▶ python logistic_regression_checkpointing.py --load True --max_epochs 10
Extracting /tmp/data/train-images-idx3-ubyte.gz
Extracting /tmp/data/train-labels-idx1-ubyte.gz
Extracting /tmp/data/t10k-images-idx3-ubyte.gz
Extracting /tmp/data/t10k-labels-idx1-ubyte.gz
Looking for existing checkpoint files...
history: [0, 1, 2, 3, 4]
last epoch: 4
last checkpoint: ./my_model-4.meta
2017-08-30 12:03:12.359184: W tensorflow/core/platform/cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use SSE4.1 instructions, but these are available on your machine and could speed up CPU computations.
2017-08-30 12:03:12.359223: W tensorflow/core/platform/cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use SSE4.2 instructions, but these are available on your machine and could speed up CPU computations.
2017-08-30 12:03:12.359229: W tensorflow/core/platform/cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use AVX instructions, but these are available on your machine and could speed up CPU computations.
2017-08-30 12:03:12.359234: W tensorflow/core/platform/cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use AVX2 instructions, but these are available on your machine and could speed up CPU computations.
2017-08-30 12:03:12.359238: W tensorflow/core/platform/cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use FMA instructions, but these are available on your machine and could speed up CPU computations.
2017-08-30 12:03:12.820262: E tensorflow/stream_executor/cuda/cuda_driver.cc:405] failed call to cuInit: CUDA_ERROR_NO_DEVICE
2017-08-30 12:03:12.820595: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:158] retrieving CUDA diagnostic information for host: Atuls-MacBook-Pro.local
2017-08-30 12:03:12.820608: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:165] hostname: Atuls-MacBook-Pro.local
2017-08-30 12:03:12.820820: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:189] libcuda reported version is: 310.42.25
2017-08-30 12:03:12.821011: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:193] kernel reported version is: Invalid argument: expected %d.%d or %d.%d.%d form for driver version; got ""
-- Restoring model --
2017-08-30 12:03:12.880175: W tensorflow/core/framework/op_kernel.cc:1152] Not found: Unsuccessful TensorSliceReader constructor: Failed to find any matching files for my_model
2017-08-30 12:03:12.880800: W tensorflow/core/framework/op_kernel.cc:1152] Not found: Unsuccessful TensorSliceReader constructor: Failed to find any matching files for my_model
Traceback (most recent call last):
  File "/Users/aa/Developer/miniconda/envs/tf1/lib/python3.5/site-packages/tensorflow/python/client/session.py", line 1039, in _do_call
    return fn(*args)
  File "/Users/aa/Developer/miniconda/envs/tf1/lib/python3.5/site-packages/tensorflow/python/client/session.py", line 1021, in _run_fn
    status, run_metadata)
  File "/Users/aa/Developer/miniconda/envs/tf1/lib/python3.5/contextlib.py", line 66, in __exit__
    next(self.gen)
  File "/Users/aa/Developer/miniconda/envs/tf1/lib/python3.5/site-packages/tensorflow/python/framework/errors_impl.py", line 466, in raise_exception_on_not_ok_status
    pywrap_tensorflow.TF_GetCode(status))
tensorflow.python.framework.errors_impl.NotFoundError: Unsuccessful TensorSliceReader constructor: Failed to find any matching files for my_model
	 [[Node: save/RestoreV2 = RestoreV2[dtypes=[DT_FLOAT], _device="/job:localhost/replica:0/task:0/cpu:0"](_recv_save/Const_0, save/RestoreV2/tensor_names, save/RestoreV2/shape_and_slices)]]

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "logistic_regression_checkpointing.py", line 96, in <module>
    saver.restore(sess, MODEL_FILE)
  File "/Users/aa/Developer/miniconda/envs/tf1/lib/python3.5/site-packages/tensorflow/python/training/saver.py", line 1457, in restore
    {self.saver_def.filename_tensor_name: save_path})
  File "/Users/aa/Developer/miniconda/envs/tf1/lib/python3.5/site-packages/tensorflow/python/client/session.py", line 778, in run
    run_metadata_ptr)
  File "/Users/aa/Developer/miniconda/envs/tf1/lib/python3.5/site-packages/tensorflow/python/client/session.py", line 982, in _run
    feed_dict_string, options, run_metadata)
  File "/Users/aa/Developer/miniconda/envs/tf1/lib/python3.5/site-packages/tensorflow/python/client/session.py", line 1032, in _do_run
    target_list, options, run_metadata)
  File "/Users/aa/Developer/miniconda/envs/tf1/lib/python3.5/site-packages/tensorflow/python/client/session.py", line 1052, in _do_call
    raise type(e)(node_def, op, message)
tensorflow.python.framework.errors_impl.NotFoundError: Unsuccessful TensorSliceReader constructor: Failed to find any matching files for my_model
	 [[Node: save/RestoreV2 = RestoreV2[dtypes=[DT_FLOAT], _device="/job:localhost/replica:0/task:0/cpu:0"](_recv_save/Const_0, save/RestoreV2/tensor_names, save/RestoreV2/shape_and_slices)]]

Caused by op 'save/RestoreV2', defined at:
  File "logistic_regression_checkpointing.py", line 87, in <module>
    saver = tf.train.import_meta_graph(latest_ckpt_name_base + '.meta')
  File "/Users/aa/Developer/miniconda/envs/tf1/lib/python3.5/site-packages/tensorflow/python/training/saver.py", line 1595, in import_meta_graph
    **kwargs)
  File "/Users/aa/Developer/miniconda/envs/tf1/lib/python3.5/site-packages/tensorflow/python/framework/meta_graph.py", line 499, in import_scoped_meta_graph
    producer_op_list=producer_op_list)
  File "/Users/aa/Developer/miniconda/envs/tf1/lib/python3.5/site-packages/tensorflow/python/framework/importer.py", line 308, in import_graph_def
    op_def=op_def)
  File "/Users/aa/Developer/miniconda/envs/tf1/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 2336, in create_op
    original_op=self._default_original_op, op_def=op_def)
  File "/Users/aa/Developer/miniconda/envs/tf1/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 1228, in __init__
    self._traceback = _extract_stack()

NotFoundError (see above for traceback): Unsuccessful TensorSliceReader constructor: Failed to find any matching files for my_model
	 [[Node: save/RestoreV2 = RestoreV2[dtypes=[DT_FLOAT], _device="/job:localhost/replica:0/task:0/cpu:0"](_recv_save/Const_0, save/RestoreV2/tensor_names, save/RestoreV2/shape_and_slices)]]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment