Last active
December 19, 2017 20:59
-
-
Save OluwoleOyetoke/174182499e4fbb6e3006adc6c8885880 to your computer and use it in GitHub Desktop.
Pop Examples From TFRecord In TensorFlow
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
@date 17th December, 2017 | |
@Language: Python | |
@author: Oluwole Oyetoke | |
@email: oluwoleoyetoke@gmail.com | |
This script helps to get selected features out of a TF Record and return as an np array | |
Features extracted here: (Image, Labels) | |
#IMPORTS | |
""" | |
from __future__ import absolute_import | |
from __future__ import division | |
from __future__ import print_function | |
import math | |
import numpy as np | |
import tensorflow as tf | |
import os | |
#INT64 wrappers | |
def _int64_feature(value): | |
return tf.train.Feature(int64_list=tf.train.Int64List(value=[value])) | |
def _bytes_feature(value): | |
return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value])) | |
#FUNCTION TO GET & PROCESS ALL DATASET DATA | |
def _process_multiple_images(filenames, perform_shuffle=False, repeat_count=1, batch_size=32, available_record=310, num_of_epochs=1): | |
"""args: | |
filenames: Name of the variable where the directory path(s) to the TFRecor(s) is stored | |
perform_shuffle: Should handler itterate through dataset in a shuffled manner e.g True, False | |
repeat_count: Repeats dataset this # times | |
available_record: Specifies how many files (images) are in the dataset e.g 39209 | |
num_of_epochs: Number of times to iterate through dataset | |
""" | |
def _process_one_image(serialized): | |
#Specify the features you want to extract per data. Keys used to extract data is based on keys used to save them into the TFRecord in the first palce e.g 'image/shape' | |
features = {'image/shape': tf.FixedLenFeature([], tf.string), | |
'image/class/label': tf.FixedLenFeature([], tf.int64), | |
'image/class/text': tf.FixedLenFeature([], tf.string), | |
'image/filename': tf.FixedLenFeature([], tf.string), | |
'image/encoded': tf.FixedLenFeature([], tf.string)} | |
parsed_example = tf.parse_single_example(serialized, features=features) | |
#Any further editing on the parsed_example | |
image_raw = tf.decode_raw(parsed_example['image/encoded'], tf.uint8) | |
shape = tf.decode_raw(parsed_example['image/shape'], tf.int32) | |
label = tf.cast(parsed_example['image/class/label'], dtype=tf.int32) | |
reshaped_img = tf.reshape(image_raw, shape) | |
casted_img = tf.cast(reshaped_img, tf.float32) | |
label_tensor= [label] | |
image_tensor = [casted_img] | |
return label_tensor, image_tensor | |
complete_labels = np.array([]) | |
complete_images = np.array([]) | |
dataset = tf.data.TFRecordDataset(filenames=filenames) #Connect to the TF Record | |
dataset = dataset.map(_process_one_image) #Map (pass) every member of the dataset through the '_process_one_image' function to extract (and modify) features | |
dataset = dataset.repeat(repeat_count) #Repeats dataset this # times | |
dataset = dataset.batch(batch_size) #Batch size to use | |
iterator = dataset.make_initializable_iterator() #Create iterator which helps to get all iamges in the dataset | |
labels_tensor, images_tensor = iterator.get_next() #Get batch data | |
no_of_rounds = int(math.ceil(available_record/batch_size))+1; | |
#Create tf session to evaluate the tensors | |
sess = tf.Session() | |
print("At this batch size of %i, aproximately %i round(s) is needed to stream through the entire dataset" %(batch_size, no_of_rounds)) | |
count=1 | |
for _ in range(num_of_epochs): | |
sess.run(iterator.initializer) #Initialize @ next batch. Get next batch and evelauate | |
while True: | |
try: | |
print("Now evaluating tensors for round number %i out of %i" % (count, no_of_rounds)) | |
evaluated_label, evaluated_image = sess.run([labels_tensor, images_tensor]) #evaluate tesnosrs gotten from '_process_one_image' | |
#convert evaluated tensors to np array | |
label_np_array = np.asarray(evaluated_label, dtype=np.float32) | |
image_np_array = np.asarray(evaluated_image, dtype=np.float32) | |
#squeeze np array to make dimesnsions appropriate | |
squeezed_label_np_array = label_np_array.squeeze() | |
squeezed_image_np_array = image_np_array.squeeze() | |
#add current batch to total | |
if (count==1): | |
complete_images=squeezed_image_np_array | |
else: | |
complete_images = np.concatenate((complete_images.squeeze(), squeezed_image_np_array)) | |
complete_labels = np.append(complete_labels, squeezed_label_np_array) | |
except tf.errors.OutOfRangeError: | |
print("End of Dataset Reached") | |
break | |
count=count+1 | |
sess.close() #Close TF Session | |
return complete_labels, complete_images | |
#MAIN FUNCTION | |
def main(unused_argv): | |
"""FUNCTION FLOW | |
1. Get path(s) to TF record files | |
2. Determine the number of records in teh TF Record file | |
3. Process & get all images stored in the TFRecord file | |
3a. connect to TF Record to get raw dataset (Line 64) | |
3b. pass every single content of the TF record through the '_process_one_image' function (line 65) which does the following: | |
3b(i). specify features to extract (line 45 to 49) | |
3b(ii). finese extracted data to suit your current needs e.g reshape, cast from unit8 to float32 etc (line 53-59) | |
3c. create itterator which iterates through the dataset based on the number of batches you specify | |
3d. evaluate the tensors returned by the itterator for each itteration (line 83) | |
3e. squeeze the np array returned by the evaluator to ensure they are not out of form | |
3f. append this newly acquired bactch of images (in np array) to the complete_labels and complete_images np arrays | |
3g. perform step 3d to 3f untill 'num_of_epochs' specified is completed | |
3h. return entire dataset of extracted features e.g images and labels as 2 big np arrays (line 105) | |
4. Print shape of returned array to be sure you got it right """ | |
print("Started\n\n") | |
#NOTE: Remember to change the TFRecord(s) directory path to yours | |
filenames = ["C:/Users/oluwole.oyetoke/Documents/TFRecord_227x227/train-00000-of-00002","C:/Users/oluwole.oyetoke/Documents/TFRecord_227x227/train-00001-of-00002"] | |
#Determine total number of records in the '.tfrecord' files | |
record_count = 0 | |
for fn in filenames: | |
for record in tf.python_io.tf_record_iterator(fn): | |
record_count += 1 | |
#Get nparrays of all the labels and images in the TF Record | |
label_np_array, image_np_array = _process_multiple_images(filenames, perform_shuffle=False, repeat_count=1, batch_size=50, available_record=record_count, num_of_epochs=1) | |
print(label_np_array.squeeze().shape) #Terminal output>> (#no of dataset files,) | |
print(image_np_array.squeeze().shape) #Terminal Output>> (#no of dataset files, 227,227,3) | |
if __name__ == "__main__": | |
tf.app.run() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment