Skip to content

Instantly share code, notes, and snippets.

@OluwoleOyetoke
Last active December 19, 2017 20:59
Show Gist options
  • Save OluwoleOyetoke/174182499e4fbb6e3006adc6c8885880 to your computer and use it in GitHub Desktop.
Save OluwoleOyetoke/174182499e4fbb6e3006adc6c8885880 to your computer and use it in GitHub Desktop.
Pop Examples From TFRecord In TensorFlow
"""
@date 17th December, 2017
@Language: Python
@author: Oluwole Oyetoke
@email: oluwoleoyetoke@gmail.com
This script helps to get selected features out of a TF Record and return as an np array
Features extracted here: (Image, Labels)
#IMPORTS
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import numpy as np
import tensorflow as tf
import os
#INT64 wrappers
def _int64_feature(value):
return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
def _bytes_feature(value):
return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
#FUNCTION TO GET & PROCESS ALL DATASET DATA
def _process_multiple_images(filenames, perform_shuffle=False, repeat_count=1, batch_size=32, available_record=310, num_of_epochs=1):
"""args:
filenames: Name of the variable where the directory path(s) to the TFRecor(s) is stored
perform_shuffle: Should handler itterate through dataset in a shuffled manner e.g True, False
repeat_count: Repeats dataset this # times
available_record: Specifies how many files (images) are in the dataset e.g 39209
num_of_epochs: Number of times to iterate through dataset
"""
def _process_one_image(serialized):
#Specify the features you want to extract per data. Keys used to extract data is based on keys used to save them into the TFRecord in the first palce e.g 'image/shape'
features = {'image/shape': tf.FixedLenFeature([], tf.string),
'image/class/label': tf.FixedLenFeature([], tf.int64),
'image/class/text': tf.FixedLenFeature([], tf.string),
'image/filename': tf.FixedLenFeature([], tf.string),
'image/encoded': tf.FixedLenFeature([], tf.string)}
parsed_example = tf.parse_single_example(serialized, features=features)
#Any further editing on the parsed_example
image_raw = tf.decode_raw(parsed_example['image/encoded'], tf.uint8)
shape = tf.decode_raw(parsed_example['image/shape'], tf.int32)
label = tf.cast(parsed_example['image/class/label'], dtype=tf.int32)
reshaped_img = tf.reshape(image_raw, shape)
casted_img = tf.cast(reshaped_img, tf.float32)
label_tensor= [label]
image_tensor = [casted_img]
return label_tensor, image_tensor
complete_labels = np.array([])
complete_images = np.array([])
dataset = tf.data.TFRecordDataset(filenames=filenames) #Connect to the TF Record
dataset = dataset.map(_process_one_image) #Map (pass) every member of the dataset through the '_process_one_image' function to extract (and modify) features
dataset = dataset.repeat(repeat_count) #Repeats dataset this # times
dataset = dataset.batch(batch_size) #Batch size to use
iterator = dataset.make_initializable_iterator() #Create iterator which helps to get all iamges in the dataset
labels_tensor, images_tensor = iterator.get_next() #Get batch data
no_of_rounds = int(math.ceil(available_record/batch_size))+1;
#Create tf session to evaluate the tensors
sess = tf.Session()
print("At this batch size of %i, aproximately %i round(s) is needed to stream through the entire dataset" %(batch_size, no_of_rounds))
count=1
for _ in range(num_of_epochs):
sess.run(iterator.initializer) #Initialize @ next batch. Get next batch and evelauate
while True:
try:
print("Now evaluating tensors for round number %i out of %i" % (count, no_of_rounds))
evaluated_label, evaluated_image = sess.run([labels_tensor, images_tensor]) #evaluate tesnosrs gotten from '_process_one_image'
#convert evaluated tensors to np array
label_np_array = np.asarray(evaluated_label, dtype=np.float32)
image_np_array = np.asarray(evaluated_image, dtype=np.float32)
#squeeze np array to make dimesnsions appropriate
squeezed_label_np_array = label_np_array.squeeze()
squeezed_image_np_array = image_np_array.squeeze()
#add current batch to total
if (count==1):
complete_images=squeezed_image_np_array
else:
complete_images = np.concatenate((complete_images.squeeze(), squeezed_image_np_array))
complete_labels = np.append(complete_labels, squeezed_label_np_array)
except tf.errors.OutOfRangeError:
print("End of Dataset Reached")
break
count=count+1
sess.close() #Close TF Session
return complete_labels, complete_images
#MAIN FUNCTION
def main(unused_argv):
"""FUNCTION FLOW
1. Get path(s) to TF record files
2. Determine the number of records in teh TF Record file
3. Process & get all images stored in the TFRecord file
3a. connect to TF Record to get raw dataset (Line 64)
3b. pass every single content of the TF record through the '_process_one_image' function (line 65) which does the following:
3b(i). specify features to extract (line 45 to 49)
3b(ii). finese extracted data to suit your current needs e.g reshape, cast from unit8 to float32 etc (line 53-59)
3c. create itterator which iterates through the dataset based on the number of batches you specify
3d. evaluate the tensors returned by the itterator for each itteration (line 83)
3e. squeeze the np array returned by the evaluator to ensure they are not out of form
3f. append this newly acquired bactch of images (in np array) to the complete_labels and complete_images np arrays
3g. perform step 3d to 3f untill 'num_of_epochs' specified is completed
3h. return entire dataset of extracted features e.g images and labels as 2 big np arrays (line 105)
4. Print shape of returned array to be sure you got it right """
print("Started\n\n")
#NOTE: Remember to change the TFRecord(s) directory path to yours
filenames = ["C:/Users/oluwole.oyetoke/Documents/TFRecord_227x227/train-00000-of-00002","C:/Users/oluwole.oyetoke/Documents/TFRecord_227x227/train-00001-of-00002"]
#Determine total number of records in the '.tfrecord' files
record_count = 0
for fn in filenames:
for record in tf.python_io.tf_record_iterator(fn):
record_count += 1
#Get nparrays of all the labels and images in the TF Record
label_np_array, image_np_array = _process_multiple_images(filenames, perform_shuffle=False, repeat_count=1, batch_size=50, available_record=record_count, num_of_epochs=1)
print(label_np_array.squeeze().shape) #Terminal output>> (#no of dataset files,)
print(image_np_array.squeeze().shape) #Terminal Output>> (#no of dataset files, 227,227,3)
if __name__ == "__main__":
tf.app.run()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment