OluwoleOyetoke/Pop_TFRecord.py

## Pop_TFRecord.py
"""
      @date 17th December, 2017
      @Language: Python
      @author: Oluwole Oyetoke
      @email: oluwoleoyetoke@gmail.com

      This script helps to get selected features out of a TF Record and return as an np array
      Features extracted here: (Image, Labels)
#IMPORTS
"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import math
import numpy as np
import tensorflow as tf
import os


#INT64 wrappers
def _int64_feature(value):
  return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def _bytes_feature(value):
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))


#FUNCTION TO GET & PROCESS ALL DATASET DATA
def _process_multiple_images(filenames, perform_shuffle=False, repeat_count=1, batch_size=32, available_record=310, num_of_epochs=1):

  """args:
      filenames:        Name of the variable where the directory path(s) to the TFRecor(s) is stored
      perform_shuffle:  Should handler itterate through dataset in a shuffled manner e.g True, False
      repeat_count:     Repeats dataset this # times
      available_record: Specifies how many files (images) are in the dataset e.g 39209
      num_of_epochs:    Number of times to iterate through dataset
    """

  def _process_one_image(serialized):

    #Specify the features you want to extract per data. Keys used to extract data is based on keys used to save them into the TFRecord in the first palce e.g 'image/shape'
    features = {'image/shape': tf.FixedLenFeature([], tf.string),
                'image/class/label': tf.FixedLenFeature([], tf.int64),
                'image/class/text': tf.FixedLenFeature([], tf.string),
                'image/filename': tf.FixedLenFeature([], tf.string),
                'image/encoded': tf.FixedLenFeature([], tf.string)}
    parsed_example = tf.parse_single_example(serialized, features=features)

    #Any further editing on the parsed_example
    image_raw = tf.decode_raw(parsed_example['image/encoded'], tf.uint8)
    shape = tf.decode_raw(parsed_example['image/shape'], tf.int32)
    label = tf.cast(parsed_example['image/class/label'], dtype=tf.int32)
    reshaped_img = tf.reshape(image_raw, shape)
    casted_img =  tf.cast(reshaped_img, tf.float32)
    label_tensor= [label]
    image_tensor = [casted_img]
    return label_tensor, image_tensor

  complete_labels = np.array([])
  complete_images = np.array([])
  dataset = tf.data.TFRecordDataset(filenames=filenames)      #Connect to the TF Record
  dataset = dataset.map(_process_one_image)                   #Map (pass) every member of the dataset through the '_process_one_image' function to extract (and modify) features
  dataset = dataset.repeat(repeat_count)                      #Repeats dataset this # times
  dataset = dataset.batch(batch_size)                         #Batch size to use
  iterator = dataset.make_initializable_iterator()            #Create iterator which helps to get all iamges in the dataset
  labels_tensor, images_tensor = iterator.get_next()          #Get batch data
  no_of_rounds = int(math.ceil(available_record/batch_size))+1;

  #Create tf session to evaluate the tensors
  sess = tf.Session()
  print("At this batch size of %i, aproximately %i round(s) is needed to stream through the entire dataset" %(batch_size, no_of_rounds))

  count=1
  for _ in range(num_of_epochs):
    sess.run(iterator.initializer) #Initialize @ next batch. Get next batch and evelauate

    while True:
      try:
        print("Now evaluating tensors for round number %i out of %i" % (count, no_of_rounds))
        evaluated_label, evaluated_image = sess.run([labels_tensor, images_tensor]) #evaluate tesnosrs gotten from '_process_one_image'

        #convert evaluated tensors to np array
        label_np_array = np.asarray(evaluated_label, dtype=np.float32)
        image_np_array = np.asarray(evaluated_image, dtype=np.float32)

        #squeeze np array to make dimesnsions appropriate
        squeezed_label_np_array = label_np_array.squeeze()
        squeezed_image_np_array = image_np_array.squeeze()


        #add current batch to total
        if (count==1):
          complete_images=squeezed_image_np_array
        else:
          complete_images = np.concatenate((complete_images.squeeze(), squeezed_image_np_array))
        complete_labels = np.append(complete_labels, squeezed_label_np_array)
      except tf.errors.OutOfRangeError:
        print("End of Dataset Reached")
        break
      count=count+1
  sess.close() #Close TF Session
  return complete_labels, complete_images

#MAIN FUNCTION
def main(unused_argv):

 """FUNCTION FLOW
    1. Get path(s) to TF record files

    2. Determine the number of records in teh TF Record file

    3. Process & get all images stored in the TFRecord file
      3a. connect to TF Record to get raw dataset (Line 64)
      3b. pass every single content of the TF record through the '_process_one_image' function (line 65) which does the following:
        3b(i). specify features to extract (line 45 to 49)
        3b(ii). finese extracted data to suit your current needs e.g reshape, cast from unit8 to float32 etc (line 53-59)
      3c. create itterator which iterates through the dataset based on the number of batches you specify
      3d. evaluate the tensors returned by the itterator for each itteration (line 83)
      3e. squeeze the np array returned by the evaluator to ensure they are not out of form
      3f. append this newly acquired bactch of images (in np array) to the complete_labels and complete_images np arrays
      3g. perform step 3d to 3f untill 'num_of_epochs' specified is completed
      3h. return entire dataset of extracted features e.g images and labels as 2 big np arrays (line 105)

    4. Print shape of returned array to be sure you got it right """


  print("Started\n\n")

  #NOTE: Remember to change the TFRecord(s) directory path to yours
  filenames = ["C:/Users/oluwole.oyetoke/Documents/TFRecord_227x227/train-00000-of-00002","C:/Users/oluwole.oyetoke/Documents/TFRecord_227x227/train-00001-of-00002"]

  #Determine total number of records in the '.tfrecord' files
  record_count = 0
  for fn in filenames:
    for record in tf.python_io.tf_record_iterator(fn):
        record_count += 1

  #Get nparrays of all the labels and images in the TF Record
  label_np_array, image_np_array = _process_multiple_images(filenames, perform_shuffle=False, repeat_count=1, batch_size=50, available_record=record_count, num_of_epochs=1)


  print(label_np_array.squeeze().shape) #Terminal output>> (#no of dataset files,)
  print(image_np_array.squeeze().shape) #Terminal Output>> (#no of dataset files, 227,227,3)


if __name__ == "__main__":
    tf.app.run()
	"""
	@date 17th December, 2017
	@Language: Python
	@author: Oluwole Oyetoke
	@email: oluwoleoyetoke@gmail.com

	This script helps to get selected features out of a TF Record and return as an np array
	Features extracted here: (Image, Labels)
	#IMPORTS
	"""

	from __future__ import absolute_import
	from __future__ import division
	from __future__ import print_function

	import math
	import numpy as np
	import tensorflow as tf
	import os



	#INT64 wrappers
	def _int64_feature(value):
	return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

	def _bytes_feature(value):
	return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))


	#FUNCTION TO GET & PROCESS ALL DATASET DATA
	def _process_multiple_images(filenames, perform_shuffle=False, repeat_count=1, batch_size=32, available_record=310, num_of_epochs=1):

	"""args:
	filenames: Name of the variable where the directory path(s) to the TFRecor(s) is stored
	perform_shuffle: Should handler itterate through dataset in a shuffled manner e.g True, False
	repeat_count: Repeats dataset this # times
	available_record: Specifies how many files (images) are in the dataset e.g 39209
	num_of_epochs: Number of times to iterate through dataset
	"""

	def _process_one_image(serialized):

	#Specify the features you want to extract per data. Keys used to extract data is based on keys used to save them into the TFRecord in the first palce e.g 'image/shape'
	features = {'image/shape': tf.FixedLenFeature([], tf.string),
	'image/class/label': tf.FixedLenFeature([], tf.int64),
	'image/class/text': tf.FixedLenFeature([], tf.string),
	'image/filename': tf.FixedLenFeature([], tf.string),
	'image/encoded': tf.FixedLenFeature([], tf.string)}
	parsed_example = tf.parse_single_example(serialized, features=features)

	#Any further editing on the parsed_example
	image_raw = tf.decode_raw(parsed_example['image/encoded'], tf.uint8)
	shape = tf.decode_raw(parsed_example['image/shape'], tf.int32)
	label = tf.cast(parsed_example['image/class/label'], dtype=tf.int32)
	reshaped_img = tf.reshape(image_raw, shape)
	casted_img = tf.cast(reshaped_img, tf.float32)
	label_tensor= [label]
	image_tensor = [casted_img]
	return label_tensor, image_tensor

	complete_labels = np.array([])
	complete_images = np.array([])
	dataset = tf.data.TFRecordDataset(filenames=filenames) #Connect to the TF Record
	dataset = dataset.map(_process_one_image) #Map (pass) every member of the dataset through the '_process_one_image' function to extract (and modify) features
	dataset = dataset.repeat(repeat_count) #Repeats dataset this # times
	dataset = dataset.batch(batch_size) #Batch size to use
	iterator = dataset.make_initializable_iterator() #Create iterator which helps to get all iamges in the dataset
	labels_tensor, images_tensor = iterator.get_next() #Get batch data
	no_of_rounds = int(math.ceil(available_record/batch_size))+1;

	#Create tf session to evaluate the tensors
	sess = tf.Session()
	print("At this batch size of %i, aproximately %i round(s) is needed to stream through the entire dataset" %(batch_size, no_of_rounds))

	count=1
	for _ in range(num_of_epochs):
	sess.run(iterator.initializer) #Initialize @ next batch. Get next batch and evelauate

	while True:
	try:
	print("Now evaluating tensors for round number %i out of %i" % (count, no_of_rounds))
	evaluated_label, evaluated_image = sess.run([labels_tensor, images_tensor]) #evaluate tesnosrs gotten from '_process_one_image'

	#convert evaluated tensors to np array
	label_np_array = np.asarray(evaluated_label, dtype=np.float32)
	image_np_array = np.asarray(evaluated_image, dtype=np.float32)

	#squeeze np array to make dimesnsions appropriate
	squeezed_label_np_array = label_np_array.squeeze()
	squeezed_image_np_array = image_np_array.squeeze()


	#add current batch to total
	if (count==1):
	complete_images=squeezed_image_np_array
	else:
	complete_images = np.concatenate((complete_images.squeeze(), squeezed_image_np_array))
	complete_labels = np.append(complete_labels, squeezed_label_np_array)
	except tf.errors.OutOfRangeError:
	print("End of Dataset Reached")
	break
	count=count+1
	sess.close() #Close TF Session
	return complete_labels, complete_images

	#MAIN FUNCTION
	def main(unused_argv):

	"""FUNCTION FLOW
	1. Get path(s) to TF record files

	2. Determine the number of records in teh TF Record file

	3. Process & get all images stored in the TFRecord file
	3a. connect to TF Record to get raw dataset (Line 64)
	3b. pass every single content of the TF record through the '_process_one_image' function (line 65) which does the following:
	3b(i). specify features to extract (line 45 to 49)
	3b(ii). finese extracted data to suit your current needs e.g reshape, cast from unit8 to float32 etc (line 53-59)
	3c. create itterator which iterates through the dataset based on the number of batches you specify
	3d. evaluate the tensors returned by the itterator for each itteration (line 83)
	3e. squeeze the np array returned by the evaluator to ensure they are not out of form
	3f. append this newly acquired bactch of images (in np array) to the complete_labels and complete_images np arrays
	3g. perform step 3d to 3f untill 'num_of_epochs' specified is completed
	3h. return entire dataset of extracted features e.g images and labels as 2 big np arrays (line 105)

	4. Print shape of returned array to be sure you got it right """


	print("Started\n\n")

	#NOTE: Remember to change the TFRecord(s) directory path to yours
	filenames = ["C:/Users/oluwole.oyetoke/Documents/TFRecord_227x227/train-00000-of-00002","C:/Users/oluwole.oyetoke/Documents/TFRecord_227x227/train-00001-of-00002"]

	#Determine total number of records in the '.tfrecord' files
	record_count = 0
	for fn in filenames:
	for record in tf.python_io.tf_record_iterator(fn):
	record_count += 1

	#Get nparrays of all the labels and images in the TF Record
	label_np_array, image_np_array = _process_multiple_images(filenames, perform_shuffle=False, repeat_count=1, batch_size=50, available_record=record_count, num_of_epochs=1)


	print(label_np_array.squeeze().shape) #Terminal output>> (#no of dataset files,)
	print(image_np_array.squeeze().shape) #Terminal Output>> (#no of dataset files, 227,227,3)


	if __name__ == "__main__":
	tf.app.run()