Created
February 8, 2017 16:42
-
-
Save usmcamp0811/b1cb530077922b55f39c8c5fdc91d23f to your computer and use it in GitHub Desktop.
TFRecord Generator and Reader
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import tensorflow as tf | |
import pandas as pd | |
import numpy as np | |
import os | |
from tqdm import tqdm | |
from PIL import Image | |
from PIL import ImageFont | |
from PIL import ImageDraw | |
from PIL import Image, ImageOps | |
import PIL | |
def make_q_list(filepathlist, filetype): | |
filepathlist = filepathlist | |
filepaths = [] | |
labels = [] | |
for path in filepathlist: | |
data_files = os.listdir(path) | |
for data in data_files: | |
if data.endswith(filetype): | |
data_file = os.path.join(path, data) | |
data_file = data_file | |
data_label = os.path.basename(os.path.normpath(path)) | |
filepaths.append(data_file) | |
labels.append(data_label) | |
return filepaths, labels | |
class TF_Record_Generator(object): | |
''' | |
This will take a continuous stream of data and output it to a TFRecord File for use | |
in a Tensorflow model. Currently I am uncertain if this can handle mixed data types | |
in a table currently This may require some pre-proccessing to make it all either a | |
float, int, or byte dtype. | |
RECOMMENDATION: Save image sizes in the TFRecord file name until I figure out how | |
to parse them without prior knowledge of their sizes. Also for images, may want to | |
keep track of your label index. | |
''' | |
def __init__(self, tf_filename, file_queue_list=None, labels_list=None, | |
dtype='float', class_index=None, img_resize=None): | |
self.dtype = dtype | |
if self.dtype == 'float': | |
self.dtype = self._float_feature | |
elif self.dtype == 'int': | |
self.dtype = self._int64_feature | |
elif self.dtype == 'byte': | |
self.dtype = self._bytes_feature | |
self.labels_list = labels_list | |
self.img_resize = img_resize | |
self.file_queue_list = file_queue_list | |
self.writer = tf.python_io.TFRecordWriter(tf_filename) | |
img_ext = ['.jpg', '.png', '.bmp', '.gif'] | |
tbl_ext = ['.csv', '.h5'] | |
ext_set = set() | |
for f in self.file_queue_list: | |
ext_set.add(os.path.splitext(f)[1]) | |
ext_set = list(ext_set) | |
if pd.Series(list(ext_set)).isin(tbl_ext)[0] == True: | |
print('Table') | |
self.table_2TFR() | |
elif pd.Series(list(ext_set)).isin(img_ext)[0] == True: | |
print('Image') | |
#if you don't have a list of classes, make one | |
if class_index is None: | |
self.class_index = set() | |
for l in labels_list: | |
self.class_index.add(l) | |
self.class_index = list(self.class_index) | |
else: | |
self.class_index = class_index | |
self.img_2TFR() | |
print('Image Class Index:', self.class_index) | |
def _int64_feature(self, value): | |
return tf.train.Feature(int64_list=tf.train.Int64List(value=[value])) | |
def _float_feature(self, value): | |
return tf.train.Feature(float_list=tf.train.FloatList(value=value)) | |
def _bytes_feature(self, value): | |
return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value])) | |
def table_2TFR(self): | |
for file in tqdm(self.file_queue_list): | |
if os.path.splitext(file)[1] == '.csv': | |
data = pd.read_csv(file).values | |
elif os.path.splitext(file)[1] == '.hdf': | |
data = pd.read_hdf(file).values | |
else: | |
print(os.path.splitext(file)[1], 'is not supported at this time...') | |
break | |
for row in data: | |
example = tf.train.Example( | |
features=tf.train.Features(feature={ | |
'x': self.dtype(row) | |
}) | |
) | |
self.writer.write(example.SerializeToString()) | |
def label_to_int(self, labels_list=None, index=None): # labels is a list of labels | |
if index is None: | |
class_index = set() | |
for l in labels_list: | |
class_index.add(l) | |
class_index = list(class_index) | |
else: | |
class_index = index | |
int_classes = [] | |
for label in labels_list: | |
int_classes.append(class_index.index(label)) # the class_index.index() looks values up in the list label | |
int_classes = np.array(int_classes, dtype=np.uint32) | |
return int_classes, class_index #returning class index so you know what things are | |
def image_resizer(self, image, size): | |
basewidth = size | |
img = Image.fromarray(image) | |
wpercent = (basewidth / float(img.size[0])) | |
hsize = int((float(img.size[1]) * float(wpercent))) | |
img = img.resize((basewidth, hsize), PIL.Image.ANTIALIAS) | |
#TODO: Adjust this for odd size images | |
# crop the sides off to make a square image | |
img = ImageOps.fit(img, (size, size), Image.ANTIALIAS) | |
return np.asarray(img) | |
def read_image(self, path2Img, lbl_int, size=None): | |
images = [] | |
im = Image.open(path2Img) | |
im = np.asarray(im, np.uint8) | |
if size != None: | |
im = self.image_resizer(im, size) | |
images.append([lbl_int, im]) | |
images = sorted(images, key=lambda image: image[0]) | |
image_shape = images[0][1].shape | |
images_only = [np.asarray(image[1].flatten(), np.uint8) for image in images] | |
images_only = np.array(images_only) | |
labels_only = [np.asarray(key[0], np.int32) for key in images] | |
labels_only = np.array(labels_only) | |
return images_only, labels_only, image_shape | |
def img_2TFR(self): | |
# take class list and convert the class name to int values | |
int_classes, class_index = self.label_to_int(labels_list=self.labels_list, | |
index=self.class_index) | |
# for every img in the queue read the image, output something to be written to TFR | |
for im in tqdm(range(len(self.file_queue_list))): | |
img, lbl, image_shape = self.read_image(self.file_queue_list[im], | |
int_classes[im], | |
size=self.img_resize) | |
rows = image_shape[0] | |
cols = image_shape[1] | |
depth = image_shape[2] | |
img = img.tostring() | |
example = tf.train.Example(features=tf.train.Features(feature={ | |
'height': self._int64_feature(rows), | |
'width': self._int64_feature(cols), | |
'depth': self._int64_feature(depth), | |
'label': self._int64_feature(int(lbl)), | |
'image_raw': self._bytes_feature(img)})) | |
self.writer.write(example.SerializeToString()) | |
class TF_Record_Reader(object): | |
''' Currently only does shuffled batches of images, train_shuffle_batch=False for dataframes | |
will result in the entire dataframe being returned unshuffled. ''' | |
def TFR2_img(self, tf_filepath, batch_size, | |
num_epochs=None, n_classes=None, one_hot=False, flat_imshape=None, | |
num_threads=2): | |
self.tf_filepath = tf_filepath | |
self.num_threads = num_threads | |
self.batch_size = batch_size | |
self.flat_imshape = flat_imshape | |
filename_queue = tf.train.string_input_producer( | |
[tf_filepath], num_epochs=num_epochs) | |
self.images, self.labels = self.image_read_and_decode(filename_queue) | |
if one_hot: | |
self.labels = tf.one_hot(self.labels, n_classes, dtype=tf.int32) | |
self.type = 'img' | |
self.enqueue_many = False | |
return self.shuffle_batch() | |
def image_read_and_decode(self, filename_queue): | |
reader = tf.TFRecordReader() | |
_, serialized_example = reader.read(filename_queue) | |
features = tf.parse_single_example( | |
serialized_example, | |
features={ | |
'image_raw': tf.FixedLenFeature([], tf.string), | |
'label': tf.FixedLenFeature([], tf.int64), | |
'height': tf.FixedLenFeature([], tf.int64), | |
'width': tf.FixedLenFeature([], tf.int64), | |
'depth': tf.FixedLenFeature([], tf.int64) | |
}) | |
image = tf.decode_raw(features['image_raw'], tf.uint8) | |
height = tf.cast(features['height'], tf.int32) | |
width = tf.cast(features['width'], tf.int32) | |
depth = tf.cast(features['depth'], tf.int32) | |
image_shape = tf.pack([height, width, depth]) | |
#TODO: Fix this so it can get the shape of the image by itself. | |
image.set_shape([self.flat_imshape]) | |
image = tf.cast(image, tf.float32) | |
label = tf.cast(features['label'], tf.int32) | |
return image, label | |
def shuffle_batch(self): | |
if self.type == 'img': | |
data = [self.images, self.labels] | |
elif self.type == 'df': | |
data = [self.dataframe] | |
#This is similar to the 'inputs' function in Tensorflow tutorials/code | |
example_batch = tf.train.shuffle_batch( | |
data, batch_size=self.batch_size, num_threads=self.num_threads, | |
capacity=1000, enqueue_many=self.enqueue_many, | |
# Ensures a minimum amount of shuffling of examples. | |
min_after_dequeue=10, name=self.tf_filepath) | |
if self.type == 'img': | |
return example_batch[0], example_batch[1] | |
#when running to convert to DF looks like this... | |
# df = sess.run([example_batch]) | |
# df = pd.DataFrame(df[0]) | |
elif self.type == 'df': | |
return example_batch | |
def TFR2_df(self, tf_filepath, cols_count, batch_size=None, | |
train_shuffle_batch=False, num_epochs=2, num_threads=2): | |
self.train_shuffle_batch = train_shuffle_batch | |
self.num_epochs = num_epochs | |
self.batch_size = batch_size | |
self.num_threads = num_threads | |
self.tf_filepath = tf_filepath | |
features = {'x': tf.FixedLenFeature([cols_count], tf.float32)} | |
data = [] | |
for s_example in tf.python_io.tf_record_iterator(self.tf_filepath): | |
example = tf.parse_single_example(s_example, features=features) | |
data.append(tf.expand_dims(example['x'], 0)) | |
self.type = 'df' | |
self.enqueue_many = True | |
self.dataframe = tf.concat(0, data) | |
if self.train_shuffle_batch == False: | |
return self.dataframe | |
else: | |
return self.shuffle_batch() | |
if __name__ == "__main__": | |
# #For Images | |
##Find some images to test this out with... | |
cats = '/home/mcamp/Pictures/test/cat' | |
dogs = '/home/mcamp/Pictures/test/dog' | |
path_list = [cats, dogs] | |
q, label = make_q_list(path_list, 'jpg') | |
TF_Record_Generator('testtf.tfrecords', file_queue_list=q, labels_list=label) | |
train_dir = '/home/mcamp/PycharmProjects/ACDC LSTM/' | |
tf_file = 'testtf.tfrecords' | |
tfr = TF_Record_Reader() | |
Img_train_batch, lbl_train_batch = tfr.TFR2_img(tf_filepath=tf_file, | |
batch_size=2, num_epochs=None, | |
n_classes=2, one_hot=False, | |
flat_imshape=200*200*3) | |
#For DataFrames | |
for i in range(10): | |
filename = '/home/mcamp/PycharmProjects/ACDC_VisMag_LSTM/Data/random_csv' + str(i) + '.csv' | |
pd.DataFrame(np.arange(0.0, 5000.0).reshape((100, 50))).to_csv(filename) | |
filepathlist = ['/home/mcamp/PycharmProjects/ACDC_VisMag_LSTM/Data/'] | |
q, _ = make_q_list(filepathlist, '.csv') | |
tffilename = 'Demo_TFR.tfrecords' | |
TF_Record_Generator(tffilename, file_queue_list=q) | |
tfr = TF_Record_Reader() | |
##Return whole TFR file in one batch | |
DF_train_whole = tfr.TFR2_df(tffilename, cols_count=51, | |
train_shuffle_batch=False, num_epochs=2, num_threads=2) | |
##Return shuffled and batched TFR | |
DF_train_batch = tfr.TFR2_df(tffilename, batch_size=3, cols_count=51, | |
train_shuffle_batch=True, num_epochs=2, num_threads=2) | |
with tf.Session() as sess: | |
init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) | |
sess.run(init_op) | |
coord = tf.train.Coordinator() | |
threads = tf.train.start_queue_runners(coord=coord) | |
Xwhole, Xbatch = sess.run([DF_train_whole, DF_train_batch]) | |
print('All Records Shape:', Xwhole.shape, 'Batch Records Shape:', Xbatch.shape) | |
with tf.Session() as sess: | |
init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) | |
sess.run(init_op) | |
coord = tf.train.Coordinator() | |
threads = tf.train.start_queue_runners(coord=coord) | |
images, labels = sess.run([Img_train_batch, lbl_train_batch]) | |
image = np.reshape(images[0], (200,200,3)) | |
image = np.array(image, dtype=np.uint8) | |
im = Image.fromarray(image) | |
print('Int Label:', np.argmax(labels[0], axis=0)) | |
im.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Is there a license for this?