TensorFlow video input pipeline using TFRecord files (for Kinetics dataset)
def decode(serialized_example, sess):
Given a serialized example in which the frames are stored as
compressed JPG images 'frames/0001', 'frames/0002' etc., this
function samples SEQ_NUM_FRAMES from the frame list, decodes them from
JPG into a tensor and packs them to obtain a tensor of shape (N,H,W,3).
Returns the the tuple (frames, class_label (tf.int64)
:param serialized_example: serialized example from
:return: tuple: (frames (tf.uint8), class_label (tf.int64)
# Prepare feature list; read encoded JPG images as bytes
features = dict()
features["class_label"] = tf.FixedLenFeature((), tf.int64)
for i in range(SEQ_NUM_FRAMES):
features["frames/{:04d}".format(i)] = tf.FixedLenFeature((), tf.string)
# Parse into tensors
parsed_features = tf.parse_single_example(serialized_example, features)
# Decode the encoded JPG images
images = []
for i in range(SEQ_NUM_FRAMES):
# Pack the frames into one big tensor of shape (N,H,W,3)
images = tf.stack(images)
label = tf.cast(parsed_features['class_label'], tf.int64)
# Randomly sample offset ... ? Need to produce strings for dict indices after this
# offset = tf.random_uniform(shape=(), minval=0, maxval=label, dtype=tf.int64)
return images, label
def video_left_right_flip(images):
Performs tf.image.flip_left_right on entire list of video frames.
Work around since the random selection must be consistent for entire video
:param images: Tensor constaining video frames (N,H,W,3)
:return: images: Tensor constaining video frames left-right flipped (N,H,W,3)
images_list = tf.unstack(images)
for i in range(len(images_list)):
images_list[i] = tf.image.flip_left_right(images_list[i])
return tf.stack(images_list)
def preprocess_video(images, label):
Given the 'images' Tensor of video frames (N,H,W,3) perform the following
preprocessing steps:
1. Takes a random crop of size CROP_SIZExCROP_SIZE from the video frames.
2. Optionally performs random left-right flipping of the video.
3. Performs video normalization, to the range [-0.5, +0.5]
:param images: Tensor (tf.uint8) constaining video frames (N,H,W,3)
:param label: Tensor (tf.int64) constaining video frames ()
# Take a random crop of the video, returns tensor of shape (N,CROP_SIZE,CROP_SIZE,3)
images = tf.random_crop(images, (SEQ_NUM_FRAMES, CROP_SIZE, CROP_SIZE, 3))
# Consistent left_right_flip for entire video
sample = tf.random_uniform(shape=[], minval=0, maxval=1, dtype=tf.float32)
option = tf.less(sample, 0.5)
images = tf.cond(option,
lambda: video_left_right_flip(images),
lambda: tf.identity(images))
# Normalization: [0, 255] => [-0.5, +0.5] floats
images = tf.cast(images, tf.float32) * (1./255.) - 0.5
return images, label
if __name__ == "__main__":
import glob
tfrecord_files = glob.glob("/home/tomrunia/data/Kinetics/Full/tfrecords/val/*.tfrecords")
sess = tf.Session()
init_op =
dataset =
dataset = dataset.repeat(NUM_EPOCHS)
dataset =
dataset =
# The parameter is the queue size
dataset = dataset.shuffle(1000 + 3 * BATCH_SIZE)
dataset = dataset.batch(BATCH_SIZE)
iterator = dataset.make_one_shot_iterator()
next_batch = iterator.get_next()
while True:
# Fetch a new batch from the dataset
batch_videos, batch_labels =
for sample_idx in range(BATCH_SIZE):
print("Class label = {}".format(batch_labels[sample_idx]))
for frame_idx in range(SEQ_NUM_FRAMES):
cv2.imshow("image", batch_videos[sample_idx,frame_idx])
key = cv2.waitKey(0)
if key == ord('q'):
