jackyyeh5111/cupliad_preprocessing.py

## cupliad_preprocessing.py
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Preprocess images and bounding boxes for "classification" not detection.
We perform two sets of operations in preprocessing stage:
(a) operations that are applied to both training and testing data,
(b) operations that are applied only to training data for the purpose of
    data augmentation.
A preprocessing function receives a set of inputs,
e.g. an image a filename and bounding boxes,
performs an operation on them, and returns them.
Some examples are: randomly cropping the image, randomly mirroring the image,
                   randomly changing the brightness, contrast, hue and
                   randomly jittering the bounding boxes.
The preprocess function receives an image a filename and bboxes tesnors.
The image is a rank 4 tensor: [1, height, width, channels] with
dtype=tf.float32. The groundtruth_boxes is a rank 2 tensor: [N, 4] where
in each row there is a box with [ymin xmin ymax xmax].
Boxes are in normalized coordinates meaning
their coordinate values range in [0, 1]

Important Note: In tensor_dict, images is a rank 4 tensor, but preprocessing
functions receive a rank 3 tensor for processing the image. Thus, inside the
preprocess function we squeeze the image to become a rank 3 tensor and then
we pass it to the functions. At the end of the preprocess we expand the image
back to rank 4.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import functools
import sys
import tensorflow as tf
from tensorflow.python.ops import control_flow_ops
import tensorflow as tf

slim = tf.contrib.slim

_R_MEAN = 123.68
_G_MEAN = 116.78
_B_MEAN = 103.94

_RESIZE_SIDE_MIN = 256
_RESIZE_SIDE_MAX = 512
# TODO(mttang): This method is needed because the current
# tf.image.rgb_to_grayscale method does not support quantization. Replace with
# tf.image.rgb_to_grayscale after quantization support is added.
def _rgb_to_grayscale(images, name=None):
  """Converts one or more images from RGB to Grayscale.
  Outputs a tensor of the same `DType` and rank as `images`.  The size of the
  last dimension of the output is 1, containing the Grayscale value of the
  pixels.
  Args:
    images: The RGB tensor to convert. Last dimension must have size 3 and
      should contain RGB values.
    name: A name for the operation (optional).
  Returns:
    The converted grayscale image(s).
  """
  with tf.name_scope(name, 'rgb_to_grayscale', [images]) as name:
    images = tf.convert_to_tensor(images, name='images')
    # Remember original dtype to so we can convert back if needed
    orig_dtype = images.dtype
    flt_image = tf.image.convert_image_dtype(images, tf.float32)

    # Reference for converting between RGB and grayscale.
    # https://en.wikipedia.org/wiki/Luma_%28video%29
    rgb_weights = [0.2989, 0.5870, 0.1140]
    rank_1 = tf.expand_dims(tf.rank(images) - 1, 0)
    gray_float = tf.reduce_sum(
        flt_image * rgb_weights, rank_1, keep_dims=True)
    gray_float.set_shape(images.get_shape()[:-1].concatenate([1]))
    return tf.image.convert_image_dtype(gray_float, orig_dtype, name=name)


def normalize_image(image, original_minval, original_maxval, target_minval,
                    target_maxval):
  """Normalizes pixel values in the image.
  Moves the pixel values from the current [original_minval, original_maxval]
  range to a the [target_minval, target_maxval] range.
  Args:
    image: rank 3 float32 tensor containing 1
           image -> [height, width, channels].
    original_minval: current image minimum value.
    original_maxval: current image maximum value.
    target_minval: target image minimum value.
    target_maxval: target image maximum value.
  Returns:
    image: image which is the same shape as input image.
  """
  with tf.name_scope('NormalizeImage', values=[image]):
    original_minval = float(original_minval)
    original_maxval = float(original_maxval)
    target_minval = float(target_minval)
    target_maxval = float(target_maxval)
    image = tf.cast(image, dtype=tf.float32)
    image = tf.subtract(image, original_minval)
    image = tf.multiply(image, (target_maxval - target_minval) /
                        (original_maxval - original_minval))
    image = tf.add(image, target_minval)
    return image

def _mean_image_subtraction(image, means):
  """Subtracts the given means from each image channel.

  For example:
    means = [123.68, 116.779, 103.939]
    image = _mean_image_subtraction(image, means)

  Note that the rank of `image` must be known.

  Args:
    image: a tensor of size [height, width, C].
    means: a C-vector of values to subtract from each channel.

  Returns:
    the centered image.

  Raises:
    ValueError: If the rank of `image` is unknown, if `image` has a rank other
      than three or if the number of channels in `image` doesn't match the
      number of values in `means`.
  """
  if image.get_shape().ndims != 3:
    raise ValueError('Input must be of size [height, width, C>0]')
  num_channels = image.get_shape().as_list()[-1]
  if len(means) != num_channels:
    raise ValueError('len(means) must match the number of channels')

  channels = tf.split(axis=2, num_or_size_splits=num_channels, value=image)
  for i in range(num_channels):
    channels[i] -= means[i]
  return tf.concat(axis=2, values=channels)


def _mean_images_subtraction(images, means):
  """Subtracts the given means from each image channel.

  For example:
    means = [123.68, 116.779, 103.939]
    image = _mean_images_subtraction(image, means)

  Note that the rank of `image` must be known.

  Args:
    image: a tensor of size [batch, height, width, C].
    means: a C-vector of values to subtract from each channel.

  Returns:
    the centered image.

  Raises:
    ValueError: If the rank of `image` is unknown, if `image` has a rank other
      than three or if the number of channels in `image` doesn't match the
      number of values in `means`.
  """
  if images.get_shape().ndims != 4:
    raise ValueError('Input must be of size [batch, height, width, C>0]')
  num_channels = images.get_shape().as_list()[-1]
  if len(means) != num_channels:
    raise ValueError('len(means) must match the number of channels')

  channels = tf.split(axis=3, num_or_size_splits=num_channels, value=images)
  for i in range(num_channels):
    channels[i] -= means[i]
  return tf.concat(axis=3, values=channels)


def random_horizontal_flip(image, seed=None):
  """Randomly flips the image and detections horizontally.
  The probability of flipping the image is 50%.
  Args:
    image: rank 3 float32 tensor with shape [height, width, channels].
    seed: random seed
  Returns:
    image: image which is the same shape as input image.
  """

  def _flip_image(image):
    # flip image
    image_flipped = tf.image.flip_left_right(image)
    return image_flipped

  # random variable defining whether to do flip or not
  do_a_flip_random = tf.random_uniform([], seed=seed)
  do_a_flip_random = tf.greater(do_a_flip_random, 0.5)

  # flip image
  image = tf.cond(
    do_a_flip_random, lambda: _flip_image(image), lambda: image)
  return image


def random_vertical_flip(image, seed=None):
  """Randomly flips the image and detections vertically.
  The probability of flipping the image is 50%.
  Args:
    image: rank 3 float32 tensor with shape [height, width, channels].
    seed: random seed

  Returns:
    image: image which is the same shape as input image.
  """

  def _flip_image(image):
    # flip image
    image_flipped = tf.image.flip_up_down(image)
    return image_flipped

  do_a_flip_random = tf.random_uniform([], seed=seed)
  do_a_flip_random = tf.greater(do_a_flip_random, 0.5)
  image = tf.cond(do_a_flip_random, lambda: _flip_image(image),
                  lambda: image)

  return


def random_rotation90(image, seed=None):
  """Randomly rotates the image and detections 90 degrees counter-clockwise.
  The probability of rotating the image is 50%. This can be combined with
  random_horizontal_flip and random_vertical_flip to produce an output with a
  uniform distribution of the eight possible 90 degree rotation / reflection
  combinations.
  Args:
    image: rank 3 float32 tensor with shape [height, width, channels].
    seed: random seed
  Returns:
    image: image which is the same shape as input image.
  """

  def _rot90_image(image):
    # flip image
    image_rotated = tf.image.rot90(image)
    return image_rotated

  # random variable defining whether to rotate by 90 degrees or not
  do_a_rot90_random = tf.random_uniform([],seed=seed)
  do_a_rot90_random = tf.greater(do_a_rot90_random, 0.5)

  # flip image
  image = tf.cond(do_a_rot90_random, lambda: _rot90_image(image),
                  lambda: image)
  return image


def random_pixel_value_scale(image,
                             minval=0.9,
                             maxval=1.1,
                             seed=None):
  """Scales each value in the pixels of the image.
     This function scales each pixel independent of the other ones.
     For each value in image tensor, draws a random number between
     minval and maxval and multiples the values with them.
  Args:
    image: rank 3 float32 tensor contains 1 image -> [height, width, channels]
           with pixel values varying between [0, 255].
    minval: lower ratio of scaling pixel values.
    maxval: upper ratio of scaling pixel values.
    seed: random seed.
  Returns:
    image: image which is the same shape as input image.
  """
  with tf.name_scope('RandomPixelValueScale', values=[image]):
    color_coef = tf.random_uniform(tf.shape(image),
        minval=minval, maxval=maxval,
        dtype=tf.float32, seed=seed)

    image = tf.multiply(image, color_coef)
    image = tf.clip_by_value(image, 0.0, 255.0)

  return image


def _augment_only_rgb_channels(image, augment_function):
  """Augments only the RGB slice of an image with additional channels."""
  rgb_slice = image[:, :, :3]
  augmented_rgb_slice = augment_function(rgb_slice)
  image = tf.concat([augmented_rgb_slice, image[:, :, 3:]], -1)
  return image


def random_adjust_brightness(image,
                             max_delta=0.1,
                             seed=None):
  """Randomly adjusts brightness.
  Makes sure the output image is still between 0 and 255.
  Args:
    image: rank 3 float32 tensor contains 1 image -> [height, width, channels]
           with pixel values varying between [0, 255].
    max_delta: how much to change the brightness. A value between [0, 1).
    seed: random seed.
  Returns:
    image: image which is the same shape as input image.
  """
  with tf.name_scope('RandomAdjustBrightness', values=[image]):
    delta = tf.random_uniform([],-max_delta, max_delta, seed=seed)
    def _adjust_brightness(image):
      image = tf.image.adjust_brightness(image / 255, delta) * 255
      image = tf.clip_by_value(image, clip_value_min=0.0, clip_value_max=255.0)
      return image

    image = _augment_only_rgb_channels(image, _adjust_brightness)
    return image


def random_adjust_contrast(image,
                           min_delta=0.9,
                           max_delta=1.1,
                           seed=None):
  """Randomly adjusts contrast.
  Makes sure the output image is still between 0 and 255.
  Args:
    image: rank 3 float32 tensor contains 1 image -> [height, width, channels]
           with pixel values varying between [0, 255].
    min_delta: see max_delta.
    max_delta: how much to change the contrast. Contrast will change with a
               value between min_delta and max_delta. This value will be
               multiplied to the current contrast of the image.
    seed: random seed.
  Returns:
    image: image which is the same shape as input image.
  """
  with tf.name_scope('RandomAdjustContrast', values=[image]):
    contrast_factor = tf.random_uniform([],min_delta, max_delta, seed=seed)
    def _adjust_contrast(image):
      image = tf.image.adjust_contrast(image / 255, contrast_factor) * 255
      image = tf.clip_by_value(image, clip_value_min=0.0, clip_value_max=255.0)
      return image
    image = _augment_only_rgb_channels(image, _adjust_contrast)
    return image


def random_adjust_hue(image,
                      max_delta=0.02,
                      seed=None):
  """Randomly adjusts hue.
  Makes sure the output image is still between 0 and 255.
  Args:
    image: rank 3 float32 tensor contains 1 image -> [height, width, channels]
           with pixel values varying between [0, 255].
    max_delta: change hue randomly with a value between 0 and max_delta.
    seed: random seed.
  Returns:
    image: image which is the same shape as input image.
  """
  with tf.name_scope('RandomAdjustHue', values=[image]):
    delta = tf.random_uniform([], -max_delta, max_delta, seed=seed)
    def _adjust_hue(image):
      image = tf.image.adjust_hue(image / 255, delta) * 255
      image = tf.clip_by_value(image, clip_value_min=0.0, clip_value_max=255.0)
      return image
    image = _augment_only_rgb_channels(image, _adjust_hue)
    return image


def random_adjust_saturation(image,
                             min_delta=0.8,
                             max_delta=1.25,
                             seed=None):
  """Randomly adjusts saturation.
  Makes sure the output image is still between 0 and 255.
  Args:
    image: rank 3 float32 tensor contains 1 image -> [height, width, channels]
           with pixel values varying between [0, 255].
    min_delta: see max_delta.
    max_delta: how much to change the saturation. Saturation will change with a
               value between min_delta and max_delta. This value will be
               multiplied to the current saturation of the image.
    seed: random seed.
  Returns:
    image: image which is the same shape as input image.
  """
  with tf.name_scope('RandomAdjustSaturation', values=[image]):
    saturation_factor = tf.random_uniform([],min_delta, max_delta, seed=seed)
    def _adjust_saturation(image):
      image = tf.image.adjust_saturation(image / 255, saturation_factor) * 255
      image = tf.clip_by_value(image, clip_value_min=0.0, clip_value_max=255.0)
      return image
    image = _augment_only_rgb_channels(image, _adjust_saturation)
    return image


def random_add_PCA_noise(image,
                         max_delta=0.1,
                         seed=None):
  """Randomly adjusts brightness.
  Makes sure the output image is still between 0 and 255.
  Args:
    image: rank 3 float32 tensor contains 1 image -> [height, width, channels]
           with pixel values varying between [0, 255].
    max_delta: how much to change the brightness. A value between [0, 1).
    seed: random seed.
  Returns:
    image: image which is the same shape as input image.
  """
  with tf.name_scope('RandomAdjustBrightness', values=[image]):
    delta = tf.random_uniform([],-max_delta, max_delta, seed=seed)
    def _adjust_brightness(image):
      image = tf.image.adjust_brightness(image / 255, delta) * 255
      image = tf.clip_by_value(image, clip_value_min=0.0, clip_value_max=255.0)
      return image

    image = _augment_only_rgb_channels(image, _adjust_brightness)
    return image

def random_distort_color(image, color_ordering=0):
  """Randomly distorts color.
  Randomly distorts color using a combination of brightness, hue, contrast and
  saturation changes. Makes sure the output image is still between 0 and 255.
  Args:
    image: rank 3 float32 tensor contains 1 image -> [height, width, channels]
           with pixel values varying between [0, 255].
    color_ordering: Python int, a type of distortion (valid values: 0, 1, 2).
  Returns:
    image: image which is the same shape as input image.
  Raises:
    ValueError: if color_ordering is not in {0, 1, 2}.
  """
  with tf.name_scope('RandomDistortColor', values=[image]):
    if color_ordering == 0:
      image = random_adjust_brightness(image, max_delta=32. / 255.)
      image = random_adjust_saturation(image, min_delta=0.5, max_delta=1.5)
      image = random_adjust_hue(image, max_delta=0.2)
      image = random_adjust_contrast(image, min_delta=0.5, max_delta=1.5)
    elif color_ordering == 1:
      image = random_adjust_brightness(image, max_delta=32. / 255.)
      image = random_adjust_contrast(image, min_delta=0.5, max_delta=1.5)
      image = random_adjust_saturation(image, min_delta=0.5, max_delta=1.5)
      image = random_adjust_hue(image, max_delta=0.2)
    elif color_ordering == 2:
      image = random_adjust_brightness(image, max_delta=0.1)
      image = random_adjust_contrast(image, min_delta=0.8, max_delta=1.1)
    elif color_ordering == 3:
      image = random_adjust_brightness(image, max_delta=0.1)
    elif color_ordering == 4:
      image = random_adjust_brightness(image, max_delta=0.1)
      image = random_adjust_contrast(image, min_delta=0.8, max_delta=1.1)
      image = random_add_PCA_noise(image, max_delta=0.1)

    else:
      raise ValueError('color_ordering must be in {0, 1, 2}')
    return image


def image_to_float(image):
  """Used in Faster R-CNN. Casts image pixel values to float.
  Args:
    image: input image which might be in tf.uint8 or sth else format
  Returns:
    image: image in tf.float32 format.
  """
  with tf.name_scope('ImageToFloat', values=[image]):
    image = tf.cast(image, dtype=tf.float32)
    return image


def _get_image_info(image):
  """Returns the height, width and number of channels in the image."""
  image_height = tf.shape(image)[0]
  image_width = tf.shape(image)[1]
  num_channels = tf.shape(image)[2]
  return (image_height, image_width, num_channels)


def _largest_size_at_least(height, width, smallest_side):
  """Computes new shape with the smallest side equal to `smallest_side`.

  Computes new shape with the smallest side equal to `smallest_side` while
  preserving the original aspect ratio.

  Args:
    height: an int32 scalar tensor indicating the current height.
    width: an int32 scalar tensor indicating the current width.
    smallest_side: A python integer or scalar `Tensor` indicating the size of
      the smallest side after resize.

  Returns:
    new_height: an int32 scalar tensor indicating the new height.
    new_width: and int32 scalar tensor indicating the new width.
  """
  smallest_side = tf.convert_to_tensor(smallest_side, dtype=tf.int32)

  height = tf.to_float(height)
  width = tf.to_float(width)
  smallest_side = tf.to_float(smallest_side)

  scale = tf.cond(tf.greater(height, width),
                  lambda: smallest_side / height,
                  lambda: smallest_side / width)
  new_height = tf.to_int32(tf.rint(height * scale))
  new_width = tf.to_int32(tf.rint(width * scale))
  return new_height, new_width


def _smallest_size_at_least(height, width, smallest_side):
  """Computes new shape with the smallest side equal to `smallest_side`.

  Computes new shape with the smallest side equal to `smallest_side` while
  preserving the original aspect ratio.

  Args:
    height: an int32 scalar tensor indicating the current height.
    width: an int32 scalar tensor indicating the current width.
    smallest_side: A python integer or scalar `Tensor` indicating the size of
      the smallest side after resize.

  Returns:
    new_height: an int32 scalar tensor indicating the new height.
    new_width: and int32 scalar tensor indicating the new width.
  """
  smallest_side = tf.convert_to_tensor(smallest_side, dtype=tf.int32)

  height = tf.to_float(height)
  width = tf.to_float(width)
  smallest_side = tf.to_float(smallest_side)

  scale = tf.cond(tf.greater(height, width),
                  lambda: smallest_side / width,
                  lambda: smallest_side / height)
  new_height = tf.to_int32(tf.rint(height * scale))
  new_width = tf.to_int32(tf.rint(width * scale))
  return new_height, new_width

def _small_aspect_preserving_resize(image, smallest_side):
  """Resize images preserving the original aspect ratio. This will
  resize the image to longer side equlas to smallest side. thus it is
  called small_aspect_preserving_resize

  Args:
    image: A 3-D image `Tensor`.
    smallest_side: A python integer or scalar `Tensor` indicating the size of
      the smallest side after resize.

  Returns:
    resized_image: A 3-D tensor containing the resized image.
  """
  smallest_side = tf.convert_to_tensor(smallest_side, dtype=tf.int32)

  shape = tf.shape(image)
  height = shape[0]
  width = shape[1]
  new_height, new_width = _largest_size_at_least(height, width, smallest_side)
  image = tf.expand_dims(image, 0)
  resized_image = tf.image.resize_bilinear(image, [new_height, new_width],
                                           align_corners=False)
  resized_image = tf.squeeze(resized_image)
  resized_image.set_shape([None, None, 3])
  return resized_image


def _aspect_preserving_resize(image, smallest_side):
  """Resize images preserving the original aspect ratio.

  Args:
    image: A 3-D image `Tensor`.
    smallest_side: A python integer or scalar `Tensor` indicating the size of
      the smallest side after resize.

  Returns:
    resized_image: A 3-D tensor containing the resized image.
  """
  smallest_side = tf.convert_to_tensor(smallest_side, dtype=tf.int32)

  shape = tf.shape(image)
  height = shape[0]
  width = shape[1]
  new_height, new_width = _smallest_size_at_least(height, width, smallest_side)
  image = tf.expand_dims(image, 0)
  resized_image = tf.image.resize_bilinear(image, [new_height, new_width],
                                           align_corners=False)
  resized_image = tf.squeeze(resized_image)
  resized_image.set_shape([None, None, 3])
  return resized_image


def subtract_channel_mean(image, means=None):
  """Normalizes an image by subtracting a mean from each channel.
  Args:
    image: A 3D tensor of shape [height, width, channels]
    means: float list containing a mean for each channel
  Returns:
    normalized_images: a tensor of shape [height, width, channels]
  Raises:
    ValueError: if images is not a 4D tensor or if the number of means is not
      equal to the number of channels.
  """
  with tf.name_scope('SubtractChannelMean', values=[image, means]):
    if len(image.get_shape()) != 3:
      raise ValueError('Input must be of size [height, width, channels]')
    if len(means) != image.get_shape()[-1]:
      raise ValueError('len(means) must match the number of channels')
    return image - [[means]]


def one_hot_encoding(labels, num_classes=None):
  """One-hot encodes the multiclass labels.
  Example usage:
    labels = tf.constant([1, 4], dtype=tf.int32)
    one_hot = OneHotEncoding(labels, num_classes=5)
    one_hot.eval()    # evaluates to [0, 1, 0, 0, 1]
  Args:
    labels: A tensor of shape [None] corresponding to the labels.
    num_classes: Number of classes in the dataset.
  Returns:
    onehot_labels: a tensor of shape [num_classes] corresponding to the one hot
      encoding of the labels.
  Raises:
    ValueError: if num_classes is not specified.
  """
  with tf.name_scope('OneHotEncoding', values=[labels]):
    if num_classes is None:
      raise ValueError('num_classes must be specified')

    # create all zeors vector by one hot. In case labels is empty!
    labels = tf.concat([tf.constant([-1], dtype=tf.int64), labels], axis = 0)
    labels = tf.one_hot(labels, num_classes, 1, 0)

    return tf.reduce_max(labels, 0) #[num_classes]


def rgb_to_gray(image):
  """Converts a 3 channel RGB image to a 1 channel grayscale image.
  Args:
    image: Rank 3 float32 tensor containing 1 image -> [height, width, 3]
           with pixel values varying between [0, 1].
  Returns:
    image: A single channel grayscale image -> [image, height, 1].
  """
  return _rgb_to_grayscale(image)


def convert_class_logits_to_softmax(multiclass_scores, temperature=1.0):
  """Converts multiclass logits to softmax scores after applying temperature.
  Args:
    multiclass_scores: float32 tensor of shape
      [num_instances, num_classes] representing the score for each box for each
      class.
    temperature: Scale factor to use prior to applying softmax. Larger
      temperatures give more uniform distruibutions after softmax.
  Returns:
    multiclass_scores: float32 tensor of shape
      [num_instances, num_classes] with scaling and softmax applied.
  """

  # Multiclass scores must be stored as logits. Apply temp and softmax.
  multiclass_scores_scaled = tf.divide(
      multiclass_scores, temperature, name='scale_logits')
  multiclass_scores = tf.nn.softmax(multiclass_scores_scaled, name='softmax')

  return multiclass_scores


def center_pad_to_square(image):
  """Central Padding the image to square.
  Args:
    image: an image of shape [height, width, channels].
  Returns:
    image of shape [new_length, new_length, channels]
  """


  rank_assertions = []
  image_rank = tf.rank(image)
  rank_assert = tf.Assert(
      tf.equal(image_rank, 3),
      ['Wrong rank for tensor  %s [expected] [actual]',
       image.name, 3, image_rank])
  rank_assertions.append(rank_assert)
  with tf.control_dependencies(rank_assertions):
    image_shape = tf.shape(image)
    image_height = image_shape[0]
    image_width = image_shape[1]
    target_length = tf.math.maximum(image_height, image_width)
    image = tf.image.resize_image_with_crop_or_pad(
      image, target_length, target_length)
    new_shape = tf.stack([target_length, target_length, image_shape[2]])
  return tf.reshape(image,new_shape)


def _is_bbox_not_in_crop(bboxes, image_height, image_width,
                         offset_height, offset_width,
                         crop_height, crop_width):
  """
  bboxes:  rank 2 float32 tensor containing the bounding boxes with shape
             [num_instances, 4].
             Boxes are in normalized form meaning their coordinates vary
             between [0, 1].
             Each row is in the form of [ymin, xmin, ymax, xmax].
  offset_height: a scalar tensor indicating the height offset.
  offset_width: a scalar tensor indicating the width offset.
  crop_height: the height of the cropped image.
  crop_width: the width of the cropped image.
  """
  # if any bbox:  xmax < offset_width or xmin > offset_width+crop_width or
  #               ymax < offset_height or ymin > offset_height+crop_height
  image_width = tf.cast(image_width, tf.float32)
  image_height = tf.cast(image_height, tf.float32)
  offset_height = tf.cast(offset_height, tf.float32)
  offset_width  = tf.cast(offset_width ,tf.float32)
  crop_height = tf.cast(crop_height, tf.float32)
  crop_width = tf.cast(crop_width, tf.float32)
  ymax = offset_height + crop_height
  xmax = offset_width + crop_width
  mask1 = tf.math.logical_or(bboxes[:,0]*image_height > ymax,
                             bboxes[:,1]*image_width  > xmax)
  mask2 = tf.math.logical_or(bboxes[:,2]*image_height < offset_height,
                             bboxes[:,3]*image_width  < offset_width)
  mask = tf.math.logical_or(mask1, mask2)
  mask = tf.reshape(mask,[-1])
  return mask


def _crop(image, bboxes, labels, offset_height, offset_width,
          crop_height, crop_width):
  """Crops the given image using the provided offsets and sizes.
  The label is discarded if the bbox in not in the cropped image.

  Note that the method doesn't assume we know the input image size but it does
  assume we know the input image rank.

  Args:
    image: an image of shape [height, width, channels].
    bboxes:  rank 2 float32 tensor containing the bounding boxes with shape
             [num_instances, 4].
             Boxes are in normalized form meaning their coordinates vary
             between [0, 1].
             Each row is in the form of [ymin, xmin, ymax, xmax].
    offset_height: a scalar tensor indicating the height offset.
    offset_width: a scalar tensor indicating the width offset.
    crop_height: the height of the cropped image.
    crop_width: the width of the cropped image.

  Returns:
    the cropped (and resized) image and labels.

  Raises:
    InvalidArgumentError: if the rank is not 3 or if the image dimensions are
      less than the crop size.
  """
  print("YES I DO CROPPPPPPPPPPPPPPPPPPPPPPPPPPPPP")
  original_shape = tf.shape(image)

  rank_assertion = tf.Assert(
      tf.equal(tf.rank(image), 3),
      ['Rank of image must be equal to 3.'])
  with tf.control_dependencies([rank_assertion]):
    cropped_shape = tf.stack([crop_height, crop_width, original_shape[2]])

  size_assertion = tf.Assert(
      tf.logical_and(
          tf.greater_equal(original_shape[0], crop_height),
          tf.greater_equal(original_shape[1], crop_width)),
      ['Crop size greater than the image size.'])

  offsets = tf.to_int32(tf.stack([offset_height, offset_width, 0]))

  dimension_assertion = tf.Assert(
      tf.equal(tf.shape(bboxes)[0], tf.shape(labels)[0]),
      ['Dimension 0 of bboxes and labels should not be different.'])

  # Use tf.slice instead of crop_to_bounding box as it accepts tensors to
  # define the crop size.
  with tf.control_dependencies([size_assertion, dimension_assertion]):
    image = tf.slice(image, offsets, cropped_shape)
    # bbox clipping and filter out out of window bbox
    bbox_pruner = functools.partial(
        _is_bbox_not_in_crop, bboxes, original_shape[0], original_shape[1],
            offset_height, offset_width, crop_height, crop_width)
    # in case there is no defect bbox (a perfect sample)
    #mask = tf.cond(tf.equal(tf.size(bboxes),0),
    #  lambda: tf.constant([],dtype=tf.bool),
    #  lambda: bbox_pruner())

    #temp = tf.math.logical_not(mask)
    labels = tf.cond(tf.equal(tf.size(bboxes),0),
      lambda: labels,
      lambda: tf.boolean_mask(labels, tf.math.logical_not(bbox_pruner()))) #tf.math.logical_not(bbox_pruner()))
    # take only bbox that is not out of crop
    #labels = tf.boolean_mask(labels, tf.math.logical_not(mask))
    #bboxes = tf.boolean_mask(bboxes, tf.math.logical_not(mask))

  return tf.reshape(image, cropped_shape), labels


def _random_crop(image, bboxes, labels, min_crop_ratio=0.999999):
                 #min_crop_height, min_crop_width,
                 #max_crop_height, max_crop_height):
  """Crops the given list of images.

  The function applies the same crop to each image in the list. This can be
  effectively applied when there are multiple image inputs of the same
  dimension such as:

    image, depths, normals = _random_crop([image, depths, normals], 120, 150)

  Args:
    image: an image tensor.
    min_crop_ratio: the ratio of minimum_crop_side_length/side_length.

  Returns:
    The cropped images, and labels (the lable of bbox which is not in
    cropped image is pruned out)

  Raises:
    ValueError: if there are multiple image inputs provided with different size
      or the images are smaller than the crop dimensions.
  """
  # Compute the rank assertions.
  with tf.name_scope('RandomCrop', values=[image, bboxes,labels]):
    rank_assertions = []
    image_rank = tf.rank(image)
    rank_assert = tf.Assert(
        tf.equal(image_rank, 3),
        ['Wrong rank for tensor  %s [expected] [actual]',
         image.name, 3, image_rank])
    rank_assertions.append(rank_assert)

    with tf.control_dependencies([rank_assertions[0]]):
      image_shape = tf.shape(image)
    image_height = image_shape[0]
    image_width = image_shape[1]
    #crop_size_assert = tf.Assert(
    #    tf.logical_and(
    #        tf.greater_equal(image_height, max_crop_height),
    #        tf.greater_equal(image_width, max_crop_width)),
    #    ['Crop size greater than the image size.'])

    asserts = [rank_assertions[0]]
    #asserts = [rank_assertions[0], crop_size_assert]

    # Create a random bounding box.
    #
    # Use tf.random_uniform and not numpy.random.rand as doing the former would
    # generate random numbers at graph eval time, unlike the latter which
    # generates random numbers at graph definition time.
    min_crop_ratio = tf.constant(min_crop_ratio)
    crop_height = tf.random_uniform([],
          minval= tf.cast(
            tf.cast(image_height,dtype=tf.float32)*min_crop_ratio,dtype=tf.int32),
          maxval= image_height, dtype=tf.int32)
    crop_width = tf.random_uniform([],
          minval= tf.cast(
            tf.cast(image_width,dtype=tf.float32)*min_crop_ratio,dtype=tf.int32),
          maxval= image_width, dtype=tf.int32)

    with tf.control_dependencies(asserts):
      max_offset_height = tf.reshape(image_height - crop_height + 1, [])
    with tf.control_dependencies(asserts):
      max_offset_width = tf.reshape(image_width - crop_width + 1, [])
    offset_height = tf.random_uniform(
        [], maxval=max_offset_height, dtype=tf.int32)
    offset_width = tf.random_uniform(
        [], maxval=max_offset_width, dtype=tf.int32)

    return _crop(image, bboxes, labels, offset_height, offset_width,
                  crop_height, crop_width)


def _get_camera_int_id(filename):
  """Convert camera_id into tf.float32 tensor type.

  Args:
    filename: tf.string tensor with shape(), which
        represents filename.

  Returns:
    camera_int_id: tf.int32 tensor with shape ().
  """
  def _to_tensor(id):
    return tf.constant(id, dtype= tf.int32, name= 'camera_mask_id')

  tokens = tf.strings.split(tf.expand_dims(filename, 0), sep='--')
  camera_id = tokens.values[3]
  camera_int_id = tf.case(
      {
          tf.equal(camera_id, b'Camera0'): lambda: _to_tensor(0),
          tf.equal(camera_id, b'Camera1'): lambda: _to_tensor(1),
          tf.equal(camera_id, b'Camera2'): lambda: _to_tensor(2),
          tf.equal(camera_id, b'Camera3'): lambda: _to_tensor(3),
          tf.equal(camera_id, b'Camera4'): lambda: _to_tensor(3),
          tf.equal(camera_id, b'Camera5'): lambda: _to_tensor(3),
          tf.equal(camera_id, b'Camera6'): lambda: _to_tensor(3),
          tf.equal(camera_id, b'Camera7'): lambda: _to_tensor(4),
          tf.equal(camera_id, b'Camera8'): lambda: _to_tensor(4),
          tf.equal(camera_id, b'Camera9'): lambda: _to_tensor(4),
          tf.equal(camera_id, b'Camera10'): lambda: _to_tensor(4)
      },
      default= None,
      exclusive=True
  )

  return camera_int_id

def _mix_up_augmentation(images, labels, filenames, seed=None):
  """
  Args:
    images: rank 4 float32 tensor contains
            N image -> [N, height, width, 3].
            with pixel values varying between [-128, 128]
    labels: rank 2 float32 tensor containing
            the (multilabel) onehot labels -> [N, num_classes].
    filenames: rank1 string tensor contains N string.
  Return:
    the preprocessed images and labels
  """
  def _mix_up(images, labels, seed=None):
    num_img = tf.shape(images)[0]
    do_mix_random = tf.random_uniform([num_img],seed=seed)
    #do_mix_random = tf.greater(do_a_rot90_random, 0.5)
    rand_idx = tf.reshape(tf.random_shuffle(tf.range(num_img)),[num_img,1])
    shuffled_imgs = tf.gather_nd(images, rand_idx)
    shuffled_labs = tf.gather_nd(labels, rand_idx)
    mixed_imgs = 0.5*images + 0.5*shuffled_imgs
    mixed_labs = 0.5*labels + 0.5*shuffled_labs
    images = tf.where(tf.greater(do_mix_random,0.5), mixed_imgs, images)
    labels = tf.where(tf.greater(do_mix_random,0.5), mixed_labs, labels)

    return images, labels


  camera_ids =  tf.map_fn(_get_camera_int_id, filenames,
                              dtype=tf.int32, back_prop=False)

  tobe_mixed_idxs = tf.equal(camera_ids, 1)
  tobe_mixed_imgs = tf.boolean_mask(images, tobe_mixed_idxs)
  tobe_mixed_labs = tf.boolean_mask(labels, tobe_mixed_idxs)
  notbe_mixed_imgs = tf.boolean_mask(images, tf.math.logical_not(tobe_mixed_idxs))
  notbe_mixed_labs = tf.boolean_mask(labels, tf.math.logical_not(tobe_mixed_idxs))
  mixed_images, mixed_labels = tf.cond(
    tf.greater(tf.shape(tobe_mixed_imgs)[0], tf.constant(1, dtype= tf.int32)),
    lambda: _mix_up(tobe_mixed_imgs, tobe_mixed_labs),
    lambda: (tobe_mixed_imgs, tobe_mixed_labs)
  )

  images = tf.reshape(
    tf.concat([mixed_images, notbe_mixed_imgs], axis=0), tf.shape(images))
  labels = tf.reshape(
    tf.concat([mixed_labels, notbe_mixed_labs], axis=0), tf.shape(labels))

  return images, labels
def _square_image_preprocess(image, filename, resize_side):
  """ square image specific preprocessor
  For resize function, we apply the function used in research/slim.
  align_corners are set to false. However, in TFOD, align_corners is set to
  true.
  Args:
    image: an image of shape [height, width, channels].
    filename: tf.string tensor with shape(), which represents filename.
    resize_side: a pyhton int indicating the resized side length of the image
  Return:
    image: Image shape will be [resize_sied, resize_side, 3]
  """
  # resize
  with tf.name_scope('SquareImagePreprocess',
    values=[image, filename, resize_side]):
    image = _small_aspect_preserving_resize(image, resize_side)
    # pad to square
    image = center_pad_to_square(image)
    tokens = tf.strings.split(tf.expand_dims(filename, 0), sep='--')
    camera_id = tokens.values[3]
    # rotate, according to filename!
    camera_int_id = _get_camera_int_id(filename)
    image = tf.cond(camera_int_id <= tf.constant(2, dtype= tf.int32),
              lambda: random_horizontal_flip(random_rotation90(image)),
              lambda: image)
    return image

def _non_square_image_preprocess(image, output_height, output_width):
  """ square image specific preprocessor
  For resize function, we apply the function used in research/slim.
  align_corners are set to false. However, in TFOD, align_corners is set to
  true.
  In research/slim, the resize function is aspect_ration preserving. But we
  haven't consider this case yet.
  Args:
    image: an image of shape [height, width, channels].
    output_height: a pyhton int indicating the height of the resized image.
    output_width: a pyhton int indicating the width of the resized image.
  Return:
    image: Image shape will be [output_height, output_width, 3]
  """
  # resize
  with tf.name_scope('NonSquareImagePreprocess', values=[image]):
    image = tf.expand_dims(image, 0)
    image = tf.image.resize_bilinear(image, [output_height, output_width],
                                                      align_corners=False)
    image = tf.squeeze(image)
    return image

def _label_hacking(onehot_label, filename):
  """ Modify the label for experimental testing
  Args:
    onehot_label: A `Tensor` of [num_classes] representing the one hot multi-label.
    filename: A `Tensor` string.

  Returns:
    The modified onehot label.
  """
  with tf.name_scope('LabelHacking', values=[onehot_label, filename]):
    # Modify the label to force rough and thrumark appear at the same time.
    # These two classes become the same.
    '''
    max_val = tf.reduce_max(onehot_label[-2:],keepdims=True)
    onehot_label = tf.concat([onehot_label[:-2], max_val, max_val],axis=0)
    '''
    # Modify the label to force break under normal light as stain
    # if cam_id  <2 or >3
    '''
    def _break_to_stain(onehot_label):
      stain_label = tf.reduce_max(onehot_label[1:3],keepdims=True)
      break_label = tf.constant([0])
      dummy_label = tf.constant([0])
      onehot_label=tf.concat(
        [dummy_label, break_label,stain_label, onehot_label[3:]],axis=0)
      return onehot_label

    cam_id = _get_camera_int_id(filename)
    onehot_label = tf.cond(tf.logical_or(cam_id < 2, cam_id > 3),
      lambda: _break_to_stain(onehot_label),
      lambda: onehot_label)
    '''

    return onehot_label

def preprocess_for_train_easy(image, bboxes, labels, filename, num_classes,
                         output_height, output_width, use_PCA_noise):
  """Preprocesses the given image for training.

  Note that the actual resizing scale is sampled from
    [`resize_size_min`, `resize_size_max`].

  Args:
    image: A `Tensor` representing an image of arbitrary size.
    output_height: The height of the image after preprocessing.
    output_width: The width of the image after preprocessing.
    resize_side_min: The lower bound for the smallest side of the image for
      aspect-preserving resizing.
    resize_side_max: The upper bound for the smallest side of the image for
      aspect-preserving resizing.

  Returns:
    A preprocessed image.
  """
  image = tf.cond(
    tf.constant(output_height == output_width, dtype=tf.bool),
      lambda: center_pad_to_square(_small_aspect_preserving_resize(
                                                        image,output_height)),
      lambda: tf.squeeze(tf.image.resize_bilinear(tf.expand_dims(image, 0),
                          [output_height, output_width],align_corners=False))
    )
  image.set_shape([output_height, output_width, 3])
  image = tf.to_float(image)
  label = one_hot_encoding(labels, num_classes=num_classes)
  image = tf.image.random_flip_left_right(image)

  image = _mean_image_subtraction(image, [_R_MEAN, _G_MEAN, _B_MEAN])

  return image, label

def add_PCA_noise(image, mean=0.0, stddev=0.1):

  def tf_cov(x):
    # ref: https://stackoverflow.com/questions/47709854/how-to-get-covariance-matrix-in-tensorflow?rq=1
    mean_x = tf.reduce_mean(x, axis=0, keep_dims=True)
    mx = tf.matmul(tf.transpose(mean_x), mean_x)
    vx = tf.matmul(tf.transpose(x), x)/tf.cast(tf.shape(x)[0], tf.float32)
    cov_xx = vx - mx
    return cov_xx

  with tf.name_scope('AddPCANoise', values=[image]):
    image = tf.reshape(image, [-1, 3])
    image = tf.cast(image, tf.float32)

    renorm_image = image - tf.reduce_mean(image, axis=0)
    renorm_image = renorm_image / tf.keras.backend.std(image, axis=0)

    cov_matrix = tf_cov(renorm_image) # covariance matrix
    S, U, V = tf.linalg.svd(cov_matrix) # eigen value(S), eigen vector(U)

    rand = tf.random_normal(tf.shape(image), mean=mean, stddev=stddev)
    delta = tf.matmul(rand*tf.expand_dims(S, axis=0), U)

    image = image + delta

    image = tf.clip_by_value(image, clip_value_min=0.0, clip_value_max=255.0)


    return image


def preprocess_for_train(image, bboxes, labels, filename, num_classes,
                         output_height, output_width):
  """Preprocesses the given image for training.

  Args:
    image: A `Tensor` representing an image of arbitrary size.
    output_height: The height of the image after preprocessing.
    output_width: The width of the image after preprocessing.

  Returns:
    A preprocessed image.
  """
  # random crop to get label and image
  image, labels = _random_crop(image, bboxes, labels)
  label = one_hot_encoding(labels, num_classes=num_classes)
  label = _label_hacking(label, filename)
  # adjust color
  # resize, rotate, flip according to the characteristic of the image.
  # if the image is a square image, then do rotation according to cameraID.

  image = tf.cond(tf.constant(output_height == output_width, dtype=tf.bool),
    lambda: _square_image_preprocess(image, filename, output_height),
    lambda: _non_square_image_preprocess(image, output_height, output_width))

  image = tf.to_float(image)
  image = random_distort_color(image, color_ordering = 2)
  image.set_shape([output_height, output_width, 3])
  # Rotate according to image filename
  #image = random_horizontal_flip(image)

  return _mean_image_subtraction(image, [_R_MEAN, _G_MEAN, _B_MEAN]), label

def preprocess_for_eval(image, bboxes, labels, filename, num_classes,
                        output_height, output_width):
  """Preprocesses the given image for evaluation.

  Args:
    image: A `Tensor` representing an image of arbitrary size.
    output_height: The height of the image after preprocessing.
    output_width: The width of the image after preprocessing.
    resize_side: The smallest side of the image for aspect-preserving resizing.

  Returns:
    A preprocessed image.
  """
  image = tf.cond(
    tf.constant(output_height == output_width, dtype=tf.bool),
      lambda: center_pad_to_square(_small_aspect_preserving_resize(
                                                        image,output_height)),
      lambda: tf.squeeze(tf.image.resize_bilinear(tf.expand_dims(image, 0),
                          [output_height, output_width],align_corners=False))
    )
  image.set_shape([output_height, output_width, 3])
  image = tf.to_float(image)
  label = one_hot_encoding(labels, num_classes=num_classes)
  label = _label_hacking(label, filename)
  return _mean_image_subtraction(image, [_R_MEAN, _G_MEAN, _B_MEAN]), label

def preprocess_for_freezing(image):
  return _mean_images_subtraction(image, [_R_MEAN, _G_MEAN, _B_MEAN])

def preprocess_image(image, output_height, output_width,
                     bboxes, labels, filename, num_classes,
                     use_more_augmentation=True, is_training=False,
                     is_freezing=False, use_PCA_noise=False):
  """Preprocesses the given image.

  Args:
    images: rank 3 float32 tensor contains
            1 image -> [height, width, 3].
            with pixel values varying between [0, 1]
    bboxes: rank 2 float32 tensor containing
            the bounding boxes -> [N, 4].
            Boxes are in normalized form meaning
            their coordinates vary between [0, 1].
            Each row is in the form
            of [ymin, xmin, ymax, xmax].
    output_height: The height of the image after preprocessing.
    output_width: The width of the image after preprocessing.
    is_training: `True` if we're preprocessing the image for training and
      `False` otherwise.
    resize_side_min: The lower bound for the smallest side of the image for
      aspect-preserving resizing. If `is_training` is `False`, then this value
      is used for rescaling.
    resize_side_max: The upper bound for the smallest side of the image for
      aspect-preserving resizing. If `is_training` is `False`, this value is
      ignored. Otherwise, the resize side is sampled from
        [resize_size_min, resize_size_max].

  Returns:
    A preprocessed image.
  """
  # TODO check if image is rank 4 or 3
  #if len(images.get_shape()) != 4:
  #    raise ValueError('images in tensor_dict should be rank 4')
  #image = tf.squeeze(images, axis=0)

  if is_training:
    if use_more_augmentation:
      return preprocess_for_train(image, bboxes, labels, filename, num_classes,
                  output_height, output_width)
    else:
      return preprocess_for_train_easy(image, bboxes, labels, filename,
                                      num_classes, output_height, output_width, use_PCA_noise)
  else:
    if is_freezing:
      return preprocess_for_freezing(image)
    else:
      return preprocess_for_eval(image, bboxes, labels, filename, num_classes,
                  output_height, output_width)

def batch_preprocess_fn(images, labels, filenames, is_training=True):
  """
  Args:
    images: rank 4 float32 tensor contains
            N image -> [N, height, width, 3].
            with pixel values varying between [-128, 128]
    labels: rank 2 float32 tensor containing
            the (multilabel) onehot labels -> [N, num_classes].
    filenames: rank1 string tensor contains N string.
  Return:
    the preprocessed images and labels
  """
  images, labels = _mix_up_augmentation(images, labels, filenames)
  return images, labels