jiangzhongbo/blaze_face_detect.py

## blaze_face_detect.py
import cv2
import time
import math
import numpy as np
import tensorflow as tf

class SsdAnchorsCalculatorOptions:
    def __init__(self, input_size_width, input_size_height, min_scale, max_scale
        , num_layers, feature_map_width, feature_map_height
        , strides, aspect_ratios, anchor_offset_x=0.5, anchor_offset_y=0.5
        , reduce_boxes_in_lowest_layer=False, interpolated_scale_aspect_ratio=1.0
        , fixed_anchor_size=False):
        # Size of input images.
        self.input_size_width = input_size_width
        self.input_size_height = input_size_height
        # Min and max scales for generating anchor boxes on feature maps.
        self.min_scale = min_scale
        self.max_scale = max_scale
        # The offset for the center of anchors. The value is in the scale of stride.
        # E.g. 0.5 meaning 0.5 * |current_stride| in pixels.
        self.anchor_offset_x = anchor_offset_x
        self.anchor_offset_y = anchor_offset_y
        # Number of output feature maps to generate the anchors on.
        self.num_layers = num_layers
        # Sizes of output feature maps to create anchors. Either feature_map size or
        # stride should be provided.
        self.feature_map_width = feature_map_width
        self.feature_map_height = feature_map_height
        self.feature_map_width_size = len(feature_map_width)
        self.feature_map_height_size = len(feature_map_height)
        # Strides of each output feature maps.
        self.strides = strides
        self.strides_size = len(strides)
        # List of different aspect ratio to generate anchors.
        self.aspect_ratios = aspect_ratios
        self.aspect_ratios_size = len(aspect_ratios)
        # A boolean to indicate whether the fixed 3 boxes per location is used in the lowest layer.
        self.reduce_boxes_in_lowest_layer = reduce_boxes_in_lowest_layer
        # An additional anchor is added with this aspect ratio and a scale
        # interpolated between the scale for a layer and the scale for the next layer
        # (1.0 for the last layer). This anchor is not included if this value is 0.
        self.interpolated_scale_aspect_ratio = interpolated_scale_aspect_ratio
        # Whether use fixed width and height (e.g. both 1.0f) for each anchor.
        # This option can be used when the predicted anchor width and height are in  pixels.
        self.fixed_anchor_size = fixed_anchor_size
    def to_string(self):
        return 'input_size_width: {:}\ninput_size_height: {:}\nmin_scale: {:}\nmax_scale: {:}\nanchor_offset_x: {:}\nanchor_offset_y: {:}\nnum_layers: {:}\nfeature_map_width: {:}\nfeature_map_height: {:}\nstrides: {:}\naspect_ratios: {:}\nreduce_boxes_in_lowest_layer: {:}\ninterpolated_scale_aspect_ratio: {:}\nfixed_anchor_size: {:}'\
        .format(self.input_size_width, self.input_size_height, self.min_scale, self.max_scale
            , self.anchor_offset_x, self.anchor_offset_y, self.num_layers
            , self.feature_map_width, self.feature_map_height, self.strides, self.aspect_ratios
            , self.reduce_boxes_in_lowest_layer, self.interpolated_scale_aspect_ratio
            , self.fixed_anchor_size)

class Anchor:
    def __init__(self, x_center, y_center, h, w):
        self.x_center = x_center
        self.y_center = y_center
        self.h = h
        self.w = w
    def to_string(self):
        return 'x_center: {:}, y_center: {:}, h: {:}, w: {:}'.format(self.x_center, self.y_center, self.h, self.w)

class Detection:
    def __init__(self, score, class_id, xmin, ymin, width, height):
        self.score = score
        self.class_id = class_id
        self.xmin = xmin
        self.ymin = ymin
        self.width = width
        self.height = height
    def to_string(self):
        return 'score: {:}, class_id: {:}, xmin: {:}, ymin: {:}, width: {:}, height: {:}'.format(self.score, self.class_id, self.xmin, self.ymin, self.width, self.height)

class TfLiteTensorsToDetectionsCalculatorOptions:
    def __init__(self, num_classes, num_boxes, num_coords, keypoint_coord_offset
        , ignore_classes, score_clipping_thresh, min_score_thresh
        , num_keypoints=0, num_values_per_keypoint=2, box_coord_offset=0
        , x_scale=0.0, y_scale=0.0, w_scale=0.0, h_scale=0.0, apply_exponential_on_box_size=False
        , reverse_output_order=False, sigmoid_score=False, flip_vertically=False):
        # The number of output classes predicted by the detection model.
        self.num_classes = num_classes
        # The number of output boxes predicted by the detection model.
        self.num_boxes = num_boxes
        # The number of output values per boxes predicted by the detection model. The
        # values contain bounding boxes, keypoints, etc.
        self.num_coords = num_coords

        # The offset of keypoint coordinates in the location tensor.
        self.keypoint_coord_offset = keypoint_coord_offset
        # The number of predicted keypoints.
        self.num_keypoints = num_keypoints
        # The dimension of each keypoint, e.g. number of values predicted for each keypoint.
        self.num_values_per_keypoint = num_values_per_keypoint
        # The offset of box coordinates in the location tensor.
        self.box_coord_offset = box_coord_offset

        # Parameters for decoding SSD detection model.
        self.x_scale = x_scale
        self.y_scale = y_scale
        self.w_scale = w_scale
        self.h_scale = h_scale

        self.apply_exponential_on_box_size = apply_exponential_on_box_size

        # Whether to reverse the order of predicted x, y from output.
        # If false, the order is [y_center, x_center, h, w], if true the order is
        # [x_center, y_center, w, h].
        self.reverse_output_order = reverse_output_order
        # The ids of classes that should be ignored during decoding the score for
        # each predicted box.
        self.ignore_classes = ignore_classes

        self.sigmoid_score = sigmoid_score
        self.score_clipping_thresh = score_clipping_thresh

        # Whether the detection coordinates from the input tensors should be flipped
        # vertically (along the y-direction). This is useful, for example, when the
        # input tensors represent detections defined with a coordinate system where
        # the origin is at the top-left corner, whereas the desired detection
        # representation has a bottom-left origin (e.g., in OpenGL).
        self.flip_vertically = flip_vertically

        # Score threshold for perserving decoded detections.
        self.min_score_thresh = min_score_thresh

    def to_string(self):
        return 'num_classes: {:}\nnum_boxes: {:}\nnum_coords: {:}\nkeypoint_coord_offset: {:}\nnum_keypoints: {:}\nnum_values_per_keypoint: {:}\nbox_coord_offset: {:}\nx_scale: {:}\ny_scale: {:}\nwx_scale: {:}\nh_scale: {:}\napply_exponential_on_box_size: {:}\nreverse_output_order: {:}\nignore_classes: {:}\nsigmoid_score: {:}\nscore_clipping_thresh: {:}\nflip_vertically: {:}\nmin_score_thresh: {:}'\
        .format(self.num_classes, self.num_boxes, self.num_coords, self.keypoint_coord_offset
            , self.num_keypoints, self.num_values_per_keypoint, self.box_coord_offset
            , self.x_scale, self.y_scale, self.w_scale, self.h_scale
            , self.apply_exponential_on_box_size, self.reverse_output_order
            , self.ignore_classes, self.sigmoid_score, self.score_clipping_thresh
            , self.flip_vertically, self.min_score_thresh)

def DecodeBoxes(raw_boxes, anchors, options):
    boxes = np.zeros(options.num_boxes * options.num_coords)
    for i in range(options.num_boxes):
        box_offset = i * options.num_coords + options.box_coord_offset

        y_center = raw_boxes[box_offset]
        x_center = raw_boxes[box_offset + 1]
        h = raw_boxes[box_offset + 2]
        w = raw_boxes[box_offset + 3]
        if (options.reverse_output_order):
            x_center = raw_boxes[box_offset]
            y_center = raw_boxes[box_offset + 1]
            w = raw_boxes[box_offset + 2]
            h = raw_boxes[box_offset + 3]

        x_center = x_center / options.x_scale * anchors[i].w + anchors[i].x_center
        y_center = y_center / options.y_scale * anchors[i].h + anchors[i].y_center

        if (options.apply_exponential_on_box_size):
            h = np.exp(h / options.h_scale) * anchors[i].h
            w = np.exp(w / options.w_scale) * anchors[i].w
        else:
            h = h / options.h_scale * anchors[i].h
            w = w / options.w_scale * anchors[i].w


        ymin = y_center - h / 2.0
        xmin = x_center - w / 2.0
        ymax = y_center + h / 2.0
        xmax = x_center + w / 2.0

        boxes[i * options.num_coords + 0] = ymin
        boxes[i * options.num_coords + 1] = xmin
        boxes[i * options.num_coords + 2] = ymax
        boxes[i * options.num_coords + 3] = xmax

        if (options.num_keypoints):
            for k in range(options.num_keypoints):
                offset = i * options.num_coords + options.keypoint_coord_offset + k * options.num_values_per_keypoint

                keypoint_y = raw_boxes[offset]
                keypoint_x = raw_boxes[offset + 1]
                if (options.reverse_output_order):
                    keypoint_x = raw_boxes[offset]
                    keypoint_y = raw_boxes[offset + 1]

                boxes[offset] = keypoint_x / options.x_scale * anchors[i].w + anchors[i].x_center
                boxes[offset + 1] = keypoint_y / options.y_scale * anchors[i].h + anchors[i].y_center
    return boxes


def ConvertToDetections(detection_boxes, detection_scores, detection_classes, options):
    output_detections = []
    for i in range(options.num_boxes):
        if (detection_scores[i] < options.min_score_thresh):
            # print('passed, score lower than threshold')
            continue
        print("box_idx:{:}".format(i))
        box_offset = i * options.num_coords
        detection = ConvertToDetection(
            detection_boxes[box_offset + 0], detection_boxes[box_offset + 1],
            detection_boxes[box_offset + 2], detection_boxes[box_offset + 3],
            detection_scores[i], detection_classes[i], options.flip_vertically)
        # Add keypoints. TODO:
        # if (options.num_keypoints > 0):
        #     location_data = detection.mutable_location_data()
        #     kp_id = 0
        #     while(kp_id < options.num_keypoints * options.num_values_per_keypoint):
        #         keypoint = location_data->add_relative_keypoints()
        #         keypoint_index = box_offset + options.keypoint_coord_offset + kp_id
        #         keypoint->set_x(detection_boxes[keypoint_index + 0])
        #         keypoint->set_y(options.flip_vertically
        #                         ? 1.f - detection_boxes[keypoint_index + 1]
        #                         : detection_boxes[keypoint_index + 1])
        #         kp_id += options.num_values_per_keypoint

        output_detections.append(detection);
    return output_detections

def ConvertToDetection(box_ymin, box_xmin, box_ymax, box_xmax, score, class_id, flip_vertically):
    # Detection detection;
    # detection.add_score(score);
    # detection.add_label_id(class_id);

    # LocationData* location_data = detection.mutable_location_data();
    # location_data->set_format(LocationData::RELATIVE_BOUNDING_BOX);

    # LocationData::RelativeBoundingBox* relative_bbox = location_data->mutable_relative_bounding_box();

    # relative_bbox->set_xmin(box_xmin);
    # relative_bbox->set_ymin(flip_vertically ? 1.f - box_ymax : box_ymin);
    # relative_bbox->set_width(box_xmax - box_xmin);
    # relative_bbox->set_height(box_ymax - box_ymin);

    detection = Detection(score, class_id, box_xmin, (1.0 - box_ymax if flip_vertically else box_ymin), (box_xmax - box_xmin), (box_ymax - box_ymin))

    # print('score: {:}, class_id: {:}\n, xmin: {:}, ymin: {:}, width: {:}, height: {:}'.format(score, class_id, box_xmin, (1.0 - box_ymax if flip_vertically else box_ymin), (box_xmax - box_xmin), (box_ymax - box_ymin)))

    return detection

def ProcessCPU(raw_boxes, raw_scores, anchors_, options):
    # Postprocessing on CPU for model without postprocessing op. E.g. output
    # raw score tensor and box tensor. Anchor decoding will be handled below.

    boxes = DecodeBoxes(raw_boxes, anchors_, options)
    detection_scores = np.zeros(options.num_boxes)
    detection_classes = np.zeros(options.num_boxes)

    # Filter classes by scores.
    for i in range(options.num_boxes):
        class_id = -1
        max_score = np.finfo(float).min
        # Find the top score for box i.
        for score_idx in range(options.num_classes):
            # if (ignore_classes_.find(score_idx) == ignore_classes_.end()) {
            score = raw_scores[i * options.num_classes + score_idx]
            if options.sigmoid_score:
                if options.score_clipping_thresh>0:
                    score = -options.score_clipping_thresh if score<-options.score_clipping_thresh else score
                    score = options.score_clipping_thresh if score>options.score_clipping_thresh else score
                score = 1.0 / (1.0 + np.exp(-score))
            if (max_score < score):
                max_score = score
                class_id = score_idx
            # }
        detection_scores[i] = max_score
        detection_classes[i] = class_id

    print('--------------------------------')
    print('boxes: ')
    print(boxes.shape)
    print(boxes)
    print('--------------------------------')
    print('detection_scores: ')
    print(detection_scores.shape)
    print(detection_scores)
    print('--------------------------------')
    print('detection_classes: ')
    print(detection_classes.shape)
    print(detection_classes)

    output_detections = ConvertToDetections(boxes, detection_scores, detection_classes, options)
    return output_detections

def orig_nms(detections, threshold):
    """nms
    :boxes: [:,0:5]
    :threshold: 0.5 like
    :type: 'Min' or others
    :returns: TODO
    """
    if len(detections) <= 0:
        return np.array([])
    x1 = []
    x2 = []
    y1 = []
    y2 = []
    s = []
    for detection in detections:
        x1.append(detection.xmin)
        x2.append(detection.xmin + detection.width)
        y1.append(detection.ymin)
        y2.append(detection.ymin + detection.height)
        s.append(detection.score)
    x1 = np.array(x1)
    x2 = np.array(x2)
    y1 = np.array(y1)
    y2 = np.array(y2)
    s = np.array(s)
    area = np.multiply(x2-x1+1, y2-y1+1)
    I = np.array(s.argsort()) # read s using I

    pick = [];
    while len(I) > 0:
        xx1 = np.maximum(x1[I[-1]], x1[I[0:-1]])
        yy1 = np.maximum(y1[I[-1]], y1[I[0:-1]])
        xx2 = np.minimum(x2[I[-1]], x2[I[0:-1]])
        yy2 = np.minimum(y2[I[-1]], y2[I[0:-1]])
        w = np.maximum(0.0, xx2 - xx1 + 1)
        h = np.maximum(0.0, yy2 - yy1 + 1)
        inter = w * h
        o = inter / (area[I[-1]] + area[I[0:-1]] - inter)
        pick.append(I[-1])
        I = I[np.where( o <= threshold)[0]]
    return list(np.array(detections)[pick])

def gen_anchors(options):
    anchors = []
    # Verify the options.
    if (options.strides_size != options.num_layers):
        print("strides_size and num_layers must be equal.")
        return []

    layer_id = 0
    while (layer_id < options.strides_size):
        anchor_height = []
        anchor_width = []
        aspect_ratios = []
        scales = []

        # For same strides, we merge the anchors in the same order.
        last_same_stride_layer = layer_id
        while (last_same_stride_layer < options.strides_size and options.strides[last_same_stride_layer] == options.strides[layer_id]):
            scale = options.min_scale + (options.max_scale - options.min_scale) * 1.0 * last_same_stride_layer / (options.strides_size - 1.0)
            if (last_same_stride_layer == 0 and options.reduce_boxes_in_lowest_layer):
                # For first layer, it can be specified to use predefined anchors.
                aspect_ratios.append(1.0)
                aspect_ratios.append(2.0)
                aspect_ratios.append(0.5)
                scales.append(0.1)
                scales.append(scale)
                scales.append(scale)
            else:
                for aspect_ratio_id in range(options.aspect_ratios_size):
                    aspect_ratios.append(options.aspect_ratios[aspect_ratio_id])
                    scales.append(scale)

                if (options.interpolated_scale_aspect_ratio > 0.0):
                    scale_next = 1.0 if last_same_stride_layer == options.strides_size - 1 else options.min_scale + (options.max_scale - options.min_scale) * 1.0 * (last_same_stride_layer+1) / (options.strides_size - 1.0)
                    scales.append(math.sqrt(scale * scale_next))
                    aspect_ratios.append(options.interpolated_scale_aspect_ratio)
            last_same_stride_layer += 1
        for i in range(len(aspect_ratios)):
            ratio_sqrts = math.sqrt(aspect_ratios[i])
            anchor_height.append(scales[i] / ratio_sqrts)
            anchor_width.append(scales[i] * ratio_sqrts)

        feature_map_height = 0
        feature_map_width = 0
        if (options.feature_map_height_size > 0):
            feature_map_height = options.feature_map_height[layer_id]
            feature_map_width = options.feature_map_width[layer_id]
        else:
            stride = options.strides[layer_id]
            feature_map_height = math.ceil(1.0 * options.input_size_height / stride)
            feature_map_width = math.ceil(1.0 * options.input_size_width / stride)

        for y in range(feature_map_height):
            for x in range(feature_map_width):
                for anchor_id in range(len(anchor_height)):
                    # TODO: Support specifying anchor_offset_x, anchor_offset_y.
                    x_center = (x + options.anchor_offset_x) * 1.0 / feature_map_width
                    y_center = (y + options.anchor_offset_y) * 1.0 / feature_map_height
                    w = 0
                    h = 0
                    if (options.fixed_anchor_size):
                        w = 1.0
                        h = 1.0
                    else:
                        w = anchor_width[anchor_id]
                        h = anchor_height[anchor_id]
                    new_anchor = Anchor(x_center, y_center, h, w)
                    anchors.append(new_anchor)
        layer_id = last_same_stride_layer
    return anchors


def main():
    # Options to generate anchors for SSD object detection models.
    ssd_anchors_calculator_options = SsdAnchorsCalculatorOptions(input_size_width=128, input_size_height=128, min_scale=0.1484375, max_scale=0.75
            , anchor_offset_x=0.5, anchor_offset_y=0.5, num_layers=4
            , feature_map_width=[], feature_map_height=[]
            , strides=[8, 16, 16, 16], aspect_ratios=[1.0]
            , reduce_boxes_in_lowest_layer=False, interpolated_scale_aspect_ratio=1.0
            , fixed_anchor_size=True)
    print('------------------------------------------------')
    print('SsdAnchorsCalculatorOptions: ')
    print(ssd_anchors_calculator_options.to_string())

    anchors = gen_anchors(ssd_anchors_calculator_options)
    # print('------------------------------------------------')
    # print('Anchors: ')
    # print('number: {:}'.format(len(anchors)))
    # for i, anchor in enumerate(anchors):
    #     print('Anchor {:}'.format(i))
    #     print(anchor.to_string())


    options = TfLiteTensorsToDetectionsCalculatorOptions(num_classes=1, num_boxes=896, num_coords=16
        , keypoint_coord_offset=4, ignore_classes=[], score_clipping_thresh=100.0, min_score_thresh=0.75
        , num_keypoints=6, num_values_per_keypoint=2, box_coord_offset=0
        , x_scale=128.0, y_scale=128.0, w_scale=128.0, h_scale=128.0, apply_exponential_on_box_size=False
        , reverse_output_order=True, sigmoid_score=True, flip_vertically=False)
    print('------------------------------------------------')
    print('TfLiteTensorsToDetectionsCalculatorOptions: ')
    print(options.to_string())
    # blaze face model
    # https://github.com/google/mediapipe/tree/master/mediapipe/models/face_detection_front.tflite
    model_path = './face_detection_front.tflite'

    # Load TFLite model and allocate tensors.
    interpreter = tf.contrib.lite.Interpreter(model_path=model_path)
    interpreter.allocate_tensors()
    # Get input and output tensors.
    input_details = interpreter.get_input_details()
    output_details = interpreter.get_output_details()

    print('--------------------------------')
    print("input_details: ")
    print(input_details)
    print("output_details: ")
    print(output_details)

    # capture = cv2.VideoCapture('./videoplayback_1.mp4')
    capture = cv2.VideoCapture(0)
    frame_cnt = 0
    accum_time = 0
    curr_fps = 0
    fps = "FPS: ??"
    prev_time = time.time()
    while (True):
        ret, img = capture.read()
        # img = cv2.imread('./test_image.jpg')
        img_height = img.shape[0]
        img_width = img.shape[1]

        frame_cnt += 1
        print('-------- frame_cnt: '+str(frame_cnt)+' --------')
        if ret == True:
            img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

            preprocess_start_time = time.time()
            # input shape
            input_width = input_details[0]["shape"][1]
            input_height = input_details[0]["shape"][2]
            # resize
            input_data = cv2.resize(img_rgb, (input_width, input_height)).astype(np.float32)
            # preprocess
            # input_data = (input_data)
            input_data = ((input_data-127.5)/127.5)
            # input_data = ((input_data)/255)
            input_data = np.expand_dims(input_data, axis=0)
            preprocess_end_time = time.time()
            inference_start_time = time.time()
            # set input data
            interpreter.set_tensor(input_details[0]["index"], input_data)
            interpreter.invoke()
            regressors = interpreter.get_tensor(output_details[0]["index"])
            classificators = interpreter.get_tensor(output_details[1]["index"])
            inference_end_time = time.time()

            # print('--------------------------------')
            # print('regressors: ')
            # print(regressors.shape)
            # print(regressors)
            # print('--------------------------------')
            # print('classificators: ')
            # print(classificators.shape)
            # print(classificators)
            postprocess_start_time = time.time()
            raw_boxes = np.reshape(regressors, int(regressors.shape[0]*regressors.shape[1]*regressors.shape[2]))
            raw_scores = np.reshape(classificators, int(classificators.shape[0]*classificators.shape[1]*classificators.shape[2]))
            detections = ProcessCPU(raw_boxes, raw_scores, anchors, options)
            detections = orig_nms(detections, 0.3)
            print('--------------------------------')
            print('detections: ')
            print('number: {:}'.format(len(detections)))
            for detection in detections:
                print(detection.to_string())
                x1 = int(img_width * detection.xmin)
                x2 = int(img_width * (detection.xmin + detection.width))
                y1 = int(img_height * detection.ymin)
                y2 = int(img_height * (detection.ymin + detection.height))
                print("x1: {:}, y1: {:}\nx2: {:}, y2: {:}".format(x1, y1, x2, y2))

                cv2.rectangle(img, (x1, y1), (x2, y2), (255,0,0), 2)
                cv2.putText(img, '{:.2f}'.format(detection.score), (x1, y1 - 6)
                    , cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 255), 2)

            postprocess_end_time = time.time()
            print('preprocess cost: {:.2f} ms'.format((preprocess_end_time-preprocess_start_time)*1000))
            print('inference cost: {:.2f} ms'.format((inference_end_time-inference_start_time)*1000))
            print('postprocess cost: {:.2f} ms'.format((postprocess_end_time-postprocess_start_time)*1000))

            curr_time = time.time()
            exec_time = curr_time - prev_time
            prev_time = curr_time
            accum_time = accum_time + exec_time
            curr_fps = curr_fps + 1
            if accum_time > 1:
                accum_time = accum_time - 1
                fps = "FPS: " + str(curr_fps)
                curr_fps = 0

            print(fps)
            cv2.putText(img, text=fps , org=(10, 25)
                , fontFace=cv2.FONT_HERSHEY_SIMPLEX, fontScale=0.60, color=(255, 0, 0), thickness=2)
            cv2.imshow('img', img)
            c = cv2.waitKey(1) & 0xff
            if c==27:
                break

        # if frame_cnt>100:
        #     exit(0)

if __name__ == "__main__":
    main()