Created October 30, 2019 03:26
real time face detection with an ultra_light_model
# -*- coding: utf-8 -*-
# @Author: fyr91
# @Date: 2019-10-22 15:05:15
# @Last Modified by: fyr91
# @Last Modified time: 2019-10-30 11:25:26
import cv2
import numpy as np
import onnx
import onnxruntime as ort
from onnx_tf.backend import prepare
def area_of(left_top, right_bottom):
Compute the areas of rectangles given two corners.
left_top (N, 2): left top corner.
right_bottom (N, 2): right bottom corner.
area (N): return the area.
hw = np.clip(right_bottom - left_top, 0.0, None)
return hw[..., 0] * hw[..., 1]
def iou_of(boxes0, boxes1, eps=1e-5):
Return intersection-over-union (Jaccard index) of boxes.
boxes0 (N, 4): ground truth boxes.
boxes1 (N or 1, 4): predicted boxes.
eps: a small number to avoid 0 as denominator.
iou (N): IoU values.
overlap_left_top = np.maximum(boxes0[..., :2], boxes1[..., :2])
overlap_right_bottom = np.minimum(boxes0[..., 2:], boxes1[..., 2:])
overlap_area = area_of(overlap_left_top, overlap_right_bottom)
area0 = area_of(boxes0[..., :2], boxes0[..., 2:])
area1 = area_of(boxes1[..., :2], boxes1[..., 2:])
return overlap_area / (area0 + area1 - overlap_area + eps)
def hard_nms(box_scores, iou_threshold, top_k=-1, candidate_size=200):
Perform hard non-maximum-supression to filter out boxes with iou greater
than threshold
box_scores (N, 5): boxes in corner-form and probabilities.
iou_threshold: intersection over union threshold.
top_k: keep top_k results. If k <= 0, keep all the results.
candidate_size: only consider the candidates with the highest scores.
picked: a list of indexes of the kept boxes
scores = box_scores[:, -1]
boxes = box_scores[:, :-1]
picked = []
indexes = np.argsort(scores)
indexes = indexes[-candidate_size:]
while len(indexes) > 0:
current = indexes[-1]
if 0 < top_k == len(picked) or len(indexes) == 1:
current_box = boxes[current, :]
indexes = indexes[:-1]
rest_boxes = boxes[indexes, :]
iou = iou_of(
np.expand_dims(current_box, axis=0),
indexes = indexes[iou <= iou_threshold]
return box_scores[picked, :]
def predict(width, height, confidences, boxes, prob_threshold, iou_threshold=0.5, top_k=-1):
Select boxes that contain human faces
width: original image width
height: original image height
confidences (N, 2): confidence array
boxes (N, 4): boxes array in corner-form
iou_threshold: intersection over union threshold.
top_k: keep top_k results. If k <= 0, keep all the results.
boxes (k, 4): an array of boxes kept
labels (k): an array of labels for each boxes kept
probs (k): an array of probabilities for each boxes being in corresponding labels
boxes = boxes[0]
confidences = confidences[0]
picked_box_probs = []
picked_labels = []
for class_index in range(1, confidences.shape[1]):
probs = confidences[:, class_index]
mask = probs > prob_threshold
probs = probs[mask]
if probs.shape[0] == 0:
subset_boxes = boxes[mask, :]
box_probs = np.concatenate([subset_boxes, probs.reshape(-1, 1)], axis=1)
box_probs = hard_nms(box_probs,
picked_labels.extend([class_index] * box_probs.shape[0])
if not picked_box_probs:
return np.array([]), np.array([]), np.array([])
picked_box_probs = np.concatenate(picked_box_probs)
picked_box_probs[:, 0] *= width
picked_box_probs[:, 1] *= height
picked_box_probs[:, 2] *= width
picked_box_probs[:, 3] *= height
return picked_box_probs[:, :4].astype(np.int32), np.array(picked_labels), picked_box_probs[:, 4]
video_capture = cv2.VideoCapture(0)
onnx_path = 'ultra_light/ultra_light_models/ultra_light_640.onnx'
onnx_model = onnx.load(onnx_path)
predictor = prepare(onnx_model)
ort_session = ort.InferenceSession(onnx_path)
input_name = ort_session.get_inputs()[0].name
while True:
ret, frame =
if frame is not None:
h, w, _ = frame.shape
# preprocess img acquired
img = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) # convert bgr to rgb
img = cv2.resize(img, (640, 480)) # resize
img_mean = np.array([127, 127, 127])
img = (img - img_mean) / 128
img = np.transpose(img, [2, 0, 1])
img = np.expand_dims(img, axis=0)
img = img.astype(np.float32)
confidences, boxes =, {input_name: img})
boxes, labels, probs = predict(w, h, confidences, boxes, 0.7)
for i in range(boxes.shape[0]):
box = boxes[i, :]
x1, y1, x2, y2 = box
cv2.rectangle(frame, (x1, y1), (x2, y2), (80,18,236), 2)
cv2.rectangle(frame, (x1, y2 - 20), (x2, y2), (80,18,236), cv2.FILLED)
text = f"face: {labels[i]}"
cv2.putText(frame, text, (x1 + 6, y2 - 6), font, 0.5, (255, 255, 255), 1)
cv2.imshow('Video', frame)
# Hit 'q' on the keyboard to quit!
if cv2.waitKey(1) & 0xFF == ord('q'):
# Release handle to the webcam
