Neutree/rk3588_yolov5_camera.py

## rk3588_yolov5_camera.py
#
# yolov5 code for rk3588/rk3588s(borad rock5b)
# @author neucrack neucrack.com
# @license MIT

import sys
import cv2
import numpy as np
from rknnlite.api import RKNNLite


OBJ_THRESH = 0.25
NMS_THRESH = 0.45

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def xywh2xyxy(x):
    # Convert [x, y, w, h] to [x1, y1, x2, y2]
    y = np.copy(x)
    y[:, 0] = x[:, 0] - x[:, 2] / 2  # top left x
    y[:, 1] = x[:, 1] - x[:, 3] / 2  # top left y
    y[:, 2] = x[:, 0] + x[:, 2] / 2  # bottom right x
    y[:, 3] = x[:, 1] + x[:, 3] / 2  # bottom right y
    return y

def process(input, mask, anchors, input_size = 640):

    anchors = [anchors[i] for i in mask]
    grid_h, grid_w = map(int, input.shape[0:2])

    box_confidence = sigmoid(input[..., 4])
    box_confidence = np.expand_dims(box_confidence, axis=-1)

    box_class_probs = sigmoid(input[..., 5:])

    box_xy = sigmoid(input[..., :2])*2 - 0.5

    col = np.tile(np.arange(0, grid_w), grid_w).reshape(-1, grid_w)
    row = np.tile(np.arange(0, grid_h).reshape(-1, 1), grid_h)
    col = col.reshape(grid_h, grid_w, 1, 1).repeat(3, axis=-2)
    row = row.reshape(grid_h, grid_w, 1, 1).repeat(3, axis=-2)
    grid = np.concatenate((col, row), axis=-1)
    box_xy += grid
    box_xy *= int(input_size/grid_h)

    box_wh = pow(sigmoid(input[..., 2:4])*2, 2)
    box_wh = box_wh * anchors

    box = np.concatenate((box_xy, box_wh), axis=-1)

    return box, box_confidence, box_class_probs

def filter_boxes(boxes, box_confidences, box_class_probs):
    """Filter boxes with box threshold. It's a bit different with origin yolov5 post process!
    # Arguments
        boxes: ndarray, boxes of objects.
        box_confidences: ndarray, confidences of objects.
        box_class_probs: ndarray, class_probs of objects.
    # Returns
        boxes: ndarray, filtered boxes.
        classes: ndarray, classes for boxes.
        scores: ndarray, scores for boxes.
    """
    boxes = boxes.reshape(-1, 4)
    box_confidences = box_confidences.reshape(-1)
    box_class_probs = box_class_probs.reshape(-1, box_class_probs.shape[-1])

    _box_pos = np.where(box_confidences >= OBJ_THRESH)
    boxes = boxes[_box_pos]
    box_confidences = box_confidences[_box_pos]
    box_class_probs = box_class_probs[_box_pos]

    class_max_score = np.max(box_class_probs, axis=-1)
    classes = np.argmax(box_class_probs, axis=-1)
    _class_pos = np.where(class_max_score >= OBJ_THRESH)

    boxes = boxes[_class_pos]
    classes = classes[_class_pos]
    scores = (class_max_score* box_confidences)[_class_pos]

    return boxes, classes, scores

def nms_boxes(boxes, scores):
    """Suppress non-maximal boxes.
    # Arguments
        boxes: ndarray, boxes of objects.
        scores: ndarray, scores of objects.
    # Returns
        keep: ndarray, index of effective boxes.
    """
    x = boxes[:, 0]
    y = boxes[:, 1]
    w = boxes[:, 2] - boxes[:, 0]
    h = boxes[:, 3] - boxes[:, 1]

    areas = w * h
    order = scores.argsort()[::-1]

    keep = []
    while order.size > 0:
        i = order[0]
        keep.append(i)

        xx1 = np.maximum(x[i], x[order[1:]])
        yy1 = np.maximum(y[i], y[order[1:]])
        xx2 = np.minimum(x[i] + w[i], x[order[1:]] + w[order[1:]])
        yy2 = np.minimum(y[i] + h[i], y[order[1:]] + h[order[1:]])

        w1 = np.maximum(0.0, xx2 - xx1 + 0.00001)
        h1 = np.maximum(0.0, yy2 - yy1 + 0.00001)
        inter = w1 * h1

        ovr = inter / (areas[i] + areas[order[1:]] - inter)
        inds = np.where(ovr <= NMS_THRESH)[0]
        order = order[inds + 1]
    keep = np.array(keep)
    return keep

def yolov5_post_process(input_data, input_size = 640):
    masks = [[0, 1, 2], [3, 4, 5], [6, 7, 8]]
    anchors = [[10, 13], [16, 30], [33, 23], [30, 61], [62, 45],
               [59, 119], [116, 90], [156, 198], [373, 326]]

    boxes, classes, scores = [], [], []
    for input, mask in zip(input_data, masks):
        b, c, s = process(input, mask, anchors, input_size)
        b, c, s = filter_boxes(b, c, s)
        boxes.append(b)
        classes.append(c)
        scores.append(s)

    boxes = np.concatenate(boxes)
    boxes = xywh2xyxy(boxes)
    classes = np.concatenate(classes)
    scores = np.concatenate(scores)

    nboxes, nclasses, nscores = [], [], []
    for c in set(classes):
        inds = np.where(classes == c)
        b = boxes[inds]
        c = classes[inds]
        s = scores[inds]

        keep = nms_boxes(b, s)

        nboxes.append(b[keep])
        nclasses.append(c[keep])
        nscores.append(s[keep])

    if not nclasses and not nscores:
        return None, None, None

    boxes = np.concatenate(nboxes)
    classes = np.concatenate(nclasses)
    scores = np.concatenate(nscores)

    return boxes, classes, scores

def draw(image, boxes, scores, classes, labels):
    """Draw the boxes on the image.
    # Argument:
        image: original image.
        boxes: ndarray, boxes of objects.
        classes: ndarray, classes of objects.
        scores: ndarray, scores of objects.
        all_classes: all classes name.
    """
    for box, score, cl in zip(boxes, scores, classes):
        top, left, right, bottom = box
        print('class: {}, score: {}'.format(labels[cl], score))
        print('box coordinate left,top,right,down: [{}, {}, {}, {}]'.format(top, left, right, bottom))
        top = int(top)
        left = int(left)
        right = int(right)
        bottom = int(bottom)

        cv2.rectangle(image, (top, left), (right, bottom), (255, 0, 0), 2)
        cv2.putText(image, '{0} {1:.2f}'.format(labels[cl], score),
                    (top, left - 6),
                    cv2.FONT_HERSHEY_SIMPLEX,
                    0.6, (0, 0, 255), 2)

def read_cam(nn, w, h, callback, labels):
    max_w = 3840
    min_h = 2160
    if max_w / w > min_h / h:
        cap_h = h
        cap_w = max_w / (min_h // h)
    else:
        cap_w = w
        cap_h = min_h / (max_w // w)
    # 16 align
    cap_w = int((cap_w + 15) // 16 * 16)
    cap_h = int((cap_h + 15) // 16 * 16)
    print("capture size:", cap_w, cap_h)
    cap = cv2.VideoCapture(f"v4l2src device=/dev/video11 ! video/x-raw,format=NV12,width={cap_w},height={cap_h}, framerate=30/1 ! appsink")
    if cap.isOpened():
        cv2.namedWindow("demo", cv2.WINDOW_AUTOSIZE)
        while True:
            ret_val, img = cap.read()
            # img2 = cv2.cvtColor(img, cv2.COLOR_YUV2BGR_NV12)
            img2 = cv2.cvtColor(img, cv2.COLOR_YUV2RGB_NV12)
            # crop wxh from center of img2
            img2 = img2[(img2.shape[0] - h) // 2:(img2.shape[0] + h) // 2, (img2.shape[1] - w) // 2:(img2.shape[1] + w) // 2]
            callback(nn, img2, labels)
    else:
     print("camera open failed")
    cv2.destroyAllWindows()

def nn_init(rknn_model):
    rknn_lite = RKNNLite()
    ret = rknn_lite.load_rknn(rknn_model)
    if ret != 0:
        raise Exception('Load RKNN model failed')
    ret = rknn_lite.init_runtime(core_mask=RKNNLite.NPU_CORE_0_1_2) # NPU_CORE_0
    if ret != 0:
        raise Exception('Init runtime environment failed')
    return rknn_lite

def on_image(nn, img, labels):
    '''
        @img RGB hwc image
    '''
    # normalization will be done automatically in inference method
    outs = nn.inference(inputs=[img])
    for i, out in enumerate(outs):
        print(f"output {i}: {out.shape}")
    input0_data = outs[0]
    input1_data = outs[1]
    input2_data = outs[2]

    input0_data = input0_data.reshape([3, -1]+list(input0_data.shape[-2:]))
    input1_data = input1_data.reshape([3, -1]+list(input1_data.shape[-2:]))
    input2_data = input2_data.reshape([3, -1]+list(input2_data.shape[-2:]))

    input_data = list()
    input_data.append(np.transpose(input0_data, (2, 3, 0, 1)))
    input_data.append(np.transpose(input1_data, (2, 3, 0, 1)))
    input_data.append(np.transpose(input2_data, (2, 3, 0, 1)))

    boxes, classes, scores = yolov5_post_process(input_data)

    img2 = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
    if boxes is not None:
        draw(img2, boxes, scores, classes, labels = labels)
    cv2.imshow('demo',img2)
    cv2.waitKey(1)

if __name__ == '__main__':
    width = 640
    height = 640
    if len(sys.argv) != 2:
        print("Usage:")
        print("    python yolov5_camera.py yolov5s.rknn")
        sys.exit(0)
    model = sys.argv[1]
    labels = ("person", "bicycle", "car", "motorbike ", "aeroplane ", "bus ", "train", "truck ", "boat", "traffic light",
           "fire hydrant", "stop sign ", "parking meter", "bench", "bird", "cat", "dog ", "horse ", "sheep", "cow", "elephant",
           "bear", "zebra ", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite",
           "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork", "knife ",
           "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza ", "donut", "cake", "chair", "sofa",
           "pottedplant", "bed", "diningtable", "toilet ", "tvmonitor", "laptop	", "mouse	", "remote ", "keyboard ", "cell phone", "microwave ",
           "oven ", "toaster", "sink", "refrigerator ", "book", "clock", "vase", "scissors ", "teddy bear ", "hair drier", "toothbrush ")
    nn = nn_init(model)
    read_cam(nn, width, height, on_image, labels)
	#
	# yolov5 code for rk3588/rk3588s(borad rock5b)
	# @author neucrack neucrack.com
	# @license MIT

	import sys
	import cv2
	import numpy as np
	from rknnlite.api import RKNNLite


	OBJ_THRESH = 0.25
	NMS_THRESH = 0.45

	def sigmoid(x):
	return 1 / (1 + np.exp(-x))

	def xywh2xyxy(x):
	# Convert [x, y, w, h] to [x1, y1, x2, y2]
	y = np.copy(x)
	y[:, 0] = x[:, 0] - x[:, 2] / 2 # top left x
	y[:, 1] = x[:, 1] - x[:, 3] / 2 # top left y
	y[:, 2] = x[:, 0] + x[:, 2] / 2 # bottom right x
	y[:, 3] = x[:, 1] + x[:, 3] / 2 # bottom right y
	return y

	def process(input, mask, anchors, input_size = 640):

	anchors = [anchors[i] for i in mask]
	grid_h, grid_w = map(int, input.shape[0:2])

	box_confidence = sigmoid(input[..., 4])
	box_confidence = np.expand_dims(box_confidence, axis=-1)

	box_class_probs = sigmoid(input[..., 5:])

	box_xy = sigmoid(input[..., :2])*2 - 0.5

	col = np.tile(np.arange(0, grid_w), grid_w).reshape(-1, grid_w)
	row = np.tile(np.arange(0, grid_h).reshape(-1, 1), grid_h)
	col = col.reshape(grid_h, grid_w, 1, 1).repeat(3, axis=-2)
	row = row.reshape(grid_h, grid_w, 1, 1).repeat(3, axis=-2)
	grid = np.concatenate((col, row), axis=-1)
	box_xy += grid
	box_xy *= int(input_size/grid_h)

	box_wh = pow(sigmoid(input[..., 2:4])*2, 2)
	box_wh = box_wh * anchors

	box = np.concatenate((box_xy, box_wh), axis=-1)

	return box, box_confidence, box_class_probs

	def filter_boxes(boxes, box_confidences, box_class_probs):
	"""Filter boxes with box threshold. It's a bit different with origin yolov5 post process!
	# Arguments
	boxes: ndarray, boxes of objects.
	box_confidences: ndarray, confidences of objects.
	box_class_probs: ndarray, class_probs of objects.
	# Returns
	boxes: ndarray, filtered boxes.
	classes: ndarray, classes for boxes.
	scores: ndarray, scores for boxes.
	"""
	boxes = boxes.reshape(-1, 4)
	box_confidences = box_confidences.reshape(-1)
	box_class_probs = box_class_probs.reshape(-1, box_class_probs.shape[-1])

	_box_pos = np.where(box_confidences >= OBJ_THRESH)
	boxes = boxes[_box_pos]
	box_confidences = box_confidences[_box_pos]
	box_class_probs = box_class_probs[_box_pos]

	class_max_score = np.max(box_class_probs, axis=-1)
	classes = np.argmax(box_class_probs, axis=-1)
	_class_pos = np.where(class_max_score >= OBJ_THRESH)

	boxes = boxes[_class_pos]
	classes = classes[_class_pos]
	scores = (class_max_score* box_confidences)[_class_pos]

	return boxes, classes, scores

	def nms_boxes(boxes, scores):
	"""Suppress non-maximal boxes.
	# Arguments
	boxes: ndarray, boxes of objects.
	scores: ndarray, scores of objects.
	# Returns
	keep: ndarray, index of effective boxes.
	"""
	x = boxes[:, 0]
	y = boxes[:, 1]
	w = boxes[:, 2] - boxes[:, 0]
	h = boxes[:, 3] - boxes[:, 1]

	areas = w * h
	order = scores.argsort()[::-1]

	keep = []
	while order.size > 0:
	i = order[0]
	keep.append(i)

	xx1 = np.maximum(x[i], x[order[1:]])
	yy1 = np.maximum(y[i], y[order[1:]])
	xx2 = np.minimum(x[i] + w[i], x[order[1:]] + w[order[1:]])
	yy2 = np.minimum(y[i] + h[i], y[order[1:]] + h[order[1:]])

	w1 = np.maximum(0.0, xx2 - xx1 + 0.00001)
	h1 = np.maximum(0.0, yy2 - yy1 + 0.00001)
	inter = w1 * h1

	ovr = inter / (areas[i] + areas[order[1:]] - inter)
	inds = np.where(ovr <= NMS_THRESH)[0]
	order = order[inds + 1]
	keep = np.array(keep)
	return keep

	def yolov5_post_process(input_data, input_size = 640):
	masks = [[0, 1, 2], [3, 4, 5], [6, 7, 8]]
	anchors = [[10, 13], [16, 30], [33, 23], [30, 61], [62, 45],
	[59, 119], [116, 90], [156, 198], [373, 326]]

	boxes, classes, scores = [], [], []
	for input, mask in zip(input_data, masks):
	b, c, s = process(input, mask, anchors, input_size)
	b, c, s = filter_boxes(b, c, s)
	boxes.append(b)
	classes.append(c)
	scores.append(s)

	boxes = np.concatenate(boxes)
	boxes = xywh2xyxy(boxes)
	classes = np.concatenate(classes)
	scores = np.concatenate(scores)

	nboxes, nclasses, nscores = [], [], []
	for c in set(classes):
	inds = np.where(classes == c)
	b = boxes[inds]
	c = classes[inds]
	s = scores[inds]

	keep = nms_boxes(b, s)

	nboxes.append(b[keep])
	nclasses.append(c[keep])
	nscores.append(s[keep])

	if not nclasses and not nscores:
	return None, None, None

	boxes = np.concatenate(nboxes)
	classes = np.concatenate(nclasses)
	scores = np.concatenate(nscores)

	return boxes, classes, scores

	def draw(image, boxes, scores, classes, labels):
	"""Draw the boxes on the image.
	# Argument:
	image: original image.
	boxes: ndarray, boxes of objects.
	classes: ndarray, classes of objects.
	scores: ndarray, scores of objects.
	all_classes: all classes name.
	"""
	for box, score, cl in zip(boxes, scores, classes):
	top, left, right, bottom = box
	print('class: {}, score: {}'.format(labels[cl], score))
	print('box coordinate left,top,right,down: [{}, {}, {}, {}]'.format(top, left, right, bottom))
	top = int(top)
	left = int(left)
	right = int(right)
	bottom = int(bottom)

	cv2.rectangle(image, (top, left), (right, bottom), (255, 0, 0), 2)
	cv2.putText(image, '{0} {1:.2f}'.format(labels[cl], score),
	(top, left - 6),
	cv2.FONT_HERSHEY_SIMPLEX,
	0.6, (0, 0, 255), 2)

	def read_cam(nn, w, h, callback, labels):
	max_w = 3840
	min_h = 2160
	if max_w / w > min_h / h:
	cap_h = h
	cap_w = max_w / (min_h // h)
	else:
	cap_w = w
	cap_h = min_h / (max_w // w)
	# 16 align
	cap_w = int((cap_w + 15) // 16 * 16)
	cap_h = int((cap_h + 15) // 16 * 16)
	print("capture size:", cap_w, cap_h)
	cap = cv2.VideoCapture(f"v4l2src device=/dev/video11 ! video/x-raw,format=NV12,width={cap_w},height={cap_h}, framerate=30/1 ! appsink")
	if cap.isOpened():
	cv2.namedWindow("demo", cv2.WINDOW_AUTOSIZE)
	while True:
	ret_val, img = cap.read()
	# img2 = cv2.cvtColor(img, cv2.COLOR_YUV2BGR_NV12)
	img2 = cv2.cvtColor(img, cv2.COLOR_YUV2RGB_NV12)
	# crop wxh from center of img2
	img2 = img2[(img2.shape[0] - h) // 2:(img2.shape[0] + h) // 2, (img2.shape[1] - w) // 2:(img2.shape[1] + w) // 2]
	callback(nn, img2, labels)
	else:
	print("camera open failed")
	cv2.destroyAllWindows()

	def nn_init(rknn_model):
	rknn_lite = RKNNLite()
	ret = rknn_lite.load_rknn(rknn_model)
	if ret != 0:
	raise Exception('Load RKNN model failed')
	ret = rknn_lite.init_runtime(core_mask=RKNNLite.NPU_CORE_0_1_2) # NPU_CORE_0
	if ret != 0:
	raise Exception('Init runtime environment failed')
	return rknn_lite

	def on_image(nn, img, labels):
	'''
	@img RGB hwc image
	'''
	# normalization will be done automatically in inference method
	outs = nn.inference(inputs=[img])
	for i, out in enumerate(outs):
	print(f"output {i}: {out.shape}")
	input0_data = outs[0]
	input1_data = outs[1]
	input2_data = outs[2]

	input0_data = input0_data.reshape([3, -1]+list(input0_data.shape[-2:]))
	input1_data = input1_data.reshape([3, -1]+list(input1_data.shape[-2:]))
	input2_data = input2_data.reshape([3, -1]+list(input2_data.shape[-2:]))

	input_data = list()
	input_data.append(np.transpose(input0_data, (2, 3, 0, 1)))
	input_data.append(np.transpose(input1_data, (2, 3, 0, 1)))
	input_data.append(np.transpose(input2_data, (2, 3, 0, 1)))

	boxes, classes, scores = yolov5_post_process(input_data)

	img2 = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
	if boxes is not None:
	draw(img2, boxes, scores, classes, labels = labels)
	cv2.imshow('demo',img2)
	cv2.waitKey(1)

	if __name__ == '__main__':
	width = 640
	height = 640
	if len(sys.argv) != 2:
	print("Usage:")
	print(" python yolov5_camera.py yolov5s.rknn")
	sys.exit(0)
	model = sys.argv[1]
	labels = ("person", "bicycle", "car", "motorbike ", "aeroplane ", "bus ", "train", "truck ", "boat", "traffic light",
	"fire hydrant", "stop sign ", "parking meter", "bench", "bird", "cat", "dog ", "horse ", "sheep", "cow", "elephant",
	"bear", "zebra ", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite",
	"baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork", "knife ",
	"spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza ", "donut", "cake", "chair", "sofa",
	"pottedplant", "bed", "diningtable", "toilet ", "tvmonitor", "laptop ", "mouse ", "remote ", "keyboard ", "cell phone", "microwave ",
	"oven ", "toaster", "sink", "refrigerator ", "book", "clock", "vase", "scissors ", "teddy bear ", "hair drier", "toothbrush ")
	nn = nn_init(model)
	read_cam(nn, width, height, on_image, labels)