Created
September 2, 2020 18:25
-
-
Save 26medias/505408fd58505cb1c1d0083c7a2cd69a to your computer and use it in GitHub Desktop.
TensorRT YoloV3 running on a video via OpenCV
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# | |
# Copyright 1993-2020 NVIDIA Corporation. All rights reserved. | |
# | |
# NOTICE TO LICENSEE: | |
# | |
# This source code and/or documentation ("Licensed Deliverables") are | |
# subject to NVIDIA intellectual property rights under U.S. and | |
# international Copyright laws. | |
# | |
# These Licensed Deliverables contained herein is PROPRIETARY and | |
# CONFIDENTIAL to NVIDIA and is being provided under the terms and | |
# conditions of a form of NVIDIA software license agreement by and | |
# between NVIDIA and Licensee ("License Agreement") or electronically | |
# accepted by Licensee. Notwithstanding any terms or conditions to | |
# the contrary in the License Agreement, reproduction or disclosure | |
# of the Licensed Deliverables to any third party without the express | |
# written consent of NVIDIA is prohibited. | |
# | |
# NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE | |
# LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE | |
# SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS | |
# PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. | |
# NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED | |
# DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, | |
# NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. | |
# NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE | |
# LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY | |
# SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY | |
# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, | |
# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS | |
# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE | |
# OF THESE LICENSED DELIVERABLES. | |
# | |
# U.S. Government End Users. These Licensed Deliverables are a | |
# "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT | |
# 1995), consisting of "commercial computer software" and "commercial | |
# computer software documentation" as such terms are used in 48 | |
# C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government | |
# only as a commercial end item. Consistent with 48 C.F.R.12.212 and | |
# 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all | |
# U.S. Government End Users acquire the Licensed Deliverables with | |
# only those rights set forth herein. | |
# | |
# Any use of the Licensed Deliverables in individual and commercial | |
# software must include, in the user documentation and internal | |
# comments to the code, the above Disclaimer and U.S. Government End | |
# Users Notice. | |
# | |
import math | |
from PIL import Image | |
import numpy as np | |
import os | |
# YOLOv3-608 has been trained with these 80 categories from COCO: | |
# Lin, Tsung-Yi, et al. "Microsoft COCO: Common Objects in Context." | |
# European Conference on Computer Vision. Springer, Cham, 2014. | |
def load_label_categories(label_file_path): | |
categories = [line.rstrip('\n') for line in open(label_file_path)] | |
return categories | |
LABEL_FILE_PATH = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'coco_labels.txt') | |
ALL_CATEGORIES = load_label_categories(LABEL_FILE_PATH) | |
# Let's make sure that there are 80 classes, as expected for the COCO data set: | |
CATEGORY_NUM = len(ALL_CATEGORIES) | |
assert CATEGORY_NUM == 80 | |
class PreprocessYOLO(object): | |
"""A simple class for loading images with PIL and reshaping them to the specified | |
input resolution for YOLOv3-608. | |
""" | |
def __init__(self, yolo_input_resolution): | |
"""Initialize with the input resolution for YOLOv3, which will stay fixed in this sample. | |
Keyword arguments: | |
yolo_input_resolution -- two-dimensional tuple with the target network's (spatial) | |
input resolution in HW order | |
""" | |
self.yolo_input_resolution = yolo_input_resolution | |
def process(self, input_image): | |
"""Load an image from the specified input path, | |
and return it together with a pre-processed version required for feeding it into a | |
YOLOv3 network. | |
Keyword arguments: | |
input_image_path -- string path of the image to be loaded | |
""" | |
image_raw, image_resized = self._load_and_resize(input_image) | |
image_preprocessed = self._shuffle_and_normalize(image_resized) | |
return image_raw, image_preprocessed | |
def _load_and_resize(self, input_image): | |
"""Load an image from the specified path and resize it to the input resolution. | |
Return the input image before resizing as a PIL Image (required for visualization), | |
and the resized image as a NumPy float array. | |
Keyword arguments: | |
input_image_path -- string path of the image to be loaded | |
""" | |
image_raw = input_image | |
# Expecting yolo_input_resolution in (height, width) format, adjusting to PIL | |
# convention (width, height) in PIL: | |
new_resolution = ( | |
self.yolo_input_resolution[1], | |
self.yolo_input_resolution[0]) | |
image_resized = image_raw.resize( | |
new_resolution, resample=Image.BICUBIC) | |
image_resized = np.array(image_resized, dtype=np.float32, order='C') | |
return image_raw, image_resized | |
def _shuffle_and_normalize(self, image): | |
"""Normalize a NumPy array representing an image to the range [0, 1], and | |
convert it from HWC format ("channels last") to NCHW format ("channels first" | |
with leading batch dimension). | |
Keyword arguments: | |
image -- image as three-dimensional NumPy float array, in HWC format | |
""" | |
image /= 255.0 | |
# HWC to CHW format: | |
image = np.transpose(image, [2, 0, 1]) | |
# CHW to NCHW format | |
image = np.expand_dims(image, axis=0) | |
# Convert the image to row-major order, also known as "C order": | |
image = np.array(image, dtype=np.float32, order='C') | |
return image | |
class PostprocessYOLO(object): | |
"""Class for post-processing the three outputs tensors from YOLOv3-608.""" | |
def __init__(self, | |
yolo_masks, | |
yolo_anchors, | |
obj_threshold, | |
nms_threshold, | |
yolo_input_resolution): | |
"""Initialize with all values that will be kept when processing several frames. | |
Assuming 3 outputs of the network in the case of (large) YOLOv3. | |
Keyword arguments: | |
yolo_masks -- a list of 3 three-dimensional tuples for the YOLO masks | |
yolo_anchors -- a list of 9 two-dimensional tuples for the YOLO anchors | |
object_threshold -- threshold for object coverage, float value between 0 and 1 | |
nms_threshold -- threshold for non-max suppression algorithm, | |
float value between 0 and 1 | |
input_resolution_yolo -- two-dimensional tuple with the target network's (spatial) | |
input resolution in HW order | |
""" | |
self.masks = yolo_masks | |
self.anchors = yolo_anchors | |
self.object_threshold = obj_threshold | |
self.nms_threshold = nms_threshold | |
self.input_resolution_yolo = yolo_input_resolution | |
def process(self, outputs, resolution_raw): | |
"""Take the YOLOv3 outputs generated from a TensorRT forward pass, post-process them | |
and return a list of bounding boxes for detected object together with their category | |
and their confidences in separate lists. | |
Keyword arguments: | |
outputs -- outputs from a TensorRT engine in NCHW format | |
resolution_raw -- the original spatial resolution from the input PIL image in WH order | |
""" | |
outputs_reshaped = list() | |
for output in outputs: | |
outputs_reshaped.append(self._reshape_output(output)) | |
boxes, categories, confidences = self._process_yolo_output( | |
outputs_reshaped, resolution_raw) | |
return boxes, categories, confidences | |
def _reshape_output(self, output): | |
"""Reshape a TensorRT output from NCHW to NHWC format (with expected C=255), | |
and then return it in (height,width,3,85) dimensionality after further reshaping. | |
Keyword argument: | |
output -- an output from a TensorRT engine after inference | |
""" | |
output = np.transpose(output, [0, 2, 3, 1]) | |
_, height, width, _ = output.shape | |
dim1, dim2 = height, width | |
dim3 = 3 | |
# There are CATEGORY_NUM=80 object categories: | |
dim4 = (4 + 1 + CATEGORY_NUM) | |
return np.reshape(output, (dim1, dim2, dim3, dim4)) | |
def _process_yolo_output(self, outputs_reshaped, resolution_raw): | |
"""Take in a list of three reshaped YOLO outputs in (height,width,3,85) shape and return | |
return a list of bounding boxes for detected object together with their category and their | |
confidences in separate lists. | |
Keyword arguments: | |
outputs_reshaped -- list of three reshaped YOLO outputs as NumPy arrays | |
with shape (height,width,3,85) | |
resolution_raw -- the original spatial resolution from the input PIL image in WH order | |
""" | |
# E.g. in YOLOv3-608, there are three output tensors, which we associate with their | |
# respective masks. Then we iterate through all output-mask pairs and generate candidates | |
# for bounding boxes, their corresponding category predictions and their confidences: | |
boxes, categories, confidences = list(), list(), list() | |
for output, mask in zip(outputs_reshaped, self.masks): | |
box, category, confidence = self._process_feats(output, mask) | |
box, category, confidence = self._filter_boxes(box, category, confidence) | |
boxes.append(box) | |
categories.append(category) | |
confidences.append(confidence) | |
boxes = np.concatenate(boxes) | |
categories = np.concatenate(categories) | |
confidences = np.concatenate(confidences) | |
# Scale boxes back to original image shape: | |
width, height = resolution_raw | |
image_dims = [width, height, width, height] | |
boxes = boxes * image_dims | |
# Using the candidates from the previous (loop) step, we apply the non-max suppression | |
# algorithm that clusters adjacent bounding boxes to a single bounding box: | |
nms_boxes, nms_categories, nscores = list(), list(), list() | |
for category in set(categories): | |
idxs = np.where(categories == category) | |
box = boxes[idxs] | |
category = categories[idxs] | |
confidence = confidences[idxs] | |
keep = self._nms_boxes(box, confidence) | |
nms_boxes.append(box[keep]) | |
nms_categories.append(category[keep]) | |
nscores.append(confidence[keep]) | |
if not nms_categories and not nscores: | |
return None, None, None | |
boxes = np.concatenate(nms_boxes) | |
categories = np.concatenate(nms_categories) | |
confidences = np.concatenate(nscores) | |
return boxes, categories, confidences | |
def _process_feats(self, output_reshaped, mask): | |
"""Take in a reshaped YOLO output in height,width,3,85 format together with its | |
corresponding YOLO mask and return the detected bounding boxes, the confidence, | |
and the class probability in each cell/pixel. | |
Keyword arguments: | |
output_reshaped -- reshaped YOLO output as NumPy arrays with shape (height,width,3,85) | |
mask -- 2-dimensional tuple with mask specification for this output | |
""" | |
# Two in-line functions required for calculating the bounding box | |
# descriptors: | |
def sigmoid(value): | |
"""Return the sigmoid of the input.""" | |
return 1.0 / (1.0 + math.exp(-value)) | |
def exponential(value): | |
"""Return the exponential of the input.""" | |
return math.exp(value) | |
# Vectorized calculation of above two functions: | |
sigmoid_v = np.vectorize(sigmoid) | |
exponential_v = np.vectorize(exponential) | |
grid_h, grid_w, _, _ = output_reshaped.shape | |
anchors = [self.anchors[i] for i in mask] | |
# Reshape to N, height, width, num_anchors, box_params: | |
anchors_tensor = np.reshape(anchors, [1, 1, len(anchors), 2]) | |
box_xy = sigmoid_v(output_reshaped[..., :2]) | |
box_wh = exponential_v(output_reshaped[..., 2:4]) * anchors_tensor | |
box_confidence = sigmoid_v(output_reshaped[..., 4]) | |
box_confidence = np.expand_dims(box_confidence, axis=-1) | |
box_class_probs = sigmoid_v(output_reshaped[..., 5:]) | |
col = np.tile(np.arange(0, grid_w), grid_w).reshape(-1, grid_w) | |
row = np.tile(np.arange(0, grid_h).reshape(-1, 1), grid_h) | |
col = col.reshape(grid_h, grid_w, 1, 1).repeat(3, axis=-2) | |
row = row.reshape(grid_h, grid_w, 1, 1).repeat(3, axis=-2) | |
grid = np.concatenate((col, row), axis=-1) | |
box_xy += grid | |
box_xy /= (grid_w, grid_h) | |
box_wh /= self.input_resolution_yolo | |
box_xy -= (box_wh / 2.) | |
boxes = np.concatenate((box_xy, box_wh), axis=-1) | |
# boxes: centroids, box_confidence: confidence level, box_class_probs: | |
# class confidence | |
return boxes, box_confidence, box_class_probs | |
def _filter_boxes(self, boxes, box_confidences, box_class_probs): | |
"""Take in the unfiltered bounding box descriptors and discard each cell | |
whose score is lower than the object threshold set during class initialization. | |
Keyword arguments: | |
boxes -- bounding box coordinates with shape (height,width,3,4); 4 for | |
x,y,height,width coordinates of the boxes | |
box_confidences -- bounding box confidences with shape (height,width,3,1); 1 for as | |
confidence scalar per element | |
box_class_probs -- class probabilities with shape (height,width,3,CATEGORY_NUM) | |
""" | |
box_scores = box_confidences * box_class_probs | |
box_classes = np.argmax(box_scores, axis=-1) | |
box_class_scores = np.max(box_scores, axis=-1) | |
pos = np.where(box_class_scores >= self.object_threshold) | |
boxes = boxes[pos] | |
classes = box_classes[pos] | |
scores = box_class_scores[pos] | |
return boxes, classes, scores | |
def _nms_boxes(self, boxes, box_confidences): | |
"""Apply the Non-Maximum Suppression (NMS) algorithm on the bounding boxes with their | |
confidence scores and return an array with the indexes of the bounding boxes we want to | |
keep (and display later). | |
Keyword arguments: | |
boxes -- a NumPy array containing N bounding-box coordinates that survived filtering, | |
with shape (N,4); 4 for x,y,height,width coordinates of the boxes | |
box_confidences -- a Numpy array containing the corresponding confidences with shape N | |
""" | |
x_coord = boxes[:, 0] | |
y_coord = boxes[:, 1] | |
width = boxes[:, 2] | |
height = boxes[:, 3] | |
areas = width * height | |
ordered = box_confidences.argsort()[::-1] | |
keep = list() | |
while ordered.size > 0: | |
# Index of the current element: | |
i = ordered[0] | |
keep.append(i) | |
xx1 = np.maximum(x_coord[i], x_coord[ordered[1:]]) | |
yy1 = np.maximum(y_coord[i], y_coord[ordered[1:]]) | |
xx2 = np.minimum(x_coord[i] + width[i], x_coord[ordered[1:]] + width[ordered[1:]]) | |
yy2 = np.minimum(y_coord[i] + height[i], y_coord[ordered[1:]] + height[ordered[1:]]) | |
width1 = np.maximum(0.0, xx2 - xx1 + 1) | |
height1 = np.maximum(0.0, yy2 - yy1 + 1) | |
intersection = width1 * height1 | |
union = (areas[i] + areas[ordered[1:]] - intersection) | |
# Compute the Intersection over Union (IoU) score: | |
iou = intersection / union | |
# The goal of the NMS algorithm is to reduce the number of adjacent bounding-box | |
# candidates to a minimum. In this step, we keep only those elements whose overlap | |
# with the current bounding box is lower than the threshold: | |
indexes = np.where(iou <= self.nms_threshold)[0] | |
ordered = ordered[indexes + 1] | |
keep = np.array(keep) | |
return keep |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import cv2 as cv | |
import os | |
import sys | |
import tensorrt as trt | |
import pycuda.driver as cuda | |
import pycuda.autoinit | |
from PIL import ImageDraw | |
from PIL import Image | |
from data_processing_cv import PreprocessYOLO, PostprocessYOLO, ALL_CATEGORIES | |
import sys, os | |
sys.path.insert(1, os.path.join(sys.path[0], "..")) | |
import common | |
TRT_LOGGER = trt.Logger() | |
print(cv.__version__) | |
onnx_file_path = 'yolov3.onnx' | |
engine_file_path = "yolov3.trt" | |
input_resolution_yolov3_HW = (608, 608) | |
# Create a pre-processor object by specifying the required input resolution for YOLOv3 | |
preprocessor = PreprocessYOLO(input_resolution_yolov3_HW) | |
# Imported from onnx_to_tensorrt.py, modified for OpenCV | |
def draw_bboxes(image_raw, bboxes, confidences, categories, all_categories, bbox_color='blue'): | |
print(bboxes, confidences, categories) | |
for box, score, category in zip(bboxes, confidences, categories): | |
cv.rectangle(image_raw,(round(x_coord),round(y_coord)),(round(x_coord+width),round(y_coord+height)),(0,255,255),1) | |
return image_raw | |
# Imported from onnx_to_tensorrt.py | |
def get_engine(onnx_file_path, engine_file_path=""): | |
"""Attempts to load a serialized engine if available, otherwise builds a new TensorRT engine and saves it.""" | |
def build_engine(): | |
"""Takes an ONNX file and creates a TensorRT engine to run inference with""" | |
with trt.Builder(TRT_LOGGER) as builder, builder.create_network(common.EXPLICIT_BATCH) as network, trt.OnnxParser(network, TRT_LOGGER) as parser: | |
builder.max_workspace_size = 1 << 28 # 256MiB | |
builder.max_batch_size = 1 | |
# Parse model file | |
if not os.path.exists(onnx_file_path): | |
print('ONNX file {} not found, please run yolov3_to_onnx.py first to generate it.'.format(onnx_file_path)) | |
exit(0) | |
print('Loading ONNX file from path {}...'.format(onnx_file_path)) | |
with open(onnx_file_path, 'rb') as model: | |
print('Beginning ONNX file parsing') | |
if not parser.parse(model.read()): | |
print ('ERROR: Failed to parse the ONNX file.') | |
for error in range(parser.num_errors): | |
print (parser.get_error(error)) | |
return None | |
# The actual yolov3.onnx is generated with batch size 64. Reshape input to batch size 1 | |
network.get_input(0).shape = [1, 3, 608, 608] | |
print('Completed parsing of ONNX file') | |
print('Building an engine from file {}; this may take a while...'.format(onnx_file_path)) | |
engine = builder.build_cuda_engine(network) | |
print("Completed creating Engine") | |
with open(engine_file_path, "wb") as f: | |
f.write(engine.serialize()) | |
return engine | |
if os.path.exists(engine_file_path): | |
# If a serialized engine exists, use it instead of building an engine. | |
print("Reading engine from file {}".format(engine_file_path)) | |
with open(engine_file_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime: | |
return runtime.deserialize_cuda_engine(f.read()) | |
else: | |
return build_engine() | |
# Create the engine | |
engine = get_engine(onnx_file_path, engine_file_path) | |
context = engine.create_execution_context() | |
inputs, outputs, bindings, stream = common.allocate_buffers(engine) | |
# Apply YoloV3 on an OpenCV frame, return boxes, classes, scores | |
def getYolo(img): | |
# Convert the CV2 image to PIL | |
img = cv.cvtColor(img, cv.COLOR_BGR2RGB) | |
im_pil = Image.fromarray(img) | |
image_raw, image = preprocessor.process(im_pil) | |
# Store the shape of the original input image in WH format, we will need it for later | |
shape_orig_WH = image_raw.size | |
# Output shapes expected by the post-processor | |
output_shapes = [(1, 255, 19, 19), (1, 255, 38, 38), (1, 255, 76, 76)] | |
# Do inference with TensorRT | |
trt_outputs = [] | |
# Set host input to the image. The common.do_inference function will copy the input to the GPU before executing. | |
inputs[0].host = image | |
trt_outputs = common.do_inference_v2(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream) | |
# Before doing post-processing, we need to reshape the outputs as the common.do_inference will give us flat arrays. | |
trt_outputs = [output.reshape(shape) for output, shape in zip(trt_outputs, output_shapes)] | |
postprocessor_args = {"yolo_masks": [(6, 7, 8), (3, 4, 5), (0, 1, 2)], # A list of 3 three-dimensional tuples for the YOLO masks | |
"yolo_anchors": [(10, 13), (16, 30), (33, 23), (30, 61), (62, 45), # A list of 9 two-dimensional tuples for the YOLO anchors | |
(59, 119), (116, 90), (156, 198), (373, 326)], | |
"obj_threshold": 0.6, # Threshold for object coverage, float value between 0 and 1 | |
"nms_threshold": 0.5, # Threshold for non-max suppression algorithm, float value between 0 and 1 | |
"yolo_input_resolution": input_resolution_yolov3_HW} | |
postprocessor = PostprocessYOLO(**postprocessor_args) | |
# Run the post-processing algorithms on the TensorRT outputs and get the bounding box details of detected objects | |
boxes, classes, scores = postprocessor.process(trt_outputs, (shape_orig_WH)) | |
return boxes, classes, scores | |
# Read a video, run YoloV3 at every frame | |
cap = cv.VideoCapture('traffic.mp4') | |
while cap.isOpened(): | |
ret, frame = cap.read() | |
if not ret: | |
print("Can't receive frame (stream end?). Exiting ...") | |
break | |
# Apply YoloV3 | |
boxes, classes, scores = getYolo(frame) | |
# Draw the rects | |
draw_bboxes(frame, boxes, scores, classes, ALL_CATEGORIES) | |
cv.namedWindow("result", cv.WINDOW_AUTOSIZE) | |
cv.imshow("result", frame) | |
if cv.waitKey(1) == ord('q'): | |
break | |
cap.release() | |
cv.destroyAllWindows() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment