Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
YOLOv4 on OpenCV DNN
import cv2
import time
CONFIDENCE_THRESHOLD = 0.2
NMS_THRESHOLD = 0.4
COLORS = [(0, 255, 255), (255, 255, 0), (0, 255, 0), (255, 0, 0)]
class_names = []
with open("classes.txt", "r") as f:
class_names = [cname.strip() for cname in f.readlines()]
vc = cv2.VideoCapture("demo.mp4")
net = cv2.dnn.readNet("yolov4.weights", "yolov4.cfg")
net.setPreferableBackend(cv2.dnn.DNN_BACKEND_CUDA)
net.setPreferableTarget(cv2.dnn.DNN_TARGET_CUDA_FP16)
model = cv2.dnn_DetectionModel(net)
model.setInputParams(size=(416, 416), scale=1/255, swapRB=True)
while cv2.waitKey(1) < 1:
(grabbed, frame) = vc.read()
if not grabbed:
exit()
start = time.time()
classes, scores, boxes = model.detect(frame, CONFIDENCE_THRESHOLD, NMS_THRESHOLD)
end = time.time()
start_drawing = time.time()
for (classid, score, box) in zip(classes, scores, boxes):
color = COLORS[int(classid) % len(COLORS)]
label = "%s : %f" % (class_names[classid[0]], score)
cv2.rectangle(frame, box, color, 2)
cv2.putText(frame, label, (box[0], box[1] - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)
end_drawing = time.time()
fps_label = "FPS: %.2f (excluding drawing time of %.2fms)" % (1 / (end - start), (end_drawing - start_drawing) * 1000)
cv2.putText(frame, fps_label, (0, 25), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 0), 2)
cv2.imshow("detections", frame)
#include <iostream>
#include <queue>
#include <iterator>
#include <sstream>
#include <fstream>
#include <iomanip>
#include <chrono>
#include <opencv2/core.hpp>
#include <opencv2/dnn.hpp>
#include <opencv2/dnn/all_layers.hpp>
#include <opencv2/imgproc.hpp>
#include <opencv2/highgui.hpp>
constexpr float CONFIDENCE_THRESHOLD = 0;
constexpr float NMS_THRESHOLD = 0.4;
constexpr int NUM_CLASSES = 80;
// colors for bounding boxes
const cv::Scalar colors[] = {
{0, 255, 255},
{255, 255, 0},
{0, 255, 0},
{255, 0, 0}
};
const auto NUM_COLORS = sizeof(colors)/sizeof(colors[0]);
int main()
{
std::vector<std::string> class_names;
{
std::ifstream class_file("classes.txt");
if (!class_file)
{
std::cerr << "failed to open classes.txt\n";
return 0;
}
std::string line;
while (std::getline(class_file, line))
class_names.push_back(line);
}
cv::VideoCapture source("demo.mp4");
auto net = cv::dnn::readNetFromDarknet("yolov4.cfg", "yolov4.weights");
net.setPreferableBackend(cv::dnn::DNN_BACKEND_CUDA);
net.setPreferableTarget(cv::dnn::DNN_TARGET_CUDA);
// net.setPreferableBackend(cv::dnn::DNN_BACKEND_OPENCV);
// net.setPreferableTarget(cv::dnn::DNN_TARGET_CPU);
auto output_names = net.getUnconnectedOutLayersNames();
cv::Mat frame, blob;
std::vector<cv::Mat> detections;
while(cv::waitKey(1) < 1)
{
source >> frame;
if (frame.empty())
{
cv::waitKey();
break;
}
auto total_start = std::chrono::steady_clock::now();
cv::dnn::blobFromImage(frame, blob, 0.00392, cv::Size(608, 608), cv::Scalar(), true, false, CV_32F);
net.setInput(blob);
auto dnn_start = std::chrono::steady_clock::now();
net.forward(detections, output_names);
auto dnn_end = std::chrono::steady_clock::now();
std::vector<int> indices[NUM_CLASSES];
std::vector<cv::Rect> boxes[NUM_CLASSES];
std::vector<float> scores[NUM_CLASSES];
for (auto& output : detections)
{
const auto num_boxes = output.rows;
for (int i = 0; i < num_boxes; i++)
{
auto x = output.at<float>(i, 0) * frame.cols;
auto y = output.at<float>(i, 1) * frame.rows;
auto width = output.at<float>(i, 2) * frame.cols;
auto height = output.at<float>(i, 3) * frame.rows;
cv::Rect rect(x - width/2, y - height/2, width, height);
for (int c = 0; c < NUM_CLASSES; c++)
{
auto confidence = *output.ptr<float>(i, 5 + c);
if (confidence >= CONFIDENCE_THRESHOLD)
{
boxes[c].push_back(rect);
scores[c].push_back(confidence);
}
}
}
}
for (int c = 0; c < NUM_CLASSES; c++)
cv::dnn::NMSBoxes(boxes[c], scores[c], 0.0, NMS_THRESHOLD, indices[c]);
for (int c= 0; c < NUM_CLASSES; c++)
{
for (size_t i = 0; i < indices[c].size(); ++i)
{
const auto color = colors[c % NUM_COLORS];
auto idx = indices[c][i];
const auto& rect = boxes[c][idx];
cv::rectangle(frame, cv::Point(rect.x, rect.y), cv::Point(rect.x + rect.width, rect.y + rect.height), color, 3);
std::ostringstream label_ss;
label_ss << class_names[c] << ": " << std::fixed << std::setprecision(2) << scores[c][idx];
auto label = label_ss.str();
int baseline;
auto label_bg_sz = cv::getTextSize(label.c_str(), cv::FONT_HERSHEY_COMPLEX_SMALL, 1, 1, &baseline);
cv::rectangle(frame, cv::Point(rect.x, rect.y - label_bg_sz.height - baseline - 10), cv::Point(rect.x + label_bg_sz.width, rect.y), color, cv::FILLED);
cv::putText(frame, label.c_str(), cv::Point(rect.x, rect.y - baseline - 5), cv::FONT_HERSHEY_COMPLEX_SMALL, 1, cv::Scalar(0, 0, 0));
}
}
auto total_end = std::chrono::steady_clock::now();
float inference_fps = 1000.0 / std::chrono::duration_cast<std::chrono::milliseconds>(dnn_end - dnn_start).count();
float total_fps = 1000.0 / std::chrono::duration_cast<std::chrono::milliseconds>(total_end - total_start).count();
std::ostringstream stats_ss;
stats_ss << std::fixed << std::setprecision(2);
stats_ss << "Inference FPS: " << inference_fps << ", Total FPS: " << total_fps;
auto stats = stats_ss.str();
int baseline;
auto stats_bg_sz = cv::getTextSize(stats.c_str(), cv::FONT_HERSHEY_COMPLEX_SMALL, 1, 1, &baseline);
cv::rectangle(frame, cv::Point(0, 0), cv::Point(stats_bg_sz.width, stats_bg_sz.height + 10), cv::Scalar(0, 0, 0), cv::FILLED);
cv::putText(frame, stats.c_str(), cv::Point(0, stats_bg_sz.height + 5), cv::FONT_HERSHEY_COMPLEX_SMALL, 1, cv::Scalar(255, 255, 255));
cv::namedWindow("output");
cv::imshow("output", frame);
}
return 0;
}
@vvmatorin
Copy link

vvmatorin commented Jul 26, 2021

@vvmatorin Can you check if the outputs match exactly with NMS disabled in both Darknet and OpenCV?

@YashasSamaga I set:

  • nms = 0 for darknet, with confidence threshold = 0.1
  • NMS_THRESHOLD = 0 for OpenCV, with CONFIDENCE_THRESHOLD = 0.1

The outputs don't match, as OpenCV output has less bounding boxes overall. Results are below:

@vvmatorin did you find any solution for this?

I haven't found a solution sadly.
I experimented a bit with rebuilding OpenCV for GPU inference with different parameters, but it didn't help.
At the moment, my guess is, that is has to do something with the way darknet resizes images and how it's different to OpenCV, but I don't have time to check it so I stopped at that.

@AlexeyAB
Copy link

AlexeyAB commented Jul 26, 2021

About different resize approaches in the Darknet (letter_box=1 vs letter_box=0) and OpenCV-dnn AlexeyAB/darknet#232 (comment)

@marvision-ai
Copy link

marvision-ai commented Jul 26, 2021

@vvmatorin @Fetulhak do you see the same issue if you resize all your images with cv2.resize and scale your annotations to your specific [net] custom_width x custom_height BEFORE training with darknet? I think the goal is to make sure you have your data at the size you want before training the models.

Note that if you do not resize prior to training, this is the slowest training option, as Darknet must continuously resize all of the images as they are loaded from disk. Darknet does not keep a local image cache.When resizing images, Darknet does not maintain or respect aspect ratio unless you use letterbox. All images will be stretched as necessary to match the exact network dimensions defined in the "[net]" section of the cfg file.

@sctrueew
Copy link

sctrueew commented Jul 27, 2021

Hi everyone,

I've trained a model and some objects are multilabel annotated. When I use the darknet for detecting gives me labels for each object but OpenCV-DNN only returns one label.

I used the latest version of the darknet for detecting and I used OpenCV-dnn for detecting in OpenCV.
I found this example of OpenCV and I used this too and everything is Okay but the inference is slow and FP is much because I have almost 400 classes.
I changed the parameters in
cv::dnn::NMSBoxes(boxes, confidences, CONFIDENCE_THRESHOLD, NMS_THRESHOLD, indices); but I couldn't give a good result.

Where is the problem?

Thanks in advance!

@YashasSamaga
Copy link
Author

YashasSamaga commented Jul 27, 2021

@zpmmehrdad

I've trained a model and some objects are multilabel annotated. When I use the darknet for detecting gives me labels for each object but OpenCV-DNN only returns one label.

I suspect it's performing a single NMS. What you need is classwise NMS. You need to store boxes of a particular class separately and perform NMS on each such class separately.

Okay but the inference is slow and FP is much because I have almost 400 classes.

You should check what step in the pipeline is taking most of the time. I think the DNN inference is probably fast and IO is the bottleneck. You have to identify the bottleneck and try to mitigate it.

@sctrueew
Copy link

sctrueew commented Jul 27, 2021

@YashasSamaga
I suspect it's performing a single NMS. What you need is classwise NMS. You need to store boxes of a particular class separately and perform NMS on each such class separately.

Thanks for the reply, how can I set up NMS for a few specific classes, not all.

@YashasSamaga
Copy link
Author

YashasSamaga commented Jul 28, 2021

@zpmmehrdad The C++ example in this gist does classwise NMS. DetectionModel class allows you to choose between classwise NMS and across class NMS.

@sctrueew
Copy link

sctrueew commented Jul 28, 2021

@YashasSamaga Hi,
I tested this DetectionModel but some objects that are multi-label only detect one class. This version works well but have two problems, FP is high and very slow with about 450 classes.

@YashasSamaga
Copy link
Author

YashasSamaga commented Jul 28, 2021

@zpmmehrdad Can you try with the other NMS configuration in DetectionModel? (try both options and see if one of them meets your requirements)

What is FP? What GPU do you have? Did you use DNN_TARGET_CUDA or DNN_TARGET_CUDA_FP16?

@aabilityuk
Copy link

aabilityuk commented Sep 4, 2021

Hi, YashasSamaga!!! Thank you for a code sharing, i am using your code example for yolov4-tiny model inference trained with images dataset (416x416) for face mask detection! Everything is working good, except of one thing, when i stay close to webcam (approximately 0.5 - 1 meters) algorithm constantly determines the face mask is on even if there is no mask on the face, if i stay 1,5 and more meters far from the web camera everything works as expected and face masks determines in correct way!!! Could you please to help me tune the code to solve my problem

@YashasSamaga
Copy link
Author

YashasSamaga commented Sep 4, 2021

Hi, YashasSamaga!!! Thank you for a code sharing, i am using your code example for yolov4-tiny model inference trained with images dataset (416x416) for face mask detection! Everything is working good, except of one thing, when i stay close to webcam (approximately 0.5 - 1 meters) algorithm constantly determines the face mask is on even if there is no mask on the face, if i stay 1,5 and more meters far from the web camera everything works as expected and face masks determines in correct way!!! Could you please to help me tune the code to solve my problem

I think this is an issue with your dataset used for training and less to do with OpenCV or DNN inference. Maybe your dataset lacks sufficient samples where people are close to the camera. I don't really know much about training models but here is an idea:

  1. detect faces using a face detector
  2. crop around the face (include some background too)
  3. resize the image

This might augment your dataset to have more samples where the face occupies a large portion of the image. I am not sure if this will exactly mimic being closer to the camera but it may work.

@aabilityuk
Copy link

aabilityuk commented Sep 4, 2021

I think this is an issue with your dataset used for training and less to do with OpenCV or DNN inference. Maybe your dataset lacks sufficient samples where people are close to the camera. I don't really know much about training models but here is an idea:

1. detect faces using a face detector

2. crop around the face (include some background too)

3. resize the image

This might augment your dataset to have more samples where the face occupies a large portion of the image. I am not sure if this will exactly mimic being closer to the camera but it may work.

Thank you very much for a quick reply and tips you gave me! I will try to use another images dataset with closer faces and will see what happened

@dgp52
Copy link

dgp52 commented Sep 22, 2021

Hi @YashasSamaga, I'm having the similar issue like others. OpenCV is missing some of the detections as compared to the darknet detections. Has anyone found any solution to this issue?

Heres what I have (Using Google colab):
Inside yolov4-custom.cfg

[net]
#Testing
batch=1
subdivisions=1
#Training
#batch=64
#subdivisions=64
width=640
height=640
.
.
filters=27
classes=4
.

Test image size is 640x640

Darknet:
!./darknet/darknet detector test custom_data_sides_colored/labelled_data.data /content/drive/MyDrive/yolo/test_model/sides-colored/yolov4-custom.cfg /content/drive/MyDrive/yolo/backup-yolov4-models/yolov4-custom_final.weights /content/drive/MyDrive/custom-object-detection/color-img/640-bw.jpg -thresh 0.01

Console info after running the above code:

CUDA-version: 11010 (11020), cuDNN: 7.6.5, GPU count: 1  
OpenCV version: 3.2.0
.

Notice how it prints "OpenCV version 3.2.0". Could this be part of the problem ? because it's using different version than the installed cv2 version (4.5.3)?

OpenCV:

cv2.__version__
4.5.3
net = cv2.dnn.readNet("/content/drive/MyDrive/yolo/backup-yolov4-models/yolov4-custom_final.weights", "/content/drive/MyDrive/yolo/test_model/sides-colored/yolov4-custom.cfg")
img = cv2.imread('/content/drive/MyDrive/custom-object-detection/color-img/640-bw.jpg', flags=cv2.IMREAD_COLOR) 
CONFIDENCE_THRESHOLD = 0.01
NMS_THRESHOLD = 0.4

#Tried DNN_BACKEND_CUDA and DNN_TARGET_CUDA_FP16 but no difference
#net.setPreferableBackend(cv2.dnn.DNN_BACKEND_CUDA)
#net.setPreferableTarget(cv2.dnn.DNN_TARGET_CUDA_FP16)

net.setPreferableBackend(cv2.dnn.DNN_BACKEND_OPENCV)
net.setPreferableTarget(cv2.dnn.DNN_TARGET_CPU)
model = cv2.dnn_DetectionModel(net)
model.setInputParams(size=(640, 640), scale=1/255, swapRB=True,crop= False)
classes, scores, boxes = model.detect(img, CONFIDENCE_THRESHOLD, NMS_THRESHOLD)
for idx, (classid, score, box) in enumerate(zip(classes, scores, boxes)):
  print("Class: " + str(classid) + " Score:" + str(score))
  cv2.rectangle(img, box, (0, 0, 0), 2)
cv2_imshow(img) 

@YashasSamaga
Copy link
Author

YashasSamaga commented Sep 22, 2021

@dgp52 Does it happen with DNN_BACKEND_OPENCV and DNN_TARGET_CPU?

@Fetulhak
Copy link

Fetulhak commented Sep 22, 2021

@YashasSamaga I have tried a lot of options to see why this happens but sadly I did not get any solution. The results in Darknet are far better than OpenCv-dnn as well as other tensorflow implementations of darknet yolo.

@dgp52
Copy link

dgp52 commented Sep 22, 2021

@dgp52 Does it happen with DNN_BACKEND_OPENCV and DNN_TARGET_CPU?

Correct. It happens with DNN_BACKEND_OPENCV and DNN_TARGET_CPU. I have also tried using DNN_BACKEND_CUDA and DNN_TARGET_CUDA_FP16 but no difference.

@YashasSamaga
Copy link
Author

YashasSamaga commented Sep 22, 2021

Correct. It happens with DNN_BACKEND_OPENCV and DNN_TARGET_CPU. I have also tried using DNN_BACKEND_CUDA and DNN_TARGET_CUDA_FP16 but no difference.

@dgp52 @Fetulhak @zpmmehrdad @vvmatorin and others

OpenCV has internal tests most of which are shared across all backends. The CUDA backend always trails the OpenCV CPU backend in terms of correctness and features; it just mimics what OpenCV CPU backend does (this is like the reference implementation). Since it also happens in the CPU backend, the patch must first go there. I think an issue should be opened at OpenCV repository reporting this issue in the OCV CPU backend (and mention as a sidenote that the same behavior is observed in the CUDA backend).

Please also check AlexeyAB's comment:

About different resize approaches in the Darknet (letter_box=1 vs letter_box=0) and OpenCV-dnn AlexeyAB/darknet#232 (comment)

@dgp52
Copy link

dgp52 commented Sep 25, 2021

@YashasSamaga @vvmatorin @marvision-ai @Fetulhak
Explicitly setting the thresh value in the test cfg file worked for me! Now it's giving me exact detections!

Here's what I think is happening. Feel free to correct me but it seems like OpenCV has a default confidence value it checks against during the detection process. Almost like a minimum threshold, so anything lower than this value won't get detected. We can overwrite this value by adding thresh = 0.01 or any default value you would like. Add this value for all three yolo layers.

Ex:

[yolo]
thresh = 0.01
.
.
[yolo]
thresh = 0.01
.
.
[yolo]
thresh = 0.01

I'm curious to see if this works for others.

@Fetulhak
Copy link

Fetulhak commented Sep 26, 2021

@dgp52 this seems very interesting if it works. I will check it for my dataset.

@angeloken
Copy link

angeloken commented Oct 4, 2021

When I changed yolov3 to yolov4's cfg and weights, a cv::Exception error occurred during dnn::readNetFromDarknet. I think it may be because of the opencv version problem, because 4.2 was proposed at the end of last year, but yolov4 was proposed in April this year.

i got the same error before...

i just ignored the FromDarknet and just used dnn.readNet

@hlacikd
Copy link

hlacikd commented Nov 17, 2021

starting from opencv 4.5.4 classid is no longer list, so code needs to be fixed
from
label = "%s : %f" % (class_names[classid[0]], score)
to
label = "%s : %f" % (class_names[classid], score)

@marvision-ai
Copy link

marvision-ai commented Dec 8, 2021

@dgp52 very interesting! You didn't actually have to train the network with thresh=0.01 correct? Only set it in the .cfg file for a trained model?

@dgp52
Copy link

dgp52 commented Jan 1, 2022

@marvision-ai Sorry for not getting back to you sooner. That is correct. Just needed to update the .cfg file.

@PROGRAMMINGENGINEER-NIKI
Copy link

PROGRAMMINGENGINEER-NIKI commented Jan 13, 2022

Hi Yashas, I need to access GPU power when running YOLOv4. What CUDA and Cudnn version should I install?

Because I installed CUDA 11.5 and cudnn 8.3.2, but when I install OpenCV, it does not find cudnn, as a result, it does not run fast.

@YashasSamaga
Copy link
Author

YashasSamaga commented Jan 13, 2022

I haven't checked the latest versions of cuDNN but cuDNN 7.6.5 gave the best performance six months ago.

@doleron
Copy link

doleron commented Jan 18, 2022

Hi @YashasSamaga , I wrote this code for YOLO V5 / OpenCV / DNN

C++

#include <fstream>

#include <opencv2/opencv.hpp>

std::vector<std::string> load_class_list()
{
    std::vector<std::string> class_list;
    std::ifstream ifs("config_files/classes.txt");
    std::string line;
    while (getline(ifs, line))
    {
        class_list.push_back(line);
    }
    return class_list;
}

void load_net(cv::dnn::Net &net, bool is_cuda)
{
    auto result = cv::dnn::readNet("config_files/yolov5s.onnx");
    if (is_cuda)
    {
        std::cout << "Attempty to use CUDA\n";
        result.setPreferableBackend(cv::dnn::DNN_BACKEND_CUDA);
        result.setPreferableTarget(cv::dnn::DNN_TARGET_CUDA_FP16);
    }
    else
    {
        std::cout << "Running on CPU\n";
        result.setPreferableBackend(cv::dnn::DNN_BACKEND_OPENCV);
        result.setPreferableTarget(cv::dnn::DNN_TARGET_CPU);
    }
    net = result;
}

const std::vector<cv::Scalar> colors = {cv::Scalar(255, 255, 0), cv::Scalar(0, 255, 0), cv::Scalar(0, 255, 255), cv::Scalar(255, 0, 0)};

const float INPUT_WIDTH = 640.0;
const float INPUT_HEIGHT = 640.0;
const float SCORE_THRESHOLD = 0.2;
const float NMS_THRESHOLD = 0.4;
const float CONFIDENCE_THRESHOLD = 0.4;

struct Detection
{
    int class_id;
    float confidence;
    cv::Rect box;
};

cv::Mat format_yolov5(const cv::Mat &source) {
    int col = source.cols;
    int row = source.rows;
    int _max = MAX(col, row);
    cv::Mat result = cv::Mat::zeros(_max, _max, CV_8UC3);
    source.copyTo(result(cv::Rect(0, 0, col, row)));
    return result;
}

void detect(cv::Mat &image, cv::dnn::Net &net, std::vector<Detection> &output, const std::vector<std::string> &className) {
    cv::Mat blob;

    auto input_image = format_yolov5(image);
    
    cv::dnn::blobFromImage(input_image, blob, 1./255., cv::Size(INPUT_WIDTH, INPUT_HEIGHT), cv::Scalar(), true, false);
    net.setInput(blob);
    std::vector<cv::Mat> outputs;
    net.forward(outputs, net.getUnconnectedOutLayersNames());

    float x_factor = input_image.cols / INPUT_WIDTH;
    float y_factor = input_image.rows / INPUT_HEIGHT;
    
    float *data = (float *)outputs[0].data;

    const int dimensions = 85;
    const int rows = 25200;
    
    std::vector<int> class_ids;
    std::vector<float> confidences;
    std::vector<cv::Rect> boxes;

    for (int i = 0; i < rows; ++i) {

        float confidence = data[4];
        if (confidence >= CONFIDENCE_THRESHOLD) {

            float * classes_scores = data + 5;
            cv::Mat scores(1, className.size(), CV_32FC1, classes_scores);
            cv::Point class_id;
            double max_class_score;
            minMaxLoc(scores, 0, &max_class_score, 0, &class_id);
            if (max_class_score > SCORE_THRESHOLD) {

                confidences.push_back(confidence);

                class_ids.push_back(class_id.x);

                float x = data[0];
                float y = data[1];
                float w = data[2];
                float h = data[3];
                int left = int((x - 0.5 * w) * x_factor);
                int top = int((y - 0.5 * h) * y_factor);
                int width = int(w * x_factor);
                int height = int(h * y_factor);
                boxes.push_back(cv::Rect(left, top, width, height));
            }

        }

        data += 85;

    }

    std::vector<int> nms_result;
    cv::dnn::NMSBoxes(boxes, confidences, SCORE_THRESHOLD, NMS_THRESHOLD, nms_result);
    for (int i = 0; i < nms_result.size(); i++) {
        int idx = nms_result[i];
        Detection result;
        result.class_id = class_ids[idx];
        result.confidence = confidences[idx];
        result.box = boxes[idx];
        output.push_back(result);
    }
}

int main(int argc, char **argv)
{

    std::vector<std::string> class_list = load_class_list();

    cv::Mat frame;
    cv::VideoCapture capture("sample.mp4");
    if (!capture.isOpened())
    {
        std::cerr << "Error opening video file\n";
        return -1;
    }

    bool is_cuda = argc > 1 && strcmp(argv[1], "cuda") == 0;

    cv::dnn::Net net;
    load_net(net, is_cuda);

    auto start = std::chrono::high_resolution_clock::now();
    int frame_count = 0;
    float fps = -1;
    int total_frames = 0;

    while (true)
    {
        capture.read(frame);
        if (frame.empty())
        {
            std::cout << "End of stream\n";
            break;
        }

        std::vector<Detection> output;
        detect(frame, net, output, class_list);

        frame_count++;
        total_frames++;

        int detections = output.size();

        for (int i = 0; i < detections; ++i)
        {

            auto detection = output[i];
            auto box = detection.box;
            auto classId = detection.class_id;
            const auto color = colors[classId % colors.size()];
            cv::rectangle(frame, box, color, 3);

            cv::rectangle(frame, cv::Point(box.x, box.y - 20), cv::Point(box.x + box.width, box.y), color, cv::FILLED);
            cv::putText(frame, class_list[classId].c_str(), cv::Point(box.x, box.y - 5), cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
        }

        if (frame_count >= 30)
        {

            auto end = std::chrono::high_resolution_clock::now();
            fps = frame_count * 1000.0 / std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();

            frame_count = 0;
            start = std::chrono::high_resolution_clock::now();
        }

        if (fps > 0)
        {

            std::ostringstream fps_label;
            fps_label << std::fixed << std::setprecision(2);
            fps_label << "FPS: " << fps;
            std::string fps_label_str = fps_label.str();

            cv::putText(frame, fps_label_str.c_str(), cv::Point(10, 25), cv::FONT_HERSHEY_SIMPLEX, 1, cv::Scalar(0, 0, 255), 2);
        }

        cv::imshow("output", frame);

        if (cv::waitKey(1) != -1)
        {
            capture.release();
            std::cout << "finished by user\n";
            break;
        }
    }

    std::cout << "Total frames: " << total_frames << "\n";

    return 0;
}

Python

import cv2
import time
import sys
import numpy as np

def build_model(is_cuda):
    net = cv2.dnn.readNet("config_files/yolov5s.onnx")
    if is_cuda:
        print("Attempty to use CUDA")
        net.setPreferableBackend(cv2.dnn.DNN_BACKEND_CUDA)
        net.setPreferableTarget(cv2.dnn.DNN_TARGET_CUDA_FP16)
    else:
        print("Running on CPU")
        net.setPreferableBackend(cv2.dnn.DNN_BACKEND_OPENCV)
        net.setPreferableTarget(cv2.dnn.DNN_TARGET_CPU)
    return net

INPUT_WIDTH = 640
INPUT_HEIGHT = 640
SCORE_THRESHOLD = 0.2
NMS_THRESHOLD = 0.4
CONFIDENCE_THRESHOLD = 0.4

def detect(image, net):
    blob = cv2.dnn.blobFromImage(image, 1/255.0, (INPUT_WIDTH, INPUT_HEIGHT), swapRB=True, crop=False)
    net.setInput(blob)
    preds = net.forward()
    return preds

def load_capture():
    capture = cv2.VideoCapture("sample.mp4")
    return capture

def load_classes():
    class_list = []
    with open("config_files/classes.txt", "r") as f:
        class_list = [cname.strip() for cname in f.readlines()]
    return class_list

class_list = load_classes()

def wrap_detection(input_image, output_data):
    class_ids = []
    confidences = []
    boxes = []

    rows = output_data.shape[0]

    image_width, image_height, _ = input_image.shape

    x_factor = image_width / INPUT_WIDTH
    y_factor =  image_height / INPUT_HEIGHT

    for r in range(rows):
        row = output_data[r]
        confidence = row[4]
        if confidence >= 0.4:

            classes_scores = row[5:]
            _, _, _, max_indx = cv2.minMaxLoc(classes_scores)
            class_id = max_indx[1]
            if (classes_scores[class_id] > .25):

                confidences.append(confidence)

                class_ids.append(class_id)

                x, y, w, h = row[0].item(), row[1].item(), row[2].item(), row[3].item() 
                left = int((x - 0.5 * w) * x_factor)
                top = int((y - 0.5 * h) * y_factor)
                width = int(w * x_factor)
                height = int(h * y_factor)
                box = np.array([left, top, width, height])
                boxes.append(box)

    indexes = cv2.dnn.NMSBoxes(boxes, confidences, 0.25, 0.45) 

    result_class_ids = []
    result_confidences = []
    result_boxes = []

    for i in indexes:
        result_confidences.append(confidences[i])
        result_class_ids.append(class_ids[i])
        result_boxes.append(boxes[i])

    return result_class_ids, result_confidences, result_boxes

def format_yolov5(frame):

    row, col, _ = frame.shape
    _max = max(col, row)
    result = np.zeros((_max, _max, 3), np.uint8)
    result[0:row, 0:col] = frame
    return result


colors = [(255, 255, 0), (0, 255, 0), (0, 255, 255), (255, 0, 0)]

is_cuda = len(sys.argv) > 1 and sys.argv[1] == "cuda"

net = build_model(is_cuda)
capture = load_capture()

start = time.time_ns()
frame_count = 0
total_frames = 0
fps = -1

while True:

    _, frame = capture.read()
    if frame is None:
        print("End of stream")
        break

    inputImage = format_yolov5(frame)
    outs = detect(inputImage, net)

    class_ids, confidences, boxes = wrap_detection(inputImage, outs[0])

    frame_count += 1
    total_frames += 1

    for (classid, confidence, box) in zip(class_ids, confidences, boxes):
         color = colors[int(classid) % len(colors)]
         cv2.rectangle(frame, box, color, 2)
         cv2.rectangle(frame, (box[0], box[1] - 20), (box[0] + box[2], box[1]), color, -1)
         cv2.putText(frame, class_list[classid], (box[0], box[1] - 10), cv2.FONT_HERSHEY_SIMPLEX, .5, (0,0,0))

    if frame_count >= 30:
        end = time.time_ns()
        fps = 1000000000 * frame_count / (end - start)
        frame_count = 0
        start = time.time_ns()
    
    if fps > 0:
        fps_label = "FPS: %.2f" % fps
        cv2.putText(frame, fps_label, (10, 25), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)

    cv2.imshow("output", frame)

    if cv2.waitKey(1) > -1:
        print("finished by user")
        break

print("Total frames: " + str(total_frames))

The more update version and instructions to run code can be found here: https://github.com/doleron/yolov4-opencv-cpp-python

@YashGugliya
Copy link

YashGugliya commented Feb 15, 2022

I want to use the output from the opencv object detections how can i do that?

@shridharkini6
Copy link

shridharkini6 commented Mar 15, 2022

@YashasSamaga
How to achieve batched inference using cv2.dnn_DetectionModel.detect for yolo

@mochechan
Copy link

mochechan commented Jul 18, 2022

The code in python is fine for me, but its c++ version doesn't work for me.

Compile:

/usr/bin/c++   -lstdc++  -g -pg -O0 -pthread -lpthread -lstdc++fs  -std=c++14 -std=c++17 -fPIC -std=gnu++11 -rdynamic -I/usr/local/include/tkDNN/ -I/usr/local/cuda/include -std=c++1z  -std=gnu++1z  CMakeFiles/predev2.dir/src/yolov4_opencv_dnn_cuda.cpp.o -o predev2   -L/usr/local/cuda/lib64  -Wl,-rpath,/usr/local/cuda/lib64:/usr/local/lib -ltkDNN -lcurl /usr/lib/aarch64-linux-gnu/libnvinfer.so /usr/local/cuda/lib64/libcudart_static.a -ldl /usr/lib/aarch64-linux-gnu/librt.so /usr/lib/aarch64-linux-gnu/libcublas.so /usr/lib/aarch64-linux-gnu/libcudnn.so /usr/lib/aarch64-linux-gnu/libnvinfer.so /usr/local/lib/libopencv_gapi.so.4.6.0 /usr/local/lib/libopencv_stitching.so.4.6.0 /usr/local/lib/libopencv_alphamat.so.4.6.0 /usr/local/lib/libopencv_aruco.so.4.6.0 /usr/local/lib/libopencv_barcode.so.4.6.0 /usr/local/lib/libopencv_bgsegm.so.4.6.0 /usr/local/lib/libopencv_bioinspired.so.4.6.0 /usr/local/lib/libopencv_ccalib.so.4.6.0 /usr/local/lib/libopencv_cudabgsegm.so.4.6.0 /usr/local/lib/libopencv_cudafeatures2d.so.4.6.0 /usr/local/lib/libopencv_cudaobjdetect.so.4.6.0 /usr/local/lib/libopencv_cudastereo.so.4.6.0 /usr/local/lib/libopencv_dnn_objdetect.so.4.6.0 /usr/local/lib/libopencv_dnn_superres.so.4.6.0 /usr/local/lib/libopencv_dpm.so.4.6.0 /usr/local/lib/libopencv_face.so.4.6.0 /usr/local/lib/libopencv_freetype.so.4.6.0 /usr/local/lib/libopencv_fuzzy.so.4.6.0 /usr/local/lib/libopencv_hdf.so.4.6.0 /usr/local/lib/libopencv_hfs.so.4.6.0 /usr/local/lib/libopencv_img_hash.so.4.6.0 /usr/local/lib/libopencv_intensity_transform.so.4.6.0 /usr/local/lib/libopencv_line_descriptor.so.4.6.0 /usr/local/lib/libopencv_mcc.so.4.6.0 /usr/local/lib/libopencv_quality.so.4.6.0 /usr/local/lib/libopencv_rapid.so.4.6.0 /usr/local/lib/libopencv_reg.so.4.6.0 /usr/local/lib/libopencv_rgbd.so.4.6.0 /usr/local/lib/libopencv_saliency.so.4.6.0 /usr/local/lib/libopencv_stereo.so.4.6.0 /usr/local/lib/libopencv_structured_light.so.4.6.0 /usr/local/lib/libopencv_superres.so.4.6.0 /usr/local/lib/libopencv_surface_matching.so.4.6.0 /usr/local/lib/libopencv_tracking.so.4.6.0 /usr/local/lib/libopencv_videostab.so.4.6.0 /usr/local/lib/libopencv_wechat_qrcode.so.4.6.0 /usr/local/lib/libopencv_xfeatures2d.so.4.6.0 /usr/local/lib/libopencv_xobjdetect.so.4.6.0 /usr/local/lib/libopencv_xphoto.so.4.6.0 /usr/local/cuda/lib64/libcudart_static.a -ldl /usr/lib/aarch64-linux-gnu/librt.so /usr/lib/aarch64-linux-gnu/libcublas.so /usr/lib/aarch64-linux-gnu/libcudnn.so -lpthread /usr/local/lib/libopencv_shape.so.4.6.0 /usr/local/lib/libopencv_highgui.so.4.6.0 /usr/local/lib/libopencv_datasets.so.4.6.0 /usr/local/lib/libopencv_plot.so.4.6.0 /usr/local/lib/libopencv_text.so.4.6.0 /usr/local/lib/libopencv_ml.so.4.6.0 /usr/local/lib/libopencv_phase_unwrapping.so.4.6.0 /usr/local/lib/libopencv_cudacodec.so.4.6.0 /usr/local/lib/libopencv_videoio.so.4.6.0 /usr/local/lib/libopencv_cudaoptflow.so.4.6.0 /usr/local/lib/libopencv_cudalegacy.so.4.6.0 /usr/local/lib/libopencv_cudawarping.so.4.6.0 /usr/local/lib/libopencv_optflow.so.4.6.0 /usr/local/lib/libopencv_ximgproc.so.4.6.0 /usr/local/lib/libopencv_video.so.4.6.0 /usr/local/lib/libopencv_imgcodecs.so.4.6.0 /usr/local/lib/libopencv_objdetect.so.4.6.0 /usr/local/lib/libopencv_calib3d.so.4.6.0 /usr/local/lib/libopencv_dnn.so.4.6.0 /usr/local/lib/libopencv_features2d.so.4.6.0 /usr/local/lib/libopencv_flann.so.4.6.0 /usr/local/lib/libopencv_photo.so.4.6.0 /usr/local/lib/libopencv_cudaimgproc.so.4.6.0 /usr/local/lib/libopencv_cudafilters.so.4.6.0 /usr/local/lib/libopencv_imgproc.so.4.6.0 /usr/local/lib/libopencv_cudaarithm.so.4.6.0 /usr/local/lib/libopencv_core.so.4.6.0 /usr/local/lib/libopencv_cudev.so.4.6.0 

Run:

$ gdb -batch -ex run -ex where -ex list -ex quit --args  ./predev2 
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib/aarch64-linux-gnu/libthread_db.so.1".
[h264 @ 0x55557908e0] top block unavailable for requested intra mode -1
[h264 @ 0x55557908e0] error while decoding MB 2 0, bytestream 213160
[h264 @ 0x55557908e0] top block unavailable for requested intra mode -1
[h264 @ 0x55557908e0] error while decoding MB 69 0, bytestream 669
[New Thread 0x7f97f79b50 (LWP 3049)]
[New Thread 0x7f97778b50 (LWP 3050)]
[New Thread 0x7f96f77b50 (LWP 3051)]
[New Thread 0x7f96776b50 (LWP 3052)]
[New Thread 0x7f95f75b50 (LWP 3053)]
[New Thread 0x7f95774b50 (LWP 3054)]
[h264 @ 0x5555792890] top block unavailable for requested intra mode -1
[h264 @ 0x5555792890] error while decoding MB 2 0, bytestream 213160

Thread 1 "predev2" received signal SIGSEGV, Segmentation fault.
0x0000007fa3cbadc0 in cv::_InputArray::size(int) const () from /usr/lib/aarch64-linux-gnu/libopencv_core.so.4.1
#0  0x0000007fa3cbadc0 in cv::_InputArray::size(int) const () at /usr/lib/aarch64-linux-gnu/libopencv_core.so.4.1
#1  0x0000007fb6f8edfc in cv::resize(cv::_InputArray const&, cv::_OutputArray const&, cv::Size_<int>, double, double, int) () at /usr/local/lib/libopencv_imgproc.so.406
#2  0x0000007fb72f5d04 in cv::dnn::dnn4_v20220524::blobFromImages(cv::_InputArray const&, cv::_OutputArray const&, double, cv::Size_<int>, cv::Scalar_<double> const&, bool, bool, int) () at /usr/local/lib/libopencv_dnn.so.406
#3  0x0000007fb72f6864 in cv::dnn::dnn4_v20220524::blobFromImage(cv::_InputArray const&, cv::_OutputArray const&, double, cv::Size_<int> const&, cv::Scalar_<double> const&, bool, bool, int) () at /usr/local/lib/libopencv_dnn.so.406
#4  0x0000005555560498 in main() () at /home/a/ai22/src/yolov4_opencv_dnn_cuda.cpp:66
16	constexpr float CONFIDENCE_THRESHOLD = 0;
17	constexpr float NMS_THRESHOLD = 0.4;
18	constexpr int NUM_CLASSES = 80;
19	
20	// colors for bounding boxes
21	const cv::Scalar colors[] = {
22	    {0, 255, 255},
23	    {255, 255, 0},
24	    {0, 255, 0},
25	    {255, 0, 0}
A debugging session is active.

	Inferior 1 [process 3043] will be killed.

Quit anyway? (y or n) [answered Y; input not from terminal]

The following line makes the issue. What's the problem? How to solve this problem?

cv::dnn::blobFromImage(frame, blob, 0.00392, cv::Size(608, 608), cv::Scalar(), true, false, CV_32F);

Even I change the size to cv::Size(512, 512) to fit my yolov4.cfg, the issue consists.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment