Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
YOLOv4 on OpenCV DNN
import cv2
import time
CONFIDENCE_THRESHOLD = 0.2
NMS_THRESHOLD = 0.4
COLORS = [(0, 255, 255), (255, 255, 0), (0, 255, 0), (255, 0, 0)]
class_names = []
with open("classes.txt", "r") as f:
class_names = [cname.strip() for cname in f.readlines()]
vc = cv2.VideoCapture("demo.mp4")
net = cv2.dnn.readNet("yolov4.weights", "yolov4.cfg")
net.setPreferableBackend(cv2.dnn.DNN_BACKEND_CUDA)
net.setPreferableTarget(cv2.dnn.DNN_TARGET_CUDA_FP16)
model = cv2.dnn_DetectionModel(net)
model.setInputParams(size=(416, 416), scale=1/255, swapRB=True)
while cv2.waitKey(1) < 1:
(grabbed, frame) = vc.read()
if not grabbed:
exit()
start = time.time()
classes, scores, boxes = model.detect(frame, CONFIDENCE_THRESHOLD, NMS_THRESHOLD)
end = time.time()
start_drawing = time.time()
for (classid, score, box) in zip(classes, scores, boxes):
color = COLORS[int(classid) % len(COLORS)]
label = "%s : %f" % (class_names[classid[0]], score)
cv2.rectangle(frame, box, color, 2)
cv2.putText(frame, label, (box[0], box[1] - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)
end_drawing = time.time()
fps_label = "FPS: %.2f (excluding drawing time of %.2fms)" % (1 / (end - start), (end_drawing - start_drawing) * 1000)
cv2.putText(frame, fps_label, (0, 25), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 0), 2)
cv2.imshow("detections", frame)
#include <iostream>
#include <queue>
#include <iterator>
#include <sstream>
#include <fstream>
#include <iomanip>
#include <chrono>
#include <opencv2/core.hpp>
#include <opencv2/dnn.hpp>
#include <opencv2/dnn/all_layers.hpp>
#include <opencv2/imgproc.hpp>
#include <opencv2/highgui.hpp>
constexpr float CONFIDENCE_THRESHOLD = 0;
constexpr float NMS_THRESHOLD = 0.4;
constexpr int NUM_CLASSES = 80;
// colors for bounding boxes
const cv::Scalar colors[] = {
{0, 255, 255},
{255, 255, 0},
{0, 255, 0},
{255, 0, 0}
};
const auto NUM_COLORS = sizeof(colors)/sizeof(colors[0]);
int main()
{
std::vector<std::string> class_names;
{
std::ifstream class_file("classes.txt");
if (!class_file)
{
std::cerr << "failed to open classes.txt\n";
return 0;
}
std::string line;
while (std::getline(class_file, line))
class_names.push_back(line);
}
cv::VideoCapture source("demo.mp4");
auto net = cv::dnn::readNetFromDarknet("yolov4.cfg", "yolov4.weights");
net.setPreferableBackend(cv::dnn::DNN_BACKEND_CUDA);
net.setPreferableTarget(cv::dnn::DNN_TARGET_CUDA);
// net.setPreferableBackend(cv::dnn::DNN_BACKEND_OPENCV);
// net.setPreferableTarget(cv::dnn::DNN_TARGET_CPU);
auto output_names = net.getUnconnectedOutLayersNames();
cv::Mat frame, blob;
std::vector<cv::Mat> detections;
while(cv::waitKey(1) < 1)
{
source >> frame;
if (frame.empty())
{
cv::waitKey();
break;
}
auto total_start = std::chrono::steady_clock::now();
cv::dnn::blobFromImage(frame, blob, 0.00392, cv::Size(608, 608), cv::Scalar(), true, false, CV_32F);
net.setInput(blob);
auto dnn_start = std::chrono::steady_clock::now();
net.forward(detections, output_names);
auto dnn_end = std::chrono::steady_clock::now();
std::vector<int> indices[NUM_CLASSES];
std::vector<cv::Rect> boxes[NUM_CLASSES];
std::vector<float> scores[NUM_CLASSES];
for (auto& output : detections)
{
const auto num_boxes = output.rows;
for (int i = 0; i < num_boxes; i++)
{
auto x = output.at<float>(i, 0) * frame.cols;
auto y = output.at<float>(i, 1) * frame.rows;
auto width = output.at<float>(i, 2) * frame.cols;
auto height = output.at<float>(i, 3) * frame.rows;
cv::Rect rect(x - width/2, y - height/2, width, height);
for (int c = 0; c < NUM_CLASSES; c++)
{
auto confidence = *output.ptr<float>(i, 5 + c);
if (confidence >= CONFIDENCE_THRESHOLD)
{
boxes[c].push_back(rect);
scores[c].push_back(confidence);
}
}
}
}
for (int c = 0; c < NUM_CLASSES; c++)
cv::dnn::NMSBoxes(boxes[c], scores[c], 0.0, NMS_THRESHOLD, indices[c]);
for (int c= 0; c < NUM_CLASSES; c++)
{
for (size_t i = 0; i < indices[c].size(); ++i)
{
const auto color = colors[c % NUM_COLORS];
auto idx = indices[c][i];
const auto& rect = boxes[c][idx];
cv::rectangle(frame, cv::Point(rect.x, rect.y), cv::Point(rect.x + rect.width, rect.y + rect.height), color, 3);
std::ostringstream label_ss;
label_ss << class_names[c] << ": " << std::fixed << std::setprecision(2) << scores[c][idx];
auto label = label_ss.str();
int baseline;
auto label_bg_sz = cv::getTextSize(label.c_str(), cv::FONT_HERSHEY_COMPLEX_SMALL, 1, 1, &baseline);
cv::rectangle(frame, cv::Point(rect.x, rect.y - label_bg_sz.height - baseline - 10), cv::Point(rect.x + label_bg_sz.width, rect.y), color, cv::FILLED);
cv::putText(frame, label.c_str(), cv::Point(rect.x, rect.y - baseline - 5), cv::FONT_HERSHEY_COMPLEX_SMALL, 1, cv::Scalar(0, 0, 0));
}
}
auto total_end = std::chrono::steady_clock::now();
float inference_fps = 1000.0 / std::chrono::duration_cast<std::chrono::milliseconds>(dnn_end - dnn_start).count();
float total_fps = 1000.0 / std::chrono::duration_cast<std::chrono::milliseconds>(total_end - total_start).count();
std::ostringstream stats_ss;
stats_ss << std::fixed << std::setprecision(2);
stats_ss << "Inference FPS: " << inference_fps << ", Total FPS: " << total_fps;
auto stats = stats_ss.str();
int baseline;
auto stats_bg_sz = cv::getTextSize(stats.c_str(), cv::FONT_HERSHEY_COMPLEX_SMALL, 1, 1, &baseline);
cv::rectangle(frame, cv::Point(0, 0), cv::Point(stats_bg_sz.width, stats_bg_sz.height + 10), cv::Scalar(0, 0, 0), cv::FILLED);
cv::putText(frame, stats.c_str(), cv::Point(0, stats_bg_sz.height + 5), cv::FONT_HERSHEY_COMPLEX_SMALL, 1, cv::Scalar(255, 255, 255));
cv::namedWindow("output");
cv::imshow("output", frame);
}
return 0;
}
@dnaveenr
Copy link

dnaveenr commented Jun 18, 2020

Thanks for this Yashas. Could you please provide the corresponding python code for Efficient YOLOv3 Inference on OpenCV?

@YashasSamaga
Copy link
Author

YashasSamaga commented Jun 19, 2020

@dnaveenr You just have to change auto net = cv::dnn::readNetFromDarknet("yolov4.cfg", "yolov4.weights"); line. The code works for both YOLOv3 and YOLOv4.

@dnaveenr
Copy link

dnaveenr commented Jun 19, 2020

Ok. Thanks. I meant equivalent Python code? But its fine. I think the main difference is the following :

for (auto name : net.getUnconnectedOutLayersNames())
{
int layerId = net.getLayerId(name);
auto layer = net.getLayer(layerId).dynamicCastcv::dnn::RegionLayer();
if (!layer.empty())
layer->nmsThreshold = 0;
}

I'll make the changes.

@YashasSamaga
Copy link
Author

YashasSamaga commented Jun 19, 2020

@dnaveenr Sorry, I read too fast. You can get the same effect by manually setting nms_threshold=0 (add it if not present already) in all [yolo] blocks in yolovN.cfg. I don't think nmsThreshold is exposed in python.

@dnaveenr
Copy link

dnaveenr commented Jun 19, 2020

No problem. Thanks. I will try it out.

@sky-fly97
Copy link

sky-fly97 commented Jun 26, 2020

Hello,I got an error that cv::dnn::dnn4_v20191202::RegionLayer" has no member "nmsThreshold" at line 52-53, but if I delete it,it will be OK!

@sky-fly97
Copy link

sky-fly97 commented Jun 26, 2020

By the way, how to save the MP4 demo after running?

@YashasSamaga
Copy link
Author

YashasSamaga commented Jun 26, 2020

@sky-fly97 I can guess that your OpenCV version is not the latest master. In that case, you will see performance regressions if you do not set the nms threshold to zero. The old solution is to set nms_threshold=0 in all [yolo] blocks in yolov3.cfg.

If you need YOLOv4 support, you need the master branch.

This is what happened:

  1. nmsThreshold was added a month ago to address spurious NMS in opencv/opencv#17371
  2. A new fix which sets nmsThreshold to zero by default was added in opencv/opencv#17592 (and the old fix is redundant now)

It's not required anymore and I have removed it from this gist.

@sky-fly97
Copy link

sky-fly97 commented Jun 26, 2020

Yeah, now my version of opencv is 4.2.Thank you very much!

@YashasSamaga
Copy link
Author

YashasSamaga commented Jun 26, 2020

@sky-fly97

By the way, how to save the MP4 demo after running?

You need VideoWriter.

cv::VideoWriter writer("output.mp4", cv::VideoWriter::fourcc('M', 'P', 'E', 'G'), 30, cv::Size(width, height));

cv::Mat frame; // what you want to write
writer << frame;

Yeah, now my version of opencv is 4.2.Thank you very much!

There have been a lot of performance improvements since then. OpenCV 4.2 was the first release with CUDA support.

@sky-fly97
Copy link

sky-fly97 commented Jun 27, 2020

@sky-fly97

By the way, how to save the MP4 demo after running?

You need VideoWriter.

cv::VideoWriter writer("output.mp4", cv::VideoWriter::fourcc('M', 'P', 'E', 'G'), 30, cv::Size(width, height));

cv::Mat frame; // what you want to write
writer << frame;

Yeah, now my version of opencv is 4.2.Thank you very much!

There have been a lot of performance improvements since then. OpenCV 4.2 was the first release with CUDA support.

Thanks!

@sky-fly97
Copy link

sky-fly97 commented Jun 27, 2020

When I changed yolov3 to yolov4's cfg and weights, a cv::Exception error occurred during dnn::readNetFromDarknet. I think it may be because of the opencv version problem, because 4.2 was proposed at the end of last year, but yolov4 was proposed in April this year.

@YashasSamaga
Copy link
Author

YashasSamaga commented Jun 27, 2020

@sky-fly97 There is no release which supports YOLOv4 yet. Support has been added for the next release on master. You need to use master if you need YOLOv4 support.

@sky-fly97
Copy link

sky-fly97 commented Jun 27, 2020

When compiling the latest master version, I followed the same process but there were still some problems that caused the compilation to fail, which was not encountered in the 4.2 and 4.3 versions... Could you please share your compiled opencv file? ? Thank you very much! It doesn't matter if it's inconvenient, I will study it again!

@YashasSamaga
Copy link
Author

YashasSamaga commented Jun 27, 2020

@sky-fly97 What is the error that is causing the compilation to fail?

@sky-fly97
Copy link

sky-fly97 commented Jun 27, 2020

https://github.com/opencv/opencv/issues/17677,I just asked this question in the community.
Cannot specify link libraries for target "opencv_gapi"

@YashasSamaga
Copy link
Author

YashasSamaga commented Jun 27, 2020

@sky-fly97

If you only need DNN with CUDA support, you need the following modules:

  • cudev
  • opencv_core
  • opencv_dnn
  • opencv_imgproc

You might also require the following to read/write/display images and videos:

  • opencv_imgcodecs
  • opencv_highgui
  • opencv_videoio

You can disable the rest.

@hlacik
Copy link

hlacik commented Jul 17, 2020

thank you very much for this , i have one question
i understand that DNN_TARGET_CUDA is faster than DNN_TARGET_CUDA_FP16 on GTX 1080 (since it has no HALF_PRECISION cores)
but why the same applies for jetson nano?

DNN_TARGET_CUDA gives ~17fps, while DNN_TARGET_CUDA_FP16 only ~1fps

I am using custom trained yolov4_tiny (darknet)

@YashasSamaga
Copy link
Author

YashasSamaga commented Jul 17, 2020

but why the same applies for jetson nano?

That's unusual. How did you measure the FPS? Note that the first few forward passes will be slow due to lazy initialization.

@hlacik
Copy link

hlacik commented Jul 17, 2020

unmodified yolov4.py python script from you , opencv compiled yesterday from git (master branch) with CUDA, CUDDN from jetson jetpack 4.4 (cuda 10.2, cuddn 8).

Could it be an issue with cuddn 8 ? Should i try older 7?

net = cv2.dnn.readNet(
"yolov4-tiny_lp_final.weights", "yolov4-tiny_lp.cfg"
)
net.setPreferableBackend(cv2.dnn.DNN_BACKEND_CUDA)
net.setPreferableTarget(cv2.dnn.DNN_TARGET_CUDA_FP16)

result :

FPS: 0.22 (excluding drawing time of 0.01ms)
FPS: 0.91 (excluding drawing time of 0.01ms)
FPS: 0.91 (excluding drawing time of 0.01ms)
FPS: 0.91 (excluding drawing time of 0.01ms)
FPS: 0.91 (excluding drawing time of 0.01ms)
FPS: 0.91 (excluding drawing time of 0.01ms)
FPS: 0.91 (excluding drawing time of 0.01ms)
FPS: 0.91 (excluding drawing time of 0.01ms)
FPS: 0.91 (excluding drawing time of 0.01ms)
FPS: 0.91 (excluding drawing time of 0.01ms)
FPS: 0.91 (excluding drawing time of 0.01ms)
FPS: 0.91 (excluding drawing time of 0.01ms)
FPS: 0.90 (excluding drawing time of 0.01ms)
FPS: 0.91 (excluding drawing time of 0.01ms)
FPS: 0.91 (excluding drawing time of 0.01ms)
FPS: 0.91 (excluding drawing time of 0.01ms)
FPS: 0.91 (excluding drawing time of 0.01ms)
FPS: 0.91 (excluding drawing time of 0.01ms)
FPS: 0.91 (excluding drawing time of 0.01ms)
FPS: 0.91 (excluding drawing time of 0.01ms)
FPS: 0.91 (excluding drawing time of 0.01ms)
FPS: 0.91 (excluding drawing time of 0.01ms)
FPS: 0.91 (excluding drawing time of 0.01ms)
FPS: 0.91 (excluding drawing time of 0.01ms)
FPS: 0.91 (excluding drawing time of 0.01ms)
FPS: 0.91 (excluding drawing time of 0.01ms)
FPS: 0.90 (excluding drawing time of 0.01ms)
FPS: 0.90 (excluding drawing time of 0.01ms)
FPS: 0.91 (excluding drawing time of 0.01ms)
FPS: 0.91 (excluding drawing time of 0.01ms)
FPS: 0.90 (excluding drawing time of 0.01ms)
FPS: 0.91 (excluding drawing time of 0.01ms)
FPS: 0.91 (excluding drawing time of 0.01ms)
FPS: 0.91 (excluding drawing time of 0.01ms)
FPS: 0.91 (excluding drawing time of 0.01ms)
FPS: 0.91 (excluding drawing time of 0.01ms)
FPS: 0.91 (excluding drawing time of 0.01ms)
FPS: 0.91 (excluding drawing time of 0.01ms)
FPS: 0.91 (excluding drawing time of 0.01ms)
FPS: 0.91 (excluding drawing time of 0.01ms)
FPS: 0.91 (excluding drawing time of 0.01ms)

and

net = cv2.dnn.readNet(
"yolov4-tiny_lp_final.weights", "yolov4-tiny_lp.cfg"
)
net.setPreferableBackend(cv2.dnn.DNN_BACKEND_CUDA)
net.setPreferableTarget(cv2.dnn.DNN_TARGET_CUDA)

result

FPS: 0.31 (excluding drawing time of 0.02ms)
FPS: 16.19 (excluding drawing time of 0.01ms)
FPS: 16.98 (excluding drawing time of 0.01ms)
FPS: 17.39 (excluding drawing time of 0.01ms)
FPS: 17.40 (excluding drawing time of 0.01ms)
FPS: 17.76 (excluding drawing time of 0.01ms)
FPS: 17.96 (excluding drawing time of 0.01ms)
FPS: 17.90 (excluding drawing time of 0.01ms)
FPS: 17.82 (excluding drawing time of 0.01ms)
FPS: 17.77 (excluding drawing time of 0.01ms)
FPS: 17.52 (excluding drawing time of 0.01ms)
FPS: 17.61 (excluding drawing time of 0.01ms)
FPS: 17.87 (excluding drawing time of 0.01ms)
FPS: 17.75 (excluding drawing time of 0.01ms)
FPS: 17.70 (excluding drawing time of 0.01ms)
FPS: 17.50 (excluding drawing time of 0.01ms)
FPS: 17.61 (excluding drawing time of 0.01ms)
FPS: 17.56 (excluding drawing time of 0.01ms)
FPS: 17.80 (excluding drawing time of 0.01ms)
FPS: 17.95 (excluding drawing time of 0.01ms)
FPS: 17.78 (excluding drawing time of 0.01ms)
FPS: 17.89 (excluding drawing time of 0.01ms)
FPS: 17.87 (excluding drawing time of 0.01ms)
FPS: 17.83 (excluding drawing time of 0.01ms)
FPS: 17.91 (excluding drawing time of 0.01ms)
FPS: 17.66 (excluding drawing time of 0.01ms)

@YashasSamaga
Copy link
Author

YashasSamaga commented Jul 17, 2020

@hlacik Let's continue the discussion at AlexeyAB/darknet#6245

@hlacik
Copy link

hlacik commented Jul 19, 2020

@YashasSamaga sorry for disturbing again - i am searching for dnn documentation, and i have not found many samples/guides at official opencv site or repo, i understand that this is because dnn is new, will yolov4.py be added as example to opencv? Where should i search for more usage examples?
I have found and read through your gists which gives brief introduction for sure.

@YashasSamaga
Copy link
Author

YashasSamaga commented Jul 19, 2020

@hlacik There are several samples here: https://github.com/opencv/opencv/tree/master/samples/dnn

The DNN module is several years old but the CUDA backend is just around six months old. All the old samples work with CUDA backend too. Only the target and backend have to be set.

The samples don't seem to be updated to use the high-level DL API.

@kmsravindra
Copy link

kmsravindra commented Jul 29, 2020

@YashasSamaga, You have published a python version and a c++ version of yolo. Is there any FPS / inference time comparison (from capture to display) for these two different versions ? If there is a link that you have published, please point me to that.
Are the benchmarks that you published here (or on alexeyAB repo) are from capture to display? If so, I am confused if they were obtained by using c++ version or python version?

@YashasSamaga
Copy link
Author

YashasSamaga commented Jul 29, 2020

@kmsravindra

Is there any FPS / inference time comparison (from capture to display) for these two different versions ?

The C++ version reports the inference and total time (preprocessing + inference + NMS + postprocessing). The python version reports the total time only. The capturing and display is not included in either version. There is no comparison that includes capturing and display as these vary across systems and can be hidden with pipelines. Therefore, they are often not considered in benchmarks.

Are the benchmarks that you published here (or on alexeyAB repo) are from capture to display? If so, I am confused if they were obtained by using c++ version or python version?

The benchmark results I published is here. The numbers you see in Darknet's ReadMe were taken from there. These numbers were obtained using a different C++ code which measures the inference time only.

You may be able to find more benchmark results from other users here: AlexeyAB/darknet#6245

@kmsravindra
Copy link

kmsravindra commented Jul 29, 2020

@YashasSamaga, Thanks for the links.

  1. It will be very helpful to understand what results in this higher performance using opencv-dnn over darknet? Is it tensorRT type of optimizations? Could you please share any article / blog on the same?
  2. Does opencv-dnn taget-cuda need tensor cores? OR can I get such performance improvements even on GTX 1080Ti ( doesn't have tensor cores)?
  3. Perhaps you might be aware of deepstream SDK from nvidia that claims to reduce the memcpy between cpu and gpu while processing the video streams. Deepstream claims high throughput and low latency due to these optimizations. Is there any plan / chance that opencv capture to display process could also do a similar thing to reduce the latency further?

@YashasSamaga
Copy link
Author

YashasSamaga commented Jul 29, 2020

@kmsravindra

It will be very helpful to understand what results in this higher performance using opencv-dnn over darknet? Is it tensorRT type of optimizations? Could you please share any article / blog on the same?

There is no article or blog post yet. The CUDA support for OpenCV DNN was first released 4.2.0. So it's just 7 months ago. Maybe I'll do it someday.

OpenCV DNN has a set of backend agnostic optimizations such as fusing linear operators like batch normalization and scaling with convolution. Then there are the backend-specific optimizations such as fusing activation and elementwise layers (like residual connections) with convolution. Further optimizations are applied at each operation to reduce tensor ranks or reduce expensive operations like transpose or slice to copy. Finally, the CUDA code at the lowest level is optimized at the instruction level based on profiling results.

I don't have much idea about the internals of Darknet. It does a few of the optimizations such as fusing batch normalisation with convolution but is not as rigorous as OpenCV.

TensorRT applies much more optimizations in addition to the ones listed above. OpenCV does not support TensorRT yet.

Does opencv-dnn taget-cuda need tensor cores? OR can I get such performance improvements even on GTX 1080Ti ( doesn't have tensor cores)?

It's optional. OpenCV DNN works on all devices with compute capability 3.0 and above. It uses TensorCores in the FP16 target (DNN_TARGET_CUDA_FP16) if available. TensorCores are only used for convolution. The optimizations listed in the previous section are always applied on all devices.

Perhaps you might be aware of deepstream SDK from nvidia that claims to reduce the memcpy between cpu and gpu while processing the video streams. Deepstream claims high throughput and low latency due to these optimizations. Is there any plan / chance that opencv capture to display process could also do a similar thing to reduce the latency further?

These features are not available in OpenCV as of today. You can open a feature request issue at OpenCV's repository.

@YashasSamaga
Copy link
Author

YashasSamaga commented Aug 27, 2020

@marvision-ai

Scenerio 1:

load model A
model A inference (for some time)
unload model A from GPU memory
Load model b
model B Inference (for some time)
unload model B from GPU memory

This would be insanely slow because OpenCV will have to reinitialize for every forward pass.

Scenerio 2 ( I ask this one because I am not sure if its memory safe to load both at the same time)

load model A
load model B
model A Inference
model B inference

This is safe and the initialization is done just once.

@YashasSamaga
Copy link
Author

YashasSamaga commented Aug 27, 2020

Doesn't matter how slow. For instance: I am running all of these on the Jetson Xavier. It does not have enough memory to load all 5 different models I am using for different cases simultaneously does it?

I think it would be nice to have an API to control memory usage: prefer fastest, prefer the least memory, some middle ground option, etc.

If I want to do taskA with modelA and then switch to task B with modelB and I have a few sec between each task, that is fine as long as I know that the mem is being dealloc properly.

You need to destroy the object to release the resources owned by the network.

net = cv2.dnn.readNet(...)
.
.
.
del net

For Scenerio 2 for instance: Is this correct?

Yes

Is it possible loading both in memory will decrease overall FPS of each since they are both loaded into a shared GPU resource? Or does it not work like that since I am running them in one after the other (series not parallel)?

FPS will not change in series configuration. The FPS of individual networks might drop in parallel configuration but the overall throughput will be higher than series configuration (i.e. both latency and throughput increase).

It might be more efficient to use multiple models as this can keep the GPU saturated with work when you are doing the CPU part of the work for some other model.

@YashasSamaga
Copy link
Author

YashasSamaga commented Aug 27, 2020

I am not sure what the limit is to the Xavier on the amount of models it can load but if I knew a way to probe available remaining space, that would it easier.

You can check this using nvidia-smi.

@marvision-ai
Copy link

marvision-ai commented Sep 4, 2020

@YashasSamaga few more quick questions:

  1. When you increase NMS_Threshold - you get more faulty detections and multiple boxes per object.... it seems like 0.4 is the sweet spot. Could you explain why?
  2. Say for instance I have input images @ 1920x1080 pix. I am running inference on them with a model of 608x608.
    Those images will be resized from 1920x1080 --> 608x608 --> and then I am returned boxes from the detections.
    The boxes I am returned are scaled to the original non-resized image inside that function correct? Just want to confirm.

@YashasSamaga
Copy link
Author

YashasSamaga commented Sep 4, 2020

@marvision-ai

When you increase NMS_Threshold - you get more faulty detections and multiple boxes per object.... it seems like 0.4 is the sweet spot. Could you explain why?

I think this value is empirically derived. This varies from model to model.

Say for instance I have input images @ 1920x1080 pix. I am running inference on them with a model of 608x608.
Those images will be resized from 1920x1080 --> 608x608 --> and then I am returned boxes from the detections.
The boxes I am returned are scaled to the original non-resized image inside that function correct? Just want to confirm

Yes.

@26medias
Copy link

26medias commented Sep 9, 2020

Do you have the list of classes to share?
I downloaded one but I'm a "thing", bottles are "prickly pears", glasses are "pawpaw", whatever that is...
I can't find the proper class list for V4.

Thanks!

@piotrostr
Copy link

piotrostr commented Sep 18, 2020

Awesome stuff man, I didnt know about the dnn_DetectionModel class. Thanks!

@jhonjam
Copy link

jhonjam commented Oct 4, 2020

script de python yolov4.py sin modificar de usted, opencv compilado ayer de git (rama maestra) con CUDA, CUDDN de jetson jetpack 4.4 (cuda 10.2, cuddn 8).

¿Podría ser un problema con cuddn 8? ¿Debería probar mayores de 7 años?

net = cv2.dnn.readNet (
"yolov4-tiny_lp_final.weights", "yolov4-tiny_lp.cfg"
)
net.setPreferableBackend (cv2.dnn.DNN_BACKEND_CUDA)
net.setPreferableTarget (cv2.dnARn.DNN_UD

resultado:

FPS: 0,22 (excluyendo el tiempo de dibujo de 0,01 ms)
FPS: 0,91 (excluyendo el tiempo de dibujo de 0,01 ms)
FPS: 0,91 (excluyendo el tiempo de dibujo de 0,01 ms)
FPS: 0,91 (excluyendo el tiempo de dibujo de 0,01 ms)
FPS: 0,91 (excluido el tiempo de dibujo) tiempo de 0.01ms)
FPS: 0.91 (excluyendo el tiempo de 0.01ms dibujo)
FPS: 0.91 (excluyendo el tiempo de 0.01ms dibujo)
FPS: 0.91 (excluyendo el tiempo de 0.01ms dibujo)
FPS: 0.91 (excluyendo el tiempo de 0.01ms dibujo)
FPS : 0,91 (excluyendo el tiempo de dibujo de 0,01 ms)
FPS: 0,91 (excluyendo el tiempo de dibujo de 0,01 ms)
FPS: 0,91 (excluyendo el tiempo de dibujo de 0,01 ms)
FPS: 0,90 (excluyendo el tiempo de dibujo de 0,01 ms)
FPS: 0,91 (excluyendo el tiempo de dibujo de 0,01 ms) de 0.01ms)
FPS: 0.91 (excluyendo el tiempo de dibujo de 0.01ms)
FPS: 0.91 (excluyendo el tiempo de dibujo de 0.01ms)
FPS: 0.91 (excluyendo el tiempo de dibujo de 0.01ms)
FPS: 0.91 (excluyendo el tiempo de dibujo de 0.01ms)
FPS: 0.91 (excluyendo el dibujo tiempo de 0.01ms)
FPS: 0.91 (excluyendo el tiempo de 0.01ms dibujo)
FPS: 0.91 (excluyendo el tiempo de 0.01ms dibujo)
FPS: 0.91 (excluyendo el tiempo de 0.01ms dibujo)
FPS: 0.91 (excluyendo el tiempo de 0.01ms dibujo)
FPS : 0,91 (excluyendo el tiempo de dibujo de 0,01 ms)
FPS: 0,91 (excluido el tiempo de dibujo de 0,01 ms)
FPS: 0,91 (excluido el tiempo de dibujo de 0,01 ms)
FPS: 0,90 (excluido el tiempo de dibujo de 0,01 ms)
FPS: 0,90 (excluido el tiempo de dibujo de 0.01ms)
FPS: 0.91 (excluyendo el tiempo de dibujo de 0.01ms)
FPS: 0.91 (excluyendo el tiempo de dibujo de 0.01ms)
FPS: 0.90 (excluyendo el tiempo de dibujo de 0.01ms)
FPS: 0.91 (excluyendo el tiempo de dibujo de 0.01ms)
FPS: 0.91 (excluyendo el dibujo tiempo de 0.01ms)
FPS: 0.91 (excluyendo el tiempo de 0.01ms dibujo)
FPS: 0.91 (excluyendo el tiempo de 0.01ms dibujo)
FPS: 0.91 (excluyendo el tiempo de 0.01ms dibujo)
FPS: 0.91 (excluyendo el tiempo de 0.01ms dibujo)
FPS : 0,91 (excluyendo el tiempo de dibujo de 0,01 ms)
FPS: 0,91 (excluyendo el tiempo de dibujo de 0,01 ms)
FPS: 0,91 (excluyendo el tiempo de dibujo de 0,01 ms)
FPS: 0,91 (excluyendo el tiempo de dibujo de 0,01 ms)

y

net = cv2.dnn.readNet (
"yolov4-tiny_lp_final.weights", "yolov4-tiny_lp.cfg"
)
net.setPreferableBackend (cv2.dnn.DNN_BACKEND_CUDA)
net.setPreferableTarget (cv2.dnGA) .DNN_UD

resultado

FPS: 0.31 (excluyendo el tiempo de dibujo de 0.02ms)
FPS: 16.19 (excluyendo el tiempo de dibujo de 0.01ms)
FPS: 16.98 (excluyendo el tiempo de dibujo de 0.01ms)
FPS: 17.39 (excluyendo el tiempo de dibujo de 0.01ms)
FPS: 17.40 (excluyendo el dibujo tiempo de dibujo de 0,01 ms)
FPS: 17,76 (excluyendo el tiempo de dibujo de 0,01 ms)
FPS: 17,96 (excluyendo el tiempo de dibujo de 0,01 ms)
FPS: 17,90 (excluyendo el tiempo de dibujo de 0,01 ms)
FPS: 17,82 (excluyendo el tiempo de dibujo de 0,01 ms)
FPS : 17,77 (excluyendo el tiempo de dibujo de 0,01 ms)
FPS: 17,52 (excluyendo el tiempo de dibujo de 0,01 ms)
FPS: 17,61 (excluyendo el tiempo de dibujo de 0,01 ms)
FPS: 17,87 (excluyendo el tiempo de dibujo de 0,01 ms)
FPS: 17,75 (excluyendo el tiempo de dibujo de 0,01 ms) de 0.01ms)
FPS: 17,70 (excluyendo el tiempo de dibujo de 0,01 ms)
FPS: 17,50 (excluyendo el tiempo de dibujo de 0,01 ms)
FPS: 17,61 (excluyendo el tiempo de dibujo de 0,01 ms)
FPS: 17,56 (excluyendo el tiempo de dibujo de 0,01 ms)
FPS: 17,80 (excluyendo el tiempo de dibujo) tiempo de dibujo de 0.01ms)
FPS: 17.95 (excluyendo el tiempo de dibujo de 0.01ms)
FPS: 17.78 (excluyendo el tiempo de dibujo de 0.01ms)
FPS: 17.89 (excluyendo el tiempo de dibujo de 0.01ms)
FPS: 17.87 (excluyendo el tiempo de dibujo de 0.01ms)
FPS : 17,83 (excluyendo el tiempo de dibujo de 0,01 ms)
FPS: 17,91 (excluyendo el tiempo de dibujo de 0,01 ms)
FPS: 17,66 (excluyendo el tiempo de dibujo de 0,01 ms)

what version of opencv are you using????? I have a problem

Unsupported activation: mish in function 'cv::dnn::darknet::ReadDarknetFromCfgStream'

@YashasSamaga
Copy link
Author

YashasSamaga commented Oct 4, 2020

Unsupported activation: mish in function 'cv::dnn::darknet::ReadDarknetFromCfgStream'

@jhonjam You need OpenCV 4.4 or above.

@marvision-ai
Copy link

marvision-ai commented Oct 14, 2020

@YashasSamaga Hi again,

I just wanted to ask a quick question: does this implementation of opencv support batched inference? I am not sure if you responded to this or answered this somewhere else.
If it does, how would one do it in the code you provided?

@YashasSamaga
Copy link
Author

YashasSamaga commented Oct 14, 2020

does this implementation of opencv support batched inference? I am not sure if you responded to this or answered this somewhere else.
If it does, how would one do it in the code you provided?

Yes, it supports batch inference but it's not supported by the high level API. You have to manually do the preprocessing/postprocessing and call Net::forward (just like in the C++ example). Please check opencv/opencv#17838.

@arnaud-nt2i
Copy link

arnaud-nt2i commented Dec 1, 2020

@YashasSamaga Hi !
While trying to use THe C++ code I got this error:
OpenCV(4.5.0) Error: Parsing error (Unknown layer type: sam) in cv::dnn::darknet::ReadDarknetFromCfgStream, file C:\opencv-4.5.0\modules\dnn\src\darknet\darknet_io.cpp, line 865

There is indeed some SAM layers in my .cfg, is there a way to use OpenCV DNN without retraining the network without sam layers?

@YashasSamaga
Copy link
Author

YashasSamaga commented Dec 2, 2020

@arnaud-nt2i Please open a feature request issue at OpenCV repository.

@pablodz
Copy link

pablodz commented Dec 9, 2020

Hello there, check this repo, I add a dockerized yolov4-tiny (only for fish detection) with streamlit based on a repo of Cuda Chen.
https://github.com/DZPeru/fishv4
https://fishv4.herokuapp.com/

@mmustafa-ja
Copy link

mmustafa-ja commented Mar 2, 2021

How can we change this from videos to images. I have an images folder that i need to run the yolov4 on. Thanks

@YashasSamaga
Copy link
Author

YashasSamaga commented Mar 2, 2021

@mmustafa0601 The demo scripts load frames from videos using cv::VideoCapture. You have to load frames using cv::imread. The DNN part of the code remains exactly the same.

@mmustafa-ja
Copy link

mmustafa-ja commented Mar 2, 2021

Thanks for the reply. That i know. The thing is i have to read from a folder full of images. Can you give me the line of code which i will use because i am getting an error the way i am using imread. Really appreciate the help!

@YashasSamaga
Copy link
Author

YashasSamaga commented Mar 3, 2021

@mmustafa0601 Are you writing the code in python or C++?

images = [f.path for f in os.scandir("path/to/images/dir ")]
for path in images:
    frame = cv2.imread(path)
    cv2.imshow('frame', frame)  

What error do you get? What format are your images in?

@mmustafa-ja
Copy link

mmustafa-ja commented Mar 3, 2021

I am using c++. Would appreciate if you could share the relavant code in c++. I am relatively new to both c++ and opencv thats why i am having trouble with it. Thanks for the assistance

@YashasSamaga
Copy link
Author

YashasSamaga commented Mar 3, 2021

@jasonbeach
Copy link

jasonbeach commented Mar 5, 2021

@mmustafa0601, The VideoCapture object supports loading multiple images from a single folder. I haven't tried it but OpenCV's documentation makes it looks like it's as simple providing something like path_to_images/img_%02d.jpg (for images named img_00.jpg, img_01.jpg, img_02.jpg, ...) as the filename arg to VideoCapture and then use VideoCapture as you would for a video.

@jasonbeach
Copy link

jasonbeach commented Mar 5, 2021

Just curious what framerate you are able to get. I built opencv 4.5.1 from source with CUDA enabled and was getting the warning setUpNet DNN module was not built with CUDA backend; switching to CPU and was processing frames at a rate of 1-2 fps. nvidia-smi confirmed the GPU wasn't being used.

I figured out in cmake setting WITH_CUDNN=ON and OPENCV_DNN_CUDA=ON--after rebuilding and reinstalling, I no longer got the error and nvidia-smi indicated the GPU was indeed being used, but I was still only getting 1-2 fps. My laptop GPU isn't great (MX150) but I would've thought I would gotten at least a little bit of a bump. The video I'm feeding it is 720p. I'm just using the stock yolov4 network and weights.

@YashasSamaga
Copy link
Author

YashasSamaga commented Mar 5, 2021

@jasonbeach Please try with DNN_TARGET_CUDA. Your GPU does not provide high FP16 performance.

@jasonbeach
Copy link

jasonbeach commented Mar 5, 2021

@jasonbeach Please try with DNN_TARGET_CUDA. Your GPU does not provide high FP16 performance.

Yes I have:

    net.setPreferableBackend(cv::dnn::DNN_BACKEND_CUDA);
    net.setPreferableTarget(cv::dnn::DNN_TARGET_CUDA);

@YashasSamaga
Copy link
Author

YashasSamaga commented Mar 5, 2021

@jasonbeach Some quick points to note:

  1. the first forward pass is slow
  2. 720p as input directly to the DNN is expected to be slow (but I believe that you're resizing manually or letting OpenCV DNN automatically resize input to something smaller, say 416x416)
  3. YOLOv4 gives around 12 FPS for 416x416 on GTX 1050. It should be lower for MX150.

If nothing works, you might have to switch to YOLOv4-Tiny or other faster variants.

@shuternay
Copy link

shuternay commented Mar 21, 2021

It seems you also need to swap R and B channels: model.setInputParams(size=(416, 416), scale=1/255, swapRB=True). Without it I get predictions that differ from darknet's.
See https://docs.opencv.org/master/da/d9d/tutorial_dnn_yolo.html and https://github.com/opencv/opencv/blob/master/samples/dnn/object_detection.py#L265

@YashasSamaga
Copy link
Author

YashasSamaga commented Mar 22, 2021

@shuternay Thanks for pointing it out. I have fixed it.

@vvmatorin
Copy link

vvmatorin commented May 7, 2021

@YashasSamaga thank you very much for sharing this code!

I wonder if you could help me with the issue that I have discovered using it: I have been using the python code for image detection, and I have some misdetections comparing to the original darknet model (trained with custom .cfg, but only changed image size, number of iterations and classes) on bboxes with low confidence.

My model size is 832x832 (original images are smaller and I am not resizing them manually), I have updated input parameters and tried setting CONFIDENCE_THRESHOLD to [0; 0.2] and changing NMS_THRESHOLD between [0.2; 0.75]. Yet it can never detect bboxes, that original darknet detects with threshold = 0.1

Below are the images (same cfg and weights):

  1. darknet detector test obj.data yolov4-obj.cfg yolov4.weights image.jpg -ext_output -dont_show -thresh 0.1
  2. opencv code above with: CONFIDENCE_THRESHOLD=0 and NMS_THRESHOLD=0.4

@YashasSamaga
Copy link
Author

YashasSamaga commented May 7, 2021

@vvmatorin I think this might be a problem with your model since it works with the original darknet model. Does your model work in darknet?

@vvmatorin
Copy link

vvmatorin commented May 7, 2021

@vvmatorin I think this might be a problem with your model since it works with the original darknet model. Does your model work in darknet?

Sorry, I might have phrased my issue wrong.
Basically I trained the darknet YOLOv4 model with my custom data and configuration to recognize single class.

Then I have inferenced the same image as above with the AlexeyAB library using my final weights/config as an input (setting confidence threshold to 0.1), and got output as shown on the first image (command at p.1).
Finally, I have inferenced the same image using same weights/config, but only with OpenCV dnn module, and got different results (losing low confidence predictions).

@YashasSamaga
Copy link
Author

YashasSamaga commented May 8, 2021

@vvmatorin

Yet it can never detect bboxes, that original darknet detects with threshold = 0.1

Can you recheck if your input is correct? BGR or RGB? normaliezed image?

Can you check if (DNN_BACKEND_OPENCV, DNN_TARGET_CPU) gives correct detections? If it gives correctly, then there might be an issue with the CUDA backend; otherwise, it might be an issue in your preprocessing or postprocessing (or you used some unsupported layer). The official YOLOv4 model is fully supported in both CUDA and OCV CPU backends. If you have added a new layer or alter the settings of some layer, then you might have to check if OpenCV supports those layers. Sometimes the DNN module won't diagnose unsupported layers. It will simply give wrong outputs.

@vvmatorin
Copy link

vvmatorin commented May 8, 2021

@YashasSamaga

Can you recheck if your input is correct? BGR or RGB? normalized image?

For preprocessing I use the same code as you shared (which should do resizing, swapping channels and normalization for input):
model.setInputParams(size=(832, 832), scale=1/255, swapRB=True, crop=False)

You can find the full code snippet attached (I'm using 4.5.1 version of OpenCV):

Can you check if (DNN_BACKEND_OPENCV, DNN_TARGET_CPU) gives correct detections?

The confidence and bboxes for both (OPENCV+CPU and CUDA) backends are exactly the same as above (different from darknet).

The official YOLOv4 model is fully supported in both CUDA and OCV CPU backends. If you have added a new layer or alter the settings of some layer, then you might have to check if OpenCV supports those layers. Sometimes the DNN module won't diagnose unsupported layers. It will simply give wrong outputs.

I am using the official config for both training and inference without making any changes to it apart from the ones below (but changing input size and number of classes should be supported):

[net]
subdivisions=64
width=832
height=832

[convolutional]
filters=18

[yolo]
classes=1

Could it be something related to how resizing is done, .jpg format or having only a single class? I see people are mentioning the same issue here.

@YashasSamaga
Copy link
Author

YashasSamaga commented May 8, 2021

@vvmatorin Can you check if the outputs match exactly with NMS disabled in both Darknet and OpenCV?

@vvmatorin
Copy link

vvmatorin commented May 8, 2021

@vvmatorin Can you check if the outputs match exactly with NMS disabled in both Darknet and OpenCV?

@YashasSamaga I set:

  • nms = 0 for darknet, with confidence threshold = 0.1
  • NMS_THRESHOLD = 0 for OpenCV, with CONFIDENCE_THRESHOLD = 0.1

The outputs don't match, as OpenCV output has less bounding boxes overall. Results are below:

@marvision-ai
Copy link

marvision-ai commented May 12, 2021

@vvmatorin & @YashasSamaga
Has this been discussed more yet? I actually saw similar results in the past but I didn't think much of it. This conversation is now making me curious if my networks are behaving the same. I am very keen to see what results @vvmatorin has to offer.
Thank you!

@Fetulhak
Copy link

Fetulhak commented Jul 25, 2021

@vvmatorin @YashasSamaga @marvision-ai Hi. I have get different result for the same image, same conf_thresh, when I use OCV dnn and darknet. when I change swapRB to False I get better results but it is still different objects detected on both versions? any idea to modify?

@YashasSamaga
Copy link
Author

YashasSamaga commented Jul 25, 2021

@vvmatorin @marvision-ai @Fetulhak Does this happen with OpenCV CPU backend?

@Fetulhak
Copy link

Fetulhak commented Jul 25, 2021

HI @YashasSamaga I have set the back end as following combinations:

net.setPreferableBackend(cv2.dnn.DNN_BACKEND_CUDA)
#net.setPreferableBackend(cv2.dnn.DNN_BACKEND_OPENCV)
net.setPreferableTarget(cv2.dnn.DNN_TARGET_CUDA_FP16)
#net.setPreferableTarget(cv2.dnn.DNN_TARGET_CPU)

BTW when I change the swapBR to False I get more detections but they are a different objects (some of them) than the one I obtained using darknet from @AlexeyAB.

is there any other parameter to be changed?

@YashasSamaga
Copy link
Author

YashasSamaga commented Jul 25, 2021

@Fetulhak What happens if you try with DNN_BACKEND_OPENCV and DNN_TARGET_CPU?

@Fetulhak
Copy link

Fetulhak commented Jul 25, 2021

@YashasSamaga it is the same detection result nothing changed. I have also tried different values of nms_thresh but no change.

@AlexeyAB
Copy link

AlexeyAB commented Jul 25, 2021

@Fetulhak Try to use default yolov4.cfg/weights with this image https://github.com/opencv/opencv_extra/blob/master/testdata/dnn/dog416.png

@Fetulhak
Copy link

Fetulhak commented Jul 25, 2021

Hi @AlexeyAB thanks for providing us your great darknet tool. It works well for default yolov4 cfg and weights on coco or imagenet. However I was using my own cfg and weight customized for my own dataset. when I check for a single image using your ./darknet detector test I get very promising result. when I bring it to the opencv pyhton it is totally a different low performance. I don't know why this is happening? is it due to the implementation difference or wht?

@AlexeyAB
Copy link

AlexeyAB commented Jul 25, 2021

@Fetulhak Hi, thanks! Currently OpenCV_dnn doesn't support Scaled-YOLOv4 cfg-files, I will try to fix it.

@Fetulhak
Copy link

Fetulhak commented Jul 26, 2021

@AlexeyAB @YashasSamaga another question: import darknet works correctly but when I use netMain = darknet.load_net_custom(configPath.encode( "ascii"), weightPath.encode("ascii"), 0, 1) # batch size = 1
it generate an error message: any idea how to fix?
AttributeError: module 'darknet' has no attribute 'load_net_custom'

@Fetulhak
Copy link

Fetulhak commented Jul 26, 2021

@AlexeyAB @YashasSamaga I run the cmake file as follows by following the instructions

!sed -i 's/GPU=0/GPU=1/g' Makefile
!sed -i 's/OPENCV=0/OPENCV=1/g' Makefile
!sed -i 's/LIBSO=0/LIBSO=1/g' Makefile
!cat Makefile
!make

when we set LIBSO=1 it build darknet.so library.
However in the darknet.py file it has a line which points to libdarknet.so which is not found

if os.name == "posix":
    cwd = os.path.dirname(__file__)
    lib = CDLL(cwd + "/libdarknet.so", RTLD_GLOBAL)
elif os.name == "nt":
    cwd = os.path.dirname(__file__)
    os.environ['PATH'] = cwd + ';' + os.environ['PATH']
    lib = CDLL("darknet.dll", RTLD_GLOBAL)

@Fetulhak
Copy link

Fetulhak commented Jul 26, 2021

@vvmatorin Can you check if the outputs match exactly with NMS disabled in both Darknet and OpenCV?

@YashasSamaga I set:

  • nms = 0 for darknet, with confidence threshold = 0.1
  • NMS_THRESHOLD = 0 for OpenCV, with CONFIDENCE_THRESHOLD = 0.1

The outputs don't match, as OpenCV output has less bounding boxes overall. Results are below:

@vvmatorin did you find any solution for this?

@vvmatorin
Copy link

vvmatorin commented Jul 26, 2021

@vvmatorin Can you check if the outputs match exactly with NMS disabled in both Darknet and OpenCV?

@YashasSamaga I set:

  • nms = 0 for darknet, with confidence threshold = 0.1
  • NMS_THRESHOLD = 0 for OpenCV, with CONFIDENCE_THRESHOLD = 0.1

The outputs don't match, as OpenCV output has less bounding boxes overall. Results are below:

@vvmatorin did you find any solution for this?

I haven't found a solution sadly.
I experimented a bit with rebuilding OpenCV for GPU inference with different parameters, but it didn't help.
At the moment, my guess is, that is has to do something with the way darknet resizes images and how it's different to OpenCV, but I don't have time to check it so I stopped at that.

@AlexeyAB
Copy link

AlexeyAB commented Jul 26, 2021

About different resize approaches in the Darknet (letter_box=1 vs letter_box=0) and OpenCV-dnn AlexeyAB/darknet#232 (comment)

@marvision-ai
Copy link

marvision-ai commented Jul 26, 2021

@vvmatorin @Fetulhak do you see the same issue if you resize all your images with cv2.resize and scale your annotations to your specific [net] custom_width x custom_height BEFORE training with darknet? I think the goal is to make sure you have your data at the size you want before training the models.

Note that if you do not resize prior to training, this is the slowest training option, as Darknet must continuously resize all of the images as they are loaded from disk. Darknet does not keep a local image cache.When resizing images, Darknet does not maintain or respect aspect ratio unless you use letterbox. All images will be stretched as necessary to match the exact network dimensions defined in the "[net]" section of the cfg file.

@sctrueew
Copy link

sctrueew commented Jul 27, 2021

Hi everyone,

I've trained a model and some objects are multilabel annotated. When I use the darknet for detecting gives me labels for each object but OpenCV-DNN only returns one label.

I used the latest version of the darknet for detecting and I used OpenCV-dnn for detecting in OpenCV.
I found this example of OpenCV and I used this too and everything is Okay but the inference is slow and FP is much because I have almost 400 classes.
I changed the parameters in
cv::dnn::NMSBoxes(boxes, confidences, CONFIDENCE_THRESHOLD, NMS_THRESHOLD, indices); but I couldn't give a good result.

Where is the problem?

Thanks in advance!

@YashasSamaga
Copy link
Author

YashasSamaga commented Jul 27, 2021

@zpmmehrdad

I've trained a model and some objects are multilabel annotated. When I use the darknet for detecting gives me labels for each object but OpenCV-DNN only returns one label.

I suspect it's performing a single NMS. What you need is classwise NMS. You need to store boxes of a particular class separately and perform NMS on each such class separately.

Okay but the inference is slow and FP is much because I have almost 400 classes.

You should check what step in the pipeline is taking most of the time. I think the DNN inference is probably fast and IO is the bottleneck. You have to identify the bottleneck and try to mitigate it.

@sctrueew
Copy link

sctrueew commented Jul 27, 2021

@YashasSamaga
I suspect it's performing a single NMS. What you need is classwise NMS. You need to store boxes of a particular class separately and perform NMS on each such class separately.

Thanks for the reply, how can I set up NMS for a few specific classes, not all.

@YashasSamaga
Copy link
Author

YashasSamaga commented Jul 28, 2021

@zpmmehrdad The C++ example in this gist does classwise NMS. DetectionModel class allows you to choose between classwise NMS and across class NMS.

@sctrueew
Copy link

sctrueew commented Jul 28, 2021

@YashasSamaga Hi,
I tested this DetectionModel but some objects that are multi-label only detect one class. This version works well but have two problems, FP is high and very slow with about 450 classes.

@YashasSamaga
Copy link
Author

YashasSamaga commented Jul 28, 2021

@zpmmehrdad Can you try with the other NMS configuration in DetectionModel? (try both options and see if one of them meets your requirements)

What is FP? What GPU do you have? Did you use DNN_TARGET_CUDA or DNN_TARGET_CUDA_FP16?

@aabilityuk
Copy link

aabilityuk commented Sep 4, 2021

Hi, YashasSamaga!!! Thank you for a code sharing, i am using your code example for yolov4-tiny model inference trained with images dataset (416x416) for face mask detection! Everything is working good, except of one thing, when i stay close to webcam (approximately 0.5 - 1 meters) algorithm constantly determines the face mask is on even if there is no mask on the face, if i stay 1,5 and more meters far from the web camera everything works as expected and face masks determines in correct way!!! Could you please to help me tune the code to solve my problem

@YashasSamaga
Copy link
Author

YashasSamaga commented Sep 4, 2021

Hi, YashasSamaga!!! Thank you for a code sharing, i am using your code example for yolov4-tiny model inference trained with images dataset (416x416) for face mask detection! Everything is working good, except of one thing, when i stay close to webcam (approximately 0.5 - 1 meters) algorithm constantly determines the face mask is on even if there is no mask on the face, if i stay 1,5 and more meters far from the web camera everything works as expected and face masks determines in correct way!!! Could you please to help me tune the code to solve my problem

I think this is an issue with your dataset used for training and less to do with OpenCV or DNN inference. Maybe your dataset lacks sufficient samples where people are close to the camera. I don't really know much about training models but here is an idea:

  1. detect faces using a face detector
  2. crop around the face (include some background too)
  3. resize the image

This might augment your dataset to have more samples where the face occupies a large portion of the image. I am not sure if this will exactly mimic being closer to the camera but it may work.

@aabilityuk
Copy link

aabilityuk commented Sep 4, 2021

I think this is an issue with your dataset used for training and less to do with OpenCV or DNN inference. Maybe your dataset lacks sufficient samples where people are close to the camera. I don't really know much about training models but here is an idea:

1. detect faces using a face detector

2. crop around the face (include some background too)

3. resize the image

This might augment your dataset to have more samples where the face occupies a large portion of the image. I am not sure if this will exactly mimic being closer to the camera but it may work.

Thank you very much for a quick reply and tips you gave me! I will try to use another images dataset with closer faces and will see what happened

@dgp52
Copy link

dgp52 commented Sep 22, 2021

Hi @YashasSamaga, I'm having the similar issue like others. OpenCV is missing some of the detections as compared to the darknet detections. Has anyone found any solution to this issue?

Heres what I have (Using Google colab):
Inside yolov4-custom.cfg

[net]
#Testing
batch=1
subdivisions=1
#Training
#batch=64
#subdivisions=64
width=640
height=640
.
.
filters=27
classes=4
.

Test image size is 640x640

Darknet:
!./darknet/darknet detector test custom_data_sides_colored/labelled_data.data /content/drive/MyDrive/yolo/test_model/sides-colored/yolov4-custom.cfg /content/drive/MyDrive/yolo/backup-yolov4-models/yolov4-custom_final.weights /content/drive/MyDrive/custom-object-detection/color-img/640-bw.jpg -thresh 0.01

Console info after running the above code:

CUDA-version: 11010 (11020), cuDNN: 7.6.5, GPU count: 1  
OpenCV version: 3.2.0
.

Notice how it prints "OpenCV version 3.2.0". Could this be part of the problem ? because it's using different version than the installed cv2 version (4.5.3)?

OpenCV:

cv2.__version__
4.5.3
net = cv2.dnn.readNet("/content/drive/MyDrive/yolo/backup-yolov4-models/yolov4-custom_final.weights", "/content/drive/MyDrive/yolo/test_model/sides-colored/yolov4-custom.cfg")
img = cv2.imread('/content/drive/MyDrive/custom-object-detection/color-img/640-bw.jpg', flags=cv2.IMREAD_COLOR) 
CONFIDENCE_THRESHOLD = 0.01
NMS_THRESHOLD = 0.4

#Tried DNN_BACKEND_CUDA and DNN_TARGET_CUDA_FP16 but no difference
#net.setPreferableBackend(cv2.dnn.DNN_BACKEND_CUDA)
#net.setPreferableTarget(cv2.dnn.DNN_TARGET_CUDA_FP16)

net.setPreferableBackend(cv2.dnn.DNN_BACKEND_OPENCV)
net.setPreferableTarget(cv2.dnn.DNN_TARGET_CPU)
model = cv2.dnn_DetectionModel(net)
model.setInputParams(size=(640, 640), scale=1/255, swapRB=True,crop= False)
classes, scores, boxes = model.detect(img, CONFIDENCE_THRESHOLD, NMS_THRESHOLD)
for idx, (classid, score, box) in enumerate(zip(classes, scores, boxes)):
  print("Class: " + str(classid) + " Score:" + str(score))
  cv2.rectangle(img, box, (0, 0, 0), 2)
cv2_imshow(img) 

@YashasSamaga
Copy link
Author

YashasSamaga commented Sep 22, 2021

@dgp52 Does it happen with DNN_BACKEND_OPENCV and DNN_TARGET_CPU?

@Fetulhak
Copy link

Fetulhak commented Sep 22, 2021

@YashasSamaga I have tried a lot of options to see why this happens but sadly I did not get any solution. The results in Darknet are far better than OpenCv-dnn as well as other tensorflow implementations of darknet yolo.

@dgp52
Copy link

dgp52 commented Sep 22, 2021

@dgp52 Does it happen with DNN_BACKEND_OPENCV and DNN_TARGET_CPU?

Correct. It happens with DNN_BACKEND_OPENCV and DNN_TARGET_CPU. I have also tried using DNN_BACKEND_CUDA and DNN_TARGET_CUDA_FP16 but no difference.

@YashasSamaga
Copy link
Author

YashasSamaga commented Sep 22, 2021

Correct. It happens with DNN_BACKEND_OPENCV and DNN_TARGET_CPU. I have also tried using DNN_BACKEND_CUDA and DNN_TARGET_CUDA_FP16 but no difference.

@dgp52 @Fetulhak @zpmmehrdad @vvmatorin and others

OpenCV has internal tests most of which are shared across all backends. The CUDA backend always trails the OpenCV CPU backend in terms of correctness and features; it just mimics what OpenCV CPU backend does (this is like the reference implementation). Since it also happens in the CPU backend, the patch must first go there. I think an issue should be opened at OpenCV repository reporting this issue in the OCV CPU backend (and mention as a sidenote that the same behavior is observed in the CUDA backend).

Please also check AlexeyAB's comment:

About different resize approaches in the Darknet (letter_box=1 vs letter_box=0) and OpenCV-dnn AlexeyAB/darknet#232 (comment)

@dgp52
Copy link

dgp52 commented Sep 25, 2021

@YashasSamaga @vvmatorin @marvision-ai @Fetulhak
Explicitly setting the thresh value in the test cfg file worked for me! Now it's giving me exact detections!

Here's what I think is happening. Feel free to correct me but it seems like OpenCV has a default confidence value it checks against during the detection process. Almost like a minimum threshold, so anything lower than this value won't get detected. We can overwrite this value by adding thresh = 0.01 or any default value you would like. Add this value for all three yolo layers.

Ex:

[yolo]
thresh = 0.01
.
.
[yolo]
thresh = 0.01
.
.
[yolo]
thresh = 0.01

I'm curious to see if this works for others.

@Fetulhak
Copy link

Fetulhak commented Sep 26, 2021

@dgp52 this seems very interesting if it works. I will check it for my dataset.

@angeloken
Copy link

angeloken commented Oct 4, 2021

When I changed yolov3 to yolov4's cfg and weights, a cv::Exception error occurred during dnn::readNetFromDarknet. I think it may be because of the opencv version problem, because 4.2 was proposed at the end of last year, but yolov4 was proposed in April this year.

i got the same error before...

i just ignored the FromDarknet and just used dnn.readNet

@hlacik
Copy link

hlacik commented Nov 17, 2021

starting from opencv 4.5.4 classid is no longer list, so code needs to be fixed
from
label = "%s : %f" % (class_names[classid[0]], score)
to
label = "%s : %f" % (class_names[classid], score)

@marvision-ai
Copy link

marvision-ai commented Dec 8, 2021

@dgp52 very interesting! You didn't actually have to train the network with thresh=0.01 correct? Only set it in the .cfg file for a trained model?

@dgp52
Copy link

dgp52 commented Jan 1, 2022

@marvision-ai Sorry for not getting back to you sooner. That is correct. Just needed to update the .cfg file.

@PROGRAMMINGENGINEER-NIKI
Copy link

PROGRAMMINGENGINEER-NIKI commented Jan 13, 2022

Hi Yashas, I need to access GPU power when running YOLOv4. What CUDA and Cudnn version should I install?

Because I installed CUDA 11.5 and cudnn 8.3.2, but when I install OpenCV, it does not find cudnn, as a result, it does not run fast.

@YashasSamaga
Copy link
Author

YashasSamaga commented Jan 13, 2022

I haven't checked the latest versions of cuDNN but cuDNN 7.6.5 gave the best performance six months ago.

@doleron
Copy link

doleron commented Jan 18, 2022

Hi @YashasSamaga , I wrote this code for YOLO V5 / OpenCV / DNN

C++

#include <fstream>

#include <opencv2/opencv.hpp>

std::vector<std::string> load_class_list()
{
    std::vector<std::string> class_list;
    std::ifstream ifs("config_files/classes.txt");
    std::string line;
    while (getline(ifs, line))
    {
        class_list.push_back(line);
    }
    return class_list;
}

void load_net(cv::dnn::Net &net, bool is_cuda)
{
    auto result = cv::dnn::readNet("config_files/yolov5s.onnx");
    if (is_cuda)
    {
        std::cout << "Attempty to use CUDA\n";
        result.setPreferableBackend(cv::dnn::DNN_BACKEND_CUDA);
        result.setPreferableTarget(cv::dnn::DNN_TARGET_CUDA_FP16);
    }
    else
    {
        std::cout << "Running on CPU\n";
        result.setPreferableBackend(cv::dnn::DNN_BACKEND_OPENCV);
        result.setPreferableTarget(cv::dnn::DNN_TARGET_CPU);
    }
    net = result;
}

const std::vector<cv::Scalar> colors = {cv::Scalar(255, 255, 0), cv::Scalar(0, 255, 0), cv::Scalar(0, 255, 255), cv::Scalar(255, 0, 0)};

const float INPUT_WIDTH = 640.0;
const float INPUT_HEIGHT = 640.0;
const float SCORE_THRESHOLD = 0.2;
const float NMS_THRESHOLD = 0.4;
const float CONFIDENCE_THRESHOLD = 0.4;

struct Detection
{
    int class_id;
    float confidence;
    cv::Rect box;
};

cv::Mat format_yolov5(const cv::Mat &source) {
    int col = source.cols;
    int row = source.rows;
    int _max = MAX(col, row);
    cv::Mat result = cv::Mat::zeros(_max, _max, CV_8UC3);
    source.copyTo(result(cv::Rect(0, 0, col, row)));
    return result;
}

void detect(cv::Mat &image, cv::dnn::Net &net, std::vector<Detection> &output, const std::vector<std::string> &className) {
    cv::Mat blob;

    auto input_image = format_yolov5(image);
    
    cv::dnn::blobFromImage(input_image, blob, 1./255., cv::Size(INPUT_WIDTH, INPUT_HEIGHT), cv::Scalar(), true, false);
    net.setInput(blob);
    std::vector<cv::Mat> outputs;
    net.forward(outputs, net.getUnconnectedOutLayersNames());

    float x_factor = input_image.cols / INPUT_WIDTH;
    float y_factor = input_image.rows / INPUT_HEIGHT;
    
    float *data = (float *)outputs[0].data;

    const int dimensions = 85;
    const int rows = 25200;
    
    std::vector<int> class_ids;
    std::vector<float> confidences;
    std::vector<cv::Rect> boxes;

    for (int i = 0; i < rows; ++i) {

        float confidence = data[4];
        if (confidence >= CONFIDENCE_THRESHOLD) {

            float * classes_scores = data + 5;
            cv::Mat scores(1, className.size(), CV_32FC1, classes_scores);
            cv::Point class_id;
            double max_class_score;
            minMaxLoc(scores, 0, &max_class_score, 0, &class_id);
            if (max_class_score > SCORE_THRESHOLD) {

                confidences.push_back(confidence);

                class_ids.push_back(class_id.x);

                float x = data[0];
                float y = data[1];
                float w = data[2];
                float h = data[3];
                int left = int((x - 0.5 * w) * x_factor);
                int top = int((y - 0.5 * h) * y_factor);
                int width = int(w * x_factor);
                int height = int(h * y_factor);
                boxes.push_back(cv::Rect(left, top, width, height));
            }

        }

        data += 85;

    }

    std::vector<int> nms_result;
    cv::dnn::NMSBoxes(boxes, confidences, SCORE_THRESHOLD, NMS_THRESHOLD, nms_result);
    for (int i = 0; i < nms_result.size(); i++) {
        int idx = nms_result[i];
        Detection result;
        result.class_id = class_ids[idx];
        result.confidence = confidences[idx];
        result.box = boxes[idx];
        output.push_back(result);
    }
}

int main(int argc, char **argv)
{

    std::vector<std::string> class_list = load_class_list();

    cv::Mat frame;
    cv::VideoCapture capture("sample.mp4");
    if (!capture.isOpened())
    {
        std::cerr << "Error opening video file\n";
        return -1;
    }

    bool is_cuda = argc > 1 && strcmp(argv[1], "cuda") == 0;

    cv::dnn::Net net;
    load_net(net, is_cuda);

    auto start = std::chrono::high_resolution_clock::now();
    int frame_count = 0;
    float fps = -1;
    int total_frames = 0;

    while (true)
    {
        capture.read(frame);
        if (frame.empty())
        {
            std::cout << "End of stream\n";
            break;
        }

        std::vector<Detection> output;
        detect(frame, net, output, class_list);

        frame_count++;
        total_frames++;

        int detections = output.size();

        for (int i = 0; i < detections; ++i)
        {

            auto detection = output[i];
            auto box = detection.box;
            auto classId = detection.class_id;
            const auto color = colors[classId % colors.size()];
            cv::rectangle(frame, box, color, 3);

            cv::rectangle(frame, cv::Point(box.x, box.y - 20), cv::Point(box.x + box.width, box.y), color, cv::FILLED);
            cv::putText(frame, class_list[classId].c_str(), cv::Point(box.x, box.y - 5), cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
        }

        if (frame_count >= 30)
        {

            auto end = std::chrono::high_resolution_clock::now();
            fps = frame_count * 1000.0 / std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();

            frame_count = 0;
            start = std::chrono::high_resolution_clock::now();
        }

        if (fps > 0)
        {

            std::ostringstream fps_label;
            fps_label << std::fixed << std::setprecision(2);
            fps_label << "FPS: " << fps;
            std::string fps_label_str = fps_label.str();

            cv::putText(frame, fps_label_str.c_str(), cv::Point(10, 25), cv::FONT_HERSHEY_SIMPLEX, 1, cv::Scalar(0, 0, 255), 2);
        }

        cv::imshow("output", frame);

        if (cv::waitKey(1) != -1)
        {
            capture.release();
            std::cout << "finished by user\n";
            break;
        }
    }

    std::cout << "Total frames: " << total_frames << "\n";

    return 0;
}

Python

import cv2
import time
import sys
import numpy as np

def build_model(is_cuda):
    net = cv2.dnn.readNet("config_files/yolov5s.onnx")
    if is_cuda:
        print("Attempty to use CUDA")
        net.setPreferableBackend(cv2.dnn.DNN_BACKEND_CUDA)
        net.setPreferableTarget(cv2.dnn.DNN_TARGET_CUDA_FP16)
    else:
        print("Running on CPU")
        net.setPreferableBackend(cv2.dnn.DNN_BACKEND_OPENCV)
        net.setPreferableTarget(cv2.dnn.DNN_TARGET_CPU)
    return net

INPUT_WIDTH = 640
INPUT_HEIGHT = 640
SCORE_THRESHOLD = 0.2
NMS_THRESHOLD = 0.4
CONFIDENCE_THRESHOLD = 0.4

def detect(image, net):
    blob = cv2.dnn.blobFromImage(image, 1/255.0, (INPUT_WIDTH, INPUT_HEIGHT), swapRB=True, crop=False)
    net.setInput(blob)
    preds = net.forward()
    return preds

def load_capture():
    capture = cv2.VideoCapture("sample.mp4")
    return capture

def load_classes():
    class_list = []
    with open("config_files/classes.txt", "r") as f:
        class_list = [cname.strip() for cname in f.readlines()]
    return class_list

class_list = load_classes()

def wrap_detection(input_image, output_data):
    class_ids = []
    confidences = []
    boxes = []

    rows = output_data.shape[0]

    image_width, image_height, _ = input_image.shape

    x_factor = image_width / INPUT_WIDTH
    y_factor =  image_height / INPUT_HEIGHT

    for r in range(rows):
        row = output_data[r]
        confidence = row[4]
        if confidence >= 0.4:

            classes_scores = row[5:]
            _, _, _, max_indx = cv2.minMaxLoc(classes_scores)
            class_id = max_indx[1]
            if (classes_scores[class_id] > .25):

                confidences.append(confidence)

                class_ids.append(class_id)

                x, y, w, h = row[0].item(), row[1].item(), row[2].item(), row[3].item() 
                left = int((x - 0.5 * w) * x_factor)
                top = int((y - 0.5 * h) * y_factor)
                width = int(w * x_factor)
                height = int(h * y_factor)
                box = np.array([left, top, width, height])
                boxes.append(box)

    indexes = cv2.dnn.NMSBoxes(boxes, confidences, 0.25, 0.45) 

    result_class_ids = []
    result_confidences = []
    result_boxes = []

    for i in indexes:
        result_confidences.append(confidences[i])
        result_class_ids.append(class_ids[i])
        result_boxes.append(boxes[i])

    return result_class_ids, result_confidences, result_boxes

def format_yolov5(frame):

    row, col, _ = frame.shape
    _max = max(col, row)
    result = np.zeros((_max, _max, 3), np.uint8)
    result[0:row, 0:col] = frame
    return result


colors = [(255, 255, 0), (0, 255, 0), (0, 255, 255), (255, 0, 0)]

is_cuda = len(sys.argv) > 1 and sys.argv[1] == "cuda"

net = build_model(is_cuda)
capture = load_capture()

start = time.time_ns()
frame_count = 0
total_frames = 0
fps = -1

while True:

    _, frame = capture.read()
    if frame is None:
        print("End of stream")
        break

    inputImage = format_yolov5(frame)
    outs = detect(inputImage, net)

    class_ids, confidences, boxes = wrap_detection(inputImage, outs[0])

    frame_count += 1
    total_frames += 1

    for (classid, confidence, box) in zip(class_ids, confidences, boxes):
         color = colors[int(classid) % len(colors)]
         cv2.rectangle(frame, box, color, 2)
         cv2.rectangle(frame, (box[0], box[1] - 20), (box[0] + box[2], box[1]), color, -1)
         cv2.putText(frame, class_list[classid], (box[0], box[1] - 10), cv2.FONT_HERSHEY_SIMPLEX, .5, (0,0,0))

    if frame_count >= 30:
        end = time.time_ns()
        fps = 1000000000 * frame_count / (end - start)
        frame_count = 0
        start = time.time_ns()
    
    if fps > 0:
        fps_label = "FPS: %.2f" % fps
        cv2.putText(frame, fps_label, (10, 25), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)

    cv2.imshow("output", frame)

    if cv2.waitKey(1) > -1:
        print("finished by user")
        break

print("Total frames: " + str(total_frames))

The more update version and instructions to run code can be found here: https://github.com/doleron/yolov4-opencv-cpp-python

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment