Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
YOLOv4 inference using OpenCV DNN
import cv2
import time
CONFIDENCE_THRESHOLD = 0.2
NMS_THRESHOLD = 0.4
COLORS = [(0, 255, 255), (255, 255, 0), (0, 255, 0), (255, 0, 0)]
class_names = []
with open("classes.txt", "r") as f:
class_names = [cname.strip() for cname in f.readlines()]
vc = cv2.VideoCapture("demo.mp4")
net = cv2.dnn.readNet("yolov4.weights", "yolov4.cfg")
net.setPreferableBackend(cv2.dnn.DNN_BACKEND_CUDA)
net.setPreferableTarget(cv2.dnn.DNN_TARGET_CUDA_FP16)
model = cv2.dnn_DetectionModel(net)
model.setInputParams(size=(416, 416), scale=1/255, swapRB=True)
while cv2.waitKey(1) < 1:
(grabbed, frame) = vc.read()
if not grabbed:
exit()
start = time.time()
classes, scores, boxes = model.detect(frame, CONFIDENCE_THRESHOLD, NMS_THRESHOLD)
end = time.time()
start_drawing = time.time()
for (classid, score, box) in zip(classes, scores, boxes):
color = COLORS[int(classid) % len(COLORS)]
label = "%s : %f" % (class_names[classid[0]], score)
cv2.rectangle(frame, box, color, 2)
cv2.putText(frame, label, (box[0], box[1] - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)
end_drawing = time.time()
fps_label = "FPS: %.2f (excluding drawing time of %.2fms)" % (1 / (end - start), (end_drawing - start_drawing) * 1000)
cv2.putText(frame, fps_label, (0, 25), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 0), 2)
cv2.imshow("detections", frame)
#include <iostream>
#include <queue>
#include <iterator>
#include <sstream>
#include <fstream>
#include <iomanip>
#include <chrono>
#include <opencv2/core.hpp>
#include <opencv2/dnn.hpp>
#include <opencv2/dnn/all_layers.hpp>
#include <opencv2/imgproc.hpp>
#include <opencv2/highgui.hpp>
constexpr float CONFIDENCE_THRESHOLD = 0;
constexpr float NMS_THRESHOLD = 0.4;
constexpr int NUM_CLASSES = 80;
// colors for bounding boxes
const cv::Scalar colors[] = {
{0, 255, 255},
{255, 255, 0},
{0, 255, 0},
{255, 0, 0}
};
const auto NUM_COLORS = sizeof(colors)/sizeof(colors[0]);
int main()
{
std::vector<std::string> class_names;
{
std::ifstream class_file("classes.txt");
if (!class_file)
{
std::cerr << "failed to open classes.txt\n";
return 0;
}
std::string line;
while (std::getline(class_file, line))
class_names.push_back(line);
}
cv::VideoCapture source("demo.mp4");
auto net = cv::dnn::readNetFromDarknet("yolov4.cfg", "yolov4.weights");
net.setPreferableBackend(cv::dnn::DNN_BACKEND_CUDA);
net.setPreferableTarget(cv::dnn::DNN_TARGET_CUDA);
// net.setPreferableBackend(cv::dnn::DNN_BACKEND_OPENCV);
// net.setPreferableTarget(cv::dnn::DNN_TARGET_CPU);
auto output_names = net.getUnconnectedOutLayersNames();
cv::Mat frame, blob;
std::vector<cv::Mat> detections;
while(cv::waitKey(1) < 1)
{
source >> frame;
if (frame.empty())
{
cv::waitKey();
break;
}
auto total_start = std::chrono::steady_clock::now();
cv::dnn::blobFromImage(frame, blob, 0.00392, cv::Size(608, 608), cv::Scalar(), true, false, CV_32F);
net.setInput(blob);
auto dnn_start = std::chrono::steady_clock::now();
net.forward(detections, output_names);
auto dnn_end = std::chrono::steady_clock::now();
std::vector<int> indices[NUM_CLASSES];
std::vector<cv::Rect> boxes[NUM_CLASSES];
std::vector<float> scores[NUM_CLASSES];
for (auto& output : detections)
{
const auto num_boxes = output.rows;
for (int i = 0; i < num_boxes; i++)
{
auto x = output.at<float>(i, 0) * frame.cols;
auto y = output.at<float>(i, 1) * frame.rows;
auto width = output.at<float>(i, 2) * frame.cols;
auto height = output.at<float>(i, 3) * frame.rows;
cv::Rect rect(x - width/2, y - height/2, width, height);
for (int c = 0; c < NUM_CLASSES; c++)
{
auto confidence = *output.ptr<float>(i, 5 + c);
if (confidence >= CONFIDENCE_THRESHOLD)
{
boxes[c].push_back(rect);
scores[c].push_back(confidence);
}
}
}
}
for (int c = 0; c < NUM_CLASSES; c++)
cv::dnn::NMSBoxes(boxes[c], scores[c], 0.0, NMS_THRESHOLD, indices[c]);
for (int c= 0; c < NUM_CLASSES; c++)
{
for (size_t i = 0; i < indices[c].size(); ++i)
{
const auto color = colors[c % NUM_COLORS];
auto idx = indices[c][i];
const auto& rect = boxes[c][idx];
cv::rectangle(frame, cv::Point(rect.x, rect.y), cv::Point(rect.x + rect.width, rect.y + rect.height), color, 3);
std::ostringstream label_ss;
label_ss << class_names[c] << ": " << std::fixed << std::setprecision(2) << scores[c][idx];
auto label = label_ss.str();
int baseline;
auto label_bg_sz = cv::getTextSize(label.c_str(), cv::FONT_HERSHEY_COMPLEX_SMALL, 1, 1, &baseline);
cv::rectangle(frame, cv::Point(rect.x, rect.y - label_bg_sz.height - baseline - 10), cv::Point(rect.x + label_bg_sz.width, rect.y), color, cv::FILLED);
cv::putText(frame, label.c_str(), cv::Point(rect.x, rect.y - baseline - 5), cv::FONT_HERSHEY_COMPLEX_SMALL, 1, cv::Scalar(0, 0, 0));
}
}
auto total_end = std::chrono::steady_clock::now();
float inference_fps = 1000.0 / std::chrono::duration_cast<std::chrono::milliseconds>(dnn_end - dnn_start).count();
float total_fps = 1000.0 / std::chrono::duration_cast<std::chrono::milliseconds>(total_end - total_start).count();
std::ostringstream stats_ss;
stats_ss << std::fixed << std::setprecision(2);
stats_ss << "Inference FPS: " << inference_fps << ", Total FPS: " << total_fps;
auto stats = stats_ss.str();
int baseline;
auto stats_bg_sz = cv::getTextSize(stats.c_str(), cv::FONT_HERSHEY_COMPLEX_SMALL, 1, 1, &baseline);
cv::rectangle(frame, cv::Point(0, 0), cv::Point(stats_bg_sz.width, stats_bg_sz.height + 10), cv::Scalar(0, 0, 0), cv::FILLED);
cv::putText(frame, stats.c_str(), cv::Point(0, stats_bg_sz.height + 5), cv::FONT_HERSHEY_COMPLEX_SMALL, 1, cv::Scalar(255, 255, 255));
cv::namedWindow("output");
cv::imshow("output", frame);
}
return 0;
}
@dnaveenr

This comment has been minimized.

Copy link

@dnaveenr dnaveenr commented Jun 18, 2020

Thanks for this Yashas. Could you please provide the corresponding python code for Efficient YOLOv3 Inference on OpenCV?

@YashasSamaga

This comment has been minimized.

Copy link
Owner Author

@YashasSamaga YashasSamaga commented Jun 19, 2020

@dnaveenr You just have to change auto net = cv::dnn::readNetFromDarknet("yolov4.cfg", "yolov4.weights"); line. The code works for both YOLOv3 and YOLOv4.

@dnaveenr

This comment has been minimized.

Copy link

@dnaveenr dnaveenr commented Jun 19, 2020

Ok. Thanks. I meant equivalent Python code? But its fine. I think the main difference is the following :

for (auto name : net.getUnconnectedOutLayersNames())
{
int layerId = net.getLayerId(name);
auto layer = net.getLayer(layerId).dynamicCastcv::dnn::RegionLayer();
if (!layer.empty())
layer->nmsThreshold = 0;
}

I'll make the changes.

@YashasSamaga

This comment has been minimized.

Copy link
Owner Author

@YashasSamaga YashasSamaga commented Jun 19, 2020

@dnaveenr Sorry, I read too fast. You can get the same effect by manually setting nms_threshold=0 (add it if not present already) in all [yolo] blocks in yolovN.cfg. I don't think nmsThreshold is exposed in python.

@dnaveenr

This comment has been minimized.

Copy link

@dnaveenr dnaveenr commented Jun 19, 2020

No problem. Thanks. I will try it out.

@sky-fly97

This comment has been minimized.

Copy link

@sky-fly97 sky-fly97 commented Jun 26, 2020

Hello,I got an error that cv::dnn::dnn4_v20191202::RegionLayer" has no member "nmsThreshold" at line 52-53, but if I delete it,it will be OK!

@sky-fly97

This comment has been minimized.

Copy link

@sky-fly97 sky-fly97 commented Jun 26, 2020

By the way, how to save the MP4 demo after running?

@YashasSamaga

This comment has been minimized.

Copy link
Owner Author

@YashasSamaga YashasSamaga commented Jun 26, 2020

@sky-fly97 I can guess that your OpenCV version is not the latest master. In that case, you will see performance regressions if you do not set the nms threshold to zero. The old solution is to set nms_threshold=0 in all [yolo] blocks in yolov3.cfg.

If you need YOLOv4 support, you need the master branch.

This is what happened:

  1. nmsThreshold was added a month ago to address spurious NMS in opencv/opencv#17371
  2. A new fix which sets nmsThreshold to zero by default was added in opencv/opencv#17592 (and the old fix is redundant now)

It's not required anymore and I have removed it from this gist.

@sky-fly97

This comment has been minimized.

Copy link

@sky-fly97 sky-fly97 commented Jun 26, 2020

Yeah, now my version of opencv is 4.2.Thank you very much!

@YashasSamaga

This comment has been minimized.

Copy link
Owner Author

@YashasSamaga YashasSamaga commented Jun 26, 2020

@sky-fly97

By the way, how to save the MP4 demo after running?

You need VideoWriter.

cv::VideoWriter writer("output.mp4", cv::VideoWriter::fourcc('M', 'P', 'E', 'G'), 30, cv::Size(width, height));

cv::Mat frame; // what you want to write
writer << frame;

Yeah, now my version of opencv is 4.2.Thank you very much!

There have been a lot of performance improvements since then. OpenCV 4.2 was the first release with CUDA support.

@sky-fly97

This comment has been minimized.

Copy link

@sky-fly97 sky-fly97 commented Jun 27, 2020

@sky-fly97

By the way, how to save the MP4 demo after running?

You need VideoWriter.

cv::VideoWriter writer("output.mp4", cv::VideoWriter::fourcc('M', 'P', 'E', 'G'), 30, cv::Size(width, height));

cv::Mat frame; // what you want to write
writer << frame;

Yeah, now my version of opencv is 4.2.Thank you very much!

There have been a lot of performance improvements since then. OpenCV 4.2 was the first release with CUDA support.

Thanks!

@sky-fly97

This comment has been minimized.

Copy link

@sky-fly97 sky-fly97 commented Jun 27, 2020

When I changed yolov3 to yolov4's cfg and weights, a cv::Exception error occurred during dnn::readNetFromDarknet. I think it may be because of the opencv version problem, because 4.2 was proposed at the end of last year, but yolov4 was proposed in April this year.

@YashasSamaga

This comment has been minimized.

Copy link
Owner Author

@YashasSamaga YashasSamaga commented Jun 27, 2020

@sky-fly97 There is no release which supports YOLOv4 yet. Support has been added for the next release on master. You need to use master if you need YOLOv4 support.

@sky-fly97

This comment has been minimized.

Copy link

@sky-fly97 sky-fly97 commented Jun 27, 2020

When compiling the latest master version, I followed the same process but there were still some problems that caused the compilation to fail, which was not encountered in the 4.2 and 4.3 versions... Could you please share your compiled opencv file? ? Thank you very much! It doesn't matter if it's inconvenient, I will study it again!

@YashasSamaga

This comment has been minimized.

Copy link
Owner Author

@YashasSamaga YashasSamaga commented Jun 27, 2020

@sky-fly97 What is the error that is causing the compilation to fail?

@sky-fly97

This comment has been minimized.

Copy link

@sky-fly97 sky-fly97 commented Jun 27, 2020

https://github.com/opencv/opencv/issues/17677,I just asked this question in the community.
Cannot specify link libraries for target "opencv_gapi"

@YashasSamaga

This comment has been minimized.

Copy link
Owner Author

@YashasSamaga YashasSamaga commented Jun 27, 2020

@sky-fly97

If you only need DNN with CUDA support, you need the following modules:

  • cudev
  • opencv_core
  • opencv_dnn
  • opencv_imgproc

You might also require the following to read/write/display images and videos:

  • opencv_imgcodecs
  • opencv_highgui
  • opencv_videoio

You can disable the rest.

@hlacik

This comment has been minimized.

Copy link

@hlacik hlacik commented Jul 17, 2020

thank you very much for this , i have one question
i understand that DNN_TARGET_CUDA is faster than DNN_TARGET_CUDA_FP16 on GTX 1080 (since it has no HALF_PRECISION cores)
but why the same applies for jetson nano?

DNN_TARGET_CUDA gives ~17fps, while DNN_TARGET_CUDA_FP16 only ~1fps

I am using custom trained yolov4_tiny (darknet)

@YashasSamaga

This comment has been minimized.

Copy link
Owner Author

@YashasSamaga YashasSamaga commented Jul 17, 2020

but why the same applies for jetson nano?

That's unusual. How did you measure the FPS? Note that the first few forward passes will be slow due to lazy initialization.

@hlacik

This comment has been minimized.

Copy link

@hlacik hlacik commented Jul 17, 2020

unmodified yolov4.py python script from you , opencv compiled yesterday from git (master branch) with CUDA, CUDDN from jetson jetpack 4.4 (cuda 10.2, cuddn 8).

Could it be an issue with cuddn 8 ? Should i try older 7?

net = cv2.dnn.readNet(
"yolov4-tiny_lp_final.weights", "yolov4-tiny_lp.cfg"
)
net.setPreferableBackend(cv2.dnn.DNN_BACKEND_CUDA)
net.setPreferableTarget(cv2.dnn.DNN_TARGET_CUDA_FP16)

result :

FPS: 0.22 (excluding drawing time of 0.01ms)
FPS: 0.91 (excluding drawing time of 0.01ms)
FPS: 0.91 (excluding drawing time of 0.01ms)
FPS: 0.91 (excluding drawing time of 0.01ms)
FPS: 0.91 (excluding drawing time of 0.01ms)
FPS: 0.91 (excluding drawing time of 0.01ms)
FPS: 0.91 (excluding drawing time of 0.01ms)
FPS: 0.91 (excluding drawing time of 0.01ms)
FPS: 0.91 (excluding drawing time of 0.01ms)
FPS: 0.91 (excluding drawing time of 0.01ms)
FPS: 0.91 (excluding drawing time of 0.01ms)
FPS: 0.91 (excluding drawing time of 0.01ms)
FPS: 0.90 (excluding drawing time of 0.01ms)
FPS: 0.91 (excluding drawing time of 0.01ms)
FPS: 0.91 (excluding drawing time of 0.01ms)
FPS: 0.91 (excluding drawing time of 0.01ms)
FPS: 0.91 (excluding drawing time of 0.01ms)
FPS: 0.91 (excluding drawing time of 0.01ms)
FPS: 0.91 (excluding drawing time of 0.01ms)
FPS: 0.91 (excluding drawing time of 0.01ms)
FPS: 0.91 (excluding drawing time of 0.01ms)
FPS: 0.91 (excluding drawing time of 0.01ms)
FPS: 0.91 (excluding drawing time of 0.01ms)
FPS: 0.91 (excluding drawing time of 0.01ms)
FPS: 0.91 (excluding drawing time of 0.01ms)
FPS: 0.91 (excluding drawing time of 0.01ms)
FPS: 0.90 (excluding drawing time of 0.01ms)
FPS: 0.90 (excluding drawing time of 0.01ms)
FPS: 0.91 (excluding drawing time of 0.01ms)
FPS: 0.91 (excluding drawing time of 0.01ms)
FPS: 0.90 (excluding drawing time of 0.01ms)
FPS: 0.91 (excluding drawing time of 0.01ms)
FPS: 0.91 (excluding drawing time of 0.01ms)
FPS: 0.91 (excluding drawing time of 0.01ms)
FPS: 0.91 (excluding drawing time of 0.01ms)
FPS: 0.91 (excluding drawing time of 0.01ms)
FPS: 0.91 (excluding drawing time of 0.01ms)
FPS: 0.91 (excluding drawing time of 0.01ms)
FPS: 0.91 (excluding drawing time of 0.01ms)
FPS: 0.91 (excluding drawing time of 0.01ms)
FPS: 0.91 (excluding drawing time of 0.01ms)

and

net = cv2.dnn.readNet(
"yolov4-tiny_lp_final.weights", "yolov4-tiny_lp.cfg"
)
net.setPreferableBackend(cv2.dnn.DNN_BACKEND_CUDA)
net.setPreferableTarget(cv2.dnn.DNN_TARGET_CUDA)

result

FPS: 0.31 (excluding drawing time of 0.02ms)
FPS: 16.19 (excluding drawing time of 0.01ms)
FPS: 16.98 (excluding drawing time of 0.01ms)
FPS: 17.39 (excluding drawing time of 0.01ms)
FPS: 17.40 (excluding drawing time of 0.01ms)
FPS: 17.76 (excluding drawing time of 0.01ms)
FPS: 17.96 (excluding drawing time of 0.01ms)
FPS: 17.90 (excluding drawing time of 0.01ms)
FPS: 17.82 (excluding drawing time of 0.01ms)
FPS: 17.77 (excluding drawing time of 0.01ms)
FPS: 17.52 (excluding drawing time of 0.01ms)
FPS: 17.61 (excluding drawing time of 0.01ms)
FPS: 17.87 (excluding drawing time of 0.01ms)
FPS: 17.75 (excluding drawing time of 0.01ms)
FPS: 17.70 (excluding drawing time of 0.01ms)
FPS: 17.50 (excluding drawing time of 0.01ms)
FPS: 17.61 (excluding drawing time of 0.01ms)
FPS: 17.56 (excluding drawing time of 0.01ms)
FPS: 17.80 (excluding drawing time of 0.01ms)
FPS: 17.95 (excluding drawing time of 0.01ms)
FPS: 17.78 (excluding drawing time of 0.01ms)
FPS: 17.89 (excluding drawing time of 0.01ms)
FPS: 17.87 (excluding drawing time of 0.01ms)
FPS: 17.83 (excluding drawing time of 0.01ms)
FPS: 17.91 (excluding drawing time of 0.01ms)
FPS: 17.66 (excluding drawing time of 0.01ms)

@YashasSamaga

This comment has been minimized.

Copy link
Owner Author

@YashasSamaga YashasSamaga commented Jul 17, 2020

@hlacik Let's continue the discussion at AlexeyAB/darknet#6245

@hlacik

This comment has been minimized.

Copy link

@hlacik hlacik commented Jul 19, 2020

@YashasSamaga sorry for disturbing again - i am searching for dnn documentation, and i have not found many samples/guides at official opencv site or repo, i understand that this is because dnn is new, will yolov4.py be added as example to opencv? Where should i search for more usage examples?
I have found and read through your gists which gives brief introduction for sure.

@YashasSamaga

This comment has been minimized.

Copy link
Owner Author

@YashasSamaga YashasSamaga commented Jul 19, 2020

@hlacik There are several samples here: https://github.com/opencv/opencv/tree/master/samples/dnn

The DNN module is several years old but the CUDA backend is just around six months old. All the old samples work with CUDA backend too. Only the target and backend have to be set.

The samples don't seem to be updated to use the high-level DL API.

@kmsravindra

This comment has been minimized.

Copy link

@kmsravindra kmsravindra commented Jul 29, 2020

@YashasSamaga, You have published a python version and a c++ version of yolo. Is there any FPS / inference time comparison (from capture to display) for these two different versions ? If there is a link that you have published, please point me to that.
Are the benchmarks that you published here (or on alexeyAB repo) are from capture to display? If so, I am confused if they were obtained by using c++ version or python version?

@YashasSamaga

This comment has been minimized.

Copy link
Owner Author

@YashasSamaga YashasSamaga commented Jul 29, 2020

@kmsravindra

Is there any FPS / inference time comparison (from capture to display) for these two different versions ?

The C++ version reports the inference and total time (preprocessing + inference + NMS + postprocessing). The python version reports the total time only. The capturing and display is not included in either version. There is no comparison that includes capturing and display as these vary across systems and can be hidden with pipelines. Therefore, they are often not considered in benchmarks.

Are the benchmarks that you published here (or on alexeyAB repo) are from capture to display? If so, I am confused if they were obtained by using c++ version or python version?

The benchmark results I published is here. The numbers you see in Darknet's ReadMe were taken from there. These numbers were obtained using a different C++ code which measures the inference time only.

You may be able to find more benchmark results from other users here: AlexeyAB/darknet#6245

@kmsravindra

This comment has been minimized.

Copy link

@kmsravindra kmsravindra commented Jul 29, 2020

@YashasSamaga, Thanks for the links.

  1. It will be very helpful to understand what results in this higher performance using opencv-dnn over darknet? Is it tensorRT type of optimizations? Could you please share any article / blog on the same?
  2. Does opencv-dnn taget-cuda need tensor cores? OR can I get such performance improvements even on GTX 1080Ti ( doesn't have tensor cores)?
  3. Perhaps you might be aware of deepstream SDK from nvidia that claims to reduce the memcpy between cpu and gpu while processing the video streams. Deepstream claims high throughput and low latency due to these optimizations. Is there any plan / chance that opencv capture to display process could also do a similar thing to reduce the latency further?
@YashasSamaga

This comment has been minimized.

Copy link
Owner Author

@YashasSamaga YashasSamaga commented Jul 29, 2020

@kmsravindra

It will be very helpful to understand what results in this higher performance using opencv-dnn over darknet? Is it tensorRT type of optimizations? Could you please share any article / blog on the same?

There is no article or blog post yet. The CUDA support for OpenCV DNN was first released 4.2.0. So it's just 7 months ago. Maybe I'll do it someday.

OpenCV DNN has a set of backend agnostic optimizations such as fusing linear operators like batch normalization and scaling with convolution. Then there are the backend-specific optimizations such as fusing activation and elementwise layers (like residual connections) with convolution. Further optimizations are applied at each operation to reduce tensor ranks or reduce expensive operations like transpose or slice to copy. Finally, the CUDA code at the lowest level is optimized at the instruction level based on profiling results.

I don't have much idea about the internals of Darknet. It does a few of the optimizations such as fusing batch normalisation with convolution but is not as rigorous as OpenCV.

TensorRT applies much more optimizations in addition to the ones listed above. OpenCV does not support TensorRT yet.

Does opencv-dnn taget-cuda need tensor cores? OR can I get such performance improvements even on GTX 1080Ti ( doesn't have tensor cores)?

It's optional. OpenCV DNN works on all devices with compute capability 3.0 and above. It uses TensorCores in the FP16 target (DNN_TARGET_CUDA_FP16) if available. TensorCores are only used for convolution. The optimizations listed in the previous section are always applied on all devices.

Perhaps you might be aware of deepstream SDK from nvidia that claims to reduce the memcpy between cpu and gpu while processing the video streams. Deepstream claims high throughput and low latency due to these optimizations. Is there any plan / chance that opencv capture to display process could also do a similar thing to reduce the latency further?

These features are not available in OpenCV as of today. You can open a feature request issue at OpenCV's repository.

@YashasSamaga

This comment has been minimized.

Copy link
Owner Author

@YashasSamaga YashasSamaga commented Aug 27, 2020

@marvision-ai

Scenerio 1:

load model A
model A inference (for some time)
unload model A from GPU memory
Load model b
model B Inference (for some time)
unload model B from GPU memory

This would be insanely slow because OpenCV will have to reinitialize for every forward pass.

Scenerio 2 ( I ask this one because I am not sure if its memory safe to load both at the same time)

load model A
load model B
model A Inference
model B inference

This is safe and the initialization is done just once.

@YashasSamaga

This comment has been minimized.

Copy link
Owner Author

@YashasSamaga YashasSamaga commented Aug 27, 2020

Doesn't matter how slow. For instance: I am running all of these on the Jetson Xavier. It does not have enough memory to load all 5 different models I am using for different cases simultaneously does it?

I think it would be nice to have an API to control memory usage: prefer fastest, prefer the least memory, some middle ground option, etc.

If I want to do taskA with modelA and then switch to task B with modelB and I have a few sec between each task, that is fine as long as I know that the mem is being dealloc properly.

You need to destroy the object to release the resources owned by the network.

net = cv2.dnn.readNet(...)
.
.
.
del net

For Scenerio 2 for instance: Is this correct?

Yes

Is it possible loading both in memory will decrease overall FPS of each since they are both loaded into a shared GPU resource? Or does it not work like that since I am running them in one after the other (series not parallel)?

FPS will not change in series configuration. The FPS of individual networks might drop in parallel configuration but the overall throughput will be higher than series configuration (i.e. both latency and throughput increase).

It might be more efficient to use multiple models as this can keep the GPU saturated with work when you are doing the CPU part of the work for some other model.

@YashasSamaga

This comment has been minimized.

Copy link
Owner Author

@YashasSamaga YashasSamaga commented Aug 27, 2020

I am not sure what the limit is to the Xavier on the amount of models it can load but if I knew a way to probe available remaining space, that would it easier.

You can check this using nvidia-smi.

@marvision-ai

This comment has been minimized.

Copy link

@marvision-ai marvision-ai commented Sep 4, 2020

@YashasSamaga few more quick questions:

  1. When you increase NMS_Threshold - you get more faulty detections and multiple boxes per object.... it seems like 0.4 is the sweet spot. Could you explain why?
  2. Say for instance I have input images @ 1920x1080 pix. I am running inference on them with a model of 608x608.
    Those images will be resized from 1920x1080 --> 608x608 --> and then I am returned boxes from the detections.
    The boxes I am returned are scaled to the original non-resized image inside that function correct? Just want to confirm.
@YashasSamaga

This comment has been minimized.

Copy link
Owner Author

@YashasSamaga YashasSamaga commented Sep 4, 2020

@marvision-ai

When you increase NMS_Threshold - you get more faulty detections and multiple boxes per object.... it seems like 0.4 is the sweet spot. Could you explain why?

I think this value is empirically derived. This varies from model to model.

Say for instance I have input images @ 1920x1080 pix. I am running inference on them with a model of 608x608.
Those images will be resized from 1920x1080 --> 608x608 --> and then I am returned boxes from the detections.
The boxes I am returned are scaled to the original non-resized image inside that function correct? Just want to confirm

Yes.

@26medias

This comment has been minimized.

Copy link

@26medias 26medias commented Sep 9, 2020

Do you have the list of classes to share?
I downloaded one but I'm a "thing", bottles are "prickly pears", glasses are "pawpaw", whatever that is...
I can't find the proper class list for V4.

Thanks!

@YashasSamaga

This comment has been minimized.

@piotr-ost

This comment has been minimized.

Copy link

@piotr-ost piotr-ost commented Sep 18, 2020

Awesome stuff man, I didnt know about the dnn_DetectionModel class. Thanks!

@jhonjam

This comment has been minimized.

Copy link

@jhonjam jhonjam commented Oct 4, 2020

script de python yolov4.py sin modificar de usted, opencv compilado ayer de git (rama maestra) con CUDA, CUDDN de jetson jetpack 4.4 (cuda 10.2, cuddn 8).

¿Podría ser un problema con cuddn 8? ¿Debería probar mayores de 7 años?

net = cv2.dnn.readNet (
"yolov4-tiny_lp_final.weights", "yolov4-tiny_lp.cfg"
)
net.setPreferableBackend (cv2.dnn.DNN_BACKEND_CUDA)
net.setPreferableTarget (cv2.dnARn.DNN_UD

resultado:

FPS: 0,22 (excluyendo el tiempo de dibujo de 0,01 ms)
FPS: 0,91 (excluyendo el tiempo de dibujo de 0,01 ms)
FPS: 0,91 (excluyendo el tiempo de dibujo de 0,01 ms)
FPS: 0,91 (excluyendo el tiempo de dibujo de 0,01 ms)
FPS: 0,91 (excluido el tiempo de dibujo) tiempo de 0.01ms)
FPS: 0.91 (excluyendo el tiempo de 0.01ms dibujo)
FPS: 0.91 (excluyendo el tiempo de 0.01ms dibujo)
FPS: 0.91 (excluyendo el tiempo de 0.01ms dibujo)
FPS: 0.91 (excluyendo el tiempo de 0.01ms dibujo)
FPS : 0,91 (excluyendo el tiempo de dibujo de 0,01 ms)
FPS: 0,91 (excluyendo el tiempo de dibujo de 0,01 ms)
FPS: 0,91 (excluyendo el tiempo de dibujo de 0,01 ms)
FPS: 0,90 (excluyendo el tiempo de dibujo de 0,01 ms)
FPS: 0,91 (excluyendo el tiempo de dibujo de 0,01 ms) de 0.01ms)
FPS: 0.91 (excluyendo el tiempo de dibujo de 0.01ms)
FPS: 0.91 (excluyendo el tiempo de dibujo de 0.01ms)
FPS: 0.91 (excluyendo el tiempo de dibujo de 0.01ms)
FPS: 0.91 (excluyendo el tiempo de dibujo de 0.01ms)
FPS: 0.91 (excluyendo el dibujo tiempo de 0.01ms)
FPS: 0.91 (excluyendo el tiempo de 0.01ms dibujo)
FPS: 0.91 (excluyendo el tiempo de 0.01ms dibujo)
FPS: 0.91 (excluyendo el tiempo de 0.01ms dibujo)
FPS: 0.91 (excluyendo el tiempo de 0.01ms dibujo)
FPS : 0,91 (excluyendo el tiempo de dibujo de 0,01 ms)
FPS: 0,91 (excluido el tiempo de dibujo de 0,01 ms)
FPS: 0,91 (excluido el tiempo de dibujo de 0,01 ms)
FPS: 0,90 (excluido el tiempo de dibujo de 0,01 ms)
FPS: 0,90 (excluido el tiempo de dibujo de 0.01ms)
FPS: 0.91 (excluyendo el tiempo de dibujo de 0.01ms)
FPS: 0.91 (excluyendo el tiempo de dibujo de 0.01ms)
FPS: 0.90 (excluyendo el tiempo de dibujo de 0.01ms)
FPS: 0.91 (excluyendo el tiempo de dibujo de 0.01ms)
FPS: 0.91 (excluyendo el dibujo tiempo de 0.01ms)
FPS: 0.91 (excluyendo el tiempo de 0.01ms dibujo)
FPS: 0.91 (excluyendo el tiempo de 0.01ms dibujo)
FPS: 0.91 (excluyendo el tiempo de 0.01ms dibujo)
FPS: 0.91 (excluyendo el tiempo de 0.01ms dibujo)
FPS : 0,91 (excluyendo el tiempo de dibujo de 0,01 ms)
FPS: 0,91 (excluyendo el tiempo de dibujo de 0,01 ms)
FPS: 0,91 (excluyendo el tiempo de dibujo de 0,01 ms)
FPS: 0,91 (excluyendo el tiempo de dibujo de 0,01 ms)

y

net = cv2.dnn.readNet (
"yolov4-tiny_lp_final.weights", "yolov4-tiny_lp.cfg"
)
net.setPreferableBackend (cv2.dnn.DNN_BACKEND_CUDA)
net.setPreferableTarget (cv2.dnGA) .DNN_UD

resultado

FPS: 0.31 (excluyendo el tiempo de dibujo de 0.02ms)
FPS: 16.19 (excluyendo el tiempo de dibujo de 0.01ms)
FPS: 16.98 (excluyendo el tiempo de dibujo de 0.01ms)
FPS: 17.39 (excluyendo el tiempo de dibujo de 0.01ms)
FPS: 17.40 (excluyendo el dibujo tiempo de dibujo de 0,01 ms)
FPS: 17,76 (excluyendo el tiempo de dibujo de 0,01 ms)
FPS: 17,96 (excluyendo el tiempo de dibujo de 0,01 ms)
FPS: 17,90 (excluyendo el tiempo de dibujo de 0,01 ms)
FPS: 17,82 (excluyendo el tiempo de dibujo de 0,01 ms)
FPS : 17,77 (excluyendo el tiempo de dibujo de 0,01 ms)
FPS: 17,52 (excluyendo el tiempo de dibujo de 0,01 ms)
FPS: 17,61 (excluyendo el tiempo de dibujo de 0,01 ms)
FPS: 17,87 (excluyendo el tiempo de dibujo de 0,01 ms)
FPS: 17,75 (excluyendo el tiempo de dibujo de 0,01 ms) de 0.01ms)
FPS: 17,70 (excluyendo el tiempo de dibujo de 0,01 ms)
FPS: 17,50 (excluyendo el tiempo de dibujo de 0,01 ms)
FPS: 17,61 (excluyendo el tiempo de dibujo de 0,01 ms)
FPS: 17,56 (excluyendo el tiempo de dibujo de 0,01 ms)
FPS: 17,80 (excluyendo el tiempo de dibujo) tiempo de dibujo de 0.01ms)
FPS: 17.95 (excluyendo el tiempo de dibujo de 0.01ms)
FPS: 17.78 (excluyendo el tiempo de dibujo de 0.01ms)
FPS: 17.89 (excluyendo el tiempo de dibujo de 0.01ms)
FPS: 17.87 (excluyendo el tiempo de dibujo de 0.01ms)
FPS : 17,83 (excluyendo el tiempo de dibujo de 0,01 ms)
FPS: 17,91 (excluyendo el tiempo de dibujo de 0,01 ms)
FPS: 17,66 (excluyendo el tiempo de dibujo de 0,01 ms)

what version of opencv are you using????? I have a problem

Unsupported activation: mish in function 'cv::dnn::darknet::ReadDarknetFromCfgStream'

@YashasSamaga

This comment has been minimized.

Copy link
Owner Author

@YashasSamaga YashasSamaga commented Oct 4, 2020

Unsupported activation: mish in function 'cv::dnn::darknet::ReadDarknetFromCfgStream'

@jhonjam You need OpenCV 4.4 or above.

@marvision-ai

This comment has been minimized.

Copy link

@marvision-ai marvision-ai commented Oct 14, 2020

@YashasSamaga Hi again,

I just wanted to ask a quick question: does this implementation of opencv support batched inference? I am not sure if you responded to this or answered this somewhere else.
If it does, how would one do it in the code you provided?

@YashasSamaga

This comment has been minimized.

Copy link
Owner Author

@YashasSamaga YashasSamaga commented Oct 14, 2020

does this implementation of opencv support batched inference? I am not sure if you responded to this or answered this somewhere else.
If it does, how would one do it in the code you provided?

Yes, it supports batch inference but it's not supported by the high level API. You have to manually do the preprocessing/postprocessing and call Net::forward (just like in the C++ example). Please check opencv/opencv#17838.

@arnaud-nt2i

This comment has been minimized.

Copy link

@arnaud-nt2i arnaud-nt2i commented Dec 1, 2020

@YashasSamaga Hi !
While trying to use THe C++ code I got this error:
OpenCV(4.5.0) Error: Parsing error (Unknown layer type: sam) in cv::dnn::darknet::ReadDarknetFromCfgStream, file C:\opencv-4.5.0\modules\dnn\src\darknet\darknet_io.cpp, line 865

There is indeed some SAM layers in my .cfg, is there a way to use OpenCV DNN without retraining the network without sam layers?

@YashasSamaga

This comment has been minimized.

Copy link
Owner Author

@YashasSamaga YashasSamaga commented Dec 2, 2020

@arnaud-nt2i Please open a feature request issue at OpenCV repository.

@pablodz

This comment has been minimized.

Copy link

@pablodz pablodz commented Dec 9, 2020

Hello there, check this repo, I add a dockerized yolov4-tiny (only for fish detection) with streamlit based on a repo of Cuda Chen.
https://github.com/DZPeru/fishv4
https://fishv4.herokuapp.com/

@mmustafa0601

This comment has been minimized.

Copy link

@mmustafa0601 mmustafa0601 commented Mar 2, 2021

How can we change this from videos to images. I have an images folder that i need to run the yolov4 on. Thanks

@YashasSamaga

This comment has been minimized.

Copy link
Owner Author

@YashasSamaga YashasSamaga commented Mar 2, 2021

@mmustafa0601 The demo scripts load frames from videos using cv::VideoCapture. You have to load frames using cv::imread. The DNN part of the code remains exactly the same.

@mmustafa0601

This comment has been minimized.

Copy link

@mmustafa0601 mmustafa0601 commented Mar 2, 2021

Thanks for the reply. That i know. The thing is i have to read from a folder full of images. Can you give me the line of code which i will use because i am getting an error the way i am using imread. Really appreciate the help!

@YashasSamaga

This comment has been minimized.

Copy link
Owner Author

@YashasSamaga YashasSamaga commented Mar 3, 2021

@mmustafa0601 Are you writing the code in python or C++?

images = [f.path for f in os.scandir("path/to/images/dir ")]
for path in images:
    frame = cv2.imread(path)
    cv2.imshow('frame', frame)  

What error do you get? What format are your images in?

@mmustafa0601

This comment has been minimized.

Copy link

@mmustafa0601 mmustafa0601 commented Mar 3, 2021

I am using c++. Would appreciate if you could share the relavant code in c++. I am relatively new to both c++ and opencv thats why i am having trouble with it. Thanks for the assistance

@jasonbeach

This comment has been minimized.

Copy link

@jasonbeach jasonbeach commented Mar 5, 2021

@mmustafa0601, The VideoCapture object supports loading multiple images from a single folder. I haven't tried it but OpenCV's documentation makes it looks like it's as simple providing something like path_to_images/img_%02d.jpg (for images named img_00.jpg, img_01.jpg, img_02.jpg, ...) as the filename arg to VideoCapture and then use VideoCapture as you would for a video.

@jasonbeach

This comment has been minimized.

Copy link

@jasonbeach jasonbeach commented Mar 5, 2021

Just curious what framerate you are able to get. I built opencv 4.5.1 from source with CUDA enabled and was getting the warning setUpNet DNN module was not built with CUDA backend; switching to CPU and was processing frames at a rate of 1-2 fps. nvidia-smi confirmed the GPU wasn't being used.

I figured out in cmake setting WITH_CUDNN=ON and OPENCV_DNN_CUDA=ON--after rebuilding and reinstalling, I no longer got the error and nvidia-smi indicated the GPU was indeed being used, but I was still only getting 1-2 fps. My laptop GPU isn't great (MX150) but I would've thought I would gotten at least a little bit of a bump. The video I'm feeding it is 720p. I'm just using the stock yolov4 network and weights.

@YashasSamaga

This comment has been minimized.

Copy link
Owner Author

@YashasSamaga YashasSamaga commented Mar 5, 2021

@jasonbeach Please try with DNN_TARGET_CUDA. Your GPU does not provide high FP16 performance.

@jasonbeach

This comment has been minimized.

Copy link

@jasonbeach jasonbeach commented Mar 5, 2021

@jasonbeach Please try with DNN_TARGET_CUDA. Your GPU does not provide high FP16 performance.

Yes I have:

    net.setPreferableBackend(cv::dnn::DNN_BACKEND_CUDA);
    net.setPreferableTarget(cv::dnn::DNN_TARGET_CUDA);
@YashasSamaga

This comment has been minimized.

Copy link
Owner Author

@YashasSamaga YashasSamaga commented Mar 5, 2021

@jasonbeach Some quick points to note:

  1. the first forward pass is slow
  2. 720p as input directly to the DNN is expected to be slow (but I believe that you're resizing manually or letting OpenCV DNN automatically resize input to something smaller, say 416x416)
  3. YOLOv4 gives around 12 FPS for 416x416 on GTX 1050. It should be lower for MX150.

If nothing works, you might have to switch to YOLOv4-Tiny or other faster variants.

@shuternay

This comment has been minimized.

Copy link

@shuternay shuternay commented Mar 21, 2021

It seems you also need to swap R and B channels: model.setInputParams(size=(416, 416), scale=1/255, swapRB=True). Without it I get predictions that differ from darknet's.
See https://docs.opencv.org/master/da/d9d/tutorial_dnn_yolo.html and https://github.com/opencv/opencv/blob/master/samples/dnn/object_detection.py#L265

@YashasSamaga

This comment has been minimized.

Copy link
Owner Author

@YashasSamaga YashasSamaga commented Mar 22, 2021

@shuternay Thanks for pointing it out. I have fixed it.

@vvmatorin

This comment has been minimized.

Copy link

@vvmatorin vvmatorin commented May 7, 2021

@YashasSamaga thank you very much for sharing this code!

I wonder if you could help me with the issue that I have discovered using it: I have been using the python code for image detection, and I have some misdetections comparing to the original darknet model (trained with custom .cfg, but only changed image size, number of iterations and classes) on bboxes with low confidence.

My model size is 832x832 (original images are smaller and I am not resizing them manually), I have updated input parameters and tried setting CONFIDENCE_THRESHOLD to [0; 0.2] and changing NMS_THRESHOLD between [0.2; 0.75]. Yet it can never detect bboxes, that original darknet detects with threshold = 0.1

Below are the images (same cfg and weights):

  1. darknet detector test obj.data yolov4-obj.cfg yolov4.weights image.jpg -ext_output -dont_show -thresh 0.1
  2. opencv code above with: CONFIDENCE_THRESHOLD=0 and NMS_THRESHOLD=0.4

@YashasSamaga

This comment has been minimized.

Copy link
Owner Author

@YashasSamaga YashasSamaga commented May 7, 2021

@vvmatorin I think this might be a problem with your model since it works with the original darknet model. Does your model work in darknet?

@vvmatorin

This comment has been minimized.

Copy link

@vvmatorin vvmatorin commented May 7, 2021

@vvmatorin I think this might be a problem with your model since it works with the original darknet model. Does your model work in darknet?

Sorry, I might have phrased my issue wrong.
Basically I trained the darknet YOLOv4 model with my custom data and configuration to recognize single class.

Then I have inferenced the same image as above with the AlexeyAB library using my final weights/config as an input (setting confidence threshold to 0.1), and got output as shown on the first image (command at p.1).
Finally, I have inferenced the same image using same weights/config, but only with OpenCV dnn module, and got different results (losing low confidence predictions).

@YashasSamaga

This comment has been minimized.

Copy link
Owner Author

@YashasSamaga YashasSamaga commented May 8, 2021

@vvmatorin

Yet it can never detect bboxes, that original darknet detects with threshold = 0.1

Can you recheck if your input is correct? BGR or RGB? normaliezed image?

Can you check if (DNN_BACKEND_OPENCV, DNN_TARGET_CPU) gives correct detections? If it gives correctly, then there might be an issue with the CUDA backend; otherwise, it might be an issue in your preprocessing or postprocessing (or you used some unsupported layer). The official YOLOv4 model is fully supported in both CUDA and OCV CPU backends. If you have added a new layer or alter the settings of some layer, then you might have to check if OpenCV supports those layers. Sometimes the DNN module won't diagnose unsupported layers. It will simply give wrong outputs.

@vvmatorin

This comment has been minimized.

Copy link

@vvmatorin vvmatorin commented May 8, 2021

@YashasSamaga

Can you recheck if your input is correct? BGR or RGB? normalized image?

For preprocessing I use the same code as you shared (which should do resizing, swapping channels and normalization for input):
model.setInputParams(size=(832, 832), scale=1/255, swapRB=True, crop=False)

You can find the full code snippet attached (I'm using 4.5.1 version of OpenCV):

Can you check if (DNN_BACKEND_OPENCV, DNN_TARGET_CPU) gives correct detections?

The confidence and bboxes for both (OPENCV+CPU and CUDA) backends are exactly the same as above (different from darknet).

The official YOLOv4 model is fully supported in both CUDA and OCV CPU backends. If you have added a new layer or alter the settings of some layer, then you might have to check if OpenCV supports those layers. Sometimes the DNN module won't diagnose unsupported layers. It will simply give wrong outputs.

I am using the official config for both training and inference without making any changes to it apart from the ones below (but changing input size and number of classes should be supported):

[net]
subdivisions=64
width=832
height=832

[convolutional]
filters=18

[yolo]
classes=1

Could it be something related to how resizing is done, .jpg format or having only a single class? I see people are mentioning the same issue here.

@YashasSamaga

This comment has been minimized.

Copy link
Owner Author

@YashasSamaga YashasSamaga commented May 8, 2021

@vvmatorin Can you check if the outputs match exactly with NMS disabled in both Darknet and OpenCV?

@vvmatorin

This comment has been minimized.

Copy link

@vvmatorin vvmatorin commented May 8, 2021

@vvmatorin Can you check if the outputs match exactly with NMS disabled in both Darknet and OpenCV?

@YashasSamaga I set:

  • nms = 0 for darknet, with confidence threshold = 0.1
  • NMS_THRESHOLD = 0 for OpenCV, with CONFIDENCE_THRESHOLD = 0.1

The outputs don't match, as OpenCV output has less bounding boxes overall. Results are below:

@marvision-ai

This comment has been minimized.

Copy link

@marvision-ai marvision-ai commented May 12, 2021

@vvmatorin & @YashasSamaga
Has this been discussed more yet? I actually saw similar results in the past but I didn't think much of it. This conversation is now making me curious if my networks are behaving the same. I am very keen to see what results @vvmatorin has to offer.
Thank you!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment