From 372b36c1d3e1af1bc879096de10872599c5f7eab Mon Sep 17 00:00:00 2001 From: Abduragim Shtanchaev <44877829+Abdurrahheem@users.noreply.github.com> Date: Wed, 31 Jan 2024 10:46:58 +0400 Subject: [PATCH] Merge pull request #24898 from Abdurrahheem:ash/yolo_ducumentation Documentation for Yolo usage in Opencv #24898 This PR introduces documentation for the usage of yolo detection model family in open CV. This is not to be merge before #24691, as the sample will need to be changed. ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [x] There is a reference to the original bug report and related work - [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [x] The feature is well documented and sample code can be built with the project CMake --- doc/tutorials/dnn/dnn_yolo/dnn_yolo.markdown | 226 +++++++++-- samples/dnn/yolo_detector.cpp | 370 +++++++++++++++++++ 2 files changed, 572 insertions(+), 24 deletions(-) create mode 100644 samples/dnn/yolo_detector.cpp diff --git a/doc/tutorials/dnn/dnn_yolo/dnn_yolo.markdown b/doc/tutorials/dnn/dnn_yolo/dnn_yolo.markdown index 7db3b797e9..a2d4b2a306 100644 --- a/doc/tutorials/dnn/dnn_yolo/dnn_yolo.markdown +++ b/doc/tutorials/dnn/dnn_yolo/dnn_yolo.markdown @@ -9,46 +9,224 @@ YOLO DNNs {#tutorial_dnn_yolo} | | | | -: | :- | | Original author | Alessandro de Oliveira Faria | -| Compatibility | OpenCV >= 3.3.1 | +| Extended by | Abduragim Shtanchaev | +| Compatibility | OpenCV >= 4.9.0 | -Introduction ------------- -In this text you will learn how to use opencv_dnn module using yolo_object_detection (Sample of using OpenCV dnn module in real time with device capture, video and image). +Running pre-trained YOLO model in OpenCV +---------------------------------------- -We will demonstrate results of this example on the following picture. -![Picture example](images/yolo.jpg) +Deploying pre-trained models is a common task in machine learning, particularly when working with +hardware that does not support certain frameworks like PyTorch. This guide provides a comprehensive +overview of exporting pre-trained YOLO family models from PyTorch and deploying them using OpenCV's +DNN framework. For demonstration purposes, we will focus on the [YOLOX](https://github.com/Megvii-BaseDetection/YOLOX/blob/main) +model, but the methodology applies to other supported models. -Examples --------- +@note Currently, OpenCV supports the following YOLO models: +- [YOLOX](https://github.com/Megvii-BaseDetection/YOLOX/blob/main), +- [YoloNas](https://github.com/Deci-AI/super-gradients/tree/master), +- [YOLOv8](https://github.com/ultralytics/ultralytics/tree/main), +- [YOLOv7](https://github.com/WongKinYiu/yolov7/tree/main), +- [YOLOv6](https://github.com/meituan/YOLOv6/blob/main), +- [YOLOv5](https://github.com/ultralytics/yolov5), +- [YOLOv4](https://github.com/Tianxiaomo/pytorch-YOLOv4). + +This support includes pre and post-processing routines specific to these models. While other older +version of YOLO are also supported by OpenCV in Darknet format, they are out of the scope of this tutorial. + + +Assuming that we have successfully trained YOLOX model, the subsequent step involves exporting and +running this model with OpenCV. There are several critical considerations to address before +proceeding with this process. Let's delve into these aspects. + +### YOLO's Pre-proccessing & Output + +Understanding the nature of inputs and outputs associated with YOLO family detectors is pivotal. +These detectors, akin to most Deep Neural Networks (DNN), typically exhibit variation in input +sizes contingent upon the model's scale. + +| Model Scale | Input Size | +|--------------|--------------| +| Small Models [1](https://github.com/Megvii-BaseDetection/YOLOX/tree/main#standard-models)| 416x416 | +| Midsize Models [2](https://github.com/Megvii-BaseDetection/YOLOX/tree/main#standard-models)| 640x640 | +| Large Models [3](https://github.com/meituan/YOLOv6/tree/main#benchmark)| 1280x1280 | + +This table provides a quick reference to understand the different input dimensions commonly used in +various YOLO models inputs. These are standard input shapes. Make sure you use input size that you +trained model with, if it is differed from from the size mentioned in the table. + +The next critical element in the process involves understanding the specifics of image pre-processing +for YOLO detectors. While the fundamental pre-processing approach remains consistent across the YOLO +family, there are subtle yet crucial differences that must be accounted for to avoid any degradation +in performance. Key among these are the `resize type` and the `padding value` applied post-resize. +For instance, the [YOLOX model](https://github.com/Megvii-BaseDetection/YOLOX/blob/ac58e0a5e68e57454b7b9ac822aced493b553c53/yolox/data/data_augment.py#L142) +utilizes a `LetterBox` resize method and a padding value of `114.0`. It is imperative to ensure that +these parameters, along with the normalization constants, are appropriately matched to the model being +exported. + +Regarding the model's output, it typically takes the form of a tensor with dimensions [BxNxC+5] or +[BxNxC+4], where 'B' represents the batch size, 'N' denotes the number of anchors, and 'C' signifies +the number of classes (for instance, 80 classes if the model is trained on the COCO dataset). +The additional 5 in the former tensor structure corresponds to the objectness score (obj), confidence +score (conf), and the bounding box coordinates (cx, cy, w, h). Notably, the YOLOv8 model's output +is shaped as [BxNxC+4], where there is no explicit objectness score, and the object score is directly +inferred from the class score. For the YOLOX model, specifically, it is also necessary to incorporate +anchor points to rescale predictions back to the image domain. This step will be integrated into +the ONNX graph, a process that we will detail further in the subsequent sections. + + +### PyTorch Model Export + +Now that we know know the parameters of the pre-precessing we can go on and export the model from +Pytorch to ONNX graph. Since in this tutorial we are using YOLOX as our sample model, lets use its +export for demonstration purposes (the process is identical for the rest of the YOLO detectors). +To exporting YOLOX we can just use [export script](https://github.com/Megvii-BaseDetection/YOLOX/blob/ac58e0a5e68e57454b7b9ac822aced493b553c53/tools/export_onnx.py). Particularly we need following commands: + +@code{.bash} +git clone https://github.com/Megvii-BaseDetection/YOLOX.git +cd YOLOX +wget https://github.com/Megvii-BaseDetection/YOLOX/releases/download/0.1.1rc0/yolox_s.pth # download pre-trained weights +python3 -m tools.export_onnx --output-name yolox_s.onnx -n yolox-s -c yolox_s.pth --decode_in_inference +@endcode + +**NOTE:** Here `--decode_in_inference` is to include anchor box creation in the ONNX graph itself. +It sets [this value](https://github.com/Megvii-BaseDetection/YOLOX/blob/ac58e0a5e68e57454b7b9ac822aced493b553c53/yolox/models/yolo_head.py#L210C16-L210C39) +to `True`, which subsequently includes anchor generation function. + +Below we demonstrated the minimal version of the export script (which could be used for models other +than YOLOX) in case it is needed. However, usually each YOLO repository has predefined export script. + +@code{.py} + import onnx + import torch + from onnxsim import simplify + + # load the model state dict + ckpt = torch.load(ckpt_file, map_location="cpu") + model.load_state_dict(ckpt) + + # prepare dummy input + dummy_input = torch.randn(args.batch_size, 3, exp.test_size[0], exp.test_size[1]) + + #export the model + torch.onnx._export( + model, + dummy_input, + "yolox.onnx", + input_names=["input"], + output_names=["output"], + dynamic_axes={"input": {0: 'batch'}, + "output": {0: 'batch'}}) + + # use onnx-simplifier to reduce reduent model. + onnx_model = onnx.load(args.output_name) + model_simp, check = simplify(onnx_model) + assert check, "Simplified ONNX model could not be validated" + onnx.save(model_simp, args.output_name) +@endcode + +### Running Yolo ONNX detector with OpenCV Sample + +Once we have our ONNX graph of the model, we just simply can run with OpenCV's sample. To that we need to make sure: + +1. OpenCV is build with -DBUILD_EXAMLES=ON flag. +2. Navigate to the OpenCV's `build` directory +3. Run the following command: + +@code{.cpp} +./bin/example_dnn_yolo_detector --input= \ + --classes= \ + --thr= \ + --nms= \ + --mean= \ + --scale= \ + --yolo= \ + --padvalue= \ + --paddingmode= \ + --backend= \ + --target= +@endcode VIDEO DEMO: @youtube{NHtRlndE2cg} -Source Code ------------ +- --input: File path to your input image or video. If omitted, it will capture frames from a camera. +- --classes: File path to a text file containing class names for object detection. +- --thr: Confidence threshold for detection (e.g., 0.5). +- --nms: Non-maximum suppression threshold (e.g., 0.4). +- --mean: Mean normalization value (e.g., 0.0 for no mean normalization). +- --scale: Scale factor for input normalization (e.g., 1.0). +- --yolo: YOLO model version (e.g., YOLOv3, YOLOv4, etc.). +- --padvalue: Padding value used in pre-processing (e.g., 114.0). +- --paddingmode: Method for handling image resizing and padding. Options: 0 (resize without extra processing), 1 (crop after resize), 2 (resize with aspect ratio preservation). +- --backend: Selection of computation backend (0 for automatic, 1 for Halide, 2 for OpenVINO, etc.). +- --target: Selection of target computation device (0 for CPU, 1 for OpenCL, etc.). +- --device: Camera device number (0 for default camera). If `--input` is not provided camera with index 0 will used by default. -Use a universal sample for object detection models written -[in C++](https://github.com/opencv/opencv/blob/4.x/samples/dnn/object_detection.cpp) and -[in Python](https://github.com/opencv/opencv/blob/4.x/samples/dnn/object_detection.py) languages +Here `mean`, `scale`, `padvalue`, `paddingmode` should exactly match those that we discussed +in pre-processing section in order for the model to match result in PyTorch -Usage examples --------------- +To demonstrate how to run OpenCV YOLO samples without your own pretrained model, follow these instructions: -Execute in webcam: +1. Ensure Python is installed on your platform. +2. Confirm that OpenCV is built with the `-DBUILD_EXAMPLES=ON` flag. -@code{.bash} - -$ example_dnn_object_detection --config=[PATH-TO-DARKNET]/cfg/yolo.cfg --model=[PATH-TO-DARKNET]/yolo.weights --classes=object_detection_classes_pascal_voc.txt --width=416 --height=416 --scale=0.00392 --rgb +Run the YOLOX detector(with default values): +@code{.sh} +git clone https://github.com/opencv/opencv_extra.git +cd opencv_extra/testdata/dnn +python download_models.py yolox_s_inf_decoder +cd .. +export OPENCV_TEST_DATA_PATH=$(pwd) +cd +./bin/example_dnn_yolo_detector @endcode -Execute with image or video file: +This will execute the YOLOX detector with your camera. For YOLOv8 (for instance), follow these additional steps: -@code{.bash} - -$ example_dnn_object_detection --config=[PATH-TO-DARKNET]/cfg/yolo.cfg --model=[PATH-TO-DARKNET]/yolo.weights --classes=object_detection_classes_pascal_voc.txt --width=416 --height=416 --scale=0.00392 --input=[PATH-TO-IMAGE-OR-VIDEO-FILE] --rgb +@code{.sh} +cd opencv_extra/testdata/dnn +python download_models.py yolov8 +cd .. +export OPENCV_TEST_DATA_PATH=$(pwd) +cd +./bin/example_dnn_yolo_detector --model=onnx/models/yolov8n.onnx --yolo=yolov8 --mean=0.0 --scale=0.003921568627 --paddingmode=2 --padvalue=144.0 --thr=0.5 --nms=0.4 --rgb=0 @endcode -Questions and suggestions email to: Alessandro de Oliveira Faria cabelo@opensuse.org or OpenCV Team. + +### Building a Custom Pipeline + +Sometimes there is a need to make some custom adjustments in the inference pipeline. With OpenCV DNN +module this is also quite easy to achieve. Below we will outline the sample implementation details: + +- Import required libraries + +@snippet samples/dnn/yolo_detector.cpp includes + +- Read ONNX graph and create neural network model: + +@snippet samples/dnn/yolo_detector.cpp read_net + +- Read image and pre-process it: + +@snippet samples/dnn/yolo_detector.cpp preprocess_params +@snippet samples/dnn/yolo_detector.cpp preprocess_call +@snippet samples/dnn/yolo_detector.cpp preprocess_call_func + +- Inference: + +@snippet samples/dnn/yolo_detector.cpp forward_buffers +@snippet samples/dnn/yolo_detector.cpp forward + +- Post-Processing + +All post-processing steps are implemented in function `yoloPostProcess`. Please pay attention, +that NMS step is not included into onnx graph. Sample uses OpenCV function for it. + +@snippet samples/dnn/yolo_detector.cpp postprocess + +- Draw predicted boxes + +@snippet samples/dnn/yolo_detector.cpp draw_boxes diff --git a/samples/dnn/yolo_detector.cpp b/samples/dnn/yolo_detector.cpp new file mode 100644 index 0000000000..b439b0d4bc --- /dev/null +++ b/samples/dnn/yolo_detector.cpp @@ -0,0 +1,370 @@ +/** + * @file yolo_detector.cpp + * @brief Yolo Object Detection Sample + * @author OpenCV team + */ + +//![includes] +#include +#include +#include +#include +#include +#include "iostream" +#include "common.hpp" +#include +//![includes] + +using namespace cv; +using namespace cv::dnn; + +void getClasses(std::string classesFile); +void drawPrediction(int classId, float conf, int left, int top, int right, int bottom, Mat& frame); +void yoloPostProcessing( + std::vector& outs, + std::vector& keep_classIds, + std::vector& keep_confidences, + std::vector& keep_boxes, + float conf_threshold, + float iou_threshold, + const std::string& test_name +); + +std::vector classes; + + +std::string keys = + "{ help h | | Print help message. }" + "{ device | 0 | camera device number. }" + "{ model | onnx/models/yolox_s_inf_decoder.onnx | Default model. }" + "{ yolo | yolox | yolo model version. }" + "{ input i | | Path to input image or video file. Skip this argument to capture frames from a camera. }" + "{ classes | | Optional path to a text file with names of classes to label detected objects. }" + "{ thr | .5 | Confidence threshold. }" + "{ nms | .4 | Non-maximum suppression threshold. }" + "{ mean | 0.0 | Normalization constant. }" + "{ scale | 1.0 | Preprocess input image by multiplying on a scale factor. }" + "{ width | 640 | Preprocess input image by resizing to a specific width. }" + "{ height | 640 | Preprocess input image by resizing to a specific height. }" + "{ rgb | 1 | Indicate that model works with RGB input images instead BGR ones. }" + "{ padvalue | 114.0 | padding value. }" + "{ paddingmode | 2 | Choose one of computation backends: " + "0: resize to required input size without extra processing, " + "1: Image will be cropped after resize, " + "2: Resize image to the desired size while preserving the aspect ratio of original image }" + "{ backend | 0 | Choose one of computation backends: " + "0: automatically (by default), " + "1: Halide language (http://halide-lang.org/), " + "2: Intel's Deep Learning Inference Engine (https://software.intel.com/openvino-toolkit), " + "3: OpenCV implementation, " + "4: VKCOM, " + "5: CUDA }" + "{ target | 0 | Choose one of target computation devices: " + "0: CPU target (by default), " + "1: OpenCL, " + "2: OpenCL fp16 (half-float precision), " + "3: VPU, " + "4: Vulkan, " + "6: CUDA, " + "7: CUDA fp16 (half-float preprocess) }" + "{ async | 0 | Number of asynchronous forwards at the same time. " + "Choose 0 for synchronous mode }"; + +void getClasses(std::string classesFile) +{ + std::ifstream ifs(classesFile.c_str()); + if (!ifs.is_open()) + CV_Error(Error::StsError, "File " + classesFile + " not found"); + std::string line; + while (std::getline(ifs, line)) + classes.push_back(line); +} + +void drawPrediction(int classId, float conf, int left, int top, int right, int bottom, Mat& frame) +{ + rectangle(frame, Point(left, top), Point(right, bottom), Scalar(0, 255, 0)); + + std::string label = format("%.2f", conf); + if (!classes.empty()) + { + CV_Assert(classId < (int)classes.size()); + label = classes[classId] + ": " + label; + } + + int baseLine; + Size labelSize = getTextSize(label, FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine); + + top = max(top, labelSize.height); + rectangle(frame, Point(left, top - labelSize.height), + Point(left + labelSize.width, top + baseLine), Scalar::all(255), FILLED); + putText(frame, label, Point(left, top), FONT_HERSHEY_SIMPLEX, 0.5, Scalar()); +} + +void yoloPostProcessing( + std::vector& outs, + std::vector& keep_classIds, + std::vector& keep_confidences, + std::vector& keep_boxes, + float conf_threshold, + float iou_threshold, + const std::string& test_name) +{ + // Retrieve + std::vector classIds; + std::vector confidences; + std::vector boxes; + + if (test_name == "yolov8") + { + cv::transposeND(outs[0], {0, 2, 1}, outs[0]); + } + + if (test_name == "yolonas") + { + // outs contains 2 elemets of shape [1, 8400, 80] and [1, 8400, 4]. Concat them to get [1, 8400, 84] + Mat concat_out; + // squeeze the first dimension + outs[0] = outs[0].reshape(1, outs[0].size[1]); + outs[1] = outs[1].reshape(1, outs[1].size[1]); + cv::hconcat(outs[1], outs[0], concat_out); + outs[0] = concat_out; + // remove the second element + outs.pop_back(); + // unsqueeze the first dimension + outs[0] = outs[0].reshape(0, std::vector{1, 8400, 84}); + } + + for (auto preds : outs) + { + preds = preds.reshape(1, preds.size[1]); // [1, 8400, 85] -> [8400, 85] + for (int i = 0; i < preds.rows; ++i) + { + // filter out non object + float obj_conf = (test_name == "yolov8" || test_name == "yolonas") ? 1.0f : preds.at(i, 4) ; + if (obj_conf < conf_threshold) + continue; + + Mat scores = preds.row(i).colRange((test_name == "yolov8" || test_name == "yolonas") ? 4 : 5, preds.cols); + double conf; + Point maxLoc; + minMaxLoc(scores, 0, &conf, 0, &maxLoc); + + conf = (test_name == "yolov8" || test_name == "yolonas") ? conf : conf * obj_conf; + if (conf < conf_threshold) + continue; + + // get bbox coords + float* det = preds.ptr(i); + double cx = det[0]; + double cy = det[1]; + double w = det[2]; + double h = det[3]; + + // [x1, y1, x2, y2] + if (test_name == "yolonas"){ + boxes.push_back(Rect2d(cx, cy, w, h)); + } else { + boxes.push_back(Rect2d(cx - 0.5 * w, cy - 0.5 * h, + cx + 0.5 * w, cy + 0.5 * h)); + } + classIds.push_back(maxLoc.x); + confidences.push_back(static_cast(conf)); + } + } + + // NMS + std::vector keep_idx; + NMSBoxes(boxes, confidences, conf_threshold, iou_threshold, keep_idx); + + for (auto i : keep_idx) + { + keep_classIds.push_back(classIds[i]); + keep_confidences.push_back(confidences[i]); + keep_boxes.push_back(boxes[i]); + } +} + +/** + * @function main + * @brief Main function + */ +int main(int argc, char** argv) +{ + CommandLineParser parser(argc, argv, keys); + parser.about("Use this script to run object detection deep learning networks using OpenCV."); + if (parser.has("help")) + { + parser.printMessage(); + return 0; + } + + CV_Assert(parser.has("model")); + CV_Assert(parser.has("yolo")); + // if model is default, use findFile to get the full path otherwise use the given path + std::string weightPath = findFile(parser.get("model")); + std::string yolo_model = parser.get("yolo"); + + float confThreshold = parser.get("thr"); + float nmsThreshold = parser.get("nms"); + //![preprocess_params] + float paddingValue = parser.get("padvalue"); + bool swapRB = parser.get("rgb"); + int inpWidth = parser.get("width"); + int inpHeight = parser.get("height"); + Scalar scale = parser.get("scale"); + Scalar mean = parser.get("mean"); + ImagePaddingMode paddingMode = static_cast(parser.get("paddingmode")); + //![preprocess_params] + + // check if yolo model is valid + if (yolo_model != "yolov5" && yolo_model != "yolov6" + && yolo_model != "yolov7" && yolo_model != "yolov8" + && yolo_model != "yolox" && yolo_model != "yolonas") + CV_Error(Error::StsError, "Invalid yolo model: " + yolo_model); + + // get classes + if (parser.has("classes")) + { + getClasses(findFile(parser.get("classes"))); + } + + // load model + //![read_net] + Net net = readNet(weightPath); + int backend = parser.get("backend"); + net.setPreferableBackend(backend); + net.setPreferableTarget(parser.get("target")); + //![read_net] + + VideoCapture cap; + Mat img; + bool isImage = false; + bool isCamera = false; + + // Check if input is given + if (parser.has("input")) + { + String input = parser.get("input"); + // Check if the input is an image + if (input.find(".jpg") != String::npos || input.find(".png") != String::npos) + { + img = imread(findFile(input)); + if (img.empty()) + { + CV_Error(Error::StsError, "Cannot read image file: " + input); + } + isImage = true; + } + else + { + cap.open(input); + if (!cap.isOpened()) + { + CV_Error(Error::StsError, "Cannot open video " + input); + } + isCamera = true; + } + } + else + { + int cameraIndex = parser.get("device"); + cap.open(cameraIndex); + if (!cap.isOpened()) + { + CV_Error(Error::StsError, cv::format("Cannot open camera #%d", cameraIndex)); + } + isCamera = true; + } + + // image pre-processing + //![preprocess_call] + Size size(inpWidth, inpHeight); + Image2BlobParams imgParams( + scale, + size, + mean, + swapRB, + CV_32F, + DNN_LAYOUT_NCHW, + paddingMode, + paddingValue); + + // rescale boxes back to original image + Image2BlobParams paramNet; + paramNet.scalefactor = scale; + paramNet.size = size; + paramNet.mean = mean; + paramNet.swapRB = swapRB; + paramNet.paddingmode = paddingMode; + //![preprocess_call] + + //![forward_buffers] + std::vector outs; + std::vector keep_classIds; + std::vector keep_confidences; + std::vector keep_boxes; + std::vector boxes; + //![forward_buffers] + + Mat inp; + while (waitKey(1) < 0) + { + + if (isCamera) + cap >> img; + if (img.empty()) + { + std::cout << "Empty frame" << std::endl; + waitKey(); + break; + } + //![preprocess_call_func] + inp = blobFromImageWithParams(img, imgParams); + //![preprocess_call_func] + + //![forward] + net.setInput(inp); + net.forward(outs, net.getUnconnectedOutLayersNames()); + //![forward] + + //![postprocess] + yoloPostProcessing( + outs, keep_classIds, keep_confidences, keep_boxes, + confThreshold, nmsThreshold, + yolo_model); + //![postprocess] + + // covert Rect2d to Rect + //![draw_boxes] + for (auto box : keep_boxes) + { + boxes.push_back(Rect(cvFloor(box.x), cvFloor(box.y), cvFloor(box.width - box.x), cvFloor(box.height - box.y))); + } + + paramNet.blobRectsToImageRects(boxes, boxes, img.size()); + + for (size_t idx = 0; idx < boxes.size(); ++idx) + { + Rect box = boxes[idx]; + drawPrediction(keep_classIds[idx], keep_confidences[idx], box.x, box.y, + box.width + box.x, box.height + box.y, img); + } + + const std::string kWinName = "Yolo Object Detector"; + namedWindow(kWinName, WINDOW_NORMAL); + imshow(kWinName, img); + //![draw_boxes] + + outs.clear(); + keep_classIds.clear(); + keep_confidences.clear(); + keep_boxes.clear(); + boxes.clear(); + + if (isImage) + { + waitKey(); + break; + } + } +}