From efbe580ff32163eaa4e2d76b245b9ea9e96e9038 Mon Sep 17 00:00:00 2001 From: Gursimar Singh Date: Mon, 2 Dec 2024 11:35:25 +0530 Subject: [PATCH] Merge pull request #26486 from gursimarsingh:object_detection_engine_update Code Fixes and changed post processing based on models.yml in Object Detection Sample #26486 ## Major Changes 1. Changes to add findModel support for config file in models like yolov4, yolov4-tiny, yolov3, ssd_caffe, tiny-yolo-voc, ssd_tf and faster_rcnn_tf. 2. Added new model and config download links for ssd_caffe, as previous links were not working. 3. Switched to DNN ENGINE_CLASSIC for non-cpu convig as new engine does not support it. 4. Fixes in python sample related to yolov5 usage. ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [x] There is a reference to the original bug report and related work - [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [x] The feature is well documented and sample code can be built with the project CMake --- samples/dnn/common.hpp | 9 +++- samples/dnn/common.py | 7 ++- samples/dnn/models.yml | 18 +++++-- samples/dnn/object_detection.cpp | 65 +++++++++++------------- samples/dnn/object_detection.py | 85 +++++++++++++++++++------------- 5 files changed, 110 insertions(+), 74 deletions(-) diff --git a/samples/dnn/common.hpp b/samples/dnn/common.hpp index a01b1ca40f..4e07a74778 100644 --- a/samples/dnn/common.hpp +++ b/samples/dnn/common.hpp @@ -62,7 +62,12 @@ std::string genArgument(const std::string& argName, const std::string& help, FileNode value = node[argName]; if (argName.find("sha1") != std::string::npos) { std::string prefix = argName.substr(0, argName.find("sha1")); - value = node[prefix+"load_info"][argName]; + if (prefix == "config_"){ + value = node[prefix+"load_info"]["sha1"]; + } + else{ + value = node[prefix+"load_info"][argName]; + } } if (argName.find("download_sha") != std::string::npos) { std::string prefix = argName.substr(0, argName.find("download_sha")); @@ -183,6 +188,8 @@ std::string genPreprocArguments(const std::string& modelName, const std::string& modelName, zooFile)+ genArgument(prefix + "sha1", "Optional path to hashsum of downloaded model to be loaded from models.yml", modelName, zooFile)+ + genArgument(prefix + "config_sha1", "Optional path to hashsum of downloaded config to be loaded from models.yml", + modelName, zooFile)+ genArgument(prefix + "download_sha", "Optional path to hashsum of downloaded model to be loaded from models.yml", modelName, zooFile); } diff --git a/samples/dnn/common.py b/samples/dnn/common.py index 4c5cefc8fc..5e211a30ca 100644 --- a/samples/dnn/common.py +++ b/samples/dnn/common.py @@ -19,7 +19,10 @@ def add_argument(zoo, parser, name, help, required=False, default=None, type=Non if "sha1" in name: prefix = name.replace("sha1", "") value = node.getNode(prefix + "load_info") - value = value.getNode(name) + if prefix == "config_": + value = value.getNode("sha1") + else: + value = value.getNode(name) if "download_sha" in name: prefix = name.replace("download_sha", "") value = node.getNode(prefix + "load_info") @@ -97,6 +100,8 @@ def add_preproc_args(zoo, parser, sample, alias=None, prefix=""): help='An index of background class in predictions. If not negative, exclude such class from list of classes.', alias=alias) add_argument(zoo, parser, prefix+'sha1', type=str, help='Optional path to hashsum of downloaded model to be loaded from models.yml', alias=alias) + add_argument(zoo, parser, prefix+'config_sha1', type=str, + help='Optional path to hashsum of downloaded config to be loaded from models.yml', alias=alias) add_argument(zoo, parser, prefix+'download_sha', type=str, help='Optional path to hashsum of downloaded model to be loaded from models.yml', alias=alias) diff --git a/samples/dnn/models.yml b/samples/dnn/models.yml index 41fe7862b8..c7325d7cb3 100644 --- a/samples/dnn/models.yml +++ b/samples/dnn/models.yml @@ -89,8 +89,8 @@ yolov5l: width: 640 height: 640 rgb: true - classes: "object_detection_classes_yolo.txt" - background_label_id: 0 + labels: "object_detection_classes_yolo.txt" + postprocessing: "yolov5" sample: "object_detection" # YOLO4 object detection family from Darknet (https://github.com/AlexeyAB/darknet) @@ -112,6 +112,7 @@ yolov4: rgb: true labels: "object_detection_classes_yolo.txt" background_label_id: 0 + postprocessing: "darknet" sample: "object_detection" yolov4-tiny: @@ -130,6 +131,7 @@ yolov4-tiny: rgb: true labels: "object_detection_classes_yolo.txt" background_label_id: 0 + postprocessing: "darknet" sample: "object_detection" yolov3: @@ -148,6 +150,7 @@ yolov3: rgb: true labels: "object_detection_classes_yolo.txt" background_label_id: 0 + postprocessing: "darknet" sample: "object_detection" tiny-yolo-voc: @@ -166,14 +169,18 @@ tiny-yolo-voc: rgb: true labels: "object_detection_classes_pascal_voc.txt" background_label_id: 0 + postprocessing: "darknet" sample: "object_detection" -# Caffe implementation of SSD model from https://github.com/chuanqi305/MobileNet-SSD +# Caffe implementation of SSD model from https://github.com/PINTO0309/MobileNet-SSD-RealSense ssd_caffe: load_info: - url: "https://drive.google.com/uc?export=download&id=0B3gersZ2cHIxRm5PMWRoTkdHdHc" + url: "https://github.com/PINTO0309/MobileNet-SSD-RealSense/raw/refs/heads/master/caffemodel/MobileNetSSD/MobileNetSSD_deploy.caffemodel" sha1: "994d30a8afaa9e754d17d2373b2d62a7dfbaaf7a" model: "MobileNetSSD_deploy.caffemodel" + config_load_info: + url: "https://github.com/PINTO0309/MobileNet-SSD-RealSense/raw/refs/heads/master/caffemodel/MobileNetSSD/MobileNetSSD_deploy.prototxt" + sha1: "25c8404cecdef638c2bd9ac7f3b46a8b96897deb" config: "MobileNetSSD_deploy.prototxt" mean: [127.5, 127.5, 127.5] scale: 0.007843 @@ -181,6 +188,7 @@ ssd_caffe: height: 300 rgb: false labels: "object_detection_classes_pascal_voc.txt" + postprocessing: "ssd" sample: "object_detection" # TensorFlow implementation of SSD model from https://github.com/tensorflow/models/tree/master/research/object_detection @@ -202,6 +210,7 @@ ssd_tf: height: 300 rgb: true labels: "object_detection_classes_coco.txt" + postprocessing: "ssd" sample: "object_detection" # TensorFlow implementation of Faster-RCNN model from https://github.com/tensorflow/models/tree/master/research/object_detection @@ -222,6 +231,7 @@ faster_rcnn_tf: width: 800 height: 600 rgb: true + postprocessing: "ssd" sample: "object_detection" ################################################################################ diff --git a/samples/dnn/object_detection.cpp b/samples/dnn/object_detection.cpp index abbeada29a..d2be2f1eff 100644 --- a/samples/dnn/object_detection.cpp +++ b/samples/dnn/object_detection.cpp @@ -76,7 +76,7 @@ string modelName, framework; static void preprocess(const Mat& frame, Net& net, Size inpSize); -static void postprocess(Mat& frame, const vector& outs, Net& net, int backend, vector& classIds, vector& confidences, vector& boxes, const string yolo_name); +static void postprocess(Mat& frame, const vector& outs, Net& net, int backend, vector& classIds, vector& confidences, vector& boxes, const string postprocessing); static void drawPred(vector& classIds, vector& confidences, vector& boxes, Mat& frame, FontFace& sans, int stdSize, int stdWeight, int stdImgSize, int stdThickness); @@ -91,7 +91,7 @@ static void yoloPostProcessing( vector& keep_boxes, float conf_threshold, float iou_threshold, - const string& yolo_name); + const string& postprocessing); static void printAliases(string& zooFile){ vector aliases = findAliases(zooFile, "object_detection"); @@ -195,12 +195,13 @@ int main(int argc, char** argv) inpHeight = parser.get("height"); int async = parser.get("async"); paddingValue = parser.get("padvalue"); - const string yolo_name = parser.get("postprocessing"); + const string postprocessing = parser.get("postprocessing"); paddingMode = static_cast(parser.get("paddingmode")); //![preprocess_params] String sha1 = parser.get("sha1"); + String config_sha1 = parser.get("config_sha1"); const string modelPath = findModel(parser.get("model"), sha1); - const string configPath = findFile(parser.get("config")); + const string configPath = findModel(parser.get("config"), config_sha1); framework = modelPath.substr(modelPath.rfind('.') + 1); if (parser.has("labels")) @@ -216,7 +217,11 @@ int main(int argc, char** argv) } } //![read_net] - Net net = readNet(modelPath, configPath); + EngineType engine = ENGINE_AUTO; + if ((parser.get("backend") != "default") || (parser.get("target") != "cpu")){ + engine = ENGINE_CLASSIC; + } + Net net = readNet(modelPath, configPath, "", engine); int backend = getBackendID(parser.get("backend")); net.setPreferableBackend(backend); net.setPreferableTarget(getTargetID(parser.get("target"))); @@ -230,7 +235,7 @@ int main(int argc, char** argv) // Open a video file or an image file or a camera stream. VideoCapture cap; - bool openSuccess = parser.has("input") ? cap.open(parser.get("input")) : cap.open(parser.get("device")); + bool openSuccess = parser.has("input") ? cap.open(findFile(parser.get("input"))) : cap.open(parser.get("device")); if (!openSuccess){ cout << "Could not open input file or camera device" << endl; return 0; @@ -324,7 +329,7 @@ int main(int argc, char** argv) classIds.clear(); confidences.clear(); boxes.clear(); - postprocess(frame, outs, net, backend, classIds, confidences, boxes, yolo_name); + postprocess(frame, outs, net, backend, classIds, confidences, boxes, postprocessing); drawPred(classIds, confidences, boxes, frame, sans, stdSize, stdWeight, stdImgSize, stdThickness); @@ -354,7 +359,7 @@ int main(int argc, char** argv) CV_Error(Error::StsNotImplemented, "Asynchronous forward is supported only with Inference Engine backend."); // Threading is disabled, run synchronously Mat frame, blob; - while (waitKey(100) < 0) { + while (waitKey(1) < 0) { cap >> frame; if (frame.empty()) { waitKey(); @@ -369,7 +374,7 @@ int main(int argc, char** argv) confidences.clear(); boxes.clear(); - postprocess(frame, outs, net, backend, classIds, confidences, boxes, yolo_name); + postprocess(frame, outs, net, backend, classIds, confidences, boxes, postprocessing); drawPred(classIds, confidences, boxes, frame, sans, stdSize, stdWeight, stdImgSize, stdThickness); @@ -379,7 +384,7 @@ int main(int argc, char** argv) int weight = static_cast((stdWeight * imgWidth) / (stdImgSize * 1.5)); double freq = getTickFrequency() / 1000; double t = net.getPerfProfile(layersTimes) / freq; - string label = format("Inference time: %.2f ms", t); + string label = format("FPS: %.2f", 1000/t); putText(frame, label, Point(0, size), Scalar(0, 255, 0), sans, size, weight); imshow(kWinName, frame); } @@ -414,15 +419,6 @@ void preprocess(const Mat& frame, Net& net, Size inpSize) // Set the blob as the network input net.setInput(inp); - - // Check if the model is Faster-RCNN or R-FCN - if (net.getLayer(0)->outputNameToIndex("im_info") != -1) - { - // Resize the frame and prepare imInfo - resize(frame, frame, size); - Mat imInfo = (Mat_(1, 3) << size.height, size.width, 1.6f); - net.setInput(imInfo, "im_info"); - } } void yoloPostProcessing( @@ -432,7 +428,7 @@ void yoloPostProcessing( vector& keep_boxes, float conf_threshold, float iou_threshold, - const string& yolo_name) + const string& postprocessing) { // Retrieve vector classIds; @@ -441,12 +437,12 @@ void yoloPostProcessing( vector outs_copy = outs; - if (yolo_name == "yolov8") + if (postprocessing == "yolov8") { transposeND(outs_copy[0], {0, 2, 1}, outs_copy[0]); } - if (yolo_name == "yolonas") + if (postprocessing == "yolonas") { // outs contains 2 elements of shape [1, 8400, 80] and [1, 8400, 4]. Concat them to get [1, 8400, 84] Mat concat_out; @@ -467,16 +463,16 @@ void yoloPostProcessing( for (int i = 0; i < preds.rows; ++i) { // filter out non-object - float obj_conf = (yolo_name == "yolov8" || yolo_name == "yolonas") ? 1.0f : preds.at(i, 4); + float obj_conf = (postprocessing == "yolov8" || postprocessing == "yolonas") ? 1.0f : preds.at(i, 4); if (obj_conf < conf_threshold) continue; - Mat scores = preds.row(i).colRange((yolo_name == "yolov8" || yolo_name == "yolonas") ? 4 : 5, preds.cols); + Mat scores = preds.row(i).colRange((postprocessing == "yolov8" || postprocessing == "yolonas") ? 4 : 5, preds.cols); double conf; Point maxLoc; minMaxLoc(scores, 0, &conf, 0, &maxLoc); - conf = (yolo_name == "yolov8" || yolo_name == "yolonas") ? conf : conf * obj_conf; + conf = (postprocessing == "yolov8" || postprocessing == "yolonas") ? conf : conf * obj_conf; if (conf < conf_threshold) continue; @@ -488,7 +484,7 @@ void yoloPostProcessing( double h = det[3]; // [x1, y1, x2, y2] - if (yolo_name == "yolonas") { + if (postprocessing == "yolonas") { boxes.push_back(Rect2d(cx, cy, w, h)); } else { boxes.push_back(Rect2d(cx - 0.5 * w, cy - 0.5 * h, @@ -511,12 +507,10 @@ void yoloPostProcessing( } } -void postprocess(Mat& frame, const vector& outs, Net& net, int backend, vector& classIds, vector& confidences, vector& boxes, const string yolo_name) +void postprocess(Mat& frame, const vector& outs, Net& net, int backend, vector& classIds, vector& confidences, vector& boxes, const string postprocessing) { static vector outLayers = net.getUnconnectedOutLayers(); - static string outLayerType = net.getLayer(outLayers[0])->type; - - if (outLayerType == "DetectionOutput") + if (postprocessing == "ssd") { // Network produces output blob with a shape 1x1xNx7 where N is a number of // detections and an every detection is a vector of values @@ -552,7 +546,7 @@ void postprocess(Mat& frame, const vector& outs, Net& net, int backend, vec } } } - else if (outLayerType == "Region") + else if (postprocessing == "darknet") { for (size_t i = 0; i < outs.size(); ++i) { @@ -582,7 +576,7 @@ void postprocess(Mat& frame, const vector& outs, Net& net, int backend, vec } } } - else if (outLayerType == "Identity") + else if (postprocessing == "yolov8" || postprocessing == "yolov5") { //![forward_buffers] vector keep_classIds; @@ -591,7 +585,7 @@ void postprocess(Mat& frame, const vector& outs, Net& net, int backend, vec //![forward_buffers] //![postprocess] - yoloPostProcessing(outs, keep_classIds, keep_confidences, keep_boxes, confThreshold, nmsThreshold, yolo_name); + yoloPostProcessing(outs, keep_classIds, keep_confidences, keep_boxes, confThreshold, nmsThreshold, postprocessing); //![postprocess] for (size_t i = 0; i < keep_classIds.size(); ++i) @@ -614,12 +608,13 @@ void postprocess(Mat& frame, const vector& outs, Net& net, int backend, vec } else { - CV_Error(Error::StsNotImplemented, "Unknown output layer type: " + outLayerType); + cout<< ("Unknown postprocessing method: " + postprocessing)< 1 - if (outLayers.size() > 1 || (outLayerType == "Region" && backend != DNN_BACKEND_OPENCV)) + if (outLayers.size() > 1 || (postprocessing == "darknet" && backend != DNN_BACKEND_OPENCV)) { map > class2indices; for (size_t i = 0; i < classIds.size(); i++) diff --git a/samples/dnn/object_detection.py b/samples/dnn/object_detection.py index f2171a5716..9d4f47ae8a 100644 --- a/samples/dnn/object_detection.py +++ b/samples/dnn/object_detection.py @@ -37,9 +37,6 @@ parser.add_argument('--out_tf_graph', default='graph.pbtxt', help='For models from TensorFlow Object Detection API, you may ' 'pass a .config file which was used for training through --config ' 'argument. This way an additional .pbtxt file with TensorFlow graph will be created.') -parser.add_argument('--framework', choices=['caffe', 'tensorflow', 'darknet', 'dldt', 'onnx'], - help='Optional name of an origin framework of the model. ' - 'Detect it automatically if it does not set.') parser.add_argument('--thr', type=float, default=0.5, help='Confidence threshold') parser.add_argument('--nms', type=float, default=0.4, help='Non-maximum suppression threshold') parser.add_argument('--backend', default="default", type=str, choices=backends, @@ -76,8 +73,9 @@ if args.alias is None or hasattr(args, 'help'): args.model = findModel(args.model, args.sha1) if args.config is not None: - args.config = findFile(args.config) -args.labels = findFile(args.labels) + args.config = findModel(args.config, args.config_sha1) +if args.labels is not None: + args.labels = findFile(args.labels) # If config specified, try to load it as TensorFlow Object Detection API's pipeline. config = readTextMessage(args.config) @@ -100,7 +98,10 @@ if args.labels: labels = f.read().rstrip('\n').split('\n') # Load a network -net = cv.dnn.readNet(args.model, args.config, args.framework) +engine = cv.dnn.ENGINE_AUTO +if args.backend != "default" or args.target != "cpu": + engine = cv.dnn.ENGINE_CLASSIC +net = cv.dnn.readNet(args.model, args.config, "", engine) net.setPreferableBackend(get_backend_id(args.backend)) net.setPreferableTarget(get_target_id(args.target)) outNames = net.getUnconnectedOutLayersNames() @@ -126,14 +127,10 @@ def postprocess(frame, outs): frameHeight = frame.shape[0] frameWidth = frame.shape[1] - layerNames = net.getLayerNames() - lastLayerId = net.getLayerId(layerNames[-1]) - lastLayer = net.getLayer(lastLayerId) - classIds = [] confidences = [] boxes = [] - if lastLayer.type == 'DetectionOutput': + if args.postprocessing == 'ssd': # Network produces output blob with a shape 1x1xNx7 where N is a number of # detections and an every detection is a vector of values # [batchId, classId, confidence, left, top, right, bottom] @@ -157,21 +154,12 @@ def postprocess(frame, outs): classIds.append(int(detection[1]) - 1) # Skip background label confidences.append(float(confidence)) boxes.append([left, top, width, height]) - elif lastLayer.type == 'Region' or args.postprocessing == 'yolov8': - # Network produces output blob with a shape NxC where N is a number of - # detected objects and C is a number of classes + 4 where the first 4 - # numbers are [center_x, center_y, width, height] - if args.postprocessing == 'yolov8': - box_scale_w = frameWidth / args.width - box_scale_h = frameHeight / args.height - else: - box_scale_w = frameWidth - box_scale_h = frameHeight + + elif args.postprocessing == 'darknet': + box_scale_w = frameWidth + box_scale_h = frameHeight for out in outs: - if args.postprocessing == 'yolov8': - out = out[0].transpose(1, 0) - for detection in out: scores = detection[4:] if args.background_label_id >= 0: @@ -188,13 +176,47 @@ def postprocess(frame, outs): classIds.append(classId) confidences.append(float(confidence)) boxes.append([left, top, width, height]) + + elif args.postprocessing == 'yolov8' or args.postprocessing == 'yolov5': + # Network produces output blob with a shape NxC where N is a number of + # detected objects and C is a number of classes + 4 where the first 4 + # numbers are [center_x, center_y, width, height] + box_scale_w = frameWidth / args.width + box_scale_h = frameHeight / args.height + + for out in outs: + if args.postprocessing == 'yolov8': + out = out[0].transpose(1, 0) + else: # YOLOv5, no transposition needed + out = out[0] + + for detection in out: + if args.postprocessing == 'yolov8': + scores = detection[4:] + obj_conf = 1 + else: + scores = detection[5:] + obj_conf = detection[4] + + classId = np.argmax(scores) + confidence = scores[classId]*obj_conf + if confidence > confThreshold: + center_x = int(detection[0] * box_scale_w) + center_y = int(detection[1] * box_scale_h) + width = int(detection[2] * box_scale_w) + height = int(detection[3] * box_scale_h) + left = int(center_x - width / 2) + top = int(center_y - height / 2) + classIds.append(classId) + confidences.append(float(confidence)) + boxes.append([left, top, width, height]) else: - print('Unknown output layer type: ' + lastLayer.type) + print('Unknown postprocessing method: ' + args.postprocessing) exit() # NMS is used inside Region layer only on DNN_BACKEND_OPENCV for another backends we need NMS in sample # or NMS is required if number of outputs > 1 - if len(outNames) > 1 or (lastLayer.type == 'Region' or args.postprocessing == 'yolov8') and args.backend != cv.dnn.DNN_BACKEND_OPENCV: + if len(outNames) > 1 or (args.postprocessing == 'darknet' or args.postprocessing == 'yolov8' or args.postprocessing == 'yolov5') and args.backend != cv.dnn.DNN_BACKEND_OPENCV: indices = [] classIds = np.array(classIds) boxes = np.array(boxes) @@ -308,14 +330,11 @@ def processingThreadBody(): # Create a 4D blob from a frame. inpWidth = args.width if args.width else frameWidth inpHeight = args.height if args.height else frameHeight - blob = cv.dnn.blobFromImage(frame, size=(inpWidth, inpHeight), swapRB=args.rgb, ddepth=cv.CV_32F) + blob = cv.dnn.blobFromImage(frame, scalefactor=args.scale, mean=args.mean, size=(inpWidth, inpHeight), swapRB=args.rgb, ddepth=cv.CV_32F) processedFramesQueue.put(frame) # Run a model - net.setInput(blob, scalefactor=args.scale, mean=args.mean) - if net.getLayer(0).outputNameToIndex('im_info') != -1: # Faster-RCNN or R-FCN - frame = cv.resize(frame, (inpWidth, inpHeight)) - net.setInput(np.array([[inpHeight, inpWidth, 1.6]], dtype=np.float32), 'im_info') + net.setInput(blob) if asyncN: futureOutputs.append(net.forwardAsync()) @@ -385,9 +404,9 @@ else: inpWidth = args.width if args.width else frameWidth inpHeight = args.height if args.height else frameHeight - blob = cv.dnn.blobFromImage(frame, size=(inpWidth, inpHeight), swapRB=args.rgb, ddepth=cv.CV_32F) + blob = cv.dnn.blobFromImage(frame, scalefactor=args.scale, mean=args.mean, size=(inpWidth, inpHeight), swapRB=args.rgb, ddepth=cv.CV_32F) - net.setInput(blob, scalefactor=args.scale, mean=args.mean) + net.setInput(blob) outs = net.forward(outNames) boxes, classIds, confidences, indices = postprocess(frame, outs)