From 83d70b0f36904e8906dc3f446fc093dac9e6a590 Mon Sep 17 00:00:00 2001
From: Chia-Hsiang Tsai <84863554+Tsai-chia-hsiang@users.noreply.github.com>
Date: Thu, 16 Nov 2023 18:40:00 +0800
Subject: [PATCH] Merge pull request #24396 from Tsai-chia-hsiang:yolov8cv

Using cv2 dnn interface to run yolov8 model #24396

This is a sample code for using opencv dnn interface to run ultralytics yolov8 model for object detection.

### Pull Request Readiness Checklist

See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request

- [X] I agree to contribute to the project under Apache 2 License.
- [X] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [X] The PR is proposed to the proper branch
- [] There is a reference to the original bug report and related work
- [] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
      Patch to opencv_extra has the same branch name.
- [] The feature is well documented and sample code can be built with the project CMake
---
 samples/dnn/common.py           |  4 ++++
 samples/dnn/models.yml          | 18 +++++++++++++++++
 samples/dnn/object_detection.py | 36 ++++++++++++++++++++++-----------
 3 files changed, 46 insertions(+), 12 deletions(-)

diff --git a/samples/dnn/common.py b/samples/dnn/common.py
index db9283b5d8..4765506eac 100644
--- a/samples/dnn/common.py
+++ b/samples/dnn/common.py
@@ -79,6 +79,10 @@ def add_preproc_args(zoo, parser, sample):
                  help='Indicate that model works with RGB input images instead BGR ones.')
     add_argument(zoo, parser, 'classes',
                  help='Optional path to a text file with names of classes to label detected objects.')
+    add_argument(zoo, parser, 'postprocessing', type=str,
+                 help='Post-processing kind depends on model topology.')
+    add_argument(zoo, parser, 'background_label_id', type=int, default=-1,
+                 help='An index of background class in predictions. If not negative, exclude such class from list of classes.')
 
 
 def findFile(filename):
diff --git a/samples/dnn/models.yml b/samples/dnn/models.yml
index 53d8b8048f..4d2774c71e 100644
--- a/samples/dnn/models.yml
+++ b/samples/dnn/models.yml
@@ -33,6 +33,7 @@ yolov4:
   height: 416
   rgb: true
   classes: "object_detection_classes_yolo.txt"
+  background_label_id: 0
   sample: "object_detection"
 
 yolov4-tiny:
@@ -47,6 +48,7 @@ yolov4-tiny:
   height: 416
   rgb: true
   classes: "object_detection_classes_yolo.txt"
+  background_label_id: 0
   sample: "object_detection"
 
 yolov3:
@@ -61,6 +63,7 @@ yolov3:
   height: 416
   rgb: true
   classes: "object_detection_classes_yolo.txt"
+  background_label_id: 0
   sample: "object_detection"
 
 tiny-yolo-voc:
@@ -75,6 +78,21 @@ tiny-yolo-voc:
   height: 416
   rgb: true
   classes: "object_detection_classes_pascal_voc.txt"
+  background_label_id: 0
+  sample: "object_detection"
+
+yolov8:
+  load_info:
+    url: "https://github.com/CVHub520/X-AnyLabeling/releases/download/v0.1.0/yolov8n.onnx"
+    sha1: "68f864475d06e2ec4037181052739f268eeac38d"
+  model: "yolov8n.onnx"
+  mean: [0, 0, 0]
+  scale: 0.00392
+  width: 640
+  height: 640
+  rgb: true
+  postprocessing: "yolov8"
+  classes: "object_detection_classes_yolo.txt"
   sample: "object_detection"
 
 # Caffe implementation of SSD model from https://github.com/chuanqi305/MobileNet-SSD
diff --git a/samples/dnn/object_detection.py b/samples/dnn/object_detection.py
index 0ca5586159..875ed3929f 100644
--- a/samples/dnn/object_detection.py
+++ b/samples/dnn/object_detection.py
@@ -2,6 +2,7 @@ import cv2 as cv
 import argparse
 import numpy as np
 import sys
+import copy
 import time
 from threading import Thread
 if sys.version_info[0] == 2:
@@ -27,7 +28,7 @@ parser.add_argument('--out_tf_graph', default='graph.pbtxt',
                     help='For models from TensorFlow Object Detection API, you may '
                          'pass a .config file which was used for training through --config '
                          'argument. This way an additional .pbtxt file with TensorFlow graph will be created.')
-parser.add_argument('--framework', choices=['caffe', 'tensorflow', 'torch', 'darknet', 'dldt'],
+parser.add_argument('--framework', choices=['caffe', 'tensorflow', 'torch', 'darknet', 'dldt', 'onnx'],
                     help='Optional name of an origin framework of the model. '
                          'Detect it automatically if it does not set.')
 parser.add_argument('--thr', type=float, default=0.5, help='Confidence threshold')
@@ -86,7 +87,7 @@ if args.classes:
         classes = f.read().rstrip('\n').split('\n')
 
 # Load a network
-net = cv.dnn.readNet(cv.samples.findFile(args.model), cv.samples.findFile(args.config), args.framework)
+net = cv.dnn.readNet(args.model, args.config, args.framework)
 net.setPreferableBackend(args.backend)
 net.setPreferableTarget(args.target)
 outNames = net.getUnconnectedOutLayersNames()
@@ -145,20 +146,32 @@ def postprocess(frame, outs):
                     classIds.append(int(detection[1]) - 1)  # Skip background label
                     confidences.append(float(confidence))
                     boxes.append([left, top, width, height])
-    elif lastLayer.type == 'Region':
+    elif lastLayer.type == 'Region' or args.postprocessing == 'yolov8':
         # Network produces output blob with a shape NxC where N is a number of
         # detected objects and C is a number of classes + 4 where the first 4
         # numbers are [center_x, center_y, width, height]
+        if args.postprocessing == 'yolov8':
+            box_scale_w = frameWidth / args.width
+            box_scale_h = frameHeight / args.height
+        else:
+            box_scale_w = frameWidth
+            box_scale_h = frameHeight
+
         for out in outs:
+            if args.postprocessing == 'yolov8':
+                out = out[0].transpose(1, 0)
+
             for detection in out:
-                scores = detection[5:]
+                scores = detection[4:]
+                if args.background_label_id >= 0:
+                    scores = np.delete(scores, args.background_label_id)
                 classId = np.argmax(scores)
                 confidence = scores[classId]
                 if confidence > confThreshold:
-                    center_x = int(detection[0] * frameWidth)
-                    center_y = int(detection[1] * frameHeight)
-                    width = int(detection[2] * frameWidth)
-                    height = int(detection[3] * frameHeight)
+                    center_x = int(detection[0] * box_scale_w)
+                    center_y = int(detection[1] * box_scale_h)
+                    width = int(detection[2] * box_scale_w)
+                    height = int(detection[3] * box_scale_h)
                     left = int(center_x - width / 2)
                     top = int(center_y - height / 2)
                     classIds.append(classId)
@@ -170,7 +183,7 @@ def postprocess(frame, outs):
 
     # NMS is used inside Region layer only on DNN_BACKEND_OPENCV for another backends we need NMS in sample
     # or NMS is required if number of outputs > 1
-    if len(outNames) > 1 or lastLayer.type == 'Region' and args.backend != cv.dnn.DNN_BACKEND_OPENCV:
+    if len(outNames) > 1 or (lastLayer.type == 'Region' or args.postprocessing == 'yolov8') and args.backend != cv.dnn.DNN_BACKEND_OPENCV:
         indices = []
         classIds = np.array(classIds)
         boxes = np.array(boxes)
@@ -181,7 +194,6 @@ def postprocess(frame, outs):
             conf = confidences[class_indices]
             box  = boxes[class_indices].tolist()
             nms_indices = cv.dnn.NMSBoxes(box, conf, confThreshold, nmsThreshold)
-            nms_indices = nms_indices[:, 0] if len(nms_indices) else []
             indices.extend(class_indices[nms_indices])
     else:
         indices = np.arange(0, len(classIds))
@@ -282,11 +294,11 @@ def processingThreadBody():
                 futureOutputs.append(net.forwardAsync())
             else:
                 outs = net.forward(outNames)
-                predictionsQueue.put(np.copy(outs))
+                predictionsQueue.put(copy.deepcopy(outs))
 
         while futureOutputs and futureOutputs[0].wait_for(0):
             out = futureOutputs[0].get()
-            predictionsQueue.put(np.copy([out]))
+            predictionsQueue.put(copy.deepcopy([out]))
 
             del futureOutputs[0]