From 9d37cdaa6605d4b1512666fde7737e2bd80d839d Mon Sep 17 00:00:00 2001
From: Daniel Cauchi <33454325+CowKeyMan@users.noreply.github.com>
Date: Tue, 1 Dec 2020 14:50:24 +0100
Subject: [PATCH] Merge pull request #18891 from
 CowKeyMan:NMS_boxes_with_different_labels

Add option for NMS for boxes with different labels

* DetectionModel impl

* Add option for NMS for boxes with different labels

In the detect function in modules/dnn/include/opencv2/dnn/dnn.hpp, whose implementation can be found at modules/dnn/src/model.cpp, the Non Max Suppression (NMS) is applied only for objects of the same label. Thus, a flag
was added with the purpose to allow developers to choose if they want to keep the default implementation or wether they would like NMS to be applied to all the boxes, regardless of label.

The flag is called nmsDifferentLabels, and is given a default value of false, which applies the current default implementation, thus allowing existing projects to update opencv without disruption

Solves issue opencv#18832

* Change return type of set & Add default constr

* Add assertions due to default constructor
---
 modules/dnn/include/opencv2/dnn/dnn.hpp |  17 ++++
 modules/dnn/src/model.cpp               | 126 +++++++++++++++++-------
 modules/dnn/test/test_model.cpp         |  57 ++++++++++-
 3 files changed, 165 insertions(+), 35 deletions(-)
diff --git a/modules/dnn/include/opencv2/dnn/dnn.hpp b/modules/dnn/include/opencv2/dnn/dnn.hpp
index 69b71f90ce..5467c989ac 100644
--- a/modules/dnn/include/opencv2/dnn/dnn.hpp
+++ b/modules/dnn/include/opencv2/dnn/dnn.hpp
@@ -1296,6 +1296,23 @@ CV__DNN_INLINE_NS_BEGIN
           */
          CV_WRAP DetectionModel(const Net& network);
 
+         CV_DEPRECATED_EXTERNAL  // avoid using in C++ code (need to fix bindings first)
+         DetectionModel();
+
+         /**
+          * @brief nmsAcrossClasses defaults to false,
+          * such that when non max suppression is used during the detect() function, it will do so per-class.
+          * This function allows you to toggle this behaviour.
+          * @param[in] value The new value for nmsAcrossClasses
+          */
+         CV_WRAP DetectionModel& setNmsAcrossClasses(bool value);
+
+         /**
+          * @brief Getter for nmsAcrossClasses. This variable defaults to false,
+          * such that when non max suppression is used during the detect() function, it will do so only per-class
+          */
+         CV_WRAP bool getNmsAcrossClasses();
+
          /** @brief Given the @p input frame, create input blob, run net and return result detections.
           *  @param[in]  frame  The input image.
           *  @param[out] classIds Class indexes in result detection.
diff --git a/modules/dnn/src/model.cpp b/modules/dnn/src/model.cpp
index aefeaa42b3..16f7d31a25 100644
--- a/modules/dnn/src/model.cpp
+++ b/modules/dnn/src/model.cpp
@@ -320,34 +320,78 @@ void SegmentationModel::segment(InputArray frame, OutputArray mask)
     }
 }
 
-void disableRegionNMS(Net& net)
+class DetectionModel_Impl : public Model::Impl
 {
-    for (String& name : net.getUnconnectedOutLayersNames())
+public:
+    virtual ~DetectionModel_Impl() {}
+    DetectionModel_Impl() : Impl() {}
+    DetectionModel_Impl(const DetectionModel_Impl&) = delete;
+    DetectionModel_Impl(DetectionModel_Impl&&) = delete;
+
+    void disableRegionNMS(Net& net)
     {
-        int layerId = net.getLayerId(name);
-        Ptr<RegionLayer> layer = net.getLayer(layerId).dynamicCast<RegionLayer>();
-        if (!layer.empty())
+        for (String& name : net.getUnconnectedOutLayersNames())
         {
-            layer->nmsThreshold = 0;
+            int layerId = net.getLayerId(name);
+            Ptr<RegionLayer> layer = net.getLayer(layerId).dynamicCast<RegionLayer>();
+            if (!layer.empty())
+            {
+                layer->nmsThreshold = 0;
+            }
         }
     }
-}
+
+    void setNmsAcrossClasses(bool value) {
+        nmsAcrossClasses = value;
+    }
+
+    bool getNmsAcrossClasses() {
+        return nmsAcrossClasses;
+    }
+
+private:
+    bool nmsAcrossClasses = false;
+};
 
 DetectionModel::DetectionModel(const String& model, const String& config)
-    : Model(model, config)
+    : DetectionModel(readNet(model, config))
 {
-    disableRegionNMS(getNetwork_());  // FIXIT Move to DetectionModel::Impl::initNet()
+    // nothing
 }
 
-DetectionModel::DetectionModel(const Net& network) : Model(network)
+DetectionModel::DetectionModel(const Net& network) : Model()
 {
-    disableRegionNMS(getNetwork_());  // FIXIT Move to DetectionModel::Impl::initNet()
+    impl = makePtr<DetectionModel_Impl>();
+    impl->initNet(network);
+    impl.dynamicCast<DetectionModel_Impl>()->disableRegionNMS(getNetwork_());  // FIXIT Move to DetectionModel::Impl::initNet()
+}
+
+DetectionModel::DetectionModel() : Model()
+{
+    // nothing
+}
+
+DetectionModel& DetectionModel::setNmsAcrossClasses(bool value)
+{
+    CV_Assert(impl != nullptr && impl.dynamicCast<DetectionModel_Impl>() != nullptr); // remove once default constructor is removed
+
+    impl.dynamicCast<DetectionModel_Impl>()->setNmsAcrossClasses(value);
+    return *this;
+}
+
+bool DetectionModel::getNmsAcrossClasses()
+{
+    CV_Assert(impl != nullptr && impl.dynamicCast<DetectionModel_Impl>() != nullptr); // remove once default constructor is removed
+
+    return impl.dynamicCast<DetectionModel_Impl>()->getNmsAcrossClasses();
 }
 
 void DetectionModel::detect(InputArray frame, CV_OUT std::vector<int>& classIds,
                             CV_OUT std::vector<float>& confidences, CV_OUT std::vector<Rect>& boxes,
                             float confThreshold, float nmsThreshold)
 {
+    CV_Assert(impl != nullptr && impl.dynamicCast<DetectionModel_Impl>() != nullptr); // remove once default constructor is removed
+
     std::vector<Mat> detections;
     impl->processFrame(frame, detections);
 
@@ -413,7 +457,7 @@ void DetectionModel::detect(InputArray frame, CV_OUT std::vector<int>& classIds,
     {
         std::vector<int> predClassIds;
         std::vector<Rect> predBoxes;
-        std::vector<float> predConf;
+        std::vector<float> predConfidences;
         for (int i = 0; i < detections.size(); ++i)
         {
             // Network produces output blob with a shape NxC where N is a number of
@@ -442,37 +486,51 @@ void DetectionModel::detect(InputArray frame, CV_OUT std::vector<int>& classIds,
                 height   = std::max(1, std::min(height, frameHeight - top));
 
                 predClassIds.push_back(classIdPoint.x);
-                predConf.push_back(static_cast<float>(conf));
+                predConfidences.push_back(static_cast<float>(conf));
                 predBoxes.emplace_back(left, top, width, height);
             }
         }
 
         if (nmsThreshold)
         {
-            std::map<int, std::vector<size_t> > class2indices;
-            for (size_t i = 0; i < predClassIds.size(); i++)
+            if (getNmsAcrossClasses())
             {
-                if (predConf[i] >= confThreshold)
-                {
-                    class2indices[predClassIds[i]].push_back(i);
-                }
-            }
-            for (const auto& it : class2indices)
-            {
-                std::vector<Rect> localBoxes;
-                std::vector<float> localConfidences;
-                for (size_t idx : it.second)
-                {
-                    localBoxes.push_back(predBoxes[idx]);
-                    localConfidences.push_back(predConf[idx]);
-                }
                 std::vector<int> indices;
-                NMSBoxes(localBoxes, localConfidences, confThreshold, nmsThreshold, indices);
-                classIds.resize(classIds.size() + indices.size(), it.first);
+                NMSBoxes(predBoxes, predConfidences, confThreshold, nmsThreshold, indices);
                 for (int idx : indices)
                 {
-                    boxes.push_back(localBoxes[idx]);
-                    confidences.push_back(localConfidences[idx]);
+                    boxes.push_back(predBoxes[idx]);
+                    confidences.push_back(predConfidences[idx]);
+                    classIds.push_back(predClassIds[idx]);
+                }
+            }
+            else
+            {
+                std::map<int, std::vector<size_t> > class2indices;
+                for (size_t i = 0; i < predClassIds.size(); i++)
+                {
+                    if (predConfidences[i] >= confThreshold)
+                    {
+                        class2indices[predClassIds[i]].push_back(i);
+                    }
+                }
+                for (const auto& it : class2indices)
+                {
+                    std::vector<Rect> localBoxes;
+                    std::vector<float> localConfidences;
+                    for (size_t idx : it.second)
+                    {
+                        localBoxes.push_back(predBoxes[idx]);
+                        localConfidences.push_back(predConfidences[idx]);
+                    }
+                    std::vector<int> indices;
+                    NMSBoxes(localBoxes, localConfidences, confThreshold, nmsThreshold, indices);
+                    classIds.resize(classIds.size() + indices.size(), it.first);
+                    for (int idx : indices)
+                    {
+                        boxes.push_back(localBoxes[idx]);
+                        confidences.push_back(localConfidences[idx]);
+                    }
                 }
             }
         }
@@ -480,7 +538,7 @@ void DetectionModel::detect(InputArray frame, CV_OUT std::vector<int>& classIds,
         {
             boxes       = std::move(predBoxes);
             classIds    = std::move(predClassIds);
-            confidences = std::move(predConf);
+            confidences = std::move(predConfidences);
         }
     }
     else
diff --git a/modules/dnn/test/test_model.cpp b/modules/dnn/test/test_model.cpp
index 7d516de73e..58a881488a 100644
--- a/modules/dnn/test/test_model.cpp
+++ b/modules/dnn/test/test_model.cpp
@@ -25,7 +25,8 @@ public:
                          double scoreDiff, double iouDiff,
                          double confThreshold = 0.24, double nmsThreshold = 0.0,
                          const Size& size = {-1, -1}, Scalar mean = Scalar(),
-                         double scale = 1.0, bool swapRB = false, bool crop = false)
+                         double scale = 1.0, bool swapRB = false, bool crop = false,
+                         bool nmsAcrossClasses = false)
     {
         checkBackend();
 
@@ -38,6 +39,8 @@ public:
         model.setPreferableBackend(backend);
         model.setPreferableTarget(target);
 
+        model.setNmsAcrossClasses(nmsAcrossClasses);
+
         std::vector<int> classIds;
         std::vector<float> confidences;
         std::vector<Rect> boxes;
@@ -177,6 +180,58 @@ TEST_P(Test_Model, DetectRegion)
                     Scalar(), scale, swapRB);
 }
 
+TEST_P(Test_Model, DetectRegionWithNmsAcrossClasses)
+{
+    applyTestTag(CV_TEST_TAG_LONG, CV_TEST_TAG_MEMORY_1GB);
+
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2020040000)  // nGraph compilation failure
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL_FP16)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
+#endif
+
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_GE(2019010000)
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_OPENCL_FP16)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16);
+#endif
+
+#if defined(INF_ENGINE_RELEASE)
+    if (target == DNN_TARGET_MYRIAD
+        && getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD_X);
+#endif
+
+    std::vector<int> refClassIds = { 6, 11 };
+    std::vector<float> refConfidences = { 0.750469f, 0.901615f };
+    std::vector<Rect2d> refBoxes = { Rect2d(240, 53, 135, 72),
+                                    Rect2d(58, 141, 117, 249) };
+
+    std::string img_path = _tf("dog416.png");
+    std::string weights_file = _tf("yolo-voc.weights", false);
+    std::string config_file = _tf("yolo-voc.cfg");
+
+    double scale = 1.0 / 255.0;
+    Size size{ 416, 416 };
+    bool swapRB = true;
+    bool crop = false;
+    bool nmsAcrossClasses = true;
+
+    double confThreshold = 0.24;
+    double nmsThreshold = (target == DNN_TARGET_MYRIAD) ? 0.15: 0.15;
+    double scoreDiff = 8e-5, iouDiff = 1e-5;
+    if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CUDA_FP16)
+    {
+        scoreDiff = 1e-2;
+        iouDiff = 1.6e-2;
+    }
+
+    testDetectModel(weights_file, config_file, img_path, refClassIds, refConfidences,
+        refBoxes, scoreDiff, iouDiff, confThreshold, nmsThreshold, size,
+        Scalar(), scale, swapRB, crop,
+        nmsAcrossClasses);
+}
+
 TEST_P(Test_Model, DetectionOutput)
 {
 #if defined(INF_ENGINE_RELEASE)