upgrade FaceDetectorYN to v2

2025-08-06 06:26:29 +08:00 · 2022-12-23 11:12:43 +08:00 · 2022-12-23 11:12:43 +08:00 · da3a4dcbc1
commit da3a4dcbc1
parent c4226f0457
2 changed files with 91 additions and 132 deletions
--- a/modules/objdetect/src/face_detect.cpp
+++ b/modules/objdetect/src/face_detect.cpp
@ -6,6 +6,7 @@

 #include "opencv2/imgproc.hpp"
 #include "opencv2/core.hpp"
+
 #ifdef HAVE_OPENCV_DNN
 #include "opencv2/dnn.hpp"
 #endif
@ -27,6 +28,8 @@ public:
                       int top_k,
                       int backend_id,
                       int target_id)
+                       :divisor(32),
+                       strides({8, 16, 32})
    {
        net = dnn::readNet(model, config);
        CV_Assert(!net.empty());
@ -37,18 +40,20 @@ public:
        inputW = input_size.width;
        inputH = input_size.height;

+        padW = (int((inputW - 1) / divisor) + 1) * divisor;
+        padH = (int((inputH - 1) / divisor) + 1) * divisor;
+
        scoreThreshold = score_threshold;
        nmsThreshold = nms_threshold;
        topK = top_k;
-
-        generatePriors();
    }

    void setInputSize(const Size& input_size) override
    {
        inputW = input_size.width;
        inputH = input_size.height;
-        generatePriors();
+        padW = ((inputW - 1) / divisor + 1) * divisor;
+        padH = ((inputH - 1) / divisor + 1) * divisor;
    }

    Size getInputSize() override
@ -97,12 +102,14 @@ public:
            return 0;
        }
        CV_CheckEQ(input_image.size(), Size(inputW, inputH), "Size does not match. Call setInputSize(size) if input size does not match the preset size");
+        // Pad input_image with divisor 32
+        Mat pad_image = padWithDivisor(input_image);

        // Build blob from input image
-        Mat input_blob = dnn::blobFromImage(input_image);
+        Mat input_blob = dnn::blobFromImage(pad_image);

        // Forward
-        std::vector<String> output_names = { "loc", "conf", "iou" };
+        std::vector<String> output_names = { "cls_8", "cls_16", "cls_32", "obj_8", "obj_16", "obj_32", "bbox_8", "bbox_16", "bbox_32", "kps_8", "kps_16", "kps_32" };
        std::vector<Mat> output_blobs;
        net.setInput(input_blob);
        net.forward(output_blobs, output_names);
@ -113,126 +120,70 @@ public:
        return 1;
    }
 private:
-    void generatePriors()
-    {
-        // Calculate shapes of different scales according to the shape of input image
-        Size feature_map_2nd = {
-            int(int((inputW+1)/2)/2), int(int((inputH+1)/2)/2)
-        };
-        Size feature_map_3rd = {
-            int(feature_map_2nd.width/2), int(feature_map_2nd.height/2)
-        };
-        Size feature_map_4th = {
-            int(feature_map_3rd.width/2), int(feature_map_3rd.height/2)
-        };
-        Size feature_map_5th = {
-            int(feature_map_4th.width/2), int(feature_map_4th.height/2)
-        };
-        Size feature_map_6th = {
-            int(feature_map_5th.width/2), int(feature_map_5th.height/2)
-        };
-
-        std::vector<Size> feature_map_sizes;
-        feature_map_sizes.push_back(feature_map_3rd);
-        feature_map_sizes.push_back(feature_map_4th);
-        feature_map_sizes.push_back(feature_map_5th);
-        feature_map_sizes.push_back(feature_map_6th);
-
-        // Fixed params for generating priors
-        const std::vector<std::vector<float>> min_sizes = {
-            {10.0f,  16.0f,  24.0f},
-            {32.0f,  48.0f},
-            {64.0f,  96.0f},
-            {128.0f, 192.0f, 256.0f}
-        };
-        CV_Assert(min_sizes.size() == feature_map_sizes.size()); // just to keep vectors in sync
-        const std::vector<int> steps = { 8, 16, 32, 64 };
-
-        // Generate priors
-        priors.clear();
-        for (size_t i = 0; i < feature_map_sizes.size(); ++i)
-        {
-            Size feature_map_size = feature_map_sizes[i];
-            std::vector<float> min_size = min_sizes[i];
-
-            for (int _h = 0; _h < feature_map_size.height; ++_h)
-            {
-                for (int _w = 0; _w < feature_map_size.width; ++_w)
-                {
-                    for (size_t j = 0; j < min_size.size(); ++j)
-                    {
-                        float s_kx = min_size[j] / inputW;
-                        float s_ky = min_size[j] / inputH;
-
-                        float cx = (_w + 0.5f) * steps[i] / inputW;
-                        float cy = (_h + 0.5f) * steps[i] / inputH;
-
-                        Rect2f prior = { cx, cy, s_kx, s_ky };
-                        priors.push_back(prior);
-                    }
-                }
-            }
-        }
-    }
-
    Mat postProcess(const std::vector<Mat>& output_blobs)
    {
-        // Extract from output_blobs
-        Mat loc = output_blobs[0];
-        Mat conf = output_blobs[1];
-        Mat iou = output_blobs[2];
-
-        // Decode from deltas and priors
-        const std::vector<float> variance = {0.1f, 0.2f};
-        float* loc_v = (float*)(loc.data);
-        float* conf_v = (float*)(conf.data);
-        float* iou_v = (float*)(iou.data);
        Mat faces;
-        // (tl_x, tl_y, w, h, re_x, re_y, le_x, le_y, nt_x, nt_y, rcm_x, rcm_y, lcm_x, lcm_y, score)
-        // 'tl': top left point of the bounding box
-        // 're': right eye, 'le': left eye
-        // 'nt':  nose tip
-        // 'rcm': right corner of mouth, 'lcm': left corner of mouth
-        Mat face(1, 15, CV_32FC1);
-        for (size_t i = 0; i < priors.size(); ++i) {
-            // Get score
-            float clsScore = conf_v[i*2+1];
-            float iouScore = iou_v[i];
-            // Clamp
-            if (iouScore < 0.f) {
-                iouScore = 0.f;
+        for (size_t i = 0; i < strides.size(); ++i) {
+            int cols = int(padW / strides[i]);
+            int rows = int(padH / strides[i]);
+
+            // Extract from output_blobs
+            Mat cls = output_blobs[i];
+            Mat obj = output_blobs[i + strides.size() * 1];
+            Mat bbox = output_blobs[i + strides.size() * 2];
+            Mat kps = output_blobs[i + strides.size() * 3];
+
+            // Decode from predictions
+            float* cls_v = (float*)(cls.data);
+            float* obj_v = (float*)(obj.data);
+            float* bbox_v = (float*)(bbox.data);
+            float* kps_v = (float*)(kps.data);
+
+            // (tl_x, tl_y, w, h, re_x, re_y, le_x, le_y, nt_x, nt_y, rcm_x, rcm_y, lcm_x, lcm_y, score)
+            // 'tl': top left point of the bounding box
+            // 're': right eye, 'le': left eye
+            // 'nt':  nose tip
+            // 'rcm': right corner of mouth, 'lcm': left corner of mouth
+            Mat face(1, 15, CV_32FC1);
+
+            for(int r = 0; r < rows; ++r) {
+                for(int c = 0; c < cols; ++c) {
+                    size_t idx = r * cols + c;
+
+                    // Get score
+                    float cls_score = cls_v[idx];
+                    float obj_score = obj_v[idx];
+
+                    // Clamp
+                    cls_score = MIN(cls_score, 1.f);
+                    cls_score = MAX(cls_score, 0.f);
+                    obj_score = MIN(obj_score, 1.f);
+                    obj_score = MAX(obj_score, 0.f);
+                    float score = std::sqrt(cls_score * obj_score);
+                    face.at<float>(0, 14) = score;
+
+                    // Get bounding box
+                    float cx = ((c + bbox_v[idx * 4 + 0]) * strides[i]);
+                    float cy = ((r + bbox_v[idx * 4 + 1]) * strides[i]);
+                    float w = exp(bbox_v[idx * 4 + 2]) * strides[i];
+                    float h = exp(bbox_v[idx * 4 + 3]) * strides[i];
+
+                    float x1 = cx - w / 2.f;
+                    float y1 = cy - h / 2.f;
+
+                    face.at<float>(0, 0) = x1;
+                    face.at<float>(0, 1) = y1;
+                    face.at<float>(0, 2) = w;
+                    face.at<float>(0, 3) = h;
+
+                    // Get landmarks
+                    for(int n = 0; n < 5; ++n) {
+                        face.at<float>(0, 4 + 2 * n) = (kps_v[idx * 10 + 2 * n] + c) * strides[i];
+                        face.at<float>(0, 4 + 2 * n + 1) = (kps_v[idx * 10 + 2 * n + 1]+ r) * strides[i];
+                    }
+                    faces.push_back(face);
+                }
            }
-            else if (iouScore > 1.f) {
-                iouScore = 1.f;
-            }
-            float score = std::sqrt(clsScore * iouScore);
-            face.at<float>(0, 14) = score;
-
-            // Get bounding box
-            float cx = (priors[i].x + loc_v[i*14+0] * variance[0] * priors[i].width)  * inputW;
-            float cy = (priors[i].y + loc_v[i*14+1] * variance[0] * priors[i].height) * inputH;
-            float w  = priors[i].width  * exp(loc_v[i*14+2] * variance[0]) * inputW;
-            float h  = priors[i].height * exp(loc_v[i*14+3] * variance[1]) * inputH;
-            float x1 = cx - w / 2;
-            float y1 = cy - h / 2;
-            face.at<float>(0, 0) = x1;
-            face.at<float>(0, 1) = y1;
-            face.at<float>(0, 2) = w;
-            face.at<float>(0, 3) = h;
-
-            // Get landmarks
-            face.at<float>(0, 4) = (priors[i].x + loc_v[i*14+ 4] * variance[0] * priors[i].width)  * inputW;  // right eye, x
-            face.at<float>(0, 5) = (priors[i].y + loc_v[i*14+ 5] * variance[0] * priors[i].height) * inputH;  // right eye, y
-            face.at<float>(0, 6) = (priors[i].x + loc_v[i*14+ 6] * variance[0] * priors[i].width)  * inputW;  // left eye, x
-            face.at<float>(0, 7) = (priors[i].y + loc_v[i*14+ 7] * variance[0] * priors[i].height) * inputH;  // left eye, y
-            face.at<float>(0, 8) = (priors[i].x + loc_v[i*14+ 8] * variance[0] * priors[i].width)  * inputW;  // nose tip, x
-            face.at<float>(0, 9) = (priors[i].y + loc_v[i*14+ 9] * variance[0] * priors[i].height) * inputH;  // nose tip, y
-            face.at<float>(0, 10) = (priors[i].x + loc_v[i*14+10] * variance[0] * priors[i].width)  * inputW; // right corner of mouth, x
-            face.at<float>(0, 11) = (priors[i].y + loc_v[i*14+11] * variance[0] * priors[i].height) * inputH; // right corner of mouth, y
-            face.at<float>(0, 12) = (priors[i].x + loc_v[i*14+12] * variance[0] * priors[i].width)  * inputW; // left corner of mouth, x
-            face.at<float>(0, 13) = (priors[i].y + loc_v[i*14+13] * variance[0] * priors[i].height) * inputH; // left corner of mouth, y
-
-            faces.push_back(face);
        }

        if (faces.rows > 1)
@ -265,16 +216,27 @@ private:
            return faces;
        }
    }
+
+    Mat padWithDivisor(InputArray& input_image)
+    {
+        int bottom = padH - inputH;
+        int right = padW - inputW;
+        Mat pad_image;
+        copyMakeBorder(input_image, pad_image, 0, bottom, 0, right, BORDER_CONSTANT, 0);
+        return pad_image;
+    }
 private:
    dnn::Net net;

    int inputW;
    int inputH;
+    int padW;
+    int padH;
+    const int divisor;
+    int topK;
    float scoreThreshold;
    float nmsThreshold;
-    int topK;
-
-    std::vector<Rect2f> priors;
+    const std::vector<int> strides;
 };
 #endif

--- a/modules/objdetect/test/test_face.cpp
+++ b/modules/objdetect/test/test_face.cpp
@ -65,20 +65,16 @@ TEST(Objdetect_face_detection, regression)
 {
    // Pre-set params
    float scoreThreshold = 0.7f;
-    float matchThreshold = 0.9f;
-    float l2disThreshold = 5.0f;
+    float matchThreshold = 0.7f;
+    float l2disThreshold = 15.0f;
    int numLM = 5;
    int numCoords = 4 + 2 * numLM;

    // Load ground truth labels
    std::map<std::string, Mat> gt = blobFromTXT(findDataFile("dnn_face/detection/cascades_labels.txt"), numCoords);
-    // for (auto item: gt)
-    // {
-    //     std::cout << item.first << " " << item.second.size() << std::endl;
-    // }

    // Initialize detector
-    std::string model = findDataFile("dnn/onnx/models/yunet-202202.onnx", false);
+    std::string model = findDataFile("dnn/onnx/models/yunet-202303.onnx", false);
    Ptr<FaceDetectorYN> faceDetector = FaceDetectorYN::create(model, "", Size(300, 300));
    faceDetector->setScoreThreshold(0.7f);

@ -137,6 +133,7 @@ TEST(Objdetect_face_detection, regression)
                        lmMatched[lmIdx] = true;
                    }
                }
+                break;
            }
            EXPECT_TRUE(boxMatched) << "In image " << item.first << ", cannot match resBox " << resBox << " with any ground truth.";
            if (boxMatched)
@ -178,7 +175,7 @@ TEST(Objdetect_face_recognition, regression)
    }

    // Initialize detector
-    std::string detect_model = findDataFile("dnn/onnx/models/yunet-202202.onnx", false);
+    std::string detect_model = findDataFile("dnn/onnx/models/yunet-202303.onnx", false);
    Ptr<FaceDetectorYN> faceDetector = FaceDetectorYN::create(detect_model, "", Size(150, 150), score_thresh, nms_thresh);

    std::string recog_model = findDataFile("dnn/onnx/models/face_recognizer_fast.onnx", false);