Merge pull request #15189 from dvd42:keypoints_module

Keypoints module
2025-08-06 14:36:36 +08:00 · 2019-12-13 16:00:06 +01:00 · 2019-12-13 16:00:06 +01:00 · 5b0b59ecfb
commit 5b0b59ecfb
parent c2b6c67431
3 changed files with 141 additions and 0 deletions
--- a/modules/dnn/include/opencv2/dnn/dnn.hpp
+++ b/modules/dnn/include/opencv2/dnn/dnn.hpp
@ -1135,6 +1135,38 @@ CV__DNN_INLINE_NS_BEGIN
         CV_WRAP void classify(InputArray frame, CV_OUT int& classId, CV_OUT float& conf);
     };

+     /** @brief This class represents high-level API for keypoints models
+      *
+      * KeypointsModel allows to set params for preprocessing input image.
+      * KeypointsModel creates net from file with trained weights and config,
+      * sets preprocessing input, runs forward pass and returns the x and y coordinates of each detected keypoint
+      */
+     class CV_EXPORTS_W KeypointsModel: public Model
+     {
+     public:
+         /**
+          * @brief Create keypoints model from network represented in one of the supported formats.
+          * An order of @p model and @p config arguments does not matter.
+          * @param[in] model Binary file contains trained weights.
+          * @param[in] config Text file contains network configuration.
+          */
+          CV_WRAP KeypointsModel(const String& model, const String& config = "");
+
+         /**
+          * @brief Create model from deep learning network.
+          * @param[in] network Net object.
+          */
+         CV_WRAP KeypointsModel(const Net& network);
+
+         /** @brief Given the @p input frame, create input blob, run net
+          *  @param[in]  frame  The input image.
+          *  @param thresh minimum confidence threshold to select a keypoint
+          *  @returns a vector holding the x and y coordinates of each detected keypoint
+          *
+          */
+         CV_WRAP std::vector<Point2f> estimate(InputArray frame, float thresh=0.5);
+     };
+
     /** @brief This class represents high-level API for segmentation  models
      *
      * SegmentationModel allows to set params for preprocessing input image.
--- a/modules/dnn/src/model.cpp
+++ b/modules/dnn/src/model.cpp
@ -137,6 +137,64 @@ void ClassificationModel::classify(InputArray frame, int& classId, float& conf)
    std::tie(classId, conf) = classify(frame);
 }

+KeypointsModel::KeypointsModel(const String& model, const String& config)
+    : Model(model, config) {};
+
+KeypointsModel::KeypointsModel(const Net& network) : Model(network) {};
+
+std::vector<Point2f> KeypointsModel::estimate(InputArray frame, float thresh)
+{
+
+    int frameHeight = frame.getMat().size[0];
+    int frameWidth = frame.getMat().size[1];
+    std::vector<Mat> outs;
+
+    impl->predict(*this, frame.getMat(), outs);
+    CV_Assert(outs.size() == 1);
+    Mat output = outs[0];
+
+    const int nPoints = output.size[1];
+    std::vector<Point2f> points;
+
+    // If output is a map, extract the keypoints
+    if (output.dims == 4)
+    {
+        int height = output.size[2];
+        int width = output.size[3];
+
+        // find the position of the keypoints (ignore the background)
+        for (int n=0; n < nPoints - 1; n++)
+        {
+            // Probability map of corresponding keypoint
+            Mat probMap(height, width, CV_32F, output.ptr(0, n));
+
+            Point2f p(-1, -1);
+            Point maxLoc;
+            double prob;
+            minMaxLoc(probMap, NULL, &prob, NULL, &maxLoc);
+            if (prob > thresh)
+            {
+                p = maxLoc;
+                p.x *= (float)frameWidth / width;
+                p.y *= (float)frameHeight / height;
+                points.push_back(p);
+            }
+        }
+    }
+    // Otherwise the output is a vector of keypoints and we can just return it
+    else
+    {
+        for (int n=0; n < nPoints; n++)
+        {
+            Point2f p;
+            p.x = *output.ptr<float>(0, n, 0);
+            p.y = *output.ptr<float>(0, n, 1);
+            points.push_back(p);
+        }
+    }
+    return points;
+}
+
 SegmentationModel::SegmentationModel(const String& model, const String& config)
    : Model(model, config) {};

--- a/modules/dnn/test/test_model.cpp
+++ b/modules/dnn/test/test_model.cpp
@ -70,6 +70,25 @@ public:
        ASSERT_NEAR(prediction.second, ref.second, norm);
    }

+    void testKeypointsModel(const std::string& weights, const std::string& cfg,
+                            const Mat& frame, const Mat& exp, float norm,
+                            const Size& size = {-1, -1}, Scalar mean = Scalar(),
+                            double scale = 1.0, bool swapRB = false, bool crop = false)
+    {
+        checkBackend();
+
+        std::vector<Point2f> points;
+
+        KeypointsModel model(weights, cfg);
+        model.setInputSize(size).setInputMean(mean).setInputScale(scale)
+             .setInputSwapRB(swapRB).setInputCrop(crop);
+
+        points = model.estimate(frame, 0.5);
+
+        Mat out = Mat(points).reshape(1);
+        normAssert(exp, out, "", norm, norm);
+    }
+
    void testSegmentationModel(const std::string& weights_file, const std::string& config_file,
                               const std::string& inImgPath, const std::string& outImgPath,
                               float norm, const Size& size = {-1, -1}, Scalar mean = Scalar(),
@ -221,6 +240,38 @@ TEST_P(Test_Model, DetectionMobilenetSSD)
                    scoreDiff, iouDiff, confThreshold, nmsThreshold, size, mean, scale);
 }

+TEST_P(Test_Model, Keypoints_pose)
+{
+    Mat inp = imread(_tf("pose.png"));
+    std::string weights = _tf("lightweight_pose_estimation.onnx");
+    Mat exp = blobFromNPY(_tf("keypoints_exp.npy"));
+
+
+    Size size{256, 256};
+    float norm = 1e-4;
+    double scale = 1.0/255;
+    Scalar mean = Scalar(128, 128, 128);
+    bool swapRB = false;
+
+    testKeypointsModel(weights, "", inp, exp, norm, size, mean, scale, swapRB);
+}
+
+TEST_P(Test_Model, Keypoints_face)
+{
+    Mat inp = imread(_tf("gray_face.png"), 0);
+    std::string weights = _tf("facial_keypoints.onnx");
+    Mat exp = blobFromNPY(_tf("facial_keypoints_exp.npy"));
+
+    Size size{224, 224};
+    float norm = 1e-4;
+    double scale = 1.0/255;
+    Scalar mean = Scalar();
+    bool swapRB = false;
+
+    testKeypointsModel(weights, "", inp, exp, norm, size, mean, scale, swapRB);
+
+}
+
 TEST_P(Test_Model, Detection_normalized)
 {
    std::string img_path = _tf("grace_hopper_227.png");