LBP classifier was refactored, added parameter for max size of detected object

2025-07-21 19:47:02 +08:00 · 2012-07-02 08:08:11 +00:00 · 2012-07-02 08:08:11 +00:00 · a9f2f522e7
commit a9f2f522e7
parent e6f7e4d83e
4 changed files with 113 additions and 86 deletions
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@ -1435,7 +1435,8 @@ public:
    bool load(const std::string& filename);
    void release();

-    int detectMultiScale(const GpuMat& image, GpuMat& scaledImageBuffer, GpuMat& objectsBuf, double scaleFactor = 1.1, int minNeighbors = 4/*, Size minSize = Size()*/);
+    int detectMultiScale(const GpuMat& image, GpuMat& scaledImageBuffer, GpuMat& objectsBuf, double scaleFactor = 1.1, int minNeighbors = 4,
+    cv::Size maxObjectSize = cv::Size()/*, Size minSize = Size()*/);
    void preallocateIntegralBuffer(cv::Size desired);

    bool findLargestObject;
--- a/modules/gpu/src/cascadeclassifier.cpp
+++ b/modules/gpu/src/cascadeclassifier.cpp
@ -48,20 +48,6 @@ using namespace cv;
 using namespace cv::gpu;
 using namespace std;

-struct Stage
-{
-    int    first;
-    int    ntrees;
-    float  threshold;
-};
-
-struct DTreeNode
-{
-    int   featureIdx;
-    int   left;
-    int   right;
-};
-
 #if !defined (HAVE_CUDA)
 // ============ old fashioned haar cascade ==============================================//
 cv::gpu::CascadeClassifier_GPU::CascadeClassifier_GPU()               { throw_nogpu(); }
@ -128,6 +114,13 @@ bool cv::gpu::CascadeClassifier_GPU_LBP::load(const string& classifierAsXml)
 #define GPU_CC_FEATURES             "features"
 #define GPU_CC_RECT                 "rect"

+struct Stage
+{
+    int    first;
+    int    ntrees;
+    float  threshold;
+};
+
 // currently only stump based boost classifiers are supported
 bool CascadeClassifier_GPU_LBP::read(const FileNode &root)
 {
@ -279,12 +272,26 @@ namespace cv { namespace gpu { namespace device
 {
    namespace lbp
    {
-        void cascadeClassify(const DevMem2Db stages, const DevMem2Di trees, const DevMem2Db nodes, const DevMem2Df leaves, const DevMem2Di subsets, const DevMem2Db features,
-            const DevMem2Di integral, int workWidth, int workHeight, int clWidth, int clHeight, float scale, int step, int subsetSize, DevMem2D_<int4> objects, int minNeighbors = 4, cudaStream_t stream = 0);
+        void classifyStump(const DevMem2Db mstages,
+                           const int nstages,
+                           const DevMem2Di mnodes,
+                           const DevMem2Df mleaves,
+                           const DevMem2Di msubsets,
+                           const DevMem2Db mfeatures,
+                           const DevMem2Di integral,
+                           const int workWidth,
+                           const int workHeight,
+                           const int clWidth,
+                           const int clHeight,
+                           float scale,
+                           int step,
+                           int subsetSize,
+                           DevMem2D_<int4> objects);
    }
 }}}

-int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, GpuMat& scaledImageBuffer, GpuMat& objects, double scaleFactor, int minNeighbors /*, Size minSize=Size()*/)
+int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, GpuMat& scaledImageBuffer, GpuMat& objects,
+                                                        double scaleFactor, int minNeighbors, cv::Size maxObjectSize /*, Size minSize=Size()*/)
 {
    CV_Assert( scaleFactor > 1 && image.depth() == CV_8U );
    CV_Assert(!empty());
@ -299,28 +306,35 @@ int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, Gp
    // temp solution
    objects.create(image.rows, image.cols, CV_32SC4);

-    scaledImageBuffer.create(image.size(), image.type());
+    if (maxObjectSize == cv::Size())
+        maxObjectSize = image.size();
+
+    scaledImageBuffer.create(image.rows + 1, image.cols + 1, CV_8U);

-    // TODO: specify max objects size
    for( double factor = 1; ; factor *= scaleFactor )
    {
        cv::Size windowSize(cvRound(NxM.width * factor), cvRound(NxM.height * factor));
        cv::Size scaledImageSize(cvRound( image.cols / factor ), cvRound( image.rows / factor ));
        cv::Size processingRectSize( scaledImageSize.width - NxM.width + 1, scaledImageSize.height - NxM.height + 1 );

-        // nothing to do
        if (processingRectSize.width <= 0 || processingRectSize.height <= 0 )
            break;
-        // TODO: min max object sizes cheching
-        cv::gpu::resize(image, scaledImageBuffer, scaledImageSize, 0, 0, INTER_NEAREST);
-        //prepare image for evaluation
+
+        if( windowSize.width > maxObjectSize.width || windowSize.height > maxObjectSize.height )
+            break;
+
+        // if( windowSize.width < minObjectSize.width || windowSize.height < minObjectSize.height )
+        //     continue;
+
+        cv::gpu::resize(image, scaledImageBuffer, scaledImageSize, 0, 0, CV_INTER_LINEAR);
+
        integral.create(cv::Size(scaledImageSize.width + 1, scaledImageSize.height + 1), CV_32SC1);
        cv::gpu::integral(scaledImageBuffer, integral);

        int step = (factor <= 2.) + 1;

-        cv::gpu::device::lbp::cascadeClassify(stage_mat, trees_mat, nodes_mat, leaves_mat, subsets_mat, features_mat,
-         integral, processingRectSize.width, processingRectSize.height, windowSize.width, windowSize.height, scaleFactor, step, subsetSize, objects, minNeighbors);
+        cv::gpu::device::lbp::classifyStump(stage_mat, stage_mat.cols / sizeof(Stage), nodes_mat, leaves_mat, subsets_mat, features_mat,
+        integral, processingRectSize.width, processingRectSize.height, windowSize.width, windowSize.height, scaleFactor, step, subsetSize, objects);
    }
    // TODO: reject levels

--- a/modules/gpu/src/cuda/lbp.cu
+++ b/modules/gpu/src/cuda/lbp.cu
@ -46,54 +46,69 @@ namespace cv { namespace gpu { namespace device
 {
    namespace lbp
    {
-        __global__ void lbp_classify(const DevMem2D_< ::cv::gpu::device::Stage> stages, const DevMem2Di trees, const DevMem2D_< ::cv::gpu::device::ClNode> nodes,
-            const DevMem2Df leaves, const DevMem2Di subsets,
-            const DevMem2D_<uchar4> features, const DevMem2Di integral, float step, int subsetSize, DevMem2D_<int4> objects, float scale, int clWidth, int clHeight)
+        __global__ void lbp_classify_stump(Stage* stages, int nstages, ClNode* nodes, const float* leaves, const int* subsets, const uchar4* features,
+            const DevMem2Di integral, int workWidth, int workHeight, int clWidth, int clHeight, float scale, int step, int subsetSize, DevMem2D_<int4> objects)
        {
-            unsigned int x = threadIdx.x * step;
-            unsigned int y = blockIdx.x  * step;
-            int nodeOfs = 0, leafOfs = 0;
-            ::cv::gpu::device::Feature evaluator;
+            int y = threadIdx.x * scale;
+            int x = blockIdx.x * scale;

-            for (int s = 0; s < stages.cols; s++ )
+            int i = 0;
+
+            int current_node = 0;
+            int current_leave = 0;
+
+            LBP evaluator;
+            for (int s = 0; s < nstages; s++ )
            {
-                ::cv::gpu::device::Stage stage = stages(0, s);
-                int sum = 0;
-                for (int w = 0; w < stage.ntrees; w++)
-                {
-                    ::cv::gpu::device::ClNode node = nodes(0, nodeOfs);
-                    uchar4 feature = features(0, node.featureIdx);
+                float sum = 0;
+                Stage stage = stages[s];

-                    uchar c = evaluator(y, x, feature, integral);
-                    const int subsetIdx = (nodeOfs * subsetSize);
-                    int idx = subsetIdx + ((c >> 5) & ( 1 << (c & 31)) ? leafOfs : leafOfs + 1);
-                    sum += leaves(0, subsets(0, idx) );
-                    nodeOfs++;
-                    leafOfs += 2;
+                for (int t = 0; t < stage.ntrees; t++)
+                {
+                    ClNode node = nodes[current_node];
+
+                    uchar4 feature = features[node.featureIdx];
+                    int c = evaluator(y, x, feature, integral);
+                    const int* subsetIdx = subsets + (current_node * subsetSize);
+
+                    int idx =  (subsetIdx[c >> 5] & ( 1 << (c & 31))) ? current_leave : current_leave + 1;
+                    sum += leaves[idx];
+                    current_node += 1;
+                    current_leave += 2;
                }

+                i = s;
                if (sum < stage.threshold)
                    return;
            }
+
            int4 rect;
            rect.x = roundf(x * scale);
            rect.y = roundf(y * scale);
-            rect.z = roundf(clWidth * scale);
-            rect.w = roundf(clHeight * scale);
-            objects(blockIdx.x, threadIdx.x) = rect;
+            rect.z = roundf(clWidth);
+            rect.w = roundf(clHeight);
+
+            if(i >= 19)
+                printf( "GPU detected [%d, %d] - [%d, %d]\n", rect.x, rect.y, rect.z, rect.w);
+
        }

-        void cascadeClassify(const DevMem2Db bstages, const DevMem2Di trees, const DevMem2Db bnodes, const DevMem2Df leaves, const DevMem2Di subsets, const DevMem2Db bfeatures,
-            const DevMem2Di integral, int workWidth, int workHeight, int clWidth, int clHeight, float scale, int step, int subsetSize, DevMem2D_<int4> objects, int minNeighbors, cudaStream_t stream)
+        void classifyStump(const DevMem2Db mstages, const int nstages, const DevMem2Di mnodes, const DevMem2Df mleaves, const DevMem2Di msubsets, const DevMem2Db mfeatures,
+                           const DevMem2Di integral, const int workWidth, const int workHeight, const int clWidth, const int clHeight, float scale, int step, int subsetSize,
+                           DevMem2D_<int4> objects)
        {
-            printf("CascadeClassify");
-            int blocks = ceilf(workHeight / (float)step);
+            int blocks  = ceilf(workHeight / (float)step);
            int threads = ceilf(workWidth / (float)step);
-            DevMem2D_< ::cv::gpu::device::Stage> stages = DevMem2D_< ::cv::gpu::device::Stage>(bstages);
-            DevMem2D_<uchar4> features = (DevMem2D_<uchar4>)bfeatures;
-            DevMem2D_< ::cv::gpu::device::ClNode> nodes = DevMem2D_< ::cv::gpu::device::ClNode>(bnodes);
+            printf("blocks %d, threads %d\n", blocks, threads);

-            lbp_classify<<<blocks, threads>>>(stages, trees, nodes, leaves, subsets, features, integral, step, subsetSize, objects, scale, clWidth, clHeight);
+            Stage* stages = (Stage*)(mstages.ptr());
+            ClNode* nodes = (ClNode*)(mnodes.ptr());
+            const float* leaves = mleaves.ptr();
+            const int* subsets = msubsets.ptr();
+            const uchar4* features = (uchar4*)(mfeatures.ptr());
+
+            lbp_classify_stump<<<blocks, threads>>>(stages, nstages, nodes, leaves, subsets, features, integral,
+                workWidth, workHeight, clWidth, clHeight, scale, step, subsetSize, objects);
        }
    }
 }}}
--- a/modules/gpu/src/opencv2/gpu/device/lbp.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/lbp.hpp
@ -44,62 +44,58 @@
 #define __OPENCV_GPU_DEVICE_LBP_HPP_

 #include "internal_shared.hpp"
-// #include "opencv2/gpu/device/border_interpolate.hpp"
-// #include "opencv2/gpu/device/vec_traits.hpp"
-// #include "opencv2/gpu/device/vec_math.hpp"
-// #include "opencv2/gpu/device/saturate_cast.hpp"
-// #include "opencv2/gpu/device/filters.hpp"
-
-// #define CALC_SUM_(p0, p1, p2, p3, offset) \
-//     ((p0)[offset] - (p1)[offset] - (p2)[offset] + (p3)[offset])
-
-// __device__ __forceinline__ int sum(p0, p1, p2, p3, offset)
-// {
-
-// }

 namespace cv { namespace gpu { namespace device {

+namespace lbp{
    struct Stage
    {
        int    first;
        int    ntrees;
        float  threshold;
-        __device__ __forceinline__ Stage(int f = 0, int n = 0, float t = 0.f) : first(f), ntrees(n), threshold(t) {}
-        __device__ __forceinline__ Stage(const Stage& other) : first(other.first), ntrees(other.ntrees), threshold(other.threshold) {}
    };

    struct ClNode
    {
-        int   featureIdx;
        int   left;
        int   right;
-        __device__ __forceinline__  ClNode(int f = 0, int l = 0, int r = 0) : featureIdx(f), left(l), right(r) {}
-        __device__ __forceinline__  ClNode(const ClNode& other) : featureIdx(other.featureIdx), left(other.left), right(other.right) {}
+        int   featureIdx;
    };

-    struct Feature
+    struct LBP
    {
-        __device__ __forceinline__ Feature(const Feature& other) {(void)other;}
-        __device__ __forceinline__ Feature() {}
+        __device__ __forceinline__ LBP(const LBP& other) {(void)other;}
+        __device__ __forceinline__ LBP() {}

        //feature as uchar x, y - left top, z,w - right bottom
-        __device__ __forceinline__ uchar operator() (unsigned int y, unsigned int x, uchar4 feature, const DevMem2Di integral) const
+        __device__ __forceinline__ int operator() (unsigned int y, unsigned int x, uchar4 feature, const DevMem2Di integral) const
        {
            int x_off = 2 * feature.z;
            int y_off = 2 * feature.w;
+            // printf("feature: %d %d %d %d\n", (int)feature.x, (int)feature.y, (int)feature.z, (int)feature.w);
+            feature.z += feature.x;
+            feature.w += feature.y;

            // load feature key points
            int anchors[16];
+            /*
+            P0-----P1-----P2-----P3
+            |      |      |       |
+            P4-----P5-----P6-----P7
+            |      |      |       |
+            P8-----P9-----P10----P11
+            |      |      |       |
+            P12----P13----P14----15
+            */
            anchors[0]  = integral(y + feature.y, x + feature.x);
            anchors[1]  = integral(y + feature.y, x + feature.z);
-            anchors[2]  = integral(y + feature.y, x + x_off + feature.x);
-            anchors[3]  = integral(y + feature.y, x + x_off + feature.z);
+            anchors[2]  = integral(y + feature.y, x + feature.x + x_off);
+            anchors[3]  = integral(y + feature.y, x + feature.z + x_off);

            anchors[4]  = integral(y + feature.w, x + feature.x);
            anchors[5]  = integral(y + feature.w, x + feature.z);
-            anchors[6]  = integral(y + feature.w, x + x_off + feature.x);
-            anchors[7]  = integral(y + feature.w, x + x_off + feature.z);
+            anchors[6]  = integral(y + feature.w, x + feature.x + x_off);
+            anchors[7]  = integral(y + feature.w, x + feature.z + x_off);

            anchors[8]  = integral(y + y_off + feature.y, x + feature.x);
            anchors[9]  = integral(y + y_off + feature.y, x + feature.z);
@ -114,7 +110,7 @@ namespace cv { namespace gpu { namespace device {
            // calculate feature
            int sum = anchors[5] - anchors[6] - anchors[9] + anchors[10];

-            uchar response = (( (anchors[ 0] - anchors[ 1] - anchors[ 4] + anchors[ 5]) >= sum )? 128 : 0)
+            int response =   (( (anchors[ 0] - anchors[ 1] - anchors[ 4] + anchors[ 5]) >= sum )? 128 : 0)
                            |(( (anchors[ 1] - anchors[ 2] - anchors[ 5] + anchors[ 6]) >= sum )? 64  : 0)
                            |(( (anchors[ 2] - anchors[ 3] - anchors[ 6] + anchors[ 7]) >= sum )? 32  : 0)
                            |(( (anchors[ 6] - anchors[ 7] - anchors[10] + anchors[11]) >= sum )? 16  : 0)
@ -122,11 +118,12 @@ namespace cv { namespace gpu { namespace device {
                            |(( (anchors[ 9] - anchors[10] - anchors[13] + anchors[14]) >= sum )? 4   : 0)
                            |(( (anchors[ 8] - anchors[ 9] - anchors[12] + anchors[13]) >= sum )? 2   : 0)
                            |(( (anchors[ 4] - anchors[ 5] - anchors[ 8] + anchors[ 9]) >= sum )? 1   : 0);
-
            return response;
        }
-
    };
+} // lbp
+
+
 } } }// namespaces

 #endif