From b52fea7fae6ec38096dcd57458e0ee6be87da996 Mon Sep 17 00:00:00 2001
From: "marina.kolpakova" <marina.kolpakova@itseez.com>
Date: Mon, 1 Oct 2012 13:48:16 +0400
Subject: [PATCH] update soft cascade interface:  - add class Detection in
 interface,  - split sync- and async- versions,  - add support for detecting
 at the specific scale.

---
 modules/gpu/include/opencv2/gpu/gpu.hpp | 26 ++++++++++++-
 modules/gpu/src/cuda/isf-sc.cu          | 45 ++++++++++++++++++----
 modules/gpu/src/softcascade.cpp         | 50 ++++++++++++++++++++-----
 3 files changed, 103 insertions(+), 18 deletions(-)

diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp
index 5008e10275..f171ad904d 100644
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@@ -1537,6 +1537,18 @@ public:
 class CV_EXPORTS SoftCascade
 {
 public:
+
+    struct CV_EXPORTS Detection
+    {
+        ushort x;
+        ushort y;
+        ushort w;
+        ushort h;
+        float confidence;
+        int kind;
+
+        enum {PEDESTRIAN = 0};
+    };
     //! An empty cascade will be created.
     SoftCascade();
 
@@ -1559,9 +1571,19 @@ public:
     //! Param rois is a mask
     //! Param objects 4-channel matrix thet contain detected rectangles
     //! Param rejectfactor used for final object box computing
-    //! Param stream
     virtual void detectMultiScale(const GpuMat& image, const GpuMat& rois, GpuMat& objects,
-    int rejectfactor = 1, Stream stream = Stream::Null());
+    int rejectfactor = 1, int specificScale = -1);
+
+    //! detect specific objects on in the input frame for all scales computed flom minScale and maxscale values.
+    //! asynchronous version.
+    //! Param image is input frame for detector. Cascade will be applied to it.
+    //! Param rois is a mask
+    //! Param objects 4-channel matrix thet contain detected rectangles
+    //! Param rejectfactor used for final object box computing
+    //! Param ndet retrieves number of detections
+    //! Param stream wrapper for CUDA stream
+    virtual void detectMultiScale(const GpuMat& image, const GpuMat& rois, GpuMat& objects,
+    int rejectfactor, GpuMat& ndet, Stream stream);
 
 private:
     struct Filds;
diff --git a/modules/gpu/src/cuda/isf-sc.cu b/modules/gpu/src/cuda/isf-sc.cu
index f3c92cc6ab..3d9a1e10f9 100644
--- a/modules/gpu/src/cuda/isf-sc.cu
+++ b/modules/gpu/src/cuda/isf-sc.cu
@@ -105,7 +105,7 @@ namespace icf {
         float sarea = (scaledRect.z - scaledRect.x) * (scaledRect.w - scaledRect.y);
 
         const float expected_new_area = farea * relScale * relScale;
-        float approx =  sarea / expected_new_area;
+        float approx = __fdividef(sarea, expected_new_area);
 
         dprintf("%d: new rect: %d box %d %d %d %d  rel areas %f %f\n",threadIdx.x, (node.threshold >> 28),
         scaledRect.x, scaledRect.y, scaledRect.z, scaledRect.w, farea * relScale * relScale, sarea);
@@ -198,12 +198,13 @@ namespace icf {
 //     }
 
     __global__ void test_kernel_warp(const Level* levels, const Octave* octaves, const float* stages,
-        const Node* nodes, const float* leaves, Detection* objects, const uint ndetections, uint* ctr)
+        const Node* nodes, const float* leaves, Detection* objects, const uint ndetections, uint* ctr,
+        const int downscales)
     {
         const int y = blockIdx.y * blockDim.y + threadIdx.y;
         const int x = blockIdx.x;
 
-        Level level = levels[blockIdx.z];
+        Level level = levels[downscales + blockIdx.z];
 
         if(x >= level.workRect.x || y >= level.workRect.y) return;
 
@@ -236,7 +237,7 @@ namespace icf {
             dprintf("%d: decided: %d (%d >= %f) %d %f\n\n" ,threadIdx.x, next, sum, threshold, lShift, impact);
             dprintf("%d: extracted stage: %f\n",threadIdx.x, stages[(st + threadIdx.x)]);
             dprintf("%d: computed  score: %f\n",threadIdx.x, impact);
-
+#pragma unroll
             // scan on shuffl functions
             for (int i = 1; i < 32; i *= 2)
             {
@@ -263,13 +264,13 @@ namespace icf {
 
     void detect(const PtrStepSzb& levels, const PtrStepSzb& octaves, const PtrStepSzf& stages,
                 const PtrStepSzb& nodes,  const PtrStepSzf& leaves,  const PtrStepSzi& hogluv,
-                PtrStepSz<uchar4> objects, PtrStepSzi counter)
+                PtrStepSz<uchar4> objects, PtrStepSzi counter, const int downscales)
     {
         int fw = 160;
         int fh = 120;
 
         dim3 block(32, 8);
-        dim3 grid(fw, fh / 8, 47);
+        dim3 grid(fw, fh / 8, downscales);
 
         const Level* l = (const Level*)levels.ptr();
         const Octave* oct = ((const Octave*)octaves.ptr());
@@ -283,8 +284,38 @@ namespace icf {
         cudaChannelFormatDesc desc = cudaCreateChannelDesc<int>();
         cudaSafeCall( cudaBindTexture2D(0, thogluv, hogluv.data, desc, hogluv.cols, hogluv.rows, hogluv.step));
 
-        test_kernel_warp<<<grid, block>>>(l, oct, st, nd, lf, det, max_det, ctr);
+        test_kernel_warp<<<grid, block>>>(l, oct, st, nd, lf, det, max_det, ctr, 0);
+        cudaSafeCall( cudaGetLastError());
 
+        grid = dim3(fw, fh / 8, 47 - downscales);
+        test_kernel_warp<<<grid, block>>>(l, oct, st, nd, lf, det, max_det, ctr, downscales);
+        cudaSafeCall( cudaGetLastError());
+        cudaSafeCall( cudaDeviceSynchronize());
+    }
+
+    void detectAtScale(const int scale, const PtrStepSzb& levels, const PtrStepSzb& octaves, const PtrStepSzf& stages,
+        const PtrStepSzb& nodes, const PtrStepSzf& leaves, const PtrStepSzi& hogluv, PtrStepSz<uchar4> objects,
+        PtrStepSzi counter)
+    {
+        int fw = 160;
+        int fh = 120;
+
+        dim3 block(32, 8);
+        dim3 grid(fw, fh / 8, 1);
+
+        const Level* l = (const Level*)levels.ptr();
+        const Octave* oct = ((const Octave*)octaves.ptr());
+        const float* st = (const float*)stages.ptr();
+        const Node* nd = (const Node*)nodes.ptr();
+        const float* lf = (const float*)leaves.ptr();
+        uint* ctr = (uint*)counter.ptr();
+        Detection* det = (Detection*)objects.ptr();
+        uint max_det = objects.cols / sizeof(Detection);
+
+        cudaChannelFormatDesc desc = cudaCreateChannelDesc<int>();
+        cudaSafeCall( cudaBindTexture2D(0, thogluv, hogluv.data, desc, hogluv.cols, hogluv.rows, hogluv.step));
+
+        test_kernel_warp<<<grid, block>>>(l, oct, st, nd, lf, det, max_det, ctr, scale);
         cudaSafeCall( cudaGetLastError());
         cudaSafeCall( cudaDeviceSynchronize());
     }
diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp
index 320fbb3437..fd94909cf0 100644
--- a/modules/gpu/src/softcascade.cpp
+++ b/modules/gpu/src/softcascade.cpp
@@ -49,7 +49,11 @@ cv::gpu::SoftCascade::SoftCascade() : filds(0) { throw_nogpu(); }
 cv::gpu::SoftCascade::SoftCascade( const string&, const float, const float) : filds(0) { throw_nogpu(); }
 cv::gpu::SoftCascade::~SoftCascade() { throw_nogpu(); }
 bool cv::gpu::SoftCascade::load( const string&, const float, const float) { throw_nogpu(); return false; }
-void cv::gpu::SoftCascade::detectMultiScale(const GpuMat&, const GpuMat&, GpuMat&, const int, Stream) { throw_nogpu();}
+void cv::gpu::SoftCascade::detectMultiScale(const GpuMat&, const GpuMat&, GpuMat&, const int, int) { throw_nogpu();}
+void cv::gpu::SoftCascade::detectMultiScale(const GpuMat&, const GpuMat&, GpuMat&, int, GpuMat&, Stream)
+{
+    throw_nogpu();
+}
 
 #else
 
@@ -60,6 +64,9 @@ namespace icf {
     void fillBins(cv::gpu::PtrStepSzb hogluv, const cv::gpu::PtrStepSzf& nangle,
         const int fw, const int fh, const int bins);
     void detect(const PtrStepSzb& levels, const PtrStepSzb& octaves, const PtrStepSzf& stages,
+        const PtrStepSzb& nodes, const PtrStepSzf& leaves, const PtrStepSzi& hogluv, PtrStepSz<uchar4> objects,
+        PtrStepSzi counter, const int downscales);
+    void detectAtScale(const int scale, const PtrStepSzb& levels, const PtrStepSzb& octaves, const PtrStepSzf& stages,
         const PtrStepSzb& nodes, const PtrStepSzf& leaves, const PtrStepSzi& hogluv, PtrStepSz<uchar4> objects,
         PtrStepSzi counter);
 }
@@ -86,6 +93,8 @@ struct cv::gpu::SoftCascade::Filds
     int origObjWidth;
     int origObjHeight;
 
+    int downscales;
+
     GpuMat octaves;
     GpuMat stages;
     GpuMat nodes;
@@ -120,7 +129,6 @@ struct cv::gpu::SoftCascade::Filds
         FRAME_WIDTH        = 640,
         FRAME_HEIGHT       = 480,
         TOTAL_SCALES       = 55,
-//         CLASSIFIERS        = 5,
         ORIG_OBJECT_WIDTH  = 64,
         ORIG_OBJECT_HEIGHT = 128,
         HOG_BINS           = 6,
@@ -132,7 +140,14 @@ struct cv::gpu::SoftCascade::Filds
     void detect(cv::gpu::GpuMat objects, cudaStream_t stream) const
     {
         cudaMemset(detCounter.data, 0, detCounter.step * detCounter.rows * sizeof(int));
-        device::icf::detect(levels, octaves, stages, nodes, leaves, hogluv, objects , detCounter);
+        device::icf::detect(levels, octaves, stages, nodes, leaves, hogluv, objects , detCounter, downscales);
+    }
+
+    void detectAtScale(int scale, cv::gpu::GpuMat objects, cudaStream_t stream) const
+    {
+        cudaMemset(detCounter.data, 0, detCounter.step * detCounter.rows * sizeof(int));
+        device::icf::detectAtScale(scale, levels, octaves, stages, nodes, leaves, hogluv, objects,
+            detCounter);
     }
 
 private:
@@ -160,7 +175,7 @@ private:
     }
 };
 
-inline bool cv::gpu::SoftCascade::Filds::fill(const FileNode &root, const float mins, const float maxs)
+bool cv::gpu::SoftCascade::Filds::fill(const FileNode &root, const float mins, const float maxs)
 {
     using namespace device::icf;
     minScale = mins;
@@ -351,6 +366,7 @@ inline void cv::gpu::SoftCascade::Filds::calcLevels(const std::vector<device::ic
     float logFactor = (::log(maxScale) - ::log(minScale)) / (nscales -1);
 
     float scale = minScale;
+    downscales = 0;
     for (int sc = 0; sc < nscales; ++sc)
     {
         int width  = ::std::max(0.0f, frameW - (origObjWidth  * scale));
@@ -366,7 +382,10 @@ inline void cv::gpu::SoftCascade::Filds::calcLevels(const std::vector<device::ic
         if (!width || !height)
             break;
         else
+        {
             vlevels.push_back(level);
+            if (octs[fit].scale < 1) ++downscales;
+        }
 
         if (::fabs(scale - maxScale) < FLT_EPSILON) break;
         scale = ::std::min(maxScale, ::expf(::log(scale) + logFactor));
@@ -424,8 +443,11 @@ namespace {
         return s;
     }
 }
+
+//================================== synchronous version ============================================================//
+
 void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& colored, const GpuMat& /*rois*/,
-                                GpuMat& objects, const int /*rejectfactor*/, Stream s)
+                                GpuMat& objects, const int /*rejectfactor*/, int specificScale)
 {
     // only color images are supperted
     CV_Assert(colored.type() == CV_8UC3);
@@ -513,11 +535,21 @@ void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& colored, const GpuMat&
     }
 #endif
 
-    cudaStream_t stream = StreamAccessor::getStream(s);
-    flds.detect(objects, stream);
+    if (specificScale == -1)
+        flds.detect(objects, 0);
+    else
+        flds.detectAtScale(specificScale, objects, 0);
 
-        //     cv::Mat out(flds.detCounter);
-        // std::cout << out << std::endl;
+    cv::Mat out(flds.detCounter);
+    int ndetections = *(out.data);
+
+    objects = GpuMat(objects, cv::Rect(0, 0, ndetections * sizeof(Detection), 1));
 }
 
+void cv::gpu::SoftCascade::detectMultiScale(const GpuMat&, const GpuMat&, GpuMat&, int, GpuMat&, Stream)
+{
+    // cudaStream_t stream = StreamAccessor::getStream(s);
+}
+
+
 #endif
\ No newline at end of file