From c9f9f3877719380303dea1f9029e44e8160eb633 Mon Sep 17 00:00:00 2001
From: Alexey Spizhevoy <no@email>
Date: Mon, 13 Dec 2010 14:34:02 +0000
Subject: [PATCH] added gpu::sqrSum function

---
 modules/gpu/include/opencv2/gpu/gpu.hpp |  8 +++
 modules/gpu/src/arithm.cpp              | 40 +++++++++++++
 modules/gpu/src/cuda/mathfunc.cu        | 75 +++++++++++++++++++++++--
 tests/gpu/src/arithm.cpp                | 17 +++++-
 4 files changed, 132 insertions(+), 8 deletions(-)
diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp
index f0d4dd3ebe..26f9d649e4 100644
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@@ -428,6 +428,14 @@ namespace cv
         //! supports only single channel images
         CV_EXPORTS Scalar sum(const GpuMat& src, GpuMat& buf);
 
+        //! computes squared sum of array elements
+        //! supports only single channel images
+        CV_EXPORTS Scalar sqrSum(const GpuMat& src);
+
+        //! computes squared sum of array elements
+        //! supports only single channel images
+        CV_EXPORTS Scalar sqrSum(const GpuMat& src, GpuMat& buf);
+
         //! finds global minimum and maximum array elements and returns their values
         CV_EXPORTS void minMax(const GpuMat& src, double* minVal, double* maxVal=0, const GpuMat& mask=GpuMat());
 
diff --git a/modules/gpu/src/arithm.cpp b/modules/gpu/src/arithm.cpp
index 049bfa4334..5f7dd6150d 100644
--- a/modules/gpu/src/arithm.cpp
+++ b/modules/gpu/src/arithm.cpp
@@ -66,6 +66,8 @@ double cv::gpu::norm(const GpuMat&, const GpuMat&, int) { throw_nogpu(); return
 void cv::gpu::flip(const GpuMat&, GpuMat&, int) { throw_nogpu(); }
 Scalar cv::gpu::sum(const GpuMat&) { throw_nogpu(); return Scalar(); }
 Scalar cv::gpu::sum(const GpuMat&, GpuMat&) { throw_nogpu(); return Scalar(); }
+Scalar cv::gpu::sqrSum(const GpuMat&) { throw_nogpu(); return Scalar(); }
+Scalar cv::gpu::sqrSum(const GpuMat&, GpuMat&) { throw_nogpu(); return Scalar(); }
 void cv::gpu::minMax(const GpuMat&, double*, double*, const GpuMat&) { throw_nogpu(); }
 void cv::gpu::minMax(const GpuMat&, double*, double*, const GpuMat&, GpuMat&) { throw_nogpu(); }
 void cv::gpu::minMaxLoc(const GpuMat&, double*, double*, Point*, Point*, const GpuMat&) { throw_nogpu(); }
@@ -489,6 +491,12 @@ namespace cv { namespace gpu { namespace mathfunc
     template <typename T>
     void sum_multipass_caller(const DevMem2D src, PtrStep buf, double* sum);
 
+    template <typename T>
+    void sqsum_caller(const DevMem2D src, PtrStep buf, double* sum);
+
+    template <typename T>
+    void sqsum_multipass_caller(const DevMem2D src, PtrStep buf, double* sum);
+
     namespace sum
     {
         void get_buf_size_required(int cols, int rows, int& bufcols, int& bufrows);
@@ -527,6 +535,38 @@ Scalar cv::gpu::sum(const GpuMat& src, GpuMat& buf)
     return result;
 }
 
+Scalar cv::gpu::sqrSum(const GpuMat& src) 
+{
+    GpuMat buf;
+    return sqrSum(src, buf);
+}
+
+Scalar cv::gpu::sqrSum(const GpuMat& src, GpuMat& buf) 
+{
+    using namespace mathfunc;
+    CV_Assert(src.channels() == 1);
+
+    typedef void (*Caller)(const DevMem2D, PtrStep, double*);
+    static const Caller callers[2][7] = 
+        { { sqsum_multipass_caller<unsigned char>, sqsum_multipass_caller<char>, 
+            sqsum_multipass_caller<unsigned short>, sqsum_multipass_caller<short>, 
+            sqsum_multipass_caller<int>, sqsum_multipass_caller<float>, 0 },
+          { sqsum_caller<unsigned char>, sqsum_caller<char>, 
+            sqsum_caller<unsigned short>, sqsum_caller<short>, 
+            sqsum_caller<int>, sqsum_caller<float>, sqsum_caller<double> } };
+
+    Size bufSize;
+    sum::get_buf_size_required(src.cols, src.rows, bufSize.width, bufSize.height); 
+    buf.create(bufSize, CV_8U);
+
+    Caller caller = callers[hasAtomicsSupport(getDevice())][src.type()];
+    if (!caller) CV_Error(CV_StsBadArg, "sqrSum: unsupported type");
+
+    double result;
+    caller(src, buf, &result);
+    return result;
+}
+
 ////////////////////////////////////////////////////////////////////////
 // minMax
 
diff --git a/modules/gpu/src/cuda/mathfunc.cu b/modules/gpu/src/cuda/mathfunc.cu
index 3d6e1f606a..c990228541 100644
--- a/modules/gpu/src/cuda/mathfunc.cu
+++ b/modules/gpu/src/cuda/mathfunc.cu
@@ -1428,6 +1428,12 @@ namespace cv { namespace gpu { namespace mathfunc
     template <> struct SumType<float> { typedef float R; };
     template <> struct SumType<double> { typedef double R; };
 
+    template <typename R> 
+    struct IdentityOp { static __device__ R call(R x) { return x; } };
+
+    template <typename R> 
+    struct SqrOp { static __device__ R call(R x) { return x * x; } };
+
     __constant__ int ctwidth;
     __constant__ int ctheight;
     __device__ unsigned int blocks_finished = 0;
@@ -1462,7 +1468,7 @@ namespace cv { namespace gpu { namespace mathfunc
         cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(theight))); 
     }
 
-    template <typename T, typename R, int nthreads>
+    template <typename T, typename R, typename Op, int nthreads>
     __global__ void sum_kernel(const DevMem2D_<T> src, R* result)
     {
         __shared__ R smem[nthreads];
@@ -1477,7 +1483,7 @@ namespace cv { namespace gpu { namespace mathfunc
         {
             const T* ptr = src.ptr(y0 + y * blockDim.y);
             for (int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x)
-                sum += ptr[x0 + x * blockDim.x];
+                sum += Op::call(ptr[x0 + x * blockDim.x]);
         }
 
         smem[tid] = sum;
@@ -1548,9 +1554,8 @@ namespace cv { namespace gpu { namespace mathfunc
 
         R* buf_ = (R*)buf.ptr(0);
 
-        sum_kernel<T, R, threads_x * threads_y><<<grid, threads>>>((const DevMem2D_<T>)src, buf_);
-        sum_pass2_kernel<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
-                buf_, grid.x * grid.y);
+        sum_kernel<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>((const DevMem2D_<T>)src, buf_);
+        sum_pass2_kernel<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(buf_, grid.x * grid.y);
         cudaSafeCall(cudaThreadSynchronize());
 
         R result = 0;
@@ -1566,6 +1571,35 @@ namespace cv { namespace gpu { namespace mathfunc
     template void sum_multipass_caller<float>(const DevMem2D, PtrStep, double*);
 
 
+    template <typename T>
+    void sqsum_multipass_caller(const DevMem2D src, PtrStep buf, double* sum)
+    {
+        using namespace sum;
+        typedef typename SumType<T>::R R;
+
+        dim3 threads, grid;
+        estimate_thread_cfg(src.cols, src.rows, threads, grid);
+        set_kernel_consts(src.cols, src.rows, threads, grid);
+
+        R* buf_ = (R*)buf.ptr(0);
+
+        sum_kernel<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>((const DevMem2D_<T>)src, buf_);
+        sum_pass2_kernel<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(buf_, grid.x * grid.y);
+        cudaSafeCall(cudaThreadSynchronize());
+
+        R result = 0;
+        cudaSafeCall(cudaMemcpy(&result, buf_, sizeof(result), cudaMemcpyDeviceToHost));
+        sum[0] = result;
+    }  
+
+    template void sqsum_multipass_caller<unsigned char>(const DevMem2D, PtrStep, double*);
+    template void sqsum_multipass_caller<char>(const DevMem2D, PtrStep, double*);
+    template void sqsum_multipass_caller<unsigned short>(const DevMem2D, PtrStep, double*);
+    template void sqsum_multipass_caller<short>(const DevMem2D, PtrStep, double*);
+    template void sqsum_multipass_caller<int>(const DevMem2D, PtrStep, double*);
+    template void sqsum_multipass_caller<float>(const DevMem2D, PtrStep, double*);
+
+
     template <typename T>
     void sum_caller(const DevMem2D src, PtrStep buf, double* sum)
     {
@@ -1578,7 +1612,7 @@ namespace cv { namespace gpu { namespace mathfunc
 
         R* buf_ = (R*)buf.ptr(0);
 
-        sum_kernel<T, R, threads_x * threads_y><<<grid, threads>>>((const DevMem2D_<T>)src, buf_);
+        sum_kernel<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>((const DevMem2D_<T>)src, buf_);
         cudaSafeCall(cudaThreadSynchronize());
 
         R result = 0;
@@ -1593,5 +1627,34 @@ namespace cv { namespace gpu { namespace mathfunc
     template void sum_caller<int>(const DevMem2D, PtrStep, double*);
     template void sum_caller<float>(const DevMem2D, PtrStep, double*);
     template void sum_caller<double>(const DevMem2D, PtrStep, double*);
+
+
+    template <typename T>
+    void sqsum_caller(const DevMem2D src, PtrStep buf, double* sum)
+    {
+        using namespace sum;
+        typedef typename SumType<T>::R R;
+
+        dim3 threads, grid;
+        estimate_thread_cfg(src.cols, src.rows, threads, grid);
+        set_kernel_consts(src.cols, src.rows, threads, grid);
+
+        R* buf_ = (R*)buf.ptr(0);
+
+        sum_kernel<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>((const DevMem2D_<T>)src, buf_);
+        cudaSafeCall(cudaThreadSynchronize());
+
+        R result = 0;
+        cudaSafeCall(cudaMemcpy(&result, buf_, sizeof(result), cudaMemcpyDeviceToHost));
+        sum[0] = result;
+    }  
+
+    template void sqsum_caller<unsigned char>(const DevMem2D, PtrStep, double*);
+    template void sqsum_caller<char>(const DevMem2D, PtrStep, double*);
+    template void sqsum_caller<unsigned short>(const DevMem2D, PtrStep, double*);
+    template void sqsum_caller<short>(const DevMem2D, PtrStep, double*);
+    template void sqsum_caller<int>(const DevMem2D, PtrStep, double*);
+    template void sqsum_caller<float>(const DevMem2D, PtrStep, double*);
+    template void sqsum_caller<double>(const DevMem2D, PtrStep, double*);
 }}}
 
diff --git a/tests/gpu/src/arithm.cpp b/tests/gpu/src/arithm.cpp
index 521120c082..abe76a1a37 100644
--- a/tests/gpu/src/arithm.cpp
+++ b/tests/gpu/src/arithm.cpp
@@ -940,7 +940,7 @@ struct CV_GpuSumTest: CvTest
         {
             Mat src;
             Scalar a, b;
-            double max_err = 1e-6;
+            double max_err = 1e-5;
 
             int typemax = hasNativeDoubleSupport(getDevice()) ? CV_64F : CV_32F;
             for (int type = CV_8U; type <= typemax; ++type) 
@@ -954,6 +954,19 @@ struct CV_GpuSumTest: CvTest
                     ts->set_failed_test_info(CvTS::FAIL_INVALID_OUTPUT);
                     return;
                 }
+                if (type != CV_8S)
+                {
+                    b = sqrSum(GpuMat(src));
+                    Mat sqrsrc;
+                    multiply(src, src, sqrsrc);
+                    a = sum(sqrsrc);
+                    if (abs(a[0] - b[0]) > src.size().area() * max_err)
+                    {
+                        ts->printf(CvTS::CONSOLE, "type: %d, cols: %d, rows: %d, expected: %f, actual: %f\n", type, src.cols, src.rows, a[0], b[0]);
+                        ts->set_failed_test_info(CvTS::FAIL_INVALID_OUTPUT);
+                        return;
+                    }
+                }
             }
         }
         catch (const Exception& e)
@@ -967,7 +980,7 @@ struct CV_GpuSumTest: CvTest
     {
         m.create(rows, cols, type);
         RNG rng;
-        rng.fill(m, RNG::UNIFORM, Scalar::all(0), Scalar::all(20));
+        rng.fill(m, RNG::UNIFORM, Scalar::all(0), Scalar::all(16));
 
     }
 };