From 49fa536c6259761e4e259fddecb55646f276e669 Mon Sep 17 00:00:00 2001
From: Vladislav Vinogradov <no@email>
Date: Mon, 4 Oct 2010 11:42:40 +0000
Subject: [PATCH] added Sobel, GaussianBlur, Canny to gpu module. minor fix of
 matrix_operations.cpp.

---
 modules/gpu/include/opencv2/gpu/gpu.hpp |   9 +
 modules/gpu/src/filtering_npp.cpp       | 184 ++++++++++
 modules/gpu/src/imgproc_gpu.cpp         |  30 ++
 modules/gpu/src/matrix_operations.cpp   | 429 ++++++++++++++----------
 tests/gpu/src/gputest_main.cpp          |   3 +
 tests/gpu/src/imgproc_gpu.cpp           | 112 +++++++
 6 files changed, 589 insertions(+), 178 deletions(-)

diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp
index 61a202c6a1..d8dc5bf6ff 100644
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@@ -533,9 +533,18 @@ namespace cv
         //! applies an advanced morphological operation to the image
         CV_EXPORTS void morphologyEx( const GpuMat& src, GpuMat& dst, int op, const Mat& kernel, Point anchor, int iterations);
 
+        //! 1D mask Window Sum for 8 bit images
         CV_EXPORTS void sumWindowColumn(const GpuMat& src, GpuMat& dst, int ksize, int anchor = -1);
         CV_EXPORTS void sumWindowRow(const GpuMat& src, GpuMat& dst, int ksize, int anchor = -1);
 
+        //! applies generalized Sobel operator to the image
+        CV_EXPORTS void Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, int ksize = 3, double scale = 1);
+
+        //! smooths the image using Gaussian filter.
+        CV_EXPORTS void GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, double sigma1, double sigma2 = 0);
+
+        //! applies Canny edge detector and produces the edge map.
+        CV_EXPORTS void Canny(const GpuMat& image, GpuMat& edges, double threshold1, double threshold2, int apertureSize = 3);
 
         //////////////////////////////// Image Labeling ////////////////////////////////
 
diff --git a/modules/gpu/src/filtering_npp.cpp b/modules/gpu/src/filtering_npp.cpp
index 4a9fd888ee..1100db72cc 100644
--- a/modules/gpu/src/filtering_npp.cpp
+++ b/modules/gpu/src/filtering_npp.cpp
@@ -54,6 +54,8 @@ void cv::gpu::morphologyEx( const GpuMat&, GpuMat&, int, const Mat&, Point, int)
 void cv::gpu::boxFilter(const GpuMat&, GpuMat&, Size, Point) { throw_nogpu(); }
 void cv::gpu::sumWindowColumn(const GpuMat&, GpuMat&, int, int) { throw_nogpu(); }
 void cv::gpu::sumWindowRow(const GpuMat&, GpuMat&, int, int) { throw_nogpu(); }
+void cv::gpu::Sobel(const GpuMat&, GpuMat&, int, int, int, int, double) { throw_nogpu(); }
+void cv::gpu::GaussianBlur(const GpuMat&, GpuMat&, Size, double, double) { throw_nogpu(); }
 
 #else
 
@@ -237,4 +239,186 @@ void cv::gpu::sumWindowRow(const GpuMat& src, GpuMat& dst, int ksize, int anchor
     sumWindowCaller(nppiSumWindowRow_8u32f_C1R, src, dst, ksize, anchor);
 }
 
+////////////////////////////////////////////////////////////////////////
+// Filter Engine
+
+namespace
+{
+    typedef NppStatus (*nppFilter1D_t)(const Npp8u * pSrc, Npp32s nSrcStep, Npp8u * pDst, Npp32s nDstStep, NppiSize oROI, 
+                     const Npp32s * pKernel, Npp32s nMaskSize, Npp32s nAnchor, Npp32s nDivisor);
+    typedef NppStatus (*nppFilter2D_t)(const Npp8u * pSrc, Npp32s nSrcStep, Npp8u * pDst, Npp32s nDstStep, NppiSize oSizeROI, 
+                  const Npp32s * pKernel, NppiSize oKernelSize, NppiPoint oAnchor, Npp32s nDivisor);
+
+    void applyRowFilter(const GpuMat& src, GpuMat& dst, const GpuMat& rowKernel, Npp32s anchor = -1, Npp32s nDivisor = 1)
+    {
+        static const nppFilter1D_t nppFilter1D_callers[] = {nppiFilterRow_8u_C1R, nppiFilterRow_8u_C4R};
+
+        CV_Assert(src.type() == CV_8UC1 || src.type() == CV_8UC4);
+
+        int kRowSize = rowKernel.cols;
+
+        dst.create(src.size(), src.type());
+        dst = Scalar();
+
+        NppiSize oROI;
+        oROI.width = src.cols - kRowSize + 1;
+        oROI.height = src.rows;
+
+        if (anchor < 0)
+            anchor = kRowSize >> 1;
+
+        GpuMat srcROI = src.colRange(kRowSize-1, oROI.width);
+        GpuMat dstROI = dst.colRange(kRowSize-1, oROI.width);
+
+        nppFilter1D_callers[src.channels() >> 2](srcROI.ptr<Npp8u>(), srcROI.step, dstROI.ptr<Npp8u>(), dstROI.step, oROI, 
+                rowKernel.ptr<Npp32s>(), kRowSize, anchor, nDivisor);
+    }
+
+    void applyColumnFilter(const GpuMat& src, GpuMat& dst, const GpuMat& columnKernel, Npp32s anchor = -1, Npp32s nDivisor = 1)
+    {
+        static const nppFilter1D_t nppFilter1D_callers[] = {nppiFilterColumn_8u_C1R, nppiFilterColumn_8u_C4R};
+
+        CV_Assert(src.type() == CV_8UC1 || src.type() == CV_8UC4);
+
+        int kColSize = columnKernel.cols;
+
+        dst.create(src.size(), src.type());
+        dst = Scalar();
+
+        NppiSize oROI;
+        oROI.width = src.cols;
+        oROI.height = src.rows - kColSize + 1;
+
+        if (anchor < 0)
+            anchor = kColSize >> 1;
+
+        GpuMat srcROI = src.rowRange(kColSize-1, oROI.height);
+        GpuMat dstROI = dst.rowRange(kColSize-1, oROI.height);
+        
+        nppFilter1D_callers[src.channels() >> 2](srcROI.ptr<Npp8u>(), srcROI.step, dstROI.ptr<Npp8u>(), dstROI.step, oROI, 
+                columnKernel.ptr<Npp32s>(), kColSize, anchor, nDivisor);
+    }
+
+    inline void applySeparableFilter(const GpuMat& src, GpuMat& dst, const GpuMat& rowKernel, const GpuMat& columnKernel, 
+        const cv::Point& anchor = cv::Point(-1, -1), Npp32s nDivisor = 1)
+    {
+        GpuMat dstBuf;
+        applyRowFilter(src, dstBuf, rowKernel, anchor.x, nDivisor);
+        applyColumnFilter(dstBuf, dst, columnKernel, anchor.y, nDivisor);
+    }
+
+    void makeNppKernel(Mat kernel, GpuMat& dst)
+    {
+        kernel.convertTo(kernel, CV_32S); 
+        kernel = kernel.t();
+        int ksize = kernel.cols;
+        for (int i = 0; i < ksize / 2; ++i)
+        {
+            std::swap(kernel.at<int>(0, i), kernel.at<int>(0, ksize - 1 - i));
+        }
+        dst.upload(kernel);
+    }
+
+    void applyFilter2D(const GpuMat& src, GpuMat& dst, const GpuMat& kernel, cv::Point anchor = cv::Point(-1, -1), Npp32s nDivisor = 1)
+    {
+        static const nppFilter2D_t nppFilter2D_callers[] = {nppiFilter_8u_C1R, nppiFilter_8u_C4R};        
+
+        CV_Assert(src.type() == CV_8UC1 || src.type() == CV_8UC4);
+
+        dst.create(src.size(), src.type());
+        dst = Scalar();
+
+        NppiSize oROI;
+        oROI.width = src.cols - kernel.cols + 1;
+        oROI.height = src.rows - kernel.rows + 1;
+
+        if (anchor.x < 0)
+            anchor.x = kernel.cols >> 1;
+        if (anchor.y < 0)
+            anchor.y = kernel.rows >> 1;
+
+        GpuMat srcROI = src(Range(kernel.rows-1, oROI.height), Range(kernel.cols-1, oROI.width));
+        GpuMat dstROI = dst(Range(kernel.rows-1, oROI.height), Range(kernel.cols-1, oROI.width));
+
+        NppiSize oKernelSize;
+        oKernelSize.height = kernel.rows;
+        oKernelSize.width = kernel.cols;
+        NppiPoint oAnchor;
+        oAnchor.x = anchor.x;
+        oAnchor.y = anchor.y;
+        
+        nppFilter2D_callers[src.channels() >> 2](srcROI.ptr<Npp8u>(), srcROI.step, dstROI.ptr<Npp8u>(), dstROI.step, oROI, 
+                kernel.ptr<Npp32s>(), oKernelSize, oAnchor, nDivisor);
+    }
+}
+
+////////////////////////////////////////////////////////////////////////
+// Sobel
+
+void cv::gpu::Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, int ksize, double scale)
+{
+    Mat kx, ky;
+    getDerivKernels(kx, ky, dx, dy, ksize, false, CV_32F);
+
+    if (scale != 1)
+    {
+        // usually the smoothing part is the slowest to compute,
+        // so try to scale it instead of the faster differenciating part
+        if (dx == 0)
+            kx *= scale;
+        else
+            ky *= scale;
+    }
+    
+    GpuMat rowKernel; makeNppKernel(kx, rowKernel);
+    GpuMat columnKernel; makeNppKernel(ky, columnKernel);
+
+    applySeparableFilter(src, dst, rowKernel, columnKernel);
+}
+
+////////////////////////////////////////////////////////////////////////
+// GaussianBlur
+
+void cv::gpu::GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, double sigma1, double sigma2)
+{
+    if (ksize.width == 1 && ksize.height == 1)
+    {
+        src.copyTo(dst);
+        return;
+    }
+
+    int depth = src.depth();
+    if (sigma2 <= 0)
+        sigma2 = sigma1;
+
+    // automatic detection of kernel size from sigma
+    if (ksize.width <= 0 && sigma1 > 0)
+        ksize.width = cvRound(sigma1 * (depth == CV_8U ? 3 : 4) * 2 + 1) | 1;
+    if (ksize.height <= 0 && sigma2 > 0)
+        ksize.height = cvRound(sigma2 * (depth == CV_8U ? 3 : 4) * 2 + 1) | 1;
+
+    CV_Assert(ksize.width > 0 && ksize.width % 2 == 1 && ksize.height > 0 && ksize.height % 2 == 1);
+
+    sigma1 = std::max(sigma1, 0.0);
+    sigma2 = std::max(sigma2, 0.0);
+    
+    const int scaleFactor = 256;
+
+    Mat kx = getGaussianKernel(ksize.width, sigma1, std::max(depth, CV_32F));
+    kx.convertTo(kx, kx.depth(), scaleFactor);
+    Mat ky;
+    if (ksize.height == ksize.width && std::abs(sigma1 - sigma2) < DBL_EPSILON)
+        ky = kx;
+    else
+    {
+        ky = getGaussianKernel(ksize.height, sigma2, std::max(depth, CV_32F));        
+        ky.convertTo(ky, ky.depth(), scaleFactor);
+    }
+
+    GpuMat rowKernel; makeNppKernel(kx, rowKernel);
+    GpuMat columnKernel; makeNppKernel(ky, columnKernel);
+
+    applySeparableFilter(src, dst, rowKernel, columnKernel, cv::Point(-1, -1), scaleFactor);
+}
+
 #endif
diff --git a/modules/gpu/src/imgproc_gpu.cpp b/modules/gpu/src/imgproc_gpu.cpp
index 815aa8686a..a786a620b9 100644
--- a/modules/gpu/src/imgproc_gpu.cpp
+++ b/modules/gpu/src/imgproc_gpu.cpp
@@ -62,6 +62,7 @@ void cv::gpu::warpAffine(const GpuMat&, GpuMat&, const Mat&, Size, int) { throw_
 void cv::gpu::warpPerspective(const GpuMat&, GpuMat&, const Mat&, Size, int) { throw_nogpu(); }
 void cv::gpu::rotate(const GpuMat&, GpuMat&, Size, double, double, double, int) { throw_nogpu(); }
 void cv::gpu::integral(GpuMat&, GpuMat&, GpuMat&) { throw_nogpu(); }
+void cv::gpu::Canny(const GpuMat&, GpuMat&, double, double, int) { throw_nogpu(); }
 
 #else /* !defined (HAVE_CUDA) */
 
@@ -986,4 +987,33 @@ void cv::gpu::integral(GpuMat& src, GpuMat& sum, GpuMat& sqsum)
         sum.step, sqsum.ptr<Npp32f>(), sqsum.step, sz, 0, 0.0f, h) );
 }
 
+////////////////////////////////////////////////////////////////////////
+// Canny
+
+void cv::gpu::Canny(const GpuMat& image, GpuMat& edges, double threshold1, double threshold2, int apertureSize)
+{
+    CV_Assert(image.type() == CV_8UC1);
+
+    GpuMat srcDx, srcDy;
+
+    Sobel(image, srcDx, -1, 1, 0, apertureSize);
+    Sobel(image, srcDy, -1, 0, 1, apertureSize);
+
+    srcDx.convertTo(srcDx, CV_32F);
+    srcDy.convertTo(srcDy, CV_32F);
+
+    edges.create(image.size(), CV_8UC1);
+
+    NppiSize sz;
+    sz.height = image.rows;
+    sz.width = image.cols;
+
+    int bufsz;
+    nppSafeCall( nppiCannyGetBufferSize(sz, &bufsz) );
+    GpuMat buf(1, bufsz, CV_8UC1);
+
+    nppSafeCall( nppiCanny_32f8u_C1R(srcDx.ptr<Npp32f>(), srcDx.step, srcDy.ptr<Npp32f>(), srcDy.step, 
+        edges.ptr<Npp8u>(), edges.step, sz, (Npp32f)threshold1, (Npp32f)threshold2, buf.ptr<Npp8u>()) );
+}
+
 #endif /* !defined (HAVE_CUDA) */
diff --git a/modules/gpu/src/matrix_operations.cpp b/modules/gpu/src/matrix_operations.cpp
index 7b1837dfe5..7d58619b28 100644
--- a/modules/gpu/src/matrix_operations.cpp
+++ b/modules/gpu/src/matrix_operations.cpp
@@ -124,6 +124,61 @@ void cv::gpu::GpuMat::copyTo( GpuMat& mat, const GpuMat& mask ) const
     }
 }
 
+namespace
+{
+    template<int n> struct NPPTypeTraits;
+    template<> struct NPPTypeTraits<CV_8U>  { typedef Npp8u npp_type; };
+    template<> struct NPPTypeTraits<CV_16U> { typedef Npp16u npp_type; };
+    template<> struct NPPTypeTraits<CV_16S> { typedef Npp16s npp_type; };
+    template<> struct NPPTypeTraits<CV_32S> { typedef Npp32s npp_type; };
+    template<> struct NPPTypeTraits<CV_32F> { typedef Npp32f npp_type; };
+
+    template<int SDEPTH, int DDEPTH> struct NppConvertFunc
+    {
+        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+        typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
+
+        typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, dst_t* pDst, int nDstStep, NppiSize oSizeROI);
+    };
+    template<int DDEPTH> struct NppConvertFunc<CV_32F, DDEPTH>
+    {
+        typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
+
+        typedef NppStatus (*func_ptr)(const Npp32f* pSrc, int nSrcStep, dst_t* pDst, int nDstStep, NppiSize oSizeROI, NppRoundMode eRoundMode);
+    };
+    
+    template<int SDEPTH, int DDEPTH, typename NppConvertFunc<SDEPTH, DDEPTH>::func_ptr func> struct NppCvt 
+    { 
+        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+        typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
+
+        static void cvt(const GpuMat& src, GpuMat& dst)
+        {
+            NppiSize sz;
+            sz.width = src.cols;
+            sz.height = src.rows;
+            nppSafeCall( func(src.ptr<src_t>(), src.step, dst.ptr<dst_t>(), dst.step, sz) );
+        }
+    };
+    template<int DDEPTH, typename NppConvertFunc<CV_32F, DDEPTH>::func_ptr func> struct NppCvt<CV_32F, DDEPTH, func>
+    { 
+        typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
+
+        static void cvt(const GpuMat& src, GpuMat& dst)
+        {
+            NppiSize sz;
+            sz.width = src.cols;
+            sz.height = src.rows;
+            nppSafeCall( func(src.ptr<Npp32f>(), src.step, dst.ptr<dst_t>(), dst.step, sz, NPP_RND_NEAR) );
+        }
+    };
+
+    void convertToKernelCaller(const GpuMat& src, GpuMat& dst)
+    {
+        matrix_operations::convert_to(src, src.depth(), dst, dst.depth(), src.channels(), 1.0, 0.0);
+    }
+}
+
 void cv::gpu::GpuMat::convertTo( GpuMat& dst, int rtype, double alpha, double beta ) const
 {
     bool noScale = fabs(alpha-1) < std::numeric_limits<double>::epsilon() && fabs(beta) < std::numeric_limits<double>::epsilon();
@@ -133,7 +188,7 @@ void cv::gpu::GpuMat::convertTo( GpuMat& dst, int rtype, double alpha, double be
     else
         rtype = CV_MAKETYPE(CV_MAT_DEPTH(rtype), channels());
     
-    int stype = type();
+    int scn = channels();
     int sdepth = depth(), ddepth = CV_MAT_DEPTH(rtype);
     if( sdepth == ddepth && noScale )
     {
@@ -152,44 +207,85 @@ void cv::gpu::GpuMat::convertTo( GpuMat& dst, int rtype, double alpha, double be
         matrix_operations::convert_to(*psrc, sdepth, dst, ddepth, psrc->channels(), alpha, beta);
     else
     {
-        NppiSize sz;
-        sz.width = cols;
-        sz.height = rows;
+        typedef void (*convert_caller_t)(const GpuMat& src, GpuMat& dst);
+        static const convert_caller_t convert_callers[8][8][4] = 
+        {
+            {
+                {0,0,0,0},
+                {convertToKernelCaller, convertToKernelCaller, convertToKernelCaller, convertToKernelCaller},
+                {NppCvt<CV_8U, CV_16U, nppiConvert_8u16u_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,NppCvt<CV_8U, CV_16U, nppiConvert_8u16u_C4R>::cvt},
+                {NppCvt<CV_8U, CV_16S, nppiConvert_8u16s_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,NppCvt<CV_8U, CV_16S, nppiConvert_8u16s_C4R>::cvt},
+                {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
+                {NppCvt<CV_8U, CV_32F, nppiConvert_8u32f_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
+                {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
+                {0,0,0,0}
+            },
+            {
+                {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
+                {0,0,0,0},
+                {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
+                {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
+                {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
+                {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
+                {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
+                {0,0,0,0}
+            },
+            {
+                {NppCvt<CV_16U, CV_8U, nppiConvert_16u8u_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,NppCvt<CV_16U, CV_8U, nppiConvert_16u8u_C4R>::cvt},
+                {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
+                {0,0,0,0},
+                {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
+                {NppCvt<CV_16U, CV_32S, nppiConvert_16u32s_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller}, 
+                {NppCvt<CV_16U, CV_32F, nppiConvert_16u32f_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller}, 
+                {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
+                {0,0,0,0}
+            },
+            {
+                {NppCvt<CV_16S, CV_8U, nppiConvert_16s8u_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,NppCvt<CV_16S, CV_8U, nppiConvert_16s8u_C4R>::cvt},
+                {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
+                {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
+                {0,0,0,0},
+                {NppCvt<CV_16S, CV_32S, nppiConvert_16s32s_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller}, 
+                {NppCvt<CV_16S, CV_32F, nppiConvert_16s32f_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller}, 
+                {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
+                {0,0,0,0}
+            },
+            {
+                {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
+                {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
+                {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
+                {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
+                {0,0,0,0},
+                {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
+                {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
+                {0,0,0,0}
+            },
+            {
+                {NppCvt<CV_32F, CV_8U, nppiConvert_32f8u_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
+                {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
+                {NppCvt<CV_32F, CV_16U, nppiConvert_32f16u_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller}, 
+                {NppCvt<CV_32F, CV_16S, nppiConvert_32f16s_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller}, 
+                {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
+                {0,0,0,0},
+                {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
+                {0,0,0,0}
+            },
+            {
+                {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
+                {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
+                {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
+                {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
+                {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
+                {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
+                {0,0,0,0},
+                {0,0,0,0}
+            },
+            {
+                {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}
+            }
+        };
 
-        if (stype == CV_8UC1 && ddepth == CV_16U)
-            nppSafeCall( nppiConvert_8u16u_C1R(psrc->ptr<Npp8u>(), psrc->step, dst.ptr<Npp16u>(), dst.step, sz) );
-        else if (stype == CV_16UC1 && ddepth == CV_8U)
-            nppSafeCall( nppiConvert_16u8u_C1R(psrc->ptr<Npp16u>(), psrc->step, dst.ptr<Npp8u>(), dst.step, sz) );
-        else if (stype == CV_8UC4 && ddepth == CV_16U)
-            nppSafeCall( nppiConvert_8u16u_C4R(psrc->ptr<Npp8u>(), psrc->step, dst.ptr<Npp16u>(), dst.step, sz) );
-        else if (stype == CV_16UC4 && ddepth == CV_8U)
-            nppSafeCall( nppiConvert_16u8u_C4R(psrc->ptr<Npp16u>(), psrc->step, dst.ptr<Npp8u>(), dst.step, sz) );
-        else if (stype == CV_8UC1 && ddepth == CV_16S)
-            nppSafeCall( nppiConvert_8u16s_C1R(psrc->ptr<Npp8u>(), psrc->step, dst.ptr<Npp16s>(), dst.step, sz) );
-        else if (stype == CV_16SC1 && ddepth == CV_8U)
-            nppSafeCall( nppiConvert_16s8u_C1R(psrc->ptr<Npp16s>(), psrc->step, dst.ptr<Npp8u>(), dst.step, sz) );
-        else if (stype == CV_8UC4 && ddepth == CV_16S)
-            nppSafeCall( nppiConvert_8u16s_C4R(psrc->ptr<Npp8u>(), psrc->step, dst.ptr<Npp16s>(), dst.step, sz) );
-        else if (stype == CV_16SC4 && ddepth == CV_8U)
-            nppSafeCall( nppiConvert_16s8u_C4R(psrc->ptr<Npp16s>(), psrc->step, dst.ptr<Npp8u>(), dst.step, sz) );
-        else if (stype == CV_16SC1 && ddepth == CV_32F)
-            nppSafeCall( nppiConvert_16s32f_C1R(psrc->ptr<Npp16s>(), psrc->step, dst.ptr<Npp32f>(), dst.step, sz) );
-        else if (stype == CV_32FC1 && ddepth == CV_16S)
-            nppSafeCall( nppiConvert_32f16s_C1R(psrc->ptr<Npp32f>(), psrc->step, dst.ptr<Npp16s>(), dst.step, sz, NPP_RND_NEAR) );
-        else if (stype == CV_8UC1 && ddepth == CV_32F)
-            nppSafeCall( nppiConvert_8u32f_C1R(psrc->ptr<Npp8u>(), psrc->step, dst.ptr<Npp32f>(), dst.step, sz) );
-        else if (stype == CV_32FC1 && ddepth == CV_8U)
-            nppSafeCall( nppiConvert_32f8u_C1R(psrc->ptr<Npp32f>(), psrc->step, dst.ptr<Npp8u>(), dst.step, sz, NPP_RND_NEAR) );
-        else if (stype == CV_16UC1 && ddepth == CV_32F)
-            nppSafeCall( nppiConvert_16u32f_C1R(psrc->ptr<Npp16u>(), psrc->step, dst.ptr<Npp32f>(), dst.step, sz) );
-        else if (stype == CV_32FC1 && ddepth == CV_16U)
-            nppSafeCall( nppiConvert_32f16u_C1R(psrc->ptr<Npp32f>(), psrc->step, dst.ptr<Npp16u>(), dst.step, sz, NPP_RND_NEAR) );
-        else if (stype == CV_16UC1 && ddepth == CV_32S)
-            nppSafeCall( nppiConvert_16u32s_C1R(psrc->ptr<Npp16u>(), psrc->step, dst.ptr<Npp32s>(), dst.step, sz) );
-        else if (stype == CV_16SC1 && ddepth == CV_32S)
-            nppSafeCall( nppiConvert_16s32s_C1R(psrc->ptr<Npp16s>(), psrc->step, dst.ptr<Npp32s>(), dst.step, sz) );
-        else
-            matrix_operations::convert_to(*psrc, sdepth, dst, ddepth, psrc->channels(), 1.0, 0.0);
+        convert_callers[sdepth][ddepth][scn-1](*psrc, dst);
     }
 }
 
@@ -199,6 +295,99 @@ GpuMat& GpuMat::operator = (const Scalar& s)
     return *this;
 }
 
+namespace
+{
+    template<int SDEPTH, int SCN> struct NppSetFunc
+    {
+        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+
+        typedef NppStatus (*func_ptr)(const src_t values[], src_t* pSrc, int nSrcStep, NppiSize oSizeROI);
+    };
+    template<int SDEPTH> struct NppSetFunc<SDEPTH, 1>
+    {
+        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+
+        typedef NppStatus (*func_ptr)(src_t val, src_t* pSrc, int nSrcStep, NppiSize oSizeROI);
+    };
+    
+    template<int SDEPTH, int SCN, typename NppSetFunc<SDEPTH, SCN>::func_ptr func> struct NppSet 
+    { 
+        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+
+        static void set(GpuMat& src, const Scalar& s)
+        {
+            NppiSize sz;
+            sz.width = src.cols;
+            sz.height = src.rows;
+            Scalar_<src_t> nppS = s;
+            nppSafeCall( func(nppS.val, src.ptr<src_t>(), src.step, sz) );
+        }
+    };
+    template<int SDEPTH, typename NppSetFunc<SDEPTH, 1>::func_ptr func> struct NppSet<SDEPTH, 1, func>
+    { 
+        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+
+        static void set(GpuMat& src, const Scalar& s)
+        {
+            NppiSize sz;
+            sz.width = src.cols;
+            sz.height = src.rows;
+            Scalar_<src_t> nppS = s;
+            nppSafeCall( func(nppS[0], src.ptr<src_t>(), src.step, sz) );
+        }
+    };
+
+    void kernelSet(GpuMat& src, const Scalar& s)
+    {
+        matrix_operations::set_to_without_mask(src, src.depth(), s.val, src.channels());
+    }
+    
+    template<int SDEPTH, int SCN> struct NppSetMaskFunc
+    {
+        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+
+        typedef NppStatus (*func_ptr)(const src_t values[], src_t* pSrc, int nSrcStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep);
+    };
+    template<int SDEPTH> struct NppSetMaskFunc<SDEPTH, 1>
+    {
+        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+
+        typedef NppStatus (*func_ptr)(src_t val, src_t* pSrc, int nSrcStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep);
+    };
+    
+    template<int SDEPTH, int SCN, typename NppSetMaskFunc<SDEPTH, SCN>::func_ptr func> struct NppSetMask
+    { 
+        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+
+        static void set(GpuMat& src, const Scalar& s, const GpuMat& mask)
+        {
+            NppiSize sz;
+            sz.width = src.cols;
+            sz.height = src.rows;
+            Scalar_<src_t> nppS = s;
+            nppSafeCall( func(nppS.val, src.ptr<src_t>(), src.step, sz, mask.ptr<Npp8u>(), mask.step) );
+        }
+    };
+    template<int SDEPTH, typename NppSetMaskFunc<SDEPTH, 1>::func_ptr func> struct NppSetMask<SDEPTH, 1, func>
+    { 
+        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+
+        static void set(GpuMat& src, const Scalar& s, const GpuMat& mask)
+        {
+            NppiSize sz;
+            sz.width = src.cols;
+            sz.height = src.rows;
+            Scalar_<src_t> nppS = s;
+            nppSafeCall( func(nppS[0], src.ptr<src_t>(), src.step, sz, mask.ptr<Npp8u>(), mask.step) );
+        }
+    };
+    
+    void kernelSetMask(GpuMat& src, const Scalar& s, const GpuMat& mask)
+    {
+        matrix_operations::set_to_with_mask(src, src.depth(), s.val, mask, src.channels());
+    }
+}
+
 GpuMat& GpuMat::setTo(const Scalar& s, const GpuMat& mask)
 {
     CV_Assert(mask.type() == CV_8UC1);
@@ -211,151 +400,35 @@ GpuMat& GpuMat::setTo(const Scalar& s, const GpuMat& mask)
 
     if (mask.empty())
     {
-        switch (type())
+        typedef void (*set_caller_t)(GpuMat& src, const Scalar& s);
+        static const set_caller_t set_callers[8][4] =
         {
-        case CV_8UC1:
-            {
-                Npp8u nVal = (Npp8u)s[0];
-                nppSafeCall( nppiSet_8u_C1R(nVal, ptr<Npp8u>(), step, sz) );
-                break;
-            }
-        case CV_8UC4:
-            {
-                Scalar_<Npp8u> nVal = s;
-                nppSafeCall( nppiSet_8u_C4R(nVal.val, ptr<Npp8u>(), step, sz) );
-                break;
-            }
-        case CV_16UC1:
-            {
-                Npp16u nVal = (Npp16u)s[0];
-                nppSafeCall( nppiSet_16u_C1R(nVal, ptr<Npp16u>(), step, sz) );
-                break;
-            }
-        /*case CV_16UC2:
-            {
-                Scalar_<Npp16u> nVal = s;
-                nppSafeCall( nppiSet_16u_C2R(nVal.val, ptr<Npp16u>(), step, sz) );
-                break;
-            }*/
-        case CV_16UC4:
-            {
-                Scalar_<Npp16u> nVal = s;
-                nppSafeCall( nppiSet_16u_C4R(nVal.val, ptr<Npp16u>(), step, sz) );
-                break;
-            }
-        case CV_16SC1:
-            {
-                Npp16s nVal = (Npp16s)s[0];
-                nppSafeCall( nppiSet_16s_C1R(nVal, ptr<Npp16s>(), step, sz) );
-                break;
-            }
-        /*case CV_16SC2:
-            {
-                Scalar_<Npp16s> nVal = s;
-                nppSafeCall( nppiSet_16s_C2R(nVal.val, ptr<Npp16s>(), step, sz) );
-                break;
-            }*/
-        case CV_16SC4:
-            {
-                Scalar_<Npp16s> nVal = s;
-                nppSafeCall( nppiSet_16s_C4R(nVal.val, ptr<Npp16s>(), step, sz) );
-                break;
-            }
-        case CV_32SC1:
-            {
-                Npp32s nVal = (Npp32s)s[0];
-                nppSafeCall( nppiSet_32s_C1R(nVal, ptr<Npp32s>(), step, sz) );
-                break;
-            }
-        case CV_32SC4:
-            {
-                Scalar_<Npp32s> nVal = s;
-                nppSafeCall( nppiSet_32s_C4R(nVal.val, ptr<Npp32s>(), step, sz) );
-                break;
-            }
-        case CV_32FC1:
-            {
-                Npp32f nVal = (Npp32f)s[0];
-                nppSafeCall( nppiSet_32f_C1R(nVal, ptr<Npp32f>(), step, sz) );
-                break;
-            }
-        case CV_32FC4:
-            {
-                Scalar_<Npp32f> nVal = s;
-                nppSafeCall( nppiSet_32f_C4R(nVal.val, ptr<Npp32f>(), step, sz) );
-                break;
-            }
-        default:
-            matrix_operations::set_to_without_mask( *this, depth(), s.val, channels());
-        }        
+            {NppSet<CV_8U, 1, nppiSet_8u_C1R>::set,kernelSet,kernelSet,NppSet<CV_8U, 4, nppiSet_8u_C4R>::set},
+            {kernelSet,kernelSet,kernelSet,kernelSet},
+            {NppSet<CV_16U, 1, nppiSet_16u_C1R>::set,kernelSet,kernelSet,NppSet<CV_16U, 4, nppiSet_16u_C4R>::set},
+            {NppSet<CV_16S, 1, nppiSet_16s_C1R>::set,kernelSet,kernelSet,NppSet<CV_16S, 4, nppiSet_16s_C4R>::set},
+            {NppSet<CV_32S, 1, nppiSet_32s_C1R>::set,kernelSet,kernelSet,NppSet<CV_32S, 4, nppiSet_32s_C4R>::set},
+            {NppSet<CV_32F, 1, nppiSet_32f_C1R>::set,kernelSet,kernelSet,NppSet<CV_32F, 4, nppiSet_32f_C4R>::set},
+            {kernelSet,kernelSet,kernelSet,kernelSet},
+            {0,0,0,0}
+        };
+        set_callers[depth()][channels()-1](*this, s);     
     }
     else
     {
-        switch (type())
+        typedef void (*set_caller_t)(GpuMat& src, const Scalar& s, const GpuMat& mask);
+        static const set_caller_t set_callers[8][4] =
         {
-        case CV_8UC1:
-            {
-                Npp8u nVal = (Npp8u)s[0];
-                nppSafeCall( nppiSet_8u_C1MR(nVal, ptr<Npp8u>(), step, sz, mask.ptr<Npp8u>(), mask.step) );
-                break;
-            }
-        case CV_8UC4:
-            {
-                Scalar_<Npp8u> nVal = s;
-                nppSafeCall( nppiSet_8u_C4MR(nVal.val, ptr<Npp8u>(), step, sz, mask.ptr<Npp8u>(), mask.step) );
-                break;
-            }
-        case CV_16UC1:
-            {
-                Npp16u nVal = (Npp16u)s[0];
-                nppSafeCall( nppiSet_16u_C1MR(nVal, ptr<Npp16u>(), step, sz, mask.ptr<Npp8u>(), mask.step) );
-                break;
-            }
-        case CV_16UC4:
-            {
-                Scalar_<Npp16u> nVal = s;
-                nppSafeCall( nppiSet_16u_C4MR(nVal.val, ptr<Npp16u>(), step, sz, mask.ptr<Npp8u>(), mask.step) );
-                break;
-            }
-        case CV_16SC1:
-            {
-                Npp16s nVal = (Npp16s)s[0];
-                nppSafeCall( nppiSet_16s_C1MR(nVal, ptr<Npp16s>(), step, sz, mask.ptr<Npp8u>(), mask.step) );
-                break;
-            }
-        case CV_16SC4:
-            {
-                Scalar_<Npp16s> nVal = s;
-                nppSafeCall( nppiSet_16s_C4MR(nVal.val, ptr<Npp16s>(), step, sz, mask.ptr<Npp8u>(), mask.step) );
-                break;
-            }
-        case CV_32SC1:
-            {
-                Npp32s nVal = (Npp32s)s[0];
-                nppSafeCall( nppiSet_32s_C1MR(nVal, ptr<Npp32s>(), step, sz, mask.ptr<Npp8u>(), mask.step) );
-                break;
-            }
-        case CV_32SC4:
-            {
-                Scalar_<Npp32s> nVal = s;
-                nppSafeCall( nppiSet_32s_C4MR(nVal.val, ptr<Npp32s>(), step, sz, mask.ptr<Npp8u>(), mask.step) );
-                break;
-            }
-        case CV_32FC1:
-            {
-                Npp32f nVal = (Npp32f)s[0];
-                nppSafeCall( nppiSet_32f_C1MR(nVal, ptr<Npp32f>(), step, sz, mask.ptr<Npp8u>(), mask.step) );
-                break;
-            }
-        case CV_32FC4:
-            {
-                Scalar_<Npp32f> nVal = s;
-                nppSafeCall( nppiSet_32f_C4MR(nVal.val, ptr<Npp32f>(), step, sz, mask.ptr<Npp8u>(), mask.step) );
-                break;
-            }
-        default:
-            matrix_operations::set_to_with_mask( *this, depth(), s.val, mask, channels());
-        }
+            {NppSetMask<CV_8U, 1, nppiSet_8u_C1MR>::set,kernelSetMask,kernelSetMask,NppSetMask<CV_8U, 4, nppiSet_8u_C4MR>::set},
+            {kernelSetMask,kernelSetMask,kernelSetMask,kernelSetMask},
+            {NppSetMask<CV_16U, 1, nppiSet_16u_C1MR>::set,kernelSetMask,kernelSetMask,NppSetMask<CV_16U, 4, nppiSet_16u_C4MR>::set},
+            {NppSetMask<CV_16S, 1, nppiSet_16s_C1MR>::set,kernelSetMask,kernelSetMask,NppSetMask<CV_16S, 4, nppiSet_16s_C4MR>::set},
+            {NppSetMask<CV_32S, 1, nppiSet_32s_C1MR>::set,kernelSetMask,kernelSetMask,NppSetMask<CV_32S, 4, nppiSet_32s_C4MR>::set},
+            {NppSetMask<CV_32F, 1, nppiSet_32f_C1MR>::set,kernelSetMask,kernelSetMask,NppSetMask<CV_32F, 4, nppiSet_32f_C4MR>::set},
+            {kernelSetMask,kernelSetMask,kernelSetMask,kernelSetMask},
+            {0,0,0,0}
+        };
+        set_callers[depth()][channels()-1](*this, s, mask);
     }
 
     return *this;
diff --git a/tests/gpu/src/gputest_main.cpp b/tests/gpu/src/gputest_main.cpp
index 0833ffe335..5012c4b419 100644
--- a/tests/gpu/src/gputest_main.cpp
+++ b/tests/gpu/src/gputest_main.cpp
@@ -61,6 +61,9 @@ const char* blacklist[] =
     //"GPU-NppImageLog",              // different precision
     //"GPU-NppImageMagnitude",        // different precision
     //"GPU-NppImageSumWindow",        // different border interpolation
+    //"GPU-NppImageSobel",            // ???
+    //"GPU-NppImageGaussianBlur",     // different border interpolation
+    "GPU-NppImageCanny",            // NPP_TEXTURE_BIND_ERROR
     0
 };
 
diff --git a/tests/gpu/src/imgproc_gpu.cpp b/tests/gpu/src/imgproc_gpu.cpp
index ce6cebc98f..7b07312c11 100644
--- a/tests/gpu/src/imgproc_gpu.cpp
+++ b/tests/gpu/src/imgproc_gpu.cpp
@@ -492,6 +492,115 @@ struct CV_GpuNppImageSumWindowTest : public CV_GpuImageProcTest
     }
 };
 
+////////////////////////////////////////////////////////////////////////////////
+// Sobel
+struct CV_GpuNppImageSobelTest : public CV_GpuImageProcTest
+{
+    CV_GpuNppImageSobelTest() : CV_GpuImageProcTest( "GPU-NppImageSobel", "Sobel" ) {}
+
+    int test(const Mat& img)
+    {
+        if (img.type() != CV_8UC1 && img.type() != CV_8UC4)
+        {
+            ts->printf(CvTS::LOG, "\nUnsupported type\n");
+            return CvTS::OK;
+        }
+
+        int ksizes[] = {3, 5, 7};
+        int ksizes_num = sizeof(ksizes) / sizeof(int);
+
+        int dx = 1, dy = 0;
+
+        int test_res = CvTS::OK;
+
+        for (int i = 0; i < ksizes_num; ++i)
+        {
+            ts->printf(CvTS::LOG, "\nksize = %d\n", ksizes[i]);
+
+            Mat cpudst;
+            cv::Sobel(img, cpudst, -1, dx, dy, ksizes[i]);
+
+            GpuMat gpu1(img);
+            GpuMat gpudst;
+            cv::gpu::Sobel(gpu1, gpudst, -1, dx, dy, ksizes[i]);
+
+            if (CheckNorm(cpudst, gpudst) != CvTS::OK)
+                test_res = CvTS::FAIL_GENERIC;
+        }
+
+        return test_res;
+    }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// GaussianBlur
+struct CV_GpuNppImageGaussianBlurTest : public CV_GpuImageProcTest
+{
+    CV_GpuNppImageGaussianBlurTest() : CV_GpuImageProcTest( "GPU-NppImageGaussianBlur", "GaussianBlur" ) {}
+
+    int test(const Mat& img)
+    {
+        if (img.type() != CV_8UC1 && img.type() != CV_8UC4)
+        {
+            ts->printf(CvTS::LOG, "\nUnsupported type\n");
+            return CvTS::OK;
+        }
+
+        int ksizes[] = {3, 5, 7};
+        int ksizes_num = sizeof(ksizes) / sizeof(int);
+
+        int test_res = CvTS::OK;
+
+        const double sigma1 = 3.0;
+
+        for (int i = 0; i < ksizes_num; ++i)
+        {
+            for (int j = 0; j < ksizes_num; ++j)
+            {
+                ts->printf(CvTS::LOG, "\nksize = (%dx%d)\n", ksizes[i], ksizes[j]);
+
+                Mat cpudst;
+                cv::GaussianBlur(img, cpudst, cv::Size(ksizes[i], ksizes[j]), sigma1);
+
+                GpuMat gpu1(img);
+                GpuMat gpudst;
+                cv::gpu::GaussianBlur(gpu1, gpudst, cv::Size(ksizes[i], ksizes[j]), sigma1);
+                if (CheckNorm(cpudst, gpudst) != CvTS::OK)
+                    test_res = CvTS::FAIL_GENERIC;
+            }
+        }
+
+        return test_res;
+    }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// Canny
+struct CV_GpuNppImageCannyTest : public CV_GpuImageProcTest
+{
+    CV_GpuNppImageCannyTest() : CV_GpuImageProcTest( "GPU-NppImageCanny", "Canny" ) {}
+
+    int test(const Mat& img)
+    {
+        if (img.type() != CV_8UC1)
+        {
+            ts->printf(CvTS::LOG, "\nUnsupported type\n");
+            return CvTS::OK;
+        }
+
+        const double threshold1 = 1.0, threshold2 = 10.0;
+
+        Mat cpudst;
+        cv::Canny(img, cpudst, threshold1, threshold2);
+
+        GpuMat gpu1(img);
+        GpuMat gpudst;
+        cv::gpu::Canny(gpu1, gpudst, threshold1, threshold2);
+
+        return CheckNorm(cpudst, gpudst);
+    }
+};
+
 ////////////////////////////////////////////////////////////////////////////////
 // cvtColor
 class CV_GpuCvtColorTest : public CvTest
@@ -598,4 +707,7 @@ CV_GpuNppImageWarpPerspectiveTest CV_GpuNppImageWarpPerspective_test;
 CV_GpuNppImageIntegralTest CV_GpuNppImageIntegral_test;
 CV_GpuNppImageBlurTest CV_GpuNppImageBlur_test;
 CV_GpuNppImageSumWindowTest CV_GpuNppImageSumWindow_test;
+CV_GpuNppImageSobelTest CV_GpuNppImageSobel_test;
+CV_GpuNppImageGaussianBlurTest CV_GpuNppImageGaussianBlur_test;
+CV_GpuNppImageCannyTest CV_GpuNppImageCanny_test;
 CV_GpuCvtColorTest CV_GpuCvtColor_test;
\ No newline at end of file