created wrappers for new NPP functions

removed void integral(const GpuMat& src, GpuMat& sum, GpuMat& sqsum, Stream& stream) - it fails with NPP_NOT_IMPLEMENTED error updated docs, accuracy and performance tests
2024-11-24 19:20:28 +08:00 · 2012-02-22 10:00:53 +00:00 · 2012-02-22 10:00:53 +00:00 · 2d30480982
commit 2d30480982
parent e426dfc396
37 changed files with 1984 additions and 566 deletions
--- a/modules/core/src/gpumat.cpp
+++ b/modules/core/src/gpumat.cpp
@ -48,6 +48,17 @@
 #ifdef HAVE_CUDA
    #include <cuda_runtime.h>
    #include <npp.h>
+
+    #define CUDART_MINIMUM_REQUIRED_VERSION 4010
+    #define NPP_MINIMUM_REQUIRED_VERSION 4100
+
+    #if (CUDART_VERSION < CUDART_MINIMUM_REQUIRED_VERSION)
+        #error "Insufficient Cuda Runtime library version, please update it."
+    #endif
+
+    #if (NPP_VERSION_MAJOR * 1000 + NPP_VERSION_MINOR * 100 + NPP_VERSION_BUILD < NPP_MINIMUM_REQUIRED_VERSION)
+        #error "Insufficient NPP version, please update it."
+    #endif
 #endif

 using namespace std;
@ -460,15 +471,17 @@ namespace cv { namespace gpu

 namespace
 {
-    //////////////////////////////////////////////////////////////////////////
-    // Convert
-
    template<int n> struct NPPTypeTraits;
    template<> struct NPPTypeTraits<CV_8U>  { typedef Npp8u npp_type; };
+    template<> struct NPPTypeTraits<CV_8S>  { typedef Npp8s npp_type; };
    template<> struct NPPTypeTraits<CV_16U> { typedef Npp16u npp_type; };
    template<> struct NPPTypeTraits<CV_16S> { typedef Npp16s npp_type; };
    template<> struct NPPTypeTraits<CV_32S> { typedef Npp32s npp_type; };
    template<> struct NPPTypeTraits<CV_32F> { typedef Npp32f npp_type; };
+    template<> struct NPPTypeTraits<CV_64F> { typedef Npp64f npp_type; };
+
+    //////////////////////////////////////////////////////////////////////////
+    // Convert

    template<int SDEPTH, int DDEPTH> struct NppConvertFunc
    {
@ -494,6 +507,7 @@ namespace
            NppiSize sz;
            sz.width = src.cols;
            sz.height = src.rows;
+
            nppSafeCall( func(src.ptr<src_t>(), static_cast<int>(src.step), dst.ptr<dst_t>(), static_cast<int>(dst.step), sz) );

            cudaSafeCall( cudaDeviceSynchronize() );
@ -508,6 +522,7 @@ namespace
            NppiSize sz;
            sz.width = src.cols;
            sz.height = src.rows;
+
            nppSafeCall( func(src.ptr<Npp32f>(), static_cast<int>(src.step), dst.ptr<dst_t>(), static_cast<int>(dst.step), sz, NPP_RND_NEAR) );

            cudaSafeCall( cudaDeviceSynchronize() );
@ -529,6 +544,14 @@ namespace

        typedef NppStatus (*func_ptr)(src_t val, src_t* pSrc, int nSrcStep, NppiSize oSizeROI);
    };
+    template<int SCN> struct NppSetFunc<CV_8S, SCN>
+    {
+        typedef NppStatus (*func_ptr)(Npp8s values[], Npp8s* pSrc, int nSrcStep, NppiSize oSizeROI);
+    };
+    template<> struct NppSetFunc<CV_8S, 1>
+    {
+        typedef NppStatus (*func_ptr)(Npp8s val, Npp8s* pSrc, int nSrcStep, NppiSize oSizeROI);
+    };

    template<int SDEPTH, int SCN, typename NppSetFunc<SDEPTH, SCN>::func_ptr func> struct NppSet
    {
@ -613,6 +636,35 @@ namespace
        }
    };

+    //////////////////////////////////////////////////////////////////////////
+    // CopyMasked
+
+    template<int SDEPTH> struct NppCopyMaskedFunc
+    {
+        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+
+        typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, src_t* pDst, int nDstStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep);
+    };
+
+    template<int SDEPTH, typename NppCopyMaskedFunc<SDEPTH>::func_ptr func> struct NppCopyMasked
+    {
+        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+
+        static void copyMasked(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t /*stream*/)
+        {
+            NppiSize sz;
+            sz.width = src.cols;
+            sz.height = src.rows;
+
+            nppSafeCall( func(src.ptr<src_t>(), static_cast<int>(src.step), dst.ptr<src_t>(), static_cast<int>(dst.step), sz, mask.ptr<Npp8u>(), static_cast<int>(mask.step)) );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    };
+
+    //////////////////////////////////////////////////////////////////////////
+    // CudaFuncTable
+
    class CudaFuncTable : public GpuFuncTable
    {
    public:
@ -631,7 +683,26 @@ namespace

        void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask) const
        {
-            ::cv::gpu::copyWithMask(src, dst, mask);
+            CV_Assert(src.size() == dst.size() && src.type() == dst.type());
+            CV_Assert(src.size() == mask.size() && mask.depth() == CV_8U && (mask.channels() == 1 || mask.channels() == src.channels()));
+
+            typedef void (*caller_t)(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream);
+
+            static const caller_t callers[7][4] = 
+            {
+                /*  8U */ {NppCopyMasked<CV_8U, nppiCopy_8u_C1MR>::copyMasked, cv::gpu::copyWithMask, NppCopyMasked<CV_8U, nppiCopy_8u_C3MR>::copyMasked, NppCopyMasked<CV_8U, nppiCopy_8u_C4MR>::copyMasked},
+                /*  8S */ {cv::gpu::copyWithMask, cv::gpu::copyWithMask, cv::gpu::copyWithMask, cv::gpu::copyWithMask},
+                /* 16U */ {NppCopyMasked<CV_16U, nppiCopy_16u_C1MR>::copyMasked, cv::gpu::copyWithMask, NppCopyMasked<CV_16U, nppiCopy_16u_C3MR>::copyMasked, NppCopyMasked<CV_16U, nppiCopy_16u_C4MR>::copyMasked},
+                /* 16S */ {NppCopyMasked<CV_16S, nppiCopy_16s_C1MR>::copyMasked, cv::gpu::copyWithMask, NppCopyMasked<CV_16S, nppiCopy_16s_C3MR>::copyMasked, NppCopyMasked<CV_16S, nppiCopy_16s_C4MR>::copyMasked},
+                /* 32S */ {NppCopyMasked<CV_32S, nppiCopy_32s_C1MR>::copyMasked, cv::gpu::copyWithMask, NppCopyMasked<CV_32S, nppiCopy_32s_C3MR>::copyMasked, NppCopyMasked<CV_32S, nppiCopy_32s_C4MR>::copyMasked},
+                /* 32F */ {NppCopyMasked<CV_32F, nppiCopy_32f_C1MR>::copyMasked, cv::gpu::copyWithMask, NppCopyMasked<CV_32F, nppiCopy_32f_C3MR>::copyMasked, NppCopyMasked<CV_32F, nppiCopy_32f_C4MR>::copyMasked},
+                /* 64F */ {cv::gpu::copyWithMask, cv::gpu::copyWithMask, cv::gpu::copyWithMask, cv::gpu::copyWithMask}
+            };
+
+            caller_t func =  mask.channels() == src.channels() ? callers[src.depth()][src.channels()] : cv::gpu::copyWithMask;
+            CV_DbgAssert(func != 0);
+
+            func(src, dst, mask, 0);
        }

        void convert(const GpuMat& src, GpuMat& dst) const
@ -641,65 +712,65 @@ namespace
            {
                {
                    /*  8U ->  8U */ {0, 0, 0, 0},
-                    /*  8U ->  8S */ {::cv::gpu::convertTo, ::cv::gpu::convertTo, ::cv::gpu::convertTo, ::cv::gpu::convertTo},
-                    /*  8U -> 16U */ {NppCvt<CV_8U, CV_16U, nppiConvert_8u16u_C1R>::cvt,::cv::gpu::convertTo,::cv::gpu::convertTo,NppCvt<CV_8U, CV_16U, nppiConvert_8u16u_C4R>::cvt},
-                    /*  8U -> 16S */ {NppCvt<CV_8U, CV_16S, nppiConvert_8u16s_C1R>::cvt,::cv::gpu::convertTo,::cv::gpu::convertTo,NppCvt<CV_8U, CV_16S, nppiConvert_8u16s_C4R>::cvt},
-                    /*  8U -> 32S */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
-                    /*  8U -> 32F */ {NppCvt<CV_8U, CV_32F, nppiConvert_8u32f_C1R>::cvt,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
-                    /*  8U -> 64F */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo}
+                    /*  8U ->  8S */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo},
+                    /*  8U -> 16U */ {NppCvt<CV_8U, CV_16U, nppiConvert_8u16u_C1R>::cvt,cv::gpu::convertTo,cv::gpu::convertTo,NppCvt<CV_8U, CV_16U, nppiConvert_8u16u_C4R>::cvt},
+                    /*  8U -> 16S */ {NppCvt<CV_8U, CV_16S, nppiConvert_8u16s_C1R>::cvt,cv::gpu::convertTo,cv::gpu::convertTo,NppCvt<CV_8U, CV_16S, nppiConvert_8u16s_C4R>::cvt},
+                    /*  8U -> 32S */ {cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo},
+                    /*  8U -> 32F */ {NppCvt<CV_8U, CV_32F, nppiConvert_8u32f_C1R>::cvt,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo},
+                    /*  8U -> 64F */ {cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo}
                },
                {
-                    /*  8S ->  8U */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
+                    /*  8S ->  8U */ {cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo},
                    /*  8S ->  8S */ {0,0,0,0},
-                    /*  8S -> 16U */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
-                    /*  8S -> 16S */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
-                    /*  8S -> 32S */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
-                    /*  8S -> 32F */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
-                    /*  8S -> 64F */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo}
+                    /*  8S -> 16U */ {cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo},
+                    /*  8S -> 16S */ {cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo},
+                    /*  8S -> 32S */ {cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo},
+                    /*  8S -> 32F */ {cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo},
+                    /*  8S -> 64F */ {cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo}
                },
                {
-                    /* 16U ->  8U */ {NppCvt<CV_16U, CV_8U, nppiConvert_16u8u_C1R>::cvt,::cv::gpu::convertTo,::cv::gpu::convertTo,NppCvt<CV_16U, CV_8U, nppiConvert_16u8u_C4R>::cvt},
-                    /* 16U ->  8S */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
+                    /* 16U ->  8U */ {NppCvt<CV_16U, CV_8U, nppiConvert_16u8u_C1R>::cvt,cv::gpu::convertTo,cv::gpu::convertTo,NppCvt<CV_16U, CV_8U, nppiConvert_16u8u_C4R>::cvt},
+                    /* 16U ->  8S */ {cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo},
                    /* 16U -> 16U */ {0,0,0,0},
-                    /* 16U -> 16S */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
-                    /* 16U -> 32S */ {NppCvt<CV_16U, CV_32S, nppiConvert_16u32s_C1R>::cvt,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
-                    /* 16U -> 32F */ {NppCvt<CV_16U, CV_32F, nppiConvert_16u32f_C1R>::cvt,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
-                    /* 16U -> 64F */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo}
+                    /* 16U -> 16S */ {cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo},
+                    /* 16U -> 32S */ {NppCvt<CV_16U, CV_32S, nppiConvert_16u32s_C1R>::cvt,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo},
+                    /* 16U -> 32F */ {NppCvt<CV_16U, CV_32F, nppiConvert_16u32f_C1R>::cvt,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo},
+                    /* 16U -> 64F */ {cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo}
                },
                {
-                    /* 16S ->  8U */ {NppCvt<CV_16S, CV_8U, nppiConvert_16s8u_C1R>::cvt,::cv::gpu::convertTo,::cv::gpu::convertTo,NppCvt<CV_16S, CV_8U, nppiConvert_16s8u_C4R>::cvt},
-                    /* 16S ->  8S */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
-                    /* 16S -> 16U */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
+                    /* 16S ->  8U */ {NppCvt<CV_16S, CV_8U, nppiConvert_16s8u_C1R>::cvt,cv::gpu::convertTo,cv::gpu::convertTo,NppCvt<CV_16S, CV_8U, nppiConvert_16s8u_C4R>::cvt},
+                    /* 16S ->  8S */ {cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo},
+                    /* 16S -> 16U */ {cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo},
                    /* 16S -> 16S */ {0,0,0,0},
-                    /* 16S -> 32S */ {NppCvt<CV_16S, CV_32S, nppiConvert_16s32s_C1R>::cvt,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
-                    /* 16S -> 32F */ {NppCvt<CV_16S, CV_32F, nppiConvert_16s32f_C1R>::cvt,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
-                    /* 16S -> 64F */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo}
+                    /* 16S -> 32S */ {NppCvt<CV_16S, CV_32S, nppiConvert_16s32s_C1R>::cvt,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo},
+                    /* 16S -> 32F */ {NppCvt<CV_16S, CV_32F, nppiConvert_16s32f_C1R>::cvt,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo},
+                    /* 16S -> 64F */ {cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo}
                },
                {
-                    /* 32S ->  8U */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
-                    /* 32S ->  8S */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
-                    /* 32S -> 16U */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
-                    /* 32S -> 16S */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
+                    /* 32S ->  8U */ {cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo},
+                    /* 32S ->  8S */ {cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo},
+                    /* 32S -> 16U */ {cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo},
+                    /* 32S -> 16S */ {cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo},
                    /* 32S -> 32S */ {0,0,0,0},
-                    /* 32S -> 32F */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
-                    /* 32S -> 64F */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo}
+                    /* 32S -> 32F */ {cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo},
+                    /* 32S -> 64F */ {cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo}
                },
                {
-                    /* 32F ->  8U */ {NppCvt<CV_32F, CV_8U, nppiConvert_32f8u_C1R>::cvt,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
-                    /* 32F ->  8S */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
-                    /* 32F -> 16U */ {NppCvt<CV_32F, CV_16U, nppiConvert_32f16u_C1R>::cvt,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
-                    /* 32F -> 16S */ {NppCvt<CV_32F, CV_16S, nppiConvert_32f16s_C1R>::cvt,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
-                    /* 32F -> 32S */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
+                    /* 32F ->  8U */ {NppCvt<CV_32F, CV_8U, nppiConvert_32f8u_C1R>::cvt,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo},
+                    /* 32F ->  8S */ {cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo},
+                    /* 32F -> 16U */ {NppCvt<CV_32F, CV_16U, nppiConvert_32f16u_C1R>::cvt,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo},
+                    /* 32F -> 16S */ {NppCvt<CV_32F, CV_16S, nppiConvert_32f16s_C1R>::cvt,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo},
+                    /* 32F -> 32S */ {cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo},
                    /* 32F -> 32F */ {0,0,0,0},
-                    /* 32F -> 64F */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo}
+                    /* 32F -> 64F */ {cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo}
                },
                {
-                    /* 64F ->  8U */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
-                    /* 64F ->  8S */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
-                    /* 64F -> 16U */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
-                    /* 64F -> 16S */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
-                    /* 64F -> 32S */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
-                    /* 64F -> 32F */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
+                    /* 64F ->  8U */ {cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo},
+                    /* 64F ->  8S */ {cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo},
+                    /* 64F -> 16U */ {cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo},
+                    /* 64F -> 16S */ {cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo},
+                    /* 64F -> 32S */ {cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo},
+                    /* 64F -> 32F */ {cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo},
                    /* 64F -> 64F */ {0,0,0,0}
                }
            };
@ -712,7 +783,7 @@ namespace

        void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta) const
        {
-            ::cv::gpu::convertTo(src, dst, alpha, beta);
+            cv::gpu::convertTo(src, dst, alpha, beta);
        }

        void setTo(GpuMat& m, Scalar s, const GpuMat& mask) const
@ -744,13 +815,13 @@ namespace
                typedef void (*caller_t)(GpuMat& src, Scalar s);
                static const caller_t callers[7][4] =
                {
-                    {NppSet<CV_8U, 1, nppiSet_8u_C1R>::set, ::cv::gpu::setTo, ::cv::gpu::setTo, NppSet<CV_8U, 4, nppiSet_8u_C4R>::set},
-                    {::cv::gpu::setTo, ::cv::gpu::setTo, ::cv::gpu::setTo, ::cv::gpu::setTo},
-                    {NppSet<CV_16U, 1, nppiSet_16u_C1R>::set, NppSet<CV_16U, 2, nppiSet_16u_C2R>::set, ::cv::gpu::setTo, NppSet<CV_16U, 4, nppiSet_16u_C4R>::set},
-                    {NppSet<CV_16S, 1, nppiSet_16s_C1R>::set, NppSet<CV_16S, 2, nppiSet_16s_C2R>::set, ::cv::gpu::setTo, NppSet<CV_16S, 4, nppiSet_16s_C4R>::set},
-                    {NppSet<CV_32S, 1, nppiSet_32s_C1R>::set, ::cv::gpu::setTo, ::cv::gpu::setTo, NppSet<CV_32S, 4, nppiSet_32s_C4R>::set},
-                    {NppSet<CV_32F, 1, nppiSet_32f_C1R>::set, ::cv::gpu::setTo, ::cv::gpu::setTo, NppSet<CV_32F, 4, nppiSet_32f_C4R>::set},
-                    {::cv::gpu::setTo, ::cv::gpu::setTo, ::cv::gpu::setTo, ::cv::gpu::setTo}
+                    {NppSet<CV_8U, 1, nppiSet_8u_C1R>::set, cv::gpu::setTo, cv::gpu::setTo, NppSet<CV_8U, 4, nppiSet_8u_C4R>::set},
+                    {NppSet<CV_8S, 1, nppiSet_8s_C1R>::set, NppSet<CV_8S, 2, nppiSet_8s_C2R>::set, NppSet<CV_8S, 3, nppiSet_8s_C3R>::set, NppSet<CV_8S, 4, nppiSet_8s_C4R>::set},
+                    {NppSet<CV_16U, 1, nppiSet_16u_C1R>::set, NppSet<CV_16U, 2, nppiSet_16u_C2R>::set, cv::gpu::setTo, NppSet<CV_16U, 4, nppiSet_16u_C4R>::set},
+                    {NppSet<CV_16S, 1, nppiSet_16s_C1R>::set, NppSet<CV_16S, 2, nppiSet_16s_C2R>::set, cv::gpu::setTo, NppSet<CV_16S, 4, nppiSet_16s_C4R>::set},
+                    {NppSet<CV_32S, 1, nppiSet_32s_C1R>::set, cv::gpu::setTo, cv::gpu::setTo, NppSet<CV_32S, 4, nppiSet_32s_C4R>::set},
+                    {NppSet<CV_32F, 1, nppiSet_32f_C1R>::set, cv::gpu::setTo, cv::gpu::setTo, NppSet<CV_32F, 4, nppiSet_32f_C4R>::set},
+                    {cv::gpu::setTo, cv::gpu::setTo, cv::gpu::setTo, cv::gpu::setTo}
                };

                callers[m.depth()][m.channels() - 1](m, s);
@ -761,13 +832,13 @@ namespace

                static const caller_t callers[7][4] =
                {
-                    {NppSetMask<CV_8U, 1, nppiSet_8u_C1MR>::set, ::cv::gpu::setTo, ::cv::gpu::setTo, NppSetMask<CV_8U, 4, nppiSet_8u_C4MR>::set},
-                    {::cv::gpu::setTo, ::cv::gpu::setTo, ::cv::gpu::setTo, ::cv::gpu::setTo},
-                    {NppSetMask<CV_16U, 1, nppiSet_16u_C1MR>::set, ::cv::gpu::setTo, ::cv::gpu::setTo, NppSetMask<CV_16U, 4, nppiSet_16u_C4MR>::set},
-                    {NppSetMask<CV_16S, 1, nppiSet_16s_C1MR>::set, ::cv::gpu::setTo, ::cv::gpu::setTo, NppSetMask<CV_16S, 4, nppiSet_16s_C4MR>::set},
-                    {NppSetMask<CV_32S, 1, nppiSet_32s_C1MR>::set, ::cv::gpu::setTo, ::cv::gpu::setTo, NppSetMask<CV_32S, 4, nppiSet_32s_C4MR>::set},
-                    {NppSetMask<CV_32F, 1, nppiSet_32f_C1MR>::set, ::cv::gpu::setTo, ::cv::gpu::setTo, NppSetMask<CV_32F, 4, nppiSet_32f_C4MR>::set},
-                    {::cv::gpu::setTo, ::cv::gpu::setTo, ::cv::gpu::setTo, ::cv::gpu::setTo}
+                    {NppSetMask<CV_8U, 1, nppiSet_8u_C1MR>::set, cv::gpu::setTo, cv::gpu::setTo, NppSetMask<CV_8U, 4, nppiSet_8u_C4MR>::set},
+                    {cv::gpu::setTo, cv::gpu::setTo, cv::gpu::setTo, cv::gpu::setTo},
+                    {NppSetMask<CV_16U, 1, nppiSet_16u_C1MR>::set, cv::gpu::setTo, cv::gpu::setTo, NppSetMask<CV_16U, 4, nppiSet_16u_C4MR>::set},
+                    {NppSetMask<CV_16S, 1, nppiSet_16s_C1MR>::set, cv::gpu::setTo, cv::gpu::setTo, NppSetMask<CV_16S, 4, nppiSet_16s_C4MR>::set},
+                    {NppSetMask<CV_32S, 1, nppiSet_32s_C1MR>::set, cv::gpu::setTo, cv::gpu::setTo, NppSetMask<CV_32S, 4, nppiSet_32s_C4MR>::set},
+                    {NppSetMask<CV_32F, 1, nppiSet_32f_C1MR>::set, cv::gpu::setTo, cv::gpu::setTo, NppSetMask<CV_32F, 4, nppiSet_32f_C4MR>::set},
+                    {cv::gpu::setTo, cv::gpu::setTo, cv::gpu::setTo, cv::gpu::setTo}
                };

                callers[m.depth()][m.channels() - 1](m, s, mask);
--- a/modules/gpu/doc/image_processing.rst
+++ b/modules/gpu/doc/image_processing.rst
@ -69,18 +69,14 @@ Performs a mean-shift segmentation of the source image and eliminates small segm

 gpu::integral
 -----------------
-Computes an integral image and a squared integral image.
+Computes an integral image.

 .. ocv:function:: void gpu::integral(const GpuMat& src, GpuMat& sum, Stream& stream = Stream::Null())

-.. ocv:function:: void gpu::integral(const GpuMat& src, GpuMat& sum, GpuMat& sqsum, Stream& stream = Stream::Null())
-
    :param src: Source image. Only  ``CV_8UC1`` images are supported for now.

    :param sum: Integral image containing 32-bit unsigned integer values packed into  ``CV_32SC1`` .

-    :param sqsum: Squared integral image of the  ``CV_32FC1`` type.
-
    :param stream: Stream for the asynchronous version.

 .. seealso:: :ocv:func:`integral`
@ -380,6 +376,22 @@ Converts an image from one color space to another.



+gpu::swapChannels
+-----------------
+Exchanges the color channels of an image in-place.
+
+.. ocv:function:: void gpu::swapChannels(GpuMat& image, const int dstOrder[4], Stream& stream = Stream::Null())
+
+    :param src: Source image. Supports only ``CV_8UC4`` type.
+
+    :param dstOrder: Integer array describing how channel values are permutated. The n-th entry of the array contains the number of the channel that is stored in the n-th channel of the output image. E.g. Given an RGBA image, aDstOrder = [3,2,1,0] converts this to ABGR channel order.
+
+    :param stream: Stream for the asynchronous version.
+
+The methods support arbitrary permutations of the original channels, including replication.
+
+
+
 gpu::threshold
 ------------------
 Applies a fixed-level threshold to each array element.
@ -489,7 +501,7 @@ Rotates an image around the origin (0,0) and then shifts it.

 .. ocv:function:: void gpu::rotate(const GpuMat& src, GpuMat& dst, Size dsize, double angle, double xShift = 0, double yShift = 0, int interpolation = INTER_LINEAR, Stream& stream = Stream::Null())

-    :param src: Source image.  ``CV_8UC1`` and  ``CV_8UC4`` types are supported.
+    :param src: Source image. Supports 1, 3 or 4 channels images with ``CV_8U`` , ``CV_16U`` or ``CV_32F`` depth.

    :param dst: Destination image with the same type as  ``src`` . The size is  ``dsize`` .

@ -751,6 +763,38 @@ Performs linear blending of two images.



+gpu::alphaComp
+-------------------
+Composites two images using alpha opacity values contained in each image.
+
+.. ocv:function:: void gpu::alphaComp(const GpuMat& img1, const GpuMat& img2, GpuMat& dst, int alpha_op, Stream& stream = Stream::Null())
+
+    :param img1: First image. Supports ``CV_8UC4`` , ``CV_16UC4`` , ``CV_32SC4`` and ``CV_32FC4`` types.
+
+    :param img1: Second image. Must have the same size and the same type as ``img1`` .
+
+    :param dst: Destination image.
+
+    :param alpha_op: Flag specifying the alpha-blending operation:
+
+            * **ALPHA_OVER**
+            * **ALPHA_IN** 
+            * **ALPHA_OUT** 
+            * **ALPHA_ATOP** 
+            * **ALPHA_XOR** 
+            * **ALPHA_PLUS** 
+            * **ALPHA_OVER_PREMUL** 
+            * **ALPHA_IN_PREMUL** 
+            * **ALPHA_OUT_PREMUL** 
+            * **ALPHA_ATOP_PREMUL** 
+            * **ALPHA_XOR_PREMUL** 
+            * **ALPHA_PLUS_PREMUL** 
+            * **ALPHA_PREMUL**
+
+    :param stream: Stream for the asynchronous version.
+
+
+
 gpu::Canny
 -------------------
 Finds edges in an image using the [Canny86]_ algorithm.
--- a/modules/gpu/doc/matrix_reductions.rst
+++ b/modules/gpu/doc/matrix_reductions.rst
@ -10,6 +10,7 @@ gpu::meanStdDev
 Computes a mean value and a standard deviation of matrix elements.

 .. ocv:function:: void gpu::meanStdDev(const GpuMat& mtx, Scalar& mean, Scalar& stddev)
+.. ocv:function:: void gpu::meanStdDev(const GpuMat& mtx, Scalar& mean, Scalar& stddev, GpuMat& buf);

    :param mtx: Source matrix.  ``CV_8UC1``  matrices are supported for now.

@ -17,6 +18,8 @@ Computes a mean value and a standard deviation of matrix elements.

    :param stddev: Standard deviation value.

+    :param buf: Optional buffer to avoid extra memory allocations. It is resized automatically.
+
 .. seealso:: :ocv:func:`meanStdDev`


--- a/modules/gpu/doc/operations_on_matrices.rst
+++ b/modules/gpu/doc/operations_on_matrices.rst
@ -63,7 +63,7 @@ Flips a 2D matrix around vertical, horizontal, or both axes.

 .. ocv:function:: void gpu::flip(const GpuMat& src, GpuMat& dst, int flipCode, Stream& stream = Stream::Null())

-    :param src: Source matrix. Only  ``CV_8UC1``  and  ``CV_8UC4``  matrices are supported for now.
+    :param src: Source matrix. Supports 1, 3 and 4 channels images with ``CV_8U``, ``CV_16U``, ``CV_32S`` or ``CV_32F`` depth.

    :param dst: Destination matrix.

--- a/modules/gpu/doc/per_element_operations.rst
+++ b/modules/gpu/doc/per_element_operations.rst
@ -139,6 +139,50 @@ where ``I`` is a multi-dimensional index of array elements. In case of multi-cha



+gpu::abs
+------------
+Computes an absolute value of each matrix element.
+
+.. ocv:function:: void gpu::abs(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null())
+
+    :param src: Source matrix. Supports ``CV_16S`` and ``CV_32F`` depth.
+
+    :param dst: Destination matrix with the same size and type as ``src`` .
+
+    :param stream: Stream for the asynchronous version.
+
+.. seealso:: :ocv:func:`abs`
+
+
+
+gpu::sqr
+------------
+Computes a square value of each matrix element.
+
+.. ocv:function:: void gpu::sqr(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null())
+
+    :param src: Source matrix. Supports ``CV_8U`` , ``CV_16U`` , ``CV_16S`` and ``CV_32F`` depth.
+
+    :param dst: Destination matrix with the same size and type as ``src`` .
+
+    :param stream: Stream for the asynchronous version.
+
+
+
+gpu::sqrt
+------------
+Computes a square root of each matrix element.
+
+.. ocv:function:: void gpu::sqrt(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null())
+
+    :param src: Source matrix. Supports ``CV_8U`` , ``CV_16U`` , ``CV_16S`` and ``CV_32F`` depth.
+
+    :param dst: Destination matrix with the same size and type as ``src`` .
+
+    :param stream: Stream for the asynchronous version.
+
+.. seealso:: :ocv:func:`sqrt`
+


 gpu::exp
@ -147,7 +191,7 @@ Computes an exponent of each matrix element.

 .. ocv:function:: void gpu::exp(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null())

-    :param src: Source matrix. ``CV_32FC1`` matrixes are supported for now.
+    :param src: Source matrix. Supports ``CV_8U`` , ``CV_16U`` , ``CV_16S`` and ``CV_32F`` depth.

    :param dst: Destination matrix with the same size and type as ``src`` .

@ -157,6 +201,22 @@ Computes an exponent of each matrix element.



+gpu::log
+------------
+Computes a natural logarithm of absolute value of each matrix element.
+
+.. ocv:function:: void gpu::log(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null())
+
+    :param src: Source matrix. Supports ``CV_8U`` , ``CV_16U`` , ``CV_16S`` and ``CV_32F`` depth.
+
+    :param dst: Destination matrix with the same size and type as ``src`` .
+
+    :param stream: Stream for the asynchronous version.
+
+.. seealso:: :ocv:func:`log`
+
+
+
 gpu::pow
 ------------
 Raises every matrix element to a power.
@ -181,22 +241,6 @@ The function ``pow`` raises every element of the input matrix to ``p`` :



-gpu::log
------------
-Computes a natural logarithm of absolute value of each matrix element.
-
-.. ocv:function:: void gpu::log(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null())
-
-    :param src: Source matrix. ``CV_32FC1`` matrixes are supported for now.
-
-    :param dst: Destination matrix with the same size and type as ``src`` .
-
-    :param stream: Stream for the asynchronous version.
-
-.. seealso:: :ocv:func:`log`
-
-
-
 gpu::absdiff
 ----------------
 Computes per-element absolute difference of two matrices (or of a matrix and scalar).
@ -262,9 +306,10 @@ Performs a per-element bitwise inversion.

 gpu::bitwise_or
 -------------------
-Performs a per-element bitwise disjunction of two matrices.
+Performs a per-element bitwise disjunction of two matrices or of matrix and scalar.

 .. ocv:function:: void gpu::bitwise_or(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask=GpuMat(), Stream& stream = Stream::Null())
+.. ocv:function:: void gpu::bitwise_or(const GpuMat& src1, const Scalar& sc, GpuMat& dst, Stream& stream = Stream::Null())

    :param src1: First source matrix.

@ -280,9 +325,10 @@ Performs a per-element bitwise disjunction of two matrices.

 gpu::bitwise_and
 --------------------
-Performs a per-element bitwise conjunction of two matrices.
+Performs a per-element bitwise conjunction of two matrices or of matrix and scalar.

 .. ocv:function:: void gpu::bitwise_and(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask=GpuMat(), Stream& stream = Stream::Null())
+.. ocv:function:: void gpu::bitwise_and(const GpuMat& src1, const Scalar& sc, GpuMat& dst, Stream& stream = Stream::Null())

    :param src1: First source matrix.

@ -298,9 +344,10 @@ Performs a per-element bitwise conjunction of two matrices.

 gpu::bitwise_xor
 --------------------
-Performs a per-element bitwise ``exclusive or`` operation of two matrices.
+Performs a per-element bitwise ``exclusive or`` operation of two matrices of matrix and scalar.

 .. ocv:function:: void gpu::bitwise_xor(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask=GpuMat(), Stream& stream = Stream::Null())
+.. ocv:function:: void gpu::bitwise_xor(const GpuMat& src1, const Scalar& sc, GpuMat& dst, Stream& stream = Stream::Null())

    :param src1: First source matrix.

@ -314,6 +361,38 @@ Performs a per-element bitwise ``exclusive or`` operation of two matrices.



+gpu::rshift
+--------------------
+Performs pixel by pixel right shift of an image by a constant value.
+
+.. ocv:function:: void gpu::rshift(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stream& stream = Stream::Null())
+
+    :param src: Source matrix. Supports 1, 3 and 4 channels images with integers elements.
+
+    :param sc: Constant values, one per channel.
+
+    :param dst: Destination matrix with the same size and type as ``src`` .
+
+    :param stream: Stream for the asynchronous version.
+
+
+
+gpu::lshift
+--------------------
+Performs pixel by pixel right left of an image by a constant value.
+
+.. ocv:function:: void gpu::lshift(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stream& stream = Stream::Null())
+
+    :param src: Source matrix. Supports 1, 3 and 4 channels images with ``CV_8U`` , ``CV_16U`` or ``CV_32S`` depth.
+
+    :param sc: Constant values, one per channel.
+
+    :param dst: Destination matrix with the same size and type as ``src`` .
+
+    :param stream: Stream for the asynchronous version.
+
+
+
 gpu::min
 ------------
 Computes the per-element minimum of two matrices (or a matrix and a scalar).
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@ -498,7 +498,7 @@ CV_EXPORTS void gemm(const GpuMat& src1, const GpuMat& src2, double alpha,
 CV_EXPORTS void transpose(const GpuMat& src1, GpuMat& dst, Stream& stream = Stream::Null());

 //! reverses the order of the rows, columns or both in a matrix
-//! supports CV_8UC1, CV_8UC4 types
+//! supports 1, 3 and 4 channels images with CV_8U, CV_16U, CV_32S or CV_32F depth
 CV_EXPORTS void flip(const GpuMat& a, GpuMat& b, int flipCode, Stream& stream = Stream::Null());

 //! transforms 8-bit unsigned integers using lookup table: dst(i)=lut(src(i))
@ -586,20 +586,32 @@ CV_EXPORTS void absdiff(const GpuMat& a, const GpuMat& b, GpuMat& c, Stream& str
 //! computes element-wise absolute difference of array and scalar (c = abs(a - s))
 CV_EXPORTS void absdiff(const GpuMat& a, const Scalar& s, GpuMat& c, Stream& stream = Stream::Null());

+//! computes absolute value of each matrix element
+//! supports CV_16S and CV_32F depth
+CV_EXPORTS void abs(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null());
+
+//! computes square of each pixel in an image
+//! supports CV_8U, CV_16U, CV_16S and CV_32F depth
+CV_EXPORTS void sqr(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null());
+
+//! computes square root of each pixel in an image
+//! supports CV_8U, CV_16U, CV_16S and CV_32F depth
+CV_EXPORTS void sqrt(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null());
+
 //! computes exponent of each matrix element (b = e**a)
-//! supports only CV_32FC1 type
+//! supports CV_8U, CV_16U, CV_16S and CV_32F depth
 CV_EXPORTS void exp(const GpuMat& a, GpuMat& b, Stream& stream = Stream::Null());

+//! computes natural logarithm of absolute value of each matrix element: b = log(abs(a))
+//! supports CV_8U, CV_16U, CV_16S and CV_32F depth
+CV_EXPORTS void log(const GpuMat& a, GpuMat& b, Stream& stream = Stream::Null());
+
 //! computes power of each matrix element:
 //    (dst(i,j) = pow(     src(i,j) , power), if src.type() is integer
 //    (dst(i,j) = pow(fabs(src(i,j)), power), otherwise
 //! supports all, except depth == CV_64F
 CV_EXPORTS void pow(const GpuMat& src, double power, GpuMat& dst, Stream& stream = Stream::Null());

-//! computes natural logarithm of absolute value of each matrix element: b = log(abs(a))
-//! supports only CV_32FC1 type
-CV_EXPORTS void log(const GpuMat& a, GpuMat& b, Stream& stream = Stream::Null());
-
 //! compares elements of two arrays (c = a <cmpop> b)
 CV_EXPORTS void compare(const GpuMat& a, const GpuMat& b, GpuMat& c, int cmpop, Stream& stream = Stream::Null());

@ -608,12 +620,29 @@ CV_EXPORTS void bitwise_not(const GpuMat& src, GpuMat& dst, const GpuMat& mask=G

 //! calculates per-element bit-wise disjunction of two arrays
 CV_EXPORTS void bitwise_or(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask=GpuMat(), Stream& stream = Stream::Null());
+//! calculates per-element bit-wise disjunction of array and scalar
+//! supports 1, 3 and 4 channels images with CV_8U, CV_16U or CV_32S depth
+CV_EXPORTS void bitwise_or(const GpuMat& src1, const Scalar& sc, GpuMat& dst, Stream& stream = Stream::Null());

 //! calculates per-element bit-wise conjunction of two arrays
 CV_EXPORTS void bitwise_and(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask=GpuMat(), Stream& stream = Stream::Null());
+//! calculates per-element bit-wise conjunction of array and scalar
+//! supports 1, 3 and 4 channels images with CV_8U, CV_16U or CV_32S depth
+CV_EXPORTS void bitwise_and(const GpuMat& src1, const Scalar& sc, GpuMat& dst, Stream& stream = Stream::Null());

 //! calculates per-element bit-wise "exclusive or" operation
 CV_EXPORTS void bitwise_xor(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask=GpuMat(), Stream& stream = Stream::Null());
+//! calculates per-element bit-wise "exclusive or" of array and scalar
+//! supports 1, 3 and 4 channels images with CV_8U, CV_16U or CV_32S depth
+CV_EXPORTS void bitwise_xor(const GpuMat& src1, const Scalar& sc, GpuMat& dst, Stream& stream = Stream::Null());
+
+//! pixel by pixel right shift of an image by a constant value
+//! supports 1, 3 and 4 channels images with integers elements
+CV_EXPORTS void rshift(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stream& stream = Stream::Null());
+
+//! pixel by pixel left shift of an image by a constant value
+//! supports 1, 3 and 4 channels images with CV_8U, CV_16U or CV_32S depth
+CV_EXPORTS void lshift(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stream& stream = Stream::Null());

 //! computes per-element minimum of two arrays (dst = min(src1, src2))
 CV_EXPORTS void min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream = Stream::Null());
@ -627,6 +656,13 @@ CV_EXPORTS void max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream&
 //! computes per-element maximum of array and scalar (dst = max(src1, src2))
 CV_EXPORTS void max(const GpuMat& src1, double src2, GpuMat& dst, Stream& stream = Stream::Null());

+enum { ALPHA_OVER, ALPHA_IN, ALPHA_OUT, ALPHA_ATOP, ALPHA_XOR, ALPHA_PLUS, ALPHA_OVER_PREMUL, ALPHA_IN_PREMUL, ALPHA_OUT_PREMUL,
+       ALPHA_ATOP_PREMUL, ALPHA_XOR_PREMUL, ALPHA_PLUS_PREMUL, ALPHA_PREMUL};
+
+//! Composite two images using alpha opacity values contained in each image
+//! Supports CV_8UC4, CV_16UC4, CV_32SC4 and CV_32FC4 types
+CV_EXPORTS void alphaComp(const GpuMat& img1, const GpuMat& img2, GpuMat& dst, int alpha_op, Stream& stream = Stream::Null());
+

 ////////////////////////////// Image processing //////////////////////////////

@ -665,6 +701,13 @@ CV_EXPORTS void reprojectImageTo3D(const GpuMat& disp, GpuMat& xyzw, const Mat&
 //! converts image from one color space to another
 CV_EXPORTS void cvtColor(const GpuMat& src, GpuMat& dst, int code, int dcn = 0, Stream& stream = Stream::Null());

+//! swap channels
+//! dstOrder - Integer array describing how channel values are permutated. The n-th entry
+//!            of the array contains the number of the channel that is stored in the n-th channel of
+//!            the output image. E.g. Given an RGBA image, aDstOrder = [3,2,1,0] converts this to ABGR
+//!            channel order.
+CV_EXPORTS void swapChannels(GpuMat& image, const int dstOrder[4], Stream& stream = Stream::Null());
+
 //! applies fixed threshold to the image
 CV_EXPORTS double threshold(const GpuMat& src, GpuMat& dst, double thresh, double maxval, int type, Stream& stream = Stream::Null());

@ -692,9 +735,9 @@ CV_EXPORTS void buildWarpCylindricalMaps(Size src_size, Rect dst_roi, const Mat
 CV_EXPORTS void buildWarpSphericalMaps(Size src_size, Rect dst_roi, const Mat &K, const Mat& R, float scale,
                                       GpuMat& map_x, GpuMat& map_y, Stream& stream = Stream::Null());

-//! rotate 8bit single or four channel image
-//! Supports INTER_NEAREST, INTER_LINEAR, INTER_CUBIC
-//! supports CV_8UC1, CV_8UC4 types
+//! rotates an image around the origin (0,0) and then shifts it
+//! supports INTER_NEAREST, INTER_LINEAR, INTER_CUBIC
+//! supports 1, 3 or 4 channels images with CV_8U, CV_16U or CV_32F depth
 CV_EXPORTS void rotate(const GpuMat& src, GpuMat& dst, Size dsize, double angle, double xShift = 0, double yShift = 0, 
                       int interpolation = INTER_LINEAR, Stream& stream = Stream::Null());

@ -706,15 +749,9 @@ CV_EXPORTS void copyMakeBorder(const GpuMat& src, GpuMat& dst, int top, int bott
 //! sum will have CV_32S type, but will contain unsigned int values
 //! supports only CV_8UC1 source type
 CV_EXPORTS void integral(const GpuMat& src, GpuMat& sum, Stream& stream = Stream::Null());
-
 //! buffered version
 CV_EXPORTS void integralBuffered(const GpuMat& src, GpuMat& sum, GpuMat& buffer, Stream& stream = Stream::Null());

-//! computes the integral image and integral for the squared image
-//! sum will have CV_32S type, sqsum - CV32F type
-//! supports only CV_8UC1 source type
-CV_EXPORTS void integral(const GpuMat& src, GpuMat& sum, GpuMat& sqsum, Stream& stream = Stream::Null());
-
 //! computes squared integral image
 //! result matrix will have 64F type, but will contain 64U values
 //! supports source images of 8UC1 type only
@ -859,6 +896,8 @@ private:
 //! computes mean value and standard deviation of all or selected array elements
 //! supports only CV_8UC1 type
 CV_EXPORTS void meanStdDev(const GpuMat& mtx, Scalar& mean, Scalar& stddev);
+//! buffered version
+CV_EXPORTS void meanStdDev(const GpuMat& mtx, Scalar& mean, Scalar& stddev, GpuMat& buf);

 //! computes norm of array
 //! supports NORM_INF, NORM_L1, NORM_L2
@ -939,10 +978,16 @@ CV_EXPORTS void solvePnPRansac(const Mat& object, const Mat& image, const Mat& c

 //////////////////////////////// Image Labeling ////////////////////////////////

-//!performs labeling via graph cuts
+//!performs labeling via graph cuts of a 2D regular 4-connected graph. 
 CV_EXPORTS void graphcut(GpuMat& terminals, GpuMat& leftTransp, GpuMat& rightTransp, GpuMat& top, GpuMat& bottom, GpuMat& labels, 
                         GpuMat& buf, Stream& stream = Stream::Null());

+//!performs labeling via graph cuts of a 2D regular 8-connected graph. 
+CV_EXPORTS void graphcut(GpuMat& terminals, GpuMat& leftTransp, GpuMat& rightTransp, GpuMat& top, GpuMat& topLeft, GpuMat& topRight, 
+                         GpuMat& bottom, GpuMat& bottomLeft, GpuMat& bottomRight,
+                         GpuMat& labels, 
+                         GpuMat& buf, Stream& stream = Stream::Null());
+
 ////////////////////////////////// Histograms //////////////////////////////////

 //! Compute levels with even distribution. levels will have 1 row and nLevels cols and CV_32SC1 type.
--- a/modules/gpu/perf/perf_arithm.cpp
+++ b/modules/gpu/perf/perf_arithm.cpp
@ -59,7 +59,7 @@ GPU_PERF_TEST(Flip, cv::gpu::DeviceInfo, cv::Size, perf::MatType, FlipCode)
 INSTANTIATE_TEST_CASE_P(Arithm, Flip, testing::Combine(
                        ALL_DEVICES, 
                        GPU_TYPICAL_MAT_SIZES, 
-                        testing::Values(CV_8UC1, CV_8UC4),
+                        testing::Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4),
                        testing::Values((int) HORIZONTAL_AXIS, (int) VERTICAL_AXIS, (int) BOTH_AXIS)));

 //////////////////////////////////////////////////////////////////////
@ -363,6 +363,33 @@ INSTANTIATE_TEST_CASE_P(Arithm, BitwiseAnd, testing::Combine(
                        GPU_TYPICAL_MAT_SIZES, 
                        testing::Values(CV_8UC1, CV_16UC1, CV_32SC1)));

+GPU_PERF_TEST(BitwiseScalarAnd, cv::gpu::DeviceInfo, cv::Size, perf::MatType)
+{
+    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
+    cv::Size size = GET_PARAM(1);
+    int type = GET_PARAM(2);
+
+    cv::gpu::setDevice(devInfo.deviceID());
+
+    cv::Mat src_host(size, type);
+
+    declare.in(src_host, WARMUP_RNG);
+
+    cv::gpu::GpuMat src(src_host);
+    cv::gpu::GpuMat dst;
+    cv::Scalar sc = cv::Scalar(123, 123, 123, 123);
+
+    TEST_CYCLE()
+    {
+        cv::gpu::bitwise_and(src, sc, dst);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(Arithm, BitwiseScalarAnd, testing::Combine(
+                        ALL_DEVICES, 
+                        GPU_TYPICAL_MAT_SIZES, 
+                        testing::Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_16UC1, CV_16UC3, CV_16UC4, CV_32SC1, CV_32SC3, CV_32SC4)));
+
 //////////////////////////////////////////////////////////////////////
 // Min

@ -411,10 +438,11 @@ GPU_PERF_TEST(MeanStdDev, cv::gpu::DeviceInfo, cv::Size)
    cv::gpu::GpuMat src(src_host); 
    cv::Scalar mean;
    cv::Scalar stddev;
+    cv::gpu::GpuMat buf;

    TEST_CYCLE()
    {
-        cv::gpu::meanStdDev(src, mean, stddev);
+        cv::gpu::meanStdDev(src, mean, stddev, buf);
    }
 }

--- a/modules/gpu/perf/perf_imgproc.cpp
+++ b/modules/gpu/perf/perf_imgproc.cpp
@ -201,7 +201,7 @@ GPU_PERF_TEST(CvtColor, cv::gpu::DeviceInfo, cv::Size, perf::MatType, CvtColorIn
    declare.in(src_host, WARMUP_RNG);

    cv::gpu::GpuMat src(src_host);
-    cv::gpu::GpuMat dst(size, CV_MAKETYPE(type, info.dcn));
+    cv::gpu::GpuMat dst;

    TEST_CYCLE()
    {
@ -218,6 +218,32 @@ INSTANTIATE_TEST_CASE_P(ImgProc, CvtColor, testing::Combine(
                            CvtColorInfo(4, 4, cv::COLOR_BGR2XYZ), CvtColorInfo(4, 4, cv::COLOR_BGR2YCrCb), CvtColorInfo(4, 4, cv::COLOR_YCrCb2BGR), 
                            CvtColorInfo(4, 4, cv::COLOR_BGR2HSV), CvtColorInfo(4, 4, cv::COLOR_HSV2BGR))));

+//////////////////////////////////////////////////////////////////////
+// SwapChannels
+
+GPU_PERF_TEST(SwapChannels, cv::gpu::DeviceInfo, cv::Size)
+{
+    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
+    cv::Size size = GET_PARAM(1);
+
+    cv::gpu::setDevice(devInfo.deviceID());
+
+    cv::Mat src_host(size, CV_8UC4);
+
+    declare.in(src_host, WARMUP_RNG);
+
+    cv::gpu::GpuMat src(src_host);
+
+    const int dstOrder[] = {2, 1, 0, 3};
+
+    TEST_CYCLE()
+    {
+        cv::gpu::swapChannels(src, dstOrder);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(ImgProc, SwapChannels, testing::Combine(ALL_DEVICES, GPU_TYPICAL_MAT_SIZES));
+
 //////////////////////////////////////////////////////////////////////
 // Threshold

@ -457,7 +483,7 @@ GPU_PERF_TEST(Rotate, cv::gpu::DeviceInfo, cv::Size, perf::MatType, Interpolatio
 INSTANTIATE_TEST_CASE_P(ImgProc, Rotate, testing::Combine(
                        ALL_DEVICES, 
                        GPU_TYPICAL_MAT_SIZES, 
-                        testing::Values(CV_8UC1, CV_8UC4),
+                        testing::Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_16UC1, CV_16UC3, CV_16UC4, CV_32FC1, CV_32FC3, CV_32FC4),
                        testing::Values((int) cv::INTER_NEAREST, (int) cv::INTER_LINEAR, (int) cv::INTER_CUBIC)));

 //////////////////////////////////////////////////////////////////////
@ -519,33 +545,6 @@ INSTANTIATE_TEST_CASE_P(ImgProc, Integral, testing::Combine(
                        ALL_DEVICES, 
                        GPU_TYPICAL_MAT_SIZES));

-//////////////////////////////////////////////////////////////////////
-// IntegralBoth
-
-GPU_PERF_TEST(IntegralBoth, cv::gpu::DeviceInfo, cv::Size)
-{
-    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
-    cv::Size size = GET_PARAM(1);
-
-    cv::gpu::setDevice(devInfo.deviceID());
-
-    cv::Mat src_host(size, CV_8UC1);
-
-    declare.in(src_host, WARMUP_RNG);
-
-    cv::gpu::GpuMat src(src_host);
-    cv::gpu::GpuMat sum, sqsum;
-
-    TEST_CYCLE()
-    {
-        cv::gpu::integral(src, sum, sqsum);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(ImgProc, IntegralBoth, testing::Combine(
-                        ALL_DEVICES, 
-                        GPU_TYPICAL_MAT_SIZES));
-
 //////////////////////////////////////////////////////////////////////
 // IntegralSqr

@ -849,6 +848,39 @@ INSTANTIATE_TEST_CASE_P(ImgProc, BlendLinear, testing::Combine(
                        GPU_TYPICAL_MAT_SIZES,
                        testing::Values(CV_8UC1, CV_32FC1)));

+//////////////////////////////////////////////////////////////////////
+// AlphaComp
+
+GPU_PERF_TEST(AlphaComp, cv::gpu::DeviceInfo, cv::Size, perf::MatType, AlphaOp)
+{
+    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
+    cv::Size size = GET_PARAM(1);
+    int type = GET_PARAM(2);
+    int alpha_op = GET_PARAM(3);
+
+    cv::gpu::setDevice(devInfo.deviceID());
+
+    cv::Mat img1_host(size, type);
+    cv::Mat img2_host(size, type);
+
+    declare.in(img1_host, img2_host, WARMUP_RNG);
+
+    cv::gpu::GpuMat img1(img1_host);
+    cv::gpu::GpuMat img2(img2_host);
+    cv::gpu::GpuMat dst;
+
+    TEST_CYCLE()
+    {
+        cv::gpu::alphaComp(img1, img2, dst, alpha_op);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(ImgProc, AlphaComp, testing::Combine(
+                        ALL_DEVICES, 
+                        GPU_TYPICAL_MAT_SIZES,
+                        testing::Values(CV_8UC4, CV_16UC4, CV_32SC4, CV_32FC4),
+                        testing::Values((int)cv::gpu::ALPHA_OVER, (int)cv::gpu::ALPHA_IN, (int)cv::gpu::ALPHA_OUT, (int)cv::gpu::ALPHA_ATOP, (int)cv::gpu::ALPHA_XOR, (int)cv::gpu::ALPHA_PLUS, (int)cv::gpu::ALPHA_OVER_PREMUL, (int)cv::gpu::ALPHA_IN_PREMUL, (int)cv::gpu::ALPHA_OUT_PREMUL, (int)cv::gpu::ALPHA_ATOP_PREMUL, (int)cv::gpu::ALPHA_XOR_PREMUL, (int)cv::gpu::ALPHA_PLUS_PREMUL, (int)cv::gpu::ALPHA_PREMUL)));
+
 //////////////////////////////////////////////////////////////////////
 // Canny

--- a/modules/gpu/perf/perf_main.cpp
+++ b/modules/gpu/perf/perf_main.cpp
@ -11,7 +11,7 @@ int main(int argc, char **argv)

 #else

-int main(int argc, char** argv)
+int main()
 {
    printf("OpenCV was built without CUDA support\n");
    return 0;
--- a/modules/gpu/perf/perf_utility.hpp
+++ b/modules/gpu/perf/perf_utility.hpp
@ -11,6 +11,7 @@ CV_ENUM(FlipCode, HORIZONTAL_AXIS, VERTICAL_AXIS, BOTH_AXIS)
 CV_ENUM(Interpolation, cv::INTER_NEAREST, cv::INTER_LINEAR, cv::INTER_CUBIC)
 CV_ENUM(MatchMethod, cv::TM_SQDIFF, cv::TM_SQDIFF_NORMED, cv::TM_CCORR, cv::TM_CCORR_NORMED, cv::TM_CCOEFF, cv::TM_CCOEFF_NORMED)
 CV_ENUM(NormType, cv::NORM_INF, cv::NORM_L1, cv::NORM_L2)
+CV_ENUM(AlphaOp, cv::gpu::ALPHA_OVER, cv::gpu::ALPHA_IN, cv::gpu::ALPHA_OUT, cv::gpu::ALPHA_ATOP, cv::gpu::ALPHA_XOR, cv::gpu::ALPHA_PLUS, cv::gpu::ALPHA_OVER_PREMUL, cv::gpu::ALPHA_IN_PREMUL, cv::gpu::ALPHA_OUT_PREMUL, cv::gpu::ALPHA_ATOP_PREMUL, cv::gpu::ALPHA_XOR_PREMUL, cv::gpu::ALPHA_PLUS_PREMUL, cv::gpu::ALPHA_PREMUL)

 struct CvtColorInfo
 {
--- a/modules/gpu/src/arithm.cpp
+++ b/modules/gpu/src/arithm.cpp
@ -52,8 +52,6 @@ void cv::gpu::gemm(const GpuMat&, const GpuMat&, double, const GpuMat&, double,
 void cv::gpu::transpose(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
 void cv::gpu::flip(const GpuMat&, GpuMat&, int, Stream&) { throw_nogpu(); }
 void cv::gpu::LUT(const GpuMat&, const Mat&, GpuMat&, Stream&) { throw_nogpu(); }
-void cv::gpu::exp(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
-void cv::gpu::log(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
 void cv::gpu::magnitude(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
 void cv::gpu::magnitudeSqr(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
 void cv::gpu::magnitude(const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
@ -89,9 +87,9 @@ void cv::gpu::gemm(const GpuMat& src1, const GpuMat& src2, double alpha, const G
    CV_Assert(src1.type() == CV_32FC1 || src1.type() == CV_32FC2 || src1.type() == CV_64FC1 || src1.type() == CV_64FC2);
    CV_Assert(src2.type() == src1.type() && (src3.empty() || src3.type() == src1.type()));

-    bool tr1 = flags & GEMM_1_T;
-    bool tr2 = flags & GEMM_2_T;
-    bool tr3 = flags & GEMM_3_T;
+    bool tr1 = (flags & GEMM_1_T) != 0;
+    bool tr2 = (flags & GEMM_2_T) != 0;
+    bool tr3 = (flags & GEMM_3_T) != 0;

    Size src1Size = tr1 ? Size(src1.rows, src1.cols) : src1.size();
    Size src2Size = tr2 ? Size(src2.rows, src2.cols) : src2.size();
@ -243,35 +241,66 @@ void cv::gpu::transpose(const GpuMat& src, GpuMat& dst, Stream& s)
 ////////////////////////////////////////////////////////////////////////
 // flip

-void cv::gpu::flip(const GpuMat& src, GpuMat& dst, int flipCode, Stream& s)
+namespace
 {
-    CV_Assert(src.type() == CV_8UC1 || src.type() == CV_8UC4);
+    template<int DEPTH> struct NppTypeTraits;
+    template<> struct NppTypeTraits<CV_8U>  { typedef Npp8u npp_t; };
+    template<> struct NppTypeTraits<CV_8S>  { typedef Npp8s npp_t; };
+    template<> struct NppTypeTraits<CV_16U> { typedef Npp16u npp_t; };
+    template<> struct NppTypeTraits<CV_16S> { typedef Npp16s npp_t; };
+    template<> struct NppTypeTraits<CV_32S> { typedef Npp32s npp_t; };
+    template<> struct NppTypeTraits<CV_32F> { typedef Npp32f npp_t; };
+    template<> struct NppTypeTraits<CV_64F> { typedef Npp64f npp_t; };

-    dst.create( src.size(), src.type() );
-
-    NppiSize sz;
-    sz.width  = src.cols;
-    sz.height = src.rows;
-
-    cudaStream_t stream = StreamAccessor::getStream(s);
-
-    NppStreamHandler h(stream);
-
-    if (src.type() == CV_8UC1)
+    template <int DEPTH> struct NppMirrorFunc
    {
-        nppSafeCall( nppiMirror_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
-            dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz,
-            (flipCode == 0 ? NPP_HORIZONTAL_AXIS : (flipCode > 0 ? NPP_VERTICAL_AXIS : NPP_BOTH_AXIS))) );
-    }
-    else
-    {
-        nppSafeCall( nppiMirror_8u_C4R(src.ptr<Npp8u>(), static_cast<int>(src.step),
-            dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz,
-            (flipCode == 0 ? NPP_HORIZONTAL_AXIS : (flipCode > 0 ? NPP_VERTICAL_AXIS : NPP_BOTH_AXIS))) );
-    }
+        typedef typename NppTypeTraits<DEPTH>::npp_t npp_t;

-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
+        typedef NppStatus (*func_t)(const npp_t* pSrc, int nSrcStep, npp_t* pDst, int nDstStep, NppiSize oROI, NppiAxis flip);
+    };
+
+    template <int DEPTH, typename NppMirrorFunc<DEPTH>::func_t func> struct NppMirror
+    {
+        typedef typename NppMirrorFunc<DEPTH>::npp_t npp_t;
+
+        static void call(const GpuMat& src, GpuMat& dst, int flipCode, cudaStream_t stream)
+        {
+            NppStreamHandler h(stream);
+
+            NppiSize sz;
+            sz.width  = src.cols;
+            sz.height = src.rows;
+
+            nppSafeCall( func(src.ptr<npp_t>(), static_cast<int>(src.step),
+                dst.ptr<npp_t>(), static_cast<int>(dst.step), sz,
+                (flipCode == 0 ? NPP_HORIZONTAL_AXIS : (flipCode > 0 ? NPP_VERTICAL_AXIS : NPP_BOTH_AXIS))) );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    };
+}
+
+void cv::gpu::flip(const GpuMat& src, GpuMat& dst, int flipCode, Stream& stream)
+{
+    typedef void (*func_t)(const GpuMat& src, GpuMat& dst, int flipCode, cudaStream_t stream);
+
+    static const func_t funcs[6][4] = 
+    {
+        {NppMirror<CV_8U, nppiMirror_8u_C1R>::call, 0, NppMirror<CV_8U, nppiMirror_8u_C3R>::call, NppMirror<CV_8U, nppiMirror_8u_C4R>::call},
+        {0,0,0,0},
+        {NppMirror<CV_16U, nppiMirror_16u_C1R>::call, 0, NppMirror<CV_16U, nppiMirror_16u_C3R>::call, NppMirror<CV_16U, nppiMirror_16u_C4R>::call},
+        {0,0,0,0},
+        {NppMirror<CV_32S, nppiMirror_32s_C1R>::call, 0, NppMirror<CV_32S, nppiMirror_32s_C3R>::call, NppMirror<CV_32S, nppiMirror_32s_C4R>::call},
+        {NppMirror<CV_32F, nppiMirror_32f_C1R>::call, 0, NppMirror<CV_32F, nppiMirror_32f_C3R>::call, NppMirror<CV_32F, nppiMirror_32f_C4R>::call}
+    };
+
+    CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32S || src.depth() == CV_32F);
+    CV_Assert(src.channels() == 1 || src.channels() == 3 || src.channels() == 4);
+
+    dst.create(src.size(), src.type());
+
+    funcs[src.depth()][src.channels() - 1](src, dst, flipCode, StreamAccessor::getStream(stream));
 }

 ////////////////////////////////////////////////////////////////////////
@ -340,52 +369,6 @@ void cv::gpu::LUT(const GpuMat& src, const Mat& lut, GpuMat& dst, Stream& s)
        cudaSafeCall( cudaDeviceSynchronize() );
 }

-////////////////////////////////////////////////////////////////////////
-// exp
-
-void cv::gpu::exp(const GpuMat& src, GpuMat& dst, Stream& s)
-{
-    CV_Assert(src.type() == CV_32FC1);
-
-    dst.create(src.size(), src.type());
-
-    NppiSize sz;
-    sz.width = src.cols;
-    sz.height = src.rows;
-
-    cudaStream_t stream = StreamAccessor::getStream(s);
-
-    NppStreamHandler h(stream);
-
-    nppSafeCall( nppiExp_32f_C1R(src.ptr<Npp32f>(), static_cast<int>(src.step), dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz) );
-
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
-
-////////////////////////////////////////////////////////////////////////
-// log
-
-void cv::gpu::log(const GpuMat& src, GpuMat& dst, Stream& s)
-{
-    CV_Assert(src.type() == CV_32FC1);
-
-    dst.create(src.size(), src.type());
-
-    NppiSize sz;
-    sz.width = src.cols;
-    sz.height = src.rows;
-
-    cudaStream_t stream = StreamAccessor::getStream(s);
-
-    NppStreamHandler h(stream);
-
-    nppSafeCall( nppiLn_32f_C1R(src.ptr<Npp32f>(), static_cast<int>(src.step), dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz) );
-
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
-
 ////////////////////////////////////////////////////////////////////////
 // NPP magnitide

--- a/modules/gpu/src/color.cpp
+++ b/modules/gpu/src/color.cpp
@ -48,6 +48,7 @@ using namespace cv::gpu;
 #if !defined (HAVE_CUDA)

 void cv::gpu::cvtColor(const GpuMat&, GpuMat&, int, int, Stream&) { throw_nogpu(); }
+void cv::gpu::swapChannels(GpuMat&, const int[], Stream&) { throw_nogpu(); }

 #else /* !defined (HAVE_CUDA) */

@ -1423,4 +1424,19 @@ void cv::gpu::cvtColor(const GpuMat& src, GpuMat& dst, int code, int dcn, Stream
    func(src, dst, dcn, stream);
 }

+void cv::gpu::swapChannels(GpuMat& image, const int dstOrder[4], Stream& s)
+{
+    CV_Assert(image.type() == CV_8UC4);
+
+    cudaStream_t stream = StreamAccessor::getStream(s);
+
+    NppStreamHandler h(stream);
+
+    NppiSize sz;
+    sz.width  = image.cols;
+    sz.height = image.rows;
+
+    nppSafeCall( nppiSwapChannels_8u_C4IR(image.ptr<Npp8u>(), static_cast<int>(image.step), sz, dstOrder) );	
+}
+
 #endif /* !defined (HAVE_CUDA) */
--- a/modules/gpu/src/cuda/bilateral_filter.cu
+++ b/modules/gpu/src/cuda/bilateral_filter.cu
@ -209,7 +209,7 @@ namespace cv { namespace gpu { namespace device
                cv::gpu::error("Unsupported channels count", __FILE__, __LINE__, "bilateral_filter_caller");
            }

-            if (stream != 0)
+            if (stream == 0)
                cudaSafeCall( cudaDeviceSynchronize() );
        }

--- a/modules/gpu/src/cuda/element_operations.cu
+++ b/modules/gpu/src/cuda/element_operations.cu
@ -1104,9 +1104,9 @@ namespace cv { namespace gpu { namespace device
        cv::gpu::device::transform((DevMem2D_<T>)src1, (DevMem2D_<T>)dst, op, WithOutMask(), stream);
    }

-    template void absdiff_gpu<uchar >(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream);
+    //template void absdiff_gpu<uchar >(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream);
    template void absdiff_gpu<schar >(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream);
-    template void absdiff_gpu<ushort>(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream);                  
+    //template void absdiff_gpu<ushort>(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream);                  
    template void absdiff_gpu<short >(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream);
    template void absdiff_gpu<int   >(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream);                  
    //template void absdiff_gpu<float >(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream);                  
--- a/modules/gpu/src/element_operations.cpp
+++ b/modules/gpu/src/element_operations.cpp
--- a/modules/gpu/src/filtering.cpp
+++ b/modules/gpu/src/filtering.cpp
@ -50,7 +50,7 @@ using namespace cv::gpu;

 Ptr<FilterEngine_GPU> cv::gpu::createFilter2D_GPU(const Ptr<BaseFilter_GPU>&, int, int) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }
 Ptr<FilterEngine_GPU> cv::gpu::createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU>&, const Ptr<BaseColumnFilter_GPU>&, int, int, int) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }
-Ptr<FilterEngine_GPU> cv::gpu::createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU>&, const Ptr<BaseColumnFilter_GPU>&, int, int, int, GpuMat& buf) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }
+Ptr<FilterEngine_GPU> cv::gpu::createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU>&, const Ptr<BaseColumnFilter_GPU>&, int, int, int, GpuMat&) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }
 Ptr<BaseRowFilter_GPU> cv::gpu::getRowSumFilter_GPU(int, int, int, int) { throw_nogpu(); return Ptr<BaseRowFilter_GPU>(0); }
 Ptr<BaseColumnFilter_GPU> cv::gpu::getColumnSumFilter_GPU(int, int, int, int) { throw_nogpu(); return Ptr<BaseColumnFilter_GPU>(0); }
 Ptr<BaseFilter_GPU> cv::gpu::getBoxFilter_GPU(int, int, const Size&, Point) { throw_nogpu(); return Ptr<BaseFilter_GPU>(0); }
--- a/modules/gpu/src/graphcuts.cpp
+++ b/modules/gpu/src/graphcuts.cpp
@ -45,12 +45,41 @@
 #if !defined (HAVE_CUDA)

 void cv::gpu::graphcut(GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
+void cv::gpu::graphcut(GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }

 #else /* !defined (HAVE_CUDA) */

+namespace
+{
+    typedef NppStatus (*init_func_t)(NppiSize oSize, NppiGraphcutState** ppState, Npp8u* pDeviceMem);
+
+    class NppiGraphcutStateHandler
+    {
+    public:
+        NppiGraphcutStateHandler(NppiSize sznpp, Npp8u* pDeviceMem, const init_func_t func)
+        {
+            nppSafeCall( func(sznpp, &pState, pDeviceMem) );
+        }
+
+        ~NppiGraphcutStateHandler()
+        {
+            nppSafeCall( nppiGraphcutFree(pState) );
+        }
+
+        operator NppiGraphcutState*()
+        {
+            return pState;
+        }
+
+    private:        
+        NppiGraphcutState* pState;
+    };
+}
+
 void cv::gpu::graphcut(GpuMat& terminals, GpuMat& leftTransp, GpuMat& rightTransp, GpuMat& top, GpuMat& bottom, GpuMat& labels, GpuMat& buf, Stream& s)
 {
    Size src_size = terminals.size();
+
    CV_Assert(terminals.type() == CV_32S);
    CV_Assert(leftTransp.size() == Size(src_size.height, src_size.width));
    CV_Assert(leftTransp.type() == CV_32S);
@ -70,30 +99,76 @@ void cv::gpu::graphcut(GpuMat& terminals, GpuMat& leftTransp, GpuMat& rightTrans
    int bufsz;
    nppSafeCall( nppiGraphcutGetSize(sznpp, &bufsz) );

-    if ((size_t)bufsz > buf.cols * buf.rows * buf.elemSize())
-        buf.create(1, bufsz, CV_8U);
+    ensureSizeIsEnough(1, bufsz, CV_8U, buf);

    cudaStream_t stream = StreamAccessor::getStream(s);

    NppStreamHandler h(stream);

-#if CUDART_VERSION > 4000 
-    NppiGraphcutState* pState;
-    nppSafeCall( nppiGraphcutInitAlloc(sznpp, &pState, buf.ptr<Npp8u>()) );
+    NppiGraphcutStateHandler state(sznpp, buf.ptr<Npp8u>(), nppiGraphcutInitAlloc);
    
    nppSafeCall( nppiGraphcut_32s8u(terminals.ptr<Npp32s>(), leftTransp.ptr<Npp32s>(), rightTransp.ptr<Npp32s>(), top.ptr<Npp32s>(), bottom.ptr<Npp32s>(),
-        static_cast<int>(terminals.step), static_cast<int>(leftTransp.step), sznpp, labels.ptr<Npp8u>(), static_cast<int>(labels.step), pState) );
-
-    nppSafeCall( nppiGraphcutFree(pState) );
-#else
-    nppSafeCall( nppiGraphcut_32s8u(terminals.ptr<Npp32s>(), leftTransp.ptr<Npp32s>(), rightTransp.ptr<Npp32s>(), top.ptr<Npp32s>(), bottom.ptr<Npp32s>(),
-        static_cast<int>(terminals.step), static_cast<int>(leftTransp.step), sznpp, labels.ptr<Npp8u>(), static_cast<int>(labels.step), buf.ptr<Npp8u>()) );
-#endif
+        static_cast<int>(terminals.step), static_cast<int>(leftTransp.step), sznpp, labels.ptr<Npp8u>(), static_cast<int>(labels.step), state) );

    if (stream == 0)
        cudaSafeCall( cudaDeviceSynchronize() );
 }

+void cv::gpu::graphcut(GpuMat& terminals, GpuMat& leftTransp, GpuMat& rightTransp, GpuMat& top, GpuMat& topLeft, GpuMat& topRight, 
+              GpuMat& bottom, GpuMat& bottomLeft, GpuMat& bottomRight, GpuMat& labels, GpuMat& buf, Stream& s)
+{
+    Size src_size = terminals.size();
+
+    CV_Assert(terminals.type() == CV_32S);
+
+    CV_Assert(leftTransp.size() == Size(src_size.height, src_size.width));
+    CV_Assert(leftTransp.type() == CV_32S);
+
+    CV_Assert(rightTransp.size() == Size(src_size.height, src_size.width));
+    CV_Assert(rightTransp.type() == CV_32S);
+
+    CV_Assert(top.size() == src_size);
+    CV_Assert(top.type() == CV_32S);
+
+    CV_Assert(topLeft.size() == src_size);
+    CV_Assert(topLeft.type() == CV_32S);
+
+    CV_Assert(topRight.size() == src_size);
+    CV_Assert(topRight.type() == CV_32S);
+
+    CV_Assert(bottom.size() == src_size);
+    CV_Assert(bottom.type() == CV_32S);
+
+    CV_Assert(bottomLeft.size() == src_size);
+    CV_Assert(bottomLeft.type() == CV_32S);
+
+    CV_Assert(bottomRight.size() == src_size);
+    CV_Assert(bottomRight.type() == CV_32S);
+
+    labels.create(src_size, CV_8U);
+
+    NppiSize sznpp;
+    sznpp.width = src_size.width;
+    sznpp.height = src_size.height;
+
+    int bufsz;
+    nppSafeCall( nppiGraphcut8GetSize(sznpp, &bufsz) );
+
+    ensureSizeIsEnough(1, bufsz, CV_8U, buf);
+
+    cudaStream_t stream = StreamAccessor::getStream(s);
+
+    NppStreamHandler h(stream);
+
+    NppiGraphcutStateHandler state(sznpp, buf.ptr<Npp8u>(), nppiGraphcut8InitAlloc);
+    
+    nppSafeCall( nppiGraphcut8_32s8u(terminals.ptr<Npp32s>(), leftTransp.ptr<Npp32s>(), rightTransp.ptr<Npp32s>(), 
+        top.ptr<Npp32s>(), topLeft.ptr<Npp32s>(), topRight.ptr<Npp32s>(),
+        bottom.ptr<Npp32s>(), bottomLeft.ptr<Npp32s>(), bottomRight.ptr<Npp32s>(),
+        static_cast<int>(terminals.step), static_cast<int>(leftTransp.step), sznpp, labels.ptr<Npp8u>(), static_cast<int>(labels.step), state) );
+
+    if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
+}

 #endif /* !defined (HAVE_CUDA) */
-
--- a/modules/gpu/src/imgproc.cpp
+++ b/modules/gpu/src/imgproc.cpp
@ -62,7 +62,6 @@ void cv::gpu::buildWarpSphericalMaps(Size, Rect, const Mat&, const Mat&, float,
 void cv::gpu::rotate(const GpuMat&, GpuMat&, Size, double, double, double, int, Stream&) { throw_nogpu(); }
 void cv::gpu::integral(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
 void cv::gpu::integralBuffered(const GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
-void cv::gpu::integral(const GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
 void cv::gpu::sqrIntegral(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
 void cv::gpu::columnSum(const GpuMat&, GpuMat&) { throw_nogpu(); }
 void cv::gpu::rectStdDev(const GpuMat&, const GpuMat&, GpuMat&, const Rect&, Stream&) { throw_nogpu(); }
@ -91,7 +90,7 @@ void cv::gpu::mulAndScaleSpectrums(const GpuMat&, const GpuMat&, GpuMat&, int, f
 void cv::gpu::dft(const GpuMat&, GpuMat&, Size, int, Stream&) { throw_nogpu(); }
 void cv::gpu::ConvolveBuf::create(Size, Size) { throw_nogpu(); }
 void cv::gpu::convolve(const GpuMat&, const GpuMat&, GpuMat&, bool) { throw_nogpu(); }
-void cv::gpu::convolve(const GpuMat&, const GpuMat&, GpuMat&, bool, ConvolveBuf&, Stream& stream) { throw_nogpu(); }
+void cv::gpu::convolve(const GpuMat&, const GpuMat&, GpuMat&, bool, ConvolveBuf&, Stream&) { throw_nogpu(); }
 void cv::gpu::pyrDown(const GpuMat&, GpuMat&, int, Stream&) { throw_nogpu(); }
 void cv::gpu::pyrUp(const GpuMat&, GpuMat&, int, Stream&) { throw_nogpu(); }
 void cv::gpu::Canny(const GpuMat&, GpuMat&, double, double, int, bool) { throw_nogpu(); }
@ -780,44 +779,78 @@ void cv::gpu::buildWarpSphericalMaps(Size src_size, Rect dst_roi, const Mat &K,
 ////////////////////////////////////////////////////////////////////////
 // rotate

-void cv::gpu::rotate(const GpuMat& src, GpuMat& dst, Size dsize, double angle, double xShift, double yShift, int interpolation, Stream& s)
-{
-    static const int npp_inter[] = {NPPI_INTER_NN, NPPI_INTER_LINEAR, NPPI_INTER_CUBIC};
+namespace
+{    
+    template<int DEPTH> struct NppTypeTraits;
+    template<> struct NppTypeTraits<CV_8U>  { typedef Npp8u npp_t; };
+    template<> struct NppTypeTraits<CV_8S>  { typedef Npp8s npp_t; };
+    template<> struct NppTypeTraits<CV_16U> { typedef Npp16u npp_t; };
+    template<> struct NppTypeTraits<CV_16S> { typedef Npp16s npp_t; };
+    template<> struct NppTypeTraits<CV_32S> { typedef Npp32s npp_t; };
+    template<> struct NppTypeTraits<CV_32F> { typedef Npp32f npp_t; };
+    template<> struct NppTypeTraits<CV_64F> { typedef Npp64f npp_t; };

-    CV_Assert(src.type() == CV_8UC1 || src.type() == CV_8UC4);
+    template <int DEPTH> struct NppRotateFunc
+    {
+        typedef typename NppTypeTraits<DEPTH>::npp_t npp_t;
+
+        typedef NppStatus (*func_t)(const npp_t* pSrc, NppiSize oSrcSize, int nSrcStep, NppiRect oSrcROI, 
+                                    npp_t* pDst, int nDstStep, NppiRect oDstROI,
+                                    double nAngle, double nShiftX, double nShiftY, int eInterpolation);
+    };
+
+    template <int DEPTH, typename NppRotateFunc<DEPTH>::func_t func> struct NppRotate
+    {
+        typedef typename NppRotateFunc<DEPTH>::npp_t npp_t;
+
+        static void call(const GpuMat& src, GpuMat& dst, Size dsize, double angle, double xShift, double yShift, int interpolation, cudaStream_t stream)
+        {
+            static const int npp_inter[] = {NPPI_INTER_NN, NPPI_INTER_LINEAR, NPPI_INTER_CUBIC};
+
+            NppStreamHandler h(stream);
+
+            NppiSize srcsz;
+            srcsz.height = src.rows;
+            srcsz.width = src.cols;
+            NppiRect srcroi;
+            srcroi.x = srcroi.y = 0;
+            srcroi.height = src.rows;
+            srcroi.width = src.cols;
+            NppiRect dstroi;
+            dstroi.x = dstroi.y = 0;
+            dstroi.height = dst.rows;
+            dstroi.width = dst.cols;
+
+            nppSafeCall( func(src.ptr<npp_t>(), srcsz, static_cast<int>(src.step), srcroi,
+                dst.ptr<npp_t>(), static_cast<int>(dst.step), dstroi, angle, xShift, yShift, npp_inter[interpolation]) );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    };
+}
+
+void cv::gpu::rotate(const GpuMat& src, GpuMat& dst, Size dsize, double angle, double xShift, double yShift, int interpolation, Stream& stream)
+{
+    typedef void (*func_t)(const GpuMat& src, GpuMat& dst, Size dsize, double angle, double xShift, double yShift, int interpolation, cudaStream_t stream);
+
+    static const func_t funcs[6][4] = 
+    {
+        {NppRotate<CV_8U, nppiRotate_8u_C1R>::call, 0, NppRotate<CV_8U, nppiRotate_8u_C3R>::call, NppRotate<CV_8U, nppiRotate_8u_C4R>::call},
+        {0,0,0,0},
+        {NppRotate<CV_16U, nppiRotate_16u_C1R>::call, 0, NppRotate<CV_16U, nppiRotate_16u_C3R>::call, NppRotate<CV_16U, nppiRotate_16u_C4R>::call},
+        {0,0,0,0},
+        {0,0,0,0},
+        {NppRotate<CV_32F, nppiRotate_32f_C1R>::call, 0, NppRotate<CV_32F, nppiRotate_32f_C3R>::call, NppRotate<CV_32F, nppiRotate_32f_C4R>::call}
+    };
+
+    CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
+    CV_Assert(src.channels() == 1 || src.channels() == 3 || src.channels() == 4);
    CV_Assert(interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC);

    dst.create(dsize, src.type());

-    NppiSize srcsz;
-    srcsz.height = src.rows;
-    srcsz.width = src.cols;
-    NppiRect srcroi;
-    srcroi.x = srcroi.y = 0;
-    srcroi.height = src.rows;
-    srcroi.width = src.cols;
-    NppiRect dstroi;
-    dstroi.x = dstroi.y = 0;
-    dstroi.height = dst.rows;
-    dstroi.width = dst.cols;
-
-    cudaStream_t stream = StreamAccessor::getStream(s);
-
-    NppStreamHandler h(stream);
-
-    if (src.type() == CV_8UC1)
-    {
-        nppSafeCall( nppiRotate_8u_C1R(src.ptr<Npp8u>(), srcsz, static_cast<int>(src.step), srcroi,
-            dst.ptr<Npp8u>(), static_cast<int>(dst.step), dstroi, angle, xShift, yShift, npp_inter[interpolation]) );
-    }
-    else
-    {
-        nppSafeCall( nppiRotate_8u_C4R(src.ptr<Npp8u>(), srcsz, static_cast<int>(src.step), srcroi,
-            dst.ptr<Npp8u>(), static_cast<int>(dst.step), dstroi, angle, xShift, yShift, npp_inter[interpolation]) );
-    }
-
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
+    funcs[src.depth()][src.channels() - 1](src, dst, dsize, angle, xShift, yShift, interpolation, StreamAccessor::getStream(stream));
 }

 ////////////////////////////////////////////////////////////////////////
@ -857,30 +890,6 @@ void cv::gpu::integralBuffered(const GpuMat& src, GpuMat& sum, GpuMat& buffer, S
        cudaSafeCall( cudaDeviceSynchronize() );
 }

-void cv::gpu::integral(const GpuMat& src, GpuMat& sum, GpuMat& sqsum, Stream& s)
-{
-    CV_Assert(src.type() == CV_8UC1);
-
-    int width = src.cols + 1, height = src.rows + 1;
-
-    sum.create(height, width, CV_32S);
-    sqsum.create(height, width, CV_32F);
-
-    NppiSize sz;
-    sz.width = src.cols;
-    sz.height = src.rows;
-
-    cudaStream_t stream = StreamAccessor::getStream(s);
-
-    NppStreamHandler h(stream);
-
-    nppSafeCall( nppiSqrIntegral_8u32s32f_C1R(const_cast<Npp8u*>(src.ptr<Npp8u>()), static_cast<int>(src.step), 
-        sum.ptr<Npp32s>(), static_cast<int>(sum.step), sqsum.ptr<Npp32f>(), static_cast<int>(sqsum.step), sz, 0, 0.0f, height) );
-
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
-
 //////////////////////////////////////////////////////////////////////////////
 // sqrIntegral

@ -935,7 +944,6 @@ void cv::gpu::columnSum(const GpuMat& src, GpuMat& dst)

 void cv::gpu::rectStdDev(const GpuMat& src, const GpuMat& sqr, GpuMat& dst, const Rect& rect, Stream& s)
 {
-#if CUDART_VERSION > 4000 
    CV_Assert(src.type() == CV_32SC1 && sqr.type() == CV_64FC1);

    dst.create(src.size(), CV_32FC1);
@ -959,31 +967,6 @@ void cv::gpu::rectStdDev(const GpuMat& src, const GpuMat& sqr, GpuMat& dst, cons

    if (stream == 0)
        cudaSafeCall( cudaDeviceSynchronize() );
-#else
-    CV_Assert(src.type() == CV_32SC1 && sqr.type() == CV_32FC1);
-
-    dst.create(src.size(), CV_32FC1);
-
-    NppiSize sz;
-    sz.width = src.cols;
-    sz.height = src.rows;
-
-    NppiRect nppRect;
-    nppRect.height = rect.height;
-    nppRect.width = rect.width;
-    nppRect.x = rect.x;
-    nppRect.y = rect.y;
-
-    cudaStream_t stream = StreamAccessor::getStream(s);
-
-    NppStreamHandler h(stream);
-
-    nppSafeCall( nppiRectStdDev_32s32f_C1R(src.ptr<Npp32s>(), static_cast<int>(src.step), sqr.ptr<Npp32f>(), static_cast<int>(sqr.step),
-                dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz, nppRect) );
-
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-#endif
 }


@ -992,25 +975,19 @@ void cv::gpu::rectStdDev(const GpuMat& src, const GpuMat& sqr, GpuMat& dst, cons

 namespace
 {
-    template<int n> struct NPPTypeTraits;
-    template<> struct NPPTypeTraits<CV_8U>  { typedef Npp8u npp_type; };
-    template<> struct NPPTypeTraits<CV_16U> { typedef Npp16u npp_type; };
-    template<> struct NPPTypeTraits<CV_16S> { typedef Npp16s npp_type; };
-    template<> struct NPPTypeTraits<CV_32F> { typedef Npp32f npp_type; };
-
    typedef NppStatus (*get_buf_size_c1_t)(NppiSize oSizeROI, int nLevels, int* hpBufferSize);
    typedef NppStatus (*get_buf_size_c4_t)(NppiSize oSizeROI, int nLevels[], int* hpBufferSize);

    template<int SDEPTH> struct NppHistogramEvenFuncC1
    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+        typedef typename NppTypeTraits<SDEPTH>::npp_t src_t;

 	typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, NppiSize oSizeROI, Npp32s * pHist,
 		    int nLevels, Npp32s nLowerLevel, Npp32s nUpperLevel, Npp8u * pBuffer);
    };
    template<int SDEPTH> struct NppHistogramEvenFuncC4
    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+        typedef typename NppTypeTraits<SDEPTH>::npp_t src_t;

        typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, NppiSize oSizeROI,
            Npp32s * pHist[4], int nLevels[4], Npp32s nLowerLevel[4], Npp32s nUpperLevel[4], Npp8u * pBuffer);
@ -1079,7 +1056,7 @@ namespace

    template<int SDEPTH> struct NppHistogramRangeFuncC1
    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+        typedef typename NppTypeTraits<SDEPTH>::npp_t src_t;
        typedef Npp32s level_t;
        enum {LEVEL_TYPE_CODE=CV_32SC1};

@ -1097,7 +1074,7 @@ namespace
    };
    template<int SDEPTH> struct NppHistogramRangeFuncC4
    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+        typedef typename NppTypeTraits<SDEPTH>::npp_t src_t;
        typedef Npp32s level_t;
        enum {LEVEL_TYPE_CODE=CV_32SC1};

--- a/modules/gpu/src/initialization.cpp
+++ b/modules/gpu/src/initialization.cpp
@ -171,8 +171,8 @@ bool cv::gpu::DeviceInfo::supports(cv::gpu::FeatureSet) const { throw_nogpu(); r
 bool cv::gpu::DeviceInfo::isCompatible() const { throw_nogpu(); return false; }
 void cv::gpu::DeviceInfo::query() { throw_nogpu(); }
 void cv::gpu::DeviceInfo::queryMemory(size_t&, size_t&) const { throw_nogpu(); }
-void cv::gpu::printCudaDeviceInfo(int device) { throw_nogpu(); }
-void cv::gpu::printShortCudaDeviceInfo(int device) { throw_nogpu(); }
+void cv::gpu::printCudaDeviceInfo(int) { throw_nogpu(); }
+void cv::gpu::printShortCudaDeviceInfo(int) { throw_nogpu(); }

 #else /* !defined (HAVE_CUDA) */

@ -346,7 +346,6 @@ void cv::gpu::printCudaDeviceInfo(int device)
            convertSMVer2Cores(prop.major, prop.minor) * prop.multiProcessorCount);
        printf("  GPU Clock Speed:                               %.2f GHz\n", prop.clockRate * 1e-6f);
 
-#if (CUDART_VERSION >= 4000)
        // This is not available in the CUDA Runtime API, so we make the necessary calls the driver API to support this for output
        int memoryClock, memBusWidth, L2CacheSize;
        getCudaAttribute<int>( &memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, dev );       
@ -364,7 +363,7 @@ void cv::gpu::printCudaDeviceInfo(int device)
        printf("  Max Layered Texture Size (dim) x layers        1D=(%d) x %d, 2D=(%d,%d) x %d\n",
            prop.maxTexture1DLayered[0], prop.maxTexture1DLayered[1],
            prop.maxTexture2DLayered[0], prop.maxTexture2DLayered[1], prop.maxTexture2DLayered[2]);
-#endif
+
        printf("  Total amount of constant memory:               %u bytes\n", (int)prop.totalConstMem);
        printf("  Total amount of shared memory per block:       %u bytes\n", (int)prop.sharedMemPerBlock);
        printf("  Total number of registers available per block: %d\n", prop.regsPerBlock);
@ -375,11 +374,7 @@ void cv::gpu::printCudaDeviceInfo(int device)
        printf("  Maximum memory pitch:                          %u bytes\n", (int)prop.memPitch);
        printf("  Texture alignment:                             %u bytes\n", (int)prop.textureAlignment);
 
-#if CUDART_VERSION >= 4000
        printf("  Concurrent copy and execution:                 %s with %d copy engine(s)\n", (prop.deviceOverlap ? "Yes" : "No"), prop.asyncEngineCount);
-#else
-        printf("  Concurrent copy and execution:                 %s\n", prop.deviceOverlap ? "Yes" : "No");
-#endif
        printf("  Run time limit on kernels:                     %s\n", prop.kernelExecTimeoutEnabled ? "Yes" : "No");
        printf("  Integrated GPU sharing Host Memory:            %s\n", prop.integrated ? "Yes" : "No");
        printf("  Support host page-locked memory mapping:       %s\n", prop.canMapHostMemory ? "Yes" : "No");
@ -388,10 +383,8 @@ void cv::gpu::printCudaDeviceInfo(int device)
        printf("  Alignment requirement for Surfaces:            %s\n", prop.surfaceAlignment ? "Yes" : "No");
        printf("  Device has ECC support enabled:                %s\n", prop.ECCEnabled ? "Yes" : "No");
        printf("  Device is using TCC driver mode:               %s\n", prop.tccDriver ? "Yes" : "No");
-#if CUDART_VERSION >= 4000
        printf("  Device supports Unified Addressing (UVA):      %s\n", prop.unifiedAddressing ? "Yes" : "No");
        printf("  Device PCI Bus ID / PCI location ID:           %d / %d\n", prop.pciBusID, prop.pciDeviceID );
-#endif
        printf("  Compute Mode:\n");
        printf("      %s \n", computeMode[prop.computeMode]);
    }
--- a/modules/gpu/src/matrix_reductions.cpp
+++ b/modules/gpu/src/matrix_reductions.cpp
@ -48,6 +48,7 @@ using namespace cv::gpu;
 #if !defined (HAVE_CUDA)

 void cv::gpu::meanStdDev(const GpuMat&, Scalar&, Scalar&) { throw_nogpu(); }
+void cv::gpu::meanStdDev(const GpuMat&, Scalar&, Scalar&, GpuMat&) { throw_nogpu(); }
 double cv::gpu::norm(const GpuMat&, int) { throw_nogpu(); return 0.0; }
 double cv::gpu::norm(const GpuMat&, int, GpuMat&) { throw_nogpu(); return 0.0; }
 double cv::gpu::norm(const GpuMat&, const GpuMat&, int) { throw_nogpu(); return 0.0; }
@ -108,6 +109,12 @@ namespace
 // meanStdDev

 void cv::gpu::meanStdDev(const GpuMat& src, Scalar& mean, Scalar& stddev)
+{
+    GpuMat buf;
+    meanStdDev(src, mean, stddev, buf);
+}
+
+void cv::gpu::meanStdDev(const GpuMat& src, Scalar& mean, Scalar& stddev, GpuMat& buf)
 {
    CV_Assert(src.type() == CV_8UC1);

@ -117,15 +124,12 @@ void cv::gpu::meanStdDev(const GpuMat& src, Scalar& mean, Scalar& stddev)

    DeviceBuffer dbuf(2);

-#if CUDART_VERSION > 4000 
    int bufSize;
    nppSafeCall( nppiMeanStdDev8uC1RGetBufferHostSize(sz, &bufSize) );

-    GpuMat buf(1, bufSize, CV_8UC1);
+    ensureSizeIsEnough(1, bufSize, CV_8UC1, buf);
+
    nppSafeCall( nppiMean_StdDev_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step), sz, buf.ptr<Npp8u>(), dbuf, (double*)dbuf + 1) );
-#else
-    nppSafeCall( nppiMean_StdDev_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step), sz, dbuf, (double*)dbuf + 1) );
-#endif

    cudaSafeCall( cudaDeviceSynchronize() );
    
@ -133,7 +137,6 @@ void cv::gpu::meanStdDev(const GpuMat& src, Scalar& mean, Scalar& stddev)
    dbuf.download(ptrs);
 }

-
 ////////////////////////////////////////////////////////////////////////
 // norm

@ -151,7 +154,7 @@ double cv::gpu::norm(const GpuMat& src, int normType, GpuMat& buf)
        return absSum(src_single_channel, buf)[0];

    if (normType == NORM_L2)
-        return sqrt(sqrSum(src_single_channel, buf)[0]);
+        return std::sqrt(sqrSum(src_single_channel, buf)[0]);

    if (normType == NORM_INF)
    {
--- a/modules/gpu/src/optical_flow.cpp
+++ b/modules/gpu/src/optical_flow.cpp
@ -228,7 +228,7 @@ void cv::gpu::createOpticalFlowNeedleMap(const GpuMat& u, const GpuMat& v, GpuMa
    minMax(u_avg, 0, &uMax);
    minMax(v_avg, 0, &vMax);

-    float max_flow = static_cast<float>(sqrt(uMax * uMax + vMax * vMax));
+    float max_flow = static_cast<float>(std::sqrt(uMax * uMax + vMax * vMax));

    CreateOpticalFlowNeedleMap_gpu(u_avg, v_avg, vertex.ptr<float>(), colors.ptr<float>(), max_flow, 1.0f / u.cols, 1.0f / u.rows);

--- a/modules/gpu/src/optical_flow_farneback.cpp
+++ b/modules/gpu/src/optical_flow_farneback.cpp
@ -160,7 +160,7 @@ void cv::gpu::FarnebackOpticalFlow::setPolynomialExpansionConsts(int n, double s
    double ig11, ig03, ig33, ig55;
    prepareGaussian(n, sigma, g, xg, xxg, ig11, ig03, ig33, ig55);

-    device::optflow_farneback::setPolynomialExpansionConsts(n, g, xg, xxg, ig11, ig03, ig33, ig55);
+    device::optflow_farneback::setPolynomialExpansionConsts(n, g, xg, xxg, static_cast<float>(ig11), static_cast<float>(ig03), static_cast<float>(ig33), static_cast<float>(ig55));
 }


--- a/modules/gpu/src/orb.cpp
+++ b/modules/gpu/src/orb.cpp
@ -429,11 +429,11 @@ void cv::gpu::ORB_GPU::setParams(size_t n_features, const ORB::CommonParams& det
    // pre-compute the end of a row in a circular patch
    int half_patch_size = params_.patch_size_ / 2;
    vector<int> u_max(half_patch_size + 1);
-    for (int v = 0; v <= half_patch_size * sqrt(2.f) / 2 + 1; ++v)
-        u_max[v] = cvRound(sqrt(static_cast<float>(half_patch_size * half_patch_size - v * v)));
+    for (int v = 0; v <= half_patch_size * std::sqrt(2.f) / 2 + 1; ++v)
+        u_max[v] = cvRound(std::sqrt(static_cast<float>(half_patch_size * half_patch_size - v * v)));
    
    // Make sure we are symmetric
-    for (int v = half_patch_size, v_0 = 0; v >= half_patch_size * sqrt(2.f) / 2; --v)
+    for (int v = half_patch_size, v_0 = 0; v >= half_patch_size * std::sqrt(2.f) / 2; --v)
    {
        while (u_max[v_0] == u_max[v_0 + 1])
            ++v_0;
--- a/modules/gpu/src/precomp.hpp
+++ b/modules/gpu/src/precomp.hpp
@ -91,8 +91,8 @@
    #include "nvidia/NCVHaarObjectDetection.hpp"
    #include "nvidia/NCVBroxOpticalFlow.hpp"

-    #define CUDART_MINIMUM_REQUIRED_VERSION 4000
-    #define NPP_MINIMUM_REQUIRED_VERSION 4000
+    #define CUDART_MINIMUM_REQUIRED_VERSION 4010
+    #define NPP_MINIMUM_REQUIRED_VERSION 4100

    #if (CUDART_VERSION < CUDART_MINIMUM_REQUIRED_VERSION)
        #error "Insufficient Cuda Runtime library version, please update it."
--- a/modules/gpu/test/test_arithm.cpp
+++ b/modules/gpu/test/test_arithm.cpp
@ -69,10 +69,10 @@ PARAM_TEST_CASE(ArithmTestBase, cv::gpu::DeviceInfo, MatType, UseRoi)

        size = cv::Size(rng.uniform(100, 200), rng.uniform(100, 200));
        
-        mat1 = randomMat(rng, size, type, 1, 16, false);
-        mat2 = randomMat(rng, size, type, 1, 16, false);
+        mat1 = randomMat(rng, size, type, 5, 16, false);
+        mat2 = randomMat(rng, size, type, 5, 16, false);

-        val = cv::Scalar(rng.uniform(0.1, 3.0), rng.uniform(0.1, 3.0), rng.uniform(0.1, 3.0), rng.uniform(0.1, 3.0));
+        val = cv::Scalar(rng.uniform(1, 3), rng.uniform(1, 3), rng.uniform(1, 3), rng.uniform(1, 3));
    }
 };

@ -115,7 +115,8 @@ TEST_P(Add, Scalar)

 INSTANTIATE_TEST_CASE_P(Arithm, Add, Combine(
                        ALL_DEVICES,
-                        Values(CV_8UC1, CV_16UC1, CV_32SC1, CV_32FC1),
+                        Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_16UC1, CV_16UC3, CV_16UC4, CV_16SC1, CV_16SC2, CV_16SC3, CV_16SC4, 
+                               CV_32SC1, CV_32SC2, CV_32SC3, CV_32FC1, CV_32FC2, CV_32FC3, CV_32FC4),
                        USE_ROI));

 ////////////////////////////////////////////////////////////////////////////////
@ -157,7 +158,8 @@ TEST_P(Subtract, Scalar)

 INSTANTIATE_TEST_CASE_P(Arithm, Subtract, Combine(
                        ALL_DEVICES,
-                        Values(CV_8UC1, CV_16UC1, CV_32SC1, CV_32FC1),
+                        Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_16UC1, CV_16UC3, CV_16UC4, CV_16SC1, CV_16SC2, CV_16SC3, CV_16SC4, 
+                               CV_32SC1, CV_32SC2, CV_32SC3, CV_32FC1, CV_32FC2, CV_32FC3, CV_32FC4),
                        USE_ROI));

 ////////////////////////////////////////////////////////////////////////////////
@ -199,7 +201,8 @@ TEST_P(Multiply, Scalar)

 INSTANTIATE_TEST_CASE_P(Arithm, Multiply, Combine(
                        ALL_DEVICES,
-                        Values(CV_8UC1, CV_16UC1, CV_32SC1, CV_32FC1),
+                        Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_16UC1, CV_16UC3, CV_16UC4, CV_16SC1, CV_16SC3, CV_16SC4, 
+                               CV_32SC1, CV_32SC3, CV_32FC1, CV_32FC3, CV_32FC4),
                        USE_ROI));

 ////////////////////////////////////////////////////////////////////////////////
@ -220,7 +223,7 @@ TEST_P(Divide, Array)

    gpuRes.download(dst);

-    EXPECT_MAT_NEAR(dst_gold, dst, 1.0);
+    EXPECT_MAT_NEAR(dst_gold, dst, mat1.depth() == CV_32F ? 1e-5 : 1);
 }

 TEST_P(Divide, Scalar) 
@ -236,12 +239,13 @@ TEST_P(Divide, Scalar)

    gpuRes.download(dst);

-    EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
+    EXPECT_MAT_NEAR(dst_gold, dst, mat1.depth() == CV_32F ? 1e-5 : 1);
 }

 INSTANTIATE_TEST_CASE_P(Arithm, Divide, Combine(
                        ALL_DEVICES,
-                        Values(CV_8UC1, CV_16UC1, CV_32SC1, CV_32FC1),
+                        Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_16UC1, CV_16UC3, CV_16UC4, CV_16SC1, CV_16SC3, CV_16SC4, 
+                               CV_32SC1, CV_32SC3, CV_32FC1, CV_32FC3, CV_32FC4),
                        USE_ROI));

 ////////////////////////////////////////////////////////////////////////////////
@ -312,6 +316,83 @@ INSTANTIATE_TEST_CASE_P(Arithm, Absdiff, Combine(
                        Values(CV_8UC1, CV_16UC1, CV_32SC1, CV_32FC1),
                        USE_ROI));

+////////////////////////////////////////////////////////////////////////////////
+// abs
+
+struct Abs : ArithmTestBase {};
+
+TEST_P(Abs, Array) 
+{
+    cv::Mat dst_gold = cv::abs(mat1);
+
+    cv::Mat dst;
+
+    cv::gpu::GpuMat gpuRes;
+
+    cv::gpu::abs(loadMat(mat1, useRoi), gpuRes);
+
+    gpuRes.download(dst);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+INSTANTIATE_TEST_CASE_P(Arithm, Abs, Combine(
+                        ALL_DEVICES,
+                        Values(CV_16SC1, CV_32FC1),
+                        USE_ROI));
+
+////////////////////////////////////////////////////////////////////////////////
+// Sqr
+
+struct Sqr : ArithmTestBase {};
+
+TEST_P(Sqr, Array) 
+{
+    cv::Mat dst_gold;
+    cv::multiply(mat1, mat1, dst_gold);
+
+    cv::Mat dst;
+
+    cv::gpu::GpuMat gpuRes;
+
+    cv::gpu::sqr(loadMat(mat1, useRoi), gpuRes);
+
+    gpuRes.download(dst);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+INSTANTIATE_TEST_CASE_P(Arithm, Sqr, Combine(
+                        ALL_DEVICES,
+                        Values(CV_8UC1, CV_16UC1, CV_16SC1, CV_32FC1),
+                        USE_ROI));
+
+////////////////////////////////////////////////////////////////////////////////
+// Sqrt
+
+struct Sqrt : ArithmTestBase {};
+
+TEST_P(Sqrt, Array) 
+{
+    cv::Mat dst_gold;
+    cv::sqrt(mat1, dst_gold);
+
+    cv::Mat dst;
+
+    cv::gpu::GpuMat gpuRes;
+
+    cv::gpu::sqrt(loadMat(mat1, useRoi), gpuRes);
+
+    gpuRes.download(dst);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+INSTANTIATE_TEST_CASE_P(Arithm, Sqrt, Combine(
+                        ALL_DEVICES,
+                        Values(MatType(CV_32FC1)),
+                        USE_ROI));
+
 ////////////////////////////////////////////////////////////////////////////////
 // compare

@ -513,7 +594,7 @@ TEST_P(Flip, Accuracy)

 INSTANTIATE_TEST_CASE_P(Arithm, Flip, Combine(
                        ALL_DEVICES,
-                        Values(CV_8UC1, CV_8UC4),
+                        Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_16UC1, CV_16UC3, CV_16UC4, CV_32SC1, CV_32SC3, CV_32SC4, CV_32FC1, CV_32FC3, CV_32FC4),
                        Values((int)FLIP_BOTH, (int)FLIP_X, (int)FLIP_Y),
                        USE_ROI));

@ -1329,6 +1410,90 @@ INSTANTIATE_TEST_CASE_P(Arithm, Bitwise, Combine(
                        ALL_DEVICES,
                        ALL_TYPES));

+PARAM_TEST_CASE(BitwiseScalar, cv::gpu::DeviceInfo, MatType)
+{
+    cv::gpu::DeviceInfo devInfo;
+    int type;
+
+    cv::Size size;
+    cv::Mat mat;
+    cv::Scalar sc;
+
+    virtual void SetUp() 
+    {
+        devInfo = GET_PARAM(0);
+        type = GET_PARAM(1);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+
+        cv::RNG& rng = cvtest::TS::ptr()->get_rng();
+
+        size = cv::Size(rng.uniform(100, 200), rng.uniform(100, 200));
+
+        mat.create(size, type);
+        
+        for (int i = 0; i < mat.rows; ++i)
+        {
+            cv::Mat row(1, static_cast<int>(mat.cols * mat.elemSize()), CV_8U, (void*)mat.ptr(i));
+            rng.fill(row, cv::RNG::UNIFORM, cv::Scalar(0), cv::Scalar(255));
+        }
+
+        sc = cv::Scalar(rng.uniform(0, 255), rng.uniform(0, 255), rng.uniform(0, 255), rng.uniform(0, 255));
+    }
+};
+
+TEST_P(BitwiseScalar, Or) 
+{
+    cv::Mat dst_gold;
+    cv::bitwise_or(mat, sc, dst_gold);
+
+    cv::Mat dst;
+    
+    cv::gpu::GpuMat dev_dst;
+
+    cv::gpu::bitwise_or(loadMat(mat), sc, dev_dst);
+
+    dev_dst.download(dst);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+TEST_P(BitwiseScalar, And) 
+{
+    cv::Mat dst_gold;
+    cv::bitwise_and(mat, sc, dst_gold);
+
+    cv::Mat dst;
+    
+    cv::gpu::GpuMat dev_dst;
+
+    cv::gpu::bitwise_and(loadMat(mat), sc, dev_dst);
+
+    dev_dst.download(dst);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+TEST_P(BitwiseScalar, Xor) 
+{
+    cv::Mat dst_gold;
+    cv::bitwise_xor(mat, sc, dst_gold);
+
+    cv::Mat dst;
+    
+    cv::gpu::GpuMat dev_dst;
+
+    cv::gpu::bitwise_xor(loadMat(mat), sc, dev_dst);
+
+    dev_dst.download(dst);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+INSTANTIATE_TEST_CASE_P(Arithm, BitwiseScalar, Combine(
+                        ALL_DEVICES,
+                        Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_16UC1, CV_16UC3, CV_16UC4, CV_32SC1, CV_32SC3, CV_32SC4)));
+
 //////////////////////////////////////////////////////////////////////////////
 // addWeighted

--- a/modules/gpu/test/test_imgproc.cpp
+++ b/modules/gpu/test/test_imgproc.cpp
@ -2377,6 +2377,49 @@ INSTANTIATE_TEST_CASE_P(ImgProc, CvtColor, Combine(
                        Values(CV_8U, CV_16U, CV_32F),
                        USE_ROI));

+///////////////////////////////////////////////////////////////////////////////////////////////////////
+// swapChannels
+
+PARAM_TEST_CASE(SwapChannels, cv::gpu::DeviceInfo, UseRoi)
+{
+    cv::gpu::DeviceInfo devInfo;
+    bool useRoi;
+    
+    cv::Mat img;
+    
+    cv::Mat dst_gold;
+    
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        useRoi = GET_PARAM(1);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+        
+        cv::Mat imgBase = readImage("stereobm/aloe-L.png");
+        ASSERT_FALSE(imgBase.empty());
+
+        cv::cvtColor(imgBase, img, cv::COLOR_BGR2BGRA);
+
+        cv::cvtColor(img, dst_gold, cv::COLOR_BGRA2RGBA);
+    }
+};
+
+TEST_P(SwapChannels, Accuracy)
+{
+    cv::gpu::GpuMat gpuImage = loadMat(img, useRoi);
+
+    const int dstOrder[] = {2, 1, 0, 3};
+    cv::gpu::swapChannels(gpuImage, dstOrder);
+
+    cv::Mat dst;
+    gpuImage.download(dst);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+INSTANTIATE_TEST_CASE_P(ImgProc, SwapChannels, Combine(ALL_DEVICES, USE_ROI));
+
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 // histograms

--- a/modules/gpu/test/test_main.cpp
+++ b/modules/gpu/test/test_main.cpp
@ -136,7 +136,7 @@ int main(int argc, char** argv)

 #else // HAVE_CUDA

-int main(int argc, char** argv)
+int main()
 {
    printf("OpenCV was built without CUDA support\n");
    return 0;
--- a/modules/highgui/src/window.cpp
+++ b/modules/highgui/src/window.cpp
@ -146,7 +146,6 @@ CV_IMPL double cvGetWindowProperty(const char* name, int prop_id)
    default:
        return -1;
    }
-    return -1;
 }

 void cv::namedWindow( const string& winname, int flags )
--- a/modules/stitching/include/opencv2/stitching/detail/seam_finders.hpp
+++ b/modules/stitching/include/opencv2/stitching/detail/seam_finders.hpp
@ -105,6 +105,8 @@ public:
    GraphCutSeamFinder(int cost_type = COST_COLOR_GRAD, float terminal_cost = 10000.f,
                       float bad_region_penalty = 1000.f);

+    ~GraphCutSeamFinder();
+
    void find(const std::vector<Mat> &src, const std::vector<Point> &corners,
              std::vector<Mat> &masks);

--- a/modules/stitching/src/seam_finders.cpp
+++ b/modules/stitching/src/seam_finders.cpp
@ -411,6 +411,8 @@ void GraphCutSeamFinder::Impl::findInPair(size_t first, size_t second, Rect roi)
 GraphCutSeamFinder::GraphCutSeamFinder(int cost_type, float terminal_cost, float bad_region_penalty)
    : impl_(new Impl(cost_type, terminal_cost, bad_region_penalty)) {}

+GraphCutSeamFinder::~GraphCutSeamFinder() {}
+

 void GraphCutSeamFinder::find(const vector<Mat> &src, const vector<Point> &corners,
                              vector<Mat> &masks)
--- a/samples/cpp/point_cloud.cpp
+++ b/samples/cpp/point_cloud.cpp
@ -247,7 +247,7 @@ inline int clamp(int val, int minVal, int maxVal)
    return max(min(val, maxVal), minVal);
 }

-void PointCloudRenderer::onMouseEvent(int event, int x, int y, int flags)
+void PointCloudRenderer::onMouseEvent(int event, int x, int y, int /*flags*/)
 {
    static int oldx = x;
    static int oldy = y;
--- a/samples/gpu/alpha_comp.cpp
+++ b/samples/gpu/alpha_comp.cpp
@ -0,0 +1,68 @@
+#include <iostream>
+
+#include "opencv2/core/opengl_interop.hpp"
+#include "opencv2/highgui/highgui.hpp"
+#include "opencv2/gpu/gpu.hpp"
+
+using namespace std;
+using namespace cv;
+using namespace cv::gpu;
+
+int main()
+{
+    cout << "This program demonstrates using alphaComp" << endl;
+    cout << "Press SPACE to change compositing operation" << endl;
+    cout << "Press ESC to exit" << endl;
+
+    namedWindow("First Image", WINDOW_NORMAL);
+    namedWindow("Second Image", WINDOW_NORMAL);
+    namedWindow("Result", WINDOW_OPENGL);
+
+    setGlDevice();
+
+    Mat src1(640, 480, CV_8UC4, Scalar::all(0));
+    Mat src2(640, 480, CV_8UC4, Scalar::all(0));
+
+    rectangle(src1, Rect(50, 50, 200, 200), Scalar(0, 0, 255, 128), 30);
+    rectangle(src2, Rect(100, 100, 200, 200), Scalar(255, 0, 0, 128), 30);
+
+    GpuMat d_src1(src1);
+    GpuMat d_src2(src2);
+
+    GpuMat d_res;
+
+    imshow("First Image", src1);
+    imshow("Second Image", src2);
+
+    int alpha_op = ALPHA_OVER;
+
+    const char* op_names[] = 
+    { 
+        "ALPHA_OVER", "ALPHA_IN", "ALPHA_OUT", "ALPHA_ATOP", "ALPHA_XOR", "ALPHA_PLUS", "ALPHA_OVER_PREMUL", "ALPHA_IN_PREMUL", "ALPHA_OUT_PREMUL",
+        "ALPHA_ATOP_PREMUL", "ALPHA_XOR_PREMUL", "ALPHA_PLUS_PREMUL", "ALPHA_PREMUL"
+    };
+
+    while (true)
+    {
+        cout << op_names[alpha_op] << endl;
+
+        alphaComp(d_src1, d_src2, d_res, alpha_op);
+
+        imshow("Result", d_res);
+
+        char key = static_cast<char>(waitKey());
+
+        if (key == 27)
+            break;
+
+        if (key == 32)
+        {
+            ++alpha_op;
+
+            if (alpha_op > ALPHA_PREMUL)
+                alpha_op = ALPHA_OVER; 
+        }
+    }
+
+    return 0;
+}
--- a/samples/gpu/brox_optical_flow.cpp
+++ b/samples/gpu/brox_optical_flow.cpp
@ -226,7 +226,7 @@ int main(int argc, const char* argv[])
                break;

            case 'S':
-                if (currentFrame < frames.size() - 1)
+                if (currentFrame < static_cast<int>(frames.size()) - 1)
                    ++currentFrame;

                imshow("Interpolated frame", frames[currentFrame]);
--- a/samples/gpu/farneback_optical_flow.cpp
+++ b/samples/gpu/farneback_optical_flow.cpp
@ -26,7 +26,7 @@ void colorizeFlow(const Mat &u, const Mat &v, Mat &dst)
    minMaxLoc(v, &vMin, &vMax, 0, 0);
    uMin = ::abs(uMin); uMax = ::abs(uMax);
    vMin = ::abs(vMin); vMax = ::abs(vMax);
-    float dMax = ::max(::max(uMin, uMax), ::max(vMin, vMax));
+    float dMax = static_cast<float>(::max(::max(uMin, uMax), ::max(vMin, vMax)));

    dst.create(u.size(), CV_8UC3);
    for (int y = 0; y < u.rows; ++y)
@ -111,11 +111,11 @@ int main(int argc, char **argv)

        s.str("");
        s << "opt. flow FPS: " << cvRound((getTickFrequency()/(tc1-tc0)));
-        putText(image, s.str(), Point(5, 65), FONT_HERSHEY_SIMPLEX, 1., Scalar(255,0,255), 2.);
+        putText(image, s.str(), Point(5, 65), FONT_HERSHEY_SIMPLEX, 1., Scalar(255,0,255), 2);

        s.str("");
        s << "total FPS: " << cvRound((getTickFrequency()/(t1-t0)));
-        putText(image, s.str(), Point(5, 105), FONT_HERSHEY_SIMPLEX, 1., Scalar(255,0,255), 2.);
+        putText(image, s.str(), Point(5, 105), FONT_HERSHEY_SIMPLEX, 1., Scalar(255,0,255), 2);

        imshow("flow", image);

--- a/samples/gpu/performance/performance.cpp
+++ b/samples/gpu/performance/performance.cpp
@ -63,7 +63,7 @@ void TestSystem::finishCurrentSubtest()
    double cpu_time = cpu_elapsed_ / getTickFrequency() * 1000.0;
    double gpu_time = gpu_elapsed_ / getTickFrequency() * 1000.0;

-    double speedup = static_cast<double>(cpu_elapsed_) / std::max((int64)1, gpu_elapsed_);
+    double speedup = static_cast<double>(cpu_elapsed_) / std::max(1.0, gpu_elapsed_);
    speedup_total_ += speedup;

    printMetrics(cpu_time, gpu_time, speedup);
--- a/samples/gpu/performance/performance.h
+++ b/samples/gpu/performance/performance.h
@ -127,8 +127,10 @@ private:
    std::stringstream cur_subtest_description_;
    bool cur_subtest_is_empty_;

-    int64 cpu_started_, cpu_elapsed_;
-    int64 gpu_started_, gpu_elapsed_;
+    int64 cpu_started_;
+    int64 gpu_started_;
+    double cpu_elapsed_;
+    double gpu_elapsed_;

    double speedup_total_;
    int num_subtests_called_;
--- a/samples/gpu/performance/tests.cpp
+++ b/samples/gpu/performance/tests.cpp
@ -1199,7 +1199,7 @@ TEST(FarnebackOpticalFlow)
    if (frame1.empty()) throw runtime_error("can't open " + datasets[i] + "2.png");

    gpu::FarnebackOpticalFlow calc;
-    calc.fastPyramids = fastPyramids;
+    calc.fastPyramids = fastPyramids != 0;
    calc.flags |= useGaussianBlur ? OPTFLOW_FARNEBACK_GAUSSIAN : 0;

    gpu::GpuMat d_frame0(frame0), d_frame1(frame1), d_flowx, d_flowy;