used new device layer for cv::gpu::add

2025-08-06 06:26:29 +08:00 · 2013-08-23 18:28:13 +04:00 · 2013-08-23 18:28:13 +04:00 · 9c5da2ea22
commit 9c5da2ea22
parent 32d578f5f0
5 changed files with 280 additions and 447 deletions
--- a/modules/cudaarithm/CMakeLists.txt
+++ b/modules/cudaarithm/CMakeLists.txt
@ -6,7 +6,7 @@ set(the_description "CUDA-accelerated Operations on Matrices")

 ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4324 /wd4512 -Wundef -Wmissing-declarations)

-ocv_add_module(cudaarithm opencv_core OPTIONAL opencv_cudalegacy)
+ocv_add_module(cudaarithm opencv_core OPTIONAL opencv_cudev opencv_cudalegacy)

 ocv_module_include_directories()
 ocv_glob_module_sources()
--- a/modules/cudaarithm/src/cuda/add_mat.cu
+++ b/modules/cudaarithm/src/cuda/add_mat.cu
@ -40,146 +40,186 @@
 //
 //M*/

-#if !defined CUDA_DISABLER
+#include "opencv2/opencv_modules.hpp"

-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/functional.hpp"
-#include "opencv2/core/cuda/transform.hpp"
-#include "opencv2/core/cuda/saturate_cast.hpp"
-#include "opencv2/core/cuda/simd_functions.hpp"
+#ifndef HAVE_OPENCV_CUDEV

-#include "arithm_func_traits.hpp"
+#error "opencv_cudev is required"

-using namespace cv::cuda;
-using namespace cv::cuda::device;
+#else

-namespace arithm
+#include "opencv2/cudev.hpp"
+
+using namespace cv::cudev;
+
+void addMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, double, Stream& _stream, int);
+
+namespace
 {
-    struct VAdd4 : binary_function<uint, uint, uint>
-    {
-        __device__ __forceinline__ uint operator ()(uint a, uint b) const
-        {
-            return vadd4(a, b);
-        }
-
-        __host__ __device__ __forceinline__ VAdd4() {}
-        __host__ __device__ __forceinline__ VAdd4(const VAdd4&) {}
-    };
-
-    struct VAdd2 : binary_function<uint, uint, uint>
-    {
-        __device__ __forceinline__ uint operator ()(uint a, uint b) const
-        {
-            return vadd2(a, b);
-        }
-
-        __host__ __device__ __forceinline__ VAdd2() {}
-        __host__ __device__ __forceinline__ VAdd2(const VAdd2&) {}
-    };
-
-    template <typename T, typename D> struct AddMat : binary_function<T, T, D>
+    template <typename T, typename D> struct AddOp1 : binary_function<T, T, D>
    {
        __device__ __forceinline__ D operator ()(T a, T b) const
        {
            return saturate_cast<D>(a + b);
        }
-
-        __host__ __device__ __forceinline__ AddMat() {}
-        __host__ __device__ __forceinline__ AddMat(const AddMat&) {}
    };
-}
-
-namespace cv { namespace cuda { namespace device
-{
-    template <> struct TransformFunctorTraits< arithm::VAdd4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
-    {
-    };
-
-    template <> struct TransformFunctorTraits< arithm::VAdd2 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
-    {
-    };
-
-    template <typename T, typename D> struct TransformFunctorTraits< arithm::AddMat<T, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
-    {
-    };
-}}}
-
-namespace arithm
-{
-    void addMat_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
-    {
-        device::transform(src1, src2, dst, VAdd4(), WithOutMask(), stream);
-    }
-
-    void addMat_v2(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
-    {
-        device::transform(src1, src2, dst, VAdd2(), WithOutMask(), stream);
-    }

    template <typename T, typename D>
-    void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
+    void addMat_v1(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, Stream& stream)
    {
        if (mask.data)
-            device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, AddMat<T, D>(), mask, stream);
+            gridTransformBinary(globPtr<T>(src1), globPtr<T>(src2), globPtr<D>(dst), AddOp1<T, D>(), globPtr<uchar>(mask), stream);
        else
-            device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, AddMat<T, D>(), WithOutMask(), stream);
+            gridTransformBinary(globPtr<T>(src1), globPtr<T>(src2), globPtr<D>(dst), AddOp1<T, D>(), stream);
    }

-    template void addMat<uchar, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void addMat<uchar, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void addMat<uchar, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void addMat<uchar, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void addMat<uchar, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void addMat<uchar, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void addMat<uchar, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    struct AddOp2 : binary_function<uint, uint, uint>
+    {
+        __device__ __forceinline__ uint operator ()(uint a, uint b) const
+        {
+            return vadd2(a, b);
+        }
+    };

-    template void addMat<schar, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void addMat<schar, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void addMat<schar, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void addMat<schar, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void addMat<schar, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void addMat<schar, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void addMat<schar, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    void addMat_v2(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
+    {
+        const int vcols = src1.cols >> 1;

-    //template void addMat<ushort, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    //template void addMat<ushort, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void addMat<ushort, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void addMat<ushort, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void addMat<ushort, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void addMat<ushort, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void addMat<ushort, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+        GlobPtrSz<uint> src1_ = globPtr((uint*) src1.data, src1.step, src1.rows, vcols);
+        GlobPtrSz<uint> src2_ = globPtr((uint*) src2.data, src2.step, src1.rows, vcols);
+        GlobPtrSz<uint> dst_ = globPtr((uint*) dst.data, dst.step, src1.rows, vcols);

-    //template void addMat<short, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    //template void addMat<short, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void addMat<short, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void addMat<short, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void addMat<short, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void addMat<short, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void addMat<short, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+        gridTransformBinary(src1_, src2_, dst_, AddOp2(), stream);
+    }

-    //template void addMat<int, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    //template void addMat<int, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    //template void addMat<int, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    //template void addMat<int, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void addMat<int, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void addMat<int, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void addMat<int, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    struct AddOp4 : binary_function<uint, uint, uint>
+    {
+        __device__ __forceinline__ uint operator ()(uint a, uint b) const
+        {
+            return vadd4(a, b);
+        }
+    };

-    //template void addMat<float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    //template void addMat<float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    //template void addMat<float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    //template void addMat<float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    //template void addMat<float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void addMat<float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void addMat<float, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    void addMat_v4(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
+    {
+        const int vcols = src1.cols >> 2;

-    //template void addMat<double, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    //template void addMat<double, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    //template void addMat<double, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    //template void addMat<double, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    //template void addMat<double, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    //template void addMat<double, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void addMat<double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+        GlobPtrSz<uint> src1_ = globPtr((uint*) src1.data, src1.step, src1.rows, vcols);
+        GlobPtrSz<uint> src2_ = globPtr((uint*) src2.data, src2.step, src1.rows, vcols);
+        GlobPtrSz<uint> dst_ = globPtr((uint*) dst.data, dst.step, src1.rows, vcols);
+
+        gridTransformBinary(src1_, src2_, dst_, AddOp4(), stream);
+    }
 }

-#endif // CUDA_DISABLER
+void addMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, double, Stream& stream, int)
+{
+    typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, Stream& stream);
+    static const func_t funcs[7][7] =
+    {
+        {
+            addMat_v1<uchar, uchar>,
+            addMat_v1<uchar, schar>,
+            addMat_v1<uchar, ushort>,
+            addMat_v1<uchar, short>,
+            addMat_v1<uchar, int>,
+            addMat_v1<uchar, float>,
+            addMat_v1<uchar, double>
+        },
+        {
+            addMat_v1<schar, uchar>,
+            addMat_v1<schar, schar>,
+            addMat_v1<schar, ushort>,
+            addMat_v1<schar, short>,
+            addMat_v1<schar, int>,
+            addMat_v1<schar, float>,
+            addMat_v1<schar, double>
+        },
+        {
+            0 /*addMat_v1<ushort, uchar>*/,
+            0 /*addMat_v1<ushort, schar>*/,
+            addMat_v1<ushort, ushort>,
+            addMat_v1<ushort, short>,
+            addMat_v1<ushort, int>,
+            addMat_v1<ushort, float>,
+            addMat_v1<ushort, double>
+        },
+        {
+            0 /*addMat_v1<short, uchar>*/,
+            0 /*addMat_v1<short, schar>*/,
+            addMat_v1<short, ushort>,
+            addMat_v1<short, short>,
+            addMat_v1<short, int>,
+            addMat_v1<short, float>,
+            addMat_v1<short, double>
+        },
+        {
+            0 /*addMat_v1<int, uchar>*/,
+            0 /*addMat_v1<int, schar>*/,
+            0 /*addMat_v1<int, ushort>*/,
+            0 /*addMat_v1<int, short>*/,
+            addMat_v1<int, int>,
+            addMat_v1<int, float>,
+            addMat_v1<int, double>
+        },
+        {
+            0 /*addMat_v1<float, uchar>*/,
+            0 /*addMat_v1<float, schar>*/,
+            0 /*addMat_v1<float, ushort>*/,
+            0 /*addMat_v1<float, short>*/,
+            0 /*addMat_v1<float, int>*/,
+            addMat_v1<float, float>,
+            addMat_v1<float, double>
+        },
+        {
+            0 /*addMat_v1<double, uchar>*/,
+            0 /*addMat_v1<double, schar>*/,
+            0 /*addMat_v1<double, ushort>*/,
+            0 /*addMat_v1<double, short>*/,
+            0 /*addMat_v1<double, int>*/,
+            0 /*addMat_v1<double, float>*/,
+            addMat_v1<double, double>
+        }
+    };
+
+    const int sdepth = src1.depth();
+    const int ddepth = dst.depth();
+
+    CV_DbgAssert( sdepth < 7 && ddepth < 7 );
+
+    GpuMat src1_ = src1.reshape(1);
+    GpuMat src2_ = src2.reshape(1);
+    GpuMat dst_ = dst.reshape(1);
+
+    if (mask.empty() && (sdepth == CV_8U || sdepth == CV_16U) && ddepth == sdepth)
+    {
+        const intptr_t src1ptr = reinterpret_cast<intptr_t>(src1_.data);
+        const intptr_t src2ptr = reinterpret_cast<intptr_t>(src2_.data);
+        const intptr_t dstptr = reinterpret_cast<intptr_t>(dst_.data);
+
+        const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0;
+
+        if (isAllAligned)
+        {
+            if (sdepth == CV_8U && (src1_.cols & 3) == 0)
+            {
+                addMat_v4(src1_, src2_, dst_, stream);
+                return;
+            }
+            else if (sdepth == CV_16U && (src1_.cols & 1) == 0)
+            {
+                addMat_v2(src1_, src2_, dst_, stream);
+                return;
+            }
+        }
+    }
+
+    const func_t func = funcs[sdepth][ddepth];
+
+    if (!func)
+        CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types");
+
+    func(src1_, src2_, dst_, mask, stream);
+}
+
+#endif
--- a/modules/cudaarithm/src/cuda/add_scalar.cu
+++ b/modules/cudaarithm/src/cuda/add_scalar.cu
@ -40,109 +40,141 @@
 //
 //M*/

-#if !defined CUDA_DISABLER
+#include "opencv2/opencv_modules.hpp"

-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/functional.hpp"
-#include "opencv2/core/cuda/transform.hpp"
-#include "opencv2/core/cuda/saturate_cast.hpp"
-#include "opencv2/core/cuda/simd_functions.hpp"
+#ifndef HAVE_OPENCV_CUDEV

-#include "arithm_func_traits.hpp"
+#error "opencv_cudev is required"

-using namespace cv::cuda;
-using namespace cv::cuda::device;
+#else

-namespace arithm
+#include "opencv2/cudev.hpp"
+
+using namespace cv::cudev;
+
+void addScalar(const GpuMat& src, cv::Scalar val, bool, GpuMat& dst, const GpuMat& mask, double, Stream& stream, int);
+
+namespace
 {
-    template <typename T, typename S, typename D> struct AddScalar : unary_function<T, D>
+    template <typename SrcType, typename ScalarType, typename DstType> struct AddScalarOp : unary_function<SrcType, DstType>
    {
-        S val;
+        ScalarType val;

-        __host__ explicit AddScalar(S val_) : val(val_) {}
-
-        __device__ __forceinline__ D operator ()(T a) const
+        __device__ __forceinline__ DstType operator ()(SrcType a) const
        {
-            return saturate_cast<D>(a + val);
+            return saturate_cast<DstType>(saturate_cast<ScalarType>(a) + val);
        }
    };
-}

-namespace cv { namespace cuda { namespace device
-{
-    template <typename T, typename S, typename D> struct TransformFunctorTraits< arithm::AddScalar<T, S, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
+    template <typename ScalarDepth> struct TransformPolicy : DefaultTransformPolicy
    {
    };
-}}}
-
-namespace arithm
-{
-    template <typename T, typename S, typename D>
-    void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
+    template <> struct TransformPolicy<double> : DefaultTransformPolicy
    {
-        AddScalar<T, S, D> op(static_cast<S>(val));
+        enum {
+            shift = 1
+        };
+    };
+
+    template <typename SrcType, typename ScalarDepth, typename DstType>
+    void addScalarImpl(const GpuMat& src, cv::Scalar value, GpuMat& dst, const GpuMat& mask, Stream& stream)
+    {
+        typedef typename MakeVec<ScalarDepth, VecTraits<SrcType>::cn>::type ScalarType;
+
+        cv::Scalar_<ScalarDepth> value_ = value;
+
+        AddScalarOp<SrcType, ScalarType, DstType> op;
+        op.val = VecTraits<ScalarType>::make(value_.val);

        if (mask.data)
-            device::transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, mask, stream);
+            gridTransformUnary_< TransformPolicy<ScalarDepth> >(globPtr<SrcType>(src), globPtr<DstType>(dst), op, globPtr<uchar>(mask), stream);
        else
-            device::transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
+            gridTransformUnary_< TransformPolicy<ScalarDepth> >(globPtr<SrcType>(src), globPtr<DstType>(dst), op, stream);
    }
-
-    template void addScalar<uchar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void addScalar<uchar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void addScalar<uchar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void addScalar<uchar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void addScalar<uchar, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void addScalar<uchar, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void addScalar<uchar, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-
-    template void addScalar<schar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void addScalar<schar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void addScalar<schar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void addScalar<schar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void addScalar<schar, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void addScalar<schar, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void addScalar<schar, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-
-    //template void addScalar<ushort, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    //template void addScalar<ushort, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void addScalar<ushort, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void addScalar<ushort, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void addScalar<ushort, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void addScalar<ushort, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void addScalar<ushort, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-
-    //template void addScalar<short, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    //template void addScalar<short, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void addScalar<short, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void addScalar<short, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void addScalar<short, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void addScalar<short, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void addScalar<short, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-
-    //template void addScalar<int, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    //template void addScalar<int, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    //template void addScalar<int, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    //template void addScalar<int, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void addScalar<int, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void addScalar<int, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void addScalar<int, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-
-    //template void addScalar<float, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    //template void addScalar<float, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    //template void addScalar<float, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    //template void addScalar<float, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    //template void addScalar<float, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void addScalar<float, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void addScalar<float, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-
-    //template void addScalar<double, double, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    //template void addScalar<double, double, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    //template void addScalar<double, double, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    //template void addScalar<double, double, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    //template void addScalar<double, double, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    //template void addScalar<double, double, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void addScalar<double, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
 }

-#endif // CUDA_DISABLER
+void addScalar(const GpuMat& src, cv::Scalar val, bool, GpuMat& dst, const GpuMat& mask, double, Stream& stream, int)
+{
+    typedef void (*func_t)(const GpuMat& src, cv::Scalar val, GpuMat& dst, const GpuMat& mask, Stream& stream);
+    static const func_t funcs[7][7][4] =
+    {
+        {
+            {addScalarImpl<uchar, float, uchar>, addScalarImpl<uchar2, float, uchar2>, addScalarImpl<uchar3, float, uchar3>, addScalarImpl<uchar4, float, uchar4>},
+            {addScalarImpl<uchar, float, schar>, addScalarImpl<uchar2, float, char2>, addScalarImpl<uchar3, float, char3>, addScalarImpl<uchar4, float, char4>},
+            {addScalarImpl<uchar, float, ushort>, addScalarImpl<uchar2, float, ushort2>, addScalarImpl<uchar3, float, ushort3>, addScalarImpl<uchar4, float, ushort4>},
+            {addScalarImpl<uchar, float, short>, addScalarImpl<uchar2, float, short2>, addScalarImpl<uchar3, float, short3>, addScalarImpl<uchar4, float, short4>},
+            {addScalarImpl<uchar, float, int>, addScalarImpl<uchar2, float, int2>, addScalarImpl<uchar3, float, int3>, addScalarImpl<uchar4, float, int4>},
+            {addScalarImpl<uchar, float, float>, addScalarImpl<uchar2, float, float2>, addScalarImpl<uchar3, float, float3>, addScalarImpl<uchar4, float, float4>},
+            {addScalarImpl<uchar, double, double>, addScalarImpl<uchar2, double, double2>, addScalarImpl<uchar3, double, double3>, addScalarImpl<uchar4, double, double4>}
+        },
+        {
+            {addScalarImpl<schar, float, uchar>, addScalarImpl<char2, float, uchar2>, addScalarImpl<char3, float, uchar3>, addScalarImpl<char4, float, uchar4>},
+            {addScalarImpl<schar, float, schar>, addScalarImpl<char2, float, char2>, addScalarImpl<char3, float, char3>, addScalarImpl<char4, float, char4>},
+            {addScalarImpl<schar, float, ushort>, addScalarImpl<char2, float, ushort2>, addScalarImpl<char3, float, ushort3>, addScalarImpl<char4, float, ushort4>},
+            {addScalarImpl<schar, float, short>, addScalarImpl<char2, float, short2>, addScalarImpl<char3, float, short3>, addScalarImpl<char4, float, short4>},
+            {addScalarImpl<schar, float, int>, addScalarImpl<char2, float, int2>, addScalarImpl<char3, float, int3>, addScalarImpl<char4, float, int4>},
+            {addScalarImpl<schar, float, float>, addScalarImpl<char2, float, float2>, addScalarImpl<char3, float, float3>, addScalarImpl<char4, float, float4>},
+            {addScalarImpl<schar, double, double>, addScalarImpl<char2, double, double2>, addScalarImpl<char3, double, double3>, addScalarImpl<char4, double, double4>}
+        },
+        {
+            {0 /*addScalarImpl<ushort, float, uchar>*/, 0 /*addScalarImpl<ushort2, float, uchar2>*/, 0 /*addScalarImpl<ushort3, float, uchar3>*/, 0 /*addScalarImpl<ushort4, float, uchar4>*/},
+            {0 /*addScalarImpl<ushort, float, schar>*/, 0 /*addScalarImpl<ushort2, float, char2>*/, 0 /*addScalarImpl<ushort3, float, char3>*/, 0 /*addScalarImpl<ushort4, float, char4>*/},
+            {addScalarImpl<ushort, float, ushort>, addScalarImpl<ushort2, float, ushort2>, addScalarImpl<ushort3, float, ushort3>, addScalarImpl<ushort4, float, ushort4>},
+            {addScalarImpl<ushort, float, short>, addScalarImpl<ushort2, float, short2>, addScalarImpl<ushort3, float, short3>, addScalarImpl<ushort4, float, short4>},
+            {addScalarImpl<ushort, float, int>, addScalarImpl<ushort2, float, int2>, addScalarImpl<ushort3, float, int3>, addScalarImpl<ushort4, float, int4>},
+            {addScalarImpl<ushort, float, float>, addScalarImpl<ushort2, float, float2>, addScalarImpl<ushort3, float, float3>, addScalarImpl<ushort4, float, float4>},
+            {addScalarImpl<ushort, double, double>, addScalarImpl<ushort2, double, double2>, addScalarImpl<ushort3, double, double3>, addScalarImpl<ushort4, double, double4>}
+        },
+        {
+            {0 /*addScalarImpl<short, float, uchar>*/, 0 /*addScalarImpl<short2, float, uchar2>*/, 0 /*addScalarImpl<short3, float, uchar3>*/, 0 /*addScalarImpl<short4, float, uchar4>*/},
+            {0 /*addScalarImpl<short, float, schar>*/, 0 /*addScalarImpl<short2, float, char2>*/, 0 /*addScalarImpl<short3, float, char3>*/, 0 /*addScalarImpl<short4, float, char4>*/},
+            {addScalarImpl<short, float, ushort>, addScalarImpl<short2, float, ushort2>, addScalarImpl<short3, float, ushort3>, addScalarImpl<short4, float, ushort4>},
+            {addScalarImpl<short, float, short>, addScalarImpl<short2, float, short2>, addScalarImpl<short3, float, short3>, addScalarImpl<short4, float, short4>},
+            {addScalarImpl<short, float, int>, addScalarImpl<short2, float, int2>, addScalarImpl<short3, float, int3>, addScalarImpl<short4, float, int4>},
+            {addScalarImpl<short, float, float>, addScalarImpl<short2, float, float2>, addScalarImpl<short3, float, float3>, addScalarImpl<short4, float, float4>},
+            {addScalarImpl<short, double, double>, addScalarImpl<short2, double, double2>, addScalarImpl<short3, double, double3>, addScalarImpl<short4, double, double4>}
+        },
+        {
+            {0 /*addScalarImpl<int, float, uchar>*/, 0 /*addScalarImpl<int2, float, uchar2>*/, 0 /*addScalarImpl<int3, float, uchar3>*/, 0 /*addScalarImpl<int4, float, uchar4>*/},
+            {0 /*addScalarImpl<int, float, schar>*/, 0 /*addScalarImpl<int2, float, char2>*/, 0 /*addScalarImpl<int3, float, char3>*/, 0 /*addScalarImpl<int4, float, char4>*/},
+            {0 /*addScalarImpl<int, float, ushort>*/, 0 /*addScalarImpl<int2, float, ushort2>*/, 0 /*addScalarImpl<int3, float, ushort3>*/, 0 /*addScalarImpl<int4, float, ushort4>*/},
+            {0 /*addScalarImpl<int, float, short>*/, 0 /*addScalarImpl<int2, float, short2>*/, 0 /*addScalarImpl<int3, float, short3>*/, 0 /*addScalarImpl<int4, float, short4>*/},
+            {addScalarImpl<int, float, int>, addScalarImpl<int2, float, int2>, addScalarImpl<int3, float, int3>, addScalarImpl<int4, float, int4>},
+            {addScalarImpl<int, float, float>, addScalarImpl<int2, float, float2>, addScalarImpl<int3, float, float3>, addScalarImpl<int4, float, float4>},
+            {addScalarImpl<int, double, double>, addScalarImpl<int2, double, double2>, addScalarImpl<int3, double, double3>, addScalarImpl<int4, double, double4>}
+        },
+        {
+            {0 /*addScalarImpl<float, float, uchar>*/, 0 /*addScalarImpl<float2, float, uchar2>*/, 0 /*addScalarImpl<float3, float, uchar3>*/, 0 /*addScalarImpl<float4, float, uchar4>*/},
+            {0 /*addScalarImpl<float, float, schar>*/, 0 /*addScalarImpl<float2, float, char2>*/, 0 /*addScalarImpl<float3, float, char3>*/, 0 /*addScalarImpl<float4, float, char4>*/},
+            {0 /*addScalarImpl<float, float, ushort>*/, 0 /*addScalarImpl<float2, float, ushort2>*/, 0 /*addScalarImpl<float3, float, ushort3>*/, 0 /*addScalarImpl<float4, float, ushort4>*/},
+            {0 /*addScalarImpl<float, float, short>*/, 0 /*addScalarImpl<float2, float, short2>*/, 0 /*addScalarImpl<float3, float, short3>*/, 0 /*addScalarImpl<float4, float, short4>*/},
+            {0 /*addScalarImpl<float, float, int>*/, 0 /*addScalarImpl<float2, float, int2>*/, 0 /*addScalarImpl<float3, float, int3>*/, 0 /*addScalarImpl<float4, float, int4>*/},
+            {addScalarImpl<float, float, float>, addScalarImpl<float2, float, float2>, addScalarImpl<float3, float, float3>, addScalarImpl<float4, float, float4>},
+            {addScalarImpl<float, double, double>, addScalarImpl<float2, double, double2>, addScalarImpl<float3, double, double3>, addScalarImpl<float4, double, double4>}
+        },
+        {
+            {0 /*addScalarImpl<double, double, uchar>*/, 0 /*addScalarImpl<double2, double, uchar2>*/, 0 /*addScalarImpl<double3, double, uchar3>*/, 0 /*addScalarImpl<double4, double, uchar4>*/},
+            {0 /*addScalarImpl<double, double, schar>*/, 0 /*addScalarImpl<double2, double, char2>*/, 0 /*addScalarImpl<double3, double, char3>*/, 0 /*addScalarImpl<double4, double, char4>*/},
+            {0 /*addScalarImpl<double, double, ushort>*/, 0 /*addScalarImpl<double2, double, ushort2>*/, 0 /*addScalarImpl<double3, double, ushort3>*/, 0 /*addScalarImpl<double4, double, ushort4>*/},
+            {0 /*addScalarImpl<double, double, short>*/, 0 /*addScalarImpl<double2, double, short2>*/, 0 /*addScalarImpl<double3, double, short3>*/, 0 /*addScalarImpl<double4, double, short4>*/},
+            {0 /*addScalarImpl<double, double, int>*/, 0 /*addScalarImpl<double2, double, int2>*/, 0 /*addScalarImpl<double3, double, int3>*/, 0 /*addScalarImpl<double4, double, int4>*/},
+            {0 /*addScalarImpl<double, double, float>*/, 0 /*addScalarImpl<double2, double, float2>*/, 0 /*addScalarImpl<double3, double, float3>*/, 0 /*addScalarImpl<double4, double, float4>*/},
+            {addScalarImpl<double, double, double>, addScalarImpl<double2, double, double2>, addScalarImpl<double3, double, double3>, addScalarImpl<double4, double, double4>}
+        }
+    };
+
+    const int sdepth = src.depth();
+    const int ddepth = dst.depth();
+    const int cn = src.channels();
+
+    CV_DbgAssert( sdepth < 7 && ddepth < 7 && cn <= 4 );
+
+    const func_t func = funcs[sdepth][ddepth][cn - 1];
+
+    if (!func)
+        CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types");
+
+    func(src, val, dst, mask, stream);
+}
+
+#endif
--- a/modules/cudaarithm/src/element_operations.cpp
+++ b/modules/cudaarithm/src/element_operations.cpp
@ -336,248 +336,9 @@ namespace
 ////////////////////////////////////////////////////////////////////////
 // add

-namespace arithm
-{
-    void addMat_v4(PtrStepSz<unsigned int> src1, PtrStepSz<unsigned int> src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);
-    void addMat_v2(PtrStepSz<unsigned int> src1, PtrStepSz<unsigned int> src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);
+void addMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, double, Stream& _stream, int);

-    template <typename T, typename D>
-    void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-}
-
-static void addMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, double, Stream& _stream, int)
-{
-    typedef void (*func_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    static const func_t funcs[7][7] =
-    {
-        {
-            arithm::addMat<unsigned char, unsigned char>,
-            arithm::addMat<unsigned char, signed char>,
-            arithm::addMat<unsigned char, unsigned short>,
-            arithm::addMat<unsigned char, short>,
-            arithm::addMat<unsigned char, int>,
-            arithm::addMat<unsigned char, float>,
-            arithm::addMat<unsigned char, double>
-        },
-        {
-            arithm::addMat<signed char, unsigned char>,
-            arithm::addMat<signed char, signed char>,
-            arithm::addMat<signed char, unsigned short>,
-            arithm::addMat<signed char, short>,
-            arithm::addMat<signed char, int>,
-            arithm::addMat<signed char, float>,
-            arithm::addMat<signed char, double>
-        },
-        {
-            0 /*arithm::addMat<unsigned short, unsigned char>*/,
-            0 /*arithm::addMat<unsigned short, signed char>*/,
-            arithm::addMat<unsigned short, unsigned short>,
-            arithm::addMat<unsigned short, short>,
-            arithm::addMat<unsigned short, int>,
-            arithm::addMat<unsigned short, float>,
-            arithm::addMat<unsigned short, double>
-        },
-        {
-            0 /*arithm::addMat<short, unsigned char>*/,
-            0 /*arithm::addMat<short, signed char>*/,
-            arithm::addMat<short, unsigned short>,
-            arithm::addMat<short, short>,
-            arithm::addMat<short, int>,
-            arithm::addMat<short, float>,
-            arithm::addMat<short, double>
-        },
-        {
-            0 /*arithm::addMat<int, unsigned char>*/,
-            0 /*arithm::addMat<int, signed char>*/,
-            0 /*arithm::addMat<int, unsigned short>*/,
-            0 /*arithm::addMat<int, short>*/,
-            arithm::addMat<int, int>,
-            arithm::addMat<int, float>,
-            arithm::addMat<int, double>
-        },
-        {
-            0 /*arithm::addMat<float, unsigned char>*/,
-            0 /*arithm::addMat<float, signed char>*/,
-            0 /*arithm::addMat<float, unsigned short>*/,
-            0 /*arithm::addMat<float, short>*/,
-            0 /*arithm::addMat<float, int>*/,
-            arithm::addMat<float, float>,
-            arithm::addMat<float, double>
-        },
-        {
-            0 /*arithm::addMat<double, unsigned char>*/,
-            0 /*arithm::addMat<double, signed char>*/,
-            0 /*arithm::addMat<double, unsigned short>*/,
-            0 /*arithm::addMat<double, short>*/,
-            0 /*arithm::addMat<double, int>*/,
-            0 /*arithm::addMat<double, float>*/,
-            arithm::addMat<double, double>
-        }
-    };
-
-    const int sdepth = src1.depth();
-    const int ddepth = dst.depth();
-    const int cn = src1.channels();
-
-    cudaStream_t stream = StreamAccessor::getStream(_stream);
-
-    PtrStepSzb src1_(src1.rows, src1.cols * cn, src1.data, src1.step);
-    PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step);
-    PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step);
-
-    if (mask.empty() && (sdepth == CV_8U || sdepth == CV_16U) && ddepth == sdepth)
-    {
-        const intptr_t src1ptr = reinterpret_cast<intptr_t>(src1_.data);
-        const intptr_t src2ptr = reinterpret_cast<intptr_t>(src2_.data);
-        const intptr_t dstptr = reinterpret_cast<intptr_t>(dst_.data);
-
-        const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0;
-
-        if (isAllAligned)
-        {
-            if (sdepth == CV_8U && (src1_.cols & 3) == 0)
-            {
-                const int vcols = src1_.cols >> 2;
-
-                arithm::addMat_v4(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
-                                  PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
-                                  PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
-                                  stream);
-
-                return;
-            }
-            else if (sdepth == CV_16U && (src1_.cols & 1) == 0)
-            {
-                const int vcols = src1_.cols >> 1;
-
-                arithm::addMat_v2(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
-                                  PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
-                                  PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
-                                  stream);
-
-                return;
-            }
-        }
-    }
-
-    const func_t func = funcs[sdepth][ddepth];
-
-    if (!func)
-        CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types");
-
-    func(src1_, src2_, dst_, mask, stream);
-}
-
-namespace arithm
-{
-    template <typename T, typename S, typename D>
-    void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-}
-
-static void addScalar(const GpuMat& src, Scalar val, bool, GpuMat& dst, const GpuMat& mask, double, Stream& _stream, int)
-{
-    typedef void (*func_t)(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    static const func_t funcs[7][7] =
-    {
-        {
-            arithm::addScalar<unsigned char, float, unsigned char>,
-            arithm::addScalar<unsigned char, float, signed char>,
-            arithm::addScalar<unsigned char, float, unsigned short>,
-            arithm::addScalar<unsigned char, float, short>,
-            arithm::addScalar<unsigned char, float, int>,
-            arithm::addScalar<unsigned char, float, float>,
-            arithm::addScalar<unsigned char, double, double>
-        },
-        {
-            arithm::addScalar<signed char, float, unsigned char>,
-            arithm::addScalar<signed char, float, signed char>,
-            arithm::addScalar<signed char, float, unsigned short>,
-            arithm::addScalar<signed char, float, short>,
-            arithm::addScalar<signed char, float, int>,
-            arithm::addScalar<signed char, float, float>,
-            arithm::addScalar<signed char, double, double>
-        },
-        {
-            0 /*arithm::addScalar<unsigned short, float, unsigned char>*/,
-            0 /*arithm::addScalar<unsigned short, float, signed char>*/,
-            arithm::addScalar<unsigned short, float, unsigned short>,
-            arithm::addScalar<unsigned short, float, short>,
-            arithm::addScalar<unsigned short, float, int>,
-            arithm::addScalar<unsigned short, float, float>,
-            arithm::addScalar<unsigned short, double, double>
-        },
-        {
-            0 /*arithm::addScalar<short, float, unsigned char>*/,
-            0 /*arithm::addScalar<short, float, signed char>*/,
-            arithm::addScalar<short, float, unsigned short>,
-            arithm::addScalar<short, float, short>,
-            arithm::addScalar<short, float, int>,
-            arithm::addScalar<short, float, float>,
-            arithm::addScalar<short, double, double>
-        },
-        {
-            0 /*arithm::addScalar<int, float, unsigned char>*/,
-            0 /*arithm::addScalar<int, float, signed char>*/,
-            0 /*arithm::addScalar<int, float, unsigned short>*/,
-            0 /*arithm::addScalar<int, float, short>*/,
-            arithm::addScalar<int, float, int>,
-            arithm::addScalar<int, float, float>,
-            arithm::addScalar<int, double, double>
-        },
-        {
-            0 /*arithm::addScalar<float, float, unsigned char>*/,
-            0 /*arithm::addScalar<float, float, signed char>*/,
-            0 /*arithm::addScalar<float, float, unsigned short>*/,
-            0 /*arithm::addScalar<float, float, short>*/,
-            0 /*arithm::addScalar<float, float, int>*/,
-            arithm::addScalar<float, float, float>,
-            arithm::addScalar<float, double, double>
-        },
-        {
-            0 /*arithm::addScalar<double, double, unsigned char>*/,
-            0 /*arithm::addScalar<double, double, signed char>*/,
-            0 /*arithm::addScalar<double, double, unsigned short>*/,
-            0 /*arithm::addScalar<double, double, short>*/,
-            0 /*arithm::addScalar<double, double, int>*/,
-            0 /*arithm::addScalar<double, double, float>*/,
-            arithm::addScalar<double, double, double>
-        }
-    };
-
-    typedef void (*npp_func_t)(const PtrStepSzb src, Scalar sc, PtrStepb dst, cudaStream_t stream);
-    static const npp_func_t npp_funcs[7][4] =
-    {
-        {NppArithmScalar<CV_8U , 1, nppiAddC_8u_C1RSfs >::call, 0                                                     , NppArithmScalar<CV_8U , 3, nppiAddC_8u_C3RSfs >::call, NppArithmScalar<CV_8U , 4, nppiAddC_8u_C4RSfs >::call},
-        {0                                                    , 0                                                     , 0                                                    , 0                                                    },
-        {NppArithmScalar<CV_16U, 1, nppiAddC_16u_C1RSfs>::call, 0                                                     , NppArithmScalar<CV_16U, 3, nppiAddC_16u_C3RSfs>::call, NppArithmScalar<CV_16U, 4, nppiAddC_16u_C4RSfs>::call},
-        {NppArithmScalar<CV_16S, 1, nppiAddC_16s_C1RSfs>::call, NppArithmScalar<CV_16S, 2, nppiAddC_16sc_C1RSfs>::call, NppArithmScalar<CV_16S, 3, nppiAddC_16s_C3RSfs>::call, NppArithmScalar<CV_16S, 4, nppiAddC_16s_C4RSfs>::call},
-        {NppArithmScalar<CV_32S, 1, nppiAddC_32s_C1RSfs>::call, NppArithmScalar<CV_32S, 2, nppiAddC_32sc_C1RSfs>::call, NppArithmScalar<CV_32S, 3, nppiAddC_32s_C3RSfs>::call, 0                                                    },
-        {NppArithmScalar<CV_32F, 1, nppiAddC_32f_C1R   >::call, NppArithmScalar<CV_32F, 2, nppiAddC_32fc_C1R   >::call, NppArithmScalar<CV_32F, 3, nppiAddC_32f_C3R   >::call, NppArithmScalar<CV_32F, 4, nppiAddC_32f_C4R   >::call},
-        {0                                                    , 0                                                     , 0                                                    , 0                                                    }
-    };
-
-    const int sdepth = src.depth();
-    const int ddepth = dst.depth();
-    const int cn = src.channels();
-
-    cudaStream_t stream = StreamAccessor::getStream(_stream);
-
-    const npp_func_t npp_func = npp_funcs[sdepth][cn - 1];
-    if (ddepth == sdepth && cn > 1 && npp_func != 0)
-    {
-        npp_func(src, val, dst, stream);
-        return;
-    }
-
-    CV_Assert( cn == 1 );
-
-    const func_t func = funcs[sdepth][ddepth];
-
-    if (!func)
-        CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types");
-
-    func(src, val[0], dst, mask, stream);
-}
+void addScalar(const GpuMat& src, Scalar val, bool, GpuMat& dst, const GpuMat& mask, double, Stream& stream, int);

 void cv::cuda::add(InputArray src1, InputArray src2, OutputArray dst, InputArray mask, int dtype, Stream& stream)
 {
--- a/modules/cudev/include/opencv2/cudev/grid/detail/integral.hpp
+++ b/modules/cudev/include/opencv2/cudev/grid/detail/integral.hpp
@ -594,7 +594,7 @@ namespace integral_detail
            CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
    }

-    __host__ static void integral(const GlobPtr<uchar> src, GlobPtr<uint> dst, int rows, int cols, cudaStream_t stream)
+    __host__ static void integral(const GlobPtr<uchar>& src, const GlobPtr<uint>& dst, int rows, int cols, cudaStream_t stream)
    {
        if (deviceSupports(FEATURE_SET_COMPUTE_30)
            && (cols % 16 == 0)
@ -614,7 +614,7 @@ namespace integral_detail
            CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
    }

-    __host__ static void integral(const GlobPtr<uchar> src, GlobPtr<int> dst, int rows, int cols, cudaStream_t stream)
+    __host__ __forceinline__ void integral(const GlobPtr<uchar>& src, const GlobPtr<int>& dst, int rows, int cols, cudaStream_t stream)
    {
        GlobPtr<uint> dstui = globPtr((uint*) dst.data, dst.step);
        integral(src, dstui, rows, cols, stream);