Removed PtrElemStep, Marked DevMem1D as deprecated, now should use PtrStepSz now

2024-11-24 19:20:28 +08:00 · 2012-08-23 17:45:50 +04:00 · 2012-08-23 17:45:50 +04:00 · 70204a8e68
commit 70204a8e68
parent 9be63c50dd
95 changed files with 2597 additions and 2563 deletions
--- a/modules/core/include/opencv2/core/cuda_devptrs.hpp
+++ b/modules/core/include/opencv2/core/cuda_devptrs.hpp
@ -40,8 +40,8 @@
 //
 //M*/

-#ifndef __OPENCV_CORE_DevMem2D_HPP__
-#define __OPENCV_CORE_DevMem2D_HPP__
+#ifndef __OPENCV_CORE_DEVPTRS_HPP__
+#define __OPENCV_CORE_DEVPTRS_HPP__

 #ifdef __cplusplus

@ -107,55 +107,54 @@ namespace cv
            __CV_GPU_HOST_DEVICE__ PtrStepSz(int rows_, int cols_, T* data_, size_t step_)
                : PtrStep<T>(data_, step_), cols(cols_), rows(rows_) {}

+            template <typename U>
+            explicit PtrStepSz(const PtrStepSz<U>& d) : PtrStep<T>((T*)d.data, d.step), cols(d.cols), rows(d.rows){}
+
            int cols;
            int rows;
        };

-        template <typename T> struct DevMem2D_ : public PtrStepSz<T>
+        typedef PtrStepSz<unsigned char> PtrStepSzb;
+        typedef PtrStepSz<float> PtrStepSzf;
+        typedef PtrStepSz<int> PtrStepSzi;
+
+        typedef PtrStep<unsigned char> PtrStepb;
+        typedef PtrStep<float> PtrStepf;
+        typedef PtrStep<int> PtrStepi;
+
+
+#if defined __GNUC__ 
+    #define __CV_GPU_DEPR_BEFORE__
+    #define __CV_GPU_DEPR_AFTER__ __attribute__ ((deprecated))
+#elif defined(__MSVC__) //|| defined(__CUDACC__)
+    #pragma deprecated(DevMem2D_)
+    #define __CV_GPU_DEPR_BEFORE__ __declspec(deprecated)
+    #define __CV_GPU_DEPR_AFTER__
+#else
+    #define __CV_GPU_DEPR_BEFORE__
+    #define __CV_GPU_DEPR_AFTER__
+#endif
+
+        template <typename T> struct __CV_GPU_DEPR_BEFORE__ DevMem2D_ : public PtrStepSz<T>
        {
            DevMem2D_() {}
            DevMem2D_(int rows_, int cols_, T* data_, size_t step_) : PtrStepSz<T>(rows_, cols_, data_, step_) {}

            template <typename U> 
-            explicit DevMem2D_(const DevMem2D_<U>& d) : PtrStepSz<T>(d.rows, d.cols, (T*)d.data, d.step) {}
-        };
-
-        template<typename T> struct PtrElemStep_ : public PtrStep<T>
-        {
-            PtrElemStep_(const DevMem2D_<T>& mem) : PtrStep<T>(mem.data, mem.step)
-            {
-                StaticAssert<256 % sizeof(T) == 0>::check();
-
-                PtrStep<T>::step /= PtrStep<T>::elem_size;
-            }
-            __CV_GPU_HOST_DEVICE__ T* ptr(int y = 0) { return PtrStep<T>::data + y * PtrStep<T>::step; }
-            __CV_GPU_HOST_DEVICE__ const T* ptr(int y = 0) const { return PtrStep<T>::data + y * PtrStep<T>::step; }
-
-            __CV_GPU_HOST_DEVICE__ T& operator ()(int y, int x) { return ptr(y)[x]; }
-            __CV_GPU_HOST_DEVICE__ const T& operator ()(int y, int x) const { return ptr(y)[x]; }
-        };
-
-        template<typename T> struct PtrStep_ : public PtrStep<T>
-        {
-            PtrStep_() {}
-            PtrStep_(const DevMem2D_<T>& mem) : PtrStep<T>(mem.data, mem.step) {}
-        };
+            explicit __CV_GPU_DEPR_BEFORE__ DevMem2D_(const DevMem2D_<U>& d) : PtrStepSz<T>(d.rows, d.cols, (T*)d.data, d.step) {}
+        } __CV_GPU_DEPR_AFTER__ ;

        typedef DevMem2D_<unsigned char> DevMem2Db;
        typedef DevMem2Db DevMem2D;
        typedef DevMem2D_<float> DevMem2Df;
        typedef DevMem2D_<int> DevMem2Di;

-        typedef PtrStep<unsigned char> PtrStepb;
-        typedef PtrStep<float> PtrStepf;
-        typedef PtrStep<int> PtrStepi;
+//#undef __CV_GPU_DEPR_BEFORE__
+//#undef __CV_GPU_DEPR_AFTER__

-        typedef PtrElemStep_<unsigned char> PtrElemStep;
-        typedef PtrElemStep_<float> PtrElemStepf;
-        typedef PtrElemStep_<int> PtrElemStepi;
    }
 }

 #endif // __cplusplus

-#endif /* __OPENCV_GPU_DevMem2D_HPP__ */
+#endif /* __OPENCV_CORE_DEVPTRS_HPP__ */
--- a/modules/core/include/opencv2/core/gpumat.hpp
+++ b/modules/core/include/opencv2/core/gpumat.hpp
@ -46,7 +46,7 @@
 #ifdef __cplusplus

 #include "opencv2/core/core.hpp"
-#include "opencv2/core/devmem2d.hpp"
+#include "opencv2/core/cuda_devptrs.hpp"

 namespace cv { namespace gpu
 {
@ -268,10 +268,14 @@ namespace cv { namespace gpu
        template<typename _Tp> _Tp* ptr(int y = 0);
        template<typename _Tp> const _Tp* ptr(int y = 0) const;

-        template <typename _Tp> operator DevMem2D_<_Tp>() const;
-        template <typename _Tp> operator PtrStep_<_Tp>() const;
+        template <typename _Tp> operator PtrStepSz<_Tp>() const;
        template <typename _Tp> operator PtrStep<_Tp>() const;

+        // Deprecated function
+        __CV_GPU_DEPR_BEFORE__ template <typename _Tp> operator DevMem2D_<_Tp>() const __CV_GPU_DEPR_AFTER__;
+        #undef __CV_GPU_DEPR_BEFORE__
+        #undef __CV_GPU_DEPR_AFTER__
+
        /*! includes several bit-fields:
        - the magic signature
        - continuity flag
@ -502,14 +506,9 @@ namespace cv { namespace gpu
        return *this;
    }

-    template <class T> inline GpuMat::operator DevMem2D_<T>() const
+    template <class T> inline GpuMat::operator PtrStepSz<T>() const
    {
-        return DevMem2D_<T>(rows, cols, (T*)data, step);
-    }
-
-    template <class T> inline GpuMat::operator PtrStep_<T>() const
-    {
-        return PtrStep_<T>(static_cast< DevMem2D_<T> >(*this));
+        return PtrStepSz<T>(rows, cols, (T*)data, step);
    }

    template <class T> inline GpuMat::operator PtrStep<T>() const
@ -517,6 +516,11 @@ namespace cv { namespace gpu
        return PtrStep<T>((T*)data, step);
    }

+    template <class T> inline GpuMat::operator DevMem2D_<T>() const
+    {
+        return DevMem2D_<T>(rows, cols, (T*)data, step);
+    }
+
    inline GpuMat createContinuous(int rows, int cols, int type)
    {
        GpuMat m;
--- a/modules/core/src/cuda/matrix_operations.cu
+++ b/modules/core/src/cuda/matrix_operations.cu
@ -44,6 +44,18 @@
 #include "opencv2/gpu/device/transform.hpp"
 #include "opencv2/gpu/device/functional.hpp"

+namespace cv { namespace gpu { namespace device
+{
+    void writeScalar(const uchar*);
+    void writeScalar(const schar*);
+    void writeScalar(const ushort*);
+    void writeScalar(const short int*);
+    void writeScalar(const int*);
+    void writeScalar(const float*);
+    void writeScalar(const double*);
+    void convert_gpu(PtrStepSzb, int, PtrStepSzb, int, double, double, cudaStream_t);
+}}}
+
 namespace cv { namespace gpu { namespace device
 {
    template <typename T> struct shift_and_sizeof;
@ -59,17 +71,17 @@ namespace cv { namespace gpu { namespace device
    ////////////////////////////////// CopyTo /////////////////////////////////
    ///////////////////////////////////////////////////////////////////////////

-    template <typename T> void copyToWithMask(DevMem2Db src, DevMem2Db dst, int cn, DevMem2Db mask, bool colorMask, cudaStream_t stream)
+    template <typename T> void copyToWithMask(PtrStepSzb src, PtrStepSzb dst, int cn, PtrStepSzb mask, bool colorMask, cudaStream_t stream)
    {
        if (colorMask)
-            cv::gpu::device::transform((DevMem2D_<T>)src, (DevMem2D_<T>)dst, identity<T>(), SingleMask(mask), stream);
+            cv::gpu::device::transform((PtrStepSz<T>)src, (PtrStepSz<T>)dst, identity<T>(), SingleMask(mask), stream);
        else
-            cv::gpu::device::transform((DevMem2D_<T>)src, (DevMem2D_<T>)dst, identity<T>(), SingleMaskChannels(mask, cn), stream);
+            cv::gpu::device::transform((PtrStepSz<T>)src, (PtrStepSz<T>)dst, identity<T>(), SingleMaskChannels(mask, cn), stream);
    }

-    void copyToWithMask_gpu(DevMem2Db src, DevMem2Db dst, size_t elemSize1, int cn, DevMem2Db mask, bool colorMask, cudaStream_t stream)
+    void copyToWithMask_gpu(PtrStepSzb src, PtrStepSzb dst, size_t elemSize1, int cn, PtrStepSzb mask, bool colorMask, cudaStream_t stream)
    {
-        typedef void (*func_t)(DevMem2Db src, DevMem2Db dst, int cn, DevMem2Db mask, bool colorMask, cudaStream_t stream);
+        typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, int cn, PtrStepSzb mask, bool colorMask, cudaStream_t stream);

        static func_t tab[] =
        {
@ -164,7 +176,7 @@ namespace cv { namespace gpu { namespace device
            }
    }
    template <typename T>
-    void set_to_gpu(DevMem2Db mat, const T* scalar, DevMem2Db mask, int channels, cudaStream_t stream)
+    void set_to_gpu(PtrStepSzb mat, const T* scalar, PtrStepSzb mask, int channels, cudaStream_t stream)
    {
        writeScalar(scalar);

@ -178,16 +190,16 @@ namespace cv { namespace gpu { namespace device
            cudaSafeCall ( cudaDeviceSynchronize() );
    }

-    template void set_to_gpu<uchar >(DevMem2Db mat, const uchar*  scalar, DevMem2Db mask, int channels, cudaStream_t stream);
-    template void set_to_gpu<schar >(DevMem2Db mat, const schar*  scalar, DevMem2Db mask, int channels, cudaStream_t stream);
-    template void set_to_gpu<ushort>(DevMem2Db mat, const ushort* scalar, DevMem2Db mask, int channels, cudaStream_t stream);
-    template void set_to_gpu<short >(DevMem2Db mat, const short*  scalar, DevMem2Db mask, int channels, cudaStream_t stream);
-    template void set_to_gpu<int   >(DevMem2Db mat, const int*    scalar, DevMem2Db mask, int channels, cudaStream_t stream);
-    template void set_to_gpu<float >(DevMem2Db mat, const float*  scalar, DevMem2Db mask, int channels, cudaStream_t stream);
-    template void set_to_gpu<double>(DevMem2Db mat, const double* scalar, DevMem2Db mask, int channels, cudaStream_t stream);
+    template void set_to_gpu<uchar >(PtrStepSzb mat, const uchar*  scalar, PtrStepSzb mask, int channels, cudaStream_t stream);
+    template void set_to_gpu<schar >(PtrStepSzb mat, const schar*  scalar, PtrStepSzb mask, int channels, cudaStream_t stream);
+    template void set_to_gpu<ushort>(PtrStepSzb mat, const ushort* scalar, PtrStepSzb mask, int channels, cudaStream_t stream);
+    template void set_to_gpu<short >(PtrStepSzb mat, const short*  scalar, PtrStepSzb mask, int channels, cudaStream_t stream);
+    template void set_to_gpu<int   >(PtrStepSzb mat, const int*    scalar, PtrStepSzb mask, int channels, cudaStream_t stream);
+    template void set_to_gpu<float >(PtrStepSzb mat, const float*  scalar, PtrStepSzb mask, int channels, cudaStream_t stream);
+    template void set_to_gpu<double>(PtrStepSzb mat, const double* scalar, PtrStepSzb mask, int channels, cudaStream_t stream);

    template <typename T>
-    void set_to_gpu(DevMem2Db mat, const T* scalar, int channels, cudaStream_t stream)
+    void set_to_gpu(PtrStepSzb mat, const T* scalar, int channels, cudaStream_t stream)
    {
        writeScalar(scalar);

@ -201,13 +213,13 @@ namespace cv { namespace gpu { namespace device
            cudaSafeCall ( cudaDeviceSynchronize() );
    }

-    template void set_to_gpu<uchar >(DevMem2Db mat, const uchar*  scalar, int channels, cudaStream_t stream);
-    template void set_to_gpu<schar >(DevMem2Db mat, const schar*  scalar, int channels, cudaStream_t stream);
-    template void set_to_gpu<ushort>(DevMem2Db mat, const ushort* scalar, int channels, cudaStream_t stream);
-    template void set_to_gpu<short >(DevMem2Db mat, const short*  scalar, int channels, cudaStream_t stream);
-    template void set_to_gpu<int   >(DevMem2Db mat, const int*    scalar, int channels, cudaStream_t stream);
-    template void set_to_gpu<float >(DevMem2Db mat, const float*  scalar, int channels, cudaStream_t stream);
-    template void set_to_gpu<double>(DevMem2Db mat, const double* scalar, int channels, cudaStream_t stream);
+    template void set_to_gpu<uchar >(PtrStepSzb mat, const uchar*  scalar, int channels, cudaStream_t stream);
+    template void set_to_gpu<schar >(PtrStepSzb mat, const schar*  scalar, int channels, cudaStream_t stream);
+    template void set_to_gpu<ushort>(PtrStepSzb mat, const ushort* scalar, int channels, cudaStream_t stream);
+    template void set_to_gpu<short >(PtrStepSzb mat, const short*  scalar, int channels, cudaStream_t stream);
+    template void set_to_gpu<int   >(PtrStepSzb mat, const int*    scalar, int channels, cudaStream_t stream);
+    template void set_to_gpu<float >(PtrStepSzb mat, const float*  scalar, int channels, cudaStream_t stream);
+    template void set_to_gpu<double>(PtrStepSzb mat, const double* scalar, int channels, cudaStream_t stream);

    ///////////////////////////////////////////////////////////////////////////
    //////////////////////////////// ConvertTo ////////////////////////////////
@ -274,12 +286,12 @@ namespace cv { namespace gpu { namespace device
    };

    template<typename T, typename D>
-    void cvt_(DevMem2Db src, DevMem2Db dst, double alpha, double beta, cudaStream_t stream)
+    void cvt_(PtrStepSzb src, PtrStepSzb dst, double alpha, double beta, cudaStream_t stream)
    {
        cudaSafeCall( cudaSetDoubleForDevice(&alpha) );
        cudaSafeCall( cudaSetDoubleForDevice(&beta) );
        Convertor<T, D> op(alpha, beta);
-        cv::gpu::device::transform((DevMem2D_<T>)src, (DevMem2D_<D>)dst, op, WithOutMask(), stream);
+        cv::gpu::device::transform((PtrStepSz<T>)src, (PtrStepSz<D>)dst, op, WithOutMask(), stream);
    }

 #if defined  __clang__
@ -287,9 +299,9 @@ namespace cv { namespace gpu { namespace device
 # pragma clang diagnostic ignored "-Wmissing-declarations"
 #endif

-    void convert_gpu(DevMem2Db src, int sdepth, DevMem2Db dst, int ddepth, double alpha, double beta, cudaStream_t stream)
+    void convert_gpu(PtrStepSzb src, int sdepth, PtrStepSzb dst, int ddepth, double alpha, double beta, cudaStream_t stream)
    {
-        typedef void (*caller_t)(DevMem2Db src, DevMem2Db dst, double alpha, double beta, cudaStream_t stream);
+        typedef void (*caller_t)(PtrStepSzb src, PtrStepSzb dst, double alpha, double beta, cudaStream_t stream);

        static const caller_t tab[8][8] =
        {
--- a/modules/core/src/gpumat.cpp
+++ b/modules/core/src/gpumat.cpp
@ -761,15 +761,15 @@ namespace

 namespace cv { namespace gpu { namespace device
 {
-    void copyToWithMask_gpu(DevMem2Db src, DevMem2Db dst, size_t elemSize1, int cn, DevMem2Db mask, bool colorMask, cudaStream_t stream);
+    void copyToWithMask_gpu(PtrStepSzb src, PtrStepSzb dst, size_t elemSize1, int cn, PtrStepSzb mask, bool colorMask, cudaStream_t stream);

    template <typename T>
-    void set_to_gpu(DevMem2Db mat, const T* scalar, int channels, cudaStream_t stream);
+    void set_to_gpu(PtrStepSzb mat, const T* scalar, int channels, cudaStream_t stream);

    template <typename T>
-    void set_to_gpu(DevMem2Db mat, const T* scalar, DevMem2Db mask, int channels, cudaStream_t stream);
+    void set_to_gpu(PtrStepSzb mat, const T* scalar, PtrStepSzb mask, int channels, cudaStream_t stream);

-    void convert_gpu(DevMem2Db src, int sdepth, DevMem2Db dst, int ddepth, double alpha, double beta, cudaStream_t stream);
+    void convert_gpu(PtrStepSzb src, int sdepth, PtrStepSzb dst, int ddepth, double alpha, double beta, cudaStream_t stream);
 }}}

 namespace
@ -787,9 +787,22 @@ namespace
    }
 }

+
 namespace cv { namespace gpu
 {
-    CV_EXPORTS void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream = 0)
+    CV_EXPORTS void copyWithMask(const cv::gpu::GpuMat&, cv::gpu::GpuMat&, const cv::gpu::GpuMat&, CUstream_st*);
+    CV_EXPORTS void convertTo(const cv::gpu::GpuMat&, cv::gpu::GpuMat&);
+    CV_EXPORTS void convertTo(const cv::gpu::GpuMat&, cv::gpu::GpuMat&, double, double, CUstream_st*);
+    CV_EXPORTS void setTo(cv::gpu::GpuMat&, cv::Scalar, CUstream_st*);
+    CV_EXPORTS void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&, CUstream_st*);
+    CV_EXPORTS void setTo(cv::gpu::GpuMat&, cv::Scalar);
+    CV_EXPORTS void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&);
+}}
+
+
+namespace cv { namespace gpu
+{
+    void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream = 0)
    {
        CV_Assert(src.size() == dst.size() && src.type() == dst.type());
        CV_Assert(src.size() == mask.size() && mask.depth() == CV_8U && (mask.channels() == 1 || mask.channels() == src.channels()));
@ -797,17 +810,17 @@ namespace cv { namespace gpu
        cv::gpu::device::copyToWithMask_gpu(src.reshape(1), dst.reshape(1), src.elemSize1(), src.channels(), mask.reshape(1), mask.channels() != 1, stream);
    }

-    CV_EXPORTS void convertTo(const GpuMat& src, GpuMat& dst)
+    void convertTo(const GpuMat& src, GpuMat& dst)
    {
        cv::gpu::device::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), 1.0, 0.0, 0);
    }

-    CV_EXPORTS void convertTo(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream = 0)
+    void convertTo(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream = 0)
    {
        cv::gpu::device::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), alpha, beta, stream);
    }

-    CV_EXPORTS void setTo(GpuMat& src, Scalar s, cudaStream_t stream)
+    void setTo(GpuMat& src, Scalar s, cudaStream_t stream)
    {
        typedef void (*caller_t)(GpuMat& src, Scalar s, cudaStream_t stream);

@ -820,7 +833,7 @@ namespace cv { namespace gpu
        callers[src.depth()](src, s, stream);
    }

-    CV_EXPORTS void setTo(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream)
+    void setTo(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream)
    {
        typedef void (*caller_t)(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream);

@ -833,12 +846,12 @@ namespace cv { namespace gpu
        callers[src.depth()](src, s, mask, stream);
    }

-    CV_EXPORTS void setTo(GpuMat& src, Scalar s)
+    void setTo(GpuMat& src, Scalar s)
    {
        setTo(src, s, 0);
    }

-    CV_EXPORTS void setTo(GpuMat& src, Scalar s, const GpuMat& mask)
+    void setTo(GpuMat& src, Scalar s, const GpuMat& mask)
    {
        setTo(src, s, mask, 0);
    }
--- a/modules/gpu/doc/data_structures.rst
+++ b/modules/gpu/doc/data_structures.rst
@ -5,24 +5,24 @@ Data Structures



-gpu::DevMem2D\_
+gpu::PtrStepSz
 ---------------
-.. ocv:class:: gpu::DevMem2D\_
+.. ocv:class:: gpu::PtrStepSz

 Lightweight class encapsulating pitched memory on a GPU and passed to nvcc-compiled code (CUDA kernels). Typically, it is used internally by OpenCV and by users who write device code. You can call its members from both host and device code. ::

-    template <typename T> struct DevMem2D_
+    template <typename T> struct PtrStepSz
    {
        int cols;
        int rows;
        T* data;
        size_t step;

-        DevMem2D_() : cols(0), rows(0), data(0), step(0){};
-        DevMem2D_(int rows, int cols, T *data, size_t step);
+        PtrStepSz() : cols(0), rows(0), data(0), step(0){};
+        PtrStepSz(int rows, int cols, T *data, size_t step);

        template <typename U>
-        explicit DevMem2D_(const DevMem2D_<U>& d);
+        explicit PtrStepSz(const PtrStepSz<U>& d);

        typedef T elem_type;
        enum { elem_size = sizeof(elem_type) };
@ -34,25 +34,25 @@ Lightweight class encapsulating pitched memory on a GPU and passed to nvcc-compi
        __CV_GPU_HOST_DEVICE__ const T* ptr(int y = 0) const;
    };

-    typedef DevMem2D_<unsigned char> DevMem2D;
-    typedef DevMem2D_<float> DevMem2Df;
-    typedef DevMem2D_<int> DevMem2Di;
+    typedef PtrStepSz<unsigned char> PtrStepSzb;
+    typedef PtrStepSz<float> PtrStepSzf;
+    typedef PtrStepSz<int> PtrStepSzi;



-gpu::PtrStep\_
+gpu::PtrStep
 --------------
-.. ocv:class:: gpu::PtrStep\_
+.. ocv:class:: gpu::PtrStep

-Structure similar to :ocv:class:`gpu::DevMem2D_` but containing only a pointer and row step. Width and height fields are excluded due to performance reasons. The structure is intended for internal use or for users who write device code. ::
+Structure similar to :ocv:class:`gpu::PtrStepSz` but containing only a pointer and row step. Width and height fields are excluded due to performance reasons. The structure is intended for internal use or for users who write device code. ::

-    template<typename T> struct PtrStep_
+    template<typename T> struct PtrStep
    {
            T* data;
            size_t step;

-            PtrStep_();
-            PtrStep_(const DevMem2D_<T>& mem);
+            PtrStep();
+            PtrStep(const PtrStepSz<T>& mem);

            typedef T elem_type;
            enum { elem_size = sizeof(elem_type) };
@ -62,25 +62,9 @@ Structure similar to :ocv:class:`gpu::DevMem2D_` but containing only a pointer a
            __CV_GPU_HOST_DEVICE__ const T* ptr(int y = 0) const;
    };

-    typedef PtrStep_<unsigned char> PtrStep;
-    typedef PtrStep_<float> PtrStepf;
-    typedef PtrStep_<int> PtrStepi;
-
-
-
-gpu::PtrElemStep\_
------------------
-.. ocv:class:: gpu::PtrElemStep\_
-
-Structure similar to :ocv:class:`gpu::DevMem2D_` but containing only a pointer and a row step in elements. Width and height fields are excluded due to performance reasons. This class can only be constructed if ``sizeof(T)`` is a multiple of 256. The structure is intended for internal use or for users who write device code. ::
-
-    template<typename T> struct PtrElemStep_ : public PtrStep_<T>
-    {
-            PtrElemStep_(const DevMem2D_<T>& mem);
-            __CV_GPU_HOST_DEVICE__ T* ptr(int y = 0);
-            __CV_GPU_HOST_DEVICE__ const T* ptr(int y = 0) const;
-    };
-
+    typedef PtrStep<unsigned char> PtrStep;
+    typedef PtrStep<float> PtrStepf;
+    typedef PtrStep<int> PtrStepi;


 gpu::GpuMat
@ -93,7 +77,7 @@ Base storage class for GPU memory with reference counting. Its interface matches
 * no functions that return references to their data (because references on GPU are not valid for CPU)
 * no expression templates technique support

-Beware that the latter limitation may lead to overloaded matrix operators that cause memory allocations. The ``GpuMat`` class is convertible to :ocv:class:`gpu::DevMem2D_` and :ocv:class:`gpu::PtrStep_` so it can be passed directly to the kernel.
+Beware that the latter limitation may lead to overloaded matrix operators that cause memory allocations. The ``GpuMat`` class is convertible to :ocv:class:`gpu::PtrStepSz` and :ocv:class:`gpu::PtrStep` so it can be passed directly to the kernel.

 .. note:: In contrast with :ocv:class:`Mat`, in most cases ``GpuMat::isContinuous() == false`` . This means that rows are aligned to a size depending on the hardware. Single-row ``GpuMat`` is always a continuous matrix.

@ -113,10 +97,10 @@ Beware that the latter limitation may lead to overloaded matrix operators that c
            //! builds GpuMat from Mat. Blocks uploading to device.
            explicit GpuMat (const Mat& m);

-            //! returns lightweight DevMem2D_ structure for passing
+            //! returns lightweight PtrStepSz structure for passing
            //to nvcc-compiled code. Contains size, data ptr and step.
-            template <class T> operator DevMem2D_<T>() const;
-            template <class T> operator PtrStep_<T>() const;
+            template <class T> operator PtrStepSz<T>() const;
+            template <class T> operator PtrStep<T>() const;

            //! blocks uploading data to GpuMat.
            void upload(const cv::Mat& m);
--- a/modules/gpu/include/opencv2/gpu/devmem2d.hpp
+++ b/modules/gpu/include/opencv2/gpu/devmem2d.hpp
@ -40,4 +40,4 @@
 //
 //M*/

-#include "opencv2/core/devmem2d.hpp"
+#include "opencv2/core/cuda_devptrs.hpp"
--- a/modules/gpu/src/arithm.cpp
+++ b/modules/gpu/src/arithm.cpp
@ -454,8 +454,8 @@ namespace cv { namespace gpu { namespace device
 {
    namespace mathfunc
    {
-        void cartToPolar_gpu(DevMem2Df x, DevMem2Df y, DevMem2Df mag, bool magSqr, DevMem2Df angle, bool angleInDegrees, cudaStream_t stream);
-        void polarToCart_gpu(DevMem2Df mag, DevMem2Df angle, DevMem2Df x, DevMem2Df y, bool angleInDegrees, cudaStream_t stream);
+        void cartToPolar_gpu(PtrStepSzf x, PtrStepSzf y, PtrStepSzf mag, bool magSqr, PtrStepSzf angle, bool angleInDegrees, cudaStream_t stream);
+        void polarToCart_gpu(PtrStepSzf mag, PtrStepSzf angle, PtrStepSzf x, PtrStepSzf y, bool angleInDegrees, cudaStream_t stream);
    }
 }}}

--- a/modules/gpu/src/bgfg_gmg.cpp
+++ b/modules/gpu/src/bgfg_gmg.cpp
@ -58,7 +58,7 @@ namespace cv { namespace gpu { namespace device {
                           float decisionThreshold, int maxFeatures, int numInitializationFrames);

        template <typename SrcT>
-        void update_gpu(DevMem2Db frame, PtrStepb fgmask, DevMem2Di colors, PtrStepf weights, PtrStepi nfeatures,
+        void update_gpu(PtrStepSzb frame, PtrStepb fgmask, PtrStepSzi colors, PtrStepf weights, PtrStepi nfeatures,
                        int frameNum,  float learningRate, bool updateBackgroundModel, cudaStream_t stream);
    }
 }}}
@ -109,7 +109,7 @@ void cv::gpu::GMG_GPU::operator ()(const cv::gpu::GpuMat& frame, cv::gpu::GpuMat
 {
    using namespace cv::gpu::device::bgfg_gmg;

-    typedef void (*func_t)(DevMem2Db frame, PtrStepb fgmask, DevMem2Di colors, PtrStepf weights, PtrStepi nfeatures,
+    typedef void (*func_t)(PtrStepSzb frame, PtrStepb fgmask, PtrStepSzi colors, PtrStepf weights, PtrStepi nfeatures,
                           int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream);
    static const func_t funcs[6][4] =
    {
--- a/modules/gpu/src/bgfg_mog.cpp
+++ b/modules/gpu/src/bgfg_mog.cpp
@ -62,14 +62,14 @@ namespace cv { namespace gpu { namespace device
 {
    namespace mog
    {
-        void mog_gpu(DevMem2Db frame, int cn, DevMem2Db fgmask, DevMem2Df weight, DevMem2Df sortKey, DevMem2Db mean, DevMem2Db var,
+        void mog_gpu(PtrStepSzb frame, int cn, PtrStepSzb fgmask, PtrStepSzf weight, PtrStepSzf sortKey, PtrStepSzb mean, PtrStepSzb var,
                     int nmixtures, float varThreshold, float learningRate, float backgroundRatio, float noiseSigma,
                     cudaStream_t stream);
-        void getBackgroundImage_gpu(int cn, DevMem2Df weight, DevMem2Db mean, DevMem2Db dst, int nmixtures, float backgroundRatio, cudaStream_t stream);
+        void getBackgroundImage_gpu(int cn, PtrStepSzf weight, PtrStepSzb mean, PtrStepSzb dst, int nmixtures, float backgroundRatio, cudaStream_t stream);

        void loadConstants(int nmixtures, float Tb, float TB, float Tg, float varInit, float varMin, float varMax, float tau, unsigned char shadowVal);
-        void mog2_gpu(DevMem2Db frame, int cn, DevMem2Db fgmask, DevMem2Db modesUsed, DevMem2Df weight, DevMem2Df variance, DevMem2Db mean, float alphaT, float prune, bool detectShadows, cudaStream_t stream);
-        void getBackgroundImage2_gpu(int cn, DevMem2Db modesUsed, DevMem2Df weight, DevMem2Db mean, DevMem2Db dst, cudaStream_t stream);
+        void mog2_gpu(PtrStepSzb frame, int cn, PtrStepSzb fgmask, PtrStepSzb modesUsed, PtrStepSzf weight, PtrStepSzf variance, PtrStepSzb mean, float alphaT, float prune, bool detectShadows, cudaStream_t stream);
+        void getBackgroundImage2_gpu(int cn, PtrStepSzb modesUsed, PtrStepSzf weight, PtrStepSzb mean, PtrStepSzb dst, cudaStream_t stream);
    }
 }}}

--- a/modules/gpu/src/bgfg_vibe.cpp
+++ b/modules/gpu/src/bgfg_vibe.cpp
@ -57,9 +57,9 @@ namespace cv { namespace gpu { namespace device
    {
        void loadConstants(int nbSamples, int reqMatches, int radius, int subsamplingFactor);

-        void init_gpu(DevMem2Db frame, int cn, DevMem2Db samples, DevMem2D_<unsigned int> randStates, cudaStream_t stream);
+        void init_gpu(PtrStepSzb frame, int cn, PtrStepSzb samples, PtrStepSz<unsigned int> randStates, cudaStream_t stream);

-        void update_gpu(DevMem2Db frame, int cn, DevMem2Db fgmask, DevMem2Db samples, DevMem2D_<unsigned int> randStates, cudaStream_t stream);
+        void update_gpu(PtrStepSzb frame, int cn, PtrStepSzb fgmask, PtrStepSzb samples, PtrStepSz<unsigned int> randStates, cudaStream_t stream);
    }
 }}}

--- a/modules/gpu/src/bilateral_filter.cpp
+++ b/modules/gpu/src/bilateral_filter.cpp
@ -59,10 +59,10 @@ namespace cv { namespace gpu { namespace device
 {
    namespace bilateral_filter
    {
-        void load_constants(float* table_color, DevMem2Df table_space, int ndisp, int radius, short edge_disc, short max_disc);
+        void load_constants(float* table_color, PtrStepSzf table_space, int ndisp, int radius, short edge_disc, short max_disc);

-        void bilateral_filter_gpu(DevMem2Db disp, DevMem2Db img, int channels, int iters, cudaStream_t stream);
-        void bilateral_filter_gpu(DevMem2D_<short> disp, DevMem2Db img, int channels, int iters, cudaStream_t stream);
+        void bilateral_filter_gpu(PtrStepSzb disp, PtrStepSzb img, int channels, int iters, cudaStream_t stream);
+        void bilateral_filter_gpu(PtrStepSz<short> disp, PtrStepSzb img, int channels, int iters, cudaStream_t stream);
    }
 }}}

@ -120,7 +120,7 @@ namespace
                disp.copyTo(dst);
        }

-        bilateral_filter_gpu((DevMem2D_<T>)dst, img, img.channels(), iters, StreamAccessor::getStream(stream));
+        bilateral_filter_gpu((PtrStepSz<T>)dst, img, img.channels(), iters, StreamAccessor::getStream(stream));
    }

    typedef void (*bilateral_filter_operator_t)(int ndisp, int radius, int iters, float edge_threshold, float max_disc_threshold, 
--- a/modules/gpu/src/brute_force_matcher.cpp
+++ b/modules/gpu/src/brute_force_matcher.cpp
@ -86,72 +86,72 @@ namespace cv { namespace gpu { namespace device
 {
    namespace bf_match
    {
-        template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask,
-            const DevMem2Di& trainIdx, const DevMem2Df& distance,
+        template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
+            const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
            int cc, cudaStream_t stream);
-        template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask,
-            const DevMem2Di& trainIdx, const DevMem2Df& distance,
+        template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
+            const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
            int cc, cudaStream_t stream);
-        template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask,
-            const DevMem2Di& trainIdx, const DevMem2Df& distance,
+        template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
+            const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
            int cc, cudaStream_t stream);

-        template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks,
-            const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
+        template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
+            const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
            int cc, cudaStream_t stream);
-        template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks,
-            const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
+        template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
+            const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
            int cc, cudaStream_t stream);
-        template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks,
-            const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
+        template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
+            const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
            int cc, cudaStream_t stream);
    }

    namespace bf_knnmatch
    {
-        template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask,
-            const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist,
+        template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
+            const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
            int cc, cudaStream_t stream);
-        template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask,
-            const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist,
+        template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
+            const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
            int cc, cudaStream_t stream);
-        template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask,
-            const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist,
+        template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
+            const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
            int cc, cudaStream_t stream);

-        template <typename T> void match2L1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks,
-            const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance,
+        template <typename T> void match2L1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
+            const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
            int cc, cudaStream_t stream);
-        template <typename T> void match2L2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks,
-            const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance,
+        template <typename T> void match2L2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
+            const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
            int cc, cudaStream_t stream);
-        template <typename T> void match2Hamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks,
-            const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance,
+        template <typename T> void match2Hamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
+            const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
            int cc, cudaStream_t stream);
    }

    namespace bf_radius_match
    {
-        template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask,
-            const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
+        template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
+            const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
            int cc, cudaStream_t stream);
-        template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask,
-            const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
+        template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
+            const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
            int cc, cudaStream_t stream);
-        template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask,
-            const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
+        template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
+            const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
            int cc, cudaStream_t stream);

-        template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks,
-            const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
+        template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
+            const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
            int cc, cudaStream_t stream);

-        template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks,
-            const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
+        template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
+            const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
            int cc, cudaStream_t stream);

-        template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks,
-            const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
+        template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
+            const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
            int cc, cudaStream_t stream);
    }
 }}}
@ -200,8 +200,8 @@ void cv::gpu::BFMatcher_GPU::matchSingle(const GpuMat& query, const GpuMat& trai

    using namespace cv::gpu::device::bf_match;

-    typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask,
-                             const DevMem2Di& trainIdx, const DevMem2Df& distance,
+    typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
+                             const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
                             int cc, cudaStream_t stream);

    static const caller_t callersL1[] =
@ -301,9 +301,9 @@ void cv::gpu::BFMatcher_GPU::makeGpuCollection(GpuMat& trainCollection, GpuMat&

    if (masks.empty())
    {
-        Mat trainCollectionCPU(1, static_cast<int>(trainDescCollection.size()), CV_8UC(sizeof(DevMem2Db)));
+        Mat trainCollectionCPU(1, static_cast<int>(trainDescCollection.size()), CV_8UC(sizeof(PtrStepSzb)));

-        DevMem2Db* trainCollectionCPU_ptr = trainCollectionCPU.ptr<DevMem2Db>();
+        PtrStepSzb* trainCollectionCPU_ptr = trainCollectionCPU.ptr<PtrStepSzb>();

        for (size_t i = 0, size = trainDescCollection.size(); i < size; ++i, ++trainCollectionCPU_ptr)
            *trainCollectionCPU_ptr = trainDescCollection[i];
@ -315,10 +315,10 @@ void cv::gpu::BFMatcher_GPU::makeGpuCollection(GpuMat& trainCollection, GpuMat&
    {
        CV_Assert(masks.size() == trainDescCollection.size());

-        Mat trainCollectionCPU(1, static_cast<int>(trainDescCollection.size()), CV_8UC(sizeof(DevMem2Db)));
+        Mat trainCollectionCPU(1, static_cast<int>(trainDescCollection.size()), CV_8UC(sizeof(PtrStepSzb)));
        Mat maskCollectionCPU(1, static_cast<int>(trainDescCollection.size()), CV_8UC(sizeof(PtrStepb)));

-        DevMem2Db* trainCollectionCPU_ptr = trainCollectionCPU.ptr<DevMem2Db>();
+        PtrStepSzb* trainCollectionCPU_ptr = trainCollectionCPU.ptr<PtrStepSzb>();
        PtrStepb* maskCollectionCPU_ptr = maskCollectionCPU.ptr<PtrStepb>();

        for (size_t i = 0, size = trainDescCollection.size(); i < size; ++i, ++trainCollectionCPU_ptr, ++maskCollectionCPU_ptr)
@ -346,8 +346,8 @@ void cv::gpu::BFMatcher_GPU::matchCollection(const GpuMat& query, const GpuMat&

    using namespace cv::gpu::device::bf_match;

-    typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks,
-                             const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
+    typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
+                             const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
                             int cc, cudaStream_t stream);

    static const caller_t callersL1[] =
@ -460,8 +460,8 @@ void cv::gpu::BFMatcher_GPU::knnMatchSingle(const GpuMat& query, const GpuMat& t

    using namespace cv::gpu::device::bf_knnmatch;

-    typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask,
-                             const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist,
+    typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
+                             const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
                             int cc, cudaStream_t stream);

    static const caller_t callersL1[] =
@ -592,8 +592,8 @@ void cv::gpu::BFMatcher_GPU::knnMatch2Collection(const GpuMat& query, const GpuM

    using namespace cv::gpu::device::bf_knnmatch;

-    typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks,
-                             const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance,
+    typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
+                             const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
                             int cc, cudaStream_t stream);

    static const caller_t callersL1[] =
@ -776,8 +776,8 @@ void cv::gpu::BFMatcher_GPU::radiusMatchSingle(const GpuMat& query, const GpuMat

    using namespace cv::gpu::device::bf_radius_match;

-    typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask,
-                             const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
+    typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
+                             const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
                             int cc, cudaStream_t stream);

    static const caller_t callersL1[] =
@ -911,8 +911,8 @@ void cv::gpu::BFMatcher_GPU::radiusMatchCollection(const GpuMat& query, GpuMat&

    using namespace cv::gpu::device::bf_radius_match;

-    typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks,
-                             const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
+    typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
+                             const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
                             int cc, cudaStream_t stream);

    static const caller_t callersL1[] =
@ -964,8 +964,8 @@ void cv::gpu::BFMatcher_GPU::radiusMatchCollection(const GpuMat& query, GpuMat&
    caller_t func = callers[query.depth()];
    CV_Assert(func != 0);

-    vector<DevMem2Db> trains_(trainDescCollection.begin(), trainDescCollection.end());
-    vector<DevMem2Db> masks_(masks.begin(), masks.end());
+    vector<PtrStepSzb> trains_(trainDescCollection.begin(), trainDescCollection.end());
+    vector<PtrStepSzb> masks_(masks.begin(), masks.end());

    func(query, &trains_[0], static_cast<int>(trains_.size()), maxDistance, masks_.size() == 0 ? 0 : &masks_[0],
        trainIdx, imgIdx, distance, nMatches, cc, StreamAccessor::getStream(stream));
--- a/modules/gpu/src/calib3d.cpp
+++ b/modules/gpu/src/calib3d.cpp
@ -60,12 +60,12 @@ namespace cv { namespace gpu { namespace device
 {
    namespace transform_points
    {
-        void call(const DevMem2D_<float3> src, const float* rot, const float* transl, DevMem2D_<float3> dst, cudaStream_t stream);
+        void call(const PtrStepSz<float3> src, const float* rot, const float* transl, PtrStepSz<float3> dst, cudaStream_t stream);
    }

    namespace project_points
    {
-        void call(const DevMem2D_<float3> src, const float* rot, const float* transl, const float* proj, DevMem2D_<float2> dst, cudaStream_t stream);
+        void call(const PtrStepSz<float3> src, const float* rot, const float* transl, const float* proj, PtrStepSz<float2> dst, cudaStream_t stream);
    }

    namespace solve_pnp_ransac
--- a/modules/gpu/src/cascadeclassifier.cpp
+++ b/modules/gpu/src/cascadeclassifier.cpp
@ -352,18 +352,18 @@ namespace cv { namespace gpu { namespace device
                             float initalScale,
                             float factor,
                             int total,
-                             const DevMem2Db& mstages,
+                             const PtrStepSzb& mstages,
                             const int nstages,
-                             const DevMem2Di& mnodes,
-                             const DevMem2Df& mleaves,
-                             const DevMem2Di& msubsets,
-                             const DevMem2Db& mfeatures,
+                             const PtrStepSzi& mnodes,
+                             const PtrStepSzf& mleaves,
+                             const PtrStepSzi& msubsets,
+                             const PtrStepSzb& mfeatures,
                             const int subsetSize,
-                             DevMem2D_<int4> objects,
+                             PtrStepSz<int4> objects,
                             unsigned int* classified,
-                             DevMem2Di integral);
+                             PtrStepSzi integral);

-        void connectedConmonents(DevMem2D_<int4>  candidates, int ncandidates, DevMem2D_<int4> objects,int groupThreshold, float grouping_eps, unsigned int* nclasses);
+        void connectedConmonents(PtrStepSz<int4>  candidates, int ncandidates, PtrStepSz<int4> objects,int groupThreshold, float grouping_eps, unsigned int* nclasses);
    }
 }}}

--- a/modules/gpu/src/color.cpp
+++ b/modules/gpu/src/color.cpp
@ -59,9 +59,9 @@ namespace cv { namespace gpu {
    namespace device
    {
        template <int cn>
-        void Bayer2BGR_8u_gpu(DevMem2Db src, DevMem2Db dst, bool blue_last, bool start_with_green, cudaStream_t stream);
+        void Bayer2BGR_8u_gpu(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
        template <int cn>
-        void Bayer2BGR_16u_gpu(DevMem2Db src, DevMem2Db dst, bool blue_last, bool start_with_green, cudaStream_t stream);
+        void Bayer2BGR_16u_gpu(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
    }
 }}

@ -69,7 +69,7 @@ using namespace ::cv::gpu::device;

 namespace
 {
-    typedef void (*gpu_func_t)(const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream);
+    typedef void (*gpu_func_t)(const PtrStepSzb& src, const PtrStepSzb& dst, cudaStream_t stream);

    void bgr_to_rgb(const GpuMat& src, GpuMat& dst, int, Stream& stream)
    {
@ -1336,7 +1336,7 @@ namespace

    void bayer_to_bgr(const GpuMat& src, GpuMat& dst, int dcn, bool blue_last, bool start_with_green, Stream& stream)
    {
-        typedef void (*func_t)(DevMem2Db src, DevMem2Db dst, bool blue_last, bool start_with_green, cudaStream_t stream);
+        typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
        static const func_t funcs[3][4] =
        {
            {0,0,Bayer2BGR_8u_gpu<3>, Bayer2BGR_8u_gpu<4>},
--- a/modules/gpu/src/cuda/NV12ToARGB.cu
+++ b/modules/gpu/src/cuda/NV12ToARGB.cu
@ -191,7 +191,7 @@ namespace cv { namespace gpu { namespace device {
            dstImage[y * dstImagePitch + x + 1 ] = RGBAPACK_10bit(red[1], green[1], blue[1], constAlpha);
        }

-        void NV12ToARGB_gpu(const PtrStepb decodedFrame, DevMem2D_<uint> interopFrame, cudaStream_t stream)
+        void NV12ToARGB_gpu(const PtrStepb decodedFrame, PtrStepSz<uint> interopFrame, cudaStream_t stream)
        {
            dim3 block(32, 8);
            dim3 grid(divUp(interopFrame.cols, 2 * block.x), divUp(interopFrame.rows, block.y));
--- a/modules/gpu/src/cuda/bf_knnmatch.cu
+++ b/modules/gpu/src/cuda/bf_knnmatch.cu
@ -209,7 +209,7 @@ namespace cv { namespace gpu { namespace device
        // Match Unrolled Cached

        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename T, typename U>
-        __device__ void loadQueryToSmem(int queryIdx, const DevMem2D_<T>& query, U* s_query)
+        __device__ void loadQueryToSmem(int queryIdx, const PtrStepSz<T>& query, U* s_query)
        {
            #pragma unroll
            for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
@ -220,7 +220,7 @@ namespace cv { namespace gpu { namespace device
        }

        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
-        __device__ void loopUnrolledCached(int queryIdx, const DevMem2D_<T>& query, int imgIdx, const DevMem2D_<T>& train, const Mask& mask,
+        __device__ void loopUnrolledCached(int queryIdx, const PtrStepSz<T>& query, int imgIdx, const PtrStepSz<T>& train, const Mask& mask,
                                           typename Dist::value_type* s_query, typename Dist::value_type* s_train,
                                           float& bestDistance1, float& bestDistance2,
                                           int& bestTrainIdx1, int& bestTrainIdx2,
@ -281,7 +281,7 @@ namespace cv { namespace gpu { namespace device
        }

        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
-        __global__ void matchUnrolledCached(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)
+        __global__ void matchUnrolledCached(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)
        {
            extern __shared__ int smem[];

@ -314,8 +314,8 @@ namespace cv { namespace gpu { namespace device
        }

        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
-        void matchUnrolledCached(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask,
-                                 const DevMem2D_<int2>& trainIdx, const DevMem2D_<float2>& distance,
+        void matchUnrolledCached(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
+                                 const PtrStepSz<int2>& trainIdx, const PtrStepSz<float2>& distance,
                                 cudaStream_t stream)
        {
            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
@ -331,7 +331,7 @@ namespace cv { namespace gpu { namespace device
        }

        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
-        __global__ void matchUnrolledCached(const DevMem2D_<T> query, const DevMem2D_<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)
+        __global__ void matchUnrolledCached(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)
        {
            extern __shared__ int smem[];

@ -353,7 +353,7 @@ namespace cv { namespace gpu { namespace device

            for (int imgIdx = 0; imgIdx < n; ++imgIdx)
            {
-                const DevMem2D_<T> train = trains[imgIdx];
+                const PtrStepSz<T> train = trains[imgIdx];
                m.next();
                loopUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2);
            }
@ -375,8 +375,8 @@ namespace cv { namespace gpu { namespace device
        }

        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
-        void matchUnrolledCached(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask,
-                                 const DevMem2D_<int2>& trainIdx, const DevMem2D_<int2>& imgIdx, const DevMem2D_<float2>& distance,
+        void matchUnrolledCached(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
+                                 const PtrStepSz<int2>& trainIdx, const PtrStepSz<int2>& imgIdx, const PtrStepSz<float2>& distance,
                                 cudaStream_t stream)
        {
            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
@ -395,7 +395,7 @@ namespace cv { namespace gpu { namespace device
        // Match Unrolled

        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
-        __device__ void loopUnrolled(int queryIdx, const DevMem2D_<T>& query, int imgIdx, const DevMem2D_<T>& train, const Mask& mask,
+        __device__ void loopUnrolled(int queryIdx, const PtrStepSz<T>& query, int imgIdx, const PtrStepSz<T>& train, const Mask& mask,
                                     typename Dist::value_type* s_query, typename Dist::value_type* s_train,
                                     float& bestDistance1, float& bestDistance2,
                                     int& bestTrainIdx1, int& bestTrainIdx2,
@ -460,7 +460,7 @@ namespace cv { namespace gpu { namespace device
        }

        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
-        __global__ void matchUnrolled(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)
+        __global__ void matchUnrolled(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)
        {
            extern __shared__ int smem[];

@ -491,8 +491,8 @@ namespace cv { namespace gpu { namespace device
        }

        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
-        void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask,
-                           const DevMem2D_<int2>& trainIdx, const DevMem2D_<float2>& distance,
+        void matchUnrolled(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
+                           const PtrStepSz<int2>& trainIdx, const PtrStepSz<float2>& distance,
                           cudaStream_t stream)
        {
            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
@ -508,7 +508,7 @@ namespace cv { namespace gpu { namespace device
        }

        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
-        __global__ void matchUnrolled(const DevMem2D_<T> query, const DevMem2D_<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)
+        __global__ void matchUnrolled(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)
        {
            extern __shared__ int smem[];

@ -528,7 +528,7 @@ namespace cv { namespace gpu { namespace device

            for (int imgIdx = 0; imgIdx < n; ++imgIdx)
            {
-                const DevMem2D_<T> train = trains[imgIdx];
+                const PtrStepSz<T> train = trains[imgIdx];
                m.next();
                loopUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2);
            }
@ -550,8 +550,8 @@ namespace cv { namespace gpu { namespace device
        }

        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
-        void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask,
-                           const DevMem2D_<int2>& trainIdx, const DevMem2D_<int2>& imgIdx, const DevMem2D_<float2>& distance,
+        void matchUnrolled(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
+                           const PtrStepSz<int2>& trainIdx, const PtrStepSz<int2>& imgIdx, const PtrStepSz<float2>& distance,
                           cudaStream_t stream)
        {
            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
@ -570,7 +570,7 @@ namespace cv { namespace gpu { namespace device
        // Match

        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
-        __device__ void loop(int queryIdx, const DevMem2D_<T>& query, int imgIdx, const DevMem2D_<T>& train, const Mask& mask,
+        __device__ void loop(int queryIdx, const PtrStepSz<T>& query, int imgIdx, const PtrStepSz<T>& train, const Mask& mask,
                             typename Dist::value_type* s_query, typename Dist::value_type* s_train,
                             float& bestDistance1, float& bestDistance2,
                             int& bestTrainIdx1, int& bestTrainIdx2,
@ -634,7 +634,7 @@ namespace cv { namespace gpu { namespace device
        }

        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
-        __global__ void match(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)
+        __global__ void match(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)
        {
            extern __shared__ int smem[];

@ -665,8 +665,8 @@ namespace cv { namespace gpu { namespace device
        }

        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
-        void match(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask,
-                   const DevMem2D_<int2>& trainIdx, const DevMem2D_<float2>& distance,
+        void match(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
+                   const PtrStepSz<int2>& trainIdx, const PtrStepSz<float2>& distance,
                   cudaStream_t stream)
        {
            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
@ -682,7 +682,7 @@ namespace cv { namespace gpu { namespace device
        }

        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
-        __global__ void match(const DevMem2D_<T> query, const DevMem2D_<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)
+        __global__ void match(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)
        {
            extern __shared__ int smem[];

@ -702,7 +702,7 @@ namespace cv { namespace gpu { namespace device

            for (int imgIdx = 0; imgIdx < n; ++imgIdx)
            {
-                const DevMem2D_<T> train = trains[imgIdx];
+                const PtrStepSz<T> train = trains[imgIdx];
                m.next();
                loop<BLOCK_SIZE, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2);
            }
@ -724,8 +724,8 @@ namespace cv { namespace gpu { namespace device
        }

        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
-        void match(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask,
-                   const DevMem2D_<int2>& trainIdx, const DevMem2D_<int2>& imgIdx, const DevMem2D_<float2>& distance,
+        void match(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
+                   const PtrStepSz<int2>& trainIdx, const PtrStepSz<int2>& imgIdx, const PtrStepSz<float2>& distance,
                   cudaStream_t stream)
        {
            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
@ -744,66 +744,66 @@ namespace cv { namespace gpu { namespace device
        // knnMatch 2 dispatcher

        template <typename Dist, typename T, typename Mask>
-        void match2Dispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask,
-                              const DevMem2Db& trainIdx, const DevMem2Db& distance,
+        void match2Dispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
+                              const PtrStepSzb& trainIdx, const PtrStepSzb& distance,
                              int cc, cudaStream_t stream)
        {
            (void)cc;
            if (query.cols <= 64)
            {
-                matchUnrolledCached<16, 64, Dist>(query, train, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> > (distance), stream);
+                matchUnrolledCached<16, 64, Dist>(query, train, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<float2> > (distance), stream);
            }
            else if (query.cols <= 128)
            {
-                matchUnrolledCached<16, 128, Dist>(query, train, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> > (distance), stream);
+                matchUnrolledCached<16, 128, Dist>(query, train, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<float2> > (distance), stream);
            }
            /*else if (query.cols <= 256)
            {
-                matchUnrolled<16, 256, Dist>(query, train, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> > (distance), stream);
+                matchUnrolled<16, 256, Dist>(query, train, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<float2> > (distance), stream);
            }
            else if (query.cols <= 512)
            {
-                matchUnrolled<16, 512, Dist>(query, train, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> > (distance), stream);
+                matchUnrolled<16, 512, Dist>(query, train, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<float2> > (distance), stream);
            }
            else if (query.cols <= 1024)
            {
-                matchUnrolled<16, 1024, Dist>(query, train, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> > (distance), stream);
+                matchUnrolled<16, 1024, Dist>(query, train, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<float2> > (distance), stream);
            }*/
            else
            {
-                match<16, Dist>(query, train, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> > (distance), stream);
+                match<16, Dist>(query, train, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<float2> > (distance), stream);
            }
        }

        template <typename Dist, typename T, typename Mask>
-        void match2Dispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask,
-                              const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance,
+        void match2Dispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
+                              const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
                              int cc, cudaStream_t stream)
        {
            (void)cc;
            if (query.cols <= 64)
            {
-                matchUnrolledCached<16, 64, Dist>(query, trains, n, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<int2> >(imgIdx), static_cast< DevMem2D_<float2> > (distance), stream);
+                matchUnrolledCached<16, 64, Dist>(query, trains, n, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<int2> >(imgIdx), static_cast< PtrStepSz<float2> > (distance), stream);
            }
            else if (query.cols <= 128)
            {
-                matchUnrolledCached<16, 128, Dist>(query, trains, n, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<int2> >(imgIdx), static_cast< DevMem2D_<float2> > (distance), stream);
+                matchUnrolledCached<16, 128, Dist>(query, trains, n, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<int2> >(imgIdx), static_cast< PtrStepSz<float2> > (distance), stream);
            }
            /*else if (query.cols <= 256)
            {
-                matchUnrolled<16, 256, Dist>(query, trains, n, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<int2> >(imgIdx), static_cast< DevMem2D_<float2> > (distance), stream);
+                matchUnrolled<16, 256, Dist>(query, trains, n, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<int2> >(imgIdx), static_cast< PtrStepSz<float2> > (distance), stream);
            }
            else if (query.cols <= 512)
            {
-                matchUnrolled<16, 512, Dist>(query, trains, n, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<int2> >(imgIdx), static_cast< DevMem2D_<float2> > (distance), stream);
+                matchUnrolled<16, 512, Dist>(query, trains, n, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<int2> >(imgIdx), static_cast< PtrStepSz<float2> > (distance), stream);
            }
            else if (query.cols <= 1024)
            {
-                matchUnrolled<16, 1024, Dist>(query, trains, n, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<int2> >(imgIdx), static_cast< DevMem2D_<float2> > (distance), stream);
+                matchUnrolled<16, 1024, Dist>(query, trains, n, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<int2> >(imgIdx), static_cast< PtrStepSz<float2> > (distance), stream);
            }*/
            else
            {
-                match<16, Dist>(query, trains, n, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<int2> >(imgIdx), static_cast< DevMem2D_<float2> > (distance), stream);
+                match<16, Dist>(query, trains, n, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<int2> >(imgIdx), static_cast< PtrStepSz<float2> > (distance), stream);
            }
        }

@ -811,7 +811,7 @@ namespace cv { namespace gpu { namespace device
        // Calc distance kernel

        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
-        __global__ void calcDistanceUnrolled(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, PtrStepf allDist)
+        __global__ void calcDistanceUnrolled(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, PtrStepf allDist)
        {
            extern __shared__ int smem[];

@ -860,7 +860,7 @@ namespace cv { namespace gpu { namespace device
        }

        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
-        void calcDistanceUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, const DevMem2Df& allDist, cudaStream_t stream)
+        void calcDistanceUnrolled(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask, const PtrStepSzf& allDist, cudaStream_t stream)
        {
            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
            const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
@ -875,7 +875,7 @@ namespace cv { namespace gpu { namespace device
        }

        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
-        __global__ void calcDistance(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, PtrStepf allDist)
+        __global__ void calcDistance(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, PtrStepf allDist)
        {
            extern __shared__ int smem[];

@ -923,7 +923,7 @@ namespace cv { namespace gpu { namespace device
        }

        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
-        void calcDistance(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, const DevMem2Df& allDist, cudaStream_t stream)
+        void calcDistance(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask, const PtrStepSzf& allDist, cudaStream_t stream)
        {
            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
            const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
@ -941,8 +941,8 @@ namespace cv { namespace gpu { namespace device
        // Calc Distance dispatcher

        template <typename Dist, typename T, typename Mask>
-        void calcDistanceDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask,
-                                    const DevMem2Df& allDist,
+        void calcDistanceDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
+                                    const PtrStepSzf& allDist,
                                    int cc, cudaStream_t stream)
        {
            (void)cc;
@ -976,7 +976,7 @@ namespace cv { namespace gpu { namespace device
        // find knn match kernel

        template <int BLOCK_SIZE>
-        __global__ void findBestMatch(DevMem2Df allDist, int i, PtrStepi trainIdx, PtrStepf distance)
+        __global__ void findBestMatch(PtrStepSzf allDist, int i, PtrStepi trainIdx, PtrStepf distance)
        {
            const int SMEM_SIZE = BLOCK_SIZE > 64 ? BLOCK_SIZE : 64;
            __shared__ float s_dist[SMEM_SIZE];
@ -1017,7 +1017,7 @@ namespace cv { namespace gpu { namespace device
        }

        template <int BLOCK_SIZE>
-        void findKnnMatch(int k, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream)
+        void findKnnMatch(int k, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSzf& allDist, cudaStream_t stream)
        {
            const dim3 block(BLOCK_SIZE, 1, 1);
            const dim3 grid(trainIdx.rows, 1, 1);
@ -1032,17 +1032,17 @@ namespace cv { namespace gpu { namespace device
                cudaSafeCall( cudaDeviceSynchronize() );
        }

-        void findKnnMatchDispatcher(int k, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream)
+        void findKnnMatchDispatcher(int k, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream)
        {
-            findKnnMatch<256>(k, static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Df>(distance), allDist, stream);
+            findKnnMatch<256>(k, static_cast<PtrStepSzi>(trainIdx), static_cast<PtrStepSzf>(distance), allDist, stream);
        }

        ///////////////////////////////////////////////////////////////////////////////
        // knn match Dispatcher

        template <typename Dist, typename T, typename Mask>
-        void matchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, int k, const Mask& mask,
-            const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist,
+        void matchDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, int k, const Mask& mask,
+            const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
            int cc, cudaStream_t stream)
        {
            if (k == 2)
@ -1059,104 +1059,104 @@ namespace cv { namespace gpu { namespace device
        ///////////////////////////////////////////////////////////////////////////////
        // knn match caller

-        template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask,
-            const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist,
+        template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
+            const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
            int cc, cudaStream_t stream)
        {
            if (mask.data)
-                matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream);
+                matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream);
            else
-                matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream);
+                matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream);
        }

-        template void matchL1_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
-        //template void matchL1_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
-        template void matchL1_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
-        template void matchL1_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
-        template void matchL1_gpu<int   >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
-        template void matchL1_gpu<float >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
+        template void matchL1_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
+        //template void matchL1_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
+        template void matchL1_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
+        template void matchL1_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
+        template void matchL1_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
+        template void matchL1_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);

-        template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask,
-            const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist,
+        template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
+            const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
            int cc, cudaStream_t stream)
        {
            if (mask.data)
-                matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream);
+                matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream);
            else
-                matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream);
+                matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream);
        }

-        //template void matchL2_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<int   >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
-        template void matchL2_gpu<float >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
+        //template void matchL2_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
+        //template void matchL2_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
+        //template void matchL2_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
+        //template void matchL2_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
+        //template void matchL2_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
+        template void matchL2_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);

-        template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask,
-            const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist,
+        template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
+            const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
            int cc, cudaStream_t stream)
        {
            if (mask.data)
-                matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream);
+                matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream);
            else
-                matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream);
+                matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream);
        }

-        template void matchHamming_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
-        //template void matchHamming_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
-        template void matchHamming_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
-        //template void matchHamming_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
-        template void matchHamming_gpu<int   >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
+        template void matchHamming_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
+        //template void matchHamming_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
+        template void matchHamming_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
+        //template void matchHamming_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
+        template void matchHamming_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);

-        template <typename T> void match2L1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks,
-            const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance,
+        template <typename T> void match2L1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
+            const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
            int cc, cudaStream_t stream)
        {
            if (masks.data)
-                match2Dispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream);
+                match2Dispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream);
            else
-                match2Dispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream);
+                match2Dispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream);
        }

-        template void match2L1_gpu<uchar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
-        //template void match2L1_gpu<schar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
-        template void match2L1_gpu<ushort>(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
-        template void match2L1_gpu<short >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
-        template void match2L1_gpu<int   >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
-        template void match2L1_gpu<float >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
+        template void match2L1_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
+        //template void match2L1_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
+        template void match2L1_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
+        template void match2L1_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
+        template void match2L1_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
+        template void match2L1_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);

-        template <typename T> void match2L2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks,
-            const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance,
+        template <typename T> void match2L2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
+            const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
            int cc, cudaStream_t stream)
        {
            if (masks.data)
-                match2Dispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream);
+                match2Dispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream);
            else
-                match2Dispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream);
+                match2Dispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream);
        }

-        //template void match2L2_gpu<uchar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
-        //template void match2L2_gpu<schar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
-        //template void match2L2_gpu<ushort>(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
-        //template void match2L2_gpu<short >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
-        //template void match2L2_gpu<int   >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Di& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
-        template void match2L2_gpu<float >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
+        //template void match2L2_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
+        //template void match2L2_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
+        //template void match2L2_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
+        //template void match2L2_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
+        //template void match2L2_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
+        template void match2L2_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);

-        template <typename T> void match2Hamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks,
-            const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance,
+        template <typename T> void match2Hamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
+            const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
            int cc, cudaStream_t stream)
        {
            if (masks.data)
-                match2Dispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream);
+                match2Dispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream);
            else
-                match2Dispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream);
+                match2Dispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream);
        }

-        template void match2Hamming_gpu<uchar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
-        //template void match2Hamming_gpu<schar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
-        template void match2Hamming_gpu<ushort>(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
-        //template void match2Hamming_gpu<short >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
-        template void match2Hamming_gpu<int   >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
+        template void match2Hamming_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
+        //template void match2Hamming_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
+        template void match2Hamming_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
+        //template void match2Hamming_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
+        template void match2Hamming_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
    } // namespace bf_knnmatch
 }}} // namespace cv { namespace gpu { namespace device {
--- a/modules/gpu/src/cuda/bf_match.cu
+++ b/modules/gpu/src/cuda/bf_match.cu
@ -86,7 +86,7 @@ namespace cv { namespace gpu { namespace device
        // Match Unrolled Cached

        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename T, typename U>
-        __device__ void loadQueryToSmem(int queryIdx, const DevMem2D_<T>& query, U* s_query)
+        __device__ void loadQueryToSmem(int queryIdx, const PtrStepSz<T>& query, U* s_query)
        {
            #pragma unroll
            for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
@ -97,7 +97,7 @@ namespace cv { namespace gpu { namespace device
        }

        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
-        __device__ void loopUnrolledCached(int queryIdx, const DevMem2D_<T>& query,volatile int imgIdx, const DevMem2D_<T>& train, const Mask& mask,
+        __device__ void loopUnrolledCached(int queryIdx, const PtrStepSz<T>& query,volatile int imgIdx, const PtrStepSz<T>& train, const Mask& mask,
                                           typename Dist::value_type* s_query, typename Dist::value_type* s_train,
                                           float& bestDistance, int& bestTrainIdx, int& bestImgIdx)
        {
@ -143,7 +143,7 @@ namespace cv { namespace gpu { namespace device
        }

        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
-        __global__ void matchUnrolledCached(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, int* bestTrainIdx, float* bestDistance)
+        __global__ void matchUnrolledCached(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int* bestTrainIdx, float* bestDistance)
        {
            extern __shared__ int smem[];

@ -174,8 +174,8 @@ namespace cv { namespace gpu { namespace device
        }

        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
-        void matchUnrolledCached(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask,
-                                 const DevMem2Di& trainIdx, const DevMem2Df& distance,
+        void matchUnrolledCached(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
+                                 const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
                                 cudaStream_t stream)
        {
            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
@ -191,7 +191,7 @@ namespace cv { namespace gpu { namespace device
        }

        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
-        __global__ void matchUnrolledCached(const DevMem2D_<T> query, const DevMem2D_<T>* trains, int n, const Mask mask,
+        __global__ void matchUnrolledCached(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask,
                                            int* bestTrainIdx, int* bestImgIdx, float* bestDistance)
        {
            extern __shared__ int smem[];
@ -211,7 +211,7 @@ namespace cv { namespace gpu { namespace device

            for (int imgIdx = 0; imgIdx < n; ++imgIdx)
            {
-                const DevMem2D_<T> train = trains[imgIdx];
+                const PtrStepSz<T> train = trains[imgIdx];
                m.next();
                loopUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance, myBestTrainIdx, myBestImgIdx);
            }
@ -233,8 +233,8 @@ namespace cv { namespace gpu { namespace device
        }

        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
-        void matchUnrolledCached(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask,
-                                 const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
+        void matchUnrolledCached(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
+                                 const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
                                 cudaStream_t stream)
        {
            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
@ -253,7 +253,7 @@ namespace cv { namespace gpu { namespace device
        // Match Unrolled

        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
-        __device__ void loopUnrolled(int queryIdx, const DevMem2D_<T>& query,volatile int imgIdx, const DevMem2D_<T>& train, const Mask& mask,
+        __device__ void loopUnrolled(int queryIdx, const PtrStepSz<T>& query,volatile int imgIdx, const PtrStepSz<T>& train, const Mask& mask,
                                     typename Dist::value_type* s_query, typename Dist::value_type* s_train,
                                     float& bestDistance, int& bestTrainIdx, int& bestImgIdx)
        {
@ -303,7 +303,7 @@ namespace cv { namespace gpu { namespace device
        }

        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
-        __global__ void matchUnrolled(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, int* bestTrainIdx, float* bestDistance)
+        __global__ void matchUnrolled(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int* bestTrainIdx, float* bestDistance)
        {
            extern __shared__ int smem[];

@ -332,8 +332,8 @@ namespace cv { namespace gpu { namespace device
        }

        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
-        void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask,
-                           const DevMem2Di& trainIdx, const DevMem2Df& distance,
+        void matchUnrolled(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
+                           const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
                           cudaStream_t stream)
        {
            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
@ -349,7 +349,7 @@ namespace cv { namespace gpu { namespace device
        }

        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
-        __global__ void matchUnrolled(const DevMem2D_<T> query, const DevMem2D_<T>* trains, int n, const Mask mask,
+        __global__ void matchUnrolled(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask,
                                      int* bestTrainIdx, int* bestImgIdx, float* bestDistance)
        {
            extern __shared__ int smem[];
@ -367,7 +367,7 @@ namespace cv { namespace gpu { namespace device

            for (int imgIdx = 0; imgIdx < n; ++imgIdx)
            {
-                const DevMem2D_<T> train = trains[imgIdx];
+                const PtrStepSz<T> train = trains[imgIdx];
                m.next();
                loopUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance, myBestTrainIdx, myBestImgIdx);
            }
@ -389,8 +389,8 @@ namespace cv { namespace gpu { namespace device
        }

        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
-        void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask,
-                           const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
+        void matchUnrolled(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
+                           const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
                           cudaStream_t stream)
        {
            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
@ -409,7 +409,7 @@ namespace cv { namespace gpu { namespace device
        // Match

        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
-        __device__ void loop(int queryIdx, const DevMem2D_<T>& query, volatile int imgIdx, const DevMem2D_<T>& train, const Mask& mask,
+        __device__ void loop(int queryIdx, const PtrStepSz<T>& query, volatile int imgIdx, const PtrStepSz<T>& train, const Mask& mask,
                             typename Dist::value_type* s_query, typename Dist::value_type* s_train,
                             float& bestDistance, int& bestTrainIdx, int& bestImgIdx)
        {
@ -458,7 +458,7 @@ namespace cv { namespace gpu { namespace device
        }

        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
-        __global__ void match(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, int* bestTrainIdx, float* bestDistance)
+        __global__ void match(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int* bestTrainIdx, float* bestDistance)
        {
            extern __shared__ int smem[];

@ -487,8 +487,8 @@ namespace cv { namespace gpu { namespace device
        }

        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
-        void match(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask,
-                   const DevMem2Di& trainIdx, const DevMem2Df& distance,
+        void match(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
+                   const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
                   cudaStream_t stream)
        {
            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
@ -504,7 +504,7 @@ namespace cv { namespace gpu { namespace device
        }

        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
-        __global__ void match(const DevMem2D_<T> query, const DevMem2D_<T>* trains, int n, const Mask mask,
+        __global__ void match(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask,
                              int* bestTrainIdx, int* bestImgIdx, float* bestDistance)
        {
            extern __shared__ int smem[];
@ -521,7 +521,7 @@ namespace cv { namespace gpu { namespace device
            Mask m = mask;
            for (int imgIdx = 0; imgIdx < n; ++imgIdx)
            {
-                const DevMem2D_<T> train = trains[imgIdx];
+                const PtrStepSz<T> train = trains[imgIdx];
                m.next();
                loop<BLOCK_SIZE, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance, myBestTrainIdx, myBestImgIdx);
            }
@ -543,8 +543,8 @@ namespace cv { namespace gpu { namespace device
        }

        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
-        void match(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask,
-                   const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
+        void match(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
+                   const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
                   cudaStream_t stream)
        {
            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
@ -563,8 +563,8 @@ namespace cv { namespace gpu { namespace device
        // Match dispatcher

        template <typename Dist, typename T, typename Mask>
-        void matchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask,
-                             const DevMem2Di& trainIdx, const DevMem2Df& distance,
+        void matchDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
+                             const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
                             int cc, cudaStream_t stream)
        {
            (void)cc;
@ -595,8 +595,8 @@ namespace cv { namespace gpu { namespace device
        }

        template <typename Dist, typename T, typename Mask>
-        void matchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask,
-                             const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
+        void matchDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
+                             const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
                             int cc, cudaStream_t stream)
        {
            (void)cc;
@ -629,152 +629,152 @@ namespace cv { namespace gpu { namespace device
        ///////////////////////////////////////////////////////////////////////////////
        // Match caller

-        template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask,
-                                               const DevMem2Di& trainIdx, const DevMem2Df& distance,
+        template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
+                                               const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
                                               int cc, cudaStream_t stream)
        {
            if (mask.data)
            {
-                matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), SingleMask(mask),
+                matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), SingleMask(mask),
                    trainIdx, distance,
                    cc, stream);
            }
            else
            {
-                matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), WithOutMask(),
+                matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), WithOutMask(),
                    trainIdx, distance,
                    cc, stream);
            }
        }

-        template void matchL1_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
-        //template void matchL1_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
-        template void matchL1_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
-        template void matchL1_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
-        template void matchL1_gpu<int   >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
-        template void matchL1_gpu<float >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
+        template void matchL1_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
+        //template void matchL1_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
+        template void matchL1_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
+        template void matchL1_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
+        template void matchL1_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
+        template void matchL1_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);

-        template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask,
-                                               const DevMem2Di& trainIdx, const DevMem2Df& distance,
+        template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
+                                               const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
                                               int cc, cudaStream_t stream)
        {
            if (mask.data)
            {
-                matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), SingleMask(mask),
+                matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), SingleMask(mask),
                    trainIdx, distance,
                    cc, stream);
            }
            else
            {
-                matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), WithOutMask(),
+                matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), WithOutMask(),
                    trainIdx, distance,
                    cc, stream);
            }
        }

-        //template void matchL2_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<int   >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
-        template void matchL2_gpu<float >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
+        //template void matchL2_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
+        //template void matchL2_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
+        //template void matchL2_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
+        //template void matchL2_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
+        //template void matchL2_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
+        template void matchL2_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);

-        template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask,
-                                                    const DevMem2Di& trainIdx, const DevMem2Df& distance,
+        template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
+                                                    const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
                                                    int cc, cudaStream_t stream)
        {
            if (mask.data)
            {
-                matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), SingleMask(mask),
+                matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), SingleMask(mask),
                    trainIdx, distance,
                    cc, stream);
            }
            else
            {
-                matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), WithOutMask(),
+                matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), WithOutMask(),
                    trainIdx, distance,
                    cc, stream);
            }
        }

-        template void matchHamming_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
-        //template void matchHamming_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
-        template void matchHamming_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
-        //template void matchHamming_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
-        template void matchHamming_gpu<int   >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
+        template void matchHamming_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
+        //template void matchHamming_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
+        template void matchHamming_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
+        //template void matchHamming_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
+        template void matchHamming_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);

-        template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks,
-                                               const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
+        template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
+                                               const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
                                               int cc, cudaStream_t stream)
        {
            if (masks.data)
            {
-                matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data),
+                matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data),
                    trainIdx, imgIdx, distance,
                    cc, stream);
            }
            else
            {
-                matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, WithOutMask(),
+                matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(),
                    trainIdx, imgIdx, distance,
                    cc, stream);
            }
        }

-        template void matchL1_gpu<uchar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
-        //template void matchL1_gpu<schar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
-        template void matchL1_gpu<ushort>(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
-        template void matchL1_gpu<short >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
-        template void matchL1_gpu<int   >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
-        template void matchL1_gpu<float >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
+        template void matchL1_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
+        //template void matchL1_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
+        template void matchL1_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
+        template void matchL1_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
+        template void matchL1_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
+        template void matchL1_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);

-        template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks,
-                                               const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
+        template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
+                                               const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
                                               int cc, cudaStream_t stream)
        {
            if (masks.data)
            {
-                matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data),
+                matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data),
                    trainIdx, imgIdx, distance,
                    cc, stream);
            }
            else
            {
-                matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, WithOutMask(),
+                matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(),
                    trainIdx, imgIdx, distance,
                    cc, stream);
            }
        }

-        //template void matchL2_gpu<uchar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<schar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<ushort>(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<short >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<int   >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
-        template void matchL2_gpu<float >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
+        //template void matchL2_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
+        //template void matchL2_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
+        //template void matchL2_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
+        //template void matchL2_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
+        //template void matchL2_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
+        template void matchL2_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& maskCollection, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);

-        template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks,
-                                                    const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
+        template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
+                                                    const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
                                                    int cc, cudaStream_t stream)
        {
            if (masks.data)
            {
-                matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data),
+                matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data),
                    trainIdx, imgIdx, distance,
                    cc, stream);
            }
            else
            {
-                matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, WithOutMask(),
+                matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(),
                    trainIdx, imgIdx, distance,
                    cc, stream);
            }
        }

-        template void matchHamming_gpu<uchar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
-        //template void matchHamming_gpu<schar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
-        template void matchHamming_gpu<ushort>(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
-        //template void matchHamming_gpu<short >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
-        template void matchHamming_gpu<int   >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
+        template void matchHamming_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
+        //template void matchHamming_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
+        template void matchHamming_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
+        //template void matchHamming_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
+        template void matchHamming_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
    } // namespace bf_match
 }}} // namespace cv { namespace gpu { namespace device {
--- a/modules/gpu/src/cuda/bf_radius_match.cu
+++ b/modules/gpu/src/cuda/bf_radius_match.cu
@ -53,7 +53,7 @@ namespace cv { namespace gpu { namespace device
        // Match Unrolled

        template <int BLOCK_SIZE, int MAX_DESC_LEN, bool SAVE_IMG_IDX, typename Dist, typename T, typename Mask>
-        __global__ void matchUnrolled(const DevMem2D_<T> query, int imgIdx, const DevMem2D_<T> train, float maxDistance, const Mask mask,
+        __global__ void matchUnrolled(const PtrStepSz<T> query, int imgIdx, const PtrStepSz<T> train, float maxDistance, const Mask mask,
            PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)
        {
            #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110)
@ -113,8 +113,8 @@ namespace cv { namespace gpu { namespace device
        }

        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
-        void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>& train, float maxDistance, const Mask& mask,
-            const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, cudaStream_t stream)
+        void matchUnrolled(const PtrStepSz<T>& query, const PtrStepSz<T>& train, float maxDistance, const Mask& mask,
+            const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream)
        {
            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
            const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
@ -130,8 +130,8 @@ namespace cv { namespace gpu { namespace device
        }

        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T>
-        void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, float maxDistance, const DevMem2Db* masks,
-            const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
+        void matchUnrolled(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, float maxDistance, const PtrStepSzb* masks,
+            const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
            cudaStream_t stream)
        {
            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
@ -140,7 +140,7 @@ namespace cv { namespace gpu { namespace device

            for (int i = 0; i < n; ++i)
            {
-                const DevMem2D_<T> train = trains[i];
+                const PtrStepSz<T> train = trains[i];

                const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));

@ -165,7 +165,7 @@ namespace cv { namespace gpu { namespace device
        // Match

        template <int BLOCK_SIZE, bool SAVE_IMG_IDX, typename Dist, typename T, typename Mask>
-        __global__ void match(const DevMem2D_<T> query, int imgIdx, const DevMem2D_<T> train, float maxDistance, const Mask mask,
+        __global__ void match(const PtrStepSz<T> query, int imgIdx, const PtrStepSz<T> train, float maxDistance, const Mask mask,
            PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)
        {
            #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110)
@ -224,8 +224,8 @@ namespace cv { namespace gpu { namespace device
        }

        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
-        void match(const DevMem2D_<T>& query, const DevMem2D_<T>& train, float maxDistance, const Mask& mask,
-            const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
+        void match(const PtrStepSz<T>& query, const PtrStepSz<T>& train, float maxDistance, const Mask& mask,
+            const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
            cudaStream_t stream)
        {
            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
@ -242,8 +242,8 @@ namespace cv { namespace gpu { namespace device
        }

        template <int BLOCK_SIZE, typename Dist, typename T>
-        void match(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, float maxDistance, const DevMem2Db* masks,
-            const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
+        void match(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, float maxDistance, const PtrStepSzb* masks,
+            const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
            cudaStream_t stream)
        {
            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
@ -252,7 +252,7 @@ namespace cv { namespace gpu { namespace device

            for (int i = 0; i < n; ++i)
            {
-                const DevMem2D_<T> train = trains[i];
+                const PtrStepSz<T> train = trains[i];

                const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));

@ -277,8 +277,8 @@ namespace cv { namespace gpu { namespace device
        // Match dispatcher

        template <typename Dist, typename T, typename Mask>
-        void matchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, float maxDistance, const Mask& mask,
-                             const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
+        void matchDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, float maxDistance, const Mask& mask,
+                             const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
                             int cc, cudaStream_t stream)
        {
            (void)cc;
@ -309,8 +309,8 @@ namespace cv { namespace gpu { namespace device
        }

        template <typename Dist, typename T>
-        void matchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, float maxDistance, const DevMem2Db* masks,
-                             const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
+        void matchDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, float maxDistance, const PtrStepSzb* masks,
+                             const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
                             int cc, cudaStream_t stream)
        {
            (void)cc;
@ -343,125 +343,125 @@ namespace cv { namespace gpu { namespace device
        ///////////////////////////////////////////////////////////////////////////////
        // Radius Match caller

-        template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask,
-            const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
+        template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
+            const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
            int cc, cudaStream_t stream)
        {
            if (mask.data)
            {
-                matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, SingleMask(mask),
+                matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, SingleMask(mask),
                    trainIdx, distance, nMatches,
                    cc, stream);
            }
            else
            {
-                matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, WithOutMask(),
+                matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, WithOutMask(),
                    trainIdx, distance, nMatches,
                    cc, stream);
            }
        }

-        template void matchL1_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        //template void matchL1_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        template void matchL1_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        template void matchL1_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        template void matchL1_gpu<int   >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        template void matchL1_gpu<float >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        template void matchL1_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        //template void matchL1_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        template void matchL1_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        template void matchL1_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        template void matchL1_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        template void matchL1_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);

-        template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask,
-            const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
+        template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
+            const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
            int cc, cudaStream_t stream)
        {
            if (mask.data)
            {
-                matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, SingleMask(mask),
+                matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, SingleMask(mask),
                    trainIdx, distance, nMatches,
                    cc, stream);
            }
            else
            {
-                matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, WithOutMask(),
+                matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, WithOutMask(),
                    trainIdx, distance, nMatches,
                    cc, stream);
            }
        }

-        //template void matchL2_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<int   >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        template void matchL2_gpu<float >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        //template void matchL2_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        //template void matchL2_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        //template void matchL2_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        //template void matchL2_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        //template void matchL2_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        template void matchL2_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);

-        template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask,
-            const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
+        template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
+            const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
            int cc, cudaStream_t stream)
        {
            if (mask.data)
            {
-                matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, SingleMask(mask),
+                matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, SingleMask(mask),
                    trainIdx, distance, nMatches,
                    cc, stream);
            }
            else
            {
-                matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, WithOutMask(),
+                matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, WithOutMask(),
                    trainIdx, distance, nMatches,
                    cc, stream);
            }
        }

-        template void matchHamming_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        //template void matchHamming_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        template void matchHamming_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        //template void matchHamming_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        template void matchHamming_gpu<int   >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        template void matchHamming_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        //template void matchHamming_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        template void matchHamming_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        //template void matchHamming_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        template void matchHamming_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);

-        template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks,
-            const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
+        template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
+            const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
            int cc, cudaStream_t stream)
        {
-            matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains, n, maxDistance, masks,
+            matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains, n, maxDistance, masks,
                trainIdx, imgIdx, distance, nMatches,
                cc, stream);
        }

-        template void matchL1_gpu<uchar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        //template void matchL1_gpu<schar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        template void matchL1_gpu<ushort>(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        template void matchL1_gpu<short >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        template void matchL1_gpu<int   >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        template void matchL1_gpu<float >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        template void matchL1_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        //template void matchL1_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        template void matchL1_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        template void matchL1_gpu<short >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        template void matchL1_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        template void matchL1_gpu<float >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);

-        template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks,
-            const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
+        template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
+            const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
            int cc, cudaStream_t stream)
        {
-            matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains, n, maxDistance, masks,
+            matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains, n, maxDistance, masks,
                trainIdx, imgIdx, distance, nMatches,
                cc, stream);
        }

-        //template void matchL2_gpu<uchar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<schar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<ushort>(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<short >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<int   >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        template void matchL2_gpu<float >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        //template void matchL2_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        //template void matchL2_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        //template void matchL2_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        //template void matchL2_gpu<short >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        //template void matchL2_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        template void matchL2_gpu<float >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);

-        template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks,
-            const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
+        template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
+            const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
            int cc, cudaStream_t stream)
        {
-            matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains, n, maxDistance, masks,
+            matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains, n, maxDistance, masks,
                trainIdx, imgIdx, distance, nMatches,
                cc, stream);
        }

-        template void matchHamming_gpu<uchar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        //template void matchHamming_gpu<schar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        template void matchHamming_gpu<ushort>(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        //template void matchHamming_gpu<short >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        template void matchHamming_gpu<int   >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        template void matchHamming_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        //template void matchHamming_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        template void matchHamming_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        //template void matchHamming_gpu<short >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        template void matchHamming_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
    } // namespace bf_radius_match
 }}} // namespace cv { namespace gpu { namespace device
--- a/modules/gpu/src/cuda/bgfg_gmg.cu
+++ b/modules/gpu/src/cuda/bgfg_gmg.cu
@ -168,7 +168,7 @@ namespace cv { namespace gpu { namespace device {
        template <typename T> struct Quantization : detail::Quantization<VecTraits<T>::cn> {};

        template <typename SrcT>
-        __global__ void update(const PtrStep_<SrcT> frame, PtrStepb fgmask, PtrStepi colors_, PtrStepf weights_, PtrStepi nfeatures_,
+        __global__ void update(const PtrStep<SrcT> frame, PtrStepb fgmask, PtrStepi colors_, PtrStepf weights_, PtrStepi nfeatures_,
                               const int frameNum, const float learningRate, const bool updateBackgroundModel)
        {
            const int x = blockIdx.x * blockDim.x + threadIdx.x;
@ -222,7 +222,7 @@ namespace cv { namespace gpu { namespace device {
        }

        template <typename SrcT>
-        void update_gpu(DevMem2Db frame, PtrStepb fgmask, DevMem2Di colors, PtrStepf weights, PtrStepi nfeatures,
+        void update_gpu(PtrStepSzb frame, PtrStepb fgmask, PtrStepSzi colors, PtrStepf weights, PtrStepi nfeatures,
                        int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream)
        {
            const dim3 block(32, 8);
@ -230,7 +230,7 @@ namespace cv { namespace gpu { namespace device {

            cudaSafeCall( cudaFuncSetCacheConfig(update<SrcT>, cudaFuncCachePreferL1) );

-            update<SrcT><<<grid, block, 0, stream>>>((DevMem2D_<SrcT>) frame, fgmask, colors, weights, nfeatures, frameNum, learningRate, updateBackgroundModel);
+            update<SrcT><<<grid, block, 0, stream>>>((PtrStepSz<SrcT>) frame, fgmask, colors, weights, nfeatures, frameNum, learningRate, updateBackgroundModel);

            cudaSafeCall( cudaGetLastError() );

@ -238,16 +238,16 @@ namespace cv { namespace gpu { namespace device {
                cudaSafeCall( cudaDeviceSynchronize() );
        }

-        template void update_gpu<uchar  >(DevMem2Db frame, PtrStepb fgmask, DevMem2Di colors, PtrStepf weights, PtrStepi nfeatures, int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream);
-        template void update_gpu<uchar3 >(DevMem2Db frame, PtrStepb fgmask, DevMem2Di colors, PtrStepf weights, PtrStepi nfeatures, int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream);
-        template void update_gpu<uchar4 >(DevMem2Db frame, PtrStepb fgmask, DevMem2Di colors, PtrStepf weights, PtrStepi nfeatures, int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream);
+        template void update_gpu<uchar  >(PtrStepSzb frame, PtrStepb fgmask, PtrStepSzi colors, PtrStepf weights, PtrStepi nfeatures, int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream);
+        template void update_gpu<uchar3 >(PtrStepSzb frame, PtrStepb fgmask, PtrStepSzi colors, PtrStepf weights, PtrStepi nfeatures, int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream);
+        template void update_gpu<uchar4 >(PtrStepSzb frame, PtrStepb fgmask, PtrStepSzi colors, PtrStepf weights, PtrStepi nfeatures, int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream);

-        template void update_gpu<ushort >(DevMem2Db frame, PtrStepb fgmask, DevMem2Di colors, PtrStepf weights, PtrStepi nfeatures, int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream);
-        template void update_gpu<ushort3>(DevMem2Db frame, PtrStepb fgmask, DevMem2Di colors, PtrStepf weights, PtrStepi nfeatures, int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream);
-        template void update_gpu<ushort4>(DevMem2Db frame, PtrStepb fgmask, DevMem2Di colors, PtrStepf weights, PtrStepi nfeatures, int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream);
+        template void update_gpu<ushort >(PtrStepSzb frame, PtrStepb fgmask, PtrStepSzi colors, PtrStepf weights, PtrStepi nfeatures, int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream);
+        template void update_gpu<ushort3>(PtrStepSzb frame, PtrStepb fgmask, PtrStepSzi colors, PtrStepf weights, PtrStepi nfeatures, int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream);
+        template void update_gpu<ushort4>(PtrStepSzb frame, PtrStepb fgmask, PtrStepSzi colors, PtrStepf weights, PtrStepi nfeatures, int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream);

-        template void update_gpu<float  >(DevMem2Db frame, PtrStepb fgmask, DevMem2Di colors, PtrStepf weights, PtrStepi nfeatures, int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream);
-        template void update_gpu<float3 >(DevMem2Db frame, PtrStepb fgmask, DevMem2Di colors, PtrStepf weights, PtrStepi nfeatures, int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream);
-        template void update_gpu<float4 >(DevMem2Db frame, PtrStepb fgmask, DevMem2Di colors, PtrStepf weights, PtrStepi nfeatures, int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream);
+        template void update_gpu<float  >(PtrStepSzb frame, PtrStepb fgmask, PtrStepSzi colors, PtrStepf weights, PtrStepi nfeatures, int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream);
+        template void update_gpu<float3 >(PtrStepSzb frame, PtrStepb fgmask, PtrStepSzi colors, PtrStepf weights, PtrStepi nfeatures, int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream);
+        template void update_gpu<float4 >(PtrStepSzb frame, PtrStepb fgmask, PtrStepSzi colors, PtrStepf weights, PtrStepi nfeatures, int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream);
    }
 }}}
--- a/modules/gpu/src/cuda/bgfg_mog.cu
+++ b/modules/gpu/src/cuda/bgfg_mog.cu
@ -121,8 +121,8 @@ namespace cv { namespace gpu { namespace device
        // MOG without learning

        template <typename SrcT, typename WorkT>
-        __global__ void mog_withoutLearning(const DevMem2D_<SrcT> frame, PtrStepb fgmask,
-                                            const PtrStepf gmm_weight, const PtrStep_<WorkT> gmm_mean, const PtrStep_<WorkT> gmm_var,
+        __global__ void mog_withoutLearning(const PtrStepSz<SrcT> frame, PtrStepb fgmask,
+                                            const PtrStepf gmm_weight, const PtrStep<WorkT> gmm_mean, const PtrStep<WorkT> gmm_var,
                                            const int nmixtures, const float varThreshold, const float backgroundRatio)
        {
            const int x = blockIdx.x * blockDim.x + threadIdx.x;
@ -172,7 +172,7 @@ namespace cv { namespace gpu { namespace device
        }

        template <typename SrcT, typename WorkT>
-        void mog_withoutLearning_caller(DevMem2Db frame, DevMem2Db fgmask, DevMem2Df weight, DevMem2Db mean, DevMem2Db var,
+        void mog_withoutLearning_caller(PtrStepSzb frame, PtrStepSzb fgmask, PtrStepSzf weight, PtrStepSzb mean, PtrStepSzb var,
                                        int nmixtures, float varThreshold, float backgroundRatio, cudaStream_t stream)
        {
            dim3 block(32, 8);
@ -180,8 +180,8 @@ namespace cv { namespace gpu { namespace device

            cudaSafeCall( cudaFuncSetCacheConfig(mog_withoutLearning<SrcT, WorkT>, cudaFuncCachePreferL1) );

-            mog_withoutLearning<SrcT, WorkT><<<grid, block, 0, stream>>>((DevMem2D_<SrcT>) frame, fgmask,
-                                                                         weight, (DevMem2D_<WorkT>) mean, (DevMem2D_<WorkT>) var,
+            mog_withoutLearning<SrcT, WorkT><<<grid, block, 0, stream>>>((PtrStepSz<SrcT>) frame, fgmask,
+                                                                         weight, (PtrStepSz<WorkT>) mean, (PtrStepSz<WorkT>) var,
                                                                         nmixtures, varThreshold, backgroundRatio);

            cudaSafeCall( cudaGetLastError() );
@ -194,8 +194,8 @@ namespace cv { namespace gpu { namespace device
        // MOG with learning

        template <typename SrcT, typename WorkT>
-        __global__ void mog_withLearning(const DevMem2D_<SrcT> frame, PtrStepb fgmask,
-                                         PtrStepf gmm_weight, PtrStepf gmm_sortKey, PtrStep_<WorkT> gmm_mean, PtrStep_<WorkT> gmm_var,
+        __global__ void mog_withLearning(const PtrStepSz<SrcT> frame, PtrStepb fgmask,
+                                         PtrStepf gmm_weight, PtrStepf gmm_sortKey, PtrStep<WorkT> gmm_mean, PtrStep<WorkT> gmm_var,
                                         const int nmixtures, const float varThreshold, const float backgroundRatio, const float learningRate, const float minVar)
        {
            const float w0 = 0.05f;
@ -324,7 +324,7 @@ namespace cv { namespace gpu { namespace device
        }

        template <typename SrcT, typename WorkT>
-        void mog_withLearning_caller(DevMem2Db frame, DevMem2Db fgmask, DevMem2Df weight, DevMem2Df sortKey, DevMem2Db mean, DevMem2Db var,
+        void mog_withLearning_caller(PtrStepSzb frame, PtrStepSzb fgmask, PtrStepSzf weight, PtrStepSzf sortKey, PtrStepSzb mean, PtrStepSzb var,
                                     int nmixtures, float varThreshold, float backgroundRatio, float learningRate, float minVar,
                                     cudaStream_t stream)
        {
@ -333,8 +333,8 @@ namespace cv { namespace gpu { namespace device

            cudaSafeCall( cudaFuncSetCacheConfig(mog_withLearning<SrcT, WorkT>, cudaFuncCachePreferL1) );

-            mog_withLearning<SrcT, WorkT><<<grid, block, 0, stream>>>((DevMem2D_<SrcT>) frame, fgmask,
-                                                                      weight, sortKey, (DevMem2D_<WorkT>) mean, (DevMem2D_<WorkT>) var,
+            mog_withLearning<SrcT, WorkT><<<grid, block, 0, stream>>>((PtrStepSz<SrcT>) frame, fgmask,
+                                                                      weight, sortKey, (PtrStepSz<WorkT>) mean, (PtrStepSz<WorkT>) var,
                                                                      nmixtures, varThreshold, backgroundRatio, learningRate, minVar);

            cudaSafeCall( cudaGetLastError() );
@ -346,10 +346,10 @@ namespace cv { namespace gpu { namespace device
        ///////////////////////////////////////////////////////////////
        // MOG

-        void mog_gpu(DevMem2Db frame, int cn, DevMem2Db fgmask, DevMem2Df weight, DevMem2Df sortKey, DevMem2Db mean, DevMem2Db var, int nmixtures, float varThreshold, float learningRate, float backgroundRatio, float noiseSigma, cudaStream_t stream)
+        void mog_gpu(PtrStepSzb frame, int cn, PtrStepSzb fgmask, PtrStepSzf weight, PtrStepSzf sortKey, PtrStepSzb mean, PtrStepSzb var, int nmixtures, float varThreshold, float learningRate, float backgroundRatio, float noiseSigma, cudaStream_t stream)
        {
-            typedef void (*withoutLearning_t)(DevMem2Db frame, DevMem2Db fgmask, DevMem2Df weight, DevMem2Db mean, DevMem2Db var, int nmixtures, float varThreshold, float backgroundRatio, cudaStream_t stream);
-            typedef void (*withLearning_t)(DevMem2Db frame, DevMem2Db fgmask, DevMem2Df weight, DevMem2Df sortKey, DevMem2Db mean, DevMem2Db var, int nmixtures, float varThreshold, float backgroundRatio, float learningRate, float minVar, cudaStream_t stream);
+            typedef void (*withoutLearning_t)(PtrStepSzb frame, PtrStepSzb fgmask, PtrStepSzf weight, PtrStepSzb mean, PtrStepSzb var, int nmixtures, float varThreshold, float backgroundRatio, cudaStream_t stream);
+            typedef void (*withLearning_t)(PtrStepSzb frame, PtrStepSzb fgmask, PtrStepSzf weight, PtrStepSzf sortKey, PtrStepSzb mean, PtrStepSzb var, int nmixtures, float varThreshold, float backgroundRatio, float learningRate, float minVar, cudaStream_t stream);

            static const withoutLearning_t withoutLearning[] =
            {
@ -369,7 +369,7 @@ namespace cv { namespace gpu { namespace device
        }

        template <typename WorkT, typename OutT>
-        __global__ void getBackgroundImage(const PtrStepf gmm_weight, const PtrStep_<WorkT> gmm_mean, DevMem2D_<OutT> dst, const int nmixtures, const float backgroundRatio)
+        __global__ void getBackgroundImage(const PtrStepf gmm_weight, const PtrStep<WorkT> gmm_mean, PtrStepSz<OutT> dst, const int nmixtures, const float backgroundRatio)
        {
            const int x = blockIdx.x * blockDim.x + threadIdx.x;
            const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -399,23 +399,23 @@ namespace cv { namespace gpu { namespace device
        }

        template <typename WorkT, typename OutT>
-        void getBackgroundImage_caller(DevMem2Df weight, DevMem2Db mean, DevMem2Db dst, int nmixtures, float backgroundRatio, cudaStream_t stream)
+        void getBackgroundImage_caller(PtrStepSzf weight, PtrStepSzb mean, PtrStepSzb dst, int nmixtures, float backgroundRatio, cudaStream_t stream)
        {
            dim3 block(32, 8);
            dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));

            cudaSafeCall( cudaFuncSetCacheConfig(getBackgroundImage<WorkT, OutT>, cudaFuncCachePreferL1) );

-            getBackgroundImage<WorkT, OutT><<<grid, block, 0, stream>>>(weight, (DevMem2D_<WorkT>) mean, (DevMem2D_<OutT>) dst, nmixtures, backgroundRatio);
+            getBackgroundImage<WorkT, OutT><<<grid, block, 0, stream>>>(weight, (PtrStepSz<WorkT>) mean, (PtrStepSz<OutT>) dst, nmixtures, backgroundRatio);
            cudaSafeCall( cudaGetLastError() );

            if (stream == 0)
                cudaSafeCall( cudaDeviceSynchronize() );
        }

-        void getBackgroundImage_gpu(int cn, DevMem2Df weight, DevMem2Db mean, DevMem2Db dst, int nmixtures, float backgroundRatio, cudaStream_t stream)
+        void getBackgroundImage_gpu(int cn, PtrStepSzf weight, PtrStepSzb mean, PtrStepSzb dst, int nmixtures, float backgroundRatio, cudaStream_t stream)
        {
-            typedef void (*func_t)(DevMem2Df weight, DevMem2Db mean, DevMem2Db dst, int nmixtures, float backgroundRatio, cudaStream_t stream);
+            typedef void (*func_t)(PtrStepSzf weight, PtrStepSzb mean, PtrStepSzb dst, int nmixtures, float backgroundRatio, cudaStream_t stream);

            static const func_t funcs[] =
            {
@ -455,8 +455,8 @@ namespace cv { namespace gpu { namespace device
        }

        template <bool detectShadows, typename SrcT, typename WorkT>
-        __global__ void mog2(const DevMem2D_<SrcT> frame, PtrStepb fgmask, PtrStepb modesUsed,
-                             PtrStepf gmm_weight, PtrStepf gmm_variance, PtrStep_<WorkT> gmm_mean,
+        __global__ void mog2(const PtrStepSz<SrcT> frame, PtrStepb fgmask, PtrStepb modesUsed,
+                             PtrStepf gmm_weight, PtrStepf gmm_variance, PtrStep<WorkT> gmm_mean,
                             const float alphaT, const float alpha1, const float prune)
        {
            const int x = blockIdx.x * blockDim.x + threadIdx.x;
@ -653,7 +653,7 @@ namespace cv { namespace gpu { namespace device
        }

        template <typename SrcT, typename WorkT>
-        void mog2_caller(DevMem2Db frame, DevMem2Db fgmask, DevMem2Db modesUsed, DevMem2Df weight, DevMem2Df variance, DevMem2Db mean,
+        void mog2_caller(PtrStepSzb frame, PtrStepSzb fgmask, PtrStepSzb modesUsed, PtrStepSzf weight, PtrStepSzf variance, PtrStepSzb mean,
                         float alphaT, float prune, bool detectShadows, cudaStream_t stream)
        {
            dim3 block(32, 8);
@ -665,16 +665,16 @@ namespace cv { namespace gpu { namespace device
            {
                cudaSafeCall( cudaFuncSetCacheConfig(mog2<true, SrcT, WorkT>, cudaFuncCachePreferL1) );

-                mog2<true, SrcT, WorkT><<<grid, block, 0, stream>>>((DevMem2D_<SrcT>) frame, fgmask, modesUsed,
-                                                                    weight, variance, (DevMem2D_<WorkT>) mean,
+                mog2<true, SrcT, WorkT><<<grid, block, 0, stream>>>((PtrStepSz<SrcT>) frame, fgmask, modesUsed,
+                                                                    weight, variance, (PtrStepSz<WorkT>) mean,
                                                                    alphaT, alpha1, prune);
            }
            else
            {
                cudaSafeCall( cudaFuncSetCacheConfig(mog2<false, SrcT, WorkT>, cudaFuncCachePreferL1) );

-                mog2<false, SrcT, WorkT><<<grid, block, 0, stream>>>((DevMem2D_<SrcT>) frame, fgmask, modesUsed,
-                                                                    weight, variance, (DevMem2D_<WorkT>) mean,
+                mog2<false, SrcT, WorkT><<<grid, block, 0, stream>>>((PtrStepSz<SrcT>) frame, fgmask, modesUsed,
+                                                                    weight, variance, (PtrStepSz<WorkT>) mean,
                                                                    alphaT, alpha1, prune);
            }

@ -684,10 +684,10 @@ namespace cv { namespace gpu { namespace device
                cudaSafeCall( cudaDeviceSynchronize() );
        }

-        void mog2_gpu(DevMem2Db frame, int cn, DevMem2Db fgmask, DevMem2Db modesUsed, DevMem2Df weight, DevMem2Df variance, DevMem2Db mean,
+        void mog2_gpu(PtrStepSzb frame, int cn, PtrStepSzb fgmask, PtrStepSzb modesUsed, PtrStepSzf weight, PtrStepSzf variance, PtrStepSzb mean,
                      float alphaT, float prune, bool detectShadows, cudaStream_t stream)
        {
-            typedef void (*func_t)(DevMem2Db frame, DevMem2Db fgmask, DevMem2Db modesUsed, DevMem2Df weight, DevMem2Df variance, DevMem2Db mean, float alphaT, float prune, bool detectShadows, cudaStream_t stream);
+            typedef void (*func_t)(PtrStepSzb frame, PtrStepSzb fgmask, PtrStepSzb modesUsed, PtrStepSzf weight, PtrStepSzf variance, PtrStepSzb mean, float alphaT, float prune, bool detectShadows, cudaStream_t stream);

            static const func_t funcs[] =
            {
@ -698,7 +698,7 @@ namespace cv { namespace gpu { namespace device
        }

        template <typename WorkT, typename OutT>
-        __global__ void getBackgroundImage2(const DevMem2Db modesUsed, const PtrStepf gmm_weight, const PtrStep_<WorkT> gmm_mean, PtrStep_<OutT> dst)
+        __global__ void getBackgroundImage2(const PtrStepSzb modesUsed, const PtrStepf gmm_weight, const PtrStep<WorkT> gmm_mean, PtrStep<OutT> dst)
        {
            const int x = blockIdx.x * blockDim.x + threadIdx.x;
            const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -730,23 +730,23 @@ namespace cv { namespace gpu { namespace device
        }

        template <typename WorkT, typename OutT>
-        void getBackgroundImage2_caller(DevMem2Db modesUsed, DevMem2Df weight, DevMem2Db mean, DevMem2Db dst, cudaStream_t stream)
+        void getBackgroundImage2_caller(PtrStepSzb modesUsed, PtrStepSzf weight, PtrStepSzb mean, PtrStepSzb dst, cudaStream_t stream)
        {
            dim3 block(32, 8);
            dim3 grid(divUp(modesUsed.cols, block.x), divUp(modesUsed.rows, block.y));

            cudaSafeCall( cudaFuncSetCacheConfig(getBackgroundImage2<WorkT, OutT>, cudaFuncCachePreferL1) );

-            getBackgroundImage2<WorkT, OutT><<<grid, block, 0, stream>>>(modesUsed, weight, (DevMem2D_<WorkT>) mean, (DevMem2D_<OutT>) dst);
+            getBackgroundImage2<WorkT, OutT><<<grid, block, 0, stream>>>(modesUsed, weight, (PtrStepSz<WorkT>) mean, (PtrStepSz<OutT>) dst);
            cudaSafeCall( cudaGetLastError() );

            if (stream == 0)
                cudaSafeCall( cudaDeviceSynchronize() );
        }

-        void getBackgroundImage2_gpu(int cn, DevMem2Db modesUsed, DevMem2Df weight, DevMem2Db mean, DevMem2Db dst, cudaStream_t stream)
+        void getBackgroundImage2_gpu(int cn, PtrStepSzb modesUsed, PtrStepSzf weight, PtrStepSzb mean, PtrStepSzb dst, cudaStream_t stream)
        {
-            typedef void (*func_t)(DevMem2Db modesUsed, DevMem2Df weight, DevMem2Db mean, DevMem2Db dst, cudaStream_t stream);
+            typedef void (*func_t)(PtrStepSzb modesUsed, PtrStepSzf weight, PtrStepSzb mean, PtrStepSzb dst, cudaStream_t stream);

            static const func_t funcs[] =
            {
--- a/modules/gpu/src/cuda/bgfg_vibe.cu
+++ b/modules/gpu/src/cuda/bgfg_vibe.cu
@ -90,7 +90,7 @@ namespace cv { namespace gpu { namespace device
        }

        template <typename SrcT, typename SampleT>
-        __global__ void init(const DevMem2D_<SrcT> frame, PtrStep_<SampleT> samples, PtrStep_<uint> randStates)
+        __global__ void init(const PtrStepSz<SrcT> frame, PtrStep<SampleT> samples, PtrStep<uint> randStates)
        {
            const int x = blockIdx.x * blockDim.x + threadIdx.x;
            const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -116,23 +116,23 @@ namespace cv { namespace gpu { namespace device
        }

        template <typename SrcT, typename SampleT>
-        void init_caller(DevMem2Db frame, DevMem2Db samples, DevMem2D_<uint> randStates, cudaStream_t stream)
+        void init_caller(PtrStepSzb frame, PtrStepSzb samples, PtrStepSz<uint> randStates, cudaStream_t stream)
        {
            dim3 block(32, 8);
            dim3 grid(divUp(frame.cols, block.x), divUp(frame.rows, block.y));

            cudaSafeCall( cudaFuncSetCacheConfig(init<SrcT, SampleT>, cudaFuncCachePreferL1) );

-            init<SrcT, SampleT><<<grid, block, 0, stream>>>((DevMem2D_<SrcT>) frame, (DevMem2D_<SampleT>) samples, randStates);
+            init<SrcT, SampleT><<<grid, block, 0, stream>>>((PtrStepSz<SrcT>) frame, (PtrStepSz<SampleT>) samples, randStates);
            cudaSafeCall( cudaGetLastError() );

            if (stream == 0)
                cudaSafeCall( cudaDeviceSynchronize() );
        }

-        void init_gpu(DevMem2Db frame, int cn, DevMem2Db samples, DevMem2D_<uint> randStates, cudaStream_t stream)
+        void init_gpu(PtrStepSzb frame, int cn, PtrStepSzb samples, PtrStepSz<uint> randStates, cudaStream_t stream)
        {
-            typedef void (*func_t)(DevMem2Db frame, DevMem2Db samples, DevMem2D_<uint> randStates, cudaStream_t stream);
+            typedef void (*func_t)(PtrStepSzb frame, PtrStepSzb samples, PtrStepSz<uint> randStates, cudaStream_t stream);
            static const func_t funcs[] =
            {
                0, init_caller<uchar, uchar>, 0, init_caller<uchar3, uchar4>, init_caller<uchar4, uchar4>
@ -155,7 +155,7 @@ namespace cv { namespace gpu { namespace device
        }

        template <typename SrcT, typename SampleT>
-        __global__ void update(const DevMem2D_<SrcT> frame, PtrStepb fgmask, PtrStep_<SampleT> samples, PtrStep_<uint> randStates)
+        __global__ void update(const PtrStepSz<SrcT> frame, PtrStepb fgmask, PtrStep<SampleT> samples, PtrStep<uint> randStates)
        {
            const int x = blockIdx.x * blockDim.x + threadIdx.x;
            const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -225,23 +225,23 @@ namespace cv { namespace gpu { namespace device
        }

        template <typename SrcT, typename SampleT>
-        void update_caller(DevMem2Db frame, DevMem2Db fgmask, DevMem2Db samples, DevMem2D_<uint> randStates, cudaStream_t stream)
+        void update_caller(PtrStepSzb frame, PtrStepSzb fgmask, PtrStepSzb samples, PtrStepSz<uint> randStates, cudaStream_t stream)
        {
            dim3 block(32, 8);
            dim3 grid(divUp(frame.cols, block.x), divUp(frame.rows, block.y));

            cudaSafeCall( cudaFuncSetCacheConfig(update<SrcT, SampleT>, cudaFuncCachePreferL1) );

-            update<SrcT, SampleT><<<grid, block, 0, stream>>>((DevMem2D_<SrcT>) frame, fgmask, (DevMem2D_<SampleT>) samples, randStates);
+            update<SrcT, SampleT><<<grid, block, 0, stream>>>((PtrStepSz<SrcT>) frame, fgmask, (PtrStepSz<SampleT>) samples, randStates);
            cudaSafeCall( cudaGetLastError() );

            if (stream == 0)
                cudaSafeCall( cudaDeviceSynchronize() );
        }

-        void update_gpu(DevMem2Db frame, int cn, DevMem2Db fgmask, DevMem2Db samples, DevMem2D_<uint> randStates, cudaStream_t stream)
+        void update_gpu(PtrStepSzb frame, int cn, PtrStepSzb fgmask, PtrStepSzb samples, PtrStepSz<uint> randStates, cudaStream_t stream)
        {
-            typedef void (*func_t)(DevMem2Db frame, DevMem2Db fgmask, DevMem2Db samples, DevMem2D_<uint> randStates, cudaStream_t stream);
+            typedef void (*func_t)(PtrStepSzb frame, PtrStepSzb fgmask, PtrStepSzb samples, PtrStepSz<uint> randStates, cudaStream_t stream);
            static const func_t funcs[] =
            {
                0, update_caller<uchar, uchar>, 0, update_caller<uchar3, uchar4>, update_caller<uchar4, uchar4>
--- a/modules/gpu/src/cuda/bilateral_filter.cu
+++ b/modules/gpu/src/cuda/bilateral_filter.cu
@ -57,7 +57,7 @@ namespace cv { namespace gpu { namespace device
        __constant__ short cedge_disc;
        __constant__ short cmax_disc;

-        void load_constants(float* table_color, DevMem2Df table_space, int ndisp, int radius, short edge_disc, short max_disc)
+        void load_constants(float* table_color, PtrStepSzf table_space, int ndisp, int radius, short edge_disc, short max_disc)
        {
            cudaSafeCall( cudaMemcpyToSymbol(ctable_color, &table_color, sizeof(table_color)) );
            cudaSafeCall( cudaMemcpyToSymbol(ctable_space, &table_space.data, sizeof(table_space.data)) );
@ -176,7 +176,7 @@ namespace cv { namespace gpu { namespace device
        }

        template <typename T>
-        void bilateral_filter_caller(DevMem2D_<T> disp, DevMem2Db img, int channels, int iters, cudaStream_t stream)
+        void bilateral_filter_caller(PtrStepSz<T> disp, PtrStepSzb img, int channels, int iters, cudaStream_t stream)
        {
            dim3 threads(32, 8, 1);
            dim3 grid(1, 1, 1);
@ -213,12 +213,12 @@ namespace cv { namespace gpu { namespace device
                cudaSafeCall( cudaDeviceSynchronize() );
        }

-        void bilateral_filter_gpu(DevMem2Db disp, DevMem2Db img, int channels, int iters, cudaStream_t stream)
+        void bilateral_filter_gpu(PtrStepSzb disp, PtrStepSzb img, int channels, int iters, cudaStream_t stream)
        {
            bilateral_filter_caller(disp, img, channels, iters, stream);
        }

-        void bilateral_filter_gpu(DevMem2D_<short> disp, DevMem2Db img, int channels, int iters, cudaStream_t stream)
+        void bilateral_filter_gpu(PtrStepSz<short> disp, PtrStepSzb img, int channels, int iters, cudaStream_t stream)
        {
            bilateral_filter_caller(disp, img, channels, iters, stream);
        }
--- a/modules/gpu/src/cuda/calib3d.cu
+++ b/modules/gpu/src/cuda/calib3d.cu
@ -66,8 +66,8 @@ namespace cv { namespace gpu { namespace device
            }
        };

-        void call(const DevMem2D_<float3> src, const float* rot,
-                  const float* transl, DevMem2D_<float3> dst,
+        void call(const PtrStepSz<float3> src, const float* rot,
+                  const float* transl, PtrStepSz<float3> dst,
                  cudaStream_t stream)
        {
            cudaSafeCall(cudaMemcpyToSymbol(crot0, rot, sizeof(float) * 3));
@ -103,8 +103,8 @@ namespace cv { namespace gpu { namespace device
            }
        };

-        void call(const DevMem2D_<float3> src, const float* rot,
-                  const float* transl, const float* proj, DevMem2D_<float2> dst,
+        void call(const PtrStepSz<float3> src, const float* rot,
+                  const float* transl, const float* proj, PtrStepSz<float2> dst,
                  cudaStream_t stream)
        {
            cudaSafeCall(cudaMemcpyToSymbol(crot0, rot, sizeof(float) * 3));
--- a/modules/gpu/src/cuda/ccomponetns.cu
+++ b/modules/gpu/src/cuda/ccomponetns.cu
@ -176,7 +176,7 @@ namespace cv { namespace gpu { namespace device


        template<typename T, typename F>
-        __global__ void computeConnectivity(const DevMem2D_<T> image, DevMem2D components, F connected)
+        __global__ void computeConnectivity(const PtrStepSz<T> image, PtrStepSzb components, F connected)
        {
            int x = threadIdx.x + blockIdx.x * blockDim.x;
            int y = threadIdx.y + blockIdx.y * blockDim.y;
@ -202,7 +202,7 @@ namespace cv { namespace gpu { namespace device
        }

        template< typename T>
-        void computeEdges(const DevMem2D& image, DevMem2D edges, const float4& lo, const float4& hi, cudaStream_t stream)
+        void computeEdges(const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream)
        {
            dim3 block(CTA_SIZE_X, CTA_SIZE_Y);
            dim3 grid(divUp(image.cols, block.x), divUp(image.rows, block.y));
@ -210,23 +210,23 @@ namespace cv { namespace gpu { namespace device
            typedef InInterval<typename IntervalsTraits<T>::dist_type, IntervalsTraits<T>::ch> Int_t;

            Int_t inInt(lo, hi);
-            computeConnectivity<T, Int_t><<<grid, block, 0, stream>>>(static_cast<const DevMem2D_<T> >(image), edges, inInt);
+            computeConnectivity<T, Int_t><<<grid, block, 0, stream>>>(static_cast<const PtrStepSz<T> >(image), edges, inInt);

            cudaSafeCall( cudaGetLastError() );
            if (stream == 0)
                cudaSafeCall( cudaDeviceSynchronize() );
        }

-        template void computeEdges<uchar>  (const DevMem2D& image, DevMem2D edges, const float4& lo, const float4& hi, cudaStream_t stream);
-        template void computeEdges<uchar3> (const DevMem2D& image, DevMem2D edges, const float4& lo, const float4& hi, cudaStream_t stream);
-        template void computeEdges<uchar4> (const DevMem2D& image, DevMem2D edges, const float4& lo, const float4& hi, cudaStream_t stream);
-        template void computeEdges<ushort> (const DevMem2D& image, DevMem2D edges, const float4& lo, const float4& hi, cudaStream_t stream);
-        template void computeEdges<ushort3>(const DevMem2D& image, DevMem2D edges, const float4& lo, const float4& hi, cudaStream_t stream);
-        template void computeEdges<ushort4>(const DevMem2D& image, DevMem2D edges, const float4& lo, const float4& hi, cudaStream_t stream);
-        template void computeEdges<int>    (const DevMem2D& image, DevMem2D edges, const float4& lo, const float4& hi, cudaStream_t stream);
-        template void computeEdges<float>  (const DevMem2D& image, DevMem2D edges, const float4& lo, const float4& hi, cudaStream_t stream);
+        template void computeEdges<uchar>  (const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
+        template void computeEdges<uchar3> (const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
+        template void computeEdges<uchar4> (const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
+        template void computeEdges<ushort> (const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
+        template void computeEdges<ushort3>(const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
+        template void computeEdges<ushort4>(const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
+        template void computeEdges<int>    (const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
+        template void computeEdges<float>  (const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);

-        __global__ void lableTiles(const DevMem2D edges, DevMem2Di comps)
+        __global__ void lableTiles(const PtrStepSzb edges, PtrStepSzi comps)
        {
            int x = threadIdx.x + blockIdx.x * TILE_COLS;
            int y = threadIdx.y + blockIdx.y * TILE_ROWS;
@ -360,7 +360,7 @@ namespace cv { namespace gpu { namespace device
                }
        }

-        __device__ __forceinline__ int root(const DevMem2Di& comps, int label)
+        __device__ __forceinline__ int root(const PtrStepSzi& comps, int label)
        {
            while(1)
            {
@ -376,7 +376,7 @@ namespace cv { namespace gpu { namespace device
            return label;
        }

-        __device__ __forceinline__ void isConnected(DevMem2Di& comps, int l1, int l2, bool& changed)
+        __device__ __forceinline__ void isConnected(PtrStepSzi& comps, int l1, int l2, bool& changed)
        {
            int r1 = root(comps, l1);
            int r2 = root(comps, l2);
@ -394,7 +394,7 @@ namespace cv { namespace gpu { namespace device
        }

        __global__ void crossMerge(const int tilesNumY, const int tilesNumX, int tileSizeY, int tileSizeX,
-            const DevMem2D edges, DevMem2Di comps, const int yIncomplete, int xIncomplete)
+            const PtrStepSzb edges, PtrStepSzi comps, const int yIncomplete, int xIncomplete)
        {
            int tid = threadIdx.y * blockDim.x + threadIdx.x;
            int stride = blockDim.y * blockDim.x;
@ -482,7 +482,7 @@ namespace cv { namespace gpu { namespace device
            } while (Emulation::syncthreadsOr(changed));
        }

-        __global__ void flatten(const DevMem2D edges, DevMem2Di comps)
+        __global__ void flatten(const PtrStepSzb edges, PtrStepSzi comps)
        {
            int x = threadIdx.x + blockIdx.x * blockDim.x;
            int y = threadIdx.y + blockIdx.y * blockDim.y;
@ -493,7 +493,7 @@ namespace cv { namespace gpu { namespace device

        enum {CC_NO_COMPACT = 0, CC_COMPACT_LABELS = 1};

-        void labelComponents(const DevMem2D& edges, DevMem2Di comps, int flags, cudaStream_t stream)
+        void labelComponents(const PtrStepSzb& edges, PtrStepSzi comps, int flags, cudaStream_t stream)
        {
            dim3 block(CTA_SIZE_X, CTA_SIZE_Y);
            dim3 grid(divUp(edges.cols, TILE_COLS), divUp(edges.rows, TILE_ROWS));
--- a/modules/gpu/src/cuda/color.cu
+++ b/modules/gpu/src/cuda/color.cu
@ -222,12 +222,12 @@ namespace cv { namespace gpu { namespace device
    };

 #define OPENCV_GPU_IMPLEMENT_CVTCOLOR(name, traits) \
-    void name(const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream) \
+    void name(const PtrStepSzb& src, const PtrStepSzb& dst, cudaStream_t stream) \
    { \
        traits::functor_type functor = traits::create_functor(); \
        typedef typename traits::functor_type::argument_type src_t; \
        typedef typename traits::functor_type::result_type   dst_t; \
-        cv::gpu::device::transform((DevMem2D_<src_t>)src, (DevMem2D_<dst_t>)dst, functor, WithOutMask(), stream); \
+        cv::gpu::device::transform((PtrStepSz<src_t>)src, (PtrStepSz<dst_t>)dst, functor, WithOutMask(), stream); \
    }

 #define OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(name) \
--- a/modules/gpu/src/cuda/column_filter.cu
+++ b/modules/gpu/src/cuda/column_filter.cu
@ -62,7 +62,7 @@ namespace cv { namespace gpu { namespace device
        }

        template <int KSIZE, typename T, typename D, typename B>
-        __global__ void linearColumnFilter(const DevMem2D_<T> src, PtrStep<D> dst, const int anchor, const B brd)
+        __global__ void linearColumnFilter(const PtrStepSz<T> src, PtrStep<D> dst, const int anchor, const B brd)
        {
            #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
                const int BLOCK_DIM_X = 16;
@ -125,7 +125,7 @@ namespace cv { namespace gpu { namespace device
        }

        template <int KSIZE, typename T, typename D, template<typename> class B>
-        void linearColumnFilter_caller(DevMem2D_<T> src, DevMem2D_<D> dst, int anchor, int cc, cudaStream_t stream)
+        void linearColumnFilter_caller(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream)
        {
            int BLOCK_DIM_X;
            int BLOCK_DIM_Y;
@ -158,9 +158,9 @@ namespace cv { namespace gpu { namespace device
        }

        template <typename T, typename D>
-        void linearColumnFilter_gpu(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream)
+        void linearColumnFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream)
        {
-            typedef void (*caller_t)(DevMem2D_<T> src, DevMem2D_<D> dst, int anchor, int cc, cudaStream_t stream);
+            typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream);

            static const caller_t callers[5][33] =
            {
@ -343,13 +343,13 @@ namespace cv { namespace gpu { namespace device

            loadKernel(kernel, ksize);

-            callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor, cc, stream);
+            callers[brd_type][ksize]((PtrStepSz<T>)src, (PtrStepSz<D>)dst, anchor, cc, stream);
        }

-        template void linearColumnFilter_gpu<float , uchar >(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-        template void linearColumnFilter_gpu<float4, uchar4>(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-        template void linearColumnFilter_gpu<float3, short3>(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-        template void linearColumnFilter_gpu<float , int   >(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-        template void linearColumnFilter_gpu<float , float >(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+        template void linearColumnFilter_gpu<float , uchar >(PtrStepSzb src, PtrStepSzb dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+        template void linearColumnFilter_gpu<float4, uchar4>(PtrStepSzb src, PtrStepSzb dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+        template void linearColumnFilter_gpu<float3, short3>(PtrStepSzb src, PtrStepSzb dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+        template void linearColumnFilter_gpu<float , int   >(PtrStepSzb src, PtrStepSzb dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+        template void linearColumnFilter_gpu<float , float >(PtrStepSzb src, PtrStepSzb dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
    } // namespace column_filter
 }}} // namespace cv { namespace gpu { namespace device
--- a/modules/gpu/src/cuda/copy_make_border.cu
+++ b/modules/gpu/src/cuda/copy_make_border.cu
@ -47,7 +47,7 @@ namespace cv { namespace gpu { namespace device
 {
    namespace imgproc
    {
-        template <typename Ptr2D, typename T> __global__ void copyMakeBorder(const Ptr2D src, DevMem2D_<T> dst, int top, int left)
+        template <typename Ptr2D, typename T> __global__ void copyMakeBorder(const Ptr2D src, PtrStepSz<T> dst, int top, int left)
        {
            const int x = blockDim.x * blockIdx.x + threadIdx.x;
            const int y = blockDim.y * blockIdx.y + threadIdx.y;
@ -58,7 +58,7 @@ namespace cv { namespace gpu { namespace device

        template <template <typename> class B, typename T> struct CopyMakeBorderDispatcher
        {
-            static void call(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, int top, int left,
+            static void call(const PtrStepSz<T>& src, const PtrStepSz<T>& dst, int top, int left,
                const typename VecTraits<T>::elem_type* borderValue, cudaStream_t stream)
            {
                dim3 block(32, 8);
@ -75,12 +75,12 @@ namespace cv { namespace gpu { namespace device
            }
        };

-        template <typename T, int cn> void copyMakeBorder_gpu(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode,
+        template <typename T, int cn> void copyMakeBorder_gpu(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode,
            const T* borderValue, cudaStream_t stream)
        {
            typedef typename TypeVec<T, cn>::vec_type vec_type;

-            typedef void (*caller_t)(const DevMem2D_<vec_type>& src, const DevMem2D_<vec_type>& dst, int top, int left, const T* borderValue, cudaStream_t stream);
+            typedef void (*caller_t)(const PtrStepSz<vec_type>& src, const PtrStepSz<vec_type>& dst, int top, int left, const T* borderValue, cudaStream_t stream);

            static const caller_t callers[5] =
            {
@ -91,37 +91,37 @@ namespace cv { namespace gpu { namespace device
                CopyMakeBorderDispatcher<BrdWrap, vec_type>::call
            };

-            callers[borderMode](DevMem2D_<vec_type>(src), DevMem2D_<vec_type>(dst), top, left, borderValue, stream);
+            callers[borderMode](PtrStepSz<vec_type>(src), PtrStepSz<vec_type>(dst), top, left, borderValue, stream);
        }

-        template void copyMakeBorder_gpu<uchar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
-        //template void copyMakeBorder_gpu<uchar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
-        template void copyMakeBorder_gpu<uchar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
-        template void copyMakeBorder_gpu<uchar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
+        template void copyMakeBorder_gpu<uchar, 1>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
+        //template void copyMakeBorder_gpu<uchar, 2>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
+        template void copyMakeBorder_gpu<uchar, 3>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
+        template void copyMakeBorder_gpu<uchar, 4>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);

-        //template void copyMakeBorder_gpu<schar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
-        //template void copyMakeBorder_gpu<schar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
-        //template void copyMakeBorder_gpu<schar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
-        //template void copyMakeBorder_gpu<schar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
+        //template void copyMakeBorder_gpu<schar, 1>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
+        //template void copyMakeBorder_gpu<schar, 2>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
+        //template void copyMakeBorder_gpu<schar, 3>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
+        //template void copyMakeBorder_gpu<schar, 4>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);

-        template void copyMakeBorder_gpu<ushort, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
-        //template void copyMakeBorder_gpu<ushort, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
-        template void copyMakeBorder_gpu<ushort, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
-        template void copyMakeBorder_gpu<ushort, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
+        template void copyMakeBorder_gpu<ushort, 1>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
+        //template void copyMakeBorder_gpu<ushort, 2>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
+        template void copyMakeBorder_gpu<ushort, 3>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
+        template void copyMakeBorder_gpu<ushort, 4>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);

-        template void copyMakeBorder_gpu<short, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
-        //template void copyMakeBorder_gpu<short, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
-        template void copyMakeBorder_gpu<short, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
-        template void copyMakeBorder_gpu<short, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
+        template void copyMakeBorder_gpu<short, 1>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
+        //template void copyMakeBorder_gpu<short, 2>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
+        template void copyMakeBorder_gpu<short, 3>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
+        template void copyMakeBorder_gpu<short, 4>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);

-        //template void copyMakeBorder_gpu<int, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
-        //template void copyMakeBorder_gpu<int, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
-        //template void copyMakeBorder_gpu<int, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
-        //template void copyMakeBorder_gpu<int, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
+        //template void copyMakeBorder_gpu<int, 1>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
+        //template void copyMakeBorder_gpu<int, 2>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
+        //template void copyMakeBorder_gpu<int, 3>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
+        //template void copyMakeBorder_gpu<int, 4>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);

-        template void copyMakeBorder_gpu<float, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
-        //template void copyMakeBorder_gpu<float, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
-        template void copyMakeBorder_gpu<float, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
-        template void copyMakeBorder_gpu<float, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
+        template void copyMakeBorder_gpu<float, 1>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
+        //template void copyMakeBorder_gpu<float, 2>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
+        template void copyMakeBorder_gpu<float, 3>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
+        template void copyMakeBorder_gpu<float, 4>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
    } // namespace imgproc
 }}} // namespace cv { namespace gpu { namespace device
--- a/modules/gpu/src/cuda/debayer.cu
+++ b/modules/gpu/src/cuda/debayer.cu
@ -49,7 +49,7 @@ namespace cv { namespace gpu {
    namespace device
    {
        template <typename D>
-        __global__ void Bayer2BGR_8u(const PtrStepb src, DevMem2D_<D> dst, const bool blue_last, const bool start_with_green)
+        __global__ void Bayer2BGR_8u(const PtrStepb src, PtrStepSz<D> dst, const bool blue_last, const bool start_with_green)
        {
            const int s_x = blockIdx.x * blockDim.x + threadIdx.x;
            int s_y = blockIdx.y * blockDim.y + threadIdx.y;
@ -193,7 +193,7 @@ namespace cv { namespace gpu {
        }

        template <typename D>
-        __global__ void Bayer2BGR_16u(const PtrStepb src, DevMem2D_<D> dst, const bool blue_last, const bool start_with_green)
+        __global__ void Bayer2BGR_16u(const PtrStepb src, PtrStepSz<D> dst, const bool blue_last, const bool start_with_green)
        {
            const int s_x = blockIdx.x * blockDim.x + threadIdx.x;
            int s_y = blockIdx.y * blockDim.y + threadIdx.y;
@ -287,7 +287,7 @@ namespace cv { namespace gpu {
        }

        template <int cn>
-        void Bayer2BGR_8u_gpu(DevMem2Db src, DevMem2Db dst, bool blue_last, bool start_with_green, cudaStream_t stream)
+        void Bayer2BGR_8u_gpu(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream)
        {
            typedef typename TypeVec<uchar, cn>::vec_type dst_t;

@ -296,14 +296,14 @@ namespace cv { namespace gpu {

            cudaSafeCall( cudaFuncSetCacheConfig(Bayer2BGR_8u<dst_t>, cudaFuncCachePreferL1) );

-            Bayer2BGR_8u<dst_t><<<grid, block, 0, stream>>>(src, (DevMem2D_<dst_t>)dst, blue_last, start_with_green);
+            Bayer2BGR_8u<dst_t><<<grid, block, 0, stream>>>(src, (PtrStepSz<dst_t>)dst, blue_last, start_with_green);
            cudaSafeCall( cudaGetLastError() );

            if (stream == 0)
                cudaSafeCall( cudaDeviceSynchronize() );
        }
        template <int cn>
-        void Bayer2BGR_16u_gpu(DevMem2Db src, DevMem2Db dst, bool blue_last, bool start_with_green, cudaStream_t stream)
+        void Bayer2BGR_16u_gpu(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream)
        {
            typedef typename TypeVec<ushort, cn>::vec_type dst_t;

@ -312,16 +312,16 @@ namespace cv { namespace gpu {

            cudaSafeCall( cudaFuncSetCacheConfig(Bayer2BGR_16u<dst_t>, cudaFuncCachePreferL1) );

-            Bayer2BGR_16u<dst_t><<<grid, block, 0, stream>>>(src, (DevMem2D_<dst_t>)dst, blue_last, start_with_green);
+            Bayer2BGR_16u<dst_t><<<grid, block, 0, stream>>>(src, (PtrStepSz<dst_t>)dst, blue_last, start_with_green);
            cudaSafeCall( cudaGetLastError() );

            if (stream == 0)
                cudaSafeCall( cudaDeviceSynchronize() );
        }

-        template void Bayer2BGR_8u_gpu<3>(DevMem2Db src, DevMem2Db dst, bool blue_last, bool start_with_green, cudaStream_t stream);
-        template void Bayer2BGR_8u_gpu<4>(DevMem2Db src, DevMem2Db dst, bool blue_last, bool start_with_green, cudaStream_t stream);
-        template void Bayer2BGR_16u_gpu<3>(DevMem2Db src, DevMem2Db dst, bool blue_last, bool start_with_green, cudaStream_t stream);
-        template void Bayer2BGR_16u_gpu<4>(DevMem2Db src, DevMem2Db dst, bool blue_last, bool start_with_green, cudaStream_t stream);
+        template void Bayer2BGR_8u_gpu<3>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
+        template void Bayer2BGR_8u_gpu<4>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
+        template void Bayer2BGR_16u_gpu<3>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
+        template void Bayer2BGR_16u_gpu<4>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
    }
 }}
--- a/modules/gpu/src/cuda/element_operations.cu
+++ b/modules/gpu/src/cuda/element_operations.cu
--- a/modules/gpu/src/cuda/fast.cu
+++ b/modules/gpu/src/cuda/fast.cu
@ -221,7 +221,7 @@ namespace cv { namespace gpu { namespace device
        }

        template <bool calcScore, class Mask>
-        __global__ void calcKeypoints(const DevMem2Db img, const Mask mask, short2* kpLoc, const unsigned int maxKeypoints, PtrStepi score, const int threshold)
+        __global__ void calcKeypoints(const PtrStepSzb img, const Mask mask, short2* kpLoc, const unsigned int maxKeypoints, PtrStepi score, const int threshold)
        {
            #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110)

@ -282,7 +282,7 @@ namespace cv { namespace gpu { namespace device
            #endif
        }

-        int calcKeypoints_gpu(DevMem2Db img, DevMem2Db mask, short2* kpLoc, int maxKeypoints, DevMem2Di score, int threshold)
+        int calcKeypoints_gpu(PtrStepSzb img, PtrStepSzb mask, short2* kpLoc, int maxKeypoints, PtrStepSzi score, int threshold)
        {
            void* counter_ptr;
            cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, g_counter) );
@ -323,7 +323,7 @@ namespace cv { namespace gpu { namespace device
        ///////////////////////////////////////////////////////////////////////////
        // nonmaxSupression

-        __global__ void nonmaxSupression(const short2* kpLoc, int count, const DevMem2Di scoreMat, short2* locFinal, float* responseFinal)
+        __global__ void nonmaxSupression(const short2* kpLoc, int count, const PtrStepSzi scoreMat, short2* locFinal, float* responseFinal)
        {
            #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110)

@ -359,7 +359,7 @@ namespace cv { namespace gpu { namespace device
            #endif
        }

-        int nonmaxSupression_gpu(const short2* kpLoc, int count, DevMem2Di score, short2* loc, float* response)
+        int nonmaxSupression_gpu(const short2* kpLoc, int count, PtrStepSzi score, short2* loc, float* response)
        {
            void* counter_ptr;
            cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, g_counter) );
--- a/modules/gpu/src/cuda/fgd_bgfg.cu
+++ b/modules/gpu/src/cuda/fgd_bgfg.cu
@ -81,7 +81,7 @@ namespace bgfg


    template <typename PT, typename CT>
-    __global__ void calcPartialHistogram(const DevMem2D_<PT> prevFrame, const PtrStep_<CT> curFrame, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2)
+    __global__ void calcPartialHistogram(const PtrStepSz<PT> prevFrame, const PtrStep<CT> curFrame, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2)
    {
 #if (__CUDA_ARCH__ < 200)
        const int HISTOGRAM_WARP_COUNT = 4;
@ -240,7 +240,7 @@ namespace bgfg
    }

    template <typename PT, typename CT>
-    void calcDiffHistogram_gpu(DevMem2Db prevFrame, DevMem2Db curFrame,
+    void calcDiffHistogram_gpu(PtrStepSzb prevFrame, PtrStepSzb curFrame,
                               unsigned int* hist0, unsigned int* hist1, unsigned int* hist2,
                               unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2,
                               int cc, cudaStream_t stream)
@ -249,7 +249,7 @@ namespace bgfg
        const int HISTOGRAM_THREADBLOCK_SIZE = HISTOGRAM_WARP_COUNT * WARP_SIZE;

        calcPartialHistogram<PT, CT><<<PARTIAL_HISTOGRAM_COUNT, HISTOGRAM_THREADBLOCK_SIZE, 0, stream>>>(
-                (DevMem2D_<PT>)prevFrame, (DevMem2D_<CT>)curFrame, partialBuf0, partialBuf1, partialBuf2);
+                (PtrStepSz<PT>)prevFrame, (PtrStepSz<CT>)curFrame, partialBuf0, partialBuf1, partialBuf2);
        cudaSafeCall( cudaGetLastError() );

        mergeHistogram<<<HISTOGRAM_BIN_COUNT, MERGE_THREADBLOCK_SIZE, 0, stream>>>(partialBuf0, partialBuf1, partialBuf2, hist0, hist1, hist2);
@ -259,16 +259,16 @@ namespace bgfg
            cudaSafeCall( cudaDeviceSynchronize() );
    }

-    template void calcDiffHistogram_gpu<uchar3, uchar3>(DevMem2Db prevFrame, DevMem2Db curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, int cc, cudaStream_t stream);
-    template void calcDiffHistogram_gpu<uchar3, uchar4>(DevMem2Db prevFrame, DevMem2Db curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, int cc, cudaStream_t stream);
-    template void calcDiffHistogram_gpu<uchar4, uchar3>(DevMem2Db prevFrame, DevMem2Db curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, int cc, cudaStream_t stream);
-    template void calcDiffHistogram_gpu<uchar4, uchar4>(DevMem2Db prevFrame, DevMem2Db curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, int cc, cudaStream_t stream);
+    template void calcDiffHistogram_gpu<uchar3, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, int cc, cudaStream_t stream);
+    template void calcDiffHistogram_gpu<uchar3, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, int cc, cudaStream_t stream);
+    template void calcDiffHistogram_gpu<uchar4, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, int cc, cudaStream_t stream);
+    template void calcDiffHistogram_gpu<uchar4, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, int cc, cudaStream_t stream);

    /////////////////////////////////////////////////////////////////////////
    // calcDiffThreshMask

    template <typename PT, typename CT>
-    __global__ void calcDiffThreshMask(const DevMem2D_<PT> prevFrame, const PtrStep_<CT> curFrame, uchar3 bestThres, PtrStepb changeMask)
+    __global__ void calcDiffThreshMask(const PtrStepSz<PT> prevFrame, const PtrStep<CT> curFrame, uchar3 bestThres, PtrStepb changeMask)
    {
        const int y = blockIdx.y * blockDim.y + threadIdx.y;
        const int x = blockIdx.x * blockDim.x + threadIdx.x;
@ -290,22 +290,22 @@ namespace bgfg
    }

    template <typename PT, typename CT>
-    void calcDiffThreshMask_gpu(DevMem2Db prevFrame, DevMem2Db curFrame, uchar3 bestThres, DevMem2Db changeMask, cudaStream_t stream)
+    void calcDiffThreshMask_gpu(PtrStepSzb prevFrame, PtrStepSzb curFrame, uchar3 bestThres, PtrStepSzb changeMask, cudaStream_t stream)
    {
        dim3 block(32, 8);
        dim3 grid(divUp(prevFrame.cols, block.x), divUp(prevFrame.rows, block.y));

-        calcDiffThreshMask<PT, CT><<<grid, block, 0, stream>>>((DevMem2D_<PT>)prevFrame, (DevMem2D_<CT>)curFrame, bestThres, changeMask);
+        calcDiffThreshMask<PT, CT><<<grid, block, 0, stream>>>((PtrStepSz<PT>)prevFrame, (PtrStepSz<CT>)curFrame, bestThres, changeMask);
        cudaSafeCall( cudaGetLastError() );

        if (stream == 0)
            cudaSafeCall( cudaDeviceSynchronize() );
    }

-    template void calcDiffThreshMask_gpu<uchar3, uchar3>(DevMem2Db prevFrame, DevMem2Db curFrame, uchar3 bestThres, DevMem2Db changeMask, cudaStream_t stream);
-    template void calcDiffThreshMask_gpu<uchar3, uchar4>(DevMem2Db prevFrame, DevMem2Db curFrame, uchar3 bestThres, DevMem2Db changeMask, cudaStream_t stream);
-    template void calcDiffThreshMask_gpu<uchar4, uchar3>(DevMem2Db prevFrame, DevMem2Db curFrame, uchar3 bestThres, DevMem2Db changeMask, cudaStream_t stream);
-    template void calcDiffThreshMask_gpu<uchar4, uchar4>(DevMem2Db prevFrame, DevMem2Db curFrame, uchar3 bestThres, DevMem2Db changeMask, cudaStream_t stream);
+    template void calcDiffThreshMask_gpu<uchar3, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, uchar3 bestThres, PtrStepSzb changeMask, cudaStream_t stream);
+    template void calcDiffThreshMask_gpu<uchar3, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, uchar3 bestThres, PtrStepSzb changeMask, cudaStream_t stream);
+    template void calcDiffThreshMask_gpu<uchar4, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, uchar3 bestThres, PtrStepSzb changeMask, cudaStream_t stream);
+    template void calcDiffThreshMask_gpu<uchar4, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, uchar3 bestThres, PtrStepSzb changeMask, cudaStream_t stream);

    /////////////////////////////////////////////////////////////////////////
    // bgfgClassification
@ -334,7 +334,7 @@ namespace bgfg
    };

    template <typename PT, typename CT, typename OT>
-    __global__ void bgfgClassification(const DevMem2D_<PT> prevFrame, const PtrStep_<CT> curFrame,
+    __global__ void bgfgClassification(const PtrStepSz<PT> prevFrame, const PtrStep<CT> curFrame,
                                       const PtrStepb Ftd, const PtrStepb Fbd, PtrStepb foreground,
                                       int deltaC, int deltaCC, float alpha2, int N1c, int N1cc)
    {
@ -413,7 +413,7 @@ namespace bgfg
    }

    template <typename PT, typename CT, typename OT>
-    void bgfgClassification_gpu(DevMem2Db prevFrame, DevMem2Db curFrame, DevMem2Db Ftd, DevMem2Db Fbd, DevMem2Db foreground,
+    void bgfgClassification_gpu(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground,
                                int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream)
    {
        dim3 block(32, 8);
@ -421,7 +421,7 @@ namespace bgfg

        cudaSafeCall( cudaFuncSetCacheConfig(bgfgClassification<PT, CT, OT>, cudaFuncCachePreferL1) );

-        bgfgClassification<PT, CT, OT><<<grid, block, 0, stream>>>((DevMem2D_<PT>)prevFrame, (DevMem2D_<CT>)curFrame,
+        bgfgClassification<PT, CT, OT><<<grid, block, 0, stream>>>((PtrStepSz<PT>)prevFrame, (PtrStepSz<CT>)curFrame,
                                                                   Ftd, Fbd, foreground,
                                                                   deltaC, deltaCC, alpha2, N1c, N1cc);
        cudaSafeCall( cudaGetLastError() );
@ -430,21 +430,21 @@ namespace bgfg
            cudaSafeCall( cudaDeviceSynchronize() );
    }

-    template void bgfgClassification_gpu<uchar3, uchar3, uchar3>(DevMem2Db prevFrame, DevMem2Db curFrame, DevMem2Db Ftd, DevMem2Db Fbd, DevMem2Db foreground, int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
-    template void bgfgClassification_gpu<uchar3, uchar3, uchar4>(DevMem2Db prevFrame, DevMem2Db curFrame, DevMem2Db Ftd, DevMem2Db Fbd, DevMem2Db foreground, int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
-    template void bgfgClassification_gpu<uchar3, uchar4, uchar3>(DevMem2Db prevFrame, DevMem2Db curFrame, DevMem2Db Ftd, DevMem2Db Fbd, DevMem2Db foreground, int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
-    template void bgfgClassification_gpu<uchar3, uchar4, uchar4>(DevMem2Db prevFrame, DevMem2Db curFrame, DevMem2Db Ftd, DevMem2Db Fbd, DevMem2Db foreground, int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
-    template void bgfgClassification_gpu<uchar4, uchar3, uchar3>(DevMem2Db prevFrame, DevMem2Db curFrame, DevMem2Db Ftd, DevMem2Db Fbd, DevMem2Db foreground, int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
-    template void bgfgClassification_gpu<uchar4, uchar3, uchar4>(DevMem2Db prevFrame, DevMem2Db curFrame, DevMem2Db Ftd, DevMem2Db Fbd, DevMem2Db foreground, int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
-    template void bgfgClassification_gpu<uchar4, uchar4, uchar3>(DevMem2Db prevFrame, DevMem2Db curFrame, DevMem2Db Ftd, DevMem2Db Fbd, DevMem2Db foreground, int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
-    template void bgfgClassification_gpu<uchar4, uchar4, uchar4>(DevMem2Db prevFrame, DevMem2Db curFrame, DevMem2Db Ftd, DevMem2Db Fbd, DevMem2Db foreground, int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
+    template void bgfgClassification_gpu<uchar3, uchar3, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
+    template void bgfgClassification_gpu<uchar3, uchar3, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
+    template void bgfgClassification_gpu<uchar3, uchar4, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
+    template void bgfgClassification_gpu<uchar3, uchar4, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
+    template void bgfgClassification_gpu<uchar4, uchar3, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
+    template void bgfgClassification_gpu<uchar4, uchar3, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
+    template void bgfgClassification_gpu<uchar4, uchar4, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
+    template void bgfgClassification_gpu<uchar4, uchar4, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);

    ////////////////////////////////////////////////////////////////////////////
    // updateBackgroundModel

    template <typename PT, typename CT, typename OT, class PrevFramePtr2D, class CurFramePtr2D, class FtdPtr2D, class FbdPtr2D>
    __global__ void updateBackgroundModel(int cols, int rows, const PrevFramePtr2D prevFrame, const CurFramePtr2D curFrame, const FtdPtr2D Ftd, const FbdPtr2D Fbd,
-                                          PtrStepb foreground, PtrStep_<OT> background,
+                                          PtrStepb foreground, PtrStep<OT> background,
                                          int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T)
    {
        const int i = blockIdx.y * blockDim.y + threadIdx.y;
@ -803,16 +803,16 @@ namespace bgfg
    template <typename PT, typename CT, typename OT>
    struct UpdateBackgroundModel
    {
-        static void call(DevMem2D_<PT> prevFrame, DevMem2D_<CT> curFrame, DevMem2Db Ftd, DevMem2Db Fbd, DevMem2Db foreground, DevMem2D_<OT> background,
+        static void call(PtrStepSz<PT> prevFrame, PtrStepSz<CT> curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSz<OT> background,
                         int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T,
                         cudaStream_t stream)
        {
            dim3 block(32, 8);
            dim3 grid(divUp(prevFrame.cols, block.x), divUp(prevFrame.rows, block.y));

-            cudaSafeCall( cudaFuncSetCacheConfig(updateBackgroundModel<PT, CT, OT, PtrStep_<PT>, PtrStep_<CT>, PtrStepb, PtrStepb>, cudaFuncCachePreferL1) );
+            cudaSafeCall( cudaFuncSetCacheConfig(updateBackgroundModel<PT, CT, OT, PtrStep<PT>, PtrStep<CT>, PtrStepb, PtrStepb>, cudaFuncCachePreferL1) );

-            updateBackgroundModel<PT, CT, OT, PtrStep_<PT>, PtrStep_<CT>, PtrStepb, PtrStepb><<<grid, block, 0, stream>>>(
+            updateBackgroundModel<PT, CT, OT, PtrStep<PT>, PtrStep<CT>, PtrStepb, PtrStepb><<<grid, block, 0, stream>>>(
                prevFrame.cols, prevFrame.rows,
                prevFrame, curFrame,
                Ftd, Fbd, foreground, background,
@ -825,20 +825,20 @@ namespace bgfg
    };

    template <typename PT, typename CT, typename OT>
-    void updateBackgroundModel_gpu(DevMem2Db prevFrame, DevMem2Db curFrame, DevMem2Db Ftd, DevMem2Db Fbd, DevMem2Db foreground, DevMem2Db background,
+    void updateBackgroundModel_gpu(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSzb background,
                                   int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T,
                                   cudaStream_t stream)
    {
-        UpdateBackgroundModel<PT, CT, OT>::call(DevMem2D_<PT>(prevFrame), DevMem2D_<CT>(curFrame), Ftd, Fbd, foreground, DevMem2D_<OT>(background),
+        UpdateBackgroundModel<PT, CT, OT>::call(PtrStepSz<PT>(prevFrame), PtrStepSz<CT>(curFrame), Ftd, Fbd, foreground, PtrStepSz<OT>(background),
                                                deltaC, deltaCC, alpha1, alpha2, alpha3, N1c, N1cc, N2c, N2cc, T, stream);
    }

-    template void updateBackgroundModel_gpu<uchar3, uchar3, uchar3>(DevMem2Db prevFrame, DevMem2Db curFrame, DevMem2Db Ftd, DevMem2Db Fbd, DevMem2Db foreground, DevMem2Db background, int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
-    template void updateBackgroundModel_gpu<uchar3, uchar3, uchar4>(DevMem2Db prevFrame, DevMem2Db curFrame, DevMem2Db Ftd, DevMem2Db Fbd, DevMem2Db foreground, DevMem2Db background, int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
-    template void updateBackgroundModel_gpu<uchar3, uchar4, uchar3>(DevMem2Db prevFrame, DevMem2Db curFrame, DevMem2Db Ftd, DevMem2Db Fbd, DevMem2Db foreground, DevMem2Db background, int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
-    template void updateBackgroundModel_gpu<uchar3, uchar4, uchar4>(DevMem2Db prevFrame, DevMem2Db curFrame, DevMem2Db Ftd, DevMem2Db Fbd, DevMem2Db foreground, DevMem2Db background, int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
-    template void updateBackgroundModel_gpu<uchar4, uchar3, uchar3>(DevMem2Db prevFrame, DevMem2Db curFrame, DevMem2Db Ftd, DevMem2Db Fbd, DevMem2Db foreground, DevMem2Db background, int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
-    template void updateBackgroundModel_gpu<uchar4, uchar3, uchar4>(DevMem2Db prevFrame, DevMem2Db curFrame, DevMem2Db Ftd, DevMem2Db Fbd, DevMem2Db foreground, DevMem2Db background, int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
-    template void updateBackgroundModel_gpu<uchar4, uchar4, uchar3>(DevMem2Db prevFrame, DevMem2Db curFrame, DevMem2Db Ftd, DevMem2Db Fbd, DevMem2Db foreground, DevMem2Db background, int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
-    template void updateBackgroundModel_gpu<uchar4, uchar4, uchar4>(DevMem2Db prevFrame, DevMem2Db curFrame, DevMem2Db Ftd, DevMem2Db Fbd, DevMem2Db foreground, DevMem2Db background, int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
+    template void updateBackgroundModel_gpu<uchar3, uchar3, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSzb background, int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
+    template void updateBackgroundModel_gpu<uchar3, uchar3, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSzb background, int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
+    template void updateBackgroundModel_gpu<uchar3, uchar4, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSzb background, int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
+    template void updateBackgroundModel_gpu<uchar3, uchar4, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSzb background, int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
+    template void updateBackgroundModel_gpu<uchar4, uchar3, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSzb background, int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
+    template void updateBackgroundModel_gpu<uchar4, uchar3, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSzb background, int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
+    template void updateBackgroundModel_gpu<uchar4, uchar4, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSzb background, int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
+    template void updateBackgroundModel_gpu<uchar4, uchar4, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSzb background, int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
 }
--- a/modules/gpu/src/cuda/fgd_bgfg_common.hpp
+++ b/modules/gpu/src/cuda/fgd_bgfg_common.hpp
@ -1,7 +1,7 @@
 #ifndef __FGD_BGFG_COMMON_HPP__
 #define __FGD_BGFG_COMMON_HPP__

-#include "opencv2/core/devmem2d.hpp"
+#include "opencv2/core/cuda_devptrs.hpp"

 namespace bgfg
 {
@ -122,24 +122,24 @@ namespace bgfg
    const int HISTOGRAM_BIN_COUNT = 256;

    template <typename PT, typename CT>
-    void calcDiffHistogram_gpu(cv::gpu::DevMem2Db prevFrame, cv::gpu::DevMem2Db curFrame,
+    void calcDiffHistogram_gpu(cv::gpu::PtrStepSzb prevFrame, cv::gpu::PtrStepSzb curFrame,
                               unsigned int* hist0, unsigned int* hist1, unsigned int* hist2,
                               unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2,
                               int cc, cudaStream_t stream);

    template <typename PT, typename CT>
-    void calcDiffThreshMask_gpu(cv::gpu::DevMem2Db prevFrame, cv::gpu::DevMem2Db curFrame, uchar3 bestThres, cv::gpu::DevMem2Db changeMask, cudaStream_t stream);
+    void calcDiffThreshMask_gpu(cv::gpu::PtrStepSzb prevFrame, cv::gpu::PtrStepSzb curFrame, uchar3 bestThres, cv::gpu::PtrStepSzb changeMask, cudaStream_t stream);

    void setBGPixelStat(const BGPixelStat& stat);

    template <typename PT, typename CT, typename OT>
-    void bgfgClassification_gpu(cv::gpu::DevMem2Db prevFrame, cv::gpu::DevMem2Db curFrame,
-                                cv::gpu::DevMem2Db Ftd, cv::gpu::DevMem2Db Fbd, cv::gpu::DevMem2Db foreground,
+    void bgfgClassification_gpu(cv::gpu::PtrStepSzb prevFrame, cv::gpu::PtrStepSzb curFrame,
+                                cv::gpu::PtrStepSzb Ftd, cv::gpu::PtrStepSzb Fbd, cv::gpu::PtrStepSzb foreground,
                                int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);

    template <typename PT, typename CT, typename OT>
-    void updateBackgroundModel_gpu(cv::gpu::DevMem2Db prevFrame, cv::gpu::DevMem2Db curFrame,
-                                   cv::gpu::DevMem2Db Ftd, cv::gpu::DevMem2Db Fbd, cv::gpu::DevMem2Db foreground, cv::gpu::DevMem2Db background,
+    void updateBackgroundModel_gpu(cv::gpu::PtrStepSzb prevFrame, cv::gpu::PtrStepSzb curFrame,
+                                   cv::gpu::PtrStepSzb Ftd, cv::gpu::PtrStepSzb Fbd, cv::gpu::PtrStepSzb foreground, cv::gpu::PtrStepSzb background,
                                   int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T,
                                   cudaStream_t stream);
 }
--- a/modules/gpu/src/cuda/gftt.cu
+++ b/modules/gpu/src/cuda/gftt.cu
@ -97,7 +97,7 @@ namespace cv { namespace gpu { namespace device
            #endif // __CUDA_ARCH__ >= 110
        }

-        int findCorners_gpu(DevMem2Df eig, float threshold, DevMem2Db mask, float2* corners, int max_count)
+        int findCorners_gpu(PtrStepSzf eig, float threshold, PtrStepSzb mask, float2* corners, int max_count)
        {
            void* counter_ptr;
            cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, g_counter) );
@ -134,7 +134,7 @@ namespace cv { namespace gpu { namespace device
        };


-        void sortCorners_gpu(DevMem2Df eig, float2* corners, int count)
+        void sortCorners_gpu(PtrStepSzf eig, float2* corners, int count)
        {
            bindTexture(&eigTex, eig);

--- a/modules/gpu/src/cuda/global_motion.cu
+++ b/modules/gpu/src/cuda/global_motion.cu
@ -68,7 +68,7 @@ int compactPoints(int N, float *points0, float *points1, const uchar *mask)

 __global__ void calcWobbleSuppressionMapsKernel(
        const int left, const int idx, const int right, const int width, const int height,
-        PtrElemStepf mapx, PtrElemStepf mapy)
+        PtrStepf mapx, PtrStepf mapy)
 {
    const int x = blockDim.x * blockIdx.x + threadIdx.x;
    const int y = blockDim.y * blockIdx.y + threadIdx.y;
@ -97,7 +97,7 @@ __global__ void calcWobbleSuppressionMapsKernel(

 void calcWobbleSuppressionMaps(
        int left, int idx, int right, int width, int height,
-        const float *ml, const float *mr, DevMem2Df mapx, DevMem2Df mapy)
+        const float *ml, const float *mr, PtrStepSzf mapx, PtrStepSzf mapy)
 {
    cudaSafeCall(cudaMemcpyToSymbol(cml, ml, 9*sizeof(float)));
    cudaSafeCall(cudaMemcpyToSymbol(cmr, mr, 9*sizeof(float)));
--- a/modules/gpu/src/cuda/hist.cu
+++ b/modules/gpu/src/cuda/hist.cu
@ -169,10 +169,10 @@ namespace cv { namespace gpu { namespace device
                d_Histogram[blockIdx.x] = saturate_cast<int>(data[0]);
        }

-        void histogram256_gpu(DevMem2Db src, int* hist, uint* buf, cudaStream_t stream)
+        void histogram256_gpu(PtrStepSzb src, int* hist, uint* buf, cudaStream_t stream)
        {
            histogram256<<<PARTIAL_HISTOGRAM256_COUNT, HISTOGRAM256_THREADBLOCK_SIZE, 0, stream>>>(
-                DevMem2D_<uint>(src),
+                PtrStepSz<uint>(src),
                buf,
                static_cast<uint>(src.rows * src.step / sizeof(uint)),
                src.cols);
@ -189,7 +189,7 @@ namespace cv { namespace gpu { namespace device

        __constant__ int c_lut[256];

-        __global__ void equalizeHist(const DevMem2Db src, PtrStepb dst)
+        __global__ void equalizeHist(const PtrStepSzb src, PtrStepb dst)
        {
            const int x = blockIdx.x * blockDim.x + threadIdx.x;
            const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -202,7 +202,7 @@ namespace cv { namespace gpu { namespace device
            }
        }

-        void equalizeHist_gpu(DevMem2Db src, DevMem2Db dst, const int* lut, cudaStream_t stream)
+        void equalizeHist_gpu(PtrStepSzb src, PtrStepSzb dst, const int* lut, cudaStream_t stream)
        {
            dim3 block(16, 16);
            dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
--- a/modules/gpu/src/cuda/hog.cu
+++ b/modules/gpu/src/cuda/hog.cu
@ -110,8 +110,8 @@ namespace cv { namespace gpu { namespace device


        template <int nblocks> // Number of histogram blocks processed by single GPU thread block
-        __global__ void compute_hists_kernel_many_blocks(const int img_block_width, const PtrElemStepf grad,
-                                                         const PtrElemStep qangle, float scale, float* block_hists)
+        __global__ void compute_hists_kernel_many_blocks(const int img_block_width, const PtrStepf grad,
+                                                         const PtrStepb qangle, float scale, float* block_hists)
        {
            const int block_x = threadIdx.z;
            const int cell_x = threadIdx.x / 16;
@ -149,8 +149,8 @@ namespace cv { namespace gpu { namespace device
                    float2 vote = *(const float2*)grad_ptr;
                    uchar2 bin = *(const uchar2*)qangle_ptr;

-                    grad_ptr += grad.step;
-                    qangle_ptr += qangle.step;
+                    grad_ptr += grad.step/grad.elemSize();
+                    qangle_ptr += qangle.step/qangle.elemSize();

                    int dist_center_y = dist_y - 4 * (1 - 2 * cell_y);
                    int dist_center_x = dist_x - 4 * (1 - 2 * cell_x);
@ -188,8 +188,8 @@ namespace cv { namespace gpu { namespace device


        void compute_hists(int nbins, int block_stride_x, int block_stride_y,
-                           int height, int width, const DevMem2Df& grad,
-                           const DevMem2Db& qangle, float sigma, float* block_hists)
+                           int height, int width, const PtrStepSzf& grad,
+                           const PtrStepSzb& qangle, float sigma, float* block_hists)
        {
            const int nblocks = 1;

@ -512,7 +512,7 @@ namespace cv { namespace gpu { namespace device

        template <int nthreads>
        __global__ void extract_descrs_by_rows_kernel(const int img_block_width, const int win_block_stride_x, const int win_block_stride_y,
-											          const float* block_hists, PtrElemStepf descriptors)
+											          const float* block_hists, PtrStepf descriptors)
        {
            // Get left top corner of the window in src
            const float* hist = block_hists + (blockIdx.y * win_block_stride_y * img_block_width +
@ -532,7 +532,7 @@ namespace cv { namespace gpu { namespace device


        void extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, int block_stride_x, int win_stride_y, int win_stride_x,
-							        int height, int width, float* block_hists, DevMem2Df descriptors)
+							        int height, int width, float* block_hists, PtrStepSzf descriptors)
        {
            const int nthreads = 256;

@ -555,7 +555,7 @@ namespace cv { namespace gpu { namespace device
        template <int nthreads>
        __global__ void extract_descrs_by_cols_kernel(const int img_block_width, const int win_block_stride_x,
                                                      const int win_block_stride_y, const float* block_hists,
-                                                      PtrElemStepf descriptors)
+                                                      PtrStepf descriptors)
        {
            // Get left top corner of the window in src
            const float* hist = block_hists + (blockIdx.y * win_block_stride_y * img_block_width +
@ -581,7 +581,7 @@ namespace cv { namespace gpu { namespace device

        void extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, int block_stride_x,
                                    int win_stride_y, int win_stride_x, int height, int width, float* block_hists,
-                                    DevMem2Df descriptors)
+                                    PtrStepSzf descriptors)
        {
            const int nthreads = 256;

@ -605,8 +605,8 @@ namespace cv { namespace gpu { namespace device


        template <int nthreads, int correct_gamma>
-        __global__ void compute_gradients_8UC4_kernel(int height, int width, const PtrElemStep img,
-                                                      float angle_scale, PtrElemStepf grad, PtrElemStep qangle)
+        __global__ void compute_gradients_8UC4_kernel(int height, int width, const PtrStepb img,
+                                                      float angle_scale, PtrStepf grad, PtrStepb qangle)
        {
            const int x = blockIdx.x * blockDim.x + threadIdx.x;

@ -707,8 +707,8 @@ namespace cv { namespace gpu { namespace device
        }


-        void compute_gradients_8UC4(int nbins, int height, int width, const DevMem2Db& img,
-                                    float angle_scale, DevMem2Df grad, DevMem2Db qangle, bool correct_gamma)
+        void compute_gradients_8UC4(int nbins, int height, int width, const PtrStepSzb& img,
+                                    float angle_scale, PtrStepSzf grad, PtrStepSzb qangle, bool correct_gamma)
        {
            (void)nbins;
            const int nthreads = 256;
@ -727,8 +727,8 @@ namespace cv { namespace gpu { namespace device
        }

        template <int nthreads, int correct_gamma>
-        __global__ void compute_gradients_8UC1_kernel(int height, int width, const PtrElemStep img,
-                                                      float angle_scale, PtrElemStepf grad, PtrElemStep qangle)
+        __global__ void compute_gradients_8UC1_kernel(int height, int width, const PtrStepb img,
+                                                      float angle_scale, PtrStepf grad, PtrStepb qangle)
        {
            const int x = blockIdx.x * blockDim.x + threadIdx.x;

@ -780,8 +780,8 @@ namespace cv { namespace gpu { namespace device
        }


-        void compute_gradients_8UC1(int nbins, int height, int width, const DevMem2Db& img,
-                                    float angle_scale, DevMem2Df grad, DevMem2Db qangle, bool correct_gamma)
+        void compute_gradients_8UC1(int nbins, int height, int width, const PtrStepSzb& img,
+                                    float angle_scale, PtrStepSzf grad, PtrStepSzb qangle, bool correct_gamma)
        {
            (void)nbins;
            const int nthreads = 256;
@ -807,7 +807,7 @@ namespace cv { namespace gpu { namespace device
        texture<uchar4, 2, cudaReadModeNormalizedFloat> resize8UC4_tex;
        texture<uchar,  2, cudaReadModeNormalizedFloat> resize8UC1_tex;

-        __global__ void resize_for_hog_kernel(float sx, float sy, DevMem2D_<uchar> dst, int colOfs)
+        __global__ void resize_for_hog_kernel(float sx, float sy, PtrStepSz<uchar> dst, int colOfs)
        {
            unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
            unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -816,7 +816,7 @@ namespace cv { namespace gpu { namespace device
                dst.ptr(y)[x] = tex2D(resize8UC1_tex, x * sx + colOfs, y * sy) * 255;
        }

-        __global__ void resize_for_hog_kernel(float sx, float sy, DevMem2D_<uchar4> dst, int colOfs)
+        __global__ void resize_for_hog_kernel(float sx, float sy, PtrStepSz<uchar4> dst, int colOfs)
        {
            unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
            unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -829,7 +829,7 @@ namespace cv { namespace gpu { namespace device
        }

        template<class T, class TEX>
-        static void resize_for_hog(const DevMem2Db& src, DevMem2Db dst, TEX& tex)
+        static void resize_for_hog(const PtrStepSzb& src, PtrStepSzb dst, TEX& tex)
        {
            tex.filterMode = cudaFilterModeLinear;

@ -852,7 +852,7 @@ namespace cv { namespace gpu { namespace device
 	        float sx = static_cast<float>(src.cols) / dst.cols;
            float sy = static_cast<float>(src.rows) / dst.rows;

-            resize_for_hog_kernel<<<grid, threads>>>(sx, sy, (DevMem2D_<T>)dst, colOfs);
+            resize_for_hog_kernel<<<grid, threads>>>(sx, sy, (PtrStepSz<T>)dst, colOfs);
            cudaSafeCall( cudaGetLastError() );

            cudaSafeCall( cudaDeviceSynchronize() );
@ -860,7 +860,7 @@ namespace cv { namespace gpu { namespace device
            cudaSafeCall( cudaUnbindTexture(tex) );
        }

-        void resize_8UC1(const DevMem2Db& src, DevMem2Db dst) { resize_for_hog<uchar> (src, dst, resize8UC1_tex); }
-        void resize_8UC4(const DevMem2Db& src, DevMem2Db dst) { resize_for_hog<uchar4>(src, dst, resize8UC4_tex); }
+        void resize_8UC1(const PtrStepSzb& src, PtrStepSzb dst) { resize_for_hog<uchar> (src, dst, resize8UC1_tex); }
+        void resize_8UC4(const PtrStepSzb& src, PtrStepSzb dst) { resize_for_hog<uchar4>(src, dst, resize8UC4_tex); }
    } // namespace hog
 }}} // namespace cv { namespace gpu { namespace device
--- a/modules/gpu/src/cuda/hough.cu
+++ b/modules/gpu/src/cuda/hough.cu
@ -55,7 +55,7 @@ namespace cv { namespace gpu { namespace device

        const int PIXELS_PER_THREAD = 16;

-        __global__ void buildPointList(const DevMem2Db src, unsigned int* list)
+        __global__ void buildPointList(const PtrStepSzb src, unsigned int* list)
        {
            __shared__ unsigned int s_queues[4][32 * PIXELS_PER_THREAD];
            __shared__ int s_qsize[4];
@ -112,7 +112,7 @@ namespace cv { namespace gpu { namespace device
                list[gidx] = s_queues[threadIdx.y][i];
        }

-        int buildPointList_gpu(DevMem2Db src, unsigned int* list)
+        int buildPointList_gpu(PtrStepSzb src, unsigned int* list)
        {
            void* counterPtr;
            cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
@ -206,7 +206,7 @@ namespace cv { namespace gpu { namespace device
                accumRow[i] = smem[i];
        }

-        void linesAccum_gpu(const unsigned int* list, int count, DevMem2Di accum, float rho, float theta, size_t sharedMemPerBlock, bool has20)
+        void linesAccum_gpu(const unsigned int* list, int count, PtrStepSzi accum, float rho, float theta, size_t sharedMemPerBlock, bool has20)
        {
            const dim3 block(has20 ? 1024 : 512);
            const dim3 grid(accum.rows - 2);
@ -226,7 +226,7 @@ namespace cv { namespace gpu { namespace device
        ////////////////////////////////////////////////////////////////////////
        // linesGetResult

-        __global__ void linesGetResult(const DevMem2Di accum, float2* out, int* votes, const int maxSize, const float rho, const float theta, const int threshold, const int numrho)
+        __global__ void linesGetResult(const PtrStepSzi accum, float2* out, int* votes, const int maxSize, const float rho, const float theta, const int threshold, const int numrho)
        {
            const int r = blockIdx.x * blockDim.x + threadIdx.x;
            const int n = blockIdx.y * blockDim.y + threadIdx.y;
@ -254,7 +254,7 @@ namespace cv { namespace gpu { namespace device
            }
        }

-        int linesGetResult_gpu(DevMem2Di accum, float2* out, int* votes, int maxSize, float rho, float theta, int threshold, bool doSort)
+        int linesGetResult_gpu(PtrStepSzi accum, float2* out, int* votes, int maxSize, float rho, float theta, int threshold, bool doSort)
        {
            void* counterPtr;
            cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
@ -341,7 +341,7 @@ namespace cv { namespace gpu { namespace device
            }
        }

-        void circlesAccumCenters_gpu(const unsigned int* list, int count, PtrStepi dx, PtrStepi dy, DevMem2Di accum, int minRadius, int maxRadius, float idp)
+        void circlesAccumCenters_gpu(const unsigned int* list, int count, PtrStepi dx, PtrStepi dy, PtrStepSzi accum, int minRadius, int maxRadius, float idp)
        {
            const dim3 block(256);
            const dim3 grid(divUp(count, block.x));
@ -357,7 +357,7 @@ namespace cv { namespace gpu { namespace device
        ////////////////////////////////////////////////////////////////////////
        // buildCentersList

-        __global__ void buildCentersList(const DevMem2Di accum, unsigned int* centers, const int threshold)
+        __global__ void buildCentersList(const PtrStepSzi accum, unsigned int* centers, const int threshold)
        {
            const int x = blockIdx.x * blockDim.x + threadIdx.x;
            const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -381,7 +381,7 @@ namespace cv { namespace gpu { namespace device
            }
        }

-        int buildCentersList_gpu(DevMem2Di accum, unsigned int* centers, int threshold)
+        int buildCentersList_gpu(PtrStepSzi accum, unsigned int* centers, int threshold)
        {
            void* counterPtr;
            cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
@ -467,7 +467,7 @@ namespace cv { namespace gpu { namespace device
            const dim3 block(has20 ? 1024 : 512);
            const dim3 grid(centersCount);

-            const int histSize = ::ceil(maxRadius - minRadius + 1);
+            const int histSize = maxRadius - minRadius + 1;
            size_t smemSize = (histSize + 2) * sizeof(int);

            circlesAccumRadius<<<grid, block, smemSize>>>(centers, list, count, circles, maxCircles, dp, minRadius, maxRadius, histSize, threshold);
--- a/modules/gpu/src/cuda/imgproc.cu
+++ b/modules/gpu/src/cuda/imgproc.cu
@ -143,7 +143,7 @@ namespace cv { namespace gpu { namespace device
            }
        }

-        void meanShiftFiltering_gpu(const DevMem2Db& src, DevMem2Db dst, int sp, int sr, int maxIter, float eps, cudaStream_t stream)
+        void meanShiftFiltering_gpu(const PtrStepSzb& src, PtrStepSzb dst, int sp, int sr, int maxIter, float eps, cudaStream_t stream)
        {
            dim3 grid(1, 1, 1);
            dim3 threads(32, 8, 1);
@ -162,7 +162,7 @@ namespace cv { namespace gpu { namespace device
            //cudaSafeCall( cudaUnbindTexture( tex_meanshift ) );
        }

-        void meanShiftProc_gpu(const DevMem2Db& src, DevMem2Db dstr, DevMem2Db dstsp, int sp, int sr, int maxIter, float eps, cudaStream_t stream)
+        void meanShiftProc_gpu(const PtrStepSzb& src, PtrStepSzb dstr, PtrStepSzb dstsp, int sp, int sr, int maxIter, float eps, cudaStream_t stream)
        {
            dim3 grid(1, 1, 1);
            dim3 threads(32, 8, 1);
@ -284,7 +284,7 @@ namespace cv { namespace gpu { namespace device
        }


-        void drawColorDisp_gpu(const DevMem2Db& src, const DevMem2Db& dst, int ndisp, const cudaStream_t& stream)
+        void drawColorDisp_gpu(const PtrStepSzb& src, const PtrStepSzb& dst, int ndisp, const cudaStream_t& stream)
        {
            dim3 threads(16, 16, 1);
            dim3 grid(1, 1, 1);
@ -298,7 +298,7 @@ namespace cv { namespace gpu { namespace device
                cudaSafeCall( cudaDeviceSynchronize() );
        }

-        void drawColorDisp_gpu(const DevMem2D_<short>& src, const DevMem2Db& dst, int ndisp, const cudaStream_t& stream)
+        void drawColorDisp_gpu(const PtrStepSz<short>& src, const PtrStepSzb& dst, int ndisp, const cudaStream_t& stream)
        {
            dim3 threads(32, 8, 1);
            dim3 grid(1, 1, 1);
@ -317,7 +317,7 @@ namespace cv { namespace gpu { namespace device
        __constant__ float cq[16];

        template <typename T, typename D>
-        __global__ void reprojectImageTo3D(const DevMem2D_<T> disp, PtrStep<D> xyz)
+        __global__ void reprojectImageTo3D(const PtrStepSz<T> disp, PtrStep<D> xyz)
        {
            const int x = blockIdx.x * blockDim.x + threadIdx.x;
            const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -343,31 +343,31 @@ namespace cv { namespace gpu { namespace device
        }

        template <typename T, typename D>
-        void reprojectImageTo3D_gpu(const DevMem2Db disp, DevMem2Db xyz, const float* q, cudaStream_t stream)
+        void reprojectImageTo3D_gpu(const PtrStepSzb disp, PtrStepSzb xyz, const float* q, cudaStream_t stream)
        {
            dim3 block(32, 8);
            dim3 grid(divUp(disp.cols, block.x), divUp(disp.rows, block.y));

            cudaSafeCall( cudaMemcpyToSymbol(cq, q, 16 * sizeof(float)) );

-            reprojectImageTo3D<T, D><<<grid, block, 0, stream>>>((DevMem2D_<T>)disp, (DevMem2D_<D>)xyz);
+            reprojectImageTo3D<T, D><<<grid, block, 0, stream>>>((PtrStepSz<T>)disp, (PtrStepSz<D>)xyz);
            cudaSafeCall( cudaGetLastError() );

            if (stream == 0)
                cudaSafeCall( cudaDeviceSynchronize() );
        }

-        template void reprojectImageTo3D_gpu<uchar, float3>(const DevMem2Db disp, DevMem2Db xyz, const float* q, cudaStream_t stream);
-        template void reprojectImageTo3D_gpu<uchar, float4>(const DevMem2Db disp, DevMem2Db xyz, const float* q, cudaStream_t stream);
-        template void reprojectImageTo3D_gpu<short, float3>(const DevMem2Db disp, DevMem2Db xyz, const float* q, cudaStream_t stream);
-        template void reprojectImageTo3D_gpu<short, float4>(const DevMem2Db disp, DevMem2Db xyz, const float* q, cudaStream_t stream);
+        template void reprojectImageTo3D_gpu<uchar, float3>(const PtrStepSzb disp, PtrStepSzb xyz, const float* q, cudaStream_t stream);
+        template void reprojectImageTo3D_gpu<uchar, float4>(const PtrStepSzb disp, PtrStepSzb xyz, const float* q, cudaStream_t stream);
+        template void reprojectImageTo3D_gpu<short, float3>(const PtrStepSzb disp, PtrStepSzb xyz, const float* q, cudaStream_t stream);
+        template void reprojectImageTo3D_gpu<short, float4>(const PtrStepSzb disp, PtrStepSzb xyz, const float* q, cudaStream_t stream);

        /////////////////////////////////////////// Corner Harris /////////////////////////////////////////////////

        texture<float, cudaTextureType2D, cudaReadModeElementType> harrisDxTex(0, cudaFilterModePoint, cudaAddressModeClamp);
        texture<float, cudaTextureType2D, cudaReadModeElementType> harrisDyTex(0, cudaFilterModePoint, cudaAddressModeClamp);

-        __global__ void cornerHarris_kernel(const int block_size, const float k, DevMem2Df dst)
+        __global__ void cornerHarris_kernel(const int block_size, const float k, PtrStepSzf dst)
        {
            const int x = blockIdx.x * blockDim.x + threadIdx.x;
            const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -401,7 +401,7 @@ namespace cv { namespace gpu { namespace device
        }

        template <typename BR, typename BC>
-        __global__ void cornerHarris_kernel(const int block_size, const float k, DevMem2Df dst, const BR border_row, const BC border_col)
+        __global__ void cornerHarris_kernel(const int block_size, const float k, PtrStepSzf dst, const BR border_row, const BC border_col)
        {
            const int x = blockIdx.x * blockDim.x + threadIdx.x;
            const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -438,7 +438,7 @@ namespace cv { namespace gpu { namespace device
            }
        }

-        void cornerHarris_gpu(int block_size, float k, DevMem2Df Dx, DevMem2Df Dy, DevMem2Df dst, int border_type, cudaStream_t stream)
+        void cornerHarris_gpu(int block_size, float k, PtrStepSzf Dx, PtrStepSzf Dy, PtrStepSzf dst, int border_type, cudaStream_t stream)
        {
            dim3 block(32, 8);
            dim3 grid(divUp(Dx.cols, block.x), divUp(Dx.rows, block.y));
@ -472,7 +472,7 @@ namespace cv { namespace gpu { namespace device
        texture<float, cudaTextureType2D, cudaReadModeElementType> minEigenValDxTex(0, cudaFilterModePoint, cudaAddressModeClamp);
        texture<float, cudaTextureType2D, cudaReadModeElementType> minEigenValDyTex(0, cudaFilterModePoint, cudaAddressModeClamp);

-        __global__ void cornerMinEigenVal_kernel(const int block_size, DevMem2Df dst)
+        __global__ void cornerMinEigenVal_kernel(const int block_size, PtrStepSzf dst)
        {
            const int x = blockIdx.x * blockDim.x + threadIdx.x;
            const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -510,7 +510,7 @@ namespace cv { namespace gpu { namespace device


        template <typename BR, typename BC>
-        __global__ void cornerMinEigenVal_kernel(const int block_size, DevMem2Df dst, const BR border_row, const BC border_col)
+        __global__ void cornerMinEigenVal_kernel(const int block_size, PtrStepSzf dst, const BR border_row, const BC border_col)
        {
            const int x = blockIdx.x * blockDim.x + threadIdx.x;
            const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -550,7 +550,7 @@ namespace cv { namespace gpu { namespace device
            }
        }

-        void cornerMinEigenVal_gpu(int block_size, DevMem2Df Dx, DevMem2Df Dy, DevMem2Df dst, int border_type, cudaStream_t stream)
+        void cornerMinEigenVal_gpu(int block_size, PtrStepSzf Dx, PtrStepSzf Dy, PtrStepSzf dst, int border_type, cudaStream_t stream)
        {
            dim3 block(32, 8);
            dim3 grid(divUp(Dx.cols, block.x), divUp(Dx.rows, block.y));
@ -602,7 +602,7 @@ namespace cv { namespace gpu { namespace device
        }


-        void columnSum_32F(const DevMem2Db src, const DevMem2Db dst)
+        void columnSum_32F(const PtrStepSzb src, const PtrStepSzb dst)
        {
            dim3 threads(256);
            dim3 grid(divUp(src.cols, threads.x));
@ -617,7 +617,7 @@ namespace cv { namespace gpu { namespace device
        //////////////////////////////////////////////////////////////////////////
        // mulSpectrums

-        __global__ void mulSpectrumsKernel(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c)
+        __global__ void mulSpectrumsKernel(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, PtrStepSz<cufftComplex> c)
        {
            const int x = blockIdx.x * blockDim.x + threadIdx.x;
            const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -629,7 +629,7 @@ namespace cv { namespace gpu { namespace device
        }


-        void mulSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c, cudaStream_t stream)
+        void mulSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, PtrStepSz<cufftComplex> c, cudaStream_t stream)
        {
            dim3 threads(256);
            dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));
@ -645,7 +645,7 @@ namespace cv { namespace gpu { namespace device
        //////////////////////////////////////////////////////////////////////////
        // mulSpectrums_CONJ

-        __global__ void mulSpectrumsKernel_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c)
+        __global__ void mulSpectrumsKernel_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, PtrStepSz<cufftComplex> c)
        {
            const int x = blockIdx.x * blockDim.x + threadIdx.x;
            const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -657,7 +657,7 @@ namespace cv { namespace gpu { namespace device
        }


-        void mulSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c, cudaStream_t stream)
+        void mulSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, PtrStepSz<cufftComplex> c, cudaStream_t stream)
        {
            dim3 threads(256);
            dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));
@ -673,7 +673,7 @@ namespace cv { namespace gpu { namespace device
        //////////////////////////////////////////////////////////////////////////
        // mulAndScaleSpectrums

-        __global__ void mulAndScaleSpectrumsKernel(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, DevMem2D_<cufftComplex> c)
+        __global__ void mulAndScaleSpectrumsKernel(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, PtrStepSz<cufftComplex> c)
        {
            const int x = blockIdx.x * blockDim.x + threadIdx.x;
            const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -686,7 +686,7 @@ namespace cv { namespace gpu { namespace device
        }


-        void mulAndScaleSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, DevMem2D_<cufftComplex> c, cudaStream_t stream)
+        void mulAndScaleSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, PtrStepSz<cufftComplex> c, cudaStream_t stream)
        {
            dim3 threads(256);
            dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));
@ -702,7 +702,7 @@ namespace cv { namespace gpu { namespace device
        //////////////////////////////////////////////////////////////////////////
        // mulAndScaleSpectrums_CONJ

-        __global__ void mulAndScaleSpectrumsKernel_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, DevMem2D_<cufftComplex> c)
+        __global__ void mulAndScaleSpectrumsKernel_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, PtrStepSz<cufftComplex> c)
        {
            const int x = blockIdx.x * blockDim.x + threadIdx.x;
            const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -715,7 +715,7 @@ namespace cv { namespace gpu { namespace device
        }


-        void mulAndScaleSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, DevMem2D_<cufftComplex> c, cudaStream_t stream)
+        void mulAndScaleSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, PtrStepSz<cufftComplex> c, cudaStream_t stream)
        {
            dim3 threads(256);
            dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));
@ -830,7 +830,7 @@ namespace cv { namespace gpu { namespace device
        }


-        void buildWarpPlaneMaps(int tl_u, int tl_v, DevMem2Df map_x, DevMem2Df map_y,
+        void buildWarpPlaneMaps(int tl_u, int tl_v, PtrStepSzf map_x, PtrStepSzf map_y,
                                const float k_rinv[9], const float r_kinv[9], const float t[3],
                                float scale, cudaStream_t stream)
        {
@ -852,7 +852,7 @@ namespace cv { namespace gpu { namespace device
        }


-        void buildWarpCylindricalMaps(int tl_u, int tl_v, DevMem2Df map_x, DevMem2Df map_y,
+        void buildWarpCylindricalMaps(int tl_u, int tl_v, PtrStepSzf map_x, PtrStepSzf map_y,
                                      const float k_rinv[9], const float r_kinv[9], float scale,
                                      cudaStream_t stream)
        {
@ -873,7 +873,7 @@ namespace cv { namespace gpu { namespace device
        }


-        void buildWarpSphericalMaps(int tl_u, int tl_v, DevMem2Df map_x, DevMem2Df map_y,
+        void buildWarpSphericalMaps(int tl_u, int tl_v, PtrStepSzf map_x, PtrStepSzf map_y,
                                    const float k_rinv[9], const float r_kinv[9], float scale,
                                    cudaStream_t stream)
        {
@ -901,7 +901,7 @@ namespace cv { namespace gpu { namespace device
        __constant__ float c_filter2DKernel[FILTER2D_MAX_KERNEL_SIZE * FILTER2D_MAX_KERNEL_SIZE];

        template <class SrcT, typename D>
-        __global__ void filter2D(const SrcT src, DevMem2D_<D> dst, const int kWidth, const int kHeight, const int anchorX, const int anchorY)
+        __global__ void filter2D(const SrcT src, PtrStepSz<D> dst, const int kWidth, const int kHeight, const int anchorX, const int anchorY)
        {
            typedef typename TypeVec<float, VecTraits<D>::cn>::vec_type sum_t;

@ -941,7 +941,7 @@ namespace cv { namespace gpu { namespace device
            }; \
            template <typename D, template <typename> class Brd> struct Filter2DCaller< type , D, Brd> \
            { \
-                static void call(const DevMem2D_< type > srcWhole, int xoff, int yoff, DevMem2D_<D> dst, \
+                static void call(const PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSz<D> dst, \
                    int kWidth, int kHeight, int anchorX, int anchorY, const float* borderValue, cudaStream_t stream) \
                { \
                    typedef typename TypeVec<float, VecTraits< type >::cn>::vec_type work_type; \
@ -970,11 +970,11 @@ namespace cv { namespace gpu { namespace device
        #undef IMPLEMENT_FILTER2D_TEX_READER

        template <typename T, typename D>
-        void filter2D_gpu(DevMem2Db srcWhole, int ofsX, int ofsY, DevMem2Db dst,
+        void filter2D_gpu(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst,
                          int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel,
                          int borderMode, const float* borderValue, cudaStream_t stream)
        {
-            typedef void (*func_t)(const DevMem2D_<T> srcWhole, int xoff, int yoff, DevMem2D_<D> dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* borderValue, cudaStream_t stream);
+            typedef void (*func_t)(const PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<D> dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* borderValue, cudaStream_t stream);
            static const func_t funcs[] =
            {
                Filter2DCaller<T, D, BrdReflect101>::call,
@ -986,14 +986,14 @@ namespace cv { namespace gpu { namespace device

            cudaSafeCall(cudaMemcpyToSymbol(c_filter2DKernel, kernel, kWidth * kHeight * sizeof(float), 0, cudaMemcpyDeviceToDevice) );

-            funcs[borderMode](static_cast< DevMem2D_<T> >(srcWhole), ofsX, ofsY, static_cast< DevMem2D_<D> >(dst), kWidth, kHeight, anchorX, anchorY, borderValue, stream);
+            funcs[borderMode](static_cast< PtrStepSz<T> >(srcWhole), ofsX, ofsY, static_cast< PtrStepSz<D> >(dst), kWidth, kHeight, anchorX, anchorY, borderValue, stream);
        }

-        template void filter2D_gpu<uchar, uchar>(DevMem2Db srcWhole, int ofsX, int ofsY, DevMem2Db dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, int borderMode, const float* borderValue, cudaStream_t stream);
-        template void filter2D_gpu<uchar4, uchar4>(DevMem2Db srcWhole, int ofsX, int ofsY, DevMem2Db dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, int borderMode, const float* borderValue, cudaStream_t stream);
-        template void filter2D_gpu<ushort, ushort>(DevMem2Db srcWhole, int ofsX, int ofsY, DevMem2Db dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, int borderMode, const float* borderValue, cudaStream_t stream);
-        template void filter2D_gpu<ushort4, ushort4>(DevMem2Db srcWhole, int ofsX, int ofsY, DevMem2Db dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, int borderMode, const float* borderValue, cudaStream_t stream);
-        template void filter2D_gpu<float, float>(DevMem2Db srcWhole, int ofsX, int ofsY, DevMem2Db dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, int borderMode, const float* borderValue, cudaStream_t stream);
-        template void filter2D_gpu<float4, float4>(DevMem2Db srcWhole, int ofsX, int ofsY, DevMem2Db dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, int borderMode, const float* borderValue, cudaStream_t stream);
+        template void filter2D_gpu<uchar, uchar>(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, int borderMode, const float* borderValue, cudaStream_t stream);
+        template void filter2D_gpu<uchar4, uchar4>(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, int borderMode, const float* borderValue, cudaStream_t stream);
+        template void filter2D_gpu<ushort, ushort>(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, int borderMode, const float* borderValue, cudaStream_t stream);
+        template void filter2D_gpu<ushort4, ushort4>(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, int borderMode, const float* borderValue, cudaStream_t stream);
+        template void filter2D_gpu<float, float>(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, int borderMode, const float* borderValue, cudaStream_t stream);
+        template void filter2D_gpu<float4, float4>(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, int borderMode, const float* borderValue, cudaStream_t stream);
    } // namespace imgproc
 }}} // namespace cv { namespace gpu { namespace device {
--- a/modules/gpu/src/cuda/integral_image.cu
+++ b/modules/gpu/src/cuda/integral_image.cu
@ -57,7 +57,7 @@ namespace cv { namespace gpu { namespace device
            return bytes;
        }

-        __global__ void shfl_integral_horizontal(const PtrStep_<uint4> img, PtrStep_<uint4> integral)
+        __global__ void shfl_integral_horizontal(const PtrStep<uint4> img, PtrStep<uint4> integral)
        {
        #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 300)
            __shared__ int sums[128];
@ -297,7 +297,7 @@ namespace cv { namespace gpu { namespace device
        // The final set of sums from the block is then propgated, with the block
        // computing "down" the image and adding the running sum to the local
        // block sums.
-        __global__ void shfl_integral_vertical(DevMem2D_<unsigned int> integral)
+        __global__ void shfl_integral_vertical(PtrStepSz<unsigned int> integral)
        {
        #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 300)
            __shared__ unsigned int sums[32][9];
@ -355,7 +355,7 @@ namespace cv { namespace gpu { namespace device
        #endif
        }

-        void shfl_integral_gpu(DevMem2Db img, DevMem2D_<unsigned int> integral, cudaStream_t stream)
+        void shfl_integral_gpu(PtrStepSzb img, PtrStepSz<unsigned int> integral, cudaStream_t stream)
        {
            {
                // each thread handles 16 values, use 1 block/row
@ -366,7 +366,7 @@ namespace cv { namespace gpu { namespace device

                cudaSafeCall( cudaFuncSetCacheConfig(shfl_integral_horizontal, cudaFuncCachePreferL1) );

-                shfl_integral_horizontal<<<grid, block, 0, stream>>>((DevMem2D_<uint4>) img, (DevMem2D_<uint4>) integral);
+                shfl_integral_horizontal<<<grid, block, 0, stream>>>((PtrStepSz<uint4>) img, (PtrStepSz<uint4>) integral);
                cudaSafeCall( cudaGetLastError() );
            }

--- a/modules/gpu/src/cuda/lbp.cu
+++ b/modules/gpu/src/cuda/lbp.cu
@ -181,7 +181,7 @@ namespace cv { namespace gpu { namespace device
            }
        }

-        void connectedConmonents(DevMem2D_<int4> candidates, int ncandidates, DevMem2D_<int4> objects, int groupThreshold, float grouping_eps, unsigned int* nclasses)
+        void connectedConmonents(PtrStepSz<int4> candidates, int ncandidates, PtrStepSz<int4> objects, int groupThreshold, float grouping_eps, unsigned int* nclasses)
        {
            int block = ncandidates;
            int smem  = block * ( sizeof(int) + sizeof(int4) );
@ -240,7 +240,7 @@ namespace cv { namespace gpu { namespace device

        // stepShift, scale, width_k, sum_prev => y =  sum_prev + tid_k / width_k, x = tid_k - tid_k / width_k
        __global__ void lbp_cascade(const Cascade cascade, int frameW, int frameH, int windowW, int windowH, float scale, const float factor,
-            const int total, int* integral, const int pitch, DevMem2D_<int4> objects, unsigned int* classified)
+            const int total, int* integral, const int pitch, PtrStepSz<int4> objects, unsigned int* classified)
        {
            int ftid = blockIdx.x * blockDim.x + threadIdx.x;
            if (ftid >= total) return;
@ -285,8 +285,8 @@ namespace cv { namespace gpu { namespace device
        }

        void classifyPyramid(int frameW, int frameH, int windowW, int windowH, float initialScale, float factor, int workAmount,
-            const DevMem2Db& mstages, const int nstages, const DevMem2Di& mnodes, const DevMem2Df& mleaves, const DevMem2Di& msubsets, const DevMem2Db& mfeatures,
-            const int subsetSize, DevMem2D_<int4> objects, unsigned int* classified, DevMem2Di integral)
+            const PtrStepSzb& mstages, const int nstages, const PtrStepSzi& mnodes, const PtrStepSzf& mleaves, const PtrStepSzi& msubsets, const PtrStepSzb& mfeatures,
+            const int subsetSize, PtrStepSz<int4> objects, unsigned int* classified, PtrStepSzi integral)
        {
            const int block = 128;
            int grid = divUp(workAmount, block);
--- a/modules/gpu/src/cuda/match_template.cu
+++ b/modules/gpu/src/cuda/match_template.cu
@ -81,7 +81,7 @@ namespace cv { namespace gpu { namespace device
        // Naive_CCORR

        template <typename T, int cn>
-        __global__ void matchTemplateNaiveKernel_CCORR(int w, int h, const PtrStepb image, const PtrStepb templ, DevMem2Df result)
+        __global__ void matchTemplateNaiveKernel_CCORR(int w, int h, const PtrStepb image, const PtrStepb templ, PtrStepSzf result)
        {
            typedef typename TypeVec<T, cn>::vec_type Type;
            typedef typename TypeVec<float, cn>::vec_type Typef;
@ -106,7 +106,7 @@ namespace cv { namespace gpu { namespace device
        }

        template <typename T, int cn>
-        void matchTemplateNaive_CCORR(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, cudaStream_t stream)
+        void matchTemplateNaive_CCORR(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream)
        {
            const dim3 threads(32, 8);
            const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
@ -118,9 +118,9 @@ namespace cv { namespace gpu { namespace device
                cudaSafeCall( cudaDeviceSynchronize() );
        }

-        void matchTemplateNaive_CCORR_32F(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream)
+        void matchTemplateNaive_CCORR_32F(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream)
        {
-            typedef void (*caller_t)(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, cudaStream_t stream);
+            typedef void (*caller_t)(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream);

            static const caller_t callers[] =
            {
@ -131,9 +131,9 @@ namespace cv { namespace gpu { namespace device
        }


-        void matchTemplateNaive_CCORR_8U(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream)
+        void matchTemplateNaive_CCORR_8U(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream)
        {
-            typedef void (*caller_t)(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, cudaStream_t stream);
+            typedef void (*caller_t)(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream);

            static const caller_t callers[] =
            {
@ -147,7 +147,7 @@ namespace cv { namespace gpu { namespace device
        // Naive_SQDIFF

        template <typename T, int cn>
-        __global__ void matchTemplateNaiveKernel_SQDIFF(int w, int h, const PtrStepb image, const PtrStepb templ, DevMem2Df result)
+        __global__ void matchTemplateNaiveKernel_SQDIFF(int w, int h, const PtrStepb image, const PtrStepb templ, PtrStepSzf result)
        {
            typedef typename TypeVec<T, cn>::vec_type Type;
            typedef typename TypeVec<float, cn>::vec_type Typef;
@ -176,7 +176,7 @@ namespace cv { namespace gpu { namespace device
        }

        template <typename T, int cn>
-        void matchTemplateNaive_SQDIFF(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, cudaStream_t stream)
+        void matchTemplateNaive_SQDIFF(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream)
        {
            const dim3 threads(32, 8);
            const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
@ -188,9 +188,9 @@ namespace cv { namespace gpu { namespace device
                cudaSafeCall( cudaDeviceSynchronize() );
        }

-        void matchTemplateNaive_SQDIFF_32F(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream)
+        void matchTemplateNaive_SQDIFF_32F(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream)
        {
-            typedef void (*caller_t)(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, cudaStream_t stream);
+            typedef void (*caller_t)(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream);

            static const caller_t callers[] =
            {
@ -200,9 +200,9 @@ namespace cv { namespace gpu { namespace device
            callers[cn](image, templ, result, stream);
        }

-        void matchTemplateNaive_SQDIFF_8U(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream)
+        void matchTemplateNaive_SQDIFF_8U(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream)
        {
-            typedef void (*caller_t)(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, cudaStream_t stream);
+            typedef void (*caller_t)(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream);

            static const caller_t callers[] =
            {
@ -216,7 +216,7 @@ namespace cv { namespace gpu { namespace device
        // Prepared_SQDIFF

        template <int cn>
-        __global__ void matchTemplatePreparedKernel_SQDIFF_8U(int w, int h, const PtrStep<unsigned long long> image_sqsum, unsigned long long templ_sqsum, DevMem2Df result)
+        __global__ void matchTemplatePreparedKernel_SQDIFF_8U(int w, int h, const PtrStep<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result)
        {
            const int x = blockIdx.x * blockDim.x + threadIdx.x;
            const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -232,7 +232,7 @@ namespace cv { namespace gpu { namespace device
        }

        template <int cn>
-        void matchTemplatePrepared_SQDIFF_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned long long templ_sqsum, DevMem2Df result, cudaStream_t stream)
+        void matchTemplatePrepared_SQDIFF_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result, cudaStream_t stream)
        {
            const dim3 threads(32, 8);
            const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
@ -244,10 +244,10 @@ namespace cv { namespace gpu { namespace device
                cudaSafeCall( cudaDeviceSynchronize() );
        }

-        void matchTemplatePrepared_SQDIFF_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned long long templ_sqsum, DevMem2Df result, int cn,
+        void matchTemplatePrepared_SQDIFF_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result, int cn,
                                             cudaStream_t stream)
        {
-            typedef void (*caller_t)(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned long long templ_sqsum, DevMem2Df result, cudaStream_t stream);
+            typedef void (*caller_t)(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result, cudaStream_t stream);

            static const caller_t callers[] =
            {
@ -286,7 +286,7 @@ namespace cv { namespace gpu { namespace device
        template <int cn>
        __global__ void matchTemplatePreparedKernel_SQDIFF_NORMED_8U(
                int w, int h, const PtrStep<unsigned long long> image_sqsum,
-                unsigned long long templ_sqsum, DevMem2Df result)
+                unsigned long long templ_sqsum, PtrStepSzf result)
        {
            const int x = blockIdx.x * blockDim.x + threadIdx.x;
            const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -303,8 +303,8 @@ namespace cv { namespace gpu { namespace device
        }

        template <int cn>
-        void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned long long templ_sqsum,
-                                                    DevMem2Df result, cudaStream_t stream)
+        void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum,
+                                                    PtrStepSzf result, cudaStream_t stream)
        {
            const dim3 threads(32, 8);
            const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
@ -317,10 +317,10 @@ namespace cv { namespace gpu { namespace device
        }


-        void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned long long templ_sqsum,
-                                                    DevMem2Df result, int cn, cudaStream_t stream)
+        void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum,
+                                                    PtrStepSzf result, int cn, cudaStream_t stream)
        {
-            typedef void (*caller_t)(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned long long templ_sqsum, DevMem2Df result, cudaStream_t stream);
+            typedef void (*caller_t)(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result, cudaStream_t stream);
            static const caller_t callers[] =
            {
                0, matchTemplatePrepared_SQDIFF_NORMED_8U<1>, matchTemplatePrepared_SQDIFF_NORMED_8U<2>, matchTemplatePrepared_SQDIFF_NORMED_8U<3>, matchTemplatePrepared_SQDIFF_NORMED_8U<4>
@ -332,7 +332,7 @@ namespace cv { namespace gpu { namespace device
        //////////////////////////////////////////////////////////////////////
        // Prepared_CCOFF

-        __global__ void matchTemplatePreparedKernel_CCOFF_8U(int w, int h, float templ_sum_scale, const PtrStep<unsigned int> image_sum, DevMem2Df result)
+        __global__ void matchTemplatePreparedKernel_CCOFF_8U(int w, int h, float templ_sum_scale, const PtrStep<unsigned int> image_sum, PtrStepSzf result)
        {
            const int x = blockIdx.x * blockDim.x + threadIdx.x;
            const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -347,7 +347,7 @@ namespace cv { namespace gpu { namespace device
            }
        }

-        void matchTemplatePrepared_CCOFF_8U(int w, int h, const DevMem2D_<unsigned int> image_sum, unsigned int templ_sum, DevMem2Df result, cudaStream_t stream)
+        void matchTemplatePrepared_CCOFF_8U(int w, int h, const PtrStepSz<unsigned int> image_sum, unsigned int templ_sum, PtrStepSzf result, cudaStream_t stream)
        {
            dim3 threads(32, 8);
            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
@ -365,7 +365,7 @@ namespace cv { namespace gpu { namespace device
                int w, int h, float templ_sum_scale_r, float templ_sum_scale_g,
                const PtrStep<unsigned int> image_sum_r,
                const PtrStep<unsigned int> image_sum_g,
-                DevMem2Df result)
+                PtrStepSzf result)
        {
            const int x = blockIdx.x * blockDim.x + threadIdx.x;
            const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -386,10 +386,10 @@ namespace cv { namespace gpu { namespace device

        void matchTemplatePrepared_CCOFF_8UC2(
                int w, int h,
-                const DevMem2D_<unsigned int> image_sum_r,
-                const DevMem2D_<unsigned int> image_sum_g,
+                const PtrStepSz<unsigned int> image_sum_r,
+                const PtrStepSz<unsigned int> image_sum_g,
                unsigned int templ_sum_r, unsigned int templ_sum_g,
-                DevMem2Df result, cudaStream_t stream)
+                PtrStepSzf result, cudaStream_t stream)
        {
            dim3 threads(32, 8);
            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
@ -413,7 +413,7 @@ namespace cv { namespace gpu { namespace device
                const PtrStep<unsigned int> image_sum_r,
                const PtrStep<unsigned int> image_sum_g,
                const PtrStep<unsigned int> image_sum_b,
-                DevMem2Df result)
+                PtrStepSzf result)
        {
            const int x = blockIdx.x * blockDim.x + threadIdx.x;
            const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -438,13 +438,13 @@ namespace cv { namespace gpu { namespace device

        void matchTemplatePrepared_CCOFF_8UC3(
                int w, int h,
-                const DevMem2D_<unsigned int> image_sum_r,
-                const DevMem2D_<unsigned int> image_sum_g,
-                const DevMem2D_<unsigned int> image_sum_b,
+                const PtrStepSz<unsigned int> image_sum_r,
+                const PtrStepSz<unsigned int> image_sum_g,
+                const PtrStepSz<unsigned int> image_sum_b,
                unsigned int templ_sum_r,
                unsigned int templ_sum_g,
                unsigned int templ_sum_b,
-                DevMem2Df result, cudaStream_t stream)
+                PtrStepSzf result, cudaStream_t stream)
        {
            dim3 threads(32, 8);
            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
@ -473,7 +473,7 @@ namespace cv { namespace gpu { namespace device
                const PtrStep<unsigned int> image_sum_g,
                const PtrStep<unsigned int> image_sum_b,
                const PtrStep<unsigned int> image_sum_a,
-                DevMem2Df result)
+                PtrStepSzf result)
        {
            const int x = blockIdx.x * blockDim.x + threadIdx.x;
            const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -502,15 +502,15 @@ namespace cv { namespace gpu { namespace device

        void matchTemplatePrepared_CCOFF_8UC4(
                int w, int h,
-                const DevMem2D_<unsigned int> image_sum_r,
-                const DevMem2D_<unsigned int> image_sum_g,
-                const DevMem2D_<unsigned int> image_sum_b,
-                const DevMem2D_<unsigned int> image_sum_a,
+                const PtrStepSz<unsigned int> image_sum_r,
+                const PtrStepSz<unsigned int> image_sum_g,
+                const PtrStepSz<unsigned int> image_sum_b,
+                const PtrStepSz<unsigned int> image_sum_a,
                unsigned int templ_sum_r,
                unsigned int templ_sum_g,
                unsigned int templ_sum_b,
                unsigned int templ_sum_a,
-                DevMem2Df result, cudaStream_t stream)
+                PtrStepSzf result, cudaStream_t stream)
        {
            dim3 threads(32, 8);
            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
@ -537,7 +537,7 @@ namespace cv { namespace gpu { namespace device
                float templ_sum_scale, float templ_sqsum_scale,
                const PtrStep<unsigned int> image_sum,
                const PtrStep<unsigned long long> image_sqsum,
-                DevMem2Df result)
+                PtrStepSzf result)
        {
            const int x = blockIdx.x * blockDim.x + threadIdx.x;
            const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -557,10 +557,10 @@ namespace cv { namespace gpu { namespace device
        }

        void matchTemplatePrepared_CCOFF_NORMED_8U(
-                    int w, int h, const DevMem2D_<unsigned int> image_sum,
-                    const DevMem2D_<unsigned long long> image_sqsum,
+                    int w, int h, const PtrStepSz<unsigned int> image_sum,
+                    const PtrStepSz<unsigned long long> image_sqsum,
                    unsigned int templ_sum, unsigned long long templ_sqsum,
-                    DevMem2Df result, cudaStream_t stream)
+                    PtrStepSzf result, cudaStream_t stream)
        {
            dim3 threads(32, 8);
            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
@ -586,7 +586,7 @@ namespace cv { namespace gpu { namespace device
                float templ_sqsum_scale,
                const PtrStep<unsigned int> image_sum_r, const PtrStep<unsigned long long> image_sqsum_r,
                const PtrStep<unsigned int> image_sum_g, const PtrStep<unsigned long long> image_sqsum_g,
-                DevMem2Df result)
+                PtrStepSzf result)
        {
            const int x = blockIdx.x * blockDim.x + threadIdx.x;
            const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -616,11 +616,11 @@ namespace cv { namespace gpu { namespace device

        void matchTemplatePrepared_CCOFF_NORMED_8UC2(
                    int w, int h,
-                    const DevMem2D_<unsigned int> image_sum_r, const DevMem2D_<unsigned long long> image_sqsum_r,
-                    const DevMem2D_<unsigned int> image_sum_g, const DevMem2D_<unsigned long long> image_sqsum_g,
+                    const PtrStepSz<unsigned int> image_sum_r, const PtrStepSz<unsigned long long> image_sqsum_r,
+                    const PtrStepSz<unsigned int> image_sum_g, const PtrStepSz<unsigned long long> image_sqsum_g,
                    unsigned int templ_sum_r, unsigned long long templ_sqsum_r,
                    unsigned int templ_sum_g, unsigned long long templ_sqsum_g,
-                    DevMem2Df result, cudaStream_t stream)
+                    PtrStepSzf result, cudaStream_t stream)
        {
            dim3 threads(32, 8);
            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
@ -653,7 +653,7 @@ namespace cv { namespace gpu { namespace device
                const PtrStep<unsigned int> image_sum_r, const PtrStep<unsigned long long> image_sqsum_r,
                const PtrStep<unsigned int> image_sum_g, const PtrStep<unsigned long long> image_sqsum_g,
                const PtrStep<unsigned int> image_sum_b, const PtrStep<unsigned long long> image_sqsum_b,
-                DevMem2Df result)
+                PtrStepSzf result)
        {
            const int x = blockIdx.x * blockDim.x + threadIdx.x;
            const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -691,13 +691,13 @@ namespace cv { namespace gpu { namespace device

        void matchTemplatePrepared_CCOFF_NORMED_8UC3(
                    int w, int h,
-                    const DevMem2D_<unsigned int> image_sum_r, const DevMem2D_<unsigned long long> image_sqsum_r,
-                    const DevMem2D_<unsigned int> image_sum_g, const DevMem2D_<unsigned long long> image_sqsum_g,
-                    const DevMem2D_<unsigned int> image_sum_b, const DevMem2D_<unsigned long long> image_sqsum_b,
+                    const PtrStepSz<unsigned int> image_sum_r, const PtrStepSz<unsigned long long> image_sqsum_r,
+                    const PtrStepSz<unsigned int> image_sum_g, const PtrStepSz<unsigned long long> image_sqsum_g,
+                    const PtrStepSz<unsigned int> image_sum_b, const PtrStepSz<unsigned long long> image_sqsum_b,
                    unsigned int templ_sum_r, unsigned long long templ_sqsum_r,
                    unsigned int templ_sum_g, unsigned long long templ_sqsum_g,
                    unsigned int templ_sum_b, unsigned long long templ_sqsum_b,
-                    DevMem2Df result, cudaStream_t stream)
+                    PtrStepSzf result, cudaStream_t stream)
        {
            dim3 threads(32, 8);
            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
@ -734,7 +734,7 @@ namespace cv { namespace gpu { namespace device
                const PtrStep<unsigned int> image_sum_g, const PtrStep<unsigned long long> image_sqsum_g,
                const PtrStep<unsigned int> image_sum_b, const PtrStep<unsigned long long> image_sqsum_b,
                const PtrStep<unsigned int> image_sum_a, const PtrStep<unsigned long long> image_sqsum_a,
-                DevMem2Df result)
+                PtrStepSzf result)
        {
            const int x = blockIdx.x * blockDim.x + threadIdx.x;
            const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -778,15 +778,15 @@ namespace cv { namespace gpu { namespace device

        void matchTemplatePrepared_CCOFF_NORMED_8UC4(
                    int w, int h,
-                    const DevMem2D_<unsigned int> image_sum_r, const DevMem2D_<unsigned long long> image_sqsum_r,
-                    const DevMem2D_<unsigned int> image_sum_g, const DevMem2D_<unsigned long long> image_sqsum_g,
-                    const DevMem2D_<unsigned int> image_sum_b, const DevMem2D_<unsigned long long> image_sqsum_b,
-                    const DevMem2D_<unsigned int> image_sum_a, const DevMem2D_<unsigned long long> image_sqsum_a,
+                    const PtrStepSz<unsigned int> image_sum_r, const PtrStepSz<unsigned long long> image_sqsum_r,
+                    const PtrStepSz<unsigned int> image_sum_g, const PtrStepSz<unsigned long long> image_sqsum_g,
+                    const PtrStepSz<unsigned int> image_sum_b, const PtrStepSz<unsigned long long> image_sqsum_b,
+                    const PtrStepSz<unsigned int> image_sum_a, const PtrStepSz<unsigned long long> image_sqsum_a,
                    unsigned int templ_sum_r, unsigned long long templ_sqsum_r,
                    unsigned int templ_sum_g, unsigned long long templ_sqsum_g,
                    unsigned int templ_sum_b, unsigned long long templ_sqsum_b,
                    unsigned int templ_sum_a, unsigned long long templ_sqsum_a,
-                    DevMem2Df result, cudaStream_t stream)
+                    PtrStepSzf result, cudaStream_t stream)
        {
            dim3 threads(32, 8);
            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
@ -822,7 +822,7 @@ namespace cv { namespace gpu { namespace device
        template <int cn>
        __global__ void normalizeKernel_8U(
                int w, int h, const PtrStep<unsigned long long> image_sqsum,
-                unsigned long long templ_sqsum, DevMem2Df result)
+                unsigned long long templ_sqsum, PtrStepSzf result)
        {
            const int x = blockIdx.x * blockDim.x + threadIdx.x;
            const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -836,8 +836,8 @@ namespace cv { namespace gpu { namespace device
            }
        }

-        void normalize_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum,
-                          unsigned long long templ_sqsum, DevMem2Df result, int cn, cudaStream_t stream)
+        void normalize_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum,
+                          unsigned long long templ_sqsum, PtrStepSzf result, int cn, cudaStream_t stream)
        {
            dim3 threads(32, 8);
            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
@ -868,7 +868,7 @@ namespace cv { namespace gpu { namespace device
        // extractFirstChannel

        template <int cn>
-        __global__ void extractFirstChannel_32F(const PtrStepb image, DevMem2Df result)
+        __global__ void extractFirstChannel_32F(const PtrStepb image, PtrStepSzf result)
        {
            typedef typename TypeVec<float, cn>::vec_type Typef;

@ -882,7 +882,7 @@ namespace cv { namespace gpu { namespace device
            }
        }

-        void extractFirstChannel_32F(const DevMem2Db image, DevMem2Df result, int cn, cudaStream_t stream)
+        void extractFirstChannel_32F(const PtrStepSzb image, PtrStepSzf result, int cn, cudaStream_t stream)
        {
            dim3 threads(32, 8);
            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
--- a/modules/gpu/src/cuda/mathfunc.cu
+++ b/modules/gpu/src/cuda/mathfunc.cu
@ -130,7 +130,7 @@ namespace cv { namespace gpu { namespace device
        }

        template <typename Mag, typename Angle>
-        void cartToPolar_caller(DevMem2Df x, DevMem2Df y, DevMem2Df mag, DevMem2Df angle, bool angleInDegrees, cudaStream_t stream)
+        void cartToPolar_caller(PtrStepSzf x, PtrStepSzf y, PtrStepSzf mag, PtrStepSzf angle, bool angleInDegrees, cudaStream_t stream)
        {
            dim3 threads(32, 8, 1);
            dim3 grid(1, 1, 1);
@ -149,9 +149,9 @@ namespace cv { namespace gpu { namespace device
                cudaSafeCall( cudaDeviceSynchronize() );
        }

-        void cartToPolar_gpu(DevMem2Df x, DevMem2Df y, DevMem2Df mag, bool magSqr, DevMem2Df angle, bool angleInDegrees, cudaStream_t stream)
+        void cartToPolar_gpu(PtrStepSzf x, PtrStepSzf y, PtrStepSzf mag, bool magSqr, PtrStepSzf angle, bool angleInDegrees, cudaStream_t stream)
        {
-            typedef void (*caller_t)(DevMem2Df x, DevMem2Df y, DevMem2Df mag, DevMem2Df angle, bool angleInDegrees, cudaStream_t stream);
+            typedef void (*caller_t)(PtrStepSzf x, PtrStepSzf y, PtrStepSzf mag, PtrStepSzf angle, bool angleInDegrees, cudaStream_t stream);
            static const caller_t callers[2][2][2] =
            {
                {
@ -180,7 +180,7 @@ namespace cv { namespace gpu { namespace device
        }

        template <typename Mag>
-        void polarToCart_caller(DevMem2Df mag, DevMem2Df angle, DevMem2Df x, DevMem2Df y, bool angleInDegrees, cudaStream_t stream)
+        void polarToCart_caller(PtrStepSzf mag, PtrStepSzf angle, PtrStepSzf x, PtrStepSzf y, bool angleInDegrees, cudaStream_t stream)
        {
            dim3 threads(32, 8, 1);
            dim3 grid(1, 1, 1);
@ -198,9 +198,9 @@ namespace cv { namespace gpu { namespace device
                cudaSafeCall( cudaDeviceSynchronize() );
        }

-        void polarToCart_gpu(DevMem2Df mag, DevMem2Df angle, DevMem2Df x, DevMem2Df y, bool angleInDegrees, cudaStream_t stream)
+        void polarToCart_gpu(PtrStepSzf mag, PtrStepSzf angle, PtrStepSzf x, PtrStepSzf y, bool angleInDegrees, cudaStream_t stream)
        {
-            typedef void (*caller_t)(DevMem2Df mag, DevMem2Df angle, DevMem2Df x, DevMem2Df y, bool angleInDegrees, cudaStream_t stream);
+            typedef void (*caller_t)(PtrStepSzf mag, PtrStepSzf angle, PtrStepSzf x, PtrStepSzf y, bool angleInDegrees, cudaStream_t stream);
            static const caller_t callers[2] =
            {
                polarToCart_caller<NonEmptyMag>,
--- a/modules/gpu/src/cuda/matrix_reductions.cu
+++ b/modules/gpu/src/cuda/matrix_reductions.cu
@ -175,7 +175,7 @@ namespace cv { namespace gpu { namespace device


            template <int nthreads, typename T, typename Mask>
-            __global__ void minMaxKernel(const DevMem2Db src, Mask mask, T* minval, T* maxval)
+            __global__ void minMaxKernel(const PtrStepSzb src, Mask mask, T* minval, T* maxval)
            {
                typedef typename MinMaxTypeTraits<T>::best_type best_type;
                __shared__ best_type sminval[nthreads];
@ -258,7 +258,7 @@ namespace cv { namespace gpu { namespace device


            template <typename T>
-            void minMaxMaskCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf)
+            void minMaxMaskCaller(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf)
            {
                dim3 threads, grid;
                estimateThreadCfg(src.cols, src.rows, threads, grid);
@ -279,17 +279,17 @@ namespace cv { namespace gpu { namespace device
                *maxval = maxval_;
            }

-            template void minMaxMaskCaller<uchar>(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb);
-            template void minMaxMaskCaller<char>(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb);
-            template void minMaxMaskCaller<ushort>(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb);
-            template void minMaxMaskCaller<short>(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb);
-            template void minMaxMaskCaller<int>(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb);
-            template void minMaxMaskCaller<float>(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb);
-            template void minMaxMaskCaller<double>(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb);
+            template void minMaxMaskCaller<uchar>(const PtrStepSzb, const PtrStepb, double*, double*, PtrStepb);
+            template void minMaxMaskCaller<char>(const PtrStepSzb, const PtrStepb, double*, double*, PtrStepb);
+            template void minMaxMaskCaller<ushort>(const PtrStepSzb, const PtrStepb, double*, double*, PtrStepb);
+            template void minMaxMaskCaller<short>(const PtrStepSzb, const PtrStepb, double*, double*, PtrStepb);
+            template void minMaxMaskCaller<int>(const PtrStepSzb, const PtrStepb, double*, double*, PtrStepb);
+            template void minMaxMaskCaller<float>(const PtrStepSzb, const PtrStepb, double*, double*, PtrStepb);
+            template void minMaxMaskCaller<double>(const PtrStepSzb, const PtrStepb, double*, double*, PtrStepb);


            template <typename T>
-            void minMaxCaller(const DevMem2Db src, double* minval, double* maxval, PtrStepb buf)
+            void minMaxCaller(const PtrStepSzb src, double* minval, double* maxval, PtrStepb buf)
            {
                dim3 threads, grid;
                estimateThreadCfg(src.cols, src.rows, threads, grid);
@ -310,13 +310,13 @@ namespace cv { namespace gpu { namespace device
                *maxval = maxval_;
            }

-            template void minMaxCaller<uchar>(const DevMem2Db, double*, double*, PtrStepb);
-            template void minMaxCaller<char>(const DevMem2Db, double*, double*, PtrStepb);
-            template void minMaxCaller<ushort>(const DevMem2Db, double*, double*, PtrStepb);
-            template void minMaxCaller<short>(const DevMem2Db, double*, double*, PtrStepb);
-            template void minMaxCaller<int>(const DevMem2Db, double*, double*, PtrStepb);
-            template void minMaxCaller<float>(const DevMem2Db, double*,double*, PtrStepb);
-            template void minMaxCaller<double>(const DevMem2Db, double*, double*, PtrStepb);
+            template void minMaxCaller<uchar>(const PtrStepSzb, double*, double*, PtrStepb);
+            template void minMaxCaller<char>(const PtrStepSzb, double*, double*, PtrStepb);
+            template void minMaxCaller<ushort>(const PtrStepSzb, double*, double*, PtrStepb);
+            template void minMaxCaller<short>(const PtrStepSzb, double*, double*, PtrStepb);
+            template void minMaxCaller<int>(const PtrStepSzb, double*, double*, PtrStepb);
+            template void minMaxCaller<float>(const PtrStepSzb, double*,double*, PtrStepb);
+            template void minMaxCaller<double>(const PtrStepSzb, double*, double*, PtrStepb);


            template <int nthreads, typename T>
@ -344,7 +344,7 @@ namespace cv { namespace gpu { namespace device


            template <typename T>
-            void minMaxMaskMultipassCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf)
+            void minMaxMaskMultipassCaller(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf)
            {
                dim3 threads, grid;
                estimateThreadCfg(src.cols, src.rows, threads, grid);
@ -367,16 +367,16 @@ namespace cv { namespace gpu { namespace device
                *maxval = maxval_;
            }

-            template void minMaxMaskMultipassCaller<uchar>(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb);
-            template void minMaxMaskMultipassCaller<char>(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb);
-            template void minMaxMaskMultipassCaller<ushort>(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb);
-            template void minMaxMaskMultipassCaller<short>(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb);
-            template void minMaxMaskMultipassCaller<int>(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb);
-            template void minMaxMaskMultipassCaller<float>(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb);
+            template void minMaxMaskMultipassCaller<uchar>(const PtrStepSzb, const PtrStepb, double*, double*, PtrStepb);
+            template void minMaxMaskMultipassCaller<char>(const PtrStepSzb, const PtrStepb, double*, double*, PtrStepb);
+            template void minMaxMaskMultipassCaller<ushort>(const PtrStepSzb, const PtrStepb, double*, double*, PtrStepb);
+            template void minMaxMaskMultipassCaller<short>(const PtrStepSzb, const PtrStepb, double*, double*, PtrStepb);
+            template void minMaxMaskMultipassCaller<int>(const PtrStepSzb, const PtrStepb, double*, double*, PtrStepb);
+            template void minMaxMaskMultipassCaller<float>(const PtrStepSzb, const PtrStepb, double*, double*, PtrStepb);


            template <typename T>
-            void minMaxMultipassCaller(const DevMem2Db src, double* minval, double* maxval, PtrStepb buf)
+            void minMaxMultipassCaller(const PtrStepSzb src, double* minval, double* maxval, PtrStepb buf)
            {
                dim3 threads, grid;
                estimateThreadCfg(src.cols, src.rows, threads, grid);
@ -399,12 +399,12 @@ namespace cv { namespace gpu { namespace device
                *maxval = maxval_;
            }

-            template void minMaxMultipassCaller<uchar>(const DevMem2Db, double*, double*, PtrStepb);
-            template void minMaxMultipassCaller<char>(const DevMem2Db, double*, double*, PtrStepb);
-            template void minMaxMultipassCaller<ushort>(const DevMem2Db, double*, double*, PtrStepb);
-            template void minMaxMultipassCaller<short>(const DevMem2Db, double*, double*, PtrStepb);
-            template void minMaxMultipassCaller<int>(const DevMem2Db, double*, double*, PtrStepb);
-            template void minMaxMultipassCaller<float>(const DevMem2Db, double*, double*, PtrStepb);
+            template void minMaxMultipassCaller<uchar>(const PtrStepSzb, double*, double*, PtrStepb);
+            template void minMaxMultipassCaller<char>(const PtrStepSzb, double*, double*, PtrStepb);
+            template void minMaxMultipassCaller<ushort>(const PtrStepSzb, double*, double*, PtrStepb);
+            template void minMaxMultipassCaller<short>(const PtrStepSzb, double*, double*, PtrStepb);
+            template void minMaxMultipassCaller<int>(const PtrStepSzb, double*, double*, PtrStepb);
+            template void minMaxMultipassCaller<float>(const PtrStepSzb, double*, double*, PtrStepb);
        } // namespace minmax

        ///////////////////////////////////////////////////////////////////////////////
@ -493,7 +493,7 @@ namespace cv { namespace gpu { namespace device


            template <int nthreads, typename T, typename Mask>
-            __global__ void minMaxLocKernel(const DevMem2Db src, Mask mask, T* minval, T* maxval,
+            __global__ void minMaxLocKernel(const PtrStepSzb src, Mask mask, T* minval, T* maxval,
                                            uint* minloc, uint* maxloc)
            {
                typedef typename MinMaxTypeTraits<T>::best_type best_type;
@ -586,7 +586,7 @@ namespace cv { namespace gpu { namespace device


            template <typename T>
-            void minMaxLocMaskCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval,
+            void minMaxLocMaskCaller(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval,
                                     int minloc[2], int maxloc[2], PtrStepb valbuf, PtrStepb locbuf)
            {
                dim3 threads, grid;
@ -617,17 +617,17 @@ namespace cv { namespace gpu { namespace device
                maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols;
            }

-            template void minMaxLocMaskCaller<uchar>(const DevMem2Db, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
-            template void minMaxLocMaskCaller<char>(const DevMem2Db, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
-            template void minMaxLocMaskCaller<ushort>(const DevMem2Db, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
-            template void minMaxLocMaskCaller<short>(const DevMem2Db, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
-            template void minMaxLocMaskCaller<int>(const DevMem2Db, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
-            template void minMaxLocMaskCaller<float>(const DevMem2Db, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
-            template void minMaxLocMaskCaller<double>(const DevMem2Db, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
+            template void minMaxLocMaskCaller<uchar>(const PtrStepSzb, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
+            template void minMaxLocMaskCaller<char>(const PtrStepSzb, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
+            template void minMaxLocMaskCaller<ushort>(const PtrStepSzb, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
+            template void minMaxLocMaskCaller<short>(const PtrStepSzb, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
+            template void minMaxLocMaskCaller<int>(const PtrStepSzb, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
+            template void minMaxLocMaskCaller<float>(const PtrStepSzb, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
+            template void minMaxLocMaskCaller<double>(const PtrStepSzb, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);


            template <typename T>
-            void minMaxLocCaller(const DevMem2Db src, double* minval, double* maxval,
+            void minMaxLocCaller(const PtrStepSzb src, double* minval, double* maxval,
                                 int minloc[2], int maxloc[2], PtrStepb valbuf, PtrStepb locbuf)
            {
                dim3 threads, grid;
@ -658,13 +658,13 @@ namespace cv { namespace gpu { namespace device
                maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols;
            }

-            template void minMaxLocCaller<uchar>(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
-            template void minMaxLocCaller<char>(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
-            template void minMaxLocCaller<ushort>(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
-            template void minMaxLocCaller<short>(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
-            template void minMaxLocCaller<int>(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
-            template void minMaxLocCaller<float>(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
-            template void minMaxLocCaller<double>(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
+            template void minMaxLocCaller<uchar>(const PtrStepSzb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
+            template void minMaxLocCaller<char>(const PtrStepSzb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
+            template void minMaxLocCaller<ushort>(const PtrStepSzb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
+            template void minMaxLocCaller<short>(const PtrStepSzb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
+            template void minMaxLocCaller<int>(const PtrStepSzb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
+            template void minMaxLocCaller<float>(const PtrStepSzb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
+            template void minMaxLocCaller<double>(const PtrStepSzb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);


            // This kernel will be used only when compute capability is 1.0
@ -699,7 +699,7 @@ namespace cv { namespace gpu { namespace device


            template <typename T>
-            void minMaxLocMaskMultipassCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval,
+            void minMaxLocMaskMultipassCaller(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval,
                                              int minloc[2], int maxloc[2], PtrStepb valbuf, PtrStepb locbuf)
            {
                dim3 threads, grid;
@ -732,16 +732,16 @@ namespace cv { namespace gpu { namespace device
                maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols;
            }

-            template void minMaxLocMaskMultipassCaller<uchar>(const DevMem2Db, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
-            template void minMaxLocMaskMultipassCaller<char>(const DevMem2Db, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
-            template void minMaxLocMaskMultipassCaller<ushort>(const DevMem2Db, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
-            template void minMaxLocMaskMultipassCaller<short>(const DevMem2Db, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
-            template void minMaxLocMaskMultipassCaller<int>(const DevMem2Db, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
-            template void minMaxLocMaskMultipassCaller<float>(const DevMem2Db, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
+            template void minMaxLocMaskMultipassCaller<uchar>(const PtrStepSzb, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
+            template void minMaxLocMaskMultipassCaller<char>(const PtrStepSzb, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
+            template void minMaxLocMaskMultipassCaller<ushort>(const PtrStepSzb, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
+            template void minMaxLocMaskMultipassCaller<short>(const PtrStepSzb, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
+            template void minMaxLocMaskMultipassCaller<int>(const PtrStepSzb, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
+            template void minMaxLocMaskMultipassCaller<float>(const PtrStepSzb, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);


            template <typename T>
-            void minMaxLocMultipassCaller(const DevMem2Db src, double* minval, double* maxval,
+            void minMaxLocMultipassCaller(const PtrStepSzb src, double* minval, double* maxval,
                                          int minloc[2], int maxloc[2], PtrStepb valbuf, PtrStepb locbuf)
            {
                dim3 threads, grid;
@ -774,12 +774,12 @@ namespace cv { namespace gpu { namespace device
                maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols;
            }

-            template void minMaxLocMultipassCaller<uchar>(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
-            template void minMaxLocMultipassCaller<char>(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
-            template void minMaxLocMultipassCaller<ushort>(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
-            template void minMaxLocMultipassCaller<short>(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
-            template void minMaxLocMultipassCaller<int>(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
-            template void minMaxLocMultipassCaller<float>(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
+            template void minMaxLocMultipassCaller<uchar>(const PtrStepSzb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
+            template void minMaxLocMultipassCaller<char>(const PtrStepSzb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
+            template void minMaxLocMultipassCaller<ushort>(const PtrStepSzb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
+            template void minMaxLocMultipassCaller<short>(const PtrStepSzb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
+            template void minMaxLocMultipassCaller<int>(const PtrStepSzb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
+            template void minMaxLocMultipassCaller<float>(const PtrStepSzb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
        } // namespace minmaxloc

        //////////////////////////////////////////////////////////////////////////////////////////////////////////
@ -820,7 +820,7 @@ namespace cv { namespace gpu { namespace device


            template <int nthreads, typename T>
-            __global__ void countNonZeroKernel(const DevMem2Db src, volatile uint* count)
+            __global__ void countNonZeroKernel(const PtrStepSzb src, volatile uint* count)
            {
                __shared__ uint scount[nthreads];

@ -875,7 +875,7 @@ namespace cv { namespace gpu { namespace device


            template <typename T>
-            int countNonZeroCaller(const DevMem2Db src, PtrStepb buf)
+            int countNonZeroCaller(const PtrStepSzb src, PtrStepb buf)
            {
                dim3 threads, grid;
                estimateThreadCfg(src.cols, src.rows, threads, grid);
@ -894,13 +894,13 @@ namespace cv { namespace gpu { namespace device
                return count;
            }

-            template int countNonZeroCaller<uchar>(const DevMem2Db, PtrStepb);
-            template int countNonZeroCaller<char>(const DevMem2Db, PtrStepb);
-            template int countNonZeroCaller<ushort>(const DevMem2Db, PtrStepb);
-            template int countNonZeroCaller<short>(const DevMem2Db, PtrStepb);
-            template int countNonZeroCaller<int>(const DevMem2Db, PtrStepb);
-            template int countNonZeroCaller<float>(const DevMem2Db, PtrStepb);
-            template int countNonZeroCaller<double>(const DevMem2Db, PtrStepb);
+            template int countNonZeroCaller<uchar>(const PtrStepSzb, PtrStepb);
+            template int countNonZeroCaller<char>(const PtrStepSzb, PtrStepb);
+            template int countNonZeroCaller<ushort>(const PtrStepSzb, PtrStepb);
+            template int countNonZeroCaller<short>(const PtrStepSzb, PtrStepb);
+            template int countNonZeroCaller<int>(const PtrStepSzb, PtrStepb);
+            template int countNonZeroCaller<float>(const PtrStepSzb, PtrStepb);
+            template int countNonZeroCaller<double>(const PtrStepSzb, PtrStepb);


            template <int nthreads, typename T>
@ -920,7 +920,7 @@ namespace cv { namespace gpu { namespace device


            template <typename T>
-            int countNonZeroMultipassCaller(const DevMem2Db src, PtrStepb buf)
+            int countNonZeroMultipassCaller(const PtrStepSzb src, PtrStepb buf)
            {
                dim3 threads, grid;
                estimateThreadCfg(src.cols, src.rows, threads, grid);
@ -941,12 +941,12 @@ namespace cv { namespace gpu { namespace device
                return count;
            }

-            template int countNonZeroMultipassCaller<uchar>(const DevMem2Db, PtrStepb);
-            template int countNonZeroMultipassCaller<char>(const DevMem2Db, PtrStepb);
-            template int countNonZeroMultipassCaller<ushort>(const DevMem2Db, PtrStepb);
-            template int countNonZeroMultipassCaller<short>(const DevMem2Db, PtrStepb);
-            template int countNonZeroMultipassCaller<int>(const DevMem2Db, PtrStepb);
-            template int countNonZeroMultipassCaller<float>(const DevMem2Db, PtrStepb);
+            template int countNonZeroMultipassCaller<uchar>(const PtrStepSzb, PtrStepb);
+            template int countNonZeroMultipassCaller<char>(const PtrStepSzb, PtrStepb);
+            template int countNonZeroMultipassCaller<ushort>(const PtrStepSzb, PtrStepb);
+            template int countNonZeroMultipassCaller<short>(const PtrStepSzb, PtrStepb);
+            template int countNonZeroMultipassCaller<int>(const PtrStepSzb, PtrStepb);
+            template int countNonZeroMultipassCaller<float>(const PtrStepSzb, PtrStepb);

        } // namespace countnonzero

@ -1012,7 +1012,7 @@ namespace cv { namespace gpu { namespace device
            }

            template <typename T, typename R, typename Op, int nthreads>
-            __global__ void sumKernel(const DevMem2Db src, R* result)
+            __global__ void sumKernel(const PtrStepSzb src, R* result)
            {
                __shared__ R smem[nthreads];

@ -1084,7 +1084,7 @@ namespace cv { namespace gpu { namespace device


            template <typename T, typename R, typename Op, int nthreads>
-            __global__ void sumKernel_C2(const DevMem2Db src, typename TypeVec<R, 2>::vec_type* result)
+            __global__ void sumKernel_C2(const PtrStepSzb src, typename TypeVec<R, 2>::vec_type* result)
            {
                typedef typename TypeVec<T, 2>::vec_type SrcType;
                typedef typename TypeVec<R, 2>::vec_type DstType;
@ -1189,7 +1189,7 @@ namespace cv { namespace gpu { namespace device


            template <typename T, typename R, typename Op, int nthreads>
-            __global__ void sumKernel_C3(const DevMem2Db src, typename TypeVec<R, 3>::vec_type* result)
+            __global__ void sumKernel_C3(const PtrStepSzb src, typename TypeVec<R, 3>::vec_type* result)
            {
                typedef typename TypeVec<T, 3>::vec_type SrcType;
                typedef typename TypeVec<R, 3>::vec_type DstType;
@ -1303,7 +1303,7 @@ namespace cv { namespace gpu { namespace device
            }

            template <typename T, typename R, typename Op, int nthreads>
-            __global__ void sumKernel_C4(const DevMem2Db src, typename TypeVec<R, 4>::vec_type* result)
+            __global__ void sumKernel_C4(const PtrStepSzb src, typename TypeVec<R, 4>::vec_type* result)
            {
                typedef typename TypeVec<T, 4>::vec_type SrcType;
                typedef typename TypeVec<R, 4>::vec_type DstType;
@ -1428,7 +1428,7 @@ namespace cv { namespace gpu { namespace device
            }

            template <typename T>
-            void sumMultipassCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn)
+            void sumMultipassCaller(const PtrStepSzb src, PtrStepb buf, double* sum, int cn)
            {
                typedef typename SumType<T>::R R;

@ -1490,16 +1490,16 @@ namespace cv { namespace gpu { namespace device
                sum[3] = result[3];
            }

-            template void sumMultipassCaller<uchar>(const DevMem2Db, PtrStepb, double*, int);
-            template void sumMultipassCaller<char>(const DevMem2Db, PtrStepb, double*, int);
-            template void sumMultipassCaller<ushort>(const DevMem2Db, PtrStepb, double*, int);
-            template void sumMultipassCaller<short>(const DevMem2Db, PtrStepb, double*, int);
-            template void sumMultipassCaller<int>(const DevMem2Db, PtrStepb, double*, int);
-            template void sumMultipassCaller<float>(const DevMem2Db, PtrStepb, double*, int);
+            template void sumMultipassCaller<uchar>(const PtrStepSzb, PtrStepb, double*, int);
+            template void sumMultipassCaller<char>(const PtrStepSzb, PtrStepb, double*, int);
+            template void sumMultipassCaller<ushort>(const PtrStepSzb, PtrStepb, double*, int);
+            template void sumMultipassCaller<short>(const PtrStepSzb, PtrStepb, double*, int);
+            template void sumMultipassCaller<int>(const PtrStepSzb, PtrStepb, double*, int);
+            template void sumMultipassCaller<float>(const PtrStepSzb, PtrStepb, double*, int);


            template <typename T>
-            void sumCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn)
+            void sumCaller(const PtrStepSzb src, PtrStepb buf, double* sum, int cn)
            {
                typedef typename SumType<T>::R R;

@ -1539,16 +1539,16 @@ namespace cv { namespace gpu { namespace device
                sum[3] = result[3];
            }

-            template void sumCaller<uchar>(const DevMem2Db, PtrStepb, double*, int);
-            template void sumCaller<char>(const DevMem2Db, PtrStepb, double*, int);
-            template void sumCaller<ushort>(const DevMem2Db, PtrStepb, double*, int);
-            template void sumCaller<short>(const DevMem2Db, PtrStepb, double*, int);
-            template void sumCaller<int>(const DevMem2Db, PtrStepb, double*, int);
-            template void sumCaller<float>(const DevMem2Db, PtrStepb, double*, int);
+            template void sumCaller<uchar>(const PtrStepSzb, PtrStepb, double*, int);
+            template void sumCaller<char>(const PtrStepSzb, PtrStepb, double*, int);
+            template void sumCaller<ushort>(const PtrStepSzb, PtrStepb, double*, int);
+            template void sumCaller<short>(const PtrStepSzb, PtrStepb, double*, int);
+            template void sumCaller<int>(const PtrStepSzb, PtrStepb, double*, int);
+            template void sumCaller<float>(const PtrStepSzb, PtrStepb, double*, int);


            template <typename T>
-            void absSumMultipassCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn)
+            void absSumMultipassCaller(const PtrStepSzb src, PtrStepb buf, double* sum, int cn)
            {
                typedef typename SumType<T>::R R;

@ -1610,16 +1610,16 @@ namespace cv { namespace gpu { namespace device
                sum[3] = result[3];
            }

-            template void absSumMultipassCaller<uchar>(const DevMem2Db, PtrStepb, double*, int);
-            template void absSumMultipassCaller<char>(const DevMem2Db, PtrStepb, double*, int);
-            template void absSumMultipassCaller<ushort>(const DevMem2Db, PtrStepb, double*, int);
-            template void absSumMultipassCaller<short>(const DevMem2Db, PtrStepb, double*, int);
-            template void absSumMultipassCaller<int>(const DevMem2Db, PtrStepb, double*, int);
-            template void absSumMultipassCaller<float>(const DevMem2Db, PtrStepb, double*, int);
+            template void absSumMultipassCaller<uchar>(const PtrStepSzb, PtrStepb, double*, int);
+            template void absSumMultipassCaller<char>(const PtrStepSzb, PtrStepb, double*, int);
+            template void absSumMultipassCaller<ushort>(const PtrStepSzb, PtrStepb, double*, int);
+            template void absSumMultipassCaller<short>(const PtrStepSzb, PtrStepb, double*, int);
+            template void absSumMultipassCaller<int>(const PtrStepSzb, PtrStepb, double*, int);
+            template void absSumMultipassCaller<float>(const PtrStepSzb, PtrStepb, double*, int);


            template <typename T>
-            void absSumCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn)
+            void absSumCaller(const PtrStepSzb src, PtrStepb buf, double* sum, int cn)
            {
                typedef typename SumType<T>::R R;

@ -1659,16 +1659,16 @@ namespace cv { namespace gpu { namespace device
                sum[3] = result[3];
            }

-            template void absSumCaller<uchar>(const DevMem2Db, PtrStepb, double*, int);
-            template void absSumCaller<char>(const DevMem2Db, PtrStepb, double*, int);
-            template void absSumCaller<ushort>(const DevMem2Db, PtrStepb, double*, int);
-            template void absSumCaller<short>(const DevMem2Db, PtrStepb, double*, int);
-            template void absSumCaller<int>(const DevMem2Db, PtrStepb, double*, int);
-            template void absSumCaller<float>(const DevMem2Db, PtrStepb, double*, int);
+            template void absSumCaller<uchar>(const PtrStepSzb, PtrStepb, double*, int);
+            template void absSumCaller<char>(const PtrStepSzb, PtrStepb, double*, int);
+            template void absSumCaller<ushort>(const PtrStepSzb, PtrStepb, double*, int);
+            template void absSumCaller<short>(const PtrStepSzb, PtrStepb, double*, int);
+            template void absSumCaller<int>(const PtrStepSzb, PtrStepb, double*, int);
+            template void absSumCaller<float>(const PtrStepSzb, PtrStepb, double*, int);


            template <typename T>
-            void sqrSumMultipassCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn)
+            void sqrSumMultipassCaller(const PtrStepSzb src, PtrStepb buf, double* sum, int cn)
            {
                typedef typename SumType<T>::R R;

@ -1730,16 +1730,16 @@ namespace cv { namespace gpu { namespace device
                sum[3] = result[3];
            }

-            template void sqrSumMultipassCaller<uchar>(const DevMem2Db, PtrStepb, double*, int);
-            template void sqrSumMultipassCaller<char>(const DevMem2Db, PtrStepb, double*, int);
-            template void sqrSumMultipassCaller<ushort>(const DevMem2Db, PtrStepb, double*, int);
-            template void sqrSumMultipassCaller<short>(const DevMem2Db, PtrStepb, double*, int);
-            template void sqrSumMultipassCaller<int>(const DevMem2Db, PtrStepb, double*, int);
-            template void sqrSumMultipassCaller<float>(const DevMem2Db, PtrStepb, double*, int);
+            template void sqrSumMultipassCaller<uchar>(const PtrStepSzb, PtrStepb, double*, int);
+            template void sqrSumMultipassCaller<char>(const PtrStepSzb, PtrStepb, double*, int);
+            template void sqrSumMultipassCaller<ushort>(const PtrStepSzb, PtrStepb, double*, int);
+            template void sqrSumMultipassCaller<short>(const PtrStepSzb, PtrStepb, double*, int);
+            template void sqrSumMultipassCaller<int>(const PtrStepSzb, PtrStepb, double*, int);
+            template void sqrSumMultipassCaller<float>(const PtrStepSzb, PtrStepb, double*, int);


            template <typename T>
-            void sqrSumCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn)
+            void sqrSumCaller(const PtrStepSzb src, PtrStepb buf, double* sum, int cn)
            {
                typedef double R;

@ -1779,12 +1779,12 @@ namespace cv { namespace gpu { namespace device
                sum[3] = result[3];
            }

-            template void sqrSumCaller<uchar>(const DevMem2Db, PtrStepb, double*, int);
-            template void sqrSumCaller<char>(const DevMem2Db, PtrStepb, double*, int);
-            template void sqrSumCaller<ushort>(const DevMem2Db, PtrStepb, double*, int);
-            template void sqrSumCaller<short>(const DevMem2Db, PtrStepb, double*, int);
-            template void sqrSumCaller<int>(const DevMem2Db, PtrStepb, double*, int);
-            template void sqrSumCaller<float>(const DevMem2Db, PtrStepb, double*, int);
+            template void sqrSumCaller<uchar>(const PtrStepSzb, PtrStepb, double*, int);
+            template void sqrSumCaller<char>(const PtrStepSzb, PtrStepb, double*, int);
+            template void sqrSumCaller<ushort>(const PtrStepSzb, PtrStepb, double*, int);
+            template void sqrSumCaller<short>(const PtrStepSzb, PtrStepb, double*, int);
+            template void sqrSumCaller<int>(const PtrStepSzb, PtrStepb, double*, int);
+            template void sqrSumCaller<float>(const PtrStepSzb, PtrStepb, double*, int);
        } // namespace sum

        //////////////////////////////////////////////////////////////////////////////
@ -1882,7 +1882,7 @@ namespace cv { namespace gpu { namespace device
            }
        };

-        template <class Op, typename T, typename S, typename D> __global__ void reduceRows(const DevMem2D_<T> src, D* dst, const Op op)
+        template <class Op, typename T, typename S, typename D> __global__ void reduceRows(const PtrStepSz<T> src, D* dst, const Op op)
        {
            __shared__ S smem[16 * 16];

@ -1913,7 +1913,7 @@ namespace cv { namespace gpu { namespace device
                dst[x] = saturate_cast<D>(op.result(smem[threadIdx.x * 16], src.rows));
        }

-        template <template <typename> class Op, typename T, typename S, typename D> void reduceRows_caller(const DevMem2D_<T>& src, DevMem2D_<D> dst, cudaStream_t stream)
+        template <template <typename> class Op, typename T, typename S, typename D> void reduceRows_caller(const PtrStepSz<T>& src, PtrStepSz<D> dst, cudaStream_t stream)
        {
            const dim3 block(16, 16);
            const dim3 grid(divUp(src.cols, block.x));
@ -1927,9 +1927,9 @@ namespace cv { namespace gpu { namespace device

        }

-        template <typename T, typename S, typename D> void reduceRows_gpu(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream)
+        template <typename T, typename S, typename D> void reduceRows_gpu(const PtrStepSzb& src, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream)
        {
-            typedef void (*caller_t)(const DevMem2D_<T>& src, DevMem2D_<D> dst, cudaStream_t stream);
+            typedef void (*caller_t)(const PtrStepSz<T>& src, PtrStepSz<D> dst, cudaStream_t stream);

            static const caller_t callers[] =
            {
@ -1939,29 +1939,29 @@ namespace cv { namespace gpu { namespace device
                reduceRows_caller<MinReductor, T, S, D>
            };

-            callers[reduceOp](static_cast< DevMem2D_<T> >(src), static_cast< DevMem2D_<D> >(dst), stream);
+            callers[reduceOp](static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<D> >(dst), stream);
        }

-        template void reduceRows_gpu<uchar, int, uchar>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
-        template void reduceRows_gpu<uchar, int, int>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
-        template void reduceRows_gpu<uchar, int, float>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
+        template void reduceRows_gpu<uchar, int, uchar>(const PtrStepSzb& src, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);
+        template void reduceRows_gpu<uchar, int, int>(const PtrStepSzb& src, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);
+        template void reduceRows_gpu<uchar, int, float>(const PtrStepSzb& src, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);

-        template void reduceRows_gpu<ushort, int, ushort>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
-        template void reduceRows_gpu<ushort, int, int>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
-        template void reduceRows_gpu<ushort, int, float>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
+        template void reduceRows_gpu<ushort, int, ushort>(const PtrStepSzb& src, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);
+        template void reduceRows_gpu<ushort, int, int>(const PtrStepSzb& src, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);
+        template void reduceRows_gpu<ushort, int, float>(const PtrStepSzb& src, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);

-        template void reduceRows_gpu<short, int, short>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
-        template void reduceRows_gpu<short, int, int>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
-        template void reduceRows_gpu<short, int, float>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
+        template void reduceRows_gpu<short, int, short>(const PtrStepSzb& src, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);
+        template void reduceRows_gpu<short, int, int>(const PtrStepSzb& src, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);
+        template void reduceRows_gpu<short, int, float>(const PtrStepSzb& src, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);

-        template void reduceRows_gpu<int, int, int>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
-        template void reduceRows_gpu<int, int, float>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
+        template void reduceRows_gpu<int, int, int>(const PtrStepSzb& src, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);
+        template void reduceRows_gpu<int, int, float>(const PtrStepSzb& src, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);

-        template void reduceRows_gpu<float, float, float>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
+        template void reduceRows_gpu<float, float, float>(const PtrStepSzb& src, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);



-        template <int cn, class Op, typename T, typename S, typename D> __global__ void reduceCols(const DevMem2D_<T> src, D* dst, const Op op)
+        template <int cn, class Op, typename T, typename S, typename D> __global__ void reduceCols(const PtrStepSz<T> src, D* dst, const Op op)
        {
            __shared__ S smem[256 * cn];

@ -2050,7 +2050,7 @@ namespace cv { namespace gpu { namespace device
                dst[y * cn + threadIdx.x] = saturate_cast<D>(op.result(smem[threadIdx.x * 256], src.cols));
        }

-        template <int cn, template <typename> class Op, typename T, typename S, typename D> void reduceCols_caller(const DevMem2D_<T>& src, DevMem2D_<D> dst, cudaStream_t stream)
+        template <int cn, template <typename> class Op, typename T, typename S, typename D> void reduceCols_caller(const PtrStepSz<T>& src, PtrStepSz<D> dst, cudaStream_t stream)
        {
            const dim3 block(256);
            const dim3 grid(src.rows);
@ -2064,9 +2064,9 @@ namespace cv { namespace gpu { namespace device

        }

-        template <typename T, typename S, typename D> void reduceCols_gpu(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream)
+        template <typename T, typename S, typename D> void reduceCols_gpu(const PtrStepSzb& src, int cn, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream)
        {
-            typedef void (*caller_t)(const DevMem2D_<T>& src, DevMem2D_<D> dst, cudaStream_t stream);
+            typedef void (*caller_t)(const PtrStepSz<T>& src, PtrStepSz<D> dst, cudaStream_t stream);

            static const caller_t callers[4][4] =
            {
@ -2076,24 +2076,24 @@ namespace cv { namespace gpu { namespace device
                {reduceCols_caller<4, SumReductor, T, S, D>, reduceCols_caller<4, AvgReductor, T, S, D>, reduceCols_caller<4, MaxReductor, T, S, D>, reduceCols_caller<4, MinReductor, T, S, D>},
            };

-            callers[cn - 1][reduceOp](static_cast< DevMem2D_<T> >(src), static_cast< DevMem2D_<D> >(dst), stream);
+            callers[cn - 1][reduceOp](static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<D> >(dst), stream);
        }

-        template void reduceCols_gpu<uchar, int, uchar>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
-        template void reduceCols_gpu<uchar, int, int>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
-        template void reduceCols_gpu<uchar, int, float>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
+        template void reduceCols_gpu<uchar, int, uchar>(const PtrStepSzb& src, int cn, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);
+        template void reduceCols_gpu<uchar, int, int>(const PtrStepSzb& src, int cn, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);
+        template void reduceCols_gpu<uchar, int, float>(const PtrStepSzb& src, int cn, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);

-        template void reduceCols_gpu<ushort, int, ushort>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
-        template void reduceCols_gpu<ushort, int, int>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
-        template void reduceCols_gpu<ushort, int, float>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
+        template void reduceCols_gpu<ushort, int, ushort>(const PtrStepSzb& src, int cn, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);
+        template void reduceCols_gpu<ushort, int, int>(const PtrStepSzb& src, int cn, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);
+        template void reduceCols_gpu<ushort, int, float>(const PtrStepSzb& src, int cn, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);

-        template void reduceCols_gpu<short, int, short>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
-        template void reduceCols_gpu<short, int, int>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
-        template void reduceCols_gpu<short, int, float>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
+        template void reduceCols_gpu<short, int, short>(const PtrStepSzb& src, int cn, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);
+        template void reduceCols_gpu<short, int, int>(const PtrStepSzb& src, int cn, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);
+        template void reduceCols_gpu<short, int, float>(const PtrStepSzb& src, int cn, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);

-        template void reduceCols_gpu<int, int, int>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
-        template void reduceCols_gpu<int, int, float>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
+        template void reduceCols_gpu<int, int, int>(const PtrStepSzb& src, int cn, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);
+        template void reduceCols_gpu<int, int, float>(const PtrStepSzb& src, int cn, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);

-        template void reduceCols_gpu<float, float, float>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
+        template void reduceCols_gpu<float, float, float>(const PtrStepSzb& src, int cn, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);
    } // namespace mattrix_reductions
 }}} // namespace cv { namespace gpu { namespace device
--- a/modules/gpu/src/cuda/optical_flow.cu
+++ b/modules/gpu/src/cuda/optical_flow.cu
@ -49,7 +49,7 @@ namespace cv { namespace gpu { namespace device
        #define NEEDLE_MAP_SCALE 16
        #define NUM_VERTS_PER_ARROW 6

-        __global__ void NeedleMapAverageKernel(const DevMem2Df u, const PtrStepf v, PtrStepf u_avg, PtrStepf v_avg)
+        __global__ void NeedleMapAverageKernel(const PtrStepSzf u, const PtrStepf v, PtrStepf u_avg, PtrStepf v_avg)
        {
            __shared__ float smem[2 * NEEDLE_MAP_SCALE];

@ -111,7 +111,7 @@ namespace cv { namespace gpu { namespace device
            }
        }

-        void NeedleMapAverage_gpu(DevMem2Df u, DevMem2Df v, DevMem2Df u_avg, DevMem2Df v_avg)
+        void NeedleMapAverage_gpu(PtrStepSzf u, PtrStepSzf v, PtrStepSzf u_avg, PtrStepSzf v_avg)
        {
            const dim3 block(NEEDLE_MAP_SCALE);
            const dim3 grid(u_avg.cols, u_avg.rows);
@ -122,7 +122,7 @@ namespace cv { namespace gpu { namespace device
            cudaSafeCall( cudaDeviceSynchronize() );
        }

-        __global__ void NeedleMapVertexKernel(const DevMem2Df u_avg, const PtrStepf v_avg, float* vertex_data, float* color_data, float max_flow, float xscale, float yscale)
+        __global__ void NeedleMapVertexKernel(const PtrStepSzf u_avg, const PtrStepf v_avg, float* vertex_data, float* color_data, float max_flow, float xscale, float yscale)
        {
            // test - just draw a triangle at each pixel
            const int x = blockIdx.x * blockDim.x + threadIdx.x;
@ -202,7 +202,7 @@ namespace cv { namespace gpu { namespace device
            }
        }

-        void CreateOpticalFlowNeedleMap_gpu(DevMem2Df u_avg, DevMem2Df v_avg, float* vertex_buffer, float* color_data, float max_flow, float xscale, float yscale)
+        void CreateOpticalFlowNeedleMap_gpu(PtrStepSzf u_avg, PtrStepSzf v_avg, float* vertex_buffer, float* color_data, float max_flow, float xscale, float yscale)
        {
            const dim3 block(16);
            const dim3 grid(divUp(u_avg.cols, block.x), divUp(u_avg.rows, block.y));
--- a/modules/gpu/src/cuda/optical_flow_farneback.cu
+++ b/modules/gpu/src/cuda/optical_flow_farneback.cu
@ -135,7 +135,7 @@ namespace cv { namespace gpu { namespace device { namespace optflow_farneback
    }


-    void polynomialExpansionGpu(const DevMem2Df &src, int polyN, DevMem2Df dst, cudaStream_t stream)
+    void polynomialExpansionGpu(const PtrStepSzf &src, int polyN, PtrStepSzf dst, cudaStream_t stream)
    {
        dim3 block(256);
        dim3 grid(divUp(src.cols, block.x - 2*polyN), src.rows);
@ -251,8 +251,8 @@ namespace cv { namespace gpu { namespace device { namespace optflow_farneback


    void updateMatricesGpu(
-            const DevMem2Df flowx, const DevMem2Df flowy, const DevMem2Df R0, const DevMem2Df R1,
-            DevMem2Df M, cudaStream_t stream)
+            const PtrStepSzf flowx, const PtrStepSzf flowy, const PtrStepSzf R0, const PtrStepSzf R1,
+            PtrStepSzf M, cudaStream_t stream)
    {
        dim3 block(32, 8);
        dim3 grid(divUp(flowx.cols, block.x), divUp(flowx.rows, block.y));
@ -288,7 +288,7 @@ namespace cv { namespace gpu { namespace device { namespace optflow_farneback
    }


-    void updateFlowGpu(const DevMem2Df M, DevMem2Df flowx, DevMem2Df flowy, cudaStream_t stream)
+    void updateFlowGpu(const PtrStepSzf M, PtrStepSzf flowx, PtrStepSzf flowy, cudaStream_t stream)
    {
        dim3 block(32, 8);
        dim3 grid(divUp(flowx.cols, block.x), divUp(flowx.rows, block.y));
@ -340,7 +340,7 @@ namespace cv { namespace gpu { namespace device { namespace optflow_farneback
    }


-    void boxFilterGpu(const DevMem2Df src, int ksizeHalf, DevMem2Df dst, cudaStream_t stream)
+    void boxFilterGpu(const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, cudaStream_t stream)
    {
        dim3 block(256);
        dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
@ -414,7 +414,7 @@ namespace cv { namespace gpu { namespace device { namespace optflow_farneback
    }


-    void boxFilter5Gpu(const DevMem2Df src, int ksizeHalf, DevMem2Df dst, cudaStream_t stream)
+    void boxFilter5Gpu(const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, cudaStream_t stream)
    {
        int height = src.rows / 5;
        int width = src.cols;
@ -433,7 +433,7 @@ namespace cv { namespace gpu { namespace device { namespace optflow_farneback
    }


-    void boxFilter5Gpu_CC11(const DevMem2Df src, int ksizeHalf, DevMem2Df dst, cudaStream_t stream)
+    void boxFilter5Gpu_CC11(const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, cudaStream_t stream)
    {
        int height = src.rows / 5;
        int width = src.cols;
@ -501,7 +501,7 @@ namespace cv { namespace gpu { namespace device { namespace optflow_farneback


    template <typename Border>
-    void gaussianBlurCaller(const DevMem2Df src, int ksizeHalf, DevMem2Df dst, cudaStream_t stream)
+    void gaussianBlurCaller(const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, cudaStream_t stream)
    {
        int height = src.rows;
        int width = src.cols;
@ -521,9 +521,9 @@ namespace cv { namespace gpu { namespace device { namespace optflow_farneback


    void gaussianBlurGpu(
-            const DevMem2Df src, int ksizeHalf, DevMem2Df dst, int borderMode, cudaStream_t stream)
+            const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, int borderMode, cudaStream_t stream)
    {
-        typedef void (*caller_t)(const DevMem2Df, int, DevMem2Df, cudaStream_t);
+        typedef void (*caller_t)(const PtrStepSzf, int, PtrStepSzf, cudaStream_t);

        static const caller_t callers[] =
        {
@ -596,7 +596,7 @@ namespace cv { namespace gpu { namespace device { namespace optflow_farneback

    template <typename Border, int blockDimX>
    void gaussianBlur5Caller(
-            const DevMem2Df src, int ksizeHalf, DevMem2Df dst, cudaStream_t stream)
+            const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, cudaStream_t stream)
    {
        int height = src.rows / 5;
        int width = src.cols;
@ -616,9 +616,9 @@ namespace cv { namespace gpu { namespace device { namespace optflow_farneback


    void gaussianBlur5Gpu(
-            const DevMem2Df src, int ksizeHalf, DevMem2Df dst, int borderMode, cudaStream_t stream)
+            const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, int borderMode, cudaStream_t stream)
    {
-        typedef void (*caller_t)(const DevMem2Df, int, DevMem2Df, cudaStream_t);
+        typedef void (*caller_t)(const PtrStepSzf, int, PtrStepSzf, cudaStream_t);

        static const caller_t callers[] =
        {
@ -630,9 +630,9 @@ namespace cv { namespace gpu { namespace device { namespace optflow_farneback
    }

    void gaussianBlur5Gpu_CC11(
-            const DevMem2Df src, int ksizeHalf, DevMem2Df dst, int borderMode, cudaStream_t stream)
+            const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, int borderMode, cudaStream_t stream)
    {
-        typedef void (*caller_t)(const DevMem2Df, int, DevMem2Df, cudaStream_t);
+        typedef void (*caller_t)(const PtrStepSzf, int, PtrStepSzf, cudaStream_t);

        static const caller_t callers[] =
        {
--- a/modules/gpu/src/cuda/orb.cu
+++ b/modules/gpu/src/cuda/orb.cu
@ -122,7 +122,7 @@ namespace cv { namespace gpu { namespace device
            }
        }

-        void HarrisResponses_gpu(DevMem2Db img, const short2* loc, float* response, const int npoints, int blockSize, float harris_k, cudaStream_t stream)
+        void HarrisResponses_gpu(PtrStepSzb img, const short2* loc, float* response, const int npoints, int blockSize, float harris_k, cudaStream_t stream)
        {
            dim3 block(32, 8);

@ -201,7 +201,7 @@ namespace cv { namespace gpu { namespace device
            }
        }

-        void IC_Angle_gpu(DevMem2Db image, const short2* loc, float* angle, int npoints, int half_k, cudaStream_t stream)
+        void IC_Angle_gpu(PtrStepSzb image, const short2* loc, float* angle, int npoints, int half_k, cudaStream_t stream)
        {
            dim3 block(32, 8);

--- a/modules/gpu/src/cuda/pyr_down.cu
+++ b/modules/gpu/src/cuda/pyr_down.cu
@ -124,7 +124,7 @@ namespace cv { namespace gpu { namespace device
            }
        }

-        template <typename T, template <typename> class B> void pyrDown_caller(DevMem2D_<T> src, DevMem2D_<T> dst, cudaStream_t stream)
+        template <typename T, template <typename> class B> void pyrDown_caller(PtrStepSz<T> src, PtrStepSz<T> dst, cudaStream_t stream)
        {
            const dim3 block(256);
            const dim3 grid(divUp(src.cols, block.x), dst.rows);
@ -138,39 +138,39 @@ namespace cv { namespace gpu { namespace device
                cudaSafeCall( cudaDeviceSynchronize() );
        }

-        template <typename T> void pyrDown_gpu(DevMem2Db src, DevMem2Db dst, cudaStream_t stream)
+        template <typename T> void pyrDown_gpu(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream)
        {
-            pyrDown_caller<T, BrdReflect101>(static_cast< DevMem2D_<T> >(src), static_cast< DevMem2D_<T> >(dst), stream);
+            pyrDown_caller<T, BrdReflect101>(static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(dst), stream);
        }

-        template void pyrDown_gpu<uchar>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
-        //template void pyrDown_gpu<uchar2>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
-        template void pyrDown_gpu<uchar3>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
-        template void pyrDown_gpu<uchar4>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        template void pyrDown_gpu<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrDown_gpu<uchar2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrDown_gpu<uchar3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrDown_gpu<uchar4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);

-        //template void pyrDown_gpu<schar>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
-        //template void pyrDown_gpu<char2>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
-        //template void pyrDown_gpu<char3>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
-        //template void pyrDown_gpu<char4>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        //template void pyrDown_gpu<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrDown_gpu<char2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrDown_gpu<char3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrDown_gpu<char4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);

-        template void pyrDown_gpu<ushort>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
-        //template void pyrDown_gpu<ushort2>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
-        template void pyrDown_gpu<ushort3>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
-        template void pyrDown_gpu<ushort4>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        template void pyrDown_gpu<ushort>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrDown_gpu<ushort2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrDown_gpu<ushort3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrDown_gpu<ushort4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);

-        template void pyrDown_gpu<short>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
-        //template void pyrDown_gpu<short2>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
-        template void pyrDown_gpu<short3>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
-        template void pyrDown_gpu<short4>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        template void pyrDown_gpu<short>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrDown_gpu<short2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrDown_gpu<short3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrDown_gpu<short4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);

-        //template void pyrDown_gpu<int>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
-        //template void pyrDown_gpu<int2>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
-        //template void pyrDown_gpu<int3>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
-        //template void pyrDown_gpu<int4>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        //template void pyrDown_gpu<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrDown_gpu<int2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrDown_gpu<int3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrDown_gpu<int4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);

-        template void pyrDown_gpu<float>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
-        //template void pyrDown_gpu<float2>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
-        template void pyrDown_gpu<float3>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
-        template void pyrDown_gpu<float4>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        template void pyrDown_gpu<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrDown_gpu<float2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrDown_gpu<float3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrDown_gpu<float4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
    } // namespace imgproc
 }}} // namespace cv { namespace gpu { namespace device
--- a/modules/gpu/src/cuda/pyr_up.cu
+++ b/modules/gpu/src/cuda/pyr_up.cu
@ -50,7 +50,7 @@ namespace cv { namespace gpu { namespace device
 {
    namespace imgproc
    {
-        template <typename T> __global__ void pyrUp(const DevMem2D_<T> src, DevMem2D_<T> dst)
+        template <typename T> __global__ void pyrUp(const PtrStepSz<T> src, PtrStepSz<T> dst)
        {
            typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;

@ -142,7 +142,7 @@ namespace cv { namespace gpu { namespace device
                dst(y, x) = saturate_cast<T>(4.0f * sum);
        }

-        template <typename T> void pyrUp_caller(DevMem2D_<T> src, DevMem2D_<T> dst, cudaStream_t stream)
+        template <typename T> void pyrUp_caller(PtrStepSz<T> src, PtrStepSz<T> dst, cudaStream_t stream)
        {
            const dim3 block(16, 16);
            const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
@ -154,39 +154,39 @@ namespace cv { namespace gpu { namespace device
                cudaSafeCall( cudaDeviceSynchronize() );
        }

-        template <typename T> void pyrUp_gpu(DevMem2Db src, DevMem2Db dst, cudaStream_t stream)
+        template <typename T> void pyrUp_gpu(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream)
        {
-            pyrUp_caller<T>(static_cast< DevMem2D_<T> >(src), static_cast< DevMem2D_<T> >(dst), stream);
+            pyrUp_caller<T>(static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(dst), stream);
        }

-        template void pyrUp_gpu<uchar>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
-        //template void pyrUp_gpu<uchar2>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
-        template void pyrUp_gpu<uchar3>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
-        template void pyrUp_gpu<uchar4>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        template void pyrUp_gpu<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrUp_gpu<uchar2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrUp_gpu<uchar3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrUp_gpu<uchar4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);

-        //template void pyrUp_gpu<schar>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
-        //template void pyrUp_gpu<char2>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
-        //template void pyrUp_gpu<char3>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
-        //template void pyrUp_gpu<char4>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        //template void pyrUp_gpu<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrUp_gpu<char2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrUp_gpu<char3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrUp_gpu<char4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);

-        template void pyrUp_gpu<ushort>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
-        //template void pyrUp_gpu<ushort2>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
-        template void pyrUp_gpu<ushort3>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
-        template void pyrUp_gpu<ushort4>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        template void pyrUp_gpu<ushort>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrUp_gpu<ushort2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrUp_gpu<ushort3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrUp_gpu<ushort4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);

-        template void pyrUp_gpu<short>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
-        //template void pyrUp_gpu<short2>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
-        template void pyrUp_gpu<short3>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
-        template void pyrUp_gpu<short4>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        template void pyrUp_gpu<short>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrUp_gpu<short2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrUp_gpu<short3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrUp_gpu<short4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);

-        //template void pyrUp_gpu<int>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
-        //template void pyrUp_gpu<int2>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
-        //template void pyrUp_gpu<int3>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
-        //template void pyrUp_gpu<int4>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        //template void pyrUp_gpu<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrUp_gpu<int2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrUp_gpu<int3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrUp_gpu<int4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);

-        template void pyrUp_gpu<float>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
-        //template void pyrUp_gpu<float2>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
-        template void pyrUp_gpu<float3>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
-        template void pyrUp_gpu<float4>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        template void pyrUp_gpu<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrUp_gpu<float2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrUp_gpu<float3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrUp_gpu<float4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
    } // namespace imgproc
 }}} // namespace cv { namespace gpu { namespace device
--- a/modules/gpu/src/cuda/pyrlk.cu
+++ b/modules/gpu/src/cuda/pyrlk.cu
@ -458,7 +458,7 @@ namespace cv { namespace gpu { namespace device
                cudaSafeCall( cudaDeviceSynchronize() );
        }

-        void lkSparse1_gpu(DevMem2Df I, DevMem2Df J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
+        void lkSparse1_gpu(PtrStepSzf I, PtrStepSzf J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
            int level, dim3 block, dim3 patch, cudaStream_t stream)
        {
            typedef void (*func_t)(int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
@ -480,7 +480,7 @@ namespace cv { namespace gpu { namespace device
                level, block, stream);
        }

-        void lkSparse4_gpu(DevMem2D_<float4> I, DevMem2D_<float4> J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
+        void lkSparse4_gpu(PtrStepSz<float4> I, PtrStepSz<float4> J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
            int level, dim3 block, dim3 patch, cudaStream_t stream)
        {
            typedef void (*func_t)(int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
@ -648,8 +648,8 @@ namespace cv { namespace gpu { namespace device
            }
        }

-        void lkDense_gpu(DevMem2Db I, DevMem2Df J, DevMem2Df u, DevMem2Df v, DevMem2Df prevU, DevMem2Df prevV,
-                         DevMem2Df err, int2 winSize, cudaStream_t stream)
+        void lkDense_gpu(PtrStepSzb I, PtrStepSzf J, PtrStepSzf u, PtrStepSzf v, PtrStepSzf prevU, PtrStepSzf prevV,
+                         PtrStepSzf err, int2 winSize, cudaStream_t stream)
        {
            dim3 block(16, 16);
            dim3 grid(divUp(I.cols, block.x), divUp(I.rows, block.y));
--- a/modules/gpu/src/cuda/remap.cu
+++ b/modules/gpu/src/cuda/remap.cu
@ -51,7 +51,7 @@ namespace cv { namespace gpu { namespace device
 {
    namespace imgproc
    {
-        template <typename Ptr2D, typename T> __global__ void remap(const Ptr2D src, const PtrStepf mapx, const PtrStepf mapy, DevMem2D_<T> dst)
+        template <typename Ptr2D, typename T> __global__ void remap(const Ptr2D src, const PtrStepf mapx, const PtrStepf mapy, PtrStepSz<T> dst)
        {
            const int x = blockDim.x * blockIdx.x + threadIdx.x;
            const int y = blockDim.y * blockIdx.y + threadIdx.y;
@ -67,7 +67,7 @@ namespace cv { namespace gpu { namespace device

        template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcherStream
        {
-            static void call(DevMem2D_<T> src, DevMem2Df mapx, DevMem2Df mapy, DevMem2D_<T> dst, const float* borderValue, cudaStream_t stream, int)
+            static void call(PtrStepSz<T> src, PtrStepSzf mapx, PtrStepSzf mapy, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, int)
            {
                typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;

@ -85,7 +85,7 @@ namespace cv { namespace gpu { namespace device

        template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcherNonStream
        {
-            static void call(DevMem2D_<T> src, DevMem2D_<T> srcWhole, int xoff, int yoff, DevMem2Df mapx, DevMem2Df mapy, DevMem2D_<T> dst, const float* borderValue, int)
+            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy, PtrStepSz<T> dst, const float* borderValue, int)
            {
                (void)srcWhole;
                (void)xoff;
@ -121,8 +121,8 @@ namespace cv { namespace gpu { namespace device
            }; \
            template <template <typename> class Filter, template <typename> class B> struct RemapDispatcherNonStream<Filter, B, type> \
            { \
-                static void call(DevMem2D_< type > src, DevMem2D_< type > srcWhole, int xoff, int yoff, DevMem2Df mapx, DevMem2Df mapy, \
-                    DevMem2D_< type > dst, const float* borderValue, int cc) \
+                static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy, \
+                    PtrStepSz< type > dst, const float* borderValue, int cc) \
                { \
                    typedef typename TypeVec<float, VecTraits< type >::cn>::vec_type work_type; \
                    dim3 block(32, cc >= 20 ? 8 : 4); \
@ -139,8 +139,8 @@ namespace cv { namespace gpu { namespace device
            }; \
            template <template <typename> class Filter> struct RemapDispatcherNonStream<Filter, BrdReplicate, type> \
            { \
-                static void call(DevMem2D_< type > src, DevMem2D_< type > srcWhole, int xoff, int yoff, DevMem2Df mapx, DevMem2Df mapy, \
-                    DevMem2D_< type > dst, const float*, int) \
+                static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy, \
+                    PtrStepSz< type > dst, const float*, int) \
                { \
                    dim3 block(32, 8); \
                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
@ -191,8 +191,8 @@ namespace cv { namespace gpu { namespace device

        template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcher
        {
-            static void call(DevMem2D_<T> src, DevMem2D_<T> srcWhole, int xoff, int yoff, DevMem2Df mapx, DevMem2Df mapy,
-                DevMem2D_<T> dst, const float* borderValue, cudaStream_t stream, int cc)
+            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy,
+                PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, int cc)
            {
                if (stream == 0)
                    RemapDispatcherNonStream<Filter, B, T>::call(src, srcWhole, xoff, yoff, mapx, mapy, dst, borderValue, cc);
@ -201,11 +201,11 @@ namespace cv { namespace gpu { namespace device
            }
        };

-        template <typename T> void remap_gpu(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, DevMem2Df xmap, DevMem2Df ymap,
-            DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc)
+        template <typename T> void remap_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap,
+            PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc)
        {
-            typedef void (*caller_t)(DevMem2D_<T> src, DevMem2D_<T> srcWhole, int xoff, int yoff, DevMem2Df xmap, DevMem2Df ymap,
-                DevMem2D_<T> dst, const float* borderValue, cudaStream_t stream, int cc);
+            typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap,
+                PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, int cc);

            static const caller_t callers[3][5] =
            {
@ -232,38 +232,38 @@ namespace cv { namespace gpu { namespace device
                }
            };

-            callers[interpolation][borderMode](static_cast< DevMem2D_<T> >(src), static_cast< DevMem2D_<T> >(srcWhole), xoff, yoff, xmap, ymap,
-                static_cast< DevMem2D_<T> >(dst), borderValue, stream, cc);
+            callers[interpolation][borderMode](static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(srcWhole), xoff, yoff, xmap, ymap,
+                static_cast< PtrStepSz<T> >(dst), borderValue, stream, cc);
        }

-        template void remap_gpu<uchar >(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, DevMem2Df xmap, DevMem2Df ymap, DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void remap_gpu<uchar2>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, DevMem2Df xmap, DevMem2Df ymap, DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void remap_gpu<uchar3>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, DevMem2Df xmap, DevMem2Df ymap, DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void remap_gpu<uchar4>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, DevMem2Df xmap, DevMem2Df ymap, DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void remap_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void remap_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void remap_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void remap_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);

-        //template void remap_gpu<schar>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, DevMem2Df xmap, DevMem2Df ymap, DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void remap_gpu<char2>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, DevMem2Df xmap, DevMem2Df ymap, DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void remap_gpu<char3>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, DevMem2Df xmap, DevMem2Df ymap, DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void remap_gpu<char4>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, DevMem2Df xmap, DevMem2Df ymap, DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void remap_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void remap_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void remap_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void remap_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);

-        template void remap_gpu<ushort >(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, DevMem2Df xmap, DevMem2Df ymap, DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void remap_gpu<ushort2>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, DevMem2Df xmap, DevMem2Df ymap, DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void remap_gpu<ushort3>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, DevMem2Df xmap, DevMem2Df ymap, DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void remap_gpu<ushort4>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, DevMem2Df xmap, DevMem2Df ymap, DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void remap_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void remap_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void remap_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void remap_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);

-        template void remap_gpu<short >(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, DevMem2Df xmap, DevMem2Df ymap, DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void remap_gpu<short2>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, DevMem2Df xmap, DevMem2Df ymap, DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void remap_gpu<short3>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, DevMem2Df xmap, DevMem2Df ymap, DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void remap_gpu<short4>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, DevMem2Df xmap, DevMem2Df ymap, DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void remap_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void remap_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void remap_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void remap_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);

-        //template void remap_gpu<int >(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, DevMem2Df xmap, DevMem2Df ymap, DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void remap_gpu<int2>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, DevMem2Df xmap, DevMem2Df ymap, DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void remap_gpu<int3>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, DevMem2Df xmap, DevMem2Df ymap, DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void remap_gpu<int4>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, DevMem2Df xmap, DevMem2Df ymap, DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void remap_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void remap_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void remap_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void remap_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);

-        template void remap_gpu<float >(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, DevMem2Df xmap, DevMem2Df ymap, DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void remap_gpu<float2>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, DevMem2Df xmap, DevMem2Df ymap, DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void remap_gpu<float3>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, DevMem2Df xmap, DevMem2Df ymap, DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void remap_gpu<float4>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, DevMem2Df xmap, DevMem2Df ymap, DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void remap_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void remap_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void remap_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void remap_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
    } // namespace imgproc
 }}} // namespace cv { namespace gpu { namespace device
--- a/modules/gpu/src/cuda/resize.cu
+++ b/modules/gpu/src/cuda/resize.cu
@ -53,7 +53,7 @@ namespace cv { namespace gpu { namespace device
 {
    namespace imgproc
    {
-        template <typename Ptr2D, typename T> __global__ void resize(const Ptr2D src, float fx, float fy, DevMem2D_<T> dst)
+        template <typename Ptr2D, typename T> __global__ void resize(const Ptr2D src, float fx, float fy, PtrStepSz<T> dst)
        {
            const int x = blockDim.x * blockIdx.x + threadIdx.x;
            const int y = blockDim.y * blockIdx.y + threadIdx.y;
@ -67,7 +67,7 @@ namespace cv { namespace gpu { namespace device
            }
        }

-        template <typename Ptr2D, typename T> __global__ void resize_area(const Ptr2D src, float fx, float fy, DevMem2D_<T> dst)
+        template <typename Ptr2D, typename T> __global__ void resize_area(const Ptr2D src, float fx, float fy, PtrStepSz<T> dst)
        {
            const int x = blockDim.x * blockIdx.x + threadIdx.x;
            const int y = blockDim.y * blockIdx.y + threadIdx.y;
@ -80,7 +80,7 @@ namespace cv { namespace gpu { namespace device

        template <template <typename> class Filter, typename T> struct ResizeDispatcherStream
        {
-            static void call(DevMem2D_<T> src, float fx, float fy, DevMem2D_<T> dst, cudaStream_t stream)
+            static void call(PtrStepSz<T> src, float fx, float fy, PtrStepSz<T> dst, cudaStream_t stream)
            {
                dim3 block(32, 8);
                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
@ -96,7 +96,7 @@ namespace cv { namespace gpu { namespace device

        template <typename T> struct ResizeDispatcherStream<AreaFilter, T>
        {
-            static void call(DevMem2D_<T> src, float fx, float fy, DevMem2D_<T> dst, cudaStream_t stream)
+            static void call(PtrStepSz<T> src, float fx, float fy, PtrStepSz<T> dst, cudaStream_t stream)
            {
                dim3 block(32, 8);
                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
@ -113,7 +113,7 @@ namespace cv { namespace gpu { namespace device

        template <typename T> struct ResizeDispatcherStream<IntegerAreaFilter, T>
        {
-            static void call(DevMem2D_<T> src, float fx, float fy, DevMem2D_<T> dst, cudaStream_t stream)
+            static void call(PtrStepSz<T> src, float fx, float fy, PtrStepSz<T> dst, cudaStream_t stream)
            {
                dim3 block(32, 8);
                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
@ -129,7 +129,7 @@ namespace cv { namespace gpu { namespace device

        template <template <typename> class Filter, typename T> struct ResizeDispatcherNonStream
        {
-            static void call(DevMem2D_<T> src, DevMem2D_<T> srcWhole, int xoff, int yoff, float fx, float fy, DevMem2D_<T> dst)
+            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSz<T> dst)
            {
                (void)srcWhole;
                (void)xoff;
@ -165,7 +165,7 @@ namespace cv { namespace gpu { namespace device
            }; \
            template <template <typename> class Filter> struct ResizeDispatcherNonStream<Filter, type > \
            { \
-                static void call(DevMem2D_< type > src, DevMem2D_< type > srcWhole, int xoff, int yoff, float fx, float fy, DevMem2D_< type > dst) \
+                static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSz< type > dst) \
                { \
                    dim3 block(32, 8); \
                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
@ -210,7 +210,7 @@ namespace cv { namespace gpu { namespace device

        template <template <typename> class Filter, typename T> struct ResizeDispatcher
        {
-            static void call(DevMem2D_<T> src, DevMem2D_<T> srcWhole, int xoff, int yoff, float fx, float fy, DevMem2D_<T> dst, cudaStream_t stream)
+            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSz<T> dst, cudaStream_t stream)
            {
                if (stream == 0)
                    ResizeDispatcherNonStream<Filter, T>::call(src, srcWhole, xoff, yoff, fx, fy, dst);
@ -221,7 +221,7 @@ namespace cv { namespace gpu { namespace device

        template <typename T> struct ResizeDispatcher<AreaFilter, T>
        {
-            static void call(DevMem2D_<T> src, DevMem2D_<T> srcWhole, int xoff, int yoff, float fx, float fy, DevMem2D_<T> dst, cudaStream_t stream)
+            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSz<T> dst, cudaStream_t stream)
            {
                (void)srcWhole;
                (void)xoff;
@ -236,10 +236,10 @@ namespace cv { namespace gpu { namespace device
            }
        };

-        template <typename T> void resize_gpu(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float fx, float fy,
-            DevMem2Db dst, int interpolation, cudaStream_t stream)
+        template <typename T> void resize_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy,
+            PtrStepSzb dst, int interpolation, cudaStream_t stream)
        {
-            typedef void (*caller_t)(DevMem2D_<T> src, DevMem2D_<T> srcWhole, int xoff, int yoff, float fx, float fy, DevMem2D_<T> dst, cudaStream_t stream);
+            typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSz<T> dst, cudaStream_t stream);

            static const caller_t callers[4] =
            {
@ -252,39 +252,39 @@ namespace cv { namespace gpu { namespace device
            if (interpolation == 3 && (fx <= 1.f || fy <= 1.f))
                interpolation = 1;

-            callers[interpolation](static_cast< DevMem2D_<T> >(src), static_cast< DevMem2D_<T> >(srcWhole), xoff, yoff, fx, fy,
-                static_cast< DevMem2D_<T> >(dst), stream);
+            callers[interpolation](static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(srcWhole), xoff, yoff, fx, fy,
+                static_cast< PtrStepSz<T> >(dst), stream);
        }

-        template void resize_gpu<uchar >(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float fx, float fy, DevMem2Db dst, int interpolation, cudaStream_t stream);
-        //template void resize_gpu<uchar2>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float fx, float fy, DevMem2Db dst, int interpolation, cudaStream_t stream);
-        template void resize_gpu<uchar3>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float fx, float fy, DevMem2Db dst, int interpolation, cudaStream_t stream);
-        template void resize_gpu<uchar4>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float fx, float fy, DevMem2Db dst, int interpolation, cudaStream_t stream);
+        template void resize_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+        //template void resize_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+        template void resize_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+        template void resize_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);

-        //template void resize_gpu<schar>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float fx, float fy, DevMem2Db dst, int interpolation, cudaStream_t stream);
-        //template void resize_gpu<char2>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float fx, float fy, DevMem2Db dst, int interpolation, cudaStream_t stream);
-        //template void resize_gpu<char3>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float fx, float fy, DevMem2Db dst, int interpolation, cudaStream_t stream);
-        //template void resize_gpu<char4>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float fx, float fy, DevMem2Db dst, int interpolation, cudaStream_t stream);
+        //template void resize_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+        //template void resize_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+        //template void resize_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+        //template void resize_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);

-        template void resize_gpu<ushort >(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float fx, float fy, DevMem2Db dst, int interpolation, cudaStream_t stream);
-        //template void resize_gpu<ushort2>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float fx, float fy, DevMem2Db dst, int interpolation, cudaStream_t stream);
-        template void resize_gpu<ushort3>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float fx, float fy, DevMem2Db dst, int interpolation, cudaStream_t stream);
-        template void resize_gpu<ushort4>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float fx, float fy, DevMem2Db dst, int interpolation, cudaStream_t stream);
+        template void resize_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+        //template void resize_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+        template void resize_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+        template void resize_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);

-        template void resize_gpu<short >(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float fx, float fy, DevMem2Db dst, int interpolation, cudaStream_t stream);
-        //template void resize_gpu<short2>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float fx, float fy, DevMem2Db dst, int interpolation, cudaStream_t stream);
-        template void resize_gpu<short3>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float fx, float fy, DevMem2Db dst, int interpolation, cudaStream_t stream);
-        template void resize_gpu<short4>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float fx, float fy, DevMem2Db dst, int interpolation, cudaStream_t stream);
+        template void resize_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+        //template void resize_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+        template void resize_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+        template void resize_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);

-        //template void resize_gpu<int >(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float fx, float fy, DevMem2Db dst, int interpolation, cudaStream_t stream);
-        //template void resize_gpu<int2>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float fx, float fy, DevMem2Db dst, int interpolation, cudaStream_t stream);
-        //template void resize_gpu<int3>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float fx, float fy, DevMem2Db dst, int interpolation, cudaStream_t stream);
-        //template void resize_gpu<int4>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float fx, float fy, DevMem2Db dst, int interpolation, cudaStream_t stream);
+        //template void resize_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+        //template void resize_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+        //template void resize_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+        //template void resize_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);

-        template void resize_gpu<float >(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float fx, float fy, DevMem2Db dst, int interpolation, cudaStream_t stream);
-        //template void resize_gpu<float2>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float fx, float fy, DevMem2Db dst, int interpolation, cudaStream_t stream);
-        template void resize_gpu<float3>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float fx, float fy, DevMem2Db dst, int interpolation, cudaStream_t stream);
-        template void resize_gpu<float4>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float fx, float fy, DevMem2Db dst, int interpolation, cudaStream_t stream);
+        template void resize_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+        //template void resize_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+        template void resize_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+        template void resize_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);

        template<typename T> struct scan_traits{};

--- a/modules/gpu/src/cuda/rgb_to_yv12.cu
+++ b/modules/gpu/src/cuda/rgb_to_yv12.cu
@ -59,7 +59,7 @@ namespace cv { namespace gpu { namespace device
            v = static_cast<uchar>(((int)(50 * r) - (int)(42 * g) - (int)(8 * b) + 12800) / 100);
        }

-        __global__ void Gray_to_YV12(const DevMem2Db src, PtrStepb dst)
+        __global__ void Gray_to_YV12(const PtrStepSzb src, PtrStepb dst)
        {
            const int x = (blockIdx.x * blockDim.x + threadIdx.x) * 2;
            const int y = (blockIdx.y * blockDim.y + threadIdx.y) * 2;
@ -96,7 +96,7 @@ namespace cv { namespace gpu { namespace device
        }

        template <typename T>
-        __global__ void BGR_to_YV12(const DevMem2D_<T> src, PtrStepb dst)
+        __global__ void BGR_to_YV12(const PtrStepSz<T> src, PtrStepb dst)
        {
            const int x = (blockIdx.x * blockDim.x + threadIdx.x) * 2;
            const int y = (blockIdx.y * blockDim.y + threadIdx.y) * 2;
@ -132,7 +132,7 @@ namespace cv { namespace gpu { namespace device
            v_plane(y / 2, x / 2) = v_val;
        }

-        void Gray_to_YV12_caller(const DevMem2Db src, PtrStepb dst)
+        void Gray_to_YV12_caller(const PtrStepSzb src, PtrStepb dst)
        {
            dim3 block(32, 8);
            dim3 grid(divUp(src.cols, block.x * 2), divUp(src.rows, block.y * 2));
@ -143,22 +143,22 @@ namespace cv { namespace gpu { namespace device
            cudaSafeCall( cudaDeviceSynchronize() );
        }
        template <int cn>
-        void BGR_to_YV12_caller(const DevMem2Db src, PtrStepb dst)
+        void BGR_to_YV12_caller(const PtrStepSzb src, PtrStepb dst)
        {
            typedef typename TypeVec<uchar, cn>::vec_type src_t;

            dim3 block(32, 8);
            dim3 grid(divUp(src.cols, block.x * 2), divUp(src.rows, block.y * 2));

-            BGR_to_YV12<<<grid, block>>>(static_cast< DevMem2D_<src_t> >(src), dst);
+            BGR_to_YV12<<<grid, block>>>(static_cast< PtrStepSz<src_t> >(src), dst);
            cudaSafeCall( cudaGetLastError() );

            cudaSafeCall( cudaDeviceSynchronize() );
        }

-        void YV12_gpu(const DevMem2Db src, int cn, DevMem2Db dst)
+        void YV12_gpu(const PtrStepSzb src, int cn, PtrStepSzb dst)
        {
-            typedef void (*func_t)(const DevMem2Db src, PtrStepb dst);
+            typedef void (*func_t)(const PtrStepSzb src, PtrStepb dst);

            static const func_t funcs[] =
            {
--- a/modules/gpu/src/cuda/row_filter.cu
+++ b/modules/gpu/src/cuda/row_filter.cu
@ -62,7 +62,7 @@ namespace cv { namespace gpu { namespace device
        }

        template <int KSIZE, typename T, typename D, typename B>
-        __global__ void linearRowFilter(const DevMem2D_<T> src, PtrStep<D> dst, const int anchor, const B brd)
+        __global__ void linearRowFilter(const PtrStepSz<T> src, PtrStep<D> dst, const int anchor, const B brd)
        {
            #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
                const int BLOCK_DIM_X = 32;
@ -125,7 +125,7 @@ namespace cv { namespace gpu { namespace device
        }

        template <int KSIZE, typename T, typename D, template<typename> class B>
-        void linearRowFilter_caller(DevMem2D_<T> src, DevMem2D_<D> dst, int anchor, int cc, cudaStream_t stream)
+        void linearRowFilter_caller(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream)
        {
            int BLOCK_DIM_X;
            int BLOCK_DIM_Y;
@ -157,9 +157,9 @@ namespace cv { namespace gpu { namespace device
        }

        template <typename T, typename D>
-        void linearRowFilter_gpu(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream)
+        void linearRowFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream)
        {
-            typedef void (*caller_t)(DevMem2D_<T> src, DevMem2D_<D> dst, int anchor, int cc, cudaStream_t stream);
+            typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream);

            static const caller_t callers[5][33] =
            {
@ -342,13 +342,13 @@ namespace cv { namespace gpu { namespace device

            loadKernel(kernel, ksize);

-            callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor, cc, stream);
+            callers[brd_type][ksize]((PtrStepSz<T>)src, (PtrStepSz<D>)dst, anchor, cc, stream);
        }

-        template void linearRowFilter_gpu<uchar , float >(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-        template void linearRowFilter_gpu<uchar4, float4>(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-        template void linearRowFilter_gpu<short3, float3>(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-        template void linearRowFilter_gpu<int   , float >(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-        template void linearRowFilter_gpu<float , float >(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+        template void linearRowFilter_gpu<uchar , float >(PtrStepSzb src, PtrStepSzb dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+        template void linearRowFilter_gpu<uchar4, float4>(PtrStepSzb src, PtrStepSzb dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+        template void linearRowFilter_gpu<short3, float3>(PtrStepSzb src, PtrStepSzb dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+        template void linearRowFilter_gpu<int   , float >(PtrStepSzb src, PtrStepSzb dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+        template void linearRowFilter_gpu<float , float >(PtrStepSzb src, PtrStepSzb dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
    } // namespace row_filter
 }}} // namespace cv { namespace gpu { namespace device
--- a/modules/gpu/src/cuda/split_merge.cu
+++ b/modules/gpu/src/cuda/split_merge.cu
@ -91,8 +91,8 @@ namespace cv { namespace gpu { namespace device
            //typedef double4 type3;
        };

-        typedef void (*MergeFunction)(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream);
-        typedef void (*SplitFunction)(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream);
+        typedef void (*MergeFunction)(const PtrStepSzb* src, PtrStepSzb& dst, const cudaStream_t& stream);
+        typedef void (*SplitFunction)(const PtrStepSzb& src, PtrStepSzb* dst, const cudaStream_t& stream);

        //------------------------------------------------------------
        // Merge
@ -226,7 +226,7 @@ namespace cv { namespace gpu { namespace device


        template <typename T>
-        static void mergeC2_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)
+        static void mergeC2_(const PtrStepSzb* src, PtrStepSzb& dst, const cudaStream_t& stream)
        {
            dim3 block(32, 8);
            dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
@ -242,7 +242,7 @@ namespace cv { namespace gpu { namespace device


        template <typename T>
-        static void mergeC3_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)
+        static void mergeC3_(const PtrStepSzb* src, PtrStepSzb& dst, const cudaStream_t& stream)
        {
            dim3 block(32, 8);
            dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
@ -259,7 +259,7 @@ namespace cv { namespace gpu { namespace device


        template <typename T>
-        static void mergeC4_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)
+        static void mergeC4_(const PtrStepSzb* src, PtrStepSzb& dst, const cudaStream_t& stream)
        {
            dim3 block(32, 8);
            dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
@ -276,7 +276,7 @@ namespace cv { namespace gpu { namespace device
        }


-        void merge_caller(const DevMem2Db* src, DevMem2Db& dst,
+        void merge_caller(const PtrStepSzb* src, PtrStepSzb& dst,
                                     int total_channels, size_t elem_size,
                                     const cudaStream_t& stream)
        {
@ -435,7 +435,7 @@ namespace cv { namespace gpu { namespace device
        }

        template <typename T>
-        static void splitC2_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)
+        static void splitC2_(const PtrStepSzb& src, PtrStepSzb* dst, const cudaStream_t& stream)
        {
            dim3 block(32, 8);
            dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
@ -451,7 +451,7 @@ namespace cv { namespace gpu { namespace device


        template <typename T>
-        static void splitC3_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)
+        static void splitC3_(const PtrStepSzb& src, PtrStepSzb* dst, const cudaStream_t& stream)
        {
            dim3 block(32, 8);
            dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
@ -468,7 +468,7 @@ namespace cv { namespace gpu { namespace device


        template <typename T>
-        static void splitC4_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)
+        static void splitC4_(const PtrStepSzb& src, PtrStepSzb* dst, const cudaStream_t& stream)
        {
            dim3 block(32, 8);
            dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
@ -485,7 +485,7 @@ namespace cv { namespace gpu { namespace device
        }


-        void split_caller(const DevMem2Db& src, DevMem2Db* dst, int num_channels, size_t elem_size1, const cudaStream_t& stream)
+        void split_caller(const PtrStepSzb& src, PtrStepSzb* dst, int num_channels, size_t elem_size1, const cudaStream_t& stream)
        {
            static SplitFunction split_func_tbl[] =
            {
--- a/modules/gpu/src/cuda/stereobm.cu
+++ b/modules/gpu/src/cuda/stereobm.cu
@ -308,7 +308,7 @@ namespace cv { namespace gpu { namespace device
        }


-        template<int RADIUS> void kernel_caller(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& disp, int maxdisp, cudaStream_t & stream)
+        template<int RADIUS> void kernel_caller(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& disp, int maxdisp, cudaStream_t & stream)
        {
            dim3 grid(1,1,1);
            dim3 threads(BLOCK_W, 1, 1);
@ -326,7 +326,7 @@ namespace cv { namespace gpu { namespace device
                cudaSafeCall( cudaDeviceSynchronize() );
        };

-        typedef void (*kernel_caller_t)(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& disp, int maxdisp, cudaStream_t & stream);
+        typedef void (*kernel_caller_t)(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& disp, int maxdisp, cudaStream_t & stream);

        const static kernel_caller_t callers[] =
        {
@ -341,7 +341,7 @@ namespace cv { namespace gpu { namespace device
        };
        const int calles_num = sizeof(callers)/sizeof(callers[0]);

-        void stereoBM_GPU(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& disp, int maxdisp, int winsz, const DevMem2D_<unsigned int>& minSSD_buf, cudaStream_t& stream)
+        void stereoBM_GPU(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& disp, int maxdisp, int winsz, const PtrStepSz<unsigned int>& minSSD_buf, cudaStream_t& stream)
        {
            int winsz2 = winsz >> 1;

@ -370,7 +370,7 @@ namespace cv { namespace gpu { namespace device

        texture<unsigned char, 2, cudaReadModeElementType> texForSobel;

-        __global__ void prefilter_kernel(DevMem2Db output, int prefilterCap)
+        __global__ void prefilter_kernel(PtrStepSzb output, int prefilterCap)
        {
            int x = blockDim.x * blockIdx.x + threadIdx.x;
            int y = blockDim.y * blockIdx.y + threadIdx.y;
@ -387,7 +387,7 @@ namespace cv { namespace gpu { namespace device
            }
        }

-        void prefilter_xsobel(const DevMem2Db& input, const DevMem2Db& output, int prefilterCap, cudaStream_t & stream)
+        void prefilter_xsobel(const PtrStepSzb& input, const PtrStepSzb& output, int prefilterCap, cudaStream_t & stream)
        {
            cudaChannelFormatDesc desc = cudaCreateChannelDesc<unsigned char>();
            cudaSafeCall( cudaBindTexture2D( 0, texForSobel, input.data, desc, input.cols, input.rows, input.step ) );
@ -446,7 +446,7 @@ namespace cv { namespace gpu { namespace device

        #define RpT (2 * ROWSperTHREAD)  // got experimentally

-        __global__ void textureness_kernel(DevMem2Db disp, int winsz, float threshold)
+        __global__ void textureness_kernel(PtrStepSzb disp, int winsz, float threshold)
        {
            int winsz2 = winsz/2;
            int n_dirty_pixels = (winsz2) * 2;
@ -505,7 +505,7 @@ namespace cv { namespace gpu { namespace device
            }
        }

-        void postfilter_textureness(const DevMem2Db& input, int winsz, float avgTexturenessThreshold, const DevMem2Db& disp, cudaStream_t & stream)
+        void postfilter_textureness(const PtrStepSzb& input, int winsz, float avgTexturenessThreshold, const PtrStepSzb& disp, cudaStream_t & stream)
        {
            avgTexturenessThreshold *= winsz * winsz;

--- a/modules/gpu/src/cuda/stereobp.cu
+++ b/modules/gpu/src/cuda/stereobp.cu
@ -128,7 +128,7 @@ namespace cv { namespace gpu { namespace device
        };

        template <int cn, typename D>
-        __global__ void comp_data(const DevMem2Db left, const PtrStepb right, PtrElemStep_<D> data)
+        __global__ void comp_data(const PtrStepSzb left, const PtrStepb right, PtrStep<D> data)
        {
            const int x = blockIdx.x * blockDim.x + threadIdx.x;
            const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -140,7 +140,7 @@ namespace cv { namespace gpu { namespace device
                const uchar* rs = right.ptr(y) + x * cn;

                D* ds = data.ptr(y) + x;
-                const size_t disp_step = data.step * left.rows;
+                const size_t disp_step = data.step * left.rows / PtrStep<D>::elem_size;

                for (int disp = 0; disp < cndisp; disp++)
                {
@ -159,9 +159,9 @@ namespace cv { namespace gpu { namespace device
        }

        template<typename T, typename D>
-        void comp_data_gpu(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream);
+        void comp_data_gpu(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& data, cudaStream_t stream);

-        template <> void comp_data_gpu<uchar, short>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)
+        template <> void comp_data_gpu<uchar, short>(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& data, cudaStream_t stream)
        {
            dim3 threads(32, 8, 1);
            dim3 grid(1, 1, 1);
@ -169,13 +169,13 @@ namespace cv { namespace gpu { namespace device
            grid.x = divUp(left.cols, threads.x);
            grid.y = divUp(left.rows, threads.y);

-            comp_data<1, short><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<short>)data);
+            comp_data<1, short><<<grid, threads, 0, stream>>>(left, right, (PtrStepSz<short>)data);
            cudaSafeCall( cudaGetLastError() );

            if (stream == 0)
                cudaSafeCall( cudaDeviceSynchronize() );
        }
-        template <> void comp_data_gpu<uchar, float>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)
+        template <> void comp_data_gpu<uchar, float>(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& data, cudaStream_t stream)
        {
            dim3 threads(32, 8, 1);
            dim3 grid(1, 1, 1);
@ -183,14 +183,14 @@ namespace cv { namespace gpu { namespace device
            grid.x = divUp(left.cols, threads.x);
            grid.y = divUp(left.rows, threads.y);

-            comp_data<1, float><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<float>)data);
+            comp_data<1, float><<<grid, threads, 0, stream>>>(left, right, (PtrStepSz<float>)data);
            cudaSafeCall( cudaGetLastError() );

            if (stream == 0)
                cudaSafeCall( cudaDeviceSynchronize() );
        }

-        template <> void comp_data_gpu<uchar3, short>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)
+        template <> void comp_data_gpu<uchar3, short>(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& data, cudaStream_t stream)
        {
            dim3 threads(32, 8, 1);
            dim3 grid(1, 1, 1);
@ -198,13 +198,13 @@ namespace cv { namespace gpu { namespace device
            grid.x = divUp(left.cols, threads.x);
            grid.y = divUp(left.rows, threads.y);

-            comp_data<3, short><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<short>)data);
+            comp_data<3, short><<<grid, threads, 0, stream>>>(left, right, (PtrStepSz<short>)data);
            cudaSafeCall( cudaGetLastError() );

            if (stream == 0)
                cudaSafeCall( cudaDeviceSynchronize() );
        }
-        template <> void comp_data_gpu<uchar3, float>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)
+        template <> void comp_data_gpu<uchar3, float>(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& data, cudaStream_t stream)
        {
            dim3 threads(32, 8, 1);
            dim3 grid(1, 1, 1);
@ -212,14 +212,14 @@ namespace cv { namespace gpu { namespace device
            grid.x = divUp(left.cols, threads.x);
            grid.y = divUp(left.rows, threads.y);

-            comp_data<3, float><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<float>)data);
+            comp_data<3, float><<<grid, threads, 0, stream>>>(left, right, (PtrStepSz<float>)data);
            cudaSafeCall( cudaGetLastError() );

            if (stream == 0)
                cudaSafeCall( cudaDeviceSynchronize() );
        }

-        template <> void comp_data_gpu<uchar4, short>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)
+        template <> void comp_data_gpu<uchar4, short>(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& data, cudaStream_t stream)
        {
            dim3 threads(32, 8, 1);
            dim3 grid(1, 1, 1);
@ -227,13 +227,13 @@ namespace cv { namespace gpu { namespace device
            grid.x = divUp(left.cols, threads.x);
            grid.y = divUp(left.rows, threads.y);

-            comp_data<4, short><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<short>)data);
+            comp_data<4, short><<<grid, threads, 0, stream>>>(left, right, (PtrStepSz<short>)data);
            cudaSafeCall( cudaGetLastError() );

            if (stream == 0)
                cudaSafeCall( cudaDeviceSynchronize() );
        }
-        template <> void comp_data_gpu<uchar4, float>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)
+        template <> void comp_data_gpu<uchar4, float>(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& data, cudaStream_t stream)
        {
            dim3 threads(32, 8, 1);
            dim3 grid(1, 1, 1);
@ -241,7 +241,7 @@ namespace cv { namespace gpu { namespace device
            grid.x = divUp(left.cols, threads.x);
            grid.y = divUp(left.rows, threads.y);

-            comp_data<4, float><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<float>)data);
+            comp_data<4, float><<<grid, threads, 0, stream>>>(left, right, (PtrStepSz<float>)data);
            cudaSafeCall( cudaGetLastError() );

            if (stream == 0)
@ -273,7 +273,7 @@ namespace cv { namespace gpu { namespace device
        }

        template<typename T>
-        void data_step_down_gpu(int dst_cols, int dst_rows, int src_rows, const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream)
+        void data_step_down_gpu(int dst_cols, int dst_rows, int src_rows, const PtrStepSzb& src, const PtrStepSzb& dst, cudaStream_t stream)
        {
            dim3 threads(32, 8, 1);
            dim3 grid(1, 1, 1);
@ -281,30 +281,30 @@ namespace cv { namespace gpu { namespace device
            grid.x = divUp(dst_cols, threads.x);
            grid.y = divUp(dst_rows, threads.y);

-            data_step_down<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)src, (DevMem2D_<T>)dst);
+            data_step_down<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (PtrStepSz<T>)src, (PtrStepSz<T>)dst);
            cudaSafeCall( cudaGetLastError() );

            if (stream == 0)
                cudaSafeCall( cudaDeviceSynchronize() );
        }

-        template void data_step_down_gpu<short>(int dst_cols, int dst_rows, int src_rows, const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream);
-        template void data_step_down_gpu<float>(int dst_cols, int dst_rows, int src_rows, const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream);
+        template void data_step_down_gpu<short>(int dst_cols, int dst_rows, int src_rows, const PtrStepSzb& src, const PtrStepSzb& dst, cudaStream_t stream);
+        template void data_step_down_gpu<float>(int dst_cols, int dst_rows, int src_rows, const PtrStepSzb& src, const PtrStepSzb& dst, cudaStream_t stream);

        ///////////////////////////////////////////////////////////////
        /////////////////// level up messages  ////////////////////////
        ///////////////////////////////////////////////////////////////

        template <typename T>
-        __global__ void level_up_message(int dst_cols, int dst_rows, int src_rows, const PtrElemStep_<T> src, PtrElemStep_<T> dst)
+        __global__ void level_up_message(int dst_cols, int dst_rows, int src_rows, const PtrStep<T> src, PtrStep<T> dst)
        {
            const int x = blockIdx.x * blockDim.x + threadIdx.x;
            const int y = blockIdx.y * blockDim.y + threadIdx.y;

            if (x < dst_cols && y < dst_rows)
            {
-                const size_t dst_disp_step = dst.step * dst_rows;
-                const size_t src_disp_step = src.step * src_rows;
+                const size_t dst_disp_step = dst.step * dst_rows / PtrStep<T>::elem_size;
+                const size_t src_disp_step = src.step * src_rows / PtrStep<T>::elem_size;

                T*       dstr = dst.ptr(y  ) + x;
                const T* srcr = src.ptr(y/2) + x/2;
@ -315,7 +315,7 @@ namespace cv { namespace gpu { namespace device
        }

        template <typename T>
-        void level_up_messages_gpu(int dst_idx, int dst_cols, int dst_rows, int src_rows, DevMem2Db* mus, DevMem2Db* mds, DevMem2Db* mls, DevMem2Db* mrs, cudaStream_t stream)
+        void level_up_messages_gpu(int dst_idx, int dst_cols, int dst_rows, int src_rows, PtrStepSzb* mus, PtrStepSzb* mds, PtrStepSzb* mls, PtrStepSzb* mrs, cudaStream_t stream)
        {
            dim3 threads(32, 8, 1);
            dim3 grid(1, 1, 1);
@ -325,24 +325,24 @@ namespace cv { namespace gpu { namespace device

            int src_idx = (dst_idx + 1) & 1;

-            level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mus[src_idx], (DevMem2D_<T>)mus[dst_idx]);
+            level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (PtrStepSz<T>)mus[src_idx], (PtrStepSz<T>)mus[dst_idx]);
            cudaSafeCall( cudaGetLastError() );

-            level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mds[src_idx], (DevMem2D_<T>)mds[dst_idx]);
+            level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (PtrStepSz<T>)mds[src_idx], (PtrStepSz<T>)mds[dst_idx]);
            cudaSafeCall( cudaGetLastError() );

-            level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mls[src_idx], (DevMem2D_<T>)mls[dst_idx]);
+            level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (PtrStepSz<T>)mls[src_idx], (PtrStepSz<T>)mls[dst_idx]);
            cudaSafeCall( cudaGetLastError() );

-            level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mrs[src_idx], (DevMem2D_<T>)mrs[dst_idx]);
+            level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (PtrStepSz<T>)mrs[src_idx], (PtrStepSz<T>)mrs[dst_idx]);
            cudaSafeCall( cudaGetLastError() );

            if (stream == 0)
                cudaSafeCall( cudaDeviceSynchronize() );
        }

-        template void level_up_messages_gpu<short>(int dst_idx, int dst_cols, int dst_rows, int src_rows, DevMem2Db* mus, DevMem2Db* mds, DevMem2Db* mls, DevMem2Db* mrs, cudaStream_t stream);
-        template void level_up_messages_gpu<float>(int dst_idx, int dst_cols, int dst_rows, int src_rows, DevMem2Db* mus, DevMem2Db* mds, DevMem2Db* mls, DevMem2Db* mrs, cudaStream_t stream);
+        template void level_up_messages_gpu<short>(int dst_idx, int dst_cols, int dst_rows, int src_rows, PtrStepSzb* mus, PtrStepSzb* mds, PtrStepSzb* mls, PtrStepSzb* mrs, cudaStream_t stream);
+        template void level_up_messages_gpu<float>(int dst_idx, int dst_cols, int dst_rows, int src_rows, PtrStepSzb* mus, PtrStepSzb* mds, PtrStepSzb* mls, PtrStepSzb* mrs, cudaStream_t stream);

        ///////////////////////////////////////////////////////////////
        ////////////////////  calc all iterations /////////////////////
@ -419,7 +419,7 @@ namespace cv { namespace gpu { namespace device
        }

        template <typename T>
-        __global__ void one_iteration(int t, PtrElemStep_<T> u, T* d, T* l, T* r, const PtrElemStep_<T> data, int cols, int rows)
+        __global__ void one_iteration(int t, PtrStep<T> u, T* d, T* l, T* r, const PtrStep<T> data, int cols, int rows)
        {
            const int y = blockIdx.y * blockDim.y + threadIdx.y;
            const int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + t) & 1);
@ -427,13 +427,13 @@ namespace cv { namespace gpu { namespace device
            if ((y > 0) && (y < rows - 1) && (x > 0) && (x < cols - 1))
            {
                T* us = u.ptr(y) + x;
-                T* ds = d + y * u.step + x;
-                T* ls = l + y * u.step + x;
-                T* rs = r + y * u.step + x;
+                T* ds = d + y * u.step/PtrStep<T>::elem_size + x;
+                T* ls = l + y * u.step/PtrStep<T>::elem_size + x;
+                T* rs = r + y * u.step/PtrStep<T>::elem_size + x;
                const T* dt = data.ptr(y) + x;

                size_t msg_disp_step = u.step * rows;
-                size_t data_disp_step = data.step * rows;
+                size_t data_disp_step = data.step * rows / PtrStep<T>::elem_size;

                message(us + u.step, ls      + 1, rs - 1, dt, us, msg_disp_step, data_disp_step);
                message(ds - u.step, ls      + 1, rs - 1, dt, ds, msg_disp_step, data_disp_step);
@ -443,8 +443,8 @@ namespace cv { namespace gpu { namespace device
        }

        template <typename T>
-        void calc_all_iterations_gpu(int cols, int rows, int iters, const DevMem2Db& u, const DevMem2Db& d,
-            const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, cudaStream_t stream)
+        void calc_all_iterations_gpu(int cols, int rows, int iters, const PtrStepSzb& u, const PtrStepSzb& d,
+            const PtrStepSzb& l, const PtrStepSzb& r, const PtrStepSzb& data, cudaStream_t stream)
        {
            dim3 threads(32, 8, 1);
            dim3 grid(1, 1, 1);
@ -454,7 +454,7 @@ namespace cv { namespace gpu { namespace device

            for(int t = 0; t < iters; ++t)
            {
-                one_iteration<T><<<grid, threads, 0, stream>>>(t, (DevMem2D_<T>)u, (T*)d.data, (T*)l.data, (T*)r.data, (DevMem2D_<T>)data, cols, rows);
+                one_iteration<T><<<grid, threads, 0, stream>>>(t, (PtrStepSz<T>)u, (T*)d.data, (T*)l.data, (T*)r.data, (PtrStepSz<T>)data, cols, rows);
                cudaSafeCall( cudaGetLastError() );

                if (stream == 0)
@ -462,16 +462,16 @@ namespace cv { namespace gpu { namespace device
            }
        }

-        template void calc_all_iterations_gpu<short>(int cols, int rows, int iters, const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, cudaStream_t stream);
-        template void calc_all_iterations_gpu<float>(int cols, int rows, int iters, const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, cudaStream_t stream);
+        template void calc_all_iterations_gpu<short>(int cols, int rows, int iters, const PtrStepSzb& u, const PtrStepSzb& d, const PtrStepSzb& l, const PtrStepSzb& r, const PtrStepSzb& data, cudaStream_t stream);
+        template void calc_all_iterations_gpu<float>(int cols, int rows, int iters, const PtrStepSzb& u, const PtrStepSzb& d, const PtrStepSzb& l, const PtrStepSzb& r, const PtrStepSzb& data, cudaStream_t stream);

        ///////////////////////////////////////////////////////////////
        /////////////////////////// output ////////////////////////////
        ///////////////////////////////////////////////////////////////

        template <typename T>
-        __global__ void output(const PtrElemStep_<T> u, const T* d, const T* l, const T* r, const T* data,
-            DevMem2D_<short> disp)
+        __global__ void output(const PtrStep<T> u, const T* d, const T* l, const T* r, const T* data,
+            PtrStepSz<short> disp)
        {
            const int x = blockIdx.x * blockDim.x + threadIdx.x;
            const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -479,12 +479,12 @@ namespace cv { namespace gpu { namespace device
            if (y > 0 && y < disp.rows - 1 && x > 0 && x < disp.cols - 1)
            {
                const T* us = u.ptr(y + 1) + x;
-                const T* ds = d + (y - 1) * u.step + x;
-                const T* ls = l + y * u.step + (x + 1);
-                const T* rs = r + y * u.step + (x - 1);
-                const T* dt = data + y * u.step + x;
+                const T* ds = d + (y - 1) * u.step/PtrStep<T>::elem_size + x;
+                const T* ls = l + y * u.step/PtrStep<T>::elem_size + (x + 1);
+                const T* rs = r + y * u.step/PtrStep<T>::elem_size + (x - 1);
+                const T* dt = data + y * u.step/PtrStep<T>::elem_size + x;

-                size_t disp_step = disp.rows * u.step;
+                size_t disp_step = disp.rows * u.step/PtrStep<T>::elem_size;

                int best = 0;
                float best_val = numeric_limits<float>::max();
@ -508,8 +508,8 @@ namespace cv { namespace gpu { namespace device
        }

        template <typename T>
-        void output_gpu(const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data,
-            const DevMem2D_<short>& disp, cudaStream_t stream)
+        void output_gpu(const PtrStepSzb& u, const PtrStepSzb& d, const PtrStepSzb& l, const PtrStepSzb& r, const PtrStepSzb& data,
+            const PtrStepSz<short>& disp, cudaStream_t stream)
        {
            dim3 threads(32, 8, 1);
            dim3 grid(1, 1, 1);
@ -517,14 +517,14 @@ namespace cv { namespace gpu { namespace device
            grid.x = divUp(disp.cols, threads.x);
            grid.y = divUp(disp.rows, threads.y);

-            output<T><<<grid, threads, 0, stream>>>((DevMem2D_<T>)u, (const T*)d.data, (const T*)l.data, (const T*)r.data, (const T*)data.data, disp);
+            output<T><<<grid, threads, 0, stream>>>((PtrStepSz<T>)u, (const T*)d.data, (const T*)l.data, (const T*)r.data, (const T*)data.data, disp);
            cudaSafeCall( cudaGetLastError() );

            if (stream == 0)
                cudaSafeCall( cudaDeviceSynchronize() );
        }

-        template void output_gpu<short>(const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, const DevMem2D_<short>& disp, cudaStream_t stream);
-        template void output_gpu<float>(const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, const DevMem2D_<short>& disp, cudaStream_t stream);
+        template void output_gpu<short>(const PtrStepSzb& u, const PtrStepSzb& d, const PtrStepSzb& l, const PtrStepSzb& r, const PtrStepSzb& data, const PtrStepSz<short>& disp, cudaStream_t stream);
+        template void output_gpu<float>(const PtrStepSzb& u, const PtrStepSzb& d, const PtrStepSzb& l, const PtrStepSzb& r, const PtrStepSzb& data, const PtrStepSz<short>& disp, cudaStream_t stream);
    } // namespace stereobp
 }}} // namespace cv { namespace gpu { namespace device
--- a/modules/gpu/src/cuda/stereocsbp.cu
+++ b/modules/gpu/src/cuda/stereocsbp.cu
@ -72,7 +72,7 @@ namespace cv { namespace gpu { namespace device


        void load_constants(int ndisp, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump, int min_disp_th,
-                            const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& temp)
+                            const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& temp)
        {
            cudaSafeCall( cudaMemcpyToSymbol(cndisp, &ndisp, sizeof(int)) );

@ -860,7 +860,7 @@ namespace cv { namespace gpu { namespace device

        template<class T>
        void compute_disp(const T* u, const T* d, const T* l, const T* r, const T* data_cost_selected, const T* disp_selected, size_t msg_step,
-            const DevMem2D_<short>& disp, int nr_plane, cudaStream_t stream)
+            const PtrStepSz<short>& disp, int nr_plane, cudaStream_t stream)
        {
            size_t disp_step = disp.rows * msg_step;
            cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step, sizeof(size_t)) );
@ -880,9 +880,9 @@ namespace cv { namespace gpu { namespace device
        }

        template void compute_disp(const short* u, const short* d, const short* l, const short* r, const short* data_cost_selected, const short* disp_selected, size_t msg_step,
-            const DevMem2D_<short>& disp, int nr_plane, cudaStream_t stream);
+            const PtrStepSz<short>& disp, int nr_plane, cudaStream_t stream);

        template void compute_disp(const float* u, const float* d, const float* l, const float* r, const float* data_cost_selected, const float* disp_selected, size_t msg_step,
-            const DevMem2D_<short>& disp, int nr_plane, cudaStream_t stream);
+            const PtrStepSz<short>& disp, int nr_plane, cudaStream_t stream);
    } // namespace stereocsbp
 }}} // namespace cv { namespace gpu { namespace device {
--- a/modules/gpu/src/cuda/surf.cu
+++ b/modules/gpu/src/cuda/surf.cu
@ -102,15 +102,15 @@ namespace cv { namespace gpu { namespace device
        texture<unsigned int, 2, cudaReadModeElementType> sumTex(0, cudaFilterModePoint, cudaAddressModeClamp);
        texture<unsigned int, 2, cudaReadModeElementType> maskSumTex(0, cudaFilterModePoint, cudaAddressModeClamp);

-        void bindImgTex(DevMem2Db img)
+        void bindImgTex(PtrStepSzb img)
        {
            bindTexture(&imgTex, img);
        }
-        void bindSumTex(DevMem2D_<uint> sum)
+        void bindSumTex(PtrStepSz<uint> sum)
        {
            bindTexture(&sumTex, sum);
        }
-        void bindMaskSumTex(DevMem2D_<uint> maskSum)
+        void bindMaskSumTex(PtrStepSz<uint> maskSum)
        {
            bindTexture(&maskSumTex, maskSum);
        }
@ -958,7 +958,7 @@ namespace cv { namespace gpu { namespace device
            descriptor_base[threadIdx.x] = lookup / len;
        }

-        void compute_descriptors_gpu(const DevMem2Df& descriptors,
+        void compute_descriptors_gpu(const PtrStepSzf& descriptors,
            const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, int nFeatures)
        {
            // compute unnormalized descriptors, then normalize them - odd indexing since grid must be 2D
--- a/modules/gpu/src/cuda/warp.cu
+++ b/modules/gpu/src/cuda/warp.cu
@ -80,7 +80,7 @@ namespace cv { namespace gpu { namespace device
        ///////////////////////////////////////////////////////////////////
        // Build Maps

-        template <class Transform> __global__ void buildWarpMaps(DevMem2Df xmap, PtrStepf ymap)
+        template <class Transform> __global__ void buildWarpMaps(PtrStepSzf xmap, PtrStepf ymap)
        {
            const int x = blockDim.x * blockIdx.x + threadIdx.x;
            const int y = blockDim.y * blockIdx.y + threadIdx.y;
@ -94,7 +94,7 @@ namespace cv { namespace gpu { namespace device
            }
        }

-        template <class Transform> void buildWarpMaps_caller(DevMem2Df xmap, DevMem2Df ymap, cudaStream_t stream)
+        template <class Transform> void buildWarpMaps_caller(PtrStepSzf xmap, PtrStepSzf ymap, cudaStream_t stream)
        {
            dim3 block(32, 8);
            dim3 grid(divUp(xmap.cols, block.x), divUp(xmap.rows, block.y));
@ -106,14 +106,14 @@ namespace cv { namespace gpu { namespace device
                cudaSafeCall( cudaDeviceSynchronize() );
        }

-        void buildWarpAffineMaps_gpu(float coeffs[2 * 3], DevMem2Df xmap, DevMem2Df ymap, cudaStream_t stream)
+        void buildWarpAffineMaps_gpu(float coeffs[2 * 3], PtrStepSzf xmap, PtrStepSzf ymap, cudaStream_t stream)
        {
            cudaSafeCall( cudaMemcpyToSymbol(c_warpMat, coeffs, 2 * 3 * sizeof(float)) );

            buildWarpMaps_caller<AffineTransform>(xmap, ymap, stream);
        }

-        void buildWarpPerspectiveMaps_gpu(float coeffs[3 * 3], DevMem2Df xmap, DevMem2Df ymap, cudaStream_t stream)
+        void buildWarpPerspectiveMaps_gpu(float coeffs[3 * 3], PtrStepSzf xmap, PtrStepSzf ymap, cudaStream_t stream)
        {
            cudaSafeCall( cudaMemcpyToSymbol(c_warpMat, coeffs, 3 * 3 * sizeof(float)) );

@ -123,7 +123,7 @@ namespace cv { namespace gpu { namespace device
        ///////////////////////////////////////////////////////////////////
        // Warp

-        template <class Transform, class Ptr2D, typename T> __global__ void warp(const Ptr2D src, DevMem2D_<T> dst)
+        template <class Transform, class Ptr2D, typename T> __global__ void warp(const Ptr2D src, PtrStepSz<T> dst)
        {
            const int x = blockDim.x * blockIdx.x + threadIdx.x;
            const int y = blockDim.y * blockIdx.y + threadIdx.y;
@ -138,7 +138,7 @@ namespace cv { namespace gpu { namespace device

        template <class Transform, template <typename> class Filter, template <typename> class B, typename T> struct WarpDispatcherStream
        {
-            static void call(DevMem2D_<T> src, DevMem2D_<T> dst, const float* borderValue, cudaStream_t stream, int)
+            static void call(PtrStepSz<T> src, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, int)
            {
                typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;

@ -156,7 +156,7 @@ namespace cv { namespace gpu { namespace device

        template <class Transform, template <typename> class Filter, template <typename> class B, typename T> struct WarpDispatcherNonStream
        {
-            static void call(DevMem2D_<T> src, DevMem2D_<T> srcWhole, int xoff, int yoff, DevMem2D_<T> dst, const float* borderValue, int)
+            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<T> dst, const float* borderValue, int)
            {
                (void)xoff;
                (void)yoff;
@ -193,7 +193,7 @@ namespace cv { namespace gpu { namespace device
            }; \
            template <class Transform, template <typename> class Filter, template <typename> class B> struct WarpDispatcherNonStream<Transform, Filter, B, type> \
            { \
-                static void call(DevMem2D_< type > src, DevMem2D_< type > srcWhole, int xoff, int yoff, DevMem2D_< type > dst, const float* borderValue, int cc) \
+                static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSz< type > dst, const float* borderValue, int cc) \
                { \
                    typedef typename TypeVec<float, VecTraits< type >::cn>::vec_type work_type; \
                    dim3 block(32, cc >= 20 ? 8 : 4); \
@ -210,7 +210,7 @@ namespace cv { namespace gpu { namespace device
            }; \
            template <class Transform, template <typename> class Filter> struct WarpDispatcherNonStream<Transform, Filter, BrdReplicate, type> \
            { \
-                static void call(DevMem2D_< type > src, DevMem2D_< type > srcWhole, int xoff, int yoff, DevMem2D_< type > dst, const float*, int) \
+                static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSz< type > dst, const float*, int) \
                { \
                    dim3 block(32, 8); \
                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
@ -261,7 +261,7 @@ namespace cv { namespace gpu { namespace device

        template <class Transform, template <typename> class Filter, template <typename> class B, typename T> struct WarpDispatcher
        {
-            static void call(DevMem2D_<T> src, DevMem2D_<T> srcWhole, int xoff, int yoff, DevMem2D_<T> dst, const float* borderValue, cudaStream_t stream, int cc)
+            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, int cc)
            {
                if (stream == 0)
                    WarpDispatcherNonStream<Transform, Filter, B, T>::call(src, srcWhole, xoff, yoff, dst, borderValue, cc);
@ -271,10 +271,10 @@ namespace cv { namespace gpu { namespace device
        };

        template <class Transform, typename T>
-        void warp_caller(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, DevMem2Db dst, int interpolation,
+        void warp_caller(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzb dst, int interpolation,
                         int borderMode, const float* borderValue, cudaStream_t stream, int cc)
        {
-            typedef void (*func_t)(DevMem2D_<T> src, DevMem2D_<T> srcWhole, int xoff, int yoff, DevMem2D_<T> dst, const float* borderValue, cudaStream_t stream, int cc);
+            typedef void (*func_t)(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, int cc);

            static const func_t funcs[3][5] =
            {
@ -301,11 +301,11 @@ namespace cv { namespace gpu { namespace device
                }
            };

-            funcs[interpolation][borderMode](static_cast< DevMem2D_<T> >(src), static_cast< DevMem2D_<T> >(srcWhole), xoff, yoff,
-                static_cast< DevMem2D_<T> >(dst), borderValue, stream, cc);
+            funcs[interpolation][borderMode](static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(srcWhole), xoff, yoff,
+                static_cast< PtrStepSz<T> >(dst), borderValue, stream, cc);
        }

-        template <typename T> void warpAffine_gpu(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[2 * 3], DevMem2Db dst, int interpolation,
+        template <typename T> void warpAffine_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation,
                                                  int borderMode, const float* borderValue, cudaStream_t stream, int cc)
        {
            cudaSafeCall( cudaMemcpyToSymbol(c_warpMat, coeffs, 2 * 3 * sizeof(float)) );
@ -313,37 +313,37 @@ namespace cv { namespace gpu { namespace device
            warp_caller<AffineTransform, T>(src, srcWhole, xoff, yoff, dst, interpolation, borderMode, borderValue, stream, cc);
        }

-        template void warpAffine_gpu<uchar >(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[2 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpAffine_gpu<uchar2>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[2 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpAffine_gpu<uchar3>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[2 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpAffine_gpu<uchar4>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[2 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpAffine_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpAffine_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpAffine_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpAffine_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);

-        //template void warpAffine_gpu<schar>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[2 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpAffine_gpu<char2>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[2 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpAffine_gpu<char3>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[2 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpAffine_gpu<char4>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[2 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpAffine_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpAffine_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpAffine_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpAffine_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);

-        template void warpAffine_gpu<ushort >(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[2 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpAffine_gpu<ushort2>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[2 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpAffine_gpu<ushort3>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[2 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpAffine_gpu<ushort4>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[2 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpAffine_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpAffine_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpAffine_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpAffine_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);

-        template void warpAffine_gpu<short >(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[2 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpAffine_gpu<short2>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[2 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpAffine_gpu<short3>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[2 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpAffine_gpu<short4>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[2 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpAffine_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpAffine_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpAffine_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpAffine_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);

-        //template void warpAffine_gpu<int >(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[2 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpAffine_gpu<int2>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[2 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpAffine_gpu<int3>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[2 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpAffine_gpu<int4>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[2 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpAffine_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpAffine_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpAffine_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpAffine_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);

-        template void warpAffine_gpu<float >(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[2 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpAffine_gpu<float2>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[2 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpAffine_gpu<float3>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[2 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpAffine_gpu<float4>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[2 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpAffine_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpAffine_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpAffine_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpAffine_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);

-        template <typename T> void warpPerspective_gpu(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[3 * 3], DevMem2Db dst, int interpolation,
+        template <typename T> void warpPerspective_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation,
                                                  int borderMode, const float* borderValue, cudaStream_t stream, int cc)
        {
            cudaSafeCall( cudaMemcpyToSymbol(c_warpMat, coeffs, 3 * 3 * sizeof(float)) );
@ -351,34 +351,34 @@ namespace cv { namespace gpu { namespace device
            warp_caller<PerspectiveTransform, T>(src, srcWhole, xoff, yoff, dst, interpolation, borderMode, borderValue, stream, cc);
        }

-        template void warpPerspective_gpu<uchar >(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[3 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpPerspective_gpu<uchar2>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[3 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpPerspective_gpu<uchar3>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[3 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpPerspective_gpu<uchar4>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[3 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpPerspective_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpPerspective_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpPerspective_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpPerspective_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);

-        //template void warpPerspective_gpu<schar>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[3 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpPerspective_gpu<char2>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[3 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpPerspective_gpu<char3>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[3 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpPerspective_gpu<char4>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[3 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpPerspective_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpPerspective_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpPerspective_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpPerspective_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);

-        template void warpPerspective_gpu<ushort >(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[3 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpPerspective_gpu<ushort2>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[3 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpPerspective_gpu<ushort3>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[3 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpPerspective_gpu<ushort4>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[3 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpPerspective_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpPerspective_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpPerspective_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpPerspective_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);

-        template void warpPerspective_gpu<short >(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[3 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpPerspective_gpu<short2>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[3 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpPerspective_gpu<short3>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[3 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpPerspective_gpu<short4>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[3 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpPerspective_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpPerspective_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpPerspective_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpPerspective_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);

-        //template void warpPerspective_gpu<int >(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[3 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpPerspective_gpu<int2>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[3 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpPerspective_gpu<int3>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[3 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpPerspective_gpu<int4>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[3 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpPerspective_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpPerspective_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpPerspective_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpPerspective_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);

-        template void warpPerspective_gpu<float >(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[3 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpPerspective_gpu<float2>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[3 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpPerspective_gpu<float3>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[3 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpPerspective_gpu<float4>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[3 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpPerspective_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpPerspective_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpPerspective_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpPerspective_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
    } // namespace imgproc
 }}} // namespace cv { namespace gpu { namespace device
--- a/modules/gpu/src/cvt_colot_internal.h
+++ b/modules/gpu/src/cvt_colot_internal.h
@ -46,7 +46,7 @@
 namespace cv { namespace gpu { namespace device
 {
 #define OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name)                                   \
-    void name(const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream);
+    void name(const PtrStepSzb& src, const PtrStepSzb& dst, cudaStream_t stream);

 #define OPENCV_GPU_DECLARE_CVTCOLOR_ALL(name)       \
    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _8u)    \
--- a/modules/gpu/src/element_operations.cpp
+++ b/modules/gpu/src/element_operations.cpp
@ -116,7 +116,7 @@ namespace
    {
        typedef typename NppArithmFunc<DEPTH>::npp_t npp_t;

-        static void call(const DevMem2Db src1, const PtrStepb src2, PtrStepb dst, cudaStream_t stream)
+        static void call(const PtrStepSzb src1, const PtrStepb src2, PtrStepb dst, cudaStream_t stream)
        {
            NppStreamHandler h(stream);

@ -135,7 +135,7 @@ namespace
    {
        typedef typename NppArithmFunc<CV_32F>::npp_t npp_t;

-        static void call(const DevMem2Db src1, const PtrStepb src2, PtrStepb dst, cudaStream_t stream)
+        static void call(const PtrStepSzb src1, const PtrStepb src2, PtrStepb dst, cudaStream_t stream)
        {
            NppStreamHandler h(stream);

@ -189,7 +189,7 @@ namespace
    {
        typedef typename NppTypeTraits<DEPTH>::npp_t npp_t;

-        static void call(const DevMem2Db src, Scalar sc, PtrStepb dst, cudaStream_t stream)
+        static void call(const PtrStepSzb src, Scalar sc, PtrStepb dst, cudaStream_t stream)
        {
            NppStreamHandler h(stream);

@ -209,7 +209,7 @@ namespace
    {
        typedef typename NppTypeTraits<DEPTH>::npp_t npp_t;

-        static void call(const DevMem2Db src, Scalar sc, PtrStepb dst, cudaStream_t stream)
+        static void call(const PtrStepSzb src, Scalar sc, PtrStepb dst, cudaStream_t stream)
        {
            NppStreamHandler h(stream);

@ -228,7 +228,7 @@ namespace
        typedef typename NppTypeTraits<DEPTH>::npp_t npp_t;
        typedef typename NppTypeTraits<DEPTH>::npp_complex_type npp_complex_type;

-        static void call(const DevMem2Db src, Scalar sc, PtrStepb dst, cudaStream_t stream)
+        static void call(const PtrStepSzb src, Scalar sc, PtrStepb dst, cudaStream_t stream)
        {
            NppStreamHandler h(stream);

@ -251,7 +251,7 @@ namespace
    {
        typedef typename NppTypeTraits<CV_32F>::npp_t npp_t;

-        static void call(const DevMem2Db src, Scalar sc, PtrStepb dst, cudaStream_t stream)
+        static void call(const PtrStepSzb src, Scalar sc, PtrStepb dst, cudaStream_t stream)
        {
            NppStreamHandler h(stream);

@ -271,7 +271,7 @@ namespace
    {
        typedef typename NppTypeTraits<CV_32F>::npp_t npp_t;

-        static void call(const DevMem2Db src, Scalar sc, PtrStepb dst, cudaStream_t stream)
+        static void call(const PtrStepSzb src, Scalar sc, PtrStepb dst, cudaStream_t stream)
        {
            NppStreamHandler h(stream);

@ -290,7 +290,7 @@ namespace
        typedef typename NppTypeTraits<CV_32F>::npp_t npp_t;
        typedef typename NppTypeTraits<CV_32F>::npp_complex_type npp_complex_type;

-        static void call(const DevMem2Db src, Scalar sc, PtrStepb dst, cudaStream_t stream)
+        static void call(const PtrStepSzb src, Scalar sc, PtrStepb dst, cudaStream_t stream)
        {
            NppStreamHandler h(stream);

@ -316,17 +316,17 @@ namespace
 namespace cv { namespace gpu { namespace device
 {
    template <typename T, typename D>
-    void add_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);

    template <typename T, typename D>
-    void add_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
 }}}

 void cv::gpu::add(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, int dtype, Stream& s)
 {
    using namespace cv::gpu::device;

-    typedef void (*func_t)(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    typedef void (*func_t)(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
    static const func_t funcs[7][7] =
    {
        {add_gpu<unsigned char, unsigned char>       , 0 /*add_gpu<unsigned char, signed char>*/ , add_gpu<unsigned char, unsigned short>    , add_gpu<unsigned char, short>       , add_gpu<unsigned char, int>    , add_gpu<unsigned char, float>    , add_gpu<unsigned char, double>    },
@ -338,7 +338,7 @@ void cv::gpu::add(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Gpu
        {0 /*add_gpu<double, unsigned char>*/        , 0 /*add_gpu<double, signed char>*/        , 0 /*add_gpu<double, unsigned short>*/     , 0 /*add_gpu<double, short>*/        , 0 /*add_gpu<double, int>*/     , 0 /*add_gpu<double, float>*/     , add_gpu<double, double>           }
    };

-    typedef void (*npp_func_t)(const DevMem2Db src1, const PtrStepb src2, PtrStepb dst, cudaStream_t stream);
+    typedef void (*npp_func_t)(const PtrStepSzb src1, const PtrStepb src2, PtrStepb dst, cudaStream_t stream);
    static const npp_func_t npp_funcs[] =
    {
        NppArithm<CV_8U , nppiAdd_8u_C1RSfs >::call,
@ -384,7 +384,7 @@ void cv::gpu::add(const GpuMat& src, const Scalar& sc, GpuMat& dst, const GpuMat
 {
    using namespace cv::gpu::device;

-    typedef void (*func_t)(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    typedef void (*func_t)(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
    static const func_t funcs[7][7] =
    {
        {add_gpu<unsigned char, unsigned char>       , 0 /*add_gpu<unsigned char, signed char>*/ , add_gpu<unsigned char, unsigned short>    , add_gpu<unsigned char, short>       , add_gpu<unsigned char, int>    , add_gpu<unsigned char, float>    , add_gpu<unsigned char, double>    },
@ -396,7 +396,7 @@ void cv::gpu::add(const GpuMat& src, const Scalar& sc, GpuMat& dst, const GpuMat
        {0 /*add_gpu<double, unsigned char>*/        , 0 /*add_gpu<double, signed char>*/        , 0 /*add_gpu<double, unsigned short>*/     , 0 /*add_gpu<double, short>*/        , 0 /*add_gpu<double, int>*/     , 0 /*add_gpu<double, float>*/     , add_gpu<double, double>           }
    };

-    typedef void (*npp_func_t)(const DevMem2Db src, Scalar sc, PtrStepb dst, cudaStream_t stream);
+    typedef void (*npp_func_t)(const PtrStepSzb src, Scalar sc, PtrStepb dst, cudaStream_t stream);
    static const npp_func_t npp_funcs[7][4] =
    {
        {NppArithmScalar<CV_8U , 1, nppiAddC_8u_C1RSfs >::call, 0                                                     , NppArithmScalar<CV_8U , 3, nppiAddC_8u_C3RSfs >::call, NppArithmScalar<CV_8U , 4, nppiAddC_8u_C4RSfs >::call},
@ -452,17 +452,17 @@ void cv::gpu::add(const GpuMat& src, const Scalar& sc, GpuMat& dst, const GpuMat
 namespace cv { namespace gpu { namespace device
 {
    template <typename T, typename D>
-    void subtract_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);

    template <typename T, typename D>
-    void subtract_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
 }}}

 void cv::gpu::subtract(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, int dtype, Stream& s)
 {
    using namespace cv::gpu::device;

-    typedef void (*func_t)(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    typedef void (*func_t)(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
    static const func_t funcs[7][7] =
    {
        {subtract_gpu<unsigned char, unsigned char>       , 0 /*subtract_gpu<unsigned char, signed char>*/ , subtract_gpu<unsigned char, unsigned short>    , subtract_gpu<unsigned char, short>       , subtract_gpu<unsigned char, int>    , subtract_gpu<unsigned char, float>    , subtract_gpu<unsigned char, double>    },
@ -474,7 +474,7 @@ void cv::gpu::subtract(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cons
        {0 /*subtract_gpu<double, unsigned char>*/        , 0 /*subtract_gpu<double, signed char>*/        , 0 /*subtract_gpu<double, unsigned short>*/     , 0 /*subtract_gpu<double, short>*/        , 0 /*subtract_gpu<double, int>*/     , 0 /*subtract_gpu<double, float>*/     , subtract_gpu<double, double>           }
    };

-    typedef void (*npp_func_t)(const DevMem2Db src1, const PtrStepb src2, PtrStepb dst, cudaStream_t stream);
+    typedef void (*npp_func_t)(const PtrStepSzb src1, const PtrStepb src2, PtrStepb dst, cudaStream_t stream);
    static const npp_func_t npp_funcs[6] =
    {
        NppArithm<CV_8U , nppiSub_8u_C1RSfs>::call,
@ -520,7 +520,7 @@ void cv::gpu::subtract(const GpuMat& src, const Scalar& sc, GpuMat& dst, const G
 {
    using namespace cv::gpu::device;

-    typedef void (*func_t)(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    typedef void (*func_t)(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
    static const func_t funcs[7][7] =
    {
        {subtract_gpu<unsigned char, unsigned char>       , 0 /*subtract_gpu<unsigned char, signed char>*/ , subtract_gpu<unsigned char, unsigned short>    , subtract_gpu<unsigned char, short>       , subtract_gpu<unsigned char, int>    , subtract_gpu<unsigned char, float>    , subtract_gpu<unsigned char, double>    },
@ -532,7 +532,7 @@ void cv::gpu::subtract(const GpuMat& src, const Scalar& sc, GpuMat& dst, const G
        {0 /*subtract_gpu<double, unsigned char>*/        , 0 /*subtract_gpu<double, signed char>*/        , 0 /*subtract_gpu<double, unsigned short>*/     , 0 /*subtract_gpu<double, short>*/        , 0 /*subtract_gpu<double, int>*/     , 0 /*subtract_gpu<double, float>*/     , subtract_gpu<double, double>           }
    };

-    typedef void (*npp_func_t)(const DevMem2Db src, Scalar sc, PtrStepb dst, cudaStream_t stream);
+    typedef void (*npp_func_t)(const PtrStepSzb src, Scalar sc, PtrStepb dst, cudaStream_t stream);
    static const npp_func_t npp_funcs[7][4] =
    {
        {NppArithmScalar<CV_8U , 1, nppiSubC_8u_C1RSfs >::call, 0                                                     , NppArithmScalar<CV_8U , 3, nppiSubC_8u_C3RSfs >::call, NppArithmScalar<CV_8U , 4, nppiSubC_8u_C4RSfs >::call},
@ -587,14 +587,14 @@ void cv::gpu::subtract(const GpuMat& src, const Scalar& sc, GpuMat& dst, const G

 namespace cv { namespace gpu { namespace device
 {
-    void multiply_gpu(const DevMem2D_<uchar4>& src1, const DevMem2Df& src2, const DevMem2D_<uchar4>& dst, cudaStream_t stream);
-    void multiply_gpu(const DevMem2D_<short4>& src1, const DevMem2Df& src2, const DevMem2D_<short4>& dst, cudaStream_t stream);
+    void multiply_gpu(const PtrStepSz<uchar4>& src1, const PtrStepSzf& src2, const PtrStepSz<uchar4>& dst, cudaStream_t stream);
+    void multiply_gpu(const PtrStepSz<short4>& src1, const PtrStepSzf& src2, const PtrStepSz<short4>& dst, cudaStream_t stream);

    template <typename T, typename D>
-    void multiply_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);

    template <typename T, typename D>
-    void multiply_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
 }}}

 void cv::gpu::multiply(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, double scale, int dtype, Stream& s)
@ -609,7 +609,7 @@ void cv::gpu::multiply(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, doub

        dst.create(src1.size(), src1.type());

-        multiply_gpu(static_cast<DevMem2D_<uchar4> >(src1), static_cast<DevMem2Df>(src2), static_cast<DevMem2D_<uchar4> >(dst), stream);
+        multiply_gpu(static_cast<PtrStepSz<uchar4> >(src1), static_cast<PtrStepSzf>(src2), static_cast<PtrStepSz<uchar4> >(dst), stream);
    }
    else if (src1.type() == CV_16SC4 && src2.type() == CV_32FC1)
    {
@ -617,11 +617,11 @@ void cv::gpu::multiply(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, doub

        dst.create(src1.size(), src1.type());

-        multiply_gpu(static_cast<DevMem2D_<short4> >(src1), static_cast<DevMem2Df>(src2), static_cast<DevMem2D_<short4> >(dst), stream);
+        multiply_gpu(static_cast<PtrStepSz<short4> >(src1), static_cast<PtrStepSzf>(src2), static_cast<PtrStepSz<short4> >(dst), stream);
    }
    else
    {
-        typedef void (*func_t)(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+        typedef void (*func_t)(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
        static const func_t funcs[7][7] =
        {
            {multiply_gpu<unsigned char, unsigned char>       , 0 /*multiply_gpu<unsigned char, signed char>*/ , multiply_gpu<unsigned char, unsigned short>    , multiply_gpu<unsigned char, short>       , multiply_gpu<unsigned char, int>    , multiply_gpu<unsigned char, float>    , multiply_gpu<unsigned char, double>    },
@ -633,7 +633,7 @@ void cv::gpu::multiply(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, doub
            {0 /*multiply_gpu<double, unsigned char>*/        , 0 /*multiply_gpu<double, signed char>*/        , 0 /*multiply_gpu<double, unsigned short>*/     , 0 /*multiply_gpu<double, short>*/        , 0 /*multiply_gpu<double, int>*/     , 0 /*multiply_gpu<double, float>*/     , multiply_gpu<double, double>           }
        };

-        typedef void (*npp_func_t)(const DevMem2Db src1, const PtrStepb src2, PtrStepb dst, cudaStream_t stream);
+        typedef void (*npp_func_t)(const PtrStepSzb src1, const PtrStepb src2, PtrStepb dst, cudaStream_t stream);
        static const npp_func_t npp_funcs[] =
        {
            NppArithm<CV_8U , nppiMul_8u_C1RSfs >::call,
@ -689,7 +689,7 @@ void cv::gpu::multiply(const GpuMat& src, const Scalar& sc, GpuMat& dst, double
 {
    using namespace cv::gpu::device;

-    typedef void (*func_t)(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    typedef void (*func_t)(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
    static const func_t funcs[7][7] =
    {
        {multiply_gpu<unsigned char, unsigned char>       , 0 /*multiply_gpu<unsigned char, signed char>*/ , multiply_gpu<unsigned char, unsigned short>    , multiply_gpu<unsigned char, short>       , multiply_gpu<unsigned char, int>    , multiply_gpu<unsigned char, float>    , multiply_gpu<unsigned char, double>    },
@ -701,7 +701,7 @@ void cv::gpu::multiply(const GpuMat& src, const Scalar& sc, GpuMat& dst, double
        {0 /*multiply_gpu<double, unsigned char>*/        , 0 /*multiply_gpu<double, signed char>*/        , 0 /*multiply_gpu<double, unsigned short>*/     , 0 /*multiply_gpu<double, short>*/        , 0 /*multiply_gpu<double, int>*/     , 0 /*multiply_gpu<double, float>*/     , multiply_gpu<double, double>           }
    };

-    typedef void (*npp_func_t)(const DevMem2Db src, Scalar sc, PtrStepb dst, cudaStream_t stream);
+    typedef void (*npp_func_t)(const PtrStepSzb src, Scalar sc, PtrStepb dst, cudaStream_t stream);
    static const npp_func_t npp_funcs[7][4] =
    {
        {NppArithmScalar<CV_8U , 1, nppiMulC_8u_C1RSfs >::call, 0, NppArithmScalar<CV_8U , 3, nppiMulC_8u_C3RSfs >::call, NppArithmScalar<CV_8U , 4, nppiMulC_8u_C4RSfs >::call},
@ -755,17 +755,17 @@ void cv::gpu::multiply(const GpuMat& src, const Scalar& sc, GpuMat& dst, double

 namespace cv { namespace gpu { namespace device
 {
-    void divide_gpu(const DevMem2D_<uchar4>& src1, const DevMem2Df& src2, const DevMem2D_<uchar4>& dst, cudaStream_t stream);
-    void divide_gpu(const DevMem2D_<short4>& src1, const DevMem2Df& src2, const DevMem2D_<short4>& dst, cudaStream_t stream);
+    void divide_gpu(const PtrStepSz<uchar4>& src1, const PtrStepSzf& src2, const PtrStepSz<uchar4>& dst, cudaStream_t stream);
+    void divide_gpu(const PtrStepSz<short4>& src1, const PtrStepSzf& src2, const PtrStepSz<short4>& dst, cudaStream_t stream);

    template <typename T, typename D>
-    void divide_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);

    template <typename T, typename D>
-    void divide_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);

    template <typename T, typename D>
-    void divide_gpu(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
 }}}

 void cv::gpu::divide(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, double scale, int dtype, Stream& s)
@ -780,7 +780,7 @@ void cv::gpu::divide(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, double

        dst.create(src1.size(), src1.type());

-        divide_gpu(static_cast<DevMem2D_<uchar4> >(src1), static_cast<DevMem2Df>(src2), static_cast<DevMem2D_<uchar4> >(dst), stream);
+        divide_gpu(static_cast<PtrStepSz<uchar4> >(src1), static_cast<PtrStepSzf>(src2), static_cast<PtrStepSz<uchar4> >(dst), stream);
    }
    else if (src1.type() == CV_16SC4 && src2.type() == CV_32FC1)
    {
@ -788,11 +788,11 @@ void cv::gpu::divide(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, double

        dst.create(src1.size(), src1.type());

-        divide_gpu(static_cast<DevMem2D_<short4> >(src1), static_cast<DevMem2Df>(src2), static_cast<DevMem2D_<short4> >(dst), stream);
+        divide_gpu(static_cast<PtrStepSz<short4> >(src1), static_cast<PtrStepSzf>(src2), static_cast<PtrStepSz<short4> >(dst), stream);
    }
    else
    {
-        typedef void (*func_t)(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+        typedef void (*func_t)(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
        static const func_t funcs[7][7] =
        {
            {divide_gpu<unsigned char, unsigned char>       , 0 /*divide_gpu<unsigned char, signed char>*/ , divide_gpu<unsigned char, unsigned short>    , divide_gpu<unsigned char, short>       , divide_gpu<unsigned char, int>    , divide_gpu<unsigned char, float>    , divide_gpu<unsigned char, double>    },
@ -804,7 +804,7 @@ void cv::gpu::divide(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, double
            {0 /*divide_gpu<double, unsigned char>*/        , 0 /*divide_gpu<double, signed char>*/        , 0 /*divide_gpu<double, unsigned short>*/     , 0 /*divide_gpu<double, short>*/        , 0 /*divide_gpu<double, int>*/     , 0 /*divide_gpu<double, float>*/     , divide_gpu<double, double>           }
        };

-        typedef void (*npp_func_t)(const DevMem2Db src1, const PtrStepb src2, PtrStepb dst, cudaStream_t stream);
+        typedef void (*npp_func_t)(const PtrStepSzb src1, const PtrStepb src2, PtrStepb dst, cudaStream_t stream);
        static const npp_func_t npp_funcs[6] =
        {
            NppArithm<CV_8U , nppiDiv_8u_C1RSfs >::call,
@ -848,7 +848,7 @@ void cv::gpu::divide(const GpuMat& src, const Scalar& sc, GpuMat& dst, double sc
 {
    using namespace cv::gpu::device;

-    typedef void (*func_t)(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    typedef void (*func_t)(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
    static const func_t funcs[7][7] =
    {
        {divide_gpu<unsigned char, unsigned char>       , 0 /*divide_gpu<unsigned char, signed char>*/ , divide_gpu<unsigned char, unsigned short>    , divide_gpu<unsigned char, short>       , divide_gpu<unsigned char, int>    , divide_gpu<unsigned char, float>    , divide_gpu<unsigned char, double>    },
@ -860,7 +860,7 @@ void cv::gpu::divide(const GpuMat& src, const Scalar& sc, GpuMat& dst, double sc
        {0 /*divide_gpu<double, unsigned char>*/        , 0 /*divide_gpu<double, signed char>*/        , 0 /*divide_gpu<double, unsigned short>*/     , 0 /*divide_gpu<double, short>*/        , 0 /*divide_gpu<double, int>*/     , 0 /*divide_gpu<double, float>*/     , divide_gpu<double, double>           }
    };

-    typedef void (*npp_func_t)(const DevMem2Db src, Scalar sc, PtrStepb dst, cudaStream_t stream);
+    typedef void (*npp_func_t)(const PtrStepSzb src, Scalar sc, PtrStepb dst, cudaStream_t stream);
    static const npp_func_t npp_funcs[7][4] =
    {
        {NppArithmScalar<CV_8U , 1, nppiDivC_8u_C1RSfs >::call, 0, NppArithmScalar<CV_8U , 3, nppiDivC_8u_C3RSfs >::call, NppArithmScalar<CV_8U , 4, nppiDivC_8u_C4RSfs >::call},
@ -913,7 +913,7 @@ void cv::gpu::divide(double scale, const GpuMat& src, GpuMat& dst, int dtype, St
 {
    using namespace cv::gpu::device;

-    typedef void (*func_t)(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    typedef void (*func_t)(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
    static const func_t funcs[7][7] =
    {
        {divide_gpu<unsigned char, unsigned char>       , 0 /*divide_gpu<unsigned char, signed char>*/ , divide_gpu<unsigned char, unsigned short>    , divide_gpu<unsigned char, short>       , divide_gpu<unsigned char, int>    , divide_gpu<unsigned char, float>    , divide_gpu<unsigned char, double>    },
@ -955,10 +955,10 @@ void cv::gpu::divide(double scale, const GpuMat& src, GpuMat& dst, int dtype, St
 namespace cv { namespace gpu { namespace device
 {
    template <typename T>
-    void absdiff_gpu(const DevMem2Db src1, const DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
+    void absdiff_gpu(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);

    template <typename T>
-    void absdiff_gpu(const DevMem2Db src1, double val, DevMem2Db dst, cudaStream_t stream);
+    void absdiff_gpu(const PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
 }}}

 namespace
@ -974,7 +974,7 @@ namespace
    {
        typedef typename NppAbsDiffFunc<DEPTH>::npp_t npp_t;

-        static void call(const DevMem2Db src1, const DevMem2Db src2, DevMem2Db dst, cudaStream_t stream)
+        static void call(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
        {
            NppStreamHandler h(stream);

@ -1014,7 +1014,7 @@ namespace
        typedef typename NppAbsDiffCFunc<DEPTH>::npp_t npp_t;
        typedef typename NppAbsDiffCFunc<DEPTH>::scalar_t scalar_t;

-        static void call(const DevMem2Db src1, double val, DevMem2Db dst, cudaStream_t stream)
+        static void call(const PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream)
        {
            NppStreamHandler h(stream);

@ -1035,7 +1035,7 @@ void cv::gpu::absdiff(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Strea
 {
    using namespace cv::gpu::device;

-    typedef void (*func_t)(const DevMem2Db src1, const DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
+    typedef void (*func_t)(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
    static const func_t funcs[] =
    {
        NppAbsDiff<CV_8U, nppiAbsDiff_8u_C1R>::call,
@ -1065,7 +1065,7 @@ void cv::gpu::absdiff(const GpuMat& src1, const Scalar& src2, GpuMat& dst, Strea
 {
    using namespace cv::gpu::device;

-    typedef void (*func_t)(const DevMem2Db src1, double val, DevMem2Db dst, cudaStream_t stream);
+    typedef void (*func_t)(const PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
    static const func_t funcs[] =
    {
        NppAbsDiffC<CV_8U, nppiAbsDiffC_8u_C1R>::call,
@ -1366,24 +1366,24 @@ void cv::gpu::exp(const GpuMat& src, GpuMat& dst, Stream& stream)

 namespace cv { namespace gpu { namespace device
 {
-    template <typename T> void compare_eq(DevMem2Db src1, DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
-    template <typename T> void compare_ne(DevMem2Db src1, DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
-    template <typename T> void compare_lt(DevMem2Db src1, DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
-    template <typename T> void compare_le(DevMem2Db src1, DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
+    template <typename T> void compare_eq(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template <typename T> void compare_ne(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template <typename T> void compare_lt(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template <typename T> void compare_le(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);

-    template <typename T> void compare_eq(DevMem2Db src, int cn, double val[4], DevMem2Db dst, cudaStream_t stream);
-    template <typename T> void compare_ne(DevMem2Db src, int cn, double val[4], DevMem2Db dst, cudaStream_t stream);
-    template <typename T> void compare_lt(DevMem2Db src, int cn, double val[4], DevMem2Db dst, cudaStream_t stream);
-    template <typename T> void compare_le(DevMem2Db src, int cn, double val[4], DevMem2Db dst, cudaStream_t stream);
-    template <typename T> void compare_gt(DevMem2Db src, int cn, double val[4], DevMem2Db dst, cudaStream_t stream);
-    template <typename T> void compare_ge(DevMem2Db src, int cn, double val[4], DevMem2Db dst, cudaStream_t stream);
+    template <typename T> void compare_eq(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template <typename T> void compare_ne(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template <typename T> void compare_lt(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template <typename T> void compare_le(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template <typename T> void compare_gt(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template <typename T> void compare_ge(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
 }}}

 void cv::gpu::compare(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, int cmpop, Stream& stream)
 {
    using namespace cv::gpu::device;

-    typedef void (*func_t)(DevMem2Db src1, DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
+    typedef void (*func_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
    static const func_t funcs[7][4] =
    {
        {compare_eq<unsigned char> , compare_ne<unsigned char> , compare_lt<unsigned char> , compare_le<unsigned char> },
@ -1439,7 +1439,7 @@ void cv::gpu::compare(const GpuMat& src, Scalar sc, GpuMat& dst, int cmpop, Stre
 {
    using namespace cv::gpu::device;

-    typedef void (*func_t)(DevMem2Db src, int cn, double val[4], DevMem2Db dst, cudaStream_t stream);
+    typedef void (*func_t)(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
    static const func_t funcs[7][6] =
    {
        {compare_eq<unsigned char> , compare_gt<unsigned char> , compare_ge<unsigned char> , compare_lt<unsigned char> , compare_le<unsigned char> , compare_ne<unsigned char> },
@ -1900,18 +1900,18 @@ void cv::gpu::lshift(const GpuMat& src, Scalar_<int> sc, GpuMat& dst, Stream& st

 namespace cv { namespace gpu { namespace device
 {
-    template <typename T> void min_gpu(const DevMem2Db src1, const DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
-    template <typename T> void max_gpu(const DevMem2Db src1, const DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
+    template <typename T> void min_gpu(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template <typename T> void max_gpu(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);

-    template <typename T> void min_gpu(const DevMem2Db src, T val, DevMem2Db dst, cudaStream_t stream);
-    template <typename T> void max_gpu(const DevMem2Db src, T val, DevMem2Db dst, cudaStream_t stream);
+    template <typename T> void min_gpu(const PtrStepSzb src, T val, PtrStepSzb dst, cudaStream_t stream);
+    template <typename T> void max_gpu(const PtrStepSzb src, T val, PtrStepSzb dst, cudaStream_t stream);
 }}}

 void cv::gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
 {
    using namespace cv::gpu::device;

-    typedef void (*func_t)(const DevMem2Db src1, const DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
+    typedef void (*func_t)(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
    static const func_t funcs[] =
    {
        min_gpu<unsigned char>,
@ -1941,7 +1941,7 @@ void cv::gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& s
 {
    using namespace cv::gpu::device;

-    typedef void (*func_t)(const DevMem2Db src1, const DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
+    typedef void (*func_t)(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
    static const func_t funcs[] =
    {
        max_gpu<unsigned char>,
@ -1969,12 +1969,12 @@ void cv::gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& s

 namespace
 {
-    template <typename T> void minScalar(const DevMem2Db src, double val, DevMem2Db dst, cudaStream_t stream)
+    template <typename T> void minScalar(const PtrStepSzb src, double val, PtrStepSzb dst, cudaStream_t stream)
    {
        cv::gpu::device::min_gpu(src, saturate_cast<T>(val), dst, stream);
    }

-    template <typename T> void maxScalar(const DevMem2Db src, double val, DevMem2Db dst, cudaStream_t stream)
+    template <typename T> void maxScalar(const PtrStepSzb src, double val, PtrStepSzb dst, cudaStream_t stream)
    {
        cv::gpu::device::max_gpu(src, saturate_cast<T>(val), dst, stream);
    }
@ -1982,7 +1982,7 @@ namespace

 void cv::gpu::min(const GpuMat& src, double val, GpuMat& dst, Stream& stream)
 {
-    typedef void (*func_t)(const DevMem2Db src1, double src2, DevMem2Db dst, cudaStream_t stream);
+    typedef void (*func_t)(const PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
    static const func_t funcs[] =
    {
        minScalar<unsigned char>,
@ -2010,7 +2010,7 @@ void cv::gpu::min(const GpuMat& src, double val, GpuMat& dst, Stream& stream)

 void cv::gpu::max(const GpuMat& src, double val, GpuMat& dst, Stream& stream)
 {
-    typedef void (*func_t)(const DevMem2Db src1, double src2, DevMem2Db dst, cudaStream_t stream);
+    typedef void (*func_t)(const PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
    static const func_t funcs[] =
    {
        maxScalar<unsigned char>,
@ -2042,7 +2042,7 @@ void cv::gpu::max(const GpuMat& src, double val, GpuMat& dst, Stream& stream)
 namespace cv { namespace gpu { namespace device
 {
    template <typename T>
-    void threshold_gpu(const DevMem2Db& src, const DevMem2Db& dst, T thresh, T maxVal, int type, cudaStream_t stream);
+    void threshold_gpu(const PtrStepSzb& src, const PtrStepSzb& dst, T thresh, T maxVal, int type, cudaStream_t stream);
 }}}

 namespace
@ -2109,14 +2109,14 @@ double cv::gpu::threshold(const GpuMat& src, GpuMat& dst, double thresh, double

 namespace cv { namespace gpu { namespace device
 {
-    template<typename T> void pow_caller(DevMem2Db src, double power, DevMem2Db dst, cudaStream_t stream);
+    template<typename T> void pow_caller(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
 }}}

 void cv::gpu::pow(const GpuMat& src, double power, GpuMat& dst, Stream& stream)
 {
    using namespace cv::gpu::device;

-    typedef void (*func_t)(DevMem2Db src, double power, DevMem2Db dst, cudaStream_t stream);
+    typedef void (*func_t)(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
    static const func_t funcs[] =
    {
        pow_caller<unsigned char>,  pow_caller<signed char>,
@ -2216,14 +2216,14 @@ void cv::gpu::alphaComp(const GpuMat& img1, const GpuMat& img2, GpuMat& dst, int
 namespace cv { namespace gpu { namespace device
 {
    template <typename T1, typename T2, typename D>
-    void addWeighted_gpu(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    void addWeighted_gpu(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
 }}}

 void cv::gpu::addWeighted(const GpuMat& src1, double alpha, const GpuMat& src2, double beta, double gamma, GpuMat& dst, int dtype, Stream& stream)
 {
    using namespace cv::gpu::device;

-    typedef void (*func_t)(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    typedef void (*func_t)(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);

    static const func_t funcs[7][7][7] =
    {
--- a/modules/gpu/src/fast.cpp
+++ b/modules/gpu/src/fast.cpp
@ -113,8 +113,8 @@ namespace cv { namespace gpu { namespace device
 {
    namespace fast
    {
-        int calcKeypoints_gpu(DevMem2Db img, DevMem2Db mask, short2* kpLoc, int maxKeypoints, DevMem2Di score, int threshold);
-        int nonmaxSupression_gpu(const short2* kpLoc, int count, DevMem2Di score, short2* loc, float* response);
+        int calcKeypoints_gpu(PtrStepSzb img, PtrStepSzb mask, short2* kpLoc, int maxKeypoints, PtrStepSzi score, int threshold);
+        int nonmaxSupression_gpu(const short2* kpLoc, int count, PtrStepSzi score, short2* loc, float* response);
    }
 }}}

@ -138,7 +138,7 @@ int cv::gpu::FAST_GPU::calcKeyPointsLocation(const GpuMat& img, const GpuMat& ma
        score_.setTo(Scalar::all(0));
    }

-    count_ = calcKeypoints_gpu(img, mask, kpLoc_.ptr<short2>(), maxKeypoints, nonmaxSupression ? score_ : DevMem2Di(), threshold);
+    count_ = calcKeypoints_gpu(img, mask, kpLoc_.ptr<short2>(), maxKeypoints, nonmaxSupression ? score_ : PtrStepSzi(), threshold);
    count_ = std::min(count_, maxKeypoints);

    return count_;
--- a/modules/gpu/src/fgd_bgfg.cpp
+++ b/modules/gpu/src/fgd_bgfg.cpp
@ -336,7 +336,7 @@ namespace
 {
    void calcDiffHistogram(const cv::gpu::GpuMat& prevFrame, const cv::gpu::GpuMat& curFrame, cv::gpu::GpuMat& hist, cv::gpu::GpuMat& histBuf)
    {
-        typedef void (*func_t)(cv::gpu::DevMem2Db prevFrame, cv::gpu::DevMem2Db curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, int cc, cudaStream_t stream);
+        typedef void (*func_t)(cv::gpu::PtrStepSzb prevFrame, cv::gpu::PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, int cc, cudaStream_t stream);
        static const func_t funcs[4][4] =
        {
            {0,0,0,0},
@ -401,7 +401,7 @@ namespace

    void calcDiffThreshMask(const cv::gpu::GpuMat& prevFrame, const cv::gpu::GpuMat& curFrame, cv::Vec3d bestThres, cv::gpu::GpuMat& changeMask)
    {
-        typedef void (*func_t)(cv::gpu::DevMem2Db prevFrame, cv::gpu::DevMem2Db curFrame, uchar3 bestThres, cv::gpu::DevMem2Db changeMask, cudaStream_t stream);
+        typedef void (*func_t)(cv::gpu::PtrStepSzb prevFrame, cv::gpu::PtrStepSzb curFrame, uchar3 bestThres, cv::gpu::PtrStepSzb changeMask, cudaStream_t stream);
        static const func_t funcs[4][4] =
        {
            {0,0,0,0},
@ -450,7 +450,7 @@ namespace
                           cv::gpu::GpuMat& foreground, cv::gpu::GpuMat& countBuf,
                           const cv::gpu::FGDStatModel::Params& params, int out_cn)
    {
-        typedef void (*func_t)(cv::gpu::DevMem2Db prevFrame, cv::gpu::DevMem2Db curFrame, cv::gpu::DevMem2Db Ftd, cv::gpu::DevMem2Db Fbd, cv::gpu::DevMem2Db foreground,
+        typedef void (*func_t)(cv::gpu::PtrStepSzb prevFrame, cv::gpu::PtrStepSzb curFrame, cv::gpu::PtrStepSzb Ftd, cv::gpu::PtrStepSzb Fbd, cv::gpu::PtrStepSzb foreground,
                               int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
        static const func_t funcs[4][4][4] =
        {
@ -602,8 +602,8 @@ namespace
                               const cv::gpu::GpuMat& foreground, cv::gpu::GpuMat& background,
                               const cv::gpu::FGDStatModel::Params& params)
    {
-        typedef void (*func_t)(cv::gpu::DevMem2Db prevFrame, cv::gpu::DevMem2Db curFrame, cv::gpu::DevMem2Db Ftd, cv::gpu::DevMem2Db Fbd,
-                               cv::gpu::DevMem2Db foreground, cv::gpu::DevMem2Db background,
+        typedef void (*func_t)(cv::gpu::PtrStepSzb prevFrame, cv::gpu::PtrStepSzb curFrame, cv::gpu::PtrStepSzb Ftd, cv::gpu::PtrStepSzb Fbd,
+                               cv::gpu::PtrStepSzb foreground, cv::gpu::PtrStepSzb background,
                               int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
        static const func_t funcs[4][4][4] =
        {
--- a/modules/gpu/src/filtering.cpp
+++ b/modules/gpu/src/filtering.cpp
@ -664,7 +664,7 @@ namespace cv { namespace gpu { namespace device
    namespace imgproc
    {
        template <typename T, typename D>
-        void filter2D_gpu(DevMem2Db srcWhole, int ofsX, int ofsY, DevMem2Db dst,
+        void filter2D_gpu(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst,
                          int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel,
                          int borderMode, const float* borderValue, cudaStream_t stream);
    }
@ -708,7 +708,7 @@ namespace
        nppFilter2D_t func;
    };

-    typedef void (*gpuFilter2D_t)(DevMem2Db srcWhole, int ofsX, int ofsY, DevMem2Db dst,
+    typedef void (*gpuFilter2D_t)(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst,
                                   int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel,
                                   int borderMode, const float* borderValue, cudaStream_t stream);

@ -833,13 +833,13 @@ namespace cv { namespace gpu { namespace device
    namespace row_filter
    {
        template <typename T, typename D>
-        void linearRowFilter_gpu(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+        void linearRowFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
    }

    namespace column_filter
    {
        template <typename T, typename D>
-        void linearColumnFilter_gpu(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+        void linearColumnFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
    }
 }}}

@ -848,7 +848,7 @@ namespace
    typedef NppStatus (*nppFilter1D_t)(const Npp8u * pSrc, Npp32s nSrcStep, Npp8u * pDst, Npp32s nDstStep, NppiSize oROI,
        const Npp32s * pKernel, Npp32s nMaskSize, Npp32s nAnchor, Npp32s nDivisor);

-    typedef void (*gpuFilter1D_t)(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+    typedef void (*gpuFilter1D_t)(PtrStepSzb src, PtrStepSzb dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);

    struct NppLinearRowFilter : public BaseRowFilter_GPU
    {
--- a/modules/gpu/src/gftt.cpp
+++ b/modules/gpu/src/gftt.cpp
@ -56,8 +56,8 @@ namespace cv { namespace gpu { namespace device
 {
    namespace gfft
    {
-        int findCorners_gpu(DevMem2Df eig, float threshold, DevMem2Db mask, float2* corners, int max_count);
-        void sortCorners_gpu(DevMem2Df eig, float2* corners, int count);
+        int findCorners_gpu(PtrStepSzf eig, float threshold, PtrStepSzb mask, float2* corners, int max_count);
+        void sortCorners_gpu(PtrStepSzf eig, float2* corners, int count);
    }
 }}}

--- a/modules/gpu/src/global_motion.cpp
+++ b/modules/gpu/src/global_motion.cpp
@ -60,7 +60,7 @@ namespace cv { namespace gpu { namespace device { namespace globmotion {

    void calcWobbleSuppressionMaps(
            int left, int idx, int right, int width, int height,
-            const float *ml, const float *mr, DevMem2Df mapx, DevMem2Df mapy);
+            const float *ml, const float *mr, PtrStepSzf mapx, PtrStepSzf mapy);

 }}}}

--- a/modules/gpu/src/graphcuts.cpp
+++ b/modules/gpu/src/graphcuts.cpp
@ -56,10 +56,10 @@ namespace cv { namespace gpu { namespace device
 {
    namespace ccl
    {
-        void labelComponents(const DevMem2D& edges, DevMem2Di comps, int flags, cudaStream_t stream);
+        void labelComponents(const PtrStepSzb& edges, PtrStepSzi comps, int flags, cudaStream_t stream);

        template<typename T>
-        void computeEdges(const DevMem2D& image, DevMem2D edges, const float4& lo, const float4& hi, cudaStream_t stream);
+        void computeEdges(const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
    }
 }}}

@ -77,7 +77,7 @@ void cv::gpu::connectivityMask(const GpuMat& image, GpuMat& mask, const cv::Scal

    int depth = image.depth();

-    typedef void (*func_t)(const DevMem2D& image, DevMem2D edges, const float4& lo, const float4& hi, cudaStream_t stream);
+    typedef void (*func_t)(const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);

    static const func_t suppotLookup[8][4] =
    {   //    1,    2,     3,     4
--- a/modules/gpu/src/hog.cpp
+++ b/modules/gpu/src/hog.cpp
@ -70,8 +70,8 @@ namespace cv { namespace gpu { namespace device
                              int nblocks_win_x, int nblocks_win_y);

        void compute_hists(int nbins, int block_stride_x, int blovck_stride_y,
-                           int height, int width, const cv::gpu::DevMem2Df& grad,
-                           const cv::gpu::DevMem2Db& qangle, float sigma, float* block_hists);
+                           int height, int width, const cv::gpu::PtrStepSzf& grad,
+                           const cv::gpu::PtrStepSzb& qangle, float sigma, float* block_hists);

        void normalize_hists(int nbins, int block_stride_x, int block_stride_y,
                             int height, int width, float* block_hists, float threshold);
@ -87,18 +87,18 @@ namespace cv { namespace gpu { namespace device

        void extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, int block_stride_x,
                                    int win_stride_y, int win_stride_x, int height, int width, float* block_hists,
-                                    cv::gpu::DevMem2Df descriptors);
+                                    cv::gpu::PtrStepSzf descriptors);
        void extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, int block_stride_x,
                                    int win_stride_y, int win_stride_x, int height, int width, float* block_hists,
-                                    cv::gpu::DevMem2Df descriptors);
+                                    cv::gpu::PtrStepSzf descriptors);

-        void compute_gradients_8UC1(int nbins, int height, int width, const cv::gpu::DevMem2Db& img,
-                                    float angle_scale, cv::gpu::DevMem2Df grad, cv::gpu::DevMem2Db qangle, bool correct_gamma);
-        void compute_gradients_8UC4(int nbins, int height, int width, const cv::gpu::DevMem2Db& img,
-                                    float angle_scale, cv::gpu::DevMem2Df grad, cv::gpu::DevMem2Db qangle, bool correct_gamma);
+        void compute_gradients_8UC1(int nbins, int height, int width, const cv::gpu::PtrStepSzb& img,
+                                    float angle_scale, cv::gpu::PtrStepSzf grad, cv::gpu::PtrStepSzb qangle, bool correct_gamma);
+        void compute_gradients_8UC4(int nbins, int height, int width, const cv::gpu::PtrStepSzb& img,
+                                    float angle_scale, cv::gpu::PtrStepSzf grad, cv::gpu::PtrStepSzb qangle, bool correct_gamma);

-        void resize_8UC1(const cv::gpu::DevMem2Db& src, cv::gpu::DevMem2Db dst);
-        void resize_8UC4(const cv::gpu::DevMem2Db& src, cv::gpu::DevMem2Db dst);
+        void resize_8UC1(const cv::gpu::PtrStepSzb& src, cv::gpu::PtrStepSzb dst);
+        void resize_8UC4(const cv::gpu::PtrStepSzb& src, cv::gpu::PtrStepSzb dst);
    }
 }}}

--- a/modules/gpu/src/hough.cpp
+++ b/modules/gpu/src/hough.cpp
@ -58,13 +58,13 @@ namespace cv { namespace gpu { namespace device
 {
    namespace hough
    {
-        int buildPointList_gpu(DevMem2Db src, unsigned int* list);
+        int buildPointList_gpu(PtrStepSzb src, unsigned int* list);

-        void linesAccum_gpu(const unsigned int* list, int count, DevMem2Di accum, float rho, float theta, size_t sharedMemPerBlock, bool has20);
-        int linesGetResult_gpu(DevMem2Di accum, float2* out, int* votes, int maxSize, float rho, float theta, int threshold, bool doSort);
+        void linesAccum_gpu(const unsigned int* list, int count, PtrStepSzi accum, float rho, float theta, size_t sharedMemPerBlock, bool has20);
+        int linesGetResult_gpu(PtrStepSzi accum, float2* out, int* votes, int maxSize, float rho, float theta, int threshold, bool doSort);

-        void circlesAccumCenters_gpu(const unsigned int* list, int count, PtrStepi dx, PtrStepi dy, DevMem2Di accum, int minRadius, int maxRadius, float idp);
-        int buildCentersList_gpu(DevMem2Di accum, unsigned int* centers, int threshold);
+        void circlesAccumCenters_gpu(const unsigned int* list, int count, PtrStepi dx, PtrStepi dy, PtrStepSzi accum, int minRadius, int maxRadius, float idp);
+        int buildCentersList_gpu(PtrStepSzi accum, unsigned int* centers, int threshold);
        int circlesAccumRadius_gpu(const unsigned int* centers, int centersCount, const unsigned int* list, int count,
                                   float3* circles, int maxCircles, float dp, int minRadius, int maxRadius, int threshold, bool has20);
    }
--- a/modules/gpu/src/imgproc.cpp
+++ b/modules/gpu/src/imgproc.cpp
@ -104,7 +104,7 @@ namespace cv { namespace gpu { namespace device
 {
    namespace imgproc
    {
-        void meanShiftFiltering_gpu(const DevMem2Db& src, DevMem2Db dst, int sp, int sr, int maxIter, float eps, cudaStream_t stream);
+        void meanShiftFiltering_gpu(const PtrStepSzb& src, PtrStepSzb dst, int sp, int sr, int maxIter, float eps, cudaStream_t stream);
    }
 }}}

@ -140,7 +140,7 @@ namespace cv { namespace gpu { namespace device
 {
    namespace imgproc
    {
-        void meanShiftProc_gpu(const DevMem2Db& src, DevMem2Db dstr, DevMem2Db dstsp, int sp, int sr, int maxIter, float eps, cudaStream_t stream);
+        void meanShiftProc_gpu(const PtrStepSzb& src, PtrStepSzb dstr, PtrStepSzb dstsp, int sp, int sr, int maxIter, float eps, cudaStream_t stream);
    }
 }}}

@ -177,8 +177,8 @@ namespace cv { namespace gpu { namespace device
 {
    namespace imgproc
    {
-        void drawColorDisp_gpu(const DevMem2Db& src, const DevMem2Db& dst, int ndisp, const cudaStream_t& stream);
-        void drawColorDisp_gpu(const DevMem2D_<short>& src, const DevMem2Db& dst, int ndisp, const cudaStream_t& stream);
+        void drawColorDisp_gpu(const PtrStepSzb& src, const PtrStepSzb& dst, int ndisp, const cudaStream_t& stream);
+        void drawColorDisp_gpu(const PtrStepSz<short>& src, const PtrStepSzb& dst, int ndisp, const cudaStream_t& stream);
    }
 }}}

@ -191,7 +191,7 @@ namespace

        dst.create(src.size(), CV_8UC4);

-        drawColorDisp_gpu((DevMem2D_<T>)src, dst, ndisp, stream);
+        drawColorDisp_gpu((PtrStepSz<T>)src, dst, ndisp, stream);
    }

    typedef void (*drawColorDisp_caller_t)(const GpuMat& src, GpuMat& dst, int ndisp, const cudaStream_t& stream);
@ -214,7 +214,7 @@ namespace cv { namespace gpu { namespace device
    namespace imgproc
    {
        template <typename T, typename D>
-        void reprojectImageTo3D_gpu(const DevMem2Db disp, DevMem2Db xyz, const float* q, cudaStream_t stream);
+        void reprojectImageTo3D_gpu(const PtrStepSzb disp, PtrStepSzb xyz, const float* q, cudaStream_t stream);
    }
 }}}

@ -222,7 +222,7 @@ void cv::gpu::reprojectImageTo3D(const GpuMat& disp, GpuMat& xyz, const Mat& Q,
 {
    using namespace cv::gpu::device::imgproc;

-    typedef void (*func_t)(const DevMem2Db disp, DevMem2Db xyz, const float* q, cudaStream_t stream);
+    typedef void (*func_t)(const PtrStepSzb disp, PtrStepSzb xyz, const float* q, cudaStream_t stream);
    static const func_t funcs[2][4] =
    {
        {reprojectImageTo3D_gpu<uchar, float3>, 0, 0, reprojectImageTo3D_gpu<short, float3>},
@ -245,13 +245,13 @@ namespace cv { namespace gpu { namespace device
 {
    namespace imgproc
    {
-        template <typename T, int cn> void copyMakeBorder_gpu(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const T* borderValue, cudaStream_t stream);
+        template <typename T, int cn> void copyMakeBorder_gpu(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const T* borderValue, cudaStream_t stream);
    }
 }}}

 namespace
 {
-    template <typename T, int cn> void copyMakeBorder_caller(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderType, const Scalar& value, cudaStream_t stream)
+    template <typename T, int cn> void copyMakeBorder_caller(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderType, const Scalar& value, cudaStream_t stream)
    {
        using namespace ::cv::gpu::device::imgproc;

@ -326,7 +326,7 @@ void cv::gpu::copyMakeBorder(const GpuMat& src, GpuMat& dst, int top, int bottom
    }
    else
    {
-        typedef void (*caller_t)(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderType, const Scalar& value, cudaStream_t stream);
+        typedef void (*caller_t)(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderType, const Scalar& value, cudaStream_t stream);
        static const caller_t callers[6][4] =
        {
            {   copyMakeBorder_caller<uchar, 1>  , 0/*copyMakeBorder_caller<uchar, 2>*/ ,    copyMakeBorder_caller<uchar, 3>  ,    copyMakeBorder_caller<uchar, 4>},
@ -354,7 +354,7 @@ namespace cv { namespace gpu { namespace device
 {
    namespace imgproc
    {
-        void buildWarpPlaneMaps(int tl_u, int tl_v, DevMem2Df map_x, DevMem2Df map_y,
+        void buildWarpPlaneMaps(int tl_u, int tl_v, PtrStepSzf map_x, PtrStepSzf map_y,
                                const float k_rinv[9], const float r_kinv[9], const float t[3], float scale,
                                cudaStream_t stream);
    }
@ -388,7 +388,7 @@ namespace cv { namespace gpu { namespace device
 {
    namespace imgproc
    {
-        void buildWarpCylindricalMaps(int tl_u, int tl_v, DevMem2Df map_x, DevMem2Df map_y,
+        void buildWarpCylindricalMaps(int tl_u, int tl_v, PtrStepSzf map_x, PtrStepSzf map_y,
                                      const float k_rinv[9], const float r_kinv[9], float scale,
                                      cudaStream_t stream);
    }
@ -421,7 +421,7 @@ namespace cv { namespace gpu { namespace device
 {
    namespace imgproc
    {
-        void buildWarpSphericalMaps(int tl_u, int tl_v, DevMem2Df map_x, DevMem2Df map_y,
+        void buildWarpSphericalMaps(int tl_u, int tl_v, PtrStepSzf map_x, PtrStepSzf map_y,
                                    const float k_rinv[9], const float r_kinv[9], float scale,
                                    cudaStream_t stream);
    }
@ -537,7 +537,7 @@ namespace cv { namespace gpu { namespace device
 {
    namespace imgproc
    {
-        void shfl_integral_gpu(DevMem2Db img, DevMem2D_<unsigned int> integral, cudaStream_t stream);
+        void shfl_integral_gpu(PtrStepSzb img, PtrStepSz<unsigned int> integral, cudaStream_t stream);
    }
 }}}

@ -652,7 +652,7 @@ namespace cv { namespace gpu { namespace device
 {
    namespace imgproc
    {
-        void columnSum_32F(const DevMem2Db src, const DevMem2Db dst);
+        void columnSum_32F(const PtrStepSzb src, const PtrStepSzb dst);
    }
 }}}

@ -991,12 +991,12 @@ namespace cv { namespace gpu { namespace device
 {
    namespace hist
    {
-        void histogram256_gpu(DevMem2Db src, int* hist, unsigned int* buf, cudaStream_t stream);
+        void histogram256_gpu(PtrStepSzb src, int* hist, unsigned int* buf, cudaStream_t stream);

        const int PARTIAL_HISTOGRAM256_COUNT = 240;
        const int HISTOGRAM256_BIN_COUNT     = 256;

-        void equalizeHist_gpu(DevMem2Db src, DevMem2Db dst, const int* lut, cudaStream_t stream);
+        void equalizeHist_gpu(PtrStepSzb src, PtrStepSzb dst, const int* lut, cudaStream_t stream);
    }
 }}}

@ -1072,8 +1072,8 @@ namespace cv { namespace gpu { namespace device
 {
    namespace imgproc
    {
-        void cornerHarris_gpu(int block_size, float k, DevMem2Df Dx, DevMem2Df Dy, DevMem2Df dst, int border_type, cudaStream_t stream);
-        void cornerMinEigenVal_gpu(int block_size, DevMem2Df Dx, DevMem2Df Dy, DevMem2Df dst, int border_type, cudaStream_t stream);
+        void cornerHarris_gpu(int block_size, float k, PtrStepSzf Dx, PtrStepSzf Dy, PtrStepSzf dst, int border_type, cudaStream_t stream);
+        void cornerMinEigenVal_gpu(int block_size, PtrStepSzf Dx, PtrStepSzf Dy, PtrStepSzf dst, int border_type, cudaStream_t stream);
    }
 }}}

@ -1195,9 +1195,9 @@ namespace cv { namespace gpu { namespace device
 {
    namespace imgproc
    {
-        void mulSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c, cudaStream_t stream);
+        void mulSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, PtrStepSz<cufftComplex> c, cudaStream_t stream);

-        void mulSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c, cudaStream_t stream);
+        void mulSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, PtrStepSz<cufftComplex> c, cudaStream_t stream);
    }
 }}}

@ -1206,7 +1206,7 @@ void cv::gpu::mulSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flag
    (void)flags;
    using namespace ::cv::gpu::device::imgproc;

-    typedef void (*Caller)(const PtrStep<cufftComplex>, const PtrStep<cufftComplex>, DevMem2D_<cufftComplex>, cudaStream_t stream);
+    typedef void (*Caller)(const PtrStep<cufftComplex>, const PtrStep<cufftComplex>, PtrStepSz<cufftComplex>, cudaStream_t stream);

    static Caller callers[] = { device::imgproc::mulSpectrums, device::imgproc::mulSpectrums_CONJ };

@ -1226,9 +1226,9 @@ namespace cv { namespace gpu { namespace device
 {
    namespace imgproc
    {
-        void mulAndScaleSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, DevMem2D_<cufftComplex> c, cudaStream_t stream);
+        void mulAndScaleSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, PtrStepSz<cufftComplex> c, cudaStream_t stream);

-        void mulAndScaleSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, DevMem2D_<cufftComplex> c, cudaStream_t stream);
+        void mulAndScaleSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, PtrStepSz<cufftComplex> c, cudaStream_t stream);
    }
 }}}

@ -1237,7 +1237,7 @@ void cv::gpu::mulAndScaleSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c,
    (void)flags;
    using namespace ::cv::gpu::device::imgproc;

-    typedef void (*Caller)(const PtrStep<cufftComplex>, const PtrStep<cufftComplex>, float scale, DevMem2D_<cufftComplex>, cudaStream_t stream);
+    typedef void (*Caller)(const PtrStep<cufftComplex>, const PtrStep<cufftComplex>, float scale, PtrStepSz<cufftComplex>, cudaStream_t stream);
    static Caller callers[] = { device::imgproc::mulAndScaleSpectrums, device::imgproc::mulAndScaleSpectrums_CONJ };

    CV_Assert(a.type() == b.type() && a.type() == CV_32FC2);
--- a/modules/gpu/src/match_template.cpp
+++ b/modules/gpu/src/match_template.cpp
@ -56,85 +56,85 @@ namespace cv { namespace gpu { namespace device
 {
    namespace match_template
    {
-        void matchTemplateNaive_CCORR_8U(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream);
-        void matchTemplateNaive_CCORR_32F(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream);
+        void matchTemplateNaive_CCORR_8U(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream);
+        void matchTemplateNaive_CCORR_32F(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream);

-        void matchTemplateNaive_SQDIFF_8U(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream);
-        void matchTemplateNaive_SQDIFF_32F(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream);
+        void matchTemplateNaive_SQDIFF_8U(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream);
+        void matchTemplateNaive_SQDIFF_32F(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream);

-        void matchTemplatePrepared_SQDIFF_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned long long templ_sqsum, DevMem2Df result,
+        void matchTemplatePrepared_SQDIFF_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result,
            int cn, cudaStream_t stream);

-        void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned long long templ_sqsum, DevMem2Df result,
+        void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result,
            int cn, cudaStream_t stream);

-        void matchTemplatePrepared_CCOFF_8U(int w, int h, const DevMem2D_<unsigned int> image_sum, unsigned int templ_sum, DevMem2Df result, cudaStream_t stream);
+        void matchTemplatePrepared_CCOFF_8U(int w, int h, const PtrStepSz<unsigned int> image_sum, unsigned int templ_sum, PtrStepSzf result, cudaStream_t stream);
        void matchTemplatePrepared_CCOFF_8UC2(
            int w, int h,
-            const DevMem2D_<unsigned int> image_sum_r,
-            const DevMem2D_<unsigned int> image_sum_g,
+            const PtrStepSz<unsigned int> image_sum_r,
+            const PtrStepSz<unsigned int> image_sum_g,
            unsigned int templ_sum_r,
            unsigned int templ_sum_g,
-            DevMem2Df result, cudaStream_t stream);
+            PtrStepSzf result, cudaStream_t stream);
        void matchTemplatePrepared_CCOFF_8UC3(
                int w, int h,
-                const DevMem2D_<unsigned int> image_sum_r,
-                const DevMem2D_<unsigned int> image_sum_g,
-                const DevMem2D_<unsigned int> image_sum_b,
+                const PtrStepSz<unsigned int> image_sum_r,
+                const PtrStepSz<unsigned int> image_sum_g,
+                const PtrStepSz<unsigned int> image_sum_b,
                unsigned int templ_sum_r,
                unsigned int templ_sum_g,
                unsigned int templ_sum_b,
-                DevMem2Df result, cudaStream_t stream);
+                PtrStepSzf result, cudaStream_t stream);
        void matchTemplatePrepared_CCOFF_8UC4(
                int w, int h,
-                const DevMem2D_<unsigned int> image_sum_r,
-                const DevMem2D_<unsigned int> image_sum_g,
-                const DevMem2D_<unsigned int> image_sum_b,
-                const DevMem2D_<unsigned int> image_sum_a,
+                const PtrStepSz<unsigned int> image_sum_r,
+                const PtrStepSz<unsigned int> image_sum_g,
+                const PtrStepSz<unsigned int> image_sum_b,
+                const PtrStepSz<unsigned int> image_sum_a,
                unsigned int templ_sum_r,
                unsigned int templ_sum_g,
                unsigned int templ_sum_b,
                unsigned int templ_sum_a,
-                DevMem2Df result, cudaStream_t stream);
+                PtrStepSzf result, cudaStream_t stream);


        void matchTemplatePrepared_CCOFF_NORMED_8U(
-                int w, int h, const DevMem2D_<unsigned int> image_sum,
-                const DevMem2D_<unsigned long long> image_sqsum,
+                int w, int h, const PtrStepSz<unsigned int> image_sum,
+                const PtrStepSz<unsigned long long> image_sqsum,
                unsigned int templ_sum, unsigned long long templ_sqsum,
-                DevMem2Df result, cudaStream_t stream);
+                PtrStepSzf result, cudaStream_t stream);
        void matchTemplatePrepared_CCOFF_NORMED_8UC2(
                int w, int h,
-                const DevMem2D_<unsigned int> image_sum_r, const DevMem2D_<unsigned long long> image_sqsum_r,
-                const DevMem2D_<unsigned int> image_sum_g, const DevMem2D_<unsigned long long> image_sqsum_g,
+                const PtrStepSz<unsigned int> image_sum_r, const PtrStepSz<unsigned long long> image_sqsum_r,
+                const PtrStepSz<unsigned int> image_sum_g, const PtrStepSz<unsigned long long> image_sqsum_g,
                unsigned int templ_sum_r, unsigned long long templ_sqsum_r,
                unsigned int templ_sum_g, unsigned long long templ_sqsum_g,
-                DevMem2Df result, cudaStream_t stream);
+                PtrStepSzf result, cudaStream_t stream);
        void matchTemplatePrepared_CCOFF_NORMED_8UC3(
                int w, int h,
-                const DevMem2D_<unsigned int> image_sum_r, const DevMem2D_<unsigned long long> image_sqsum_r,
-                const DevMem2D_<unsigned int> image_sum_g, const DevMem2D_<unsigned long long> image_sqsum_g,
-                const DevMem2D_<unsigned int> image_sum_b, const DevMem2D_<unsigned long long> image_sqsum_b,
+                const PtrStepSz<unsigned int> image_sum_r, const PtrStepSz<unsigned long long> image_sqsum_r,
+                const PtrStepSz<unsigned int> image_sum_g, const PtrStepSz<unsigned long long> image_sqsum_g,
+                const PtrStepSz<unsigned int> image_sum_b, const PtrStepSz<unsigned long long> image_sqsum_b,
                unsigned int templ_sum_r, unsigned long long templ_sqsum_r,
                unsigned int templ_sum_g, unsigned long long templ_sqsum_g,
                unsigned int templ_sum_b, unsigned long long templ_sqsum_b,
-                DevMem2Df result, cudaStream_t stream);
+                PtrStepSzf result, cudaStream_t stream);
        void matchTemplatePrepared_CCOFF_NORMED_8UC4(
                int w, int h,
-                const DevMem2D_<unsigned int> image_sum_r, const DevMem2D_<unsigned long long> image_sqsum_r,
-                const DevMem2D_<unsigned int> image_sum_g, const DevMem2D_<unsigned long long> image_sqsum_g,
-                const DevMem2D_<unsigned int> image_sum_b, const DevMem2D_<unsigned long long> image_sqsum_b,
-                const DevMem2D_<unsigned int> image_sum_a, const DevMem2D_<unsigned long long> image_sqsum_a,
+                const PtrStepSz<unsigned int> image_sum_r, const PtrStepSz<unsigned long long> image_sqsum_r,
+                const PtrStepSz<unsigned int> image_sum_g, const PtrStepSz<unsigned long long> image_sqsum_g,
+                const PtrStepSz<unsigned int> image_sum_b, const PtrStepSz<unsigned long long> image_sqsum_b,
+                const PtrStepSz<unsigned int> image_sum_a, const PtrStepSz<unsigned long long> image_sqsum_a,
                unsigned int templ_sum_r, unsigned long long templ_sqsum_r,
                unsigned int templ_sum_g, unsigned long long templ_sqsum_g,
                unsigned int templ_sum_b, unsigned long long templ_sqsum_b,
                unsigned int templ_sum_a, unsigned long long templ_sqsum_a,
-                DevMem2Df result, cudaStream_t stream);
+                PtrStepSzf result, cudaStream_t stream);

-        void normalize_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum,
-                          unsigned long long templ_sqsum, DevMem2Df result, int cn, cudaStream_t stream);
+        void normalize_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum,
+                          unsigned long long templ_sqsum, PtrStepSzf result, int cn, cudaStream_t stream);

-        void extractFirstChannel_32F(const DevMem2Db image, DevMem2Df result, int cn, cudaStream_t stream);
+        void extractFirstChannel_32F(const PtrStepSzb image, PtrStepSzf result, int cn, cudaStream_t stream);
    }
 }}}

--- a/modules/gpu/src/matrix_reductions.cpp
+++ b/modules/gpu/src/matrix_reductions.cpp
@ -211,22 +211,22 @@ namespace cv { namespace gpu { namespace device
        namespace sum
        {
            template <typename T>
-            void sumCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn);
+            void sumCaller(const PtrStepSzb src, PtrStepb buf, double* sum, int cn);

            template <typename T>
-            void sumMultipassCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn);
+            void sumMultipassCaller(const PtrStepSzb src, PtrStepb buf, double* sum, int cn);

            template <typename T>
-            void absSumCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn);
+            void absSumCaller(const PtrStepSzb src, PtrStepb buf, double* sum, int cn);

            template <typename T>
-            void absSumMultipassCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn);
+            void absSumMultipassCaller(const PtrStepSzb src, PtrStepb buf, double* sum, int cn);

            template <typename T>
-            void sqrSumCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn);
+            void sqrSumCaller(const PtrStepSzb src, PtrStepb buf, double* sum, int cn);

            template <typename T>
-            void sqrSumMultipassCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn);
+            void sqrSumMultipassCaller(const PtrStepSzb src, PtrStepb buf, double* sum, int cn);

            void getBufSizeRequired(int cols, int rows, int cn, int& bufcols, int& bufrows);
        }
@ -244,7 +244,7 @@ Scalar cv::gpu::sum(const GpuMat& src, GpuMat& buf)
 {
    using namespace cv::gpu::device::matrix_reductions::sum;

-    typedef void (*Caller)(const DevMem2Db, PtrStepb, double*, int);
+    typedef void (*Caller)(const PtrStepSzb, PtrStepb, double*, int);

    static Caller multipass_callers[] =
    {
@ -288,7 +288,7 @@ Scalar cv::gpu::absSum(const GpuMat& src, GpuMat& buf)
 {
    using namespace cv::gpu::device::matrix_reductions::sum;

-    typedef void (*Caller)(const DevMem2Db, PtrStepb, double*, int);
+    typedef void (*Caller)(const PtrStepSzb, PtrStepb, double*, int);

    static Caller multipass_callers[] =
    {
@ -333,7 +333,7 @@ Scalar cv::gpu::sqrSum(const GpuMat& src, GpuMat& buf)
 {
    using namespace cv::gpu::device::matrix_reductions::sum;

-    typedef void (*Caller)(const DevMem2Db, PtrStepb, double*, int);
+    typedef void (*Caller)(const PtrStepSzb, PtrStepb, double*, int);

    static Caller multipass_callers[] =
    {
@ -378,16 +378,16 @@ namespace cv { namespace gpu { namespace device
            void getBufSizeRequired(int cols, int rows, int elem_size, int& bufcols, int& bufrows);

            template <typename T>
-            void minMaxCaller(const DevMem2Db src, double* minval, double* maxval, PtrStepb buf);
+            void minMaxCaller(const PtrStepSzb src, double* minval, double* maxval, PtrStepb buf);

            template <typename T>
-            void minMaxMaskCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
+            void minMaxMaskCaller(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);

            template <typename T>
-            void minMaxMultipassCaller(const DevMem2Db src, double* minval, double* maxval, PtrStepb buf);
+            void minMaxMultipassCaller(const PtrStepSzb src, double* minval, double* maxval, PtrStepb buf);

            template <typename T>
-            void minMaxMaskMultipassCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
+            void minMaxMaskMultipassCaller(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
        }
    }
 }}}
@ -404,8 +404,8 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const Gp
 {
    using namespace ::cv::gpu::device::matrix_reductions::minmax;

-    typedef void (*Caller)(const DevMem2Db, double*, double*, PtrStepb);
-    typedef void (*MaskedCaller)(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb);
+    typedef void (*Caller)(const PtrStepSzb, double*, double*, PtrStepb);
+    typedef void (*MaskedCaller)(const PtrStepSzb, const PtrStepb, double*, double*, PtrStepb);

    static Caller multipass_callers[] =
    {
@ -488,19 +488,19 @@ namespace cv { namespace gpu { namespace device
                                    int& b1rows, int& b2cols, int& b2rows);

            template <typename T>
-            void minMaxLocCaller(const DevMem2Db src, double* minval, double* maxval,
+            void minMaxLocCaller(const PtrStepSzb src, double* minval, double* maxval,
                                 int minloc[2], int maxloc[2], PtrStepb valBuf, PtrStepb locBuf);

            template <typename T>
-            void minMaxLocMaskCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval,
+            void minMaxLocMaskCaller(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval,
                                     int minloc[2], int maxloc[2], PtrStepb valBuf, PtrStepb locBuf);

            template <typename T>
-            void minMaxLocMultipassCaller(const DevMem2Db src, double* minval, double* maxval,
+            void minMaxLocMultipassCaller(const PtrStepSzb src, double* minval, double* maxval,
                                          int minloc[2], int maxloc[2], PtrStepb valBuf, PtrStepb locBuf);

            template <typename T>
-            void minMaxLocMaskMultipassCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval,
+            void minMaxLocMaskMultipassCaller(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval,
                                              int minloc[2], int maxloc[2], PtrStepb valBuf, PtrStepb locBuf);
        }
    }
@ -517,8 +517,8 @@ void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point
 {
    using namespace ::cv::gpu::device::matrix_reductions::minmaxloc;

-    typedef void (*Caller)(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
-    typedef void (*MaskedCaller)(const DevMem2Db, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
+    typedef void (*Caller)(const PtrStepSzb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
+    typedef void (*MaskedCaller)(const PtrStepSzb, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);

    static Caller multipass_callers[] =
    {
@ -606,10 +606,10 @@ namespace cv { namespace gpu { namespace device
            void getBufSizeRequired(int cols, int rows, int& bufcols, int& bufrows);

            template <typename T>
-            int countNonZeroCaller(const DevMem2Db src, PtrStepb buf);
+            int countNonZeroCaller(const PtrStepSzb src, PtrStepb buf);

            template <typename T>
-            int countNonZeroMultipassCaller(const DevMem2Db src, PtrStepb buf);
+            int countNonZeroMultipassCaller(const PtrStepSzb src, PtrStepb buf);
        }
    }
 }}}
@ -625,7 +625,7 @@ int cv::gpu::countNonZero(const GpuMat& src, GpuMat& buf)
 {
    using namespace ::cv::gpu::device::matrix_reductions::countnonzero;

-    typedef int (*Caller)(const DevMem2Db src, PtrStepb buf);
+    typedef int (*Caller)(const PtrStepSzb src, PtrStepb buf);

    static Caller multipass_callers[7] =
    {
@ -669,8 +669,8 @@ namespace cv { namespace gpu { namespace device
 {
    namespace matrix_reductions
    {
-        template <typename T, typename S, typename D> void reduceRows_gpu(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
-        template <typename T, typename S, typename D> void reduceCols_gpu(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
+        template <typename T, typename S, typename D> void reduceRows_gpu(const PtrStepSzb& src, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);
+        template <typename T, typename S, typename D> void reduceCols_gpu(const PtrStepSzb& src, int cn, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);
    }
 }}}

@ -689,7 +689,7 @@ void cv::gpu::reduce(const GpuMat& src, GpuMat& dst, int dim, int reduceOp, int

    if (dim == 0)
    {
-        typedef void (*caller_t)(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
+        typedef void (*caller_t)(const PtrStepSzb& src, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);

        static const caller_t callers[6][6] =
        {
@ -752,7 +752,7 @@ void cv::gpu::reduce(const GpuMat& src, GpuMat& dst, int dim, int reduceOp, int
    }
    else
    {
-        typedef void (*caller_t)(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
+        typedef void (*caller_t)(const PtrStepSzb& src, int cn, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);

        static const caller_t callers[6][6] =
        {
--- a/modules/gpu/src/nvidia/core/NCVPyramid.cu
+++ b/modules/gpu/src/nvidia/core/NCVPyramid.cu
@ -206,7 +206,7 @@ namespace cv { namespace gpu { namespace device
 {
    namespace pyramid
    {
-        template <typename T> void kernelDownsampleX2_gpu(DevMem2Db src, DevMem2Db dst, cudaStream_t stream)
+        template <typename T> void kernelDownsampleX2_gpu(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream)
        {
            dim3 bDim(16, 8);
            dim3 gDim(divUp(src.cols, bDim.x), divUp(src.rows, bDim.y));
@ -220,17 +220,17 @@ namespace cv { namespace gpu { namespace device
                cudaSafeCall( cudaDeviceSynchronize() );
        }

-        template void kernelDownsampleX2_gpu<uchar1>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
-        template void kernelDownsampleX2_gpu<uchar3>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
-        template void kernelDownsampleX2_gpu<uchar4>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        template void kernelDownsampleX2_gpu<uchar1>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void kernelDownsampleX2_gpu<uchar3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void kernelDownsampleX2_gpu<uchar4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);

-        template void kernelDownsampleX2_gpu<ushort1>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
-        template void kernelDownsampleX2_gpu<ushort3>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
-        template void kernelDownsampleX2_gpu<ushort4>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        template void kernelDownsampleX2_gpu<ushort1>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void kernelDownsampleX2_gpu<ushort3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void kernelDownsampleX2_gpu<ushort4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);

-        template void kernelDownsampleX2_gpu<float1>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
-        template void kernelDownsampleX2_gpu<float3>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
-        template void kernelDownsampleX2_gpu<float4>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        template void kernelDownsampleX2_gpu<float1>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void kernelDownsampleX2_gpu<float3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void kernelDownsampleX2_gpu<float4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
    }
 }}}

@ -281,7 +281,7 @@ namespace cv { namespace gpu { namespace device
 {
    namespace pyramid
    {
-        template <typename T> void kernelInterpolateFrom1_gpu(DevMem2Db src, DevMem2Db dst, cudaStream_t stream)
+        template <typename T> void kernelInterpolateFrom1_gpu(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream)
        {
            dim3 bDim(16, 8);
            dim3 gDim(divUp(dst.cols, bDim.x), divUp(dst.rows, bDim.y));
@ -295,17 +295,17 @@ namespace cv { namespace gpu { namespace device
                cudaSafeCall( cudaDeviceSynchronize() );
        }

-        template void kernelInterpolateFrom1_gpu<uchar1>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
-        template void kernelInterpolateFrom1_gpu<uchar3>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
-        template void kernelInterpolateFrom1_gpu<uchar4>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        template void kernelInterpolateFrom1_gpu<uchar1>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void kernelInterpolateFrom1_gpu<uchar3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void kernelInterpolateFrom1_gpu<uchar4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);

-        template void kernelInterpolateFrom1_gpu<ushort1>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
-        template void kernelInterpolateFrom1_gpu<ushort3>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
-        template void kernelInterpolateFrom1_gpu<ushort4>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        template void kernelInterpolateFrom1_gpu<ushort1>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void kernelInterpolateFrom1_gpu<ushort3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void kernelInterpolateFrom1_gpu<ushort4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);

-        template void kernelInterpolateFrom1_gpu<float1>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
-        template void kernelInterpolateFrom1_gpu<float3>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
-        template void kernelInterpolateFrom1_gpu<float4>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        template void kernelInterpolateFrom1_gpu<float1>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void kernelInterpolateFrom1_gpu<float3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void kernelInterpolateFrom1_gpu<float4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
    }
 }}}

--- a/modules/gpu/src/opencv2/gpu/device/common.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/common.hpp
@ -44,7 +44,7 @@
 #define __OPENCV_GPU_COMMON_HPP__

 #include <cuda_runtime.h>
-#include "opencv2/core/devmem2d.hpp"
+#include "opencv2/core/cuda_devptrs.hpp"

 #ifndef CV_PI
    #define CV_PI   3.1415926535897932384626433832795
@ -101,7 +101,7 @@ namespace cv { namespace gpu
        typedef signed char schar;
        typedef unsigned int uint;

-        template<class T> inline void bindTexture(const textureReference* tex, const DevMem2D_<T>& img)
+        template<class T> inline void bindTexture(const textureReference* tex, const PtrStepSz<T>& img)
        {
            cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
            cudaSafeCall( cudaBindTexture2D(0, tex, img.ptr(), &desc, img.cols, img.rows, img.step) );
--- a/modules/gpu/src/opencv2/gpu/device/detail/transform_detail.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/detail/transform_detail.hpp
@ -203,7 +203,7 @@ namespace cv { namespace gpu { namespace device
        };

        template <typename T, typename D, typename UnOp, typename Mask>
-        __global__ static void transformSmart(const DevMem2D_<T> src_, PtrStep<D> dst_, const Mask mask, const UnOp op)
+        __global__ static void transformSmart(const PtrStepSz<T> src_, PtrStep<D> dst_, const Mask mask, const UnOp op)
        {
            typedef TransformFunctorTraits<UnOp> ft;
            typedef typename UnaryReadWriteTraits<T, D, ft::smart_shift>::read_type read_type;
@ -239,7 +239,7 @@ namespace cv { namespace gpu { namespace device
        }

        template <typename T, typename D, typename UnOp, typename Mask>
-        static __global__ void transformSimple(const DevMem2D_<T> src, PtrStep<D> dst, const Mask mask, const UnOp op)
+        static __global__ void transformSimple(const PtrStepSz<T> src, PtrStep<D> dst, const Mask mask, const UnOp op)
        {
 	        const int x = blockDim.x * blockIdx.x + threadIdx.x;
 	        const int y = blockDim.y * blockIdx.y + threadIdx.y;
@ -251,7 +251,7 @@ namespace cv { namespace gpu { namespace device
        }

        template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
-        __global__ static void transformSmart(const DevMem2D_<T1> src1_, const PtrStep<T2> src2_, PtrStep<D> dst_, 
+        __global__ static void transformSmart(const PtrStepSz<T1> src1_, const PtrStep<T2> src2_, PtrStep<D> dst_, 
            const Mask mask, const BinOp op)
        {
            typedef TransformFunctorTraits<BinOp> ft;
@ -291,7 +291,7 @@ namespace cv { namespace gpu { namespace device
        }

        template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
-        static __global__ void transformSimple(const DevMem2D_<T1> src1, const PtrStep<T2> src2, PtrStep<D> dst, 
+        static __global__ void transformSimple(const PtrStepSz<T1> src1, const PtrStep<T2> src2, PtrStep<D> dst, 
            const Mask mask, const BinOp op)
        {
 	        const int x = blockDim.x * blockIdx.x + threadIdx.x;
@ -309,7 +309,7 @@ namespace cv { namespace gpu { namespace device
        template<> struct TransformDispatcher<false>
        {
            template <typename T, typename D, typename UnOp, typename Mask>
-            static void call(DevMem2D_<T> src, DevMem2D_<D> dst, UnOp op, Mask mask, cudaStream_t stream)
+            static void call(PtrStepSz<T> src, PtrStepSz<D> dst, UnOp op, Mask mask, cudaStream_t stream)
            {
                typedef TransformFunctorTraits<UnOp> ft;

@ -324,7 +324,7 @@ namespace cv { namespace gpu { namespace device
            }

            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
-            static void call(DevMem2D_<T1> src1, DevMem2D_<T2> src2, DevMem2D_<D> dst, BinOp op, Mask mask, cudaStream_t stream)
+            static void call(PtrStepSz<T1> src1, PtrStepSz<T2> src2, PtrStepSz<D> dst, BinOp op, Mask mask, cudaStream_t stream)
            {
                typedef TransformFunctorTraits<BinOp> ft;

@ -341,7 +341,7 @@ namespace cv { namespace gpu { namespace device
        template<> struct TransformDispatcher<true>
        {
            template <typename T, typename D, typename UnOp, typename Mask>
-            static void call(DevMem2D_<T> src, DevMem2D_<D> dst, UnOp op, Mask mask, cudaStream_t stream)
+            static void call(PtrStepSz<T> src, PtrStepSz<D> dst, UnOp op, Mask mask, cudaStream_t stream)
            {
                typedef TransformFunctorTraits<UnOp> ft;

@ -365,7 +365,7 @@ namespace cv { namespace gpu { namespace device
            }

            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
-            static void call(DevMem2D_<T1> src1, DevMem2D_<T2> src2, DevMem2D_<D> dst, BinOp op, Mask mask, cudaStream_t stream)
+            static void call(PtrStepSz<T1> src1, PtrStepSz<T2> src2, PtrStepSz<D> dst, BinOp op, Mask mask, cudaStream_t stream)
            {
                typedef TransformFunctorTraits<BinOp> ft;

--- a/modules/gpu/src/opencv2/gpu/device/transform.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/transform.hpp
@ -50,14 +50,14 @@
 namespace cv { namespace gpu { namespace device
 {
    template <typename T, typename D, typename UnOp, typename Mask>
-    static inline void transform(DevMem2D_<T> src, DevMem2D_<D> dst, UnOp op, const Mask& mask, cudaStream_t stream)
+    static inline void transform(PtrStepSz<T> src, PtrStepSz<D> dst, UnOp op, const Mask& mask, cudaStream_t stream)
    {
        typedef TransformFunctorTraits<UnOp> ft;
        transform_detail::TransformDispatcher<VecTraits<T>::cn == 1 && VecTraits<D>::cn == 1 && ft::smart_shift != 1>::call(src, dst, op, mask, stream);
    }

    template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
-    static inline void transform(DevMem2D_<T1> src1, DevMem2D_<T2> src2, DevMem2D_<D> dst, BinOp op, const Mask& mask, cudaStream_t stream)
+    static inline void transform(PtrStepSz<T1> src1, PtrStepSz<T2> src2, PtrStepSz<D> dst, BinOp op, const Mask& mask, cudaStream_t stream)
    {
        typedef TransformFunctorTraits<BinOp> ft;
        transform_detail::TransformDispatcher<VecTraits<T1>::cn == 1 && VecTraits<T2>::cn == 1 && VecTraits<D>::cn == 1 && ft::smart_shift != 1>::call(src1, src2, dst, op, mask, stream);
--- a/modules/gpu/src/optical_flow.cpp
+++ b/modules/gpu/src/optical_flow.cpp
@ -193,8 +193,8 @@ namespace cv { namespace gpu { namespace device
 {
    namespace optical_flow
    {
-        void NeedleMapAverage_gpu(DevMem2Df u, DevMem2Df v, DevMem2Df u_avg, DevMem2Df v_avg);
-        void CreateOpticalFlowNeedleMap_gpu(DevMem2Df u_avg, DevMem2Df v_avg, float* vertex_buffer, float* color_data, float max_flow, float xscale, float yscale);
+        void NeedleMapAverage_gpu(PtrStepSzf u, PtrStepSzf v, PtrStepSzf u_avg, PtrStepSzf v_avg);
+        void CreateOpticalFlowNeedleMap_gpu(PtrStepSzf u_avg, PtrStepSzf v_avg, float* vertex_buffer, float* color_data, float max_flow, float xscale, float yscale);
    }
 }}}

--- a/modules/gpu/src/optical_flow_farneback.cpp
+++ b/modules/gpu/src/optical_flow_farneback.cpp
@ -66,33 +66,33 @@ namespace cv { namespace gpu { namespace device { namespace optflow_farneback
            int polyN, const float *g, const float *xg, const float *xxg,
            float ig11, float ig03, float ig33, float ig55);

-    void polynomialExpansionGpu(const DevMem2Df &src, int polyN, DevMem2Df dst, cudaStream_t stream);
+    void polynomialExpansionGpu(const PtrStepSzf &src, int polyN, PtrStepSzf dst, cudaStream_t stream);

    void setUpdateMatricesConsts();

    void updateMatricesGpu(
-            const DevMem2Df flowx, const DevMem2Df flowy, const DevMem2Df R0, const DevMem2Df R1,
-            DevMem2Df M, cudaStream_t stream);
+            const PtrStepSzf flowx, const PtrStepSzf flowy, const PtrStepSzf R0, const PtrStepSzf R1,
+            PtrStepSzf M, cudaStream_t stream);

    void updateFlowGpu(
-            const DevMem2Df M, DevMem2Df flowx, DevMem2Df flowy, cudaStream_t stream);
+            const PtrStepSzf M, PtrStepSzf flowx, PtrStepSzf flowy, cudaStream_t stream);

-    /*void boxFilterGpu(const DevMem2Df src, int ksizeHalf, DevMem2Df dst, cudaStream_t stream);*/
+    /*void boxFilterGpu(const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, cudaStream_t stream);*/

-    void boxFilter5Gpu(const DevMem2Df src, int ksizeHalf, DevMem2Df dst, cudaStream_t stream);
+    void boxFilter5Gpu(const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, cudaStream_t stream);

-    void boxFilter5Gpu_CC11(const DevMem2Df src, int ksizeHalf, DevMem2Df dst, cudaStream_t stream);
+    void boxFilter5Gpu_CC11(const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, cudaStream_t stream);

    void setGaussianBlurKernel(const float *gKer, int ksizeHalf);

    void gaussianBlurGpu(
-            const DevMem2Df src, int ksizeHalf, DevMem2Df dst, int borderType, cudaStream_t stream);
+            const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, int borderType, cudaStream_t stream);

    void gaussianBlur5Gpu(
-            const DevMem2Df src, int ksizeHalf, DevMem2Df dst, int borderType, cudaStream_t stream);
+            const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, int borderType, cudaStream_t stream);

    void gaussianBlur5Gpu_CC11(
-            const DevMem2Df src, int ksizeHalf, DevMem2Df dst, int borderType, cudaStream_t stream);
+            const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, int borderType, cudaStream_t stream);

 }}}} // namespace cv { namespace gpu { namespace device { namespace optflow_farneback

--- a/modules/gpu/src/orb.cpp
+++ b/modules/gpu/src/orb.cpp
@ -69,11 +69,11 @@ namespace cv { namespace gpu { namespace device
    {
        int cull_gpu(int* loc, float* response, int size, int n_points);

-        void HarrisResponses_gpu(DevMem2Db img, const short2* loc, float* response, const int npoints, int blockSize, float harris_k, cudaStream_t stream);
+        void HarrisResponses_gpu(PtrStepSzb img, const short2* loc, float* response, const int npoints, int blockSize, float harris_k, cudaStream_t stream);

        void loadUMax(const int* u_max, int count);

-        void IC_Angle_gpu(DevMem2Db image, const short2* loc, float* angle, int npoints, int half_k, cudaStream_t stream);
+        void IC_Angle_gpu(PtrStepSzb image, const short2* loc, float* angle, int npoints, int half_k, cudaStream_t stream);

        void computeOrbDescriptor_gpu(PtrStepb img, const short2* loc, const float* angle, const int npoints,
            const int* pattern_x, const int* pattern_y, PtrStepb desc, int dsize, int WTA_K, cudaStream_t stream);
--- a/modules/gpu/src/pyramids.cpp
+++ b/modules/gpu/src/pyramids.cpp
@ -58,7 +58,7 @@ namespace cv { namespace gpu { namespace device
 {
    namespace imgproc
    {
-        template <typename T> void pyrDown_gpu(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        template <typename T> void pyrDown_gpu(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
    }
 }}}

@ -66,7 +66,7 @@ void cv::gpu::pyrDown(const GpuMat& src, GpuMat& dst, Stream& stream)
 {
    using namespace cv::gpu::device::imgproc;

-    typedef void (*func_t)(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+    typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);

    static const func_t funcs[6][4] =
    {
@ -96,7 +96,7 @@ namespace cv { namespace gpu { namespace device
 {
    namespace imgproc
    {
-        template <typename T> void pyrUp_gpu(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        template <typename T> void pyrUp_gpu(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
    }
 }}}

@ -104,7 +104,7 @@ void cv::gpu::pyrUp(const GpuMat& src, GpuMat& dst, Stream& stream)
 {
    using namespace cv::gpu::device::imgproc;

-    typedef void (*func_t)(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+    typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);

    static const func_t funcs[6][4] =
    {
@ -134,8 +134,8 @@ namespace cv { namespace gpu { namespace device
 {
    namespace pyramid
    {
-        template <typename T> void kernelDownsampleX2_gpu(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
-        template <typename T> void kernelInterpolateFrom1_gpu(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        template <typename T> void kernelDownsampleX2_gpu(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template <typename T> void kernelInterpolateFrom1_gpu(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
    }
 }}}

@ -143,7 +143,7 @@ void cv::gpu::ImagePyramid::build(const GpuMat& img, int numLayers, Stream& stre
 {
    using namespace cv::gpu::device::pyramid;

-    typedef void (*func_t)(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+    typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);

    static const func_t funcs[6][4] =
    {
@ -191,7 +191,7 @@ void cv::gpu::ImagePyramid::getLayer(GpuMat& outImg, Size outRoi, Stream& stream
 {
    using namespace cv::gpu::device::pyramid;

-    typedef void (*func_t)(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+    typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);

    static const func_t funcs[6][4] =
    {
--- a/modules/gpu/src/pyrlk.cpp
+++ b/modules/gpu/src/pyrlk.cpp
@ -61,13 +61,13 @@ namespace cv { namespace gpu { namespace device
    {
        void loadConstants(int2 winSize, int iters);

-        void lkSparse1_gpu(DevMem2Df I, DevMem2Df J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
+        void lkSparse1_gpu(PtrStepSzf I, PtrStepSzf J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
            int level, dim3 block, dim3 patch, cudaStream_t stream = 0);
-        void lkSparse4_gpu(DevMem2D_<float4> I, DevMem2D_<float4> J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
+        void lkSparse4_gpu(PtrStepSz<float4> I, PtrStepSz<float4> J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
            int level, dim3 block, dim3 patch, cudaStream_t stream = 0);

-        void lkDense_gpu(DevMem2Db I, DevMem2Df J, DevMem2Df u, DevMem2Df v, DevMem2Df prevU, DevMem2Df prevV,
-                         DevMem2Df err, int2 winSize, cudaStream_t stream = 0);
+        void lkDense_gpu(PtrStepSzb I, PtrStepSzf J, PtrStepSzf u, PtrStepSzf v, PtrStepSzf prevU, PtrStepSzf prevV,
+                         PtrStepSzf err, int2 winSize, cudaStream_t stream = 0);
    }
 }}}

@ -221,7 +221,7 @@ void cv::gpu::PyrLKOpticalFlow::dense(const GpuMat& prevImg, const GpuMat& nextI
    int2 winSize2i = make_int2(winSize.width, winSize.height);
    loadConstants(winSize2i, iters);

-    DevMem2Df derr = err ? *err : DevMem2Df();
+    PtrStepSzf derr = err ? *err : PtrStepSzf();

    int idx = 0;

@ -230,7 +230,7 @@ void cv::gpu::PyrLKOpticalFlow::dense(const GpuMat& prevImg, const GpuMat& nextI
        int idx2 = (idx + 1) & 1;

        lkDense_gpu(prevPyr_[level], nextPyr_[level], uPyr_[idx], vPyr_[idx], uPyr_[idx2], vPyr_[idx2],
-            level == 0 ? derr : DevMem2Df(), winSize2i);
+            level == 0 ? derr : PtrStepSzf(), winSize2i);

        if (level > 0)
            idx = idx2;
--- a/modules/gpu/src/remap.cpp
+++ b/modules/gpu/src/remap.cpp
@ -53,7 +53,7 @@ namespace cv { namespace gpu { namespace device
    namespace imgproc 
    {
        template <typename T> 
-        void remap_gpu(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, DevMem2Df xmap, DevMem2Df ymap, DevMem2Db dst, 
+        void remap_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, 
                       int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
    }
 }}}
@ -62,7 +62,7 @@ void cv::gpu::remap(const GpuMat& src, GpuMat& dst, const GpuMat& xmap, const Gp
 {
    using namespace cv::gpu::device::imgproc;

-    typedef void (*func_t)(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, DevMem2Df xmap, DevMem2Df ymap, DevMem2Db dst, int interpolation, 
+    typedef void (*func_t)(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, 
        int borderMode, const float* borderValue, cudaStream_t stream, int cc);

    static const func_t funcs[6][4] = 
@ -98,7 +98,7 @@ void cv::gpu::remap(const GpuMat& src, GpuMat& dst, const GpuMat& xmap, const Gp
    Point ofs;
    src.locateROI(wholeSize, ofs);

-    func(src, DevMem2Db(wholeSize.height, wholeSize.width, src.datastart, src.step), ofs.x, ofs.y, xmap, ymap, 
+    func(src, PtrStepSzb(wholeSize.height, wholeSize.width, src.datastart, src.step), ofs.x, ofs.y, xmap, ymap, 
        dst, interpolation, gpuBorderType, borderValueFloat.val, StreamAccessor::getStream(stream), cc);
 }

--- a/modules/gpu/src/resize.cpp
+++ b/modules/gpu/src/resize.cpp
@ -64,8 +64,8 @@ namespace cv { namespace gpu { namespace device
    namespace imgproc
    {
        template <typename T>
-        void resize_gpu(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float fx, float fy,
-                        DevMem2Db dst, int interpolation, cudaStream_t stream);
+        void resize_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy,
+                        PtrStepSzb dst, int interpolation, cudaStream_t stream);
    }
 }}}

@ -139,7 +139,7 @@ void cv::gpu::resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx, doub
    {
        using namespace ::cv::gpu::device::imgproc;

-        typedef void (*func_t)(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float fx, float fy, DevMem2Db dst, int interpolation, cudaStream_t stream);
+        typedef void (*func_t)(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);

        static const func_t funcs[6][4] =
        {
@ -154,7 +154,7 @@ void cv::gpu::resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx, doub
        const func_t func = funcs[src.depth()][src.channels() - 1];
        CV_Assert(func != 0);

-        func(src, DevMem2Db(wholeSize.height, wholeSize.width, src.datastart, src.step), ofs.x, ofs.y,
+        func(src, PtrStepSzb(wholeSize.height, wholeSize.width, src.datastart, src.step), ofs.x, ofs.y,
            static_cast<float>(1.0 / fx), static_cast<float>(1.0 / fy), dst, interpolation, stream);
    }
 }
--- a/modules/gpu/src/split_merge.cpp
+++ b/modules/gpu/src/split_merge.cpp
@ -59,8 +59,8 @@ namespace cv { namespace gpu { namespace device
 {
    namespace split_merge
    {
-        void merge_caller(const DevMem2Db* src, DevMem2Db& dst, int total_channels, size_t elem_size, const cudaStream_t& stream);
-        void split_caller(const DevMem2Db& src, DevMem2Db* dst, int num_channels, size_t elem_size1, const cudaStream_t& stream);
+        void merge_caller(const PtrStepSzb* src, PtrStepSzb& dst, int total_channels, size_t elem_size, const cudaStream_t& stream);
+        void split_caller(const PtrStepSzb& src, PtrStepSzb* dst, int num_channels, size_t elem_size1, const cudaStream_t& stream);
    }
 }}}

@ -102,11 +102,11 @@ namespace
        {
            dst.create(size, CV_MAKETYPE(depth, total_channels));

-            DevMem2Db src_as_devmem[4];
+            PtrStepSzb src_as_devmem[4];
            for(size_t i = 0; i < n; ++i)
                src_as_devmem[i] = src[i];

-            DevMem2Db dst_as_devmem(dst);
+            PtrStepSzb dst_as_devmem(dst);
            merge_caller(src_as_devmem, dst_as_devmem, total_channels, CV_ELEM_SIZE(depth), stream);
        }
    }
@ -137,11 +137,11 @@ namespace

        CV_Assert(num_channels <= 4);

-        DevMem2Db dst_as_devmem[4];
+        PtrStepSzb dst_as_devmem[4];
        for (int i = 0; i < num_channels; ++i)
            dst_as_devmem[i] = dst[i];

-        DevMem2Db src_as_devmem(src);
+        PtrStepSzb src_as_devmem(src);
        split_caller(src_as_devmem, dst_as_devmem, num_channels, src.elemSize1(), stream);
    }
 }
--- a/modules/gpu/src/stereobm.cpp
+++ b/modules/gpu/src/stereobm.cpp
@ -59,9 +59,9 @@ namespace cv { namespace gpu { namespace device
 {
    namespace stereobm
    {
-        void stereoBM_GPU(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& disp, int ndisp, int winsz, const DevMem2D_<unsigned int>& minSSD_buf, cudaStream_t & stream);
-        void prefilter_xsobel(const DevMem2Db& input, const DevMem2Db& output, int prefilterCap /*= 31*/, cudaStream_t & stream);
-        void postfilter_textureness(const DevMem2Db& input, int winsz, float avgTexturenessThreshold, const DevMem2Db& disp, cudaStream_t & stream);
+        void stereoBM_GPU(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& disp, int ndisp, int winsz, const PtrStepSz<unsigned int>& minSSD_buf, cudaStream_t & stream);
+        void prefilter_xsobel(const PtrStepSzb& input, const PtrStepSzb& output, int prefilterCap /*= 31*/, cudaStream_t & stream);
+        void postfilter_textureness(const PtrStepSzb& input, int winsz, float avgTexturenessThreshold, const PtrStepSzb& disp, cudaStream_t & stream);
    }
 }}}

--- a/modules/gpu/src/stereobp.cpp
+++ b/modules/gpu/src/stereobp.cpp
@ -65,17 +65,17 @@ namespace cv { namespace gpu { namespace device
    {
        void load_constants(int ndisp, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump);
        template<typename T, typename D>
-        void comp_data_gpu(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream);
+        void comp_data_gpu(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& data, cudaStream_t stream);
        template<typename T>
-        void data_step_down_gpu(int dst_cols, int dst_rows, int src_rows, const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream);
+        void data_step_down_gpu(int dst_cols, int dst_rows, int src_rows, const PtrStepSzb& src, const PtrStepSzb& dst, cudaStream_t stream);
        template <typename T>
-        void level_up_messages_gpu(int dst_idx, int dst_cols, int dst_rows, int src_rows, DevMem2Db* mus, DevMem2Db* mds, DevMem2Db* mls, DevMem2Db* mrs, cudaStream_t stream);
+        void level_up_messages_gpu(int dst_idx, int dst_cols, int dst_rows, int src_rows, PtrStepSzb* mus, PtrStepSzb* mds, PtrStepSzb* mls, PtrStepSzb* mrs, cudaStream_t stream);
        template <typename T>
-        void calc_all_iterations_gpu(int cols, int rows, int iters, const DevMem2Db& u, const DevMem2Db& d, 
-            const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, cudaStream_t stream);
+        void calc_all_iterations_gpu(int cols, int rows, int iters, const PtrStepSzb& u, const PtrStepSzb& d, 
+            const PtrStepSzb& l, const PtrStepSzb& r, const PtrStepSzb& data, cudaStream_t stream);
        template <typename T>
-        void output_gpu(const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, 
-            const DevMem2D_<short>& disp, cudaStream_t stream);
+        void output_gpu(const PtrStepSzb& u, const PtrStepSzb& d, const PtrStepSzb& l, const PtrStepSzb& r, const PtrStepSzb& data, 
+            const PtrStepSz<short>& disp, cudaStream_t stream);
    }
 }}}

@ -137,7 +137,7 @@ namespace

        void operator()(const GpuMat& left, const GpuMat& right, GpuMat& disp, Stream& stream)
        {
-            typedef void (*comp_data_t)(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream);
+            typedef void (*comp_data_t)(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& data, cudaStream_t stream);
            static const comp_data_t comp_data_callers[2][5] = 
            {
                {0, comp_data_gpu<unsigned char, short>, 0, comp_data_gpu<uchar3, short>, comp_data_gpu<uchar4, short>},
@ -253,25 +253,25 @@ namespace

        void calcBP(GpuMat& disp, Stream& stream)
        {
-            typedef void (*data_step_down_t)(int dst_cols, int dst_rows, int src_rows, const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream);
+            typedef void (*data_step_down_t)(int dst_cols, int dst_rows, int src_rows, const PtrStepSzb& src, const PtrStepSzb& dst, cudaStream_t stream);
            static const data_step_down_t data_step_down_callers[2] = 
            {
                data_step_down_gpu<short>, data_step_down_gpu<float>
            };
            
-            typedef void (*level_up_messages_t)(int dst_idx, int dst_cols, int dst_rows, int src_rows, DevMem2Db* mus, DevMem2Db* mds, DevMem2Db* mls, DevMem2Db* mrs, cudaStream_t stream);
+            typedef void (*level_up_messages_t)(int dst_idx, int dst_cols, int dst_rows, int src_rows, PtrStepSzb* mus, PtrStepSzb* mds, PtrStepSzb* mls, PtrStepSzb* mrs, cudaStream_t stream);
            static const level_up_messages_t level_up_messages_callers[2] = 
            {
                level_up_messages_gpu<short>, level_up_messages_gpu<float>
            };

-            typedef void (*calc_all_iterations_t)(int cols, int rows, int iters, const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, cudaStream_t stream);
+            typedef void (*calc_all_iterations_t)(int cols, int rows, int iters, const PtrStepSzb& u, const PtrStepSzb& d, const PtrStepSzb& l, const PtrStepSzb& r, const PtrStepSzb& data, cudaStream_t stream);
            static const calc_all_iterations_t calc_all_iterations_callers[2] = 
            {
                calc_all_iterations_gpu<short>, calc_all_iterations_gpu<float>
            };

-            typedef void (*output_t)(const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, const DevMem2D_<short>& disp, cudaStream_t stream);
+            typedef void (*output_t)(const PtrStepSzb& u, const PtrStepSzb& d, const PtrStepSzb& l, const PtrStepSzb& r, const PtrStepSzb& data, const PtrStepSz<short>& disp, cudaStream_t stream);
            static const output_t output_callers[2] = 
            {
                output_gpu<short>, output_gpu<float>
@ -291,10 +291,10 @@ namespace
                data_step_down_callers[funcIdx](cols_all[i], rows_all[i], rows_all[i-1], datas[i-1], datas[i], cudaStream);
            }

-            DevMem2Db mus[] = {u, u2};
-            DevMem2Db mds[] = {d, d2};
-            DevMem2Db mrs[] = {r, r2};
-            DevMem2Db mls[] = {l, l2};
+            PtrStepSzb mus[] = {u, u2};
+            PtrStepSzb mds[] = {d, d2};
+            PtrStepSzb mrs[] = {r, r2};
+            PtrStepSzb mls[] = {l, l2};

            int mem_idx = (rthis.levels & 1) ? 0 : 1;

--- a/modules/gpu/src/stereocsbp.cpp
+++ b/modules/gpu/src/stereocsbp.cpp
@ -62,7 +62,7 @@ namespace cv { namespace gpu { namespace device
    namespace stereocsbp
    {
        void load_constants(int ndisp, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump, int min_disp_th,
-            const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& temp);
+            const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& temp);

        template<class T>
        void init_data_cost(int rows, int cols, T* disp_selected_pyr, T* data_cost_selected, size_t msg_step,
@ -85,7 +85,7 @@ namespace cv { namespace gpu { namespace device

        template<class T> 
        void compute_disp(const T* u, const T* d, const T* l, const T* r, const T* data_cost_selected, const T* disp_selected, size_t msg_step,
-            const DevMem2D_<short>& disp, int nr_plane, cudaStream_t stream);
+            const PtrStepSz<short>& disp, int nr_plane, cudaStream_t stream);
    }
 }}}

--- a/modules/gpu/src/surf.cpp
+++ b/modules/gpu/src/surf.cpp
@ -70,9 +70,9 @@ namespace cv { namespace gpu { namespace device
        void loadGlobalConstants(int maxCandidates, int maxFeatures, int img_rows, int img_cols, int nOctaveLayers, float hessianThreshold);
        void loadOctaveConstants(int octave, int layer_rows, int layer_cols);

-        void bindImgTex(DevMem2Db img);
-        void bindSumTex(DevMem2D_<unsigned int> sum);
-        void bindMaskSumTex(DevMem2D_<unsigned int> maskSum);
+        void bindImgTex(PtrStepSzb img);
+        void bindSumTex(PtrStepSz<unsigned int> sum);
+        void bindMaskSumTex(PtrStepSz<unsigned int> maskSum);

        void icvCalcLayerDetAndTrace_gpu(const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols, int octave, int nOctaveLayers);

@ -85,7 +85,7 @@ namespace cv { namespace gpu { namespace device

        void icvCalcOrientation_gpu(const float* featureX, const float* featureY, const float* featureSize, float* featureDir, int nFeatures);

-        void compute_descriptors_gpu(const DevMem2Df& descriptors,
+        void compute_descriptors_gpu(const PtrStepSzf& descriptors,
            const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, int nFeatures);
    }
 }}}
--- a/modules/gpu/src/video_reader.cpp
+++ b/modules/gpu/src/video_reader.cpp
@ -132,7 +132,7 @@ namespace cv { namespace gpu { namespace device {
    namespace video_decoding
    {
        void loadHueCSC(float hueCSC[9]);
-        void NV12ToARGB_gpu(const PtrStepb decodedFrame, DevMem2D_<unsigned int> interopFrame, cudaStream_t stream = 0);
+        void NV12ToARGB_gpu(const PtrStepb decodedFrame, PtrStepSz<unsigned int> interopFrame, cudaStream_t stream = 0);
    }
 }}}

--- a/modules/gpu/src/video_writer.cpp
+++ b/modules/gpu/src/video_writer.cpp
@ -505,7 +505,7 @@ namespace cv { namespace gpu { namespace device
 {
    namespace video_encoding
    {
-        void YV12_gpu(const DevMem2Db src, int cn, DevMem2Db dst);
+        void YV12_gpu(const PtrStepSzb src, int cn, PtrStepSzb dst);
    }
 }}}

--- a/modules/gpu/src/warp.cpp
+++ b/modules/gpu/src/warp.cpp
@ -57,16 +57,16 @@ namespace cv { namespace gpu { namespace device
 {
    namespace imgproc
    {
-        void buildWarpAffineMaps_gpu(float coeffs[2 * 3], DevMem2Df xmap, DevMem2Df ymap, cudaStream_t stream);
+        void buildWarpAffineMaps_gpu(float coeffs[2 * 3], PtrStepSzf xmap, PtrStepSzf ymap, cudaStream_t stream);

        template <typename T>
-        void warpAffine_gpu(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[2 * 3], DevMem2Db dst, int interpolation,
+        void warpAffine_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation,
                            int borderMode, const float* borderValue, cudaStream_t stream, int cc);

-        void buildWarpPerspectiveMaps_gpu(float coeffs[3 * 3], DevMem2Df xmap, DevMem2Df ymap, cudaStream_t stream);
+        void buildWarpPerspectiveMaps_gpu(float coeffs[3 * 3], PtrStepSzf xmap, PtrStepSzf ymap, cudaStream_t stream);

        template <typename T>
-        void warpPerspective_gpu(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[3 * 3], DevMem2Db dst, int interpolation,
+        void warpPerspective_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation,
                            int borderMode, const float* borderValue, cudaStream_t stream, int cc);
    }
 }}}
@ -275,7 +275,7 @@ void cv::gpu::warpAffine(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsiz
    {
        using namespace cv::gpu::device::imgproc;

-        typedef void (*func_t)(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[2 * 3], DevMem2Db dst, int interpolation,
+        typedef void (*func_t)(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation,
            int borderMode, const float* borderValue, cudaStream_t stream, int cc);

        static const func_t funcs[6][4] =
@ -314,7 +314,7 @@ void cv::gpu::warpAffine(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsiz
        DeviceInfo info;
        int cc = info.majorVersion() * 10 + info.minorVersion();

-        func(src, DevMem2Db(wholeSize.height, wholeSize.width, src.datastart, src.step), ofs.x, ofs.y, coeffs,
+        func(src, PtrStepSzb(wholeSize.height, wholeSize.width, src.datastart, src.step), ofs.x, ofs.y, coeffs,
            dst, interpolation, gpuBorderType, borderValueFloat.val, StreamAccessor::getStream(s), cc);
    }
 }
@ -417,7 +417,7 @@ void cv::gpu::warpPerspective(const GpuMat& src, GpuMat& dst, const Mat& M, Size
    {
        using namespace cv::gpu::device::imgproc;

-        typedef void (*func_t)(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[2 * 3], DevMem2Db dst, int interpolation,
+        typedef void (*func_t)(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation,
            int borderMode, const float* borderValue, cudaStream_t stream, int cc);

        static const func_t funcs[6][4] =
@ -456,7 +456,7 @@ void cv::gpu::warpPerspective(const GpuMat& src, GpuMat& dst, const Mat& M, Size
        DeviceInfo info;
        int cc = info.majorVersion() * 10 + info.minorVersion();

-        func(src, DevMem2Db(wholeSize.height, wholeSize.width, src.datastart, src.step), ofs.x, ofs.y, coeffs,
+        func(src, PtrStepSzb(wholeSize.height, wholeSize.width, src.datastart, src.step), ofs.x, ofs.y, coeffs,
            dst, interpolation, gpuBorderType, borderValueFloat.val, StreamAccessor::getStream(s), cc);
    }
 }
--- a/modules/gpu/test/test_imgproc.cpp
+++ b/modules/gpu/test/test_imgproc.cpp
@ -1201,7 +1201,7 @@ PARAM_TEST_CASE(HoughCircles, cv::gpu::DeviceInfo, cv::Size, UseRoi)
        dst.setTo(cv::Scalar::all(0));

        for (size_t i = 0; i < circles.size(); ++i)
-            cv::circle(dst, cv::Point(circles[i][0], circles[i][1]), circles[i][2], cv::Scalar::all(255), fill ? -1 : 1);
+            cv::circle(dst, cv::Point2f(circles[i][0], circles[i][1]), (int)circles[i][2], cv::Scalar::all(255), fill ? -1 : 1);
    }
 };

@ -1220,10 +1220,10 @@ TEST_P(HoughCircles, Accuracy)
    const int votesThreshold = 20;

    std::vector<cv::Vec3f> circles_gold(4);
-    circles_gold[0] = cv::Vec3f(20, 20, minRadius);
-    circles_gold[1] = cv::Vec3f(90, 87, minRadius + 3);
-    circles_gold[2] = cv::Vec3f(30, 70, minRadius + 8);
-    circles_gold[3] = cv::Vec3f(80, 10, maxRadius);
+    circles_gold[0] = cv::Vec3i(20, 20, minRadius);
+    circles_gold[1] = cv::Vec3i(90, 87, minRadius + 3);
+    circles_gold[2] = cv::Vec3i(30, 70, minRadius + 8);
+    circles_gold[3] = cv::Vec3i(80, 10, maxRadius);

    cv::Mat src(size, CV_8UC1);
    drawCircles(src, circles_gold, true);
--- a/modules/imgproc/src/imgwarp.cpp
+++ b/modules/imgproc/src/imgwarp.cpp
@ -331,6 +331,9 @@ private:
    Mat dst;
    int* x_ofs, pix_size4;
    double ify;
+
+    resizeNNInvoker(const resizeNNInvoker&);
+    resizeNNInvoker& operator=(const resizeNNInvoker&);
 };

 static void
@ -1193,6 +1196,9 @@ private:
    const AT* alpha, *_beta;
    const Size ssize, dsize;
    const int ksize, xmin, xmax;
+
+    resizeGeneric_Invoker(const resizeGeneric_Invoker&);
+    resizeGeneric_Invoker& operator=(const resizeGeneric_Invoker&);
 };

 template<class HResize, class VResize>
@ -1280,6 +1286,9 @@ private:
    const int cn;
    bool fast_mode;
    const int step;
+
+    ResizeAreaFast_2x2_8u(const ResizeAreaFast_2x2_8u&);
+    ResizeAreaFast_2x2_8u& operator=(const ResizeAreaFast_2x2_8u&);
 };

 template <typename T, typename WT, typename VecOp>
@ -1357,7 +1366,7 @@ public:
                    }
                }

-                D[dx] = saturate_cast<WT>((float)sum/count);
+                D[dx] = saturate_cast<T>((float)sum/count);
            }
        }
    }
@ -1367,6 +1376,9 @@ private:
    Mat dst;
    const int scale_x, scale_y;
    const int *ofs, *xofs;
+
+    resizeAreaFast_Invoker(const resizeAreaFast_Invoker&);
+    resizeAreaFast_Invoker& operator=(const resizeAreaFast_Invoker&);
 };

 template<typename T, typename WT, typename VecOp>
@ -1600,6 +1612,8 @@ private:
 #ifdef HAVE_TBB
    const int *yofs, *cur_dy_ofs;
 #endif
+    resizeArea_Invoker(const resizeArea_Invoker&);
+    resizeArea_Invoker& operator=(const resizeArea_Invoker&);
 };

 template<typename T, typename WT>
@ -1617,7 +1631,7 @@ static void resizeArea_( const Mat& src, Mat& dst, const DecimateAlpha* xofs, in
        cur_dy_ofs[sy] = cur_dy;
        if( (cur_dy + 1)*scale_y_ <= sy + 1 || sy == ssize.height - 1 )
        {
-            WT beta = std::max(sy + 1 - (cur_dy+1)*scale_y_, 0.);
+            WT beta = (WT)std::max(sy + 1 - (cur_dy+1)*scale_y_, 0.);
            if( fabs(beta) < 1e-3 )
            {
                if(cur_dy >= dsize.height)
@ -2960,6 +2974,9 @@ private:
    RemapNNFunc nnfunc;
    RemapFunc ifunc;
    const void *ctab;
+
+    remapInvoker(const remapInvoker&);
+    remapInvoker& operator=(const remapInvoker&);
 };

 }
@ -3324,6 +3341,9 @@ private:
    const Scalar borderValue;
    int *adelta, *bdelta;
    double *M;
+
+    warpAffineInvoker(const warpAffineInvoker&);
+    warpAffineInvoker& operator=(const warpAffineInvoker&);
 };

 }
@ -3477,6 +3497,8 @@ private:
    double* M;
    int interpolation, borderType;
    const Scalar borderValue;
+    warpPerspectiveInvoker(const warpPerspectiveInvoker&);
+    warpPerspectiveInvoker& operator=(const warpPerspectiveInvoker&);
 };
    
 }
--- a/modules/video/src/bgfg_gmg.cpp
+++ b/modules/video/src/bgfg_gmg.cpp
@ -271,7 +271,7 @@ namespace
                        for (int i = 0; i < nfeatures; ++i)
                            weights[i] *= 1.0f - learningRate_;

-                        bool inserted = insertFeature(newFeatureColor, learningRate_, colors, weights, nfeatures, maxFeatures_);
+                        bool inserted = insertFeature(newFeatureColor, (float)learningRate_, colors, weights, nfeatures, maxFeatures_);

                        if (inserted)
                        {