diff --git a/modules/core/include/opencv2/core/base.hpp b/modules/core/include/opencv2/core/base.hpp
index 5b3ef205cc..e43fbbc951 100644
--- a/modules/core/include/opencv2/core/base.hpp
+++ b/modules/core/include/opencv2/core/base.hpp
@@ -705,7 +705,7 @@ namespace ogl
 namespace cuda
 {
     class CV_EXPORTS GpuMat;
-    class CV_EXPORTS CudaMem;
+    class CV_EXPORTS HostMem;
     class CV_EXPORTS Stream;
     class CV_EXPORTS Event;
 }
diff --git a/modules/core/include/opencv2/core/cuda.hpp b/modules/core/include/opencv2/core/cuda.hpp
index b67bf62e34..a9c7a39a8f 100644
--- a/modules/core/include/opencv2/core/cuda.hpp
+++ b/modules/core/include/opencv2/core/cuda.hpp
@@ -67,7 +67,9 @@ namespace cv { namespace cuda {
 //! @addtogroup cudacore_struct
 //! @{
 
-//////////////////////////////// GpuMat ///////////////////////////////
+//===================================================================================
+// GpuMat
+//===================================================================================
 
 /** @brief Base storage class for GPU memory with reference counting.
 
@@ -325,13 +327,13 @@ The function does not reallocate memory if the matrix has proper attributes alre
  */
 CV_EXPORTS void ensureSizeIsEnough(int rows, int cols, int type, OutputArray arr);
 
-CV_EXPORTS GpuMat allocMatFromBuf(int rows, int cols, int type, GpuMat& mat);
-
 //! BufferPool management (must be called before Stream creation)
 CV_EXPORTS void setBufferPoolUsage(bool on);
 CV_EXPORTS void setBufferPoolConfig(int deviceId, size_t stackSize, int stackCount);
 
-//////////////////////////////// CudaMem ////////////////////////////////
+//===================================================================================
+// HostMem
+//===================================================================================
 
 /** @brief Class with reference counting wrapping special memory type allocation functions from CUDA.
 
@@ -348,43 +350,45 @@ Its interface is also Mat-like but with additional memory type parameters.
 @note Allocation size of such memory types is usually limited. For more details, see *CUDA 2.2
 Pinned Memory APIs* document or *CUDA C Programming Guide*.
  */
-class CV_EXPORTS CudaMem
+class CV_EXPORTS HostMem
 {
 public:
     enum AllocType { PAGE_LOCKED = 1, SHARED = 2, WRITE_COMBINED = 4 };
 
-    explicit CudaMem(AllocType alloc_type = PAGE_LOCKED);
+    static MatAllocator* getAllocator(AllocType alloc_type = PAGE_LOCKED);
 
-    CudaMem(const CudaMem& m);
+    explicit HostMem(AllocType alloc_type = PAGE_LOCKED);
 
-    CudaMem(int rows, int cols, int type, AllocType alloc_type = PAGE_LOCKED);
-    CudaMem(Size size, int type, AllocType alloc_type = PAGE_LOCKED);
+    HostMem(const HostMem& m);
+
+    HostMem(int rows, int cols, int type, AllocType alloc_type = PAGE_LOCKED);
+    HostMem(Size size, int type, AllocType alloc_type = PAGE_LOCKED);
 
     //! creates from host memory with coping data
-    explicit CudaMem(InputArray arr, AllocType alloc_type = PAGE_LOCKED);
+    explicit HostMem(InputArray arr, AllocType alloc_type = PAGE_LOCKED);
 
-    ~CudaMem();
+    ~HostMem();
 
-    CudaMem& operator =(const CudaMem& m);
+    HostMem& operator =(const HostMem& m);
 
     //! swaps with other smart pointer
-    void swap(CudaMem& b);
+    void swap(HostMem& b);
 
     //! returns deep copy of the matrix, i.e. the data is copied
-    CudaMem clone() const;
+    HostMem clone() const;
 
     //! allocates new matrix data unless the matrix already has specified size and type.
     void create(int rows, int cols, int type);
     void create(Size size, int type);
 
-    //! creates alternative CudaMem header for the same data, with different
+    //! creates alternative HostMem header for the same data, with different
     //! number of channels and/or different number of rows
-    CudaMem reshape(int cn, int rows = 0) const;
+    HostMem reshape(int cn, int rows = 0) const;
 
     //! decrements reference counter and released memory if needed.
     void release();
 
-    //! returns matrix header with disabled reference counting for CudaMem data.
+    //! returns matrix header with disabled reference counting for HostMem data.
     Mat createMatHeader() const;
 
     /** @brief Maps CPU memory to GPU address space and creates the cuda::GpuMat header without reference counting
@@ -433,7 +437,9 @@ CV_EXPORTS void registerPageLocked(Mat& m);
  */
 CV_EXPORTS void unregisterPageLocked(Mat& m);
 
-///////////////////////////////// Stream //////////////////////////////////
+//===================================================================================
+// Stream
+//===================================================================================
 
 /** @brief This class encapsulates a queue of asynchronous calls.
 
@@ -528,7 +534,9 @@ private:
 
 //! @} cudacore_struct
 
-//////////////////////////////// Initialization & Info ////////////////////////
+//===================================================================================
+// Initialization & Info
+//===================================================================================
 
 //! @addtogroup cudacore_init
 //! @{
@@ -570,7 +578,9 @@ enum FeatureSet
     FEATURE_SET_COMPUTE_20 = 20,
     FEATURE_SET_COMPUTE_21 = 21,
     FEATURE_SET_COMPUTE_30 = 30,
+    FEATURE_SET_COMPUTE_32 = 32,
     FEATURE_SET_COMPUTE_35 = 35,
+    FEATURE_SET_COMPUTE_50 = 50,
 
     GLOBAL_ATOMICS = FEATURE_SET_COMPUTE_11,
     SHARED_ATOMICS = FEATURE_SET_COMPUTE_12,
diff --git a/modules/core/include/opencv2/core/cuda.inl.hpp b/modules/core/include/opencv2/core/cuda.inl.hpp
index 652bcfea29..1285b1a23d 100644
--- a/modules/core/include/opencv2/core/cuda.inl.hpp
+++ b/modules/core/include/opencv2/core/cuda.inl.hpp
@@ -50,7 +50,9 @@
 
 namespace cv { namespace cuda {
 
-//////////////////////////////// GpuMat ///////////////////////////////
+//===================================================================================
+// GpuMat
+//===================================================================================
 
 inline
 GpuMat::GpuMat(Allocator* allocator_)
@@ -145,6 +147,7 @@ void GpuMat::swap(GpuMat& b)
     std::swap(datastart, b.datastart);
     std::swap(dataend, b.dataend);
     std::swap(refcount, b.refcount);
+    std::swap(allocator, b.allocator);
 }
 
 inline
@@ -374,16 +377,18 @@ void swap(GpuMat& a, GpuMat& b)
     a.swap(b);
 }
 
-//////////////////////////////// CudaMem ////////////////////////////////
+//===================================================================================
+// HostMem
+//===================================================================================
 
 inline
-CudaMem::CudaMem(AllocType alloc_type_)
+HostMem::HostMem(AllocType alloc_type_)
     : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), alloc_type(alloc_type_)
 {
 }
 
 inline
-CudaMem::CudaMem(const CudaMem& m)
+HostMem::HostMem(const HostMem& m)
     : flags(m.flags), rows(m.rows), cols(m.cols), step(m.step), data(m.data), refcount(m.refcount), datastart(m.datastart), dataend(m.dataend), alloc_type(m.alloc_type)
 {
     if( refcount )
@@ -391,7 +396,7 @@ CudaMem::CudaMem(const CudaMem& m)
 }
 
 inline
-CudaMem::CudaMem(int rows_, int cols_, int type_, AllocType alloc_type_)
+HostMem::HostMem(int rows_, int cols_, int type_, AllocType alloc_type_)
     : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), alloc_type(alloc_type_)
 {
     if (rows_ > 0 && cols_ > 0)
@@ -399,7 +404,7 @@ CudaMem::CudaMem(int rows_, int cols_, int type_, AllocType alloc_type_)
 }
 
 inline
-CudaMem::CudaMem(Size size_, int type_, AllocType alloc_type_)
+HostMem::HostMem(Size size_, int type_, AllocType alloc_type_)
     : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), alloc_type(alloc_type_)
 {
     if (size_.height > 0 && size_.width > 0)
@@ -407,24 +412,24 @@ CudaMem::CudaMem(Size size_, int type_, AllocType alloc_type_)
 }
 
 inline
-CudaMem::CudaMem(InputArray arr, AllocType alloc_type_)
+HostMem::HostMem(InputArray arr, AllocType alloc_type_)
     : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), alloc_type(alloc_type_)
 {
     arr.getMat().copyTo(*this);
 }
 
 inline
-CudaMem::~CudaMem()
+HostMem::~HostMem()
 {
     release();
 }
 
 inline
-CudaMem& CudaMem::operator =(const CudaMem& m)
+HostMem& HostMem::operator =(const HostMem& m)
 {
     if (this != &m)
     {
-        CudaMem temp(m);
+        HostMem temp(m);
         swap(temp);
     }
 
@@ -432,7 +437,7 @@ CudaMem& CudaMem::operator =(const CudaMem& m)
 }
 
 inline
-void CudaMem::swap(CudaMem& b)
+void HostMem::swap(HostMem& b)
 {
     std::swap(flags, b.flags);
     std::swap(rows, b.rows);
@@ -446,86 +451,88 @@ void CudaMem::swap(CudaMem& b)
 }
 
 inline
-CudaMem CudaMem::clone() const
+HostMem HostMem::clone() const
 {
-    CudaMem m(size(), type(), alloc_type);
+    HostMem m(size(), type(), alloc_type);
     createMatHeader().copyTo(m);
     return m;
 }
 
 inline
-void CudaMem::create(Size size_, int type_)
+void HostMem::create(Size size_, int type_)
 {
     create(size_.height, size_.width, type_);
 }
 
 inline
-Mat CudaMem::createMatHeader() const
+Mat HostMem::createMatHeader() const
 {
     return Mat(size(), type(), data, step);
 }
 
 inline
-bool CudaMem::isContinuous() const
+bool HostMem::isContinuous() const
 {
     return (flags & Mat::CONTINUOUS_FLAG) != 0;
 }
 
 inline
-size_t CudaMem::elemSize() const
+size_t HostMem::elemSize() const
 {
     return CV_ELEM_SIZE(flags);
 }
 
 inline
-size_t CudaMem::elemSize1() const
+size_t HostMem::elemSize1() const
 {
     return CV_ELEM_SIZE1(flags);
 }
 
 inline
-int CudaMem::type() const
+int HostMem::type() const
 {
     return CV_MAT_TYPE(flags);
 }
 
 inline
-int CudaMem::depth() const
+int HostMem::depth() const
 {
     return CV_MAT_DEPTH(flags);
 }
 
 inline
-int CudaMem::channels() const
+int HostMem::channels() const
 {
     return CV_MAT_CN(flags);
 }
 
 inline
-size_t CudaMem::step1() const
+size_t HostMem::step1() const
 {
     return step / elemSize1();
 }
 
 inline
-Size CudaMem::size() const
+Size HostMem::size() const
 {
     return Size(cols, rows);
 }
 
 inline
-bool CudaMem::empty() const
+bool HostMem::empty() const
 {
     return data == 0;
 }
 
 static inline
-void swap(CudaMem& a, CudaMem& b)
+void swap(HostMem& a, HostMem& b)
 {
     a.swap(b);
 }
 
-//////////////////////////////// Stream ///////////////////////////////
+//===================================================================================
+// Stream
+//===================================================================================
 
 inline
 Stream::Stream(const Ptr<Impl>& impl)
@@ -533,7 +540,9 @@ Stream::Stream(const Ptr<Impl>& impl)
 {
 }
 
-//////////////////////////////// Initialization & Info ////////////////////////
+//===================================================================================
+// Initialization & Info
+//===================================================================================
 
 inline
 bool TargetArchs::has(int major, int minor)
@@ -592,7 +601,9 @@ bool DeviceInfo::supports(FeatureSet feature_set) const
 
 }} // namespace cv { namespace cuda {
 
-//////////////////////////////// Mat ////////////////////////////////
+//===================================================================================
+// Mat
+//===================================================================================
 
 namespace cv {
 
diff --git a/modules/core/include/opencv2/core/mat.hpp b/modules/core/include/opencv2/core/mat.hpp
index 7bddc0b358..c8836274e6 100644
--- a/modules/core/include/opencv2/core/mat.hpp
+++ b/modules/core/include/opencv2/core/mat.hpp
@@ -160,8 +160,8 @@ public:
         STD_VECTOR_MAT    = 5 << KIND_SHIFT,
         EXPR              = 6 << KIND_SHIFT,
         OPENGL_BUFFER     = 7 << KIND_SHIFT,
-        CUDA_MEM          = 8 << KIND_SHIFT,
-        GPU_MAT           = 9 << KIND_SHIFT,
+        CUDA_HOST_MEM     = 8 << KIND_SHIFT,
+        CUDA_GPU_MAT      = 9 << KIND_SHIFT,
         UMAT              =10 << KIND_SHIFT,
         STD_VECTOR_UMAT   =11 << KIND_SHIFT
     };
@@ -180,7 +180,7 @@ public:
     _InputArray(const double& val);
     _InputArray(const cuda::GpuMat& d_mat);
     _InputArray(const ogl::Buffer& buf);
-    _InputArray(const cuda::CudaMem& cuda_mem);
+    _InputArray(const cuda::HostMem& cuda_mem);
     template<typename _Tp> _InputArray(const cudev::GpuMat_<_Tp>& m);
     _InputArray(const UMat& um);
     _InputArray(const std::vector<UMat>& umv);
@@ -277,7 +277,7 @@ public:
     _OutputArray(std::vector<Mat>& vec);
     _OutputArray(cuda::GpuMat& d_mat);
     _OutputArray(ogl::Buffer& buf);
-    _OutputArray(cuda::CudaMem& cuda_mem);
+    _OutputArray(cuda::HostMem& cuda_mem);
     template<typename _Tp> _OutputArray(cudev::GpuMat_<_Tp>& m);
     template<typename _Tp> _OutputArray(std::vector<_Tp>& vec);
     template<typename _Tp> _OutputArray(std::vector<std::vector<_Tp> >& vec);
@@ -292,7 +292,7 @@ public:
     _OutputArray(const std::vector<Mat>& vec);
     _OutputArray(const cuda::GpuMat& d_mat);
     _OutputArray(const ogl::Buffer& buf);
-    _OutputArray(const cuda::CudaMem& cuda_mem);
+    _OutputArray(const cuda::HostMem& cuda_mem);
     template<typename _Tp> _OutputArray(const cudev::GpuMat_<_Tp>& m);
     template<typename _Tp> _OutputArray(const std::vector<_Tp>& vec);
     template<typename _Tp> _OutputArray(const std::vector<std::vector<_Tp> >& vec);
@@ -310,7 +310,7 @@ public:
     virtual UMat& getUMatRef(int i=-1) const;
     virtual cuda::GpuMat& getGpuMatRef() const;
     virtual ogl::Buffer& getOGlBufferRef() const;
-    virtual cuda::CudaMem& getCudaMemRef() const;
+    virtual cuda::HostMem& getHostMemRef() const;
     virtual void create(Size sz, int type, int i=-1, bool allowTransposed=false, int fixedDepthMask=0) const;
     virtual void create(int rows, int cols, int type, int i=-1, bool allowTransposed=false, int fixedDepthMask=0) const;
     virtual void create(int dims, const int* size, int type, int i=-1, bool allowTransposed=false, int fixedDepthMask=0) const;
@@ -333,7 +333,7 @@ public:
     _InputOutputArray(std::vector<Mat>& vec);
     _InputOutputArray(cuda::GpuMat& d_mat);
     _InputOutputArray(ogl::Buffer& buf);
-    _InputOutputArray(cuda::CudaMem& cuda_mem);
+    _InputOutputArray(cuda::HostMem& cuda_mem);
     template<typename _Tp> _InputOutputArray(cudev::GpuMat_<_Tp>& m);
     template<typename _Tp> _InputOutputArray(std::vector<_Tp>& vec);
     template<typename _Tp> _InputOutputArray(std::vector<std::vector<_Tp> >& vec);
@@ -348,7 +348,7 @@ public:
     _InputOutputArray(const std::vector<Mat>& vec);
     _InputOutputArray(const cuda::GpuMat& d_mat);
     _InputOutputArray(const ogl::Buffer& buf);
-    _InputOutputArray(const cuda::CudaMem& cuda_mem);
+    _InputOutputArray(const cuda::HostMem& cuda_mem);
     template<typename _Tp> _InputOutputArray(const cudev::GpuMat_<_Tp>& m);
     template<typename _Tp> _InputOutputArray(const std::vector<_Tp>& vec);
     template<typename _Tp> _InputOutputArray(const std::vector<std::vector<_Tp> >& vec);
diff --git a/modules/core/include/opencv2/core/mat.inl.hpp b/modules/core/include/opencv2/core/mat.inl.hpp
index 24c6b453c1..9ca85116a0 100644
--- a/modules/core/include/opencv2/core/mat.inl.hpp
+++ b/modules/core/include/opencv2/core/mat.inl.hpp
@@ -100,13 +100,13 @@ inline _InputArray::_InputArray(const MatExpr& expr)
 { init(FIXED_TYPE + FIXED_SIZE + EXPR + ACCESS_READ, &expr); }
 
 inline _InputArray::_InputArray(const cuda::GpuMat& d_mat)
-{ init(GPU_MAT + ACCESS_READ, &d_mat); }
+{ init(CUDA_GPU_MAT + ACCESS_READ, &d_mat); }
 
 inline _InputArray::_InputArray(const ogl::Buffer& buf)
 { init(OPENGL_BUFFER + ACCESS_READ, &buf); }
 
-inline _InputArray::_InputArray(const cuda::CudaMem& cuda_mem)
-{ init(CUDA_MEM + ACCESS_READ, &cuda_mem); }
+inline _InputArray::_InputArray(const cuda::HostMem& cuda_mem)
+{ init(CUDA_HOST_MEM + ACCESS_READ, &cuda_mem); }
 
 inline _InputArray::~_InputArray() {}
 
@@ -174,13 +174,13 @@ _OutputArray::_OutputArray(const _Tp* vec, int n)
 { init(FIXED_TYPE + FIXED_SIZE + MATX + DataType<_Tp>::type + ACCESS_WRITE, vec, Size(n, 1)); }
 
 inline _OutputArray::_OutputArray(cuda::GpuMat& d_mat)
-{ init(GPU_MAT + ACCESS_WRITE, &d_mat); }
+{ init(CUDA_GPU_MAT + ACCESS_WRITE, &d_mat); }
 
 inline _OutputArray::_OutputArray(ogl::Buffer& buf)
 { init(OPENGL_BUFFER + ACCESS_WRITE, &buf); }
 
-inline _OutputArray::_OutputArray(cuda::CudaMem& cuda_mem)
-{ init(CUDA_MEM + ACCESS_WRITE, &cuda_mem); }
+inline _OutputArray::_OutputArray(cuda::HostMem& cuda_mem)
+{ init(CUDA_HOST_MEM + ACCESS_WRITE, &cuda_mem); }
 
 inline _OutputArray::_OutputArray(const Mat& m)
 { init(FIXED_TYPE + FIXED_SIZE + MAT + ACCESS_WRITE, &m); }
@@ -195,13 +195,13 @@ inline _OutputArray::_OutputArray(const std::vector<UMat>& vec)
 { init(FIXED_SIZE + STD_VECTOR_UMAT + ACCESS_WRITE, &vec); }
 
 inline _OutputArray::_OutputArray(const cuda::GpuMat& d_mat)
-{ init(FIXED_TYPE + FIXED_SIZE + GPU_MAT + ACCESS_WRITE, &d_mat); }
+{ init(FIXED_TYPE + FIXED_SIZE + CUDA_GPU_MAT + ACCESS_WRITE, &d_mat); }
 
 inline _OutputArray::_OutputArray(const ogl::Buffer& buf)
 { init(FIXED_TYPE + FIXED_SIZE + OPENGL_BUFFER + ACCESS_WRITE, &buf); }
 
-inline _OutputArray::_OutputArray(const cuda::CudaMem& cuda_mem)
-{ init(FIXED_TYPE + FIXED_SIZE + CUDA_MEM + ACCESS_WRITE, &cuda_mem); }
+inline _OutputArray::_OutputArray(const cuda::HostMem& cuda_mem)
+{ init(FIXED_TYPE + FIXED_SIZE + CUDA_HOST_MEM + ACCESS_WRITE, &cuda_mem); }
 
 ///////////////////////////////////////////////////////////////////////////////////////////
 
@@ -261,13 +261,13 @@ _InputOutputArray::_InputOutputArray(const _Tp* vec, int n)
 { init(FIXED_TYPE + FIXED_SIZE + MATX + DataType<_Tp>::type + ACCESS_RW, vec, Size(n, 1)); }
 
 inline _InputOutputArray::_InputOutputArray(cuda::GpuMat& d_mat)
-{ init(GPU_MAT + ACCESS_RW, &d_mat); }
+{ init(CUDA_GPU_MAT + ACCESS_RW, &d_mat); }
 
 inline _InputOutputArray::_InputOutputArray(ogl::Buffer& buf)
 { init(OPENGL_BUFFER + ACCESS_RW, &buf); }
 
-inline _InputOutputArray::_InputOutputArray(cuda::CudaMem& cuda_mem)
-{ init(CUDA_MEM + ACCESS_RW, &cuda_mem); }
+inline _InputOutputArray::_InputOutputArray(cuda::HostMem& cuda_mem)
+{ init(CUDA_HOST_MEM + ACCESS_RW, &cuda_mem); }
 
 inline _InputOutputArray::_InputOutputArray(const Mat& m)
 { init(FIXED_TYPE + FIXED_SIZE + MAT + ACCESS_RW, &m); }
@@ -282,13 +282,13 @@ inline _InputOutputArray::_InputOutputArray(const std::vector<UMat>& vec)
 { init(FIXED_SIZE + STD_VECTOR_UMAT + ACCESS_RW, &vec); }
 
 inline _InputOutputArray::_InputOutputArray(const cuda::GpuMat& d_mat)
-{ init(FIXED_TYPE + FIXED_SIZE + GPU_MAT + ACCESS_RW, &d_mat); }
+{ init(FIXED_TYPE + FIXED_SIZE + CUDA_GPU_MAT + ACCESS_RW, &d_mat); }
 
 inline _InputOutputArray::_InputOutputArray(const ogl::Buffer& buf)
 { init(FIXED_TYPE + FIXED_SIZE + OPENGL_BUFFER + ACCESS_RW, &buf); }
 
-inline _InputOutputArray::_InputOutputArray(const cuda::CudaMem& cuda_mem)
-{ init(FIXED_TYPE + FIXED_SIZE + CUDA_MEM + ACCESS_RW, &cuda_mem); }
+inline _InputOutputArray::_InputOutputArray(const cuda::HostMem& cuda_mem)
+{ init(FIXED_TYPE + FIXED_SIZE + CUDA_HOST_MEM + ACCESS_RW, &cuda_mem); }
 
 //////////////////////////////////////////// Mat //////////////////////////////////////////
 
diff --git a/modules/cuda/perf/perf_matop.cpp b/modules/core/perf/cuda/perf_gpumat.cpp
similarity index 91%
rename from modules/cuda/perf/perf_matop.cpp
rename to modules/core/perf/cuda/perf_gpumat.cpp
index 751e6e7148..4ef79c7ad8 100644
--- a/modules/cuda/perf/perf_matop.cpp
+++ b/modules/core/perf/cuda/perf_gpumat.cpp
@@ -40,7 +40,12 @@
 //
 //M*/
 
-#include "perf_precomp.hpp"
+#include "../perf_precomp.hpp"
+
+#ifdef HAVE_CUDA
+
+#include "opencv2/core/cuda.hpp"
+#include "opencv2/ts/cuda_perf.hpp"
 
 using namespace std;
 using namespace testing;
@@ -49,7 +54,7 @@ using namespace perf;
 //////////////////////////////////////////////////////////////////////
 // SetTo
 
-PERF_TEST_P(Sz_Depth_Cn, MatOp_SetTo,
+PERF_TEST_P(Sz_Depth_Cn, CUDA_GpuMat_SetTo,
             Combine(CUDA_TYPICAL_MAT_SIZES,
                     Values(CV_8U, CV_16U, CV_32F, CV_64F),
                     CUDA_CHANNELS_1_3_4))
@@ -67,23 +72,21 @@ PERF_TEST_P(Sz_Depth_Cn, MatOp_SetTo,
         cv::cuda::GpuMat dst(size, type);
 
         TEST_CYCLE() dst.setTo(val);
-
-        CUDA_SANITY_CHECK(dst);
     }
     else
     {
         cv::Mat dst(size, type);
 
         TEST_CYCLE() dst.setTo(val);
-
-        CPU_SANITY_CHECK(dst);
     }
+
+    SANITY_CHECK_NOTHING();
 }
 
 //////////////////////////////////////////////////////////////////////
 // SetToMasked
 
-PERF_TEST_P(Sz_Depth_Cn, MatOp_SetToMasked,
+PERF_TEST_P(Sz_Depth_Cn, CUDA_GpuMat_SetToMasked,
             Combine(CUDA_TYPICAL_MAT_SIZES,
                     Values(CV_8U, CV_16U, CV_32F, CV_64F),
                     CUDA_CHANNELS_1_3_4))
@@ -106,23 +109,21 @@ PERF_TEST_P(Sz_Depth_Cn, MatOp_SetToMasked,
         const cv::cuda::GpuMat d_mask(mask);
 
         TEST_CYCLE() dst.setTo(val, d_mask);
-
-        CUDA_SANITY_CHECK(dst, 1e-10);
     }
     else
     {
         cv::Mat dst = src;
 
         TEST_CYCLE() dst.setTo(val, mask);
-
-        CPU_SANITY_CHECK(dst);
     }
+
+    SANITY_CHECK_NOTHING();
 }
 
 //////////////////////////////////////////////////////////////////////
 // CopyToMasked
 
-PERF_TEST_P(Sz_Depth_Cn, MatOp_CopyToMasked,
+PERF_TEST_P(Sz_Depth_Cn, CUDA_GpuMat_CopyToMasked,
             Combine(CUDA_TYPICAL_MAT_SIZES,
                     Values(CV_8U, CV_16U, CV_32F, CV_64F),
                     CUDA_CHANNELS_1_3_4))
@@ -144,17 +145,15 @@ PERF_TEST_P(Sz_Depth_Cn, MatOp_CopyToMasked,
         cv::cuda::GpuMat dst(d_src.size(), d_src.type(), cv::Scalar::all(0));
 
         TEST_CYCLE() d_src.copyTo(dst, d_mask);
-
-        CUDA_SANITY_CHECK(dst, 1e-10);
     }
     else
     {
         cv::Mat dst(src.size(), src.type(), cv::Scalar::all(0));
 
         TEST_CYCLE() src.copyTo(dst, mask);
-
-        CPU_SANITY_CHECK(dst);
     }
+
+    SANITY_CHECK_NOTHING();
 }
 
 //////////////////////////////////////////////////////////////////////
@@ -162,7 +161,7 @@ PERF_TEST_P(Sz_Depth_Cn, MatOp_CopyToMasked,
 
 DEF_PARAM_TEST(Sz_2Depth, cv::Size, MatDepth, MatDepth);
 
-PERF_TEST_P(Sz_2Depth, MatOp_ConvertTo,
+PERF_TEST_P(Sz_2Depth, CUDA_GpuMat_ConvertTo,
             Combine(CUDA_TYPICAL_MAT_SIZES,
                     Values(CV_8U, CV_16U, CV_32F, CV_64F),
                     Values(CV_8U, CV_16U, CV_32F, CV_64F)))
@@ -183,15 +182,15 @@ PERF_TEST_P(Sz_2Depth, MatOp_ConvertTo,
         cv::cuda::GpuMat dst;
 
         TEST_CYCLE() d_src.convertTo(dst, depth2, a, b);
-
-        CUDA_SANITY_CHECK(dst, 1e-10);
     }
     else
     {
         cv::Mat dst;
 
         TEST_CYCLE() src.convertTo(dst, depth2, a, b);
-
-        CPU_SANITY_CHECK(dst);
     }
+
+    SANITY_CHECK_NOTHING();
 }
+
+#endif
diff --git a/modules/core/src/cuda_gpu_mat.cpp b/modules/core/src/cuda_gpu_mat.cpp
index 803b21069d..4440d58536 100644
--- a/modules/core/src/cuda_gpu_mat.cpp
+++ b/modules/core/src/cuda_gpu_mat.cpp
@@ -275,12 +275,12 @@ void cv::cuda::createContinuous(int rows, int cols, int type, OutputArray arr)
         ::createContinuousImpl(rows, cols, type, arr.getMatRef());
         break;
 
-    case _InputArray::GPU_MAT:
+    case _InputArray::CUDA_GPU_MAT:
         ::createContinuousImpl(rows, cols, type, arr.getGpuMatRef());
         break;
 
-    case _InputArray::CUDA_MEM:
-        ::createContinuousImpl(rows, cols, type, arr.getCudaMemRef());
+    case _InputArray::CUDA_HOST_MEM:
+        ::createContinuousImpl(rows, cols, type, arr.getHostMemRef());
         break;
 
     default:
@@ -329,12 +329,12 @@ void cv::cuda::ensureSizeIsEnough(int rows, int cols, int type, OutputArray arr)
         ::ensureSizeIsEnoughImpl(rows, cols, type, arr.getMatRef());
         break;
 
-    case _InputArray::GPU_MAT:
+    case _InputArray::CUDA_GPU_MAT:
         ::ensureSizeIsEnoughImpl(rows, cols, type, arr.getGpuMatRef());
         break;
 
-    case _InputArray::CUDA_MEM:
-        ::ensureSizeIsEnoughImpl(rows, cols, type, arr.getCudaMemRef());
+    case _InputArray::CUDA_HOST_MEM:
+        ::ensureSizeIsEnoughImpl(rows, cols, type, arr.getHostMemRef());
         break;
 
     default:
@@ -342,14 +342,6 @@ void cv::cuda::ensureSizeIsEnough(int rows, int cols, int type, OutputArray arr)
     }
 }
 
-GpuMat cv::cuda::allocMatFromBuf(int rows, int cols, int type, GpuMat& mat)
-{
-    if (!mat.empty() && mat.type() == type && mat.rows >= rows && mat.cols >= cols)
-        return mat(Rect(0, 0, cols, rows));
-
-    return mat = GpuMat(rows, cols, type);
-}
-
 #ifndef HAVE_CUDA
 
 GpuMat::Allocator* cv::cuda::GpuMat::defaultAllocator()
diff --git a/modules/core/src/cuda_host_mem.cpp b/modules/core/src/cuda_host_mem.cpp
index b27d52e329..2ad733b675 100644
--- a/modules/core/src/cuda_host_mem.cpp
+++ b/modules/core/src/cuda_host_mem.cpp
@@ -42,10 +42,124 @@
 //M*/
 
 #include "precomp.hpp"
+#include <map>
 
 using namespace cv;
 using namespace cv::cuda;
 
+#ifdef HAVE_CUDA
+
+namespace {
+
+class HostMemAllocator : public MatAllocator
+{
+public:
+    explicit HostMemAllocator(unsigned int flags) : flags_(flags)
+    {
+    }
+
+    UMatData* allocate(int dims, const int* sizes, int type,
+                       void* data0, size_t* step,
+                       int /*flags*/, UMatUsageFlags /*usageFlags*/) const
+    {
+        size_t total = CV_ELEM_SIZE(type);
+        for (int i = dims-1; i >= 0; i--)
+        {
+            if (step)
+            {
+                if (data0 && step[i] != CV_AUTOSTEP)
+                {
+                    CV_Assert(total <= step[i]);
+                    total = step[i];
+                }
+                else
+                {
+                    step[i] = total;
+                }
+            }
+
+            total *= sizes[i];
+        }
+
+        UMatData* u = new UMatData(this);
+        u->size = total;
+
+        if (data0)
+        {
+            u->data = u->origdata = static_cast<uchar*>(data0);
+            u->flags |= UMatData::USER_ALLOCATED;
+        }
+        else
+        {
+            void* ptr = 0;
+            cudaSafeCall( cudaHostAlloc(&ptr, total, flags_) );
+
+            u->data = u->origdata = static_cast<uchar*>(ptr);
+        }
+
+        return u;
+    }
+
+    bool allocate(UMatData* u, int /*accessFlags*/, UMatUsageFlags /*usageFlags*/) const
+    {
+        return (u != NULL);
+    }
+
+    void deallocate(UMatData* u) const
+    {
+        CV_Assert(u->urefcount >= 0);
+        CV_Assert(u->refcount >= 0);
+
+        if (u && u->refcount == 0)
+        {
+            if ( !(u->flags & UMatData::USER_ALLOCATED) )
+            {
+                cudaFreeHost(u->origdata);
+                u->origdata = 0;
+            }
+
+            delete u;
+        }
+    }
+
+private:
+    unsigned int flags_;
+};
+
+} // namespace
+
+#endif
+
+MatAllocator* cv::cuda::HostMem::getAllocator(AllocType alloc_type)
+{
+#ifndef HAVE_CUDA
+    (void) alloc_type;
+    throw_no_cuda();
+    return NULL;
+#else
+    static std::map<unsigned int, Ptr<MatAllocator> > allocators;
+
+    unsigned int flag = cudaHostAllocDefault;
+
+    switch (alloc_type)
+    {
+    case PAGE_LOCKED:    flag = cudaHostAllocDefault; break;
+    case SHARED:         flag = cudaHostAllocMapped;  break;
+    case WRITE_COMBINED: flag = cudaHostAllocWriteCombined; break;
+    default:             CV_Error(cv::Error::StsBadFlag, "Invalid alloc type");
+    }
+
+    Ptr<MatAllocator>& a = allocators[flag];
+
+    if (a.empty())
+    {
+        a = makePtr<HostMemAllocator>(flag);
+    }
+
+    return a.get();
+#endif
+}
+
 #ifdef HAVE_CUDA
 namespace
 {
@@ -59,7 +173,7 @@ namespace
 }
 #endif
 
-void cv::cuda::CudaMem::create(int rows_, int cols_, int type_)
+void cv::cuda::HostMem::create(int rows_, int cols_, int type_)
 {
 #ifndef HAVE_CUDA
     (void) rows_;
@@ -123,9 +237,9 @@ void cv::cuda::CudaMem::create(int rows_, int cols_, int type_)
 #endif
 }
 
-CudaMem cv::cuda::CudaMem::reshape(int new_cn, int new_rows) const
+HostMem cv::cuda::HostMem::reshape(int new_cn, int new_rows) const
 {
-    CudaMem hdr = *this;
+    HostMem hdr = *this;
 
     int cn = channels();
     if (new_cn == 0)
@@ -166,7 +280,7 @@ CudaMem cv::cuda::CudaMem::reshape(int new_cn, int new_rows) const
     return hdr;
 }
 
-void cv::cuda::CudaMem::release()
+void cv::cuda::HostMem::release()
 {
 #ifdef HAVE_CUDA
     if (refcount && CV_XADD(refcount, -1) == 1)
@@ -181,7 +295,7 @@ void cv::cuda::CudaMem::release()
 #endif
 }
 
-GpuMat cv::cuda::CudaMem::createGpuMatHeader() const
+GpuMat cv::cuda::HostMem::createGpuMatHeader() const
 {
 #ifndef HAVE_CUDA
     throw_no_cuda();
diff --git a/modules/core/src/matrix.cpp b/modules/core/src/matrix.cpp
index 980ade1845..38ff7ed53a 100644
--- a/modules/core/src/matrix.cpp
+++ b/modules/core/src/matrix.cpp
@@ -1187,18 +1187,18 @@ Mat _InputArray::getMat(int i) const
         return Mat();
     }
 
-    if( k == GPU_MAT )
+    if( k == CUDA_GPU_MAT )
     {
         CV_Assert( i < 0 );
         CV_Error(cv::Error::StsNotImplemented, "You should explicitly call download method for cuda::GpuMat object");
         return Mat();
     }
 
-    if( k == CUDA_MEM )
+    if( k == CUDA_HOST_MEM )
     {
         CV_Assert( i < 0 );
 
-        const cuda::CudaMem* cuda_mem = (const cuda::CudaMem*)obj;
+        const cuda::HostMem* cuda_mem = (const cuda::HostMem*)obj;
 
         return cuda_mem->createMatHeader();
     }
@@ -1391,15 +1391,15 @@ cuda::GpuMat _InputArray::getGpuMat() const
 {
     int k = kind();
 
-    if (k == GPU_MAT)
+    if (k == CUDA_GPU_MAT)
     {
         const cuda::GpuMat* d_mat = (const cuda::GpuMat*)obj;
         return *d_mat;
     }
 
-    if (k == CUDA_MEM)
+    if (k == CUDA_HOST_MEM)
     {
-        const cuda::CudaMem* cuda_mem = (const cuda::CudaMem*)obj;
+        const cuda::HostMem* cuda_mem = (const cuda::HostMem*)obj;
         return cuda_mem->createGpuMatHeader();
     }
 
@@ -1412,7 +1412,7 @@ cuda::GpuMat _InputArray::getGpuMat() const
     if (k == NONE)
         return cuda::GpuMat();
 
-    CV_Error(cv::Error::StsNotImplemented, "getGpuMat is available only for cuda::GpuMat and cuda::CudaMem");
+    CV_Error(cv::Error::StsNotImplemented, "getGpuMat is available only for cuda::GpuMat and cuda::HostMem");
     return cuda::GpuMat();
 }
 
@@ -1520,20 +1520,22 @@ Size _InputArray::size(int i) const
         return buf->size();
     }
 
-    if( k == GPU_MAT )
+    if( k == CUDA_GPU_MAT )
     {
         CV_Assert( i < 0 );
         const cuda::GpuMat* d_mat = (const cuda::GpuMat*)obj;
         return d_mat->size();
     }
 
-    CV_Assert( k == CUDA_MEM );
-    //if( k == CUDA_MEM )
+    if( k == CUDA_HOST_MEM )
     {
         CV_Assert( i < 0 );
-        const cuda::CudaMem* cuda_mem = (const cuda::CudaMem*)obj;
+        const cuda::HostMem* cuda_mem = (const cuda::HostMem*)obj;
         return cuda_mem->size();
     }
+
+    CV_Error(Error::StsNotImplemented, "Unknown/unsupported array type");
+    return Size();
 }
 
 int _InputArray::sizend(int* arrsz, int i) const
@@ -1700,18 +1702,20 @@ int _InputArray::dims(int i) const
         return 2;
     }
 
-    if( k == GPU_MAT )
+    if( k == CUDA_GPU_MAT )
     {
         CV_Assert( i < 0 );
         return 2;
     }
 
-    CV_Assert( k == CUDA_MEM );
-    //if( k == CUDA_MEM )
+    if( k == CUDA_HOST_MEM )
     {
         CV_Assert( i < 0 );
         return 2;
     }
+
+    CV_Error(Error::StsNotImplemented, "Unknown/unsupported array type");
+    return 0;
 }
 
 size_t _InputArray::total(int i) const
@@ -1799,12 +1803,14 @@ int _InputArray::type(int i) const
     if( k == OPENGL_BUFFER )
         return ((const ogl::Buffer*)obj)->type();
 
-    if( k == GPU_MAT )
+    if( k == CUDA_GPU_MAT )
         return ((const cuda::GpuMat*)obj)->type();
 
-    CV_Assert( k == CUDA_MEM );
-    //if( k == CUDA_MEM )
-        return ((const cuda::CudaMem*)obj)->type();
+    if( k == CUDA_HOST_MEM )
+        return ((const cuda::HostMem*)obj)->type();
+
+    CV_Error(Error::StsNotImplemented, "Unknown/unsupported array type");
+    return 0;
 }
 
 int _InputArray::depth(int i) const
@@ -1863,12 +1869,14 @@ bool _InputArray::empty() const
     if( k == OPENGL_BUFFER )
         return ((const ogl::Buffer*)obj)->empty();
 
-    if( k == GPU_MAT )
+    if( k == CUDA_GPU_MAT )
         return ((const cuda::GpuMat*)obj)->empty();
 
-    CV_Assert( k == CUDA_MEM );
-    //if( k == CUDA_MEM )
-        return ((const cuda::CudaMem*)obj)->empty();
+    if( k == CUDA_HOST_MEM )
+        return ((const cuda::HostMem*)obj)->empty();
+
+    CV_Error(Error::StsNotImplemented, "Unknown/unsupported array type");
+    return true;
 }
 
 bool _InputArray::isContinuous(int i) const
@@ -1970,7 +1978,7 @@ size_t _InputArray::offset(int i) const
         return vv[i].offset;
     }
 
-    if( k == GPU_MAT )
+    if( k == CUDA_GPU_MAT )
     {
         CV_Assert( i < 0 );
         const cuda::GpuMat * const m = ((const cuda::GpuMat*)obj);
@@ -2016,7 +2024,7 @@ size_t _InputArray::step(int i) const
         return vv[i].step;
     }
 
-    if( k == GPU_MAT )
+    if( k == CUDA_GPU_MAT )
     {
         CV_Assert( i < 0 );
         return ((const cuda::GpuMat*)obj)->step;
@@ -2095,7 +2103,7 @@ void _OutputArray::create(Size _sz, int mtype, int i, bool allowTransposed, int
         ((UMat*)obj)->create(_sz, mtype);
         return;
     }
-    if( k == GPU_MAT && i < 0 && !allowTransposed && fixedDepthMask == 0 )
+    if( k == CUDA_GPU_MAT && i < 0 && !allowTransposed && fixedDepthMask == 0 )
     {
         CV_Assert(!fixedSize() || ((cuda::GpuMat*)obj)->size() == _sz);
         CV_Assert(!fixedType() || ((cuda::GpuMat*)obj)->type() == mtype);
@@ -2109,11 +2117,11 @@ void _OutputArray::create(Size _sz, int mtype, int i, bool allowTransposed, int
         ((ogl::Buffer*)obj)->create(_sz, mtype);
         return;
     }
-    if( k == CUDA_MEM && i < 0 && !allowTransposed && fixedDepthMask == 0 )
+    if( k == CUDA_HOST_MEM && i < 0 && !allowTransposed && fixedDepthMask == 0 )
     {
-        CV_Assert(!fixedSize() || ((cuda::CudaMem*)obj)->size() == _sz);
-        CV_Assert(!fixedType() || ((cuda::CudaMem*)obj)->type() == mtype);
-        ((cuda::CudaMem*)obj)->create(_sz, mtype);
+        CV_Assert(!fixedSize() || ((cuda::HostMem*)obj)->size() == _sz);
+        CV_Assert(!fixedType() || ((cuda::HostMem*)obj)->type() == mtype);
+        ((cuda::HostMem*)obj)->create(_sz, mtype);
         return;
     }
     int sizes[] = {_sz.height, _sz.width};
@@ -2137,7 +2145,7 @@ void _OutputArray::create(int _rows, int _cols, int mtype, int i, bool allowTran
         ((UMat*)obj)->create(_rows, _cols, mtype);
         return;
     }
-    if( k == GPU_MAT && i < 0 && !allowTransposed && fixedDepthMask == 0 )
+    if( k == CUDA_GPU_MAT && i < 0 && !allowTransposed && fixedDepthMask == 0 )
     {
         CV_Assert(!fixedSize() || ((cuda::GpuMat*)obj)->size() == Size(_cols, _rows));
         CV_Assert(!fixedType() || ((cuda::GpuMat*)obj)->type() == mtype);
@@ -2151,11 +2159,11 @@ void _OutputArray::create(int _rows, int _cols, int mtype, int i, bool allowTran
         ((ogl::Buffer*)obj)->create(_rows, _cols, mtype);
         return;
     }
-    if( k == CUDA_MEM && i < 0 && !allowTransposed && fixedDepthMask == 0 )
+    if( k == CUDA_HOST_MEM && i < 0 && !allowTransposed && fixedDepthMask == 0 )
     {
-        CV_Assert(!fixedSize() || ((cuda::CudaMem*)obj)->size() == Size(_cols, _rows));
-        CV_Assert(!fixedType() || ((cuda::CudaMem*)obj)->type() == mtype);
-        ((cuda::CudaMem*)obj)->create(_rows, _cols, mtype);
+        CV_Assert(!fixedSize() || ((cuda::HostMem*)obj)->size() == Size(_cols, _rows));
+        CV_Assert(!fixedType() || ((cuda::HostMem*)obj)->type() == mtype);
+        ((cuda::HostMem*)obj)->create(_rows, _cols, mtype);
         return;
     }
     int sizes[] = {_rows, _cols};
@@ -2479,15 +2487,15 @@ void _OutputArray::release() const
         return;
     }
 
-    if( k == GPU_MAT )
+    if( k == CUDA_GPU_MAT )
     {
         ((cuda::GpuMat*)obj)->release();
         return;
     }
 
-    if( k == CUDA_MEM )
+    if( k == CUDA_HOST_MEM )
     {
-        ((cuda::CudaMem*)obj)->release();
+        ((cuda::HostMem*)obj)->release();
         return;
     }
 
@@ -2583,7 +2591,7 @@ UMat& _OutputArray::getUMatRef(int i) const
 cuda::GpuMat& _OutputArray::getGpuMatRef() const
 {
     int k = kind();
-    CV_Assert( k == GPU_MAT );
+    CV_Assert( k == CUDA_GPU_MAT );
     return *(cuda::GpuMat*)obj;
 }
 
@@ -2594,11 +2602,11 @@ ogl::Buffer& _OutputArray::getOGlBufferRef() const
     return *(ogl::Buffer*)obj;
 }
 
-cuda::CudaMem& _OutputArray::getCudaMemRef() const
+cuda::HostMem& _OutputArray::getHostMemRef() const
 {
     int k = kind();
-    CV_Assert( k == CUDA_MEM );
-    return *(cuda::CudaMem*)obj;
+    CV_Assert( k == CUDA_HOST_MEM );
+    return *(cuda::HostMem*)obj;
 }
 
 void _OutputArray::setTo(const _InputArray& arr, const _InputArray & mask) const
@@ -2614,10 +2622,10 @@ void _OutputArray::setTo(const _InputArray& arr, const _InputArray & mask) const
     }
     else if( k == UMAT )
         ((UMat*)obj)->setTo(arr, mask);
-    else if( k == GPU_MAT )
+    else if( k == CUDA_GPU_MAT )
     {
         Mat value = arr.getMat();
-        CV_Assert( checkScalar(value, type(), arr.kind(), _InputArray::GPU_MAT) );
+        CV_Assert( checkScalar(value, type(), arr.kind(), _InputArray::CUDA_GPU_MAT) );
         ((cuda::GpuMat*)obj)->setTo(Scalar(Vec<double, 4>(value.ptr<double>())), mask);
     }
     else
diff --git a/modules/core/src/opengl.cpp b/modules/core/src/opengl.cpp
index e7b2a7627a..00a7f66662 100644
--- a/modules/core/src/opengl.cpp
+++ b/modules/core/src/opengl.cpp
@@ -509,7 +509,7 @@ cv::ogl::Buffer::Buffer(InputArray arr, Target target, bool autoRelease) : rows_
     switch (kind)
     {
     case _InputArray::OPENGL_BUFFER:
-    case _InputArray::GPU_MAT:
+    case _InputArray::CUDA_GPU_MAT:
         copyFrom(arr, target, autoRelease);
         break;
 
@@ -594,7 +594,7 @@ void cv::ogl::Buffer::copyFrom(InputArray arr, Target target, bool autoRelease)
             break;
         }
 
-    case _InputArray::GPU_MAT:
+    case _InputArray::CUDA_GPU_MAT:
         {
             #ifndef HAVE_CUDA
                 throw_no_cuda();
@@ -657,7 +657,7 @@ void cv::ogl::Buffer::copyTo(OutputArray arr) const
             break;
         }
 
-    case _InputArray::GPU_MAT:
+    case _InputArray::CUDA_GPU_MAT:
         {
             #ifndef HAVE_CUDA
                 throw_no_cuda();
@@ -1018,7 +1018,7 @@ cv::ogl::Texture2D::Texture2D(InputArray arr, bool autoRelease) : rows_(0), cols
             break;
         }
 
-    case _InputArray::GPU_MAT:
+    case _InputArray::CUDA_GPU_MAT:
         {
             #ifndef HAVE_CUDA
                 throw_no_cuda();
@@ -1132,7 +1132,7 @@ void cv::ogl::Texture2D::copyFrom(InputArray arr, bool autoRelease)
             break;
         }
 
-    case _InputArray::GPU_MAT:
+    case _InputArray::CUDA_GPU_MAT:
         {
             #ifndef HAVE_CUDA
                 throw_no_cuda();
@@ -1184,7 +1184,7 @@ void cv::ogl::Texture2D::copyTo(OutputArray arr, int ddepth, bool autoRelease) c
             break;
         }
 
-    case _InputArray::GPU_MAT:
+    case _InputArray::CUDA_GPU_MAT:
         {
             #ifndef HAVE_CUDA
                 throw_no_cuda();
diff --git a/modules/cuda/test/test_buffer_pool.cpp b/modules/core/test/cuda/test_buffer_pool.cpp
similarity index 62%
rename from modules/cuda/test/test_buffer_pool.cpp
rename to modules/core/test/cuda/test_buffer_pool.cpp
index 2526358d95..eec6ed3f64 100644
--- a/modules/cuda/test/test_buffer_pool.cpp
+++ b/modules/core/test/cuda/test_buffer_pool.cpp
@@ -40,13 +40,13 @@
 //
 //M*/
 
-#include "test_precomp.hpp"
+#include "../test_precomp.hpp"
 
 #ifdef HAVE_CUDA
 
-#include "opencv2/cudaarithm.hpp"
-#include "opencv2/cudawarping.hpp"
+#include "opencv2/core/cuda.hpp"
 #include "opencv2/core/private.cuda.hpp"
+#include "opencv2/ts/cuda_test.hpp"
 
 using namespace testing;
 using namespace cv;
@@ -54,65 +54,64 @@ using namespace cv::cuda;
 
 struct BufferPoolTest : TestWithParam<DeviceInfo>
 {
+    void RunSimpleTest(Stream& stream, HostMem& dst_1, HostMem& dst_2)
+    {
+        BufferPool pool(stream);
+
+        {
+            GpuMat buf0 = pool.getBuffer(Size(640, 480), CV_8UC1);
+            EXPECT_FALSE( buf0.empty() );
+
+            buf0.setTo(Scalar::all(0), stream);
+
+            GpuMat buf1 = pool.getBuffer(Size(640, 480), CV_8UC1);
+            EXPECT_FALSE( buf1.empty() );
+
+            buf0.convertTo(buf1, buf1.type(), 1.0, 1.0, stream);
+
+            buf1.download(dst_1, stream);
+        }
+
+        {
+            GpuMat buf2 = pool.getBuffer(Size(1280, 1024), CV_32SC1);
+            EXPECT_FALSE( buf2.empty() );
+
+            buf2.setTo(Scalar::all(2), stream);
+
+            buf2.download(dst_2, stream);
+        }
+    }
+
+    void CheckSimpleTest(HostMem& dst_1, HostMem& dst_2)
+    {
+        EXPECT_MAT_NEAR(Mat(Size(640, 480), CV_8UC1, Scalar::all(1)), dst_1, 0.0);
+        EXPECT_MAT_NEAR(Mat(Size(1280, 1024), CV_32SC1, Scalar::all(2)), dst_2, 0.0);
+    }
 };
 
-namespace
+CUDA_TEST_P(BufferPoolTest, FromNullStream)
 {
-    void func1(const GpuMat& src, GpuMat& dst, Stream& stream)
-    {
-        BufferPool pool(stream);
+    HostMem dst_1, dst_2;
 
-        GpuMat buf = pool.getBuffer(src.size(), CV_32FC(src.channels()));
+    RunSimpleTest(Stream::Null(), dst_1, dst_2);
 
-        src.convertTo(buf, CV_32F, 1.0 / 255.0, stream);
-
-        cuda::exp(buf, dst, stream);
-    }
-
-    void func2(const GpuMat& src, GpuMat& dst, Stream& stream)
-    {
-        BufferPool pool(stream);
-
-        GpuMat buf1 = pool.getBuffer(saturate_cast<int>(src.rows * 0.5), saturate_cast<int>(src.cols * 0.5), src.type());
-
-        cuda::resize(src, buf1, Size(), 0.5, 0.5, cv::INTER_NEAREST, stream);
-
-        GpuMat buf2 = pool.getBuffer(buf1.size(), CV_32FC(buf1.channels()));
-
-        func1(buf1, buf2, stream);
-
-        GpuMat buf3 = pool.getBuffer(src.size(), buf2.type());
-
-        cuda::resize(buf2, buf3, src.size(), 0, 0, cv::INTER_NEAREST, stream);
-
-        buf3.convertTo(dst, CV_8U, stream);
-    }
+    CheckSimpleTest(dst_1, dst_2);
 }
 
-CUDA_TEST_P(BufferPoolTest, SimpleUsage)
+CUDA_TEST_P(BufferPoolTest, From2Streams)
 {
-    DeviceInfo devInfo = GetParam();
-    setDevice(devInfo.deviceID());
+    HostMem dst1_1, dst1_2;
+    HostMem dst2_1, dst2_2;
 
-    GpuMat src(200, 200, CV_8UC1);
-    GpuMat dst;
+    Stream stream1, stream2;
+    RunSimpleTest(stream1, dst1_1, dst1_2);
+    RunSimpleTest(stream2, dst2_1, dst2_2);
 
-    Stream stream;
+    stream1.waitForCompletion();
+    stream2.waitForCompletion();
 
-    func2(src, dst, stream);
-
-    stream.waitForCompletion();
-
-    GpuMat buf, buf1, buf2, buf3;
-    GpuMat dst_gold;
-
-    cuda::resize(src, buf1, Size(), 0.5, 0.5, cv::INTER_NEAREST);
-    buf1.convertTo(buf, CV_32F, 1.0 / 255.0);
-    cuda::exp(buf, buf2);
-    cuda::resize(buf2, buf3, src.size(), 0, 0, cv::INTER_NEAREST);
-    buf3.convertTo(dst_gold, CV_8U);
-
-    ASSERT_MAT_NEAR(dst_gold, dst, 0);
+    CheckSimpleTest(dst1_1, dst1_2);
+    CheckSimpleTest(dst2_1, dst2_2);
 }
 
 INSTANTIATE_TEST_CASE_P(CUDA_Stream, BufferPoolTest, ALL_DEVICES);
diff --git a/modules/cuda/test/test_gpumat.cpp b/modules/core/test/cuda/test_gpumat.cpp
similarity index 91%
rename from modules/cuda/test/test_gpumat.cpp
rename to modules/core/test/cuda/test_gpumat.cpp
index dcd368c085..b549f03a05 100644
--- a/modules/cuda/test/test_gpumat.cpp
+++ b/modules/core/test/cuda/test_gpumat.cpp
@@ -40,16 +40,19 @@
 //
 //M*/
 
-#include "test_precomp.hpp"
+#include "../test_precomp.hpp"
 
 #ifdef HAVE_CUDA
 
+#include "opencv2/core/cuda.hpp"
+#include "opencv2/ts/cuda_test.hpp"
+
 using namespace cvtest;
 
 ////////////////////////////////////////////////////////////////////////////////
 // SetTo
 
-PARAM_TEST_CASE(SetTo, cv::cuda::DeviceInfo, cv::Size, MatType, UseRoi)
+PARAM_TEST_CASE(GpuMat_SetTo, cv::cuda::DeviceInfo, cv::Size, MatType, UseRoi)
 {
     cv::cuda::DeviceInfo devInfo;
     cv::Size size;
@@ -67,7 +70,7 @@ PARAM_TEST_CASE(SetTo, cv::cuda::DeviceInfo, cv::Size, MatType, UseRoi)
     }
 };
 
-CUDA_TEST_P(SetTo, Zero)
+CUDA_TEST_P(GpuMat_SetTo, Zero)
 {
     cv::Scalar zero = cv::Scalar::all(0);
 
@@ -77,7 +80,7 @@ CUDA_TEST_P(SetTo, Zero)
     EXPECT_MAT_NEAR(cv::Mat::zeros(size, type), mat, 0.0);
 }
 
-CUDA_TEST_P(SetTo, SameVal)
+CUDA_TEST_P(GpuMat_SetTo, SameVal)
 {
     cv::Scalar val = cv::Scalar::all(randomDouble(0.0, 255.0));
 
@@ -102,7 +105,7 @@ CUDA_TEST_P(SetTo, SameVal)
     }
 }
 
-CUDA_TEST_P(SetTo, DifferentVal)
+CUDA_TEST_P(GpuMat_SetTo, DifferentVal)
 {
     cv::Scalar val = randomScalar(0.0, 255.0);
 
@@ -127,7 +130,7 @@ CUDA_TEST_P(SetTo, DifferentVal)
     }
 }
 
-CUDA_TEST_P(SetTo, Masked)
+CUDA_TEST_P(GpuMat_SetTo, Masked)
 {
     cv::Scalar val = randomScalar(0.0, 255.0);
     cv::Mat mat_gold = randomMat(size, type);
@@ -156,7 +159,7 @@ CUDA_TEST_P(SetTo, Masked)
     }
 }
 
-INSTANTIATE_TEST_CASE_P(CUDA_GpuMat, SetTo, testing::Combine(
+INSTANTIATE_TEST_CASE_P(CUDA, GpuMat_SetTo, testing::Combine(
     ALL_DEVICES,
     DIFFERENT_SIZES,
     ALL_TYPES,
@@ -165,7 +168,7 @@ INSTANTIATE_TEST_CASE_P(CUDA_GpuMat, SetTo, testing::Combine(
 ////////////////////////////////////////////////////////////////////////////////
 // CopyTo
 
-PARAM_TEST_CASE(CopyTo, cv::cuda::DeviceInfo, cv::Size, MatType, UseRoi)
+PARAM_TEST_CASE(GpuMat_CopyTo, cv::cuda::DeviceInfo, cv::Size, MatType, UseRoi)
 {
     cv::cuda::DeviceInfo devInfo;
     cv::Size size;
@@ -184,7 +187,7 @@ PARAM_TEST_CASE(CopyTo, cv::cuda::DeviceInfo, cv::Size, MatType, UseRoi)
     }
 };
 
-CUDA_TEST_P(CopyTo, WithOutMask)
+CUDA_TEST_P(GpuMat_CopyTo, WithOutMask)
 {
     cv::Mat src = randomMat(size, type);
 
@@ -195,7 +198,7 @@ CUDA_TEST_P(CopyTo, WithOutMask)
     EXPECT_MAT_NEAR(src, dst, 0.0);
 }
 
-CUDA_TEST_P(CopyTo, Masked)
+CUDA_TEST_P(GpuMat_CopyTo, Masked)
 {
     cv::Mat src = randomMat(size, type);
     cv::Mat mask = randomMat(size, CV_8UC1, 0.0, 2.0);
@@ -226,7 +229,7 @@ CUDA_TEST_P(CopyTo, Masked)
     }
 }
 
-INSTANTIATE_TEST_CASE_P(CUDA_GpuMat, CopyTo, testing::Combine(
+INSTANTIATE_TEST_CASE_P(CUDA, GpuMat_CopyTo, testing::Combine(
     ALL_DEVICES,
     DIFFERENT_SIZES,
     ALL_TYPES,
@@ -235,7 +238,7 @@ INSTANTIATE_TEST_CASE_P(CUDA_GpuMat, CopyTo, testing::Combine(
 ////////////////////////////////////////////////////////////////////////////////
 // ConvertTo
 
-PARAM_TEST_CASE(ConvertTo, cv::cuda::DeviceInfo, cv::Size, MatDepth, MatDepth, UseRoi)
+PARAM_TEST_CASE(GpuMat_ConvertTo, cv::cuda::DeviceInfo, cv::Size, MatDepth, MatDepth, UseRoi)
 {
     cv::cuda::DeviceInfo devInfo;
     cv::Size size;
@@ -255,7 +258,7 @@ PARAM_TEST_CASE(ConvertTo, cv::cuda::DeviceInfo, cv::Size, MatDepth, MatDepth, U
     }
 };
 
-CUDA_TEST_P(ConvertTo, WithOutScaling)
+CUDA_TEST_P(GpuMat_ConvertTo, WithOutScaling)
 {
     cv::Mat src = randomMat(size, depth1);
 
@@ -285,7 +288,7 @@ CUDA_TEST_P(ConvertTo, WithOutScaling)
     }
 }
 
-CUDA_TEST_P(ConvertTo, WithScaling)
+CUDA_TEST_P(GpuMat_ConvertTo, WithScaling)
 {
     cv::Mat src = randomMat(size, depth1);
     double a = randomDouble(0.0, 1.0);
@@ -317,7 +320,7 @@ CUDA_TEST_P(ConvertTo, WithScaling)
     }
 }
 
-INSTANTIATE_TEST_CASE_P(CUDA_GpuMat, ConvertTo, testing::Combine(
+INSTANTIATE_TEST_CASE_P(CUDA, GpuMat_ConvertTo, testing::Combine(
     ALL_DEVICES,
     DIFFERENT_SIZES,
     ALL_DEPTH,
@@ -356,6 +359,6 @@ CUDA_TEST_P(EnsureSizeIsEnough, BufferReuse)
     EXPECT_EQ(reinterpret_cast<intptr_t>(old.data), reinterpret_cast<intptr_t>(buffer.data));
 }
 
-INSTANTIATE_TEST_CASE_P(CUDA_GpuMat, EnsureSizeIsEnough, ALL_DEVICES);
+INSTANTIATE_TEST_CASE_P(CUDA, EnsureSizeIsEnough, ALL_DEVICES);
 
 #endif // HAVE_CUDA
diff --git a/modules/cuda/test/test_opengl.cpp b/modules/core/test/cuda/test_opengl.cpp
similarity index 98%
rename from modules/cuda/test/test_opengl.cpp
rename to modules/core/test/cuda/test_opengl.cpp
index 0b4812c209..f4c733d064 100644
--- a/modules/cuda/test/test_opengl.cpp
+++ b/modules/core/test/cuda/test_opengl.cpp
@@ -40,10 +40,14 @@
 //
 //M*/
 
-#include "test_precomp.hpp"
+#include "../test_precomp.hpp"
 
 #if defined(HAVE_CUDA) && defined(HAVE_OPENGL)
 
+#include "opencv2/core/cuda.hpp"
+#include "opencv2/core/opengl.hpp"
+#include "opencv2/ts/cuda_test.hpp"
+
 using namespace cvtest;
 
 /////////////////////////////////////////////
diff --git a/modules/cuda/test/test_stream.cpp b/modules/core/test/cuda/test_stream.cpp
similarity index 82%
rename from modules/cuda/test/test_stream.cpp
rename to modules/core/test/cuda/test_stream.cpp
index cdeca71aba..a0e451a62a 100644
--- a/modules/cuda/test/test_stream.cpp
+++ b/modules/core/test/cuda/test_stream.cpp
@@ -40,22 +40,23 @@
 //
 //M*/
 
-#include "test_precomp.hpp"
+#include "../test_precomp.hpp"
 
 #ifdef HAVE_CUDA
 
 #include <cuda_runtime.h>
 
-#if CUDART_VERSION >= 5000
+#include "opencv2/core/cuda.hpp"
+#include "opencv2/ts/cuda_test.hpp"
 
 using namespace cvtest;
 
 struct Async : testing::TestWithParam<cv::cuda::DeviceInfo>
 {
-    cv::cuda::CudaMem src;
+    cv::cuda::HostMem src;
     cv::cuda::GpuMat d_src;
 
-    cv::cuda::CudaMem dst;
+    cv::cuda::HostMem dst;
     cv::cuda::GpuMat d_dst;
 
     virtual void SetUp()
@@ -63,7 +64,7 @@ struct Async : testing::TestWithParam<cv::cuda::DeviceInfo>
         cv::cuda::DeviceInfo devInfo = GetParam();
         cv::cuda::setDevice(devInfo.deviceID());
 
-        src = cv::cuda::CudaMem(cv::cuda::CudaMem::PAGE_LOCKED);
+        src = cv::cuda::HostMem(cv::cuda::HostMem::PAGE_LOCKED);
 
         cv::Mat m = randomMat(cv::Size(128, 128), CV_8UC1);
         m.copyTo(src);
@@ -76,8 +77,8 @@ void checkMemSet(int status, void* userData)
 
     Async* test = reinterpret_cast<Async*>(userData);
 
-    cv::cuda::CudaMem src = test->src;
-    cv::cuda::CudaMem dst = test->dst;
+    cv::cuda::HostMem src = test->src;
+    cv::cuda::HostMem dst = test->dst;
 
     cv::Mat dst_gold = cv::Mat::zeros(src.size(), src.type());
 
@@ -105,8 +106,8 @@ void checkConvert(int status, void* userData)
 
     Async* test = reinterpret_cast<Async*>(userData);
 
-    cv::cuda::CudaMem src = test->src;
-    cv::cuda::CudaMem dst = test->dst;
+    cv::cuda::HostMem src = test->src;
+    cv::cuda::HostMem dst = test->dst;
 
     cv::Mat dst_gold;
     src.createMatHeader().convertTo(dst_gold, CV_32S);
@@ -128,8 +129,25 @@ CUDA_TEST_P(Async, Convert)
     stream.waitForCompletion();
 }
 
+CUDA_TEST_P(Async, HostMemAllocator)
+{
+    cv::cuda::Stream stream;
+
+    cv::Mat h_dst;
+    h_dst.allocator = cv::cuda::HostMem::getAllocator();
+
+    d_src.upload(src, stream);
+    d_src.convertTo(d_dst, CV_32S, stream);
+    d_dst.download(h_dst, stream);
+
+    stream.waitForCompletion();
+
+    cv::Mat dst_gold;
+    src.createMatHeader().convertTo(dst_gold, CV_32S);
+
+    ASSERT_MAT_NEAR(dst_gold, h_dst, 0);
+}
+
 INSTANTIATE_TEST_CASE_P(CUDA_Stream, Async, ALL_DEVICES);
 
-#endif // CUDART_VERSION >= 5000
-
 #endif // HAVE_CUDA
diff --git a/modules/core/test/test_main.cpp b/modules/core/test/test_main.cpp
index d5400e20fd..5ddfb72348 100644
--- a/modules/core/test/test_main.cpp
+++ b/modules/core/test/test_main.cpp
@@ -7,4 +7,14 @@
 
 #include "test_precomp.hpp"
 
+#ifndef HAVE_CUDA
+
 CV_TEST_MAIN("cv")
+
+#else
+
+#include "opencv2/ts/cuda_test.hpp"
+
+CV_CUDA_TEST_MAIN("cv")
+
+#endif
diff --git a/modules/cuda/perf/perf_buffer_pool.cpp b/modules/cuda/perf/perf_buffer_pool.cpp
deleted file mode 100644
index 72bd47a070..0000000000
--- a/modules/cuda/perf/perf_buffer_pool.cpp
+++ /dev/null
@@ -1,114 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "perf_precomp.hpp"
-
-#ifdef HAVE_CUDA
-
-#include "opencv2/cudaarithm.hpp"
-#include "opencv2/core/private.cuda.hpp"
-
-using namespace testing;
-using namespace perf;
-using namespace cv;
-using namespace cv::cuda;
-
-namespace
-{
-    void func1(const GpuMat& src, GpuMat& dst, Stream& stream)
-    {
-        BufferPool pool(stream);
-
-        GpuMat buf = pool.getBuffer(src.size(), CV_32FC(src.channels()));
-
-        src.convertTo(buf, CV_32F, 1.0 / 255.0, stream);
-
-        cuda::exp(buf, dst, stream);
-    }
-
-    void func2(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
-    {
-        BufferPool pool(stream);
-
-        GpuMat buf1 = pool.getBuffer(src1.size(), CV_32FC(src1.channels()));
-
-        func1(src1, buf1, stream);
-
-        GpuMat buf2 = pool.getBuffer(src2.size(), CV_32FC(src2.channels()));
-
-        func1(src2, buf2, stream);
-
-        cuda::add(buf1, buf2, dst, noArray(), -1, stream);
-    }
-}
-
-PERF_TEST_P(Sz, BufferPool, CUDA_TYPICAL_MAT_SIZES)
-{
-    static bool first = true;
-
-    const Size size = GetParam();
-
-    const bool useBufferPool = PERF_RUN_CUDA();
-
-    Mat host_src(size, CV_8UC1);
-    declare.in(host_src, WARMUP_RNG);
-
-    GpuMat src1(host_src), src2(host_src);
-    GpuMat dst;
-
-    setBufferPoolUsage(useBufferPool);
-    if (useBufferPool && first)
-    {
-        setBufferPoolConfig(-1, 25 * 1024 * 1024, 2);
-        first = false;
-    }
-
-    TEST_CYCLE()
-    {
-        func2(src1, src2, dst, Stream::Null());
-    }
-
-    Mat h_dst(dst);
-    SANITY_CHECK(h_dst);
-}
-
-#endif
diff --git a/modules/cudaarithm/src/cuda/lut.cu b/modules/cudaarithm/src/cuda/lut.cu
index a8d5bc5b06..0b1fe8b0d5 100644
--- a/modules/cudaarithm/src/cuda/lut.cu
+++ b/modules/cudaarithm/src/cuda/lut.cu
@@ -74,7 +74,7 @@ namespace
 
     LookUpTableImpl::LookUpTableImpl(InputArray _lut)
     {
-        if (_lut.kind() == _InputArray::GPU_MAT)
+        if (_lut.kind() == _InputArray::CUDA_GPU_MAT)
         {
             d_lut = _lut.getGpuMat();
         }
diff --git a/modules/cudaimgproc/src/histogram.cpp b/modules/cudaimgproc/src/histogram.cpp
index 37edd6e0d1..d63e57de31 100644
--- a/modules/cudaimgproc/src/histogram.cpp
+++ b/modules/cudaimgproc/src/histogram.cpp
@@ -467,14 +467,14 @@ void cv::cuda::evenLevels(OutputArray _levels, int nLevels, int lowerLevel, int
     _levels.create(1, nLevels, CV_32SC1);
 
     Mat host_levels;
-    if (kind == _InputArray::GPU_MAT)
+    if (kind == _InputArray::CUDA_GPU_MAT)
         host_levels.create(1, nLevels, CV_32SC1);
     else
         host_levels = _levels.getMat();
 
     nppSafeCall( nppiEvenLevelsHost_32s(host_levels.ptr<Npp32s>(), nLevels, lowerLevel, upperLevel) );
 
-    if (kind == _InputArray::GPU_MAT)
+    if (kind == _InputArray::CUDA_GPU_MAT)
         _levels.getGpuMatRef().upload(host_levels);
 }
 
diff --git a/modules/cudaoptflow/src/farneback.cpp b/modules/cudaoptflow/src/farneback.cpp
index dc52035255..6b74432632 100644
--- a/modules/cudaoptflow/src/farneback.cpp
+++ b/modules/cudaoptflow/src/farneback.cpp
@@ -95,6 +95,16 @@ namespace cv { namespace cuda { namespace device { namespace optflow_farneback
 
 }}}} // namespace cv { namespace cuda { namespace cudev { namespace optflow_farneback
 
+namespace
+{
+    GpuMat allocMatFromBuf(int rows, int cols, int type, GpuMat& mat)
+    {
+        if (!mat.empty() && mat.type() == type && mat.rows >= rows && mat.cols >= cols)
+            return mat(Rect(0, 0, cols, rows));
+
+        return mat = GpuMat(rows, cols, type);
+    }
+}
 
 void cv::cuda::FarnebackOpticalFlow::prepareGaussian(
         int n, double sigma, float *g, float *xg, float *xxg,
diff --git a/modules/cudev/include/opencv2/cudev/ptr2d/detail/gpumat.hpp b/modules/cudev/include/opencv2/cudev/ptr2d/detail/gpumat.hpp
index e378c52372..665840ec03 100644
--- a/modules/cudev/include/opencv2/cudev/ptr2d/detail/gpumat.hpp
+++ b/modules/cudev/include/opencv2/cudev/ptr2d/detail/gpumat.hpp
@@ -51,33 +51,33 @@
 namespace cv { namespace cudev {
 
 template <typename T>
-__host__ GpuMat_<T>::GpuMat_()
-    : GpuMat()
+__host__ GpuMat_<T>::GpuMat_(Allocator* allocator)
+    : GpuMat(allocator)
 {
     flags = (flags & ~CV_MAT_TYPE_MASK) | DataType<T>::type;
 }
 
 template <typename T>
-__host__ GpuMat_<T>::GpuMat_(int arows, int acols)
-    : GpuMat(arows, acols, DataType<T>::type)
+__host__ GpuMat_<T>::GpuMat_(int arows, int acols, Allocator* allocator)
+    : GpuMat(arows, acols, DataType<T>::type, allocator)
 {
 }
 
 template <typename T>
-__host__ GpuMat_<T>::GpuMat_(Size asize)
-    : GpuMat(asize.height, asize.width, DataType<T>::type)
+__host__ GpuMat_<T>::GpuMat_(Size asize, Allocator* allocator)
+    : GpuMat(asize.height, asize.width, DataType<T>::type, allocator)
 {
 }
 
 template <typename T>
-__host__ GpuMat_<T>::GpuMat_(int arows, int acols, Scalar val)
-    : GpuMat(arows, acols, DataType<T>::type, val)
+__host__ GpuMat_<T>::GpuMat_(int arows, int acols, Scalar val, Allocator* allocator)
+    : GpuMat(arows, acols, DataType<T>::type, val, allocator)
 {
 }
 
 template <typename T>
-__host__ GpuMat_<T>::GpuMat_(Size asize, Scalar val)
-    : GpuMat(asize.height, asize.width, DataType<T>::type, val)
+__host__ GpuMat_<T>::GpuMat_(Size asize, Scalar val, Allocator* allocator)
+    : GpuMat(asize.height, asize.width, DataType<T>::type, val, allocator)
 {
 }
 
@@ -88,8 +88,8 @@ __host__ GpuMat_<T>::GpuMat_(const GpuMat_& m)
 }
 
 template <typename T>
-__host__ GpuMat_<T>::GpuMat_(const GpuMat& m)
-    : GpuMat()
+__host__ GpuMat_<T>::GpuMat_(const GpuMat& m, Allocator* allocator)
+    : GpuMat(allocator)
 {
     flags = (flags & ~CV_MAT_TYPE_MASK) | DataType<T>::type;
 
@@ -134,8 +134,8 @@ __host__ GpuMat_<T>::GpuMat_(const GpuMat_& m, Rect roi)
 }
 
 template <typename T>
-__host__ GpuMat_<T>::GpuMat_(InputArray arr)
-    : GpuMat()
+__host__ GpuMat_<T>::GpuMat_(InputArray arr, Allocator* allocator)
+    : GpuMat(allocator)
 {
     flags = (flags & ~CV_MAT_TYPE_MASK) | DataType<T>::type;
     upload(arr);
@@ -341,7 +341,7 @@ namespace cv {
 
 template<typename _Tp>
 __host__ _InputArray::_InputArray(const cudev::GpuMat_<_Tp>& m)
-    : flags(FIXED_TYPE + GPU_MAT + DataType<_Tp>::type), obj((void*)&m)
+    : flags(FIXED_TYPE + CUDA_GPU_MAT + DataType<_Tp>::type), obj((void*)&m)
 {}
 
 template<typename _Tp>
diff --git a/modules/cudev/include/opencv2/cudev/ptr2d/gpumat.hpp b/modules/cudev/include/opencv2/cudev/ptr2d/gpumat.hpp
index 02d8cb7735..983652c53c 100644
--- a/modules/cudev/include/opencv2/cudev/ptr2d/gpumat.hpp
+++ b/modules/cudev/include/opencv2/cudev/ptr2d/gpumat.hpp
@@ -63,21 +63,21 @@ public:
     typedef T value_type;
 
     //! default constructor
-    __host__ GpuMat_();
+    __host__ GpuMat_(Allocator* allocator = defaultAllocator());
 
     //! constructs GpuMat of the specified size
-    __host__ GpuMat_(int arows, int acols);
-    __host__ explicit GpuMat_(Size asize);
+    __host__ GpuMat_(int arows, int acols, Allocator* allocator = defaultAllocator());
+    __host__ explicit GpuMat_(Size asize, Allocator* allocator = defaultAllocator());
 
     //! constucts GpuMat and fills it with the specified value
-    __host__ GpuMat_(int arows, int acols, Scalar val);
-    __host__ GpuMat_(Size asize, Scalar val);
+    __host__ GpuMat_(int arows, int acols, Scalar val, Allocator* allocator = defaultAllocator());
+    __host__ GpuMat_(Size asize, Scalar val, Allocator* allocator = defaultAllocator());
 
     //! copy constructor
     __host__ GpuMat_(const GpuMat_& m);
 
     //! copy/conversion contructor. If m is of different type, it's converted
-    __host__ explicit GpuMat_(const GpuMat& m);
+    __host__ explicit GpuMat_(const GpuMat& m, Allocator* allocator = defaultAllocator());
 
     //! constructs a matrix on top of user-allocated data. step is in bytes(!!!), regardless of the type
     __host__ GpuMat_(int arows, int acols, T* adata, size_t astep = Mat::AUTO_STEP);
@@ -88,7 +88,7 @@ public:
     __host__ GpuMat_(const GpuMat_& m, Rect roi);
 
     //! builds GpuMat from host memory (Blocking call)
-    __host__ explicit GpuMat_(InputArray arr);
+    __host__ explicit GpuMat_(InputArray arr, Allocator* allocator = defaultAllocator());
 
     //! assignment operators
     __host__ GpuMat_& operator =(const GpuMat_& m);
diff --git a/modules/highgui/src/window.cpp b/modules/highgui/src/window.cpp
index f43f86411b..cda019102c 100644
--- a/modules/highgui/src/window.cpp
+++ b/modules/highgui/src/window.cpp
@@ -297,7 +297,7 @@ void cv::imshow( const String& winname, InputArray _img )
 
         cv::ogl::Texture2D& tex = ownWndTexs[winname];
 
-        if (_img.kind() == _InputArray::GPU_MAT)
+        if (_img.kind() == _InputArray::CUDA_GPU_MAT)
         {
             cv::ogl::Buffer& buf = ownWndBufs[winname];
             buf.copyFrom(_img);
diff --git a/modules/superres/src/btv_l1_cuda.cpp b/modules/superres/src/btv_l1_cuda.cpp
index 1ec71f220c..f72e3846e8 100644
--- a/modules/superres/src/btv_l1_cuda.cpp
+++ b/modules/superres/src/btv_l1_cuda.cpp
@@ -514,7 +514,7 @@ namespace
         ++outPos_;
         const GpuMat& curOutput = at(outPos_, outputs_);
 
-        if (_output.kind() == _InputArray::GPU_MAT)
+        if (_output.kind() == _InputArray::CUDA_GPU_MAT)
             curOutput.convertTo(_output.getGpuMatRef(), CV_8U);
         else
         {
diff --git a/modules/superres/src/frame_source.cpp b/modules/superres/src/frame_source.cpp
index 0f81efd5e1..216e869c14 100644
--- a/modules/superres/src/frame_source.cpp
+++ b/modules/superres/src/frame_source.cpp
@@ -116,7 +116,7 @@ namespace
     {
         if (_frame.kind() == _InputArray::MAT)
             vc_ >> _frame.getMatRef();
-        else if(_frame.kind() == _InputArray::GPU_MAT)
+        else if(_frame.kind() == _InputArray::CUDA_GPU_MAT)
         {
             vc_ >> frame_;
             arrCopy(frame_, _frame);
@@ -226,7 +226,7 @@ namespace
 
     void VideoFrameSource_CUDA::nextFrame(OutputArray _frame)
     {
-        if (_frame.kind() == _InputArray::GPU_MAT)
+        if (_frame.kind() == _InputArray::CUDA_GPU_MAT)
         {
             bool res = reader_->nextFrame(_frame.getGpuMatRef());
             if (!res)
diff --git a/modules/superres/src/input_array_utility.cpp b/modules/superres/src/input_array_utility.cpp
index 9f4f229360..ec20673b47 100644
--- a/modules/superres/src/input_array_utility.cpp
+++ b/modules/superres/src/input_array_utility.cpp
@@ -49,7 +49,7 @@ Mat cv::superres::arrGetMat(InputArray arr, Mat& buf)
 {
     switch (arr.kind())
     {
-    case _InputArray::GPU_MAT:
+    case _InputArray::CUDA_GPU_MAT:
         arr.getGpuMat().download(buf);
         return buf;
 
@@ -66,7 +66,7 @@ UMat cv::superres::arrGetUMat(InputArray arr, UMat& buf)
 {
     switch (arr.kind())
     {
-    case _InputArray::GPU_MAT:
+    case _InputArray::CUDA_GPU_MAT:
         arr.getGpuMat().download(buf);
         return buf;
 
@@ -83,7 +83,7 @@ GpuMat cv::superres::arrGetGpuMat(InputArray arr, GpuMat& buf)
 {
     switch (arr.kind())
     {
-    case _InputArray::GPU_MAT:
+    case _InputArray::CUDA_GPU_MAT:
         return arr.getGpuMat();
 
     case _InputArray::OPENGL_BUFFER:
@@ -184,7 +184,7 @@ namespace
 
         switch (src.kind())
         {
-        case _InputArray::GPU_MAT:
+        case _InputArray::CUDA_GPU_MAT:
             #ifdef HAVE_OPENCV_CUDAIMGPROC
                 cuda::cvtColor(src.getGpuMat(), dst.getGpuMatRef(), code, cn);
             #else
@@ -218,7 +218,7 @@ namespace
 
         switch (src.kind())
         {
-        case _InputArray::GPU_MAT:
+        case _InputArray::CUDA_GPU_MAT:
             src.getGpuMat().convertTo(dst.getGpuMatRef(), depth, scale);
             break;
 
diff --git a/modules/superres/src/optical_flow.cpp b/modules/superres/src/optical_flow.cpp
index 7227b080fc..fcc9bef347 100644
--- a/modules/superres/src/optical_flow.cpp
+++ b/modules/superres/src/optical_flow.cpp
@@ -458,7 +458,7 @@ namespace
         GpuMat input0 = convertToType(frame0, work_type_, buf_[2], buf_[3]);
         GpuMat input1 = convertToType(frame1, work_type_, buf_[4], buf_[5]);
 
-        if (_flow2.needed() && _flow1.kind() == _InputArray::GPU_MAT && _flow2.kind() == _InputArray::GPU_MAT)
+        if (_flow2.needed() && _flow1.kind() == _InputArray::CUDA_GPU_MAT && _flow2.kind() == _InputArray::CUDA_GPU_MAT)
         {
             impl(input0, input1, _flow1.getGpuMatRef(), _flow2.getGpuMatRef());
             return;
diff --git a/modules/ts/include/opencv2/ts.hpp b/modules/ts/include/opencv2/ts.hpp
index 209cb2915a..c1b68a0c0f 100644
--- a/modules/ts/include/opencv2/ts.hpp
+++ b/modules/ts/include/opencv2/ts.hpp
@@ -569,10 +569,10 @@ void parseCustomOptions(int argc, char **argv);
 #define CV_TEST_MAIN(resourcesubdir, ...) \
 int main(int argc, char **argv) \
 { \
+    __CV_TEST_EXEC_ARGS(__VA_ARGS__) \
     cvtest::TS::ptr()->init(resourcesubdir); \
     ::testing::InitGoogleTest(&argc, argv); \
     cvtest::printVersionInfo(); \
-    __CV_TEST_EXEC_ARGS(__VA_ARGS__) \
     TEST_DUMP_OCL_INFO \
     parseCustomOptions(argc, argv); \
     return RUN_ALL_TESTS(); \
diff --git a/modules/ts/include/opencv2/ts/cuda_test.hpp b/modules/ts/include/opencv2/ts/cuda_test.hpp
index 049021b544..b225ab1796 100644
--- a/modules/ts/include/opencv2/ts/cuda_test.hpp
+++ b/modules/ts/include/opencv2/ts/cuda_test.hpp
@@ -340,6 +340,7 @@ namespace cvtest
     CV_EXPORTS void dumpImage(const std::string& fileName, const cv::Mat& image);
     CV_EXPORTS void showDiff(cv::InputArray gold, cv::InputArray actual, double eps);
 
+    CV_EXPORTS void parseCudaDeviceOptions(int argc, char **argv);
     CV_EXPORTS void printCudaInfo();
 }
 
@@ -351,53 +352,7 @@ namespace cv { namespace cuda
 #ifdef HAVE_CUDA
 
 #define CV_CUDA_TEST_MAIN(resourcesubdir) \
-    int main(int argc, char* argv[]) \
-    { \
-        try \
-        { \
-            cv::CommandLineParser cmd(argc, argv, \
-                "{ h help ?            |      | Print help}" \
-                "{ i info              |      | Print information about system and exit }" \
-                "{ device              | -1   | Device on which tests will be executed (-1 means all devices) }" \
-            ); \
-            if (cmd.has("help")) \
-            { \
-                cmd.printMessage(); \
-                return 0; \
-            } \
-            cvtest::printCudaInfo(); \
-            if (cmd.has("info")) \
-            { \
-                return 0; \
-            } \
-            int device = cmd.get<int>("device"); \
-            if (device < 0) \
-            { \
-                cvtest::DeviceManager::instance().loadAll(); \
-                std::cout << "Run tests on all supported devices \n" << std::endl; \
-            } \
-            else \
-            { \
-                cvtest::DeviceManager::instance().load(device); \
-                cv::cuda::DeviceInfo info(device); \
-                std::cout << "Run tests on device " << device << " [" << info.name() << "] \n" << std::endl; \
-            } \
-            cvtest::TS::ptr()->init( resourcesubdir ); \
-            testing::InitGoogleTest(&argc, argv); \
-            return RUN_ALL_TESTS(); \
-        } \
-        catch (const std::exception& e) \
-        { \
-            std::cerr << e.what() << std::endl; \
-            return -1; \
-        } \
-        catch (...) \
-        { \
-            std::cerr << "Unknown error" << std::endl; \
-            return -1; \
-        } \
-        return 0; \
-    }
+    CV_TEST_MAIN(resourcesubdir, cvtest::parseCudaDeviceOptions(argc, argv), cvtest::printCudaInfo())
 
 #else // HAVE_CUDA
 
diff --git a/modules/ts/src/cuda_test.cpp b/modules/ts/src/cuda_test.cpp
index 1086fd111d..a48e0a0871 100644
--- a/modules/ts/src/cuda_test.cpp
+++ b/modules/ts/src/cuda_test.cpp
@@ -190,6 +190,33 @@ namespace cvtest
         }
     }
 
+    void parseCudaDeviceOptions(int argc, char **argv)
+    {
+        cv::CommandLineParser cmd(argc, argv,
+            "{ cuda_device | -1    | CUDA device on which tests will be executed (-1 means all devices) }"
+            "{ h help      | false | Print help info                                                    }"
+        );
+
+        if (cmd.has("help"))
+        {
+            std::cout << "\nAvailable options besides google test option: \n";
+            cmd.printMessage();
+        }
+
+        int device = cmd.get<int>("cuda_device");
+        if (device < 0)
+        {
+            cvtest::DeviceManager::instance().loadAll();
+            std::cout << "Run tests on all supported CUDA devices \n" << std::endl;
+        }
+        else
+        {
+            cvtest::DeviceManager::instance().load(device);
+            cv::cuda::DeviceInfo info(device);
+            std::cout << "Run tests on CUDA device " << device << " [" << info.name() << "] \n" << std::endl;
+        }
+    }
+
     //////////////////////////////////////////////////////////////////////
     // Additional assertion
 
@@ -278,7 +305,7 @@ namespace cvtest
 
     Mat getMat(InputArray arr)
     {
-        if (arr.kind() == _InputArray::GPU_MAT)
+        if (arr.kind() == _InputArray::CUDA_GPU_MAT)
         {
             Mat m;
             arr.getGpuMat().download(m);
diff --git a/samples/gpu/stereo_multi.cpp b/samples/gpu/stereo_multi.cpp
index 0997165f1f..bfb3e8a48b 100644
--- a/samples/gpu/stereo_multi.cpp
+++ b/samples/gpu/stereo_multi.cpp
@@ -278,7 +278,7 @@ public:
     StereoMultiGpuStream();
     ~StereoMultiGpuStream();
 
-    void compute(const CudaMem& leftFrame, const CudaMem& rightFrame, CudaMem& disparity);
+    void compute(const HostMem& leftFrame, const HostMem& rightFrame, HostMem& disparity);
 
 private:
     GpuMat d_leftFrames[2];
@@ -316,7 +316,7 @@ StereoMultiGpuStream::~StereoMultiGpuStream()
     streams[1].release();
 }
 
-void StereoMultiGpuStream::compute(const CudaMem& leftFrame, const CudaMem& rightFrame, CudaMem& disparity)
+void StereoMultiGpuStream::compute(const HostMem& leftFrame, const HostMem& rightFrame, HostMem& disparity)
 {
     disparity.create(leftFrame.size(), CV_8UC1);
 
@@ -403,7 +403,7 @@ int main(int argc, char** argv)
     cout << endl;
 
     Mat leftFrame, rightFrame;
-    CudaMem leftGrayFrame, rightGrayFrame;
+    HostMem leftGrayFrame, rightGrayFrame;
 
     StereoSingleGpu gpu0Alg(0);
     StereoSingleGpu gpu1Alg(1);
@@ -413,7 +413,7 @@ int main(int argc, char** argv)
     Mat disparityGpu0;
     Mat disparityGpu1;
     Mat disparityMultiThread;
-    CudaMem disparityMultiStream;
+    HostMem disparityMultiStream;
 
     Mat disparityGpu0Show;
     Mat disparityGpu1Show;