diff --git a/modules/core/include/opencv2/core/cuda.hpp b/modules/core/include/opencv2/core/cuda.hpp
index 82558846d6..9800877a77 100644
--- a/modules/core/include/opencv2/core/cuda.hpp
+++ b/modules/core/include/opencv2/core/cuda.hpp
@@ -340,6 +340,201 @@ public:
     Allocator* allocator;
 };
 
+struct CV_EXPORTS_W GpuData
+{
+    explicit GpuData(size_t _size);
+     ~GpuData();
+
+    GpuData(const GpuData&) = delete;
+    GpuData& operator=(const GpuData&) = delete;
+
+    GpuData(GpuData&&) = delete;
+    GpuData& operator=(GpuData&&) = delete;
+
+    uchar* data;
+    size_t size;
+};
+
+class CV_EXPORTS_W GpuMatND
+{
+public:
+    using SizeArray = std::vector<int>;
+    using StepArray = std::vector<size_t>;
+    using IndexArray = std::vector<int>;
+
+    //! destructor
+    ~GpuMatND();
+
+    //! default constructor
+    GpuMatND();
+
+    /** @overload
+    @param size Array of integers specifying an n-dimensional array shape.
+    @param type Array type. Use CV_8UC1, ..., CV_16FC4 to create 1-4 channel matrices, or
+    CV_8UC(n), ..., CV_64FC(n) to create multi-channel (up to CV_CN_MAX channels) matrices.
+    */
+    GpuMatND(SizeArray size, int type);
+
+    /** @overload
+    @param size Array of integers specifying an n-dimensional array shape.
+    @param type Array type. Use CV_8UC1, ..., CV_16FC4 to create 1-4 channel matrices, or
+    CV_8UC(n), ..., CV_64FC(n) to create multi-channel (up to CV_CN_MAX channels) matrices.
+    @param data Pointer to the user data. Matrix constructors that take data and step parameters do not
+    allocate matrix data. Instead, they just initialize the matrix header that points to the specified
+    data, which means that no data is copied. This operation is very efficient and can be used to
+    process external data using OpenCV functions. The external data is not automatically deallocated, so
+    you should take care of it.
+    @param step Array of _size.size()-1 steps in case of a multi-dimensional array (the last step is always
+    set to the element size). If not specified, the matrix is assumed to be continuous.
+    */
+    GpuMatND(SizeArray size, int type, void* data, StepArray step = StepArray());
+
+    /** @brief Allocates GPU memory.
+    Suppose there is some GPU memory already allocated. In that case, this method may choose to reuse that
+    GPU memory under the specific condition: it must be of the same size and type, not externally allocated,
+    the GPU memory is continuous(i.e., isContinuous() is true), and is not a sub-matrix of another GpuMatND
+    (i.e., isSubmatrix() is false). In other words, this method guarantees that the GPU memory allocated by
+    this method is always continuous and is not a sub-region of another GpuMatND.
+    */
+    void create(SizeArray size, int type);
+
+    void release();
+
+    void swap(GpuMatND& m) noexcept;
+
+    /** @brief Creates a full copy of the array and the underlying data.
+    The method creates a full copy of the array. It mimics the behavior of Mat::clone(), i.e.
+    the original step is not taken into account. So, the array copy is a continuous array
+    occupying total()\*elemSize() bytes.
+    */
+    GpuMatND clone() const;
+
+    /** @overload
+    This overload is non-blocking, so it may return even if the copy operation is not finished.
+    */
+    GpuMatND clone(Stream& stream) const;
+
+    /** @brief Extracts a sub-matrix.
+    The operator makes a new header for the specified sub-array of \*this.
+    The operator is an O(1) operation, that is, no matrix data is copied.
+    @param ranges Array of selected ranges along each dimension.
+    */
+    GpuMatND operator()(const std::vector<Range>& ranges) const;
+
+    /** @brief Creates a GpuMat header for a 2D plane part of an n-dim matrix.
+    @note The returned GpuMat is constructed with the constructor for user-allocated data.
+    That is, It does not perform reference counting.
+    @note This function does not increment this GpuMatND's reference counter.
+    */
+    GpuMat createGpuMatHeader(IndexArray idx, Range rowRange, Range colRange) const;
+
+    /** @overload
+    Creates a GpuMat header if this GpuMatND is effectively 2D.
+    @note The returned GpuMat is constructed with the constructor for user-allocated data.
+    That is, It does not perform reference counting.
+    @note This function does not increment this GpuMatND's reference counter.
+    */
+    GpuMat createGpuMatHeader() const;
+
+    /** @brief Extracts a 2D plane part of an n-dim matrix.
+    It differs from createGpuMatHeader(IndexArray, Range, Range) in that it clones a part of this
+    GpuMatND to the returned GpuMat.
+    @note This operator does not increment this GpuMatND's reference counter;
+    */
+    GpuMat operator()(IndexArray idx, Range rowRange, Range colRange) const;
+
+    /** @brief Extracts a 2D plane part of an n-dim matrix if this GpuMatND is effectively 2D.
+    It differs from createGpuMatHeader() in that it clones a part of this GpuMatND.
+    @note This operator does not increment this GpuMatND's reference counter;
+    */
+    operator GpuMat() const;
+
+    GpuMatND(const GpuMatND&) = default;
+    GpuMatND& operator=(const GpuMatND&) = default;
+
+    GpuMatND(GpuMatND&&) noexcept = default;
+    GpuMatND& operator=(GpuMatND&&) noexcept = default;
+
+    void upload(InputArray src);
+    void upload(InputArray src, Stream& stream);
+    void download(OutputArray dst) const;
+    void download(OutputArray dst, Stream& stream) const;
+
+    //! returns true iff the GpuMatND data is continuous
+    //! (i.e. when there are no gaps between successive rows)
+    bool isContinuous() const;
+
+    //! returns true if the matrix is a sub-matrix of another matrix
+    bool isSubmatrix() const;
+
+    //! returns element size in bytes
+    size_t elemSize() const;
+
+    //! returns the size of element channel in bytes
+    size_t elemSize1() const;
+
+    //! returns true if data is null
+    bool empty() const;
+
+    //! returns true if not empty and points to external(user-allocated) gpu memory
+    bool external() const;
+
+    //! returns pointer to the first byte of the GPU memory
+    uchar* getDevicePtr() const;
+
+    //! returns the total number of array elements
+    size_t total() const;
+
+    //! returns the size of underlying memory in bytes
+    size_t totalMemSize() const;
+
+    //! returns element type
+    int type() const;
+
+private:
+    //! internal use
+    void setFields(SizeArray size, int type, StepArray step = StepArray());
+
+public:
+    /*! includes several bit-fields:
+    - the magic signature
+    - continuity flag
+    - depth
+    - number of channels
+    */
+    int flags;
+
+    //! matrix dimensionality
+    int dims;
+
+    //! shape of this array
+    SizeArray size;
+
+    /*! step values
+    Their semantics is identical to the semantics of step for Mat.
+    */
+    StepArray step;
+
+private:
+    /*! internal use
+    If this GpuMatND holds external memory, this is empty.
+    */
+    std::shared_ptr<GpuData> data_;
+
+    /*! internal use
+    If this GpuMatND manages memory with reference counting, this value is
+    always equal to data_->data. If this GpuMatND holds external memory,
+    data_ is empty and data points to the external memory.
+    */
+    uchar* data;
+
+    /*! internal use
+    If this GpuMatND is a sub-matrix of a larger matrix, this value is the
+    difference of the first byte between the sub-matrix and the whole matrix.
+    */
+    size_t offset;
+};
+
 /** @brief Creates a continuous matrix.
 
 @param rows Row count.
diff --git a/modules/core/include/opencv2/core/cuda.inl.hpp b/modules/core/include/opencv2/core/cuda.inl.hpp
index 30fc0aee22..3f2a0c7240 100644
--- a/modules/core/include/opencv2/core/cuda.inl.hpp
+++ b/modules/core/include/opencv2/core/cuda.inl.hpp
@@ -383,6 +383,92 @@ void swap(GpuMat& a, GpuMat& b)
     a.swap(b);
 }
 
+//===================================================================================
+// GpuMatND
+//===================================================================================
+
+inline
+GpuMatND::GpuMatND() :
+    flags(0), dims(0), data(nullptr), offset(0)
+{
+}
+
+inline
+GpuMatND::GpuMatND(SizeArray _size, int _type) :
+    flags(0), dims(0), data(nullptr), offset(0)
+{
+    create(std::move(_size), _type);
+}
+
+inline
+void GpuMatND::swap(GpuMatND& m) noexcept
+{
+    std::swap(*this, m);
+}
+
+inline
+bool GpuMatND::isContinuous() const
+{
+    return (flags & Mat::CONTINUOUS_FLAG) != 0;
+}
+
+inline
+bool GpuMatND::isSubmatrix() const
+{
+    return (flags & Mat::SUBMATRIX_FLAG) != 0;
+}
+
+inline
+size_t GpuMatND::elemSize() const
+{
+    return CV_ELEM_SIZE(flags);
+}
+
+inline
+size_t GpuMatND::elemSize1() const
+{
+    return CV_ELEM_SIZE1(flags);
+}
+
+inline
+bool GpuMatND::empty() const
+{
+    return data == nullptr;
+}
+
+inline
+bool GpuMatND::external() const
+{
+    return !empty() && data_.use_count() == 0;
+}
+
+inline
+uchar* GpuMatND::getDevicePtr() const
+{
+    return data + offset;
+}
+
+inline
+size_t GpuMatND::total() const
+{
+    size_t p = 1;
+    for(auto s : size)
+        p *= s;
+    return p;
+}
+
+inline
+size_t GpuMatND::totalMemSize() const
+{
+    return size[0] * step[0];
+}
+
+inline
+int GpuMatND::type() const
+{
+    return CV_MAT_TYPE(flags);
+}
+
 //===================================================================================
 // HostMem
 //===================================================================================
diff --git a/modules/core/src/cuda/gpu_mat_nd.cu b/modules/core/src/cuda/gpu_mat_nd.cu
new file mode 100644
index 0000000000..3f51fd8afa
--- /dev/null
+++ b/modules/core/src/cuda/gpu_mat_nd.cu
@@ -0,0 +1,269 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "opencv2/opencv_modules.hpp"
+
+#ifndef HAVE_OPENCV_CUDEV
+
+#error "opencv_cudev is required"
+
+#else
+
+#include "opencv2/core/cuda.hpp"
+#include "opencv2/cudev.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+
+GpuData::GpuData(const size_t _size)
+    : data(nullptr), size(_size)
+{
+    CV_CUDEV_SAFE_CALL(cudaMalloc(&data, _size));
+}
+
+GpuData::~GpuData()
+{
+    CV_CUDEV_SAFE_CALL(cudaFree(data));
+}
+
+/////////////////////////////////////////////////////
+/// create
+
+void GpuMatND::create(SizeArray _size, int _type)
+{
+    {
+        auto elements_nonzero = [](SizeArray& v)
+        {
+            return std::all_of(v.begin(), v.end(),
+                [](unsigned u){ return u > 0; });
+        };
+        CV_Assert(!_size.empty());
+        CV_Assert(elements_nonzero(_size));
+    }
+
+    _type &= Mat::TYPE_MASK;
+
+    if (size == _size && type() == _type && !empty() && !external() && isContinuous() && !isSubmatrix())
+        return;
+
+    release();
+
+    setFields(std::move(_size), _type);
+
+    data_ = std::make_shared<GpuData>(totalMemSize());
+    data = data_->data;
+    offset = 0;
+}
+
+/////////////////////////////////////////////////////
+/// release
+
+void GpuMatND::release()
+{
+    data = nullptr;
+    data_.reset();
+
+    flags = dims = offset = 0;
+    size.clear();
+    step.clear();
+}
+
+/////////////////////////////////////////////////////
+/// clone
+
+static bool next(uchar*& d, const uchar*& s, std::vector<int>& idx, const int dims, const GpuMatND& dst, const GpuMatND& src)
+{
+    int inc = dims-3;
+
+    while (true)
+    {
+        if (idx[inc] == src.size[inc] - 1)
+        {
+            if (inc == 0)
+            {
+                return false;
+            }
+
+            idx[inc] = 0;
+            d -= (dst.size[inc] - 1) * dst.step[inc];
+            s -= (src.size[inc] - 1) * src.step[inc];
+            inc--;
+        }
+        else
+        {
+            idx[inc]++;
+            d += dst.step[inc];
+            s += src.step[inc];
+            break;
+        }
+    }
+
+    return true;
+}
+
+GpuMatND GpuMatND::clone() const
+{
+    CV_DbgAssert(!empty());
+
+    GpuMatND ret(size, type());
+
+    if (isContinuous())
+    {
+        CV_CUDEV_SAFE_CALL(cudaMemcpy(ret.getDevicePtr(), getDevicePtr(), ret.totalMemSize(), cudaMemcpyDeviceToDevice));
+    }
+    else
+    {
+        // 1D arrays are always continuous
+
+        if (dims == 2)
+        {
+            CV_CUDEV_SAFE_CALL(
+                cudaMemcpy2D(ret.getDevicePtr(), ret.step[0], getDevicePtr(), step[0],
+                    size[1]*step[1], size[0], cudaMemcpyDeviceToDevice)
+            );
+        }
+        else
+        {
+            std::vector<int> idx(dims-2, 0);
+
+            uchar* d = ret.getDevicePtr();
+            const uchar* s = getDevicePtr();
+
+            // iterate each 2D plane
+            do
+            {
+                CV_CUDEV_SAFE_CALL(
+                    cudaMemcpy2DAsync(
+                        d, ret.step[dims-2], s, step[dims-2],
+                        size[dims-1]*step[dims-1], size[dims-2], cudaMemcpyDeviceToDevice)
+                );
+            }
+            while (next(d, s, idx, dims, ret, *this));
+
+            CV_CUDEV_SAFE_CALL(cudaStreamSynchronize(0));
+        }
+    }
+
+    return ret;
+}
+
+GpuMatND GpuMatND::clone(Stream& stream) const
+{
+    CV_DbgAssert(!empty());
+
+    GpuMatND ret(size, type());
+
+    cudaStream_t _stream = StreamAccessor::getStream(stream);
+
+    if (isContinuous())
+    {
+        CV_CUDEV_SAFE_CALL(cudaMemcpyAsync(ret.getDevicePtr(), getDevicePtr(), ret.totalMemSize(), cudaMemcpyDeviceToDevice, _stream));
+    }
+    else
+    {
+        // 1D arrays are always continuous
+
+        if (dims == 2)
+        {
+            CV_CUDEV_SAFE_CALL(
+                cudaMemcpy2DAsync(ret.getDevicePtr(), ret.step[0], getDevicePtr(), step[0],
+                    size[1]*step[1], size[0], cudaMemcpyDeviceToDevice, _stream)
+            );
+        }
+        else
+        {
+            std::vector<int> idx(dims-2, 0);
+
+            uchar* d = ret.getDevicePtr();
+            const uchar* s = getDevicePtr();
+
+            // iterate each 2D plane
+            do
+            {
+                CV_CUDEV_SAFE_CALL(
+                    cudaMemcpy2DAsync(
+                        d, ret.step[dims-2], s, step[dims-2],
+                        size[dims-1]*step[dims-1], size[dims-2], cudaMemcpyDeviceToDevice, _stream)
+                );
+            }
+            while (next(d, s, idx, dims, ret, *this));
+        }
+    }
+
+    return ret;
+}
+
+/////////////////////////////////////////////////////
+/// upload
+
+void GpuMatND::upload(InputArray src)
+{
+    Mat mat = src.getMat();
+
+    CV_DbgAssert(!mat.empty());
+
+    if (!mat.isContinuous())
+        mat = mat.clone();
+
+    SizeArray _size(mat.dims);
+    std::copy_n(mat.size.p, mat.dims, _size.data());
+
+    create(std::move(_size), mat.type());
+
+    CV_CUDEV_SAFE_CALL(cudaMemcpy(getDevicePtr(), mat.data, totalMemSize(), cudaMemcpyHostToDevice));
+}
+
+void GpuMatND::upload(InputArray src, Stream& stream)
+{
+    Mat mat = src.getMat();
+
+    CV_DbgAssert(!mat.empty());
+
+    if (!mat.isContinuous())
+        mat = mat.clone();
+
+    SizeArray _size(mat.dims);
+    std::copy_n(mat.size.p, mat.dims, _size.data());
+
+    create(std::move(_size), mat.type());
+
+    cudaStream_t _stream = StreamAccessor::getStream(stream);
+    CV_CUDEV_SAFE_CALL(cudaMemcpyAsync(getDevicePtr(), mat.data, totalMemSize(), cudaMemcpyHostToDevice, _stream));
+}
+
+/////////////////////////////////////////////////////
+/// download
+
+void GpuMatND::download(OutputArray dst) const
+{
+    CV_DbgAssert(!empty());
+
+    dst.create(dims, size.data(), type());
+    Mat mat = dst.getMat();
+
+    GpuMatND gmat = *this;
+
+    if (!gmat.isContinuous())
+        gmat = gmat.clone();
+
+    CV_CUDEV_SAFE_CALL(cudaMemcpy(mat.data, gmat.getDevicePtr(), mat.total() * mat.elemSize(), cudaMemcpyDeviceToHost));
+}
+
+void GpuMatND::download(OutputArray dst, Stream& stream) const
+{
+    CV_DbgAssert(!empty());
+
+    dst.create(dims, size.data(), type());
+    Mat mat = dst.getMat();
+
+    GpuMatND gmat = *this;
+
+    if (!gmat.isContinuous())
+        gmat = gmat.clone(stream);
+
+    cudaStream_t _stream = StreamAccessor::getStream(stream);
+    CV_CUDEV_SAFE_CALL(cudaMemcpyAsync(mat.data, gmat.getDevicePtr(), mat.total() * mat.elemSize(), cudaMemcpyDeviceToHost, _stream));
+}
+
+#endif
diff --git a/modules/core/src/cuda_gpu_mat_nd.cpp b/modules/core/src/cuda_gpu_mat_nd.cpp
new file mode 100644
index 0000000000..8440f179ea
--- /dev/null
+++ b/modules/core/src/cuda_gpu_mat_nd.cpp
@@ -0,0 +1,180 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+
+GpuMatND::~GpuMatND() = default;
+
+GpuMatND::GpuMatND(SizeArray _size, int _type, void* _data, StepArray _step) :
+    flags(0), dims(0), data(static_cast<uchar*>(_data)), offset(0)
+{
+    CV_Assert(_step.empty() || _size.size() == _step.size() + 1);
+
+    setFields(std::move(_size), _type, std::move(_step));
+}
+
+GpuMatND GpuMatND::operator()(const std::vector<Range>& ranges) const
+{
+    CV_Assert(dims == (int)ranges.size());
+
+    for (int i = 0; i < dims; ++i)
+    {
+        Range r = ranges[i];
+        CV_Assert(r == Range::all() || (0 <= r.start && r.start < r.end && r.end <= size[i]));
+    }
+
+    GpuMatND ret = *this;
+
+    for (int i = 0; i < dims; ++i)
+    {
+        Range r = ranges[i];
+        if (r != Range::all() && r != Range(0, ret.size[i]))
+        {
+            ret.offset += r.start * ret.step[i];
+            ret.size[i] = r.size();
+            ret.flags |= Mat::SUBMATRIX_FLAG;
+        }
+    }
+
+    ret.flags = cv::updateContinuityFlag(ret.flags, dims, ret.size.data(), ret.step.data());
+
+    return ret;
+}
+
+GpuMat GpuMatND::createGpuMatHeader(IndexArray idx, Range rowRange, Range colRange) const
+{
+    CV_Assert((int)idx.size() == dims - 2);
+
+    std::vector<Range> ranges;
+    for (int i : idx)
+        ranges.emplace_back(i, i+1);
+    ranges.push_back(rowRange);
+    ranges.push_back(colRange);
+
+    return (*this)(ranges).createGpuMatHeader();
+}
+
+GpuMat GpuMatND::createGpuMatHeader() const
+{
+    auto Effectively2D = [](GpuMatND m)
+    {
+        for (int i = 0; i < m.dims - 2; ++i)
+            if (m.size[i] > 1)
+                return false;
+        return true;
+    };
+    CV_Assert(Effectively2D(*this));
+
+    return GpuMat(size[dims-2], size[dims-1], type(), getDevicePtr(), step[dims-2]);
+}
+
+GpuMat GpuMatND::operator()(IndexArray idx, Range rowRange, Range colRange) const
+{
+    return createGpuMatHeader(idx, rowRange, colRange).clone();
+}
+
+GpuMatND::operator GpuMat() const
+{
+    return createGpuMatHeader().clone();
+}
+
+void GpuMatND::setFields(SizeArray _size, int _type, StepArray _step)
+{
+    _type &= Mat::TYPE_MASK;
+
+    flags = Mat::MAGIC_VAL + _type;
+    dims = static_cast<int>(_size.size());
+    size = std::move(_size);
+
+    if (_step.empty())
+    {
+        step = StepArray(dims);
+
+        step.back() = elemSize();
+        for (int _i = dims - 2; _i >= 0; --_i)
+        {
+            const size_t i = _i;
+            step[i] = step[i+1] * size[i+1];
+        }
+
+        flags |= Mat::CONTINUOUS_FLAG;
+    }
+    else
+    {
+        step = std::move(_step);
+        step.push_back(elemSize());
+
+        flags = cv::updateContinuityFlag(flags, dims, size.data(), step.data());
+    }
+
+    CV_Assert(size.size() == step.size());
+    CV_Assert(step.back() == elemSize());
+}
+
+#ifndef HAVE_CUDA
+
+GpuData::GpuData(const size_t _size)
+    : data(nullptr), size(0)
+{
+    CV_UNUSED(_size);
+    throw_no_cuda();
+}
+
+GpuData::~GpuData()
+{
+}
+
+void GpuMatND::create(SizeArray _size, int _type)
+{
+    CV_UNUSED(_size);
+    CV_UNUSED(_type);
+    throw_no_cuda();
+}
+
+void GpuMatND::release()
+{
+    throw_no_cuda();
+}
+
+GpuMatND GpuMatND::clone() const
+{
+    throw_no_cuda();
+}
+
+GpuMatND GpuMatND::clone(Stream& stream) const
+{
+    CV_UNUSED(stream);
+    throw_no_cuda();
+}
+
+void GpuMatND::upload(InputArray src)
+{
+    CV_UNUSED(src);
+    throw_no_cuda();
+}
+
+void GpuMatND::upload(InputArray src, Stream& stream)
+{
+    CV_UNUSED(src);
+    CV_UNUSED(stream);
+    throw_no_cuda();
+}
+
+void GpuMatND::download(OutputArray dst) const
+{
+    CV_UNUSED(dst);
+    throw_no_cuda();
+}
+
+void GpuMatND::download(OutputArray dst, Stream& stream) const
+{
+    CV_UNUSED(dst);
+    CV_UNUSED(stream);
+    throw_no_cuda();
+}
+
+#endif