used new device layer for cv::gpu::copyMakeBorder

2025-08-06 14:36:36 +08:00 · 2013-08-26 10:48:10 +04:00 · 2013-08-26 10:48:10 +04:00 · 3f62e78592
commit 3f62e78592
parent 7b3bbcea71
2 changed files with 95 additions and 180 deletions
--- a/modules/cudaarithm/src/core.cpp
+++ b/modules/cudaarithm/src/core.cpp
@ -343,113 +343,4 @@ Ptr<LookUpTable> cv::cuda::createLookUpTable(InputArray lut)
    return makePtr<LookUpTableImpl>(lut);
 }

-////////////////////////////////////////////////////////////////////////
-// copyMakeBorder
-
-namespace cv { namespace cuda { namespace device
-{
-    namespace imgproc
-    {
-        template <typename T, int cn> void copyMakeBorder_gpu(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const T* borderValue, cudaStream_t stream);
-    }
-}}}
-
-namespace
-{
-    template <typename T, int cn> void copyMakeBorder_caller(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderType, const Scalar& value, cudaStream_t stream)
-    {
-        using namespace ::cv::cuda::device::imgproc;
-
-        Scalar_<T> val(saturate_cast<T>(value[0]), saturate_cast<T>(value[1]), saturate_cast<T>(value[2]), saturate_cast<T>(value[3]));
-
-        copyMakeBorder_gpu<T, cn>(src, dst, top, left, borderType, val.val, stream);
-    }
-}
-
-#if defined __GNUC__ && __GNUC__ > 2 && __GNUC_MINOR__  > 4
-typedef Npp32s __attribute__((__may_alias__)) Npp32s_a;
-#else
-typedef Npp32s Npp32s_a;
-#endif
-
-void cv::cuda::copyMakeBorder(InputArray _src, OutputArray _dst, int top, int bottom, int left, int right, int borderType, Scalar value, Stream& _stream)
-{
-    GpuMat src = _src.getGpuMat();
-
-    CV_Assert( src.depth() <= CV_32F && src.channels() <= 4 );
-    CV_Assert( borderType == BORDER_REFLECT_101 || borderType == BORDER_REPLICATE || borderType == BORDER_CONSTANT || borderType == BORDER_REFLECT || borderType == BORDER_WRAP );
-
-    _dst.create(src.rows + top + bottom, src.cols + left + right, src.type());
-    GpuMat dst = _dst.getGpuMat();
-
-    cudaStream_t stream = StreamAccessor::getStream(_stream);
-
-    if (borderType == BORDER_CONSTANT && (src.type() == CV_8UC1 || src.type() == CV_8UC4 || src.type() == CV_32SC1 || src.type() == CV_32FC1))
-    {
-        NppiSize srcsz;
-        srcsz.width  = src.cols;
-        srcsz.height = src.rows;
-
-        NppiSize dstsz;
-        dstsz.width  = dst.cols;
-        dstsz.height = dst.rows;
-
-        NppStreamHandler h(stream);
-
-        switch (src.type())
-        {
-        case CV_8UC1:
-            {
-                Npp8u nVal = saturate_cast<Npp8u>(value[0]);
-                nppSafeCall( nppiCopyConstBorder_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step), srcsz,
-                    dst.ptr<Npp8u>(), static_cast<int>(dst.step), dstsz, top, left, nVal) );
-                break;
-            }
-        case CV_8UC4:
-            {
-                Npp8u nVal[] = {saturate_cast<Npp8u>(value[0]), saturate_cast<Npp8u>(value[1]), saturate_cast<Npp8u>(value[2]), saturate_cast<Npp8u>(value[3])};
-                nppSafeCall( nppiCopyConstBorder_8u_C4R(src.ptr<Npp8u>(), static_cast<int>(src.step), srcsz,
-                    dst.ptr<Npp8u>(), static_cast<int>(dst.step), dstsz, top, left, nVal) );
-                break;
-            }
-        case CV_32SC1:
-            {
-                Npp32s nVal = saturate_cast<Npp32s>(value[0]);
-                nppSafeCall( nppiCopyConstBorder_32s_C1R(src.ptr<Npp32s>(), static_cast<int>(src.step), srcsz,
-                    dst.ptr<Npp32s>(), static_cast<int>(dst.step), dstsz, top, left, nVal) );
-                break;
-            }
-        case CV_32FC1:
-            {
-                Npp32f val = saturate_cast<Npp32f>(value[0]);
-                Npp32s nVal = *(reinterpret_cast<Npp32s_a*>(&val));
-                nppSafeCall( nppiCopyConstBorder_32s_C1R(src.ptr<Npp32s>(), static_cast<int>(src.step), srcsz,
-                    dst.ptr<Npp32s>(), static_cast<int>(dst.step), dstsz, top, left, nVal) );
-                break;
-            }
-        }
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-    else
-    {
-        typedef void (*caller_t)(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderType, const Scalar& value, cudaStream_t stream);
-        static const caller_t callers[6][4] =
-        {
-            {   copyMakeBorder_caller<uchar, 1>  ,    copyMakeBorder_caller<uchar, 2>   ,    copyMakeBorder_caller<uchar, 3>  ,    copyMakeBorder_caller<uchar, 4>},
-            {0/*copyMakeBorder_caller<schar, 1>*/, 0/*copyMakeBorder_caller<schar, 2>*/ , 0/*copyMakeBorder_caller<schar, 3>*/, 0/*copyMakeBorder_caller<schar, 4>*/},
-            {   copyMakeBorder_caller<ushort, 1> , 0/*copyMakeBorder_caller<ushort, 2>*/,    copyMakeBorder_caller<ushort, 3> ,    copyMakeBorder_caller<ushort, 4>},
-            {   copyMakeBorder_caller<short, 1>  , 0/*copyMakeBorder_caller<short, 2>*/ ,    copyMakeBorder_caller<short, 3>  ,    copyMakeBorder_caller<short, 4>},
-            {0/*copyMakeBorder_caller<int,   1>*/, 0/*copyMakeBorder_caller<int,   2>*/ , 0/*copyMakeBorder_caller<int,   3>*/, 0/*copyMakeBorder_caller<int  , 4>*/},
-            {   copyMakeBorder_caller<float, 1>  , 0/*copyMakeBorder_caller<float, 2>*/ ,    copyMakeBorder_caller<float, 3>  ,    copyMakeBorder_caller<float ,4>}
-        };
-
-        caller_t func = callers[src.depth()][src.channels() - 1];
-        CV_Assert(func != 0);
-
-        func(src, dst, top, left, borderType, value, stream);
-    }
-}
-
 #endif /* !defined (HAVE_CUDA) */
--- a/modules/cudaarithm/src/cuda/copy_make_border.cu
+++ b/modules/cudaarithm/src/cuda/copy_make_border.cu
@ -40,92 +40,116 @@
 //
 //M*/

-#if !defined CUDA_DISABLER
+#include "opencv2/opencv_modules.hpp"

-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/border_interpolate.hpp"
+#ifndef HAVE_OPENCV_CUDEV

-namespace cv { namespace cuda { namespace device
+#error "opencv_cudev is required"
+
+#else
+
+#include "opencv2/cudaarithm.hpp"
+#include "opencv2/cudev.hpp"
+
+using namespace cv::cudev;
+
+namespace
 {
-    namespace imgproc
+    struct ShiftMap
    {
-        template <typename Ptr2D, typename T> __global__ void copyMakeBorder(const Ptr2D src, PtrStepSz<T> dst, int top, int left)
-        {
-            const int x = blockDim.x * blockIdx.x + threadIdx.x;
-            const int y = blockDim.y * blockIdx.y + threadIdx.y;
+        typedef int2 value_type;
+        typedef int index_type;

-            if (x < dst.cols && y < dst.rows)
-                dst.ptr(y)[x] = src(y - top, x - left);
+        int top;
+        int left;
+
+        __device__ __forceinline__ int2 operator ()(int y, int x) const
+        {
+            return make_int2(x - left, y - top);
        }
+    };

-        template <template <typename> class B, typename T> struct CopyMakeBorderDispatcher
+    struct ShiftMapSz : ShiftMap
+    {
+        int rows, cols;
+    };
+}
+
+namespace cv { namespace cudev {
+
+template <> struct PtrTraits<ShiftMapSz> : PtrTraitsBase<ShiftMapSz, ShiftMap>
+{
+};
+
+}}
+
+namespace
+{
+    template <typename T, int cn>
+    void copyMakeBorderImpl(const GpuMat& src, GpuMat& dst, int top, int left, int borderMode, cv::Scalar borderValue, Stream& stream)
+    {
+        typedef typename MakeVec<T, cn>::type src_type;
+
+        cv::Scalar_<T> borderValue_ = borderValue;
+        const src_type brdVal = VecTraits<src_type>::make(borderValue_.val);
+
+        ShiftMapSz map;
+        map.top = top;
+        map.left = left;
+        map.rows = dst.rows;
+        map.cols = dst.cols;
+
+        switch (borderMode)
        {
-            static void call(const PtrStepSz<T>& src, const PtrStepSz<T>& dst, int top, int left,
-                const typename VecTraits<T>::elem_type* borderValue, cudaStream_t stream)
-            {
-                dim3 block(32, 8);
-                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-
-                B<T> brd(src.rows, src.cols, VecTraits<T>::make(borderValue));
-                BorderReader< PtrStep<T>, B<T> > brdSrc(src, brd);
-
-                copyMakeBorder<<<grid, block, 0, stream>>>(brdSrc, dst, top, left);
-                cudaSafeCall( cudaGetLastError() );
-
-                if (stream == 0)
-                    cudaSafeCall( cudaDeviceSynchronize() );
-            }
+        case cv::BORDER_CONSTANT:
+            gridCopy(remapPtr(brdConstant(globPtr<src_type>(src), brdVal), map), globPtr<src_type>(dst), stream);
+            break;
+        case cv::BORDER_REPLICATE:
+            gridCopy(remapPtr(brdReplicate(globPtr<src_type>(src)), map), globPtr<src_type>(dst), stream);
+            break;
+        case cv::BORDER_REFLECT:
+            gridCopy(remapPtr(brdReflect(globPtr<src_type>(src)), map), globPtr<src_type>(dst), stream);
+            break;
+        case cv::BORDER_WRAP:
+            gridCopy(remapPtr(brdWrap(globPtr<src_type>(src)), map), globPtr<src_type>(dst), stream);
+            break;
+        case cv::BORDER_REFLECT_101:
+            gridCopy(remapPtr(brdReflect101(globPtr<src_type>(src)), map), globPtr<src_type>(dst), stream);
+            break;
        };
+    }
+}

-        template <typename T, int cn> void copyMakeBorder_gpu(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode,
-            const T* borderValue, cudaStream_t stream)
-        {
-            typedef typename TypeVec<T, cn>::vec_type vec_type;
+void cv::cuda::copyMakeBorder(InputArray _src, OutputArray _dst, int top, int bottom, int left, int right, int borderType, Scalar value, Stream& stream)
+{
+    typedef void (*func_t)(const GpuMat& src, GpuMat& dst, int top, int left, int borderMode, cv::Scalar borderValue, Stream& stream);
+    static const func_t funcs[6][4] =
+    {
+        {    copyMakeBorderImpl<uchar , 1>  ,     copyMakeBorderImpl<uchar , 2>  ,     copyMakeBorderImpl<uchar , 3>  ,     copyMakeBorderImpl<uchar , 4>  },
+        {0 /*copyMakeBorderImpl<schar , 1>*/, 0 /*copyMakeBorderImpl<schar , 2>*/, 0 /*copyMakeBorderImpl<schar , 3>*/, 0 /*copyMakeBorderImpl<schar , 4>*/},
+        {    copyMakeBorderImpl<ushort, 1>  , 0 /*copyMakeBorderImpl<ushort, 2>*/,     copyMakeBorderImpl<ushort, 3>  ,     copyMakeBorderImpl<ushort, 4>  },
+        {    copyMakeBorderImpl<short , 1>  , 0 /*copyMakeBorderImpl<short , 2>*/,     copyMakeBorderImpl<short , 3>  ,     copyMakeBorderImpl<short , 4>  },
+        {0 /*copyMakeBorderImpl<int   , 1>*/, 0 /*copyMakeBorderImpl<int   , 2>*/, 0 /*copyMakeBorderImpl<int   , 3>*/, 0 /*copyMakeBorderImpl<int   , 4>*/},
+        {    copyMakeBorderImpl<float , 1>  , 0 /*copyMakeBorderImpl<float , 2>*/,     copyMakeBorderImpl<float , 3>  ,     copyMakeBorderImpl<float  ,4>  }
+    };

-            typedef void (*caller_t)(const PtrStepSz<vec_type>& src, const PtrStepSz<vec_type>& dst, int top, int left, const T* borderValue, cudaStream_t stream);
+    GpuMat src = _src.getGpuMat();

-            static const caller_t callers[5] =
-            {
-                CopyMakeBorderDispatcher<BrdConstant, vec_type>::call,
-                CopyMakeBorderDispatcher<BrdReplicate, vec_type>::call,
-                CopyMakeBorderDispatcher<BrdReflect, vec_type>::call,
-                CopyMakeBorderDispatcher<BrdWrap, vec_type>::call,
-                CopyMakeBorderDispatcher<BrdReflect101, vec_type>::call
-            };
+    const int depth = src.depth();
+    const int cn = src.channels();

-            callers[borderMode](PtrStepSz<vec_type>(src), PtrStepSz<vec_type>(dst), top, left, borderValue, stream);
-        }
+    CV_Assert( depth <= CV_32F && cn <= 4 );
+    CV_Assert( borderType == BORDER_REFLECT_101 || borderType == BORDER_REPLICATE || borderType == BORDER_CONSTANT || borderType == BORDER_REFLECT || borderType == BORDER_WRAP );

-        template void copyMakeBorder_gpu<uchar, 1>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
-        template void copyMakeBorder_gpu<uchar, 2>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
-        template void copyMakeBorder_gpu<uchar, 3>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
-        template void copyMakeBorder_gpu<uchar, 4>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
+    _dst.create(src.rows + top + bottom, src.cols + left + right, src.type());
+    GpuMat dst = _dst.getGpuMat();

-        //template void copyMakeBorder_gpu<schar, 1>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
-        //template void copyMakeBorder_gpu<schar, 2>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
-        //template void copyMakeBorder_gpu<schar, 3>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
-        //template void copyMakeBorder_gpu<schar, 4>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
+    const func_t func = funcs[depth][cn - 1];

-        template void copyMakeBorder_gpu<ushort, 1>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
-        //template void copyMakeBorder_gpu<ushort, 2>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
-        template void copyMakeBorder_gpu<ushort, 3>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
-        template void copyMakeBorder_gpu<ushort, 4>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
+    if (!func)
+        CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types");

-        template void copyMakeBorder_gpu<short, 1>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
-        //template void copyMakeBorder_gpu<short, 2>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
-        template void copyMakeBorder_gpu<short, 3>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
-        template void copyMakeBorder_gpu<short, 4>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
+    func(src, dst, top, left, borderType, value, stream);
+}

-        //template void copyMakeBorder_gpu<int, 1>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
-        //template void copyMakeBorder_gpu<int, 2>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
-        //template void copyMakeBorder_gpu<int, 3>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
-        //template void copyMakeBorder_gpu<int, 4>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
-
-        template void copyMakeBorder_gpu<float, 1>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
-        //template void copyMakeBorder_gpu<float, 2>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
-        template void copyMakeBorder_gpu<float, 3>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
-        template void copyMakeBorder_gpu<float, 4>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
-    } // namespace imgproc
-}}} // namespace cv { namespace cuda { namespace cudev
-
-#endif /* CUDA_DISABLER */
+#endif