From 9411cd6c07687539cc4a7d68f54a8fa573215fc0 Mon Sep 17 00:00:00 2001 From: Namgoo Lee Date: Tue, 21 Jul 2020 10:27:43 +0900 Subject: [PATCH] Use in-place npp function for inplace arguments --- modules/cudaarithm/src/core.cpp | 44 ++++++++++++++++++++++++++- modules/cudaarithm/test/test_core.cpp | 13 ++++++++ 2 files changed, 56 insertions(+), 1 deletion(-) diff --git a/modules/cudaarithm/src/core.cpp b/modules/cudaarithm/src/core.cpp index 6d97e15dbb..ac01afc7f0 100644 --- a/modules/cudaarithm/src/core.cpp +++ b/modules/cudaarithm/src/core.cpp @@ -102,6 +102,34 @@ namespace cudaSafeCall( cudaDeviceSynchronize() ); } }; + + template struct NppMirrorIFunc + { + typedef typename NppTypeTraits::npp_t npp_t; + + typedef NppStatus (*func_t)(npp_t* pSrcDst, int nSrcDstStep, NppiSize oROI, NppiAxis flip); + }; + + template ::func_t func> struct NppMirrorI + { + typedef typename NppMirrorIFunc::npp_t npp_t; + + static void call(GpuMat& srcDst, int flipCode, cudaStream_t stream) + { + NppStreamHandler h(stream); + + NppiSize sz; + sz.width = srcDst.cols; + sz.height = srcDst.rows; + + nppSafeCall( func(srcDst.ptr(), static_cast(srcDst.step), + sz, + (flipCode == 0 ? NPP_HORIZONTAL_AXIS : (flipCode > 0 ? NPP_VERTICAL_AXIS : NPP_BOTH_AXIS))) ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } + }; } void cv::cuda::flip(InputArray _src, OutputArray _dst, int flipCode, Stream& stream) @@ -117,6 +145,17 @@ void cv::cuda::flip(InputArray _src, OutputArray _dst, int flipCode, Stream& str {NppMirror::call, 0, NppMirror::call, NppMirror::call} }; + typedef void (*ifunc_t)(GpuMat& srcDst, int flipCode, cudaStream_t stream); + static const ifunc_t ifuncs[6][4] = + { + {NppMirrorI::call, 0, NppMirrorI::call, NppMirrorI::call}, + {0,0,0,0}, + {NppMirrorI::call, 0, NppMirrorI::call, NppMirrorI::call}, + {0,0,0,0}, + {NppMirrorI::call, 0, NppMirrorI::call, NppMirrorI::call}, + {NppMirrorI::call, 0, NppMirrorI::call, NppMirrorI::call} + }; + GpuMat src = getInputMat(_src, stream); CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32S || src.depth() == CV_32F); @@ -125,7 +164,10 @@ void cv::cuda::flip(InputArray _src, OutputArray _dst, int flipCode, Stream& str _dst.create(src.size(), src.type()); GpuMat dst = getOutputMat(_dst, src.size(), src.type(), stream); - funcs[src.depth()][src.channels() - 1](src, dst, flipCode, StreamAccessor::getStream(stream)); + if (src.refcount != dst.refcount) + funcs[src.depth()][src.channels() - 1](src, dst, flipCode, StreamAccessor::getStream(stream)); + else // in-place + ifuncs[src.depth()][src.channels() - 1](src, flipCode, StreamAccessor::getStream(stream)); syncOutput(dst, _dst, stream); } diff --git a/modules/cudaarithm/test/test_core.cpp b/modules/cudaarithm/test/test_core.cpp index 7e5762aa3f..bc8f3737e5 100644 --- a/modules/cudaarithm/test/test_core.cpp +++ b/modules/cudaarithm/test/test_core.cpp @@ -279,6 +279,19 @@ CUDA_TEST_P(Flip, Accuracy) EXPECT_MAT_NEAR(dst_gold, dst, 0.0); } +CUDA_TEST_P(Flip, AccuracyInplace) +{ + cv::Mat src = randomMat(size, type); + + cv::cuda::GpuMat srcDst = loadMat(src, useRoi); + cv::cuda::flip(srcDst, srcDst, flip_code); + + cv::Mat dst_gold; + cv::flip(src, dst_gold, flip_code); + + EXPECT_MAT_NEAR(dst_gold, srcDst, 0.0); +} + INSTANTIATE_TEST_CASE_P(CUDA_Arithm, Flip, testing::Combine( ALL_DEVICES, DIFFERENT_SIZES,