From 539f367d0b682d69967dec674d74dfbe362dcebf Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Fri, 26 Apr 2013 12:39:02 +0400 Subject: [PATCH] refactored gpu::LUT function: * converted it to Algorithm, because implementation uses inner buffers and requires preprocessing step * new implementation splits preprocessing and transform, what is more effecient * old API still can be used for source compatibility (marked as deprecated) --- .../gpuarithm/include/opencv2/gpuarithm.hpp | 35 ++- modules/gpuarithm/perf/perf_core.cpp | 8 +- modules/gpuarithm/src/core.cpp | 263 +++++++++++++----- modules/gpuarithm/test/test_core.cpp | 8 +- 4 files changed, 234 insertions(+), 80 deletions(-) diff --git a/modules/gpuarithm/include/opencv2/gpuarithm.hpp b/modules/gpuarithm/include/opencv2/gpuarithm.hpp index 2bf60eff32..4272e15468 100644 --- a/modules/gpuarithm/include/opencv2/gpuarithm.hpp +++ b/modules/gpuarithm/include/opencv2/gpuarithm.hpp @@ -49,6 +49,17 @@ #include "opencv2/core/gpu.hpp" +#if defined __GNUC__ + #define __OPENCV_GPUARITHM_DEPR_BEFORE__ + #define __OPENCV_GPUARITHM_DEPR_AFTER__ __attribute__ ((deprecated)) +#elif (defined WIN32 || defined _WIN32) + #define __OPENCV_GPUARITHM_DEPR_BEFORE__ __declspec(deprecated) + #define __OPENCV_GPUARITHM_DEPR_AFTER__ +#else + #define __OPENCV_GPUARITHM_DEPR_BEFORE__ + #define __OPENCV_GPUARITHM_DEPR_AFTER__ +#endif + namespace cv { namespace gpu { //! adds one matrix to another (dst = src1 + src2) @@ -178,14 +189,25 @@ CV_EXPORTS void transpose(InputArray src1, OutputArray dst, Stream& stream = Str //! supports 1, 3 and 4 channels images with CV_8U, CV_16U, CV_32S or CV_32F depth CV_EXPORTS void flip(InputArray src, OutputArray dst, int flipCode, Stream& stream = Stream::Null()); -//! implements generalized matrix product algorithm GEMM from BLAS -CV_EXPORTS void gemm(const GpuMat& src1, const GpuMat& src2, double alpha, - const GpuMat& src3, double beta, GpuMat& dst, int flags = 0, Stream& stream = Stream::Null()); - //! transforms 8-bit unsigned integers using lookup table: dst(i)=lut(src(i)) //! destination array will have the depth type as lut and the same channels number as source //! supports CV_8UC1, CV_8UC3 types -CV_EXPORTS void LUT(const GpuMat& src, const Mat& lut, GpuMat& dst, Stream& stream = Stream::Null()); +class CV_EXPORTS LookUpTable : public Algorithm +{ +public: + virtual void transform(InputArray src, OutputArray dst, Stream& stream = Stream::Null()) = 0; +}; +CV_EXPORTS Ptr createLookUpTable(InputArray lut); + +__OPENCV_GPUARITHM_DEPR_BEFORE__ void LUT(InputArray src, InputArray lut, OutputArray dst, Stream& stream = Stream::Null()) __OPENCV_GPUARITHM_DEPR_AFTER__; +inline void LUT(InputArray src, InputArray lut, OutputArray dst, Stream& stream) +{ + createLookUpTable(lut)->transform(src, dst, stream); +} + +//! implements generalized matrix product algorithm GEMM from BLAS +CV_EXPORTS void gemm(const GpuMat& src1, const GpuMat& src2, double alpha, + const GpuMat& src3, double beta, GpuMat& dst, int flags = 0, Stream& stream = Stream::Null()); //! scales and shifts array elements so that either the specified norm (alpha) or the minimum (alpha) and maximum (beta) array values get the specified values CV_EXPORTS void normalize(const GpuMat& src, GpuMat& dst, double alpha = 1, double beta = 0, @@ -311,4 +333,7 @@ CV_EXPORTS void convolve(const GpuMat& image, const GpuMat& templ, GpuMat& resul }} // namespace cv { namespace gpu { +#undef __OPENCV_GPUARITHM_DEPR_BEFORE__ +#undef __OPENCV_GPUARITHM_DEPR_AFTER__ + #endif /* __OPENCV_GPUARITHM_HPP__ */ diff --git a/modules/gpuarithm/perf/perf_core.cpp b/modules/gpuarithm/perf/perf_core.cpp index eab6d87366..0add472ca3 100644 --- a/modules/gpuarithm/perf/perf_core.cpp +++ b/modules/gpuarithm/perf/perf_core.cpp @@ -224,10 +224,12 @@ PERF_TEST_P(Sz_Type, LutOneChannel, if (PERF_RUN_GPU()) { + cv::Ptr lutAlg = cv::gpu::createLookUpTable(lut); + const cv::gpu::GpuMat d_src(src); cv::gpu::GpuMat dst; - TEST_CYCLE() cv::gpu::LUT(d_src, lut, dst); + TEST_CYCLE() lutAlg->transform(d_src, dst); GPU_SANITY_CHECK(dst); } @@ -259,10 +261,12 @@ PERF_TEST_P(Sz_Type, LutMultiChannel, if (PERF_RUN_GPU()) { + cv::Ptr lutAlg = cv::gpu::createLookUpTable(lut); + const cv::gpu::GpuMat d_src(src); cv::gpu::GpuMat dst; - TEST_CYCLE() cv::gpu::LUT(d_src, lut, dst); + TEST_CYCLE() lutAlg->transform(d_src, dst); GPU_SANITY_CHECK(dst); } diff --git a/modules/gpuarithm/src/core.cpp b/modules/gpuarithm/src/core.cpp index c8ef966e51..5dc1d4a5e5 100644 --- a/modules/gpuarithm/src/core.cpp +++ b/modules/gpuarithm/src/core.cpp @@ -57,7 +57,7 @@ void cv::gpu::transpose(InputArray, OutputArray, Stream&) { throw_no_cuda(); } void cv::gpu::flip(InputArray, OutputArray, int, Stream&) { throw_no_cuda(); } -void cv::gpu::LUT(const GpuMat&, const Mat&, GpuMat&, Stream&) { throw_no_cuda(); } +Ptr cv::gpu::createLookUpTable(InputArray) { throw_no_cuda(); return Ptr(); } void cv::gpu::copyMakeBorder(const GpuMat&, GpuMat&, int, int, int, int, int, const Scalar&, Stream&) { throw_no_cuda(); } @@ -290,93 +290,214 @@ void cv::gpu::flip(InputArray _src, OutputArray _dst, int flipCode, Stream& stre //////////////////////////////////////////////////////////////////////// // LUT -void cv::gpu::LUT(const GpuMat& src, const Mat& lut, GpuMat& dst, Stream& s) +#if (CUDA_VERSION >= 5000) + +namespace { - const int cn = src.channels(); - - CV_Assert( src.type() == CV_8UC1 || src.type() == CV_8UC3 ); - CV_Assert( lut.depth() == CV_8U ); - CV_Assert( lut.channels() == 1 || lut.channels() == cn ); - CV_Assert( lut.rows * lut.cols == 256 && lut.isContinuous() ); - - dst.create(src.size(), CV_MAKE_TYPE(lut.depth(), cn)); - - NppiSize sz; - sz.height = src.rows; - sz.width = src.cols; - - Mat nppLut; - lut.convertTo(nppLut, CV_32S); - - int nValues3[] = {256, 256, 256}; - - Npp32s pLevels[256]; - for (int i = 0; i < 256; ++i) - pLevels[i] = i; - - const Npp32s* pLevels3[3]; - -#if (CUDA_VERSION <= 4020) - pLevels3[0] = pLevels3[1] = pLevels3[2] = pLevels; -#else - GpuMat d_pLevels; - d_pLevels.upload(Mat(1, 256, CV_32S, pLevels)); - pLevels3[0] = pLevels3[1] = pLevels3[2] = d_pLevels.ptr(); -#endif - - cudaStream_t stream = StreamAccessor::getStream(s); - NppStreamHandler h(stream); - - if (src.type() == CV_8UC1) - { -#if (CUDA_VERSION <= 4020) - nppSafeCall( nppiLUT_Linear_8u_C1R(src.ptr(), static_cast(src.step), - dst.ptr(), static_cast(dst.step), sz, nppLut.ptr(), pLevels, 256) ); -#else - GpuMat d_nppLut(Mat(1, 256, CV_32S, nppLut.data)); - nppSafeCall( nppiLUT_Linear_8u_C1R(src.ptr(), static_cast(src.step), - dst.ptr(), static_cast(dst.step), sz, d_nppLut.ptr(), d_pLevels.ptr(), 256) ); -#endif - } - else + class LookUpTableImpl : public LookUpTable { + public: + LookUpTableImpl(InputArray lut); + + void transform(InputArray src, OutputArray dst, Stream& stream = Stream::Null()); + + private: + int lut_cn; + + int nValues3[3]; const Npp32s* pValues3[3]; + const Npp32s* pLevels3[3]; - Mat nppLut3[3]; - if (nppLut.channels() == 1) + GpuMat d_pLevels; + GpuMat d_nppLut; + GpuMat d_nppLut3[3]; + }; + + LookUpTableImpl::LookUpTableImpl(InputArray _lut) + { + nValues3[0] = nValues3[1] = nValues3[2] = 256; + + Npp32s pLevels[256]; + for (int i = 0; i < 256; ++i) + pLevels[i] = i; + + d_pLevels.upload(Mat(1, 256, CV_32S, pLevels)); + pLevels3[0] = pLevels3[1] = pLevels3[2] = d_pLevels.ptr(); + + GpuMat lut; + if (_lut.kind() == _InputArray::GPU_MAT) + { + lut = _lut.getGpuMat(); + } + else + { + Mat hLut = _lut.getMat(); + CV_Assert( hLut.total() == 256 && hLut.isContinuous() ); + lut.upload(Mat(1, 256, hLut.type(), hLut.data)); + } + + lut_cn = lut.channels(); + + CV_Assert( lut.depth() == CV_8U ); + CV_Assert( lut.rows == 1 && lut.cols == 256 ); + + lut.convertTo(d_nppLut, CV_32S); + + if (lut_cn == 1) { -#if (CUDA_VERSION <= 4020) - pValues3[0] = pValues3[1] = pValues3[2] = nppLut.ptr(); -#else - GpuMat d_nppLut(Mat(1, 256, CV_32S, nppLut.data)); pValues3[0] = pValues3[1] = pValues3[2] = d_nppLut.ptr(); -#endif + } + else + { + gpu::split(d_nppLut, d_nppLut3); + + pValues3[0] = d_nppLut3[0].ptr(); + pValues3[1] = d_nppLut3[1].ptr(); + pValues3[2] = d_nppLut3[2].ptr(); + } + } + + void LookUpTableImpl::transform(InputArray _src, OutputArray _dst, Stream& _stream) + { + GpuMat src = _src.getGpuMat(); + + const int cn = src.channels(); + + CV_Assert( src.type() == CV_8UC1 || src.type() == CV_8UC3 ); + CV_Assert( lut_cn == 1 || lut_cn == cn ); + + _dst.create(src.size(), src.type()); + GpuMat dst = _dst.getGpuMat(); + + cudaStream_t stream = StreamAccessor::getStream(_stream); + + NppStreamHandler h(stream); + + NppiSize sz; + sz.height = src.rows; + sz.width = src.cols; + + if (src.type() == CV_8UC1) + { + nppSafeCall( nppiLUT_Linear_8u_C1R(src.ptr(), static_cast(src.step), + dst.ptr(), static_cast(dst.step), sz, d_nppLut.ptr(), d_pLevels.ptr(), 256) ); + } + else + { + nppSafeCall( nppiLUT_Linear_8u_C3R(src.ptr(), static_cast(src.step), + dst.ptr(), static_cast(dst.step), sz, pValues3, pLevels3, nValues3) ); + } + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } +} + +#else // (CUDA_VERSION >= 5000) + +namespace +{ + class LookUpTableImpl : public LookUpTable + { + public: + LookUpTableImpl(InputArray lut); + + void transform(InputArray src, OutputArray dst, Stream& stream = Stream::Null()); + + private: + int lut_cn; + + Npp32s pLevels[256]; + int nValues3[3]; + const Npp32s* pValues3[3]; + const Npp32s* pLevels3[3]; + + Mat nppLut; + Mat nppLut3[3]; + }; + + LookUpTableImpl::LookUpTableImpl(InputArray _lut) + { + nValues3[0] = nValues3[1] = nValues3[2] = 256; + + for (int i = 0; i < 256; ++i) + pLevels[i] = i; + pLevels3[0] = pLevels3[1] = pLevels3[2] = pLevels; + + Mat lut; + if (_lut.kind() == _InputArray::GPU_MAT) + { + lut = Mat(_lut.getGpuMat()); + } + else + { + Mat hLut = _lut.getMat(); + CV_Assert( hLut.total() == 256 && hLut.isContinuous() ); + lut = hLut; + } + + lut_cn = lut.channels(); + + CV_Assert( lut.depth() == CV_8U ); + CV_Assert( lut.rows == 1 && lut.cols == 256 ); + + lut.convertTo(nppLut, CV_32S); + + if (lut_cn == 1) + { + pValues3[0] = pValues3[1] = pValues3[2] = nppLut.ptr(); } else { cv::split(nppLut, nppLut3); -#if (CUDA_VERSION <= 4020) pValues3[0] = nppLut3[0].ptr(); pValues3[1] = nppLut3[1].ptr(); pValues3[2] = nppLut3[2].ptr(); -#else - GpuMat d_nppLut0(Mat(1, 256, CV_32S, nppLut3[0].data)); - GpuMat d_nppLut1(Mat(1, 256, CV_32S, nppLut3[1].data)); - GpuMat d_nppLut2(Mat(1, 256, CV_32S, nppLut3[2].data)); - - pValues3[0] = d_nppLut0.ptr(); - pValues3[1] = d_nppLut1.ptr(); - pValues3[2] = d_nppLut2.ptr(); -#endif } - - nppSafeCall( nppiLUT_Linear_8u_C3R(src.ptr(), static_cast(src.step), - dst.ptr(), static_cast(dst.step), sz, pValues3, pLevels3, nValues3) ); } - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); + void LookUpTableImpl::transform(InputArray _src, OutputArray _dst, Stream& _stream) + { + GpuMat src = _src.getGpuMat(); + + const int cn = src.channels(); + + CV_Assert( src.type() == CV_8UC1 || src.type() == CV_8UC3 ); + CV_Assert( lut_cn == 1 || lut_cn == cn ); + + _dst.create(src.size(), src.type()); + GpuMat dst = _dst.getGpuMat(); + + cudaStream_t stream = StreamAccessor::getStream(_stream); + + NppStreamHandler h(stream); + + NppiSize sz; + sz.height = src.rows; + sz.width = src.cols; + + if (src.type() == CV_8UC1) + { + nppSafeCall( nppiLUT_Linear_8u_C1R(src.ptr(), static_cast(src.step), + dst.ptr(), static_cast(dst.step), sz, nppLut.ptr(), pLevels, 256) ); + } + else + { + nppSafeCall( nppiLUT_Linear_8u_C3R(src.ptr(), static_cast(src.step), + dst.ptr(), static_cast(dst.step), sz, pValues3, pLevels3, nValues3) ); + } + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } +} + +#endif // (CUDA_VERSION >= 5000) + +Ptr cv::gpu::createLookUpTable(InputArray lut) +{ + return new LookUpTableImpl(lut); } //////////////////////////////////////////////////////////////////////// diff --git a/modules/gpuarithm/test/test_core.cpp b/modules/gpuarithm/test/test_core.cpp index 45f796dc59..d465aa4634 100644 --- a/modules/gpuarithm/test/test_core.cpp +++ b/modules/gpuarithm/test/test_core.cpp @@ -323,8 +323,10 @@ GPU_TEST_P(LUT, OneChannel) cv::Mat src = randomMat(size, type); cv::Mat lut = randomMat(cv::Size(256, 1), CV_8UC1); + cv::Ptr lutAlg = cv::gpu::createLookUpTable(lut); + cv::gpu::GpuMat dst = createMat(size, CV_MAKE_TYPE(lut.depth(), src.channels())); - cv::gpu::LUT(loadMat(src, useRoi), lut, dst); + lutAlg->transform(loadMat(src, useRoi), dst); cv::Mat dst_gold; cv::LUT(src, lut, dst_gold); @@ -337,8 +339,10 @@ GPU_TEST_P(LUT, MultiChannel) cv::Mat src = randomMat(size, type); cv::Mat lut = randomMat(cv::Size(256, 1), CV_MAKE_TYPE(CV_8U, src.channels())); + cv::Ptr lutAlg = cv::gpu::createLookUpTable(lut); + cv::gpu::GpuMat dst = createMat(size, CV_MAKE_TYPE(lut.depth(), src.channels()), useRoi); - cv::gpu::LUT(loadMat(src, useRoi), lut, dst); + lutAlg->transform(loadMat(src, useRoi), dst); cv::Mat dst_gold; cv::LUT(src, lut, dst_gold);