From 539f367d0b682d69967dec674d74dfbe362dcebf Mon Sep 17 00:00:00 2001
From: Vladislav Vinogradov <vlad.vinogradov@itseez.com>
Date: Fri, 26 Apr 2013 12:39:02 +0400
Subject: [PATCH] refactored gpu::LUT function:

* converted it to Algorithm, because implementation uses inner buffers and
  requires preprocessing step
* new implementation splits preprocessing and transform,
  what is more effecient
* old API still can be used for source compatibility (marked as deprecated)
---
 .../gpuarithm/include/opencv2/gpuarithm.hpp   |  35 ++-
 modules/gpuarithm/perf/perf_core.cpp          |   8 +-
 modules/gpuarithm/src/core.cpp                | 263 +++++++++++++-----
 modules/gpuarithm/test/test_core.cpp          |   8 +-
 4 files changed, 234 insertions(+), 80 deletions(-)
diff --git a/modules/gpuarithm/include/opencv2/gpuarithm.hpp b/modules/gpuarithm/include/opencv2/gpuarithm.hpp
index 2bf60eff32..4272e15468 100644
--- a/modules/gpuarithm/include/opencv2/gpuarithm.hpp
+++ b/modules/gpuarithm/include/opencv2/gpuarithm.hpp
@@ -49,6 +49,17 @@
 
 #include "opencv2/core/gpu.hpp"
 
+#if defined __GNUC__
+    #define __OPENCV_GPUARITHM_DEPR_BEFORE__
+    #define __OPENCV_GPUARITHM_DEPR_AFTER__ __attribute__ ((deprecated))
+#elif (defined WIN32 || defined _WIN32)
+    #define __OPENCV_GPUARITHM_DEPR_BEFORE__ __declspec(deprecated)
+    #define __OPENCV_GPUARITHM_DEPR_AFTER__
+#else
+    #define __OPENCV_GPUARITHM_DEPR_BEFORE__
+    #define __OPENCV_GPUARITHM_DEPR_AFTER__
+#endif
+
 namespace cv { namespace gpu {
 
 //! adds one matrix to another (dst = src1 + src2)
@@ -178,14 +189,25 @@ CV_EXPORTS void transpose(InputArray src1, OutputArray dst, Stream& stream = Str
 //! supports 1, 3 and 4 channels images with CV_8U, CV_16U, CV_32S or CV_32F depth
 CV_EXPORTS void flip(InputArray src, OutputArray dst, int flipCode, Stream& stream = Stream::Null());
 
-//! implements generalized matrix product algorithm GEMM from BLAS
-CV_EXPORTS void gemm(const GpuMat& src1, const GpuMat& src2, double alpha,
-    const GpuMat& src3, double beta, GpuMat& dst, int flags = 0, Stream& stream = Stream::Null());
-
 //! transforms 8-bit unsigned integers using lookup table: dst(i)=lut(src(i))
 //! destination array will have the depth type as lut and the same channels number as source
 //! supports CV_8UC1, CV_8UC3 types
-CV_EXPORTS void LUT(const GpuMat& src, const Mat& lut, GpuMat& dst, Stream& stream = Stream::Null());
+class CV_EXPORTS LookUpTable : public Algorithm
+{
+public:
+    virtual void transform(InputArray src, OutputArray dst, Stream& stream = Stream::Null()) = 0;
+};
+CV_EXPORTS Ptr<LookUpTable> createLookUpTable(InputArray lut);
+
+__OPENCV_GPUARITHM_DEPR_BEFORE__ void LUT(InputArray src, InputArray lut, OutputArray dst, Stream& stream = Stream::Null()) __OPENCV_GPUARITHM_DEPR_AFTER__;
+inline void LUT(InputArray src, InputArray lut, OutputArray dst, Stream& stream)
+{
+    createLookUpTable(lut)->transform(src, dst, stream);
+}
+
+//! implements generalized matrix product algorithm GEMM from BLAS
+CV_EXPORTS void gemm(const GpuMat& src1, const GpuMat& src2, double alpha,
+    const GpuMat& src3, double beta, GpuMat& dst, int flags = 0, Stream& stream = Stream::Null());
 
 //! scales and shifts array elements so that either the specified norm (alpha) or the minimum (alpha) and maximum (beta) array values get the specified values
 CV_EXPORTS void normalize(const GpuMat& src, GpuMat& dst, double alpha = 1, double beta = 0,
@@ -311,4 +333,7 @@ CV_EXPORTS void convolve(const GpuMat& image, const GpuMat& templ, GpuMat& resul
 
 }} // namespace cv { namespace gpu {
 
+#undef __OPENCV_GPUARITHM_DEPR_BEFORE__
+#undef __OPENCV_GPUARITHM_DEPR_AFTER__
+
 #endif /* __OPENCV_GPUARITHM_HPP__ */
diff --git a/modules/gpuarithm/perf/perf_core.cpp b/modules/gpuarithm/perf/perf_core.cpp
index eab6d87366..0add472ca3 100644
--- a/modules/gpuarithm/perf/perf_core.cpp
+++ b/modules/gpuarithm/perf/perf_core.cpp
@@ -224,10 +224,12 @@ PERF_TEST_P(Sz_Type, LutOneChannel,
 
     if (PERF_RUN_GPU())
     {
+        cv::Ptr<cv::gpu::LookUpTable> lutAlg = cv::gpu::createLookUpTable(lut);
+
         const cv::gpu::GpuMat d_src(src);
         cv::gpu::GpuMat dst;
 
-        TEST_CYCLE() cv::gpu::LUT(d_src, lut, dst);
+        TEST_CYCLE() lutAlg->transform(d_src, dst);
 
         GPU_SANITY_CHECK(dst);
     }
@@ -259,10 +261,12 @@ PERF_TEST_P(Sz_Type, LutMultiChannel,
 
     if (PERF_RUN_GPU())
     {
+        cv::Ptr<cv::gpu::LookUpTable> lutAlg = cv::gpu::createLookUpTable(lut);
+
         const cv::gpu::GpuMat d_src(src);
         cv::gpu::GpuMat dst;
 
-        TEST_CYCLE() cv::gpu::LUT(d_src, lut, dst);
+        TEST_CYCLE() lutAlg->transform(d_src, dst);
 
         GPU_SANITY_CHECK(dst);
     }
diff --git a/modules/gpuarithm/src/core.cpp b/modules/gpuarithm/src/core.cpp
index c8ef966e51..5dc1d4a5e5 100644
--- a/modules/gpuarithm/src/core.cpp
+++ b/modules/gpuarithm/src/core.cpp
@@ -57,7 +57,7 @@ void cv::gpu::transpose(InputArray, OutputArray, Stream&) { throw_no_cuda(); }
 
 void cv::gpu::flip(InputArray, OutputArray, int, Stream&) { throw_no_cuda(); }
 
-void cv::gpu::LUT(const GpuMat&, const Mat&, GpuMat&, Stream&) { throw_no_cuda(); }
+Ptr<LookUpTable> cv::gpu::createLookUpTable(InputArray) { throw_no_cuda(); return Ptr<LookUpTable>(); }
 
 void cv::gpu::copyMakeBorder(const GpuMat&, GpuMat&, int, int, int, int, int, const Scalar&, Stream&) { throw_no_cuda(); }
 
@@ -290,93 +290,214 @@ void cv::gpu::flip(InputArray _src, OutputArray _dst, int flipCode, Stream& stre
 ////////////////////////////////////////////////////////////////////////
 // LUT
 
-void cv::gpu::LUT(const GpuMat& src, const Mat& lut, GpuMat& dst, Stream& s)
+#if (CUDA_VERSION >= 5000)
+
+namespace
 {
-    const int cn = src.channels();
-
-    CV_Assert( src.type() == CV_8UC1 || src.type() == CV_8UC3 );
-    CV_Assert( lut.depth() == CV_8U );
-    CV_Assert( lut.channels() == 1 || lut.channels() == cn );
-    CV_Assert( lut.rows * lut.cols == 256 && lut.isContinuous() );
-
-    dst.create(src.size(), CV_MAKE_TYPE(lut.depth(), cn));
-
-    NppiSize sz;
-    sz.height = src.rows;
-    sz.width = src.cols;
-
-    Mat nppLut;
-    lut.convertTo(nppLut, CV_32S);
-
-    int nValues3[] = {256, 256, 256};
-
-    Npp32s pLevels[256];
-    for (int i = 0; i < 256; ++i)
-        pLevels[i] = i;
-
-    const Npp32s* pLevels3[3];
-
-#if (CUDA_VERSION <= 4020)
-    pLevels3[0] = pLevels3[1] = pLevels3[2] = pLevels;
-#else
-    GpuMat d_pLevels;
-    d_pLevels.upload(Mat(1, 256, CV_32S, pLevels));
-    pLevels3[0] = pLevels3[1] = pLevels3[2] = d_pLevels.ptr<Npp32s>();
-#endif
-
-    cudaStream_t stream = StreamAccessor::getStream(s);
-    NppStreamHandler h(stream);
-
-    if (src.type() == CV_8UC1)
-    {
-#if (CUDA_VERSION <= 4020)
-        nppSafeCall( nppiLUT_Linear_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
-            dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, nppLut.ptr<Npp32s>(), pLevels, 256) );
-#else
-        GpuMat d_nppLut(Mat(1, 256, CV_32S, nppLut.data));
-        nppSafeCall( nppiLUT_Linear_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
-            dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, d_nppLut.ptr<Npp32s>(), d_pLevels.ptr<Npp32s>(), 256) );
-#endif
-    }
-    else
+    class LookUpTableImpl : public LookUpTable
     {
+    public:
+        LookUpTableImpl(InputArray lut);
+
+        void transform(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
+
+    private:
+        int lut_cn;
+
+        int nValues3[3];
         const Npp32s* pValues3[3];
+        const Npp32s* pLevels3[3];
 
-        Mat nppLut3[3];
-        if (nppLut.channels() == 1)
+        GpuMat d_pLevels;
+        GpuMat d_nppLut;
+        GpuMat d_nppLut3[3];
+    };
+
+    LookUpTableImpl::LookUpTableImpl(InputArray _lut)
+    {
+        nValues3[0] = nValues3[1] = nValues3[2] = 256;
+
+        Npp32s pLevels[256];
+        for (int i = 0; i < 256; ++i)
+            pLevels[i] = i;
+
+        d_pLevels.upload(Mat(1, 256, CV_32S, pLevels));
+        pLevels3[0] = pLevels3[1] = pLevels3[2] = d_pLevels.ptr<Npp32s>();
+
+        GpuMat lut;
+        if (_lut.kind() == _InputArray::GPU_MAT)
+        {
+            lut = _lut.getGpuMat();
+        }
+        else
+        {
+            Mat hLut = _lut.getMat();
+            CV_Assert( hLut.total() == 256 && hLut.isContinuous() );
+            lut.upload(Mat(1, 256, hLut.type(), hLut.data));
+        }
+
+        lut_cn = lut.channels();
+
+        CV_Assert( lut.depth() == CV_8U );
+        CV_Assert( lut.rows == 1 && lut.cols == 256 );
+
+        lut.convertTo(d_nppLut, CV_32S);
+
+        if (lut_cn == 1)
         {
-#if (CUDA_VERSION <= 4020)
-            pValues3[0] = pValues3[1] = pValues3[2] = nppLut.ptr<Npp32s>();
-#else
-            GpuMat d_nppLut(Mat(1, 256, CV_32S, nppLut.data));
             pValues3[0] = pValues3[1] = pValues3[2] = d_nppLut.ptr<Npp32s>();
-#endif
+        }
+        else
+        {
+            gpu::split(d_nppLut, d_nppLut3);
+
+            pValues3[0] = d_nppLut3[0].ptr<Npp32s>();
+            pValues3[1] = d_nppLut3[1].ptr<Npp32s>();
+            pValues3[2] = d_nppLut3[2].ptr<Npp32s>();
+        }
+    }
+
+    void LookUpTableImpl::transform(InputArray _src, OutputArray _dst, Stream& _stream)
+    {
+        GpuMat src = _src.getGpuMat();
+
+        const int cn = src.channels();
+
+        CV_Assert( src.type() == CV_8UC1 || src.type() == CV_8UC3 );
+        CV_Assert( lut_cn == 1 || lut_cn == cn );
+
+        _dst.create(src.size(), src.type());
+        GpuMat dst = _dst.getGpuMat();
+
+        cudaStream_t stream = StreamAccessor::getStream(_stream);
+
+        NppStreamHandler h(stream);
+
+        NppiSize sz;
+        sz.height = src.rows;
+        sz.width = src.cols;
+
+        if (src.type() == CV_8UC1)
+        {
+            nppSafeCall( nppiLUT_Linear_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
+                dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, d_nppLut.ptr<Npp32s>(), d_pLevels.ptr<Npp32s>(), 256) );
+        }
+        else
+        {
+            nppSafeCall( nppiLUT_Linear_8u_C3R(src.ptr<Npp8u>(), static_cast<int>(src.step),
+                dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, pValues3, pLevels3, nValues3) );
+        }
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+}
+
+#else //  (CUDA_VERSION >= 5000)
+
+namespace
+{
+    class LookUpTableImpl : public LookUpTable
+    {
+    public:
+        LookUpTableImpl(InputArray lut);
+
+        void transform(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
+
+    private:
+        int lut_cn;
+
+        Npp32s pLevels[256];
+        int nValues3[3];
+        const Npp32s* pValues3[3];
+        const Npp32s* pLevels3[3];
+
+        Mat nppLut;
+        Mat nppLut3[3];
+    };
+
+    LookUpTableImpl::LookUpTableImpl(InputArray _lut)
+    {
+        nValues3[0] = nValues3[1] = nValues3[2] = 256;
+
+        for (int i = 0; i < 256; ++i)
+            pLevels[i] = i;
+        pLevels3[0] = pLevels3[1] = pLevels3[2] = pLevels;
+
+        Mat lut;
+        if (_lut.kind() == _InputArray::GPU_MAT)
+        {
+            lut = Mat(_lut.getGpuMat());
+        }
+        else
+        {
+            Mat hLut = _lut.getMat();
+            CV_Assert( hLut.total() == 256 && hLut.isContinuous() );
+            lut = hLut;
+        }
+
+        lut_cn = lut.channels();
+
+        CV_Assert( lut.depth() == CV_8U );
+        CV_Assert( lut.rows == 1 && lut.cols == 256 );
+
+        lut.convertTo(nppLut, CV_32S);
+
+        if (lut_cn == 1)
+        {
+            pValues3[0] = pValues3[1] = pValues3[2] = nppLut.ptr<Npp32s>();
         }
         else
         {
             cv::split(nppLut, nppLut3);
 
-#if (CUDA_VERSION <= 4020)
             pValues3[0] = nppLut3[0].ptr<Npp32s>();
             pValues3[1] = nppLut3[1].ptr<Npp32s>();
             pValues3[2] = nppLut3[2].ptr<Npp32s>();
-#else
-            GpuMat d_nppLut0(Mat(1, 256, CV_32S, nppLut3[0].data));
-            GpuMat d_nppLut1(Mat(1, 256, CV_32S, nppLut3[1].data));
-            GpuMat d_nppLut2(Mat(1, 256, CV_32S, nppLut3[2].data));
-
-            pValues3[0] = d_nppLut0.ptr<Npp32s>();
-            pValues3[1] = d_nppLut1.ptr<Npp32s>();
-            pValues3[2] = d_nppLut2.ptr<Npp32s>();
-#endif
         }
-
-        nppSafeCall( nppiLUT_Linear_8u_C3R(src.ptr<Npp8u>(), static_cast<int>(src.step),
-            dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, pValues3, pLevels3, nValues3) );
     }
 
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
+    void LookUpTableImpl::transform(InputArray _src, OutputArray _dst, Stream& _stream)
+    {
+        GpuMat src = _src.getGpuMat();
+
+        const int cn = src.channels();
+
+        CV_Assert( src.type() == CV_8UC1 || src.type() == CV_8UC3 );
+        CV_Assert( lut_cn == 1 || lut_cn == cn );
+
+        _dst.create(src.size(), src.type());
+        GpuMat dst = _dst.getGpuMat();
+
+        cudaStream_t stream = StreamAccessor::getStream(_stream);
+
+        NppStreamHandler h(stream);
+
+        NppiSize sz;
+        sz.height = src.rows;
+        sz.width = src.cols;
+
+        if (src.type() == CV_8UC1)
+        {
+            nppSafeCall( nppiLUT_Linear_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
+                dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, nppLut.ptr<Npp32s>(), pLevels, 256) );
+        }
+        else
+        {
+            nppSafeCall( nppiLUT_Linear_8u_C3R(src.ptr<Npp8u>(), static_cast<int>(src.step),
+                dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, pValues3, pLevels3, nValues3) );
+        }
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+}
+
+#endif //  (CUDA_VERSION >= 5000)
+
+Ptr<LookUpTable> cv::gpu::createLookUpTable(InputArray lut)
+{
+    return new LookUpTableImpl(lut);
 }
 
 ////////////////////////////////////////////////////////////////////////
diff --git a/modules/gpuarithm/test/test_core.cpp b/modules/gpuarithm/test/test_core.cpp
index 45f796dc59..d465aa4634 100644
--- a/modules/gpuarithm/test/test_core.cpp
+++ b/modules/gpuarithm/test/test_core.cpp
@@ -323,8 +323,10 @@ GPU_TEST_P(LUT, OneChannel)
     cv::Mat src = randomMat(size, type);
     cv::Mat lut = randomMat(cv::Size(256, 1), CV_8UC1);
 
+    cv::Ptr<cv::gpu::LookUpTable> lutAlg = cv::gpu::createLookUpTable(lut);
+
     cv::gpu::GpuMat dst = createMat(size, CV_MAKE_TYPE(lut.depth(), src.channels()));
-    cv::gpu::LUT(loadMat(src, useRoi), lut, dst);
+    lutAlg->transform(loadMat(src, useRoi), dst);
 
     cv::Mat dst_gold;
     cv::LUT(src, lut, dst_gold);
@@ -337,8 +339,10 @@ GPU_TEST_P(LUT, MultiChannel)
     cv::Mat src = randomMat(size, type);
     cv::Mat lut = randomMat(cv::Size(256, 1), CV_MAKE_TYPE(CV_8U, src.channels()));
 
+    cv::Ptr<cv::gpu::LookUpTable> lutAlg = cv::gpu::createLookUpTable(lut);
+
     cv::gpu::GpuMat dst = createMat(size, CV_MAKE_TYPE(lut.depth(), src.channels()), useRoi);
-    cv::gpu::LUT(loadMat(src, useRoi), lut, dst);
+    lutAlg->transform(loadMat(src, useRoi), dst);
 
     cv::Mat dst_gold;
     cv::LUT(src, lut, dst_gold);