From fb8e652c3f20d377e9f935faee370ed28fb60122 Mon Sep 17 00:00:00 2001 From: Namgoo Lee Date: Tue, 5 Feb 2019 16:37:33 +0000 Subject: [PATCH] Add CV_16UC1 support for cuda::CLAHE Due to size limit of shared memory, histogram is built on the global memory for CV_16UC1 case. The amount of memory needed for building histogram is: 65536 * 4byte = 256KB and shared memory limit is 48KB typically. Added test cases for CV_16UC1 and various clip limits. Added perf tests for CV_16UC1 on both CPU and CUDA code. There was also a bug in CV_8UC1 case when redistributing "residual" clipped pixels. Adding the test case where clip limit is 5.0 exposes this bug. --- .../core/include/opencv2/core/cuda_types.hpp | 2 + modules/cudaimgproc/perf/perf_histogram.cpp | 8 +- modules/cudaimgproc/src/cuda/clahe.cu | 256 +++++++++++++++++- modules/cudaimgproc/src/histogram.cpp | 29 +- modules/cudaimgproc/test/test_histogram.cpp | 13 +- modules/imgproc/perf/perf_histogram.cpp | 8 +- 6 files changed, 285 insertions(+), 31 deletions(-) diff --git a/modules/core/include/opencv2/core/cuda_types.hpp b/modules/core/include/opencv2/core/cuda_types.hpp index e2647c0455..45dc2cad1c 100644 --- a/modules/core/include/opencv2/core/cuda_types.hpp +++ b/modules/core/include/opencv2/core/cuda_types.hpp @@ -127,10 +127,12 @@ namespace cv }; typedef PtrStepSz PtrStepSzb; + typedef PtrStepSz PtrStepSzus; typedef PtrStepSz PtrStepSzf; typedef PtrStepSz PtrStepSzi; typedef PtrStep PtrStepb; + typedef PtrStep PtrStepus; typedef PtrStep PtrStepf; typedef PtrStep PtrStepi; diff --git a/modules/cudaimgproc/perf/perf_histogram.cpp b/modules/cudaimgproc/perf/perf_histogram.cpp index 105411da9e..585f2b0006 100644 --- a/modules/cudaimgproc/perf/perf_histogram.cpp +++ b/modules/cudaimgproc/perf/perf_histogram.cpp @@ -183,16 +183,18 @@ PERF_TEST_P(Sz, EqualizeHist, ////////////////////////////////////////////////////////////////////// // CLAHE -DEF_PARAM_TEST(Sz_ClipLimit, cv::Size, double); +DEF_PARAM_TEST(Sz_ClipLimit, cv::Size, double, MatType); PERF_TEST_P(Sz_ClipLimit, CLAHE, Combine(CUDA_TYPICAL_MAT_SIZES, - Values(0.0, 40.0))) + Values(0.0, 40.0), + Values(MatType(CV_8UC1), MatType(CV_16UC1)))) { const cv::Size size = GET_PARAM(0); const double clipLimit = GET_PARAM(1); + const int type = GET_PARAM(2); - cv::Mat src(size, CV_8UC1); + cv::Mat src(size, type); declare.in(src, WARMUP_RNG); if (PERF_RUN_CUDA()) diff --git a/modules/cudaimgproc/src/cuda/clahe.cu b/modules/cudaimgproc/src/cuda/clahe.cu index b66a7d8a66..75663c51e2 100644 --- a/modules/cudaimgproc/src/cuda/clahe.cu +++ b/modules/cudaimgproc/src/cuda/clahe.cu @@ -48,11 +48,11 @@ using namespace cv::cudev; namespace clahe { - __global__ void calcLutKernel(const PtrStepb src, PtrStepb lut, - const int2 tileSize, const int tilesX, - const int clipLimit, const float lutScale) + __global__ void calcLutKernel_8U(const PtrStepb src, PtrStepb lut, + const int2 tileSize, const int tilesX, + const int clipLimit, const float lutScale) { - __shared__ int smem[512]; + __shared__ int smem[256]; const int tx = blockIdx.x; const int ty = blockIdx.y; @@ -95,18 +95,28 @@ namespace clahe // broadcast evaluated value __shared__ int totalClipped; + __shared__ int redistBatch; + __shared__ int residual; + __shared__ int rStep; if (tid == 0) + { totalClipped = clipped; + redistBatch = totalClipped / 256; + residual = totalClipped - redistBatch * 256; + + rStep = 1; + if (residual != 0) + rStep = 256 / residual; + } + __syncthreads(); // redistribute clipped samples evenly - int redistBatch = totalClipped / 256; tHistVal += redistBatch; - int residual = totalClipped - redistBatch * 256; - if (tid < residual) + if (residual && tid % rStep == 0 && tid / rStep < residual) ++tHistVal; } @@ -115,12 +125,212 @@ namespace clahe lut(ty * tilesX + tx, tid) = saturate_cast(__float2int_rn(lutScale * lutVal)); } - void calcLut(PtrStepSzb src, PtrStepb lut, int tilesX, int tilesY, int2 tileSize, int clipLimit, float lutScale, cudaStream_t stream) + __global__ void calcLutKernel_16U(const PtrStepus src, PtrStepus lut, + const int2 tileSize, const int tilesX, + const int clipLimit, const float lutScale, + PtrStepSzi hist) + { + #define histSize 65536 + #define blockSize 256 + + __shared__ int smem[blockSize]; + + const int tx = blockIdx.x; + const int ty = blockIdx.y; + const unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x; + + const int histRow = ty * tilesX + tx; + + // build histogram + + for (int i = tid; i < histSize; i += blockSize) + hist(histRow, i) = 0; + + __syncthreads(); + + for (int i = threadIdx.y; i < tileSize.y; i += blockDim.y) + { + const ushort* srcPtr = src.ptr(ty * tileSize.y + i) + tx * tileSize.x; + for (int j = threadIdx.x; j < tileSize.x; j += blockDim.x) + { + const int data = srcPtr[j]; + ::atomicAdd(&hist(histRow, data), 1); + } + } + + __syncthreads(); + + if (clipLimit > 0) + { + // clip histogram bar && + // find number of overall clipped samples + + __shared__ int partialSum[blockSize]; + + for (int i = tid; i < histSize; i += blockSize) + { + int histVal = hist(histRow, i); + + int clipped = 0; + if (histVal > clipLimit) + { + clipped = histVal - clipLimit; + hist(histRow, i) = clipLimit; + } + + // Following code block is in effect equivalent to: + // + // blockReduce(smem, clipped, tid, plus()); + // + { + for (int j = 16; j >= 1; j /= 2) + { + #if __CUDACC_VER_MAJOR__ >= 9 + int val = __shfl_down_sync(0xFFFFFFFFU, clipped, j); + #else + int val = __shfl_down(clipped, j); + #endif + clipped += val; + } + + if (tid % 32 == 0) + smem[tid / 32] = clipped; + + __syncthreads(); + + if (tid < 8) + { + clipped = smem[tid]; + + for (int j = 4; j >= 1; j /= 2) + { + #if __CUDACC_VER_MAJOR__ >= 9 + int val = __shfl_down_sync(0x000000FFU, clipped, j); + #else + int val = __shfl_down(clipped, j); + #endif + clipped += val; + } + } + } + // end of code block + + if (tid == 0) + partialSum[i / blockSize] = clipped; + + __syncthreads(); + } + + int partialSum_ = partialSum[tid]; + + // Following code block is in effect equivalent to: + // + // blockReduce(smem, partialSum_, tid, plus()); + // + { + for (int j = 16; j >= 1; j /= 2) + { + #if __CUDACC_VER_MAJOR__ >= 9 + int val = __shfl_down_sync(0xFFFFFFFFU, partialSum_, j); + #else + int val = __shfl_down(partialSum_, j); + #endif + partialSum_ += val; + } + + if (tid % 32 == 0) + smem[tid / 32] = partialSum_; + + __syncthreads(); + + if (tid < 8) + { + partialSum_ = smem[tid]; + + for (int j = 4; j >= 1; j /= 2) + { + #if __CUDACC_VER_MAJOR__ >= 9 + int val = __shfl_down_sync(0x000000FFU, partialSum_, j); + #else + int val = __shfl_down(partialSum_, j); + #endif + partialSum_ += val; + } + } + } + // end of code block + + // broadcast evaluated value && + // redistribute clipped samples evenly + + __shared__ int totalClipped; + __shared__ int redistBatch; + __shared__ int residual; + __shared__ int rStep; + + if (tid == 0) + { + totalClipped = partialSum_; + redistBatch = totalClipped / histSize; + residual = totalClipped - redistBatch * histSize; + + rStep = 1; + if (residual != 0) + rStep = histSize / residual; + } + + __syncthreads(); + + for (int i = tid; i < histSize; i += blockSize) + { + int histVal = hist(histRow, i); + + int equalized = histVal + redistBatch; + + if (residual && i % rStep == 0 && i / rStep < residual) + ++equalized; + + hist(histRow, i) = equalized; + } + } + + __shared__ int partialScan[blockSize]; + + for (int i = tid; i < histSize; i += blockSize) + { + int equalized = hist(histRow, i); + equalized = blockScanInclusive(equalized, smem, tid); + + if (tid == blockSize - 1) + partialScan[i / blockSize] = equalized; + + hist(histRow, i) = equalized; + } + + __syncthreads(); + + int partialScan_ = partialScan[tid]; + partialScan[tid] = blockScanExclusive(partialScan_, smem, tid); + + __syncthreads(); + + for (int i = tid; i < histSize; i += blockSize) + { + const int lutVal = hist(histRow, i) + partialScan[i / blockSize]; + + lut(histRow, i) = saturate_cast(__float2int_rn(lutScale * lutVal)); + } + + #undef histSize + #undef blockSize + } + + void calcLut_8U(PtrStepSzb src, PtrStepb lut, int tilesX, int tilesY, int2 tileSize, int clipLimit, float lutScale, cudaStream_t stream) { const dim3 block(32, 8); const dim3 grid(tilesX, tilesY); - calcLutKernel<<>>(src, lut, tileSize, tilesX, clipLimit, lutScale); + calcLutKernel_8U<<>>(src, lut, tileSize, tilesX, clipLimit, lutScale); CV_CUDEV_SAFE_CALL( cudaGetLastError() ); @@ -128,7 +338,21 @@ namespace clahe CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() ); } - __global__ void transformKernel(const PtrStepSzb src, PtrStepb dst, const PtrStepb lut, const int2 tileSize, const int tilesX, const int tilesY) + void calcLut_16U(PtrStepSzus src, PtrStepus lut, int tilesX, int tilesY, int2 tileSize, int clipLimit, float lutScale, PtrStepSzi hist, cudaStream_t stream) + { + const dim3 block(32, 8); + const dim3 grid(tilesX, tilesY); + + calcLutKernel_16U<<>>(src, lut, tileSize, tilesX, clipLimit, lutScale, hist); + + CV_CUDEV_SAFE_CALL( cudaGetLastError() ); + + if (stream == 0) + CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() ); + } + + template + __global__ void transformKernel(const PtrStepSz src, PtrStep dst, const PtrStep lut, const int2 tileSize, const int tilesX, const int tilesY) { const int x = blockIdx.x * blockDim.x + threadIdx.x; const int y = blockIdx.y * blockDim.y + threadIdx.y; @@ -159,22 +383,26 @@ namespace clahe res += lut(ty2 * tilesX + tx1, srcVal) * ((1.0f - xa) * (ya)); res += lut(ty2 * tilesX + tx2, srcVal) * ((xa) * (ya)); - dst(y, x) = saturate_cast(res); + dst(y, x) = saturate_cast(res); } - void transform(PtrStepSzb src, PtrStepSzb dst, PtrStepb lut, int tilesX, int tilesY, int2 tileSize, cudaStream_t stream) + template + void transform(PtrStepSz src, PtrStepSz dst, PtrStep lut, int tilesX, int tilesY, int2 tileSize, cudaStream_t stream) { const dim3 block(32, 8); const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y)); - CV_CUDEV_SAFE_CALL( cudaFuncSetCacheConfig(transformKernel, cudaFuncCachePreferL1) ); + CV_CUDEV_SAFE_CALL( cudaFuncSetCacheConfig(transformKernel, cudaFuncCachePreferL1) ); - transformKernel<<>>(src, dst, lut, tileSize, tilesX, tilesY); + transformKernel<<>>(src, dst, lut, tileSize, tilesX, tilesY); CV_CUDEV_SAFE_CALL( cudaGetLastError() ); if (stream == 0) CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() ); } + + template void transform(PtrStepSz src, PtrStepSz dst, PtrStep lut, int tilesX, int tilesY, int2 tileSize, cudaStream_t stream); + template void transform(PtrStepSz src, PtrStepSz dst, PtrStep lut, int tilesX, int tilesY, int2 tileSize, cudaStream_t stream); } #endif // CUDA_DISABLER diff --git a/modules/cudaimgproc/src/histogram.cpp b/modules/cudaimgproc/src/histogram.cpp index 6e219b641b..e616c5a2e9 100644 --- a/modules/cudaimgproc/src/histogram.cpp +++ b/modules/cudaimgproc/src/histogram.cpp @@ -141,8 +141,9 @@ void cv::cuda::equalizeHist(InputArray _src, OutputArray _dst, Stream& _stream) namespace clahe { - void calcLut(PtrStepSzb src, PtrStepb lut, int tilesX, int tilesY, int2 tileSize, int clipLimit, float lutScale, cudaStream_t stream); - void transform(PtrStepSzb src, PtrStepSzb dst, PtrStepb lut, int tilesX, int tilesY, int2 tileSize, cudaStream_t stream); + void calcLut_8U(PtrStepSzb src, PtrStepb lut, int tilesX, int tilesY, int2 tileSize, int clipLimit, float lutScale, cudaStream_t stream); + void calcLut_16U(PtrStepSzus src, PtrStepus lut, int tilesX, int tilesY, int2 tileSize, int clipLimit, float lutScale, PtrStepSzi hist, cudaStream_t stream); + template void transform(PtrStepSz src, PtrStepSz dst, PtrStep lut, int tilesX, int tilesY, int2 tileSize, cudaStream_t stream); } namespace @@ -170,6 +171,7 @@ namespace GpuMat srcExt_; GpuMat lut_; + GpuMat hist_; // histogram on global memory for CV_16UC1 case }; CLAHE_Impl::CLAHE_Impl(double clipLimit, int tilesX, int tilesY) : @@ -186,14 +188,16 @@ namespace { GpuMat src = _src.getGpuMat(); - CV_Assert( src.type() == CV_8UC1 ); + const int type = src.type(); - _dst.create( src.size(), src.type() ); + CV_Assert( type == CV_8UC1 || type == CV_16UC1 ); + + _dst.create( src.size(), type ); GpuMat dst = _dst.getGpuMat(); - const int histSize = 256; + const int histSize = type == CV_8UC1 ? 256 : 65536; - ensureSizeIsEnough(tilesX_ * tilesY_, histSize, CV_8UC1, lut_); + ensureSizeIsEnough(tilesX_ * tilesY_, histSize, type, lut_); cudaStream_t stream = StreamAccessor::getStream(s); @@ -227,9 +231,18 @@ namespace clipLimit = std::max(clipLimit, 1); } - clahe::calcLut(srcForLut, lut_, tilesX_, tilesY_, make_int2(tileSize.width, tileSize.height), clipLimit, lutScale, stream); + if (type == CV_8UC1) + clahe::calcLut_8U(srcForLut, lut_, tilesX_, tilesY_, make_int2(tileSize.width, tileSize.height), clipLimit, lutScale, stream); + else // type == CV_16UC1 + { + ensureSizeIsEnough(tilesX_ * tilesY_, histSize, CV_32SC1, hist_); + clahe::calcLut_16U(srcForLut, lut_, tilesX_, tilesY_, make_int2(tileSize.width, tileSize.height), clipLimit, lutScale, hist_, stream); + } - clahe::transform(src, dst, lut_, tilesX_, tilesY_, make_int2(tileSize.width, tileSize.height), stream); + if (type == CV_8UC1) + clahe::transform(src, dst, lut_, tilesX_, tilesY_, make_int2(tileSize.width, tileSize.height), stream); + else // type == CV_16UC1 + clahe::transform(src, dst, lut_, tilesX_, tilesY_, make_int2(tileSize.width, tileSize.height), stream); } void CLAHE_Impl::setClipLimit(double clipLimit) diff --git a/modules/cudaimgproc/test/test_histogram.cpp b/modules/cudaimgproc/test/test_histogram.cpp index eb084609e9..6af8eb2135 100644 --- a/modules/cudaimgproc/test/test_histogram.cpp +++ b/modules/cudaimgproc/test/test_histogram.cpp @@ -236,17 +236,19 @@ namespace IMPLEMENT_PARAM_CLASS(ClipLimit, double) } -PARAM_TEST_CASE(CLAHE, cv::cuda::DeviceInfo, cv::Size, ClipLimit) +PARAM_TEST_CASE(CLAHE, cv::cuda::DeviceInfo, cv::Size, ClipLimit, MatType) { cv::cuda::DeviceInfo devInfo; cv::Size size; double clipLimit; + int type; virtual void SetUp() { devInfo = GET_PARAM(0); size = GET_PARAM(1); clipLimit = GET_PARAM(2); + type = GET_PARAM(3); cv::cuda::setDevice(devInfo.deviceID()); } @@ -254,7 +256,11 @@ PARAM_TEST_CASE(CLAHE, cv::cuda::DeviceInfo, cv::Size, ClipLimit) CUDA_TEST_P(CLAHE, Accuracy) { - cv::Mat src = randomMat(size, CV_8UC1); + cv::Mat src; + if (type == CV_8UC1) + src = randomMat(size, type); + else if (type == CV_16UC1) + src = randomMat(size, type, 0, 65535); cv::Ptr clahe = cv::cuda::createCLAHE(clipLimit); cv::cuda::GpuMat dst; @@ -270,7 +276,8 @@ CUDA_TEST_P(CLAHE, Accuracy) INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, CLAHE, testing::Combine( ALL_DEVICES, DIFFERENT_SIZES, - testing::Values(0.0, 40.0))); + testing::Values(0.0, 5.0, 10.0, 20.0, 40.0), + testing::Values(MatType(CV_8UC1), MatType(CV_16UC1)))); }} // namespace diff --git a/modules/imgproc/perf/perf_histogram.cpp b/modules/imgproc/perf/perf_histogram.cpp index 4f54e948bb..d80d8a6d51 100644 --- a/modules/imgproc/perf/perf_histogram.cpp +++ b/modules/imgproc/perf/perf_histogram.cpp @@ -141,18 +141,20 @@ PERF_TEST_P(Dim_Cmpmethod, compareHist, SANITY_CHECK_NOTHING(); } -typedef tuple Sz_ClipLimit_t; +typedef tuple Sz_ClipLimit_t; typedef TestBaseWithParam Sz_ClipLimit; PERF_TEST_P(Sz_ClipLimit, CLAHE, testing::Combine(testing::Values(::perf::szVGA, ::perf::sz720p, ::perf::sz1080p), - testing::Values(0.0, 40.0)) + testing::Values(0.0, 40.0), + testing::Values(MatType(CV_8UC1), MatType(CV_16UC1))) ) { const Size size = get<0>(GetParam()); const double clipLimit = get<1>(GetParam()); + const int type = get<2>(GetParam()); - Mat src(size, CV_8UC1); + Mat src(size, type); declare.in(src, WARMUP_RNG); Ptr clahe = createCLAHE(clipLimit);