diff --git a/modules/cudaimgproc/src/cuda/clahe.cu b/modules/cudaimgproc/src/cuda/clahe.cu index 455aa20d98..b66a7d8a66 100644 --- a/modules/cudaimgproc/src/cuda/clahe.cu +++ b/modules/cudaimgproc/src/cuda/clahe.cu @@ -42,15 +42,9 @@ #if !defined CUDA_DISABLER -#include "opencv2/core/cuda/common.hpp" -#include "opencv2/core/cuda/functional.hpp" -#include "opencv2/core/cuda/emulation.hpp" -#include "opencv2/core/cuda/scan.hpp" -#include "opencv2/core/cuda/reduce.hpp" -#include "opencv2/core/cuda/saturate_cast.hpp" +#include "opencv2/cudev.hpp" -using namespace cv::cuda; -using namespace cv::cuda::device; +using namespace cv::cudev; namespace clahe { @@ -73,7 +67,7 @@ namespace clahe for (int j = threadIdx.x; j < tileSize.x; j += blockDim.x) { const int data = srcPtr[j]; - Emulation::smem::atomicAdd(&smem[data], 1); + ::atomicAdd(&smem[data], 1); } } @@ -96,7 +90,7 @@ namespace clahe // find number of overall clipped samples - reduce<256>(smem, clipped, tid, plus()); + blockReduce<256>(smem, clipped, tid, plus()); // broadcast evaluated value @@ -128,10 +122,10 @@ namespace clahe calcLutKernel<<>>(src, lut, tileSize, tilesX, clipLimit, lutScale); - cudaSafeCall( cudaGetLastError() ); + CV_CUDEV_SAFE_CALL( cudaGetLastError() ); if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); + CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() ); } __global__ void transformKernel(const PtrStepSzb src, PtrStepb dst, const PtrStepb lut, const int2 tileSize, const int tilesX, const int tilesY) @@ -173,13 +167,13 @@ namespace clahe const dim3 block(32, 8); const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y)); - cudaSafeCall( cudaFuncSetCacheConfig(transformKernel, cudaFuncCachePreferL1) ); + CV_CUDEV_SAFE_CALL( cudaFuncSetCacheConfig(transformKernel, cudaFuncCachePreferL1) ); transformKernel<<>>(src, dst, lut, tileSize, tilesX, tilesY); - cudaSafeCall( cudaGetLastError() ); + CV_CUDEV_SAFE_CALL( cudaGetLastError() ); if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); + CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() ); } } diff --git a/modules/cudev/include/opencv2/cudev/block/scan.hpp b/modules/cudev/include/opencv2/cudev/block/scan.hpp index cd75a3e197..705f875a6e 100644 --- a/modules/cudev/include/opencv2/cudev/block/scan.hpp +++ b/modules/cudev/include/opencv2/cudev/block/scan.hpp @@ -48,12 +48,134 @@ #include "../common.hpp" #include "../warp/scan.hpp" +#include "../warp/warp.hpp" namespace cv { namespace cudev { //! @addtogroup cudev //! @{ +#if __CUDACC_VER_MAJOR__ >= 9 + +// Usage Note +// - THREADS_NUM should be equal to the number of threads in this block. +// - smem must be able to contain at least n elements of type T, where n is equal to the number +// of warps in this block. The number can be calculated by divUp(THREADS_NUM, WARP_SIZE). +// +// Dev Note +// - Starting from CUDA 9.0, support for Fermi is dropped. So CV_CUDEV_ARCH >= 300 is implied. +// - "For Pascal and earlier architectures (CV_CUDEV_ARCH < 700), all threads in mask must execute +// the same warp intrinsic instruction in convergence, and the union of all values in mask must +// be equal to the warp's active mask." +// (https://docs.nvidia.com/cuda/archive/10.0/cuda-c-programming-guide#independent-thread-scheduling-7-x) +// - Above restriction does not apply starting from Volta (CV_CUDEV_ARCH >= 700). We just need to +// take care so that "all non-exited threads named in mask must execute the same intrinsic with +// the same mask." +// (https://docs.nvidia.com/cuda/archive/10.0/cuda-c-programming-guide#warp-description) + +template +__device__ T blockScanInclusive(T data, volatile T* smem, uint tid) +{ + const int residual = THREADS_NUM & (WARP_SIZE - 1); + +#if CV_CUDEV_ARCH < 700 + const uint residual_mask = (1U << residual) - 1; +#endif + + if (THREADS_NUM > WARP_SIZE) + { + // bottom-level inclusive warp scan + #if CV_CUDEV_ARCH >= 700 + T warpResult = warpScanInclusive(0xFFFFFFFFU, data); + #else + T warpResult; + + if (0 == residual) + warpResult = warpScanInclusive(0xFFFFFFFFU, data); + else + { + const int n_warps = divUp(THREADS_NUM, WARP_SIZE); + const int warp_num = Warp::warpId(); + + if (warp_num < n_warps - 1) + warpResult = warpScanInclusive(0xFFFFFFFFU, data); + else + { + // We are at the last threads of a block whose number of threads + // is not a multiple of the warp size + warpResult = warpScanInclusive(residual_mask, data); + } + } + #endif + + __syncthreads(); + + // save top elements of each warp for exclusive warp scan + // sync to wait for warp scans to complete (because smem is being overwritten) + if ((tid & (WARP_SIZE - 1)) == (WARP_SIZE - 1)) + { + smem[tid >> LOG_WARP_SIZE] = warpResult; + } + + __syncthreads(); + + int quot = THREADS_NUM / WARP_SIZE; + + if (tid < quot) + { + // grab top warp elements + T val = smem[tid]; + + uint mask = (1LLU << quot) - 1; + + if (0 == residual) + { + // calculate exclusive scan and write back to shared memory + smem[tid] = warpScanExclusive(mask, val); + } + else + { + // calculate inclusive scan and write back to shared memory with offset 1 + smem[tid + 1] = warpScanInclusive(mask, val); + + if (tid == 0) + smem[0] = 0; + } + } + + __syncthreads(); + + // return updated warp scans + return warpResult + smem[tid >> LOG_WARP_SIZE]; + } + else + { + #if CV_CUDEV_ARCH >= 700 + return warpScanInclusive(0xFFFFFFFFU, data); + #else + if (THREADS_NUM == WARP_SIZE) + return warpScanInclusive(0xFFFFFFFFU, data); + else + return warpScanInclusive(residual_mask, data); + #endif + } +} + +template +__device__ __forceinline__ T blockScanExclusive(T data, volatile T* smem, uint tid) +{ + return blockScanInclusive(data, smem, tid) - data; +} + +#else // __CUDACC_VER_MAJOR__ >= 9 + +// Usage Note +// - THREADS_NUM should be equal to the number of threads in this block. +// - (>= Kepler) smem must be able to contain at least n elements of type T, where n is equal to the number +// of warps in this block. The number can be calculated by divUp(THREADS_NUM, WARP_SIZE). +// - (Fermi) smem must be able to contain at least n elements of type T, where n is equal to the number +// of threads in this block (= THREADS_NUM). + template __device__ T blockScanInclusive(T data, volatile T* smem, uint tid) { @@ -73,18 +195,31 @@ __device__ T blockScanInclusive(T data, volatile T* smem, uint tid) __syncthreads(); - if (tid < (THREADS_NUM / WARP_SIZE)) + int quot = THREADS_NUM / WARP_SIZE; + + if (tid < quot) { // grab top warp elements T val = smem[tid]; - // calculate exclusive scan and write back to shared memory - smem[tid] = warpScanExclusive(val, smem, tid); + if (0 == (THREADS_NUM & (WARP_SIZE - 1))) + { + // calculate exclusive scan and write back to shared memory + smem[tid] = warpScanExclusive(val, smem, tid); + } + else + { + // calculate inclusive scan and write back to shared memory with offset 1 + smem[tid + 1] = warpScanInclusive(val, smem, tid); + + if (tid == 0) + smem[0] = 0; + } } __syncthreads(); - // return updated warp scans with exclusive scan results + // return updated warp scans return warpResult + smem[tid >> LOG_WARP_SIZE]; } else @@ -99,6 +234,8 @@ __device__ __forceinline__ T blockScanExclusive(T data, volatile T* smem, uint t return blockScanInclusive(data, smem, tid) - data; } +#endif // __CUDACC_VER_MAJOR__ >= 9 + //! @} }} diff --git a/modules/cudev/include/opencv2/cudev/grid/detail/integral.hpp b/modules/cudev/include/opencv2/cudev/grid/detail/integral.hpp index 7672aca71d..d1014b3ceb 100644 --- a/modules/cudev/include/opencv2/cudev/grid/detail/integral.hpp +++ b/modules/cudev/include/opencv2/cudev/grid/detail/integral.hpp @@ -215,7 +215,7 @@ namespace integral_detail #pragma unroll for (int i = 1; i < 32; i *= 2) { - const int n = shfl_up(sum, i, 32); + const int n = compatible_shfl_up(sum, i, 32); if (lane_id >= i) { @@ -245,9 +245,9 @@ namespace integral_detail int warp_sum = sums[lane_id]; #pragma unroll - for (int i = 1; i <= 32; i *= 2) + for (int i = 1; i < 32; i *= 2) { - const int n = shfl_up(warp_sum, i, 32); + const int n = compatible_shfl_up(warp_sum, i, 32); if (lane_id >= i) warp_sum += n; @@ -453,7 +453,7 @@ namespace integral_detail for (int i = 1; i <= 8; i *= 2) { - T n = shfl_up(partial_sum, i, 32); + T n = compatible_shfl_up(partial_sum, i, 32); if (lane_id >= i) partial_sum += n; diff --git a/modules/cudev/include/opencv2/cudev/warp/scan.hpp b/modules/cudev/include/opencv2/cudev/warp/scan.hpp index a5007f881a..bab462973f 100644 --- a/modules/cudev/include/opencv2/cudev/warp/scan.hpp +++ b/modules/cudev/include/opencv2/cudev/warp/scan.hpp @@ -55,6 +55,36 @@ namespace cv { namespace cudev { //! @addtogroup cudev //! @{ +#if __CUDACC_VER_MAJOR__ >= 9 + +// Starting from CUDA 9.0, support for Fermi is dropped. +// So CV_CUDEV_ARCH >= 300 is implied. + +template +__device__ T warpScanInclusive(uint mask, T data) +{ + const uint laneId = Warp::laneId(); + + // scan on shufl functions + #pragma unroll + for (int i = 1; i <= (WARP_SIZE / 2); i *= 2) + { + const T val = shfl_up_sync(mask, data, i); + if (laneId >= i) + data += val; + } + + return data; +} + +template +__device__ __forceinline__ T warpScanExclusive(uint mask, T data) +{ + return warpScanInclusive(mask, data) - data; +} + +#else // __CUDACC_VER_MAJOR__ >= 9 + template __device__ T warpScanInclusive(T data, volatile T* smem, uint tid) { @@ -75,19 +105,16 @@ __device__ T warpScanInclusive(T data, volatile T* smem, uint tid) return data; #else - uint pos = 2 * tid - (tid & (WARP_SIZE - 1)); - smem[pos] = 0; + const uint laneId = Warp::laneId(); - pos += WARP_SIZE; - smem[pos] = data; + smem[tid] = data; - smem[pos] += smem[pos - 1]; - smem[pos] += smem[pos - 2]; - smem[pos] += smem[pos - 4]; - smem[pos] += smem[pos - 8]; - smem[pos] += smem[pos - 16]; + #pragma unroll + for (int i = 1; i <= (WARP_SIZE / 2); i *= 2) + if (laneId >= i) + smem[tid] += smem[tid - i]; - return smem[pos]; + return smem[tid]; #endif } @@ -97,6 +124,8 @@ __device__ __forceinline__ T warpScanExclusive(T data, volatile T* smem, uint ti return warpScanInclusive(data, smem, tid) - data; } +#endif // __CUDACC_VER_MAJOR__ >= 9 + //! @} }} diff --git a/modules/cudev/include/opencv2/cudev/warp/shuffle.hpp b/modules/cudev/include/opencv2/cudev/warp/shuffle.hpp index e776dd65df..dd142c6719 100644 --- a/modules/cudev/include/opencv2/cudev/warp/shuffle.hpp +++ b/modules/cudev/include/opencv2/cudev/warp/shuffle.hpp @@ -48,6 +48,8 @@ #include "../common.hpp" #include "../util/vec_traits.hpp" +#include "../block/block.hpp" +#include "warp.hpp" namespace cv { namespace cudev { @@ -59,7 +61,7 @@ namespace cv { namespace cudev { #if __CUDACC_VER_MAJOR__ >= 9 # define __shfl(x, y, z) __shfl_sync(0xFFFFFFFFU, x, y, z) # define __shfl_xor(x, y, z) __shfl_xor_sync(0xFFFFFFFFU, x, y, z) -# define __shfl_up(x, y, z) __shfl_up_sync(0xFFFFFFFFU, x, y, z) +//# define __shfl_up(x, y, z) __shfl_up_sync(0xFFFFFFFFU, x, y, z) # define __shfl_down(x, y, z) __shfl_down_sync(0xFFFFFFFFU, x, y, z) #endif @@ -155,6 +157,53 @@ CV_CUDEV_SHFL_VEC_INST(double) // shfl_up +template +__device__ __forceinline__ T compatible_shfl_up(T val, uint delta, int width = warpSize) +{ +#if __CUDACC_VER_MAJOR__ < 9 + + return shfl_up(val, delta, width); + +#else // __CUDACC_VER_MAJOR__ < 9 + +#if CV_CUDEV_ARCH >= 700 + return shfl_up_sync(0xFFFFFFFFU, val, delta, width); +#else + const int block_size = Block::blockSize(); + const int residual = block_size & (warpSize - 1); + + if (0 == residual) + return shfl_up_sync(0xFFFFFFFFU, val, delta, width); + else + { + const int n_warps = divUp(block_size, warpSize); + const int warp_id = Warp::warpId(); + + if (warp_id < n_warps - 1) + return shfl_up_sync(0xFFFFFFFFU, val, delta, width); + else + { + // We are at the last threads of a block whose number of threads + // is not a multiple of the warp size + uint mask = (1LU << residual) - 1; + return shfl_up_sync(mask, val, delta, width); + } + } +#endif + +#endif // __CUDACC_VER_MAJOR__ < 9 +} + +#if __CUDACC_VER_MAJOR__ >= 9 + +template +__device__ __forceinline__ T shfl_up_sync(uint mask, T val, uint delta, int width = warpSize) +{ + return (T) __shfl_up_sync(mask, val, delta, width); +} + +#else + __device__ __forceinline__ uchar shfl_up(uchar val, uint delta, int width = warpSize) { return (uchar) __shfl_up((int) val, delta, width); @@ -244,6 +293,8 @@ CV_CUDEV_SHFL_UP_VEC_INST(double) #undef CV_CUDEV_SHFL_UP_VEC_INST +#endif + // shfl_down __device__ __forceinline__ uchar shfl_down(uchar val, uint delta, int width = warpSize) diff --git a/modules/cudev/test/test_scan.cu b/modules/cudev/test/test_scan.cu new file mode 100644 index 0000000000..e5404d8890 --- /dev/null +++ b/modules/cudev/test/test_scan.cu @@ -0,0 +1,140 @@ + +#include "test_precomp.hpp" + +using namespace cv; +using namespace cv::cudev; +using namespace cvtest; + +// BlockScanInt + +template +__global__ void int_kernel(int* data) +{ + uint tid = Block::threadLineId(); + +#if CV_CUDEV_ARCH >= 300 + const int n_warps = (THREADS_NUM - 1) / WARP_SIZE + 1; + __shared__ int smem[n_warps]; +#else + __shared__ int smem[THREADS_NUM]; +#endif + + data[tid] = blockScanInclusive(data[tid], smem, tid); +} + +#define BLOCK_SCAN_INT_TEST(block_size) \ + TEST(BlockScanInt, BlockSize##block_size) \ + { \ + Mat src = randomMat(Size(block_size, 1), CV_32SC1, 0, 1024); \ + \ + GpuMat d_src; \ + d_src.upload(src); \ + \ + for (int col = 1; col < block_size; col++) \ + src.at(0, col) += src.at(0, col - 1); \ + \ + int_kernel<<<1, block_size>>>((int*)d_src.data); \ + \ + CV_CUDEV_SAFE_CALL(cudaDeviceSynchronize()); \ + \ + EXPECT_MAT_NEAR(d_src, src, 0); \ + } + +BLOCK_SCAN_INT_TEST(29) +BLOCK_SCAN_INT_TEST(30) +BLOCK_SCAN_INT_TEST(32) +BLOCK_SCAN_INT_TEST(40) +BLOCK_SCAN_INT_TEST(41) + +BLOCK_SCAN_INT_TEST(59) +BLOCK_SCAN_INT_TEST(60) +BLOCK_SCAN_INT_TEST(64) +BLOCK_SCAN_INT_TEST(70) +BLOCK_SCAN_INT_TEST(71) + +BLOCK_SCAN_INT_TEST(109) +BLOCK_SCAN_INT_TEST(110) +BLOCK_SCAN_INT_TEST(128) +BLOCK_SCAN_INT_TEST(130) +BLOCK_SCAN_INT_TEST(131) + +BLOCK_SCAN_INT_TEST(189) +BLOCK_SCAN_INT_TEST(200) +BLOCK_SCAN_INT_TEST(256) +BLOCK_SCAN_INT_TEST(300) +BLOCK_SCAN_INT_TEST(311) + +BLOCK_SCAN_INT_TEST(489) +BLOCK_SCAN_INT_TEST(500) +BLOCK_SCAN_INT_TEST(512) +BLOCK_SCAN_INT_TEST(600) +BLOCK_SCAN_INT_TEST(611) + +BLOCK_SCAN_INT_TEST(1024) + +// BlockScanDouble + +template +__global__ void double_kernel(double* data) +{ + uint tid = Block::threadLineId(); + +#if CV_CUDEV_ARCH >= 300 + const int n_warps = (THREADS_NUM - 1) / WARP_SIZE + 1; + __shared__ double smem[n_warps]; +#else + __shared__ double smem[THREADS_NUM]; +#endif + + data[tid] = blockScanInclusive(data[tid], smem, tid); +} + +#define BLOCK_SCAN_DOUBLE_TEST(block_size) \ + TEST(BlockScanDouble, BlockSize##block_size) \ + { \ + Mat src = randomMat(Size(block_size, 1), CV_64FC1, 0.0, 1.0); \ + \ + GpuMat d_src; \ + d_src.upload(src); \ + \ + for (int col = 1; col < block_size; col++) \ + src.at(0, col) += src.at(0, col - 1); \ + \ + double_kernel<<<1, block_size>>>((double*)d_src.data); \ + \ + CV_CUDEV_SAFE_CALL(cudaDeviceSynchronize()); \ + \ + EXPECT_MAT_NEAR(d_src, src, 1e-10); \ + } + +BLOCK_SCAN_DOUBLE_TEST(29) +BLOCK_SCAN_DOUBLE_TEST(30) +BLOCK_SCAN_DOUBLE_TEST(32) +BLOCK_SCAN_DOUBLE_TEST(40) +BLOCK_SCAN_DOUBLE_TEST(41) + +BLOCK_SCAN_DOUBLE_TEST(59) +BLOCK_SCAN_DOUBLE_TEST(60) +BLOCK_SCAN_DOUBLE_TEST(64) +BLOCK_SCAN_DOUBLE_TEST(70) +BLOCK_SCAN_DOUBLE_TEST(71) + +BLOCK_SCAN_DOUBLE_TEST(109) +BLOCK_SCAN_DOUBLE_TEST(110) +BLOCK_SCAN_DOUBLE_TEST(128) +BLOCK_SCAN_DOUBLE_TEST(130) +BLOCK_SCAN_DOUBLE_TEST(131) + +BLOCK_SCAN_DOUBLE_TEST(189) +BLOCK_SCAN_DOUBLE_TEST(200) +BLOCK_SCAN_DOUBLE_TEST(256) +BLOCK_SCAN_DOUBLE_TEST(300) +BLOCK_SCAN_DOUBLE_TEST(311) + +BLOCK_SCAN_DOUBLE_TEST(489) +BLOCK_SCAN_DOUBLE_TEST(500) +BLOCK_SCAN_DOUBLE_TEST(512) +BLOCK_SCAN_DOUBLE_TEST(600) +BLOCK_SCAN_DOUBLE_TEST(611) + +BLOCK_SCAN_DOUBLE_TEST(1024)