__shfl_up_sync with mask for CUDA >= 9

* __shfl_up_sync with proper mask value for CUDA >= 9 * BlockScanInclusive for CUDA >= 9 * compatible_shfl_up for use in integral.hpp * Use CLAHE in cudev * Add tests for BlockScan
2025-07-25 22:57:53 +08:00 · 2019-01-21 15:31:05 +00:00 · 2019-01-21 15:31:05 +00:00 · 970293a229
commit 970293a229
parent 0395b2ea9c
6 changed files with 385 additions and 34 deletions
--- a/modules/cudaimgproc/src/cuda/clahe.cu
+++ b/modules/cudaimgproc/src/cuda/clahe.cu
@ -42,15 +42,9 @@
 #if !defined CUDA_DISABLER
-#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/cudev.hpp"
 #include "opencv2/core/cuda/functional.hpp"
 #include "opencv2/core/cuda/emulation.hpp"
 #include "opencv2/core/cuda/scan.hpp"
 #include "opencv2/core/cuda/reduce.hpp"
 #include "opencv2/core/cuda/saturate_cast.hpp"
-using namespace cv::cuda;
+using namespace cv::cudev;
 using namespace cv::cuda::device;
 namespace clahe
 {
@ -73,7 +67,7 @@ namespace clahe
            for (int j = threadIdx.x; j < tileSize.x; j += blockDim.x)
            {
                const int data = srcPtr[j];
-                Emulation::smem::atomicAdd(&smem[data], 1);
+                ::atomicAdd(&smem[data], 1);
            }
        }
@ -96,7 +90,7 @@ namespace clahe
            // find number of overall clipped samples
-            reduce<256>(smem, clipped, tid, plus<int>());
+            blockReduce<256>(smem, clipped, tid, plus<int>());
            // broadcast evaluated value
@ -128,10 +122,10 @@ namespace clahe
        calcLutKernel<<<grid, block, 0, stream>>>(src, lut, tileSize, tilesX, clipLimit, lutScale);
-        cudaSafeCall( cudaGetLastError() );
+        CV_CUDEV_SAFE_CALL( cudaGetLastError() );
        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
+            CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
    }
    __global__ void transformKernel(const PtrStepSzb src, PtrStepb dst, const PtrStepb lut, const int2 tileSize, const int tilesX, const int tilesY)
@ -173,13 +167,13 @@ namespace clahe
        const dim3 block(32, 8);
        const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
-        cudaSafeCall( cudaFuncSetCacheConfig(transformKernel, cudaFuncCachePreferL1) );
+        CV_CUDEV_SAFE_CALL( cudaFuncSetCacheConfig(transformKernel, cudaFuncCachePreferL1) );
        transformKernel<<<grid, block, 0, stream>>>(src, dst, lut, tileSize, tilesX, tilesY);
-        cudaSafeCall( cudaGetLastError() );
+        CV_CUDEV_SAFE_CALL( cudaGetLastError() );
        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
+            CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
    }
 }
--- a/modules/cudev/include/opencv2/cudev/block/scan.hpp
+++ b/modules/cudev/include/opencv2/cudev/block/scan.hpp
@ -48,12 +48,134 @@
 #include "../common.hpp"
 #include "../warp/scan.hpp"
 #include "../warp/warp.hpp"
 namespace cv { namespace cudev {
 //! @addtogroup cudev
 //! @{
 #if __CUDACC_VER_MAJOR__ >= 9
 // Usage Note
 // - THREADS_NUM should be equal to the number of threads in this block.
 // - smem must be able to contain at least n elements of type T, where n is equal to the number
 //   of warps in this block. The number can be calculated by divUp(THREADS_NUM, WARP_SIZE).
 //
 // Dev Note
 // - Starting from CUDA 9.0, support for Fermi is dropped. So CV_CUDEV_ARCH >= 300 is implied.
 // - "For Pascal and earlier architectures (CV_CUDEV_ARCH < 700), all threads in mask must execute
 //    the same warp intrinsic instruction in convergence, and the union of all values in mask must
 //    be equal to the warp's active mask."
 //   (https://docs.nvidia.com/cuda/archive/10.0/cuda-c-programming-guide#independent-thread-scheduling-7-x)
 // - Above restriction does not apply starting from Volta (CV_CUDEV_ARCH >= 700). We just need to
 //   take care so that "all non-exited threads named in mask must execute the same intrinsic with
 //   the same mask."
 //   (https://docs.nvidia.com/cuda/archive/10.0/cuda-c-programming-guide#warp-description)
 template <int THREADS_NUM, typename T>
 __device__ T blockScanInclusive(T data, volatile T* smem, uint tid)
 {
    const int residual = THREADS_NUM & (WARP_SIZE - 1);
 #if CV_CUDEV_ARCH < 700
    const uint residual_mask = (1U << residual) - 1;
 #endif
    if (THREADS_NUM > WARP_SIZE)
    {
        // bottom-level inclusive warp scan
    #if CV_CUDEV_ARCH >= 700
        T warpResult = warpScanInclusive(0xFFFFFFFFU, data);
    #else
        T warpResult;
        if (0 == residual)
            warpResult = warpScanInclusive(0xFFFFFFFFU, data);
        else
        {
            const int n_warps = divUp(THREADS_NUM, WARP_SIZE);
            const int warp_num = Warp::warpId();
            if (warp_num < n_warps - 1)
                warpResult = warpScanInclusive(0xFFFFFFFFU, data);
            else
            {
                // We are at the last threads of a block whose number of threads
                // is not a multiple of the warp size
                warpResult = warpScanInclusive(residual_mask, data);
            }
        }
    #endif
        __syncthreads();
        // save top elements of each warp for exclusive warp scan
        // sync to wait for warp scans to complete (because smem is being overwritten)
        if ((tid & (WARP_SIZE - 1)) == (WARP_SIZE - 1))
        {
            smem[tid >> LOG_WARP_SIZE] = warpResult;
        }
        __syncthreads();
        int quot = THREADS_NUM / WARP_SIZE;
        if (tid < quot)
        {
            // grab top warp elements
            T val = smem[tid];
            uint mask = (1LLU << quot) - 1;
            if (0 == residual)
            {
                // calculate exclusive scan and write back to shared memory
                smem[tid] = warpScanExclusive(mask, val);
            }
            else
            {
                // calculate inclusive scan and write back to shared memory with offset 1
                smem[tid + 1] = warpScanInclusive(mask, val);
                if (tid == 0)
                    smem[0] = 0;
            }
        }
        __syncthreads();
        // return updated warp scans
        return warpResult + smem[tid >> LOG_WARP_SIZE];
    }
    else
    {
    #if CV_CUDEV_ARCH >= 700
        return warpScanInclusive(0xFFFFFFFFU, data);
    #else
        if (THREADS_NUM == WARP_SIZE)
            return warpScanInclusive(0xFFFFFFFFU, data);
        else
            return warpScanInclusive(residual_mask, data);
    #endif
    }
 }
 template <int THREADS_NUM, typename T>
 __device__ __forceinline__ T blockScanExclusive(T data, volatile T* smem, uint tid)
 {
    return blockScanInclusive<THREADS_NUM>(data, smem, tid) - data;
 }
 #else // __CUDACC_VER_MAJOR__ >= 9
 // Usage Note
 // - THREADS_NUM should be equal to the number of threads in this block.
 // - (>= Kepler) smem must be able to contain at least n elements of type T, where n is equal to the number
 //   of warps in this block. The number can be calculated by divUp(THREADS_NUM, WARP_SIZE).
 // - (Fermi) smem must be able to contain at least n elements of type T, where n is equal to the number
 //   of threads in this block (= THREADS_NUM).
 template <int THREADS_NUM, typename T>
 __device__ T blockScanInclusive(T data, volatile T* smem, uint tid)
 {
@ -73,18 +195,31 @@ __device__ T blockScanInclusive(T data, volatile T* smem, uint tid)
        __syncthreads();
-        if (tid < (THREADS_NUM / WARP_SIZE))
+        int quot = THREADS_NUM / WARP_SIZE;
        if (tid < quot)
        {
            // grab top warp elements
            T val = smem[tid];
-            // calculate exclusive scan and write back to shared memory
+            if (0 == (THREADS_NUM & (WARP_SIZE - 1)))
-            smem[tid] = warpScanExclusive(val, smem, tid);
+            {
                // calculate exclusive scan and write back to shared memory
                smem[tid] = warpScanExclusive(val, smem, tid);
            }
            else
            {
                // calculate inclusive scan and write back to shared memory with offset 1
                smem[tid + 1] = warpScanInclusive(val, smem, tid);
                if (tid == 0)
                    smem[0] = 0;
            }
        }
        __syncthreads();
-        // return updated warp scans with exclusive scan results
+        // return updated warp scans
        return warpResult + smem[tid >> LOG_WARP_SIZE];
    }
    else
@ -99,6 +234,8 @@ __device__ __forceinline__ T blockScanExclusive(T data, volatile T* smem, uint t
    return blockScanInclusive<THREADS_NUM>(data, smem, tid) - data;
 }
 #endif // __CUDACC_VER_MAJOR__ >= 9
 //! @}
 }}
--- a/modules/cudev/include/opencv2/cudev/grid/detail/integral.hpp
+++ b/modules/cudev/include/opencv2/cudev/grid/detail/integral.hpp
@ -215,7 +215,7 @@ namespace integral_detail
        #pragma unroll
        for (int i = 1; i < 32; i *= 2)
        {
-            const int n = shfl_up(sum, i, 32);
+            const int n = compatible_shfl_up(sum, i, 32);
            if (lane_id >= i)
            {
@ -245,9 +245,9 @@ namespace integral_detail
            int warp_sum = sums[lane_id];
            #pragma unroll
-            for (int i = 1; i <= 32; i *= 2)
+            for (int i = 1; i < 32; i *= 2)
            {
-                const int n = shfl_up(warp_sum, i, 32);
+                const int n = compatible_shfl_up(warp_sum, i, 32);
                if (lane_id >= i)
                    warp_sum += n;
@ -453,7 +453,7 @@ namespace integral_detail
            for (int i = 1; i <= 8; i *= 2)
            {
-                T n = shfl_up(partial_sum, i, 32);
+                T n = compatible_shfl_up(partial_sum, i, 32);
                if (lane_id >= i)
                    partial_sum += n;
--- a/modules/cudev/include/opencv2/cudev/warp/scan.hpp
+++ b/modules/cudev/include/opencv2/cudev/warp/scan.hpp
@ -55,6 +55,36 @@ namespace cv { namespace cudev {
 //! @addtogroup cudev
 //! @{
 #if __CUDACC_VER_MAJOR__ >= 9
 // Starting from CUDA 9.0, support for Fermi is dropped.
 // So CV_CUDEV_ARCH >= 300 is implied.
 template <typename T>
 __device__ T warpScanInclusive(uint mask, T data)
 {
    const uint laneId = Warp::laneId();
    // scan on shufl functions
    #pragma unroll
    for (int i = 1; i <= (WARP_SIZE / 2); i *= 2)
    {
        const T val = shfl_up_sync(mask, data, i);
        if (laneId >= i)
              data += val;
    }
    return data;
 }
 template <typename T>
 __device__ __forceinline__ T warpScanExclusive(uint mask, T data)
 {
    return warpScanInclusive(mask, data) - data;
 }
 #else // __CUDACC_VER_MAJOR__ >= 9
 template <typename T>
 __device__ T warpScanInclusive(T data, volatile T* smem, uint tid)
 {
@ -75,19 +105,16 @@ __device__ T warpScanInclusive(T data, volatile T* smem, uint tid)
    return data;
 #else
-    uint pos = 2 * tid - (tid & (WARP_SIZE - 1));
+    const uint laneId = Warp::laneId();
    smem[pos] = 0;
-    pos += WARP_SIZE;
+    smem[tid] = data;
    smem[pos] = data;
-    smem[pos] += smem[pos - 1];
+    #pragma unroll
-    smem[pos] += smem[pos - 2];
+    for (int i = 1; i <= (WARP_SIZE / 2); i *= 2)
-    smem[pos] += smem[pos - 4];
+        if (laneId >= i)
-    smem[pos] += smem[pos - 8];
+            smem[tid] += smem[tid - i];
    smem[pos] += smem[pos - 16];
-    return smem[pos];
+    return smem[tid];
 #endif
 }
@ -97,6 +124,8 @@ __device__ __forceinline__ T warpScanExclusive(T data, volatile T* smem, uint ti
    return warpScanInclusive(data, smem, tid) - data;
 }
 #endif // __CUDACC_VER_MAJOR__ >= 9
 //! @}
 }}
--- a/modules/cudev/include/opencv2/cudev/warp/shuffle.hpp
+++ b/modules/cudev/include/opencv2/cudev/warp/shuffle.hpp
@ -48,6 +48,8 @@
 #include "../common.hpp"
 #include "../util/vec_traits.hpp"
 #include "../block/block.hpp"
 #include "warp.hpp"
 namespace cv { namespace cudev {
@ -59,7 +61,7 @@ namespace cv { namespace cudev {
 #if __CUDACC_VER_MAJOR__ >= 9
 #  define __shfl(x, y, z) __shfl_sync(0xFFFFFFFFU, x, y, z)
 #  define __shfl_xor(x, y, z) __shfl_xor_sync(0xFFFFFFFFU, x, y, z)
-#  define __shfl_up(x, y, z) __shfl_up_sync(0xFFFFFFFFU, x, y, z)
+//#  define __shfl_up(x, y, z) __shfl_up_sync(0xFFFFFFFFU, x, y, z)
 #  define __shfl_down(x, y, z) __shfl_down_sync(0xFFFFFFFFU, x, y, z)
 #endif
@ -155,6 +157,53 @@ CV_CUDEV_SHFL_VEC_INST(double)
 // shfl_up
 template <typename T>
 __device__ __forceinline__ T compatible_shfl_up(T val, uint delta, int width = warpSize)
 {
 #if __CUDACC_VER_MAJOR__ < 9
    return shfl_up(val, delta, width);
 #else // __CUDACC_VER_MAJOR__ < 9
 #if CV_CUDEV_ARCH >= 700
    return shfl_up_sync(0xFFFFFFFFU, val, delta, width);
 #else
    const int block_size = Block::blockSize();
    const int residual = block_size & (warpSize - 1);
    if (0 == residual)
        return shfl_up_sync(0xFFFFFFFFU, val, delta, width);
    else
    {
        const int n_warps = divUp(block_size, warpSize);
        const int warp_id = Warp::warpId();
        if (warp_id < n_warps - 1)
            return shfl_up_sync(0xFFFFFFFFU, val, delta, width);
        else
        {
            // We are at the last threads of a block whose number of threads
            // is not a multiple of the warp size
            uint mask = (1LU << residual) - 1;
            return shfl_up_sync(mask, val, delta, width);
        }
    }
 #endif
 #endif // __CUDACC_VER_MAJOR__ < 9
 }
 #if __CUDACC_VER_MAJOR__ >= 9
 template <typename T>
 __device__ __forceinline__ T shfl_up_sync(uint mask, T val, uint delta, int width = warpSize)
 {
    return (T) __shfl_up_sync(mask, val, delta, width);
 }
 #else
 __device__ __forceinline__ uchar shfl_up(uchar val, uint delta, int width = warpSize)
 {
    return (uchar) __shfl_up((int) val, delta, width);
@ -244,6 +293,8 @@ CV_CUDEV_SHFL_UP_VEC_INST(double)
 #undef CV_CUDEV_SHFL_UP_VEC_INST
 #endif
 // shfl_down
 __device__ __forceinline__ uchar shfl_down(uchar val, uint delta, int width = warpSize)
--- a/modules/cudev/test/test_scan.cu
+++ b/modules/cudev/test/test_scan.cu
@ -0,0 +1,140 @@
 #include "test_precomp.hpp"
 using namespace cv;
 using namespace cv::cudev;
 using namespace cvtest;
 // BlockScanInt
 template <int THREADS_NUM>
 __global__ void int_kernel(int* data)
 {
    uint tid = Block::threadLineId();
 #if CV_CUDEV_ARCH >= 300
    const int n_warps = (THREADS_NUM - 1) / WARP_SIZE + 1;
    __shared__ int smem[n_warps];
 #else
    __shared__ int smem[THREADS_NUM];
 #endif
    data[tid] = blockScanInclusive<THREADS_NUM>(data[tid], smem, tid);
 }
 #define BLOCK_SCAN_INT_TEST(block_size)                                 \
    TEST(BlockScanInt, BlockSize##block_size)                           \
    {                                                                   \
        Mat src = randomMat(Size(block_size, 1), CV_32SC1, 0, 1024);    \
                                                                        \
        GpuMat d_src;                                                   \
        d_src.upload(src);                                              \
                                                                        \
        for (int col = 1; col < block_size; col++)                      \
            src.at<int>(0, col) += src.at<int>(0, col - 1);             \
                                                                        \
        int_kernel<block_size><<<1, block_size>>>((int*)d_src.data);    \
                                                                        \
        CV_CUDEV_SAFE_CALL(cudaDeviceSynchronize());                    \
                                                                        \
        EXPECT_MAT_NEAR(d_src, src, 0);                                 \
    }
 BLOCK_SCAN_INT_TEST(29)
 BLOCK_SCAN_INT_TEST(30)
 BLOCK_SCAN_INT_TEST(32)
 BLOCK_SCAN_INT_TEST(40)
 BLOCK_SCAN_INT_TEST(41)
 BLOCK_SCAN_INT_TEST(59)
 BLOCK_SCAN_INT_TEST(60)
 BLOCK_SCAN_INT_TEST(64)
 BLOCK_SCAN_INT_TEST(70)
 BLOCK_SCAN_INT_TEST(71)
 BLOCK_SCAN_INT_TEST(109)
 BLOCK_SCAN_INT_TEST(110)
 BLOCK_SCAN_INT_TEST(128)
 BLOCK_SCAN_INT_TEST(130)
 BLOCK_SCAN_INT_TEST(131)
 BLOCK_SCAN_INT_TEST(189)
 BLOCK_SCAN_INT_TEST(200)
 BLOCK_SCAN_INT_TEST(256)
 BLOCK_SCAN_INT_TEST(300)
 BLOCK_SCAN_INT_TEST(311)
 BLOCK_SCAN_INT_TEST(489)
 BLOCK_SCAN_INT_TEST(500)
 BLOCK_SCAN_INT_TEST(512)
 BLOCK_SCAN_INT_TEST(600)
 BLOCK_SCAN_INT_TEST(611)
 BLOCK_SCAN_INT_TEST(1024)
 // BlockScanDouble
 template <int THREADS_NUM>
 __global__ void double_kernel(double* data)
 {
    uint tid = Block::threadLineId();
 #if CV_CUDEV_ARCH >= 300
    const int n_warps = (THREADS_NUM - 1) / WARP_SIZE + 1;
    __shared__ double smem[n_warps];
 #else
    __shared__ double smem[THREADS_NUM];
 #endif
    data[tid] = blockScanInclusive<THREADS_NUM>(data[tid], smem, tid);
 }
 #define BLOCK_SCAN_DOUBLE_TEST(block_size)                                  \
    TEST(BlockScanDouble, BlockSize##block_size)                            \
    {                                                                       \
        Mat src = randomMat(Size(block_size, 1), CV_64FC1, 0.0, 1.0);       \
                                                                            \
        GpuMat d_src;                                                       \
        d_src.upload(src);                                                  \
                                                                            \
        for (int col = 1; col < block_size; col++)                          \
            src.at<double>(0, col) += src.at<double>(0, col - 1);           \
                                                                            \
        double_kernel<block_size><<<1, block_size>>>((double*)d_src.data);  \
                                                                            \
        CV_CUDEV_SAFE_CALL(cudaDeviceSynchronize());                        \
                                                                            \
        EXPECT_MAT_NEAR(d_src, src, 1e-10);                                 \
    }
 BLOCK_SCAN_DOUBLE_TEST(29)
 BLOCK_SCAN_DOUBLE_TEST(30)
 BLOCK_SCAN_DOUBLE_TEST(32)
 BLOCK_SCAN_DOUBLE_TEST(40)
 BLOCK_SCAN_DOUBLE_TEST(41)
 BLOCK_SCAN_DOUBLE_TEST(59)
 BLOCK_SCAN_DOUBLE_TEST(60)
 BLOCK_SCAN_DOUBLE_TEST(64)
 BLOCK_SCAN_DOUBLE_TEST(70)
 BLOCK_SCAN_DOUBLE_TEST(71)
 BLOCK_SCAN_DOUBLE_TEST(109)
 BLOCK_SCAN_DOUBLE_TEST(110)
 BLOCK_SCAN_DOUBLE_TEST(128)
 BLOCK_SCAN_DOUBLE_TEST(130)
 BLOCK_SCAN_DOUBLE_TEST(131)
 BLOCK_SCAN_DOUBLE_TEST(189)
 BLOCK_SCAN_DOUBLE_TEST(200)
 BLOCK_SCAN_DOUBLE_TEST(256)
 BLOCK_SCAN_DOUBLE_TEST(300)
 BLOCK_SCAN_DOUBLE_TEST(311)
 BLOCK_SCAN_DOUBLE_TEST(489)
 BLOCK_SCAN_DOUBLE_TEST(500)
 BLOCK_SCAN_DOUBLE_TEST(512)
 BLOCK_SCAN_DOUBLE_TEST(600)
 BLOCK_SCAN_DOUBLE_TEST(611)
 BLOCK_SCAN_DOUBLE_TEST(1024)