__shfl_up_sync with mask for CUDA >= 9

* __shfl_up_sync with proper mask value for CUDA >= 9 * BlockScanInclusive for CUDA >= 9 * compatible_shfl_up for use in integral.hpp * Use CLAHE in cudev * Add tests for BlockScan
2025-07-24 14:06:27 +08:00 · 2019-01-21 15:31:05 +00:00 · 2019-01-21 15:31:05 +00:00 · 970293a229
commit 970293a229
parent 0395b2ea9c
6 changed files with 385 additions and 34 deletions
--- a/modules/cudaimgproc/src/cuda/clahe.cu
+++ b/modules/cudaimgproc/src/cuda/clahe.cu
@ -42,15 +42,9 @@

 #if !defined CUDA_DISABLER

-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/functional.hpp"
-#include "opencv2/core/cuda/emulation.hpp"
-#include "opencv2/core/cuda/scan.hpp"
-#include "opencv2/core/cuda/reduce.hpp"
-#include "opencv2/core/cuda/saturate_cast.hpp"
+#include "opencv2/cudev.hpp"

-using namespace cv::cuda;
-using namespace cv::cuda::device;
+using namespace cv::cudev;

 namespace clahe
 {
@ -73,7 +67,7 @@ namespace clahe
            for (int j = threadIdx.x; j < tileSize.x; j += blockDim.x)
            {
                const int data = srcPtr[j];
-                Emulation::smem::atomicAdd(&smem[data], 1);
+                ::atomicAdd(&smem[data], 1);
            }
        }

@ -96,7 +90,7 @@ namespace clahe

            // find number of overall clipped samples

-            reduce<256>(smem, clipped, tid, plus<int>());
+            blockReduce<256>(smem, clipped, tid, plus<int>());

            // broadcast evaluated value

@ -128,10 +122,10 @@ namespace clahe

        calcLutKernel<<<grid, block, 0, stream>>>(src, lut, tileSize, tilesX, clipLimit, lutScale);

-        cudaSafeCall( cudaGetLastError() );
+        CV_CUDEV_SAFE_CALL( cudaGetLastError() );

        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
+            CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
    }

    __global__ void transformKernel(const PtrStepSzb src, PtrStepb dst, const PtrStepb lut, const int2 tileSize, const int tilesX, const int tilesY)
@ -173,13 +167,13 @@ namespace clahe
        const dim3 block(32, 8);
        const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));

-        cudaSafeCall( cudaFuncSetCacheConfig(transformKernel, cudaFuncCachePreferL1) );
+        CV_CUDEV_SAFE_CALL( cudaFuncSetCacheConfig(transformKernel, cudaFuncCachePreferL1) );

        transformKernel<<<grid, block, 0, stream>>>(src, dst, lut, tileSize, tilesX, tilesY);
-        cudaSafeCall( cudaGetLastError() );
+        CV_CUDEV_SAFE_CALL( cudaGetLastError() );

        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
+            CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
    }
 }

--- a/modules/cudev/include/opencv2/cudev/block/scan.hpp
+++ b/modules/cudev/include/opencv2/cudev/block/scan.hpp
@ -48,12 +48,134 @@

 #include "../common.hpp"
 #include "../warp/scan.hpp"
+#include "../warp/warp.hpp"

 namespace cv { namespace cudev {

 //! @addtogroup cudev
 //! @{

+#if __CUDACC_VER_MAJOR__ >= 9
+
+// Usage Note
+// - THREADS_NUM should be equal to the number of threads in this block.
+// - smem must be able to contain at least n elements of type T, where n is equal to the number
+//   of warps in this block. The number can be calculated by divUp(THREADS_NUM, WARP_SIZE).
+//
+// Dev Note
+// - Starting from CUDA 9.0, support for Fermi is dropped. So CV_CUDEV_ARCH >= 300 is implied.
+// - "For Pascal and earlier architectures (CV_CUDEV_ARCH < 700), all threads in mask must execute
+//    the same warp intrinsic instruction in convergence, and the union of all values in mask must
+//    be equal to the warp's active mask."
+//   (https://docs.nvidia.com/cuda/archive/10.0/cuda-c-programming-guide#independent-thread-scheduling-7-x)
+// - Above restriction does not apply starting from Volta (CV_CUDEV_ARCH >= 700). We just need to
+//   take care so that "all non-exited threads named in mask must execute the same intrinsic with
+//   the same mask."
+//   (https://docs.nvidia.com/cuda/archive/10.0/cuda-c-programming-guide#warp-description)
+
+template <int THREADS_NUM, typename T>
+__device__ T blockScanInclusive(T data, volatile T* smem, uint tid)
+{
+    const int residual = THREADS_NUM & (WARP_SIZE - 1);
+
+#if CV_CUDEV_ARCH < 700
+    const uint residual_mask = (1U << residual) - 1;
+#endif
+
+    if (THREADS_NUM > WARP_SIZE)
+    {
+        // bottom-level inclusive warp scan
+    #if CV_CUDEV_ARCH >= 700
+        T warpResult = warpScanInclusive(0xFFFFFFFFU, data);
+    #else
+        T warpResult;
+
+        if (0 == residual)
+            warpResult = warpScanInclusive(0xFFFFFFFFU, data);
+        else
+        {
+            const int n_warps = divUp(THREADS_NUM, WARP_SIZE);
+            const int warp_num = Warp::warpId();
+
+            if (warp_num < n_warps - 1)
+                warpResult = warpScanInclusive(0xFFFFFFFFU, data);
+            else
+            {
+                // We are at the last threads of a block whose number of threads
+                // is not a multiple of the warp size
+                warpResult = warpScanInclusive(residual_mask, data);
+            }
+        }
+    #endif
+
+        __syncthreads();
+
+        // save top elements of each warp for exclusive warp scan
+        // sync to wait for warp scans to complete (because smem is being overwritten)
+        if ((tid & (WARP_SIZE - 1)) == (WARP_SIZE - 1))
+        {
+            smem[tid >> LOG_WARP_SIZE] = warpResult;
+        }
+
+        __syncthreads();
+
+        int quot = THREADS_NUM / WARP_SIZE;
+
+        if (tid < quot)
+        {
+            // grab top warp elements
+            T val = smem[tid];
+
+            uint mask = (1LLU << quot) - 1;
+
+            if (0 == residual)
+            {
+                // calculate exclusive scan and write back to shared memory
+                smem[tid] = warpScanExclusive(mask, val);
+            }
+            else
+            {
+                // calculate inclusive scan and write back to shared memory with offset 1
+                smem[tid + 1] = warpScanInclusive(mask, val);
+
+                if (tid == 0)
+                    smem[0] = 0;
+            }
+        }
+
+        __syncthreads();
+
+        // return updated warp scans
+        return warpResult + smem[tid >> LOG_WARP_SIZE];
+    }
+    else
+    {
+    #if CV_CUDEV_ARCH >= 700
+        return warpScanInclusive(0xFFFFFFFFU, data);
+    #else
+        if (THREADS_NUM == WARP_SIZE)
+            return warpScanInclusive(0xFFFFFFFFU, data);
+        else
+            return warpScanInclusive(residual_mask, data);
+    #endif
+    }
+}
+
+template <int THREADS_NUM, typename T>
+__device__ __forceinline__ T blockScanExclusive(T data, volatile T* smem, uint tid)
+{
+    return blockScanInclusive<THREADS_NUM>(data, smem, tid) - data;
+}
+
+#else // __CUDACC_VER_MAJOR__ >= 9
+
+// Usage Note
+// - THREADS_NUM should be equal to the number of threads in this block.
+// - (>= Kepler) smem must be able to contain at least n elements of type T, where n is equal to the number
+//   of warps in this block. The number can be calculated by divUp(THREADS_NUM, WARP_SIZE).
+// - (Fermi) smem must be able to contain at least n elements of type T, where n is equal to the number
+//   of threads in this block (= THREADS_NUM).
+
 template <int THREADS_NUM, typename T>
 __device__ T blockScanInclusive(T data, volatile T* smem, uint tid)
 {
@ -73,18 +195,31 @@ __device__ T blockScanInclusive(T data, volatile T* smem, uint tid)

        __syncthreads();

-        if (tid < (THREADS_NUM / WARP_SIZE))
+        int quot = THREADS_NUM / WARP_SIZE;
+
+        if (tid < quot)
        {
            // grab top warp elements
            T val = smem[tid];

-            // calculate exclusive scan and write back to shared memory
-            smem[tid] = warpScanExclusive(val, smem, tid);
+            if (0 == (THREADS_NUM & (WARP_SIZE - 1)))
+            {
+                // calculate exclusive scan and write back to shared memory
+                smem[tid] = warpScanExclusive(val, smem, tid);
+            }
+            else
+            {
+                // calculate inclusive scan and write back to shared memory with offset 1
+                smem[tid + 1] = warpScanInclusive(val, smem, tid);
+
+                if (tid == 0)
+                    smem[0] = 0;
+            }
        }

        __syncthreads();

-        // return updated warp scans with exclusive scan results
+        // return updated warp scans
        return warpResult + smem[tid >> LOG_WARP_SIZE];
    }
    else
@ -99,6 +234,8 @@ __device__ __forceinline__ T blockScanExclusive(T data, volatile T* smem, uint t
    return blockScanInclusive<THREADS_NUM>(data, smem, tid) - data;
 }

+#endif // __CUDACC_VER_MAJOR__ >= 9
+
 //! @}

 }}
--- a/modules/cudev/include/opencv2/cudev/grid/detail/integral.hpp
+++ b/modules/cudev/include/opencv2/cudev/grid/detail/integral.hpp
@ -215,7 +215,7 @@ namespace integral_detail
        #pragma unroll
        for (int i = 1; i < 32; i *= 2)
        {
-            const int n = shfl_up(sum, i, 32);
+            const int n = compatible_shfl_up(sum, i, 32);

            if (lane_id >= i)
            {
@ -245,9 +245,9 @@ namespace integral_detail
            int warp_sum = sums[lane_id];

            #pragma unroll
-            for (int i = 1; i <= 32; i *= 2)
+            for (int i = 1; i < 32; i *= 2)
            {
-                const int n = shfl_up(warp_sum, i, 32);
+                const int n = compatible_shfl_up(warp_sum, i, 32);

                if (lane_id >= i)
                    warp_sum += n;
@ -453,7 +453,7 @@ namespace integral_detail

            for (int i = 1; i <= 8; i *= 2)
            {
-                T n = shfl_up(partial_sum, i, 32);
+                T n = compatible_shfl_up(partial_sum, i, 32);

                if (lane_id >= i)
                    partial_sum += n;
--- a/modules/cudev/include/opencv2/cudev/warp/scan.hpp
+++ b/modules/cudev/include/opencv2/cudev/warp/scan.hpp
@ -55,6 +55,36 @@ namespace cv { namespace cudev {
 //! @addtogroup cudev
 //! @{

+#if __CUDACC_VER_MAJOR__ >= 9
+
+// Starting from CUDA 9.0, support for Fermi is dropped.
+// So CV_CUDEV_ARCH >= 300 is implied.
+
+template <typename T>
+__device__ T warpScanInclusive(uint mask, T data)
+{
+    const uint laneId = Warp::laneId();
+
+    // scan on shufl functions
+    #pragma unroll
+    for (int i = 1; i <= (WARP_SIZE / 2); i *= 2)
+    {
+        const T val = shfl_up_sync(mask, data, i);
+        if (laneId >= i)
+              data += val;
+    }
+
+    return data;
+}
+
+template <typename T>
+__device__ __forceinline__ T warpScanExclusive(uint mask, T data)
+{
+    return warpScanInclusive(mask, data) - data;
+}
+
+#else // __CUDACC_VER_MAJOR__ >= 9
+
 template <typename T>
 __device__ T warpScanInclusive(T data, volatile T* smem, uint tid)
 {
@ -75,19 +105,16 @@ __device__ T warpScanInclusive(T data, volatile T* smem, uint tid)

    return data;
 #else
-    uint pos = 2 * tid - (tid & (WARP_SIZE - 1));
-    smem[pos] = 0;
+    const uint laneId = Warp::laneId();

-    pos += WARP_SIZE;
-    smem[pos] = data;
+    smem[tid] = data;

-    smem[pos] += smem[pos - 1];
-    smem[pos] += smem[pos - 2];
-    smem[pos] += smem[pos - 4];
-    smem[pos] += smem[pos - 8];
-    smem[pos] += smem[pos - 16];
+    #pragma unroll
+    for (int i = 1; i <= (WARP_SIZE / 2); i *= 2)
+        if (laneId >= i)
+            smem[tid] += smem[tid - i];

-    return smem[pos];
+    return smem[tid];
 #endif
 }

@ -97,6 +124,8 @@ __device__ __forceinline__ T warpScanExclusive(T data, volatile T* smem, uint ti
    return warpScanInclusive(data, smem, tid) - data;
 }

+#endif // __CUDACC_VER_MAJOR__ >= 9
+
 //! @}

 }}
--- a/modules/cudev/include/opencv2/cudev/warp/shuffle.hpp
+++ b/modules/cudev/include/opencv2/cudev/warp/shuffle.hpp
@ -48,6 +48,8 @@

 #include "../common.hpp"
 #include "../util/vec_traits.hpp"
+#include "../block/block.hpp"
+#include "warp.hpp"

 namespace cv { namespace cudev {

@ -59,7 +61,7 @@ namespace cv { namespace cudev {
 #if __CUDACC_VER_MAJOR__ >= 9
 #  define __shfl(x, y, z) __shfl_sync(0xFFFFFFFFU, x, y, z)
 #  define __shfl_xor(x, y, z) __shfl_xor_sync(0xFFFFFFFFU, x, y, z)
-#  define __shfl_up(x, y, z) __shfl_up_sync(0xFFFFFFFFU, x, y, z)
+//#  define __shfl_up(x, y, z) __shfl_up_sync(0xFFFFFFFFU, x, y, z)
 #  define __shfl_down(x, y, z) __shfl_down_sync(0xFFFFFFFFU, x, y, z)
 #endif

@ -155,6 +157,53 @@ CV_CUDEV_SHFL_VEC_INST(double)

 // shfl_up

+template <typename T>
+__device__ __forceinline__ T compatible_shfl_up(T val, uint delta, int width = warpSize)
+{
+#if __CUDACC_VER_MAJOR__ < 9
+
+    return shfl_up(val, delta, width);
+
+#else // __CUDACC_VER_MAJOR__ < 9
+
+#if CV_CUDEV_ARCH >= 700
+    return shfl_up_sync(0xFFFFFFFFU, val, delta, width);
+#else
+    const int block_size = Block::blockSize();
+    const int residual = block_size & (warpSize - 1);
+
+    if (0 == residual)
+        return shfl_up_sync(0xFFFFFFFFU, val, delta, width);
+    else
+    {
+        const int n_warps = divUp(block_size, warpSize);
+        const int warp_id = Warp::warpId();
+
+        if (warp_id < n_warps - 1)
+            return shfl_up_sync(0xFFFFFFFFU, val, delta, width);
+        else
+        {
+            // We are at the last threads of a block whose number of threads
+            // is not a multiple of the warp size
+            uint mask = (1LU << residual) - 1;
+            return shfl_up_sync(mask, val, delta, width);
+        }
+    }
+#endif
+
+#endif // __CUDACC_VER_MAJOR__ < 9
+}
+
+#if __CUDACC_VER_MAJOR__ >= 9
+
+template <typename T>
+__device__ __forceinline__ T shfl_up_sync(uint mask, T val, uint delta, int width = warpSize)
+{
+    return (T) __shfl_up_sync(mask, val, delta, width);
+}
+
+#else
+
 __device__ __forceinline__ uchar shfl_up(uchar val, uint delta, int width = warpSize)
 {
    return (uchar) __shfl_up((int) val, delta, width);
@ -244,6 +293,8 @@ CV_CUDEV_SHFL_UP_VEC_INST(double)

 #undef CV_CUDEV_SHFL_UP_VEC_INST

+#endif
+
 // shfl_down

 __device__ __forceinline__ uchar shfl_down(uchar val, uint delta, int width = warpSize)
--- a/modules/cudev/test/test_scan.cu
+++ b/modules/cudev/test/test_scan.cu
@ -0,0 +1,140 @@
+
+#include "test_precomp.hpp"
+
+using namespace cv;
+using namespace cv::cudev;
+using namespace cvtest;
+
+// BlockScanInt
+
+template <int THREADS_NUM>
+__global__ void int_kernel(int* data)
+{
+    uint tid = Block::threadLineId();
+
+#if CV_CUDEV_ARCH >= 300
+    const int n_warps = (THREADS_NUM - 1) / WARP_SIZE + 1;
+    __shared__ int smem[n_warps];
+#else
+    __shared__ int smem[THREADS_NUM];
+#endif
+
+    data[tid] = blockScanInclusive<THREADS_NUM>(data[tid], smem, tid);
+}
+
+#define BLOCK_SCAN_INT_TEST(block_size)                                 \
+    TEST(BlockScanInt, BlockSize##block_size)                           \
+    {                                                                   \
+        Mat src = randomMat(Size(block_size, 1), CV_32SC1, 0, 1024);    \
+                                                                        \
+        GpuMat d_src;                                                   \
+        d_src.upload(src);                                              \
+                                                                        \
+        for (int col = 1; col < block_size; col++)                      \
+            src.at<int>(0, col) += src.at<int>(0, col - 1);             \
+                                                                        \
+        int_kernel<block_size><<<1, block_size>>>((int*)d_src.data);    \
+                                                                        \
+        CV_CUDEV_SAFE_CALL(cudaDeviceSynchronize());                    \
+                                                                        \
+        EXPECT_MAT_NEAR(d_src, src, 0);                                 \
+    }
+
+BLOCK_SCAN_INT_TEST(29)
+BLOCK_SCAN_INT_TEST(30)
+BLOCK_SCAN_INT_TEST(32)
+BLOCK_SCAN_INT_TEST(40)
+BLOCK_SCAN_INT_TEST(41)
+
+BLOCK_SCAN_INT_TEST(59)
+BLOCK_SCAN_INT_TEST(60)
+BLOCK_SCAN_INT_TEST(64)
+BLOCK_SCAN_INT_TEST(70)
+BLOCK_SCAN_INT_TEST(71)
+
+BLOCK_SCAN_INT_TEST(109)
+BLOCK_SCAN_INT_TEST(110)
+BLOCK_SCAN_INT_TEST(128)
+BLOCK_SCAN_INT_TEST(130)
+BLOCK_SCAN_INT_TEST(131)
+
+BLOCK_SCAN_INT_TEST(189)
+BLOCK_SCAN_INT_TEST(200)
+BLOCK_SCAN_INT_TEST(256)
+BLOCK_SCAN_INT_TEST(300)
+BLOCK_SCAN_INT_TEST(311)
+
+BLOCK_SCAN_INT_TEST(489)
+BLOCK_SCAN_INT_TEST(500)
+BLOCK_SCAN_INT_TEST(512)
+BLOCK_SCAN_INT_TEST(600)
+BLOCK_SCAN_INT_TEST(611)
+
+BLOCK_SCAN_INT_TEST(1024)
+
+// BlockScanDouble
+
+template <int THREADS_NUM>
+__global__ void double_kernel(double* data)
+{
+    uint tid = Block::threadLineId();
+
+#if CV_CUDEV_ARCH >= 300
+    const int n_warps = (THREADS_NUM - 1) / WARP_SIZE + 1;
+    __shared__ double smem[n_warps];
+#else
+    __shared__ double smem[THREADS_NUM];
+#endif
+
+    data[tid] = blockScanInclusive<THREADS_NUM>(data[tid], smem, tid);
+}
+
+#define BLOCK_SCAN_DOUBLE_TEST(block_size)                                  \
+    TEST(BlockScanDouble, BlockSize##block_size)                            \
+    {                                                                       \
+        Mat src = randomMat(Size(block_size, 1), CV_64FC1, 0.0, 1.0);       \
+                                                                            \
+        GpuMat d_src;                                                       \
+        d_src.upload(src);                                                  \
+                                                                            \
+        for (int col = 1; col < block_size; col++)                          \
+            src.at<double>(0, col) += src.at<double>(0, col - 1);           \
+                                                                            \
+        double_kernel<block_size><<<1, block_size>>>((double*)d_src.data);  \
+                                                                            \
+        CV_CUDEV_SAFE_CALL(cudaDeviceSynchronize());                        \
+                                                                            \
+        EXPECT_MAT_NEAR(d_src, src, 1e-10);                                 \
+    }
+
+BLOCK_SCAN_DOUBLE_TEST(29)
+BLOCK_SCAN_DOUBLE_TEST(30)
+BLOCK_SCAN_DOUBLE_TEST(32)
+BLOCK_SCAN_DOUBLE_TEST(40)
+BLOCK_SCAN_DOUBLE_TEST(41)
+
+BLOCK_SCAN_DOUBLE_TEST(59)
+BLOCK_SCAN_DOUBLE_TEST(60)
+BLOCK_SCAN_DOUBLE_TEST(64)
+BLOCK_SCAN_DOUBLE_TEST(70)
+BLOCK_SCAN_DOUBLE_TEST(71)
+
+BLOCK_SCAN_DOUBLE_TEST(109)
+BLOCK_SCAN_DOUBLE_TEST(110)
+BLOCK_SCAN_DOUBLE_TEST(128)
+BLOCK_SCAN_DOUBLE_TEST(130)
+BLOCK_SCAN_DOUBLE_TEST(131)
+
+BLOCK_SCAN_DOUBLE_TEST(189)
+BLOCK_SCAN_DOUBLE_TEST(200)
+BLOCK_SCAN_DOUBLE_TEST(256)
+BLOCK_SCAN_DOUBLE_TEST(300)
+BLOCK_SCAN_DOUBLE_TEST(311)
+
+BLOCK_SCAN_DOUBLE_TEST(489)
+BLOCK_SCAN_DOUBLE_TEST(500)
+BLOCK_SCAN_DOUBLE_TEST(512)
+BLOCK_SCAN_DOUBLE_TEST(600)
+BLOCK_SCAN_DOUBLE_TEST(611)
+
+BLOCK_SCAN_DOUBLE_TEST(1024)