cudev - Rework some code

- Use shfl_down, instead of __shfl_down, on warp scan - Remove race conditions
2025-06-08 01:53:19 +08:00 · 2019-02-23 04:54:48 +00:00 · 2019-02-23 04:54:48 +00:00 · 2b6be3cb0f
commit 2b6be3cb0f
parent 4c94804bb0
4 changed files with 25 additions and 4 deletions
--- a/modules/cudev/include/opencv2/cudev/block/scan.hpp
+++ b/modules/cudev/include/opencv2/cudev/block/scan.hpp
@ -135,6 +135,12 @@ __device__ T blockScanInclusive(T data, volatile T* smem, uint tid)
            }
            else
            {
                // Read from smem[tid]              (T val = smem[tid])
                // and write to smem[tid + 1]       (smem[tid + 1] = warpScanInclusive(mask, val))
                // should be explicitly fenced by "__syncwarp" to get rid of
                // "cuda-memcheck --tool racecheck" warnings.
                __syncwarp(mask);
                // calculate inclusive scan and write back to shared memory with offset 1
                smem[tid + 1] = warpScanInclusive(mask, val);
@ -197,10 +203,18 @@ __device__ T blockScanInclusive(T data, volatile T* smem, uint tid)
        int quot = THREADS_NUM / WARP_SIZE;
        T val;
        if (tid < quot)
        {
            // grab top warp elements
-            T val = smem[tid];
+            val = smem[tid];
        }
        __syncthreads();
        if (tid < quot)
        {
            if (0 == (THREADS_NUM & (WARP_SIZE - 1)))
            {
--- a/modules/cudev/include/opencv2/cudev/grid/detail/integral.hpp
+++ b/modules/cudev/include/opencv2/cudev/grid/detail/integral.hpp
@ -63,7 +63,8 @@ namespace integral_detail
        __shared__ D smem[NUM_SCAN_THREADS * 2];
        __shared__ D carryElem;
-        carryElem = 0;
+        if (threadIdx.x == 0)
            carryElem = 0;
        __syncthreads();
@ -105,7 +106,8 @@ namespace integral_detail
        __shared__ D smem[NUM_SCAN_THREADS * 2];
        __shared__ D carryElem;
-        carryElem = 0;
+        if (threadIdx.x == 0)
            carryElem = 0;
        __syncthreads();
--- a/modules/cudev/include/opencv2/cudev/warp/scan.hpp
+++ b/modules/cudev/include/opencv2/cudev/warp/scan.hpp
@ -98,7 +98,7 @@ __device__ T warpScanInclusive(T data, volatile T* smem, uint tid)
    #pragma unroll
    for (int i = 1; i <= (WARP_SIZE / 2); i *= 2)
    {
-        const T val = __shfl_up(data, i, WARP_SIZE);
+        const T val = shfl_up(data, i);
        if (laneId >= i)
              data += val;
    }
--- a/modules/cudev/include/opencv2/cudev/warp/shuffle.hpp
+++ b/modules/cudev/include/opencv2/cudev/warp/shuffle.hpp
@ -250,6 +250,11 @@ __device__ double shfl_up(double val, uint delta, int width = warpSize)
    return __hiloint2double(hi, lo);
 }
 __device__ __forceinline__ unsigned long long shfl_up(unsigned long long val, uint delta, int width = warpSize)
 {
    return __shfl_up(val, delta, width);
 }
 #define CV_CUDEV_SHFL_UP_VEC_INST(input_type) \
    __device__ __forceinline__ input_type ## 1 shfl_up(const input_type ## 1 & val, uint delta, int width = warpSize) \
    { \