From 21eb60f88bce4e9e4bedffcd28b29c75c404e788 Mon Sep 17 00:00:00 2001 From: Namgoo Lee Date: Tue, 12 Feb 2019 08:24:57 +0000 Subject: [PATCH] cudalegacy: Use safe block scan function --- .../src/cuda/NCVHaarObjectDetection.cu | 93 +------------- modules/cudalegacy/src/cuda/NPP_staging.cu | 116 +----------------- .../cudev/include/opencv2/cudev/warp/scan.hpp | 2 +- 3 files changed, 9 insertions(+), 202 deletions(-) diff --git a/modules/cudalegacy/src/cuda/NCVHaarObjectDetection.cu b/modules/cudalegacy/src/cuda/NCVHaarObjectDetection.cu index 1a32baa438..00b3fd297f 100644 --- a/modules/cudalegacy/src/cuda/NCVHaarObjectDetection.cu +++ b/modules/cudalegacy/src/cuda/NCVHaarObjectDetection.cu @@ -59,8 +59,7 @@ #include #include -#include "opencv2/core/cuda/warp.hpp" -#include "opencv2/core/cuda/warp_shuffle.hpp" +#include "opencv2/cudev.hpp" #include "opencv2/opencv_modules.hpp" @@ -77,92 +76,6 @@ #include "NCVAlg.hpp" -//============================================================================== -// -// BlockScan file -// -//============================================================================== - - -NCV_CT_ASSERT(K_WARP_SIZE == 32); //this is required for the manual unroll of the loop in warpScanInclusive - - -//Almost the same as naive scan1Inclusive, but doesn't need __syncthreads() -//assuming size <= WARP_SIZE and size is power of 2 -__device__ Ncv32u warpScanInclusive(Ncv32u idata, volatile Ncv32u *s_Data) -{ -#if __CUDA_ARCH__ >= 300 - const unsigned int laneId = cv::cuda::device::Warp::laneId(); - - // scan on shuffl functions - #pragma unroll - for (int i = 1; i <= (K_WARP_SIZE / 2); i *= 2) - { - const Ncv32u n = cv::cuda::device::shfl_up(idata, i); - if (laneId >= i) - idata += n; - } - - return idata; -#else - Ncv32u pos = 2 * threadIdx.x - (threadIdx.x & (K_WARP_SIZE - 1)); - s_Data[pos] = 0; - pos += K_WARP_SIZE; - s_Data[pos] = idata; - - s_Data[pos] += s_Data[pos - 1]; - s_Data[pos] += s_Data[pos - 2]; - s_Data[pos] += s_Data[pos - 4]; - s_Data[pos] += s_Data[pos - 8]; - s_Data[pos] += s_Data[pos - 16]; - - return s_Data[pos]; -#endif -} - -__device__ __forceinline__ Ncv32u warpScanExclusive(Ncv32u idata, volatile Ncv32u *s_Data) -{ - return warpScanInclusive(idata, s_Data) - idata; -} - -template -__device__ Ncv32u scan1Inclusive(Ncv32u idata, volatile Ncv32u *s_Data) -{ - if (tiNumScanThreads > K_WARP_SIZE) - { - //Bottom-level inclusive warp scan - Ncv32u warpResult = warpScanInclusive(idata, s_Data); - - //Save top elements of each warp for exclusive warp scan - //sync to wait for warp scans to complete (because s_Data is being overwritten) - __syncthreads(); - if( (threadIdx.x & (K_WARP_SIZE - 1)) == (K_WARP_SIZE - 1) ) - { - s_Data[threadIdx.x >> K_LOG2_WARP_SIZE] = warpResult; - } - - //wait for warp scans to complete - __syncthreads(); - - if( threadIdx.x < (tiNumScanThreads / K_WARP_SIZE) ) - { - //grab top warp elements - Ncv32u val = s_Data[threadIdx.x]; - //calculate exclusive scan and write back to shared memory - s_Data[threadIdx.x] = warpScanExclusive(val, s_Data); - } - - //return updated warp scans with exclusive scan results - __syncthreads(); - return warpResult + s_Data[threadIdx.x >> K_LOG2_WARP_SIZE]; - } - else - { - return warpScanInclusive(idata, s_Data); - } -} - - //============================================================================== // // HaarClassifierCascade file @@ -260,11 +173,11 @@ __device__ void compactBlockWriteOutAnchorParallel(Ncv32u threadPassFlag, Ncv32u { #if __CUDA_ARCH__ && __CUDA_ARCH__ >= 110 - __shared__ Ncv32u shmem[NUM_THREADS_ANCHORSPARALLEL * 2]; + __shared__ Ncv32u shmem[NUM_THREADS_ANCHORSPARALLEL]; __shared__ Ncv32u numPassed; __shared__ Ncv32u outMaskOffset; - Ncv32u incScan = scan1Inclusive(threadPassFlag, shmem); + Ncv32u incScan = cv::cudev::blockScanInclusive(threadPassFlag, shmem, threadIdx.x); __syncthreads(); if (threadIdx.x == NUM_THREADS_ANCHORSPARALLEL-1) diff --git a/modules/cudalegacy/src/cuda/NPP_staging.cu b/modules/cudalegacy/src/cuda/NPP_staging.cu index a96f44ff99..90880d56cc 100644 --- a/modules/cudalegacy/src/cuda/NPP_staging.cu +++ b/modules/cudalegacy/src/cuda/NPP_staging.cu @@ -45,8 +45,7 @@ #include #include -#include "opencv2/core/cuda/warp.hpp" -#include "opencv2/core/cuda/warp_shuffle.hpp" +#include "opencv2/cudev.hpp" #include "opencv2/cudalegacy/NPP_staging.hpp" @@ -81,111 +80,6 @@ cudaStream_t nppStSetActiveCUDAstream(cudaStream_t cudaStream) } -//============================================================================== -// -// BlockScan.cuh -// -//============================================================================== - - -NCV_CT_ASSERT(K_WARP_SIZE == 32); //this is required for the manual unroll of the loop in warpScanInclusive - - -//Almost the same as naive scan1Inclusive, but doesn't need __syncthreads() -//assuming size <= WARP_SIZE and size is power of 2 -template -inline __device__ T warpScanInclusive(T idata, volatile T *s_Data) -{ -#if __CUDA_ARCH__ >= 300 - const unsigned int laneId = cv::cuda::device::Warp::laneId(); - - // scan on shuffl functions - #pragma unroll - for (int i = 1; i <= (K_WARP_SIZE / 2); i *= 2) - { - const T n = cv::cuda::device::shfl_up(idata, i); - if (laneId >= i) - idata += n; - } - - return idata; -#else - Ncv32u pos = 2 * threadIdx.x - (threadIdx.x & (K_WARP_SIZE - 1)); - s_Data[pos] = 0; - pos += K_WARP_SIZE; - s_Data[pos] = idata; - - s_Data[pos] += s_Data[pos - 1]; - s_Data[pos] += s_Data[pos - 2]; - s_Data[pos] += s_Data[pos - 4]; - s_Data[pos] += s_Data[pos - 8]; - s_Data[pos] += s_Data[pos - 16]; - - return s_Data[pos]; -#endif -} -inline __device__ Ncv64u warpScanInclusive(Ncv64u idata, volatile Ncv64u *s_Data) -{ - Ncv32u pos = 2 * threadIdx.x - (threadIdx.x & (K_WARP_SIZE - 1)); - s_Data[pos] = 0; - pos += K_WARP_SIZE; - s_Data[pos] = idata; - - s_Data[pos] += s_Data[pos - 1]; - s_Data[pos] += s_Data[pos - 2]; - s_Data[pos] += s_Data[pos - 4]; - s_Data[pos] += s_Data[pos - 8]; - s_Data[pos] += s_Data[pos - 16]; - - return s_Data[pos]; -} - - -template -inline __device__ T warpScanExclusive(T idata, volatile T *s_Data) -{ - return warpScanInclusive(idata, s_Data) - idata; -} - - -template -inline __device__ T blockScanInclusive(T idata, volatile T *s_Data) -{ - if (tiNumScanThreads > K_WARP_SIZE) - { - //Bottom-level inclusive warp scan - T warpResult = warpScanInclusive(idata, s_Data); - - //Save top elements of each warp for exclusive warp scan - //sync to wait for warp scans to complete (because s_Data is being overwritten) - __syncthreads(); - if( (threadIdx.x & (K_WARP_SIZE - 1)) == (K_WARP_SIZE - 1) ) - { - s_Data[threadIdx.x >> K_LOG2_WARP_SIZE] = warpResult; - } - - //wait for warp scans to complete - __syncthreads(); - - if( threadIdx.x < (tiNumScanThreads / K_WARP_SIZE) ) - { - //grab top warp elements - T val = s_Data[threadIdx.x]; - //calculate exclusive scan and write back to shared memory - s_Data[threadIdx.x] = warpScanExclusive(val, s_Data); - } - - //return updated warp scans with exclusive scan results - __syncthreads(); - return warpResult + s_Data[threadIdx.x >> K_LOG2_WARP_SIZE]; - } - else - { - return warpScanInclusive(idata, s_Data); - } -} - - //============================================================================== // // IntegralImage.cu @@ -280,7 +174,7 @@ __global__ void scanRows(T_in *d_src, Ncv32u texOffs, Ncv32u srcWidth, Ncv32u sr Ncv32u numBuckets = (srcWidth + NUM_SCAN_THREADS - 1) >> LOG2_NUM_SCAN_THREADS; Ncv32u offsetX = 0; - __shared__ T_out shmem[NUM_SCAN_THREADS * 2]; + __shared__ T_out shmem[NUM_SCAN_THREADS]; __shared__ T_out carryElem; carryElem = 0; __syncthreads(); @@ -301,7 +195,7 @@ __global__ void scanRows(T_in *d_src, Ncv32u texOffs, Ncv32u srcWidth, Ncv32u sr curElemMod = _scanElemOp::scanElemOp(curElem); //inclusive scan - curScanElem = blockScanInclusive(curElemMod, shmem); + curScanElem = cv::cudev::blockScanInclusive(curElemMod, shmem, threadIdx.x); if (curElemOffs <= srcWidth) { @@ -1290,7 +1184,7 @@ __global__ void removePass1Scan(Ncv32u *d_src, Ncv32u srcLen, return; } - __shared__ Ncv32u shmem[NUM_REMOVE_THREADS * 2]; + __shared__ Ncv32u shmem[NUM_REMOVE_THREADS]; Ncv32u scanElem = 0; if (elemAddrIn < srcLen) @@ -1305,7 +1199,7 @@ __global__ void removePass1Scan(Ncv32u *d_src, Ncv32u srcLen, } } - Ncv32u localScanInc = blockScanInclusive(scanElem, shmem); + Ncv32u localScanInc = cv::cudev::blockScanInclusive(scanElem, shmem, threadIdx.x); __syncthreads(); if (elemAddrIn < srcLen) diff --git a/modules/cudev/include/opencv2/cudev/warp/scan.hpp b/modules/cudev/include/opencv2/cudev/warp/scan.hpp index bab462973f..c0afb552a9 100644 --- a/modules/cudev/include/opencv2/cudev/warp/scan.hpp +++ b/modules/cudev/include/opencv2/cudev/warp/scan.hpp @@ -98,7 +98,7 @@ __device__ T warpScanInclusive(T data, volatile T* smem, uint tid) #pragma unroll for (int i = 1; i <= (WARP_SIZE / 2); i *= 2) { - const T val = shfl_up(data, i); + const T val = __shfl_up(data, i, WARP_SIZE); if (laneId >= i) data += val; }