mirror of
https://github.com/opencv/opencv.git
synced 2025-06-08 01:53:19 +08:00
cudalegacy: Use safe block scan function
This commit is contained in:
parent
b17c1c6721
commit
21eb60f88b
@ -59,8 +59,7 @@
|
|||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
|
|
||||||
#include "opencv2/core/cuda/warp.hpp"
|
#include "opencv2/cudev.hpp"
|
||||||
#include "opencv2/core/cuda/warp_shuffle.hpp"
|
|
||||||
|
|
||||||
#include "opencv2/opencv_modules.hpp"
|
#include "opencv2/opencv_modules.hpp"
|
||||||
|
|
||||||
@ -77,92 +76,6 @@
|
|||||||
#include "NCVAlg.hpp"
|
#include "NCVAlg.hpp"
|
||||||
|
|
||||||
|
|
||||||
//==============================================================================
|
|
||||||
//
|
|
||||||
// BlockScan file
|
|
||||||
//
|
|
||||||
//==============================================================================
|
|
||||||
|
|
||||||
|
|
||||||
NCV_CT_ASSERT(K_WARP_SIZE == 32); //this is required for the manual unroll of the loop in warpScanInclusive
|
|
||||||
|
|
||||||
|
|
||||||
//Almost the same as naive scan1Inclusive, but doesn't need __syncthreads()
|
|
||||||
//assuming size <= WARP_SIZE and size is power of 2
|
|
||||||
__device__ Ncv32u warpScanInclusive(Ncv32u idata, volatile Ncv32u *s_Data)
|
|
||||||
{
|
|
||||||
#if __CUDA_ARCH__ >= 300
|
|
||||||
const unsigned int laneId = cv::cuda::device::Warp::laneId();
|
|
||||||
|
|
||||||
// scan on shuffl functions
|
|
||||||
#pragma unroll
|
|
||||||
for (int i = 1; i <= (K_WARP_SIZE / 2); i *= 2)
|
|
||||||
{
|
|
||||||
const Ncv32u n = cv::cuda::device::shfl_up(idata, i);
|
|
||||||
if (laneId >= i)
|
|
||||||
idata += n;
|
|
||||||
}
|
|
||||||
|
|
||||||
return idata;
|
|
||||||
#else
|
|
||||||
Ncv32u pos = 2 * threadIdx.x - (threadIdx.x & (K_WARP_SIZE - 1));
|
|
||||||
s_Data[pos] = 0;
|
|
||||||
pos += K_WARP_SIZE;
|
|
||||||
s_Data[pos] = idata;
|
|
||||||
|
|
||||||
s_Data[pos] += s_Data[pos - 1];
|
|
||||||
s_Data[pos] += s_Data[pos - 2];
|
|
||||||
s_Data[pos] += s_Data[pos - 4];
|
|
||||||
s_Data[pos] += s_Data[pos - 8];
|
|
||||||
s_Data[pos] += s_Data[pos - 16];
|
|
||||||
|
|
||||||
return s_Data[pos];
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
__device__ __forceinline__ Ncv32u warpScanExclusive(Ncv32u idata, volatile Ncv32u *s_Data)
|
|
||||||
{
|
|
||||||
return warpScanInclusive(idata, s_Data) - idata;
|
|
||||||
}
|
|
||||||
|
|
||||||
template <Ncv32u tiNumScanThreads>
|
|
||||||
__device__ Ncv32u scan1Inclusive(Ncv32u idata, volatile Ncv32u *s_Data)
|
|
||||||
{
|
|
||||||
if (tiNumScanThreads > K_WARP_SIZE)
|
|
||||||
{
|
|
||||||
//Bottom-level inclusive warp scan
|
|
||||||
Ncv32u warpResult = warpScanInclusive(idata, s_Data);
|
|
||||||
|
|
||||||
//Save top elements of each warp for exclusive warp scan
|
|
||||||
//sync to wait for warp scans to complete (because s_Data is being overwritten)
|
|
||||||
__syncthreads();
|
|
||||||
if( (threadIdx.x & (K_WARP_SIZE - 1)) == (K_WARP_SIZE - 1) )
|
|
||||||
{
|
|
||||||
s_Data[threadIdx.x >> K_LOG2_WARP_SIZE] = warpResult;
|
|
||||||
}
|
|
||||||
|
|
||||||
//wait for warp scans to complete
|
|
||||||
__syncthreads();
|
|
||||||
|
|
||||||
if( threadIdx.x < (tiNumScanThreads / K_WARP_SIZE) )
|
|
||||||
{
|
|
||||||
//grab top warp elements
|
|
||||||
Ncv32u val = s_Data[threadIdx.x];
|
|
||||||
//calculate exclusive scan and write back to shared memory
|
|
||||||
s_Data[threadIdx.x] = warpScanExclusive(val, s_Data);
|
|
||||||
}
|
|
||||||
|
|
||||||
//return updated warp scans with exclusive scan results
|
|
||||||
__syncthreads();
|
|
||||||
return warpResult + s_Data[threadIdx.x >> K_LOG2_WARP_SIZE];
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
return warpScanInclusive(idata, s_Data);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
//==============================================================================
|
//==============================================================================
|
||||||
//
|
//
|
||||||
// HaarClassifierCascade file
|
// HaarClassifierCascade file
|
||||||
@ -260,11 +173,11 @@ __device__ void compactBlockWriteOutAnchorParallel(Ncv32u threadPassFlag, Ncv32u
|
|||||||
{
|
{
|
||||||
#if __CUDA_ARCH__ && __CUDA_ARCH__ >= 110
|
#if __CUDA_ARCH__ && __CUDA_ARCH__ >= 110
|
||||||
|
|
||||||
__shared__ Ncv32u shmem[NUM_THREADS_ANCHORSPARALLEL * 2];
|
__shared__ Ncv32u shmem[NUM_THREADS_ANCHORSPARALLEL];
|
||||||
__shared__ Ncv32u numPassed;
|
__shared__ Ncv32u numPassed;
|
||||||
__shared__ Ncv32u outMaskOffset;
|
__shared__ Ncv32u outMaskOffset;
|
||||||
|
|
||||||
Ncv32u incScan = scan1Inclusive<NUM_THREADS_ANCHORSPARALLEL>(threadPassFlag, shmem);
|
Ncv32u incScan = cv::cudev::blockScanInclusive<NUM_THREADS_ANCHORSPARALLEL>(threadPassFlag, shmem, threadIdx.x);
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
if (threadIdx.x == NUM_THREADS_ANCHORSPARALLEL-1)
|
if (threadIdx.x == NUM_THREADS_ANCHORSPARALLEL-1)
|
||||||
|
@ -45,8 +45,7 @@
|
|||||||
#include <vector>
|
#include <vector>
|
||||||
#include <cuda_runtime.h>
|
#include <cuda_runtime.h>
|
||||||
|
|
||||||
#include "opencv2/core/cuda/warp.hpp"
|
#include "opencv2/cudev.hpp"
|
||||||
#include "opencv2/core/cuda/warp_shuffle.hpp"
|
|
||||||
|
|
||||||
#include "opencv2/cudalegacy/NPP_staging.hpp"
|
#include "opencv2/cudalegacy/NPP_staging.hpp"
|
||||||
|
|
||||||
@ -81,111 +80,6 @@ cudaStream_t nppStSetActiveCUDAstream(cudaStream_t cudaStream)
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
//==============================================================================
|
|
||||||
//
|
|
||||||
// BlockScan.cuh
|
|
||||||
//
|
|
||||||
//==============================================================================
|
|
||||||
|
|
||||||
|
|
||||||
NCV_CT_ASSERT(K_WARP_SIZE == 32); //this is required for the manual unroll of the loop in warpScanInclusive
|
|
||||||
|
|
||||||
|
|
||||||
//Almost the same as naive scan1Inclusive, but doesn't need __syncthreads()
|
|
||||||
//assuming size <= WARP_SIZE and size is power of 2
|
|
||||||
template <class T>
|
|
||||||
inline __device__ T warpScanInclusive(T idata, volatile T *s_Data)
|
|
||||||
{
|
|
||||||
#if __CUDA_ARCH__ >= 300
|
|
||||||
const unsigned int laneId = cv::cuda::device::Warp::laneId();
|
|
||||||
|
|
||||||
// scan on shuffl functions
|
|
||||||
#pragma unroll
|
|
||||||
for (int i = 1; i <= (K_WARP_SIZE / 2); i *= 2)
|
|
||||||
{
|
|
||||||
const T n = cv::cuda::device::shfl_up(idata, i);
|
|
||||||
if (laneId >= i)
|
|
||||||
idata += n;
|
|
||||||
}
|
|
||||||
|
|
||||||
return idata;
|
|
||||||
#else
|
|
||||||
Ncv32u pos = 2 * threadIdx.x - (threadIdx.x & (K_WARP_SIZE - 1));
|
|
||||||
s_Data[pos] = 0;
|
|
||||||
pos += K_WARP_SIZE;
|
|
||||||
s_Data[pos] = idata;
|
|
||||||
|
|
||||||
s_Data[pos] += s_Data[pos - 1];
|
|
||||||
s_Data[pos] += s_Data[pos - 2];
|
|
||||||
s_Data[pos] += s_Data[pos - 4];
|
|
||||||
s_Data[pos] += s_Data[pos - 8];
|
|
||||||
s_Data[pos] += s_Data[pos - 16];
|
|
||||||
|
|
||||||
return s_Data[pos];
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
inline __device__ Ncv64u warpScanInclusive(Ncv64u idata, volatile Ncv64u *s_Data)
|
|
||||||
{
|
|
||||||
Ncv32u pos = 2 * threadIdx.x - (threadIdx.x & (K_WARP_SIZE - 1));
|
|
||||||
s_Data[pos] = 0;
|
|
||||||
pos += K_WARP_SIZE;
|
|
||||||
s_Data[pos] = idata;
|
|
||||||
|
|
||||||
s_Data[pos] += s_Data[pos - 1];
|
|
||||||
s_Data[pos] += s_Data[pos - 2];
|
|
||||||
s_Data[pos] += s_Data[pos - 4];
|
|
||||||
s_Data[pos] += s_Data[pos - 8];
|
|
||||||
s_Data[pos] += s_Data[pos - 16];
|
|
||||||
|
|
||||||
return s_Data[pos];
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
template <class T>
|
|
||||||
inline __device__ T warpScanExclusive(T idata, volatile T *s_Data)
|
|
||||||
{
|
|
||||||
return warpScanInclusive(idata, s_Data) - idata;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
template <class T, Ncv32u tiNumScanThreads>
|
|
||||||
inline __device__ T blockScanInclusive(T idata, volatile T *s_Data)
|
|
||||||
{
|
|
||||||
if (tiNumScanThreads > K_WARP_SIZE)
|
|
||||||
{
|
|
||||||
//Bottom-level inclusive warp scan
|
|
||||||
T warpResult = warpScanInclusive(idata, s_Data);
|
|
||||||
|
|
||||||
//Save top elements of each warp for exclusive warp scan
|
|
||||||
//sync to wait for warp scans to complete (because s_Data is being overwritten)
|
|
||||||
__syncthreads();
|
|
||||||
if( (threadIdx.x & (K_WARP_SIZE - 1)) == (K_WARP_SIZE - 1) )
|
|
||||||
{
|
|
||||||
s_Data[threadIdx.x >> K_LOG2_WARP_SIZE] = warpResult;
|
|
||||||
}
|
|
||||||
|
|
||||||
//wait for warp scans to complete
|
|
||||||
__syncthreads();
|
|
||||||
|
|
||||||
if( threadIdx.x < (tiNumScanThreads / K_WARP_SIZE) )
|
|
||||||
{
|
|
||||||
//grab top warp elements
|
|
||||||
T val = s_Data[threadIdx.x];
|
|
||||||
//calculate exclusive scan and write back to shared memory
|
|
||||||
s_Data[threadIdx.x] = warpScanExclusive(val, s_Data);
|
|
||||||
}
|
|
||||||
|
|
||||||
//return updated warp scans with exclusive scan results
|
|
||||||
__syncthreads();
|
|
||||||
return warpResult + s_Data[threadIdx.x >> K_LOG2_WARP_SIZE];
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
return warpScanInclusive(idata, s_Data);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
//==============================================================================
|
//==============================================================================
|
||||||
//
|
//
|
||||||
// IntegralImage.cu
|
// IntegralImage.cu
|
||||||
@ -280,7 +174,7 @@ __global__ void scanRows(T_in *d_src, Ncv32u texOffs, Ncv32u srcWidth, Ncv32u sr
|
|||||||
Ncv32u numBuckets = (srcWidth + NUM_SCAN_THREADS - 1) >> LOG2_NUM_SCAN_THREADS;
|
Ncv32u numBuckets = (srcWidth + NUM_SCAN_THREADS - 1) >> LOG2_NUM_SCAN_THREADS;
|
||||||
Ncv32u offsetX = 0;
|
Ncv32u offsetX = 0;
|
||||||
|
|
||||||
__shared__ T_out shmem[NUM_SCAN_THREADS * 2];
|
__shared__ T_out shmem[NUM_SCAN_THREADS];
|
||||||
__shared__ T_out carryElem;
|
__shared__ T_out carryElem;
|
||||||
carryElem = 0;
|
carryElem = 0;
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
@ -301,7 +195,7 @@ __global__ void scanRows(T_in *d_src, Ncv32u texOffs, Ncv32u srcWidth, Ncv32u sr
|
|||||||
curElemMod = _scanElemOp<T_in, T_out>::scanElemOp<tbDoSqr>(curElem);
|
curElemMod = _scanElemOp<T_in, T_out>::scanElemOp<tbDoSqr>(curElem);
|
||||||
|
|
||||||
//inclusive scan
|
//inclusive scan
|
||||||
curScanElem = blockScanInclusive<T_out, NUM_SCAN_THREADS>(curElemMod, shmem);
|
curScanElem = cv::cudev::blockScanInclusive<NUM_SCAN_THREADS>(curElemMod, shmem, threadIdx.x);
|
||||||
|
|
||||||
if (curElemOffs <= srcWidth)
|
if (curElemOffs <= srcWidth)
|
||||||
{
|
{
|
||||||
@ -1290,7 +1184,7 @@ __global__ void removePass1Scan(Ncv32u *d_src, Ncv32u srcLen,
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
__shared__ Ncv32u shmem[NUM_REMOVE_THREADS * 2];
|
__shared__ Ncv32u shmem[NUM_REMOVE_THREADS];
|
||||||
|
|
||||||
Ncv32u scanElem = 0;
|
Ncv32u scanElem = 0;
|
||||||
if (elemAddrIn < srcLen)
|
if (elemAddrIn < srcLen)
|
||||||
@ -1305,7 +1199,7 @@ __global__ void removePass1Scan(Ncv32u *d_src, Ncv32u srcLen,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Ncv32u localScanInc = blockScanInclusive<Ncv32u, NUM_REMOVE_THREADS>(scanElem, shmem);
|
Ncv32u localScanInc = cv::cudev::blockScanInclusive<NUM_REMOVE_THREADS>(scanElem, shmem, threadIdx.x);
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
if (elemAddrIn < srcLen)
|
if (elemAddrIn < srcLen)
|
||||||
|
@ -98,7 +98,7 @@ __device__ T warpScanInclusive(T data, volatile T* smem, uint tid)
|
|||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i = 1; i <= (WARP_SIZE / 2); i *= 2)
|
for (int i = 1; i <= (WARP_SIZE / 2); i *= 2)
|
||||||
{
|
{
|
||||||
const T val = shfl_up(data, i);
|
const T val = __shfl_up(data, i, WARP_SIZE);
|
||||||
if (laneId >= i)
|
if (laneId >= i)
|
||||||
data += val;
|
data += val;
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user