mirror of
https://github.com/opencv/opencv.git
synced 2025-07-25 22:57:53 +08:00
__shfl_up_sync with mask for CUDA >= 9
* __shfl_up_sync with proper mask value for CUDA >= 9 * BlockScanInclusive for CUDA >= 9 * compatible_shfl_up for use in integral.hpp * Use CLAHE in cudev * Add tests for BlockScan
This commit is contained in:
parent
0395b2ea9c
commit
970293a229
@ -42,15 +42,9 @@
|
|||||||
|
|
||||||
#if !defined CUDA_DISABLER
|
#if !defined CUDA_DISABLER
|
||||||
|
|
||||||
#include "opencv2/core/cuda/common.hpp"
|
#include "opencv2/cudev.hpp"
|
||||||
#include "opencv2/core/cuda/functional.hpp"
|
|
||||||
#include "opencv2/core/cuda/emulation.hpp"
|
|
||||||
#include "opencv2/core/cuda/scan.hpp"
|
|
||||||
#include "opencv2/core/cuda/reduce.hpp"
|
|
||||||
#include "opencv2/core/cuda/saturate_cast.hpp"
|
|
||||||
|
|
||||||
using namespace cv::cuda;
|
using namespace cv::cudev;
|
||||||
using namespace cv::cuda::device;
|
|
||||||
|
|
||||||
namespace clahe
|
namespace clahe
|
||||||
{
|
{
|
||||||
@ -73,7 +67,7 @@ namespace clahe
|
|||||||
for (int j = threadIdx.x; j < tileSize.x; j += blockDim.x)
|
for (int j = threadIdx.x; j < tileSize.x; j += blockDim.x)
|
||||||
{
|
{
|
||||||
const int data = srcPtr[j];
|
const int data = srcPtr[j];
|
||||||
Emulation::smem::atomicAdd(&smem[data], 1);
|
::atomicAdd(&smem[data], 1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -96,7 +90,7 @@ namespace clahe
|
|||||||
|
|
||||||
// find number of overall clipped samples
|
// find number of overall clipped samples
|
||||||
|
|
||||||
reduce<256>(smem, clipped, tid, plus<int>());
|
blockReduce<256>(smem, clipped, tid, plus<int>());
|
||||||
|
|
||||||
// broadcast evaluated value
|
// broadcast evaluated value
|
||||||
|
|
||||||
@ -128,10 +122,10 @@ namespace clahe
|
|||||||
|
|
||||||
calcLutKernel<<<grid, block, 0, stream>>>(src, lut, tileSize, tilesX, clipLimit, lutScale);
|
calcLutKernel<<<grid, block, 0, stream>>>(src, lut, tileSize, tilesX, clipLimit, lutScale);
|
||||||
|
|
||||||
cudaSafeCall( cudaGetLastError() );
|
CV_CUDEV_SAFE_CALL( cudaGetLastError() );
|
||||||
|
|
||||||
if (stream == 0)
|
if (stream == 0)
|
||||||
cudaSafeCall( cudaDeviceSynchronize() );
|
CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
|
||||||
}
|
}
|
||||||
|
|
||||||
__global__ void transformKernel(const PtrStepSzb src, PtrStepb dst, const PtrStepb lut, const int2 tileSize, const int tilesX, const int tilesY)
|
__global__ void transformKernel(const PtrStepSzb src, PtrStepb dst, const PtrStepb lut, const int2 tileSize, const int tilesX, const int tilesY)
|
||||||
@ -173,13 +167,13 @@ namespace clahe
|
|||||||
const dim3 block(32, 8);
|
const dim3 block(32, 8);
|
||||||
const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
|
const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
|
||||||
|
|
||||||
cudaSafeCall( cudaFuncSetCacheConfig(transformKernel, cudaFuncCachePreferL1) );
|
CV_CUDEV_SAFE_CALL( cudaFuncSetCacheConfig(transformKernel, cudaFuncCachePreferL1) );
|
||||||
|
|
||||||
transformKernel<<<grid, block, 0, stream>>>(src, dst, lut, tileSize, tilesX, tilesY);
|
transformKernel<<<grid, block, 0, stream>>>(src, dst, lut, tileSize, tilesX, tilesY);
|
||||||
cudaSafeCall( cudaGetLastError() );
|
CV_CUDEV_SAFE_CALL( cudaGetLastError() );
|
||||||
|
|
||||||
if (stream == 0)
|
if (stream == 0)
|
||||||
cudaSafeCall( cudaDeviceSynchronize() );
|
CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -48,12 +48,134 @@
|
|||||||
|
|
||||||
#include "../common.hpp"
|
#include "../common.hpp"
|
||||||
#include "../warp/scan.hpp"
|
#include "../warp/scan.hpp"
|
||||||
|
#include "../warp/warp.hpp"
|
||||||
|
|
||||||
namespace cv { namespace cudev {
|
namespace cv { namespace cudev {
|
||||||
|
|
||||||
//! @addtogroup cudev
|
//! @addtogroup cudev
|
||||||
//! @{
|
//! @{
|
||||||
|
|
||||||
|
#if __CUDACC_VER_MAJOR__ >= 9
|
||||||
|
|
||||||
|
// Usage Note
|
||||||
|
// - THREADS_NUM should be equal to the number of threads in this block.
|
||||||
|
// - smem must be able to contain at least n elements of type T, where n is equal to the number
|
||||||
|
// of warps in this block. The number can be calculated by divUp(THREADS_NUM, WARP_SIZE).
|
||||||
|
//
|
||||||
|
// Dev Note
|
||||||
|
// - Starting from CUDA 9.0, support for Fermi is dropped. So CV_CUDEV_ARCH >= 300 is implied.
|
||||||
|
// - "For Pascal and earlier architectures (CV_CUDEV_ARCH < 700), all threads in mask must execute
|
||||||
|
// the same warp intrinsic instruction in convergence, and the union of all values in mask must
|
||||||
|
// be equal to the warp's active mask."
|
||||||
|
// (https://docs.nvidia.com/cuda/archive/10.0/cuda-c-programming-guide#independent-thread-scheduling-7-x)
|
||||||
|
// - Above restriction does not apply starting from Volta (CV_CUDEV_ARCH >= 700). We just need to
|
||||||
|
// take care so that "all non-exited threads named in mask must execute the same intrinsic with
|
||||||
|
// the same mask."
|
||||||
|
// (https://docs.nvidia.com/cuda/archive/10.0/cuda-c-programming-guide#warp-description)
|
||||||
|
|
||||||
|
template <int THREADS_NUM, typename T>
|
||||||
|
__device__ T blockScanInclusive(T data, volatile T* smem, uint tid)
|
||||||
|
{
|
||||||
|
const int residual = THREADS_NUM & (WARP_SIZE - 1);
|
||||||
|
|
||||||
|
#if CV_CUDEV_ARCH < 700
|
||||||
|
const uint residual_mask = (1U << residual) - 1;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
if (THREADS_NUM > WARP_SIZE)
|
||||||
|
{
|
||||||
|
// bottom-level inclusive warp scan
|
||||||
|
#if CV_CUDEV_ARCH >= 700
|
||||||
|
T warpResult = warpScanInclusive(0xFFFFFFFFU, data);
|
||||||
|
#else
|
||||||
|
T warpResult;
|
||||||
|
|
||||||
|
if (0 == residual)
|
||||||
|
warpResult = warpScanInclusive(0xFFFFFFFFU, data);
|
||||||
|
else
|
||||||
|
{
|
||||||
|
const int n_warps = divUp(THREADS_NUM, WARP_SIZE);
|
||||||
|
const int warp_num = Warp::warpId();
|
||||||
|
|
||||||
|
if (warp_num < n_warps - 1)
|
||||||
|
warpResult = warpScanInclusive(0xFFFFFFFFU, data);
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// We are at the last threads of a block whose number of threads
|
||||||
|
// is not a multiple of the warp size
|
||||||
|
warpResult = warpScanInclusive(residual_mask, data);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
// save top elements of each warp for exclusive warp scan
|
||||||
|
// sync to wait for warp scans to complete (because smem is being overwritten)
|
||||||
|
if ((tid & (WARP_SIZE - 1)) == (WARP_SIZE - 1))
|
||||||
|
{
|
||||||
|
smem[tid >> LOG_WARP_SIZE] = warpResult;
|
||||||
|
}
|
||||||
|
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
int quot = THREADS_NUM / WARP_SIZE;
|
||||||
|
|
||||||
|
if (tid < quot)
|
||||||
|
{
|
||||||
|
// grab top warp elements
|
||||||
|
T val = smem[tid];
|
||||||
|
|
||||||
|
uint mask = (1LLU << quot) - 1;
|
||||||
|
|
||||||
|
if (0 == residual)
|
||||||
|
{
|
||||||
|
// calculate exclusive scan and write back to shared memory
|
||||||
|
smem[tid] = warpScanExclusive(mask, val);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// calculate inclusive scan and write back to shared memory with offset 1
|
||||||
|
smem[tid + 1] = warpScanInclusive(mask, val);
|
||||||
|
|
||||||
|
if (tid == 0)
|
||||||
|
smem[0] = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
// return updated warp scans
|
||||||
|
return warpResult + smem[tid >> LOG_WARP_SIZE];
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
#if CV_CUDEV_ARCH >= 700
|
||||||
|
return warpScanInclusive(0xFFFFFFFFU, data);
|
||||||
|
#else
|
||||||
|
if (THREADS_NUM == WARP_SIZE)
|
||||||
|
return warpScanInclusive(0xFFFFFFFFU, data);
|
||||||
|
else
|
||||||
|
return warpScanInclusive(residual_mask, data);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <int THREADS_NUM, typename T>
|
||||||
|
__device__ __forceinline__ T blockScanExclusive(T data, volatile T* smem, uint tid)
|
||||||
|
{
|
||||||
|
return blockScanInclusive<THREADS_NUM>(data, smem, tid) - data;
|
||||||
|
}
|
||||||
|
|
||||||
|
#else // __CUDACC_VER_MAJOR__ >= 9
|
||||||
|
|
||||||
|
// Usage Note
|
||||||
|
// - THREADS_NUM should be equal to the number of threads in this block.
|
||||||
|
// - (>= Kepler) smem must be able to contain at least n elements of type T, where n is equal to the number
|
||||||
|
// of warps in this block. The number can be calculated by divUp(THREADS_NUM, WARP_SIZE).
|
||||||
|
// - (Fermi) smem must be able to contain at least n elements of type T, where n is equal to the number
|
||||||
|
// of threads in this block (= THREADS_NUM).
|
||||||
|
|
||||||
template <int THREADS_NUM, typename T>
|
template <int THREADS_NUM, typename T>
|
||||||
__device__ T blockScanInclusive(T data, volatile T* smem, uint tid)
|
__device__ T blockScanInclusive(T data, volatile T* smem, uint tid)
|
||||||
{
|
{
|
||||||
@ -73,18 +195,31 @@ __device__ T blockScanInclusive(T data, volatile T* smem, uint tid)
|
|||||||
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
if (tid < (THREADS_NUM / WARP_SIZE))
|
int quot = THREADS_NUM / WARP_SIZE;
|
||||||
|
|
||||||
|
if (tid < quot)
|
||||||
{
|
{
|
||||||
// grab top warp elements
|
// grab top warp elements
|
||||||
T val = smem[tid];
|
T val = smem[tid];
|
||||||
|
|
||||||
// calculate exclusive scan and write back to shared memory
|
if (0 == (THREADS_NUM & (WARP_SIZE - 1)))
|
||||||
smem[tid] = warpScanExclusive(val, smem, tid);
|
{
|
||||||
|
// calculate exclusive scan and write back to shared memory
|
||||||
|
smem[tid] = warpScanExclusive(val, smem, tid);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// calculate inclusive scan and write back to shared memory with offset 1
|
||||||
|
smem[tid + 1] = warpScanInclusive(val, smem, tid);
|
||||||
|
|
||||||
|
if (tid == 0)
|
||||||
|
smem[0] = 0;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
// return updated warp scans with exclusive scan results
|
// return updated warp scans
|
||||||
return warpResult + smem[tid >> LOG_WARP_SIZE];
|
return warpResult + smem[tid >> LOG_WARP_SIZE];
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
@ -99,6 +234,8 @@ __device__ __forceinline__ T blockScanExclusive(T data, volatile T* smem, uint t
|
|||||||
return blockScanInclusive<THREADS_NUM>(data, smem, tid) - data;
|
return blockScanInclusive<THREADS_NUM>(data, smem, tid) - data;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#endif // __CUDACC_VER_MAJOR__ >= 9
|
||||||
|
|
||||||
//! @}
|
//! @}
|
||||||
|
|
||||||
}}
|
}}
|
||||||
|
@ -215,7 +215,7 @@ namespace integral_detail
|
|||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i = 1; i < 32; i *= 2)
|
for (int i = 1; i < 32; i *= 2)
|
||||||
{
|
{
|
||||||
const int n = shfl_up(sum, i, 32);
|
const int n = compatible_shfl_up(sum, i, 32);
|
||||||
|
|
||||||
if (lane_id >= i)
|
if (lane_id >= i)
|
||||||
{
|
{
|
||||||
@ -245,9 +245,9 @@ namespace integral_detail
|
|||||||
int warp_sum = sums[lane_id];
|
int warp_sum = sums[lane_id];
|
||||||
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i = 1; i <= 32; i *= 2)
|
for (int i = 1; i < 32; i *= 2)
|
||||||
{
|
{
|
||||||
const int n = shfl_up(warp_sum, i, 32);
|
const int n = compatible_shfl_up(warp_sum, i, 32);
|
||||||
|
|
||||||
if (lane_id >= i)
|
if (lane_id >= i)
|
||||||
warp_sum += n;
|
warp_sum += n;
|
||||||
@ -453,7 +453,7 @@ namespace integral_detail
|
|||||||
|
|
||||||
for (int i = 1; i <= 8; i *= 2)
|
for (int i = 1; i <= 8; i *= 2)
|
||||||
{
|
{
|
||||||
T n = shfl_up(partial_sum, i, 32);
|
T n = compatible_shfl_up(partial_sum, i, 32);
|
||||||
|
|
||||||
if (lane_id >= i)
|
if (lane_id >= i)
|
||||||
partial_sum += n;
|
partial_sum += n;
|
||||||
|
@ -55,6 +55,36 @@ namespace cv { namespace cudev {
|
|||||||
//! @addtogroup cudev
|
//! @addtogroup cudev
|
||||||
//! @{
|
//! @{
|
||||||
|
|
||||||
|
#if __CUDACC_VER_MAJOR__ >= 9
|
||||||
|
|
||||||
|
// Starting from CUDA 9.0, support for Fermi is dropped.
|
||||||
|
// So CV_CUDEV_ARCH >= 300 is implied.
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
__device__ T warpScanInclusive(uint mask, T data)
|
||||||
|
{
|
||||||
|
const uint laneId = Warp::laneId();
|
||||||
|
|
||||||
|
// scan on shufl functions
|
||||||
|
#pragma unroll
|
||||||
|
for (int i = 1; i <= (WARP_SIZE / 2); i *= 2)
|
||||||
|
{
|
||||||
|
const T val = shfl_up_sync(mask, data, i);
|
||||||
|
if (laneId >= i)
|
||||||
|
data += val;
|
||||||
|
}
|
||||||
|
|
||||||
|
return data;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
__device__ __forceinline__ T warpScanExclusive(uint mask, T data)
|
||||||
|
{
|
||||||
|
return warpScanInclusive(mask, data) - data;
|
||||||
|
}
|
||||||
|
|
||||||
|
#else // __CUDACC_VER_MAJOR__ >= 9
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
__device__ T warpScanInclusive(T data, volatile T* smem, uint tid)
|
__device__ T warpScanInclusive(T data, volatile T* smem, uint tid)
|
||||||
{
|
{
|
||||||
@ -75,19 +105,16 @@ __device__ T warpScanInclusive(T data, volatile T* smem, uint tid)
|
|||||||
|
|
||||||
return data;
|
return data;
|
||||||
#else
|
#else
|
||||||
uint pos = 2 * tid - (tid & (WARP_SIZE - 1));
|
const uint laneId = Warp::laneId();
|
||||||
smem[pos] = 0;
|
|
||||||
|
|
||||||
pos += WARP_SIZE;
|
smem[tid] = data;
|
||||||
smem[pos] = data;
|
|
||||||
|
|
||||||
smem[pos] += smem[pos - 1];
|
#pragma unroll
|
||||||
smem[pos] += smem[pos - 2];
|
for (int i = 1; i <= (WARP_SIZE / 2); i *= 2)
|
||||||
smem[pos] += smem[pos - 4];
|
if (laneId >= i)
|
||||||
smem[pos] += smem[pos - 8];
|
smem[tid] += smem[tid - i];
|
||||||
smem[pos] += smem[pos - 16];
|
|
||||||
|
|
||||||
return smem[pos];
|
return smem[tid];
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -97,6 +124,8 @@ __device__ __forceinline__ T warpScanExclusive(T data, volatile T* smem, uint ti
|
|||||||
return warpScanInclusive(data, smem, tid) - data;
|
return warpScanInclusive(data, smem, tid) - data;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#endif // __CUDACC_VER_MAJOR__ >= 9
|
||||||
|
|
||||||
//! @}
|
//! @}
|
||||||
|
|
||||||
}}
|
}}
|
||||||
|
@ -48,6 +48,8 @@
|
|||||||
|
|
||||||
#include "../common.hpp"
|
#include "../common.hpp"
|
||||||
#include "../util/vec_traits.hpp"
|
#include "../util/vec_traits.hpp"
|
||||||
|
#include "../block/block.hpp"
|
||||||
|
#include "warp.hpp"
|
||||||
|
|
||||||
namespace cv { namespace cudev {
|
namespace cv { namespace cudev {
|
||||||
|
|
||||||
@ -59,7 +61,7 @@ namespace cv { namespace cudev {
|
|||||||
#if __CUDACC_VER_MAJOR__ >= 9
|
#if __CUDACC_VER_MAJOR__ >= 9
|
||||||
# define __shfl(x, y, z) __shfl_sync(0xFFFFFFFFU, x, y, z)
|
# define __shfl(x, y, z) __shfl_sync(0xFFFFFFFFU, x, y, z)
|
||||||
# define __shfl_xor(x, y, z) __shfl_xor_sync(0xFFFFFFFFU, x, y, z)
|
# define __shfl_xor(x, y, z) __shfl_xor_sync(0xFFFFFFFFU, x, y, z)
|
||||||
# define __shfl_up(x, y, z) __shfl_up_sync(0xFFFFFFFFU, x, y, z)
|
//# define __shfl_up(x, y, z) __shfl_up_sync(0xFFFFFFFFU, x, y, z)
|
||||||
# define __shfl_down(x, y, z) __shfl_down_sync(0xFFFFFFFFU, x, y, z)
|
# define __shfl_down(x, y, z) __shfl_down_sync(0xFFFFFFFFU, x, y, z)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -155,6 +157,53 @@ CV_CUDEV_SHFL_VEC_INST(double)
|
|||||||
|
|
||||||
// shfl_up
|
// shfl_up
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
__device__ __forceinline__ T compatible_shfl_up(T val, uint delta, int width = warpSize)
|
||||||
|
{
|
||||||
|
#if __CUDACC_VER_MAJOR__ < 9
|
||||||
|
|
||||||
|
return shfl_up(val, delta, width);
|
||||||
|
|
||||||
|
#else // __CUDACC_VER_MAJOR__ < 9
|
||||||
|
|
||||||
|
#if CV_CUDEV_ARCH >= 700
|
||||||
|
return shfl_up_sync(0xFFFFFFFFU, val, delta, width);
|
||||||
|
#else
|
||||||
|
const int block_size = Block::blockSize();
|
||||||
|
const int residual = block_size & (warpSize - 1);
|
||||||
|
|
||||||
|
if (0 == residual)
|
||||||
|
return shfl_up_sync(0xFFFFFFFFU, val, delta, width);
|
||||||
|
else
|
||||||
|
{
|
||||||
|
const int n_warps = divUp(block_size, warpSize);
|
||||||
|
const int warp_id = Warp::warpId();
|
||||||
|
|
||||||
|
if (warp_id < n_warps - 1)
|
||||||
|
return shfl_up_sync(0xFFFFFFFFU, val, delta, width);
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// We are at the last threads of a block whose number of threads
|
||||||
|
// is not a multiple of the warp size
|
||||||
|
uint mask = (1LU << residual) - 1;
|
||||||
|
return shfl_up_sync(mask, val, delta, width);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif // __CUDACC_VER_MAJOR__ < 9
|
||||||
|
}
|
||||||
|
|
||||||
|
#if __CUDACC_VER_MAJOR__ >= 9
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
__device__ __forceinline__ T shfl_up_sync(uint mask, T val, uint delta, int width = warpSize)
|
||||||
|
{
|
||||||
|
return (T) __shfl_up_sync(mask, val, delta, width);
|
||||||
|
}
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
__device__ __forceinline__ uchar shfl_up(uchar val, uint delta, int width = warpSize)
|
__device__ __forceinline__ uchar shfl_up(uchar val, uint delta, int width = warpSize)
|
||||||
{
|
{
|
||||||
return (uchar) __shfl_up((int) val, delta, width);
|
return (uchar) __shfl_up((int) val, delta, width);
|
||||||
@ -244,6 +293,8 @@ CV_CUDEV_SHFL_UP_VEC_INST(double)
|
|||||||
|
|
||||||
#undef CV_CUDEV_SHFL_UP_VEC_INST
|
#undef CV_CUDEV_SHFL_UP_VEC_INST
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
// shfl_down
|
// shfl_down
|
||||||
|
|
||||||
__device__ __forceinline__ uchar shfl_down(uchar val, uint delta, int width = warpSize)
|
__device__ __forceinline__ uchar shfl_down(uchar val, uint delta, int width = warpSize)
|
||||||
|
140
modules/cudev/test/test_scan.cu
Normal file
140
modules/cudev/test/test_scan.cu
Normal file
@ -0,0 +1,140 @@
|
|||||||
|
|
||||||
|
#include "test_precomp.hpp"
|
||||||
|
|
||||||
|
using namespace cv;
|
||||||
|
using namespace cv::cudev;
|
||||||
|
using namespace cvtest;
|
||||||
|
|
||||||
|
// BlockScanInt
|
||||||
|
|
||||||
|
template <int THREADS_NUM>
|
||||||
|
__global__ void int_kernel(int* data)
|
||||||
|
{
|
||||||
|
uint tid = Block::threadLineId();
|
||||||
|
|
||||||
|
#if CV_CUDEV_ARCH >= 300
|
||||||
|
const int n_warps = (THREADS_NUM - 1) / WARP_SIZE + 1;
|
||||||
|
__shared__ int smem[n_warps];
|
||||||
|
#else
|
||||||
|
__shared__ int smem[THREADS_NUM];
|
||||||
|
#endif
|
||||||
|
|
||||||
|
data[tid] = blockScanInclusive<THREADS_NUM>(data[tid], smem, tid);
|
||||||
|
}
|
||||||
|
|
||||||
|
#define BLOCK_SCAN_INT_TEST(block_size) \
|
||||||
|
TEST(BlockScanInt, BlockSize##block_size) \
|
||||||
|
{ \
|
||||||
|
Mat src = randomMat(Size(block_size, 1), CV_32SC1, 0, 1024); \
|
||||||
|
\
|
||||||
|
GpuMat d_src; \
|
||||||
|
d_src.upload(src); \
|
||||||
|
\
|
||||||
|
for (int col = 1; col < block_size; col++) \
|
||||||
|
src.at<int>(0, col) += src.at<int>(0, col - 1); \
|
||||||
|
\
|
||||||
|
int_kernel<block_size><<<1, block_size>>>((int*)d_src.data); \
|
||||||
|
\
|
||||||
|
CV_CUDEV_SAFE_CALL(cudaDeviceSynchronize()); \
|
||||||
|
\
|
||||||
|
EXPECT_MAT_NEAR(d_src, src, 0); \
|
||||||
|
}
|
||||||
|
|
||||||
|
BLOCK_SCAN_INT_TEST(29)
|
||||||
|
BLOCK_SCAN_INT_TEST(30)
|
||||||
|
BLOCK_SCAN_INT_TEST(32)
|
||||||
|
BLOCK_SCAN_INT_TEST(40)
|
||||||
|
BLOCK_SCAN_INT_TEST(41)
|
||||||
|
|
||||||
|
BLOCK_SCAN_INT_TEST(59)
|
||||||
|
BLOCK_SCAN_INT_TEST(60)
|
||||||
|
BLOCK_SCAN_INT_TEST(64)
|
||||||
|
BLOCK_SCAN_INT_TEST(70)
|
||||||
|
BLOCK_SCAN_INT_TEST(71)
|
||||||
|
|
||||||
|
BLOCK_SCAN_INT_TEST(109)
|
||||||
|
BLOCK_SCAN_INT_TEST(110)
|
||||||
|
BLOCK_SCAN_INT_TEST(128)
|
||||||
|
BLOCK_SCAN_INT_TEST(130)
|
||||||
|
BLOCK_SCAN_INT_TEST(131)
|
||||||
|
|
||||||
|
BLOCK_SCAN_INT_TEST(189)
|
||||||
|
BLOCK_SCAN_INT_TEST(200)
|
||||||
|
BLOCK_SCAN_INT_TEST(256)
|
||||||
|
BLOCK_SCAN_INT_TEST(300)
|
||||||
|
BLOCK_SCAN_INT_TEST(311)
|
||||||
|
|
||||||
|
BLOCK_SCAN_INT_TEST(489)
|
||||||
|
BLOCK_SCAN_INT_TEST(500)
|
||||||
|
BLOCK_SCAN_INT_TEST(512)
|
||||||
|
BLOCK_SCAN_INT_TEST(600)
|
||||||
|
BLOCK_SCAN_INT_TEST(611)
|
||||||
|
|
||||||
|
BLOCK_SCAN_INT_TEST(1024)
|
||||||
|
|
||||||
|
// BlockScanDouble
|
||||||
|
|
||||||
|
template <int THREADS_NUM>
|
||||||
|
__global__ void double_kernel(double* data)
|
||||||
|
{
|
||||||
|
uint tid = Block::threadLineId();
|
||||||
|
|
||||||
|
#if CV_CUDEV_ARCH >= 300
|
||||||
|
const int n_warps = (THREADS_NUM - 1) / WARP_SIZE + 1;
|
||||||
|
__shared__ double smem[n_warps];
|
||||||
|
#else
|
||||||
|
__shared__ double smem[THREADS_NUM];
|
||||||
|
#endif
|
||||||
|
|
||||||
|
data[tid] = blockScanInclusive<THREADS_NUM>(data[tid], smem, tid);
|
||||||
|
}
|
||||||
|
|
||||||
|
#define BLOCK_SCAN_DOUBLE_TEST(block_size) \
|
||||||
|
TEST(BlockScanDouble, BlockSize##block_size) \
|
||||||
|
{ \
|
||||||
|
Mat src = randomMat(Size(block_size, 1), CV_64FC1, 0.0, 1.0); \
|
||||||
|
\
|
||||||
|
GpuMat d_src; \
|
||||||
|
d_src.upload(src); \
|
||||||
|
\
|
||||||
|
for (int col = 1; col < block_size; col++) \
|
||||||
|
src.at<double>(0, col) += src.at<double>(0, col - 1); \
|
||||||
|
\
|
||||||
|
double_kernel<block_size><<<1, block_size>>>((double*)d_src.data); \
|
||||||
|
\
|
||||||
|
CV_CUDEV_SAFE_CALL(cudaDeviceSynchronize()); \
|
||||||
|
\
|
||||||
|
EXPECT_MAT_NEAR(d_src, src, 1e-10); \
|
||||||
|
}
|
||||||
|
|
||||||
|
BLOCK_SCAN_DOUBLE_TEST(29)
|
||||||
|
BLOCK_SCAN_DOUBLE_TEST(30)
|
||||||
|
BLOCK_SCAN_DOUBLE_TEST(32)
|
||||||
|
BLOCK_SCAN_DOUBLE_TEST(40)
|
||||||
|
BLOCK_SCAN_DOUBLE_TEST(41)
|
||||||
|
|
||||||
|
BLOCK_SCAN_DOUBLE_TEST(59)
|
||||||
|
BLOCK_SCAN_DOUBLE_TEST(60)
|
||||||
|
BLOCK_SCAN_DOUBLE_TEST(64)
|
||||||
|
BLOCK_SCAN_DOUBLE_TEST(70)
|
||||||
|
BLOCK_SCAN_DOUBLE_TEST(71)
|
||||||
|
|
||||||
|
BLOCK_SCAN_DOUBLE_TEST(109)
|
||||||
|
BLOCK_SCAN_DOUBLE_TEST(110)
|
||||||
|
BLOCK_SCAN_DOUBLE_TEST(128)
|
||||||
|
BLOCK_SCAN_DOUBLE_TEST(130)
|
||||||
|
BLOCK_SCAN_DOUBLE_TEST(131)
|
||||||
|
|
||||||
|
BLOCK_SCAN_DOUBLE_TEST(189)
|
||||||
|
BLOCK_SCAN_DOUBLE_TEST(200)
|
||||||
|
BLOCK_SCAN_DOUBLE_TEST(256)
|
||||||
|
BLOCK_SCAN_DOUBLE_TEST(300)
|
||||||
|
BLOCK_SCAN_DOUBLE_TEST(311)
|
||||||
|
|
||||||
|
BLOCK_SCAN_DOUBLE_TEST(489)
|
||||||
|
BLOCK_SCAN_DOUBLE_TEST(500)
|
||||||
|
BLOCK_SCAN_DOUBLE_TEST(512)
|
||||||
|
BLOCK_SCAN_DOUBLE_TEST(600)
|
||||||
|
BLOCK_SCAN_DOUBLE_TEST(611)
|
||||||
|
|
||||||
|
BLOCK_SCAN_DOUBLE_TEST(1024)
|
Loading…
Reference in New Issue
Block a user