From ada6ab3778b1391192d84688caad6c388becf46c Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Wed, 15 Feb 2012 19:25:29 +0000 Subject: [PATCH] fixed compilation for old compute capabilities --- modules/gpu/src/cuda/column_filter.cu | 71 +++++++++++++++++---------- modules/gpu/src/cuda/row_filter.cu | 71 +++++++++++++++++---------- modules/gpu/src/filtering.cpp | 15 ++++-- 3 files changed, 102 insertions(+), 55 deletions(-) diff --git a/modules/gpu/src/cuda/column_filter.cu b/modules/gpu/src/cuda/column_filter.cu index 36dd7bb320..d00bec84aa 100644 --- a/modules/gpu/src/cuda/column_filter.cu +++ b/modules/gpu/src/cuda/column_filter.cu @@ -61,12 +61,20 @@ namespace cv { namespace gpu { namespace device cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float)) ); } - template + template __global__ void linearColumnFilter(const DevMem2D_ src, PtrStep dst, const int anchor, const B brd) { - Static::check(); - Static= KSIZE>::check(); - Static::cn == VecTraits::cn>::check(); + #if __CUDA_ARCH__ >= 200 + const int BLOCK_DIM_X = 16; + const int BLOCK_DIM_Y = 16; + const int PATCH_PER_BLOCK = 4; + const int HALO_SIZE = KSIZE <= 16 ? 1 : 2; + #else + const int BLOCK_DIM_X = 16; + const int BLOCK_DIM_Y = 8; + const int PATCH_PER_BLOCK = 2; + const int HALO_SIZE = 2; + #endif typedef typename TypeVec::cn>::vec_type sum_t; @@ -103,32 +111,45 @@ namespace cv { namespace gpu { namespace device { const int y = yStart + j * BLOCK_DIM_Y; - if (y >= src.rows) - return; + if (y < src.rows) + { + sum_t sum = VecTraits::all(0); - sum_t sum = VecTraits::all(0); + #pragma unroll + for (int k = 0; k < KSIZE; ++k) + sum = sum + smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y - anchor + k][threadIdx.x] * c_kernel[k]; - #pragma unroll - for (int k = 0; k < KSIZE; ++k) - sum = sum + smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y - anchor + k][threadIdx.x] * c_kernel[k]; - - dst(y, x) = saturate_cast(sum); + dst(y, x) = saturate_cast(sum); + } } } template class B> - void linearColumnFilter_caller(DevMem2D_ src, DevMem2D_ dst, int anchor, cudaStream_t stream) + void linearColumnFilter_caller(DevMem2D_ src, DevMem2D_ dst, int anchor, int cc, cudaStream_t stream) { - const int BLOCK_DIM_X = 16; - const int BLOCK_DIM_Y = 16; - const int PATCH_PER_BLOCK = 4; + int BLOCK_DIM_X; + int BLOCK_DIM_Y; + int PATCH_PER_BLOCK; + + if (cc >= 20) + { + BLOCK_DIM_X = 16; + BLOCK_DIM_Y = 16; + PATCH_PER_BLOCK = 4; + } + else + { + BLOCK_DIM_X = 16; + BLOCK_DIM_Y = 8; + PATCH_PER_BLOCK = 2; + } const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y); const dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y * PATCH_PER_BLOCK)); B brd(src.rows); - linearColumnFilter<<>>(src, dst, anchor, brd); + linearColumnFilter<<>>(src, dst, anchor, brd); cudaSafeCall( cudaGetLastError() ); @@ -137,9 +158,9 @@ namespace cv { namespace gpu { namespace device } template - void linearColumnFilter_gpu(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream) + void linearColumnFilter_gpu(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream) { - typedef void (*caller_t)(DevMem2D_ src, DevMem2D_ dst, int anchor, cudaStream_t stream); + typedef void (*caller_t)(DevMem2D_ src, DevMem2D_ dst, int anchor, int cc, cudaStream_t stream); static const caller_t callers[5][33] = { @@ -322,13 +343,13 @@ namespace cv { namespace gpu { namespace device loadKernel(kernel, ksize); - callers[brd_type][ksize]((DevMem2D_)src, (DevMem2D_)dst, anchor, stream); + callers[brd_type][ksize]((DevMem2D_)src, (DevMem2D_)dst, anchor, cc, stream); } - template void linearColumnFilter_gpu(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); - template void linearColumnFilter_gpu(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); - template void linearColumnFilter_gpu(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); - template void linearColumnFilter_gpu(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); - template void linearColumnFilter_gpu(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); + template void linearColumnFilter_gpu(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); + template void linearColumnFilter_gpu(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); + template void linearColumnFilter_gpu(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); + template void linearColumnFilter_gpu(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); + template void linearColumnFilter_gpu(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); } // namespace column_filter }}} // namespace cv { namespace gpu { namespace device diff --git a/modules/gpu/src/cuda/row_filter.cu b/modules/gpu/src/cuda/row_filter.cu index b252b2123b..a5ec8869c8 100644 --- a/modules/gpu/src/cuda/row_filter.cu +++ b/modules/gpu/src/cuda/row_filter.cu @@ -61,12 +61,20 @@ namespace cv { namespace gpu { namespace device cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float)) ); } - template + template __global__ void linearRowFilter(const DevMem2D_ src, PtrStep dst, const int anchor, const B brd) { - Static::check(); - Static= KSIZE>::check(); - Static::cn == VecTraits::cn>::check(); + #if __CUDA_ARCH__ >= 200 + const int BLOCK_DIM_X = 32; + const int BLOCK_DIM_Y = 8; + const int PATCH_PER_BLOCK = 4; + const int HALO_SIZE = 1; + #else + const int BLOCK_DIM_X = 32; + const int BLOCK_DIM_Y = 4; + const int PATCH_PER_BLOCK = 4; + const int HALO_SIZE = 1; + #endif typedef typename TypeVec::cn>::vec_type sum_t; @@ -103,32 +111,45 @@ namespace cv { namespace gpu { namespace device { const int x = xStart + j * BLOCK_DIM_X; - if (x >= src.cols) - return; + if (x < src.cols) + { + sum_t sum = VecTraits::all(0); - sum_t sum = VecTraits::all(0); + #pragma unroll + for (int k = 0; k < KSIZE; ++k) + sum = sum + smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X - anchor + k] * c_kernel[k]; - #pragma unroll - for (int k = 0; k < KSIZE; ++k) - sum = sum + smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X - anchor + k] * c_kernel[k]; - - dst(y, x) = saturate_cast(sum); + dst(y, x) = saturate_cast(sum); + } } } template class B> - void linearRowFilter_caller(DevMem2D_ src, DevMem2D_ dst, int anchor, cudaStream_t stream) + void linearRowFilter_caller(DevMem2D_ src, DevMem2D_ dst, int anchor, int cc, cudaStream_t stream) { - const int BLOCK_DIM_X = 32; - const int BLOCK_DIM_Y = 8; - const int PATCH_PER_BLOCK = 4; + int BLOCK_DIM_X; + int BLOCK_DIM_Y; + int PATCH_PER_BLOCK; + + if (cc >= 20) + { + BLOCK_DIM_X = 32; + BLOCK_DIM_Y = 8; + PATCH_PER_BLOCK = 4; + } + else + { + BLOCK_DIM_X = 32; + BLOCK_DIM_Y = 4; + PATCH_PER_BLOCK = 4; + } const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y); const dim3 grid(divUp(src.cols, BLOCK_DIM_X * PATCH_PER_BLOCK), divUp(src.rows, BLOCK_DIM_Y)); B brd(src.cols); - linearRowFilter<<>>(src, dst, anchor, brd); + linearRowFilter<<>>(src, dst, anchor, brd); cudaSafeCall( cudaGetLastError() ); if (stream == 0) @@ -136,9 +157,9 @@ namespace cv { namespace gpu { namespace device } template - void linearRowFilter_gpu(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream) + void linearRowFilter_gpu(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream) { - typedef void (*caller_t)(DevMem2D_ src, DevMem2D_ dst, int anchor, cudaStream_t stream); + typedef void (*caller_t)(DevMem2D_ src, DevMem2D_ dst, int anchor, int cc, cudaStream_t stream); static const caller_t callers[5][33] = { @@ -321,13 +342,13 @@ namespace cv { namespace gpu { namespace device loadKernel(kernel, ksize); - callers[brd_type][ksize]((DevMem2D_)src, (DevMem2D_)dst, anchor, stream); + callers[brd_type][ksize]((DevMem2D_)src, (DevMem2D_)dst, anchor, cc, stream); } - template void linearRowFilter_gpu(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); - template void linearRowFilter_gpu(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); - template void linearRowFilter_gpu(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); - template void linearRowFilter_gpu(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); - template void linearRowFilter_gpu(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); + template void linearRowFilter_gpu(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); + template void linearRowFilter_gpu(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); + template void linearRowFilter_gpu(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); + template void linearRowFilter_gpu(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); + template void linearRowFilter_gpu(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); } // namespace row_filter }}} // namespace cv { namespace gpu { namespace device diff --git a/modules/gpu/src/filtering.cpp b/modules/gpu/src/filtering.cpp index 45e2cd03e4..42a0a39b00 100644 --- a/modules/gpu/src/filtering.cpp +++ b/modules/gpu/src/filtering.cpp @@ -740,13 +740,13 @@ namespace cv { namespace gpu { namespace device namespace row_filter { template - void linearRowFilter_gpu(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); + void linearRowFilter_gpu(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); } namespace column_filter { template - void linearColumnFilter_gpu(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); + void linearColumnFilter_gpu(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); } }}} @@ -755,7 +755,7 @@ namespace typedef NppStatus (*nppFilter1D_t)(const Npp8u * pSrc, Npp32s nSrcStep, Npp8u * pDst, Npp32s nDstStep, NppiSize oROI, const Npp32s * pKernel, Npp32s nMaskSize, Npp32s nAnchor, Npp32s nDivisor); - typedef void (*gpuFilter1D_t)(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); + typedef void (*gpuFilter1D_t)(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); struct NppLinearRowFilter : public BaseRowFilter_GPU { @@ -791,7 +791,9 @@ namespace virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null()) { - func(src, dst, kernel.ptr(), ksize, anchor, brd_type, StreamAccessor::getStream(s)); + DeviceInfo devInfo; + int cc = devInfo.majorVersion() * 10 + devInfo.minorVersion(); + func(src, dst, kernel.ptr(), ksize, anchor, brd_type, cc, StreamAccessor::getStream(s)); } Mat kernel; @@ -899,7 +901,10 @@ namespace virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null()) { - func(src, dst, kernel.ptr(), ksize, anchor, brd_type, StreamAccessor::getStream(s)); + DeviceInfo devInfo; + int cc = devInfo.majorVersion() * 10 + devInfo.minorVersion(); + CV_Assert(cc >= 20 || ksize <= 16); + func(src, dst, kernel.ptr(), ksize, anchor, brd_type, cc, StreamAccessor::getStream(s)); } Mat kernel;