From 09269b4cd802b1669a061fc80d36c37c377ebc8a Mon Sep 17 00:00:00 2001 From: Marina Kolpakova Date: Sat, 7 Jul 2012 21:48:53 +0000 Subject: [PATCH] fixed backward compatibility with less than 1.2 CUDA capability --- modules/gpu/src/cuda/lbp.cu | 23 +++++++++-- modules/gpu/src/opencv2/gpu/device/lbp.hpp | 48 ++++++++++++++++++++++ 2 files changed, 68 insertions(+), 3 deletions(-) diff --git a/modules/gpu/src/cuda/lbp.cu b/modules/gpu/src/cuda/lbp.cu index cd469453b3..9981fa6f0f 100644 --- a/modules/gpu/src/cuda/lbp.cu +++ b/modules/gpu/src/cuda/lbp.cu @@ -86,8 +86,11 @@ namespace cv { namespace gpu { namespace device rect.y = roundf(y * scale); rect.z = roundf(clWidth); rect.w = roundf(clHeight); - - int res = atomicInc(n, 100); +#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120) + int res = __atomicInc(n, 100U); +#else + int res = atomicInc(n, 100U); +#endif objects(0, res) = rect; } @@ -111,14 +114,24 @@ namespace cv { namespace gpu { namespace device __syncthreads(); int cls = labels[tid]; +#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120) + __atomicAdd((int*)(rrects + cls * 4 + 0), candidates[tid].x); + __atomicAdd((int*)(rrects + cls * 4 + 1), candidates[tid].y); + __atomicAdd((int*)(rrects + cls * 4 + 2), candidates[tid].z); + __atomicAdd((int*)(rrects + cls * 4 + 3), candidates[tid].w); +#else atomicAdd((int*)(rrects + cls * 4 + 0), candidates[tid].x); atomicAdd((int*)(rrects + cls * 4 + 1), candidates[tid].y); atomicAdd((int*)(rrects + cls * 4 + 2), candidates[tid].z); atomicAdd((int*)(rrects + cls * 4 + 3), candidates[tid].w); +#endif labels[tid] = 0; __syncthreads(); - +#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120) + __atomicInc((unsigned int*)labels + cls, n); +#else atomicInc((unsigned int*)labels + cls, n); +#endif *nclasses = 0; int active = labels[tid]; @@ -154,7 +167,11 @@ namespace cv { namespace gpu { namespace device } if( j == n) { +#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120) + objects[__atomicInc(nclasses, n)] = VecTraits::make(r1[0], r1[1], r1[2], r1[3]); +#else objects[atomicInc(nclasses, n)] = VecTraits::make(r1[0], r1[1], r1[2], r1[3]); +#endif } } } diff --git a/modules/gpu/src/opencv2/gpu/device/lbp.hpp b/modules/gpu/src/opencv2/gpu/device/lbp.hpp index f4ec78b300..8a7624d18d 100644 --- a/modules/gpu/src/opencv2/gpu/device/lbp.hpp +++ b/modules/gpu/src/opencv2/gpu/device/lbp.hpp @@ -48,6 +48,46 @@ namespace cv { namespace gpu { namespace device { namespace lbp{ + + #define TAG_MASK ( (1U << ( (sizeof(unsigned int) << 3) - 5U)) - 1U ) +template +__device__ __forceinline__ T __atomicInc(T* address, T val) +{ + T count; + unsigned int tag = threadIdx.x << ( (sizeof(unsigned int) << 3) - 5U); + do + { + count = *address & TAG_MASK; + count = tag | (count + 1); + *address = count; + } while (*address != count); + return (count & TAG_MASK) - 1; +} + +template +__device__ __forceinline__ void __atomicAdd(T* address, T val) +{ + T count; + unsigned int tag = threadIdx.x << ( (sizeof(unsigned int) << 3) - 5U); + do + { + count = *address & TAG_MASK; + count = tag | (count + val); + *address = count; + } while (*address != count); +} + +template +__device__ __forceinline__ T __atomicMin(T* address, T val) +{ + T count = min(*address, val); + do + { + *address = count; + } while (*address > count); + return count; +} + struct Stage { int first; @@ -94,11 +134,19 @@ namespace lbp{ if (p < q) { +#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120) + __atomicMin(labels + id, p); +#else atomicMin(labels + id, p); +#endif } else if (p > q) { +#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120) + __atomicMin(labels + tid, q); +#else atomicMin(labels + tid, q); +#endif } } }