fixed backward compatibility with less than 1.2 CUDA capability

2025-06-17 07:10:51 +08:00 · 2012-07-07 21:48:53 +00:00 · 2012-07-07 21:48:53 +00:00 · 09269b4cd8
commit 09269b4cd8
parent 3ac6272c3b
2 changed files with 68 additions and 3 deletions
--- a/modules/gpu/src/cuda/lbp.cu
+++ b/modules/gpu/src/cuda/lbp.cu
@ -86,8 +86,11 @@ namespace cv { namespace gpu { namespace device
            rect.y = roundf(y * scale);
            rect.z = roundf(clWidth);
            rect.w = roundf(clHeight);
-
+#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
-            int res = atomicInc(n, 100);
+            int res = __atomicInc(n, 100U);
 #else
            int res = atomicInc(n, 100U);
 #endif
            objects(0, res) = rect;
        }
@ -111,14 +114,24 @@ namespace cv { namespace gpu { namespace device
            __syncthreads();
            int cls = labels[tid];
 #if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
            __atomicAdd((int*)(rrects + cls * 4 + 0), candidates[tid].x);
            __atomicAdd((int*)(rrects + cls * 4 + 1), candidates[tid].y);
            __atomicAdd((int*)(rrects + cls * 4 + 2), candidates[tid].z);
            __atomicAdd((int*)(rrects + cls * 4 + 3), candidates[tid].w);
 #else
            atomicAdd((int*)(rrects + cls * 4 + 0), candidates[tid].x);
            atomicAdd((int*)(rrects + cls * 4 + 1), candidates[tid].y);
            atomicAdd((int*)(rrects + cls * 4 + 2), candidates[tid].z);
            atomicAdd((int*)(rrects + cls * 4 + 3), candidates[tid].w);
 #endif
            labels[tid] = 0;
            __syncthreads();
-
+#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
            __atomicInc((unsigned int*)labels + cls, n);
 #else
            atomicInc((unsigned int*)labels + cls, n);
 #endif
            *nclasses = 0;
            int active = labels[tid];
@ -154,7 +167,11 @@ namespace cv { namespace gpu { namespace device
                    }
                    if( j == n)
                    {
 #if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
                        objects[__atomicInc(nclasses, n)] = VecTraits<int4>::make(r1[0], r1[1], r1[2], r1[3]);
 #else
                        objects[atomicInc(nclasses, n)] = VecTraits<int4>::make(r1[0], r1[1], r1[2], r1[3]);
 #endif
                    }
                }
            }
--- a/modules/gpu/src/opencv2/gpu/device/lbp.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/lbp.hpp
@ -48,6 +48,46 @@
 namespace cv { namespace gpu { namespace device {
 namespace lbp{
    #define TAG_MASK ( (1U << ( (sizeof(unsigned int) << 3) - 5U)) - 1U )
 template<typename T>
 __device__ __forceinline__ T __atomicInc(T* address, T val)
 {
    T count;
    unsigned int tag = threadIdx.x << ( (sizeof(unsigned int) << 3) - 5U);
    do
    {
        count = *address & TAG_MASK;
        count = tag | (count + 1);
        *address = count;
    } while (*address != count);
    return (count & TAG_MASK) - 1;
 }
 template<typename T>
 __device__ __forceinline__ void __atomicAdd(T* address, T val)
 {
    T count;
    unsigned int tag = threadIdx.x << ( (sizeof(unsigned int) << 3) - 5U);
    do
    {
        count = *address & TAG_MASK;
        count = tag | (count + val);
        *address = count;
    } while (*address != count);
 }
 template<typename T>
 __device__ __forceinline__ T __atomicMin(T* address, T val)
 {
    T count = min(*address, val);
    do
    {
        *address = count;
    } while (*address > count);
    return count;
 }
    struct Stage
    {
        int    first;
@ -94,11 +134,19 @@ namespace lbp{
                if (p < q)
                {
 #if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
                    __atomicMin(labels + id, p);
 #else
                    atomicMin(labels + id, p);
 #endif
                }
                else if (p > q)
                {
 #if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
                    __atomicMin(labels + tid, q);
 #else
                    atomicMin(labels + tid, q);
 #endif
                }
            }
        }