From fdb07a0ac156b3186721e871723356458f347e87 Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Mon, 11 Feb 2013 16:55:25 +0400 Subject: [PATCH] fix performance issue of gpu reduction --- modules/gpu/src/cuda/matrix_reductions.cu | 319 +++++++++++++--------- 1 file changed, 185 insertions(+), 134 deletions(-) diff --git a/modules/gpu/src/cuda/matrix_reductions.cu b/modules/gpu/src/cuda/matrix_reductions.cu index 6ee56ca5b0..b48c47e6a2 100644 --- a/modules/gpu/src/cuda/matrix_reductions.cu +++ b/modules/gpu/src/cuda/matrix_reductions.cu @@ -55,6 +55,128 @@ using namespace cv::gpu; using namespace cv::gpu::device; +namespace detail +{ + __device__ __forceinline__ int cvAtomicAdd(int* address, int val) + { + return ::atomicAdd(address, val); + } + __device__ __forceinline__ unsigned int cvAtomicAdd(unsigned int* address, unsigned int val) + { + return ::atomicAdd(address, val); + } + __device__ __forceinline__ float cvAtomicAdd(float* address, float val) + { + #if __CUDA_ARCH__ >= 200 + return ::atomicAdd(address, val); + #else + int* address_as_i = (int*) address; + int old = *address_as_i, assumed; + do { + assumed = old; + old = ::atomicCAS(address_as_i, assumed, + __float_as_int(val + __int_as_float(assumed))); + } while (assumed != old); + return __int_as_float(old); + #endif + } + __device__ __forceinline__ double cvAtomicAdd(double* address, double val) + { + #if __CUDA_ARCH__ >= 130 + unsigned long long int* address_as_ull = (unsigned long long int*) address; + unsigned long long int old = *address_as_ull, assumed; + do { + assumed = old; + old = ::atomicCAS(address_as_ull, assumed, + __double_as_longlong(val + __longlong_as_double(assumed))); + } while (assumed != old); + return __longlong_as_double(old); + #else + (void) address; + (void) val; + return 0.0; + #endif + } + + __device__ __forceinline__ int cvAtomicMin(int* address, int val) + { + return ::atomicMin(address, val); + } + __device__ __forceinline__ float cvAtomicMin(float* address, float val) + { + #if __CUDA_ARCH__ >= 120 + int* address_as_i = (int*) address; + int old = *address_as_i, assumed; + do { + assumed = old; + old = ::atomicCAS(address_as_i, assumed, + __float_as_int(::fminf(val, __int_as_float(assumed)))); + } while (assumed != old); + return __int_as_float(old); + #else + (void) address; + (void) val; + return 0.0f; + #endif + } + __device__ __forceinline__ double cvAtomicMin(double* address, double val) + { + #if __CUDA_ARCH__ >= 130 + unsigned long long int* address_as_ull = (unsigned long long int*) address; + unsigned long long int old = *address_as_ull, assumed; + do { + assumed = old; + old = ::atomicCAS(address_as_ull, assumed, + __double_as_longlong(::fmin(val, __longlong_as_double(assumed)))); + } while (assumed != old); + return __longlong_as_double(old); + #else + (void) address; + (void) val; + return 0.0; + #endif + } + + __device__ __forceinline__ int cvAtomicMax(int* address, int val) + { + return ::atomicMax(address, val); + } + __device__ __forceinline__ float cvAtomicMax(float* address, float val) + { + #if __CUDA_ARCH__ >= 120 + int* address_as_i = (int*) address; + int old = *address_as_i, assumed; + do { + assumed = old; + old = ::atomicCAS(address_as_i, assumed, + __float_as_int(::fmaxf(val, __int_as_float(assumed)))); + } while (assumed != old); + return __int_as_float(old); + #else + (void) address; + (void) val; + return 0.0f; + #endif + } + __device__ __forceinline__ double cvAtomicMax(double* address, double val) + { + #if __CUDA_ARCH__ >= 130 + unsigned long long int* address_as_ull = (unsigned long long int*) address; + unsigned long long int old = *address_as_ull, assumed; + do { + assumed = old; + old = ::atomicCAS(address_as_ull, assumed, + __double_as_longlong(::fmax(val, __longlong_as_double(assumed)))); + } while (assumed != old); + return __longlong_as_double(old); + #else + (void) address; + (void) val; + return 0.0; + #endif + } +} + namespace detail { template struct Unroll; @@ -152,7 +274,7 @@ namespace sum { static __device__ void run(R* ptr, R val) { - ::atomicAdd(ptr, val); + detail::cvAtomicAdd(ptr, val); } }; template struct AtomicAdd @@ -161,8 +283,8 @@ namespace sum static __device__ void run(R* ptr, val_type val) { - ::atomicAdd(ptr, val.x); - ::atomicAdd(ptr + 1, val.y); + detail::cvAtomicAdd(ptr, val.x); + detail::cvAtomicAdd(ptr + 1, val.y); } }; template struct AtomicAdd @@ -171,9 +293,9 @@ namespace sum static __device__ void run(R* ptr, val_type val) { - ::atomicAdd(ptr, val.x); - ::atomicAdd(ptr + 1, val.y); - ::atomicAdd(ptr + 2, val.z); + detail::cvAtomicAdd(ptr, val.x); + detail::cvAtomicAdd(ptr + 1, val.y); + detail::cvAtomicAdd(ptr + 2, val.z); } }; template struct AtomicAdd @@ -182,10 +304,10 @@ namespace sum static __device__ void run(R* ptr, val_type val) { - ::atomicAdd(ptr, val.x); - ::atomicAdd(ptr + 1, val.y); - ::atomicAdd(ptr + 2, val.z); - ::atomicAdd(ptr + 3, val.w); + detail::cvAtomicAdd(ptr, val.x); + detail::cvAtomicAdd(ptr + 1, val.y); + detail::cvAtomicAdd(ptr + 2, val.z); + detail::cvAtomicAdd(ptr + 3, val.w); } }; @@ -229,41 +351,6 @@ namespace sum #endif } }; - template - struct GlobalReduce - { - typedef typename TypeVec::vec_type result_type; - - static __device__ void run(result_type& sum, result_type* result, int tid, int bid, double* smem) - { - __shared__ bool is_last; - - if (tid == 0) - { - result[bid] = sum; - - __threadfence(); - - unsigned int ticket = ::atomicAdd(&blocks_finished, 1); - is_last = (ticket == gridDim.x * gridDim.y - 1); - } - - __syncthreads(); - - if (is_last) - { - sum = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits::all(0); - - device::reduce(detail::Unroll::template smem_tuple(smem), detail::Unroll::tie(sum), tid, detail::Unroll::op(plus())); - - if (tid == 0) - { - result[0] = sum; - blocks_finished = 0; - } - } - } - }; template __global__ void kernel(const PtrStepSz src, result_type* result, const Op op, const int twidth, const int theight) @@ -518,53 +605,12 @@ namespace minMax struct GlobalReduce { static __device__ void run(R& mymin, R& mymax, R* minval, R* maxval, int tid, int bid, R* sminval, R* smaxval) - { - __shared__ bool is_last; - - if (tid == 0) - { - minval[bid] = mymin; - maxval[bid] = mymax; - - __threadfence(); - - unsigned int ticket = ::atomicAdd(&blocks_finished, 1); - is_last = (ticket == gridDim.x * gridDim.y - 1); - } - - __syncthreads(); - - if (is_last) - { - int idx = ::min(tid, gridDim.x * gridDim.y - 1); - - mymin = minval[idx]; - mymax = maxval[idx]; - - const minimum minOp; - const maximum maxOp; - device::reduce(smem_tuple(sminval, smaxval), thrust::tie(mymin, mymax), tid, thrust::make_tuple(minOp, maxOp)); - - if (tid == 0) - { - minval[0] = mymin; - maxval[0] = mymax; - - blocks_finished = 0; - } - } - } - }; - template - struct GlobalReduce - { - static __device__ void run(int& mymin, int& mymax, int* minval, int* maxval, int tid, int bid, int* sminval, int* smaxval) { #if __CUDA_ARCH__ >= 200 if (tid == 0) { - ::atomicMin(minval, mymin); - ::atomicMax(maxval, mymax); + detail::cvAtomicMin(minval, mymin); + detail::cvAtomicMax(maxval, mymax); } #else __shared__ bool is_last; @@ -589,8 +635,8 @@ namespace minMax mymin = minval[idx]; mymax = maxval[idx]; - const minimum minOp; - const maximum maxOp; + const minimum minOp; + const maximum maxOp; device::reduce(smem_tuple(sminval, smaxval), thrust::tie(mymin, mymax), tid, thrust::make_tuple(minOp, maxOp)); if (tid == 0) @@ -672,12 +718,19 @@ namespace minMax *minval_buf = numeric_limits::max(); *maxval_buf = numeric_limits::min(); } + __global__ void setDefaultKernel(float* minval_buf, float* maxval_buf) + { + *minval_buf = numeric_limits::max(); + *maxval_buf = -numeric_limits::max(); + } + __global__ void setDefaultKernel(double* minval_buf, double* maxval_buf) + { + *minval_buf = numeric_limits::max(); + *maxval_buf = -numeric_limits::max(); + } template - void setDefault(R*, R*) - { - } - void setDefault(int* minval_buf, int* maxval_buf) + void setDefault(R* minval_buf, R* maxval_buf) { setDefaultKernel<<<1, 1>>>(minval_buf, maxval_buf); } @@ -728,21 +781,19 @@ namespace minMax namespace minMaxLoc { - __device__ unsigned int blocks_finished = 0; - // To avoid shared bank conflicts we convert each value into value of // appropriate type (32 bits minimum) template struct MinMaxTypeTraits; - template <> struct MinMaxTypeTraits { typedef int best_type; }; - template <> struct MinMaxTypeTraits { typedef int best_type; }; - template <> struct MinMaxTypeTraits { typedef int best_type; }; + template <> struct MinMaxTypeTraits { typedef int best_type; }; + template <> struct MinMaxTypeTraits { typedef int best_type; }; + template <> struct MinMaxTypeTraits { typedef int best_type; }; template <> struct MinMaxTypeTraits { typedef int best_type; }; template <> struct MinMaxTypeTraits { typedef int best_type; }; template <> struct MinMaxTypeTraits { typedef float best_type; }; template <> struct MinMaxTypeTraits { typedef double best_type; }; template - __global__ void kernel(const PtrStepSz src, const Mask mask, T* minval, T* maxval, unsigned int* minloc, unsigned int* maxloc, const int twidth, const int theight) + __global__ void kernel_pass_1(const PtrStepSz src, const Mask mask, T* minval, T* maxval, unsigned int* minloc, unsigned int* maxloc, const int twidth, const int theight) { typedef typename MinMaxTypeTraits::best_type work_type; @@ -750,7 +801,6 @@ namespace minMaxLoc __shared__ work_type smaxval[BLOCK_SIZE]; __shared__ unsigned int sminloc[BLOCK_SIZE]; __shared__ unsigned int smaxloc[BLOCK_SIZE]; - __shared__ bool is_last; const int x0 = blockIdx.x * blockDim.x * twidth + threadIdx.x; const int y0 = blockIdx.y * blockDim.y * theight + threadIdx.y; @@ -799,38 +849,36 @@ namespace minMaxLoc maxval[bid] = (T) mymax; minloc[bid] = myminloc; maxloc[bid] = mymaxloc; - - __threadfence(); - - unsigned int ticket = ::atomicInc(&blocks_finished, gridDim.x * gridDim.y); - is_last = (ticket == gridDim.x * gridDim.y - 1); } + } + template + __global__ void kernel_pass_2(T* minval, T* maxval, unsigned int* minloc, unsigned int* maxloc, int count) + { + typedef typename MinMaxTypeTraits::best_type work_type; - __syncthreads(); + __shared__ work_type sminval[BLOCK_SIZE]; + __shared__ work_type smaxval[BLOCK_SIZE]; + __shared__ unsigned int sminloc[BLOCK_SIZE]; + __shared__ unsigned int smaxloc[BLOCK_SIZE]; - if (is_last) + unsigned int idx = ::min(threadIdx.x, count - 1); + + work_type mymin = minval[idx]; + work_type mymax = maxval[idx]; + unsigned int myminloc = minloc[idx]; + unsigned int mymaxloc = maxloc[idx]; + + reduceKeyVal(smem_tuple(sminval, smaxval), thrust::tie(mymin, mymax), + smem_tuple(sminloc, smaxloc), thrust::tie(myminloc, mymaxloc), + threadIdx.x, + thrust::make_tuple(less(), greater())); + + if (threadIdx.x == 0) { - unsigned int idx = ::min(tid, gridDim.x * gridDim.y - 1); - - mymin = minval[idx]; - mymax = maxval[idx]; - myminloc = minloc[idx]; - mymaxloc = maxloc[idx]; - - reduceKeyVal(smem_tuple(sminval, smaxval), thrust::tie(mymin, mymax), - smem_tuple(sminloc, smaxloc), thrust::tie(myminloc, mymaxloc), - tid, - thrust::make_tuple(less(), greater())); - - if (tid == 0) - { - minval[0] = (T) mymin; - maxval[0] = (T) mymax; - minloc[0] = myminloc; - maxloc[0] = mymaxloc; - - blocks_finished = 0; - } + minval[0] = (T) mymin; + maxval[0] = (T) mymax; + minloc[0] = myminloc; + maxloc[0] = mymaxloc; } } @@ -877,12 +925,15 @@ namespace minMaxLoc unsigned int* maxloc_buf = locbuf.ptr(1); if (mask.data) - kernel<<>>((PtrStepSz) src, SingleMask(mask), minval_buf, maxval_buf, minloc_buf, maxloc_buf, twidth, theight); + kernel_pass_1<<>>((PtrStepSz) src, SingleMask(mask), minval_buf, maxval_buf, minloc_buf, maxloc_buf, twidth, theight); else - kernel<<>>((PtrStepSz) src, WithOutMask(), minval_buf, maxval_buf, minloc_buf, maxloc_buf, twidth, theight); + kernel_pass_1<<>>((PtrStepSz) src, WithOutMask(), minval_buf, maxval_buf, minloc_buf, maxloc_buf, twidth, theight); cudaSafeCall( cudaGetLastError() ); + kernel_pass_2<<<1, threads_x * threads_y>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y); + cudaSafeCall( cudaGetLastError() ); + cudaSafeCall( cudaDeviceSynchronize() ); T minval_, maxval_; @@ -898,9 +949,9 @@ namespace minMaxLoc maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols; } - template void run(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep locbuf); - template void run(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep locbuf); - template void run(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep locbuf); + template void run(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep locbuf); + template void run(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep locbuf); + template void run(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep locbuf); template void run(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep locbuf); template void run(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep locbuf); template void run(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep locbuf);