From 63a022dcd74e4d9058c12693dbe2e2f47dd8481f Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Mon, 26 Nov 2012 13:12:50 +0400 Subject: [PATCH] added explicit unroll to reduce implementation --- .../opencv2/gpu/device/detail/reduce.hpp | 65 ++++++++++--------- .../gpu/device/detail/reduce_key_val.hpp | 65 ++++++++++--------- 2 files changed, 68 insertions(+), 62 deletions(-) diff --git a/modules/gpu/include/opencv2/gpu/device/detail/reduce.hpp b/modules/gpu/include/opencv2/gpu/device/detail/reduce.hpp index 2b2ba6773c..091a160e31 100644 --- a/modules/gpu/include/opencv2/gpu/device/detail/reduce.hpp +++ b/modules/gpu/include/opencv2/gpu/device/detail/reduce.hpp @@ -243,29 +243,46 @@ namespace cv { namespace gpu { namespace device } }; + template + struct Unroll + { + static __device__ void loopShfl(Reference val, Op op, unsigned int N) + { + mergeShfl(val, I, N, op); + Unroll::loopShfl(val, op, N); + } + static __device__ void loop(Pointer smem, Reference val, unsigned int tid, Op op) + { + merge(smem, val, tid, I, op); + Unroll::loop(smem, val, tid, op); + } + }; + template + struct Unroll<0, Pointer, Reference, Op> + { + static __device__ void loopShfl(Reference, Op, unsigned int) + { + } + static __device__ void loop(Pointer, Reference, unsigned int, Op) + { + } + }; + template struct WarpOptimized { template static __device__ void reduce(Pointer smem, Reference val, unsigned int tid, Op op) { - #if __CUDA_ARCH >= 300 + #if __CUDA_ARCH__ >= 300 (void) smem; (void) tid; - #pragma unroll - for (unsigned int i = N / 2; i >= 1; i /= 2) - mergeShfl(val, i, N, op); + Unroll::loopShfl(val, op, N); #else loadToSmem(smem, val, tid); if (tid < N / 2) - { - #if __CUDA_ARCH__ >= 200 - #pragma unroll - #endif - for (unsigned int i = N / 2; i >= 1; i /= 2) - merge(smem, val, tid, i, op); - } + Unroll::loop(smem, val, tid, op); #endif } }; @@ -279,10 +296,8 @@ namespace cv { namespace gpu { namespace device { const unsigned int laneId = Warp::laneId(); - #if __CUDA_ARCH >= 300 - #pragma unroll - for (int i = 16; i >= 1; i /= 2) - mergeShfl(val, i, warpSize, op); + #if __CUDA_ARCH__ >= 300 + Unroll<16, Pointer, Reference, Op>::loopShfl(val, op, warpSize); if (laneId == 0) loadToSmem(smem, val, tid / 32); @@ -290,13 +305,7 @@ namespace cv { namespace gpu { namespace device loadToSmem(smem, val, tid); if (laneId < 16) - { - #if __CUDA_ARCH__ >= 200 - #pragma unroll - #endif - for (int i = 16; i >= 1; i /= 2) - merge(smem, val, tid, i, op); - } + Unroll<16, Pointer, Reference, Op>::loop(smem, val, tid, op); __syncthreads(); @@ -310,16 +319,10 @@ namespace cv { namespace gpu { namespace device if (tid < 32) { - #if __CUDA_ARCH >= 300 - #pragma unroll - for (int i = M / 2; i >= 1; i /= 2) - mergeShfl(val, i, M, op); + #if __CUDA_ARCH__ >= 300 + Unroll::loopShfl(val, op, M); #else - #if __CUDA_ARCH__ >= 200 - #pragma unroll - #endif - for (int i = M / 2; i >= 1; i /= 2) - merge(smem, val, tid, i, op); + Unroll::loop(smem, val, tid, op); #endif } } diff --git a/modules/gpu/include/opencv2/gpu/device/detail/reduce_key_val.hpp b/modules/gpu/include/opencv2/gpu/device/detail/reduce_key_val.hpp index f1aa285d3d..ca2c431273 100644 --- a/modules/gpu/include/opencv2/gpu/device/detail/reduce_key_val.hpp +++ b/modules/gpu/include/opencv2/gpu/device/detail/reduce_key_val.hpp @@ -369,31 +369,48 @@ namespace cv { namespace gpu { namespace device } }; + template + struct Unroll + { + static __device__ void loopShfl(KR key, VR val, Cmp cmp, unsigned int N) + { + mergeShfl(key, val, cmp, I, N); + Unroll::loopShfl(key, val, cmp, N); + } + static __device__ void loop(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp) + { + merge(skeys, key, svals, val, cmp, tid, I); + Unroll::loop(skeys, key, svals, val, tid, cmp); + } + }; + template + struct Unroll<0, KP, KR, VP, VR, Cmp> + { + static __device__ void loopShfl(KR, VR, Cmp, unsigned int) + { + } + static __device__ void loop(KP, KR, VP, VR, unsigned int, Cmp) + { + } + }; + template struct WarpOptimized { template static __device__ void reduce(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp) { - #if __CUDA_ARCH >= 300 + #if __CUDA_ARCH__ >= 300 (void) skeys; (void) svals; (void) tid; - #pragma unroll - for (unsigned int i = N / 2; i >= 1; i /= 2) - mergeShfl(key, val, cml, i, N); + Unroll::loopShfl(key, val, cmp, N); #else loadToSmem(skeys, key, tid); loadToSmem(svals, val, tid); if (tid < N / 2) - { - #if __CUDA_ARCH__ >= 200 - #pragma unroll - #endif - for (unsigned int i = N / 2; i >= 1; i /= 2) - merge(skeys, key, svals, val, cmp, tid, i); - } + Unroll::loop(skeys, key, svals, val, tid, cmp); #endif } }; @@ -407,10 +424,8 @@ namespace cv { namespace gpu { namespace device { const unsigned int laneId = Warp::laneId(); - #if __CUDA_ARCH >= 300 - #pragma unroll - for (unsigned int i = 16; i >= 1; i /= 2) - mergeShfl(key, val, cml, i, warpSize); + #if __CUDA_ARCH__ >= 300 + Unroll<16, KP, KR, VP, VR, Cmp>::loopShfl(key, val, cmp, warpSize); if (laneId == 0) { @@ -422,13 +437,7 @@ namespace cv { namespace gpu { namespace device loadToSmem(svals, val, tid); if (laneId < 16) - { - #if __CUDA_ARCH__ >= 200 - #pragma unroll - #endif - for (int i = 16; i >= 1; i /= 2) - merge(skeys, key, svals, val, cmp, tid, i); - } + Unroll<16, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp); __syncthreads(); @@ -445,18 +454,12 @@ namespace cv { namespace gpu { namespace device if (tid < 32) { - #if __CUDA_ARCH >= 300 + #if __CUDA_ARCH__ >= 300 loadFromSmem(svals, val, tid); - #pragma unroll - for (unsigned int i = M / 2; i >= 1; i /= 2) - mergeShfl(key, val, cml, i, M); + Unroll::loopShfl(key, val, cmp, M); #else - #if __CUDA_ARCH__ >= 200 - #pragma unroll - #endif - for (unsigned int i = M / 2; i >= 1; i /= 2) - merge(skeys, key, svals, val, cmp, tid, i); + Unroll::loop(skeys, key, svals, val, tid, cmp); #endif } }