mirror of
https://github.com/opencv/opencv.git
synced 2025-08-06 14:36:36 +08:00
added explicit unroll to reduce implementation
This commit is contained in:
parent
11c6eb6305
commit
63a022dcd7
@ -243,29 +243,46 @@ namespace cv { namespace gpu { namespace device
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
template <unsigned int I, typename Pointer, typename Reference, class Op>
|
||||||
|
struct Unroll
|
||||||
|
{
|
||||||
|
static __device__ void loopShfl(Reference val, Op op, unsigned int N)
|
||||||
|
{
|
||||||
|
mergeShfl(val, I, N, op);
|
||||||
|
Unroll<I / 2, Pointer, Reference, Op>::loopShfl(val, op, N);
|
||||||
|
}
|
||||||
|
static __device__ void loop(Pointer smem, Reference val, unsigned int tid, Op op)
|
||||||
|
{
|
||||||
|
merge(smem, val, tid, I, op);
|
||||||
|
Unroll<I / 2, Pointer, Reference, Op>::loop(smem, val, tid, op);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
template <typename Pointer, typename Reference, class Op>
|
||||||
|
struct Unroll<0, Pointer, Reference, Op>
|
||||||
|
{
|
||||||
|
static __device__ void loopShfl(Reference, Op, unsigned int)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
static __device__ void loop(Pointer, Reference, unsigned int, Op)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
template <unsigned int N> struct WarpOptimized
|
template <unsigned int N> struct WarpOptimized
|
||||||
{
|
{
|
||||||
template <typename Pointer, typename Reference, class Op>
|
template <typename Pointer, typename Reference, class Op>
|
||||||
static __device__ void reduce(Pointer smem, Reference val, unsigned int tid, Op op)
|
static __device__ void reduce(Pointer smem, Reference val, unsigned int tid, Op op)
|
||||||
{
|
{
|
||||||
#if __CUDA_ARCH >= 300
|
#if __CUDA_ARCH__ >= 300
|
||||||
(void) smem;
|
(void) smem;
|
||||||
(void) tid;
|
(void) tid;
|
||||||
|
|
||||||
#pragma unroll
|
Unroll<N / 2, Pointer, Reference, Op>::loopShfl(val, op, N);
|
||||||
for (unsigned int i = N / 2; i >= 1; i /= 2)
|
|
||||||
mergeShfl(val, i, N, op);
|
|
||||||
#else
|
#else
|
||||||
loadToSmem(smem, val, tid);
|
loadToSmem(smem, val, tid);
|
||||||
|
|
||||||
if (tid < N / 2)
|
if (tid < N / 2)
|
||||||
{
|
Unroll<N / 2, Pointer, Reference, Op>::loop(smem, val, tid, op);
|
||||||
#if __CUDA_ARCH__ >= 200
|
|
||||||
#pragma unroll
|
|
||||||
#endif
|
|
||||||
for (unsigned int i = N / 2; i >= 1; i /= 2)
|
|
||||||
merge(smem, val, tid, i, op);
|
|
||||||
}
|
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@ -279,10 +296,8 @@ namespace cv { namespace gpu { namespace device
|
|||||||
{
|
{
|
||||||
const unsigned int laneId = Warp::laneId();
|
const unsigned int laneId = Warp::laneId();
|
||||||
|
|
||||||
#if __CUDA_ARCH >= 300
|
#if __CUDA_ARCH__ >= 300
|
||||||
#pragma unroll
|
Unroll<16, Pointer, Reference, Op>::loopShfl(val, op, warpSize);
|
||||||
for (int i = 16; i >= 1; i /= 2)
|
|
||||||
mergeShfl(val, i, warpSize, op);
|
|
||||||
|
|
||||||
if (laneId == 0)
|
if (laneId == 0)
|
||||||
loadToSmem(smem, val, tid / 32);
|
loadToSmem(smem, val, tid / 32);
|
||||||
@ -290,13 +305,7 @@ namespace cv { namespace gpu { namespace device
|
|||||||
loadToSmem(smem, val, tid);
|
loadToSmem(smem, val, tid);
|
||||||
|
|
||||||
if (laneId < 16)
|
if (laneId < 16)
|
||||||
{
|
Unroll<16, Pointer, Reference, Op>::loop(smem, val, tid, op);
|
||||||
#if __CUDA_ARCH__ >= 200
|
|
||||||
#pragma unroll
|
|
||||||
#endif
|
|
||||||
for (int i = 16; i >= 1; i /= 2)
|
|
||||||
merge(smem, val, tid, i, op);
|
|
||||||
}
|
|
||||||
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
@ -310,16 +319,10 @@ namespace cv { namespace gpu { namespace device
|
|||||||
|
|
||||||
if (tid < 32)
|
if (tid < 32)
|
||||||
{
|
{
|
||||||
#if __CUDA_ARCH >= 300
|
#if __CUDA_ARCH__ >= 300
|
||||||
#pragma unroll
|
Unroll<M / 2, Pointer, Reference, Op>::loopShfl(val, op, M);
|
||||||
for (int i = M / 2; i >= 1; i /= 2)
|
|
||||||
mergeShfl(val, i, M, op);
|
|
||||||
#else
|
#else
|
||||||
#if __CUDA_ARCH__ >= 200
|
Unroll<M / 2, Pointer, Reference, Op>::loop(smem, val, tid, op);
|
||||||
#pragma unroll
|
|
||||||
#endif
|
|
||||||
for (int i = M / 2; i >= 1; i /= 2)
|
|
||||||
merge(smem, val, tid, i, op);
|
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -369,31 +369,48 @@ namespace cv { namespace gpu { namespace device
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
template <unsigned int I, class KP, class KR, class VP, class VR, class Cmp>
|
||||||
|
struct Unroll
|
||||||
|
{
|
||||||
|
static __device__ void loopShfl(KR key, VR val, Cmp cmp, unsigned int N)
|
||||||
|
{
|
||||||
|
mergeShfl(key, val, cmp, I, N);
|
||||||
|
Unroll<I / 2, KP, KR, VP, VR, Cmp>::loopShfl(key, val, cmp, N);
|
||||||
|
}
|
||||||
|
static __device__ void loop(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp)
|
||||||
|
{
|
||||||
|
merge(skeys, key, svals, val, cmp, tid, I);
|
||||||
|
Unroll<I / 2, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
template <class KP, class KR, class VP, class VR, class Cmp>
|
||||||
|
struct Unroll<0, KP, KR, VP, VR, Cmp>
|
||||||
|
{
|
||||||
|
static __device__ void loopShfl(KR, VR, Cmp, unsigned int)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
static __device__ void loop(KP, KR, VP, VR, unsigned int, Cmp)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
template <unsigned int N> struct WarpOptimized
|
template <unsigned int N> struct WarpOptimized
|
||||||
{
|
{
|
||||||
template <class KP, class KR, class VP, class VR, class Cmp>
|
template <class KP, class KR, class VP, class VR, class Cmp>
|
||||||
static __device__ void reduce(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp)
|
static __device__ void reduce(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp)
|
||||||
{
|
{
|
||||||
#if __CUDA_ARCH >= 300
|
#if __CUDA_ARCH__ >= 300
|
||||||
(void) skeys;
|
(void) skeys;
|
||||||
(void) svals;
|
(void) svals;
|
||||||
(void) tid;
|
(void) tid;
|
||||||
|
|
||||||
#pragma unroll
|
Unroll<N / 2, KP, KR, VP, VR, Cmp>::loopShfl(key, val, cmp, N);
|
||||||
for (unsigned int i = N / 2; i >= 1; i /= 2)
|
|
||||||
mergeShfl(key, val, cml, i, N);
|
|
||||||
#else
|
#else
|
||||||
loadToSmem(skeys, key, tid);
|
loadToSmem(skeys, key, tid);
|
||||||
loadToSmem(svals, val, tid);
|
loadToSmem(svals, val, tid);
|
||||||
|
|
||||||
if (tid < N / 2)
|
if (tid < N / 2)
|
||||||
{
|
Unroll<N / 2, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp);
|
||||||
#if __CUDA_ARCH__ >= 200
|
|
||||||
#pragma unroll
|
|
||||||
#endif
|
|
||||||
for (unsigned int i = N / 2; i >= 1; i /= 2)
|
|
||||||
merge(skeys, key, svals, val, cmp, tid, i);
|
|
||||||
}
|
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@ -407,10 +424,8 @@ namespace cv { namespace gpu { namespace device
|
|||||||
{
|
{
|
||||||
const unsigned int laneId = Warp::laneId();
|
const unsigned int laneId = Warp::laneId();
|
||||||
|
|
||||||
#if __CUDA_ARCH >= 300
|
#if __CUDA_ARCH__ >= 300
|
||||||
#pragma unroll
|
Unroll<16, KP, KR, VP, VR, Cmp>::loopShfl(key, val, cmp, warpSize);
|
||||||
for (unsigned int i = 16; i >= 1; i /= 2)
|
|
||||||
mergeShfl(key, val, cml, i, warpSize);
|
|
||||||
|
|
||||||
if (laneId == 0)
|
if (laneId == 0)
|
||||||
{
|
{
|
||||||
@ -422,13 +437,7 @@ namespace cv { namespace gpu { namespace device
|
|||||||
loadToSmem(svals, val, tid);
|
loadToSmem(svals, val, tid);
|
||||||
|
|
||||||
if (laneId < 16)
|
if (laneId < 16)
|
||||||
{
|
Unroll<16, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp);
|
||||||
#if __CUDA_ARCH__ >= 200
|
|
||||||
#pragma unroll
|
|
||||||
#endif
|
|
||||||
for (int i = 16; i >= 1; i /= 2)
|
|
||||||
merge(skeys, key, svals, val, cmp, tid, i);
|
|
||||||
}
|
|
||||||
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
@ -445,18 +454,12 @@ namespace cv { namespace gpu { namespace device
|
|||||||
|
|
||||||
if (tid < 32)
|
if (tid < 32)
|
||||||
{
|
{
|
||||||
#if __CUDA_ARCH >= 300
|
#if __CUDA_ARCH__ >= 300
|
||||||
loadFromSmem(svals, val, tid);
|
loadFromSmem(svals, val, tid);
|
||||||
|
|
||||||
#pragma unroll
|
Unroll<M / 2, KP, KR, VP, VR, Cmp>::loopShfl(key, val, cmp, M);
|
||||||
for (unsigned int i = M / 2; i >= 1; i /= 2)
|
|
||||||
mergeShfl(key, val, cml, i, M);
|
|
||||||
#else
|
#else
|
||||||
#if __CUDA_ARCH__ >= 200
|
Unroll<M / 2, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp);
|
||||||
#pragma unroll
|
|
||||||
#endif
|
|
||||||
for (unsigned int i = M / 2; i >= 1; i /= 2)
|
|
||||||
merge(skeys, key, svals, val, cmp, tid, i);
|
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user