mirror of
https://github.com/opencv/opencv.git
synced 2024-11-26 12:10:49 +08:00
new reduce and reduceKeyVal implementation
This commit is contained in:
parent
d47c112434
commit
7a1874b2cc
352
modules/gpu/include/opencv2/gpu/device/detail/reduce.hpp
Normal file
352
modules/gpu/include/opencv2/gpu/device/detail/reduce.hpp
Normal file
@ -0,0 +1,352 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#ifndef __OPENCV_GPU_REDUCE_DETAIL_HPP__
|
||||
#define __OPENCV_GPU_REDUCE_DETAIL_HPP__
|
||||
|
||||
#include <thrust/tuple.h>
|
||||
#include "../warp.hpp"
|
||||
#include "../warp_shuffle.hpp"
|
||||
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
namespace reduce_detail
|
||||
{
|
||||
template <typename T> struct GetType;
|
||||
template <typename T> struct GetType<T*>
|
||||
{
|
||||
typedef T type;
|
||||
};
|
||||
template <typename T> struct GetType<volatile T*>
|
||||
{
|
||||
typedef T type;
|
||||
};
|
||||
template <typename T> struct GetType<T&>
|
||||
{
|
||||
typedef T type;
|
||||
};
|
||||
|
||||
template <unsigned int I, unsigned int N>
|
||||
struct For
|
||||
{
|
||||
template <class PointerTuple, class ValTuple>
|
||||
static __device__ void loadToSmem(const PointerTuple& smem, const ValTuple& val, unsigned int tid)
|
||||
{
|
||||
thrust::get<I>(smem)[tid] = thrust::get<I>(val);
|
||||
|
||||
For<I + 1, N>::loadToSmem(smem, val, tid);
|
||||
}
|
||||
template <class PointerTuple, class ValTuple>
|
||||
static __device__ void loadFromSmem(const PointerTuple& smem, const ValTuple& val, unsigned int tid)
|
||||
{
|
||||
thrust::get<I>(val) = thrust::get<I>(smem)[tid];
|
||||
|
||||
For<I + 1, N>::loadFromSmem(smem, val, tid);
|
||||
}
|
||||
|
||||
template <class PointerTuple, class ValTuple, class OpTuple>
|
||||
static __device__ void merge(const PointerTuple& smem, const ValTuple& val, unsigned int tid, unsigned int delta, const OpTuple& op)
|
||||
{
|
||||
typename GetType<typename thrust::tuple_element<I, PointerTuple>::type>::type reg = thrust::get<I>(smem)[tid + delta];
|
||||
thrust::get<I>(smem)[tid] = thrust::get<I>(val) = thrust::get<I>(op)(thrust::get<I>(val), reg);
|
||||
|
||||
For<I + 1, N>::merge(smem, val, tid, delta, op);
|
||||
}
|
||||
template <class ValTuple, class OpTuple>
|
||||
static __device__ void mergeShfl(const ValTuple& val, unsigned int delta, unsigned int width, const OpTuple& op)
|
||||
{
|
||||
typename GetType<typename thrust::tuple_element<I, ValTuple>::type>::type reg = shfl_down(thrust::get<I>(val), delta, width);
|
||||
thrust::get<I>(val) = thrust::get<I>(op)(thrust::get<I>(val), reg);
|
||||
|
||||
For<I + 1, N>::mergeShfl(val, delta, width, op);
|
||||
}
|
||||
};
|
||||
template <unsigned int N>
|
||||
struct For<N, N>
|
||||
{
|
||||
template <class PointerTuple, class ValTuple>
|
||||
static __device__ void loadToSmem(const PointerTuple&, const ValTuple&, unsigned int)
|
||||
{
|
||||
}
|
||||
template <class PointerTuple, class ValTuple>
|
||||
static __device__ void loadFromSmem(const PointerTuple&, const ValTuple&, unsigned int)
|
||||
{
|
||||
}
|
||||
|
||||
template <class PointerTuple, class ValTuple, class OpTuple>
|
||||
static __device__ void merge(const PointerTuple&, const ValTuple&, unsigned int, unsigned int, const OpTuple&)
|
||||
{
|
||||
}
|
||||
template <class ValTuple, class OpTuple>
|
||||
static __device__ void mergeShfl(const ValTuple&, unsigned int, unsigned int, const OpTuple&)
|
||||
{
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
__device__ __forceinline__ void loadToSmem(volatile T* smem, T& val, unsigned int tid)
|
||||
{
|
||||
smem[tid] = val;
|
||||
}
|
||||
template <typename T>
|
||||
__device__ __forceinline__ void loadFromSmem(volatile T* smem, T& val, unsigned int tid)
|
||||
{
|
||||
val = smem[tid];
|
||||
}
|
||||
template <typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9,
|
||||
typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9>
|
||||
__device__ __forceinline__ void loadToSmem(const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem,
|
||||
const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
|
||||
unsigned int tid)
|
||||
{
|
||||
For<0, thrust::tuple_size<thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9> >::value>::loadToSmem(smem, val, tid);
|
||||
}
|
||||
template <typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9,
|
||||
typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9>
|
||||
__device__ __forceinline__ void loadFromSmem(const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem,
|
||||
const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
|
||||
unsigned int tid)
|
||||
{
|
||||
For<0, thrust::tuple_size<thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9> >::value>::loadFromSmem(smem, val, tid);
|
||||
}
|
||||
|
||||
template <typename T, class Op>
|
||||
__device__ __forceinline__ void merge(volatile T* smem, T& val, unsigned int tid, unsigned int delta, const Op& op)
|
||||
{
|
||||
T reg = smem[tid + delta];
|
||||
smem[tid] = val = op(val, reg);
|
||||
}
|
||||
template <typename T, class Op>
|
||||
__device__ __forceinline__ void mergeShfl(T& val, unsigned int delta, unsigned int width, const Op& op)
|
||||
{
|
||||
T reg = shfl_down(val, delta, width);
|
||||
val = op(val, reg);
|
||||
}
|
||||
template <typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9,
|
||||
typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9,
|
||||
class Op0, class Op1, class Op2, class Op3, class Op4, class Op5, class Op6, class Op7, class Op8, class Op9>
|
||||
__device__ __forceinline__ void merge(const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem,
|
||||
const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
|
||||
unsigned int tid,
|
||||
unsigned int delta,
|
||||
const thrust::tuple<Op0, Op1, Op2, Op3, Op4, Op5, Op6, Op7, Op8, Op9>& op)
|
||||
{
|
||||
For<0, thrust::tuple_size<thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9> >::value>::merge(smem, val, tid, delta, op);
|
||||
}
|
||||
template <typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9,
|
||||
class Op0, class Op1, class Op2, class Op3, class Op4, class Op5, class Op6, class Op7, class Op8, class Op9>
|
||||
__device__ __forceinline__ void mergeShfl(const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
|
||||
unsigned int delta,
|
||||
unsigned int width,
|
||||
const thrust::tuple<Op0, Op1, Op2, Op3, Op4, Op5, Op6, Op7, Op8, Op9>& op)
|
||||
{
|
||||
For<0, thrust::tuple_size<thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9> >::value>::mergeShfl(val, delta, width, op);
|
||||
}
|
||||
|
||||
template <unsigned int N> struct Generic
|
||||
{
|
||||
template <typename Pointer, typename Reference, class Op>
|
||||
static __device__ void reduce(Pointer smem, Reference val, unsigned int tid, Op op)
|
||||
{
|
||||
loadToSmem(smem, val, tid);
|
||||
if (N >= 32)
|
||||
__syncthreads();
|
||||
|
||||
if (N >= 2048)
|
||||
{
|
||||
if (tid < 1024)
|
||||
merge(smem, val, tid, 1024, op);
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
if (N >= 1024)
|
||||
{
|
||||
if (tid < 512)
|
||||
merge(smem, val, tid, 512, op);
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
if (N >= 512)
|
||||
{
|
||||
if (tid < 256)
|
||||
merge(smem, val, tid, 256, op);
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
if (N >= 256)
|
||||
{
|
||||
if (tid < 128)
|
||||
merge(smem, val, tid, 128, op);
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
if (N >= 128)
|
||||
{
|
||||
if (tid < 64)
|
||||
merge(smem, val, tid, 64, op);
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
if (N >= 64)
|
||||
{
|
||||
if (tid < 32)
|
||||
merge(smem, val, tid, 32, op);
|
||||
}
|
||||
|
||||
if (tid < 16)
|
||||
{
|
||||
merge(smem, val, tid, 16, op);
|
||||
merge(smem, val, tid, 8, op);
|
||||
merge(smem, val, tid, 4, op);
|
||||
merge(smem, val, tid, 2, op);
|
||||
merge(smem, val, tid, 1, op);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <unsigned int N> struct WarpOptimized
|
||||
{
|
||||
template <typename Pointer, typename Reference, class Op>
|
||||
static __device__ void reduce(Pointer smem, Reference val, unsigned int tid, Op op)
|
||||
{
|
||||
#if __CUDA_ARCH >= 300
|
||||
(void) smem;
|
||||
(void) tid;
|
||||
|
||||
#pragma unroll
|
||||
for (unsigned int i = N / 2; i >= 1; i /= 2)
|
||||
mergeShfl(val, i, N, op);
|
||||
#else
|
||||
loadToSmem(smem, val, tid);
|
||||
|
||||
if (tid < N / 2)
|
||||
{
|
||||
#pragma unroll
|
||||
for (unsigned int i = N / 2; i >= 1; i /= 2)
|
||||
merge(smem, val, tid, i, op);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
template <unsigned int N> struct GenericOptimized32
|
||||
{
|
||||
enum { M = N / 32 };
|
||||
|
||||
template <typename Pointer, typename Reference, class Op>
|
||||
static __device__ void reduce(Pointer smem, Reference val, unsigned int tid, Op op)
|
||||
{
|
||||
const unsigned int laneId = Warp::laneId();
|
||||
|
||||
#if __CUDA_ARCH >= 300
|
||||
#pragma unroll
|
||||
for (int i = 16; i >= 1; i /= 2)
|
||||
mergeShfl(val, i, warpSize, op);
|
||||
|
||||
if (laneId == 0)
|
||||
loadToSmem(smem, val, tid / 32);
|
||||
#else
|
||||
loadToSmem(smem, val, tid);
|
||||
|
||||
if (laneId < 16)
|
||||
{
|
||||
#pragma unroll
|
||||
for (int i = 16; i >= 1; i /= 2)
|
||||
merge(smem, val, tid, i, op);
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (laneId == 0)
|
||||
loadToSmem(smem, val, tid / 32);
|
||||
#endif
|
||||
|
||||
__syncthreads();
|
||||
|
||||
loadFromSmem(smem, val, tid);
|
||||
|
||||
if (tid < 32)
|
||||
{
|
||||
#if __CUDA_ARCH >= 300
|
||||
#pragma unroll
|
||||
for (int i = M / 2; i >= 1; i /= 2)
|
||||
mergeShfl(val, i, M, op);
|
||||
#else
|
||||
#pragma unroll
|
||||
for (int i = M / 2; i >= 1; i /= 2)
|
||||
merge(smem, val, tid, i, op);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <bool val, class T1, class T2> struct StaticIf;
|
||||
template <class T1, class T2> struct StaticIf<true, T1, T2>
|
||||
{
|
||||
typedef T1 type;
|
||||
};
|
||||
template <class T1, class T2> struct StaticIf<false, T1, T2>
|
||||
{
|
||||
typedef T2 type;
|
||||
};
|
||||
|
||||
template <unsigned int N> struct IsPowerOf2
|
||||
{
|
||||
enum { value = ((N != 0) && !(N & (N - 1))) };
|
||||
};
|
||||
|
||||
template <unsigned int N> struct Dispatcher
|
||||
{
|
||||
typedef typename StaticIf<
|
||||
(N <= 32) && IsPowerOf2<N>::value,
|
||||
WarpOptimized<N>,
|
||||
typename StaticIf<
|
||||
(N <= 1024) && IsPowerOf2<N>::value,
|
||||
GenericOptimized32<N>,
|
||||
Generic<N>
|
||||
>::type
|
||||
>::type reductor;
|
||||
};
|
||||
}
|
||||
}}}
|
||||
|
||||
#endif // __OPENCV_GPU_REDUCE_DETAIL_HPP__
|
489
modules/gpu/include/opencv2/gpu/device/detail/reduce_key_val.hpp
Normal file
489
modules/gpu/include/opencv2/gpu/device/detail/reduce_key_val.hpp
Normal file
@ -0,0 +1,489 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#ifndef __OPENCV_GPU_PRED_VAL_REDUCE_DETAIL_HPP__
|
||||
#define __OPENCV_GPU_PRED_VAL_REDUCE_DETAIL_HPP__
|
||||
|
||||
#include <thrust/tuple.h>
|
||||
#include "../warp.hpp"
|
||||
#include "../warp_shuffle.hpp"
|
||||
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
namespace reduce_key_val_detail
|
||||
{
|
||||
template <typename T> struct GetType;
|
||||
template <typename T> struct GetType<T*>
|
||||
{
|
||||
typedef T type;
|
||||
};
|
||||
template <typename T> struct GetType<volatile T*>
|
||||
{
|
||||
typedef T type;
|
||||
};
|
||||
template <typename T> struct GetType<T&>
|
||||
{
|
||||
typedef T type;
|
||||
};
|
||||
|
||||
template <unsigned int I, unsigned int N>
|
||||
struct For
|
||||
{
|
||||
template <class PointerTuple, class ReferenceTuple>
|
||||
static __device__ void loadToSmem(const PointerTuple& smem, const ReferenceTuple& data, unsigned int tid)
|
||||
{
|
||||
thrust::get<I>(smem)[tid] = thrust::get<I>(data);
|
||||
|
||||
For<I + 1, N>::loadToSmem(smem, data, tid);
|
||||
}
|
||||
template <class PointerTuple, class ReferenceTuple>
|
||||
static __device__ void loadFromSmem(const PointerTuple& smem, const ReferenceTuple& data, unsigned int tid)
|
||||
{
|
||||
thrust::get<I>(data) = thrust::get<I>(smem)[tid];
|
||||
|
||||
For<I + 1, N>::loadFromSmem(smem, data, tid);
|
||||
}
|
||||
|
||||
template <class ReferenceTuple>
|
||||
static __device__ void copyShfl(const ReferenceTuple& val, unsigned int delta, int width)
|
||||
{
|
||||
thrust::get<I>(val) = shfl_down(thrust::get<I>(val), delta, width);
|
||||
|
||||
For<I + 1, N>::copyShfl(val, delta, width);
|
||||
}
|
||||
template <class PointerTuple, class ReferenceTuple>
|
||||
static __device__ void copy(const PointerTuple& svals, const ReferenceTuple& val, unsigned int tid, unsigned int delta)
|
||||
{
|
||||
thrust::get<I>(svals)[tid] = thrust::get<I>(val) = thrust::get<I>(svals)[tid + delta];
|
||||
|
||||
For<I + 1, N>::copy(svals, val, tid, delta);
|
||||
}
|
||||
|
||||
template <class KeyReferenceTuple, class ValReferenceTuple, class CmpTuple>
|
||||
static __device__ void mergeShfl(const KeyReferenceTuple& key, const ValReferenceTuple& val, const CmpTuple& cmp, unsigned int delta, int width)
|
||||
{
|
||||
typename GetType<typename thrust::tuple_element<I, KeyReferenceTuple>::type>::type reg = shfl_down(thrust::get<I>(key), delta, width);
|
||||
|
||||
if (thrust::get<I>(cmp)(reg, thrust::get<I>(key)))
|
||||
{
|
||||
thrust::get<I>(key) = reg;
|
||||
thrust::get<I>(val) = shfl_down(thrust::get<I>(val), delta, width);
|
||||
}
|
||||
|
||||
For<I + 1, N>::mergeShfl(key, val, cmp, delta, width);
|
||||
}
|
||||
template <class KeyPointerTuple, class KeyReferenceTuple, class ValPointerTuple, class ValReferenceTuple, class CmpTuple>
|
||||
static __device__ void merge(const KeyPointerTuple& skeys, const KeyReferenceTuple& key,
|
||||
const ValPointerTuple& svals, const ValReferenceTuple& val,
|
||||
const CmpTuple& cmp,
|
||||
unsigned int tid, unsigned int delta)
|
||||
{
|
||||
typename GetType<typename thrust::tuple_element<I, KeyPointerTuple>::type>::type reg = thrust::get<I>(skeys)[tid + delta];
|
||||
|
||||
if (thrust::get<I>(cmp)(reg, thrust::get<I>(key)))
|
||||
{
|
||||
thrust::get<I>(skeys)[tid] = thrust::get<I>(key) = reg;
|
||||
thrust::get<I>(svals)[tid] = thrust::get<I>(val) = thrust::get<I>(svals)[tid + delta];
|
||||
}
|
||||
|
||||
For<I + 1, N>::merge(skeys, key, svals, val, cmp, tid, delta);
|
||||
}
|
||||
};
|
||||
template <unsigned int N>
|
||||
struct For<N, N>
|
||||
{
|
||||
template <class PointerTuple, class ReferenceTuple>
|
||||
static __device__ void loadToSmem(const PointerTuple&, const ReferenceTuple&, unsigned int)
|
||||
{
|
||||
}
|
||||
template <class PointerTuple, class ReferenceTuple>
|
||||
static __device__ void loadFromSmem(const PointerTuple&, const ReferenceTuple&, unsigned int)
|
||||
{
|
||||
}
|
||||
|
||||
template <class ReferenceTuple>
|
||||
static __device__ void copyShfl(const ReferenceTuple&, unsigned int, int)
|
||||
{
|
||||
}
|
||||
template <class PointerTuple, class ReferenceTuple>
|
||||
static __device__ void copy(const PointerTuple&, const ReferenceTuple&, unsigned int, unsigned int)
|
||||
{
|
||||
}
|
||||
|
||||
template <class KeyReferenceTuple, class ValReferenceTuple, class CmpTuple>
|
||||
static __device__ void mergeShfl(const KeyReferenceTuple&, const ValReferenceTuple&, const CmpTuple&, unsigned int, int)
|
||||
{
|
||||
}
|
||||
template <class KeyPointerTuple, class KeyReferenceTuple, class ValPointerTuple, class ValReferenceTuple, class CmpTuple>
|
||||
static __device__ void merge(const KeyPointerTuple&, const KeyReferenceTuple&,
|
||||
const ValPointerTuple&, const ValReferenceTuple&,
|
||||
const CmpTuple&,
|
||||
unsigned int, unsigned int)
|
||||
{
|
||||
}
|
||||
};
|
||||
|
||||
//////////////////////////////////////////////////////
|
||||
// loadToSmem
|
||||
|
||||
template <typename T>
|
||||
__device__ __forceinline__ void loadToSmem(volatile T* smem, T& data, unsigned int tid)
|
||||
{
|
||||
smem[tid] = data;
|
||||
}
|
||||
template <typename T>
|
||||
__device__ __forceinline__ void loadFromSmem(volatile T* smem, T& data, unsigned int tid)
|
||||
{
|
||||
data = smem[tid];
|
||||
}
|
||||
template <typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
|
||||
typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9>
|
||||
__device__ __forceinline__ void loadToSmem(const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& smem,
|
||||
const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& data,
|
||||
unsigned int tid)
|
||||
{
|
||||
For<0, thrust::tuple_size<thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::loadToSmem(smem, data, tid);
|
||||
}
|
||||
template <typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
|
||||
typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9>
|
||||
__device__ __forceinline__ void loadFromSmem(const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& smem,
|
||||
const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& data,
|
||||
unsigned int tid)
|
||||
{
|
||||
For<0, thrust::tuple_size<thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::loadFromSmem(smem, data, tid);
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////
|
||||
// copyVals
|
||||
|
||||
template <typename V>
|
||||
__device__ __forceinline__ void copyValsShfl(V& val, unsigned int delta, int width)
|
||||
{
|
||||
val = shfl_down(val, delta, width);
|
||||
}
|
||||
template <typename V>
|
||||
__device__ __forceinline__ void copyVals(volatile V* svals, V& val, unsigned int tid, unsigned int delta)
|
||||
{
|
||||
svals[tid] = val = svals[tid + delta];
|
||||
}
|
||||
template <typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9>
|
||||
__device__ __forceinline__ void copyValsShfl(const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
|
||||
unsigned int delta,
|
||||
int width)
|
||||
{
|
||||
For<0, thrust::tuple_size<thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9> >::value>::copyShfl(val, delta, width);
|
||||
}
|
||||
template <typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
|
||||
typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9>
|
||||
__device__ __forceinline__ void copyVals(const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
|
||||
const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
|
||||
unsigned int tid, unsigned int delta)
|
||||
{
|
||||
For<0, thrust::tuple_size<thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::copy(svals, val, tid, delta);
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////
|
||||
// merge
|
||||
|
||||
template <typename K, typename V, class Cmp>
|
||||
__device__ __forceinline__ void mergeShfl(K& key, V& val, const Cmp& cmp, unsigned int delta, int width)
|
||||
{
|
||||
K reg = shfl_down(key, delta, width);
|
||||
|
||||
if (cmp(reg, key))
|
||||
{
|
||||
key = reg;
|
||||
copyValsShfl(val, delta, width);
|
||||
}
|
||||
}
|
||||
template <typename K, typename V, class Cmp>
|
||||
__device__ __forceinline__ void merge(volatile K* skeys, K& key, volatile V* svals, V& val, const Cmp& cmp, unsigned int tid, unsigned int delta)
|
||||
{
|
||||
K reg = skeys[tid + delta];
|
||||
|
||||
if (cmp(reg, key))
|
||||
{
|
||||
skeys[tid] = key = reg;
|
||||
copyVals(svals, val, tid, delta);
|
||||
}
|
||||
}
|
||||
template <typename K,
|
||||
typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
|
||||
class Cmp>
|
||||
__device__ __forceinline__ void mergeShfl(K& key,
|
||||
const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
|
||||
const Cmp& cmp,
|
||||
unsigned int delta, int width)
|
||||
{
|
||||
K reg = shfl_down(key, delta, width);
|
||||
|
||||
if (cmp(reg, key))
|
||||
{
|
||||
key = reg;
|
||||
copyValsShfl(val, delta, width);
|
||||
}
|
||||
}
|
||||
template <typename K,
|
||||
typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
|
||||
typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
|
||||
class Cmp>
|
||||
__device__ __forceinline__ void merge(volatile K* skeys, K& key,
|
||||
const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
|
||||
const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
|
||||
const Cmp& cmp, unsigned int tid, unsigned int delta)
|
||||
{
|
||||
K reg = skeys[tid + delta];
|
||||
|
||||
if (cmp(reg, key))
|
||||
{
|
||||
skeys[tid] = key = reg;
|
||||
copyVals(svals, val, tid, delta);
|
||||
}
|
||||
}
|
||||
template <typename KR0, typename KR1, typename KR2, typename KR3, typename KR4, typename KR5, typename KR6, typename KR7, typename KR8, typename KR9,
|
||||
typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
|
||||
class Cmp0, class Cmp1, class Cmp2, class Cmp3, class Cmp4, class Cmp5, class Cmp6, class Cmp7, class Cmp8, class Cmp9>
|
||||
__device__ __forceinline__ void mergeShfl(const thrust::tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9>& key,
|
||||
const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
|
||||
const thrust::tuple<Cmp0, Cmp1, Cmp2, Cmp3, Cmp4, Cmp5, Cmp6, Cmp7, Cmp8, Cmp9>& cmp,
|
||||
unsigned int delta, int width)
|
||||
{
|
||||
For<0, thrust::tuple_size<thrust::tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9> >::value>::mergeShfl(key, val, cmp, delta, width);
|
||||
}
|
||||
template <typename KP0, typename KP1, typename KP2, typename KP3, typename KP4, typename KP5, typename KP6, typename KP7, typename KP8, typename KP9,
|
||||
typename KR0, typename KR1, typename KR2, typename KR3, typename KR4, typename KR5, typename KR6, typename KR7, typename KR8, typename KR9,
|
||||
typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
|
||||
typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
|
||||
class Cmp0, class Cmp1, class Cmp2, class Cmp3, class Cmp4, class Cmp5, class Cmp6, class Cmp7, class Cmp8, class Cmp9>
|
||||
__device__ __forceinline__ void merge(const thrust::tuple<KP0, KP1, KP2, KP3, KP4, KP5, KP6, KP7, KP8, KP9>& skeys,
|
||||
const thrust::tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9>& key,
|
||||
const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
|
||||
const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
|
||||
const thrust::tuple<Cmp0, Cmp1, Cmp2, Cmp3, Cmp4, Cmp5, Cmp6, Cmp7, Cmp8, Cmp9>& cmp,
|
||||
unsigned int tid, unsigned int delta)
|
||||
{
|
||||
For<0, thrust::tuple_size<thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::merge(skeys, key, svals, val, cmp, tid, delta);
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////
|
||||
// Generic
|
||||
|
||||
template <unsigned int N> struct Generic
|
||||
{
|
||||
template <class KP, class KR, class VP, class VR, class Cmp>
|
||||
static __device__ void reduce(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp)
|
||||
{
|
||||
loadToSmem(skeys, key, tid);
|
||||
loadValsToSmem(svals, val, tid);
|
||||
if (N >= 32)
|
||||
__syncthreads();
|
||||
|
||||
if (N >= 2048)
|
||||
{
|
||||
if (tid < 1024)
|
||||
merge(skeys, key, svals, val, cmp, tid, 1024);
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
if (N >= 1024)
|
||||
{
|
||||
if (tid < 512)
|
||||
merge(skeys, key, svals, val, cmp, tid, 512);
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
if (N >= 512)
|
||||
{
|
||||
if (tid < 256)
|
||||
merge(skeys, key, svals, val, cmp, tid, 256);
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
if (N >= 256)
|
||||
{
|
||||
if (tid < 128)
|
||||
merge(skeys, key, svals, val, cmp, tid, 128);
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
if (N >= 128)
|
||||
{
|
||||
if (tid < 64)
|
||||
merge(skeys, key, svals, val, cmp, tid, 64);
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
if (N >= 64)
|
||||
{
|
||||
if (tid < 32)
|
||||
merge(skeys, key, svals, val, cmp, tid, 32);
|
||||
}
|
||||
|
||||
if (tid < 16)
|
||||
{
|
||||
merge(skeys, key, svals, val, cmp, tid, 16);
|
||||
merge(skeys, key, svals, val, cmp, tid, 8);
|
||||
merge(skeys, key, svals, val, cmp, tid, 4);
|
||||
merge(skeys, key, svals, val, cmp, tid, 2);
|
||||
merge(skeys, key, svals, val, cmp, tid, 1);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <unsigned int N> struct WarpOptimized
|
||||
{
|
||||
template <class KP, class KR, class VP, class VR, class Cmp>
|
||||
static __device__ void reduce(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp)
|
||||
{
|
||||
#if __CUDA_ARCH >= 300
|
||||
(void) skeys;
|
||||
(void) svals;
|
||||
(void) tid;
|
||||
|
||||
#pragma unroll
|
||||
for (unsigned int i = N / 2; i >= 1; i /= 2)
|
||||
mergeShfl(key, val, cml, i, N);
|
||||
#else
|
||||
loadToSmem(skeys, key, tid);
|
||||
loadToSmem(svals, val, tid);
|
||||
|
||||
if (tid < N / 2)
|
||||
{
|
||||
#pragma unroll
|
||||
for (unsigned int i = N / 2; i >= 1; i /= 2)
|
||||
merge(skeys, key, svals, val, cmp, tid, i);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
template <unsigned int N> struct GenericOptimized32
|
||||
{
|
||||
enum { M = N / 32 };
|
||||
|
||||
template <class KP, class KR, class VP, class VR, class Cmp>
|
||||
static __device__ void reduce(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp)
|
||||
{
|
||||
const unsigned int laneId = Warp::laneId();
|
||||
|
||||
#if __CUDA_ARCH >= 300
|
||||
#pragma unroll
|
||||
for (unsigned int i = 16; i >= 1; i /= 2)
|
||||
mergeShfl(key, val, cml, i, warpSize);
|
||||
|
||||
if (laneId == 0)
|
||||
{
|
||||
loadToSmem(skeys, key, tid / 32);
|
||||
loadToSmem(svals, val, tid / 32);
|
||||
}
|
||||
#else
|
||||
loadToSmem(skeys, key, tid);
|
||||
loadToSmem(svals, val, tid);
|
||||
|
||||
if (laneId < 16)
|
||||
{
|
||||
#pragma unroll
|
||||
for (int i = 16; i >= 1; i /= 2)
|
||||
merge(skeys, key, svals, val, cmp, tid, i);
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (laneId == 0)
|
||||
{
|
||||
loadToSmem(skeys, key, tid / 32);
|
||||
loadToSmem(svals, val, tid / 32);
|
||||
}
|
||||
#endif
|
||||
|
||||
__syncthreads();
|
||||
|
||||
loadFromSmem(skeys, key, tid);
|
||||
|
||||
if (tid < 32)
|
||||
{
|
||||
#if __CUDA_ARCH >= 300
|
||||
loadFromSmem(svals, val, tid);
|
||||
|
||||
#pragma unroll
|
||||
for (unsigned int i = M / 2; i >= 1; i /= 2)
|
||||
mergeShfl(key, val, cml, i, M);
|
||||
#else
|
||||
#pragma unroll
|
||||
for (unsigned int i = M / 2; i >= 1; i /= 2)
|
||||
merge(skeys, key, svals, val, cmp, tid, i);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <bool val, class T1, class T2> struct StaticIf;
|
||||
template <class T1, class T2> struct StaticIf<true, T1, T2>
|
||||
{
|
||||
typedef T1 type;
|
||||
};
|
||||
template <class T1, class T2> struct StaticIf<false, T1, T2>
|
||||
{
|
||||
typedef T2 type;
|
||||
};
|
||||
|
||||
template <unsigned int N> struct IsPowerOf2
|
||||
{
|
||||
enum { value = ((N != 0) && !(N & (N - 1))) };
|
||||
};
|
||||
|
||||
template <unsigned int N> struct Dispatcher
|
||||
{
|
||||
typedef typename StaticIf<
|
||||
(N <= 32) && IsPowerOf2<N>::value,
|
||||
WarpOptimized<N>,
|
||||
typename StaticIf<
|
||||
(N <= 1024) && IsPowerOf2<N>::value,
|
||||
GenericOptimized32<N>,
|
||||
Generic<N>
|
||||
>::type
|
||||
>::type reductor;
|
||||
};
|
||||
}
|
||||
}}}
|
||||
|
||||
#endif // __OPENCV_GPU_PRED_VAL_REDUCE_DETAIL_HPP__
|
197
modules/gpu/include/opencv2/gpu/device/reduce.hpp
Normal file
197
modules/gpu/include/opencv2/gpu/device/reduce.hpp
Normal file
@ -0,0 +1,197 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#ifndef __OPENCV_GPU_REDUCE_HPP__
|
||||
#define __OPENCV_GPU_REDUCE_HPP__
|
||||
|
||||
#include <thrust/tuple.h>
|
||||
#include "detail/reduce.hpp"
|
||||
#include "detail/reduce_key_val.hpp"
|
||||
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
template <int N, typename T, class Op>
|
||||
__device__ __forceinline__ void reduce(volatile T* smem, T& val, unsigned int tid, const Op& op)
|
||||
{
|
||||
reduce_detail::Dispatcher<N>::reductor::template reduce<volatile T*, T&, const Op&>(smem, val, tid, op);
|
||||
}
|
||||
template <int N,
|
||||
typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9,
|
||||
typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9,
|
||||
class Op0, class Op1, class Op2, class Op3, class Op4, class Op5, class Op6, class Op7, class Op8, class Op9>
|
||||
__device__ __forceinline__ void reduce(const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem,
|
||||
const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
|
||||
unsigned int tid,
|
||||
const thrust::tuple<Op0, Op1, Op2, Op3, Op4, Op5, Op6, Op7, Op8, Op9>& op)
|
||||
{
|
||||
reduce_detail::Dispatcher<N>::reductor::template reduce<
|
||||
const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>&,
|
||||
const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>&,
|
||||
const thrust::tuple<Op0, Op1, Op2, Op3, Op4, Op5, Op6, Op7, Op8, Op9>&>(smem, val, tid, op);
|
||||
}
|
||||
|
||||
template <unsigned int N, typename K, typename V, class Cmp>
|
||||
__device__ __forceinline__ void reduceKeyVal(volatile K* skeys, K& key, volatile V* svals, V& val, unsigned int tid, const Cmp& cmp)
|
||||
{
|
||||
reduce_key_val_detail::Dispatcher<N>::reductor::template reduce<volatile K*, K&, volatile V*, V&, const Cmp&>(skeys, key, svals, val, tid, cmp);
|
||||
}
|
||||
template <unsigned int N,
|
||||
typename K,
|
||||
typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
|
||||
typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
|
||||
class Cmp>
|
||||
__device__ __forceinline__ void reduceKeyVal(volatile K* skeys, K& key,
|
||||
const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
|
||||
const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
|
||||
unsigned int tid, const Cmp& cmp)
|
||||
{
|
||||
reduce_key_val_detail::Dispatcher<N>::reductor::template reduce<volatile K*, K&,
|
||||
const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>&,
|
||||
const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>&,
|
||||
const Cmp&>(skeys, key, svals, val, tid, cmp);
|
||||
}
|
||||
template <unsigned int N,
|
||||
typename KP0, typename KP1, typename KP2, typename KP3, typename KP4, typename KP5, typename KP6, typename KP7, typename KP8, typename KP9,
|
||||
typename KR0, typename KR1, typename KR2, typename KR3, typename KR4, typename KR5, typename KR6, typename KR7, typename KR8, typename KR9,
|
||||
typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
|
||||
typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
|
||||
class Cmp0, class Cmp1, class Cmp2, class Cmp3, class Cmp4, class Cmp5, class Cmp6, class Cmp7, class Cmp8, class Cmp9>
|
||||
__device__ __forceinline__ void reduceKeyVal(const thrust::tuple<KP0, KP1, KP2, KP3, KP4, KP5, KP6, KP7, KP8, KP9>& skeys,
|
||||
const thrust::tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9>& key,
|
||||
const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
|
||||
const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
|
||||
unsigned int tid,
|
||||
const thrust::tuple<Cmp0, Cmp1, Cmp2, Cmp3, Cmp4, Cmp5, Cmp6, Cmp7, Cmp8, Cmp9>& cmp)
|
||||
{
|
||||
reduce_key_val_detail::Dispatcher<N>::reductor::template reduce<
|
||||
const thrust::tuple<KP0, KP1, KP2, KP3, KP4, KP5, KP6, KP7, KP8, KP9>&,
|
||||
const thrust::tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9>&,
|
||||
const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>&,
|
||||
const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>&,
|
||||
const thrust::tuple<Cmp0, Cmp1, Cmp2, Cmp3, Cmp4, Cmp5, Cmp6, Cmp7, Cmp8, Cmp9>&
|
||||
>(skeys, key, svals, val, tid, cmp);
|
||||
}
|
||||
|
||||
// smem_tuple
|
||||
|
||||
template <typename T0>
|
||||
__device__ __forceinline__
|
||||
thrust::tuple<volatile T0*>
|
||||
smem_tuple(T0* t0)
|
||||
{
|
||||
return thrust::make_tuple((volatile T0*) t0);
|
||||
}
|
||||
|
||||
template <typename T0, typename T1>
|
||||
__device__ __forceinline__
|
||||
thrust::tuple<volatile T0*, volatile T1*>
|
||||
smem_tuple(T0* t0, T1* t1)
|
||||
{
|
||||
return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1);
|
||||
}
|
||||
|
||||
template <typename T0, typename T1, typename T2>
|
||||
__device__ __forceinline__
|
||||
thrust::tuple<volatile T0*, volatile T1*, volatile T2*>
|
||||
smem_tuple(T0* t0, T1* t1, T2* t2)
|
||||
{
|
||||
return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2);
|
||||
}
|
||||
|
||||
template <typename T0, typename T1, typename T2, typename T3>
|
||||
__device__ __forceinline__
|
||||
thrust::tuple<volatile T0*, volatile T1*, volatile T2*, volatile T3*>
|
||||
smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3)
|
||||
{
|
||||
return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3);
|
||||
}
|
||||
|
||||
template <typename T0, typename T1, typename T2, typename T3, typename T4>
|
||||
__device__ __forceinline__
|
||||
thrust::tuple<volatile T0*, volatile T1*, volatile T2*, volatile T3*, volatile T4*>
|
||||
smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4)
|
||||
{
|
||||
return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4);
|
||||
}
|
||||
|
||||
template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5>
|
||||
__device__ __forceinline__
|
||||
thrust::tuple<volatile T0*, volatile T1*, volatile T2*, volatile T3*, volatile T4*, volatile T5*>
|
||||
smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5)
|
||||
{
|
||||
return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4, (volatile T5*) t5);
|
||||
}
|
||||
|
||||
template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6>
|
||||
__device__ __forceinline__
|
||||
thrust::tuple<volatile T0*, volatile T1*, volatile T2*, volatile T3*, volatile T4*, volatile T5*, volatile T6*>
|
||||
smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5, T6* t6)
|
||||
{
|
||||
return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4, (volatile T5*) t5, (volatile T6*) t6);
|
||||
}
|
||||
|
||||
template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7>
|
||||
__device__ __forceinline__
|
||||
thrust::tuple<volatile T0*, volatile T1*, volatile T2*, volatile T3*, volatile T4*, volatile T5*, volatile T6*, volatile T7*>
|
||||
smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5, T6* t6, T7* t7)
|
||||
{
|
||||
return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4, (volatile T5*) t5, (volatile T6*) t6, (volatile T7*) t7);
|
||||
}
|
||||
|
||||
template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7, typename T8>
|
||||
__device__ __forceinline__
|
||||
thrust::tuple<volatile T0*, volatile T1*, volatile T2*, volatile T3*, volatile T4*, volatile T5*, volatile T6*, volatile T7*, volatile T8*>
|
||||
smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5, T6* t6, T7* t7, T8* t8)
|
||||
{
|
||||
return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4, (volatile T5*) t5, (volatile T6*) t6, (volatile T7*) t7, (volatile T8*) t8);
|
||||
}
|
||||
|
||||
template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7, typename T8, typename T9>
|
||||
__device__ __forceinline__
|
||||
thrust::tuple<volatile T0*, volatile T1*, volatile T2*, volatile T3*, volatile T4*, volatile T5*, volatile T6*, volatile T7*, volatile T8*, volatile T9*>
|
||||
smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5, T6* t6, T7* t7, T8* t8, T9* t9)
|
||||
{
|
||||
return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4, (volatile T5*) t5, (volatile T6*) t6, (volatile T7*) t7, (volatile T8*) t8, (volatile T9*) t9);
|
||||
}
|
||||
}}}
|
||||
|
||||
#endif // __OPENCV_GPU_UTILITY_HPP__
|
@ -159,7 +159,7 @@ namespace cv { namespace gpu { namespace device
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Reduction
|
||||
|
||||
template <int n, typename T, typename Op> __device__ __forceinline__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
|
||||
template <int n, typename T, typename Op> __device__ __forceinline__ void reduce_old(volatile T* data, T& partial_reduction, int tid, const Op& op)
|
||||
{
|
||||
StaticAssert<n >= 8 && n <= 512>::check();
|
||||
utility_detail::ReductionDispatcher<n <= 64>::reduce<n>(data, partial_reduction, tid, op);
|
||||
|
@ -63,7 +63,7 @@ namespace cv { namespace gpu { namespace device
|
||||
|
||||
template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(int* smem, int tid)
|
||||
{
|
||||
reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile int>());
|
||||
reduce_old<THREAD_DIM>(smem, mySum, tid, plus<volatile int>());
|
||||
}
|
||||
|
||||
__device__ __forceinline__ operator int() const
|
||||
@ -87,7 +87,7 @@ namespace cv { namespace gpu { namespace device
|
||||
|
||||
template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(float* smem, int tid)
|
||||
{
|
||||
reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile float>());
|
||||
reduce_old<THREAD_DIM>(smem, mySum, tid, plus<volatile float>());
|
||||
}
|
||||
|
||||
__device__ __forceinline__ operator float() const
|
||||
@ -113,7 +113,7 @@ namespace cv { namespace gpu { namespace device
|
||||
|
||||
template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(float* smem, int tid)
|
||||
{
|
||||
reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile float>());
|
||||
reduce_old<THREAD_DIM>(smem, mySum, tid, plus<volatile float>());
|
||||
}
|
||||
|
||||
__device__ __forceinline__ operator float() const
|
||||
@ -138,7 +138,7 @@ namespace cv { namespace gpu { namespace device
|
||||
|
||||
template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(int* smem, int tid)
|
||||
{
|
||||
reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile int>());
|
||||
reduce_old<THREAD_DIM>(smem, mySum, tid, plus<volatile int>());
|
||||
}
|
||||
|
||||
__device__ __forceinline__ operator int() const
|
||||
|
97
modules/gpu/include/opencv2/gpu/device/warp_shuffle.hpp
Normal file
97
modules/gpu/include/opencv2/gpu/device/warp_shuffle.hpp
Normal file
@ -0,0 +1,97 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#ifndef __OPENCV_GPU_WARP_SHUFFLE_HPP__
|
||||
#define __OPENCV_GPU_WARP_SHUFFLE_HPP__
|
||||
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
template <typename T>
|
||||
__device__ __forceinline__ T shfl(T val, int srcLane, int width = warpSize)
|
||||
{
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
return __shfl(val, srcLane, width);
|
||||
#else
|
||||
return T();
|
||||
#endif
|
||||
}
|
||||
__device__ __forceinline__ double shfl(double val, int srcLane, int width = warpSize)
|
||||
{
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
int lo = __double2loint(val);
|
||||
int hi = __double2hiint(val);
|
||||
|
||||
lo = __shfl(lo, srcLane, width);
|
||||
hi = __shfl(hi, srcLane, width);
|
||||
|
||||
return __hiloint2double(hi, lo);
|
||||
#else
|
||||
return 0.0;
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ __forceinline__ T shfl_down(T val, unsigned int delta, int width = warpSize)
|
||||
{
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
return __shfl_down(val, delta, width);
|
||||
#else
|
||||
return T();
|
||||
#endif
|
||||
}
|
||||
__device__ __forceinline__ double shfl_down(double val, unsigned int delta, int width = warpSize)
|
||||
{
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
int lo = __double2loint(val);
|
||||
int hi = __double2hiint(val);
|
||||
|
||||
lo = __shfl_down(lo, delta, width);
|
||||
hi = __shfl_down(hi, delta, width);
|
||||
|
||||
return __hiloint2double(hi, lo);
|
||||
#else
|
||||
return 0.0;
|
||||
#endif
|
||||
}
|
||||
}}}
|
||||
|
||||
#endif // __OPENCV_GPU_WARP_SHUFFLE_HPP__
|
@ -109,9 +109,9 @@ namespace cv { namespace gpu { namespace device
|
||||
c += Ix * Iy;
|
||||
}
|
||||
|
||||
reduce<32>(srow, a, threadIdx.x, plus<volatile int>());
|
||||
reduce<32>(srow, b, threadIdx.x, plus<volatile int>());
|
||||
reduce<32>(srow, c, threadIdx.x, plus<volatile int>());
|
||||
reduce_old<32>(srow, a, threadIdx.x, plus<volatile int>());
|
||||
reduce_old<32>(srow, b, threadIdx.x, plus<volatile int>());
|
||||
reduce_old<32>(srow, c, threadIdx.x, plus<volatile int>());
|
||||
|
||||
if (threadIdx.x == 0)
|
||||
{
|
||||
@ -167,7 +167,7 @@ namespace cv { namespace gpu { namespace device
|
||||
for (int u = threadIdx.x - half_k; u <= half_k; u += blockDim.x)
|
||||
m_10 += u * image(loc.y, loc.x + u);
|
||||
|
||||
reduce<32>(srow, m_10, threadIdx.x, plus<volatile int>());
|
||||
reduce_old<32>(srow, m_10, threadIdx.x, plus<volatile int>());
|
||||
|
||||
for (int v = 1; v <= half_k; ++v)
|
||||
{
|
||||
@ -185,8 +185,8 @@ namespace cv { namespace gpu { namespace device
|
||||
m_sum += u * (val_plus + val_minus);
|
||||
}
|
||||
|
||||
reduce<32>(srow, v_sum, threadIdx.x, plus<volatile int>());
|
||||
reduce<32>(srow, m_sum, threadIdx.x, plus<volatile int>());
|
||||
reduce_old<32>(srow, v_sum, threadIdx.x, plus<volatile int>());
|
||||
reduce_old<32>(srow, m_sum, threadIdx.x, plus<volatile int>());
|
||||
|
||||
m_10 += m_sum;
|
||||
m_01 += v * v_sum;
|
||||
@ -419,4 +419,4 @@ namespace cv { namespace gpu { namespace device
|
||||
}
|
||||
}}}
|
||||
|
||||
#endif /* CUDA_DISABLER */
|
||||
#endif /* CUDA_DISABLER */
|
||||
|
@ -599,8 +599,8 @@ namespace cv { namespace gpu { namespace device
|
||||
sumy += s_Y[threadIdx.x + 96];
|
||||
}
|
||||
|
||||
device::reduce<32>(s_sumx + threadIdx.y * 32, sumx, threadIdx.x, plus<volatile float>());
|
||||
device::reduce<32>(s_sumy + threadIdx.y * 32, sumy, threadIdx.x, plus<volatile float>());
|
||||
device::reduce_old<32>(s_sumx + threadIdx.y * 32, sumx, threadIdx.x, plus<volatile float>());
|
||||
device::reduce_old<32>(s_sumy + threadIdx.y * 32, sumy, threadIdx.x, plus<volatile float>());
|
||||
|
||||
const float temp_mod = sumx * sumx + sumy * sumy;
|
||||
if (temp_mod > best_mod)
|
||||
|
Loading…
Reference in New Issue
Block a user