mirror of
https://github.com/opencv/opencv.git
synced 2024-11-25 19:50:38 +08:00
implemented gpu::reduce
This commit is contained in:
parent
ce35a6d8be
commit
8b23c79294
@ -860,6 +860,9 @@ namespace cv
|
|||||||
//! counts non-zero array elements
|
//! counts non-zero array elements
|
||||||
CV_EXPORTS int countNonZero(const GpuMat& src, GpuMat& buf);
|
CV_EXPORTS int countNonZero(const GpuMat& src, GpuMat& buf);
|
||||||
|
|
||||||
|
//! reduces a matrix to a vector
|
||||||
|
CV_EXPORTS void reduce(const GpuMat& mtx, GpuMat& vec, int dim, int reduceOp, int dtype = -1, Stream& stream = Stream::Null());
|
||||||
|
|
||||||
|
|
||||||
///////////////////////////// Calibration 3D //////////////////////////////////
|
///////////////////////////// Calibration 3D //////////////////////////////////
|
||||||
|
|
||||||
|
@ -1804,4 +1804,278 @@ namespace cv { namespace gpu { namespace mathfunc
|
|||||||
template void sqrSumCaller<short>(const DevMem2D, PtrStep, double*, int);
|
template void sqrSumCaller<short>(const DevMem2D, PtrStep, double*, int);
|
||||||
template void sqrSumCaller<int>(const DevMem2D, PtrStep, double*, int);
|
template void sqrSumCaller<int>(const DevMem2D, PtrStep, double*, int);
|
||||||
template void sqrSumCaller<float>(const DevMem2D, PtrStep, double*, int);
|
template void sqrSumCaller<float>(const DevMem2D, PtrStep, double*, int);
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
// reduce
|
||||||
|
|
||||||
|
template <typename S> struct SumReductor
|
||||||
|
{
|
||||||
|
__device__ __forceinline__ S startValue() const
|
||||||
|
{
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
__device__ __forceinline__ S operator ()(volatile S a, volatile S b) const
|
||||||
|
{
|
||||||
|
return a + b;
|
||||||
|
}
|
||||||
|
|
||||||
|
__device__ __forceinline S result(S r, double) const
|
||||||
|
{
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename S> struct AvgReductor
|
||||||
|
{
|
||||||
|
__device__ __forceinline__ S startValue() const
|
||||||
|
{
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
__device__ __forceinline__ S operator ()(volatile S a, volatile S b) const
|
||||||
|
{
|
||||||
|
return a + b;
|
||||||
|
}
|
||||||
|
|
||||||
|
__device__ __forceinline double result(S r, double sz) const
|
||||||
|
{
|
||||||
|
return r / sz;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename S> struct MinReductor
|
||||||
|
{
|
||||||
|
__device__ __forceinline__ S startValue() const
|
||||||
|
{
|
||||||
|
return numeric_limits<S>::max();
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T> __device__ __forceinline__ T operator ()(volatile T a, volatile T b) const
|
||||||
|
{
|
||||||
|
return saturate_cast<T>(::min(a, b));
|
||||||
|
}
|
||||||
|
__device__ __forceinline__ float operator ()(volatile float a, volatile float b) const
|
||||||
|
{
|
||||||
|
return ::fmin(a, b);
|
||||||
|
}
|
||||||
|
|
||||||
|
__device__ __forceinline S result(S r, double) const
|
||||||
|
{
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename S> struct MaxReductor
|
||||||
|
{
|
||||||
|
__device__ __forceinline__ S startValue() const
|
||||||
|
{
|
||||||
|
return numeric_limits<S>::min();
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T> __device__ __forceinline__ int operator ()(volatile T a, volatile T b) const
|
||||||
|
{
|
||||||
|
return ::max(a, b);
|
||||||
|
}
|
||||||
|
__device__ __forceinline__ float operator ()(volatile float a, volatile float b) const
|
||||||
|
{
|
||||||
|
return ::fmax(a, b);
|
||||||
|
}
|
||||||
|
|
||||||
|
__device__ __forceinline S result(S r, double) const
|
||||||
|
{
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
template <class Op, typename T, typename S, typename D> __global__ void reduceRows(const DevMem2D_<T> src, D* dst, const Op op)
|
||||||
|
{
|
||||||
|
__shared__ S smem[16 * 16];
|
||||||
|
|
||||||
|
const int x = blockIdx.x * 16 + threadIdx.x;
|
||||||
|
|
||||||
|
if (x < src.cols)
|
||||||
|
{
|
||||||
|
S myVal = op.startValue();
|
||||||
|
|
||||||
|
for (int y = threadIdx.y; y < src.rows; y += 16)
|
||||||
|
myVal = op(myVal, src.ptr(y)[x]);
|
||||||
|
|
||||||
|
smem[threadIdx.y * 16 + threadIdx.x] = myVal;
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
if (threadIdx.y == 0)
|
||||||
|
{
|
||||||
|
myVal = smem[threadIdx.x];
|
||||||
|
|
||||||
|
#pragma unroll
|
||||||
|
for (int i = 1; i < 16; ++i)
|
||||||
|
myVal = op(myVal, smem[i * 16 + threadIdx.x]);
|
||||||
|
|
||||||
|
dst[x] = saturate_cast<D>(op.result(myVal, src.rows));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <template <typename> class Op, typename T, typename S, typename D> void reduceRows_caller(const DevMem2D_<T>& src, DevMem2D_<D> dst, cudaStream_t stream)
|
||||||
|
{
|
||||||
|
const dim3 block(16, 16);
|
||||||
|
const dim3 grid(divUp(src.cols, block.x));
|
||||||
|
|
||||||
|
Op<S> op;
|
||||||
|
reduceRows<Op<S>, T, S, D><<<grid, block, 0, stream>>>(src, dst.data, op);
|
||||||
|
cudaSafeCall( cudaGetLastError() );
|
||||||
|
|
||||||
|
if (stream == 0)
|
||||||
|
cudaSafeCall( cudaDeviceSynchronize() );
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T, typename S, typename D> void reduceRows_gpu(const DevMem2D& src, const DevMem2D& dst, int reduceOp, cudaStream_t stream)
|
||||||
|
{
|
||||||
|
typedef void (*caller_t)(const DevMem2D_<T>& src, DevMem2D_<D> dst, cudaStream_t stream);
|
||||||
|
|
||||||
|
static const caller_t callers[] =
|
||||||
|
{
|
||||||
|
reduceRows_caller<SumReductor, T, S, D>,
|
||||||
|
reduceRows_caller<AvgReductor, T, S, D>,
|
||||||
|
reduceRows_caller<MaxReductor, T, S, D>,
|
||||||
|
reduceRows_caller<MinReductor, T, S, D>
|
||||||
|
};
|
||||||
|
|
||||||
|
callers[reduceOp](static_cast< DevMem2D_<T> >(src), static_cast< DevMem2D_<D> >(dst), stream);
|
||||||
|
}
|
||||||
|
|
||||||
|
template void reduceRows_gpu<uchar, int, uchar>(const DevMem2D& src, const DevMem2D& dst, int reduceOp, cudaStream_t stream);
|
||||||
|
template void reduceRows_gpu<uchar, int, int>(const DevMem2D& src, const DevMem2D& dst, int reduceOp, cudaStream_t stream);
|
||||||
|
template void reduceRows_gpu<uchar, int, float>(const DevMem2D& src, const DevMem2D& dst, int reduceOp, cudaStream_t stream);
|
||||||
|
|
||||||
|
template void reduceRows_gpu<ushort, int, ushort>(const DevMem2D& src, const DevMem2D& dst, int reduceOp, cudaStream_t stream);
|
||||||
|
template void reduceRows_gpu<ushort, int, int>(const DevMem2D& src, const DevMem2D& dst, int reduceOp, cudaStream_t stream);
|
||||||
|
template void reduceRows_gpu<ushort, int, float>(const DevMem2D& src, const DevMem2D& dst, int reduceOp, cudaStream_t stream);
|
||||||
|
|
||||||
|
template void reduceRows_gpu<short, int, short>(const DevMem2D& src, const DevMem2D& dst, int reduceOp, cudaStream_t stream);
|
||||||
|
template void reduceRows_gpu<short, int, int>(const DevMem2D& src, const DevMem2D& dst, int reduceOp, cudaStream_t stream);
|
||||||
|
template void reduceRows_gpu<short, int, float>(const DevMem2D& src, const DevMem2D& dst, int reduceOp, cudaStream_t stream);
|
||||||
|
|
||||||
|
template void reduceRows_gpu<int, int, int>(const DevMem2D& src, const DevMem2D& dst, int reduceOp, cudaStream_t stream);
|
||||||
|
template void reduceRows_gpu<int, int, float>(const DevMem2D& src, const DevMem2D& dst, int reduceOp, cudaStream_t stream);
|
||||||
|
|
||||||
|
template void reduceRows_gpu<float, float, float>(const DevMem2D& src, const DevMem2D& dst, int reduceOp, cudaStream_t stream);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
template <int cn, class Op, typename T, typename S, typename D> __global__ void reduceCols(const DevMem2D_<T> src, D* dst, const Op op)
|
||||||
|
{
|
||||||
|
__shared__ S smem[256 * cn];
|
||||||
|
|
||||||
|
const int y = blockIdx.x;
|
||||||
|
|
||||||
|
const T* src_row = src.ptr(y);
|
||||||
|
|
||||||
|
S myVal[cn];
|
||||||
|
|
||||||
|
#pragma unroll
|
||||||
|
for (int c = 0; c < cn; ++c)
|
||||||
|
myVal[c] = op.startValue();
|
||||||
|
|
||||||
|
for (int x = threadIdx.x; x < src.cols; x += 256)
|
||||||
|
{
|
||||||
|
#pragma unroll
|
||||||
|
for (int c = 0; c < cn; ++c)
|
||||||
|
myVal[c] = op(myVal[c], src_row[x * cn + c]);
|
||||||
|
}
|
||||||
|
|
||||||
|
#pragma unroll
|
||||||
|
for (int c = 0; c < cn; ++c)
|
||||||
|
smem[c * 256 + threadIdx.x] = myVal[c];
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
if (threadIdx.x < 128)
|
||||||
|
{
|
||||||
|
#pragma unroll
|
||||||
|
for (int c = 0; c < cn; ++c)
|
||||||
|
smem[c * 256 + threadIdx.x] = op(smem[c * 256 + threadIdx.x], smem[c * 256 + threadIdx.x + 128]);
|
||||||
|
}
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
if (threadIdx.x < 64)
|
||||||
|
{
|
||||||
|
#pragma unroll
|
||||||
|
for (int c = 0; c < cn; ++c)
|
||||||
|
smem[c * 256 + threadIdx.x] = op(smem[c * 256 + threadIdx.x], smem[c * 256 + threadIdx.x + 64]);
|
||||||
|
}
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
volatile S* sdata = smem;
|
||||||
|
|
||||||
|
if (threadIdx.x < 32)
|
||||||
|
{
|
||||||
|
#pragma unroll
|
||||||
|
for (int c = 0; c < cn; ++c)
|
||||||
|
{
|
||||||
|
sdata[c * 256 + threadIdx.x] = op(sdata[c * 256 + threadIdx.x], sdata[c * 256 + threadIdx.x + 32]);
|
||||||
|
sdata[c * 256 + threadIdx.x] = op(sdata[c * 256 + threadIdx.x], sdata[c * 256 + threadIdx.x + 16]);
|
||||||
|
sdata[c * 256 + threadIdx.x] = op(sdata[c * 256 + threadIdx.x], sdata[c * 256 + threadIdx.x + 8]);
|
||||||
|
sdata[c * 256 + threadIdx.x] = op(sdata[c * 256 + threadIdx.x], sdata[c * 256 + threadIdx.x + 4]);
|
||||||
|
sdata[c * 256 + threadIdx.x] = op(sdata[c * 256 + threadIdx.x], sdata[c * 256 + threadIdx.x + 2]);
|
||||||
|
sdata[c * 256 + threadIdx.x] = op(sdata[c * 256 + threadIdx.x], sdata[c * 256 + threadIdx.x + 1]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
if (threadIdx.x == 0)
|
||||||
|
{
|
||||||
|
#pragma unroll
|
||||||
|
for (int c = 0; c < cn; ++c)
|
||||||
|
dst[y * cn + c] = saturate_cast<D>(op.result(smem[c * 256], src.cols));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <int cn, template <typename> class Op, typename T, typename S, typename D> void reduceCols_caller(const DevMem2D_<T>& src, DevMem2D_<D> dst, cudaStream_t stream)
|
||||||
|
{
|
||||||
|
const dim3 block(256);
|
||||||
|
const dim3 grid(src.rows);
|
||||||
|
|
||||||
|
Op<S> op;
|
||||||
|
reduceCols<cn, Op<S>, T, S, D><<<grid, block, 0, stream>>>(src, dst.data, op);
|
||||||
|
cudaSafeCall( cudaGetLastError() );
|
||||||
|
|
||||||
|
if (stream == 0)
|
||||||
|
cudaSafeCall( cudaDeviceSynchronize() );
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T, typename S, typename D> void reduceCols_gpu(const DevMem2D& src, int cn, const DevMem2D& dst, int reduceOp, cudaStream_t stream)
|
||||||
|
{
|
||||||
|
typedef void (*caller_t)(const DevMem2D_<T>& src, DevMem2D_<D> dst, cudaStream_t stream);
|
||||||
|
|
||||||
|
static const caller_t callers[4][4] =
|
||||||
|
{
|
||||||
|
{reduceCols_caller<1, SumReductor, T, S, D>, reduceCols_caller<1, AvgReductor, T, S, D>, reduceCols_caller<1, MaxReductor, T, S, D>, reduceCols_caller<1, MinReductor, T, S, D>},
|
||||||
|
{reduceCols_caller<2, SumReductor, T, S, D>, reduceCols_caller<2, AvgReductor, T, S, D>, reduceCols_caller<2, MaxReductor, T, S, D>, reduceCols_caller<2, MinReductor, T, S, D>},
|
||||||
|
{reduceCols_caller<3, SumReductor, T, S, D>, reduceCols_caller<3, AvgReductor, T, S, D>, reduceCols_caller<3, MaxReductor, T, S, D>, reduceCols_caller<3, MinReductor, T, S, D>},
|
||||||
|
{reduceCols_caller<4, SumReductor, T, S, D>, reduceCols_caller<4, AvgReductor, T, S, D>, reduceCols_caller<4, MaxReductor, T, S, D>, reduceCols_caller<4, MinReductor, T, S, D>},
|
||||||
|
};
|
||||||
|
|
||||||
|
callers[cn - 1][reduceOp](static_cast< DevMem2D_<T> >(src), static_cast< DevMem2D_<D> >(dst), stream);
|
||||||
|
}
|
||||||
|
|
||||||
|
template void reduceCols_gpu<uchar, int, uchar>(const DevMem2D& src, int cn, const DevMem2D& dst, int reduceOp, cudaStream_t stream);
|
||||||
|
template void reduceCols_gpu<uchar, int, int>(const DevMem2D& src, int cn, const DevMem2D& dst, int reduceOp, cudaStream_t stream);
|
||||||
|
template void reduceCols_gpu<uchar, int, float>(const DevMem2D& src, int cn, const DevMem2D& dst, int reduceOp, cudaStream_t stream);
|
||||||
|
|
||||||
|
template void reduceCols_gpu<ushort, int, ushort>(const DevMem2D& src, int cn, const DevMem2D& dst, int reduceOp, cudaStream_t stream);
|
||||||
|
template void reduceCols_gpu<ushort, int, int>(const DevMem2D& src, int cn, const DevMem2D& dst, int reduceOp, cudaStream_t stream);
|
||||||
|
template void reduceCols_gpu<ushort, int, float>(const DevMem2D& src, int cn, const DevMem2D& dst, int reduceOp, cudaStream_t stream);
|
||||||
|
|
||||||
|
template void reduceCols_gpu<short, int, short>(const DevMem2D& src, int cn, const DevMem2D& dst, int reduceOp, cudaStream_t stream);
|
||||||
|
template void reduceCols_gpu<short, int, int>(const DevMem2D& src, int cn, const DevMem2D& dst, int reduceOp, cudaStream_t stream);
|
||||||
|
template void reduceCols_gpu<short, int, float>(const DevMem2D& src, int cn, const DevMem2D& dst, int reduceOp, cudaStream_t stream);
|
||||||
|
|
||||||
|
template void reduceCols_gpu<int, int, int>(const DevMem2D& src, int cn, const DevMem2D& dst, int reduceOp, cudaStream_t stream);
|
||||||
|
template void reduceCols_gpu<int, int, float>(const DevMem2D& src, int cn, const DevMem2D& dst, int reduceOp, cudaStream_t stream);
|
||||||
|
|
||||||
|
template void reduceCols_gpu<float, float, float>(const DevMem2D& src, int cn, const DevMem2D& dst, int reduceOp, cudaStream_t stream);
|
||||||
}}}
|
}}}
|
@ -63,6 +63,7 @@ void cv::gpu::minMaxLoc(const GpuMat&, double*, double*, Point*, Point*, const G
|
|||||||
void cv::gpu::minMaxLoc(const GpuMat&, double*, double*, Point*, Point*, const GpuMat&, GpuMat&, GpuMat&) { throw_nogpu(); }
|
void cv::gpu::minMaxLoc(const GpuMat&, double*, double*, Point*, Point*, const GpuMat&, GpuMat&, GpuMat&) { throw_nogpu(); }
|
||||||
int cv::gpu::countNonZero(const GpuMat&) { throw_nogpu(); return 0; }
|
int cv::gpu::countNonZero(const GpuMat&) { throw_nogpu(); return 0; }
|
||||||
int cv::gpu::countNonZero(const GpuMat&, GpuMat&) { throw_nogpu(); return 0; }
|
int cv::gpu::countNonZero(const GpuMat&, GpuMat&) { throw_nogpu(); return 0; }
|
||||||
|
void cv::gpu::reduce(const GpuMat&, GpuMat&, int, int, int, Stream&) { throw_nogpu(); }
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
@ -598,4 +599,150 @@ int cv::gpu::countNonZero(const GpuMat& src, GpuMat& buf)
|
|||||||
return caller(src, buf);
|
return caller(src, buf);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
// reduce
|
||||||
|
|
||||||
|
namespace cv { namespace gpu { namespace mathfunc {
|
||||||
|
template <typename T, typename S, typename D> void reduceRows_gpu(const DevMem2D& src, const DevMem2D& dst, int reduceOp, cudaStream_t stream);
|
||||||
|
template <typename T, typename S, typename D> void reduceCols_gpu(const DevMem2D& src, int cn, const DevMem2D& dst, int reduceOp, cudaStream_t stream);
|
||||||
|
}}}
|
||||||
|
|
||||||
|
void cv::gpu::reduce(const GpuMat& src, GpuMat& dst, int dim, int reduceOp, int dtype, Stream& stream)
|
||||||
|
{
|
||||||
|
using namespace cv::gpu::mathfunc;
|
||||||
|
CV_Assert(src.depth() <= CV_32F && src.channels() <= 4 && dtype <= CV_32F);
|
||||||
|
CV_Assert(dim == 0 || dim == 1);
|
||||||
|
CV_Assert(reduceOp == CV_REDUCE_SUM || reduceOp == CV_REDUCE_AVG || reduceOp == CV_REDUCE_MAX || reduceOp == CV_REDUCE_MIN);
|
||||||
|
|
||||||
|
if (dtype < 0)
|
||||||
|
dtype = src.depth();
|
||||||
|
|
||||||
|
dst.create(1, dim == 0 ? src.cols : src.rows, CV_MAKETYPE(dtype, src.channels()));
|
||||||
|
|
||||||
|
if (dim == 0)
|
||||||
|
{
|
||||||
|
typedef void (*caller_t)(const DevMem2D& src, const DevMem2D& dst, int reduceOp, cudaStream_t stream);
|
||||||
|
|
||||||
|
static const caller_t callers[6][6] =
|
||||||
|
{
|
||||||
|
{
|
||||||
|
reduceRows_gpu<unsigned char, int, unsigned char>,
|
||||||
|
0/*reduceRows_gpu<unsigned char, int, signed char>*/,
|
||||||
|
0/*reduceRows_gpu<unsigned char, int, unsigned short>*/,
|
||||||
|
0/*reduceRows_gpu<unsigned char, int, short>*/,
|
||||||
|
reduceRows_gpu<unsigned char, int, int>,
|
||||||
|
reduceRows_gpu<unsigned char, int, float>
|
||||||
|
},
|
||||||
|
{
|
||||||
|
0/*reduceRows_gpu<signed char, int, unsigned char>*/,
|
||||||
|
0/*reduceRows_gpu<signed char, int, signed char>*/,
|
||||||
|
0/*reduceRows_gpu<signed char, int, unsigned short>*/,
|
||||||
|
0/*reduceRows_gpu<signed char, int, short>*/,
|
||||||
|
0/*reduceRows_gpu<signed char, int, int>*/,
|
||||||
|
0/*reduceRows_gpu<signed char, int, float>*/
|
||||||
|
},
|
||||||
|
{
|
||||||
|
0/*reduceRows_gpu<unsigned short, int, unsigned char>*/,
|
||||||
|
0/*reduceRows_gpu<unsigned short, int, signed char>*/,
|
||||||
|
reduceRows_gpu<unsigned short, int, unsigned short>,
|
||||||
|
0/*reduceRows_gpu<unsigned short, int, short>*/,
|
||||||
|
reduceRows_gpu<unsigned short, int, int>,
|
||||||
|
reduceRows_gpu<unsigned short, int, float>
|
||||||
|
},
|
||||||
|
{
|
||||||
|
0/*reduceRows_gpu<short, int, unsigned char>*/,
|
||||||
|
0/*reduceRows_gpu<short, int, signed char>*/,
|
||||||
|
0/*reduceRows_gpu<short, int, unsigned short>*/,
|
||||||
|
reduceRows_gpu<short, int, short>,
|
||||||
|
reduceRows_gpu<short, int, int>,
|
||||||
|
reduceRows_gpu<short, int, float>
|
||||||
|
},
|
||||||
|
{
|
||||||
|
0/*reduceRows_gpu<int, int, unsigned char>*/,
|
||||||
|
0/*reduceRows_gpu<int, int, signed char>*/,
|
||||||
|
0/*reduceRows_gpu<int, int, unsigned short>*/,
|
||||||
|
0/*reduceRows_gpu<int, int, short>*/,
|
||||||
|
reduceRows_gpu<int, int, int>,
|
||||||
|
reduceRows_gpu<int, int, float>
|
||||||
|
},
|
||||||
|
{
|
||||||
|
0/*reduceRows_gpu<float, float, unsigned char>*/,
|
||||||
|
0/*reduceRows_gpu<float, float, signed char>*/,
|
||||||
|
0/*reduceRows_gpu<float, float, unsigned short>*/,
|
||||||
|
0/*reduceRows_gpu<float, float, short>*/,
|
||||||
|
0/*reduceRows_gpu<float, float, int>*/,
|
||||||
|
reduceRows_gpu<float, float, float>
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
const caller_t func = callers[src.depth()][dst.depth()];
|
||||||
|
if (!func)
|
||||||
|
CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of input and output array formats");
|
||||||
|
|
||||||
|
func(src.reshape(1), dst.reshape(1), reduceOp, StreamAccessor::getStream(stream));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
typedef void (*caller_t)(const DevMem2D& src, int cn, const DevMem2D& dst, int reduceOp, cudaStream_t stream);
|
||||||
|
|
||||||
|
static const caller_t callers[6][6] =
|
||||||
|
{
|
||||||
|
{
|
||||||
|
reduceCols_gpu<unsigned char, int, unsigned char>,
|
||||||
|
0/*reduceCols_gpu<unsigned char, int, signed char>*/,
|
||||||
|
0/*reduceCols_gpu<unsigned char, int, unsigned short>*/,
|
||||||
|
0/*reduceCols_gpu<unsigned char, int, short>*/,
|
||||||
|
reduceCols_gpu<unsigned char, int, int>,
|
||||||
|
reduceCols_gpu<unsigned char, int, float>
|
||||||
|
},
|
||||||
|
{
|
||||||
|
0/*reduceCols_gpu<signed char, int, unsigned char>*/,
|
||||||
|
0/*reduceCols_gpu<signed char, int, signed char>*/,
|
||||||
|
0/*reduceCols_gpu<signed char, int, unsigned short>*/,
|
||||||
|
0/*reduceCols_gpu<signed char, int, short>*/,
|
||||||
|
0/*reduceCols_gpu<signed char, int, int>*/,
|
||||||
|
0/*reduceCols_gpu<signed char, int, float>*/
|
||||||
|
},
|
||||||
|
{
|
||||||
|
0/*reduceCols_gpu<unsigned short, int, unsigned char>*/,
|
||||||
|
0/*reduceCols_gpu<unsigned short, int, signed char>*/,
|
||||||
|
reduceCols_gpu<unsigned short, int, unsigned short>,
|
||||||
|
0/*reduceCols_gpu<unsigned short, int, short>*/,
|
||||||
|
reduceCols_gpu<unsigned short, int, int>,
|
||||||
|
reduceCols_gpu<unsigned short, int, float>
|
||||||
|
},
|
||||||
|
{
|
||||||
|
0/*reduceCols_gpu<short, int, unsigned char>*/,
|
||||||
|
0/*reduceCols_gpu<short, int, signed char>*/,
|
||||||
|
0/*reduceCols_gpu<short, int, unsigned short>*/,
|
||||||
|
reduceCols_gpu<short, int, short>,
|
||||||
|
reduceCols_gpu<short, int, int>,
|
||||||
|
reduceCols_gpu<short, int, float>
|
||||||
|
},
|
||||||
|
{
|
||||||
|
0/*reduceCols_gpu<int, int, unsigned char>*/,
|
||||||
|
0/*reduceCols_gpu<int, int, signed char>*/,
|
||||||
|
0/*reduceCols_gpu<int, int, unsigned short>*/,
|
||||||
|
0/*reduceCols_gpu<int, int, short>*/,
|
||||||
|
reduceCols_gpu<int, int, int>,
|
||||||
|
reduceCols_gpu<int, int, float>
|
||||||
|
},
|
||||||
|
{
|
||||||
|
0/*reduceCols_gpu<float, unsigned char>*/,
|
||||||
|
0/*reduceCols_gpu<float, signed char>*/,
|
||||||
|
0/*reduceCols_gpu<float, unsigned short>*/,
|
||||||
|
0/*reduceCols_gpu<float, short>*/,
|
||||||
|
0/*reduceCols_gpu<float, int>*/,
|
||||||
|
reduceCols_gpu<float, float, float>
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
const caller_t func = callers[src.depth()][dst.depth()];
|
||||||
|
if (!func)
|
||||||
|
CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of input and output array formats");
|
||||||
|
|
||||||
|
func(src, src.channels(), dst, reduceOp, StreamAccessor::getStream(stream));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -1788,4 +1788,76 @@ INSTANTIATE_TEST_CASE_P(Arithm, AddWeighted, testing::Combine(
|
|||||||
testing::ValuesIn(types(CV_8U, CV_64F, 1, 1)),
|
testing::ValuesIn(types(CV_8U, CV_64F, 1, 1)),
|
||||||
testing::ValuesIn(types(CV_8U, CV_64F, 1, 1))));
|
testing::ValuesIn(types(CV_8U, CV_64F, 1, 1))));
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
// reduce
|
||||||
|
|
||||||
|
struct Reduce : testing::TestWithParam< std::tr1::tuple<cv::gpu::DeviceInfo, int, int, int> >
|
||||||
|
{
|
||||||
|
cv::gpu::DeviceInfo devInfo;
|
||||||
|
int type;
|
||||||
|
int dim;
|
||||||
|
int reduceOp;
|
||||||
|
|
||||||
|
cv::Size size;
|
||||||
|
cv::Mat src;
|
||||||
|
|
||||||
|
cv::Mat dst_gold;
|
||||||
|
|
||||||
|
virtual void SetUp()
|
||||||
|
{
|
||||||
|
devInfo = std::tr1::get<0>(GetParam());
|
||||||
|
type = std::tr1::get<1>(GetParam());
|
||||||
|
dim = std::tr1::get<2>(GetParam());
|
||||||
|
reduceOp = std::tr1::get<3>(GetParam());
|
||||||
|
|
||||||
|
cv::gpu::setDevice(devInfo.deviceID());
|
||||||
|
|
||||||
|
cv::RNG& rng = cvtest::TS::ptr()->get_rng();
|
||||||
|
|
||||||
|
size = cv::Size(rng.uniform(100, 400), rng.uniform(100, 400));
|
||||||
|
|
||||||
|
src = cvtest::randomMat(rng, size, type, 0.0, 255.0, false);
|
||||||
|
|
||||||
|
cv::reduce(src, dst_gold, dim, reduceOp, reduceOp == CV_REDUCE_SUM || reduceOp == CV_REDUCE_AVG ? CV_32F : CV_MAT_DEPTH(type));
|
||||||
|
|
||||||
|
if (dim == 1)
|
||||||
|
{
|
||||||
|
dst_gold.cols = dst_gold.rows;
|
||||||
|
dst_gold.rows = 1;
|
||||||
|
dst_gold.step = dst_gold.cols * dst_gold.elemSize();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
TEST_P(Reduce, Accuracy)
|
||||||
|
{
|
||||||
|
static const char* reduceOpStrs[] = {"CV_REDUCE_SUM", "CV_REDUCE_AVG", "CV_REDUCE_MAX", "CV_REDUCE_MIN"};
|
||||||
|
const char* reduceOpStr = reduceOpStrs[reduceOp];
|
||||||
|
|
||||||
|
PRINT_PARAM(devInfo);
|
||||||
|
PRINT_TYPE(type);
|
||||||
|
PRINT_PARAM(dim);
|
||||||
|
PRINT_PARAM(reduceOpStr);
|
||||||
|
PRINT_PARAM(size);
|
||||||
|
|
||||||
|
cv::Mat dst;
|
||||||
|
|
||||||
|
ASSERT_NO_THROW(
|
||||||
|
cv::gpu::GpuMat dev_dst;
|
||||||
|
|
||||||
|
cv::gpu::reduce(cv::gpu::GpuMat(src), dev_dst, dim, reduceOp, reduceOp == CV_REDUCE_SUM || reduceOp == CV_REDUCE_AVG ? CV_32F : CV_MAT_DEPTH(type));
|
||||||
|
|
||||||
|
dev_dst.download(dst);
|
||||||
|
);
|
||||||
|
|
||||||
|
double norm = reduceOp == CV_REDUCE_SUM || reduceOp == CV_REDUCE_AVG ? 1e-1 : 0.0;
|
||||||
|
EXPECT_MAT_NEAR(dst_gold, dst, norm);
|
||||||
|
}
|
||||||
|
|
||||||
|
INSTANTIATE_TEST_CASE_P(Arithm, Reduce, testing::Combine(
|
||||||
|
testing::ValuesIn(devices()),
|
||||||
|
testing::Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_16UC1, CV_16UC3, CV_16UC4, CV_32FC1, CV_32FC3, CV_32FC4),
|
||||||
|
testing::Values(0, 1),
|
||||||
|
testing::Values((int)CV_REDUCE_SUM, (int)CV_REDUCE_AVG, (int)CV_REDUCE_MAX, (int)CV_REDUCE_MIN)));
|
||||||
|
|
||||||
#endif // HAVE_CUDA
|
#endif // HAVE_CUDA
|
||||||
|
Loading…
Reference in New Issue
Block a user