mirror of
https://github.com/opencv/opencv.git
synced 2025-06-23 04:01:31 +08:00
332 lines
13 KiB
Plaintext
332 lines
13 KiB
Plaintext
/*M///////////////////////////////////////////////////////////////////////////////////////
|
|
//
|
|
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
|
//
|
|
// By downloading, copying, installing or using the software you agree to this license.
|
|
// If you do not agree to this license, do not download, install,
|
|
// copy or use the software.
|
|
//
|
|
//
|
|
// License Agreement
|
|
// For Open Source Computer Vision Library
|
|
//
|
|
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
|
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
|
// Third party copyrights are property of their respective owners.
|
|
//
|
|
// Redistribution and use in source and binary forms, with or without modification,
|
|
// are permitted provided that the following conditions are met:
|
|
//
|
|
// * Redistribution's of source code must retain the above copyright notice,
|
|
// this list of conditions and the following disclaimer.
|
|
//
|
|
// * Redistribution's in binary form must reproduce the above copyright notice,
|
|
// this list of conditions and the following disclaimer in the documentation
|
|
// and/or other materials provided with the distribution.
|
|
//
|
|
// * The name of the copyright holders may not be used to endorse or promote products
|
|
// derived from this software without specific prior written permission.
|
|
//
|
|
// This software is provided by the copyright holders and contributors "as is" and
|
|
// any express or implied warranties, including, but not limited to, the implied
|
|
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
|
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
|
// indirect, incidental, special, exemplary, or consequential damages
|
|
// (including, but not limited to, procurement of substitute goods or services;
|
|
// loss of use, data, or profits; or business interruption) however caused
|
|
// and on any theory of liability, whether in contract, strict liability,
|
|
// or tort (including negligence or otherwise) arising in any way out of
|
|
// the use of this software, even if advised of the possibility of such damage.
|
|
//
|
|
//M*/
|
|
|
|
#if !defined CUDA_DISABLER
|
|
|
|
#include "opencv2/core/cuda/common.hpp"
|
|
#include "opencv2/core/cuda/saturate_cast.hpp"
|
|
#include "opencv2/core/cuda/vec_traits.hpp"
|
|
#include "opencv2/core/cuda/vec_math.hpp"
|
|
#include "opencv2/core/cuda/functional.hpp"
|
|
#include "opencv2/core/cuda/reduce.hpp"
|
|
#include "opencv2/core/cuda/limits.hpp"
|
|
|
|
#include "unroll_detail.hpp"
|
|
|
|
using namespace cv::gpu;
|
|
using namespace cv::gpu::cudev;
|
|
|
|
namespace reduce
|
|
{
|
|
struct Sum
|
|
{
|
|
template <typename T>
|
|
__device__ __forceinline__ T startValue() const
|
|
{
|
|
return VecTraits<T>::all(0);
|
|
}
|
|
|
|
template <typename T>
|
|
__device__ __forceinline__ T operator ()(T a, T b) const
|
|
{
|
|
return a + b;
|
|
}
|
|
|
|
template <typename T>
|
|
__device__ __forceinline__ T result(T r, double) const
|
|
{
|
|
return r;
|
|
}
|
|
|
|
__host__ __device__ __forceinline__ Sum() {}
|
|
__host__ __device__ __forceinline__ Sum(const Sum&) {}
|
|
};
|
|
|
|
struct Avg
|
|
{
|
|
template <typename T>
|
|
__device__ __forceinline__ T startValue() const
|
|
{
|
|
return VecTraits<T>::all(0);
|
|
}
|
|
|
|
template <typename T>
|
|
__device__ __forceinline__ T operator ()(T a, T b) const
|
|
{
|
|
return a + b;
|
|
}
|
|
|
|
template <typename T>
|
|
__device__ __forceinline__ typename TypeVec<double, VecTraits<T>::cn>::vec_type result(T r, double sz) const
|
|
{
|
|
return r / sz;
|
|
}
|
|
|
|
__host__ __device__ __forceinline__ Avg() {}
|
|
__host__ __device__ __forceinline__ Avg(const Avg&) {}
|
|
};
|
|
|
|
struct Min
|
|
{
|
|
template <typename T>
|
|
__device__ __forceinline__ T startValue() const
|
|
{
|
|
return VecTraits<T>::all(numeric_limits<typename VecTraits<T>::elem_type>::max());
|
|
}
|
|
|
|
template <typename T>
|
|
__device__ __forceinline__ T operator ()(T a, T b) const
|
|
{
|
|
minimum<T> minOp;
|
|
return minOp(a, b);
|
|
}
|
|
|
|
template <typename T>
|
|
__device__ __forceinline__ T result(T r, double) const
|
|
{
|
|
return r;
|
|
}
|
|
|
|
__host__ __device__ __forceinline__ Min() {}
|
|
__host__ __device__ __forceinline__ Min(const Min&) {}
|
|
};
|
|
|
|
struct Max
|
|
{
|
|
template <typename T>
|
|
__device__ __forceinline__ T startValue() const
|
|
{
|
|
return VecTraits<T>::all(-numeric_limits<typename VecTraits<T>::elem_type>::max());
|
|
}
|
|
|
|
template <typename T>
|
|
__device__ __forceinline__ T operator ()(T a, T b) const
|
|
{
|
|
maximum<T> maxOp;
|
|
return maxOp(a, b);
|
|
}
|
|
|
|
template <typename T>
|
|
__device__ __forceinline__ T result(T r, double) const
|
|
{
|
|
return r;
|
|
}
|
|
|
|
__host__ __device__ __forceinline__ Max() {}
|
|
__host__ __device__ __forceinline__ Max(const Max&) {}
|
|
};
|
|
|
|
///////////////////////////////////////////////////////////
|
|
|
|
template <typename T, typename S, typename D, class Op>
|
|
__global__ void rowsKernel(const PtrStepSz<T> src, D* dst, const Op op)
|
|
{
|
|
__shared__ S smem[16 * 16];
|
|
|
|
const int x = blockIdx.x * 16 + threadIdx.x;
|
|
|
|
S myVal = op.template startValue<S>();
|
|
|
|
if (x < src.cols)
|
|
{
|
|
for (int y = threadIdx.y; y < src.rows; y += 16)
|
|
{
|
|
S srcVal = src(y, x);
|
|
myVal = op(myVal, srcVal);
|
|
}
|
|
}
|
|
|
|
smem[threadIdx.x * 16 + threadIdx.y] = myVal;
|
|
|
|
__syncthreads();
|
|
|
|
volatile S* srow = smem + threadIdx.y * 16;
|
|
|
|
myVal = srow[threadIdx.x];
|
|
cudev::reduce<16>(srow, myVal, threadIdx.x, op);
|
|
|
|
if (threadIdx.x == 0)
|
|
srow[0] = myVal;
|
|
|
|
__syncthreads();
|
|
|
|
if (threadIdx.y == 0 && x < src.cols)
|
|
dst[x] = (D) op.result(smem[threadIdx.x * 16], src.rows);
|
|
}
|
|
|
|
template <typename T, typename S, typename D, class Op>
|
|
void rowsCaller(PtrStepSz<T> src, D* dst, cudaStream_t stream)
|
|
{
|
|
const dim3 block(16, 16);
|
|
const dim3 grid(divUp(src.cols, block.x));
|
|
|
|
Op op;
|
|
rowsKernel<T, S, D, Op><<<grid, block, 0, stream>>>(src, dst, op);
|
|
cudaSafeCall( cudaGetLastError() );
|
|
|
|
if (stream == 0)
|
|
cudaSafeCall( cudaDeviceSynchronize() );
|
|
}
|
|
|
|
template <typename T, typename S, typename D>
|
|
void rows(PtrStepSzb src, void* dst, int op, cudaStream_t stream)
|
|
{
|
|
typedef void (*func_t)(PtrStepSz<T> src, D* dst, cudaStream_t stream);
|
|
static const func_t funcs[] =
|
|
{
|
|
rowsCaller<T, S, D, Sum>,
|
|
rowsCaller<T, S, D, Avg>,
|
|
rowsCaller<T, S, D, Max>,
|
|
rowsCaller<T, S, D, Min>
|
|
};
|
|
|
|
funcs[op]((PtrStepSz<T>) src, (D*) dst, stream);
|
|
}
|
|
|
|
template void rows<unsigned char, int, unsigned char>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
|
|
template void rows<unsigned char, int, int>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
|
|
template void rows<unsigned char, float, float>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
|
|
template void rows<unsigned char, double, double>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
|
|
|
|
template void rows<unsigned short, int, unsigned short>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
|
|
template void rows<unsigned short, int, int>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
|
|
template void rows<unsigned short, float, float>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
|
|
template void rows<unsigned short, double, double>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
|
|
|
|
template void rows<short, int, short>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
|
|
template void rows<short, int, int>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
|
|
template void rows<short, float, float>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
|
|
template void rows<short, double, double>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
|
|
|
|
template void rows<int, int, int>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
|
|
template void rows<int, float, float>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
|
|
template void rows<int, double, double>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
|
|
|
|
template void rows<float, float, float>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
|
|
template void rows<float, double, double>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
|
|
|
|
template void rows<double, double, double>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
|
|
|
|
///////////////////////////////////////////////////////////
|
|
|
|
template <int BLOCK_SIZE, typename T, typename S, typename D, int cn, class Op>
|
|
__global__ void colsKernel(const PtrStepSz<typename TypeVec<T, cn>::vec_type> src, typename TypeVec<D, cn>::vec_type* dst, const Op op)
|
|
{
|
|
typedef typename TypeVec<T, cn>::vec_type src_type;
|
|
typedef typename TypeVec<S, cn>::vec_type work_type;
|
|
typedef typename TypeVec<D, cn>::vec_type dst_type;
|
|
|
|
__shared__ S smem[BLOCK_SIZE * cn];
|
|
|
|
const int y = blockIdx.x;
|
|
|
|
const src_type* srcRow = src.ptr(y);
|
|
|
|
work_type myVal = op.template startValue<work_type>();
|
|
|
|
for (int x = threadIdx.x; x < src.cols; x += BLOCK_SIZE)
|
|
myVal = op(myVal, saturate_cast<work_type>(srcRow[x]));
|
|
|
|
cudev::reduce<BLOCK_SIZE>(detail::Unroll<cn>::template smem_tuple<BLOCK_SIZE>(smem), detail::Unroll<cn>::tie(myVal), threadIdx.x, detail::Unroll<cn>::op(op));
|
|
|
|
if (threadIdx.x == 0)
|
|
dst[y] = saturate_cast<dst_type>(op.result(myVal, src.cols));
|
|
}
|
|
|
|
template <typename T, typename S, typename D, int cn, class Op> void colsCaller(PtrStepSzb src, void* dst, cudaStream_t stream)
|
|
{
|
|
const int BLOCK_SIZE = 256;
|
|
|
|
const dim3 block(BLOCK_SIZE);
|
|
const dim3 grid(src.rows);
|
|
|
|
Op op;
|
|
colsKernel<BLOCK_SIZE, T, S, D, cn, Op><<<grid, block, 0, stream>>>((PtrStepSz<typename TypeVec<T, cn>::vec_type>) src, (typename TypeVec<D, cn>::vec_type*) dst, op);
|
|
cudaSafeCall( cudaGetLastError() );
|
|
|
|
if (stream == 0)
|
|
cudaSafeCall( cudaDeviceSynchronize() );
|
|
|
|
}
|
|
|
|
template <typename T, typename S, typename D> void cols(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream)
|
|
{
|
|
typedef void (*func_t)(PtrStepSzb src, void* dst, cudaStream_t stream);
|
|
static const func_t funcs[5][4] =
|
|
{
|
|
{0,0,0,0},
|
|
{colsCaller<T, S, D, 1, Sum>, colsCaller<T, S, D, 1, Avg>, colsCaller<T, S, D, 1, Max>, colsCaller<T, S, D, 1, Min>},
|
|
{colsCaller<T, S, D, 2, Sum>, colsCaller<T, S, D, 2, Avg>, colsCaller<T, S, D, 2, Max>, colsCaller<T, S, D, 2, Min>},
|
|
{colsCaller<T, S, D, 3, Sum>, colsCaller<T, S, D, 3, Avg>, colsCaller<T, S, D, 3, Max>, colsCaller<T, S, D, 3, Min>},
|
|
{colsCaller<T, S, D, 4, Sum>, colsCaller<T, S, D, 4, Avg>, colsCaller<T, S, D, 4, Max>, colsCaller<T, S, D, 4, Min>},
|
|
};
|
|
|
|
funcs[cn][op](src, dst, stream);
|
|
}
|
|
|
|
template void cols<unsigned char, int, unsigned char>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
|
|
template void cols<unsigned char, int, int>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
|
|
template void cols<unsigned char, float, float>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
|
|
template void cols<unsigned char, double, double>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
|
|
|
|
template void cols<unsigned short, int, unsigned short>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
|
|
template void cols<unsigned short, int, int>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
|
|
template void cols<unsigned short, float, float>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
|
|
template void cols<unsigned short, double, double>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
|
|
|
|
template void cols<short, int, short>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
|
|
template void cols<short, int, int>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
|
|
template void cols<short, float, float>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
|
|
template void cols<short, double, double>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
|
|
|
|
template void cols<int, int, int>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
|
|
template void cols<int, float, float>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
|
|
template void cols<int, double, double>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
|
|
|
|
template void cols<float, float, float>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
|
|
template void cols<float, double, double>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
|
|
|
|
template void cols<double, double, double>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
|
|
}
|
|
|
|
#endif /* CUDA_DISABLER */
|