mirror of
https://github.com/opencv/opencv.git
synced 2024-11-24 19:20:28 +08:00
Merge pull request #1540 from jet47:gpuarithm-cudev
This commit is contained in:
commit
21233656bd
@ -216,7 +216,7 @@ namespace
|
||||
template <typename T>
|
||||
void copyWithMask(const GpuMat& src, const GpuMat& dst, const GpuMat& mask, Stream& stream)
|
||||
{
|
||||
gridTransform_< CopyToPolicy<sizeof(typename VecTraits<T>::elem_type)> >(globPtr<T>(src), globPtr<T>(dst), identity<T>(), globPtr<uchar>(mask), stream);
|
||||
gridTransformUnary_< CopyToPolicy<sizeof(typename VecTraits<T>::elem_type)> >(globPtr<T>(src), globPtr<T>(dst), identity<T>(), globPtr<uchar>(mask), stream);
|
||||
}
|
||||
}
|
||||
|
||||
@ -268,14 +268,14 @@ namespace
|
||||
void setToWithOutMask(const GpuMat& mat, Scalar _scalar, Stream& stream)
|
||||
{
|
||||
Scalar_<typename VecTraits<T>::elem_type> scalar = _scalar;
|
||||
gridTransform(constantPtr(VecTraits<T>::make(scalar.val), mat.rows, mat.cols), globPtr<T>(mat), identity<T>(), stream);
|
||||
gridTransformUnary(constantPtr(VecTraits<T>::make(scalar.val), mat.rows, mat.cols), globPtr<T>(mat), identity<T>(), stream);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void setToWithMask(const GpuMat& mat, const GpuMat& mask, Scalar _scalar, Stream& stream)
|
||||
{
|
||||
Scalar_<typename VecTraits<T>::elem_type> scalar = _scalar;
|
||||
gridTransform(constantPtr(VecTraits<T>::make(scalar.val), mat.rows, mat.cols), globPtr<T>(mat), identity<T>(), globPtr<uchar>(mask), stream);
|
||||
gridTransformUnary(constantPtr(VecTraits<T>::make(scalar.val), mat.rows, mat.cols), globPtr<T>(mat), identity<T>(), globPtr<uchar>(mask), stream);
|
||||
}
|
||||
}
|
||||
|
||||
@ -382,7 +382,7 @@ namespace
|
||||
typedef typename LargerType<src_elem_type, float>::type larger_elem_type;
|
||||
typedef typename LargerType<float, dst_elem_type>::type scalar_type;
|
||||
|
||||
gridTransform_< ConvertToPolicy<scalar_type> >(globPtr<T>(src), globPtr<D>(dst), saturate_cast_func<T, D>(), stream);
|
||||
gridTransformUnary_< ConvertToPolicy<scalar_type> >(globPtr<T>(src), globPtr<D>(dst), saturate_cast_func<T, D>(), stream);
|
||||
}
|
||||
|
||||
template <typename T, typename D, typename S> struct Convertor : unary_function<T, D>
|
||||
@ -408,7 +408,7 @@ namespace
|
||||
op.alpha = cv::saturate_cast<scalar_type>(alpha);
|
||||
op.beta = cv::saturate_cast<scalar_type>(beta);
|
||||
|
||||
gridTransform_< ConvertToPolicy<scalar_type> >(globPtr<T>(src), globPtr<D>(dst), op, stream);
|
||||
gridTransformUnary_< ConvertToPolicy<scalar_type> >(globPtr<T>(src), globPtr<D>(dst), op, stream);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -6,7 +6,7 @@ set(the_description "CUDA-accelerated Operations on Matrices")
|
||||
|
||||
ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4324 /wd4512 -Wundef -Wmissing-declarations)
|
||||
|
||||
ocv_add_module(cudaarithm opencv_core OPTIONAL opencv_cudalegacy)
|
||||
ocv_add_module(cudaarithm opencv_core OPTIONAL opencv_cudev)
|
||||
|
||||
ocv_module_include_directories()
|
||||
ocv_glob_module_sources()
|
||||
|
@ -248,60 +248,3 @@ PERF_TEST_P(Sz_KernelSz_Ccorr, Convolve,
|
||||
CPU_SANITY_CHECK(dst);
|
||||
}
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
// Integral
|
||||
|
||||
PERF_TEST_P(Sz, Integral,
|
||||
CUDA_TYPICAL_MAT_SIZES)
|
||||
{
|
||||
const cv::Size size = GetParam();
|
||||
|
||||
cv::Mat src(size, CV_8UC1);
|
||||
declare.in(src, WARMUP_RNG);
|
||||
|
||||
if (PERF_RUN_CUDA())
|
||||
{
|
||||
const cv::cuda::GpuMat d_src(src);
|
||||
cv::cuda::GpuMat dst;
|
||||
cv::cuda::GpuMat d_buf;
|
||||
|
||||
TEST_CYCLE() cv::cuda::integral(d_src, dst, d_buf);
|
||||
|
||||
CUDA_SANITY_CHECK(dst);
|
||||
}
|
||||
else
|
||||
{
|
||||
cv::Mat dst;
|
||||
|
||||
TEST_CYCLE() cv::integral(src, dst);
|
||||
|
||||
CPU_SANITY_CHECK(dst);
|
||||
}
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
// IntegralSqr
|
||||
|
||||
PERF_TEST_P(Sz, IntegralSqr,
|
||||
CUDA_TYPICAL_MAT_SIZES)
|
||||
{
|
||||
const cv::Size size = GetParam();
|
||||
|
||||
cv::Mat src(size, CV_8UC1);
|
||||
declare.in(src, WARMUP_RNG);
|
||||
|
||||
if (PERF_RUN_CUDA())
|
||||
{
|
||||
const cv::cuda::GpuMat d_src(src);
|
||||
cv::cuda::GpuMat dst, buf;
|
||||
|
||||
TEST_CYCLE() cv::cuda::sqrIntegral(d_src, dst, buf);
|
||||
|
||||
CUDA_SANITY_CHECK(dst);
|
||||
}
|
||||
else
|
||||
{
|
||||
FAIL_NO_CPU();
|
||||
}
|
||||
}
|
||||
|
@ -373,7 +373,7 @@ PERF_TEST_P(Sz_Depth_Cn_Code_Dim, Reduce,
|
||||
const cv::cuda::GpuMat d_src(src);
|
||||
cv::cuda::GpuMat dst;
|
||||
|
||||
TEST_CYCLE() cv::cuda::reduce(d_src, dst, dim, reduceOp);
|
||||
TEST_CYCLE() cv::cuda::reduce(d_src, dst, dim, reduceOp, CV_32F);
|
||||
|
||||
CUDA_SANITY_CHECK(dst);
|
||||
}
|
||||
@ -381,7 +381,7 @@ PERF_TEST_P(Sz_Depth_Cn_Code_Dim, Reduce,
|
||||
{
|
||||
cv::Mat dst;
|
||||
|
||||
TEST_CYCLE() cv::reduce(src, dst, dim, reduceOp);
|
||||
TEST_CYCLE() cv::reduce(src, dst, dim, reduceOp, CV_32F);
|
||||
|
||||
CPU_SANITY_CHECK(dst);
|
||||
}
|
||||
@ -465,3 +465,60 @@ PERF_TEST_P(Sz, MeanStdDev,
|
||||
SANITY_CHECK(cpu_stddev);
|
||||
}
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
// Integral
|
||||
|
||||
PERF_TEST_P(Sz, Integral,
|
||||
CUDA_TYPICAL_MAT_SIZES)
|
||||
{
|
||||
const cv::Size size = GetParam();
|
||||
|
||||
cv::Mat src(size, CV_8UC1);
|
||||
declare.in(src, WARMUP_RNG);
|
||||
|
||||
if (PERF_RUN_CUDA())
|
||||
{
|
||||
const cv::cuda::GpuMat d_src(src);
|
||||
cv::cuda::GpuMat dst;
|
||||
cv::cuda::GpuMat d_buf;
|
||||
|
||||
TEST_CYCLE() cv::cuda::integral(d_src, dst, d_buf);
|
||||
|
||||
CUDA_SANITY_CHECK(dst);
|
||||
}
|
||||
else
|
||||
{
|
||||
cv::Mat dst;
|
||||
|
||||
TEST_CYCLE() cv::integral(src, dst);
|
||||
|
||||
CPU_SANITY_CHECK(dst);
|
||||
}
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
// IntegralSqr
|
||||
|
||||
PERF_TEST_P(Sz, IntegralSqr,
|
||||
CUDA_TYPICAL_MAT_SIZES)
|
||||
{
|
||||
const cv::Size size = GetParam();
|
||||
|
||||
cv::Mat src(size, CV_8UC1);
|
||||
declare.in(src, WARMUP_RNG);
|
||||
|
||||
if (PERF_RUN_CUDA())
|
||||
{
|
||||
const cv::cuda::GpuMat d_src(src);
|
||||
cv::cuda::GpuMat dst, buf;
|
||||
|
||||
TEST_CYCLE() cv::cuda::sqrIntegral(d_src, dst, buf);
|
||||
|
||||
CUDA_SANITY_CHECK(dst);
|
||||
}
|
||||
else
|
||||
{
|
||||
FAIL_NO_CPU();
|
||||
}
|
||||
}
|
||||
|
@ -292,95 +292,6 @@ void cv::cuda::gemm(InputArray _src1, InputArray _src2, double alpha, InputArray
|
||||
#endif
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
// mulSpectrums
|
||||
|
||||
#ifdef HAVE_CUFFT
|
||||
|
||||
namespace cv { namespace cuda { namespace device
|
||||
{
|
||||
void mulSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, PtrStepSz<cufftComplex> c, cudaStream_t stream);
|
||||
|
||||
void mulSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, PtrStepSz<cufftComplex> c, cudaStream_t stream);
|
||||
}}}
|
||||
|
||||
#endif
|
||||
|
||||
void cv::cuda::mulSpectrums(InputArray _src1, InputArray _src2, OutputArray _dst, int flags, bool conjB, Stream& stream)
|
||||
{
|
||||
#ifndef HAVE_CUFFT
|
||||
(void) _src1;
|
||||
(void) _src2;
|
||||
(void) _dst;
|
||||
(void) flags;
|
||||
(void) conjB;
|
||||
(void) stream;
|
||||
throw_no_cuda();
|
||||
#else
|
||||
(void) flags;
|
||||
|
||||
typedef void (*Caller)(const PtrStep<cufftComplex>, const PtrStep<cufftComplex>, PtrStepSz<cufftComplex>, cudaStream_t stream);
|
||||
static Caller callers[] = { device::mulSpectrums, device::mulSpectrums_CONJ };
|
||||
|
||||
GpuMat src1 = _src1.getGpuMat();
|
||||
GpuMat src2 = _src2.getGpuMat();
|
||||
|
||||
CV_Assert( src1.type() == src2.type() && src1.type() == CV_32FC2 );
|
||||
CV_Assert( src1.size() == src2.size() );
|
||||
|
||||
_dst.create(src1.size(), CV_32FC2);
|
||||
GpuMat dst = _dst.getGpuMat();
|
||||
|
||||
Caller caller = callers[(int)conjB];
|
||||
caller(src1, src2, dst, StreamAccessor::getStream(stream));
|
||||
#endif
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
// mulAndScaleSpectrums
|
||||
|
||||
#ifdef HAVE_CUFFT
|
||||
|
||||
namespace cv { namespace cuda { namespace device
|
||||
{
|
||||
void mulAndScaleSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, PtrStepSz<cufftComplex> c, cudaStream_t stream);
|
||||
|
||||
void mulAndScaleSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, PtrStepSz<cufftComplex> c, cudaStream_t stream);
|
||||
}}}
|
||||
|
||||
#endif
|
||||
|
||||
void cv::cuda::mulAndScaleSpectrums(InputArray _src1, InputArray _src2, OutputArray _dst, int flags, float scale, bool conjB, Stream& stream)
|
||||
{
|
||||
#ifndef HAVE_CUFFT
|
||||
(void) _src1;
|
||||
(void) _src2;
|
||||
(void) _dst;
|
||||
(void) flags;
|
||||
(void) scale;
|
||||
(void) conjB;
|
||||
(void) stream;
|
||||
throw_no_cuda();
|
||||
#else
|
||||
(void)flags;
|
||||
|
||||
typedef void (*Caller)(const PtrStep<cufftComplex>, const PtrStep<cufftComplex>, float scale, PtrStepSz<cufftComplex>, cudaStream_t stream);
|
||||
static Caller callers[] = { device::mulAndScaleSpectrums, device::mulAndScaleSpectrums_CONJ };
|
||||
|
||||
GpuMat src1 = _src1.getGpuMat();
|
||||
GpuMat src2 = _src2.getGpuMat();
|
||||
|
||||
CV_Assert( src1.type() == src2.type() && src1.type() == CV_32FC2);
|
||||
CV_Assert( src1.size() == src2.size() );
|
||||
|
||||
_dst.create(src1.size(), CV_32FC2);
|
||||
GpuMat dst = _dst.getGpuMat();
|
||||
|
||||
Caller caller = callers[(int)conjB];
|
||||
caller(src1, src2, scale, dst, StreamAccessor::getStream(stream));
|
||||
#endif
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
// dft
|
||||
|
||||
|
@ -63,163 +63,6 @@ void cv::cuda::copyMakeBorder(InputArray, OutputArray, int, int, int, int, int,
|
||||
|
||||
#else /* !defined (HAVE_CUDA) */
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// merge/split
|
||||
|
||||
namespace cv { namespace cuda { namespace device
|
||||
{
|
||||
namespace split_merge
|
||||
{
|
||||
void merge(const PtrStepSzb* src, PtrStepSzb& dst, int total_channels, size_t elem_size, const cudaStream_t& stream);
|
||||
void split(const PtrStepSzb& src, PtrStepSzb* dst, int num_channels, size_t elem_size1, const cudaStream_t& stream);
|
||||
}
|
||||
}}}
|
||||
|
||||
namespace
|
||||
{
|
||||
void merge_caller(const GpuMat* src, size_t n, OutputArray _dst, Stream& stream)
|
||||
{
|
||||
CV_Assert( src != 0 );
|
||||
CV_Assert( n > 0 && n <= 4 );
|
||||
|
||||
const int depth = src[0].depth();
|
||||
const Size size = src[0].size();
|
||||
|
||||
for (size_t i = 0; i < n; ++i)
|
||||
{
|
||||
CV_Assert( src[i].size() == size );
|
||||
CV_Assert( src[i].depth() == depth );
|
||||
CV_Assert( src[i].channels() == 1 );
|
||||
}
|
||||
|
||||
if (depth == CV_64F)
|
||||
{
|
||||
if (!deviceSupports(NATIVE_DOUBLE))
|
||||
CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
|
||||
}
|
||||
|
||||
if (n == 1)
|
||||
{
|
||||
src[0].copyTo(_dst, stream);
|
||||
}
|
||||
else
|
||||
{
|
||||
_dst.create(size, CV_MAKE_TYPE(depth, (int)n));
|
||||
GpuMat dst = _dst.getGpuMat();
|
||||
|
||||
PtrStepSzb src_as_devmem[4];
|
||||
for(size_t i = 0; i < n; ++i)
|
||||
src_as_devmem[i] = src[i];
|
||||
|
||||
PtrStepSzb dst_as_devmem(dst);
|
||||
cv::cuda::device::split_merge::merge(src_as_devmem, dst_as_devmem, (int)n, CV_ELEM_SIZE(depth), StreamAccessor::getStream(stream));
|
||||
}
|
||||
}
|
||||
|
||||
void split_caller(const GpuMat& src, GpuMat* dst, Stream& stream)
|
||||
{
|
||||
CV_Assert( dst != 0 );
|
||||
|
||||
const int depth = src.depth();
|
||||
const int num_channels = src.channels();
|
||||
|
||||
CV_Assert( num_channels <= 4 );
|
||||
|
||||
if (depth == CV_64F)
|
||||
{
|
||||
if (!deviceSupports(NATIVE_DOUBLE))
|
||||
CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
|
||||
}
|
||||
|
||||
if (num_channels == 1)
|
||||
{
|
||||
src.copyTo(dst[0], stream);
|
||||
return;
|
||||
}
|
||||
|
||||
for (int i = 0; i < num_channels; ++i)
|
||||
dst[i].create(src.size(), depth);
|
||||
|
||||
PtrStepSzb dst_as_devmem[4];
|
||||
for (int i = 0; i < num_channels; ++i)
|
||||
dst_as_devmem[i] = dst[i];
|
||||
|
||||
PtrStepSzb src_as_devmem(src);
|
||||
cv::cuda::device::split_merge::split(src_as_devmem, dst_as_devmem, num_channels, src.elemSize1(), StreamAccessor::getStream(stream));
|
||||
}
|
||||
}
|
||||
|
||||
void cv::cuda::merge(const GpuMat* src, size_t n, OutputArray dst, Stream& stream)
|
||||
{
|
||||
merge_caller(src, n, dst, stream);
|
||||
}
|
||||
|
||||
|
||||
void cv::cuda::merge(const std::vector<GpuMat>& src, OutputArray dst, Stream& stream)
|
||||
{
|
||||
merge_caller(&src[0], src.size(), dst, stream);
|
||||
}
|
||||
|
||||
void cv::cuda::split(InputArray _src, GpuMat* dst, Stream& stream)
|
||||
{
|
||||
GpuMat src = _src.getGpuMat();
|
||||
split_caller(src, dst, stream);
|
||||
}
|
||||
|
||||
void cv::cuda::split(InputArray _src, std::vector<GpuMat>& dst, Stream& stream)
|
||||
{
|
||||
GpuMat src = _src.getGpuMat();
|
||||
dst.resize(src.channels());
|
||||
if(src.channels() > 0)
|
||||
split_caller(src, &dst[0], stream);
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// transpose
|
||||
|
||||
namespace arithm
|
||||
{
|
||||
template <typename T> void transpose(PtrStepSz<T> src, PtrStepSz<T> dst, cudaStream_t stream);
|
||||
}
|
||||
|
||||
void cv::cuda::transpose(InputArray _src, OutputArray _dst, Stream& _stream)
|
||||
{
|
||||
GpuMat src = _src.getGpuMat();
|
||||
|
||||
CV_Assert( src.elemSize() == 1 || src.elemSize() == 4 || src.elemSize() == 8 );
|
||||
|
||||
_dst.create( src.cols, src.rows, src.type() );
|
||||
GpuMat dst = _dst.getGpuMat();
|
||||
|
||||
cudaStream_t stream = StreamAccessor::getStream(_stream);
|
||||
|
||||
if (src.elemSize() == 1)
|
||||
{
|
||||
NppStreamHandler h(stream);
|
||||
|
||||
NppiSize sz;
|
||||
sz.width = src.cols;
|
||||
sz.height = src.rows;
|
||||
|
||||
nppSafeCall( nppiTranspose_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
|
||||
dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
else if (src.elemSize() == 4)
|
||||
{
|
||||
arithm::transpose<int>(src, dst, stream);
|
||||
}
|
||||
else // if (src.elemSize() == 8)
|
||||
{
|
||||
if (!deviceSupports(NATIVE_DOUBLE))
|
||||
CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
|
||||
|
||||
arithm::transpose<double>(src, dst, stream);
|
||||
}
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// flip
|
||||
|
||||
@ -287,326 +130,4 @@ void cv::cuda::flip(InputArray _src, OutputArray _dst, int flipCode, Stream& str
|
||||
funcs[src.depth()][src.channels() - 1](src, dst, flipCode, StreamAccessor::getStream(stream));
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// LUT
|
||||
|
||||
#if (CUDA_VERSION >= 5000)
|
||||
|
||||
namespace
|
||||
{
|
||||
class LookUpTableImpl : public LookUpTable
|
||||
{
|
||||
public:
|
||||
LookUpTableImpl(InputArray lut);
|
||||
|
||||
void transform(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
|
||||
|
||||
private:
|
||||
int lut_cn;
|
||||
|
||||
int nValues3[3];
|
||||
const Npp32s* pValues3[3];
|
||||
const Npp32s* pLevels3[3];
|
||||
|
||||
GpuMat d_pLevels;
|
||||
GpuMat d_nppLut;
|
||||
GpuMat d_nppLut3[3];
|
||||
};
|
||||
|
||||
LookUpTableImpl::LookUpTableImpl(InputArray _lut)
|
||||
{
|
||||
nValues3[0] = nValues3[1] = nValues3[2] = 256;
|
||||
|
||||
Npp32s pLevels[256];
|
||||
for (int i = 0; i < 256; ++i)
|
||||
pLevels[i] = i;
|
||||
|
||||
d_pLevels.upload(Mat(1, 256, CV_32S, pLevels));
|
||||
pLevels3[0] = pLevels3[1] = pLevels3[2] = d_pLevels.ptr<Npp32s>();
|
||||
|
||||
GpuMat lut;
|
||||
if (_lut.kind() == _InputArray::GPU_MAT)
|
||||
{
|
||||
lut = _lut.getGpuMat();
|
||||
}
|
||||
else
|
||||
{
|
||||
Mat hLut = _lut.getMat();
|
||||
CV_Assert( hLut.total() == 256 && hLut.isContinuous() );
|
||||
lut.upload(Mat(1, 256, hLut.type(), hLut.data));
|
||||
}
|
||||
|
||||
lut_cn = lut.channels();
|
||||
|
||||
CV_Assert( lut.depth() == CV_8U );
|
||||
CV_Assert( lut.rows == 1 && lut.cols == 256 );
|
||||
|
||||
lut.convertTo(d_nppLut, CV_32S);
|
||||
|
||||
if (lut_cn == 1)
|
||||
{
|
||||
pValues3[0] = pValues3[1] = pValues3[2] = d_nppLut.ptr<Npp32s>();
|
||||
}
|
||||
else
|
||||
{
|
||||
cuda::split(d_nppLut, d_nppLut3);
|
||||
|
||||
pValues3[0] = d_nppLut3[0].ptr<Npp32s>();
|
||||
pValues3[1] = d_nppLut3[1].ptr<Npp32s>();
|
||||
pValues3[2] = d_nppLut3[2].ptr<Npp32s>();
|
||||
}
|
||||
}
|
||||
|
||||
void LookUpTableImpl::transform(InputArray _src, OutputArray _dst, Stream& _stream)
|
||||
{
|
||||
GpuMat src = _src.getGpuMat();
|
||||
|
||||
const int cn = src.channels();
|
||||
|
||||
CV_Assert( src.type() == CV_8UC1 || src.type() == CV_8UC3 );
|
||||
CV_Assert( lut_cn == 1 || lut_cn == cn );
|
||||
|
||||
_dst.create(src.size(), src.type());
|
||||
GpuMat dst = _dst.getGpuMat();
|
||||
|
||||
cudaStream_t stream = StreamAccessor::getStream(_stream);
|
||||
|
||||
NppStreamHandler h(stream);
|
||||
|
||||
NppiSize sz;
|
||||
sz.height = src.rows;
|
||||
sz.width = src.cols;
|
||||
|
||||
if (src.type() == CV_8UC1)
|
||||
{
|
||||
nppSafeCall( nppiLUT_Linear_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
|
||||
dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, d_nppLut.ptr<Npp32s>(), d_pLevels.ptr<Npp32s>(), 256) );
|
||||
}
|
||||
else
|
||||
{
|
||||
nppSafeCall( nppiLUT_Linear_8u_C3R(src.ptr<Npp8u>(), static_cast<int>(src.step),
|
||||
dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, pValues3, pLevels3, nValues3) );
|
||||
}
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
}
|
||||
|
||||
#else // (CUDA_VERSION >= 5000)
|
||||
|
||||
namespace
|
||||
{
|
||||
class LookUpTableImpl : public LookUpTable
|
||||
{
|
||||
public:
|
||||
LookUpTableImpl(InputArray lut);
|
||||
|
||||
void transform(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
|
||||
|
||||
private:
|
||||
int lut_cn;
|
||||
|
||||
Npp32s pLevels[256];
|
||||
int nValues3[3];
|
||||
const Npp32s* pValues3[3];
|
||||
const Npp32s* pLevels3[3];
|
||||
|
||||
Mat nppLut;
|
||||
Mat nppLut3[3];
|
||||
};
|
||||
|
||||
LookUpTableImpl::LookUpTableImpl(InputArray _lut)
|
||||
{
|
||||
nValues3[0] = nValues3[1] = nValues3[2] = 256;
|
||||
|
||||
for (int i = 0; i < 256; ++i)
|
||||
pLevels[i] = i;
|
||||
pLevels3[0] = pLevels3[1] = pLevels3[2] = pLevels;
|
||||
|
||||
Mat lut;
|
||||
if (_lut.kind() == _InputArray::GPU_MAT)
|
||||
{
|
||||
lut = Mat(_lut.getGpuMat());
|
||||
}
|
||||
else
|
||||
{
|
||||
Mat hLut = _lut.getMat();
|
||||
CV_Assert( hLut.total() == 256 && hLut.isContinuous() );
|
||||
lut = hLut;
|
||||
}
|
||||
|
||||
lut_cn = lut.channels();
|
||||
|
||||
CV_Assert( lut.depth() == CV_8U );
|
||||
CV_Assert( lut.rows == 1 && lut.cols == 256 );
|
||||
|
||||
lut.convertTo(nppLut, CV_32S);
|
||||
|
||||
if (lut_cn == 1)
|
||||
{
|
||||
pValues3[0] = pValues3[1] = pValues3[2] = nppLut.ptr<Npp32s>();
|
||||
}
|
||||
else
|
||||
{
|
||||
cv::split(nppLut, nppLut3);
|
||||
|
||||
pValues3[0] = nppLut3[0].ptr<Npp32s>();
|
||||
pValues3[1] = nppLut3[1].ptr<Npp32s>();
|
||||
pValues3[2] = nppLut3[2].ptr<Npp32s>();
|
||||
}
|
||||
}
|
||||
|
||||
void LookUpTableImpl::transform(InputArray _src, OutputArray _dst, Stream& _stream)
|
||||
{
|
||||
GpuMat src = _src.getGpuMat();
|
||||
|
||||
const int cn = src.channels();
|
||||
|
||||
CV_Assert( src.type() == CV_8UC1 || src.type() == CV_8UC3 );
|
||||
CV_Assert( lut_cn == 1 || lut_cn == cn );
|
||||
|
||||
_dst.create(src.size(), src.type());
|
||||
GpuMat dst = _dst.getGpuMat();
|
||||
|
||||
cudaStream_t stream = StreamAccessor::getStream(_stream);
|
||||
|
||||
NppStreamHandler h(stream);
|
||||
|
||||
NppiSize sz;
|
||||
sz.height = src.rows;
|
||||
sz.width = src.cols;
|
||||
|
||||
if (src.type() == CV_8UC1)
|
||||
{
|
||||
nppSafeCall( nppiLUT_Linear_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
|
||||
dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, nppLut.ptr<Npp32s>(), pLevels, 256) );
|
||||
}
|
||||
else
|
||||
{
|
||||
nppSafeCall( nppiLUT_Linear_8u_C3R(src.ptr<Npp8u>(), static_cast<int>(src.step),
|
||||
dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, pValues3, pLevels3, nValues3) );
|
||||
}
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
}
|
||||
|
||||
#endif // (CUDA_VERSION >= 5000)
|
||||
|
||||
Ptr<LookUpTable> cv::cuda::createLookUpTable(InputArray lut)
|
||||
{
|
||||
return makePtr<LookUpTableImpl>(lut);
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// copyMakeBorder
|
||||
|
||||
namespace cv { namespace cuda { namespace device
|
||||
{
|
||||
namespace imgproc
|
||||
{
|
||||
template <typename T, int cn> void copyMakeBorder_gpu(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const T* borderValue, cudaStream_t stream);
|
||||
}
|
||||
}}}
|
||||
|
||||
namespace
|
||||
{
|
||||
template <typename T, int cn> void copyMakeBorder_caller(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderType, const Scalar& value, cudaStream_t stream)
|
||||
{
|
||||
using namespace ::cv::cuda::device::imgproc;
|
||||
|
||||
Scalar_<T> val(saturate_cast<T>(value[0]), saturate_cast<T>(value[1]), saturate_cast<T>(value[2]), saturate_cast<T>(value[3]));
|
||||
|
||||
copyMakeBorder_gpu<T, cn>(src, dst, top, left, borderType, val.val, stream);
|
||||
}
|
||||
}
|
||||
|
||||
#if defined __GNUC__ && __GNUC__ > 2 && __GNUC_MINOR__ > 4
|
||||
typedef Npp32s __attribute__((__may_alias__)) Npp32s_a;
|
||||
#else
|
||||
typedef Npp32s Npp32s_a;
|
||||
#endif
|
||||
|
||||
void cv::cuda::copyMakeBorder(InputArray _src, OutputArray _dst, int top, int bottom, int left, int right, int borderType, Scalar value, Stream& _stream)
|
||||
{
|
||||
GpuMat src = _src.getGpuMat();
|
||||
|
||||
CV_Assert( src.depth() <= CV_32F && src.channels() <= 4 );
|
||||
CV_Assert( borderType == BORDER_REFLECT_101 || borderType == BORDER_REPLICATE || borderType == BORDER_CONSTANT || borderType == BORDER_REFLECT || borderType == BORDER_WRAP );
|
||||
|
||||
_dst.create(src.rows + top + bottom, src.cols + left + right, src.type());
|
||||
GpuMat dst = _dst.getGpuMat();
|
||||
|
||||
cudaStream_t stream = StreamAccessor::getStream(_stream);
|
||||
|
||||
if (borderType == BORDER_CONSTANT && (src.type() == CV_8UC1 || src.type() == CV_8UC4 || src.type() == CV_32SC1 || src.type() == CV_32FC1))
|
||||
{
|
||||
NppiSize srcsz;
|
||||
srcsz.width = src.cols;
|
||||
srcsz.height = src.rows;
|
||||
|
||||
NppiSize dstsz;
|
||||
dstsz.width = dst.cols;
|
||||
dstsz.height = dst.rows;
|
||||
|
||||
NppStreamHandler h(stream);
|
||||
|
||||
switch (src.type())
|
||||
{
|
||||
case CV_8UC1:
|
||||
{
|
||||
Npp8u nVal = saturate_cast<Npp8u>(value[0]);
|
||||
nppSafeCall( nppiCopyConstBorder_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step), srcsz,
|
||||
dst.ptr<Npp8u>(), static_cast<int>(dst.step), dstsz, top, left, nVal) );
|
||||
break;
|
||||
}
|
||||
case CV_8UC4:
|
||||
{
|
||||
Npp8u nVal[] = {saturate_cast<Npp8u>(value[0]), saturate_cast<Npp8u>(value[1]), saturate_cast<Npp8u>(value[2]), saturate_cast<Npp8u>(value[3])};
|
||||
nppSafeCall( nppiCopyConstBorder_8u_C4R(src.ptr<Npp8u>(), static_cast<int>(src.step), srcsz,
|
||||
dst.ptr<Npp8u>(), static_cast<int>(dst.step), dstsz, top, left, nVal) );
|
||||
break;
|
||||
}
|
||||
case CV_32SC1:
|
||||
{
|
||||
Npp32s nVal = saturate_cast<Npp32s>(value[0]);
|
||||
nppSafeCall( nppiCopyConstBorder_32s_C1R(src.ptr<Npp32s>(), static_cast<int>(src.step), srcsz,
|
||||
dst.ptr<Npp32s>(), static_cast<int>(dst.step), dstsz, top, left, nVal) );
|
||||
break;
|
||||
}
|
||||
case CV_32FC1:
|
||||
{
|
||||
Npp32f val = saturate_cast<Npp32f>(value[0]);
|
||||
Npp32s nVal = *(reinterpret_cast<Npp32s_a*>(&val));
|
||||
nppSafeCall( nppiCopyConstBorder_32s_C1R(src.ptr<Npp32s>(), static_cast<int>(src.step), srcsz,
|
||||
dst.ptr<Npp32s>(), static_cast<int>(dst.step), dstsz, top, left, nVal) );
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
else
|
||||
{
|
||||
typedef void (*caller_t)(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderType, const Scalar& value, cudaStream_t stream);
|
||||
static const caller_t callers[6][4] =
|
||||
{
|
||||
{ copyMakeBorder_caller<uchar, 1> , copyMakeBorder_caller<uchar, 2> , copyMakeBorder_caller<uchar, 3> , copyMakeBorder_caller<uchar, 4>},
|
||||
{0/*copyMakeBorder_caller<schar, 1>*/, 0/*copyMakeBorder_caller<schar, 2>*/ , 0/*copyMakeBorder_caller<schar, 3>*/, 0/*copyMakeBorder_caller<schar, 4>*/},
|
||||
{ copyMakeBorder_caller<ushort, 1> , 0/*copyMakeBorder_caller<ushort, 2>*/, copyMakeBorder_caller<ushort, 3> , copyMakeBorder_caller<ushort, 4>},
|
||||
{ copyMakeBorder_caller<short, 1> , 0/*copyMakeBorder_caller<short, 2>*/ , copyMakeBorder_caller<short, 3> , copyMakeBorder_caller<short, 4>},
|
||||
{0/*copyMakeBorder_caller<int, 1>*/, 0/*copyMakeBorder_caller<int, 2>*/ , 0/*copyMakeBorder_caller<int, 3>*/, 0/*copyMakeBorder_caller<int , 4>*/},
|
||||
{ copyMakeBorder_caller<float, 1> , 0/*copyMakeBorder_caller<float, 2>*/ , copyMakeBorder_caller<float, 3> , copyMakeBorder_caller<float ,4>}
|
||||
};
|
||||
|
||||
caller_t func = callers[src.depth()][src.channels() - 1];
|
||||
CV_Assert(func != 0);
|
||||
|
||||
func(src, dst, top, left, borderType, value, stream);
|
||||
}
|
||||
}
|
||||
|
||||
#endif /* !defined (HAVE_CUDA) */
|
||||
|
@ -40,43 +40,22 @@
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
#include "opencv2/opencv_modules.hpp"
|
||||
|
||||
#include "opencv2/core/cuda/common.hpp"
|
||||
#include "opencv2/core/cuda/functional.hpp"
|
||||
#include "opencv2/core/cuda/transform.hpp"
|
||||
#include "opencv2/core/cuda/saturate_cast.hpp"
|
||||
#include "opencv2/core/cuda/simd_functions.hpp"
|
||||
#ifndef HAVE_OPENCV_CUDEV
|
||||
|
||||
#include "arithm_func_traits.hpp"
|
||||
#error "opencv_cudev is required"
|
||||
|
||||
using namespace cv::cuda;
|
||||
using namespace cv::cuda::device;
|
||||
#else
|
||||
|
||||
namespace arithm
|
||||
#include "opencv2/cudev.hpp"
|
||||
|
||||
using namespace cv::cudev;
|
||||
|
||||
void absDiffMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat&, double, Stream& stream, int);
|
||||
|
||||
namespace
|
||||
{
|
||||
struct VAbsDiff4 : binary_function<uint, uint, uint>
|
||||
{
|
||||
__device__ __forceinline__ uint operator ()(uint a, uint b) const
|
||||
{
|
||||
return vabsdiff4(a, b);
|
||||
}
|
||||
|
||||
__host__ __device__ __forceinline__ VAbsDiff4() {}
|
||||
__host__ __device__ __forceinline__ VAbsDiff4(const VAbsDiff4&) {}
|
||||
};
|
||||
|
||||
struct VAbsDiff2 : binary_function<uint, uint, uint>
|
||||
{
|
||||
__device__ __forceinline__ uint operator ()(uint a, uint b) const
|
||||
{
|
||||
return vabsdiff2(a, b);
|
||||
}
|
||||
|
||||
__host__ __device__ __forceinline__ VAbsDiff2() {}
|
||||
__host__ __device__ __forceinline__ VAbsDiff2(const VAbsDiff2&) {}
|
||||
};
|
||||
|
||||
__device__ __forceinline__ int _abs(int a)
|
||||
{
|
||||
return ::abs(a);
|
||||
@ -90,58 +69,120 @@ namespace arithm
|
||||
return ::fabs(a);
|
||||
}
|
||||
|
||||
template <typename T> struct AbsDiffMat : binary_function<T, T, T>
|
||||
template <typename T> struct AbsDiffOp1 : binary_function<T, T, T>
|
||||
{
|
||||
__device__ __forceinline__ T operator ()(T a, T b) const
|
||||
{
|
||||
return saturate_cast<T>(_abs(a - b));
|
||||
}
|
||||
|
||||
__host__ __device__ __forceinline__ AbsDiffMat() {}
|
||||
__host__ __device__ __forceinline__ AbsDiffMat(const AbsDiffMat&) {}
|
||||
};
|
||||
}
|
||||
|
||||
namespace cv { namespace cuda { namespace device
|
||||
{
|
||||
template <> struct TransformFunctorTraits< arithm::VAbsDiff4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
|
||||
{
|
||||
};
|
||||
|
||||
template <> struct TransformFunctorTraits< arithm::VAbsDiff2 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
|
||||
template <typename ScalarDepth> struct TransformPolicy : DefaultTransformPolicy
|
||||
{
|
||||
};
|
||||
|
||||
template <typename T> struct TransformFunctorTraits< arithm::AbsDiffMat<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
|
||||
template <> struct TransformPolicy<double> : DefaultTransformPolicy
|
||||
{
|
||||
enum {
|
||||
shift = 1
|
||||
};
|
||||
};
|
||||
}}}
|
||||
|
||||
namespace arithm
|
||||
{
|
||||
void absDiffMat_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
|
||||
{
|
||||
device::transform(src1, src2, dst, VAbsDiff4(), WithOutMask(), stream);
|
||||
}
|
||||
|
||||
void absDiffMat_v2(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
|
||||
{
|
||||
device::transform(src1, src2, dst, VAbsDiff2(), WithOutMask(), stream);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void absDiffMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
|
||||
void absDiffMat_v1(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
|
||||
{
|
||||
device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, AbsDiffMat<T>(), WithOutMask(), stream);
|
||||
gridTransformBinary_< TransformPolicy<T> >(globPtr<T>(src1), globPtr<T>(src2), globPtr<T>(dst), AbsDiffOp1<T>(), stream);
|
||||
}
|
||||
|
||||
template void absDiffMat<uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void absDiffMat<schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void absDiffMat<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void absDiffMat<short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void absDiffMat<int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void absDiffMat<float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void absDiffMat<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
struct AbsDiffOp2 : binary_function<uint, uint, uint>
|
||||
{
|
||||
__device__ __forceinline__ uint operator ()(uint a, uint b) const
|
||||
{
|
||||
return vabsdiff2(a, b);
|
||||
}
|
||||
};
|
||||
|
||||
void absDiffMat_v2(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
|
||||
{
|
||||
const int vcols = src1.cols >> 1;
|
||||
|
||||
GlobPtrSz<uint> src1_ = globPtr((uint*) src1.data, src1.step, src1.rows, vcols);
|
||||
GlobPtrSz<uint> src2_ = globPtr((uint*) src2.data, src2.step, src1.rows, vcols);
|
||||
GlobPtrSz<uint> dst_ = globPtr((uint*) dst.data, dst.step, src1.rows, vcols);
|
||||
|
||||
gridTransformBinary(src1_, src2_, dst_, AbsDiffOp2(), stream);
|
||||
}
|
||||
|
||||
struct AbsDiffOp4 : binary_function<uint, uint, uint>
|
||||
{
|
||||
__device__ __forceinline__ uint operator ()(uint a, uint b) const
|
||||
{
|
||||
return vabsdiff4(a, b);
|
||||
}
|
||||
};
|
||||
|
||||
void absDiffMat_v4(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
|
||||
{
|
||||
const int vcols = src1.cols >> 2;
|
||||
|
||||
GlobPtrSz<uint> src1_ = globPtr((uint*) src1.data, src1.step, src1.rows, vcols);
|
||||
GlobPtrSz<uint> src2_ = globPtr((uint*) src2.data, src2.step, src1.rows, vcols);
|
||||
GlobPtrSz<uint> dst_ = globPtr((uint*) dst.data, dst.step, src1.rows, vcols);
|
||||
|
||||
gridTransformBinary(src1_, src2_, dst_, AbsDiffOp4(), stream);
|
||||
}
|
||||
}
|
||||
|
||||
#endif // CUDA_DISABLER
|
||||
void absDiffMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat&, double, Stream& stream, int)
|
||||
{
|
||||
typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream);
|
||||
static const func_t funcs[] =
|
||||
{
|
||||
absDiffMat_v1<uchar>,
|
||||
absDiffMat_v1<schar>,
|
||||
absDiffMat_v1<ushort>,
|
||||
absDiffMat_v1<short>,
|
||||
absDiffMat_v1<int>,
|
||||
absDiffMat_v1<float>,
|
||||
absDiffMat_v1<double>
|
||||
};
|
||||
|
||||
const int depth = src1.depth();
|
||||
|
||||
CV_DbgAssert( depth <= CV_64F );
|
||||
|
||||
GpuMat src1_ = src1.reshape(1);
|
||||
GpuMat src2_ = src2.reshape(1);
|
||||
GpuMat dst_ = dst.reshape(1);
|
||||
|
||||
if (depth == CV_8U || depth == CV_16U)
|
||||
{
|
||||
const intptr_t src1ptr = reinterpret_cast<intptr_t>(src1_.data);
|
||||
const intptr_t src2ptr = reinterpret_cast<intptr_t>(src2_.data);
|
||||
const intptr_t dstptr = reinterpret_cast<intptr_t>(dst_.data);
|
||||
|
||||
const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0;
|
||||
|
||||
if (isAllAligned)
|
||||
{
|
||||
if (depth == CV_8U && (src1_.cols & 3) == 0)
|
||||
{
|
||||
absDiffMat_v4(src1_, src2_, dst_, stream);
|
||||
return;
|
||||
}
|
||||
else if (depth == CV_16U && (src1_.cols & 1) == 0)
|
||||
{
|
||||
absDiffMat_v2(src1_, src2_, dst_, stream);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const func_t func = funcs[depth];
|
||||
|
||||
if (!func)
|
||||
CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types");
|
||||
|
||||
func(src1_, src2_, dst_, stream);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@ -40,59 +40,71 @@
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
#include "opencv2/opencv_modules.hpp"
|
||||
|
||||
#include "opencv2/core/cuda/common.hpp"
|
||||
#include "opencv2/core/cuda/functional.hpp"
|
||||
#include "opencv2/core/cuda/transform.hpp"
|
||||
#include "opencv2/core/cuda/saturate_cast.hpp"
|
||||
#include "opencv2/core/cuda/simd_functions.hpp"
|
||||
#ifndef HAVE_OPENCV_CUDEV
|
||||
|
||||
#include "arithm_func_traits.hpp"
|
||||
#error "opencv_cudev is required"
|
||||
|
||||
using namespace cv::cuda;
|
||||
using namespace cv::cuda::device;
|
||||
#else
|
||||
|
||||
namespace arithm
|
||||
#include "opencv2/cudev.hpp"
|
||||
|
||||
using namespace cv::cudev;
|
||||
|
||||
void absDiffScalar(const GpuMat& src, cv::Scalar val, bool, GpuMat& dst, const GpuMat&, double, Stream& stream, int);
|
||||
|
||||
namespace
|
||||
{
|
||||
template <typename T, typename S> struct AbsDiffScalar : unary_function<T, T>
|
||||
template <typename T, typename S> struct AbsDiffScalarOp : unary_function<T, T>
|
||||
{
|
||||
S val;
|
||||
|
||||
__host__ explicit AbsDiffScalar(S val_) : val(val_) {}
|
||||
|
||||
__device__ __forceinline__ T operator ()(T a) const
|
||||
{
|
||||
abs_func<S> f;
|
||||
return saturate_cast<T>(f(a - val));
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
namespace cv { namespace cuda { namespace device
|
||||
{
|
||||
template <typename T, typename S> struct TransformFunctorTraits< arithm::AbsDiffScalar<T, S> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
|
||||
template <typename ScalarDepth> struct TransformPolicy : DefaultTransformPolicy
|
||||
{
|
||||
};
|
||||
}}}
|
||||
|
||||
namespace arithm
|
||||
{
|
||||
template <typename T, typename S>
|
||||
void absDiffScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream)
|
||||
template <> struct TransformPolicy<double> : DefaultTransformPolicy
|
||||
{
|
||||
AbsDiffScalar<T, S> op(static_cast<S>(val));
|
||||
enum {
|
||||
shift = 1
|
||||
};
|
||||
};
|
||||
|
||||
device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) dst, op, WithOutMask(), stream);
|
||||
template <typename SrcType, typename ScalarDepth>
|
||||
void absDiffScalarImpl(const GpuMat& src, double value, GpuMat& dst, Stream& stream)
|
||||
{
|
||||
AbsDiffScalarOp<SrcType, ScalarDepth> op;
|
||||
op.val = static_cast<ScalarDepth>(value);
|
||||
gridTransformUnary_< TransformPolicy<ScalarDepth> >(globPtr<SrcType>(src), globPtr<SrcType>(dst), op, stream);
|
||||
}
|
||||
|
||||
template void absDiffScalar<uchar, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void absDiffScalar<schar, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void absDiffScalar<ushort, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void absDiffScalar<short, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void absDiffScalar<int, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void absDiffScalar<float, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void absDiffScalar<double, double>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
}
|
||||
|
||||
#endif // CUDA_DISABLER
|
||||
void absDiffScalar(const GpuMat& src, cv::Scalar val, bool, GpuMat& dst, const GpuMat&, double, Stream& stream, int)
|
||||
{
|
||||
typedef void (*func_t)(const GpuMat& src, double val, GpuMat& dst, Stream& stream);
|
||||
static const func_t funcs[] =
|
||||
{
|
||||
absDiffScalarImpl<uchar, float>,
|
||||
absDiffScalarImpl<schar, float>,
|
||||
absDiffScalarImpl<ushort, float>,
|
||||
absDiffScalarImpl<short, float>,
|
||||
absDiffScalarImpl<int, float>,
|
||||
absDiffScalarImpl<float, float>,
|
||||
absDiffScalarImpl<double, double>
|
||||
};
|
||||
|
||||
const int depth = src.depth();
|
||||
|
||||
CV_DbgAssert( depth <= CV_64F );
|
||||
|
||||
funcs[depth](src, val[0], dst, stream);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@ -40,146 +40,186 @@
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
#include "opencv2/opencv_modules.hpp"
|
||||
|
||||
#include "opencv2/core/cuda/common.hpp"
|
||||
#include "opencv2/core/cuda/functional.hpp"
|
||||
#include "opencv2/core/cuda/transform.hpp"
|
||||
#include "opencv2/core/cuda/saturate_cast.hpp"
|
||||
#include "opencv2/core/cuda/simd_functions.hpp"
|
||||
#ifndef HAVE_OPENCV_CUDEV
|
||||
|
||||
#include "arithm_func_traits.hpp"
|
||||
#error "opencv_cudev is required"
|
||||
|
||||
using namespace cv::cuda;
|
||||
using namespace cv::cuda::device;
|
||||
#else
|
||||
|
||||
namespace arithm
|
||||
#include "opencv2/cudev.hpp"
|
||||
|
||||
using namespace cv::cudev;
|
||||
|
||||
void addMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, double, Stream& _stream, int);
|
||||
|
||||
namespace
|
||||
{
|
||||
struct VAdd4 : binary_function<uint, uint, uint>
|
||||
{
|
||||
__device__ __forceinline__ uint operator ()(uint a, uint b) const
|
||||
{
|
||||
return vadd4(a, b);
|
||||
}
|
||||
|
||||
__host__ __device__ __forceinline__ VAdd4() {}
|
||||
__host__ __device__ __forceinline__ VAdd4(const VAdd4&) {}
|
||||
};
|
||||
|
||||
struct VAdd2 : binary_function<uint, uint, uint>
|
||||
{
|
||||
__device__ __forceinline__ uint operator ()(uint a, uint b) const
|
||||
{
|
||||
return vadd2(a, b);
|
||||
}
|
||||
|
||||
__host__ __device__ __forceinline__ VAdd2() {}
|
||||
__host__ __device__ __forceinline__ VAdd2(const VAdd2&) {}
|
||||
};
|
||||
|
||||
template <typename T, typename D> struct AddMat : binary_function<T, T, D>
|
||||
template <typename T, typename D> struct AddOp1 : binary_function<T, T, D>
|
||||
{
|
||||
__device__ __forceinline__ D operator ()(T a, T b) const
|
||||
{
|
||||
return saturate_cast<D>(a + b);
|
||||
}
|
||||
|
||||
__host__ __device__ __forceinline__ AddMat() {}
|
||||
__host__ __device__ __forceinline__ AddMat(const AddMat&) {}
|
||||
};
|
||||
}
|
||||
|
||||
namespace cv { namespace cuda { namespace device
|
||||
{
|
||||
template <> struct TransformFunctorTraits< arithm::VAdd4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
|
||||
{
|
||||
};
|
||||
|
||||
template <> struct TransformFunctorTraits< arithm::VAdd2 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
|
||||
{
|
||||
};
|
||||
|
||||
template <typename T, typename D> struct TransformFunctorTraits< arithm::AddMat<T, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
|
||||
{
|
||||
};
|
||||
}}}
|
||||
|
||||
namespace arithm
|
||||
{
|
||||
void addMat_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
|
||||
{
|
||||
device::transform(src1, src2, dst, VAdd4(), WithOutMask(), stream);
|
||||
}
|
||||
|
||||
void addMat_v2(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
|
||||
{
|
||||
device::transform(src1, src2, dst, VAdd2(), WithOutMask(), stream);
|
||||
}
|
||||
|
||||
template <typename T, typename D>
|
||||
void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
|
||||
void addMat_v1(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, Stream& stream)
|
||||
{
|
||||
if (mask.data)
|
||||
device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, AddMat<T, D>(), mask, stream);
|
||||
gridTransformBinary(globPtr<T>(src1), globPtr<T>(src2), globPtr<D>(dst), AddOp1<T, D>(), globPtr<uchar>(mask), stream);
|
||||
else
|
||||
device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, AddMat<T, D>(), WithOutMask(), stream);
|
||||
gridTransformBinary(globPtr<T>(src1), globPtr<T>(src2), globPtr<D>(dst), AddOp1<T, D>(), stream);
|
||||
}
|
||||
|
||||
template void addMat<uchar, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void addMat<uchar, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void addMat<uchar, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void addMat<uchar, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void addMat<uchar, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void addMat<uchar, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void addMat<uchar, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
struct AddOp2 : binary_function<uint, uint, uint>
|
||||
{
|
||||
__device__ __forceinline__ uint operator ()(uint a, uint b) const
|
||||
{
|
||||
return vadd2(a, b);
|
||||
}
|
||||
};
|
||||
|
||||
template void addMat<schar, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void addMat<schar, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void addMat<schar, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void addMat<schar, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void addMat<schar, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void addMat<schar, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void addMat<schar, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
void addMat_v2(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
|
||||
{
|
||||
const int vcols = src1.cols >> 1;
|
||||
|
||||
//template void addMat<ushort, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
//template void addMat<ushort, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void addMat<ushort, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void addMat<ushort, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void addMat<ushort, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void addMat<ushort, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void addMat<ushort, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
GlobPtrSz<uint> src1_ = globPtr((uint*) src1.data, src1.step, src1.rows, vcols);
|
||||
GlobPtrSz<uint> src2_ = globPtr((uint*) src2.data, src2.step, src1.rows, vcols);
|
||||
GlobPtrSz<uint> dst_ = globPtr((uint*) dst.data, dst.step, src1.rows, vcols);
|
||||
|
||||
//template void addMat<short, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
//template void addMat<short, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void addMat<short, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void addMat<short, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void addMat<short, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void addMat<short, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void addMat<short, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
gridTransformBinary(src1_, src2_, dst_, AddOp2(), stream);
|
||||
}
|
||||
|
||||
//template void addMat<int, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
//template void addMat<int, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
//template void addMat<int, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
//template void addMat<int, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void addMat<int, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void addMat<int, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void addMat<int, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
struct AddOp4 : binary_function<uint, uint, uint>
|
||||
{
|
||||
__device__ __forceinline__ uint operator ()(uint a, uint b) const
|
||||
{
|
||||
return vadd4(a, b);
|
||||
}
|
||||
};
|
||||
|
||||
//template void addMat<float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
//template void addMat<float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
//template void addMat<float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
//template void addMat<float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
//template void addMat<float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void addMat<float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void addMat<float, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
void addMat_v4(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
|
||||
{
|
||||
const int vcols = src1.cols >> 2;
|
||||
|
||||
//template void addMat<double, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
//template void addMat<double, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
//template void addMat<double, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
//template void addMat<double, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
//template void addMat<double, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
//template void addMat<double, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void addMat<double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
GlobPtrSz<uint> src1_ = globPtr((uint*) src1.data, src1.step, src1.rows, vcols);
|
||||
GlobPtrSz<uint> src2_ = globPtr((uint*) src2.data, src2.step, src1.rows, vcols);
|
||||
GlobPtrSz<uint> dst_ = globPtr((uint*) dst.data, dst.step, src1.rows, vcols);
|
||||
|
||||
gridTransformBinary(src1_, src2_, dst_, AddOp4(), stream);
|
||||
}
|
||||
}
|
||||
|
||||
#endif // CUDA_DISABLER
|
||||
void addMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, double, Stream& stream, int)
|
||||
{
|
||||
typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, Stream& stream);
|
||||
static const func_t funcs[7][7] =
|
||||
{
|
||||
{
|
||||
addMat_v1<uchar, uchar>,
|
||||
addMat_v1<uchar, schar>,
|
||||
addMat_v1<uchar, ushort>,
|
||||
addMat_v1<uchar, short>,
|
||||
addMat_v1<uchar, int>,
|
||||
addMat_v1<uchar, float>,
|
||||
addMat_v1<uchar, double>
|
||||
},
|
||||
{
|
||||
addMat_v1<schar, uchar>,
|
||||
addMat_v1<schar, schar>,
|
||||
addMat_v1<schar, ushort>,
|
||||
addMat_v1<schar, short>,
|
||||
addMat_v1<schar, int>,
|
||||
addMat_v1<schar, float>,
|
||||
addMat_v1<schar, double>
|
||||
},
|
||||
{
|
||||
0 /*addMat_v1<ushort, uchar>*/,
|
||||
0 /*addMat_v1<ushort, schar>*/,
|
||||
addMat_v1<ushort, ushort>,
|
||||
addMat_v1<ushort, short>,
|
||||
addMat_v1<ushort, int>,
|
||||
addMat_v1<ushort, float>,
|
||||
addMat_v1<ushort, double>
|
||||
},
|
||||
{
|
||||
0 /*addMat_v1<short, uchar>*/,
|
||||
0 /*addMat_v1<short, schar>*/,
|
||||
addMat_v1<short, ushort>,
|
||||
addMat_v1<short, short>,
|
||||
addMat_v1<short, int>,
|
||||
addMat_v1<short, float>,
|
||||
addMat_v1<short, double>
|
||||
},
|
||||
{
|
||||
0 /*addMat_v1<int, uchar>*/,
|
||||
0 /*addMat_v1<int, schar>*/,
|
||||
0 /*addMat_v1<int, ushort>*/,
|
||||
0 /*addMat_v1<int, short>*/,
|
||||
addMat_v1<int, int>,
|
||||
addMat_v1<int, float>,
|
||||
addMat_v1<int, double>
|
||||
},
|
||||
{
|
||||
0 /*addMat_v1<float, uchar>*/,
|
||||
0 /*addMat_v1<float, schar>*/,
|
||||
0 /*addMat_v1<float, ushort>*/,
|
||||
0 /*addMat_v1<float, short>*/,
|
||||
0 /*addMat_v1<float, int>*/,
|
||||
addMat_v1<float, float>,
|
||||
addMat_v1<float, double>
|
||||
},
|
||||
{
|
||||
0 /*addMat_v1<double, uchar>*/,
|
||||
0 /*addMat_v1<double, schar>*/,
|
||||
0 /*addMat_v1<double, ushort>*/,
|
||||
0 /*addMat_v1<double, short>*/,
|
||||
0 /*addMat_v1<double, int>*/,
|
||||
0 /*addMat_v1<double, float>*/,
|
||||
addMat_v1<double, double>
|
||||
}
|
||||
};
|
||||
|
||||
const int sdepth = src1.depth();
|
||||
const int ddepth = dst.depth();
|
||||
|
||||
CV_DbgAssert( sdepth <= CV_64F && ddepth <= CV_64F );
|
||||
|
||||
GpuMat src1_ = src1.reshape(1);
|
||||
GpuMat src2_ = src2.reshape(1);
|
||||
GpuMat dst_ = dst.reshape(1);
|
||||
|
||||
if (mask.empty() && (sdepth == CV_8U || sdepth == CV_16U) && ddepth == sdepth)
|
||||
{
|
||||
const intptr_t src1ptr = reinterpret_cast<intptr_t>(src1_.data);
|
||||
const intptr_t src2ptr = reinterpret_cast<intptr_t>(src2_.data);
|
||||
const intptr_t dstptr = reinterpret_cast<intptr_t>(dst_.data);
|
||||
|
||||
const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0;
|
||||
|
||||
if (isAllAligned)
|
||||
{
|
||||
if (sdepth == CV_8U && (src1_.cols & 3) == 0)
|
||||
{
|
||||
addMat_v4(src1_, src2_, dst_, stream);
|
||||
return;
|
||||
}
|
||||
else if (sdepth == CV_16U && (src1_.cols & 1) == 0)
|
||||
{
|
||||
addMat_v2(src1_, src2_, dst_, stream);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const func_t func = funcs[sdepth][ddepth];
|
||||
|
||||
if (!func)
|
||||
CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types");
|
||||
|
||||
func(src1_, src2_, dst_, mask, stream);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@ -40,109 +40,141 @@
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
#include "opencv2/opencv_modules.hpp"
|
||||
|
||||
#include "opencv2/core/cuda/common.hpp"
|
||||
#include "opencv2/core/cuda/functional.hpp"
|
||||
#include "opencv2/core/cuda/transform.hpp"
|
||||
#include "opencv2/core/cuda/saturate_cast.hpp"
|
||||
#include "opencv2/core/cuda/simd_functions.hpp"
|
||||
#ifndef HAVE_OPENCV_CUDEV
|
||||
|
||||
#include "arithm_func_traits.hpp"
|
||||
#error "opencv_cudev is required"
|
||||
|
||||
using namespace cv::cuda;
|
||||
using namespace cv::cuda::device;
|
||||
#else
|
||||
|
||||
namespace arithm
|
||||
#include "opencv2/cudev.hpp"
|
||||
|
||||
using namespace cv::cudev;
|
||||
|
||||
void addScalar(const GpuMat& src, cv::Scalar val, bool, GpuMat& dst, const GpuMat& mask, double, Stream& stream, int);
|
||||
|
||||
namespace
|
||||
{
|
||||
template <typename T, typename S, typename D> struct AddScalar : unary_function<T, D>
|
||||
template <typename SrcType, typename ScalarType, typename DstType> struct AddScalarOp : unary_function<SrcType, DstType>
|
||||
{
|
||||
S val;
|
||||
ScalarType val;
|
||||
|
||||
__host__ explicit AddScalar(S val_) : val(val_) {}
|
||||
|
||||
__device__ __forceinline__ D operator ()(T a) const
|
||||
__device__ __forceinline__ DstType operator ()(SrcType a) const
|
||||
{
|
||||
return saturate_cast<D>(a + val);
|
||||
return saturate_cast<DstType>(saturate_cast<ScalarType>(a) + val);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
namespace cv { namespace cuda { namespace device
|
||||
{
|
||||
template <typename T, typename S, typename D> struct TransformFunctorTraits< arithm::AddScalar<T, S, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
|
||||
template <typename ScalarDepth> struct TransformPolicy : DefaultTransformPolicy
|
||||
{
|
||||
};
|
||||
}}}
|
||||
|
||||
namespace arithm
|
||||
{
|
||||
template <typename T, typename S, typename D>
|
||||
void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
|
||||
template <> struct TransformPolicy<double> : DefaultTransformPolicy
|
||||
{
|
||||
AddScalar<T, S, D> op(static_cast<S>(val));
|
||||
enum {
|
||||
shift = 1
|
||||
};
|
||||
};
|
||||
|
||||
template <typename SrcType, typename ScalarDepth, typename DstType>
|
||||
void addScalarImpl(const GpuMat& src, cv::Scalar value, GpuMat& dst, const GpuMat& mask, Stream& stream)
|
||||
{
|
||||
typedef typename MakeVec<ScalarDepth, VecTraits<SrcType>::cn>::type ScalarType;
|
||||
|
||||
cv::Scalar_<ScalarDepth> value_ = value;
|
||||
|
||||
AddScalarOp<SrcType, ScalarType, DstType> op;
|
||||
op.val = VecTraits<ScalarType>::make(value_.val);
|
||||
|
||||
if (mask.data)
|
||||
device::transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, mask, stream);
|
||||
gridTransformUnary_< TransformPolicy<ScalarDepth> >(globPtr<SrcType>(src), globPtr<DstType>(dst), op, globPtr<uchar>(mask), stream);
|
||||
else
|
||||
device::transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
|
||||
gridTransformUnary_< TransformPolicy<ScalarDepth> >(globPtr<SrcType>(src), globPtr<DstType>(dst), op, stream);
|
||||
}
|
||||
|
||||
template void addScalar<uchar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void addScalar<uchar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void addScalar<uchar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void addScalar<uchar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void addScalar<uchar, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void addScalar<uchar, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void addScalar<uchar, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
|
||||
template void addScalar<schar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void addScalar<schar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void addScalar<schar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void addScalar<schar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void addScalar<schar, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void addScalar<schar, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void addScalar<schar, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
|
||||
//template void addScalar<ushort, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
//template void addScalar<ushort, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void addScalar<ushort, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void addScalar<ushort, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void addScalar<ushort, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void addScalar<ushort, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void addScalar<ushort, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
|
||||
//template void addScalar<short, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
//template void addScalar<short, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void addScalar<short, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void addScalar<short, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void addScalar<short, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void addScalar<short, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void addScalar<short, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
|
||||
//template void addScalar<int, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
//template void addScalar<int, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
//template void addScalar<int, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
//template void addScalar<int, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void addScalar<int, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void addScalar<int, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void addScalar<int, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
|
||||
//template void addScalar<float, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
//template void addScalar<float, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
//template void addScalar<float, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
//template void addScalar<float, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
//template void addScalar<float, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void addScalar<float, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void addScalar<float, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
|
||||
//template void addScalar<double, double, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
//template void addScalar<double, double, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
//template void addScalar<double, double, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
//template void addScalar<double, double, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
//template void addScalar<double, double, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
//template void addScalar<double, double, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void addScalar<double, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
}
|
||||
|
||||
#endif // CUDA_DISABLER
|
||||
void addScalar(const GpuMat& src, cv::Scalar val, bool, GpuMat& dst, const GpuMat& mask, double, Stream& stream, int)
|
||||
{
|
||||
typedef void (*func_t)(const GpuMat& src, cv::Scalar val, GpuMat& dst, const GpuMat& mask, Stream& stream);
|
||||
static const func_t funcs[7][7][4] =
|
||||
{
|
||||
{
|
||||
{addScalarImpl<uchar, float, uchar>, addScalarImpl<uchar2, float, uchar2>, addScalarImpl<uchar3, float, uchar3>, addScalarImpl<uchar4, float, uchar4>},
|
||||
{addScalarImpl<uchar, float, schar>, addScalarImpl<uchar2, float, char2>, addScalarImpl<uchar3, float, char3>, addScalarImpl<uchar4, float, char4>},
|
||||
{addScalarImpl<uchar, float, ushort>, addScalarImpl<uchar2, float, ushort2>, addScalarImpl<uchar3, float, ushort3>, addScalarImpl<uchar4, float, ushort4>},
|
||||
{addScalarImpl<uchar, float, short>, addScalarImpl<uchar2, float, short2>, addScalarImpl<uchar3, float, short3>, addScalarImpl<uchar4, float, short4>},
|
||||
{addScalarImpl<uchar, float, int>, addScalarImpl<uchar2, float, int2>, addScalarImpl<uchar3, float, int3>, addScalarImpl<uchar4, float, int4>},
|
||||
{addScalarImpl<uchar, float, float>, addScalarImpl<uchar2, float, float2>, addScalarImpl<uchar3, float, float3>, addScalarImpl<uchar4, float, float4>},
|
||||
{addScalarImpl<uchar, double, double>, addScalarImpl<uchar2, double, double2>, addScalarImpl<uchar3, double, double3>, addScalarImpl<uchar4, double, double4>}
|
||||
},
|
||||
{
|
||||
{addScalarImpl<schar, float, uchar>, addScalarImpl<char2, float, uchar2>, addScalarImpl<char3, float, uchar3>, addScalarImpl<char4, float, uchar4>},
|
||||
{addScalarImpl<schar, float, schar>, addScalarImpl<char2, float, char2>, addScalarImpl<char3, float, char3>, addScalarImpl<char4, float, char4>},
|
||||
{addScalarImpl<schar, float, ushort>, addScalarImpl<char2, float, ushort2>, addScalarImpl<char3, float, ushort3>, addScalarImpl<char4, float, ushort4>},
|
||||
{addScalarImpl<schar, float, short>, addScalarImpl<char2, float, short2>, addScalarImpl<char3, float, short3>, addScalarImpl<char4, float, short4>},
|
||||
{addScalarImpl<schar, float, int>, addScalarImpl<char2, float, int2>, addScalarImpl<char3, float, int3>, addScalarImpl<char4, float, int4>},
|
||||
{addScalarImpl<schar, float, float>, addScalarImpl<char2, float, float2>, addScalarImpl<char3, float, float3>, addScalarImpl<char4, float, float4>},
|
||||
{addScalarImpl<schar, double, double>, addScalarImpl<char2, double, double2>, addScalarImpl<char3, double, double3>, addScalarImpl<char4, double, double4>}
|
||||
},
|
||||
{
|
||||
{0 /*addScalarImpl<ushort, float, uchar>*/, 0 /*addScalarImpl<ushort2, float, uchar2>*/, 0 /*addScalarImpl<ushort3, float, uchar3>*/, 0 /*addScalarImpl<ushort4, float, uchar4>*/},
|
||||
{0 /*addScalarImpl<ushort, float, schar>*/, 0 /*addScalarImpl<ushort2, float, char2>*/, 0 /*addScalarImpl<ushort3, float, char3>*/, 0 /*addScalarImpl<ushort4, float, char4>*/},
|
||||
{addScalarImpl<ushort, float, ushort>, addScalarImpl<ushort2, float, ushort2>, addScalarImpl<ushort3, float, ushort3>, addScalarImpl<ushort4, float, ushort4>},
|
||||
{addScalarImpl<ushort, float, short>, addScalarImpl<ushort2, float, short2>, addScalarImpl<ushort3, float, short3>, addScalarImpl<ushort4, float, short4>},
|
||||
{addScalarImpl<ushort, float, int>, addScalarImpl<ushort2, float, int2>, addScalarImpl<ushort3, float, int3>, addScalarImpl<ushort4, float, int4>},
|
||||
{addScalarImpl<ushort, float, float>, addScalarImpl<ushort2, float, float2>, addScalarImpl<ushort3, float, float3>, addScalarImpl<ushort4, float, float4>},
|
||||
{addScalarImpl<ushort, double, double>, addScalarImpl<ushort2, double, double2>, addScalarImpl<ushort3, double, double3>, addScalarImpl<ushort4, double, double4>}
|
||||
},
|
||||
{
|
||||
{0 /*addScalarImpl<short, float, uchar>*/, 0 /*addScalarImpl<short2, float, uchar2>*/, 0 /*addScalarImpl<short3, float, uchar3>*/, 0 /*addScalarImpl<short4, float, uchar4>*/},
|
||||
{0 /*addScalarImpl<short, float, schar>*/, 0 /*addScalarImpl<short2, float, char2>*/, 0 /*addScalarImpl<short3, float, char3>*/, 0 /*addScalarImpl<short4, float, char4>*/},
|
||||
{addScalarImpl<short, float, ushort>, addScalarImpl<short2, float, ushort2>, addScalarImpl<short3, float, ushort3>, addScalarImpl<short4, float, ushort4>},
|
||||
{addScalarImpl<short, float, short>, addScalarImpl<short2, float, short2>, addScalarImpl<short3, float, short3>, addScalarImpl<short4, float, short4>},
|
||||
{addScalarImpl<short, float, int>, addScalarImpl<short2, float, int2>, addScalarImpl<short3, float, int3>, addScalarImpl<short4, float, int4>},
|
||||
{addScalarImpl<short, float, float>, addScalarImpl<short2, float, float2>, addScalarImpl<short3, float, float3>, addScalarImpl<short4, float, float4>},
|
||||
{addScalarImpl<short, double, double>, addScalarImpl<short2, double, double2>, addScalarImpl<short3, double, double3>, addScalarImpl<short4, double, double4>}
|
||||
},
|
||||
{
|
||||
{0 /*addScalarImpl<int, float, uchar>*/, 0 /*addScalarImpl<int2, float, uchar2>*/, 0 /*addScalarImpl<int3, float, uchar3>*/, 0 /*addScalarImpl<int4, float, uchar4>*/},
|
||||
{0 /*addScalarImpl<int, float, schar>*/, 0 /*addScalarImpl<int2, float, char2>*/, 0 /*addScalarImpl<int3, float, char3>*/, 0 /*addScalarImpl<int4, float, char4>*/},
|
||||
{0 /*addScalarImpl<int, float, ushort>*/, 0 /*addScalarImpl<int2, float, ushort2>*/, 0 /*addScalarImpl<int3, float, ushort3>*/, 0 /*addScalarImpl<int4, float, ushort4>*/},
|
||||
{0 /*addScalarImpl<int, float, short>*/, 0 /*addScalarImpl<int2, float, short2>*/, 0 /*addScalarImpl<int3, float, short3>*/, 0 /*addScalarImpl<int4, float, short4>*/},
|
||||
{addScalarImpl<int, float, int>, addScalarImpl<int2, float, int2>, addScalarImpl<int3, float, int3>, addScalarImpl<int4, float, int4>},
|
||||
{addScalarImpl<int, float, float>, addScalarImpl<int2, float, float2>, addScalarImpl<int3, float, float3>, addScalarImpl<int4, float, float4>},
|
||||
{addScalarImpl<int, double, double>, addScalarImpl<int2, double, double2>, addScalarImpl<int3, double, double3>, addScalarImpl<int4, double, double4>}
|
||||
},
|
||||
{
|
||||
{0 /*addScalarImpl<float, float, uchar>*/, 0 /*addScalarImpl<float2, float, uchar2>*/, 0 /*addScalarImpl<float3, float, uchar3>*/, 0 /*addScalarImpl<float4, float, uchar4>*/},
|
||||
{0 /*addScalarImpl<float, float, schar>*/, 0 /*addScalarImpl<float2, float, char2>*/, 0 /*addScalarImpl<float3, float, char3>*/, 0 /*addScalarImpl<float4, float, char4>*/},
|
||||
{0 /*addScalarImpl<float, float, ushort>*/, 0 /*addScalarImpl<float2, float, ushort2>*/, 0 /*addScalarImpl<float3, float, ushort3>*/, 0 /*addScalarImpl<float4, float, ushort4>*/},
|
||||
{0 /*addScalarImpl<float, float, short>*/, 0 /*addScalarImpl<float2, float, short2>*/, 0 /*addScalarImpl<float3, float, short3>*/, 0 /*addScalarImpl<float4, float, short4>*/},
|
||||
{0 /*addScalarImpl<float, float, int>*/, 0 /*addScalarImpl<float2, float, int2>*/, 0 /*addScalarImpl<float3, float, int3>*/, 0 /*addScalarImpl<float4, float, int4>*/},
|
||||
{addScalarImpl<float, float, float>, addScalarImpl<float2, float, float2>, addScalarImpl<float3, float, float3>, addScalarImpl<float4, float, float4>},
|
||||
{addScalarImpl<float, double, double>, addScalarImpl<float2, double, double2>, addScalarImpl<float3, double, double3>, addScalarImpl<float4, double, double4>}
|
||||
},
|
||||
{
|
||||
{0 /*addScalarImpl<double, double, uchar>*/, 0 /*addScalarImpl<double2, double, uchar2>*/, 0 /*addScalarImpl<double3, double, uchar3>*/, 0 /*addScalarImpl<double4, double, uchar4>*/},
|
||||
{0 /*addScalarImpl<double, double, schar>*/, 0 /*addScalarImpl<double2, double, char2>*/, 0 /*addScalarImpl<double3, double, char3>*/, 0 /*addScalarImpl<double4, double, char4>*/},
|
||||
{0 /*addScalarImpl<double, double, ushort>*/, 0 /*addScalarImpl<double2, double, ushort2>*/, 0 /*addScalarImpl<double3, double, ushort3>*/, 0 /*addScalarImpl<double4, double, ushort4>*/},
|
||||
{0 /*addScalarImpl<double, double, short>*/, 0 /*addScalarImpl<double2, double, short2>*/, 0 /*addScalarImpl<double3, double, short3>*/, 0 /*addScalarImpl<double4, double, short4>*/},
|
||||
{0 /*addScalarImpl<double, double, int>*/, 0 /*addScalarImpl<double2, double, int2>*/, 0 /*addScalarImpl<double3, double, int3>*/, 0 /*addScalarImpl<double4, double, int4>*/},
|
||||
{0 /*addScalarImpl<double, double, float>*/, 0 /*addScalarImpl<double2, double, float2>*/, 0 /*addScalarImpl<double3, double, float3>*/, 0 /*addScalarImpl<double4, double, float4>*/},
|
||||
{addScalarImpl<double, double, double>, addScalarImpl<double2, double, double2>, addScalarImpl<double3, double, double3>, addScalarImpl<double4, double, double4>}
|
||||
}
|
||||
};
|
||||
|
||||
const int sdepth = src.depth();
|
||||
const int ddepth = dst.depth();
|
||||
const int cn = src.channels();
|
||||
|
||||
CV_DbgAssert( sdepth <= CV_64F && ddepth <= CV_64F && cn <= 4 );
|
||||
|
||||
const func_t func = funcs[sdepth][ddepth][cn - 1];
|
||||
|
||||
if (!func)
|
||||
CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types");
|
||||
|
||||
func(src, val, dst, mask, stream);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@ -40,325 +40,553 @@
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
#include "opencv2/opencv_modules.hpp"
|
||||
|
||||
#include "opencv2/core/cuda/common.hpp"
|
||||
#include "opencv2/core/cuda/functional.hpp"
|
||||
#include "opencv2/core/cuda/transform.hpp"
|
||||
#include "opencv2/core/cuda/saturate_cast.hpp"
|
||||
#ifndef HAVE_OPENCV_CUDEV
|
||||
|
||||
#include "arithm_func_traits.hpp"
|
||||
#error "opencv_cudev is required"
|
||||
|
||||
using namespace cv::cuda;
|
||||
using namespace cv::cuda::device;
|
||||
#else
|
||||
|
||||
namespace arithm
|
||||
#include "opencv2/cudaarithm.hpp"
|
||||
#include "opencv2/cudev.hpp"
|
||||
|
||||
using namespace cv::cudev;
|
||||
|
||||
namespace
|
||||
{
|
||||
template <typename T> struct UseDouble_
|
||||
template <typename T1, typename T2, typename D, typename S> struct AddWeightedOp : binary_function<T1, T2, D>
|
||||
{
|
||||
enum {value = 0};
|
||||
};
|
||||
template <> struct UseDouble_<double>
|
||||
{
|
||||
enum {value = 1};
|
||||
};
|
||||
template <typename T1, typename T2, typename D> struct UseDouble
|
||||
{
|
||||
enum {value = (UseDouble_<T1>::value || UseDouble_<T2>::value || UseDouble_<D>::value)};
|
||||
};
|
||||
|
||||
template <typename T1, typename T2, typename D, bool useDouble> struct AddWeighted_;
|
||||
template <typename T1, typename T2, typename D> struct AddWeighted_<T1, T2, D, false> : binary_function<T1, T2, D>
|
||||
{
|
||||
float alpha;
|
||||
float beta;
|
||||
float gamma;
|
||||
|
||||
__host__ AddWeighted_(double alpha_, double beta_, double gamma_) : alpha(static_cast<float>(alpha_)), beta(static_cast<float>(beta_)), gamma(static_cast<float>(gamma_)) {}
|
||||
S alpha;
|
||||
S beta;
|
||||
S gamma;
|
||||
|
||||
__device__ __forceinline__ D operator ()(T1 a, T2 b) const
|
||||
{
|
||||
return saturate_cast<D>(a * alpha + b * beta + gamma);
|
||||
}
|
||||
};
|
||||
template <typename T1, typename T2, typename D> struct AddWeighted_<T1, T2, D, true> : binary_function<T1, T2, D>
|
||||
{
|
||||
double alpha;
|
||||
double beta;
|
||||
double gamma;
|
||||
|
||||
__host__ AddWeighted_(double alpha_, double beta_, double gamma_) : alpha(alpha_), beta(beta_), gamma(gamma_) {}
|
||||
|
||||
__device__ __forceinline__ D operator ()(T1 a, T2 b) const
|
||||
{
|
||||
return saturate_cast<D>(a * alpha + b * beta + gamma);
|
||||
}
|
||||
};
|
||||
template <typename T1, typename T2, typename D> struct AddWeighted : AddWeighted_<T1, T2, D, UseDouble<T1, T2, D>::value>
|
||||
template <typename ScalarDepth> struct TransformPolicy : DefaultTransformPolicy
|
||||
{
|
||||
AddWeighted(double alpha_, double beta_, double gamma_) : AddWeighted_<T1, T2, D, UseDouble<T1, T2, D>::value>(alpha_, beta_, gamma_) {}
|
||||
};
|
||||
template <> struct TransformPolicy<double> : DefaultTransformPolicy
|
||||
{
|
||||
enum {
|
||||
shift = 1
|
||||
};
|
||||
};
|
||||
|
||||
template <typename T1, typename T2, typename D>
|
||||
void addWeightedImpl(const GpuMat& src1, double alpha, const GpuMat& src2, double beta, double gamma, GpuMat& dst, Stream& stream)
|
||||
{
|
||||
typedef typename LargerType<T1, T2>::type larger_type1;
|
||||
typedef typename LargerType<larger_type1, D>::type larger_type2;
|
||||
typedef typename LargerType<larger_type2, float>::type scalar_type;
|
||||
|
||||
AddWeightedOp<T1, T2, D, scalar_type> op;
|
||||
op.alpha = static_cast<scalar_type>(alpha);
|
||||
op.beta = static_cast<scalar_type>(beta);
|
||||
op.gamma = static_cast<scalar_type>(gamma);
|
||||
|
||||
gridTransformBinary_< TransformPolicy<scalar_type> >(globPtr<T1>(src1), globPtr<T2>(src2), globPtr<D>(dst), op, stream);
|
||||
}
|
||||
}
|
||||
|
||||
namespace cv { namespace cuda { namespace device
|
||||
void cv::cuda::addWeighted(InputArray _src1, double alpha, InputArray _src2, double beta, double gamma, OutputArray _dst, int ddepth, Stream& stream)
|
||||
{
|
||||
template <typename T1, typename T2, typename D, size_t src1_size, size_t src2_size, size_t dst_size> struct AddWeightedTraits : DefaultTransformFunctorTraits< arithm::AddWeighted<T1, T2, D> >
|
||||
{
|
||||
};
|
||||
template <typename T1, typename T2, typename D, size_t src_size, size_t dst_size> struct AddWeightedTraits<T1, T2, D, src_size, src_size, dst_size> : arithm::ArithmFuncTraits<src_size, dst_size>
|
||||
typedef void (*func_t)(const GpuMat& src1, double alpha, const GpuMat& src2, double beta, double gamma, GpuMat& dst, Stream& stream);
|
||||
static const func_t funcs[7][7][7] =
|
||||
{
|
||||
{
|
||||
{
|
||||
addWeightedImpl<uchar, uchar, uchar >,
|
||||
addWeightedImpl<uchar, uchar, schar >,
|
||||
addWeightedImpl<uchar, uchar, ushort>,
|
||||
addWeightedImpl<uchar, uchar, short >,
|
||||
addWeightedImpl<uchar, uchar, int >,
|
||||
addWeightedImpl<uchar, uchar, float >,
|
||||
addWeightedImpl<uchar, uchar, double>
|
||||
},
|
||||
{
|
||||
addWeightedImpl<uchar, schar, uchar >,
|
||||
addWeightedImpl<uchar, schar, schar >,
|
||||
addWeightedImpl<uchar, schar, ushort>,
|
||||
addWeightedImpl<uchar, schar, short >,
|
||||
addWeightedImpl<uchar, schar, int >,
|
||||
addWeightedImpl<uchar, schar, float >,
|
||||
addWeightedImpl<uchar, schar, double>
|
||||
},
|
||||
{
|
||||
addWeightedImpl<uchar, ushort, uchar >,
|
||||
addWeightedImpl<uchar, ushort, schar >,
|
||||
addWeightedImpl<uchar, ushort, ushort>,
|
||||
addWeightedImpl<uchar, ushort, short >,
|
||||
addWeightedImpl<uchar, ushort, int >,
|
||||
addWeightedImpl<uchar, ushort, float >,
|
||||
addWeightedImpl<uchar, ushort, double>
|
||||
},
|
||||
{
|
||||
addWeightedImpl<uchar, short, uchar >,
|
||||
addWeightedImpl<uchar, short, schar >,
|
||||
addWeightedImpl<uchar, short, ushort>,
|
||||
addWeightedImpl<uchar, short, short >,
|
||||
addWeightedImpl<uchar, short, int >,
|
||||
addWeightedImpl<uchar, short, float >,
|
||||
addWeightedImpl<uchar, short, double>
|
||||
},
|
||||
{
|
||||
addWeightedImpl<uchar, int, uchar >,
|
||||
addWeightedImpl<uchar, int, schar >,
|
||||
addWeightedImpl<uchar, int, ushort>,
|
||||
addWeightedImpl<uchar, int, short >,
|
||||
addWeightedImpl<uchar, int, int >,
|
||||
addWeightedImpl<uchar, int, float >,
|
||||
addWeightedImpl<uchar, int, double>
|
||||
},
|
||||
{
|
||||
addWeightedImpl<uchar, float, uchar >,
|
||||
addWeightedImpl<uchar, float, schar >,
|
||||
addWeightedImpl<uchar, float, ushort>,
|
||||
addWeightedImpl<uchar, float, short >,
|
||||
addWeightedImpl<uchar, float, int >,
|
||||
addWeightedImpl<uchar, float, float >,
|
||||
addWeightedImpl<uchar, float, double>
|
||||
},
|
||||
{
|
||||
addWeightedImpl<uchar, double, uchar >,
|
||||
addWeightedImpl<uchar, double, schar >,
|
||||
addWeightedImpl<uchar, double, ushort>,
|
||||
addWeightedImpl<uchar, double, short >,
|
||||
addWeightedImpl<uchar, double, int >,
|
||||
addWeightedImpl<uchar, double, float >,
|
||||
addWeightedImpl<uchar, double, double>
|
||||
}
|
||||
},
|
||||
{
|
||||
{
|
||||
0/*addWeightedImpl<schar, uchar, uchar >*/,
|
||||
0/*addWeightedImpl<schar, uchar, schar >*/,
|
||||
0/*addWeightedImpl<schar, uchar, ushort>*/,
|
||||
0/*addWeightedImpl<schar, uchar, short >*/,
|
||||
0/*addWeightedImpl<schar, uchar, int >*/,
|
||||
0/*addWeightedImpl<schar, uchar, float >*/,
|
||||
0/*addWeightedImpl<schar, uchar, double>*/
|
||||
},
|
||||
{
|
||||
addWeightedImpl<schar, schar, uchar >,
|
||||
addWeightedImpl<schar, schar, schar >,
|
||||
addWeightedImpl<schar, schar, ushort>,
|
||||
addWeightedImpl<schar, schar, short >,
|
||||
addWeightedImpl<schar, schar, int >,
|
||||
addWeightedImpl<schar, schar, float >,
|
||||
addWeightedImpl<schar, schar, double>
|
||||
},
|
||||
{
|
||||
addWeightedImpl<schar, ushort, uchar >,
|
||||
addWeightedImpl<schar, ushort, schar >,
|
||||
addWeightedImpl<schar, ushort, ushort>,
|
||||
addWeightedImpl<schar, ushort, short >,
|
||||
addWeightedImpl<schar, ushort, int >,
|
||||
addWeightedImpl<schar, ushort, float >,
|
||||
addWeightedImpl<schar, ushort, double>
|
||||
},
|
||||
{
|
||||
addWeightedImpl<schar, short, uchar >,
|
||||
addWeightedImpl<schar, short, schar >,
|
||||
addWeightedImpl<schar, short, ushort>,
|
||||
addWeightedImpl<schar, short, short >,
|
||||
addWeightedImpl<schar, short, int >,
|
||||
addWeightedImpl<schar, short, float >,
|
||||
addWeightedImpl<schar, short, double>
|
||||
},
|
||||
{
|
||||
addWeightedImpl<schar, int, uchar >,
|
||||
addWeightedImpl<schar, int, schar >,
|
||||
addWeightedImpl<schar, int, ushort>,
|
||||
addWeightedImpl<schar, int, short >,
|
||||
addWeightedImpl<schar, int, int >,
|
||||
addWeightedImpl<schar, int, float >,
|
||||
addWeightedImpl<schar, int, double>
|
||||
},
|
||||
{
|
||||
addWeightedImpl<schar, float, uchar >,
|
||||
addWeightedImpl<schar, float, schar >,
|
||||
addWeightedImpl<schar, float, ushort>,
|
||||
addWeightedImpl<schar, float, short >,
|
||||
addWeightedImpl<schar, float, int >,
|
||||
addWeightedImpl<schar, float, float >,
|
||||
addWeightedImpl<schar, float, double>
|
||||
},
|
||||
{
|
||||
addWeightedImpl<schar, double, uchar >,
|
||||
addWeightedImpl<schar, double, schar >,
|
||||
addWeightedImpl<schar, double, ushort>,
|
||||
addWeightedImpl<schar, double, short >,
|
||||
addWeightedImpl<schar, double, int >,
|
||||
addWeightedImpl<schar, double, float >,
|
||||
addWeightedImpl<schar, double, double>
|
||||
}
|
||||
},
|
||||
{
|
||||
{
|
||||
0/*addWeightedImpl<ushort, uchar, uchar >*/,
|
||||
0/*addWeightedImpl<ushort, uchar, schar >*/,
|
||||
0/*addWeightedImpl<ushort, uchar, ushort>*/,
|
||||
0/*addWeightedImpl<ushort, uchar, short >*/,
|
||||
0/*addWeightedImpl<ushort, uchar, int >*/,
|
||||
0/*addWeightedImpl<ushort, uchar, float >*/,
|
||||
0/*addWeightedImpl<ushort, uchar, double>*/
|
||||
},
|
||||
{
|
||||
0/*addWeightedImpl<ushort, schar, uchar >*/,
|
||||
0/*addWeightedImpl<ushort, schar, schar >*/,
|
||||
0/*addWeightedImpl<ushort, schar, ushort>*/,
|
||||
0/*addWeightedImpl<ushort, schar, short >*/,
|
||||
0/*addWeightedImpl<ushort, schar, int >*/,
|
||||
0/*addWeightedImpl<ushort, schar, float >*/,
|
||||
0/*addWeightedImpl<ushort, schar, double>*/
|
||||
},
|
||||
{
|
||||
addWeightedImpl<ushort, ushort, uchar >,
|
||||
addWeightedImpl<ushort, ushort, schar >,
|
||||
addWeightedImpl<ushort, ushort, ushort>,
|
||||
addWeightedImpl<ushort, ushort, short >,
|
||||
addWeightedImpl<ushort, ushort, int >,
|
||||
addWeightedImpl<ushort, ushort, float >,
|
||||
addWeightedImpl<ushort, ushort, double>
|
||||
},
|
||||
{
|
||||
addWeightedImpl<ushort, short, uchar >,
|
||||
addWeightedImpl<ushort, short, schar >,
|
||||
addWeightedImpl<ushort, short, ushort>,
|
||||
addWeightedImpl<ushort, short, short >,
|
||||
addWeightedImpl<ushort, short, int >,
|
||||
addWeightedImpl<ushort, short, float >,
|
||||
addWeightedImpl<ushort, short, double>
|
||||
},
|
||||
{
|
||||
addWeightedImpl<ushort, int, uchar >,
|
||||
addWeightedImpl<ushort, int, schar >,
|
||||
addWeightedImpl<ushort, int, ushort>,
|
||||
addWeightedImpl<ushort, int, short >,
|
||||
addWeightedImpl<ushort, int, int >,
|
||||
addWeightedImpl<ushort, int, float >,
|
||||
addWeightedImpl<ushort, int, double>
|
||||
},
|
||||
{
|
||||
addWeightedImpl<ushort, float, uchar >,
|
||||
addWeightedImpl<ushort, float, schar >,
|
||||
addWeightedImpl<ushort, float, ushort>,
|
||||
addWeightedImpl<ushort, float, short >,
|
||||
addWeightedImpl<ushort, float, int >,
|
||||
addWeightedImpl<ushort, float, float >,
|
||||
addWeightedImpl<ushort, float, double>
|
||||
},
|
||||
{
|
||||
addWeightedImpl<ushort, double, uchar >,
|
||||
addWeightedImpl<ushort, double, schar >,
|
||||
addWeightedImpl<ushort, double, ushort>,
|
||||
addWeightedImpl<ushort, double, short >,
|
||||
addWeightedImpl<ushort, double, int >,
|
||||
addWeightedImpl<ushort, double, float >,
|
||||
addWeightedImpl<ushort, double, double>
|
||||
}
|
||||
},
|
||||
{
|
||||
{
|
||||
0/*addWeightedImpl<short, uchar, uchar >*/,
|
||||
0/*addWeightedImpl<short, uchar, schar >*/,
|
||||
0/*addWeightedImpl<short, uchar, ushort>*/,
|
||||
0/*addWeightedImpl<short, uchar, short >*/,
|
||||
0/*addWeightedImpl<short, uchar, int >*/,
|
||||
0/*addWeightedImpl<short, uchar, float >*/,
|
||||
0/*addWeightedImpl<short, uchar, double>*/
|
||||
},
|
||||
{
|
||||
0/*addWeightedImpl<short, schar, uchar >*/,
|
||||
0/*addWeightedImpl<short, schar, schar >*/,
|
||||
0/*addWeightedImpl<short, schar, ushort>*/,
|
||||
0/*addWeightedImpl<short, schar, short >*/,
|
||||
0/*addWeightedImpl<short, schar, int >*/,
|
||||
0/*addWeightedImpl<short, schar, float >*/,
|
||||
0/*addWeightedImpl<short, schar, double>*/
|
||||
},
|
||||
{
|
||||
0/*addWeightedImpl<short, ushort, uchar >*/,
|
||||
0/*addWeightedImpl<short, ushort, schar >*/,
|
||||
0/*addWeightedImpl<short, ushort, ushort>*/,
|
||||
0/*addWeightedImpl<short, ushort, short >*/,
|
||||
0/*addWeightedImpl<short, ushort, int >*/,
|
||||
0/*addWeightedImpl<short, ushort, float >*/,
|
||||
0/*addWeightedImpl<short, ushort, double>*/
|
||||
},
|
||||
{
|
||||
addWeightedImpl<short, short, uchar >,
|
||||
addWeightedImpl<short, short, schar >,
|
||||
addWeightedImpl<short, short, ushort>,
|
||||
addWeightedImpl<short, short, short >,
|
||||
addWeightedImpl<short, short, int >,
|
||||
addWeightedImpl<short, short, float >,
|
||||
addWeightedImpl<short, short, double>
|
||||
},
|
||||
{
|
||||
addWeightedImpl<short, int, uchar >,
|
||||
addWeightedImpl<short, int, schar >,
|
||||
addWeightedImpl<short, int, ushort>,
|
||||
addWeightedImpl<short, int, short >,
|
||||
addWeightedImpl<short, int, int >,
|
||||
addWeightedImpl<short, int, float >,
|
||||
addWeightedImpl<short, int, double>
|
||||
},
|
||||
{
|
||||
addWeightedImpl<short, float, uchar >,
|
||||
addWeightedImpl<short, float, schar >,
|
||||
addWeightedImpl<short, float, ushort>,
|
||||
addWeightedImpl<short, float, short >,
|
||||
addWeightedImpl<short, float, int >,
|
||||
addWeightedImpl<short, float, float >,
|
||||
addWeightedImpl<short, float, double>
|
||||
},
|
||||
{
|
||||
addWeightedImpl<short, double, uchar >,
|
||||
addWeightedImpl<short, double, schar >,
|
||||
addWeightedImpl<short, double, ushort>,
|
||||
addWeightedImpl<short, double, short >,
|
||||
addWeightedImpl<short, double, int >,
|
||||
addWeightedImpl<short, double, float >,
|
||||
addWeightedImpl<short, double, double>
|
||||
}
|
||||
},
|
||||
{
|
||||
{
|
||||
0/*addWeightedImpl<int, uchar, uchar >*/,
|
||||
0/*addWeightedImpl<int, uchar, schar >*/,
|
||||
0/*addWeightedImpl<int, uchar, ushort>*/,
|
||||
0/*addWeightedImpl<int, uchar, short >*/,
|
||||
0/*addWeightedImpl<int, uchar, int >*/,
|
||||
0/*addWeightedImpl<int, uchar, float >*/,
|
||||
0/*addWeightedImpl<int, uchar, double>*/
|
||||
},
|
||||
{
|
||||
0/*addWeightedImpl<int, schar, uchar >*/,
|
||||
0/*addWeightedImpl<int, schar, schar >*/,
|
||||
0/*addWeightedImpl<int, schar, ushort>*/,
|
||||
0/*addWeightedImpl<int, schar, short >*/,
|
||||
0/*addWeightedImpl<int, schar, int >*/,
|
||||
0/*addWeightedImpl<int, schar, float >*/,
|
||||
0/*addWeightedImpl<int, schar, double>*/
|
||||
},
|
||||
{
|
||||
0/*addWeightedImpl<int, ushort, uchar >*/,
|
||||
0/*addWeightedImpl<int, ushort, schar >*/,
|
||||
0/*addWeightedImpl<int, ushort, ushort>*/,
|
||||
0/*addWeightedImpl<int, ushort, short >*/,
|
||||
0/*addWeightedImpl<int, ushort, int >*/,
|
||||
0/*addWeightedImpl<int, ushort, float >*/,
|
||||
0/*addWeightedImpl<int, ushort, double>*/
|
||||
},
|
||||
{
|
||||
0/*addWeightedImpl<int, short, uchar >*/,
|
||||
0/*addWeightedImpl<int, short, schar >*/,
|
||||
0/*addWeightedImpl<int, short, ushort>*/,
|
||||
0/*addWeightedImpl<int, short, short >*/,
|
||||
0/*addWeightedImpl<int, short, int >*/,
|
||||
0/*addWeightedImpl<int, short, float >*/,
|
||||
0/*addWeightedImpl<int, short, double>*/
|
||||
},
|
||||
{
|
||||
addWeightedImpl<int, int, uchar >,
|
||||
addWeightedImpl<int, int, schar >,
|
||||
addWeightedImpl<int, int, ushort>,
|
||||
addWeightedImpl<int, int, short >,
|
||||
addWeightedImpl<int, int, int >,
|
||||
addWeightedImpl<int, int, float >,
|
||||
addWeightedImpl<int, int, double>
|
||||
},
|
||||
{
|
||||
addWeightedImpl<int, float, uchar >,
|
||||
addWeightedImpl<int, float, schar >,
|
||||
addWeightedImpl<int, float, ushort>,
|
||||
addWeightedImpl<int, float, short >,
|
||||
addWeightedImpl<int, float, int >,
|
||||
addWeightedImpl<int, float, float >,
|
||||
addWeightedImpl<int, float, double>
|
||||
},
|
||||
{
|
||||
addWeightedImpl<int, double, uchar >,
|
||||
addWeightedImpl<int, double, schar >,
|
||||
addWeightedImpl<int, double, ushort>,
|
||||
addWeightedImpl<int, double, short >,
|
||||
addWeightedImpl<int, double, int >,
|
||||
addWeightedImpl<int, double, float >,
|
||||
addWeightedImpl<int, double, double>
|
||||
}
|
||||
},
|
||||
{
|
||||
{
|
||||
0/*addWeightedImpl<float, uchar, uchar >*/,
|
||||
0/*addWeightedImpl<float, uchar, schar >*/,
|
||||
0/*addWeightedImpl<float, uchar, ushort>*/,
|
||||
0/*addWeightedImpl<float, uchar, short >*/,
|
||||
0/*addWeightedImpl<float, uchar, int >*/,
|
||||
0/*addWeightedImpl<float, uchar, float >*/,
|
||||
0/*addWeightedImpl<float, uchar, double>*/
|
||||
},
|
||||
{
|
||||
0/*addWeightedImpl<float, schar, uchar >*/,
|
||||
0/*addWeightedImpl<float, schar, schar >*/,
|
||||
0/*addWeightedImpl<float, schar, ushort>*/,
|
||||
0/*addWeightedImpl<float, schar, short >*/,
|
||||
0/*addWeightedImpl<float, schar, int >*/,
|
||||
0/*addWeightedImpl<float, schar, float >*/,
|
||||
0/*addWeightedImpl<float, schar, double>*/
|
||||
},
|
||||
{
|
||||
0/*addWeightedImpl<float, ushort, uchar >*/,
|
||||
0/*addWeightedImpl<float, ushort, schar >*/,
|
||||
0/*addWeightedImpl<float, ushort, ushort>*/,
|
||||
0/*addWeightedImpl<float, ushort, short >*/,
|
||||
0/*addWeightedImpl<float, ushort, int >*/,
|
||||
0/*addWeightedImpl<float, ushort, float >*/,
|
||||
0/*addWeightedImpl<float, ushort, double>*/
|
||||
},
|
||||
{
|
||||
0/*addWeightedImpl<float, short, uchar >*/,
|
||||
0/*addWeightedImpl<float, short, schar >*/,
|
||||
0/*addWeightedImpl<float, short, ushort>*/,
|
||||
0/*addWeightedImpl<float, short, short >*/,
|
||||
0/*addWeightedImpl<float, short, int >*/,
|
||||
0/*addWeightedImpl<float, short, float >*/,
|
||||
0/*addWeightedImpl<float, short, double>*/
|
||||
},
|
||||
{
|
||||
0/*addWeightedImpl<float, int, uchar >*/,
|
||||
0/*addWeightedImpl<float, int, schar >*/,
|
||||
0/*addWeightedImpl<float, int, ushort>*/,
|
||||
0/*addWeightedImpl<float, int, short >*/,
|
||||
0/*addWeightedImpl<float, int, int >*/,
|
||||
0/*addWeightedImpl<float, int, float >*/,
|
||||
0/*addWeightedImpl<float, int, double>*/
|
||||
},
|
||||
{
|
||||
addWeightedImpl<float, float, uchar >,
|
||||
addWeightedImpl<float, float, schar >,
|
||||
addWeightedImpl<float, float, ushort>,
|
||||
addWeightedImpl<float, float, short >,
|
||||
addWeightedImpl<float, float, int >,
|
||||
addWeightedImpl<float, float, float >,
|
||||
addWeightedImpl<float, float, double>
|
||||
},
|
||||
{
|
||||
addWeightedImpl<float, double, uchar >,
|
||||
addWeightedImpl<float, double, schar >,
|
||||
addWeightedImpl<float, double, ushort>,
|
||||
addWeightedImpl<float, double, short >,
|
||||
addWeightedImpl<float, double, int >,
|
||||
addWeightedImpl<float, double, float >,
|
||||
addWeightedImpl<float, double, double>
|
||||
}
|
||||
},
|
||||
{
|
||||
{
|
||||
0/*addWeightedImpl<double, uchar, uchar >*/,
|
||||
0/*addWeightedImpl<double, uchar, schar >*/,
|
||||
0/*addWeightedImpl<double, uchar, ushort>*/,
|
||||
0/*addWeightedImpl<double, uchar, short >*/,
|
||||
0/*addWeightedImpl<double, uchar, int >*/,
|
||||
0/*addWeightedImpl<double, uchar, float >*/,
|
||||
0/*addWeightedImpl<double, uchar, double>*/
|
||||
},
|
||||
{
|
||||
0/*addWeightedImpl<double, schar, uchar >*/,
|
||||
0/*addWeightedImpl<double, schar, schar >*/,
|
||||
0/*addWeightedImpl<double, schar, ushort>*/,
|
||||
0/*addWeightedImpl<double, schar, short >*/,
|
||||
0/*addWeightedImpl<double, schar, int >*/,
|
||||
0/*addWeightedImpl<double, schar, float >*/,
|
||||
0/*addWeightedImpl<double, schar, double>*/
|
||||
},
|
||||
{
|
||||
0/*addWeightedImpl<double, ushort, uchar >*/,
|
||||
0/*addWeightedImpl<double, ushort, schar >*/,
|
||||
0/*addWeightedImpl<double, ushort, ushort>*/,
|
||||
0/*addWeightedImpl<double, ushort, short >*/,
|
||||
0/*addWeightedImpl<double, ushort, int >*/,
|
||||
0/*addWeightedImpl<double, ushort, float >*/,
|
||||
0/*addWeightedImpl<double, ushort, double>*/
|
||||
},
|
||||
{
|
||||
0/*addWeightedImpl<double, short, uchar >*/,
|
||||
0/*addWeightedImpl<double, short, schar >*/,
|
||||
0/*addWeightedImpl<double, short, ushort>*/,
|
||||
0/*addWeightedImpl<double, short, short >*/,
|
||||
0/*addWeightedImpl<double, short, int >*/,
|
||||
0/*addWeightedImpl<double, short, float >*/,
|
||||
0/*addWeightedImpl<double, short, double>*/
|
||||
},
|
||||
{
|
||||
0/*addWeightedImpl<double, int, uchar >*/,
|
||||
0/*addWeightedImpl<double, int, schar >*/,
|
||||
0/*addWeightedImpl<double, int, ushort>*/,
|
||||
0/*addWeightedImpl<double, int, short >*/,
|
||||
0/*addWeightedImpl<double, int, int >*/,
|
||||
0/*addWeightedImpl<double, int, float >*/,
|
||||
0/*addWeightedImpl<double, int, double>*/
|
||||
},
|
||||
{
|
||||
0/*addWeightedImpl<double, float, uchar >*/,
|
||||
0/*addWeightedImpl<double, float, schar >*/,
|
||||
0/*addWeightedImpl<double, float, ushort>*/,
|
||||
0/*addWeightedImpl<double, float, short >*/,
|
||||
0/*addWeightedImpl<double, float, int >*/,
|
||||
0/*addWeightedImpl<double, float, float >*/,
|
||||
0/*addWeightedImpl<double, float, double>*/
|
||||
},
|
||||
{
|
||||
addWeightedImpl<double, double, uchar >,
|
||||
addWeightedImpl<double, double, schar >,
|
||||
addWeightedImpl<double, double, ushort>,
|
||||
addWeightedImpl<double, double, short >,
|
||||
addWeightedImpl<double, double, int >,
|
||||
addWeightedImpl<double, double, float >,
|
||||
addWeightedImpl<double, double, double>
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T1, typename T2, typename D> struct TransformFunctorTraits< arithm::AddWeighted<T1, T2, D> > : AddWeightedTraits<T1, T2, D, sizeof(T1), sizeof(T2), sizeof(D)>
|
||||
{
|
||||
};
|
||||
}}}
|
||||
GpuMat src1 = _src1.getGpuMat();
|
||||
GpuMat src2 = _src2.getGpuMat();
|
||||
|
||||
namespace arithm
|
||||
{
|
||||
template <typename T1, typename T2, typename D>
|
||||
void addWeighted(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream)
|
||||
{
|
||||
AddWeighted<T1, T2, D> op(alpha, beta, gamma);
|
||||
int sdepth1 = src1.depth();
|
||||
int sdepth2 = src2.depth();
|
||||
|
||||
device::transform((PtrStepSz<T1>) src1, (PtrStepSz<T2>) src2, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
|
||||
ddepth = ddepth >= 0 ? CV_MAT_DEPTH(ddepth) : std::max(sdepth1, sdepth2);
|
||||
const int cn = src1.channels();
|
||||
|
||||
CV_DbgAssert( src2.size() == src1.size() && src2.channels() == cn );
|
||||
CV_DbgAssert( sdepth1 <= CV_64F && sdepth2 <= CV_64F && ddepth <= CV_64F );
|
||||
|
||||
_dst.create(src1.size(), CV_MAKE_TYPE(ddepth, cn));
|
||||
GpuMat dst = _dst.getGpuMat();
|
||||
|
||||
GpuMat src1_ = src1.reshape(1);
|
||||
GpuMat src2_ = src2.reshape(1);
|
||||
GpuMat dst_ = dst.reshape(1);
|
||||
|
||||
if (sdepth1 > sdepth2)
|
||||
{
|
||||
src1_.swap(src2_);
|
||||
std::swap(alpha, beta);
|
||||
std::swap(sdepth1, sdepth2);
|
||||
}
|
||||
|
||||
template void addWeighted<uchar, uchar, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<uchar, uchar, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<uchar, uchar, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<uchar, uchar, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<uchar, uchar, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<uchar, uchar, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<uchar, uchar, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
const func_t func = funcs[sdepth1][sdepth2][ddepth];
|
||||
|
||||
template void addWeighted<uchar, schar, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<uchar, schar, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<uchar, schar, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<uchar, schar, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<uchar, schar, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<uchar, schar, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<uchar, schar, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
if (!func)
|
||||
CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types");
|
||||
|
||||
template void addWeighted<uchar, ushort, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<uchar, ushort, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<uchar, ushort, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<uchar, ushort, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<uchar, ushort, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<uchar, ushort, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<uchar, ushort, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
|
||||
template void addWeighted<uchar, short, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<uchar, short, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<uchar, short, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<uchar, short, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<uchar, short, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<uchar, short, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<uchar, short, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
|
||||
template void addWeighted<uchar, int, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<uchar, int, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<uchar, int, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<uchar, int, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<uchar, int, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<uchar, int, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<uchar, int, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
|
||||
template void addWeighted<uchar, float, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<uchar, float, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<uchar, float, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<uchar, float, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<uchar, float, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<uchar, float, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<uchar, float, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
|
||||
template void addWeighted<uchar, double, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<uchar, double, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<uchar, double, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<uchar, double, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<uchar, double, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<uchar, double, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<uchar, double, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
|
||||
|
||||
|
||||
template void addWeighted<schar, schar, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<schar, schar, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<schar, schar, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<schar, schar, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<schar, schar, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<schar, schar, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<schar, schar, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
|
||||
template void addWeighted<schar, ushort, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<schar, ushort, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<schar, ushort, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<schar, ushort, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<schar, ushort, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<schar, ushort, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<schar, ushort, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
|
||||
template void addWeighted<schar, short, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<schar, short, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<schar, short, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<schar, short, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<schar, short, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<schar, short, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<schar, short, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
|
||||
template void addWeighted<schar, int, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<schar, int, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<schar, int, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<schar, int, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<schar, int, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<schar, int, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<schar, int, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
|
||||
template void addWeighted<schar, float, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<schar, float, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<schar, float, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<schar, float, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<schar, float, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<schar, float, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<schar, float, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
|
||||
template void addWeighted<schar, double, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<schar, double, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<schar, double, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<schar, double, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<schar, double, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<schar, double, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<schar, double, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
|
||||
|
||||
|
||||
template void addWeighted<ushort, ushort, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<ushort, ushort, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<ushort, ushort, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<ushort, ushort, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<ushort, ushort, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<ushort, ushort, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<ushort, ushort, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
|
||||
template void addWeighted<ushort, short, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<ushort, short, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<ushort, short, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<ushort, short, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<ushort, short, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<ushort, short, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<ushort, short, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
|
||||
template void addWeighted<ushort, int, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<ushort, int, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<ushort, int, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<ushort, int, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<ushort, int, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<ushort, int, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<ushort, int, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
|
||||
template void addWeighted<ushort, float, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<ushort, float, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<ushort, float, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<ushort, float, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<ushort, float, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<ushort, float, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<ushort, float, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
|
||||
template void addWeighted<ushort, double, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<ushort, double, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<ushort, double, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<ushort, double, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<ushort, double, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<ushort, double, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<ushort, double, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
|
||||
|
||||
|
||||
template void addWeighted<short, short, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<short, short, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<short, short, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<short, short, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<short, short, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<short, short, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<short, short, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
|
||||
template void addWeighted<short, int, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<short, int, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<short, int, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<short, int, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<short, int, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<short, int, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<short, int, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
|
||||
template void addWeighted<short, float, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<short, float, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<short, float, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<short, float, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<short, float, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<short, float, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<short, float, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
|
||||
template void addWeighted<short, double, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<short, double, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<short, double, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<short, double, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<short, double, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<short, double, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<short, double, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
|
||||
|
||||
|
||||
template void addWeighted<int, int, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<int, int, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<int, int, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<int, int, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<int, int, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<int, int, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<int, int, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
|
||||
template void addWeighted<int, float, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<int, float, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<int, float, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<int, float, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<int, float, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<int, float, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<int, float, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
|
||||
template void addWeighted<int, double, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<int, double, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<int, double, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<int, double, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<int, double, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<int, double, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<int, double, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
|
||||
|
||||
|
||||
template void addWeighted<float, float, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<float, float, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<float, float, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<float, float, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<float, float, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<float, float, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<float, float, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
|
||||
template void addWeighted<float, double, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<float, double, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<float, double, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<float, double, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<float, double, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<float, double, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<float, double, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
|
||||
|
||||
|
||||
template void addWeighted<double, double, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<double, double, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<double, double, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<double, double, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<double, double, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<double, double, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void addWeighted<double, double, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
|
||||
func(src1_, alpha, src2_, beta, gamma, dst_, stream);
|
||||
}
|
||||
|
||||
#endif /* CUDA_DISABLER */
|
||||
#endif
|
||||
|
@ -1,145 +0,0 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#ifndef __ARITHM_FUNC_TRAITS_HPP__
|
||||
#define __ARITHM_FUNC_TRAITS_HPP__
|
||||
|
||||
#include <cstddef>
|
||||
|
||||
namespace arithm
|
||||
{
|
||||
template <size_t src_size, size_t dst_size> struct ArithmFuncTraits
|
||||
{
|
||||
enum { simple_block_dim_x = 32 };
|
||||
enum { simple_block_dim_y = 8 };
|
||||
|
||||
enum { smart_block_dim_x = 32 };
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 1 };
|
||||
};
|
||||
|
||||
template <> struct ArithmFuncTraits<1, 1>
|
||||
{
|
||||
enum { simple_block_dim_x = 32 };
|
||||
enum { simple_block_dim_y = 8 };
|
||||
|
||||
enum { smart_block_dim_x = 32 };
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
template <> struct ArithmFuncTraits<1, 2>
|
||||
{
|
||||
enum { simple_block_dim_x = 32 };
|
||||
enum { simple_block_dim_y = 8 };
|
||||
|
||||
enum { smart_block_dim_x = 32 };
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
template <> struct ArithmFuncTraits<1, 4>
|
||||
{
|
||||
enum { simple_block_dim_x = 32 };
|
||||
enum { simple_block_dim_y = 8 };
|
||||
|
||||
enum { smart_block_dim_x = 32 };
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
|
||||
template <> struct ArithmFuncTraits<2, 1>
|
||||
{
|
||||
enum { simple_block_dim_x = 32 };
|
||||
enum { simple_block_dim_y = 8 };
|
||||
|
||||
enum { smart_block_dim_x = 32 };
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
template <> struct ArithmFuncTraits<2, 2>
|
||||
{
|
||||
enum { simple_block_dim_x = 32 };
|
||||
enum { simple_block_dim_y = 8 };
|
||||
|
||||
enum { smart_block_dim_x = 32 };
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
template <> struct ArithmFuncTraits<2, 4>
|
||||
{
|
||||
enum { simple_block_dim_x = 32 };
|
||||
enum { simple_block_dim_y = 8 };
|
||||
|
||||
enum { smart_block_dim_x = 32 };
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
|
||||
template <> struct ArithmFuncTraits<4, 1>
|
||||
{
|
||||
enum { simple_block_dim_x = 32 };
|
||||
enum { simple_block_dim_y = 8 };
|
||||
|
||||
enum { smart_block_dim_x = 32 };
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
template <> struct ArithmFuncTraits<4, 2>
|
||||
{
|
||||
enum { simple_block_dim_x = 32 };
|
||||
enum { simple_block_dim_y = 8 };
|
||||
|
||||
enum { smart_block_dim_x = 32 };
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
template <> struct ArithmFuncTraits<4, 4>
|
||||
{
|
||||
enum { simple_block_dim_x = 32 };
|
||||
enum { simple_block_dim_y = 8 };
|
||||
|
||||
enum { smart_block_dim_x = 32 };
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
}
|
||||
|
||||
#endif // __ARITHM_FUNC_TRAITS_HPP__
|
@ -40,87 +40,187 @@
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
#include "opencv2/opencv_modules.hpp"
|
||||
|
||||
#include "opencv2/core/cuda/common.hpp"
|
||||
#include "opencv2/core/cuda/functional.hpp"
|
||||
#include "opencv2/core/cuda/transform.hpp"
|
||||
#include "opencv2/core/cuda/saturate_cast.hpp"
|
||||
#include "opencv2/core/cuda/simd_functions.hpp"
|
||||
#ifndef HAVE_OPENCV_CUDEV
|
||||
|
||||
#include "arithm_func_traits.hpp"
|
||||
#error "opencv_cudev is required"
|
||||
|
||||
using namespace cv::cuda;
|
||||
using namespace cv::cuda::device;
|
||||
#else
|
||||
|
||||
namespace cv { namespace cuda { namespace device
|
||||
#include "opencv2/cudaarithm.hpp"
|
||||
#include "opencv2/cudev.hpp"
|
||||
|
||||
using namespace cv::cudev;
|
||||
|
||||
void bitMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, double, Stream& stream, int op);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
/// bitwise_not
|
||||
|
||||
void cv::cuda::bitwise_not(InputArray _src, OutputArray _dst, InputArray _mask, Stream& stream)
|
||||
{
|
||||
template <typename T> struct TransformFunctorTraits< bit_not<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
|
||||
{
|
||||
};
|
||||
GpuMat src = _src.getGpuMat();
|
||||
GpuMat mask = _mask.getGpuMat();
|
||||
|
||||
template <typename T> struct TransformFunctorTraits< bit_and<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
|
||||
{
|
||||
};
|
||||
const int depth = src.depth();
|
||||
|
||||
template <typename T> struct TransformFunctorTraits< bit_or<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
|
||||
{
|
||||
};
|
||||
CV_DbgAssert( depth <= CV_32F );
|
||||
CV_DbgAssert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size()) );
|
||||
|
||||
template <typename T> struct TransformFunctorTraits< bit_xor<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
|
||||
{
|
||||
};
|
||||
}}}
|
||||
_dst.create(src.size(), src.type());
|
||||
GpuMat dst = _dst.getGpuMat();
|
||||
|
||||
namespace arithm
|
||||
{
|
||||
template <typename T> void bitMatNot(PtrStepSzb src, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
|
||||
if (mask.empty())
|
||||
{
|
||||
if (mask.data)
|
||||
device::transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, bit_not<T>(), mask, stream);
|
||||
const int bcols = (int) (src.cols * src.elemSize());
|
||||
|
||||
if ((bcols & 3) == 0)
|
||||
{
|
||||
const int vcols = bcols >> 2;
|
||||
|
||||
GlobPtrSz<uint> vsrc = globPtr((uint*) src.data, src.step, src.rows, vcols);
|
||||
GlobPtrSz<uint> vdst = globPtr((uint*) dst.data, dst.step, src.rows, vcols);
|
||||
|
||||
gridTransformUnary(vsrc, vdst, bit_not<uint>(), stream);
|
||||
}
|
||||
else if ((bcols & 1) == 0)
|
||||
{
|
||||
const int vcols = bcols >> 1;
|
||||
|
||||
GlobPtrSz<ushort> vsrc = globPtr((ushort*) src.data, src.step, src.rows, vcols);
|
||||
GlobPtrSz<ushort> vdst = globPtr((ushort*) dst.data, dst.step, src.rows, vcols);
|
||||
|
||||
gridTransformUnary(vsrc, vdst, bit_not<ushort>(), stream);
|
||||
}
|
||||
else
|
||||
device::transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, bit_not<T>(), WithOutMask(), stream);
|
||||
}
|
||||
{
|
||||
GlobPtrSz<uchar> vsrc = globPtr((uchar*) src.data, src.step, src.rows, bcols);
|
||||
GlobPtrSz<uchar> vdst = globPtr((uchar*) dst.data, dst.step, src.rows, bcols);
|
||||
|
||||
template <typename T> void bitMatAnd(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
|
||||
gridTransformUnary(vsrc, vdst, bit_not<uchar>(), stream);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (mask.data)
|
||||
device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, bit_and<T>(), mask, stream);
|
||||
if (depth == CV_32F || depth == CV_32S)
|
||||
{
|
||||
GlobPtrSz<uint> vsrc = globPtr((uint*) src.data, src.step, src.rows, src.cols * src.channels());
|
||||
GlobPtrSz<uint> vdst = globPtr((uint*) dst.data, dst.step, src.rows, src.cols * src.channels());
|
||||
|
||||
gridTransformUnary(vsrc, vdst, bit_not<uint>(), singleMaskChannels(globPtr<uchar>(mask), src.channels()), stream);
|
||||
}
|
||||
else if (depth == CV_16S || depth == CV_16U)
|
||||
{
|
||||
GlobPtrSz<ushort> vsrc = globPtr((ushort*) src.data, src.step, src.rows, src.cols * src.channels());
|
||||
GlobPtrSz<ushort> vdst = globPtr((ushort*) dst.data, dst.step, src.rows, src.cols * src.channels());
|
||||
|
||||
gridTransformUnary(vsrc, vdst, bit_not<ushort>(), singleMaskChannels(globPtr<uchar>(mask), src.channels()), stream);
|
||||
}
|
||||
else
|
||||
device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, bit_and<T>(), WithOutMask(), stream);
|
||||
{
|
||||
GlobPtrSz<uchar> vsrc = globPtr((uchar*) src.data, src.step, src.rows, src.cols * src.channels());
|
||||
GlobPtrSz<uchar> vdst = globPtr((uchar*) dst.data, dst.step, src.rows, src.cols * src.channels());
|
||||
|
||||
gridTransformUnary(vsrc, vdst, bit_not<uchar>(), singleMaskChannels(globPtr<uchar>(mask), src.channels()), stream);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T> void bitMatOr(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
|
||||
{
|
||||
if (mask.data)
|
||||
device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, bit_or<T>(), mask, stream);
|
||||
else
|
||||
device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, bit_or<T>(), WithOutMask(), stream);
|
||||
}
|
||||
|
||||
template <typename T> void bitMatXor(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
|
||||
{
|
||||
if (mask.data)
|
||||
device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, bit_xor<T>(), mask, stream);
|
||||
else
|
||||
device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, bit_xor<T>(), WithOutMask(), stream);
|
||||
}
|
||||
|
||||
template void bitMatNot<uchar>(PtrStepSzb src, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void bitMatNot<ushort>(PtrStepSzb src, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void bitMatNot<uint>(PtrStepSzb src, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
|
||||
template void bitMatAnd<uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void bitMatAnd<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void bitMatAnd<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
|
||||
template void bitMatOr<uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void bitMatOr<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void bitMatOr<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
|
||||
template void bitMatXor<uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void bitMatXor<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void bitMatXor<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
}
|
||||
|
||||
#endif // CUDA_DISABLER
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
/// Binary bitwise logical operations
|
||||
|
||||
namespace
|
||||
{
|
||||
template <template <typename> class Op, typename T>
|
||||
void bitMatOp(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, Stream& stream)
|
||||
{
|
||||
GlobPtrSz<T> vsrc1 = globPtr((T*) src1.data, src1.step, src1.rows, src1.cols * src1.channels());
|
||||
GlobPtrSz<T> vsrc2 = globPtr((T*) src2.data, src2.step, src1.rows, src1.cols * src1.channels());
|
||||
GlobPtrSz<T> vdst = globPtr((T*) dst.data, dst.step, src1.rows, src1.cols * src1.channels());
|
||||
|
||||
if (mask.data)
|
||||
gridTransformBinary(vsrc1, vsrc2, vdst, Op<T>(), singleMaskChannels(globPtr<uchar>(mask), src1.channels()), stream);
|
||||
else
|
||||
gridTransformBinary(vsrc1, vsrc2, vdst, Op<T>(), stream);
|
||||
}
|
||||
}
|
||||
|
||||
void bitMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, double, Stream& stream, int op)
|
||||
{
|
||||
typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, Stream& stream);
|
||||
static const func_t funcs32[] =
|
||||
{
|
||||
bitMatOp<bit_and, uint>,
|
||||
bitMatOp<bit_or, uint>,
|
||||
bitMatOp<bit_xor, uint>
|
||||
};
|
||||
static const func_t funcs16[] =
|
||||
{
|
||||
bitMatOp<bit_and, ushort>,
|
||||
bitMatOp<bit_or, ushort>,
|
||||
bitMatOp<bit_xor, ushort>
|
||||
};
|
||||
static const func_t funcs8[] =
|
||||
{
|
||||
bitMatOp<bit_and, uchar>,
|
||||
bitMatOp<bit_or, uchar>,
|
||||
bitMatOp<bit_xor, uchar>
|
||||
};
|
||||
|
||||
const int depth = src1.depth();
|
||||
|
||||
CV_DbgAssert( depth <= CV_32F );
|
||||
CV_DbgAssert( op >= 0 && op < 3 );
|
||||
|
||||
if (mask.empty())
|
||||
{
|
||||
const int bcols = (int) (src1.cols * src1.elemSize());
|
||||
|
||||
if ((bcols & 3) == 0)
|
||||
{
|
||||
const int vcols = bcols >> 2;
|
||||
|
||||
GpuMat vsrc1(src1.rows, vcols, CV_32SC1, src1.data, src1.step);
|
||||
GpuMat vsrc2(src1.rows, vcols, CV_32SC1, src2.data, src2.step);
|
||||
GpuMat vdst(src1.rows, vcols, CV_32SC1, dst.data, dst.step);
|
||||
|
||||
funcs32[op](vsrc1, vsrc2, vdst, GpuMat(), stream);
|
||||
}
|
||||
else if ((bcols & 1) == 0)
|
||||
{
|
||||
const int vcols = bcols >> 1;
|
||||
|
||||
GpuMat vsrc1(src1.rows, vcols, CV_16UC1, src1.data, src1.step);
|
||||
GpuMat vsrc2(src1.rows, vcols, CV_16UC1, src2.data, src2.step);
|
||||
GpuMat vdst(src1.rows, vcols, CV_16UC1, dst.data, dst.step);
|
||||
|
||||
funcs16[op](vsrc1, vsrc2, vdst, GpuMat(), stream);
|
||||
}
|
||||
else
|
||||
{
|
||||
GpuMat vsrc1(src1.rows, bcols, CV_8UC1, src1.data, src1.step);
|
||||
GpuMat vsrc2(src1.rows, bcols, CV_8UC1, src2.data, src2.step);
|
||||
GpuMat vdst(src1.rows, bcols, CV_8UC1, dst.data, dst.step);
|
||||
|
||||
funcs8[op](vsrc1, vsrc2, vdst, GpuMat(), stream);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (depth == CV_32F || depth == CV_32S)
|
||||
{
|
||||
funcs32[op](src1, src2, dst, mask, stream);
|
||||
}
|
||||
else if (depth == CV_16S || depth == CV_16U)
|
||||
{
|
||||
funcs16[op](src1, src2, dst, mask, stream);
|
||||
}
|
||||
else
|
||||
{
|
||||
funcs8[op](src1, src2, dst, mask, stream);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@ -40,65 +40,132 @@
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
#include "opencv2/opencv_modules.hpp"
|
||||
|
||||
#include "opencv2/core/cuda/common.hpp"
|
||||
#include "opencv2/core/cuda/functional.hpp"
|
||||
#include "opencv2/core/cuda/transform.hpp"
|
||||
#include "opencv2/core/cuda/saturate_cast.hpp"
|
||||
#include "opencv2/core/cuda/simd_functions.hpp"
|
||||
#ifndef HAVE_OPENCV_CUDEV
|
||||
|
||||
#include "arithm_func_traits.hpp"
|
||||
#error "opencv_cudev is required"
|
||||
|
||||
using namespace cv::cuda;
|
||||
using namespace cv::cuda::device;
|
||||
#else
|
||||
|
||||
namespace cv { namespace cuda { namespace device
|
||||
#include "opencv2/cudev.hpp"
|
||||
#include "opencv2/core/private.cuda.hpp"
|
||||
|
||||
using namespace cv::cudev;
|
||||
|
||||
void bitScalar(const GpuMat& src, cv::Scalar value, bool, GpuMat& dst, const GpuMat& mask, double, Stream& stream, int op);
|
||||
|
||||
namespace
|
||||
{
|
||||
template <typename T> struct TransformFunctorTraits< binder2nd< bit_and<T> > > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
|
||||
template <template <typename> class Op, typename T>
|
||||
void bitScalarOp(const GpuMat& src, uint value, GpuMat& dst, Stream& stream)
|
||||
{
|
||||
};
|
||||
|
||||
template <typename T> struct TransformFunctorTraits< binder2nd< bit_or<T> > > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
|
||||
{
|
||||
};
|
||||
|
||||
template <typename T> struct TransformFunctorTraits< binder2nd< bit_xor<T> > > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
|
||||
{
|
||||
};
|
||||
}}}
|
||||
|
||||
namespace arithm
|
||||
{
|
||||
template <typename T> void bitScalarAnd(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream)
|
||||
{
|
||||
device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) dst, cv::cuda::device::bind2nd(bit_and<T>(), src2), WithOutMask(), stream);
|
||||
gridTransformUnary(globPtr<T>(src), globPtr<T>(dst), bind2nd(Op<T>(), value), stream);
|
||||
}
|
||||
|
||||
template <typename T> void bitScalarOr(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream)
|
||||
typedef void (*bit_scalar_func_t)(const GpuMat& src, uint value, GpuMat& dst, Stream& stream);
|
||||
|
||||
template <typename T, bit_scalar_func_t func> struct BitScalar
|
||||
{
|
||||
device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) dst, cv::cuda::device::bind2nd(bit_or<T>(), src2), WithOutMask(), stream);
|
||||
}
|
||||
static void call(const GpuMat& src, cv::Scalar value, GpuMat& dst, Stream& stream)
|
||||
{
|
||||
func(src, cv::saturate_cast<T>(value[0]), dst, stream);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T> void bitScalarXor(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream)
|
||||
template <bit_scalar_func_t func> struct BitScalar4
|
||||
{
|
||||
device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) dst, cv::cuda::device::bind2nd(bit_xor<T>(), src2), WithOutMask(), stream);
|
||||
}
|
||||
static void call(const GpuMat& src, cv::Scalar value, GpuMat& dst, Stream& stream)
|
||||
{
|
||||
uint packedVal = 0;
|
||||
|
||||
template void bitScalarAnd<uchar>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void bitScalarAnd<ushort>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void bitScalarAnd<int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void bitScalarAnd<unsigned int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
packedVal |= cv::saturate_cast<uchar>(value[0]);
|
||||
packedVal |= cv::saturate_cast<uchar>(value[1]) << 8;
|
||||
packedVal |= cv::saturate_cast<uchar>(value[2]) << 16;
|
||||
packedVal |= cv::saturate_cast<uchar>(value[3]) << 24;
|
||||
|
||||
template void bitScalarOr<uchar>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void bitScalarOr<ushort>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void bitScalarOr<int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void bitScalarOr<unsigned int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
func(src, packedVal, dst, stream);
|
||||
}
|
||||
};
|
||||
|
||||
template void bitScalarXor<uchar>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void bitScalarXor<ushort>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void bitScalarXor<int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void bitScalarXor<unsigned int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template <int DEPTH, int cn> struct NppBitwiseCFunc
|
||||
{
|
||||
typedef typename NPPTypeTraits<DEPTH>::npp_type npp_type;
|
||||
|
||||
typedef NppStatus (*func_t)(const npp_type* pSrc1, int nSrc1Step, const npp_type* pConstants, npp_type* pDst, int nDstStep, NppiSize oSizeROI);
|
||||
};
|
||||
|
||||
template <int DEPTH, int cn, typename NppBitwiseCFunc<DEPTH, cn>::func_t func> struct NppBitwiseC
|
||||
{
|
||||
typedef typename NppBitwiseCFunc<DEPTH, cn>::npp_type npp_type;
|
||||
|
||||
static void call(const GpuMat& src, cv::Scalar value, GpuMat& dst, Stream& _stream)
|
||||
{
|
||||
cudaStream_t stream = StreamAccessor::getStream(_stream);
|
||||
NppStreamHandler h(stream);
|
||||
|
||||
NppiSize oSizeROI;
|
||||
oSizeROI.width = src.cols;
|
||||
oSizeROI.height = src.rows;
|
||||
|
||||
const npp_type pConstants[] =
|
||||
{
|
||||
cv::saturate_cast<npp_type>(value[0]),
|
||||
cv::saturate_cast<npp_type>(value[1]),
|
||||
cv::saturate_cast<npp_type>(value[2]),
|
||||
cv::saturate_cast<npp_type>(value[3])
|
||||
};
|
||||
|
||||
nppSafeCall( func(src.ptr<npp_type>(), static_cast<int>(src.step), pConstants, dst.ptr<npp_type>(), static_cast<int>(dst.step), oSizeROI) );
|
||||
|
||||
if (stream == 0)
|
||||
CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
#endif // CUDA_DISABLER
|
||||
void bitScalar(const GpuMat& src, cv::Scalar value, bool, GpuMat& dst, const GpuMat& mask, double, Stream& stream, int op)
|
||||
{
|
||||
(void) mask;
|
||||
|
||||
typedef void (*func_t)(const GpuMat& src, cv::Scalar value, GpuMat& dst, Stream& stream);
|
||||
static const func_t funcs[3][6][4] =
|
||||
{
|
||||
{
|
||||
{BitScalar<uchar, bitScalarOp<bit_and, uchar> >::call , 0, NppBitwiseC<CV_8U , 3, nppiAndC_8u_C3R >::call, BitScalar4< bitScalarOp<bit_and, uint> >::call},
|
||||
{BitScalar<uchar, bitScalarOp<bit_and, uchar> >::call , 0, NppBitwiseC<CV_8U , 3, nppiAndC_8u_C3R >::call, BitScalar4< bitScalarOp<bit_and, uint> >::call},
|
||||
{BitScalar<ushort, bitScalarOp<bit_and, ushort> >::call, 0, NppBitwiseC<CV_16U, 3, nppiAndC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiAndC_16u_C4R>::call},
|
||||
{BitScalar<ushort, bitScalarOp<bit_and, ushort> >::call, 0, NppBitwiseC<CV_16U, 3, nppiAndC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiAndC_16u_C4R>::call},
|
||||
{BitScalar<uint, bitScalarOp<bit_and, uint> >::call , 0, NppBitwiseC<CV_32S, 3, nppiAndC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiAndC_32s_C4R>::call},
|
||||
{BitScalar<uint, bitScalarOp<bit_and, uint> >::call , 0, NppBitwiseC<CV_32S, 3, nppiAndC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiAndC_32s_C4R>::call}
|
||||
},
|
||||
{
|
||||
{BitScalar<uchar, bitScalarOp<bit_or, uchar> >::call , 0, NppBitwiseC<CV_8U , 3, nppiOrC_8u_C3R >::call, BitScalar4< bitScalarOp<bit_or, uint> >::call},
|
||||
{BitScalar<uchar, bitScalarOp<bit_or, uchar> >::call , 0, NppBitwiseC<CV_8U , 3, nppiOrC_8u_C3R >::call, BitScalar4< bitScalarOp<bit_or, uint> >::call},
|
||||
{BitScalar<ushort, bitScalarOp<bit_or, ushort> >::call, 0, NppBitwiseC<CV_16U, 3, nppiOrC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiOrC_16u_C4R>::call},
|
||||
{BitScalar<ushort, bitScalarOp<bit_or, ushort> >::call, 0, NppBitwiseC<CV_16U, 3, nppiOrC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiOrC_16u_C4R>::call},
|
||||
{BitScalar<uint, bitScalarOp<bit_or, uint> >::call , 0, NppBitwiseC<CV_32S, 3, nppiOrC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiOrC_32s_C4R>::call},
|
||||
{BitScalar<uint, bitScalarOp<bit_or, uint> >::call , 0, NppBitwiseC<CV_32S, 3, nppiOrC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiOrC_32s_C4R>::call}
|
||||
},
|
||||
{
|
||||
{BitScalar<uchar, bitScalarOp<bit_xor, uchar> >::call , 0, NppBitwiseC<CV_8U , 3, nppiXorC_8u_C3R >::call, BitScalar4< bitScalarOp<bit_xor, uint> >::call},
|
||||
{BitScalar<uchar, bitScalarOp<bit_xor, uchar> >::call , 0, NppBitwiseC<CV_8U , 3, nppiXorC_8u_C3R >::call, BitScalar4< bitScalarOp<bit_xor, uint> >::call},
|
||||
{BitScalar<ushort, bitScalarOp<bit_xor, ushort> >::call, 0, NppBitwiseC<CV_16U, 3, nppiXorC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiXorC_16u_C4R>::call},
|
||||
{BitScalar<ushort, bitScalarOp<bit_xor, ushort> >::call, 0, NppBitwiseC<CV_16U, 3, nppiXorC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiXorC_16u_C4R>::call},
|
||||
{BitScalar<uint, bitScalarOp<bit_xor, uint> >::call , 0, NppBitwiseC<CV_32S, 3, nppiXorC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiXorC_32s_C4R>::call},
|
||||
{BitScalar<uint, bitScalarOp<bit_xor, uint> >::call , 0, NppBitwiseC<CV_32S, 3, nppiXorC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiXorC_32s_C4R>::call}
|
||||
}
|
||||
};
|
||||
|
||||
const int depth = src.depth();
|
||||
const int cn = src.channels();
|
||||
|
||||
CV_DbgAssert( depth <= CV_32F );
|
||||
CV_DbgAssert( cn == 1 || cn == 3 || cn == 4 );
|
||||
CV_DbgAssert( mask.empty() );
|
||||
CV_DbgAssert( op >= 0 && op < 3 );
|
||||
|
||||
funcs[op][depth][cn - 1](src, value, dst, stream);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@ -40,64 +40,23 @@
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
#include "opencv2/opencv_modules.hpp"
|
||||
|
||||
#include "opencv2/core/cuda/common.hpp"
|
||||
#include "opencv2/core/cuda/functional.hpp"
|
||||
#include "opencv2/core/cuda/transform.hpp"
|
||||
#include "opencv2/core/cuda/saturate_cast.hpp"
|
||||
#include "opencv2/core/cuda/simd_functions.hpp"
|
||||
#ifndef HAVE_OPENCV_CUDEV
|
||||
|
||||
#include "arithm_func_traits.hpp"
|
||||
#error "opencv_cudev is required"
|
||||
|
||||
using namespace cv::cuda;
|
||||
using namespace cv::cuda::device;
|
||||
#else
|
||||
|
||||
namespace arithm
|
||||
#include "opencv2/cudev.hpp"
|
||||
|
||||
using namespace cv::cudev;
|
||||
|
||||
void cmpMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat&, double, Stream& stream, int cmpop);
|
||||
|
||||
namespace
|
||||
{
|
||||
struct VCmpEq4 : binary_function<uint, uint, uint>
|
||||
{
|
||||
__device__ __forceinline__ uint operator ()(uint a, uint b) const
|
||||
{
|
||||
return vcmpeq4(a, b);
|
||||
}
|
||||
|
||||
__host__ __device__ __forceinline__ VCmpEq4() {}
|
||||
__host__ __device__ __forceinline__ VCmpEq4(const VCmpEq4&) {}
|
||||
};
|
||||
struct VCmpNe4 : binary_function<uint, uint, uint>
|
||||
{
|
||||
__device__ __forceinline__ uint operator ()(uint a, uint b) const
|
||||
{
|
||||
return vcmpne4(a, b);
|
||||
}
|
||||
|
||||
__host__ __device__ __forceinline__ VCmpNe4() {}
|
||||
__host__ __device__ __forceinline__ VCmpNe4(const VCmpNe4&) {}
|
||||
};
|
||||
struct VCmpLt4 : binary_function<uint, uint, uint>
|
||||
{
|
||||
__device__ __forceinline__ uint operator ()(uint a, uint b) const
|
||||
{
|
||||
return vcmplt4(a, b);
|
||||
}
|
||||
|
||||
__host__ __device__ __forceinline__ VCmpLt4() {}
|
||||
__host__ __device__ __forceinline__ VCmpLt4(const VCmpLt4&) {}
|
||||
};
|
||||
struct VCmpLe4 : binary_function<uint, uint, uint>
|
||||
{
|
||||
__device__ __forceinline__ uint operator ()(uint a, uint b) const
|
||||
{
|
||||
return vcmple4(a, b);
|
||||
}
|
||||
|
||||
__host__ __device__ __forceinline__ VCmpLe4() {}
|
||||
__host__ __device__ __forceinline__ VCmpLe4(const VCmpLe4&) {}
|
||||
};
|
||||
|
||||
template <class Op, typename T>
|
||||
struct Cmp : binary_function<T, T, uchar>
|
||||
template <class Op, typename T> struct CmpOp : binary_function<T, T, uchar>
|
||||
{
|
||||
__device__ __forceinline__ uchar operator()(T a, T b) const
|
||||
{
|
||||
@ -105,102 +64,156 @@ namespace arithm
|
||||
return -op(a, b);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
namespace cv { namespace cuda { namespace device
|
||||
{
|
||||
template <> struct TransformFunctorTraits< arithm::VCmpEq4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
|
||||
template <typename ScalarDepth> struct TransformPolicy : DefaultTransformPolicy
|
||||
{
|
||||
};
|
||||
template <> struct TransformFunctorTraits< arithm::VCmpNe4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
|
||||
template <> struct TransformPolicy<double> : DefaultTransformPolicy
|
||||
{
|
||||
enum {
|
||||
shift = 1
|
||||
};
|
||||
};
|
||||
template <> struct TransformFunctorTraits< arithm::VCmpLt4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
|
||||
{
|
||||
};
|
||||
template <> struct TransformFunctorTraits< arithm::VCmpLe4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
|
||||
{
|
||||
};
|
||||
|
||||
template <class Op, typename T> struct TransformFunctorTraits< arithm::Cmp<Op, T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(uchar)>
|
||||
{
|
||||
};
|
||||
}}}
|
||||
|
||||
namespace arithm
|
||||
{
|
||||
void cmpMatEq_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
|
||||
{
|
||||
device::transform(src1, src2, dst, VCmpEq4(), WithOutMask(), stream);
|
||||
}
|
||||
void cmpMatNe_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
|
||||
{
|
||||
device::transform(src1, src2, dst, VCmpNe4(), WithOutMask(), stream);
|
||||
}
|
||||
void cmpMatLt_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
|
||||
{
|
||||
device::transform(src1, src2, dst, VCmpLt4(), WithOutMask(), stream);
|
||||
}
|
||||
void cmpMatLe_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
|
||||
{
|
||||
device::transform(src1, src2, dst, VCmpLe4(), WithOutMask(), stream);
|
||||
}
|
||||
|
||||
template <template <typename> class Op, typename T>
|
||||
void cmpMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
|
||||
void cmpMat_v1(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
|
||||
{
|
||||
Cmp<Op<T>, T> op;
|
||||
device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, dst, op, WithOutMask(), stream);
|
||||
CmpOp<Op<T>, T> op;
|
||||
gridTransformBinary_< TransformPolicy<T> >(globPtr<T>(src1), globPtr<T>(src2), globPtr<uchar>(dst), op, stream);
|
||||
}
|
||||
|
||||
template <typename T> void cmpMatEq(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
|
||||
struct VCmpEq4 : binary_function<uint, uint, uint>
|
||||
{
|
||||
cmpMat<equal_to, T>(src1, src2, dst, stream);
|
||||
}
|
||||
template <typename T> void cmpMatNe(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
|
||||
__device__ __forceinline__ uint operator ()(uint a, uint b) const
|
||||
{
|
||||
return vcmpeq4(a, b);
|
||||
}
|
||||
};
|
||||
struct VCmpNe4 : binary_function<uint, uint, uint>
|
||||
{
|
||||
cmpMat<not_equal_to, T>(src1, src2, dst, stream);
|
||||
}
|
||||
template <typename T> void cmpMatLt(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
|
||||
__device__ __forceinline__ uint operator ()(uint a, uint b) const
|
||||
{
|
||||
return vcmpne4(a, b);
|
||||
}
|
||||
};
|
||||
struct VCmpLt4 : binary_function<uint, uint, uint>
|
||||
{
|
||||
cmpMat<less, T>(src1, src2, dst, stream);
|
||||
}
|
||||
template <typename T> void cmpMatLe(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
|
||||
__device__ __forceinline__ uint operator ()(uint a, uint b) const
|
||||
{
|
||||
return vcmplt4(a, b);
|
||||
}
|
||||
};
|
||||
struct VCmpLe4 : binary_function<uint, uint, uint>
|
||||
{
|
||||
cmpMat<less_equal, T>(src1, src2, dst, stream);
|
||||
__device__ __forceinline__ uint operator ()(uint a, uint b) const
|
||||
{
|
||||
return vcmple4(a, b);
|
||||
}
|
||||
};
|
||||
|
||||
void cmpMatEq_v4(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
|
||||
{
|
||||
const int vcols = src1.cols >> 2;
|
||||
|
||||
GlobPtrSz<uint> src1_ = globPtr((uint*) src1.data, src1.step, src1.rows, vcols);
|
||||
GlobPtrSz<uint> src2_ = globPtr((uint*) src2.data, src2.step, src1.rows, vcols);
|
||||
GlobPtrSz<uint> dst_ = globPtr((uint*) dst.data, dst.step, src1.rows, vcols);
|
||||
|
||||
gridTransformBinary(src1_, src2_, dst_, VCmpEq4(), stream);
|
||||
}
|
||||
void cmpMatNe_v4(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
|
||||
{
|
||||
const int vcols = src1.cols >> 2;
|
||||
|
||||
template void cmpMatEq<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void cmpMatEq<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void cmpMatEq<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void cmpMatEq<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void cmpMatEq<int >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void cmpMatEq<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void cmpMatEq<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
GlobPtrSz<uint> src1_ = globPtr((uint*) src1.data, src1.step, src1.rows, vcols);
|
||||
GlobPtrSz<uint> src2_ = globPtr((uint*) src2.data, src2.step, src1.rows, vcols);
|
||||
GlobPtrSz<uint> dst_ = globPtr((uint*) dst.data, dst.step, src1.rows, vcols);
|
||||
|
||||
template void cmpMatNe<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void cmpMatNe<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void cmpMatNe<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void cmpMatNe<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void cmpMatNe<int >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void cmpMatNe<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void cmpMatNe<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
gridTransformBinary(src1_, src2_, dst_, VCmpNe4(), stream);
|
||||
}
|
||||
void cmpMatLt_v4(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
|
||||
{
|
||||
const int vcols = src1.cols >> 2;
|
||||
|
||||
template void cmpMatLt<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void cmpMatLt<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void cmpMatLt<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void cmpMatLt<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void cmpMatLt<int >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void cmpMatLt<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void cmpMatLt<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
GlobPtrSz<uint> src1_ = globPtr((uint*) src1.data, src1.step, src1.rows, vcols);
|
||||
GlobPtrSz<uint> src2_ = globPtr((uint*) src2.data, src2.step, src1.rows, vcols);
|
||||
GlobPtrSz<uint> dst_ = globPtr((uint*) dst.data, dst.step, src1.rows, vcols);
|
||||
|
||||
template void cmpMatLe<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void cmpMatLe<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void cmpMatLe<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void cmpMatLe<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void cmpMatLe<int >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void cmpMatLe<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void cmpMatLe<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
gridTransformBinary(src1_, src2_, dst_, VCmpLt4(), stream);
|
||||
}
|
||||
void cmpMatLe_v4(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
|
||||
{
|
||||
const int vcols = src1.cols >> 2;
|
||||
|
||||
GlobPtrSz<uint> src1_ = globPtr((uint*) src1.data, src1.step, src1.rows, vcols);
|
||||
GlobPtrSz<uint> src2_ = globPtr((uint*) src2.data, src2.step, src1.rows, vcols);
|
||||
GlobPtrSz<uint> dst_ = globPtr((uint*) dst.data, dst.step, src1.rows, vcols);
|
||||
|
||||
gridTransformBinary(src1_, src2_, dst_, VCmpLe4(), stream);
|
||||
}
|
||||
}
|
||||
|
||||
#endif // CUDA_DISABLER
|
||||
void cmpMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat&, double, Stream& stream, int cmpop)
|
||||
{
|
||||
typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream);
|
||||
static const func_t funcs[7][4] =
|
||||
{
|
||||
{cmpMat_v1<equal_to, uchar> , cmpMat_v1<not_equal_to, uchar> , cmpMat_v1<less, uchar> , cmpMat_v1<less_equal, uchar> },
|
||||
{cmpMat_v1<equal_to, schar> , cmpMat_v1<not_equal_to, schar> , cmpMat_v1<less, schar> , cmpMat_v1<less_equal, schar> },
|
||||
{cmpMat_v1<equal_to, ushort>, cmpMat_v1<not_equal_to, ushort>, cmpMat_v1<less, ushort>, cmpMat_v1<less_equal, ushort>},
|
||||
{cmpMat_v1<equal_to, short> , cmpMat_v1<not_equal_to, short> , cmpMat_v1<less, short> , cmpMat_v1<less_equal, short> },
|
||||
{cmpMat_v1<equal_to, int> , cmpMat_v1<not_equal_to, int> , cmpMat_v1<less, int> , cmpMat_v1<less_equal, int> },
|
||||
{cmpMat_v1<equal_to, float> , cmpMat_v1<not_equal_to, float> , cmpMat_v1<less, float> , cmpMat_v1<less_equal, float> },
|
||||
{cmpMat_v1<equal_to, double>, cmpMat_v1<not_equal_to, double>, cmpMat_v1<less, double>, cmpMat_v1<less_equal, double>}
|
||||
};
|
||||
|
||||
typedef void (*func_v4_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream);
|
||||
static const func_v4_t funcs_v4[] =
|
||||
{
|
||||
cmpMatEq_v4, cmpMatNe_v4, cmpMatLt_v4, cmpMatLe_v4
|
||||
};
|
||||
|
||||
const int depth = src1.depth();
|
||||
|
||||
CV_DbgAssert( depth <= CV_64F );
|
||||
|
||||
static const int codes[] =
|
||||
{
|
||||
0, 2, 3, 2, 3, 1
|
||||
};
|
||||
const GpuMat* psrc1[] =
|
||||
{
|
||||
&src1, &src2, &src2, &src1, &src1, &src1
|
||||
};
|
||||
const GpuMat* psrc2[] =
|
||||
{
|
||||
&src2, &src1, &src1, &src2, &src2, &src2
|
||||
};
|
||||
|
||||
const int code = codes[cmpop];
|
||||
|
||||
GpuMat src1_ = psrc1[cmpop]->reshape(1);
|
||||
GpuMat src2_ = psrc2[cmpop]->reshape(1);
|
||||
GpuMat dst_ = dst.reshape(1);
|
||||
|
||||
if (depth == CV_8U && (src1_.cols & 3) == 0)
|
||||
{
|
||||
const intptr_t src1ptr = reinterpret_cast<intptr_t>(src1_.data);
|
||||
const intptr_t src2ptr = reinterpret_cast<intptr_t>(src2_.data);
|
||||
const intptr_t dstptr = reinterpret_cast<intptr_t>(dst_.data);
|
||||
|
||||
const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0;
|
||||
|
||||
if (isAllAligned)
|
||||
{
|
||||
funcs_v4[code](src1_, src2_, dst_, stream);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
const func_t func = funcs[depth][code];
|
||||
|
||||
func(src1_, src2_, dst_, stream);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@ -40,24 +40,23 @@
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
#include "opencv2/opencv_modules.hpp"
|
||||
|
||||
#include "opencv2/core/cuda/common.hpp"
|
||||
#include "opencv2/core/cuda/functional.hpp"
|
||||
#include "opencv2/core/cuda/transform.hpp"
|
||||
#include "opencv2/core/cuda/saturate_cast.hpp"
|
||||
#include "opencv2/core/cuda/simd_functions.hpp"
|
||||
#include "opencv2/core/cuda/vec_math.hpp"
|
||||
#ifndef HAVE_OPENCV_CUDEV
|
||||
|
||||
#include "arithm_func_traits.hpp"
|
||||
#error "opencv_cudev is required"
|
||||
|
||||
using namespace cv::cuda;
|
||||
using namespace cv::cuda::device;
|
||||
#else
|
||||
|
||||
namespace arithm
|
||||
#include "opencv2/cudev.hpp"
|
||||
|
||||
using namespace cv::cudev;
|
||||
|
||||
void cmpScalar(const GpuMat& src, cv::Scalar val, bool inv, GpuMat& dst, const GpuMat&, double, Stream& stream, int cmpop);
|
||||
|
||||
namespace
|
||||
{
|
||||
template <class Op, typename T>
|
||||
struct Cmp : binary_function<T, T, uchar>
|
||||
template <class Op, typename T> struct CmpOp : binary_function<T, T, uchar>
|
||||
{
|
||||
__device__ __forceinline__ uchar operator()(T a, T b) const
|
||||
{
|
||||
@ -66,219 +65,161 @@ namespace arithm
|
||||
}
|
||||
};
|
||||
|
||||
#define TYPE_VEC(type, cn) typename TypeVec<type, cn>::vec_type
|
||||
#define MAKE_VEC(_type, _cn) typename MakeVec<_type, _cn>::type
|
||||
|
||||
template <class Op, typename T, int cn> struct CmpScalarOp;
|
||||
|
||||
template <class Op, typename T, int cn> struct CmpScalar;
|
||||
template <class Op, typename T>
|
||||
struct CmpScalar<Op, T, 1> : unary_function<T, uchar>
|
||||
struct CmpScalarOp<Op, T, 1> : unary_function<T, uchar>
|
||||
{
|
||||
T val;
|
||||
|
||||
__host__ explicit CmpScalar(T val_) : val(val_) {}
|
||||
|
||||
__device__ __forceinline__ uchar operator()(T src) const
|
||||
{
|
||||
Cmp<Op, T> op;
|
||||
CmpOp<Op, T> op;
|
||||
return op(src, val);
|
||||
}
|
||||
};
|
||||
|
||||
template <class Op, typename T>
|
||||
struct CmpScalar<Op, T, 2> : unary_function<TYPE_VEC(T, 2), TYPE_VEC(uchar, 2)>
|
||||
struct CmpScalarOp<Op, T, 2> : unary_function<MAKE_VEC(T, 2), MAKE_VEC(uchar, 2)>
|
||||
{
|
||||
TYPE_VEC(T, 2) val;
|
||||
MAKE_VEC(T, 2) val;
|
||||
|
||||
__host__ explicit CmpScalar(TYPE_VEC(T, 2) val_) : val(val_) {}
|
||||
|
||||
__device__ __forceinline__ TYPE_VEC(uchar, 2) operator()(const TYPE_VEC(T, 2) & src) const
|
||||
__device__ __forceinline__ MAKE_VEC(uchar, 2) operator()(const MAKE_VEC(T, 2) & src) const
|
||||
{
|
||||
Cmp<Op, T> op;
|
||||
return VecTraits<TYPE_VEC(uchar, 2)>::make(op(src.x, val.x), op(src.y, val.y));
|
||||
CmpOp<Op, T> op;
|
||||
return VecTraits<MAKE_VEC(uchar, 2)>::make(op(src.x, val.x), op(src.y, val.y));
|
||||
}
|
||||
};
|
||||
|
||||
template <class Op, typename T>
|
||||
struct CmpScalar<Op, T, 3> : unary_function<TYPE_VEC(T, 3), TYPE_VEC(uchar, 3)>
|
||||
struct CmpScalarOp<Op, T, 3> : unary_function<MAKE_VEC(T, 3), MAKE_VEC(uchar, 3)>
|
||||
{
|
||||
TYPE_VEC(T, 3) val;
|
||||
MAKE_VEC(T, 3) val;
|
||||
|
||||
__host__ explicit CmpScalar(TYPE_VEC(T, 3) val_) : val(val_) {}
|
||||
|
||||
__device__ __forceinline__ TYPE_VEC(uchar, 3) operator()(const TYPE_VEC(T, 3) & src) const
|
||||
__device__ __forceinline__ MAKE_VEC(uchar, 3) operator()(const MAKE_VEC(T, 3) & src) const
|
||||
{
|
||||
Cmp<Op, T> op;
|
||||
return VecTraits<TYPE_VEC(uchar, 3)>::make(op(src.x, val.x), op(src.y, val.y), op(src.z, val.z));
|
||||
CmpOp<Op, T> op;
|
||||
return VecTraits<MAKE_VEC(uchar, 3)>::make(op(src.x, val.x), op(src.y, val.y), op(src.z, val.z));
|
||||
}
|
||||
};
|
||||
|
||||
template <class Op, typename T>
|
||||
struct CmpScalar<Op, T, 4> : unary_function<TYPE_VEC(T, 4), TYPE_VEC(uchar, 4)>
|
||||
struct CmpScalarOp<Op, T, 4> : unary_function<MAKE_VEC(T, 4), MAKE_VEC(uchar, 4)>
|
||||
{
|
||||
TYPE_VEC(T, 4) val;
|
||||
MAKE_VEC(T, 4) val;
|
||||
|
||||
__host__ explicit CmpScalar(TYPE_VEC(T, 4) val_) : val(val_) {}
|
||||
|
||||
__device__ __forceinline__ TYPE_VEC(uchar, 4) operator()(const TYPE_VEC(T, 4) & src) const
|
||||
__device__ __forceinline__ MAKE_VEC(uchar, 4) operator()(const MAKE_VEC(T, 4) & src) const
|
||||
{
|
||||
Cmp<Op, T> op;
|
||||
return VecTraits<TYPE_VEC(uchar, 4)>::make(op(src.x, val.x), op(src.y, val.y), op(src.z, val.z), op(src.w, val.w));
|
||||
CmpOp<Op, T> op;
|
||||
return VecTraits<MAKE_VEC(uchar, 4)>::make(op(src.x, val.x), op(src.y, val.y), op(src.z, val.z), op(src.w, val.w));
|
||||
}
|
||||
};
|
||||
|
||||
#undef TYPE_VEC
|
||||
}
|
||||
|
||||
namespace cv { namespace cuda { namespace device
|
||||
{
|
||||
template <class Op, typename T> struct TransformFunctorTraits< arithm::CmpScalar<Op, T, 1> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(uchar)>
|
||||
template <typename ScalarDepth> struct TransformPolicy : DefaultTransformPolicy
|
||||
{
|
||||
};
|
||||
}}}
|
||||
template <> struct TransformPolicy<double> : DefaultTransformPolicy
|
||||
{
|
||||
enum {
|
||||
shift = 1
|
||||
};
|
||||
};
|
||||
|
||||
namespace arithm
|
||||
{
|
||||
template <template <typename> class Op, typename T, int cn>
|
||||
void cmpScalar(PtrStepSzb src, double val[4], PtrStepSzb dst, cudaStream_t stream)
|
||||
void cmpScalarImpl(const GpuMat& src, cv::Scalar value, GpuMat& dst, Stream& stream)
|
||||
{
|
||||
typedef typename TypeVec<T, cn>::vec_type src_t;
|
||||
typedef typename TypeVec<uchar, cn>::vec_type dst_t;
|
||||
typedef typename MakeVec<T, cn>::type src_type;
|
||||
typedef typename MakeVec<uchar, cn>::type dst_type;
|
||||
|
||||
T sval[] = {static_cast<T>(val[0]), static_cast<T>(val[1]), static_cast<T>(val[2]), static_cast<T>(val[3])};
|
||||
src_t val1 = VecTraits<src_t>::make(sval);
|
||||
cv::Scalar_<T> value_ = value;
|
||||
|
||||
CmpScalar<Op<T>, T, cn> op(val1);
|
||||
device::transform((PtrStepSz<src_t>) src, (PtrStepSz<dst_t>) dst, op, WithOutMask(), stream);
|
||||
CmpScalarOp<Op<T>, T, cn> op;
|
||||
op.val = VecTraits<src_type>::make(value_.val);
|
||||
|
||||
gridTransformUnary_< TransformPolicy<T> >(globPtr<src_type>(src), globPtr<dst_type>(dst), op, stream);
|
||||
}
|
||||
|
||||
template <typename T> void cmpScalarEq(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream)
|
||||
{
|
||||
typedef void (*func_t)(PtrStepSzb src, double val[4], PtrStepSzb dst, cudaStream_t stream);
|
||||
static const func_t funcs[] =
|
||||
{
|
||||
0,
|
||||
cmpScalar<equal_to, T, 1>,
|
||||
cmpScalar<equal_to, T, 2>,
|
||||
cmpScalar<equal_to, T, 3>,
|
||||
cmpScalar<equal_to, T, 4>
|
||||
};
|
||||
|
||||
funcs[cn](src, val, dst, stream);
|
||||
}
|
||||
template <typename T> void cmpScalarNe(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream)
|
||||
{
|
||||
typedef void (*func_t)(PtrStepSzb src, double val[4], PtrStepSzb dst, cudaStream_t stream);
|
||||
static const func_t funcs[] =
|
||||
{
|
||||
0,
|
||||
cmpScalar<not_equal_to, T, 1>,
|
||||
cmpScalar<not_equal_to, T, 2>,
|
||||
cmpScalar<not_equal_to, T, 3>,
|
||||
cmpScalar<not_equal_to, T, 4>
|
||||
};
|
||||
|
||||
funcs[cn](src, val, dst, stream);
|
||||
}
|
||||
template <typename T> void cmpScalarLt(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream)
|
||||
{
|
||||
typedef void (*func_t)(PtrStepSzb src, double val[4], PtrStepSzb dst, cudaStream_t stream);
|
||||
static const func_t funcs[] =
|
||||
{
|
||||
0,
|
||||
cmpScalar<less, T, 1>,
|
||||
cmpScalar<less, T, 2>,
|
||||
cmpScalar<less, T, 3>,
|
||||
cmpScalar<less, T, 4>
|
||||
};
|
||||
|
||||
funcs[cn](src, val, dst, stream);
|
||||
}
|
||||
template <typename T> void cmpScalarLe(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream)
|
||||
{
|
||||
typedef void (*func_t)(PtrStepSzb src, double val[4], PtrStepSzb dst, cudaStream_t stream);
|
||||
static const func_t funcs[] =
|
||||
{
|
||||
0,
|
||||
cmpScalar<less_equal, T, 1>,
|
||||
cmpScalar<less_equal, T, 2>,
|
||||
cmpScalar<less_equal, T, 3>,
|
||||
cmpScalar<less_equal, T, 4>
|
||||
};
|
||||
|
||||
funcs[cn](src, val, dst, stream);
|
||||
}
|
||||
template <typename T> void cmpScalarGt(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream)
|
||||
{
|
||||
typedef void (*func_t)(PtrStepSzb src, double val[4], PtrStepSzb dst, cudaStream_t stream);
|
||||
static const func_t funcs[] =
|
||||
{
|
||||
0,
|
||||
cmpScalar<greater, T, 1>,
|
||||
cmpScalar<greater, T, 2>,
|
||||
cmpScalar<greater, T, 3>,
|
||||
cmpScalar<greater, T, 4>
|
||||
};
|
||||
|
||||
funcs[cn](src, val, dst, stream);
|
||||
}
|
||||
template <typename T> void cmpScalarGe(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream)
|
||||
{
|
||||
typedef void (*func_t)(PtrStepSzb src, double val[4], PtrStepSzb dst, cudaStream_t stream);
|
||||
static const func_t funcs[] =
|
||||
{
|
||||
0,
|
||||
cmpScalar<greater_equal, T, 1>,
|
||||
cmpScalar<greater_equal, T, 2>,
|
||||
cmpScalar<greater_equal, T, 3>,
|
||||
cmpScalar<greater_equal, T, 4>
|
||||
};
|
||||
|
||||
funcs[cn](src, val, dst, stream);
|
||||
}
|
||||
|
||||
template void cmpScalarEq<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
|
||||
template void cmpScalarEq<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
|
||||
template void cmpScalarEq<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
|
||||
template void cmpScalarEq<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
|
||||
template void cmpScalarEq<int >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
|
||||
template void cmpScalarEq<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
|
||||
template void cmpScalarEq<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
|
||||
|
||||
template void cmpScalarNe<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
|
||||
template void cmpScalarNe<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
|
||||
template void cmpScalarNe<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
|
||||
template void cmpScalarNe<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
|
||||
template void cmpScalarNe<int >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
|
||||
template void cmpScalarNe<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
|
||||
template void cmpScalarNe<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
|
||||
|
||||
template void cmpScalarLt<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
|
||||
template void cmpScalarLt<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
|
||||
template void cmpScalarLt<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
|
||||
template void cmpScalarLt<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
|
||||
template void cmpScalarLt<int >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
|
||||
template void cmpScalarLt<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
|
||||
template void cmpScalarLt<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
|
||||
|
||||
template void cmpScalarLe<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
|
||||
template void cmpScalarLe<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
|
||||
template void cmpScalarLe<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
|
||||
template void cmpScalarLe<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
|
||||
template void cmpScalarLe<int >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
|
||||
template void cmpScalarLe<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
|
||||
template void cmpScalarLe<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
|
||||
|
||||
template void cmpScalarGt<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
|
||||
template void cmpScalarGt<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
|
||||
template void cmpScalarGt<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
|
||||
template void cmpScalarGt<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
|
||||
template void cmpScalarGt<int >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
|
||||
template void cmpScalarGt<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
|
||||
template void cmpScalarGt<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
|
||||
|
||||
template void cmpScalarGe<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
|
||||
template void cmpScalarGe<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
|
||||
template void cmpScalarGe<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
|
||||
template void cmpScalarGe<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
|
||||
template void cmpScalarGe<int >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
|
||||
template void cmpScalarGe<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
|
||||
template void cmpScalarGe<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
|
||||
}
|
||||
|
||||
#endif // CUDA_DISABLER
|
||||
void cmpScalar(const GpuMat& src, cv::Scalar val, bool inv, GpuMat& dst, const GpuMat&, double, Stream& stream, int cmpop)
|
||||
{
|
||||
typedef void (*func_t)(const GpuMat& src, cv::Scalar value, GpuMat& dst, Stream& stream);
|
||||
static const func_t funcs[7][6][4] =
|
||||
{
|
||||
{
|
||||
{cmpScalarImpl<equal_to, uchar, 1>, cmpScalarImpl<equal_to, uchar, 2>, cmpScalarImpl<equal_to, uchar, 3>, cmpScalarImpl<equal_to, uchar, 4>},
|
||||
{cmpScalarImpl<greater, uchar, 1>, cmpScalarImpl<greater, uchar, 2>, cmpScalarImpl<greater, uchar, 3>, cmpScalarImpl<greater, uchar, 4>},
|
||||
{cmpScalarImpl<greater_equal, uchar, 1>, cmpScalarImpl<greater_equal, uchar, 2>, cmpScalarImpl<greater_equal, uchar, 3>, cmpScalarImpl<greater_equal, uchar, 4>},
|
||||
{cmpScalarImpl<less, uchar, 1>, cmpScalarImpl<less, uchar, 2>, cmpScalarImpl<less, uchar, 3>, cmpScalarImpl<less, uchar, 4>},
|
||||
{cmpScalarImpl<less_equal, uchar, 1>, cmpScalarImpl<less_equal, uchar, 2>, cmpScalarImpl<less_equal, uchar, 3>, cmpScalarImpl<less_equal, uchar, 4>},
|
||||
{cmpScalarImpl<not_equal_to, uchar, 1>, cmpScalarImpl<not_equal_to, uchar, 2>, cmpScalarImpl<not_equal_to, uchar, 3>, cmpScalarImpl<not_equal_to, uchar, 4>}
|
||||
},
|
||||
{
|
||||
{cmpScalarImpl<equal_to, schar, 1>, cmpScalarImpl<equal_to, schar, 2>, cmpScalarImpl<equal_to, schar, 3>, cmpScalarImpl<equal_to, schar, 4>},
|
||||
{cmpScalarImpl<greater, schar, 1>, cmpScalarImpl<greater, schar, 2>, cmpScalarImpl<greater, schar, 3>, cmpScalarImpl<greater, schar, 4>},
|
||||
{cmpScalarImpl<greater_equal, schar, 1>, cmpScalarImpl<greater_equal, schar, 2>, cmpScalarImpl<greater_equal, schar, 3>, cmpScalarImpl<greater_equal, schar, 4>},
|
||||
{cmpScalarImpl<less, schar, 1>, cmpScalarImpl<less, schar, 2>, cmpScalarImpl<less, schar, 3>, cmpScalarImpl<less, schar, 4>},
|
||||
{cmpScalarImpl<less_equal, schar, 1>, cmpScalarImpl<less_equal, schar, 2>, cmpScalarImpl<less_equal, schar, 3>, cmpScalarImpl<less_equal, schar, 4>},
|
||||
{cmpScalarImpl<not_equal_to, schar, 1>, cmpScalarImpl<not_equal_to, schar, 2>, cmpScalarImpl<not_equal_to, schar, 3>, cmpScalarImpl<not_equal_to, schar, 4>}
|
||||
},
|
||||
{
|
||||
{cmpScalarImpl<equal_to, ushort, 1>, cmpScalarImpl<equal_to, ushort, 2>, cmpScalarImpl<equal_to, ushort, 3>, cmpScalarImpl<equal_to, ushort, 4>},
|
||||
{cmpScalarImpl<greater, ushort, 1>, cmpScalarImpl<greater, ushort, 2>, cmpScalarImpl<greater, ushort, 3>, cmpScalarImpl<greater, ushort, 4>},
|
||||
{cmpScalarImpl<greater_equal, ushort, 1>, cmpScalarImpl<greater_equal, ushort, 2>, cmpScalarImpl<greater_equal, ushort, 3>, cmpScalarImpl<greater_equal, ushort, 4>},
|
||||
{cmpScalarImpl<less, ushort, 1>, cmpScalarImpl<less, ushort, 2>, cmpScalarImpl<less, ushort, 3>, cmpScalarImpl<less, ushort, 4>},
|
||||
{cmpScalarImpl<less_equal, ushort, 1>, cmpScalarImpl<less_equal, ushort, 2>, cmpScalarImpl<less_equal, ushort, 3>, cmpScalarImpl<less_equal, ushort, 4>},
|
||||
{cmpScalarImpl<not_equal_to, ushort, 1>, cmpScalarImpl<not_equal_to, ushort, 2>, cmpScalarImpl<not_equal_to, ushort, 3>, cmpScalarImpl<not_equal_to, ushort, 4>}
|
||||
},
|
||||
{
|
||||
{cmpScalarImpl<equal_to, short, 1>, cmpScalarImpl<equal_to, short, 2>, cmpScalarImpl<equal_to, short, 3>, cmpScalarImpl<equal_to, short, 4>},
|
||||
{cmpScalarImpl<greater, short, 1>, cmpScalarImpl<greater, short, 2>, cmpScalarImpl<greater, short, 3>, cmpScalarImpl<greater, short, 4>},
|
||||
{cmpScalarImpl<greater_equal, short, 1>, cmpScalarImpl<greater_equal, short, 2>, cmpScalarImpl<greater_equal, short, 3>, cmpScalarImpl<greater_equal, short, 4>},
|
||||
{cmpScalarImpl<less, short, 1>, cmpScalarImpl<less, short, 2>, cmpScalarImpl<less, short, 3>, cmpScalarImpl<less, short, 4>},
|
||||
{cmpScalarImpl<less_equal, short, 1>, cmpScalarImpl<less_equal, short, 2>, cmpScalarImpl<less_equal, short, 3>, cmpScalarImpl<less_equal, short, 4>},
|
||||
{cmpScalarImpl<not_equal_to, short, 1>, cmpScalarImpl<not_equal_to, short, 2>, cmpScalarImpl<not_equal_to, short, 3>, cmpScalarImpl<not_equal_to, short, 4>}
|
||||
},
|
||||
{
|
||||
{cmpScalarImpl<equal_to, int, 1>, cmpScalarImpl<equal_to, int, 2>, cmpScalarImpl<equal_to, int, 3>, cmpScalarImpl<equal_to, int, 4>},
|
||||
{cmpScalarImpl<greater, int, 1>, cmpScalarImpl<greater, int, 2>, cmpScalarImpl<greater, int, 3>, cmpScalarImpl<greater, int, 4>},
|
||||
{cmpScalarImpl<greater_equal, int, 1>, cmpScalarImpl<greater_equal, int, 2>, cmpScalarImpl<greater_equal, int, 3>, cmpScalarImpl<greater_equal, int, 4>},
|
||||
{cmpScalarImpl<less, int, 1>, cmpScalarImpl<less, int, 2>, cmpScalarImpl<less, int, 3>, cmpScalarImpl<less, int, 4>},
|
||||
{cmpScalarImpl<less_equal, int, 1>, cmpScalarImpl<less_equal, int, 2>, cmpScalarImpl<less_equal, int, 3>, cmpScalarImpl<less_equal, int, 4>},
|
||||
{cmpScalarImpl<not_equal_to, int, 1>, cmpScalarImpl<not_equal_to, int, 2>, cmpScalarImpl<not_equal_to, int, 3>, cmpScalarImpl<not_equal_to, int, 4>}
|
||||
},
|
||||
{
|
||||
{cmpScalarImpl<equal_to, float, 1>, cmpScalarImpl<equal_to, float, 2>, cmpScalarImpl<equal_to, float, 3>, cmpScalarImpl<equal_to, float, 4>},
|
||||
{cmpScalarImpl<greater, float, 1>, cmpScalarImpl<greater, float, 2>, cmpScalarImpl<greater, float, 3>, cmpScalarImpl<greater, float, 4>},
|
||||
{cmpScalarImpl<greater_equal, float, 1>, cmpScalarImpl<greater_equal, float, 2>, cmpScalarImpl<greater_equal, float, 3>, cmpScalarImpl<greater_equal, float, 4>},
|
||||
{cmpScalarImpl<less, float, 1>, cmpScalarImpl<less, float, 2>, cmpScalarImpl<less, float, 3>, cmpScalarImpl<less, float, 4>},
|
||||
{cmpScalarImpl<less_equal, float, 1>, cmpScalarImpl<less_equal, float, 2>, cmpScalarImpl<less_equal, float, 3>, cmpScalarImpl<less_equal, float, 4>},
|
||||
{cmpScalarImpl<not_equal_to, float, 1>, cmpScalarImpl<not_equal_to, float, 2>, cmpScalarImpl<not_equal_to, float, 3>, cmpScalarImpl<not_equal_to, float, 4>}
|
||||
},
|
||||
{
|
||||
{cmpScalarImpl<equal_to, double, 1>, cmpScalarImpl<equal_to, double, 2>, cmpScalarImpl<equal_to, double, 3>, cmpScalarImpl<equal_to, double, 4>},
|
||||
{cmpScalarImpl<greater, double, 1>, cmpScalarImpl<greater, double, 2>, cmpScalarImpl<greater, double, 3>, cmpScalarImpl<greater, double, 4>},
|
||||
{cmpScalarImpl<greater_equal, double, 1>, cmpScalarImpl<greater_equal, double, 2>, cmpScalarImpl<greater_equal, double, 3>, cmpScalarImpl<greater_equal, double, 4>},
|
||||
{cmpScalarImpl<less, double, 1>, cmpScalarImpl<less, double, 2>, cmpScalarImpl<less, double, 3>, cmpScalarImpl<less, double, 4>},
|
||||
{cmpScalarImpl<less_equal, double, 1>, cmpScalarImpl<less_equal, double, 2>, cmpScalarImpl<less_equal, double, 3>, cmpScalarImpl<less_equal, double, 4>},
|
||||
{cmpScalarImpl<not_equal_to, double, 1>, cmpScalarImpl<not_equal_to, double, 2>, cmpScalarImpl<not_equal_to, double, 3>, cmpScalarImpl<not_equal_to, double, 4>}
|
||||
}
|
||||
};
|
||||
|
||||
if (inv)
|
||||
{
|
||||
// src1 is a scalar; swap it with src2
|
||||
cmpop = cmpop == cv::CMP_LT ? cv::CMP_GT : cmpop == cv::CMP_LE ? cv::CMP_GE :
|
||||
cmpop == cv::CMP_GE ? cv::CMP_LE : cmpop == cv::CMP_GT ? cv::CMP_LT : cmpop;
|
||||
}
|
||||
|
||||
const int depth = src.depth();
|
||||
const int cn = src.channels();
|
||||
|
||||
CV_DbgAssert( depth <= CV_64F && cn <= 4 );
|
||||
|
||||
funcs[depth][cmpop][cn - 1](src, val, dst, stream);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@ -40,92 +40,116 @@
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
#include "opencv2/opencv_modules.hpp"
|
||||
|
||||
#include "opencv2/core/cuda/common.hpp"
|
||||
#include "opencv2/core/cuda/border_interpolate.hpp"
|
||||
#ifndef HAVE_OPENCV_CUDEV
|
||||
|
||||
namespace cv { namespace cuda { namespace device
|
||||
#error "opencv_cudev is required"
|
||||
|
||||
#else
|
||||
|
||||
#include "opencv2/cudaarithm.hpp"
|
||||
#include "opencv2/cudev.hpp"
|
||||
|
||||
using namespace cv::cudev;
|
||||
|
||||
namespace
|
||||
{
|
||||
namespace imgproc
|
||||
struct ShiftMap
|
||||
{
|
||||
template <typename Ptr2D, typename T> __global__ void copyMakeBorder(const Ptr2D src, PtrStepSz<T> dst, int top, int left)
|
||||
{
|
||||
const int x = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
const int y = blockDim.y * blockIdx.y + threadIdx.y;
|
||||
typedef int2 value_type;
|
||||
typedef int index_type;
|
||||
|
||||
if (x < dst.cols && y < dst.rows)
|
||||
dst.ptr(y)[x] = src(y - top, x - left);
|
||||
int top;
|
||||
int left;
|
||||
|
||||
__device__ __forceinline__ int2 operator ()(int y, int x) const
|
||||
{
|
||||
return make_int2(x - left, y - top);
|
||||
}
|
||||
};
|
||||
|
||||
template <template <typename> class B, typename T> struct CopyMakeBorderDispatcher
|
||||
struct ShiftMapSz : ShiftMap
|
||||
{
|
||||
int rows, cols;
|
||||
};
|
||||
}
|
||||
|
||||
namespace cv { namespace cudev {
|
||||
|
||||
template <> struct PtrTraits<ShiftMapSz> : PtrTraitsBase<ShiftMapSz, ShiftMap>
|
||||
{
|
||||
};
|
||||
|
||||
}}
|
||||
|
||||
namespace
|
||||
{
|
||||
template <typename T, int cn>
|
||||
void copyMakeBorderImpl(const GpuMat& src, GpuMat& dst, int top, int left, int borderMode, cv::Scalar borderValue, Stream& stream)
|
||||
{
|
||||
typedef typename MakeVec<T, cn>::type src_type;
|
||||
|
||||
cv::Scalar_<T> borderValue_ = borderValue;
|
||||
const src_type brdVal = VecTraits<src_type>::make(borderValue_.val);
|
||||
|
||||
ShiftMapSz map;
|
||||
map.top = top;
|
||||
map.left = left;
|
||||
map.rows = dst.rows;
|
||||
map.cols = dst.cols;
|
||||
|
||||
switch (borderMode)
|
||||
{
|
||||
static void call(const PtrStepSz<T>& src, const PtrStepSz<T>& dst, int top, int left,
|
||||
const typename VecTraits<T>::elem_type* borderValue, cudaStream_t stream)
|
||||
{
|
||||
dim3 block(32, 8);
|
||||
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
|
||||
|
||||
B<T> brd(src.rows, src.cols, VecTraits<T>::make(borderValue));
|
||||
BorderReader< PtrStep<T>, B<T> > brdSrc(src, brd);
|
||||
|
||||
copyMakeBorder<<<grid, block, 0, stream>>>(brdSrc, dst, top, left);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
case cv::BORDER_CONSTANT:
|
||||
gridCopy(remapPtr(brdConstant(globPtr<src_type>(src), brdVal), map), globPtr<src_type>(dst), stream);
|
||||
break;
|
||||
case cv::BORDER_REPLICATE:
|
||||
gridCopy(remapPtr(brdReplicate(globPtr<src_type>(src)), map), globPtr<src_type>(dst), stream);
|
||||
break;
|
||||
case cv::BORDER_REFLECT:
|
||||
gridCopy(remapPtr(brdReflect(globPtr<src_type>(src)), map), globPtr<src_type>(dst), stream);
|
||||
break;
|
||||
case cv::BORDER_WRAP:
|
||||
gridCopy(remapPtr(brdWrap(globPtr<src_type>(src)), map), globPtr<src_type>(dst), stream);
|
||||
break;
|
||||
case cv::BORDER_REFLECT_101:
|
||||
gridCopy(remapPtr(brdReflect101(globPtr<src_type>(src)), map), globPtr<src_type>(dst), stream);
|
||||
break;
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T, int cn> void copyMakeBorder_gpu(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode,
|
||||
const T* borderValue, cudaStream_t stream)
|
||||
{
|
||||
typedef typename TypeVec<T, cn>::vec_type vec_type;
|
||||
void cv::cuda::copyMakeBorder(InputArray _src, OutputArray _dst, int top, int bottom, int left, int right, int borderType, Scalar value, Stream& stream)
|
||||
{
|
||||
typedef void (*func_t)(const GpuMat& src, GpuMat& dst, int top, int left, int borderMode, cv::Scalar borderValue, Stream& stream);
|
||||
static const func_t funcs[6][4] =
|
||||
{
|
||||
{ copyMakeBorderImpl<uchar , 1> , copyMakeBorderImpl<uchar , 2> , copyMakeBorderImpl<uchar , 3> , copyMakeBorderImpl<uchar , 4> },
|
||||
{0 /*copyMakeBorderImpl<schar , 1>*/, 0 /*copyMakeBorderImpl<schar , 2>*/, 0 /*copyMakeBorderImpl<schar , 3>*/, 0 /*copyMakeBorderImpl<schar , 4>*/},
|
||||
{ copyMakeBorderImpl<ushort, 1> , 0 /*copyMakeBorderImpl<ushort, 2>*/, copyMakeBorderImpl<ushort, 3> , copyMakeBorderImpl<ushort, 4> },
|
||||
{ copyMakeBorderImpl<short , 1> , 0 /*copyMakeBorderImpl<short , 2>*/, copyMakeBorderImpl<short , 3> , copyMakeBorderImpl<short , 4> },
|
||||
{0 /*copyMakeBorderImpl<int , 1>*/, 0 /*copyMakeBorderImpl<int , 2>*/, 0 /*copyMakeBorderImpl<int , 3>*/, 0 /*copyMakeBorderImpl<int , 4>*/},
|
||||
{ copyMakeBorderImpl<float , 1> , 0 /*copyMakeBorderImpl<float , 2>*/, copyMakeBorderImpl<float , 3> , copyMakeBorderImpl<float ,4> }
|
||||
};
|
||||
|
||||
typedef void (*caller_t)(const PtrStepSz<vec_type>& src, const PtrStepSz<vec_type>& dst, int top, int left, const T* borderValue, cudaStream_t stream);
|
||||
GpuMat src = _src.getGpuMat();
|
||||
|
||||
static const caller_t callers[5] =
|
||||
{
|
||||
CopyMakeBorderDispatcher<BrdConstant, vec_type>::call,
|
||||
CopyMakeBorderDispatcher<BrdReplicate, vec_type>::call,
|
||||
CopyMakeBorderDispatcher<BrdReflect, vec_type>::call,
|
||||
CopyMakeBorderDispatcher<BrdWrap, vec_type>::call,
|
||||
CopyMakeBorderDispatcher<BrdReflect101, vec_type>::call
|
||||
};
|
||||
const int depth = src.depth();
|
||||
const int cn = src.channels();
|
||||
|
||||
callers[borderMode](PtrStepSz<vec_type>(src), PtrStepSz<vec_type>(dst), top, left, borderValue, stream);
|
||||
}
|
||||
CV_Assert( depth <= CV_32F && cn <= 4 );
|
||||
CV_Assert( borderType == BORDER_REFLECT_101 || borderType == BORDER_REPLICATE || borderType == BORDER_CONSTANT || borderType == BORDER_REFLECT || borderType == BORDER_WRAP );
|
||||
|
||||
template void copyMakeBorder_gpu<uchar, 1>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
|
||||
template void copyMakeBorder_gpu<uchar, 2>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
|
||||
template void copyMakeBorder_gpu<uchar, 3>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
|
||||
template void copyMakeBorder_gpu<uchar, 4>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
|
||||
_dst.create(src.rows + top + bottom, src.cols + left + right, src.type());
|
||||
GpuMat dst = _dst.getGpuMat();
|
||||
|
||||
//template void copyMakeBorder_gpu<schar, 1>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
|
||||
//template void copyMakeBorder_gpu<schar, 2>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
|
||||
//template void copyMakeBorder_gpu<schar, 3>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
|
||||
//template void copyMakeBorder_gpu<schar, 4>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
|
||||
const func_t func = funcs[depth][cn - 1];
|
||||
|
||||
template void copyMakeBorder_gpu<ushort, 1>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
|
||||
//template void copyMakeBorder_gpu<ushort, 2>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
|
||||
template void copyMakeBorder_gpu<ushort, 3>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
|
||||
template void copyMakeBorder_gpu<ushort, 4>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
|
||||
if (!func)
|
||||
CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types");
|
||||
|
||||
template void copyMakeBorder_gpu<short, 1>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
|
||||
//template void copyMakeBorder_gpu<short, 2>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
|
||||
template void copyMakeBorder_gpu<short, 3>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
|
||||
template void copyMakeBorder_gpu<short, 4>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
|
||||
func(src, dst, top, left, borderType, value, stream);
|
||||
}
|
||||
|
||||
//template void copyMakeBorder_gpu<int, 1>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
|
||||
//template void copyMakeBorder_gpu<int, 2>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
|
||||
//template void copyMakeBorder_gpu<int, 3>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
|
||||
//template void copyMakeBorder_gpu<int, 4>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
|
||||
|
||||
template void copyMakeBorder_gpu<float, 1>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
|
||||
//template void copyMakeBorder_gpu<float, 2>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
|
||||
template void copyMakeBorder_gpu<float, 3>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
|
||||
template void copyMakeBorder_gpu<float, 4>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
|
||||
} // namespace imgproc
|
||||
}}} // namespace cv { namespace cuda { namespace cudev
|
||||
|
||||
#endif /* CUDA_DISABLER */
|
||||
#endif
|
||||
|
@ -40,137 +40,57 @@
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
#include "opencv2/opencv_modules.hpp"
|
||||
|
||||
#include "opencv2/core/cuda/common.hpp"
|
||||
#include "opencv2/core/cuda/vec_traits.hpp"
|
||||
#include "opencv2/core/cuda/vec_math.hpp"
|
||||
#include "opencv2/core/cuda/functional.hpp"
|
||||
#include "opencv2/core/cuda/reduce.hpp"
|
||||
#include "opencv2/core/cuda/emulation.hpp"
|
||||
#ifndef HAVE_OPENCV_CUDEV
|
||||
|
||||
using namespace cv::cuda;
|
||||
using namespace cv::cuda::device;
|
||||
#error "opencv_cudev is required"
|
||||
|
||||
namespace countNonZero
|
||||
#else
|
||||
|
||||
#include "opencv2/cudaarithm.hpp"
|
||||
#include "opencv2/cudev.hpp"
|
||||
|
||||
using namespace cv::cudev;
|
||||
|
||||
namespace
|
||||
{
|
||||
__device__ unsigned int blocks_finished = 0;
|
||||
|
||||
template <int BLOCK_SIZE, typename T>
|
||||
__global__ void kernel(const PtrStepSz<T> src, unsigned int* count, const int twidth, const int theight)
|
||||
{
|
||||
__shared__ unsigned int scount[BLOCK_SIZE];
|
||||
|
||||
const int x0 = blockIdx.x * blockDim.x * twidth + threadIdx.x;
|
||||
const int y0 = blockIdx.y * blockDim.y * theight + threadIdx.y;
|
||||
|
||||
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
|
||||
|
||||
unsigned int mycount = 0;
|
||||
|
||||
for (int i = 0, y = y0; i < theight && y < src.rows; ++i, y += blockDim.y)
|
||||
{
|
||||
const T* ptr = src.ptr(y);
|
||||
|
||||
for (int j = 0, x = x0; j < twidth && x < src.cols; ++j, x += blockDim.x)
|
||||
{
|
||||
const T srcVal = ptr[x];
|
||||
|
||||
mycount += (srcVal != 0);
|
||||
}
|
||||
}
|
||||
|
||||
device::reduce<BLOCK_SIZE>(scount, mycount, tid, plus<unsigned int>());
|
||||
|
||||
#if __CUDA_ARCH__ >= 200
|
||||
if (tid == 0)
|
||||
::atomicAdd(count, mycount);
|
||||
#else
|
||||
__shared__ bool is_last;
|
||||
const int bid = blockIdx.y * gridDim.x + blockIdx.x;
|
||||
|
||||
if (tid == 0)
|
||||
{
|
||||
count[bid] = mycount;
|
||||
|
||||
__threadfence();
|
||||
|
||||
unsigned int ticket = ::atomicInc(&blocks_finished, gridDim.x * gridDim.y);
|
||||
is_last = (ticket == gridDim.x * gridDim.y - 1);
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (is_last)
|
||||
{
|
||||
mycount = tid < gridDim.x * gridDim.y ? count[tid] : 0;
|
||||
|
||||
device::reduce<BLOCK_SIZE>(scount, mycount, tid, plus<unsigned int>());
|
||||
|
||||
if (tid == 0)
|
||||
{
|
||||
count[0] = mycount;
|
||||
|
||||
blocks_finished = 0;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
const int threads_x = 32;
|
||||
const int threads_y = 8;
|
||||
|
||||
void getLaunchCfg(int cols, int rows, dim3& block, dim3& grid)
|
||||
{
|
||||
block = dim3(threads_x, threads_y);
|
||||
|
||||
grid = dim3(divUp(cols, block.x * block.y),
|
||||
divUp(rows, block.y * block.x));
|
||||
|
||||
grid.x = ::min(grid.x, block.x);
|
||||
grid.y = ::min(grid.y, block.y);
|
||||
}
|
||||
|
||||
void getBufSize(int cols, int rows, int& bufcols, int& bufrows)
|
||||
{
|
||||
dim3 block, grid;
|
||||
getLaunchCfg(cols, rows, block, grid);
|
||||
|
||||
bufcols = grid.x * grid.y * sizeof(int);
|
||||
bufrows = 1;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
int run(const PtrStepSzb src, PtrStep<unsigned int> buf)
|
||||
int countNonZeroImpl(const GpuMat& _src, GpuMat& _buf)
|
||||
{
|
||||
dim3 block, grid;
|
||||
getLaunchCfg(src.cols, src.rows, block, grid);
|
||||
const GpuMat_<T>& src = (const GpuMat_<T>&) _src;
|
||||
GpuMat_<int>& buf = (GpuMat_<int>&) _buf;
|
||||
|
||||
const int twidth = divUp(divUp(src.cols, grid.x), block.x);
|
||||
const int theight = divUp(divUp(src.rows, grid.y), block.y);
|
||||
gridCountNonZero(src, buf);
|
||||
|
||||
unsigned int* count_buf = buf.ptr(0);
|
||||
int data;
|
||||
buf.download(cv::Mat(1, 1, buf.type(), &data));
|
||||
|
||||
cudaSafeCall( cudaMemset(count_buf, 0, sizeof(unsigned int)) );
|
||||
|
||||
kernel<threads_x * threads_y><<<grid, block>>>((PtrStepSz<T>) src, count_buf, twidth, theight);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
|
||||
unsigned int count;
|
||||
cudaSafeCall(cudaMemcpy(&count, count_buf, sizeof(unsigned int), cudaMemcpyDeviceToHost));
|
||||
|
||||
return count;
|
||||
return data;
|
||||
}
|
||||
|
||||
template int run<uchar >(const PtrStepSzb src, PtrStep<unsigned int> buf);
|
||||
template int run<schar >(const PtrStepSzb src, PtrStep<unsigned int> buf);
|
||||
template int run<ushort>(const PtrStepSzb src, PtrStep<unsigned int> buf);
|
||||
template int run<short >(const PtrStepSzb src, PtrStep<unsigned int> buf);
|
||||
template int run<int >(const PtrStepSzb src, PtrStep<unsigned int> buf);
|
||||
template int run<float >(const PtrStepSzb src, PtrStep<unsigned int> buf);
|
||||
template int run<double>(const PtrStepSzb src, PtrStep<unsigned int> buf);
|
||||
}
|
||||
|
||||
#endif // CUDA_DISABLER
|
||||
int cv::cuda::countNonZero(InputArray _src, GpuMat& buf)
|
||||
{
|
||||
typedef int (*func_t)(const GpuMat& _src, GpuMat& _buf);
|
||||
static const func_t funcs[] =
|
||||
{
|
||||
countNonZeroImpl<uchar>,
|
||||
countNonZeroImpl<schar>,
|
||||
countNonZeroImpl<ushort>,
|
||||
countNonZeroImpl<short>,
|
||||
countNonZeroImpl<int>,
|
||||
countNonZeroImpl<float>,
|
||||
countNonZeroImpl<double>
|
||||
};
|
||||
|
||||
GpuMat src = _src.getGpuMat();
|
||||
|
||||
CV_Assert( src.channels() == 1 );
|
||||
|
||||
const func_t func = funcs[src.depth()];
|
||||
|
||||
return func(src, buf);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@ -40,191 +40,203 @@
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
#include "opencv2/opencv_modules.hpp"
|
||||
|
||||
#include "opencv2/core/cuda/common.hpp"
|
||||
#include "opencv2/core/cuda/functional.hpp"
|
||||
#include "opencv2/core/cuda/transform.hpp"
|
||||
#include "opencv2/core/cuda/saturate_cast.hpp"
|
||||
#include "opencv2/core/cuda/simd_functions.hpp"
|
||||
#ifndef HAVE_OPENCV_CUDEV
|
||||
|
||||
#include "arithm_func_traits.hpp"
|
||||
#error "opencv_cudev is required"
|
||||
|
||||
using namespace cv::cuda;
|
||||
using namespace cv::cuda::device;
|
||||
#else
|
||||
|
||||
namespace arithm
|
||||
#include "opencv2/cudev.hpp"
|
||||
|
||||
using namespace cv::cudev;
|
||||
|
||||
void divMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat&, double scale, Stream& stream, int);
|
||||
void divMat_8uc4_32f(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream);
|
||||
void divMat_16sc4_32f(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream);
|
||||
|
||||
namespace
|
||||
{
|
||||
struct Div_8uc4_32f : binary_function<uint, float, uint>
|
||||
{
|
||||
__device__ __forceinline__ uint operator ()(uint a, float b) const
|
||||
{
|
||||
uint res = 0;
|
||||
|
||||
if (b != 0)
|
||||
{
|
||||
b = 1.0f / b;
|
||||
res |= (saturate_cast<uchar>((0xffu & (a )) * b) );
|
||||
res |= (saturate_cast<uchar>((0xffu & (a >> 8)) * b) << 8);
|
||||
res |= (saturate_cast<uchar>((0xffu & (a >> 16)) * b) << 16);
|
||||
res |= (saturate_cast<uchar>((0xffu & (a >> 24)) * b) << 24);
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
};
|
||||
|
||||
struct Div_16sc4_32f : binary_function<short4, float, short4>
|
||||
{
|
||||
__device__ __forceinline__ short4 operator ()(short4 a, float b) const
|
||||
{
|
||||
return b != 0 ? make_short4(saturate_cast<short>(a.x / b), saturate_cast<short>(a.y / b),
|
||||
saturate_cast<short>(a.z / b), saturate_cast<short>(a.w / b))
|
||||
: make_short4(0,0,0,0);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T, typename D> struct Div : binary_function<T, T, D>
|
||||
template <typename T, typename D> struct DivOp : binary_function<T, T, D>
|
||||
{
|
||||
__device__ __forceinline__ D operator ()(T a, T b) const
|
||||
{
|
||||
return b != 0 ? saturate_cast<D>(a / b) : 0;
|
||||
}
|
||||
|
||||
__host__ __device__ __forceinline__ Div() {}
|
||||
__host__ __device__ __forceinline__ Div(const Div&) {}
|
||||
};
|
||||
template <typename T> struct Div<T, float> : binary_function<T, T, float>
|
||||
template <typename T> struct DivOp<T, float> : binary_function<T, T, float>
|
||||
{
|
||||
__device__ __forceinline__ float operator ()(T a, T b) const
|
||||
{
|
||||
return b != 0 ? static_cast<float>(a) / b : 0;
|
||||
return b != 0 ? static_cast<float>(a) / b : 0.0f;
|
||||
}
|
||||
|
||||
__host__ __device__ __forceinline__ Div() {}
|
||||
__host__ __device__ __forceinline__ Div(const Div&) {}
|
||||
};
|
||||
template <typename T> struct Div<T, double> : binary_function<T, T, double>
|
||||
template <typename T> struct DivOp<T, double> : binary_function<T, T, double>
|
||||
{
|
||||
__device__ __forceinline__ double operator ()(T a, T b) const
|
||||
{
|
||||
return b != 0 ? static_cast<double>(a) / b : 0;
|
||||
return b != 0 ? static_cast<double>(a) / b : 0.0;
|
||||
}
|
||||
|
||||
__host__ __device__ __forceinline__ Div() {}
|
||||
__host__ __device__ __forceinline__ Div(const Div&) {}
|
||||
};
|
||||
|
||||
template <typename T, typename S, typename D> struct DivScale : binary_function<T, T, D>
|
||||
template <typename T, typename S, typename D> struct DivScaleOp : binary_function<T, T, D>
|
||||
{
|
||||
S scale;
|
||||
|
||||
__host__ explicit DivScale(S scale_) : scale(scale_) {}
|
||||
|
||||
__device__ __forceinline__ D operator ()(T a, T b) const
|
||||
{
|
||||
return b != 0 ? saturate_cast<D>(scale * a / b) : 0;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
namespace cv { namespace cuda { namespace device
|
||||
{
|
||||
template <> struct TransformFunctorTraits<arithm::Div_8uc4_32f> : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
|
||||
template <typename ScalarDepth> struct TransformPolicy : DefaultTransformPolicy
|
||||
{
|
||||
};
|
||||
|
||||
template <typename T, typename D> struct TransformFunctorTraits< arithm::Div<T, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
|
||||
template <> struct TransformPolicy<double> : DefaultTransformPolicy
|
||||
{
|
||||
enum {
|
||||
shift = 1
|
||||
};
|
||||
};
|
||||
|
||||
template <typename T, typename S, typename D> struct TransformFunctorTraits< arithm::DivScale<T, S, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
|
||||
{
|
||||
};
|
||||
}}}
|
||||
|
||||
namespace arithm
|
||||
{
|
||||
void divMat_8uc4_32f(PtrStepSz<uint> src1, PtrStepSzf src2, PtrStepSz<uint> dst, cudaStream_t stream)
|
||||
{
|
||||
device::transform(src1, src2, dst, Div_8uc4_32f(), WithOutMask(), stream);
|
||||
}
|
||||
|
||||
void divMat_16sc4_32f(PtrStepSz<short4> src1, PtrStepSzf src2, PtrStepSz<short4> dst, cudaStream_t stream)
|
||||
{
|
||||
device::transform(src1, src2, dst, Div_16sc4_32f(), WithOutMask(), stream);
|
||||
}
|
||||
|
||||
template <typename T, typename S, typename D>
|
||||
void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream)
|
||||
void divMatImpl(const GpuMat& src1, const GpuMat& src2, const GpuMat& dst, double scale, Stream& stream)
|
||||
{
|
||||
if (scale == 1)
|
||||
{
|
||||
Div<T, D> op;
|
||||
device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
|
||||
DivOp<T, D> op;
|
||||
gridTransformBinary_< TransformPolicy<S> >(globPtr<T>(src1), globPtr<T>(src2), globPtr<D>(dst), op, stream);
|
||||
}
|
||||
else
|
||||
{
|
||||
DivScale<T, S, D> op(static_cast<S>(scale));
|
||||
device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
|
||||
DivScaleOp<T, S, D> op;
|
||||
op.scale = static_cast<S>(scale);
|
||||
gridTransformBinary_< TransformPolicy<S> >(globPtr<T>(src1), globPtr<T>(src2), globPtr<D>(dst), op, stream);
|
||||
}
|
||||
}
|
||||
|
||||
template void divMat<uchar, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
template void divMat<uchar, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
template void divMat<uchar, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
template void divMat<uchar, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
template void divMat<uchar, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
template void divMat<uchar, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
template void divMat<uchar, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
|
||||
template void divMat<schar, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
template void divMat<schar, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
template void divMat<schar, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
template void divMat<schar, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
template void divMat<schar, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
template void divMat<schar, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
template void divMat<schar, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
|
||||
//template void divMat<ushort, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
//template void divMat<ushort, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
template void divMat<ushort, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
template void divMat<ushort, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
template void divMat<ushort, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
template void divMat<ushort, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
template void divMat<ushort, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
|
||||
//template void divMat<short, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
//template void divMat<short, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
template void divMat<short, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
template void divMat<short, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
template void divMat<short, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
template void divMat<short, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
template void divMat<short, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
|
||||
//template void divMat<int, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
//template void divMat<int, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
//template void divMat<int, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
//template void divMat<int, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
template void divMat<int, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
template void divMat<int, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
template void divMat<int, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
|
||||
//template void divMat<float, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
//template void divMat<float, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
//template void divMat<float, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
//template void divMat<float, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
//template void divMat<float, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
template void divMat<float, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
template void divMat<float, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
|
||||
//template void divMat<double, double, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
//template void divMat<double, double, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
//template void divMat<double, double, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
//template void divMat<double, double, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
//template void divMat<double, double, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
//template void divMat<double, double, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
template void divMat<double, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
}
|
||||
|
||||
#endif // CUDA_DISABLER
|
||||
void divMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat&, double scale, Stream& stream, int)
|
||||
{
|
||||
typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, const GpuMat& dst, double scale, Stream& stream);
|
||||
static const func_t funcs[7][7] =
|
||||
{
|
||||
{
|
||||
divMatImpl<uchar, float, uchar>,
|
||||
divMatImpl<uchar, float, schar>,
|
||||
divMatImpl<uchar, float, ushort>,
|
||||
divMatImpl<uchar, float, short>,
|
||||
divMatImpl<uchar, float, int>,
|
||||
divMatImpl<uchar, float, float>,
|
||||
divMatImpl<uchar, double, double>
|
||||
},
|
||||
{
|
||||
divMatImpl<schar, float, uchar>,
|
||||
divMatImpl<schar, float, schar>,
|
||||
divMatImpl<schar, float, ushort>,
|
||||
divMatImpl<schar, float, short>,
|
||||
divMatImpl<schar, float, int>,
|
||||
divMatImpl<schar, float, float>,
|
||||
divMatImpl<schar, double, double>
|
||||
},
|
||||
{
|
||||
0 /*divMatImpl<ushort, float, uchar>*/,
|
||||
0 /*divMatImpl<ushort, float, schar>*/,
|
||||
divMatImpl<ushort, float, ushort>,
|
||||
divMatImpl<ushort, float, short>,
|
||||
divMatImpl<ushort, float, int>,
|
||||
divMatImpl<ushort, float, float>,
|
||||
divMatImpl<ushort, double, double>
|
||||
},
|
||||
{
|
||||
0 /*divMatImpl<short, float, uchar>*/,
|
||||
0 /*divMatImpl<short, float, schar>*/,
|
||||
divMatImpl<short, float, ushort>,
|
||||
divMatImpl<short, float, short>,
|
||||
divMatImpl<short, float, int>,
|
||||
divMatImpl<short, float, float>,
|
||||
divMatImpl<short, double, double>
|
||||
},
|
||||
{
|
||||
0 /*divMatImpl<int, float, uchar>*/,
|
||||
0 /*divMatImpl<int, float, schar>*/,
|
||||
0 /*divMatImpl<int, float, ushort>*/,
|
||||
0 /*divMatImpl<int, float, short>*/,
|
||||
divMatImpl<int, float, int>,
|
||||
divMatImpl<int, float, float>,
|
||||
divMatImpl<int, double, double>
|
||||
},
|
||||
{
|
||||
0 /*divMatImpl<float, float, uchar>*/,
|
||||
0 /*divMatImpl<float, float, schar>*/,
|
||||
0 /*divMatImpl<float, float, ushort>*/,
|
||||
0 /*divMatImpl<float, float, short>*/,
|
||||
0 /*divMatImpl<float, float, int>*/,
|
||||
divMatImpl<float, float, float>,
|
||||
divMatImpl<float, double, double>
|
||||
},
|
||||
{
|
||||
0 /*divMatImpl<double, double, uchar>*/,
|
||||
0 /*divMatImpl<double, double, schar>*/,
|
||||
0 /*divMatImpl<double, double, ushort>*/,
|
||||
0 /*divMatImpl<double, double, short>*/,
|
||||
0 /*divMatImpl<double, double, int>*/,
|
||||
0 /*divMatImpl<double, double, float>*/,
|
||||
divMatImpl<double, double, double>
|
||||
}
|
||||
};
|
||||
|
||||
const int sdepth = src1.depth();
|
||||
const int ddepth = dst.depth();
|
||||
|
||||
CV_DbgAssert( sdepth <= CV_64F && ddepth <= CV_64F );
|
||||
|
||||
GpuMat src1_ = src1.reshape(1);
|
||||
GpuMat src2_ = src2.reshape(1);
|
||||
GpuMat dst_ = dst.reshape(1);
|
||||
|
||||
const func_t func = funcs[sdepth][ddepth];
|
||||
|
||||
if (!func)
|
||||
CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types");
|
||||
|
||||
func(src1_, src2_, dst_, scale, stream);
|
||||
}
|
||||
|
||||
namespace
|
||||
{
|
||||
template <typename T>
|
||||
struct DivOpSpecial : binary_function<T, float, T>
|
||||
{
|
||||
__device__ __forceinline__ T operator ()(const T& a, float b) const
|
||||
{
|
||||
typedef typename VecTraits<T>::elem_type elem_type;
|
||||
|
||||
T res = VecTraits<T>::all(0);
|
||||
|
||||
if (b != 0)
|
||||
{
|
||||
b = 1.0f / b;
|
||||
res.x = saturate_cast<elem_type>(a.x * b);
|
||||
res.y = saturate_cast<elem_type>(a.y * b);
|
||||
res.z = saturate_cast<elem_type>(a.z * b);
|
||||
res.w = saturate_cast<elem_type>(a.w * b);
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
void divMat_8uc4_32f(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
|
||||
{
|
||||
gridTransformBinary(globPtr<uchar4>(src1), globPtr<float>(src2), globPtr<uchar4>(dst), DivOpSpecial<uchar4>(), stream);
|
||||
}
|
||||
|
||||
void divMat_16sc4_32f(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
|
||||
{
|
||||
gridTransformBinary(globPtr<short4>(src1), globPtr<float>(src2), globPtr<short4>(dst), DivOpSpecial<short4>(), stream);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@ -40,129 +40,221 @@
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
#include "opencv2/opencv_modules.hpp"
|
||||
|
||||
#include "opencv2/core/cuda/common.hpp"
|
||||
#include "opencv2/core/cuda/functional.hpp"
|
||||
#include "opencv2/core/cuda/transform.hpp"
|
||||
#include "opencv2/core/cuda/saturate_cast.hpp"
|
||||
#include "opencv2/core/cuda/simd_functions.hpp"
|
||||
#ifndef HAVE_OPENCV_CUDEV
|
||||
|
||||
#include "arithm_func_traits.hpp"
|
||||
#error "opencv_cudev is required"
|
||||
|
||||
using namespace cv::cuda;
|
||||
using namespace cv::cuda::device;
|
||||
#else
|
||||
|
||||
namespace arithm
|
||||
#include "opencv2/cudev.hpp"
|
||||
|
||||
using namespace cv::cudev;
|
||||
|
||||
void divScalar(const GpuMat& src, cv::Scalar val, bool inv, GpuMat& dst, const GpuMat& mask, double scale, Stream& stream, int);
|
||||
|
||||
namespace
|
||||
{
|
||||
template <typename T, typename S, typename D> struct DivScalar : unary_function<T, D>
|
||||
template <typename T, int cn> struct SafeDiv;
|
||||
template <typename T> struct SafeDiv<T, 1>
|
||||
{
|
||||
S val;
|
||||
|
||||
__host__ explicit DivScalar(S val_) : val(val_) {}
|
||||
|
||||
__device__ __forceinline__ D operator ()(T a) const
|
||||
__device__ __forceinline__ static T op(T a, T b)
|
||||
{
|
||||
return saturate_cast<D>(a / val);
|
||||
return b != 0 ? a / b : 0;
|
||||
}
|
||||
};
|
||||
template <typename T> struct SafeDiv<T, 2>
|
||||
{
|
||||
__device__ __forceinline__ static T op(const T& a, const T& b)
|
||||
{
|
||||
T res;
|
||||
|
||||
res.x = b.x != 0 ? a.x / b.x : 0;
|
||||
res.y = b.y != 0 ? a.y / b.y : 0;
|
||||
|
||||
return res;
|
||||
}
|
||||
};
|
||||
template <typename T> struct SafeDiv<T, 3>
|
||||
{
|
||||
__device__ __forceinline__ static T op(const T& a, const T& b)
|
||||
{
|
||||
T res;
|
||||
|
||||
res.x = b.x != 0 ? a.x / b.x : 0;
|
||||
res.y = b.y != 0 ? a.y / b.y : 0;
|
||||
res.z = b.z != 0 ? a.z / b.z : 0;
|
||||
|
||||
return res;
|
||||
}
|
||||
};
|
||||
template <typename T> struct SafeDiv<T, 4>
|
||||
{
|
||||
__device__ __forceinline__ static T op(const T& a, const T& b)
|
||||
{
|
||||
T res;
|
||||
|
||||
res.x = b.x != 0 ? a.x / b.x : 0;
|
||||
res.y = b.y != 0 ? a.y / b.y : 0;
|
||||
res.z = b.z != 0 ? a.z / b.z : 0;
|
||||
res.w = b.w != 0 ? a.w / b.w : 0;
|
||||
|
||||
return res;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T, typename S, typename D> struct DivScalarInv : unary_function<T, D>
|
||||
template <typename SrcType, typename ScalarType, typename DstType> struct DivScalarOp : unary_function<SrcType, DstType>
|
||||
{
|
||||
S val;
|
||||
ScalarType val;
|
||||
|
||||
explicit DivScalarInv(S val_) : val(val_) {}
|
||||
|
||||
__device__ __forceinline__ D operator ()(T a) const
|
||||
__device__ __forceinline__ DstType operator ()(SrcType a) const
|
||||
{
|
||||
return a != 0 ? saturate_cast<D>(val / a) : 0;
|
||||
return saturate_cast<DstType>(SafeDiv<ScalarType, VecTraits<ScalarType>::cn>::op(saturate_cast<ScalarType>(a), val));
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
namespace cv { namespace cuda { namespace device
|
||||
{
|
||||
template <typename T, typename S, typename D> struct TransformFunctorTraits< arithm::DivScalar<T, S, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
|
||||
template <typename SrcType, typename ScalarType, typename DstType> struct DivScalarOpInv : unary_function<SrcType, DstType>
|
||||
{
|
||||
ScalarType val;
|
||||
|
||||
__device__ __forceinline__ DstType operator ()(SrcType a) const
|
||||
{
|
||||
return saturate_cast<DstType>(SafeDiv<ScalarType, VecTraits<ScalarType>::cn>::op(val, saturate_cast<ScalarType>(a)));
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T, typename S, typename D> struct TransformFunctorTraits< arithm::DivScalarInv<T, S, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
|
||||
template <typename ScalarDepth> struct TransformPolicy : DefaultTransformPolicy
|
||||
{
|
||||
};
|
||||
}}}
|
||||
|
||||
namespace arithm
|
||||
{
|
||||
template <typename T, typename S, typename D>
|
||||
void divScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream)
|
||||
template <> struct TransformPolicy<double> : DefaultTransformPolicy
|
||||
{
|
||||
enum {
|
||||
shift = 1
|
||||
};
|
||||
};
|
||||
|
||||
template <typename SrcType, typename ScalarDepth, typename DstType>
|
||||
void divScalarImpl(const GpuMat& src, cv::Scalar value, bool inv, GpuMat& dst, Stream& stream)
|
||||
{
|
||||
typedef typename MakeVec<ScalarDepth, VecTraits<SrcType>::cn>::type ScalarType;
|
||||
|
||||
cv::Scalar_<ScalarDepth> value_ = value;
|
||||
|
||||
if (inv)
|
||||
{
|
||||
DivScalarInv<T, S, D> op(static_cast<S>(val));
|
||||
device::transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
|
||||
DivScalarOpInv<SrcType, ScalarType, DstType> op;
|
||||
op.val = VecTraits<ScalarType>::make(value_.val);
|
||||
|
||||
gridTransformUnary_< TransformPolicy<ScalarDepth> >(globPtr<SrcType>(src), globPtr<DstType>(dst), op, stream);
|
||||
}
|
||||
else
|
||||
{
|
||||
DivScalar<T, S, D> op(static_cast<S>(val));
|
||||
device::transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
|
||||
DivScalarOp<SrcType, ScalarType, DstType> op;
|
||||
op.val = VecTraits<ScalarType>::make(value_.val);
|
||||
|
||||
gridTransformUnary_< TransformPolicy<ScalarDepth> >(globPtr<SrcType>(src), globPtr<DstType>(dst), op, stream);
|
||||
}
|
||||
}
|
||||
|
||||
template void divScalar<uchar, float, uchar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void divScalar<uchar, float, schar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void divScalar<uchar, float, ushort>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void divScalar<uchar, float, short>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void divScalar<uchar, float, int>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void divScalar<uchar, float, float>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void divScalar<uchar, double, double>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
|
||||
|
||||
template void divScalar<schar, float, uchar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void divScalar<schar, float, schar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void divScalar<schar, float, ushort>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void divScalar<schar, float, short>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void divScalar<schar, float, int>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void divScalar<schar, float, float>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void divScalar<schar, double, double>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
|
||||
|
||||
//template void divScalar<ushort, float, uchar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
|
||||
//template void divScalar<ushort, float, schar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void divScalar<ushort, float, ushort>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void divScalar<ushort, float, short>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void divScalar<ushort, float, int>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void divScalar<ushort, float, float>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void divScalar<ushort, double, double>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
|
||||
|
||||
//template void divScalar<short, float, uchar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
|
||||
//template void divScalar<short, float, schar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void divScalar<short, float, ushort>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void divScalar<short, float, short>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void divScalar<short, float, int>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void divScalar<short, float, float>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void divScalar<short, double, double>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
|
||||
|
||||
//template void divScalar<int, float, uchar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
|
||||
//template void divScalar<int, float, schar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
|
||||
//template void divScalar<int, float, ushort>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
|
||||
//template void divScalar<int, float, short>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void divScalar<int, float, int>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void divScalar<int, float, float>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void divScalar<int, double, double>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
|
||||
|
||||
//template void divScalar<float, float, uchar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
|
||||
//template void divScalar<float, float, schar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
|
||||
//template void divScalar<float, float, ushort>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
|
||||
//template void divScalar<float, float, short>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
|
||||
//template void divScalar<float, float, int>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void divScalar<float, float, float>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void divScalar<float, double, double>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
|
||||
|
||||
//template void divScalar<double, double, uchar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
|
||||
//template void divScalar<double, double, schar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
|
||||
//template void divScalar<double, double, ushort>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
|
||||
//template void divScalar<double, double, short>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
|
||||
//template void divScalar<double, double, int>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
|
||||
//template void divScalar<double, double, float>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void divScalar<double, double, double>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
|
||||
}
|
||||
|
||||
#endif // CUDA_DISABLER
|
||||
void divScalar(const GpuMat& src, cv::Scalar val, bool inv, GpuMat& dst, const GpuMat&, double scale, Stream& stream, int)
|
||||
{
|
||||
typedef void (*func_t)(const GpuMat& src, cv::Scalar val, bool inv, GpuMat& dst, Stream& stream);
|
||||
static const func_t funcs[7][7][4] =
|
||||
{
|
||||
{
|
||||
{divScalarImpl<uchar, float, uchar>, divScalarImpl<uchar2, float, uchar2>, divScalarImpl<uchar3, float, uchar3>, divScalarImpl<uchar4, float, uchar4>},
|
||||
{divScalarImpl<uchar, float, schar>, divScalarImpl<uchar2, float, char2>, divScalarImpl<uchar3, float, char3>, divScalarImpl<uchar4, float, char4>},
|
||||
{divScalarImpl<uchar, float, ushort>, divScalarImpl<uchar2, float, ushort2>, divScalarImpl<uchar3, float, ushort3>, divScalarImpl<uchar4, float, ushort4>},
|
||||
{divScalarImpl<uchar, float, short>, divScalarImpl<uchar2, float, short2>, divScalarImpl<uchar3, float, short3>, divScalarImpl<uchar4, float, short4>},
|
||||
{divScalarImpl<uchar, float, int>, divScalarImpl<uchar2, float, int2>, divScalarImpl<uchar3, float, int3>, divScalarImpl<uchar4, float, int4>},
|
||||
{divScalarImpl<uchar, float, float>, divScalarImpl<uchar2, float, float2>, divScalarImpl<uchar3, float, float3>, divScalarImpl<uchar4, float, float4>},
|
||||
{divScalarImpl<uchar, double, double>, divScalarImpl<uchar2, double, double2>, divScalarImpl<uchar3, double, double3>, divScalarImpl<uchar4, double, double4>}
|
||||
},
|
||||
{
|
||||
{divScalarImpl<schar, float, uchar>, divScalarImpl<char2, float, uchar2>, divScalarImpl<char3, float, uchar3>, divScalarImpl<char4, float, uchar4>},
|
||||
{divScalarImpl<schar, float, schar>, divScalarImpl<char2, float, char2>, divScalarImpl<char3, float, char3>, divScalarImpl<char4, float, char4>},
|
||||
{divScalarImpl<schar, float, ushort>, divScalarImpl<char2, float, ushort2>, divScalarImpl<char3, float, ushort3>, divScalarImpl<char4, float, ushort4>},
|
||||
{divScalarImpl<schar, float, short>, divScalarImpl<char2, float, short2>, divScalarImpl<char3, float, short3>, divScalarImpl<char4, float, short4>},
|
||||
{divScalarImpl<schar, float, int>, divScalarImpl<char2, float, int2>, divScalarImpl<char3, float, int3>, divScalarImpl<char4, float, int4>},
|
||||
{divScalarImpl<schar, float, float>, divScalarImpl<char2, float, float2>, divScalarImpl<char3, float, float3>, divScalarImpl<char4, float, float4>},
|
||||
{divScalarImpl<schar, double, double>, divScalarImpl<char2, double, double2>, divScalarImpl<char3, double, double3>, divScalarImpl<char4, double, double4>}
|
||||
},
|
||||
{
|
||||
{0 /*divScalarImpl<ushort, float, uchar>*/, 0 /*divScalarImpl<ushort2, float, uchar2>*/, 0 /*divScalarImpl<ushort3, float, uchar3>*/, 0 /*divScalarImpl<ushort4, float, uchar4>*/},
|
||||
{0 /*divScalarImpl<ushort, float, schar>*/, 0 /*divScalarImpl<ushort2, float, char2>*/, 0 /*divScalarImpl<ushort3, float, char3>*/, 0 /*divScalarImpl<ushort4, float, char4>*/},
|
||||
{divScalarImpl<ushort, float, ushort>, divScalarImpl<ushort2, float, ushort2>, divScalarImpl<ushort3, float, ushort3>, divScalarImpl<ushort4, float, ushort4>},
|
||||
{divScalarImpl<ushort, float, short>, divScalarImpl<ushort2, float, short2>, divScalarImpl<ushort3, float, short3>, divScalarImpl<ushort4, float, short4>},
|
||||
{divScalarImpl<ushort, float, int>, divScalarImpl<ushort2, float, int2>, divScalarImpl<ushort3, float, int3>, divScalarImpl<ushort4, float, int4>},
|
||||
{divScalarImpl<ushort, float, float>, divScalarImpl<ushort2, float, float2>, divScalarImpl<ushort3, float, float3>, divScalarImpl<ushort4, float, float4>},
|
||||
{divScalarImpl<ushort, double, double>, divScalarImpl<ushort2, double, double2>, divScalarImpl<ushort3, double, double3>, divScalarImpl<ushort4, double, double4>}
|
||||
},
|
||||
{
|
||||
{0 /*divScalarImpl<short, float, uchar>*/, 0 /*divScalarImpl<short2, float, uchar2>*/, 0 /*divScalarImpl<short3, float, uchar3>*/, 0 /*divScalarImpl<short4, float, uchar4>*/},
|
||||
{0 /*divScalarImpl<short, float, schar>*/, 0 /*divScalarImpl<short2, float, char2>*/, 0 /*divScalarImpl<short3, float, char3>*/, 0 /*divScalarImpl<short4, float, char4>*/},
|
||||
{divScalarImpl<short, float, ushort>, divScalarImpl<short2, float, ushort2>, divScalarImpl<short3, float, ushort3>, divScalarImpl<short4, float, ushort4>},
|
||||
{divScalarImpl<short, float, short>, divScalarImpl<short2, float, short2>, divScalarImpl<short3, float, short3>, divScalarImpl<short4, float, short4>},
|
||||
{divScalarImpl<short, float, int>, divScalarImpl<short2, float, int2>, divScalarImpl<short3, float, int3>, divScalarImpl<short4, float, int4>},
|
||||
{divScalarImpl<short, float, float>, divScalarImpl<short2, float, float2>, divScalarImpl<short3, float, float3>, divScalarImpl<short4, float, float4>},
|
||||
{divScalarImpl<short, double, double>, divScalarImpl<short2, double, double2>, divScalarImpl<short3, double, double3>, divScalarImpl<short4, double, double4>}
|
||||
},
|
||||
{
|
||||
{0 /*divScalarImpl<int, float, uchar>*/, 0 /*divScalarImpl<int2, float, uchar2>*/, 0 /*divScalarImpl<int3, float, uchar3>*/, 0 /*divScalarImpl<int4, float, uchar4>*/},
|
||||
{0 /*divScalarImpl<int, float, schar>*/, 0 /*divScalarImpl<int2, float, char2>*/, 0 /*divScalarImpl<int3, float, char3>*/, 0 /*divScalarImpl<int4, float, char4>*/},
|
||||
{0 /*divScalarImpl<int, float, ushort>*/, 0 /*divScalarImpl<int2, float, ushort2>*/, 0 /*divScalarImpl<int3, float, ushort3>*/, 0 /*divScalarImpl<int4, float, ushort4>*/},
|
||||
{0 /*divScalarImpl<int, float, short>*/, 0 /*divScalarImpl<int2, float, short2>*/, 0 /*divScalarImpl<int3, float, short3>*/, 0 /*divScalarImpl<int4, float, short4>*/},
|
||||
{divScalarImpl<int, float, int>, divScalarImpl<int2, float, int2>, divScalarImpl<int3, float, int3>, divScalarImpl<int4, float, int4>},
|
||||
{divScalarImpl<int, float, float>, divScalarImpl<int2, float, float2>, divScalarImpl<int3, float, float3>, divScalarImpl<int4, float, float4>},
|
||||
{divScalarImpl<int, double, double>, divScalarImpl<int2, double, double2>, divScalarImpl<int3, double, double3>, divScalarImpl<int4, double, double4>}
|
||||
},
|
||||
{
|
||||
{0 /*divScalarImpl<float, float, uchar>*/, 0 /*divScalarImpl<float2, float, uchar2>*/, 0 /*divScalarImpl<float3, float, uchar3>*/, 0 /*divScalarImpl<float4, float, uchar4>*/},
|
||||
{0 /*divScalarImpl<float, float, schar>*/, 0 /*divScalarImpl<float2, float, char2>*/, 0 /*divScalarImpl<float3, float, char3>*/, 0 /*divScalarImpl<float4, float, char4>*/},
|
||||
{0 /*divScalarImpl<float, float, ushort>*/, 0 /*divScalarImpl<float2, float, ushort2>*/, 0 /*divScalarImpl<float3, float, ushort3>*/, 0 /*divScalarImpl<float4, float, ushort4>*/},
|
||||
{0 /*divScalarImpl<float, float, short>*/, 0 /*divScalarImpl<float2, float, short2>*/, 0 /*divScalarImpl<float3, float, short3>*/, 0 /*divScalarImpl<float4, float, short4>*/},
|
||||
{0 /*divScalarImpl<float, float, int>*/, 0 /*divScalarImpl<float2, float, int2>*/, 0 /*divScalarImpl<float3, float, int3>*/, 0 /*divScalarImpl<float4, float, int4>*/},
|
||||
{divScalarImpl<float, float, float>, divScalarImpl<float2, float, float2>, divScalarImpl<float3, float, float3>, divScalarImpl<float4, float, float4>},
|
||||
{divScalarImpl<float, double, double>, divScalarImpl<float2, double, double2>, divScalarImpl<float3, double, double3>, divScalarImpl<float4, double, double4>}
|
||||
},
|
||||
{
|
||||
{0 /*divScalarImpl<double, double, uchar>*/, 0 /*divScalarImpl<double2, double, uchar2>*/, 0 /*divScalarImpl<double3, double, uchar3>*/, 0 /*divScalarImpl<double4, double, uchar4>*/},
|
||||
{0 /*divScalarImpl<double, double, schar>*/, 0 /*divScalarImpl<double2, double, char2>*/, 0 /*divScalarImpl<double3, double, char3>*/, 0 /*divScalarImpl<double4, double, char4>*/},
|
||||
{0 /*divScalarImpl<double, double, ushort>*/, 0 /*divScalarImpl<double2, double, ushort2>*/, 0 /*divScalarImpl<double3, double, ushort3>*/, 0 /*divScalarImpl<double4, double, ushort4>*/},
|
||||
{0 /*divScalarImpl<double, double, short>*/, 0 /*divScalarImpl<double2, double, short2>*/, 0 /*divScalarImpl<double3, double, short3>*/, 0 /*divScalarImpl<double4, double, short4>*/},
|
||||
{0 /*divScalarImpl<double, double, int>*/, 0 /*divScalarImpl<double2, double, int2>*/, 0 /*divScalarImpl<double3, double, int3>*/, 0 /*divScalarImpl<double4, double, int4>*/},
|
||||
{0 /*divScalarImpl<double, double, float>*/, 0 /*divScalarImpl<double2, double, float2>*/, 0 /*divScalarImpl<double3, double, float3>*/, 0 /*divScalarImpl<double4, double, float4>*/},
|
||||
{divScalarImpl<double, double, double>, divScalarImpl<double2, double, double2>, divScalarImpl<double3, double, double3>, divScalarImpl<double4, double, double4>}
|
||||
}
|
||||
};
|
||||
|
||||
const int sdepth = src.depth();
|
||||
const int ddepth = dst.depth();
|
||||
const int cn = src.channels();
|
||||
|
||||
CV_DbgAssert( sdepth <= CV_64F && ddepth <= CV_64F && cn <= 4 );
|
||||
|
||||
if (inv)
|
||||
{
|
||||
val[0] *= scale;
|
||||
val[1] *= scale;
|
||||
val[2] *= scale;
|
||||
val[3] *= scale;
|
||||
}
|
||||
else
|
||||
{
|
||||
val[0] /= scale;
|
||||
val[1] /= scale;
|
||||
val[2] /= scale;
|
||||
val[3] /= scale;
|
||||
}
|
||||
|
||||
const func_t func = funcs[sdepth][ddepth][cn - 1];
|
||||
|
||||
if (!func)
|
||||
CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types");
|
||||
|
||||
func(src, val, inv, dst, stream);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@ -40,433 +40,61 @@
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
#include "opencv2/opencv_modules.hpp"
|
||||
|
||||
#include "opencv2/core/cuda/common.hpp"
|
||||
#ifndef HAVE_OPENCV_CUDEV
|
||||
|
||||
namespace cv { namespace cuda { namespace device
|
||||
#error "opencv_cudev is required"
|
||||
|
||||
#else
|
||||
|
||||
#include "opencv2/cudaarithm.hpp"
|
||||
#include "opencv2/cudev.hpp"
|
||||
|
||||
using namespace cv::cudev;
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// integral
|
||||
|
||||
void cv::cuda::integral(InputArray _src, OutputArray _dst, GpuMat& buffer, Stream& stream)
|
||||
{
|
||||
namespace imgproc
|
||||
{
|
||||
// Utility function to extract unsigned chars from an unsigned integer
|
||||
__device__ uchar4 int_to_uchar4(unsigned int in)
|
||||
{
|
||||
uchar4 bytes;
|
||||
bytes.x = (in & 0x000000ff) >> 0;
|
||||
bytes.y = (in & 0x0000ff00) >> 8;
|
||||
bytes.z = (in & 0x00ff0000) >> 16;
|
||||
bytes.w = (in & 0xff000000) >> 24;
|
||||
return bytes;
|
||||
}
|
||||
GpuMat src = _src.getGpuMat();
|
||||
|
||||
__global__ void shfl_integral_horizontal(const PtrStep<uint4> img, PtrStep<uint4> integral)
|
||||
{
|
||||
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 300)
|
||||
__shared__ int sums[128];
|
||||
CV_Assert( src.type() == CV_8UC1 );
|
||||
|
||||
const int id = threadIdx.x;
|
||||
const int lane_id = id % warpSize;
|
||||
const int warp_id = id / warpSize;
|
||||
GpuMat_<int>& res = (GpuMat_<int>&) buffer;
|
||||
|
||||
const uint4 data = img(blockIdx.x, id);
|
||||
gridIntegral(globPtr<uchar>(src), res, stream);
|
||||
|
||||
const uchar4 a = int_to_uchar4(data.x);
|
||||
const uchar4 b = int_to_uchar4(data.y);
|
||||
const uchar4 c = int_to_uchar4(data.z);
|
||||
const uchar4 d = int_to_uchar4(data.w);
|
||||
_dst.create(src.rows + 1, src.cols + 1, CV_32SC1);
|
||||
GpuMat dst = _dst.getGpuMat();
|
||||
|
||||
int result[16];
|
||||
dst.setTo(Scalar::all(0), stream);
|
||||
|
||||
result[0] = a.x;
|
||||
result[1] = result[0] + a.y;
|
||||
result[2] = result[1] + a.z;
|
||||
result[3] = result[2] + a.w;
|
||||
GpuMat inner = dst(Rect(1, 1, src.cols, src.rows));
|
||||
res.copyTo(inner, stream);
|
||||
}
|
||||
|
||||
result[4] = result[3] + b.x;
|
||||
result[5] = result[4] + b.y;
|
||||
result[6] = result[5] + b.z;
|
||||
result[7] = result[6] + b.w;
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
// sqrIntegral
|
||||
|
||||
result[8] = result[7] + c.x;
|
||||
result[9] = result[8] + c.y;
|
||||
result[10] = result[9] + c.z;
|
||||
result[11] = result[10] + c.w;
|
||||
void cv::cuda::sqrIntegral(InputArray _src, OutputArray _dst, GpuMat& buf, Stream& stream)
|
||||
{
|
||||
GpuMat src = _src.getGpuMat();
|
||||
|
||||
result[12] = result[11] + d.x;
|
||||
result[13] = result[12] + d.y;
|
||||
result[14] = result[13] + d.z;
|
||||
result[15] = result[14] + d.w;
|
||||
CV_Assert( src.type() == CV_8UC1 );
|
||||
|
||||
int sum = result[15];
|
||||
GpuMat_<double>& res = (GpuMat_<double>&) buf;
|
||||
|
||||
// the prefix sum for each thread's 16 value is computed,
|
||||
// now the final sums (result[15]) need to be shared
|
||||
// with the other threads and add. To do this,
|
||||
// the __shfl_up() instruction is used and a shuffle scan
|
||||
// operation is performed to distribute the sums to the correct
|
||||
// threads
|
||||
#pragma unroll
|
||||
for (int i = 1; i < 32; i *= 2)
|
||||
{
|
||||
const int n = __shfl_up(sum, i, 32);
|
||||
gridIntegral(sqr_(cvt_<int>(globPtr<uchar>(src))), res, stream);
|
||||
|
||||
if (lane_id >= i)
|
||||
{
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 16; ++i)
|
||||
result[i] += n;
|
||||
_dst.create(src.rows + 1, src.cols + 1, CV_64FC1);
|
||||
GpuMat dst = _dst.getGpuMat();
|
||||
|
||||
sum += n;
|
||||
}
|
||||
}
|
||||
dst.setTo(Scalar::all(0), stream);
|
||||
|
||||
// Now the final sum for the warp must be shared
|
||||
// between warps. This is done by each warp
|
||||
// having a thread store to shared memory, then
|
||||
// having some other warp load the values and
|
||||
// compute a prefix sum, again by using __shfl_up.
|
||||
// The results are uniformly added back to the warps.
|
||||
// last thread in the warp holding sum of the warp
|
||||
// places that in shared
|
||||
if (threadIdx.x % warpSize == warpSize - 1)
|
||||
sums[warp_id] = result[15];
|
||||
GpuMat inner = dst(Rect(1, 1, src.cols, src.rows));
|
||||
res.copyTo(inner, stream);
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (warp_id == 0)
|
||||
{
|
||||
int warp_sum = sums[lane_id];
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 1; i <= 32; i *= 2)
|
||||
{
|
||||
const int n = __shfl_up(warp_sum, i, 32);
|
||||
|
||||
if (lane_id >= i)
|
||||
warp_sum += n;
|
||||
}
|
||||
|
||||
sums[lane_id] = warp_sum;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
int blockSum = 0;
|
||||
|
||||
// fold in unused warp
|
||||
if (warp_id > 0)
|
||||
{
|
||||
blockSum = sums[warp_id - 1];
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 16; ++i)
|
||||
result[i] += blockSum;
|
||||
}
|
||||
|
||||
// assemble result
|
||||
// Each thread has 16 values to write, which are
|
||||
// now integer data (to avoid overflow). Instead of
|
||||
// each thread writing consecutive uint4s, the
|
||||
// approach shown here experiments using
|
||||
// the shuffle command to reformat the data
|
||||
// inside the registers so that each thread holds
|
||||
// consecutive data to be written so larger contiguous
|
||||
// segments can be assembled for writing.
|
||||
|
||||
/*
|
||||
For example data that needs to be written as
|
||||
|
||||
GMEM[16] <- x0 x1 x2 x3 y0 y1 y2 y3 z0 z1 z2 z3 w0 w1 w2 w3
|
||||
but is stored in registers (r0..r3), in four threads (0..3) as:
|
||||
|
||||
threadId 0 1 2 3
|
||||
r0 x0 y0 z0 w0
|
||||
r1 x1 y1 z1 w1
|
||||
r2 x2 y2 z2 w2
|
||||
r3 x3 y3 z3 w3
|
||||
|
||||
after apply __shfl_xor operations to move data between registers r1..r3:
|
||||
|
||||
threadId 00 01 10 11
|
||||
x0 y0 z0 w0
|
||||
xor(01)->y1 x1 w1 z1
|
||||
xor(10)->z2 w2 x2 y2
|
||||
xor(11)->w3 z3 y3 x3
|
||||
|
||||
and now x0..x3, and z0..z3 can be written out in order by all threads.
|
||||
|
||||
In the current code, each register above is actually representing
|
||||
four integers to be written as uint4's to GMEM.
|
||||
*/
|
||||
|
||||
result[4] = __shfl_xor(result[4] , 1, 32);
|
||||
result[5] = __shfl_xor(result[5] , 1, 32);
|
||||
result[6] = __shfl_xor(result[6] , 1, 32);
|
||||
result[7] = __shfl_xor(result[7] , 1, 32);
|
||||
|
||||
result[8] = __shfl_xor(result[8] , 2, 32);
|
||||
result[9] = __shfl_xor(result[9] , 2, 32);
|
||||
result[10] = __shfl_xor(result[10], 2, 32);
|
||||
result[11] = __shfl_xor(result[11], 2, 32);
|
||||
|
||||
result[12] = __shfl_xor(result[12], 3, 32);
|
||||
result[13] = __shfl_xor(result[13], 3, 32);
|
||||
result[14] = __shfl_xor(result[14], 3, 32);
|
||||
result[15] = __shfl_xor(result[15], 3, 32);
|
||||
|
||||
uint4* integral_row = integral.ptr(blockIdx.x);
|
||||
uint4 output;
|
||||
|
||||
///////
|
||||
|
||||
if (threadIdx.x % 4 == 0)
|
||||
output = make_uint4(result[0], result[1], result[2], result[3]);
|
||||
|
||||
if (threadIdx.x % 4 == 1)
|
||||
output = make_uint4(result[4], result[5], result[6], result[7]);
|
||||
|
||||
if (threadIdx.x % 4 == 2)
|
||||
output = make_uint4(result[8], result[9], result[10], result[11]);
|
||||
|
||||
if (threadIdx.x % 4 == 3)
|
||||
output = make_uint4(result[12], result[13], result[14], result[15]);
|
||||
|
||||
integral_row[threadIdx.x % 4 + (threadIdx.x / 4) * 16] = output;
|
||||
|
||||
///////
|
||||
|
||||
if (threadIdx.x % 4 == 2)
|
||||
output = make_uint4(result[0], result[1], result[2], result[3]);
|
||||
|
||||
if (threadIdx.x % 4 == 3)
|
||||
output = make_uint4(result[4], result[5], result[6], result[7]);
|
||||
|
||||
if (threadIdx.x % 4 == 0)
|
||||
output = make_uint4(result[8], result[9], result[10], result[11]);
|
||||
|
||||
if (threadIdx.x % 4 == 1)
|
||||
output = make_uint4(result[12], result[13], result[14], result[15]);
|
||||
|
||||
integral_row[(threadIdx.x + 2) % 4 + (threadIdx.x / 4) * 16 + 8] = output;
|
||||
|
||||
// continuning from the above example,
|
||||
// this use of __shfl_xor() places the y0..y3 and w0..w3 data
|
||||
// in order.
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 16; ++i)
|
||||
result[i] = __shfl_xor(result[i], 1, 32);
|
||||
|
||||
if (threadIdx.x % 4 == 0)
|
||||
output = make_uint4(result[0], result[1], result[2], result[3]);
|
||||
|
||||
if (threadIdx.x % 4 == 1)
|
||||
output = make_uint4(result[4], result[5], result[6], result[7]);
|
||||
|
||||
if (threadIdx.x % 4 == 2)
|
||||
output = make_uint4(result[8], result[9], result[10], result[11]);
|
||||
|
||||
if (threadIdx.x % 4 == 3)
|
||||
output = make_uint4(result[12], result[13], result[14], result[15]);
|
||||
|
||||
integral_row[threadIdx.x % 4 + (threadIdx.x / 4) * 16 + 4] = output;
|
||||
|
||||
///////
|
||||
|
||||
if (threadIdx.x % 4 == 2)
|
||||
output = make_uint4(result[0], result[1], result[2], result[3]);
|
||||
|
||||
if (threadIdx.x % 4 == 3)
|
||||
output = make_uint4(result[4], result[5], result[6], result[7]);
|
||||
|
||||
if (threadIdx.x % 4 == 0)
|
||||
output = make_uint4(result[8], result[9], result[10], result[11]);
|
||||
|
||||
if (threadIdx.x % 4 == 1)
|
||||
output = make_uint4(result[12], result[13], result[14], result[15]);
|
||||
|
||||
integral_row[(threadIdx.x + 2) % 4 + (threadIdx.x / 4) * 16 + 12] = output;
|
||||
#endif
|
||||
}
|
||||
|
||||
// This kernel computes columnwise prefix sums. When the data input is
|
||||
// the row sums from above, this completes the integral image.
|
||||
// The approach here is to have each block compute a local set of sums.
|
||||
// First , the data covered by the block is loaded into shared memory,
|
||||
// then instead of performing a sum in shared memory using __syncthreads
|
||||
// between stages, the data is reformatted so that the necessary sums
|
||||
// occur inside warps and the shuffle scan operation is used.
|
||||
// The final set of sums from the block is then propgated, with the block
|
||||
// computing "down" the image and adding the running sum to the local
|
||||
// block sums.
|
||||
__global__ void shfl_integral_vertical(PtrStepSz<unsigned int> integral)
|
||||
{
|
||||
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 300)
|
||||
__shared__ unsigned int sums[32][9];
|
||||
|
||||
const int tidx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int lane_id = tidx % 8;
|
||||
|
||||
if (tidx >= integral.cols)
|
||||
return;
|
||||
|
||||
sums[threadIdx.x][threadIdx.y] = 0;
|
||||
__syncthreads();
|
||||
|
||||
unsigned int stepSum = 0;
|
||||
|
||||
for (int y = threadIdx.y; y < integral.rows; y += blockDim.y)
|
||||
{
|
||||
unsigned int* p = integral.ptr(y) + tidx;
|
||||
|
||||
unsigned int sum = *p;
|
||||
|
||||
sums[threadIdx.x][threadIdx.y] = sum;
|
||||
__syncthreads();
|
||||
|
||||
// place into SMEM
|
||||
// shfl scan reduce the SMEM, reformating so the column
|
||||
// sums are computed in a warp
|
||||
// then read out properly
|
||||
const int j = threadIdx.x % 8;
|
||||
const int k = threadIdx.x / 8 + threadIdx.y * 4;
|
||||
|
||||
int partial_sum = sums[k][j];
|
||||
|
||||
for (int i = 1; i <= 8; i *= 2)
|
||||
{
|
||||
int n = __shfl_up(partial_sum, i, 32);
|
||||
|
||||
if (lane_id >= i)
|
||||
partial_sum += n;
|
||||
}
|
||||
|
||||
sums[k][j] = partial_sum;
|
||||
__syncthreads();
|
||||
|
||||
if (threadIdx.y > 0)
|
||||
sum += sums[threadIdx.x][threadIdx.y - 1];
|
||||
|
||||
sum += stepSum;
|
||||
stepSum += sums[threadIdx.x][blockDim.y - 1];
|
||||
|
||||
__syncthreads();
|
||||
|
||||
*p = sum;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
void shfl_integral_gpu(const PtrStepSzb& img, PtrStepSz<unsigned int> integral, cudaStream_t stream)
|
||||
{
|
||||
{
|
||||
// each thread handles 16 values, use 1 block/row
|
||||
// save, becouse step is actually can't be less 512 bytes
|
||||
int block = integral.cols / 16;
|
||||
|
||||
// launch 1 block / row
|
||||
const int grid = img.rows;
|
||||
|
||||
cudaSafeCall( cudaFuncSetCacheConfig(shfl_integral_horizontal, cudaFuncCachePreferL1) );
|
||||
|
||||
shfl_integral_horizontal<<<grid, block, 0, stream>>>((const PtrStepSz<uint4>) img, (PtrStepSz<uint4>) integral);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
}
|
||||
|
||||
{
|
||||
const dim3 block(32, 8);
|
||||
const dim3 grid(divUp(integral.cols, block.x), 1);
|
||||
|
||||
shfl_integral_vertical<<<grid, block, 0, stream>>>(integral);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
}
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
__global__ void shfl_integral_vertical(PtrStepSz<unsigned int> buffer, PtrStepSz<unsigned int> integral)
|
||||
{
|
||||
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 300)
|
||||
__shared__ unsigned int sums[32][9];
|
||||
|
||||
const int tidx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int lane_id = tidx % 8;
|
||||
|
||||
if (tidx >= integral.cols)
|
||||
return;
|
||||
|
||||
sums[threadIdx.x][threadIdx.y] = 0;
|
||||
__syncthreads();
|
||||
|
||||
unsigned int stepSum = 0;
|
||||
|
||||
for (int y = threadIdx.y; y < integral.rows; y += blockDim.y)
|
||||
{
|
||||
unsigned int* p = buffer.ptr(y) + tidx;
|
||||
unsigned int* dst = integral.ptr(y + 1) + tidx + 1;
|
||||
|
||||
unsigned int sum = *p;
|
||||
|
||||
sums[threadIdx.x][threadIdx.y] = sum;
|
||||
__syncthreads();
|
||||
|
||||
// place into SMEM
|
||||
// shfl scan reduce the SMEM, reformating so the column
|
||||
// sums are computed in a warp
|
||||
// then read out properly
|
||||
const int j = threadIdx.x % 8;
|
||||
const int k = threadIdx.x / 8 + threadIdx.y * 4;
|
||||
|
||||
int partial_sum = sums[k][j];
|
||||
|
||||
for (int i = 1; i <= 8; i *= 2)
|
||||
{
|
||||
int n = __shfl_up(partial_sum, i, 32);
|
||||
|
||||
if (lane_id >= i)
|
||||
partial_sum += n;
|
||||
}
|
||||
|
||||
sums[k][j] = partial_sum;
|
||||
__syncthreads();
|
||||
|
||||
if (threadIdx.y > 0)
|
||||
sum += sums[threadIdx.x][threadIdx.y - 1];
|
||||
|
||||
sum += stepSum;
|
||||
stepSum += sums[threadIdx.x][blockDim.y - 1];
|
||||
|
||||
__syncthreads();
|
||||
|
||||
*dst = sum;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
// used for frame preprocessing before Soft Cascade evaluation: no synchronization needed
|
||||
void shfl_integral_gpu_buffered(PtrStepSzb img, PtrStepSz<uint4> buffer, PtrStepSz<unsigned int> integral,
|
||||
int blockStep, cudaStream_t stream)
|
||||
{
|
||||
{
|
||||
const int block = blockStep;
|
||||
const int grid = img.rows;
|
||||
|
||||
cudaSafeCall( cudaFuncSetCacheConfig(shfl_integral_horizontal, cudaFuncCachePreferL1) );
|
||||
|
||||
shfl_integral_horizontal<<<grid, block, 0, stream>>>((PtrStepSz<uint4>) img, buffer);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
}
|
||||
|
||||
{
|
||||
const dim3 block(32, 8);
|
||||
const dim3 grid(divUp(integral.cols, block.x), 1);
|
||||
|
||||
shfl_integral_vertical<<<grid, block, 0, stream>>>((PtrStepSz<uint>)buffer, integral);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
}
|
||||
}
|
||||
}
|
||||
}}}
|
||||
|
||||
#endif /* CUDA_DISABLER */
|
||||
#endif
|
||||
|
207
modules/cudaarithm/src/cuda/lut.cu
Normal file
207
modules/cudaarithm/src/cuda/lut.cu
Normal file
@ -0,0 +1,207 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#include "opencv2/opencv_modules.hpp"
|
||||
|
||||
#ifndef HAVE_OPENCV_CUDEV
|
||||
|
||||
#error "opencv_cudev is required"
|
||||
|
||||
#else
|
||||
|
||||
#include "opencv2/cudaarithm.hpp"
|
||||
#include "opencv2/cudev.hpp"
|
||||
|
||||
using namespace cv;
|
||||
using namespace cv::cudev;
|
||||
|
||||
namespace
|
||||
{
|
||||
texture<uchar, cudaTextureType1D, cudaReadModeElementType> texLutTable;
|
||||
|
||||
class LookUpTableImpl : public LookUpTable
|
||||
{
|
||||
public:
|
||||
LookUpTableImpl(InputArray lut);
|
||||
~LookUpTableImpl();
|
||||
|
||||
void transform(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
|
||||
|
||||
private:
|
||||
GpuMat d_lut;
|
||||
cudaTextureObject_t texLutTableObj;
|
||||
bool cc30;
|
||||
};
|
||||
|
||||
LookUpTableImpl::LookUpTableImpl(InputArray _lut)
|
||||
{
|
||||
if (_lut.kind() == _InputArray::GPU_MAT)
|
||||
{
|
||||
d_lut = _lut.getGpuMat();
|
||||
}
|
||||
else
|
||||
{
|
||||
Mat h_lut = _lut.getMat();
|
||||
d_lut.upload(Mat(1, 256, h_lut.type(), h_lut.data));
|
||||
}
|
||||
|
||||
CV_Assert( d_lut.depth() == CV_8U );
|
||||
CV_Assert( d_lut.rows == 1 && d_lut.cols == 256 );
|
||||
|
||||
cc30 = deviceSupports(FEATURE_SET_COMPUTE_30);
|
||||
|
||||
if (cc30)
|
||||
{
|
||||
// Use the texture object
|
||||
cudaResourceDesc texRes;
|
||||
std::memset(&texRes, 0, sizeof(texRes));
|
||||
texRes.resType = cudaResourceTypeLinear;
|
||||
texRes.res.linear.devPtr = d_lut.data;
|
||||
texRes.res.linear.desc = cudaCreateChannelDesc<uchar>();
|
||||
texRes.res.linear.sizeInBytes = 256 * d_lut.channels() * sizeof(uchar);
|
||||
|
||||
cudaTextureDesc texDescr;
|
||||
std::memset(&texDescr, 0, sizeof(texDescr));
|
||||
|
||||
CV_CUDEV_SAFE_CALL( cudaCreateTextureObject(&texLutTableObj, &texRes, &texDescr, 0) );
|
||||
}
|
||||
else
|
||||
{
|
||||
// Use the texture reference
|
||||
cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar>();
|
||||
CV_CUDEV_SAFE_CALL( cudaBindTexture(0, &texLutTable, d_lut.data, &desc) );
|
||||
}
|
||||
}
|
||||
|
||||
LookUpTableImpl::~LookUpTableImpl()
|
||||
{
|
||||
if (cc30)
|
||||
{
|
||||
// Use the texture object
|
||||
cudaDestroyTextureObject(texLutTableObj);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Use the texture reference
|
||||
cudaUnbindTexture(texLutTable);
|
||||
}
|
||||
}
|
||||
|
||||
struct LutTablePtrC1
|
||||
{
|
||||
typedef uchar value_type;
|
||||
typedef uchar index_type;
|
||||
|
||||
cudaTextureObject_t texLutTableObj;
|
||||
|
||||
__device__ __forceinline__ uchar operator ()(uchar, uchar x) const
|
||||
{
|
||||
#if CV_CUDEV_ARCH < 300
|
||||
// Use the texture reference
|
||||
return tex1Dfetch(texLutTable, x);
|
||||
#else
|
||||
// Use the texture object
|
||||
return tex1Dfetch<uchar>(texLutTableObj, x);
|
||||
#endif
|
||||
}
|
||||
};
|
||||
struct LutTablePtrC3
|
||||
{
|
||||
typedef uchar3 value_type;
|
||||
typedef uchar3 index_type;
|
||||
|
||||
cudaTextureObject_t texLutTableObj;
|
||||
|
||||
__device__ __forceinline__ uchar3 operator ()(const uchar3&, const uchar3& x) const
|
||||
{
|
||||
#if CV_CUDEV_ARCH < 300
|
||||
// Use the texture reference
|
||||
return make_uchar3(tex1Dfetch(texLutTable, x.x * 3), tex1Dfetch(texLutTable, x.y * 3 + 1), tex1Dfetch(texLutTable, x.z * 3 + 2));
|
||||
#else
|
||||
// Use the texture object
|
||||
return make_uchar3(tex1Dfetch<uchar>(texLutTableObj, x.x * 3), tex1Dfetch<uchar>(texLutTableObj, x.y * 3 + 1), tex1Dfetch<uchar>(texLutTableObj, x.z * 3 + 2));
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
void LookUpTableImpl::transform(InputArray _src, OutputArray _dst, Stream& stream)
|
||||
{
|
||||
GpuMat src = _src.getGpuMat();
|
||||
|
||||
const int cn = src.channels();
|
||||
const int lut_cn = d_lut.channels();
|
||||
|
||||
CV_Assert( src.type() == CV_8UC1 || src.type() == CV_8UC3 );
|
||||
CV_Assert( lut_cn == 1 || lut_cn == cn );
|
||||
|
||||
_dst.create(src.size(), src.type());
|
||||
GpuMat dst = _dst.getGpuMat();
|
||||
|
||||
if (lut_cn == 1)
|
||||
{
|
||||
GpuMat_<uchar> src1(src.reshape(1));
|
||||
GpuMat_<uchar> dst1(dst.reshape(1));
|
||||
|
||||
LutTablePtrC1 tbl;
|
||||
tbl.texLutTableObj = texLutTableObj;
|
||||
|
||||
dst1.assign(lut_(src1, tbl), stream);
|
||||
}
|
||||
else if (lut_cn == 3)
|
||||
{
|
||||
GpuMat_<uchar3>& src3 = (GpuMat_<uchar3>&) src;
|
||||
GpuMat_<uchar3>& dst3 = (GpuMat_<uchar3>&) dst;
|
||||
|
||||
LutTablePtrC3 tbl;
|
||||
tbl.texLutTableObj = texLutTableObj;
|
||||
|
||||
dst3.assign(lut_(src3, tbl), stream);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ptr<LookUpTable> cv::cuda::createLookUpTable(InputArray lut)
|
||||
{
|
||||
return makePtr<LookUpTableImpl>(lut);
|
||||
}
|
||||
|
||||
#endif
|
@ -40,196 +40,248 @@
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
#include "opencv2/opencv_modules.hpp"
|
||||
|
||||
#include "opencv2/core/cuda/common.hpp"
|
||||
#include "opencv2/core/cuda/functional.hpp"
|
||||
#include "opencv2/core/cuda/transform.hpp"
|
||||
#include "opencv2/core/cuda/saturate_cast.hpp"
|
||||
#include "opencv2/core/cuda/simd_functions.hpp"
|
||||
#include "opencv2/core/cuda/limits.hpp"
|
||||
#include "opencv2/core/cuda/type_traits.hpp"
|
||||
#ifndef HAVE_OPENCV_CUDEV
|
||||
|
||||
#include "arithm_func_traits.hpp"
|
||||
#error "opencv_cudev is required"
|
||||
|
||||
using namespace cv::cuda;
|
||||
using namespace cv::cuda::device;
|
||||
#else
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
// absMat
|
||||
#include "opencv2/cudaarithm.hpp"
|
||||
#include "opencv2/cudev.hpp"
|
||||
|
||||
namespace cv { namespace cuda { namespace device
|
||||
using namespace cv::cudev;
|
||||
|
||||
namespace
|
||||
{
|
||||
template <typename T> struct TransformFunctorTraits< abs_func<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
|
||||
template <typename ScalarDepth> struct TransformPolicy : DefaultTransformPolicy
|
||||
{
|
||||
};
|
||||
}}}
|
||||
|
||||
namespace arithm
|
||||
{
|
||||
template <typename T>
|
||||
void absMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream)
|
||||
template <> struct TransformPolicy<double> : DefaultTransformPolicy
|
||||
{
|
||||
device::transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, abs_func<T>(), WithOutMask(), stream);
|
||||
}
|
||||
|
||||
template void absMat<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void absMat<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void absMat<ushort>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void absMat<short>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void absMat<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void absMat<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void absMat<double>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
|
||||
enum {
|
||||
shift = 1
|
||||
};
|
||||
};
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
// sqrMat
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
/// abs
|
||||
|
||||
namespace arithm
|
||||
namespace
|
||||
{
|
||||
template <typename T> struct Sqr : unary_function<T, T>
|
||||
template <typename T>
|
||||
void absMat(const GpuMat& src, const GpuMat& dst, Stream& stream)
|
||||
{
|
||||
gridTransformUnary_< TransformPolicy<T> >(globPtr<T>(src), globPtr<T>(dst), abs_func<T>(), stream);
|
||||
}
|
||||
}
|
||||
|
||||
void cv::cuda::abs(InputArray _src, OutputArray _dst, Stream& stream)
|
||||
{
|
||||
typedef void (*func_t)(const GpuMat& src, const GpuMat& dst, Stream& stream);
|
||||
static const func_t funcs[] =
|
||||
{
|
||||
absMat<uchar>,
|
||||
absMat<schar>,
|
||||
absMat<ushort>,
|
||||
absMat<short>,
|
||||
absMat<int>,
|
||||
absMat<float>,
|
||||
absMat<double>
|
||||
};
|
||||
|
||||
GpuMat src = _src.getGpuMat();
|
||||
|
||||
const int depth = src.depth();
|
||||
|
||||
CV_DbgAssert( depth <= CV_64F );
|
||||
|
||||
_dst.create(src.size(), src.type());
|
||||
GpuMat dst = _dst.getGpuMat();
|
||||
|
||||
funcs[depth](src.reshape(1), dst.reshape(1), stream);
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
/// sqr
|
||||
|
||||
namespace
|
||||
{
|
||||
template <typename T> struct SqrOp : unary_function<T, T>
|
||||
{
|
||||
__device__ __forceinline__ T operator ()(T x) const
|
||||
{
|
||||
return saturate_cast<T>(x * x);
|
||||
}
|
||||
|
||||
__host__ __device__ __forceinline__ Sqr() {}
|
||||
__host__ __device__ __forceinline__ Sqr(const Sqr&) {}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
void sqrMat(const GpuMat& src, const GpuMat& dst, Stream& stream)
|
||||
{
|
||||
gridTransformUnary_< TransformPolicy<T> >(globPtr<T>(src), globPtr<T>(dst), SqrOp<T>(), stream);
|
||||
}
|
||||
}
|
||||
|
||||
namespace cv { namespace cuda { namespace device
|
||||
void cv::cuda::sqr(InputArray _src, OutputArray _dst, Stream& stream)
|
||||
{
|
||||
template <typename T> struct TransformFunctorTraits< arithm::Sqr<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
|
||||
typedef void (*func_t)(const GpuMat& src, const GpuMat& dst, Stream& stream);
|
||||
static const func_t funcs[] =
|
||||
{
|
||||
sqrMat<uchar>,
|
||||
sqrMat<schar>,
|
||||
sqrMat<ushort>,
|
||||
sqrMat<short>,
|
||||
sqrMat<int>,
|
||||
sqrMat<float>,
|
||||
sqrMat<double>
|
||||
};
|
||||
}}}
|
||||
|
||||
namespace arithm
|
||||
GpuMat src = _src.getGpuMat();
|
||||
|
||||
const int depth = src.depth();
|
||||
|
||||
CV_DbgAssert( depth <= CV_64F );
|
||||
|
||||
_dst.create(src.size(), src.type());
|
||||
GpuMat dst = _dst.getGpuMat();
|
||||
|
||||
funcs[depth](src.reshape(1), dst.reshape(1), stream);
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
/// sqrt
|
||||
|
||||
namespace
|
||||
{
|
||||
template <typename T>
|
||||
void sqrMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream)
|
||||
void sqrtMat(const GpuMat& src, const GpuMat& dst, Stream& stream)
|
||||
{
|
||||
device::transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, Sqr<T>(), WithOutMask(), stream);
|
||||
gridTransformUnary_< TransformPolicy<T> >(globPtr<T>(src), globPtr<T>(dst), sqrt_func<T>(), stream);
|
||||
}
|
||||
|
||||
template void sqrMat<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void sqrMat<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void sqrMat<ushort>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void sqrMat<short>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void sqrMat<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void sqrMat<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void sqrMat<double>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
// sqrtMat
|
||||
|
||||
namespace cv { namespace cuda { namespace device
|
||||
void cv::cuda::sqrt(InputArray _src, OutputArray _dst, Stream& stream)
|
||||
{
|
||||
template <typename T> struct TransformFunctorTraits< sqrt_func<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
|
||||
typedef void (*func_t)(const GpuMat& src, const GpuMat& dst, Stream& stream);
|
||||
static const func_t funcs[] =
|
||||
{
|
||||
sqrtMat<uchar>,
|
||||
sqrtMat<schar>,
|
||||
sqrtMat<ushort>,
|
||||
sqrtMat<short>,
|
||||
sqrtMat<int>,
|
||||
sqrtMat<float>,
|
||||
sqrtMat<double>
|
||||
};
|
||||
}}}
|
||||
|
||||
namespace arithm
|
||||
{
|
||||
template <typename T>
|
||||
void sqrtMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream)
|
||||
{
|
||||
device::transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, sqrt_func<T>(), WithOutMask(), stream);
|
||||
}
|
||||
GpuMat src = _src.getGpuMat();
|
||||
|
||||
template void sqrtMat<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void sqrtMat<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void sqrtMat<ushort>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void sqrtMat<short>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void sqrtMat<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void sqrtMat<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void sqrtMat<double>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
|
||||
const int depth = src.depth();
|
||||
|
||||
CV_DbgAssert( depth <= CV_64F );
|
||||
|
||||
_dst.create(src.size(), src.type());
|
||||
GpuMat dst = _dst.getGpuMat();
|
||||
|
||||
funcs[depth](src.reshape(1), dst.reshape(1), stream);
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
// logMat
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
/// exp
|
||||
|
||||
namespace cv { namespace cuda { namespace device
|
||||
namespace
|
||||
{
|
||||
template <typename T> struct TransformFunctorTraits< log_func<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
|
||||
{
|
||||
};
|
||||
}}}
|
||||
|
||||
namespace arithm
|
||||
{
|
||||
template <typename T>
|
||||
void logMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream)
|
||||
{
|
||||
device::transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, log_func<T>(), WithOutMask(), stream);
|
||||
}
|
||||
|
||||
template void logMat<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void logMat<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void logMat<ushort>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void logMat<short>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void logMat<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void logMat<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void logMat<double>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
// expMat
|
||||
|
||||
namespace arithm
|
||||
{
|
||||
template <typename T> struct Exp : unary_function<T, T>
|
||||
template <typename T> struct ExpOp : unary_function<T, T>
|
||||
{
|
||||
__device__ __forceinline__ T operator ()(T x) const
|
||||
{
|
||||
exp_func<T> f;
|
||||
return saturate_cast<T>(f(x));
|
||||
}
|
||||
|
||||
__host__ __device__ __forceinline__ Exp() {}
|
||||
__host__ __device__ __forceinline__ Exp(const Exp&) {}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
void expMat(const GpuMat& src, const GpuMat& dst, Stream& stream)
|
||||
{
|
||||
gridTransformUnary_< TransformPolicy<T> >(globPtr<T>(src), globPtr<T>(dst), ExpOp<T>(), stream);
|
||||
}
|
||||
}
|
||||
|
||||
namespace cv { namespace cuda { namespace device
|
||||
void cv::cuda::exp(InputArray _src, OutputArray _dst, Stream& stream)
|
||||
{
|
||||
template <typename T> struct TransformFunctorTraits< arithm::Exp<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
|
||||
typedef void (*func_t)(const GpuMat& src, const GpuMat& dst, Stream& stream);
|
||||
static const func_t funcs[] =
|
||||
{
|
||||
expMat<uchar>,
|
||||
expMat<schar>,
|
||||
expMat<ushort>,
|
||||
expMat<short>,
|
||||
expMat<int>,
|
||||
expMat<float>,
|
||||
expMat<double>
|
||||
};
|
||||
}}}
|
||||
|
||||
namespace arithm
|
||||
GpuMat src = _src.getGpuMat();
|
||||
|
||||
const int depth = src.depth();
|
||||
|
||||
CV_DbgAssert( depth <= CV_64F );
|
||||
|
||||
_dst.create(src.size(), src.type());
|
||||
GpuMat dst = _dst.getGpuMat();
|
||||
|
||||
funcs[depth](src.reshape(1), dst.reshape(1), stream);
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// log
|
||||
|
||||
namespace
|
||||
{
|
||||
template <typename T>
|
||||
void expMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream)
|
||||
void logMat(const GpuMat& src, const GpuMat& dst, Stream& stream)
|
||||
{
|
||||
device::transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, Exp<T>(), WithOutMask(), stream);
|
||||
gridTransformUnary_< TransformPolicy<T> >(globPtr<T>(src), globPtr<T>(dst), log_func<T>(), stream);
|
||||
}
|
||||
|
||||
template void expMat<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void expMat<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void expMat<ushort>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void expMat<short>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void expMat<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void expMat<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void expMat<double>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
void cv::cuda::log(InputArray _src, OutputArray _dst, Stream& stream)
|
||||
{
|
||||
typedef void (*func_t)(const GpuMat& src, const GpuMat& dst, Stream& stream);
|
||||
static const func_t funcs[] =
|
||||
{
|
||||
logMat<uchar>,
|
||||
logMat<schar>,
|
||||
logMat<ushort>,
|
||||
logMat<short>,
|
||||
logMat<int>,
|
||||
logMat<float>,
|
||||
logMat<double>
|
||||
};
|
||||
|
||||
GpuMat src = _src.getGpuMat();
|
||||
|
||||
const int depth = src.depth();
|
||||
|
||||
CV_DbgAssert( depth <= CV_64F );
|
||||
|
||||
_dst.create(src.size(), src.type());
|
||||
GpuMat dst = _dst.getGpuMat();
|
||||
|
||||
funcs[depth](src.reshape(1), dst.reshape(1), stream);
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// pow
|
||||
|
||||
namespace arithm
|
||||
namespace
|
||||
{
|
||||
template<typename T, bool Signed = numeric_limits<T>::is_signed> struct PowOp : unary_function<T, T>
|
||||
{
|
||||
float power;
|
||||
|
||||
__host__ explicit PowOp(double power_) : power(static_cast<float>(power_)) {}
|
||||
|
||||
__device__ __forceinline__ T operator()(T e) const
|
||||
{
|
||||
return saturate_cast<T>(__powf((float)e, power));
|
||||
@ -239,8 +291,6 @@ namespace arithm
|
||||
{
|
||||
float power;
|
||||
|
||||
__host__ explicit PowOp(double power_) : power(static_cast<float>(power_)) {}
|
||||
|
||||
__device__ __forceinline__ T operator()(T e) const
|
||||
{
|
||||
T res = saturate_cast<T>(__powf((float)e, power));
|
||||
@ -255,8 +305,6 @@ namespace arithm
|
||||
{
|
||||
float power;
|
||||
|
||||
__host__ explicit PowOp(double power_) : power(static_cast<float>(power_)) {}
|
||||
|
||||
__device__ __forceinline__ float operator()(float e) const
|
||||
{
|
||||
return __powf(::fabs(e), power);
|
||||
@ -266,37 +314,46 @@ namespace arithm
|
||||
{
|
||||
double power;
|
||||
|
||||
__host__ explicit PowOp(double power_) : power(power_) {}
|
||||
|
||||
__device__ __forceinline__ double operator()(double e) const
|
||||
{
|
||||
return ::pow(::fabs(e), power);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
namespace cv { namespace cuda { namespace device
|
||||
{
|
||||
template <typename T> struct TransformFunctorTraits< arithm::PowOp<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
|
||||
{
|
||||
};
|
||||
}}}
|
||||
|
||||
namespace arithm
|
||||
{
|
||||
template<typename T>
|
||||
void pow(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream)
|
||||
void powMat(const GpuMat& src, double power, const GpuMat& dst, Stream& stream)
|
||||
{
|
||||
device::transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, PowOp<T>(power), WithOutMask(), stream);
|
||||
}
|
||||
PowOp<T> op;
|
||||
op.power = static_cast<typename LargerType<T, float>::type>(power);
|
||||
|
||||
template void pow<uchar>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void pow<schar>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void pow<short>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void pow<ushort>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void pow<int>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void pow<float>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void pow<double>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
|
||||
gridTransformUnary_< TransformPolicy<T> >(globPtr<T>(src), globPtr<T>(dst), op, stream);
|
||||
}
|
||||
}
|
||||
|
||||
#endif // CUDA_DISABLER
|
||||
void cv::cuda::pow(InputArray _src, double power, OutputArray _dst, Stream& stream)
|
||||
{
|
||||
typedef void (*func_t)(const GpuMat& src, double power, const GpuMat& dst, Stream& stream);
|
||||
static const func_t funcs[] =
|
||||
{
|
||||
powMat<uchar>,
|
||||
powMat<schar>,
|
||||
powMat<ushort>,
|
||||
powMat<short>,
|
||||
powMat<int>,
|
||||
powMat<float>,
|
||||
powMat<double>
|
||||
};
|
||||
|
||||
GpuMat src = _src.getGpuMat();
|
||||
|
||||
const int depth = src.depth();
|
||||
|
||||
CV_DbgAssert(depth <= CV_64F);
|
||||
|
||||
_dst.create(src.size(), src.type());
|
||||
GpuMat dst = _dst.getGpuMat();
|
||||
|
||||
funcs[depth](src.reshape(1), power, dst.reshape(1), stream);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@ -40,208 +40,72 @@
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
#include "opencv2/opencv_modules.hpp"
|
||||
|
||||
#include "opencv2/core/cuda/common.hpp"
|
||||
#include "opencv2/core/cuda/vec_traits.hpp"
|
||||
#include "opencv2/core/cuda/vec_math.hpp"
|
||||
#include "opencv2/core/cuda/functional.hpp"
|
||||
#include "opencv2/core/cuda/reduce.hpp"
|
||||
#include "opencv2/core/cuda/emulation.hpp"
|
||||
#include "opencv2/core/cuda/limits.hpp"
|
||||
#include "opencv2/core/cuda/utility.hpp"
|
||||
#ifndef HAVE_OPENCV_CUDEV
|
||||
|
||||
using namespace cv::cuda;
|
||||
using namespace cv::cuda::device;
|
||||
#error "opencv_cudev is required"
|
||||
|
||||
namespace minMax
|
||||
#else
|
||||
|
||||
#include "opencv2/cudaarithm.hpp"
|
||||
#include "opencv2/cudev.hpp"
|
||||
|
||||
using namespace cv::cudev;
|
||||
|
||||
namespace
|
||||
{
|
||||
__device__ unsigned int blocks_finished = 0;
|
||||
|
||||
// To avoid shared bank conflicts we convert each value into value of
|
||||
// appropriate type (32 bits minimum)
|
||||
template <typename T> struct MinMaxTypeTraits;
|
||||
template <> struct MinMaxTypeTraits<uchar> { typedef int best_type; };
|
||||
template <> struct MinMaxTypeTraits<schar> { typedef int best_type; };
|
||||
template <> struct MinMaxTypeTraits<ushort> { typedef int best_type; };
|
||||
template <> struct MinMaxTypeTraits<short> { typedef int best_type; };
|
||||
template <> struct MinMaxTypeTraits<int> { typedef int best_type; };
|
||||
template <> struct MinMaxTypeTraits<float> { typedef float best_type; };
|
||||
template <> struct MinMaxTypeTraits<double> { typedef double best_type; };
|
||||
|
||||
template <int BLOCK_SIZE, typename R>
|
||||
struct GlobalReduce
|
||||
{
|
||||
static __device__ void run(R& mymin, R& mymax, R* minval, R* maxval, int tid, int bid, R* sminval, R* smaxval)
|
||||
{
|
||||
#if __CUDA_ARCH__ >= 200
|
||||
if (tid == 0)
|
||||
{
|
||||
Emulation::glob::atomicMin(minval, mymin);
|
||||
Emulation::glob::atomicMax(maxval, mymax);
|
||||
}
|
||||
#else
|
||||
__shared__ bool is_last;
|
||||
|
||||
if (tid == 0)
|
||||
{
|
||||
minval[bid] = mymin;
|
||||
maxval[bid] = mymax;
|
||||
|
||||
__threadfence();
|
||||
|
||||
unsigned int ticket = ::atomicAdd(&blocks_finished, 1);
|
||||
is_last = (ticket == gridDim.x * gridDim.y - 1);
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (is_last)
|
||||
{
|
||||
int idx = ::min(tid, gridDim.x * gridDim.y - 1);
|
||||
|
||||
mymin = minval[idx];
|
||||
mymax = maxval[idx];
|
||||
|
||||
const minimum<R> minOp;
|
||||
const maximum<R> maxOp;
|
||||
device::reduce<BLOCK_SIZE>(smem_tuple(sminval, smaxval), thrust::tie(mymin, mymax), tid, thrust::make_tuple(minOp, maxOp));
|
||||
|
||||
if (tid == 0)
|
||||
{
|
||||
minval[0] = mymin;
|
||||
maxval[0] = mymax;
|
||||
|
||||
blocks_finished = 0;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
template <int BLOCK_SIZE, typename T, typename R, class Mask>
|
||||
__global__ void kernel(const PtrStepSz<T> src, const Mask mask, R* minval, R* maxval, const int twidth, const int theight)
|
||||
{
|
||||
__shared__ R sminval[BLOCK_SIZE];
|
||||
__shared__ R smaxval[BLOCK_SIZE];
|
||||
|
||||
const int x0 = blockIdx.x * blockDim.x * twidth + threadIdx.x;
|
||||
const int y0 = blockIdx.y * blockDim.y * theight + threadIdx.y;
|
||||
|
||||
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
|
||||
const int bid = blockIdx.y * gridDim.x + blockIdx.x;
|
||||
|
||||
R mymin = numeric_limits<R>::max();
|
||||
R mymax = -numeric_limits<R>::max();
|
||||
|
||||
const minimum<R> minOp;
|
||||
const maximum<R> maxOp;
|
||||
|
||||
for (int i = 0, y = y0; i < theight && y < src.rows; ++i, y += blockDim.y)
|
||||
{
|
||||
const T* ptr = src.ptr(y);
|
||||
|
||||
for (int j = 0, x = x0; j < twidth && x < src.cols; ++j, x += blockDim.x)
|
||||
{
|
||||
if (mask(y, x))
|
||||
{
|
||||
const R srcVal = ptr[x];
|
||||
|
||||
mymin = minOp(mymin, srcVal);
|
||||
mymax = maxOp(mymax, srcVal);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
device::reduce<BLOCK_SIZE>(smem_tuple(sminval, smaxval), thrust::tie(mymin, mymax), tid, thrust::make_tuple(minOp, maxOp));
|
||||
|
||||
GlobalReduce<BLOCK_SIZE, R>::run(mymin, mymax, minval, maxval, tid, bid, sminval, smaxval);
|
||||
}
|
||||
|
||||
const int threads_x = 32;
|
||||
const int threads_y = 8;
|
||||
|
||||
void getLaunchCfg(int cols, int rows, dim3& block, dim3& grid)
|
||||
{
|
||||
block = dim3(threads_x, threads_y);
|
||||
|
||||
grid = dim3(divUp(cols, block.x * block.y),
|
||||
divUp(rows, block.y * block.x));
|
||||
|
||||
grid.x = ::min(grid.x, block.x);
|
||||
grid.y = ::min(grid.y, block.y);
|
||||
}
|
||||
|
||||
void getBufSize(int cols, int rows, int& bufcols, int& bufrows)
|
||||
{
|
||||
dim3 block, grid;
|
||||
getLaunchCfg(cols, rows, block, grid);
|
||||
|
||||
bufcols = grid.x * grid.y * sizeof(double);
|
||||
bufrows = 2;
|
||||
}
|
||||
|
||||
__global__ void setDefaultKernel(int* minval_buf, int* maxval_buf)
|
||||
{
|
||||
*minval_buf = numeric_limits<int>::max();
|
||||
*maxval_buf = numeric_limits<int>::min();
|
||||
}
|
||||
__global__ void setDefaultKernel(float* minval_buf, float* maxval_buf)
|
||||
{
|
||||
*minval_buf = numeric_limits<float>::max();
|
||||
*maxval_buf = -numeric_limits<float>::max();
|
||||
}
|
||||
__global__ void setDefaultKernel(double* minval_buf, double* maxval_buf)
|
||||
{
|
||||
*minval_buf = numeric_limits<double>::max();
|
||||
*maxval_buf = -numeric_limits<double>::max();
|
||||
}
|
||||
|
||||
template <typename R>
|
||||
void setDefault(R* minval_buf, R* maxval_buf)
|
||||
{
|
||||
setDefaultKernel<<<1, 1>>>(minval_buf, maxval_buf);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void run(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf)
|
||||
void minMaxImpl(const GpuMat& _src, const GpuMat& mask, GpuMat& _buf, double* minVal, double* maxVal)
|
||||
{
|
||||
typedef typename MinMaxTypeTraits<T>::best_type R;
|
||||
typedef typename SelectIf<
|
||||
TypesEquals<T, double>::value,
|
||||
double,
|
||||
typename SelectIf<TypesEquals<T, float>::value, float, int>::type
|
||||
>::type work_type;
|
||||
|
||||
dim3 block, grid;
|
||||
getLaunchCfg(src.cols, src.rows, block, grid);
|
||||
const GpuMat_<T>& src = (const GpuMat_<T>&) _src;
|
||||
GpuMat_<work_type>& buf = (GpuMat_<work_type>&) _buf;
|
||||
|
||||
const int twidth = divUp(divUp(src.cols, grid.x), block.x);
|
||||
const int theight = divUp(divUp(src.rows, grid.y), block.y);
|
||||
|
||||
R* minval_buf = (R*) buf.ptr(0);
|
||||
R* maxval_buf = (R*) buf.ptr(1);
|
||||
|
||||
setDefault(minval_buf, maxval_buf);
|
||||
|
||||
if (mask.data)
|
||||
kernel<threads_x * threads_y><<<grid, block>>>((PtrStepSz<T>) src, SingleMask(mask), minval_buf, maxval_buf, twidth, theight);
|
||||
if (mask.empty())
|
||||
gridFindMinMaxVal(src, buf);
|
||||
else
|
||||
kernel<threads_x * threads_y><<<grid, block>>>((PtrStepSz<T>) src, WithOutMask(), minval_buf, maxval_buf, twidth, theight);
|
||||
gridFindMinMaxVal(src, buf, globPtr<uchar>(mask));
|
||||
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
work_type data[2];
|
||||
buf.download(cv::Mat(1, 2, buf.type(), data));
|
||||
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
if (minVal)
|
||||
*minVal = data[0];
|
||||
|
||||
R minval_, maxval_;
|
||||
cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(R), cudaMemcpyDeviceToHost) );
|
||||
cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(R), cudaMemcpyDeviceToHost) );
|
||||
*minval = minval_;
|
||||
*maxval = maxval_;
|
||||
if (maxVal)
|
||||
*maxVal = data[1];
|
||||
}
|
||||
|
||||
template void run<uchar >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
|
||||
template void run<schar >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
|
||||
template void run<ushort>(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
|
||||
template void run<short >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
|
||||
template void run<int >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
|
||||
template void run<float >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
|
||||
template void run<double>(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
|
||||
}
|
||||
|
||||
#endif // CUDA_DISABLER
|
||||
void cv::cuda::minMax(InputArray _src, double* minVal, double* maxVal, InputArray _mask, GpuMat& buf)
|
||||
{
|
||||
typedef void (*func_t)(const GpuMat& _src, const GpuMat& mask, GpuMat& _buf, double* minVal, double* maxVal);
|
||||
static const func_t funcs[] =
|
||||
{
|
||||
minMaxImpl<uchar>,
|
||||
minMaxImpl<schar>,
|
||||
minMaxImpl<ushort>,
|
||||
minMaxImpl<short>,
|
||||
minMaxImpl<int>,
|
||||
minMaxImpl<float>,
|
||||
minMaxImpl<double>
|
||||
};
|
||||
|
||||
GpuMat src = _src.getGpuMat();
|
||||
GpuMat mask = _mask.getGpuMat();
|
||||
|
||||
CV_Assert( src.channels() == 1 );
|
||||
CV_DbgAssert( mask.empty() || (mask.size() == src.size() && mask.type() == CV_8U) );
|
||||
|
||||
const func_t func = funcs[src.depth()];
|
||||
|
||||
func(src, mask, buf, minVal, maxVal);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@ -40,189 +40,204 @@
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
#include "opencv2/opencv_modules.hpp"
|
||||
|
||||
#include "opencv2/core/cuda/common.hpp"
|
||||
#include "opencv2/core/cuda/functional.hpp"
|
||||
#include "opencv2/core/cuda/transform.hpp"
|
||||
#include "opencv2/core/cuda/saturate_cast.hpp"
|
||||
#include "opencv2/core/cuda/simd_functions.hpp"
|
||||
#ifndef HAVE_OPENCV_CUDEV
|
||||
|
||||
#include "arithm_func_traits.hpp"
|
||||
#error "opencv_cudev is required"
|
||||
|
||||
using namespace cv::cuda;
|
||||
using namespace cv::cuda::device;
|
||||
#else
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
// min
|
||||
#include "opencv2/cudev.hpp"
|
||||
|
||||
namespace arithm
|
||||
using namespace cv::cudev;
|
||||
|
||||
void minMaxMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat&, double, Stream& stream, int op);
|
||||
|
||||
void minMaxScalar(const GpuMat& src, cv::Scalar value, bool, GpuMat& dst, const GpuMat&, double, Stream& stream, int op);
|
||||
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
/// minMaxMat
|
||||
|
||||
namespace
|
||||
{
|
||||
struct VMin4 : binary_function<uint, uint, uint>
|
||||
template <template <typename> class Op, typename T>
|
||||
void minMaxMat_v1(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
|
||||
{
|
||||
__device__ __forceinline__ uint operator ()(uint a, uint b) const
|
||||
{
|
||||
return vmin4(a, b);
|
||||
}
|
||||
gridTransformBinary(globPtr<T>(src1), globPtr<T>(src2), globPtr<T>(dst), Op<T>(), stream);
|
||||
}
|
||||
|
||||
__host__ __device__ __forceinline__ VMin4() {}
|
||||
__host__ __device__ __forceinline__ VMin4(const VMin4&) {}
|
||||
};
|
||||
|
||||
struct VMin2 : binary_function<uint, uint, uint>
|
||||
struct MinOp2 : binary_function<uint, uint, uint>
|
||||
{
|
||||
__device__ __forceinline__ uint operator ()(uint a, uint b) const
|
||||
{
|
||||
return vmin2(a, b);
|
||||
}
|
||||
|
||||
__host__ __device__ __forceinline__ VMin2() {}
|
||||
__host__ __device__ __forceinline__ VMin2(const VMin2&) {}
|
||||
};
|
||||
}
|
||||
|
||||
namespace cv { namespace cuda { namespace device
|
||||
{
|
||||
template <> struct TransformFunctorTraits< arithm::VMin4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
|
||||
{
|
||||
};
|
||||
|
||||
template <> struct TransformFunctorTraits< arithm::VMin2 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
|
||||
{
|
||||
};
|
||||
|
||||
template <typename T> struct TransformFunctorTraits< minimum<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
|
||||
{
|
||||
};
|
||||
|
||||
template <typename T> struct TransformFunctorTraits< binder2nd< minimum<T> > > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
|
||||
{
|
||||
};
|
||||
}}}
|
||||
|
||||
namespace arithm
|
||||
{
|
||||
void minMat_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
|
||||
{
|
||||
device::transform(src1, src2, dst, VMin4(), WithOutMask(), stream);
|
||||
}
|
||||
|
||||
void minMat_v2(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
|
||||
{
|
||||
device::transform(src1, src2, dst, VMin2(), WithOutMask(), stream);
|
||||
}
|
||||
|
||||
template <typename T> void minMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
|
||||
{
|
||||
device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, minimum<T>(), WithOutMask(), stream);
|
||||
}
|
||||
|
||||
template void minMat<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void minMat<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void minMat<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void minMat<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void minMat<int >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void minMat<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void minMat<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
|
||||
template <typename T> void minScalar(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream)
|
||||
{
|
||||
device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) dst, cv::cuda::device::bind2nd(minimum<T>(), src2), WithOutMask(), stream);
|
||||
}
|
||||
|
||||
template void minScalar<uchar >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void minScalar<schar >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void minScalar<ushort>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void minScalar<short >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void minScalar<int >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void minScalar<float >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void minScalar<double>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
// max
|
||||
|
||||
namespace arithm
|
||||
{
|
||||
struct VMax4 : binary_function<uint, uint, uint>
|
||||
{
|
||||
__device__ __forceinline__ uint operator ()(uint a, uint b) const
|
||||
{
|
||||
return vmax4(a, b);
|
||||
}
|
||||
|
||||
__host__ __device__ __forceinline__ VMax4() {}
|
||||
__host__ __device__ __forceinline__ VMax4(const VMax4&) {}
|
||||
};
|
||||
|
||||
struct VMax2 : binary_function<uint, uint, uint>
|
||||
struct MaxOp2 : binary_function<uint, uint, uint>
|
||||
{
|
||||
__device__ __forceinline__ uint operator ()(uint a, uint b) const
|
||||
{
|
||||
return vmax2(a, b);
|
||||
}
|
||||
|
||||
__host__ __device__ __forceinline__ VMax2() {}
|
||||
__host__ __device__ __forceinline__ VMax2(const VMax2&) {}
|
||||
};
|
||||
|
||||
template <class Op2>
|
||||
void minMaxMat_v2(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
|
||||
{
|
||||
const int vcols = src1.cols >> 1;
|
||||
|
||||
GlobPtrSz<uint> src1_ = globPtr((uint*) src1.data, src1.step, src1.rows, vcols);
|
||||
GlobPtrSz<uint> src2_ = globPtr((uint*) src2.data, src2.step, src1.rows, vcols);
|
||||
GlobPtrSz<uint> dst_ = globPtr((uint*) dst.data, dst.step, src1.rows, vcols);
|
||||
|
||||
gridTransformBinary(src1_, src2_, dst_, Op2(), stream);
|
||||
}
|
||||
|
||||
struct MinOp4 : binary_function<uint, uint, uint>
|
||||
{
|
||||
__device__ __forceinline__ uint operator ()(uint a, uint b) const
|
||||
{
|
||||
return vmin4(a, b);
|
||||
}
|
||||
};
|
||||
|
||||
struct MaxOp4 : binary_function<uint, uint, uint>
|
||||
{
|
||||
__device__ __forceinline__ uint operator ()(uint a, uint b) const
|
||||
{
|
||||
return vmax4(a, b);
|
||||
}
|
||||
};
|
||||
|
||||
template <class Op4>
|
||||
void minMaxMat_v4(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
|
||||
{
|
||||
const int vcols = src1.cols >> 2;
|
||||
|
||||
GlobPtrSz<uint> src1_ = globPtr((uint*) src1.data, src1.step, src1.rows, vcols);
|
||||
GlobPtrSz<uint> src2_ = globPtr((uint*) src2.data, src2.step, src1.rows, vcols);
|
||||
GlobPtrSz<uint> dst_ = globPtr((uint*) dst.data, dst.step, src1.rows, vcols);
|
||||
|
||||
gridTransformBinary(src1_, src2_, dst_, Op4(), stream);
|
||||
}
|
||||
}
|
||||
|
||||
namespace cv { namespace cuda { namespace device
|
||||
void minMaxMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat&, double, Stream& stream, int op)
|
||||
{
|
||||
template <> struct TransformFunctorTraits< arithm::VMax4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
|
||||
typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream);
|
||||
static const func_t funcs_v1[2][7] =
|
||||
{
|
||||
{
|
||||
minMaxMat_v1<minimum, uchar>,
|
||||
minMaxMat_v1<minimum, schar>,
|
||||
minMaxMat_v1<minimum, ushort>,
|
||||
minMaxMat_v1<minimum, short>,
|
||||
minMaxMat_v1<minimum, int>,
|
||||
minMaxMat_v1<minimum, float>,
|
||||
minMaxMat_v1<minimum, double>
|
||||
},
|
||||
{
|
||||
minMaxMat_v1<maximum, uchar>,
|
||||
minMaxMat_v1<maximum, schar>,
|
||||
minMaxMat_v1<maximum, ushort>,
|
||||
minMaxMat_v1<maximum, short>,
|
||||
minMaxMat_v1<maximum, int>,
|
||||
minMaxMat_v1<maximum, float>,
|
||||
minMaxMat_v1<maximum, double>
|
||||
}
|
||||
};
|
||||
|
||||
template <> struct TransformFunctorTraits< arithm::VMax2 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
|
||||
static const func_t funcs_v2[2] =
|
||||
{
|
||||
minMaxMat_v2<MinOp2>, minMaxMat_v2<MaxOp2>
|
||||
};
|
||||
|
||||
template <typename T> struct TransformFunctorTraits< maximum<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
|
||||
static const func_t funcs_v4[2] =
|
||||
{
|
||||
minMaxMat_v4<MinOp4>, minMaxMat_v4<MaxOp4>
|
||||
};
|
||||
|
||||
template <typename T> struct TransformFunctorTraits< binder2nd< maximum<T> > > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
|
||||
{
|
||||
};
|
||||
}}}
|
||||
const int depth = src1.depth();
|
||||
|
||||
namespace arithm
|
||||
{
|
||||
void maxMat_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
|
||||
CV_DbgAssert( depth <= CV_64F );
|
||||
|
||||
GpuMat src1_ = src1.reshape(1);
|
||||
GpuMat src2_ = src2.reshape(1);
|
||||
GpuMat dst_ = dst.reshape(1);
|
||||
|
||||
if (depth == CV_8U || depth == CV_16U)
|
||||
{
|
||||
device::transform(src1, src2, dst, VMax4(), WithOutMask(), stream);
|
||||
const intptr_t src1ptr = reinterpret_cast<intptr_t>(src1_.data);
|
||||
const intptr_t src2ptr = reinterpret_cast<intptr_t>(src2_.data);
|
||||
const intptr_t dstptr = reinterpret_cast<intptr_t>(dst_.data);
|
||||
|
||||
const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0;
|
||||
|
||||
if (isAllAligned)
|
||||
{
|
||||
if (depth == CV_8U && (src1_.cols & 3) == 0)
|
||||
{
|
||||
funcs_v4[op](src1_, src2_, dst_, stream);
|
||||
return;
|
||||
}
|
||||
else if (depth == CV_16U && (src1_.cols & 1) == 0)
|
||||
{
|
||||
funcs_v2[op](src1_, src2_, dst_, stream);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void maxMat_v2(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
|
||||
{
|
||||
device::transform(src1, src2, dst, VMax2(), WithOutMask(), stream);
|
||||
}
|
||||
const func_t func = funcs_v1[op][depth];
|
||||
|
||||
template <typename T> void maxMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
|
||||
{
|
||||
device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, maximum<T>(), WithOutMask(), stream);
|
||||
}
|
||||
|
||||
template void maxMat<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void maxMat<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void maxMat<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void maxMat<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void maxMat<int >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void maxMat<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void maxMat<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
|
||||
template <typename T> void maxScalar(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream)
|
||||
{
|
||||
device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) dst, cv::cuda::device::bind2nd(maximum<T>(), src2), WithOutMask(), stream);
|
||||
}
|
||||
|
||||
template void maxScalar<uchar >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void maxScalar<schar >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void maxScalar<ushort>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void maxScalar<short >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void maxScalar<int >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void maxScalar<float >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void maxScalar<double>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
func(src1_, src2_, dst_, stream);
|
||||
}
|
||||
|
||||
#endif // CUDA_DISABLER
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
/// minMaxScalar
|
||||
|
||||
namespace
|
||||
{
|
||||
template <template <typename> class Op, typename T>
|
||||
void minMaxScalar(const GpuMat& src, double value, GpuMat& dst, Stream& stream)
|
||||
{
|
||||
gridTransformUnary(globPtr<T>(src), globPtr<T>(dst), bind2nd(Op<T>(), cv::saturate_cast<T>(value)), stream);
|
||||
}
|
||||
}
|
||||
|
||||
void minMaxScalar(const GpuMat& src, cv::Scalar value, bool, GpuMat& dst, const GpuMat&, double, Stream& stream, int op)
|
||||
{
|
||||
typedef void (*func_t)(const GpuMat& src, double value, GpuMat& dst, Stream& stream);
|
||||
static const func_t funcs[2][7] =
|
||||
{
|
||||
{
|
||||
minMaxScalar<minimum, uchar>,
|
||||
minMaxScalar<minimum, schar>,
|
||||
minMaxScalar<minimum, ushort>,
|
||||
minMaxScalar<minimum, short>,
|
||||
minMaxScalar<minimum, int>,
|
||||
minMaxScalar<minimum, float>,
|
||||
minMaxScalar<minimum, double>
|
||||
},
|
||||
{
|
||||
minMaxScalar<maximum, uchar>,
|
||||
minMaxScalar<maximum, schar>,
|
||||
minMaxScalar<maximum, ushort>,
|
||||
minMaxScalar<maximum, short>,
|
||||
minMaxScalar<maximum, int>,
|
||||
minMaxScalar<maximum, float>,
|
||||
minMaxScalar<maximum, double>
|
||||
}
|
||||
};
|
||||
|
||||
const int depth = src.depth();
|
||||
|
||||
CV_DbgAssert( depth <= CV_64F );
|
||||
CV_DbgAssert( src.channels() == 1 );
|
||||
|
||||
funcs[op][depth](src, value[0], dst, stream);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@ -40,197 +40,88 @@
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
#include "opencv2/opencv_modules.hpp"
|
||||
|
||||
#include "opencv2/core/cuda/common.hpp"
|
||||
#include "opencv2/core/cuda/vec_traits.hpp"
|
||||
#include "opencv2/core/cuda/vec_math.hpp"
|
||||
#include "opencv2/core/cuda/functional.hpp"
|
||||
#include "opencv2/core/cuda/reduce.hpp"
|
||||
#include "opencv2/core/cuda/emulation.hpp"
|
||||
#include "opencv2/core/cuda/limits.hpp"
|
||||
#include "opencv2/core/cuda/utility.hpp"
|
||||
#ifndef HAVE_OPENCV_CUDEV
|
||||
|
||||
using namespace cv::cuda;
|
||||
using namespace cv::cuda::device;
|
||||
#error "opencv_cudev is required"
|
||||
|
||||
namespace minMaxLoc
|
||||
#else
|
||||
|
||||
#include "opencv2/cudaarithm.hpp"
|
||||
#include "opencv2/cudev.hpp"
|
||||
|
||||
using namespace cv::cudev;
|
||||
|
||||
namespace
|
||||
{
|
||||
// To avoid shared bank conflicts we convert each value into value of
|
||||
// appropriate type (32 bits minimum)
|
||||
template <typename T> struct MinMaxTypeTraits;
|
||||
template <> struct MinMaxTypeTraits<unsigned char> { typedef int best_type; };
|
||||
template <> struct MinMaxTypeTraits<signed char> { typedef int best_type; };
|
||||
template <> struct MinMaxTypeTraits<unsigned short> { typedef int best_type; };
|
||||
template <> struct MinMaxTypeTraits<short> { typedef int best_type; };
|
||||
template <> struct MinMaxTypeTraits<int> { typedef int best_type; };
|
||||
template <> struct MinMaxTypeTraits<float> { typedef float best_type; };
|
||||
template <> struct MinMaxTypeTraits<double> { typedef double best_type; };
|
||||
|
||||
template <int BLOCK_SIZE, typename T, class Mask>
|
||||
__global__ void kernel_pass_1(const PtrStepSz<T> src, const Mask mask, T* minval, T* maxval, unsigned int* minloc, unsigned int* maxloc, const int twidth, const int theight)
|
||||
{
|
||||
typedef typename MinMaxTypeTraits<T>::best_type work_type;
|
||||
|
||||
__shared__ work_type sminval[BLOCK_SIZE];
|
||||
__shared__ work_type smaxval[BLOCK_SIZE];
|
||||
__shared__ unsigned int sminloc[BLOCK_SIZE];
|
||||
__shared__ unsigned int smaxloc[BLOCK_SIZE];
|
||||
|
||||
const int x0 = blockIdx.x * blockDim.x * twidth + threadIdx.x;
|
||||
const int y0 = blockIdx.y * blockDim.y * theight + threadIdx.y;
|
||||
|
||||
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
|
||||
const int bid = blockIdx.y * gridDim.x + blockIdx.x;
|
||||
|
||||
work_type mymin = numeric_limits<work_type>::max();
|
||||
work_type mymax = -numeric_limits<work_type>::max();
|
||||
unsigned int myminloc = 0;
|
||||
unsigned int mymaxloc = 0;
|
||||
|
||||
for (int i = 0, y = y0; i < theight && y < src.rows; ++i, y += blockDim.y)
|
||||
{
|
||||
const T* ptr = src.ptr(y);
|
||||
|
||||
for (int j = 0, x = x0; j < twidth && x < src.cols; ++j, x += blockDim.x)
|
||||
{
|
||||
if (mask(y, x))
|
||||
{
|
||||
const work_type srcVal = ptr[x];
|
||||
|
||||
if (srcVal < mymin)
|
||||
{
|
||||
mymin = srcVal;
|
||||
myminloc = y * src.cols + x;
|
||||
}
|
||||
|
||||
if (srcVal > mymax)
|
||||
{
|
||||
mymax = srcVal;
|
||||
mymaxloc = y * src.cols + x;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
reduceKeyVal<BLOCK_SIZE>(smem_tuple(sminval, smaxval), thrust::tie(mymin, mymax),
|
||||
smem_tuple(sminloc, smaxloc), thrust::tie(myminloc, mymaxloc),
|
||||
tid,
|
||||
thrust::make_tuple(less<work_type>(), greater<work_type>()));
|
||||
|
||||
if (tid == 0)
|
||||
{
|
||||
minval[bid] = (T) mymin;
|
||||
maxval[bid] = (T) mymax;
|
||||
minloc[bid] = myminloc;
|
||||
maxloc[bid] = mymaxloc;
|
||||
}
|
||||
}
|
||||
template <int BLOCK_SIZE, typename T>
|
||||
__global__ void kernel_pass_2(T* minval, T* maxval, unsigned int* minloc, unsigned int* maxloc, int count)
|
||||
{
|
||||
typedef typename MinMaxTypeTraits<T>::best_type work_type;
|
||||
|
||||
__shared__ work_type sminval[BLOCK_SIZE];
|
||||
__shared__ work_type smaxval[BLOCK_SIZE];
|
||||
__shared__ unsigned int sminloc[BLOCK_SIZE];
|
||||
__shared__ unsigned int smaxloc[BLOCK_SIZE];
|
||||
|
||||
unsigned int idx = ::min(threadIdx.x, count - 1);
|
||||
|
||||
work_type mymin = minval[idx];
|
||||
work_type mymax = maxval[idx];
|
||||
unsigned int myminloc = minloc[idx];
|
||||
unsigned int mymaxloc = maxloc[idx];
|
||||
|
||||
reduceKeyVal<BLOCK_SIZE>(smem_tuple(sminval, smaxval), thrust::tie(mymin, mymax),
|
||||
smem_tuple(sminloc, smaxloc), thrust::tie(myminloc, mymaxloc),
|
||||
threadIdx.x,
|
||||
thrust::make_tuple(less<work_type>(), greater<work_type>()));
|
||||
|
||||
if (threadIdx.x == 0)
|
||||
{
|
||||
minval[0] = (T) mymin;
|
||||
maxval[0] = (T) mymax;
|
||||
minloc[0] = myminloc;
|
||||
maxloc[0] = mymaxloc;
|
||||
}
|
||||
}
|
||||
|
||||
const int threads_x = 32;
|
||||
const int threads_y = 8;
|
||||
|
||||
void getLaunchCfg(int cols, int rows, dim3& block, dim3& grid)
|
||||
{
|
||||
block = dim3(threads_x, threads_y);
|
||||
|
||||
grid = dim3(divUp(cols, block.x * block.y),
|
||||
divUp(rows, block.y * block.x));
|
||||
|
||||
grid.x = ::min(grid.x, block.x);
|
||||
grid.y = ::min(grid.y, block.y);
|
||||
}
|
||||
|
||||
void getBufSize(int cols, int rows, size_t elem_size, int& b1cols, int& b1rows, int& b2cols, int& b2rows)
|
||||
{
|
||||
dim3 block, grid;
|
||||
getLaunchCfg(cols, rows, block, grid);
|
||||
|
||||
// For values
|
||||
b1cols = (int)(grid.x * grid.y * elem_size);
|
||||
b1rows = 2;
|
||||
|
||||
// For locations
|
||||
b2cols = grid.x * grid.y * sizeof(int);
|
||||
b2rows = 2;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void run(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf)
|
||||
void minMaxLocImpl(const GpuMat& _src, const GpuMat& mask, GpuMat& _valBuf, GpuMat& _locBuf, double* minVal, double* maxVal, cv::Point* minLoc, cv::Point* maxLoc)
|
||||
{
|
||||
dim3 block, grid;
|
||||
getLaunchCfg(src.cols, src.rows, block, grid);
|
||||
typedef typename SelectIf<
|
||||
TypesEquals<T, double>::value,
|
||||
double,
|
||||
typename SelectIf<TypesEquals<T, float>::value, float, int>::type
|
||||
>::type work_type;
|
||||
|
||||
const int twidth = divUp(divUp(src.cols, grid.x), block.x);
|
||||
const int theight = divUp(divUp(src.rows, grid.y), block.y);
|
||||
const GpuMat_<T>& src = (const GpuMat_<T>&) _src;
|
||||
GpuMat_<work_type>& valBuf = (GpuMat_<work_type>&) _valBuf;
|
||||
GpuMat_<int>& locBuf = (GpuMat_<int>&) _locBuf;
|
||||
|
||||
T* minval_buf = (T*) valbuf.ptr(0);
|
||||
T* maxval_buf = (T*) valbuf.ptr(1);
|
||||
unsigned int* minloc_buf = locbuf.ptr(0);
|
||||
unsigned int* maxloc_buf = locbuf.ptr(1);
|
||||
|
||||
if (mask.data)
|
||||
kernel_pass_1<threads_x * threads_y><<<grid, block>>>((PtrStepSz<T>) src, SingleMask(mask), minval_buf, maxval_buf, minloc_buf, maxloc_buf, twidth, theight);
|
||||
if (mask.empty())
|
||||
gridMinMaxLoc(src, valBuf, locBuf);
|
||||
else
|
||||
kernel_pass_1<threads_x * threads_y><<<grid, block>>>((PtrStepSz<T>) src, WithOutMask(), minval_buf, maxval_buf, minloc_buf, maxloc_buf, twidth, theight);
|
||||
gridMinMaxLoc(src, valBuf, locBuf, globPtr<uchar>(mask));
|
||||
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
cv::Mat_<work_type> h_valBuf;
|
||||
cv::Mat_<int> h_locBuf;
|
||||
|
||||
kernel_pass_2<threads_x * threads_y><<<1, threads_x * threads_y>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
valBuf.download(h_valBuf);
|
||||
locBuf.download(h_locBuf);
|
||||
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
if (minVal)
|
||||
*minVal = h_valBuf(0, 0);
|
||||
|
||||
T minval_, maxval_;
|
||||
cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
|
||||
cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
|
||||
*minval = minval_;
|
||||
*maxval = maxval_;
|
||||
if (maxVal)
|
||||
*maxVal = h_valBuf(1, 0);
|
||||
|
||||
unsigned int minloc_, maxloc_;
|
||||
cudaSafeCall( cudaMemcpy(&minloc_, minloc_buf, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
|
||||
cudaSafeCall( cudaMemcpy(&maxloc_, maxloc_buf, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
|
||||
minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols;
|
||||
maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols;
|
||||
if (minLoc)
|
||||
{
|
||||
const int idx = h_locBuf(0, 0);
|
||||
*minLoc = cv::Point(idx % src.cols, idx / src.cols);
|
||||
}
|
||||
|
||||
if (maxLoc)
|
||||
{
|
||||
const int idx = h_locBuf(1, 0);
|
||||
*maxLoc = cv::Point(idx % src.cols, idx / src.cols);
|
||||
}
|
||||
}
|
||||
|
||||
template void run<unsigned char >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
|
||||
template void run<signed char >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
|
||||
template void run<unsigned short>(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
|
||||
template void run<short >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
|
||||
template void run<int >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
|
||||
template void run<float >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
|
||||
template void run<double>(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
|
||||
}
|
||||
|
||||
#endif // CUDA_DISABLER
|
||||
void cv::cuda::minMaxLoc(InputArray _src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc, InputArray _mask, GpuMat& valBuf, GpuMat& locBuf)
|
||||
{
|
||||
typedef void (*func_t)(const GpuMat& _src, const GpuMat& mask, GpuMat& _valBuf, GpuMat& _locBuf, double* minVal, double* maxVal, cv::Point* minLoc, cv::Point* maxLoc);
|
||||
static const func_t funcs[] =
|
||||
{
|
||||
minMaxLocImpl<uchar>,
|
||||
minMaxLocImpl<schar>,
|
||||
minMaxLocImpl<ushort>,
|
||||
minMaxLocImpl<short>,
|
||||
minMaxLocImpl<int>,
|
||||
minMaxLocImpl<float>,
|
||||
minMaxLocImpl<double>
|
||||
};
|
||||
|
||||
GpuMat src = _src.getGpuMat();
|
||||
GpuMat mask = _mask.getGpuMat();
|
||||
|
||||
CV_Assert( src.channels() == 1 );
|
||||
CV_DbgAssert( mask.empty() || (mask.size() == src.size() && mask.type() == CV_8U) );
|
||||
|
||||
const func_t func = funcs[src.depth()];
|
||||
|
||||
func(src, mask, valBuf, locBuf, minVal, maxVal, minLoc, maxLoc);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@ -40,172 +40,185 @@
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
#include "opencv2/opencv_modules.hpp"
|
||||
|
||||
#include "opencv2/core/cuda/common.hpp"
|
||||
#include "opencv2/core/cuda/functional.hpp"
|
||||
#include "opencv2/core/cuda/transform.hpp"
|
||||
#include "opencv2/core/cuda/saturate_cast.hpp"
|
||||
#include "opencv2/core/cuda/simd_functions.hpp"
|
||||
#ifndef HAVE_OPENCV_CUDEV
|
||||
|
||||
#include "arithm_func_traits.hpp"
|
||||
#error "opencv_cudev is required"
|
||||
|
||||
using namespace cv::cuda;
|
||||
using namespace cv::cuda::device;
|
||||
#else
|
||||
|
||||
namespace arithm
|
||||
#include "opencv2/cudev.hpp"
|
||||
|
||||
using namespace cv::cudev;
|
||||
|
||||
void mulMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat&, double scale, Stream& stream, int);
|
||||
void mulMat_8uc4_32f(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream);
|
||||
void mulMat_16sc4_32f(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream);
|
||||
|
||||
namespace
|
||||
{
|
||||
struct Mul_8uc4_32f : binary_function<uint, float, uint>
|
||||
{
|
||||
__device__ __forceinline__ uint operator ()(uint a, float b) const
|
||||
{
|
||||
uint res = 0;
|
||||
|
||||
res |= (saturate_cast<uchar>((0xffu & (a )) * b) );
|
||||
res |= (saturate_cast<uchar>((0xffu & (a >> 8)) * b) << 8);
|
||||
res |= (saturate_cast<uchar>((0xffu & (a >> 16)) * b) << 16);
|
||||
res |= (saturate_cast<uchar>((0xffu & (a >> 24)) * b) << 24);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
__host__ __device__ __forceinline__ Mul_8uc4_32f() {}
|
||||
__host__ __device__ __forceinline__ Mul_8uc4_32f(const Mul_8uc4_32f&) {}
|
||||
};
|
||||
|
||||
struct Mul_16sc4_32f : binary_function<short4, float, short4>
|
||||
{
|
||||
__device__ __forceinline__ short4 operator ()(short4 a, float b) const
|
||||
{
|
||||
return make_short4(saturate_cast<short>(a.x * b), saturate_cast<short>(a.y * b),
|
||||
saturate_cast<short>(a.z * b), saturate_cast<short>(a.w * b));
|
||||
}
|
||||
|
||||
__host__ __device__ __forceinline__ Mul_16sc4_32f() {}
|
||||
__host__ __device__ __forceinline__ Mul_16sc4_32f(const Mul_16sc4_32f&) {}
|
||||
};
|
||||
|
||||
template <typename T, typename D> struct Mul : binary_function<T, T, D>
|
||||
template <typename T, typename D> struct MulOp : binary_function<T, T, D>
|
||||
{
|
||||
__device__ __forceinline__ D operator ()(T a, T b) const
|
||||
{
|
||||
return saturate_cast<D>(a * b);
|
||||
}
|
||||
|
||||
__host__ __device__ __forceinline__ Mul() {}
|
||||
__host__ __device__ __forceinline__ Mul(const Mul&) {}
|
||||
};
|
||||
|
||||
template <typename T, typename S, typename D> struct MulScale : binary_function<T, T, D>
|
||||
template <typename T, typename S, typename D> struct MulScaleOp : binary_function<T, T, D>
|
||||
{
|
||||
S scale;
|
||||
|
||||
__host__ explicit MulScale(S scale_) : scale(scale_) {}
|
||||
|
||||
__device__ __forceinline__ D operator ()(T a, T b) const
|
||||
{
|
||||
return saturate_cast<D>(scale * a * b);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
namespace cv { namespace cuda { namespace device
|
||||
{
|
||||
template <> struct TransformFunctorTraits<arithm::Mul_8uc4_32f> : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
|
||||
template <typename ScalarDepth> struct TransformPolicy : DefaultTransformPolicy
|
||||
{
|
||||
};
|
||||
|
||||
template <typename T, typename D> struct TransformFunctorTraits< arithm::Mul<T, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
|
||||
template <> struct TransformPolicy<double> : DefaultTransformPolicy
|
||||
{
|
||||
enum {
|
||||
shift = 1
|
||||
};
|
||||
};
|
||||
|
||||
template <typename T, typename S, typename D> struct TransformFunctorTraits< arithm::MulScale<T, S, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
|
||||
{
|
||||
};
|
||||
}}}
|
||||
|
||||
namespace arithm
|
||||
{
|
||||
void mulMat_8uc4_32f(PtrStepSz<uint> src1, PtrStepSzf src2, PtrStepSz<uint> dst, cudaStream_t stream)
|
||||
{
|
||||
device::transform(src1, src2, dst, Mul_8uc4_32f(), WithOutMask(), stream);
|
||||
}
|
||||
|
||||
void mulMat_16sc4_32f(PtrStepSz<short4> src1, PtrStepSzf src2, PtrStepSz<short4> dst, cudaStream_t stream)
|
||||
{
|
||||
device::transform(src1, src2, dst, Mul_16sc4_32f(), WithOutMask(), stream);
|
||||
}
|
||||
|
||||
template <typename T, typename S, typename D>
|
||||
void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream)
|
||||
void mulMatImpl(const GpuMat& src1, const GpuMat& src2, const GpuMat& dst, double scale, Stream& stream)
|
||||
{
|
||||
if (scale == 1)
|
||||
{
|
||||
Mul<T, D> op;
|
||||
device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
|
||||
MulOp<T, D> op;
|
||||
gridTransformBinary_< TransformPolicy<S> >(globPtr<T>(src1), globPtr<T>(src2), globPtr<D>(dst), op, stream);
|
||||
}
|
||||
else
|
||||
{
|
||||
MulScale<T, S, D> op(static_cast<S>(scale));
|
||||
device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
|
||||
MulScaleOp<T, S, D> op;
|
||||
op.scale = static_cast<S>(scale);
|
||||
gridTransformBinary_< TransformPolicy<S> >(globPtr<T>(src1), globPtr<T>(src2), globPtr<D>(dst), op, stream);
|
||||
}
|
||||
}
|
||||
|
||||
template void mulMat<uchar, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
template void mulMat<uchar, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
template void mulMat<uchar, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
template void mulMat<uchar, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
template void mulMat<uchar, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
template void mulMat<uchar, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
template void mulMat<uchar, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
|
||||
template void mulMat<schar, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
template void mulMat<schar, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
template void mulMat<schar, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
template void mulMat<schar, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
template void mulMat<schar, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
template void mulMat<schar, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
template void mulMat<schar, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
|
||||
//template void mulMat<ushort, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
//template void mulMat<ushort, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
template void mulMat<ushort, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
template void mulMat<ushort, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
template void mulMat<ushort, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
template void mulMat<ushort, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
template void mulMat<ushort, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
|
||||
//template void mulMat<short, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
//template void mulMat<short, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
template void mulMat<short, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
template void mulMat<short, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
template void mulMat<short, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
template void mulMat<short, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
template void mulMat<short, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
|
||||
//template void mulMat<int, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
//template void mulMat<int, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
//template void mulMat<int, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
//template void mulMat<int, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
template void mulMat<int, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
template void mulMat<int, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
template void mulMat<int, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
|
||||
//template void mulMat<float, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
//template void mulMat<float, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
//template void mulMat<float, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
//template void mulMat<float, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
//template void mulMat<float, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
template void mulMat<float, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
template void mulMat<float, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
|
||||
//template void mulMat<double, double, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
//template void mulMat<double, double, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
//template void mulMat<double, double, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
//template void mulMat<double, double, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
//template void mulMat<double, double, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
//template void mulMat<double, double, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
template void mulMat<double, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
|
||||
}
|
||||
|
||||
#endif // CUDA_DISABLER
|
||||
void mulMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat&, double scale, Stream& stream, int)
|
||||
{
|
||||
typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, const GpuMat& dst, double scale, Stream& stream);
|
||||
static const func_t funcs[7][7] =
|
||||
{
|
||||
{
|
||||
mulMatImpl<uchar, float, uchar>,
|
||||
mulMatImpl<uchar, float, schar>,
|
||||
mulMatImpl<uchar, float, ushort>,
|
||||
mulMatImpl<uchar, float, short>,
|
||||
mulMatImpl<uchar, float, int>,
|
||||
mulMatImpl<uchar, float, float>,
|
||||
mulMatImpl<uchar, double, double>
|
||||
},
|
||||
{
|
||||
mulMatImpl<schar, float, uchar>,
|
||||
mulMatImpl<schar, float, schar>,
|
||||
mulMatImpl<schar, float, ushort>,
|
||||
mulMatImpl<schar, float, short>,
|
||||
mulMatImpl<schar, float, int>,
|
||||
mulMatImpl<schar, float, float>,
|
||||
mulMatImpl<schar, double, double>
|
||||
},
|
||||
{
|
||||
0 /*mulMatImpl<ushort, float, uchar>*/,
|
||||
0 /*mulMatImpl<ushort, float, schar>*/,
|
||||
mulMatImpl<ushort, float, ushort>,
|
||||
mulMatImpl<ushort, float, short>,
|
||||
mulMatImpl<ushort, float, int>,
|
||||
mulMatImpl<ushort, float, float>,
|
||||
mulMatImpl<ushort, double, double>
|
||||
},
|
||||
{
|
||||
0 /*mulMatImpl<short, float, uchar>*/,
|
||||
0 /*mulMatImpl<short, float, schar>*/,
|
||||
mulMatImpl<short, float, ushort>,
|
||||
mulMatImpl<short, float, short>,
|
||||
mulMatImpl<short, float, int>,
|
||||
mulMatImpl<short, float, float>,
|
||||
mulMatImpl<short, double, double>
|
||||
},
|
||||
{
|
||||
0 /*mulMatImpl<int, float, uchar>*/,
|
||||
0 /*mulMatImpl<int, float, schar>*/,
|
||||
0 /*mulMatImpl<int, float, ushort>*/,
|
||||
0 /*mulMatImpl<int, float, short>*/,
|
||||
mulMatImpl<int, float, int>,
|
||||
mulMatImpl<int, float, float>,
|
||||
mulMatImpl<int, double, double>
|
||||
},
|
||||
{
|
||||
0 /*mulMatImpl<float, float, uchar>*/,
|
||||
0 /*mulMatImpl<float, float, schar>*/,
|
||||
0 /*mulMatImpl<float, float, ushort>*/,
|
||||
0 /*mulMatImpl<float, float, short>*/,
|
||||
0 /*mulMatImpl<float, float, int>*/,
|
||||
mulMatImpl<float, float, float>,
|
||||
mulMatImpl<float, double, double>
|
||||
},
|
||||
{
|
||||
0 /*mulMatImpl<double, double, uchar>*/,
|
||||
0 /*mulMatImpl<double, double, schar>*/,
|
||||
0 /*mulMatImpl<double, double, ushort>*/,
|
||||
0 /*mulMatImpl<double, double, short>*/,
|
||||
0 /*mulMatImpl<double, double, int>*/,
|
||||
0 /*mulMatImpl<double, double, float>*/,
|
||||
mulMatImpl<double, double, double>
|
||||
}
|
||||
};
|
||||
|
||||
const int sdepth = src1.depth();
|
||||
const int ddepth = dst.depth();
|
||||
|
||||
CV_DbgAssert( sdepth <= CV_64F && ddepth <= CV_64F );
|
||||
|
||||
GpuMat src1_ = src1.reshape(1);
|
||||
GpuMat src2_ = src2.reshape(1);
|
||||
GpuMat dst_ = dst.reshape(1);
|
||||
|
||||
const func_t func = funcs[sdepth][ddepth];
|
||||
|
||||
if (!func)
|
||||
CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types");
|
||||
|
||||
func(src1_, src2_, dst_, scale, stream);
|
||||
}
|
||||
|
||||
namespace
|
||||
{
|
||||
template <typename T>
|
||||
struct MulOpSpecial : binary_function<T, float, T>
|
||||
{
|
||||
__device__ __forceinline__ T operator ()(const T& a, float b) const
|
||||
{
|
||||
typedef typename VecTraits<T>::elem_type elem_type;
|
||||
|
||||
T res;
|
||||
|
||||
res.x = saturate_cast<elem_type>(a.x * b);
|
||||
res.y = saturate_cast<elem_type>(a.y * b);
|
||||
res.z = saturate_cast<elem_type>(a.z * b);
|
||||
res.w = saturate_cast<elem_type>(a.w * b);
|
||||
|
||||
return res;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
void mulMat_8uc4_32f(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
|
||||
{
|
||||
gridTransformBinary(globPtr<uchar4>(src1), globPtr<float>(src2), globPtr<uchar4>(dst), MulOpSpecial<uchar4>(), stream);
|
||||
}
|
||||
|
||||
void mulMat_16sc4_32f(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
|
||||
{
|
||||
gridTransformBinary(globPtr<short4>(src1), globPtr<float>(src2), globPtr<short4>(dst), MulOpSpecial<short4>(), stream);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@ -40,105 +40,143 @@
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
#include "opencv2/opencv_modules.hpp"
|
||||
|
||||
#include "opencv2/core/cuda/common.hpp"
|
||||
#include "opencv2/core/cuda/functional.hpp"
|
||||
#include "opencv2/core/cuda/transform.hpp"
|
||||
#include "opencv2/core/cuda/saturate_cast.hpp"
|
||||
#include "opencv2/core/cuda/simd_functions.hpp"
|
||||
#ifndef HAVE_OPENCV_CUDEV
|
||||
|
||||
#include "arithm_func_traits.hpp"
|
||||
#error "opencv_cudev is required"
|
||||
|
||||
using namespace cv::cuda;
|
||||
using namespace cv::cuda::device;
|
||||
#else
|
||||
|
||||
namespace arithm
|
||||
#include "opencv2/cudev.hpp"
|
||||
|
||||
using namespace cv::cudev;
|
||||
|
||||
void mulScalar(const GpuMat& src, cv::Scalar val, bool, GpuMat& dst, const GpuMat& mask, double scale, Stream& stream, int);
|
||||
|
||||
namespace
|
||||
{
|
||||
template <typename T, typename S, typename D> struct MulScalar : unary_function<T, D>
|
||||
template <typename SrcType, typename ScalarType, typename DstType> struct MulScalarOp : unary_function<SrcType, DstType>
|
||||
{
|
||||
S val;
|
||||
ScalarType val;
|
||||
|
||||
__host__ explicit MulScalar(S val_) : val(val_) {}
|
||||
|
||||
__device__ __forceinline__ D operator ()(T a) const
|
||||
__device__ __forceinline__ DstType operator ()(SrcType a) const
|
||||
{
|
||||
return saturate_cast<D>(a * val);
|
||||
return saturate_cast<DstType>(saturate_cast<ScalarType>(a) * val);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
namespace cv { namespace cuda { namespace device
|
||||
{
|
||||
template <typename T, typename S, typename D> struct TransformFunctorTraits< arithm::MulScalar<T, S, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
|
||||
template <typename ScalarDepth> struct TransformPolicy : DefaultTransformPolicy
|
||||
{
|
||||
};
|
||||
}}}
|
||||
|
||||
namespace arithm
|
||||
{
|
||||
template <typename T, typename S, typename D>
|
||||
void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream)
|
||||
template <> struct TransformPolicy<double> : DefaultTransformPolicy
|
||||
{
|
||||
MulScalar<T, S, D> op(static_cast<S>(val));
|
||||
device::transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
|
||||
enum {
|
||||
shift = 1
|
||||
};
|
||||
};
|
||||
|
||||
template <typename SrcType, typename ScalarDepth, typename DstType>
|
||||
void mulScalarImpl(const GpuMat& src, cv::Scalar value, GpuMat& dst, Stream& stream)
|
||||
{
|
||||
typedef typename MakeVec<ScalarDepth, VecTraits<SrcType>::cn>::type ScalarType;
|
||||
|
||||
cv::Scalar_<ScalarDepth> value_ = value;
|
||||
|
||||
MulScalarOp<SrcType, ScalarType, DstType> op;
|
||||
op.val = VecTraits<ScalarType>::make(value_.val);
|
||||
|
||||
gridTransformUnary_< TransformPolicy<ScalarDepth> >(globPtr<SrcType>(src), globPtr<DstType>(dst), op, stream);
|
||||
}
|
||||
|
||||
template void mulScalar<uchar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void mulScalar<uchar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void mulScalar<uchar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void mulScalar<uchar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void mulScalar<uchar, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void mulScalar<uchar, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void mulScalar<uchar, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
|
||||
|
||||
template void mulScalar<schar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void mulScalar<schar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void mulScalar<schar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void mulScalar<schar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void mulScalar<schar, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void mulScalar<schar, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void mulScalar<schar, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
|
||||
|
||||
//template void mulScalar<ushort, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
|
||||
//template void mulScalar<ushort, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void mulScalar<ushort, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void mulScalar<ushort, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void mulScalar<ushort, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void mulScalar<ushort, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void mulScalar<ushort, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
|
||||
|
||||
//template void mulScalar<short, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
|
||||
//template void mulScalar<short, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void mulScalar<short, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void mulScalar<short, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void mulScalar<short, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void mulScalar<short, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void mulScalar<short, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
|
||||
|
||||
//template void mulScalar<int, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
|
||||
//template void mulScalar<int, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
|
||||
//template void mulScalar<int, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
|
||||
//template void mulScalar<int, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void mulScalar<int, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void mulScalar<int, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void mulScalar<int, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
|
||||
|
||||
//template void mulScalar<float, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
|
||||
//template void mulScalar<float, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
|
||||
//template void mulScalar<float, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
|
||||
//template void mulScalar<float, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
|
||||
//template void mulScalar<float, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void mulScalar<float, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void mulScalar<float, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
|
||||
|
||||
//template void mulScalar<double, double, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
|
||||
//template void mulScalar<double, double, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
|
||||
//template void mulScalar<double, double, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
|
||||
//template void mulScalar<double, double, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
|
||||
//template void mulScalar<double, double, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
|
||||
//template void mulScalar<double, double, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void mulScalar<double, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
|
||||
}
|
||||
|
||||
#endif // CUDA_DISABLER
|
||||
void mulScalar(const GpuMat& src, cv::Scalar val, bool, GpuMat& dst, const GpuMat&, double scale, Stream& stream, int)
|
||||
{
|
||||
typedef void (*func_t)(const GpuMat& src, cv::Scalar val, GpuMat& dst, Stream& stream);
|
||||
static const func_t funcs[7][7][4] =
|
||||
{
|
||||
{
|
||||
{mulScalarImpl<uchar, float, uchar>, mulScalarImpl<uchar2, float, uchar2>, mulScalarImpl<uchar3, float, uchar3>, mulScalarImpl<uchar4, float, uchar4>},
|
||||
{mulScalarImpl<uchar, float, schar>, mulScalarImpl<uchar2, float, char2>, mulScalarImpl<uchar3, float, char3>, mulScalarImpl<uchar4, float, char4>},
|
||||
{mulScalarImpl<uchar, float, ushort>, mulScalarImpl<uchar2, float, ushort2>, mulScalarImpl<uchar3, float, ushort3>, mulScalarImpl<uchar4, float, ushort4>},
|
||||
{mulScalarImpl<uchar, float, short>, mulScalarImpl<uchar2, float, short2>, mulScalarImpl<uchar3, float, short3>, mulScalarImpl<uchar4, float, short4>},
|
||||
{mulScalarImpl<uchar, float, int>, mulScalarImpl<uchar2, float, int2>, mulScalarImpl<uchar3, float, int3>, mulScalarImpl<uchar4, float, int4>},
|
||||
{mulScalarImpl<uchar, float, float>, mulScalarImpl<uchar2, float, float2>, mulScalarImpl<uchar3, float, float3>, mulScalarImpl<uchar4, float, float4>},
|
||||
{mulScalarImpl<uchar, double, double>, mulScalarImpl<uchar2, double, double2>, mulScalarImpl<uchar3, double, double3>, mulScalarImpl<uchar4, double, double4>}
|
||||
},
|
||||
{
|
||||
{mulScalarImpl<schar, float, uchar>, mulScalarImpl<char2, float, uchar2>, mulScalarImpl<char3, float, uchar3>, mulScalarImpl<char4, float, uchar4>},
|
||||
{mulScalarImpl<schar, float, schar>, mulScalarImpl<char2, float, char2>, mulScalarImpl<char3, float, char3>, mulScalarImpl<char4, float, char4>},
|
||||
{mulScalarImpl<schar, float, ushort>, mulScalarImpl<char2, float, ushort2>, mulScalarImpl<char3, float, ushort3>, mulScalarImpl<char4, float, ushort4>},
|
||||
{mulScalarImpl<schar, float, short>, mulScalarImpl<char2, float, short2>, mulScalarImpl<char3, float, short3>, mulScalarImpl<char4, float, short4>},
|
||||
{mulScalarImpl<schar, float, int>, mulScalarImpl<char2, float, int2>, mulScalarImpl<char3, float, int3>, mulScalarImpl<char4, float, int4>},
|
||||
{mulScalarImpl<schar, float, float>, mulScalarImpl<char2, float, float2>, mulScalarImpl<char3, float, float3>, mulScalarImpl<char4, float, float4>},
|
||||
{mulScalarImpl<schar, double, double>, mulScalarImpl<char2, double, double2>, mulScalarImpl<char3, double, double3>, mulScalarImpl<char4, double, double4>}
|
||||
},
|
||||
{
|
||||
{0 /*mulScalarImpl<ushort, float, uchar>*/, 0 /*mulScalarImpl<ushort2, float, uchar2>*/, 0 /*mulScalarImpl<ushort3, float, uchar3>*/, 0 /*mulScalarImpl<ushort4, float, uchar4>*/},
|
||||
{0 /*mulScalarImpl<ushort, float, schar>*/, 0 /*mulScalarImpl<ushort2, float, char2>*/, 0 /*mulScalarImpl<ushort3, float, char3>*/, 0 /*mulScalarImpl<ushort4, float, char4>*/},
|
||||
{mulScalarImpl<ushort, float, ushort>, mulScalarImpl<ushort2, float, ushort2>, mulScalarImpl<ushort3, float, ushort3>, mulScalarImpl<ushort4, float, ushort4>},
|
||||
{mulScalarImpl<ushort, float, short>, mulScalarImpl<ushort2, float, short2>, mulScalarImpl<ushort3, float, short3>, mulScalarImpl<ushort4, float, short4>},
|
||||
{mulScalarImpl<ushort, float, int>, mulScalarImpl<ushort2, float, int2>, mulScalarImpl<ushort3, float, int3>, mulScalarImpl<ushort4, float, int4>},
|
||||
{mulScalarImpl<ushort, float, float>, mulScalarImpl<ushort2, float, float2>, mulScalarImpl<ushort3, float, float3>, mulScalarImpl<ushort4, float, float4>},
|
||||
{mulScalarImpl<ushort, double, double>, mulScalarImpl<ushort2, double, double2>, mulScalarImpl<ushort3, double, double3>, mulScalarImpl<ushort4, double, double4>}
|
||||
},
|
||||
{
|
||||
{0 /*mulScalarImpl<short, float, uchar>*/, 0 /*mulScalarImpl<short2, float, uchar2>*/, 0 /*mulScalarImpl<short3, float, uchar3>*/, 0 /*mulScalarImpl<short4, float, uchar4>*/},
|
||||
{0 /*mulScalarImpl<short, float, schar>*/, 0 /*mulScalarImpl<short2, float, char2>*/, 0 /*mulScalarImpl<short3, float, char3>*/, 0 /*mulScalarImpl<short4, float, char4>*/},
|
||||
{mulScalarImpl<short, float, ushort>, mulScalarImpl<short2, float, ushort2>, mulScalarImpl<short3, float, ushort3>, mulScalarImpl<short4, float, ushort4>},
|
||||
{mulScalarImpl<short, float, short>, mulScalarImpl<short2, float, short2>, mulScalarImpl<short3, float, short3>, mulScalarImpl<short4, float, short4>},
|
||||
{mulScalarImpl<short, float, int>, mulScalarImpl<short2, float, int2>, mulScalarImpl<short3, float, int3>, mulScalarImpl<short4, float, int4>},
|
||||
{mulScalarImpl<short, float, float>, mulScalarImpl<short2, float, float2>, mulScalarImpl<short3, float, float3>, mulScalarImpl<short4, float, float4>},
|
||||
{mulScalarImpl<short, double, double>, mulScalarImpl<short2, double, double2>, mulScalarImpl<short3, double, double3>, mulScalarImpl<short4, double, double4>}
|
||||
},
|
||||
{
|
||||
{0 /*mulScalarImpl<int, float, uchar>*/, 0 /*mulScalarImpl<int2, float, uchar2>*/, 0 /*mulScalarImpl<int3, float, uchar3>*/, 0 /*mulScalarImpl<int4, float, uchar4>*/},
|
||||
{0 /*mulScalarImpl<int, float, schar>*/, 0 /*mulScalarImpl<int2, float, char2>*/, 0 /*mulScalarImpl<int3, float, char3>*/, 0 /*mulScalarImpl<int4, float, char4>*/},
|
||||
{0 /*mulScalarImpl<int, float, ushort>*/, 0 /*mulScalarImpl<int2, float, ushort2>*/, 0 /*mulScalarImpl<int3, float, ushort3>*/, 0 /*mulScalarImpl<int4, float, ushort4>*/},
|
||||
{0 /*mulScalarImpl<int, float, short>*/, 0 /*mulScalarImpl<int2, float, short2>*/, 0 /*mulScalarImpl<int3, float, short3>*/, 0 /*mulScalarImpl<int4, float, short4>*/},
|
||||
{mulScalarImpl<int, float, int>, mulScalarImpl<int2, float, int2>, mulScalarImpl<int3, float, int3>, mulScalarImpl<int4, float, int4>},
|
||||
{mulScalarImpl<int, float, float>, mulScalarImpl<int2, float, float2>, mulScalarImpl<int3, float, float3>, mulScalarImpl<int4, float, float4>},
|
||||
{mulScalarImpl<int, double, double>, mulScalarImpl<int2, double, double2>, mulScalarImpl<int3, double, double3>, mulScalarImpl<int4, double, double4>}
|
||||
},
|
||||
{
|
||||
{0 /*mulScalarImpl<float, float, uchar>*/, 0 /*mulScalarImpl<float2, float, uchar2>*/, 0 /*mulScalarImpl<float3, float, uchar3>*/, 0 /*mulScalarImpl<float4, float, uchar4>*/},
|
||||
{0 /*mulScalarImpl<float, float, schar>*/, 0 /*mulScalarImpl<float2, float, char2>*/, 0 /*mulScalarImpl<float3, float, char3>*/, 0 /*mulScalarImpl<float4, float, char4>*/},
|
||||
{0 /*mulScalarImpl<float, float, ushort>*/, 0 /*mulScalarImpl<float2, float, ushort2>*/, 0 /*mulScalarImpl<float3, float, ushort3>*/, 0 /*mulScalarImpl<float4, float, ushort4>*/},
|
||||
{0 /*mulScalarImpl<float, float, short>*/, 0 /*mulScalarImpl<float2, float, short2>*/, 0 /*mulScalarImpl<float3, float, short3>*/, 0 /*mulScalarImpl<float4, float, short4>*/},
|
||||
{0 /*mulScalarImpl<float, float, int>*/, 0 /*mulScalarImpl<float2, float, int2>*/, 0 /*mulScalarImpl<float3, float, int3>*/, 0 /*mulScalarImpl<float4, float, int4>*/},
|
||||
{mulScalarImpl<float, float, float>, mulScalarImpl<float2, float, float2>, mulScalarImpl<float3, float, float3>, mulScalarImpl<float4, float, float4>},
|
||||
{mulScalarImpl<float, double, double>, mulScalarImpl<float2, double, double2>, mulScalarImpl<float3, double, double3>, mulScalarImpl<float4, double, double4>}
|
||||
},
|
||||
{
|
||||
{0 /*mulScalarImpl<double, double, uchar>*/, 0 /*mulScalarImpl<double2, double, uchar2>*/, 0 /*mulScalarImpl<double3, double, uchar3>*/, 0 /*mulScalarImpl<double4, double, uchar4>*/},
|
||||
{0 /*mulScalarImpl<double, double, schar>*/, 0 /*mulScalarImpl<double2, double, char2>*/, 0 /*mulScalarImpl<double3, double, char3>*/, 0 /*mulScalarImpl<double4, double, char4>*/},
|
||||
{0 /*mulScalarImpl<double, double, ushort>*/, 0 /*mulScalarImpl<double2, double, ushort2>*/, 0 /*mulScalarImpl<double3, double, ushort3>*/, 0 /*mulScalarImpl<double4, double, ushort4>*/},
|
||||
{0 /*mulScalarImpl<double, double, short>*/, 0 /*mulScalarImpl<double2, double, short2>*/, 0 /*mulScalarImpl<double3, double, short3>*/, 0 /*mulScalarImpl<double4, double, short4>*/},
|
||||
{0 /*mulScalarImpl<double, double, int>*/, 0 /*mulScalarImpl<double2, double, int2>*/, 0 /*mulScalarImpl<double3, double, int3>*/, 0 /*mulScalarImpl<double4, double, int4>*/},
|
||||
{0 /*mulScalarImpl<double, double, float>*/, 0 /*mulScalarImpl<double2, double, float2>*/, 0 /*mulScalarImpl<double3, double, float3>*/, 0 /*mulScalarImpl<double4, double, float4>*/},
|
||||
{mulScalarImpl<double, double, double>, mulScalarImpl<double2, double, double2>, mulScalarImpl<double3, double, double3>, mulScalarImpl<double4, double, double4>}
|
||||
}
|
||||
};
|
||||
|
||||
const int sdepth = src.depth();
|
||||
const int ddepth = dst.depth();
|
||||
const int cn = src.channels();
|
||||
|
||||
CV_DbgAssert( sdepth <= CV_64F && ddepth <= CV_64F && cn <= 4 );
|
||||
|
||||
val[0] *= scale;
|
||||
val[1] *= scale;
|
||||
val[2] *= scale;
|
||||
val[3] *= scale;
|
||||
|
||||
const func_t func = funcs[sdepth][ddepth][cn - 1];
|
||||
|
||||
if (!func)
|
||||
CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types");
|
||||
|
||||
func(src, val, dst, stream);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@ -40,132 +40,126 @@
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
#include "opencv2/opencv_modules.hpp"
|
||||
|
||||
#include "cvconfig.h"
|
||||
#ifndef HAVE_OPENCV_CUDEV
|
||||
|
||||
#ifdef HAVE_CUFFT
|
||||
#error "opencv_cudev is required"
|
||||
|
||||
#include <cufft.h>
|
||||
#else
|
||||
|
||||
#include "opencv2/core/cuda/common.hpp"
|
||||
#include "opencv2/cudaarithm.hpp"
|
||||
#include "opencv2/cudev.hpp"
|
||||
|
||||
namespace cv { namespace cuda { namespace device
|
||||
using namespace cv::cudev;
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
// mulSpectrums
|
||||
|
||||
namespace
|
||||
{
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
// mulSpectrums
|
||||
|
||||
__global__ void mulSpectrumsKernel(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, PtrStepSz<cufftComplex> c)
|
||||
__device__ __forceinline__ float real(const float2& val)
|
||||
{
|
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
return val.x;
|
||||
}
|
||||
|
||||
if (x < c.cols && y < c.rows)
|
||||
__device__ __forceinline__ float imag(const float2& val)
|
||||
{
|
||||
return val.y;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ float2 cmul(const float2& a, const float2& b)
|
||||
{
|
||||
return make_float2((real(a) * real(b)) - (imag(a) * imag(b)),
|
||||
(real(a) * imag(b)) + (imag(a) * real(b)));
|
||||
}
|
||||
|
||||
__device__ __forceinline__ float2 conj(const float2& a)
|
||||
{
|
||||
return make_float2(real(a), -imag(a));
|
||||
}
|
||||
|
||||
struct comlex_mul : binary_function<float2, float2, float2>
|
||||
{
|
||||
__device__ __forceinline__ float2 operator ()(const float2& a, const float2& b) const
|
||||
{
|
||||
c.ptr(y)[x] = cuCmulf(a.ptr(y)[x], b.ptr(y)[x]);
|
||||
return cmul(a, b);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
void mulSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, PtrStepSz<cufftComplex> c, cudaStream_t stream)
|
||||
struct comlex_mul_conj : binary_function<float2, float2, float2>
|
||||
{
|
||||
dim3 threads(256);
|
||||
dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));
|
||||
|
||||
mulSpectrumsKernel<<<grid, threads, 0, stream>>>(a, b, c);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
// mulSpectrums_CONJ
|
||||
|
||||
__global__ void mulSpectrumsKernel_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, PtrStepSz<cufftComplex> c)
|
||||
{
|
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
if (x < c.cols && y < c.rows)
|
||||
__device__ __forceinline__ float2 operator ()(const float2& a, const float2& b) const
|
||||
{
|
||||
c.ptr(y)[x] = cuCmulf(a.ptr(y)[x], cuConjf(b.ptr(y)[x]));
|
||||
return cmul(a, conj(b));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
void mulSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, PtrStepSz<cufftComplex> c, cudaStream_t stream)
|
||||
struct comlex_mul_scale : binary_function<float2, float2, float2>
|
||||
{
|
||||
dim3 threads(256);
|
||||
dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));
|
||||
float scale;
|
||||
|
||||
mulSpectrumsKernel_CONJ<<<grid, threads, 0, stream>>>(a, b, c);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
// mulAndScaleSpectrums
|
||||
|
||||
__global__ void mulAndScaleSpectrumsKernel(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, PtrStepSz<cufftComplex> c)
|
||||
{
|
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
if (x < c.cols && y < c.rows)
|
||||
__device__ __forceinline__ float2 operator ()(const float2& a, const float2& b) const
|
||||
{
|
||||
cufftComplex v = cuCmulf(a.ptr(y)[x], b.ptr(y)[x]);
|
||||
c.ptr(y)[x] = make_cuFloatComplex(cuCrealf(v) * scale, cuCimagf(v) * scale);
|
||||
return scale * cmul(a, b);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
void mulAndScaleSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, PtrStepSz<cufftComplex> c, cudaStream_t stream)
|
||||
struct comlex_mul_conj_scale : binary_function<float2, float2, float2>
|
||||
{
|
||||
dim3 threads(256);
|
||||
dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));
|
||||
float scale;
|
||||
|
||||
mulAndScaleSpectrumsKernel<<<grid, threads, 0, stream>>>(a, b, scale, c);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
// mulAndScaleSpectrums_CONJ
|
||||
|
||||
__global__ void mulAndScaleSpectrumsKernel_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, PtrStepSz<cufftComplex> c)
|
||||
{
|
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
if (x < c.cols && y < c.rows)
|
||||
__device__ __forceinline__ float2 operator ()(const float2& a, const float2& b) const
|
||||
{
|
||||
cufftComplex v = cuCmulf(a.ptr(y)[x], cuConjf(b.ptr(y)[x]));
|
||||
c.ptr(y)[x] = make_cuFloatComplex(cuCrealf(v) * scale, cuCimagf(v) * scale);
|
||||
return scale * cmul(a, conj(b));
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
void cv::cuda::mulSpectrums(InputArray _src1, InputArray _src2, OutputArray _dst, int flags, bool conjB, Stream& stream)
|
||||
{
|
||||
(void) flags;
|
||||
|
||||
void mulAndScaleSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, PtrStepSz<cufftComplex> c, cudaStream_t stream)
|
||||
GpuMat src1 = _src1.getGpuMat();
|
||||
GpuMat src2 = _src2.getGpuMat();
|
||||
|
||||
CV_Assert( src1.type() == src2.type() && src1.type() == CV_32FC2 );
|
||||
CV_Assert( src1.size() == src2.size() );
|
||||
|
||||
_dst.create(src1.size(), CV_32FC2);
|
||||
GpuMat dst = _dst.getGpuMat();
|
||||
|
||||
if (conjB)
|
||||
gridTransformBinary(globPtr<float2>(src1), globPtr<float2>(src2), globPtr<float2>(dst), comlex_mul_conj(), stream);
|
||||
else
|
||||
gridTransformBinary(globPtr<float2>(src1), globPtr<float2>(src2), globPtr<float2>(dst), comlex_mul(), stream);
|
||||
}
|
||||
|
||||
void cv::cuda::mulAndScaleSpectrums(InputArray _src1, InputArray _src2, OutputArray _dst, int flags, float scale, bool conjB, Stream& stream)
|
||||
{
|
||||
(void) flags;
|
||||
|
||||
GpuMat src1 = _src1.getGpuMat();
|
||||
GpuMat src2 = _src2.getGpuMat();
|
||||
|
||||
CV_Assert( src1.type() == src2.type() && src1.type() == CV_32FC2);
|
||||
CV_Assert( src1.size() == src2.size() );
|
||||
|
||||
_dst.create(src1.size(), CV_32FC2);
|
||||
GpuMat dst = _dst.getGpuMat();
|
||||
|
||||
if (conjB)
|
||||
{
|
||||
dim3 threads(256);
|
||||
dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));
|
||||
|
||||
mulAndScaleSpectrumsKernel_CONJ<<<grid, threads, 0, stream>>>(a, b, scale, c);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
comlex_mul_conj_scale op;
|
||||
op.scale = scale;
|
||||
gridTransformBinary(globPtr<float2>(src1), globPtr<float2>(src2), globPtr<float2>(dst), op, stream);
|
||||
}
|
||||
}}} // namespace cv { namespace cuda { namespace cudev
|
||||
else
|
||||
{
|
||||
comlex_mul_scale op;
|
||||
op.scale = scale;
|
||||
gridTransformBinary(globPtr<float2>(src1), globPtr<float2>(src2), globPtr<float2>(dst), op, stream);
|
||||
}
|
||||
}
|
||||
|
||||
#endif // HAVE_CUFFT
|
||||
|
||||
#endif /* CUDA_DISABLER */
|
||||
#endif
|
||||
|
119
modules/cudaarithm/src/cuda/norm.cu
Normal file
119
modules/cudaarithm/src/cuda/norm.cu
Normal file
@ -0,0 +1,119 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#include "opencv2/opencv_modules.hpp"
|
||||
|
||||
#ifndef HAVE_OPENCV_CUDEV
|
||||
|
||||
#error "opencv_cudev is required"
|
||||
|
||||
#else
|
||||
|
||||
#include "opencv2/cudaarithm.hpp"
|
||||
#include "opencv2/cudev.hpp"
|
||||
|
||||
using namespace cv::cudev;
|
||||
|
||||
namespace
|
||||
{
|
||||
double normDiffInf(const GpuMat& _src1, const GpuMat& _src2, GpuMat& _buf)
|
||||
{
|
||||
const GpuMat_<uchar>& src1 = (const GpuMat_<uchar>&) _src1;
|
||||
const GpuMat_<uchar>& src2 = (const GpuMat_<uchar>&) _src2;
|
||||
GpuMat_<int>& buf = (GpuMat_<int>&) _buf;
|
||||
|
||||
gridFindMinMaxVal(abs_(cvt_<int>(src1) - cvt_<int>(src2)), buf);
|
||||
|
||||
int data[2];
|
||||
buf.download(cv::Mat(1, 2, buf.type(), data));
|
||||
|
||||
return data[1];
|
||||
}
|
||||
|
||||
double normDiffL1(const GpuMat& _src1, const GpuMat& _src2, GpuMat& _buf)
|
||||
{
|
||||
const GpuMat_<uchar>& src1 = (const GpuMat_<uchar>&) _src1;
|
||||
const GpuMat_<uchar>& src2 = (const GpuMat_<uchar>&) _src2;
|
||||
GpuMat_<int>& buf = (GpuMat_<int>&) _buf;
|
||||
|
||||
gridCalcSum(abs_(cvt_<int>(src1) - cvt_<int>(src2)), buf);
|
||||
|
||||
int data;
|
||||
buf.download(cv::Mat(1, 1, buf.type(), &data));
|
||||
|
||||
return data;
|
||||
}
|
||||
|
||||
double normDiffL2(const GpuMat& _src1, const GpuMat& _src2, GpuMat& _buf)
|
||||
{
|
||||
const GpuMat_<uchar>& src1 = (const GpuMat_<uchar>&) _src1;
|
||||
const GpuMat_<uchar>& src2 = (const GpuMat_<uchar>&) _src2;
|
||||
GpuMat_<double>& buf = (GpuMat_<double>&) _buf;
|
||||
|
||||
gridCalcSum(sqr_(cvt_<double>(src1) - cvt_<double>(src2)), buf);
|
||||
|
||||
double data;
|
||||
buf.download(cv::Mat(1, 1, buf.type(), &data));
|
||||
|
||||
return std::sqrt(data);
|
||||
}
|
||||
}
|
||||
|
||||
double cv::cuda::norm(InputArray _src1, InputArray _src2, GpuMat& buf, int normType)
|
||||
{
|
||||
typedef double (*func_t)(const GpuMat& _src1, const GpuMat& _src2, GpuMat& _buf);
|
||||
static const func_t funcs[] =
|
||||
{
|
||||
0, normDiffInf, normDiffL1, 0, normDiffL2
|
||||
};
|
||||
|
||||
GpuMat src1 = _src1.getGpuMat();
|
||||
GpuMat src2 = _src2.getGpuMat();
|
||||
|
||||
CV_Assert( src1.type() == CV_8UC1 );
|
||||
CV_Assert( src1.size() == src2.size() && src1.type() == src2.type() );
|
||||
CV_Assert( normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2 );
|
||||
|
||||
return funcs[normType](src1, src2, buf);
|
||||
}
|
||||
|
||||
#endif
|
@ -40,178 +40,172 @@
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
#include "opencv2/opencv_modules.hpp"
|
||||
|
||||
#include "opencv2/core/cuda/common.hpp"
|
||||
#ifndef HAVE_OPENCV_CUDEV
|
||||
|
||||
namespace cv { namespace cuda { namespace device
|
||||
#error "opencv_cudev is required"
|
||||
|
||||
#else
|
||||
|
||||
#include "opencv2/cudaarithm.hpp"
|
||||
#include "opencv2/cudev.hpp"
|
||||
|
||||
using namespace cv::cudev;
|
||||
|
||||
void cv::cuda::magnitude(InputArray _x, InputArray _y, OutputArray _dst, Stream& stream)
|
||||
{
|
||||
namespace mathfunc
|
||||
GpuMat x = _x.getGpuMat();
|
||||
GpuMat y = _y.getGpuMat();
|
||||
|
||||
CV_DbgAssert( x.depth() == CV_32F );
|
||||
CV_DbgAssert( y.type() == x.type() && y.size() == x.size() );
|
||||
|
||||
_dst.create(x.size(), CV_32FC1);
|
||||
GpuMat dst = _dst.getGpuMat();
|
||||
|
||||
GpuMat_<float> xc(x.reshape(1));
|
||||
GpuMat_<float> yc(y.reshape(1));
|
||||
GpuMat_<float> magc(dst.reshape(1));
|
||||
|
||||
gridTransformBinary(xc, yc, magc, magnitude_func<float>(), stream);
|
||||
}
|
||||
|
||||
void cv::cuda::magnitudeSqr(InputArray _x, InputArray _y, OutputArray _dst, Stream& stream)
|
||||
{
|
||||
GpuMat x = _x.getGpuMat();
|
||||
GpuMat y = _y.getGpuMat();
|
||||
|
||||
CV_DbgAssert( x.depth() == CV_32F );
|
||||
CV_DbgAssert( y.type() == x.type() && y.size() == x.size() );
|
||||
|
||||
_dst.create(x.size(), CV_32FC1);
|
||||
GpuMat dst = _dst.getGpuMat();
|
||||
|
||||
GpuMat_<float> xc(x.reshape(1));
|
||||
GpuMat_<float> yc(y.reshape(1));
|
||||
GpuMat_<float> magc(dst.reshape(1));
|
||||
|
||||
gridTransformBinary(xc, yc, magc, magnitude_sqr_func<float>(), stream);
|
||||
}
|
||||
|
||||
void cv::cuda::phase(InputArray _x, InputArray _y, OutputArray _dst, bool angleInDegrees, Stream& stream)
|
||||
{
|
||||
GpuMat x = _x.getGpuMat();
|
||||
GpuMat y = _y.getGpuMat();
|
||||
|
||||
CV_DbgAssert( x.depth() == CV_32F );
|
||||
CV_DbgAssert( y.type() == x.type() && y.size() == x.size() );
|
||||
|
||||
_dst.create(x.size(), CV_32FC1);
|
||||
GpuMat dst = _dst.getGpuMat();
|
||||
|
||||
GpuMat_<float> xc(x.reshape(1));
|
||||
GpuMat_<float> yc(y.reshape(1));
|
||||
GpuMat_<float> anglec(dst.reshape(1));
|
||||
|
||||
if (angleInDegrees)
|
||||
gridTransformBinary(xc, yc, anglec, direction_func<float, true>(), stream);
|
||||
else
|
||||
gridTransformBinary(xc, yc, anglec, direction_func<float, false>(), stream);
|
||||
}
|
||||
|
||||
void cv::cuda::cartToPolar(InputArray _x, InputArray _y, OutputArray _mag, OutputArray _angle, bool angleInDegrees, Stream& stream)
|
||||
{
|
||||
GpuMat x = _x.getGpuMat();
|
||||
GpuMat y = _y.getGpuMat();
|
||||
|
||||
CV_DbgAssert( x.depth() == CV_32F );
|
||||
CV_DbgAssert( y.type() == x.type() && y.size() == x.size() );
|
||||
|
||||
_mag.create(x.size(), CV_32FC1);
|
||||
GpuMat mag = _mag.getGpuMat();
|
||||
|
||||
_angle.create(x.size(), CV_32FC1);
|
||||
GpuMat angle = _angle.getGpuMat();
|
||||
|
||||
GpuMat_<float> xc(x.reshape(1));
|
||||
GpuMat_<float> yc(y.reshape(1));
|
||||
GpuMat_<float> magc(mag.reshape(1));
|
||||
GpuMat_<float> anglec(angle.reshape(1));
|
||||
|
||||
if (angleInDegrees)
|
||||
{
|
||||
//////////////////////////////////////////////////////////////////////////////////////
|
||||
// Cart <-> Polar
|
||||
gridTransformTuple(zipPtr(xc, yc),
|
||||
tie(magc, anglec),
|
||||
make_tuple(
|
||||
binaryTupleAdapter<0, 1>(magnitude_func<float>()),
|
||||
binaryTupleAdapter<0, 1>(direction_func<float, true>())),
|
||||
stream);
|
||||
}
|
||||
else
|
||||
{
|
||||
gridTransformTuple(zipPtr(xc, yc),
|
||||
tie(magc, anglec),
|
||||
make_tuple(
|
||||
binaryTupleAdapter<0, 1>(magnitude_func<float>()),
|
||||
binaryTupleAdapter<0, 1>(direction_func<float, false>())),
|
||||
stream);
|
||||
}
|
||||
}
|
||||
|
||||
struct Nothing
|
||||
{
|
||||
static __device__ __forceinline__ void calc(int, int, float, float, float*, size_t, float)
|
||||
{
|
||||
}
|
||||
};
|
||||
struct Magnitude
|
||||
{
|
||||
static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float)
|
||||
{
|
||||
dst[y * dst_step + x] = ::sqrtf(x_data * x_data + y_data * y_data);
|
||||
}
|
||||
};
|
||||
struct MagnitudeSqr
|
||||
{
|
||||
static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float)
|
||||
{
|
||||
dst[y * dst_step + x] = x_data * x_data + y_data * y_data;
|
||||
}
|
||||
};
|
||||
struct Atan2
|
||||
{
|
||||
static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float scale)
|
||||
{
|
||||
float angle = ::atan2f(y_data, x_data);
|
||||
angle += (angle < 0) * 2.0f * CV_PI_F;
|
||||
dst[y * dst_step + x] = scale * angle;
|
||||
}
|
||||
};
|
||||
template <typename Mag, typename Angle>
|
||||
__global__ void cartToPolar(const float* xptr, size_t x_step, const float* yptr, size_t y_step,
|
||||
float* mag, size_t mag_step, float* angle, size_t angle_step, float scale, int width, int height)
|
||||
{
|
||||
const int x = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
const int y = blockDim.y * blockIdx.y + threadIdx.y;
|
||||
namespace
|
||||
{
|
||||
template <bool useMag>
|
||||
__global__ void polarToCartImpl(const GlobPtr<float> mag, const GlobPtr<float> angle, GlobPtr<float> xmat, GlobPtr<float> ymat, const float scale, const int rows, const int cols)
|
||||
{
|
||||
const int x = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
const int y = blockDim.y * blockIdx.y + threadIdx.y;
|
||||
|
||||
if (x < width && y < height)
|
||||
{
|
||||
float x_data = xptr[y * x_step + x];
|
||||
float y_data = yptr[y * y_step + x];
|
||||
if (x >= cols || y >= rows)
|
||||
return;
|
||||
|
||||
Mag::calc(x, y, x_data, y_data, mag, mag_step, scale);
|
||||
Angle::calc(x, y, x_data, y_data, angle, angle_step, scale);
|
||||
}
|
||||
}
|
||||
const float mag_val = useMag ? mag(y, x) : 1.0f;
|
||||
const float angle_val = angle(y, x);
|
||||
|
||||
struct NonEmptyMag
|
||||
{
|
||||
static __device__ __forceinline__ float get(const float* mag, size_t mag_step, int x, int y)
|
||||
{
|
||||
return mag[y * mag_step + x];
|
||||
}
|
||||
};
|
||||
struct EmptyMag
|
||||
{
|
||||
static __device__ __forceinline__ float get(const float*, size_t, int, int)
|
||||
{
|
||||
return 1.0f;
|
||||
}
|
||||
};
|
||||
template <typename Mag>
|
||||
__global__ void polarToCart(const float* mag, size_t mag_step, const float* angle, size_t angle_step, float scale,
|
||||
float* xptr, size_t x_step, float* yptr, size_t y_step, int width, int height)
|
||||
{
|
||||
const int x = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
const int y = blockDim.y * blockIdx.y + threadIdx.y;
|
||||
float sin_a, cos_a;
|
||||
::sincosf(scale * angle_val, &sin_a, &cos_a);
|
||||
|
||||
if (x < width && y < height)
|
||||
{
|
||||
float mag_data = Mag::get(mag, mag_step, x, y);
|
||||
float angle_data = angle[y * angle_step + x];
|
||||
float sin_a, cos_a;
|
||||
xmat(y, x) = mag_val * cos_a;
|
||||
ymat(y, x) = mag_val * sin_a;
|
||||
}
|
||||
}
|
||||
|
||||
::sincosf(scale * angle_data, &sin_a, &cos_a);
|
||||
void cv::cuda::polarToCart(InputArray _mag, InputArray _angle, OutputArray _x, OutputArray _y, bool angleInDegrees, Stream& _stream)
|
||||
{
|
||||
GpuMat mag = _mag.getGpuMat();
|
||||
GpuMat angle = _angle.getGpuMat();
|
||||
|
||||
xptr[y * x_step + x] = mag_data * cos_a;
|
||||
yptr[y * y_step + x] = mag_data * sin_a;
|
||||
}
|
||||
}
|
||||
CV_DbgAssert( angle.depth() == CV_32F );
|
||||
CV_DbgAssert( mag.empty() || (mag.type() == angle.type() && mag.size() == angle.size()) );
|
||||
|
||||
template <typename Mag, typename Angle>
|
||||
void cartToPolar_caller(PtrStepSzf x, PtrStepSzf y, PtrStepSzf mag, PtrStepSzf angle, bool angleInDegrees, cudaStream_t stream)
|
||||
{
|
||||
dim3 threads(32, 8, 1);
|
||||
dim3 grid(1, 1, 1);
|
||||
_x.create(angle.size(), CV_32FC1);
|
||||
GpuMat x = _x.getGpuMat();
|
||||
|
||||
grid.x = divUp(x.cols, threads.x);
|
||||
grid.y = divUp(x.rows, threads.y);
|
||||
_y.create(angle.size(), CV_32FC1);
|
||||
GpuMat y = _y.getGpuMat();
|
||||
|
||||
const float scale = angleInDegrees ? (180.0f / CV_PI_F) : 1.f;
|
||||
GpuMat_<float> xc(x.reshape(1));
|
||||
GpuMat_<float> yc(y.reshape(1));
|
||||
GpuMat_<float> magc(mag.reshape(1));
|
||||
GpuMat_<float> anglec(angle.reshape(1));
|
||||
|
||||
cartToPolar<Mag, Angle><<<grid, threads, 0, stream>>>(
|
||||
x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(),
|
||||
mag.data, mag.step/mag.elemSize(), angle.data, angle.step/angle.elemSize(), scale, x.cols, x.rows);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
const dim3 block(32, 8);
|
||||
const dim3 grid(divUp(anglec.cols, block.x), divUp(anglec.rows, block.y));
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
const float scale = angleInDegrees ? (CV_PI_F / 180.0f) : 1.0f;
|
||||
|
||||
void cartToPolar_gpu(PtrStepSzf x, PtrStepSzf y, PtrStepSzf mag, bool magSqr, PtrStepSzf angle, bool angleInDegrees, cudaStream_t stream)
|
||||
{
|
||||
typedef void (*caller_t)(PtrStepSzf x, PtrStepSzf y, PtrStepSzf mag, PtrStepSzf angle, bool angleInDegrees, cudaStream_t stream);
|
||||
static const caller_t callers[2][2][2] =
|
||||
{
|
||||
{
|
||||
{
|
||||
cartToPolar_caller<Magnitude, Atan2>,
|
||||
cartToPolar_caller<Magnitude, Nothing>
|
||||
},
|
||||
{
|
||||
cartToPolar_caller<MagnitudeSqr, Atan2>,
|
||||
cartToPolar_caller<MagnitudeSqr, Nothing>,
|
||||
}
|
||||
},
|
||||
{
|
||||
{
|
||||
cartToPolar_caller<Nothing, Atan2>,
|
||||
cartToPolar_caller<Nothing, Nothing>
|
||||
},
|
||||
{
|
||||
cartToPolar_caller<Nothing, Atan2>,
|
||||
cartToPolar_caller<Nothing, Nothing>,
|
||||
}
|
||||
}
|
||||
};
|
||||
cudaStream_t stream = StreamAccessor::getStream(_stream);
|
||||
|
||||
callers[mag.data == 0][magSqr][angle.data == 0](x, y, mag, angle, angleInDegrees, stream);
|
||||
}
|
||||
if (magc.empty())
|
||||
polarToCartImpl<false><<<grid, block, 0, stream>>>(shrinkPtr(magc), shrinkPtr(anglec), shrinkPtr(xc), shrinkPtr(yc), scale, anglec.rows, anglec.cols);
|
||||
else
|
||||
polarToCartImpl<true><<<grid, block, 0, stream>>>(shrinkPtr(magc), shrinkPtr(anglec), shrinkPtr(xc), shrinkPtr(yc), scale, anglec.rows, anglec.cols);
|
||||
|
||||
template <typename Mag>
|
||||
void polarToCart_caller(PtrStepSzf mag, PtrStepSzf angle, PtrStepSzf x, PtrStepSzf y, bool angleInDegrees, cudaStream_t stream)
|
||||
{
|
||||
dim3 threads(32, 8, 1);
|
||||
dim3 grid(1, 1, 1);
|
||||
CV_CUDEV_SAFE_CALL( cudaGetLastError() );
|
||||
|
||||
grid.x = divUp(mag.cols, threads.x);
|
||||
grid.y = divUp(mag.rows, threads.y);
|
||||
if (stream == 0)
|
||||
CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
const float scale = angleInDegrees ? (CV_PI_F / 180.0f) : 1.0f;
|
||||
|
||||
polarToCart<Mag><<<grid, threads, 0, stream>>>(mag.data, mag.step/mag.elemSize(),
|
||||
angle.data, angle.step/angle.elemSize(), scale, x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(), mag.cols, mag.rows);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
void polarToCart_gpu(PtrStepSzf mag, PtrStepSzf angle, PtrStepSzf x, PtrStepSzf y, bool angleInDegrees, cudaStream_t stream)
|
||||
{
|
||||
typedef void (*caller_t)(PtrStepSzf mag, PtrStepSzf angle, PtrStepSzf x, PtrStepSzf y, bool angleInDegrees, cudaStream_t stream);
|
||||
static const caller_t callers[2] =
|
||||
{
|
||||
polarToCart_caller<NonEmptyMag>,
|
||||
polarToCart_caller<EmptyMag>
|
||||
};
|
||||
|
||||
callers[mag.data == 0](mag, angle, x, y, angleInDegrees, stream);
|
||||
}
|
||||
} // namespace mathfunc
|
||||
}}} // namespace cv { namespace cuda { namespace cudev
|
||||
|
||||
#endif /* CUDA_DISABLER */
|
||||
#endif
|
||||
|
@ -40,301 +40,258 @@
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
#include "opencv2/opencv_modules.hpp"
|
||||
|
||||
#include "opencv2/core/cuda/common.hpp"
|
||||
#include "opencv2/core/cuda/saturate_cast.hpp"
|
||||
#include "opencv2/core/cuda/vec_traits.hpp"
|
||||
#include "opencv2/core/cuda/vec_math.hpp"
|
||||
#include "opencv2/core/cuda/functional.hpp"
|
||||
#include "opencv2/core/cuda/reduce.hpp"
|
||||
#include "opencv2/core/cuda/limits.hpp"
|
||||
#ifndef HAVE_OPENCV_CUDEV
|
||||
|
||||
#include "unroll_detail.hpp"
|
||||
#error "opencv_cudev is required"
|
||||
|
||||
using namespace cv::cuda;
|
||||
using namespace cv::cuda::device;
|
||||
#else
|
||||
|
||||
namespace reduce
|
||||
#include "opencv2/cudaarithm.hpp"
|
||||
#include "opencv2/cudev.hpp"
|
||||
|
||||
using namespace cv::cudev;
|
||||
|
||||
namespace
|
||||
{
|
||||
struct Sum
|
||||
template <typename T, typename S, typename D>
|
||||
void reduceToRowImpl(const GpuMat& _src, GpuMat& _dst, int reduceOp, Stream& stream)
|
||||
{
|
||||
template <typename T>
|
||||
__device__ __forceinline__ T startValue() const
|
||||
const GpuMat_<T>& src = (const GpuMat_<T>&) _src;
|
||||
GpuMat_<D>& dst = (GpuMat_<D>&) _dst;
|
||||
|
||||
switch (reduceOp)
|
||||
{
|
||||
return VecTraits<T>::all(0);
|
||||
}
|
||||
case cv::REDUCE_SUM:
|
||||
gridReduceToRow< Sum<S> >(src, dst, stream);
|
||||
break;
|
||||
|
||||
template <typename T>
|
||||
__device__ __forceinline__ T operator ()(T a, T b) const
|
||||
{
|
||||
return a + b;
|
||||
}
|
||||
case cv::REDUCE_AVG:
|
||||
gridReduceToRow< Avg<S> >(src, dst, stream);
|
||||
break;
|
||||
|
||||
template <typename T>
|
||||
__device__ __forceinline__ T result(T r, int) const
|
||||
{
|
||||
return r;
|
||||
}
|
||||
case cv::REDUCE_MIN:
|
||||
gridReduceToRow< Min<S> >(src, dst, stream);
|
||||
break;
|
||||
|
||||
__host__ __device__ __forceinline__ Sum() {}
|
||||
__host__ __device__ __forceinline__ Sum(const Sum&) {}
|
||||
};
|
||||
|
||||
template <typename T> struct OutputType
|
||||
{
|
||||
typedef float type;
|
||||
};
|
||||
template <> struct OutputType<double>
|
||||
{
|
||||
typedef double type;
|
||||
};
|
||||
|
||||
struct Avg
|
||||
{
|
||||
template <typename T>
|
||||
__device__ __forceinline__ T startValue() const
|
||||
{
|
||||
return VecTraits<T>::all(0);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ __forceinline__ T operator ()(T a, T b) const
|
||||
{
|
||||
return a + b;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ __forceinline__ typename TypeVec<typename OutputType<typename VecTraits<T>::elem_type>::type, VecTraits<T>::cn>::vec_type result(T r, float sz) const
|
||||
{
|
||||
return r / sz;
|
||||
}
|
||||
|
||||
__host__ __device__ __forceinline__ Avg() {}
|
||||
__host__ __device__ __forceinline__ Avg(const Avg&) {}
|
||||
};
|
||||
|
||||
struct Min
|
||||
{
|
||||
template <typename T>
|
||||
__device__ __forceinline__ T startValue() const
|
||||
{
|
||||
return VecTraits<T>::all(numeric_limits<typename VecTraits<T>::elem_type>::max());
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ __forceinline__ T operator ()(T a, T b) const
|
||||
{
|
||||
minimum<T> minOp;
|
||||
return minOp(a, b);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ __forceinline__ T result(T r, int) const
|
||||
{
|
||||
return r;
|
||||
}
|
||||
|
||||
__host__ __device__ __forceinline__ Min() {}
|
||||
__host__ __device__ __forceinline__ Min(const Min&) {}
|
||||
};
|
||||
|
||||
struct Max
|
||||
{
|
||||
template <typename T>
|
||||
__device__ __forceinline__ T startValue() const
|
||||
{
|
||||
return VecTraits<T>::all(-numeric_limits<typename VecTraits<T>::elem_type>::max());
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ __forceinline__ T operator ()(T a, T b) const
|
||||
{
|
||||
maximum<T> maxOp;
|
||||
return maxOp(a, b);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ __forceinline__ T result(T r, int) const
|
||||
{
|
||||
return r;
|
||||
}
|
||||
|
||||
__host__ __device__ __forceinline__ Max() {}
|
||||
__host__ __device__ __forceinline__ Max(const Max&) {}
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////
|
||||
|
||||
template <typename T, typename S, typename D, class Op>
|
||||
__global__ void rowsKernel(const PtrStepSz<T> src, D* dst, const Op op)
|
||||
{
|
||||
__shared__ S smem[16 * 16];
|
||||
|
||||
const int x = blockIdx.x * 16 + threadIdx.x;
|
||||
|
||||
S myVal = op.template startValue<S>();
|
||||
|
||||
if (x < src.cols)
|
||||
{
|
||||
for (int y = threadIdx.y; y < src.rows; y += 16)
|
||||
{
|
||||
S srcVal = src(y, x);
|
||||
myVal = op(myVal, srcVal);
|
||||
}
|
||||
}
|
||||
|
||||
smem[threadIdx.x * 16 + threadIdx.y] = myVal;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
volatile S* srow = smem + threadIdx.y * 16;
|
||||
|
||||
myVal = srow[threadIdx.x];
|
||||
device::reduce<16>(srow, myVal, threadIdx.x, op);
|
||||
|
||||
if (threadIdx.x == 0)
|
||||
srow[0] = myVal;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (threadIdx.y == 0 && x < src.cols)
|
||||
dst[x] = (D) op.result(smem[threadIdx.x * 16], src.rows);
|
||||
}
|
||||
|
||||
template <typename T, typename S, typename D, class Op>
|
||||
void rowsCaller(PtrStepSz<T> src, D* dst, cudaStream_t stream)
|
||||
{
|
||||
const dim3 block(16, 16);
|
||||
const dim3 grid(divUp(src.cols, block.x));
|
||||
|
||||
Op op;
|
||||
rowsKernel<T, S, D, Op><<<grid, block, 0, stream>>>(src, dst, op);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
case cv::REDUCE_MAX:
|
||||
gridReduceToRow< Max<S> >(src, dst, stream);
|
||||
break;
|
||||
};
|
||||
}
|
||||
|
||||
template <typename T, typename S, typename D>
|
||||
void rows(PtrStepSzb src, void* dst, int op, cudaStream_t stream)
|
||||
void reduceToColumnImpl_(const GpuMat& _src, GpuMat& _dst, int reduceOp, Stream& stream)
|
||||
{
|
||||
typedef void (*func_t)(PtrStepSz<T> src, D* dst, cudaStream_t stream);
|
||||
static const func_t funcs[] =
|
||||
const GpuMat_<T>& src = (const GpuMat_<T>&) _src;
|
||||
GpuMat_<D>& dst = (GpuMat_<D>&) _dst;
|
||||
|
||||
switch (reduceOp)
|
||||
{
|
||||
rowsCaller<T, S, D, Sum>,
|
||||
rowsCaller<T, S, D, Avg>,
|
||||
rowsCaller<T, S, D, Max>,
|
||||
rowsCaller<T, S, D, Min>
|
||||
case cv::REDUCE_SUM:
|
||||
gridReduceToColumn< Sum<S> >(src, dst, stream);
|
||||
break;
|
||||
|
||||
case cv::REDUCE_AVG:
|
||||
gridReduceToColumn< Avg<S> >(src, dst, stream);
|
||||
break;
|
||||
|
||||
case cv::REDUCE_MIN:
|
||||
gridReduceToColumn< Min<S> >(src, dst, stream);
|
||||
break;
|
||||
|
||||
case cv::REDUCE_MAX:
|
||||
gridReduceToColumn< Max<S> >(src, dst, stream);
|
||||
break;
|
||||
};
|
||||
}
|
||||
|
||||
template <typename T, typename S, typename D>
|
||||
void reduceToColumnImpl(const GpuMat& src, GpuMat& dst, int reduceOp, Stream& stream)
|
||||
{
|
||||
typedef void (*func_t)(const GpuMat& src, GpuMat& dst, int reduceOp, Stream& stream);
|
||||
static const func_t funcs[4] =
|
||||
{
|
||||
reduceToColumnImpl_<T, S, D>,
|
||||
reduceToColumnImpl_<typename MakeVec<T, 2>::type, typename MakeVec<S, 2>::type, typename MakeVec<D, 2>::type>,
|
||||
reduceToColumnImpl_<typename MakeVec<T, 3>::type, typename MakeVec<S, 3>::type, typename MakeVec<D, 3>::type>,
|
||||
reduceToColumnImpl_<typename MakeVec<T, 4>::type, typename MakeVec<S, 4>::type, typename MakeVec<D, 4>::type>
|
||||
};
|
||||
|
||||
funcs[op]((PtrStepSz<T>) src, (D*) dst, stream);
|
||||
funcs[src.channels() - 1](src, dst, reduceOp, stream);
|
||||
}
|
||||
|
||||
template void rows<unsigned char, int, unsigned char>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
|
||||
template void rows<unsigned char, int, int>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
|
||||
template void rows<unsigned char, float, float>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
|
||||
template void rows<unsigned char, double, double>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
|
||||
|
||||
template void rows<unsigned short, int, unsigned short>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
|
||||
template void rows<unsigned short, int, int>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
|
||||
template void rows<unsigned short, float, float>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
|
||||
template void rows<unsigned short, double, double>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
|
||||
|
||||
template void rows<short, int, short>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
|
||||
template void rows<short, int, int>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
|
||||
template void rows<short, float, float>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
|
||||
template void rows<short, double, double>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
|
||||
|
||||
template void rows<int, int, int>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
|
||||
template void rows<int, float, float>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
|
||||
template void rows<int, double, double>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
|
||||
|
||||
template void rows<float, float, float>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
|
||||
template void rows<float, double, double>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
|
||||
|
||||
template void rows<double, double, double>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
|
||||
|
||||
///////////////////////////////////////////////////////////
|
||||
|
||||
template <int BLOCK_SIZE, typename T, typename S, typename D, int cn, class Op>
|
||||
__global__ void colsKernel(const PtrStepSz<typename TypeVec<T, cn>::vec_type> src, typename TypeVec<D, cn>::vec_type* dst, const Op op)
|
||||
{
|
||||
typedef typename TypeVec<T, cn>::vec_type src_type;
|
||||
typedef typename TypeVec<S, cn>::vec_type work_type;
|
||||
typedef typename TypeVec<D, cn>::vec_type dst_type;
|
||||
|
||||
__shared__ S smem[BLOCK_SIZE * cn];
|
||||
|
||||
const int y = blockIdx.x;
|
||||
|
||||
const src_type* srcRow = src.ptr(y);
|
||||
|
||||
work_type myVal = op.template startValue<work_type>();
|
||||
|
||||
for (int x = threadIdx.x; x < src.cols; x += BLOCK_SIZE)
|
||||
myVal = op(myVal, saturate_cast<work_type>(srcRow[x]));
|
||||
|
||||
device::reduce<BLOCK_SIZE>(detail::Unroll<cn>::template smem_tuple<BLOCK_SIZE>(smem), detail::Unroll<cn>::tie(myVal), threadIdx.x, detail::Unroll<cn>::op(op));
|
||||
|
||||
if (threadIdx.x == 0)
|
||||
dst[y] = saturate_cast<dst_type>(op.result(myVal, src.cols));
|
||||
}
|
||||
|
||||
template <typename T, typename S, typename D, int cn, class Op> void colsCaller(PtrStepSzb src, void* dst, cudaStream_t stream)
|
||||
{
|
||||
const int BLOCK_SIZE = 256;
|
||||
|
||||
const dim3 block(BLOCK_SIZE);
|
||||
const dim3 grid(src.rows);
|
||||
|
||||
Op op;
|
||||
colsKernel<BLOCK_SIZE, T, S, D, cn, Op><<<grid, block, 0, stream>>>((PtrStepSz<typename TypeVec<T, cn>::vec_type>) src, (typename TypeVec<D, cn>::vec_type*) dst, op);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
|
||||
}
|
||||
|
||||
template <typename T, typename S, typename D> void cols(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream)
|
||||
{
|
||||
typedef void (*func_t)(PtrStepSzb src, void* dst, cudaStream_t stream);
|
||||
static const func_t funcs[5][4] =
|
||||
{
|
||||
{0,0,0,0},
|
||||
{colsCaller<T, S, D, 1, Sum>, colsCaller<T, S, D, 1, Avg>, colsCaller<T, S, D, 1, Max>, colsCaller<T, S, D, 1, Min>},
|
||||
{colsCaller<T, S, D, 2, Sum>, colsCaller<T, S, D, 2, Avg>, colsCaller<T, S, D, 2, Max>, colsCaller<T, S, D, 2, Min>},
|
||||
{colsCaller<T, S, D, 3, Sum>, colsCaller<T, S, D, 3, Avg>, colsCaller<T, S, D, 3, Max>, colsCaller<T, S, D, 3, Min>},
|
||||
{colsCaller<T, S, D, 4, Sum>, colsCaller<T, S, D, 4, Avg>, colsCaller<T, S, D, 4, Max>, colsCaller<T, S, D, 4, Min>},
|
||||
};
|
||||
|
||||
funcs[cn][op](src, dst, stream);
|
||||
}
|
||||
|
||||
template void cols<unsigned char, int, unsigned char>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
|
||||
template void cols<unsigned char, int, int>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
|
||||
template void cols<unsigned char, float, float>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
|
||||
template void cols<unsigned char, double, double>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
|
||||
|
||||
template void cols<unsigned short, int, unsigned short>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
|
||||
template void cols<unsigned short, int, int>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
|
||||
template void cols<unsigned short, float, float>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
|
||||
template void cols<unsigned short, double, double>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
|
||||
|
||||
template void cols<short, int, short>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
|
||||
template void cols<short, int, int>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
|
||||
template void cols<short, float, float>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
|
||||
template void cols<short, double, double>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
|
||||
|
||||
template void cols<int, int, int>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
|
||||
template void cols<int, float, float>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
|
||||
template void cols<int, double, double>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
|
||||
|
||||
template void cols<float, float, float>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
|
||||
template void cols<float, double, double>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
|
||||
|
||||
template void cols<double, double, double>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
|
||||
}
|
||||
|
||||
#endif /* CUDA_DISABLER */
|
||||
void cv::cuda::reduce(InputArray _src, OutputArray _dst, int dim, int reduceOp, int dtype, Stream& stream)
|
||||
{
|
||||
GpuMat src = _src.getGpuMat();
|
||||
|
||||
CV_Assert( src.channels() <= 4 );
|
||||
CV_Assert( dim == 0 || dim == 1 );
|
||||
CV_Assert( reduceOp == REDUCE_SUM || reduceOp == REDUCE_AVG || reduceOp == REDUCE_MAX || reduceOp == REDUCE_MIN );
|
||||
|
||||
if (dtype < 0)
|
||||
dtype = src.depth();
|
||||
|
||||
_dst.create(1, dim == 0 ? src.cols : src.rows, CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), src.channels()));
|
||||
GpuMat dst = _dst.getGpuMat();
|
||||
|
||||
if (dim == 0)
|
||||
{
|
||||
typedef void (*func_t)(const GpuMat& _src, GpuMat& _dst, int reduceOp, Stream& stream);
|
||||
static const func_t funcs[7][7] =
|
||||
{
|
||||
{
|
||||
reduceToRowImpl<uchar, int, uchar>,
|
||||
0 /*reduceToRowImpl<uchar, int, schar>*/,
|
||||
0 /*reduceToRowImpl<uchar, int, ushort>*/,
|
||||
0 /*reduceToRowImpl<uchar, int, short>*/,
|
||||
reduceToRowImpl<uchar, int, int>,
|
||||
reduceToRowImpl<uchar, float, float>,
|
||||
reduceToRowImpl<uchar, double, double>
|
||||
},
|
||||
{
|
||||
0 /*reduceToRowImpl<schar, int, uchar>*/,
|
||||
0 /*reduceToRowImpl<schar, int, schar>*/,
|
||||
0 /*reduceToRowImpl<schar, int, ushort>*/,
|
||||
0 /*reduceToRowImpl<schar, int, short>*/,
|
||||
0 /*reduceToRowImpl<schar, int, int>*/,
|
||||
0 /*reduceToRowImpl<schar, float, float>*/,
|
||||
0 /*reduceToRowImpl<schar, double, double>*/
|
||||
},
|
||||
{
|
||||
0 /*reduceToRowImpl<ushort, int, uchar>*/,
|
||||
0 /*reduceToRowImpl<ushort, int, schar>*/,
|
||||
reduceToRowImpl<ushort, int, ushort>,
|
||||
0 /*reduceToRowImpl<ushort, int, short>*/,
|
||||
reduceToRowImpl<ushort, int, int>,
|
||||
reduceToRowImpl<ushort, float, float>,
|
||||
reduceToRowImpl<ushort, double, double>
|
||||
},
|
||||
{
|
||||
0 /*reduceToRowImpl<short, int, uchar>*/,
|
||||
0 /*reduceToRowImpl<short, int, schar>*/,
|
||||
0 /*reduceToRowImpl<short, int, ushort>*/,
|
||||
reduceToRowImpl<short, int, short>,
|
||||
reduceToRowImpl<short, int, int>,
|
||||
reduceToRowImpl<short, float, float>,
|
||||
reduceToRowImpl<short, double, double>
|
||||
},
|
||||
{
|
||||
0 /*reduceToRowImpl<int, int, uchar>*/,
|
||||
0 /*reduceToRowImpl<int, int, schar>*/,
|
||||
0 /*reduceToRowImpl<int, int, ushort>*/,
|
||||
0 /*reduceToRowImpl<int, int, short>*/,
|
||||
reduceToRowImpl<int, int, int>,
|
||||
reduceToRowImpl<int, float, float>,
|
||||
reduceToRowImpl<int, double, double>
|
||||
},
|
||||
{
|
||||
0 /*reduceToRowImpl<float, float, uchar>*/,
|
||||
0 /*reduceToRowImpl<float, float, schar>*/,
|
||||
0 /*reduceToRowImpl<float, float, ushort>*/,
|
||||
0 /*reduceToRowImpl<float, float, short>*/,
|
||||
0 /*reduceToRowImpl<float, float, int>*/,
|
||||
reduceToRowImpl<float, float, float>,
|
||||
reduceToRowImpl<float, double, double>
|
||||
},
|
||||
{
|
||||
0 /*reduceToRowImpl<double, double, uchar>*/,
|
||||
0 /*reduceToRowImpl<double, double, schar>*/,
|
||||
0 /*reduceToRowImpl<double, double, ushort>*/,
|
||||
0 /*reduceToRowImpl<double, double, short>*/,
|
||||
0 /*reduceToRowImpl<double, double, int>*/,
|
||||
0 /*reduceToRowImpl<double, double, float>*/,
|
||||
reduceToRowImpl<double, double, double>
|
||||
}
|
||||
};
|
||||
|
||||
const func_t func = funcs[src.depth()][dst.depth()];
|
||||
|
||||
if (!func)
|
||||
CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of input and output array formats");
|
||||
|
||||
GpuMat dst_cont = dst.reshape(1);
|
||||
func(src.reshape(1), dst_cont, reduceOp, stream);
|
||||
}
|
||||
else
|
||||
{
|
||||
typedef void (*func_t)(const GpuMat& _src, GpuMat& _dst, int reduceOp, Stream& stream);
|
||||
static const func_t funcs[7][7] =
|
||||
{
|
||||
{
|
||||
reduceToColumnImpl<uchar, int, uchar>,
|
||||
0 /*reduceToColumnImpl<uchar, int, schar>*/,
|
||||
0 /*reduceToColumnImpl<uchar, int, ushort>*/,
|
||||
0 /*reduceToColumnImpl<uchar, int, short>*/,
|
||||
reduceToColumnImpl<uchar, int, int>,
|
||||
reduceToColumnImpl<uchar, float, float>,
|
||||
reduceToColumnImpl<uchar, double, double>
|
||||
},
|
||||
{
|
||||
0 /*reduceToColumnImpl<schar, int, uchar>*/,
|
||||
0 /*reduceToColumnImpl<schar, int, schar>*/,
|
||||
0 /*reduceToColumnImpl<schar, int, ushort>*/,
|
||||
0 /*reduceToColumnImpl<schar, int, short>*/,
|
||||
0 /*reduceToColumnImpl<schar, int, int>*/,
|
||||
0 /*reduceToColumnImpl<schar, float, float>*/,
|
||||
0 /*reduceToColumnImpl<schar, double, double>*/
|
||||
},
|
||||
{
|
||||
0 /*reduceToColumnImpl<ushort, int, uchar>*/,
|
||||
0 /*reduceToColumnImpl<ushort, int, schar>*/,
|
||||
reduceToColumnImpl<ushort, int, ushort>,
|
||||
0 /*reduceToColumnImpl<ushort, int, short>*/,
|
||||
reduceToColumnImpl<ushort, int, int>,
|
||||
reduceToColumnImpl<ushort, float, float>,
|
||||
reduceToColumnImpl<ushort, double, double>
|
||||
},
|
||||
{
|
||||
0 /*reduceToColumnImpl<short, int, uchar>*/,
|
||||
0 /*reduceToColumnImpl<short, int, schar>*/,
|
||||
0 /*reduceToColumnImpl<short, int, ushort>*/,
|
||||
reduceToColumnImpl<short, int, short>,
|
||||
reduceToColumnImpl<short, int, int>,
|
||||
reduceToColumnImpl<short, float, float>,
|
||||
reduceToColumnImpl<short, double, double>
|
||||
},
|
||||
{
|
||||
0 /*reduceToColumnImpl<int, int, uchar>*/,
|
||||
0 /*reduceToColumnImpl<int, int, schar>*/,
|
||||
0 /*reduceToColumnImpl<int, int, ushort>*/,
|
||||
0 /*reduceToColumnImpl<int, int, short>*/,
|
||||
reduceToColumnImpl<int, int, int>,
|
||||
reduceToColumnImpl<int, float, float>,
|
||||
reduceToColumnImpl<int, double, double>
|
||||
},
|
||||
{
|
||||
0 /*reduceToColumnImpl<float, float, uchar>*/,
|
||||
0 /*reduceToColumnImpl<float, float, schar>*/,
|
||||
0 /*reduceToColumnImpl<float, float, ushort>*/,
|
||||
0 /*reduceToColumnImpl<float, float, short>*/,
|
||||
0 /*reduceToColumnImpl<float, float, int>*/,
|
||||
reduceToColumnImpl<float, float, float>,
|
||||
reduceToColumnImpl<float, double, double>
|
||||
},
|
||||
{
|
||||
0 /*reduceToColumnImpl<double, double, uchar>*/,
|
||||
0 /*reduceToColumnImpl<double, double, schar>*/,
|
||||
0 /*reduceToColumnImpl<double, double, ushort>*/,
|
||||
0 /*reduceToColumnImpl<double, double, short>*/,
|
||||
0 /*reduceToColumnImpl<double, double, int>*/,
|
||||
0 /*reduceToColumnImpl<double, double, float>*/,
|
||||
reduceToColumnImpl<double, double, double>
|
||||
}
|
||||
};
|
||||
|
||||
const func_t func = funcs[src.depth()][dst.depth()];
|
||||
|
||||
if (!func)
|
||||
CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of input and output array formats");
|
||||
|
||||
func(src, dst, reduceOp, stream);
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@ -40,472 +40,209 @@
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
#include "opencv2/opencv_modules.hpp"
|
||||
|
||||
#include "opencv2/core/cuda/common.hpp"
|
||||
#ifndef HAVE_OPENCV_CUDEV
|
||||
|
||||
namespace cv { namespace cuda { namespace device
|
||||
#error "opencv_cudev is required"
|
||||
|
||||
#else
|
||||
|
||||
#include "opencv2/cudaarithm.hpp"
|
||||
#include "opencv2/cudev.hpp"
|
||||
|
||||
using namespace cv::cudev;
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
/// merge
|
||||
|
||||
namespace
|
||||
{
|
||||
namespace split_merge
|
||||
template <int cn, typename T> struct MergeFunc;
|
||||
|
||||
template <typename T> struct MergeFunc<2, T>
|
||||
{
|
||||
template <typename T, size_t elem_size = sizeof(T)>
|
||||
struct TypeTraits
|
||||
static void call(const GpuMat* src, GpuMat& dst, Stream& stream)
|
||||
{
|
||||
typedef T type;
|
||||
typedef T type2;
|
||||
typedef T type3;
|
||||
typedef T type4;
|
||||
};
|
||||
gridMerge(zipPtr(globPtr<T>(src[0]), globPtr<T>(src[1])),
|
||||
globPtr<typename MakeVec<T, 2>::type>(dst),
|
||||
stream);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct TypeTraits<T, 1>
|
||||
template <typename T> struct MergeFunc<3, T>
|
||||
{
|
||||
static void call(const GpuMat* src, GpuMat& dst, Stream& stream)
|
||||
{
|
||||
typedef char type;
|
||||
typedef char2 type2;
|
||||
typedef char3 type3;
|
||||
typedef char4 type4;
|
||||
};
|
||||
gridMerge(zipPtr(globPtr<T>(src[0]), globPtr<T>(src[1]), globPtr<T>(src[2])),
|
||||
globPtr<typename MakeVec<T, 3>::type>(dst),
|
||||
stream);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct TypeTraits<T, 2>
|
||||
template <typename T> struct MergeFunc<4, T>
|
||||
{
|
||||
static void call(const GpuMat* src, GpuMat& dst, Stream& stream)
|
||||
{
|
||||
typedef short type;
|
||||
typedef short2 type2;
|
||||
typedef short3 type3;
|
||||
typedef short4 type4;
|
||||
};
|
||||
gridMerge(zipPtr(globPtr<T>(src[0]), globPtr<T>(src[1]), globPtr<T>(src[2]), globPtr<T>(src[3])),
|
||||
globPtr<typename MakeVec<T, 4>::type>(dst),
|
||||
stream);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct TypeTraits<T, 4>
|
||||
void mergeImpl(const GpuMat* src, size_t n, cv::OutputArray _dst, Stream& stream)
|
||||
{
|
||||
CV_DbgAssert( src != 0 );
|
||||
CV_DbgAssert( n > 0 && n <= 4 );
|
||||
|
||||
const int depth = src[0].depth();
|
||||
const cv::Size size = src[0].size();
|
||||
|
||||
#ifdef _DEBUG
|
||||
for (size_t i = 0; i < n; ++i)
|
||||
{
|
||||
typedef int type;
|
||||
typedef int2 type2;
|
||||
typedef int3 type3;
|
||||
typedef int4 type4;
|
||||
};
|
||||
CV_Assert( src[i].size() == size );
|
||||
CV_Assert( src[i].depth() == depth );
|
||||
CV_Assert( src[i].channels() == 1 );
|
||||
}
|
||||
#endif
|
||||
|
||||
template <typename T>
|
||||
struct TypeTraits<T, 8>
|
||||
if (n == 1)
|
||||
{
|
||||
typedef double type;
|
||||
typedef double2 type2;
|
||||
//typedef double3 type3;
|
||||
//typedef double4 type3;
|
||||
};
|
||||
|
||||
typedef void (*MergeFunction)(const PtrStepSzb* src, PtrStepSzb& dst, const cudaStream_t& stream);
|
||||
typedef void (*SplitFunction)(const PtrStepSzb& src, PtrStepSzb* dst, const cudaStream_t& stream);
|
||||
|
||||
//------------------------------------------------------------
|
||||
// Merge
|
||||
|
||||
template <typename T>
|
||||
__global__ void mergeC2_(const uchar* src0, size_t src0_step,
|
||||
const uchar* src1, size_t src1_step,
|
||||
int rows, int cols, uchar* dst, size_t dst_step)
|
||||
src[0].copyTo(_dst, stream);
|
||||
}
|
||||
else
|
||||
{
|
||||
typedef typename TypeTraits<T>::type2 dst_type;
|
||||
|
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
const T* src0_y = (const T*)(src0 + y * src0_step);
|
||||
const T* src1_y = (const T*)(src1 + y * src1_step);
|
||||
dst_type* dst_y = (dst_type*)(dst + y * dst_step);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
typedef void (*func_t)(const GpuMat* src, GpuMat& dst, Stream& stream);
|
||||
static const func_t funcs[3][5] =
|
||||
{
|
||||
dst_type dst_elem;
|
||||
dst_elem.x = src0_y[x];
|
||||
dst_elem.y = src1_y[x];
|
||||
dst_y[x] = dst_elem;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <typename T>
|
||||
__global__ void mergeC3_(const uchar* src0, size_t src0_step,
|
||||
const uchar* src1, size_t src1_step,
|
||||
const uchar* src2, size_t src2_step,
|
||||
int rows, int cols, uchar* dst, size_t dst_step)
|
||||
{
|
||||
typedef typename TypeTraits<T>::type3 dst_type;
|
||||
|
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
const T* src0_y = (const T*)(src0 + y * src0_step);
|
||||
const T* src1_y = (const T*)(src1 + y * src1_step);
|
||||
const T* src2_y = (const T*)(src2 + y * src2_step);
|
||||
dst_type* dst_y = (dst_type*)(dst + y * dst_step);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
dst_type dst_elem;
|
||||
dst_elem.x = src0_y[x];
|
||||
dst_elem.y = src1_y[x];
|
||||
dst_elem.z = src2_y[x];
|
||||
dst_y[x] = dst_elem;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <>
|
||||
__global__ void mergeC3_<double>(const uchar* src0, size_t src0_step,
|
||||
const uchar* src1, size_t src1_step,
|
||||
const uchar* src2, size_t src2_step,
|
||||
int rows, int cols, uchar* dst, size_t dst_step)
|
||||
{
|
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
const double* src0_y = (const double*)(src0 + y * src0_step);
|
||||
const double* src1_y = (const double*)(src1 + y * src1_step);
|
||||
const double* src2_y = (const double*)(src2 + y * src2_step);
|
||||
double* dst_y = (double*)(dst + y * dst_step);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
dst_y[3 * x] = src0_y[x];
|
||||
dst_y[3 * x + 1] = src1_y[x];
|
||||
dst_y[3 * x + 2] = src2_y[x];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <typename T>
|
||||
__global__ void mergeC4_(const uchar* src0, size_t src0_step,
|
||||
const uchar* src1, size_t src1_step,
|
||||
const uchar* src2, size_t src2_step,
|
||||
const uchar* src3, size_t src3_step,
|
||||
int rows, int cols, uchar* dst, size_t dst_step)
|
||||
{
|
||||
typedef typename TypeTraits<T>::type4 dst_type;
|
||||
|
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
const T* src0_y = (const T*)(src0 + y * src0_step);
|
||||
const T* src1_y = (const T*)(src1 + y * src1_step);
|
||||
const T* src2_y = (const T*)(src2 + y * src2_step);
|
||||
const T* src3_y = (const T*)(src3 + y * src3_step);
|
||||
dst_type* dst_y = (dst_type*)(dst + y * dst_step);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
dst_type dst_elem;
|
||||
dst_elem.x = src0_y[x];
|
||||
dst_elem.y = src1_y[x];
|
||||
dst_elem.z = src2_y[x];
|
||||
dst_elem.w = src3_y[x];
|
||||
dst_y[x] = dst_elem;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <>
|
||||
__global__ void mergeC4_<double>(const uchar* src0, size_t src0_step,
|
||||
const uchar* src1, size_t src1_step,
|
||||
const uchar* src2, size_t src2_step,
|
||||
const uchar* src3, size_t src3_step,
|
||||
int rows, int cols, uchar* dst, size_t dst_step)
|
||||
{
|
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
const double* src0_y = (const double*)(src0 + y * src0_step);
|
||||
const double* src1_y = (const double*)(src1 + y * src1_step);
|
||||
const double* src2_y = (const double*)(src2 + y * src2_step);
|
||||
const double* src3_y = (const double*)(src3 + y * src3_step);
|
||||
double2* dst_y = (double2*)(dst + y * dst_step);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
dst_y[2 * x] = make_double2(src0_y[x], src1_y[x]);
|
||||
dst_y[2 * x + 1] = make_double2(src2_y[x], src3_y[x]);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <typename T>
|
||||
static void mergeC2_(const PtrStepSzb* src, PtrStepSzb& dst, const cudaStream_t& stream)
|
||||
{
|
||||
dim3 block(32, 8);
|
||||
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
|
||||
mergeC2_<T><<<grid, block, 0, stream>>>(
|
||||
src[0].data, src[0].step,
|
||||
src[1].data, src[1].step,
|
||||
dst.rows, dst.cols, dst.data, dst.step);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall(cudaDeviceSynchronize());
|
||||
}
|
||||
|
||||
|
||||
template <typename T>
|
||||
static void mergeC3_(const PtrStepSzb* src, PtrStepSzb& dst, const cudaStream_t& stream)
|
||||
{
|
||||
dim3 block(32, 8);
|
||||
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
|
||||
mergeC3_<T><<<grid, block, 0, stream>>>(
|
||||
src[0].data, src[0].step,
|
||||
src[1].data, src[1].step,
|
||||
src[2].data, src[2].step,
|
||||
dst.rows, dst.cols, dst.data, dst.step);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall(cudaDeviceSynchronize());
|
||||
}
|
||||
|
||||
|
||||
template <typename T>
|
||||
static void mergeC4_(const PtrStepSzb* src, PtrStepSzb& dst, const cudaStream_t& stream)
|
||||
{
|
||||
dim3 block(32, 8);
|
||||
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
|
||||
mergeC4_<T><<<grid, block, 0, stream>>>(
|
||||
src[0].data, src[0].step,
|
||||
src[1].data, src[1].step,
|
||||
src[2].data, src[2].step,
|
||||
src[3].data, src[3].step,
|
||||
dst.rows, dst.cols, dst.data, dst.step);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall(cudaDeviceSynchronize());
|
||||
}
|
||||
|
||||
|
||||
void merge(const PtrStepSzb* src, PtrStepSzb& dst,
|
||||
int total_channels, size_t elem_size,
|
||||
const cudaStream_t& stream)
|
||||
{
|
||||
static MergeFunction merge_func_tbl[] =
|
||||
{
|
||||
mergeC2_<char>, mergeC2_<short>, mergeC2_<int>, 0, mergeC2_<double>,
|
||||
mergeC3_<char>, mergeC3_<short>, mergeC3_<int>, 0, mergeC3_<double>,
|
||||
mergeC4_<char>, mergeC4_<short>, mergeC4_<int>, 0, mergeC4_<double>,
|
||||
{MergeFunc<2, uchar>::call, MergeFunc<2, ushort>::call, MergeFunc<2, int>::call, 0, MergeFunc<2, double>::call},
|
||||
{MergeFunc<3, uchar>::call, MergeFunc<3, ushort>::call, MergeFunc<3, int>::call, 0, MergeFunc<3, double>::call},
|
||||
{MergeFunc<4, uchar>::call, MergeFunc<4, ushort>::call, MergeFunc<4, int>::call, 0, MergeFunc<4, double>::call}
|
||||
};
|
||||
|
||||
size_t merge_func_id = (total_channels - 2) * 5 + (elem_size >> 1);
|
||||
MergeFunction merge_func = merge_func_tbl[merge_func_id];
|
||||
const int channels = static_cast<int>(n);
|
||||
|
||||
if (merge_func == 0)
|
||||
_dst.create(size, CV_MAKE_TYPE(depth, channels));
|
||||
GpuMat dst = _dst.getGpuMat();
|
||||
|
||||
const func_t func = funcs[channels - 2][CV_ELEM_SIZE(depth) / 2];
|
||||
|
||||
if (func == 0)
|
||||
CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported channel count or data type");
|
||||
|
||||
merge_func(src, dst, stream);
|
||||
func(src, dst, stream);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void cv::cuda::merge(const GpuMat* src, size_t n, OutputArray dst, Stream& stream)
|
||||
{
|
||||
mergeImpl(src, n, dst, stream);
|
||||
}
|
||||
|
||||
|
||||
void cv::cuda::merge(const std::vector<GpuMat>& src, OutputArray dst, Stream& stream)
|
||||
{
|
||||
mergeImpl(&src[0], src.size(), dst, stream);
|
||||
}
|
||||
|
||||
//------------------------------------------------------------
|
||||
// Split
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
/// split
|
||||
|
||||
namespace
|
||||
{
|
||||
template <int cn, typename T> struct SplitFunc;
|
||||
|
||||
template <typename T>
|
||||
__global__ void splitC2_(const uchar* src, size_t src_step,
|
||||
int rows, int cols,
|
||||
uchar* dst0, size_t dst0_step,
|
||||
uchar* dst1, size_t dst1_step)
|
||||
template <typename T> struct SplitFunc<2, T>
|
||||
{
|
||||
static void call(const GpuMat& src, GpuMat* dst, Stream& stream)
|
||||
{
|
||||
typedef typename TypeTraits<T>::type2 src_type;
|
||||
|
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
const src_type* src_y = (const src_type*)(src + y * src_step);
|
||||
T* dst0_y = (T*)(dst0 + y * dst0_step);
|
||||
T* dst1_y = (T*)(dst1 + y * dst1_step);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
GlobPtrSz<T> dstarr[2] =
|
||||
{
|
||||
src_type src_elem = src_y[x];
|
||||
dst0_y[x] = src_elem.x;
|
||||
dst1_y[x] = src_elem.y;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <typename T>
|
||||
__global__ void splitC3_(const uchar* src, size_t src_step,
|
||||
int rows, int cols,
|
||||
uchar* dst0, size_t dst0_step,
|
||||
uchar* dst1, size_t dst1_step,
|
||||
uchar* dst2, size_t dst2_step)
|
||||
{
|
||||
typedef typename TypeTraits<T>::type3 src_type;
|
||||
|
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
const src_type* src_y = (const src_type*)(src + y * src_step);
|
||||
T* dst0_y = (T*)(dst0 + y * dst0_step);
|
||||
T* dst1_y = (T*)(dst1 + y * dst1_step);
|
||||
T* dst2_y = (T*)(dst2 + y * dst2_step);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
src_type src_elem = src_y[x];
|
||||
dst0_y[x] = src_elem.x;
|
||||
dst1_y[x] = src_elem.y;
|
||||
dst2_y[x] = src_elem.z;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <>
|
||||
__global__ void splitC3_<double>(
|
||||
const uchar* src, size_t src_step, int rows, int cols,
|
||||
uchar* dst0, size_t dst0_step,
|
||||
uchar* dst1, size_t dst1_step,
|
||||
uchar* dst2, size_t dst2_step)
|
||||
{
|
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
const double* src_y = (const double*)(src + y * src_step);
|
||||
double* dst0_y = (double*)(dst0 + y * dst0_step);
|
||||
double* dst1_y = (double*)(dst1 + y * dst1_step);
|
||||
double* dst2_y = (double*)(dst2 + y * dst2_step);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
dst0_y[x] = src_y[3 * x];
|
||||
dst1_y[x] = src_y[3 * x + 1];
|
||||
dst2_y[x] = src_y[3 * x + 2];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <typename T>
|
||||
__global__ void splitC4_(const uchar* src, size_t src_step, int rows, int cols,
|
||||
uchar* dst0, size_t dst0_step,
|
||||
uchar* dst1, size_t dst1_step,
|
||||
uchar* dst2, size_t dst2_step,
|
||||
uchar* dst3, size_t dst3_step)
|
||||
{
|
||||
typedef typename TypeTraits<T>::type4 src_type;
|
||||
|
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
const src_type* src_y = (const src_type*)(src + y * src_step);
|
||||
T* dst0_y = (T*)(dst0 + y * dst0_step);
|
||||
T* dst1_y = (T*)(dst1 + y * dst1_step);
|
||||
T* dst2_y = (T*)(dst2 + y * dst2_step);
|
||||
T* dst3_y = (T*)(dst3 + y * dst3_step);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
src_type src_elem = src_y[x];
|
||||
dst0_y[x] = src_elem.x;
|
||||
dst1_y[x] = src_elem.y;
|
||||
dst2_y[x] = src_elem.z;
|
||||
dst3_y[x] = src_elem.w;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <>
|
||||
__global__ void splitC4_<double>(
|
||||
const uchar* src, size_t src_step, int rows, int cols,
|
||||
uchar* dst0, size_t dst0_step,
|
||||
uchar* dst1, size_t dst1_step,
|
||||
uchar* dst2, size_t dst2_step,
|
||||
uchar* dst3, size_t dst3_step)
|
||||
{
|
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
const double2* src_y = (const double2*)(src + y * src_step);
|
||||
double* dst0_y = (double*)(dst0 + y * dst0_step);
|
||||
double* dst1_y = (double*)(dst1 + y * dst1_step);
|
||||
double* dst2_y = (double*)(dst2 + y * dst2_step);
|
||||
double* dst3_y = (double*)(dst3 + y * dst3_step);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
double2 src_elem1 = src_y[2 * x];
|
||||
double2 src_elem2 = src_y[2 * x + 1];
|
||||
dst0_y[x] = src_elem1.x;
|
||||
dst1_y[x] = src_elem1.y;
|
||||
dst2_y[x] = src_elem2.x;
|
||||
dst3_y[x] = src_elem2.y;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static void splitC2_(const PtrStepSzb& src, PtrStepSzb* dst, const cudaStream_t& stream)
|
||||
{
|
||||
dim3 block(32, 8);
|
||||
dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
|
||||
splitC2_<T><<<grid, block, 0, stream>>>(
|
||||
src.data, src.step, src.rows, src.cols,
|
||||
dst[0].data, dst[0].step,
|
||||
dst[1].data, dst[1].step);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall(cudaDeviceSynchronize());
|
||||
}
|
||||
|
||||
|
||||
template <typename T>
|
||||
static void splitC3_(const PtrStepSzb& src, PtrStepSzb* dst, const cudaStream_t& stream)
|
||||
{
|
||||
dim3 block(32, 8);
|
||||
dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
|
||||
splitC3_<T><<<grid, block, 0, stream>>>(
|
||||
src.data, src.step, src.rows, src.cols,
|
||||
dst[0].data, dst[0].step,
|
||||
dst[1].data, dst[1].step,
|
||||
dst[2].data, dst[2].step);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall(cudaDeviceSynchronize());
|
||||
}
|
||||
|
||||
|
||||
template <typename T>
|
||||
static void splitC4_(const PtrStepSzb& src, PtrStepSzb* dst, const cudaStream_t& stream)
|
||||
{
|
||||
dim3 block(32, 8);
|
||||
dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
|
||||
splitC4_<T><<<grid, block, 0, stream>>>(
|
||||
src.data, src.step, src.rows, src.cols,
|
||||
dst[0].data, dst[0].step,
|
||||
dst[1].data, dst[1].step,
|
||||
dst[2].data, dst[2].step,
|
||||
dst[3].data, dst[3].step);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall(cudaDeviceSynchronize());
|
||||
}
|
||||
|
||||
|
||||
void split(const PtrStepSzb& src, PtrStepSzb* dst, int num_channels, size_t elem_size1, const cudaStream_t& stream)
|
||||
{
|
||||
static SplitFunction split_func_tbl[] =
|
||||
{
|
||||
splitC2_<char>, splitC2_<short>, splitC2_<int>, 0, splitC2_<double>,
|
||||
splitC3_<char>, splitC3_<short>, splitC3_<int>, 0, splitC3_<double>,
|
||||
splitC4_<char>, splitC4_<short>, splitC4_<int>, 0, splitC4_<double>,
|
||||
globPtr<T>(dst[0]), globPtr<T>(dst[1])
|
||||
};
|
||||
|
||||
size_t split_func_id = (num_channels - 2) * 5 + (elem_size1 >> 1);
|
||||
SplitFunction split_func = split_func_tbl[split_func_id];
|
||||
|
||||
if (split_func == 0)
|
||||
CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported channel count or data type");
|
||||
|
||||
split_func(src, dst, stream);
|
||||
gridSplit(globPtr<typename MakeVec<T, 2>::type>(src), dstarr, stream);
|
||||
}
|
||||
} // namespace split_merge
|
||||
}}} // namespace cv { namespace cuda { namespace cudev
|
||||
};
|
||||
|
||||
template <typename T> struct SplitFunc<3, T>
|
||||
{
|
||||
static void call(const GpuMat& src, GpuMat* dst, Stream& stream)
|
||||
{
|
||||
GlobPtrSz<T> dstarr[3] =
|
||||
{
|
||||
globPtr<T>(dst[0]), globPtr<T>(dst[1]), globPtr<T>(dst[2])
|
||||
};
|
||||
|
||||
#endif /* CUDA_DISABLER */
|
||||
gridSplit(globPtr<typename MakeVec<T, 3>::type>(src), dstarr, stream);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T> struct SplitFunc<4, T>
|
||||
{
|
||||
static void call(const GpuMat& src, GpuMat* dst, Stream& stream)
|
||||
{
|
||||
GlobPtrSz<T> dstarr[4] =
|
||||
{
|
||||
globPtr<T>(dst[0]), globPtr<T>(dst[1]), globPtr<T>(dst[2]), globPtr<T>(dst[3])
|
||||
};
|
||||
|
||||
gridSplit(globPtr<typename MakeVec<T, 4>::type>(src), dstarr, stream);
|
||||
}
|
||||
};
|
||||
|
||||
void splitImpl(const GpuMat& src, GpuMat* dst, Stream& stream)
|
||||
{
|
||||
typedef void (*func_t)(const GpuMat& src, GpuMat* dst, Stream& stream);
|
||||
static const func_t funcs[3][5] =
|
||||
{
|
||||
{SplitFunc<2, uchar>::call, SplitFunc<2, ushort>::call, SplitFunc<2, int>::call, 0, SplitFunc<2, double>::call},
|
||||
{SplitFunc<3, uchar>::call, SplitFunc<3, ushort>::call, SplitFunc<3, int>::call, 0, SplitFunc<3, double>::call},
|
||||
{SplitFunc<4, uchar>::call, SplitFunc<4, ushort>::call, SplitFunc<4, int>::call, 0, SplitFunc<4, double>::call}
|
||||
};
|
||||
|
||||
CV_DbgAssert( dst != 0 );
|
||||
|
||||
const int depth = src.depth();
|
||||
const int channels = src.channels();
|
||||
|
||||
CV_DbgAssert( channels <= 4 );
|
||||
|
||||
if (channels == 0)
|
||||
return;
|
||||
|
||||
if (channels == 1)
|
||||
{
|
||||
src.copyTo(dst[0], stream);
|
||||
return;
|
||||
}
|
||||
|
||||
for (int i = 0; i < channels; ++i)
|
||||
dst[i].create(src.size(), depth);
|
||||
|
||||
const func_t func = funcs[channels - 2][CV_ELEM_SIZE(depth) / 2];
|
||||
|
||||
if (func == 0)
|
||||
CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported channel count or data type");
|
||||
|
||||
func(src, dst, stream);
|
||||
}
|
||||
}
|
||||
|
||||
void cv::cuda::split(InputArray _src, GpuMat* dst, Stream& stream)
|
||||
{
|
||||
GpuMat src = _src.getGpuMat();
|
||||
splitImpl(src, dst, stream);
|
||||
}
|
||||
|
||||
void cv::cuda::split(InputArray _src, std::vector<GpuMat>& dst, Stream& stream)
|
||||
{
|
||||
GpuMat src = _src.getGpuMat();
|
||||
dst.resize(src.channels());
|
||||
if (src.channels() > 0)
|
||||
splitImpl(src, &dst[0], stream);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@ -40,146 +40,186 @@
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
#include "opencv2/opencv_modules.hpp"
|
||||
|
||||
#include "opencv2/core/cuda/common.hpp"
|
||||
#include "opencv2/core/cuda/functional.hpp"
|
||||
#include "opencv2/core/cuda/transform.hpp"
|
||||
#include "opencv2/core/cuda/saturate_cast.hpp"
|
||||
#include "opencv2/core/cuda/simd_functions.hpp"
|
||||
#ifndef HAVE_OPENCV_CUDEV
|
||||
|
||||
#include "arithm_func_traits.hpp"
|
||||
#error "opencv_cudev is required"
|
||||
|
||||
using namespace cv::cuda;
|
||||
using namespace cv::cuda::device;
|
||||
#else
|
||||
|
||||
namespace arithm
|
||||
#include "opencv2/cudev.hpp"
|
||||
|
||||
using namespace cv::cudev;
|
||||
|
||||
void subMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, double, Stream& _stream, int);
|
||||
|
||||
namespace
|
||||
{
|
||||
struct VSub4 : binary_function<uint, uint, uint>
|
||||
{
|
||||
__device__ __forceinline__ uint operator ()(uint a, uint b) const
|
||||
{
|
||||
return vsub4(a, b);
|
||||
}
|
||||
|
||||
__host__ __device__ __forceinline__ VSub4() {}
|
||||
__host__ __device__ __forceinline__ VSub4(const VSub4&) {}
|
||||
};
|
||||
|
||||
struct VSub2 : binary_function<uint, uint, uint>
|
||||
{
|
||||
__device__ __forceinline__ uint operator ()(uint a, uint b) const
|
||||
{
|
||||
return vsub2(a, b);
|
||||
}
|
||||
|
||||
__host__ __device__ __forceinline__ VSub2() {}
|
||||
__host__ __device__ __forceinline__ VSub2(const VSub2&) {}
|
||||
};
|
||||
|
||||
template <typename T, typename D> struct SubMat : binary_function<T, T, D>
|
||||
template <typename T, typename D> struct SubOp1 : binary_function<T, T, D>
|
||||
{
|
||||
__device__ __forceinline__ D operator ()(T a, T b) const
|
||||
{
|
||||
return saturate_cast<D>(a - b);
|
||||
}
|
||||
|
||||
__host__ __device__ __forceinline__ SubMat() {}
|
||||
__host__ __device__ __forceinline__ SubMat(const SubMat&) {}
|
||||
};
|
||||
}
|
||||
|
||||
namespace cv { namespace cuda { namespace device
|
||||
{
|
||||
template <> struct TransformFunctorTraits< arithm::VSub4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
|
||||
{
|
||||
};
|
||||
|
||||
template <> struct TransformFunctorTraits< arithm::VSub2 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
|
||||
{
|
||||
};
|
||||
|
||||
template <typename T, typename D> struct TransformFunctorTraits< arithm::SubMat<T, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
|
||||
{
|
||||
};
|
||||
}}}
|
||||
|
||||
namespace arithm
|
||||
{
|
||||
void subMat_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
|
||||
{
|
||||
device::transform(src1, src2, dst, VSub4(), WithOutMask(), stream);
|
||||
}
|
||||
|
||||
void subMat_v2(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
|
||||
{
|
||||
device::transform(src1, src2, dst, VSub2(), WithOutMask(), stream);
|
||||
}
|
||||
|
||||
template <typename T, typename D>
|
||||
void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
|
||||
void subMat_v1(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, Stream& stream)
|
||||
{
|
||||
if (mask.data)
|
||||
device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, SubMat<T, D>(), mask, stream);
|
||||
gridTransformBinary(globPtr<T>(src1), globPtr<T>(src2), globPtr<D>(dst), SubOp1<T, D>(), globPtr<uchar>(mask), stream);
|
||||
else
|
||||
device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, SubMat<T, D>(), WithOutMask(), stream);
|
||||
gridTransformBinary(globPtr<T>(src1), globPtr<T>(src2), globPtr<D>(dst), SubOp1<T, D>(), stream);
|
||||
}
|
||||
|
||||
template void subMat<uchar, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void subMat<uchar, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void subMat<uchar, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void subMat<uchar, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void subMat<uchar, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void subMat<uchar, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void subMat<uchar, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
struct SubOp2 : binary_function<uint, uint, uint>
|
||||
{
|
||||
__device__ __forceinline__ uint operator ()(uint a, uint b) const
|
||||
{
|
||||
return vsub2(a, b);
|
||||
}
|
||||
};
|
||||
|
||||
template void subMat<schar, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void subMat<schar, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void subMat<schar, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void subMat<schar, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void subMat<schar, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void subMat<schar, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void subMat<schar, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
void subMat_v2(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
|
||||
{
|
||||
const int vcols = src1.cols >> 1;
|
||||
|
||||
//template void subMat<ushort, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
//template void subMat<ushort, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void subMat<ushort, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void subMat<ushort, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void subMat<ushort, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void subMat<ushort, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void subMat<ushort, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
GlobPtrSz<uint> src1_ = globPtr((uint*) src1.data, src1.step, src1.rows, vcols);
|
||||
GlobPtrSz<uint> src2_ = globPtr((uint*) src2.data, src2.step, src1.rows, vcols);
|
||||
GlobPtrSz<uint> dst_ = globPtr((uint*) dst.data, dst.step, src1.rows, vcols);
|
||||
|
||||
//template void subMat<short, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
//template void subMat<short, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void subMat<short, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void subMat<short, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void subMat<short, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void subMat<short, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void subMat<short, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
gridTransformBinary(src1_, src2_, dst_, SubOp2(), stream);
|
||||
}
|
||||
|
||||
//template void subMat<int, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
//template void subMat<int, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
//template void subMat<int, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
//template void subMat<int, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void subMat<int, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void subMat<int, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void subMat<int, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
struct SubOp4 : binary_function<uint, uint, uint>
|
||||
{
|
||||
__device__ __forceinline__ uint operator ()(uint a, uint b) const
|
||||
{
|
||||
return vsub4(a, b);
|
||||
}
|
||||
};
|
||||
|
||||
//template void subMat<float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
//template void subMat<float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
//template void subMat<float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
//template void subMat<float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
//template void subMat<float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void subMat<float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void subMat<float, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
void subMat_v4(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
|
||||
{
|
||||
const int vcols = src1.cols >> 2;
|
||||
|
||||
//template void subMat<double, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
//template void subMat<double, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
//template void subMat<double, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
//template void subMat<double, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
//template void subMat<double, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
//template void subMat<double, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void subMat<double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
GlobPtrSz<uint> src1_ = globPtr((uint*) src1.data, src1.step, src1.rows, vcols);
|
||||
GlobPtrSz<uint> src2_ = globPtr((uint*) src2.data, src2.step, src1.rows, vcols);
|
||||
GlobPtrSz<uint> dst_ = globPtr((uint*) dst.data, dst.step, src1.rows, vcols);
|
||||
|
||||
gridTransformBinary(src1_, src2_, dst_, SubOp4(), stream);
|
||||
}
|
||||
}
|
||||
|
||||
#endif // CUDA_DISABLER
|
||||
void subMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, double, Stream& stream, int)
|
||||
{
|
||||
typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, Stream& stream);
|
||||
static const func_t funcs[7][7] =
|
||||
{
|
||||
{
|
||||
subMat_v1<uchar, uchar>,
|
||||
subMat_v1<uchar, schar>,
|
||||
subMat_v1<uchar, ushort>,
|
||||
subMat_v1<uchar, short>,
|
||||
subMat_v1<uchar, int>,
|
||||
subMat_v1<uchar, float>,
|
||||
subMat_v1<uchar, double>
|
||||
},
|
||||
{
|
||||
subMat_v1<schar, uchar>,
|
||||
subMat_v1<schar, schar>,
|
||||
subMat_v1<schar, ushort>,
|
||||
subMat_v1<schar, short>,
|
||||
subMat_v1<schar, int>,
|
||||
subMat_v1<schar, float>,
|
||||
subMat_v1<schar, double>
|
||||
},
|
||||
{
|
||||
0 /*subMat_v1<ushort, uchar>*/,
|
||||
0 /*subMat_v1<ushort, schar>*/,
|
||||
subMat_v1<ushort, ushort>,
|
||||
subMat_v1<ushort, short>,
|
||||
subMat_v1<ushort, int>,
|
||||
subMat_v1<ushort, float>,
|
||||
subMat_v1<ushort, double>
|
||||
},
|
||||
{
|
||||
0 /*subMat_v1<short, uchar>*/,
|
||||
0 /*subMat_v1<short, schar>*/,
|
||||
subMat_v1<short, ushort>,
|
||||
subMat_v1<short, short>,
|
||||
subMat_v1<short, int>,
|
||||
subMat_v1<short, float>,
|
||||
subMat_v1<short, double>
|
||||
},
|
||||
{
|
||||
0 /*subMat_v1<int, uchar>*/,
|
||||
0 /*subMat_v1<int, schar>*/,
|
||||
0 /*subMat_v1<int, ushort>*/,
|
||||
0 /*subMat_v1<int, short>*/,
|
||||
subMat_v1<int, int>,
|
||||
subMat_v1<int, float>,
|
||||
subMat_v1<int, double>
|
||||
},
|
||||
{
|
||||
0 /*subMat_v1<float, uchar>*/,
|
||||
0 /*subMat_v1<float, schar>*/,
|
||||
0 /*subMat_v1<float, ushort>*/,
|
||||
0 /*subMat_v1<float, short>*/,
|
||||
0 /*subMat_v1<float, int>*/,
|
||||
subMat_v1<float, float>,
|
||||
subMat_v1<float, double>
|
||||
},
|
||||
{
|
||||
0 /*subMat_v1<double, uchar>*/,
|
||||
0 /*subMat_v1<double, schar>*/,
|
||||
0 /*subMat_v1<double, ushort>*/,
|
||||
0 /*subMat_v1<double, short>*/,
|
||||
0 /*subMat_v1<double, int>*/,
|
||||
0 /*subMat_v1<double, float>*/,
|
||||
subMat_v1<double, double>
|
||||
}
|
||||
};
|
||||
|
||||
const int sdepth = src1.depth();
|
||||
const int ddepth = dst.depth();
|
||||
|
||||
CV_DbgAssert( sdepth <= CV_64F && ddepth <= CV_64F );
|
||||
|
||||
GpuMat src1_ = src1.reshape(1);
|
||||
GpuMat src2_ = src2.reshape(1);
|
||||
GpuMat dst_ = dst.reshape(1);
|
||||
|
||||
if (mask.empty() && (sdepth == CV_8U || sdepth == CV_16U) && ddepth == sdepth)
|
||||
{
|
||||
const intptr_t src1ptr = reinterpret_cast<intptr_t>(src1_.data);
|
||||
const intptr_t src2ptr = reinterpret_cast<intptr_t>(src2_.data);
|
||||
const intptr_t dstptr = reinterpret_cast<intptr_t>(dst_.data);
|
||||
|
||||
const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0;
|
||||
|
||||
if (isAllAligned)
|
||||
{
|
||||
if (sdepth == CV_8U && (src1_.cols & 3) == 0)
|
||||
{
|
||||
subMat_v4(src1_, src2_, dst_, stream);
|
||||
return;
|
||||
}
|
||||
else if (sdepth == CV_16U && (src1_.cols & 1) == 0)
|
||||
{
|
||||
subMat_v2(src1_, src2_, dst_, stream);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const func_t func = funcs[sdepth][ddepth];
|
||||
|
||||
if (!func)
|
||||
CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types");
|
||||
|
||||
func(src1_, src2_, dst_, mask, stream);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@ -40,110 +40,164 @@
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
#include "opencv2/opencv_modules.hpp"
|
||||
|
||||
#include "opencv2/core/cuda/common.hpp"
|
||||
#include "opencv2/core/cuda/functional.hpp"
|
||||
#include "opencv2/core/cuda/transform.hpp"
|
||||
#include "opencv2/core/cuda/saturate_cast.hpp"
|
||||
#include "opencv2/core/cuda/simd_functions.hpp"
|
||||
#ifndef HAVE_OPENCV_CUDEV
|
||||
|
||||
#include "arithm_func_traits.hpp"
|
||||
#error "opencv_cudev is required"
|
||||
|
||||
using namespace cv::cuda;
|
||||
using namespace cv::cuda::device;
|
||||
#else
|
||||
|
||||
namespace arithm
|
||||
#include "opencv2/cudev.hpp"
|
||||
|
||||
using namespace cv::cudev;
|
||||
|
||||
void subScalar(const GpuMat& src, cv::Scalar val, bool inv, GpuMat& dst, const GpuMat& mask, double, Stream& stream, int);
|
||||
|
||||
namespace
|
||||
{
|
||||
template <typename T, typename S, typename D> struct SubScalar : unary_function<T, D>
|
||||
template <typename SrcType, typename ScalarType, typename DstType> struct SubScalarOp : unary_function<SrcType, DstType>
|
||||
{
|
||||
S val;
|
||||
int scale;
|
||||
ScalarType val;
|
||||
|
||||
__host__ SubScalar(S val_, int scale_) : val(val_), scale(scale_) {}
|
||||
|
||||
__device__ __forceinline__ D operator ()(T a) const
|
||||
__device__ __forceinline__ DstType operator ()(SrcType a) const
|
||||
{
|
||||
return saturate_cast<D>(scale * (a - val));
|
||||
return saturate_cast<DstType>(saturate_cast<ScalarType>(a) - val);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
namespace cv { namespace cuda { namespace device
|
||||
{
|
||||
template <typename T, typename S, typename D> struct TransformFunctorTraits< arithm::SubScalar<T, S, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
|
||||
template <typename SrcType, typename ScalarType, typename DstType> struct SubScalarOpInv : unary_function<SrcType, DstType>
|
||||
{
|
||||
ScalarType val;
|
||||
|
||||
__device__ __forceinline__ DstType operator ()(SrcType a) const
|
||||
{
|
||||
return saturate_cast<DstType>(val - saturate_cast<ScalarType>(a));
|
||||
}
|
||||
};
|
||||
|
||||
template <typename ScalarDepth> struct TransformPolicy : DefaultTransformPolicy
|
||||
{
|
||||
};
|
||||
}}}
|
||||
|
||||
namespace arithm
|
||||
{
|
||||
template <typename T, typename S, typename D>
|
||||
void subScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
|
||||
template <> struct TransformPolicy<double> : DefaultTransformPolicy
|
||||
{
|
||||
SubScalar<T, S, D> op(static_cast<S>(val), inv ? -1 : 1);
|
||||
enum {
|
||||
shift = 1
|
||||
};
|
||||
};
|
||||
|
||||
if (mask.data)
|
||||
device::transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, mask, stream);
|
||||
template <typename SrcType, typename ScalarDepth, typename DstType>
|
||||
void subScalarImpl(const GpuMat& src, cv::Scalar value, bool inv, GpuMat& dst, const GpuMat& mask, Stream& stream)
|
||||
{
|
||||
typedef typename MakeVec<ScalarDepth, VecTraits<SrcType>::cn>::type ScalarType;
|
||||
|
||||
cv::Scalar_<ScalarDepth> value_ = value;
|
||||
|
||||
if (inv)
|
||||
{
|
||||
SubScalarOpInv<SrcType, ScalarType, DstType> op;
|
||||
op.val = VecTraits<ScalarType>::make(value_.val);
|
||||
|
||||
if (mask.data)
|
||||
gridTransformUnary_< TransformPolicy<ScalarDepth> >(globPtr<SrcType>(src), globPtr<DstType>(dst), op, globPtr<uchar>(mask), stream);
|
||||
else
|
||||
gridTransformUnary_< TransformPolicy<ScalarDepth> >(globPtr<SrcType>(src), globPtr<DstType>(dst), op, stream);
|
||||
}
|
||||
else
|
||||
device::transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
|
||||
{
|
||||
SubScalarOp<SrcType, ScalarType, DstType> op;
|
||||
op.val = VecTraits<ScalarType>::make(value_.val);
|
||||
|
||||
if (mask.data)
|
||||
gridTransformUnary_< TransformPolicy<ScalarDepth> >(globPtr<SrcType>(src), globPtr<DstType>(dst), op, globPtr<uchar>(mask), stream);
|
||||
else
|
||||
gridTransformUnary_< TransformPolicy<ScalarDepth> >(globPtr<SrcType>(src), globPtr<DstType>(dst), op, stream);
|
||||
}
|
||||
}
|
||||
|
||||
template void subScalar<uchar, float, uchar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void subScalar<uchar, float, schar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void subScalar<uchar, float, ushort>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void subScalar<uchar, float, short>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void subScalar<uchar, float, int>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void subScalar<uchar, float, float>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void subScalar<uchar, double, double>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
|
||||
template void subScalar<schar, float, uchar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void subScalar<schar, float, schar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void subScalar<schar, float, ushort>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void subScalar<schar, float, short>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void subScalar<schar, float, int>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void subScalar<schar, float, float>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void subScalar<schar, double, double>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
|
||||
//template void subScalar<ushort, float, uchar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
//template void subScalar<ushort, float, schar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void subScalar<ushort, float, ushort>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void subScalar<ushort, float, short>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void subScalar<ushort, float, int>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void subScalar<ushort, float, float>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void subScalar<ushort, double, double>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
|
||||
//template void subScalar<short, float, uchar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
//template void subScalar<short, float, schar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void subScalar<short, float, ushort>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void subScalar<short, float, short>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void subScalar<short, float, int>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void subScalar<short, float, float>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void subScalar<short, double, double>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
|
||||
//template void subScalar<int, float, uchar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
//template void subScalar<int, float, schar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
//template void subScalar<int, float, ushort>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
//template void subScalar<int, float, short>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void subScalar<int, float, int>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void subScalar<int, float, float>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void subScalar<int, double, double>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
|
||||
//template void subScalar<float, float, uchar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
//template void subScalar<float, float, schar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
//template void subScalar<float, float, ushort>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
//template void subScalar<float, float, short>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
//template void subScalar<float, float, int>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void subScalar<float, float, float>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void subScalar<float, double, double>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
|
||||
//template void subScalar<double, double, uchar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
//template void subScalar<double, double, schar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
//template void subScalar<double, double, ushort>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
//template void subScalar<double, double, short>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
//template void subScalar<double, double, int>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
//template void subScalar<double, double, float>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
template void subScalar<double, double, double>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
}
|
||||
|
||||
#endif // CUDA_DISABLER
|
||||
void subScalar(const GpuMat& src, cv::Scalar val, bool inv, GpuMat& dst, const GpuMat& mask, double, Stream& stream, int)
|
||||
{
|
||||
typedef void (*func_t)(const GpuMat& src, cv::Scalar val, bool inv, GpuMat& dst, const GpuMat& mask, Stream& stream);
|
||||
static const func_t funcs[7][7][4] =
|
||||
{
|
||||
{
|
||||
{subScalarImpl<uchar, float, uchar>, subScalarImpl<uchar2, float, uchar2>, subScalarImpl<uchar3, float, uchar3>, subScalarImpl<uchar4, float, uchar4>},
|
||||
{subScalarImpl<uchar, float, schar>, subScalarImpl<uchar2, float, char2>, subScalarImpl<uchar3, float, char3>, subScalarImpl<uchar4, float, char4>},
|
||||
{subScalarImpl<uchar, float, ushort>, subScalarImpl<uchar2, float, ushort2>, subScalarImpl<uchar3, float, ushort3>, subScalarImpl<uchar4, float, ushort4>},
|
||||
{subScalarImpl<uchar, float, short>, subScalarImpl<uchar2, float, short2>, subScalarImpl<uchar3, float, short3>, subScalarImpl<uchar4, float, short4>},
|
||||
{subScalarImpl<uchar, float, int>, subScalarImpl<uchar2, float, int2>, subScalarImpl<uchar3, float, int3>, subScalarImpl<uchar4, float, int4>},
|
||||
{subScalarImpl<uchar, float, float>, subScalarImpl<uchar2, float, float2>, subScalarImpl<uchar3, float, float3>, subScalarImpl<uchar4, float, float4>},
|
||||
{subScalarImpl<uchar, double, double>, subScalarImpl<uchar2, double, double2>, subScalarImpl<uchar3, double, double3>, subScalarImpl<uchar4, double, double4>}
|
||||
},
|
||||
{
|
||||
{subScalarImpl<schar, float, uchar>, subScalarImpl<char2, float, uchar2>, subScalarImpl<char3, float, uchar3>, subScalarImpl<char4, float, uchar4>},
|
||||
{subScalarImpl<schar, float, schar>, subScalarImpl<char2, float, char2>, subScalarImpl<char3, float, char3>, subScalarImpl<char4, float, char4>},
|
||||
{subScalarImpl<schar, float, ushort>, subScalarImpl<char2, float, ushort2>, subScalarImpl<char3, float, ushort3>, subScalarImpl<char4, float, ushort4>},
|
||||
{subScalarImpl<schar, float, short>, subScalarImpl<char2, float, short2>, subScalarImpl<char3, float, short3>, subScalarImpl<char4, float, short4>},
|
||||
{subScalarImpl<schar, float, int>, subScalarImpl<char2, float, int2>, subScalarImpl<char3, float, int3>, subScalarImpl<char4, float, int4>},
|
||||
{subScalarImpl<schar, float, float>, subScalarImpl<char2, float, float2>, subScalarImpl<char3, float, float3>, subScalarImpl<char4, float, float4>},
|
||||
{subScalarImpl<schar, double, double>, subScalarImpl<char2, double, double2>, subScalarImpl<char3, double, double3>, subScalarImpl<char4, double, double4>}
|
||||
},
|
||||
{
|
||||
{0 /*subScalarImpl<ushort, float, uchar>*/, 0 /*subScalarImpl<ushort2, float, uchar2>*/, 0 /*subScalarImpl<ushort3, float, uchar3>*/, 0 /*subScalarImpl<ushort4, float, uchar4>*/},
|
||||
{0 /*subScalarImpl<ushort, float, schar>*/, 0 /*subScalarImpl<ushort2, float, char2>*/, 0 /*subScalarImpl<ushort3, float, char3>*/, 0 /*subScalarImpl<ushort4, float, char4>*/},
|
||||
{subScalarImpl<ushort, float, ushort>, subScalarImpl<ushort2, float, ushort2>, subScalarImpl<ushort3, float, ushort3>, subScalarImpl<ushort4, float, ushort4>},
|
||||
{subScalarImpl<ushort, float, short>, subScalarImpl<ushort2, float, short2>, subScalarImpl<ushort3, float, short3>, subScalarImpl<ushort4, float, short4>},
|
||||
{subScalarImpl<ushort, float, int>, subScalarImpl<ushort2, float, int2>, subScalarImpl<ushort3, float, int3>, subScalarImpl<ushort4, float, int4>},
|
||||
{subScalarImpl<ushort, float, float>, subScalarImpl<ushort2, float, float2>, subScalarImpl<ushort3, float, float3>, subScalarImpl<ushort4, float, float4>},
|
||||
{subScalarImpl<ushort, double, double>, subScalarImpl<ushort2, double, double2>, subScalarImpl<ushort3, double, double3>, subScalarImpl<ushort4, double, double4>}
|
||||
},
|
||||
{
|
||||
{0 /*subScalarImpl<short, float, uchar>*/, 0 /*subScalarImpl<short2, float, uchar2>*/, 0 /*subScalarImpl<short3, float, uchar3>*/, 0 /*subScalarImpl<short4, float, uchar4>*/},
|
||||
{0 /*subScalarImpl<short, float, schar>*/, 0 /*subScalarImpl<short2, float, char2>*/, 0 /*subScalarImpl<short3, float, char3>*/, 0 /*subScalarImpl<short4, float, char4>*/},
|
||||
{subScalarImpl<short, float, ushort>, subScalarImpl<short2, float, ushort2>, subScalarImpl<short3, float, ushort3>, subScalarImpl<short4, float, ushort4>},
|
||||
{subScalarImpl<short, float, short>, subScalarImpl<short2, float, short2>, subScalarImpl<short3, float, short3>, subScalarImpl<short4, float, short4>},
|
||||
{subScalarImpl<short, float, int>, subScalarImpl<short2, float, int2>, subScalarImpl<short3, float, int3>, subScalarImpl<short4, float, int4>},
|
||||
{subScalarImpl<short, float, float>, subScalarImpl<short2, float, float2>, subScalarImpl<short3, float, float3>, subScalarImpl<short4, float, float4>},
|
||||
{subScalarImpl<short, double, double>, subScalarImpl<short2, double, double2>, subScalarImpl<short3, double, double3>, subScalarImpl<short4, double, double4>}
|
||||
},
|
||||
{
|
||||
{0 /*subScalarImpl<int, float, uchar>*/, 0 /*subScalarImpl<int2, float, uchar2>*/, 0 /*subScalarImpl<int3, float, uchar3>*/, 0 /*subScalarImpl<int4, float, uchar4>*/},
|
||||
{0 /*subScalarImpl<int, float, schar>*/, 0 /*subScalarImpl<int2, float, char2>*/, 0 /*subScalarImpl<int3, float, char3>*/, 0 /*subScalarImpl<int4, float, char4>*/},
|
||||
{0 /*subScalarImpl<int, float, ushort>*/, 0 /*subScalarImpl<int2, float, ushort2>*/, 0 /*subScalarImpl<int3, float, ushort3>*/, 0 /*subScalarImpl<int4, float, ushort4>*/},
|
||||
{0 /*subScalarImpl<int, float, short>*/, 0 /*subScalarImpl<int2, float, short2>*/, 0 /*subScalarImpl<int3, float, short3>*/, 0 /*subScalarImpl<int4, float, short4>*/},
|
||||
{subScalarImpl<int, float, int>, subScalarImpl<int2, float, int2>, subScalarImpl<int3, float, int3>, subScalarImpl<int4, float, int4>},
|
||||
{subScalarImpl<int, float, float>, subScalarImpl<int2, float, float2>, subScalarImpl<int3, float, float3>, subScalarImpl<int4, float, float4>},
|
||||
{subScalarImpl<int, double, double>, subScalarImpl<int2, double, double2>, subScalarImpl<int3, double, double3>, subScalarImpl<int4, double, double4>}
|
||||
},
|
||||
{
|
||||
{0 /*subScalarImpl<float, float, uchar>*/, 0 /*subScalarImpl<float2, float, uchar2>*/, 0 /*subScalarImpl<float3, float, uchar3>*/, 0 /*subScalarImpl<float4, float, uchar4>*/},
|
||||
{0 /*subScalarImpl<float, float, schar>*/, 0 /*subScalarImpl<float2, float, char2>*/, 0 /*subScalarImpl<float3, float, char3>*/, 0 /*subScalarImpl<float4, float, char4>*/},
|
||||
{0 /*subScalarImpl<float, float, ushort>*/, 0 /*subScalarImpl<float2, float, ushort2>*/, 0 /*subScalarImpl<float3, float, ushort3>*/, 0 /*subScalarImpl<float4, float, ushort4>*/},
|
||||
{0 /*subScalarImpl<float, float, short>*/, 0 /*subScalarImpl<float2, float, short2>*/, 0 /*subScalarImpl<float3, float, short3>*/, 0 /*subScalarImpl<float4, float, short4>*/},
|
||||
{0 /*subScalarImpl<float, float, int>*/, 0 /*subScalarImpl<float2, float, int2>*/, 0 /*subScalarImpl<float3, float, int3>*/, 0 /*subScalarImpl<float4, float, int4>*/},
|
||||
{subScalarImpl<float, float, float>, subScalarImpl<float2, float, float2>, subScalarImpl<float3, float, float3>, subScalarImpl<float4, float, float4>},
|
||||
{subScalarImpl<float, double, double>, subScalarImpl<float2, double, double2>, subScalarImpl<float3, double, double3>, subScalarImpl<float4, double, double4>}
|
||||
},
|
||||
{
|
||||
{0 /*subScalarImpl<double, double, uchar>*/, 0 /*subScalarImpl<double2, double, uchar2>*/, 0 /*subScalarImpl<double3, double, uchar3>*/, 0 /*subScalarImpl<double4, double, uchar4>*/},
|
||||
{0 /*subScalarImpl<double, double, schar>*/, 0 /*subScalarImpl<double2, double, char2>*/, 0 /*subScalarImpl<double3, double, char3>*/, 0 /*subScalarImpl<double4, double, char4>*/},
|
||||
{0 /*subScalarImpl<double, double, ushort>*/, 0 /*subScalarImpl<double2, double, ushort2>*/, 0 /*subScalarImpl<double3, double, ushort3>*/, 0 /*subScalarImpl<double4, double, ushort4>*/},
|
||||
{0 /*subScalarImpl<double, double, short>*/, 0 /*subScalarImpl<double2, double, short2>*/, 0 /*subScalarImpl<double3, double, short3>*/, 0 /*subScalarImpl<double4, double, short4>*/},
|
||||
{0 /*subScalarImpl<double, double, int>*/, 0 /*subScalarImpl<double2, double, int2>*/, 0 /*subScalarImpl<double3, double, int3>*/, 0 /*subScalarImpl<double4, double, int4>*/},
|
||||
{0 /*subScalarImpl<double, double, float>*/, 0 /*subScalarImpl<double2, double, float2>*/, 0 /*subScalarImpl<double3, double, float3>*/, 0 /*subScalarImpl<double4, double, float4>*/},
|
||||
{subScalarImpl<double, double, double>, subScalarImpl<double2, double, double2>, subScalarImpl<double3, double, double3>, subScalarImpl<double4, double, double4>}
|
||||
}
|
||||
};
|
||||
|
||||
const int sdepth = src.depth();
|
||||
const int ddepth = dst.depth();
|
||||
const int cn = src.channels();
|
||||
|
||||
CV_DbgAssert( sdepth <= CV_64F && ddepth <= CV_64F && cn <= 4 );
|
||||
|
||||
const func_t func = funcs[sdepth][ddepth][cn - 1];
|
||||
|
||||
if (!func)
|
||||
CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types");
|
||||
|
||||
func(src, val, inv, dst, mask, stream);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@ -40,342 +40,155 @@
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
#include "opencv2/opencv_modules.hpp"
|
||||
|
||||
#include "opencv2/core/cuda/common.hpp"
|
||||
#include "opencv2/core/cuda/vec_traits.hpp"
|
||||
#include "opencv2/core/cuda/vec_math.hpp"
|
||||
#include "opencv2/core/cuda/functional.hpp"
|
||||
#include "opencv2/core/cuda/reduce.hpp"
|
||||
#include "opencv2/core/cuda/emulation.hpp"
|
||||
#include "opencv2/core/cuda/utility.hpp"
|
||||
#ifndef HAVE_OPENCV_CUDEV
|
||||
|
||||
#include "unroll_detail.hpp"
|
||||
#error "opencv_cudev is required"
|
||||
|
||||
using namespace cv::cuda;
|
||||
using namespace cv::cuda::device;
|
||||
#else
|
||||
|
||||
namespace sum
|
||||
#include "opencv2/cudaarithm.hpp"
|
||||
#include "opencv2/cudev.hpp"
|
||||
|
||||
using namespace cv::cudev;
|
||||
|
||||
namespace
|
||||
{
|
||||
__device__ unsigned int blocks_finished = 0;
|
||||
|
||||
template <typename R, int cn> struct AtomicAdd;
|
||||
template <typename R> struct AtomicAdd<R, 1>
|
||||
template <typename T, typename R, int cn>
|
||||
cv::Scalar sumImpl(const GpuMat& _src, const GpuMat& mask, GpuMat& _buf)
|
||||
{
|
||||
static __device__ void run(R* ptr, R val)
|
||||
{
|
||||
Emulation::glob::atomicAdd(ptr, val);
|
||||
}
|
||||
};
|
||||
template <typename R> struct AtomicAdd<R, 2>
|
||||
{
|
||||
typedef typename TypeVec<R, 2>::vec_type val_type;
|
||||
typedef typename MakeVec<T, cn>::type src_type;
|
||||
typedef typename MakeVec<R, cn>::type res_type;
|
||||
|
||||
static __device__ void run(R* ptr, val_type val)
|
||||
{
|
||||
Emulation::glob::atomicAdd(ptr, val.x);
|
||||
Emulation::glob::atomicAdd(ptr + 1, val.y);
|
||||
}
|
||||
};
|
||||
template <typename R> struct AtomicAdd<R, 3>
|
||||
{
|
||||
typedef typename TypeVec<R, 3>::vec_type val_type;
|
||||
const GpuMat_<src_type>& src = (const GpuMat_<src_type>&) _src;
|
||||
GpuMat_<res_type>& buf = (GpuMat_<res_type>&) _buf;
|
||||
|
||||
static __device__ void run(R* ptr, val_type val)
|
||||
{
|
||||
Emulation::glob::atomicAdd(ptr, val.x);
|
||||
Emulation::glob::atomicAdd(ptr + 1, val.y);
|
||||
Emulation::glob::atomicAdd(ptr + 2, val.z);
|
||||
}
|
||||
};
|
||||
template <typename R> struct AtomicAdd<R, 4>
|
||||
{
|
||||
typedef typename TypeVec<R, 4>::vec_type val_type;
|
||||
|
||||
static __device__ void run(R* ptr, val_type val)
|
||||
{
|
||||
Emulation::glob::atomicAdd(ptr, val.x);
|
||||
Emulation::glob::atomicAdd(ptr + 1, val.y);
|
||||
Emulation::glob::atomicAdd(ptr + 2, val.z);
|
||||
Emulation::glob::atomicAdd(ptr + 3, val.w);
|
||||
}
|
||||
};
|
||||
|
||||
template <int BLOCK_SIZE, typename R, int cn>
|
||||
struct GlobalReduce
|
||||
{
|
||||
typedef typename TypeVec<R, cn>::vec_type result_type;
|
||||
|
||||
static __device__ void run(result_type& sum, result_type* result, int tid, int bid, R* smem)
|
||||
{
|
||||
#if __CUDA_ARCH__ >= 200
|
||||
if (tid == 0)
|
||||
AtomicAdd<R, cn>::run((R*) result, sum);
|
||||
#else
|
||||
__shared__ bool is_last;
|
||||
|
||||
if (tid == 0)
|
||||
{
|
||||
result[bid] = sum;
|
||||
|
||||
__threadfence();
|
||||
|
||||
unsigned int ticket = ::atomicAdd(&blocks_finished, 1);
|
||||
is_last = (ticket == gridDim.x * gridDim.y - 1);
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (is_last)
|
||||
{
|
||||
sum = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits<result_type>::all(0);
|
||||
|
||||
device::reduce<BLOCK_SIZE>(detail::Unroll<cn>::template smem_tuple<BLOCK_SIZE>(smem), detail::Unroll<cn>::tie(sum), tid, detail::Unroll<cn>::op(plus<R>()));
|
||||
|
||||
if (tid == 0)
|
||||
{
|
||||
result[0] = sum;
|
||||
blocks_finished = 0;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
template <int BLOCK_SIZE, typename src_type, typename result_type, class Mask, class Op>
|
||||
__global__ void kernel(const PtrStepSz<src_type> src, result_type* result, const Mask mask, const Op op, const int twidth, const int theight)
|
||||
{
|
||||
typedef typename VecTraits<src_type>::elem_type T;
|
||||
typedef typename VecTraits<result_type>::elem_type R;
|
||||
const int cn = VecTraits<src_type>::cn;
|
||||
|
||||
__shared__ R smem[BLOCK_SIZE * cn];
|
||||
|
||||
const int x0 = blockIdx.x * blockDim.x * twidth + threadIdx.x;
|
||||
const int y0 = blockIdx.y * blockDim.y * theight + threadIdx.y;
|
||||
|
||||
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
|
||||
const int bid = blockIdx.y * gridDim.x + blockIdx.x;
|
||||
|
||||
result_type sum = VecTraits<result_type>::all(0);
|
||||
|
||||
for (int i = 0, y = y0; i < theight && y < src.rows; ++i, y += blockDim.y)
|
||||
{
|
||||
const src_type* ptr = src.ptr(y);
|
||||
|
||||
for (int j = 0, x = x0; j < twidth && x < src.cols; ++j, x += blockDim.x)
|
||||
{
|
||||
if (mask(y, x))
|
||||
{
|
||||
const src_type srcVal = ptr[x];
|
||||
sum = sum + op(saturate_cast<result_type>(srcVal));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
device::reduce<BLOCK_SIZE>(detail::Unroll<cn>::template smem_tuple<BLOCK_SIZE>(smem), detail::Unroll<cn>::tie(sum), tid, detail::Unroll<cn>::op(plus<R>()));
|
||||
|
||||
GlobalReduce<BLOCK_SIZE, R, cn>::run(sum, result, tid, bid, smem);
|
||||
}
|
||||
|
||||
const int threads_x = 32;
|
||||
const int threads_y = 8;
|
||||
|
||||
void getLaunchCfg(int cols, int rows, dim3& block, dim3& grid)
|
||||
{
|
||||
block = dim3(threads_x, threads_y);
|
||||
|
||||
grid = dim3(divUp(cols, block.x * block.y),
|
||||
divUp(rows, block.y * block.x));
|
||||
|
||||
grid.x = ::min(grid.x, block.x);
|
||||
grid.y = ::min(grid.y, block.y);
|
||||
}
|
||||
|
||||
void getBufSize(int cols, int rows, int cn, int& bufcols, int& bufrows)
|
||||
{
|
||||
dim3 block, grid;
|
||||
getLaunchCfg(cols, rows, block, grid);
|
||||
|
||||
bufcols = grid.x * grid.y * sizeof(double) * cn;
|
||||
bufrows = 1;
|
||||
}
|
||||
|
||||
template <typename T, typename R, int cn, template <typename> class Op>
|
||||
void caller(PtrStepSzb src_, void* buf_, double* out, PtrStepSzb mask)
|
||||
{
|
||||
typedef typename TypeVec<T, cn>::vec_type src_type;
|
||||
typedef typename TypeVec<R, cn>::vec_type result_type;
|
||||
|
||||
PtrStepSz<src_type> src(src_);
|
||||
result_type* buf = (result_type*) buf_;
|
||||
|
||||
dim3 block, grid;
|
||||
getLaunchCfg(src.cols, src.rows, block, grid);
|
||||
|
||||
const int twidth = divUp(divUp(src.cols, grid.x), block.x);
|
||||
const int theight = divUp(divUp(src.rows, grid.y), block.y);
|
||||
|
||||
Op<result_type> op;
|
||||
|
||||
if (mask.data)
|
||||
kernel<threads_x * threads_y><<<grid, block>>>(src, buf, SingleMask(mask), op, twidth, theight);
|
||||
if (mask.empty())
|
||||
gridCalcSum(src, buf);
|
||||
else
|
||||
kernel<threads_x * threads_y><<<grid, block>>>(src, buf, WithOutMask(), op, twidth, theight);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
gridCalcSum(src, buf, globPtr<uchar>(mask));
|
||||
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
cv::Scalar_<R> res;
|
||||
cv::Mat res_mat(buf.size(), buf.type(), res.val);
|
||||
buf.download(res_mat);
|
||||
|
||||
R result[4] = {0, 0, 0, 0};
|
||||
cudaSafeCall( cudaMemcpy(&result, buf, sizeof(result_type), cudaMemcpyDeviceToHost) );
|
||||
|
||||
out[0] = result[0];
|
||||
out[1] = result[1];
|
||||
out[2] = result[2];
|
||||
out[3] = result[3];
|
||||
return res;
|
||||
}
|
||||
|
||||
template <typename T> struct SumType;
|
||||
template <> struct SumType<uchar> { typedef unsigned int R; };
|
||||
template <> struct SumType<schar> { typedef int R; };
|
||||
template <> struct SumType<ushort> { typedef unsigned int R; };
|
||||
template <> struct SumType<short> { typedef int R; };
|
||||
template <> struct SumType<int> { typedef int R; };
|
||||
template <> struct SumType<float> { typedef float R; };
|
||||
template <> struct SumType<double> { typedef double R; };
|
||||
|
||||
template <typename T, int cn>
|
||||
void run(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask)
|
||||
template <typename T, typename R, int cn>
|
||||
cv::Scalar sumAbsImpl(const GpuMat& _src, const GpuMat& mask, GpuMat& _buf)
|
||||
{
|
||||
typedef typename SumType<T>::R R;
|
||||
caller<T, R, cn, identity>(src, buf, out, mask);
|
||||
typedef typename MakeVec<T, cn>::type src_type;
|
||||
typedef typename MakeVec<R, cn>::type res_type;
|
||||
|
||||
const GpuMat_<src_type>& src = (const GpuMat_<src_type>&) _src;
|
||||
GpuMat_<res_type>& buf = (GpuMat_<res_type>&) _buf;
|
||||
|
||||
if (mask.empty())
|
||||
gridCalcSum(abs_(cvt_<res_type>(src)), buf);
|
||||
else
|
||||
gridCalcSum(abs_(cvt_<res_type>(src)), buf, globPtr<uchar>(mask));
|
||||
|
||||
cv::Scalar_<R> res;
|
||||
cv::Mat res_mat(buf.size(), buf.type(), res.val);
|
||||
buf.download(res_mat);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
template void run<uchar, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
|
||||
template void run<uchar, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
|
||||
template void run<uchar, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
|
||||
template void run<uchar, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
|
||||
|
||||
template void run<schar, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
|
||||
template void run<schar, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
|
||||
template void run<schar, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
|
||||
template void run<schar, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
|
||||
|
||||
template void run<ushort, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
|
||||
template void run<ushort, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
|
||||
template void run<ushort, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
|
||||
template void run<ushort, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
|
||||
|
||||
template void run<short, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
|
||||
template void run<short, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
|
||||
template void run<short, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
|
||||
template void run<short, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
|
||||
|
||||
template void run<int, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
|
||||
template void run<int, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
|
||||
template void run<int, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
|
||||
template void run<int, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
|
||||
|
||||
template void run<float, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
|
||||
template void run<float, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
|
||||
template void run<float, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
|
||||
template void run<float, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
|
||||
|
||||
template void run<double, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
|
||||
template void run<double, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
|
||||
template void run<double, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
|
||||
template void run<double, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
|
||||
|
||||
template <typename T, int cn>
|
||||
void runAbs(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask)
|
||||
template <typename T, typename R, int cn>
|
||||
cv::Scalar sumSqrImpl(const GpuMat& _src, const GpuMat& mask, GpuMat& _buf)
|
||||
{
|
||||
typedef typename SumType<T>::R R;
|
||||
caller<T, R, cn, abs_func>(src, buf, out, mask);
|
||||
typedef typename MakeVec<T, cn>::type src_type;
|
||||
typedef typename MakeVec<R, cn>::type res_type;
|
||||
|
||||
const GpuMat_<src_type>& src = (const GpuMat_<src_type>&) _src;
|
||||
GpuMat_<res_type>& buf = (GpuMat_<res_type>&) _buf;
|
||||
|
||||
if (mask.empty())
|
||||
gridCalcSum(sqr_(cvt_<res_type>(src)), buf);
|
||||
else
|
||||
gridCalcSum(sqr_(cvt_<res_type>(src)), buf, globPtr<uchar>(mask));
|
||||
|
||||
cv::Scalar_<R> res;
|
||||
cv::Mat res_mat(buf.size(), buf.type(), res.val);
|
||||
buf.download(res_mat);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
template void runAbs<uchar, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
|
||||
template void runAbs<uchar, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
|
||||
template void runAbs<uchar, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
|
||||
template void runAbs<uchar, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
|
||||
|
||||
template void runAbs<schar, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
|
||||
template void runAbs<schar, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
|
||||
template void runAbs<schar, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
|
||||
template void runAbs<schar, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
|
||||
|
||||
template void runAbs<ushort, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
|
||||
template void runAbs<ushort, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
|
||||
template void runAbs<ushort, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
|
||||
template void runAbs<ushort, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
|
||||
|
||||
template void runAbs<short, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
|
||||
template void runAbs<short, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
|
||||
template void runAbs<short, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
|
||||
template void runAbs<short, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
|
||||
|
||||
template void runAbs<int, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
|
||||
template void runAbs<int, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
|
||||
template void runAbs<int, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
|
||||
template void runAbs<int, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
|
||||
|
||||
template void runAbs<float, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
|
||||
template void runAbs<float, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
|
||||
template void runAbs<float, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
|
||||
template void runAbs<float, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
|
||||
|
||||
template void runAbs<double, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
|
||||
template void runAbs<double, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
|
||||
template void runAbs<double, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
|
||||
template void runAbs<double, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
|
||||
|
||||
template <typename T> struct Sqr : unary_function<T, T>
|
||||
{
|
||||
__device__ __forceinline__ T operator ()(T x) const
|
||||
{
|
||||
return x * x;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T, int cn>
|
||||
void runSqr(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask)
|
||||
{
|
||||
caller<T, double, cn, Sqr>(src, buf, out, mask);
|
||||
}
|
||||
|
||||
template void runSqr<uchar, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
|
||||
template void runSqr<uchar, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
|
||||
template void runSqr<uchar, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
|
||||
template void runSqr<uchar, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
|
||||
|
||||
template void runSqr<schar, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
|
||||
template void runSqr<schar, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
|
||||
template void runSqr<schar, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
|
||||
template void runSqr<schar, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
|
||||
|
||||
template void runSqr<ushort, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
|
||||
template void runSqr<ushort, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
|
||||
template void runSqr<ushort, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
|
||||
template void runSqr<ushort, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
|
||||
|
||||
template void runSqr<short, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
|
||||
template void runSqr<short, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
|
||||
template void runSqr<short, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
|
||||
template void runSqr<short, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
|
||||
|
||||
template void runSqr<int, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
|
||||
template void runSqr<int, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
|
||||
template void runSqr<int, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
|
||||
template void runSqr<int, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
|
||||
|
||||
template void runSqr<float, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
|
||||
template void runSqr<float, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
|
||||
template void runSqr<float, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
|
||||
template void runSqr<float, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
|
||||
|
||||
template void runSqr<double, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
|
||||
template void runSqr<double, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
|
||||
template void runSqr<double, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
|
||||
template void runSqr<double, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
|
||||
}
|
||||
|
||||
#endif // CUDA_DISABLER
|
||||
cv::Scalar cv::cuda::sum(InputArray _src, InputArray _mask, GpuMat& buf)
|
||||
{
|
||||
typedef cv::Scalar (*func_t)(const GpuMat& _src, const GpuMat& mask, GpuMat& _buf);
|
||||
static const func_t funcs[7][4] =
|
||||
{
|
||||
{sumImpl<uchar , uint , 1>, sumImpl<uchar , uint , 2>, sumImpl<uchar , uint , 3>, sumImpl<uchar , uint , 4>},
|
||||
{sumImpl<schar , int , 1>, sumImpl<schar , int , 2>, sumImpl<schar , int , 3>, sumImpl<schar , int , 4>},
|
||||
{sumImpl<ushort, uint , 1>, sumImpl<ushort, uint , 2>, sumImpl<ushort, uint , 3>, sumImpl<ushort, uint , 4>},
|
||||
{sumImpl<short , int , 1>, sumImpl<short , int , 2>, sumImpl<short , int , 3>, sumImpl<short , int , 4>},
|
||||
{sumImpl<int , int , 1>, sumImpl<int , int , 2>, sumImpl<int , int , 3>, sumImpl<int , int , 4>},
|
||||
{sumImpl<float , float , 1>, sumImpl<float , float , 2>, sumImpl<float , float , 3>, sumImpl<float , float , 4>},
|
||||
{sumImpl<double, double, 1>, sumImpl<double, double, 2>, sumImpl<double, double, 3>, sumImpl<double, double, 4>}
|
||||
};
|
||||
|
||||
GpuMat src = _src.getGpuMat();
|
||||
GpuMat mask = _mask.getGpuMat();
|
||||
|
||||
CV_DbgAssert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size()) );
|
||||
|
||||
const func_t func = funcs[src.depth()][src.channels() - 1];
|
||||
|
||||
return func(src, mask, buf);
|
||||
}
|
||||
|
||||
cv::Scalar cv::cuda::absSum(InputArray _src, InputArray _mask, GpuMat& buf)
|
||||
{
|
||||
typedef cv::Scalar (*func_t)(const GpuMat& _src, const GpuMat& mask, GpuMat& _buf);
|
||||
static const func_t funcs[7][4] =
|
||||
{
|
||||
{sumAbsImpl<uchar , uint , 1>, sumAbsImpl<uchar , uint , 2>, sumAbsImpl<uchar , uint , 3>, sumAbsImpl<uchar , uint , 4>},
|
||||
{sumAbsImpl<schar , int , 1>, sumAbsImpl<schar , int , 2>, sumAbsImpl<schar , int , 3>, sumAbsImpl<schar , int , 4>},
|
||||
{sumAbsImpl<ushort, uint , 1>, sumAbsImpl<ushort, uint , 2>, sumAbsImpl<ushort, uint , 3>, sumAbsImpl<ushort, uint , 4>},
|
||||
{sumAbsImpl<short , int , 1>, sumAbsImpl<short , int , 2>, sumAbsImpl<short , int , 3>, sumAbsImpl<short , int , 4>},
|
||||
{sumAbsImpl<int , int , 1>, sumAbsImpl<int , int , 2>, sumAbsImpl<int , int , 3>, sumAbsImpl<int , int , 4>},
|
||||
{sumAbsImpl<float , float , 1>, sumAbsImpl<float , float , 2>, sumAbsImpl<float , float , 3>, sumAbsImpl<float , float , 4>},
|
||||
{sumAbsImpl<double, double, 1>, sumAbsImpl<double, double, 2>, sumAbsImpl<double, double, 3>, sumAbsImpl<double, double, 4>}
|
||||
};
|
||||
|
||||
GpuMat src = _src.getGpuMat();
|
||||
GpuMat mask = _mask.getGpuMat();
|
||||
|
||||
CV_DbgAssert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size()) );
|
||||
|
||||
const func_t func = funcs[src.depth()][src.channels() - 1];
|
||||
|
||||
return func(src, mask, buf);
|
||||
}
|
||||
|
||||
cv::Scalar cv::cuda::sqrSum(InputArray _src, InputArray _mask, GpuMat& buf)
|
||||
{
|
||||
typedef cv::Scalar (*func_t)(const GpuMat& _src, const GpuMat& mask, GpuMat& _buf);
|
||||
static const func_t funcs[7][4] =
|
||||
{
|
||||
{sumSqrImpl<uchar , double, 1>, sumSqrImpl<uchar , double, 2>, sumSqrImpl<uchar , double, 3>, sumSqrImpl<uchar , double, 4>},
|
||||
{sumSqrImpl<schar , double, 1>, sumSqrImpl<schar , double, 2>, sumSqrImpl<schar , double, 3>, sumSqrImpl<schar , double, 4>},
|
||||
{sumSqrImpl<ushort, double, 1>, sumSqrImpl<ushort, double, 2>, sumSqrImpl<ushort, double, 3>, sumSqrImpl<ushort, double, 4>},
|
||||
{sumSqrImpl<short , double, 1>, sumSqrImpl<short , double, 2>, sumSqrImpl<short , double, 3>, sumSqrImpl<short , double, 4>},
|
||||
{sumSqrImpl<int , double, 1>, sumSqrImpl<int , double, 2>, sumSqrImpl<int , double, 3>, sumSqrImpl<int , double, 4>},
|
||||
{sumSqrImpl<float , double, 1>, sumSqrImpl<float , double, 2>, sumSqrImpl<float , double, 3>, sumSqrImpl<float , double, 4>},
|
||||
{sumSqrImpl<double, double, 1>, sumSqrImpl<double, double, 2>, sumSqrImpl<double, double, 3>, sumSqrImpl<double, double, 4>}
|
||||
};
|
||||
|
||||
GpuMat src = _src.getGpuMat();
|
||||
GpuMat mask = _mask.getGpuMat();
|
||||
|
||||
CV_DbgAssert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size()) );
|
||||
|
||||
const func_t func = funcs[src.depth()][src.channels() - 1];
|
||||
|
||||
return func(src, mask, buf);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@ -40,75 +40,109 @@
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
#include "opencv2/opencv_modules.hpp"
|
||||
|
||||
#include "opencv2/core/cuda/common.hpp"
|
||||
#include "opencv2/core/cuda/functional.hpp"
|
||||
#include "opencv2/core/cuda/transform.hpp"
|
||||
#include "opencv2/core/cuda/saturate_cast.hpp"
|
||||
#include "opencv2/core/cuda/simd_functions.hpp"
|
||||
#ifndef HAVE_OPENCV_CUDEV
|
||||
|
||||
#include "arithm_func_traits.hpp"
|
||||
#error "opencv_cudev is required"
|
||||
|
||||
using namespace cv::cuda;
|
||||
using namespace cv::cuda::device;
|
||||
#else
|
||||
|
||||
namespace cv { namespace cuda { namespace device
|
||||
#include "opencv2/cudaarithm.hpp"
|
||||
#include "opencv2/cudev.hpp"
|
||||
#include "opencv2/core/private.cuda.hpp"
|
||||
|
||||
using namespace cv::cudev;
|
||||
|
||||
namespace
|
||||
{
|
||||
template <typename T> struct TransformFunctorTraits< thresh_binary_func<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
|
||||
template <typename ScalarDepth> struct TransformPolicy : DefaultTransformPolicy
|
||||
{
|
||||
};
|
||||
|
||||
template <typename T> struct TransformFunctorTraits< thresh_binary_inv_func<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
|
||||
template <> struct TransformPolicy<double> : DefaultTransformPolicy
|
||||
{
|
||||
enum {
|
||||
shift = 1
|
||||
};
|
||||
};
|
||||
|
||||
template <typename T> struct TransformFunctorTraits< thresh_trunc_func<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
|
||||
{
|
||||
};
|
||||
|
||||
template <typename T> struct TransformFunctorTraits< thresh_to_zero_func<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
|
||||
{
|
||||
};
|
||||
|
||||
template <typename T> struct TransformFunctorTraits< thresh_to_zero_inv_func<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
|
||||
{
|
||||
};
|
||||
}}}
|
||||
|
||||
namespace arithm
|
||||
{
|
||||
template <template <typename> class Op, typename T>
|
||||
void threshold_caller(PtrStepSz<T> src, PtrStepSz<T> dst, T thresh, T maxVal, cudaStream_t stream)
|
||||
{
|
||||
Op<T> op(thresh, maxVal);
|
||||
device::transform(src, dst, op, WithOutMask(), stream);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void threshold(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream)
|
||||
void thresholdImpl(const GpuMat& src, GpuMat& dst, double thresh, double maxVal, int type, Stream& stream)
|
||||
{
|
||||
typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<T> dst, T thresh, T maxVal, cudaStream_t stream);
|
||||
const T thresh_ = static_cast<T>(thresh);
|
||||
const T maxVal_ = static_cast<T>(maxVal);
|
||||
|
||||
static const caller_t callers[] =
|
||||
switch (type)
|
||||
{
|
||||
threshold_caller<thresh_binary_func, T>,
|
||||
threshold_caller<thresh_binary_inv_func, T>,
|
||||
threshold_caller<thresh_trunc_func, T>,
|
||||
threshold_caller<thresh_to_zero_func, T>,
|
||||
threshold_caller<thresh_to_zero_inv_func, T>
|
||||
case 0:
|
||||
gridTransformUnary_< TransformPolicy<T> >(globPtr<T>(src), globPtr<T>(dst), thresh_binary_func(thresh_, maxVal_), stream);
|
||||
break;
|
||||
case 1:
|
||||
gridTransformUnary_< TransformPolicy<T> >(globPtr<T>(src), globPtr<T>(dst), thresh_binary_inv_func(thresh_, maxVal_), stream);
|
||||
break;
|
||||
case 2:
|
||||
gridTransformUnary_< TransformPolicy<T> >(globPtr<T>(src), globPtr<T>(dst), thresh_trunc_func(thresh_), stream);
|
||||
break;
|
||||
case 3:
|
||||
gridTransformUnary_< TransformPolicy<T> >(globPtr<T>(src), globPtr<T>(dst), thresh_to_zero_func(thresh_), stream);
|
||||
break;
|
||||
case 4:
|
||||
gridTransformUnary_< TransformPolicy<T> >(globPtr<T>(src), globPtr<T>(dst), thresh_to_zero_inv_func(thresh_), stream);
|
||||
break;
|
||||
};
|
||||
|
||||
callers[type]((PtrStepSz<T>) src, (PtrStepSz<T>) dst, static_cast<T>(thresh), static_cast<T>(maxVal), stream);
|
||||
}
|
||||
|
||||
template void threshold<uchar>(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream);
|
||||
template void threshold<schar>(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream);
|
||||
template void threshold<ushort>(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream);
|
||||
template void threshold<short>(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream);
|
||||
template void threshold<int>(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream);
|
||||
template void threshold<float>(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream);
|
||||
template void threshold<double>(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream);
|
||||
}
|
||||
|
||||
#endif // CUDA_DISABLER
|
||||
double cv::cuda::threshold(InputArray _src, OutputArray _dst, double thresh, double maxVal, int type, Stream& stream)
|
||||
{
|
||||
GpuMat src = _src.getGpuMat();
|
||||
|
||||
const int depth = src.depth();
|
||||
|
||||
CV_DbgAssert( src.channels() == 1 && depth <= CV_64F );
|
||||
CV_DbgAssert( type <= 4 /*THRESH_TOZERO_INV*/ );
|
||||
|
||||
_dst.create(src.size(), src.type());
|
||||
GpuMat dst = _dst.getGpuMat();
|
||||
|
||||
if (depth == CV_32F && type == 2 /*THRESH_TRUNC*/)
|
||||
{
|
||||
NppStreamHandler h(StreamAccessor::getStream(stream));
|
||||
|
||||
NppiSize sz;
|
||||
sz.width = src.cols;
|
||||
sz.height = src.rows;
|
||||
|
||||
nppSafeCall( nppiThreshold_32f_C1R(src.ptr<Npp32f>(), static_cast<int>(src.step),
|
||||
dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz, static_cast<Npp32f>(thresh), NPP_CMP_GREATER) );
|
||||
|
||||
if (!stream)
|
||||
CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
|
||||
}
|
||||
else
|
||||
{
|
||||
typedef void (*func_t)(const GpuMat& src, GpuMat& dst, double thresh, double maxVal, int type, Stream& stream);
|
||||
static const func_t funcs[] =
|
||||
{
|
||||
thresholdImpl<uchar>,
|
||||
thresholdImpl<schar>,
|
||||
thresholdImpl<ushort>,
|
||||
thresholdImpl<short>,
|
||||
thresholdImpl<int>,
|
||||
thresholdImpl<float>,
|
||||
thresholdImpl<double>
|
||||
};
|
||||
|
||||
if (depth != CV_32F && depth != CV_64F)
|
||||
{
|
||||
thresh = cvFloor(thresh);
|
||||
maxVal = cvRound(maxVal);
|
||||
}
|
||||
|
||||
funcs[depth](src, dst, thresh, maxVal, type, stream);
|
||||
}
|
||||
|
||||
return thresh;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@ -40,83 +40,53 @@
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
#include "opencv2/opencv_modules.hpp"
|
||||
|
||||
#include "opencv2/core/cuda/common.hpp"
|
||||
#ifndef HAVE_OPENCV_CUDEV
|
||||
|
||||
using namespace cv::cuda;
|
||||
using namespace cv::cuda::device;
|
||||
#error "opencv_cudev is required"
|
||||
|
||||
namespace arithm
|
||||
#else
|
||||
|
||||
#include "opencv2/cudaarithm.hpp"
|
||||
#include "opencv2/cudev.hpp"
|
||||
#include "opencv2/core/private.cuda.hpp"
|
||||
|
||||
using namespace cv::cudev;
|
||||
|
||||
void cv::cuda::transpose(InputArray _src, OutputArray _dst, Stream& stream)
|
||||
{
|
||||
const int TRANSPOSE_TILE_DIM = 16;
|
||||
const int TRANSPOSE_BLOCK_ROWS = 16;
|
||||
GpuMat src = _src.getGpuMat();
|
||||
|
||||
template <typename T>
|
||||
__global__ void transposeKernel(const PtrStepSz<T> src, PtrStep<T> dst)
|
||||
const size_t elemSize = src.elemSize();
|
||||
|
||||
CV_Assert( elemSize == 1 || elemSize == 4 || elemSize == 8 );
|
||||
|
||||
_dst.create( src.cols, src.rows, src.type() );
|
||||
GpuMat dst = _dst.getGpuMat();
|
||||
|
||||
if (elemSize == 1)
|
||||
{
|
||||
__shared__ T tile[TRANSPOSE_TILE_DIM][TRANSPOSE_TILE_DIM + 1];
|
||||
NppStreamHandler h(StreamAccessor::getStream(stream));
|
||||
|
||||
int blockIdx_x, blockIdx_y;
|
||||
NppiSize sz;
|
||||
sz.width = src.cols;
|
||||
sz.height = src.rows;
|
||||
|
||||
// do diagonal reordering
|
||||
if (gridDim.x == gridDim.y)
|
||||
{
|
||||
blockIdx_y = blockIdx.x;
|
||||
blockIdx_x = (blockIdx.x + blockIdx.y) % gridDim.x;
|
||||
}
|
||||
else
|
||||
{
|
||||
int bid = blockIdx.x + gridDim.x * blockIdx.y;
|
||||
blockIdx_y = bid % gridDim.y;
|
||||
blockIdx_x = ((bid / gridDim.y) + blockIdx_y) % gridDim.x;
|
||||
}
|
||||
nppSafeCall( nppiTranspose_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
|
||||
dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) );
|
||||
|
||||
int xIndex = blockIdx_x * TRANSPOSE_TILE_DIM + threadIdx.x;
|
||||
int yIndex = blockIdx_y * TRANSPOSE_TILE_DIM + threadIdx.y;
|
||||
|
||||
if (xIndex < src.cols)
|
||||
{
|
||||
for (int i = 0; i < TRANSPOSE_TILE_DIM; i += TRANSPOSE_BLOCK_ROWS)
|
||||
{
|
||||
if (yIndex + i < src.rows)
|
||||
{
|
||||
tile[threadIdx.y + i][threadIdx.x] = src(yIndex + i, xIndex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
xIndex = blockIdx_y * TRANSPOSE_TILE_DIM + threadIdx.x;
|
||||
yIndex = blockIdx_x * TRANSPOSE_TILE_DIM + threadIdx.y;
|
||||
|
||||
if (xIndex < src.rows)
|
||||
{
|
||||
for (int i = 0; i < TRANSPOSE_TILE_DIM; i += TRANSPOSE_BLOCK_ROWS)
|
||||
{
|
||||
if (yIndex + i < src.cols)
|
||||
{
|
||||
dst(yIndex + i, xIndex) = tile[threadIdx.x][threadIdx.y + i];
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!stream)
|
||||
CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
template <typename T> void transpose(PtrStepSz<T> src, PtrStepSz<T> dst, cudaStream_t stream)
|
||||
else if (elemSize == 4)
|
||||
{
|
||||
const dim3 block(TRANSPOSE_TILE_DIM, TRANSPOSE_TILE_DIM);
|
||||
const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
|
||||
|
||||
transposeKernel<<<grid, block, 0, stream>>>(src, dst);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
gridTranspose(globPtr<int>(src), globPtr<int>(dst), stream);
|
||||
}
|
||||
else // if (elemSize == 8)
|
||||
{
|
||||
gridTranspose(globPtr<double>(src), globPtr<double>(dst), stream);
|
||||
}
|
||||
|
||||
template void transpose<int>(PtrStepSz<int> src, PtrStepSz<int> dst, cudaStream_t stream);
|
||||
template void transpose<double>(PtrStepSz<double> src, PtrStepSz<double> dst, cudaStream_t stream);
|
||||
}
|
||||
|
||||
#endif // CUDA_DISABLER
|
||||
#endif
|
||||
|
@ -1,135 +0,0 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#ifndef __UNROLL_DETAIL_HPP__
|
||||
#define __UNROLL_DETAIL_HPP__
|
||||
|
||||
#include <thrust/tuple.h>
|
||||
#include "opencv2/core/cuda/common.hpp"
|
||||
#include "opencv2/core/cuda/vec_traits.hpp"
|
||||
|
||||
namespace detail
|
||||
{
|
||||
template <int cn> struct Unroll;
|
||||
template <> struct Unroll<1>
|
||||
{
|
||||
template <int BLOCK_SIZE, typename R>
|
||||
static __device__ __forceinline__ volatile R* smem_tuple(R* smem)
|
||||
{
|
||||
return smem;
|
||||
}
|
||||
|
||||
template <typename R>
|
||||
static __device__ __forceinline__ R& tie(R& val)
|
||||
{
|
||||
return val;
|
||||
}
|
||||
|
||||
template <class Op>
|
||||
static __device__ __forceinline__ const Op& op(const Op& op)
|
||||
{
|
||||
return op;
|
||||
}
|
||||
};
|
||||
template <> struct Unroll<2>
|
||||
{
|
||||
template <int BLOCK_SIZE, typename R>
|
||||
static __device__ __forceinline__ thrust::tuple<volatile R*, volatile R*> smem_tuple(R* smem)
|
||||
{
|
||||
return cv::cuda::device::smem_tuple(smem, smem + BLOCK_SIZE);
|
||||
}
|
||||
|
||||
template <typename R>
|
||||
static __device__ __forceinline__ thrust::tuple<typename cv::cuda::device::VecTraits<R>::elem_type&, typename cv::cuda::device::VecTraits<R>::elem_type&> tie(R& val)
|
||||
{
|
||||
return thrust::tie(val.x, val.y);
|
||||
}
|
||||
|
||||
template <class Op>
|
||||
static __device__ __forceinline__ const thrust::tuple<Op, Op> op(const Op& op)
|
||||
{
|
||||
return thrust::make_tuple(op, op);
|
||||
}
|
||||
};
|
||||
template <> struct Unroll<3>
|
||||
{
|
||||
template <int BLOCK_SIZE, typename R>
|
||||
static __device__ __forceinline__ thrust::tuple<volatile R*, volatile R*, volatile R*> smem_tuple(R* smem)
|
||||
{
|
||||
return cv::cuda::device::smem_tuple(smem, smem + BLOCK_SIZE, smem + 2 * BLOCK_SIZE);
|
||||
}
|
||||
|
||||
template <typename R>
|
||||
static __device__ __forceinline__ thrust::tuple<typename cv::cuda::device::VecTraits<R>::elem_type&, typename cv::cuda::device::VecTraits<R>::elem_type&, typename cv::cuda::device::VecTraits<R>::elem_type&> tie(R& val)
|
||||
{
|
||||
return thrust::tie(val.x, val.y, val.z);
|
||||
}
|
||||
|
||||
template <class Op>
|
||||
static __device__ __forceinline__ const thrust::tuple<Op, Op, Op> op(const Op& op)
|
||||
{
|
||||
return thrust::make_tuple(op, op, op);
|
||||
}
|
||||
};
|
||||
template <> struct Unroll<4>
|
||||
{
|
||||
template <int BLOCK_SIZE, typename R>
|
||||
static __device__ __forceinline__ thrust::tuple<volatile R*, volatile R*, volatile R*, volatile R*> smem_tuple(R* smem)
|
||||
{
|
||||
return cv::cuda::device::smem_tuple(smem, smem + BLOCK_SIZE, smem + 2 * BLOCK_SIZE, smem + 3 * BLOCK_SIZE);
|
||||
}
|
||||
|
||||
template <typename R>
|
||||
static __device__ __forceinline__ thrust::tuple<typename cv::cuda::device::VecTraits<R>::elem_type&, typename cv::cuda::device::VecTraits<R>::elem_type&, typename cv::cuda::device::VecTraits<R>::elem_type&, typename cv::cuda::device::VecTraits<R>::elem_type&> tie(R& val)
|
||||
{
|
||||
return thrust::tie(val.x, val.y, val.z, val.w);
|
||||
}
|
||||
|
||||
template <class Op>
|
||||
static __device__ __forceinline__ const thrust::tuple<Op, Op, Op, Op> op(const Op& op)
|
||||
{
|
||||
return thrust::make_tuple(op, op, op, op);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
#endif // __UNROLL_DETAIL_HPP__
|
File diff suppressed because it is too large
Load Diff
@ -52,13 +52,6 @@
|
||||
|
||||
#include "opencv2/core/private.cuda.hpp"
|
||||
|
||||
#include "opencv2/opencv_modules.hpp"
|
||||
|
||||
#ifdef HAVE_OPENCV_CUDALEGACY
|
||||
# include "opencv2/cudalegacy.hpp"
|
||||
# include "opencv2/cudalegacy/private.hpp"
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_CUBLAS
|
||||
# include <cublas.h>
|
||||
#endif
|
||||
|
@ -133,513 +133,6 @@ double cv::cuda::norm(InputArray _src, int normType, InputArray _mask, GpuMat& b
|
||||
return std::max(std::abs(min_val), std::abs(max_val));
|
||||
}
|
||||
|
||||
double cv::cuda::norm(InputArray _src1, InputArray _src2, GpuMat& buf, int normType)
|
||||
{
|
||||
#if CUDA_VERSION < 5050
|
||||
(void) buf;
|
||||
|
||||
typedef NppStatus (*func_t)(const Npp8u* pSrc1, int nSrcStep1, const Npp8u* pSrc2, int nSrcStep2, NppiSize oSizeROI, Npp64f* pRetVal);
|
||||
|
||||
static const func_t funcs[] = {nppiNormDiff_Inf_8u_C1R, nppiNormDiff_L1_8u_C1R, nppiNormDiff_L2_8u_C1R};
|
||||
#else
|
||||
typedef NppStatus (*func_t)(const Npp8u* pSrc1, int nSrcStep1, const Npp8u* pSrc2, int nSrcStep2,
|
||||
NppiSize oSizeROI, Npp64f* pRetVal, Npp8u * pDeviceBuffer);
|
||||
|
||||
typedef NppStatus (*buf_size_func_t)(NppiSize oSizeROI, int* hpBufferSize);
|
||||
|
||||
static const func_t funcs[] = {nppiNormDiff_Inf_8u_C1R, nppiNormDiff_L1_8u_C1R, nppiNormDiff_L2_8u_C1R};
|
||||
|
||||
static const buf_size_func_t buf_size_funcs[] = {nppiNormDiffInfGetBufferHostSize_8u_C1R, nppiNormDiffL1GetBufferHostSize_8u_C1R, nppiNormDiffL2GetBufferHostSize_8u_C1R};
|
||||
#endif
|
||||
|
||||
GpuMat src1 = _src1.getGpuMat();
|
||||
GpuMat src2 = _src2.getGpuMat();
|
||||
|
||||
CV_Assert( src1.type() == CV_8UC1 );
|
||||
CV_Assert( src1.size() == src2.size() && src1.type() == src2.type() );
|
||||
CV_Assert( normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2 );
|
||||
|
||||
NppiSize sz;
|
||||
sz.width = src1.cols;
|
||||
sz.height = src1.rows;
|
||||
|
||||
const int funcIdx = normType >> 1;
|
||||
|
||||
DeviceBuffer dbuf;
|
||||
|
||||
#if CUDA_VERSION < 5050
|
||||
nppSafeCall( funcs[funcIdx](src1.ptr<Npp8u>(), static_cast<int>(src1.step), src2.ptr<Npp8u>(), static_cast<int>(src2.step), sz, dbuf) );
|
||||
#else
|
||||
int bufSize;
|
||||
buf_size_funcs[funcIdx](sz, &bufSize);
|
||||
|
||||
ensureSizeIsEnough(1, bufSize, CV_8UC1, buf);
|
||||
|
||||
nppSafeCall( funcs[funcIdx](src1.ptr<Npp8u>(), static_cast<int>(src1.step), src2.ptr<Npp8u>(), static_cast<int>(src2.step), sz, dbuf, buf.data) );
|
||||
#endif
|
||||
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
|
||||
double retVal;
|
||||
dbuf.download(&retVal);
|
||||
|
||||
return retVal;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// Sum
|
||||
|
||||
namespace sum
|
||||
{
|
||||
void getBufSize(int cols, int rows, int cn, int& bufcols, int& bufrows);
|
||||
|
||||
template <typename T, int cn>
|
||||
void run(PtrStepSzb src, void* buf, double* sum, PtrStepSzb mask);
|
||||
|
||||
template <typename T, int cn>
|
||||
void runAbs(PtrStepSzb src, void* buf, double* sum, PtrStepSzb mask);
|
||||
|
||||
template <typename T, int cn>
|
||||
void runSqr(PtrStepSzb src, void* buf, double* sum, PtrStepSzb mask);
|
||||
}
|
||||
|
||||
Scalar cv::cuda::sum(InputArray _src, InputArray _mask, GpuMat& buf)
|
||||
{
|
||||
GpuMat src = _src.getGpuMat();
|
||||
GpuMat mask = _mask.getGpuMat();
|
||||
|
||||
typedef void (*func_t)(PtrStepSzb src, void* buf, double* sum, PtrStepSzb mask);
|
||||
static const func_t funcs[7][5] =
|
||||
{
|
||||
{0, ::sum::run<uchar , 1>, ::sum::run<uchar , 2>, ::sum::run<uchar , 3>, ::sum::run<uchar , 4>},
|
||||
{0, ::sum::run<schar , 1>, ::sum::run<schar , 2>, ::sum::run<schar , 3>, ::sum::run<schar , 4>},
|
||||
{0, ::sum::run<ushort, 1>, ::sum::run<ushort, 2>, ::sum::run<ushort, 3>, ::sum::run<ushort, 4>},
|
||||
{0, ::sum::run<short , 1>, ::sum::run<short , 2>, ::sum::run<short , 3>, ::sum::run<short , 4>},
|
||||
{0, ::sum::run<int , 1>, ::sum::run<int , 2>, ::sum::run<int , 3>, ::sum::run<int , 4>},
|
||||
{0, ::sum::run<float , 1>, ::sum::run<float , 2>, ::sum::run<float , 3>, ::sum::run<float , 4>},
|
||||
{0, ::sum::run<double, 1>, ::sum::run<double, 2>, ::sum::run<double, 3>, ::sum::run<double, 4>}
|
||||
};
|
||||
|
||||
CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size()) );
|
||||
|
||||
if (src.depth() == CV_64F)
|
||||
{
|
||||
if (!deviceSupports(NATIVE_DOUBLE))
|
||||
CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
|
||||
}
|
||||
|
||||
Size buf_size;
|
||||
::sum::getBufSize(src.cols, src.rows, src.channels(), buf_size.width, buf_size.height);
|
||||
ensureSizeIsEnough(buf_size, CV_8U, buf);
|
||||
buf.setTo(Scalar::all(0));
|
||||
|
||||
const func_t func = funcs[src.depth()][src.channels()];
|
||||
|
||||
double result[4];
|
||||
func(src, buf.data, result, mask);
|
||||
|
||||
return Scalar(result[0], result[1], result[2], result[3]);
|
||||
}
|
||||
|
||||
Scalar cv::cuda::absSum(InputArray _src, InputArray _mask, GpuMat& buf)
|
||||
{
|
||||
GpuMat src = _src.getGpuMat();
|
||||
GpuMat mask = _mask.getGpuMat();
|
||||
|
||||
typedef void (*func_t)(PtrStepSzb src, void* buf, double* sum, PtrStepSzb mask);
|
||||
static const func_t funcs[7][5] =
|
||||
{
|
||||
{0, ::sum::runAbs<uchar , 1>, ::sum::runAbs<uchar , 2>, ::sum::runAbs<uchar , 3>, ::sum::runAbs<uchar , 4>},
|
||||
{0, ::sum::runAbs<schar , 1>, ::sum::runAbs<schar , 2>, ::sum::runAbs<schar , 3>, ::sum::runAbs<schar , 4>},
|
||||
{0, ::sum::runAbs<ushort, 1>, ::sum::runAbs<ushort, 2>, ::sum::runAbs<ushort, 3>, ::sum::runAbs<ushort, 4>},
|
||||
{0, ::sum::runAbs<short , 1>, ::sum::runAbs<short , 2>, ::sum::runAbs<short , 3>, ::sum::runAbs<short , 4>},
|
||||
{0, ::sum::runAbs<int , 1>, ::sum::runAbs<int , 2>, ::sum::runAbs<int , 3>, ::sum::runAbs<int , 4>},
|
||||
{0, ::sum::runAbs<float , 1>, ::sum::runAbs<float , 2>, ::sum::runAbs<float , 3>, ::sum::runAbs<float , 4>},
|
||||
{0, ::sum::runAbs<double, 1>, ::sum::runAbs<double, 2>, ::sum::runAbs<double, 3>, ::sum::runAbs<double, 4>}
|
||||
};
|
||||
|
||||
CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size()) );
|
||||
|
||||
if (src.depth() == CV_64F)
|
||||
{
|
||||
if (!deviceSupports(NATIVE_DOUBLE))
|
||||
CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
|
||||
}
|
||||
|
||||
Size buf_size;
|
||||
::sum::getBufSize(src.cols, src.rows, src.channels(), buf_size.width, buf_size.height);
|
||||
ensureSizeIsEnough(buf_size, CV_8U, buf);
|
||||
buf.setTo(Scalar::all(0));
|
||||
|
||||
const func_t func = funcs[src.depth()][src.channels()];
|
||||
|
||||
double result[4];
|
||||
func(src, buf.data, result, mask);
|
||||
|
||||
return Scalar(result[0], result[1], result[2], result[3]);
|
||||
}
|
||||
|
||||
Scalar cv::cuda::sqrSum(InputArray _src, InputArray _mask, GpuMat& buf)
|
||||
{
|
||||
GpuMat src = _src.getGpuMat();
|
||||
GpuMat mask = _mask.getGpuMat();
|
||||
|
||||
typedef void (*func_t)(PtrStepSzb src, void* buf, double* sum, PtrStepSzb mask);
|
||||
static const func_t funcs[7][5] =
|
||||
{
|
||||
{0, ::sum::runSqr<uchar , 1>, ::sum::runSqr<uchar , 2>, ::sum::runSqr<uchar , 3>, ::sum::runSqr<uchar , 4>},
|
||||
{0, ::sum::runSqr<schar , 1>, ::sum::runSqr<schar , 2>, ::sum::runSqr<schar , 3>, ::sum::runSqr<schar , 4>},
|
||||
{0, ::sum::runSqr<ushort, 1>, ::sum::runSqr<ushort, 2>, ::sum::runSqr<ushort, 3>, ::sum::runSqr<ushort, 4>},
|
||||
{0, ::sum::runSqr<short , 1>, ::sum::runSqr<short , 2>, ::sum::runSqr<short , 3>, ::sum::runSqr<short , 4>},
|
||||
{0, ::sum::runSqr<int , 1>, ::sum::runSqr<int , 2>, ::sum::runSqr<int , 3>, ::sum::runSqr<int , 4>},
|
||||
{0, ::sum::runSqr<float , 1>, ::sum::runSqr<float , 2>, ::sum::runSqr<float , 3>, ::sum::runSqr<float , 4>},
|
||||
{0, ::sum::runSqr<double, 1>, ::sum::runSqr<double, 2>, ::sum::runSqr<double, 3>, ::sum::runSqr<double, 4>}
|
||||
};
|
||||
|
||||
CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size()) );
|
||||
|
||||
if (src.depth() == CV_64F)
|
||||
{
|
||||
if (!deviceSupports(NATIVE_DOUBLE))
|
||||
CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
|
||||
}
|
||||
|
||||
Size buf_size;
|
||||
::sum::getBufSize(src.cols, src.rows, src.channels(), buf_size.width, buf_size.height);
|
||||
ensureSizeIsEnough(buf_size, CV_8U, buf);
|
||||
buf.setTo(Scalar::all(0));
|
||||
|
||||
const func_t func = funcs[src.depth()][src.channels()];
|
||||
|
||||
double result[4];
|
||||
func(src, buf.data, result, mask);
|
||||
|
||||
return Scalar(result[0], result[1], result[2], result[3]);
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// minMax
|
||||
|
||||
namespace minMax
|
||||
{
|
||||
void getBufSize(int cols, int rows, int& bufcols, int& bufrows);
|
||||
|
||||
template <typename T>
|
||||
void run(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
|
||||
}
|
||||
|
||||
void cv::cuda::minMax(InputArray _src, double* minVal, double* maxVal, InputArray _mask, GpuMat& buf)
|
||||
{
|
||||
GpuMat src = _src.getGpuMat();
|
||||
GpuMat mask = _mask.getGpuMat();
|
||||
|
||||
typedef void (*func_t)(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
|
||||
static const func_t funcs[] =
|
||||
{
|
||||
::minMax::run<uchar>,
|
||||
::minMax::run<schar>,
|
||||
::minMax::run<ushort>,
|
||||
::minMax::run<short>,
|
||||
::minMax::run<int>,
|
||||
::minMax::run<float>,
|
||||
::minMax::run<double>
|
||||
};
|
||||
|
||||
CV_Assert( src.channels() == 1 );
|
||||
CV_Assert( mask.empty() || (mask.size() == src.size() && mask.type() == CV_8U) );
|
||||
|
||||
if (src.depth() == CV_64F)
|
||||
{
|
||||
if (!deviceSupports(NATIVE_DOUBLE))
|
||||
CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
|
||||
}
|
||||
|
||||
Size buf_size;
|
||||
::minMax::getBufSize(src.cols, src.rows, buf_size.width, buf_size.height);
|
||||
ensureSizeIsEnough(buf_size, CV_8U, buf);
|
||||
|
||||
const func_t func = funcs[src.depth()];
|
||||
|
||||
double temp1, temp2;
|
||||
func(src, mask, minVal ? minVal : &temp1, maxVal ? maxVal : &temp2, buf);
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// minMaxLoc
|
||||
|
||||
namespace minMaxLoc
|
||||
{
|
||||
void getBufSize(int cols, int rows, size_t elem_size, int& b1cols, int& b1rows, int& b2cols, int& b2rows);
|
||||
|
||||
template <typename T>
|
||||
void run(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
|
||||
}
|
||||
|
||||
void cv::cuda::minMaxLoc(InputArray _src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc,
|
||||
InputArray _mask, GpuMat& valBuf, GpuMat& locBuf)
|
||||
{
|
||||
GpuMat src = _src.getGpuMat();
|
||||
GpuMat mask = _mask.getGpuMat();
|
||||
|
||||
typedef void (*func_t)(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
|
||||
static const func_t funcs[] =
|
||||
{
|
||||
::minMaxLoc::run<uchar>,
|
||||
::minMaxLoc::run<schar>,
|
||||
::minMaxLoc::run<ushort>,
|
||||
::minMaxLoc::run<short>,
|
||||
::minMaxLoc::run<int>,
|
||||
::minMaxLoc::run<float>,
|
||||
::minMaxLoc::run<double>
|
||||
};
|
||||
|
||||
CV_Assert( src.channels() == 1 );
|
||||
CV_Assert( mask.empty() || (mask.size() == src.size() && mask.type() == CV_8U) );
|
||||
|
||||
if (src.depth() == CV_64F)
|
||||
{
|
||||
if (!deviceSupports(NATIVE_DOUBLE))
|
||||
CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
|
||||
}
|
||||
|
||||
Size valbuf_size, locbuf_size;
|
||||
::minMaxLoc::getBufSize(src.cols, src.rows, src.elemSize(), valbuf_size.width, valbuf_size.height, locbuf_size.width, locbuf_size.height);
|
||||
ensureSizeIsEnough(valbuf_size, CV_8U, valBuf);
|
||||
ensureSizeIsEnough(locbuf_size, CV_8U, locBuf);
|
||||
|
||||
const func_t func = funcs[src.depth()];
|
||||
|
||||
double temp1, temp2;
|
||||
Point temp3, temp4;
|
||||
func(src, mask, minVal ? minVal : &temp1, maxVal ? maxVal : &temp2, minLoc ? &minLoc->x : &temp3.x, maxLoc ? &maxLoc->x : &temp4.x, valBuf, locBuf);
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
// countNonZero
|
||||
|
||||
namespace countNonZero
|
||||
{
|
||||
void getBufSize(int cols, int rows, int& bufcols, int& bufrows);
|
||||
|
||||
template <typename T>
|
||||
int run(const PtrStepSzb src, PtrStep<unsigned int> buf);
|
||||
}
|
||||
|
||||
int cv::cuda::countNonZero(InputArray _src, GpuMat& buf)
|
||||
{
|
||||
GpuMat src = _src.getGpuMat();
|
||||
|
||||
typedef int (*func_t)(const PtrStepSzb src, PtrStep<unsigned int> buf);
|
||||
static const func_t funcs[] =
|
||||
{
|
||||
::countNonZero::run<uchar>,
|
||||
::countNonZero::run<schar>,
|
||||
::countNonZero::run<ushort>,
|
||||
::countNonZero::run<short>,
|
||||
::countNonZero::run<int>,
|
||||
::countNonZero::run<float>,
|
||||
::countNonZero::run<double>
|
||||
};
|
||||
|
||||
CV_Assert(src.channels() == 1);
|
||||
|
||||
if (src.depth() == CV_64F)
|
||||
{
|
||||
if (!deviceSupports(NATIVE_DOUBLE))
|
||||
CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
|
||||
}
|
||||
|
||||
Size buf_size;
|
||||
::countNonZero::getBufSize(src.cols, src.rows, buf_size.width, buf_size.height);
|
||||
ensureSizeIsEnough(buf_size, CV_8U, buf);
|
||||
|
||||
const func_t func = funcs[src.depth()];
|
||||
|
||||
return func(src, buf);
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
// reduce
|
||||
|
||||
namespace reduce
|
||||
{
|
||||
template <typename T, typename S, typename D>
|
||||
void rows(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
|
||||
|
||||
template <typename T, typename S, typename D>
|
||||
void cols(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
|
||||
}
|
||||
|
||||
void cv::cuda::reduce(InputArray _src, OutputArray _dst, int dim, int reduceOp, int dtype, Stream& stream)
|
||||
{
|
||||
GpuMat src = _src.getGpuMat();
|
||||
|
||||
CV_Assert( src.channels() <= 4 );
|
||||
CV_Assert( dim == 0 || dim == 1 );
|
||||
CV_Assert( reduceOp == REDUCE_SUM || reduceOp == REDUCE_AVG || reduceOp == REDUCE_MAX || reduceOp == REDUCE_MIN );
|
||||
|
||||
if (dtype < 0)
|
||||
dtype = src.depth();
|
||||
|
||||
_dst.create(1, dim == 0 ? src.cols : src.rows, CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), src.channels()));
|
||||
GpuMat dst = _dst.getGpuMat();
|
||||
|
||||
if (dim == 0)
|
||||
{
|
||||
typedef void (*func_t)(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
|
||||
static const func_t funcs[7][7] =
|
||||
{
|
||||
{
|
||||
::reduce::rows<unsigned char, int, unsigned char>,
|
||||
0/*::reduce::rows<unsigned char, int, signed char>*/,
|
||||
0/*::reduce::rows<unsigned char, int, unsigned short>*/,
|
||||
0/*::reduce::rows<unsigned char, int, short>*/,
|
||||
::reduce::rows<unsigned char, int, int>,
|
||||
::reduce::rows<unsigned char, float, float>,
|
||||
::reduce::rows<unsigned char, double, double>
|
||||
},
|
||||
{
|
||||
0/*::reduce::rows<signed char, int, unsigned char>*/,
|
||||
0/*::reduce::rows<signed char, int, signed char>*/,
|
||||
0/*::reduce::rows<signed char, int, unsigned short>*/,
|
||||
0/*::reduce::rows<signed char, int, short>*/,
|
||||
0/*::reduce::rows<signed char, int, int>*/,
|
||||
0/*::reduce::rows<signed char, float, float>*/,
|
||||
0/*::reduce::rows<signed char, double, double>*/
|
||||
},
|
||||
{
|
||||
0/*::reduce::rows<unsigned short, int, unsigned char>*/,
|
||||
0/*::reduce::rows<unsigned short, int, signed char>*/,
|
||||
::reduce::rows<unsigned short, int, unsigned short>,
|
||||
0/*::reduce::rows<unsigned short, int, short>*/,
|
||||
::reduce::rows<unsigned short, int, int>,
|
||||
::reduce::rows<unsigned short, float, float>,
|
||||
::reduce::rows<unsigned short, double, double>
|
||||
},
|
||||
{
|
||||
0/*::reduce::rows<short, int, unsigned char>*/,
|
||||
0/*::reduce::rows<short, int, signed char>*/,
|
||||
0/*::reduce::rows<short, int, unsigned short>*/,
|
||||
::reduce::rows<short, int, short>,
|
||||
::reduce::rows<short, int, int>,
|
||||
::reduce::rows<short, float, float>,
|
||||
::reduce::rows<short, double, double>
|
||||
},
|
||||
{
|
||||
0/*::reduce::rows<int, int, unsigned char>*/,
|
||||
0/*::reduce::rows<int, int, signed char>*/,
|
||||
0/*::reduce::rows<int, int, unsigned short>*/,
|
||||
0/*::reduce::rows<int, int, short>*/,
|
||||
::reduce::rows<int, int, int>,
|
||||
::reduce::rows<int, float, float>,
|
||||
::reduce::rows<int, double, double>
|
||||
},
|
||||
{
|
||||
0/*::reduce::rows<float, float, unsigned char>*/,
|
||||
0/*::reduce::rows<float, float, signed char>*/,
|
||||
0/*::reduce::rows<float, float, unsigned short>*/,
|
||||
0/*::reduce::rows<float, float, short>*/,
|
||||
0/*::reduce::rows<float, float, int>*/,
|
||||
::reduce::rows<float, float, float>,
|
||||
::reduce::rows<float, double, double>
|
||||
},
|
||||
{
|
||||
0/*::reduce::rows<double, double, unsigned char>*/,
|
||||
0/*::reduce::rows<double, double, signed char>*/,
|
||||
0/*::reduce::rows<double, double, unsigned short>*/,
|
||||
0/*::reduce::rows<double, double, short>*/,
|
||||
0/*::reduce::rows<double, double, int>*/,
|
||||
0/*::reduce::rows<double, double, float>*/,
|
||||
::reduce::rows<double, double, double>
|
||||
}
|
||||
};
|
||||
|
||||
const func_t func = funcs[src.depth()][dst.depth()];
|
||||
|
||||
if (!func)
|
||||
CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of input and output array formats");
|
||||
|
||||
func(src.reshape(1), dst.data, reduceOp, StreamAccessor::getStream(stream));
|
||||
}
|
||||
else
|
||||
{
|
||||
typedef void (*func_t)(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
|
||||
static const func_t funcs[7][7] =
|
||||
{
|
||||
{
|
||||
::reduce::cols<unsigned char, int, unsigned char>,
|
||||
0/*::reduce::cols<unsigned char, int, signed char>*/,
|
||||
0/*::reduce::cols<unsigned char, int, unsigned short>*/,
|
||||
0/*::reduce::cols<unsigned char, int, short>*/,
|
||||
::reduce::cols<unsigned char, int, int>,
|
||||
::reduce::cols<unsigned char, float, float>,
|
||||
::reduce::cols<unsigned char, double, double>
|
||||
},
|
||||
{
|
||||
0/*::reduce::cols<signed char, int, unsigned char>*/,
|
||||
0/*::reduce::cols<signed char, int, signed char>*/,
|
||||
0/*::reduce::cols<signed char, int, unsigned short>*/,
|
||||
0/*::reduce::cols<signed char, int, short>*/,
|
||||
0/*::reduce::cols<signed char, int, int>*/,
|
||||
0/*::reduce::cols<signed char, float, float>*/,
|
||||
0/*::reduce::cols<signed char, double, double>*/
|
||||
},
|
||||
{
|
||||
0/*::reduce::cols<unsigned short, int, unsigned char>*/,
|
||||
0/*::reduce::cols<unsigned short, int, signed char>*/,
|
||||
::reduce::cols<unsigned short, int, unsigned short>,
|
||||
0/*::reduce::cols<unsigned short, int, short>*/,
|
||||
::reduce::cols<unsigned short, int, int>,
|
||||
::reduce::cols<unsigned short, float, float>,
|
||||
::reduce::cols<unsigned short, double, double>
|
||||
},
|
||||
{
|
||||
0/*::reduce::cols<short, int, unsigned char>*/,
|
||||
0/*::reduce::cols<short, int, signed char>*/,
|
||||
0/*::reduce::cols<short, int, unsigned short>*/,
|
||||
::reduce::cols<short, int, short>,
|
||||
::reduce::cols<short, int, int>,
|
||||
::reduce::cols<short, float, float>,
|
||||
::reduce::cols<short, double, double>
|
||||
},
|
||||
{
|
||||
0/*::reduce::cols<int, int, unsigned char>*/,
|
||||
0/*::reduce::cols<int, int, signed char>*/,
|
||||
0/*::reduce::cols<int, int, unsigned short>*/,
|
||||
0/*::reduce::cols<int, int, short>*/,
|
||||
::reduce::cols<int, int, int>,
|
||||
::reduce::cols<int, float, float>,
|
||||
::reduce::cols<int, double, double>
|
||||
},
|
||||
{
|
||||
0/*::reduce::cols<float, float, unsigned char>*/,
|
||||
0/*::reduce::cols<float, float, signed char>*/,
|
||||
0/*::reduce::cols<float, float, unsigned short>*/,
|
||||
0/*::reduce::cols<float, float, short>*/,
|
||||
0/*::reduce::cols<float, float, int>*/,
|
||||
::reduce::cols<float, float, float>,
|
||||
::reduce::cols<float, double, double>
|
||||
},
|
||||
{
|
||||
0/*::reduce::cols<double, double, unsigned char>*/,
|
||||
0/*::reduce::cols<double, double, signed char>*/,
|
||||
0/*::reduce::cols<double, double, unsigned short>*/,
|
||||
0/*::reduce::cols<double, double, short>*/,
|
||||
0/*::reduce::cols<double, double, int>*/,
|
||||
0/*::reduce::cols<double, double, float>*/,
|
||||
::reduce::cols<double, double, double>
|
||||
}
|
||||
};
|
||||
|
||||
const func_t func = funcs[src.depth()][dst.depth()];
|
||||
|
||||
if (!func)
|
||||
CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of input and output array formats");
|
||||
|
||||
func(src, dst.data, src.channels(), reduceOp, StreamAccessor::getStream(stream));
|
||||
}
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// meanStdDev
|
||||
|
||||
@ -748,116 +241,4 @@ void cv::cuda::normalize(InputArray _src, OutputArray dst, double a, double b, i
|
||||
}
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// integral
|
||||
|
||||
namespace cv { namespace cuda { namespace device
|
||||
{
|
||||
namespace imgproc
|
||||
{
|
||||
void shfl_integral_gpu(const PtrStepSzb& img, PtrStepSz<unsigned int> integral, cudaStream_t stream);
|
||||
}
|
||||
}}}
|
||||
|
||||
void cv::cuda::integral(InputArray _src, OutputArray _dst, GpuMat& buffer, Stream& _stream)
|
||||
{
|
||||
GpuMat src = _src.getGpuMat();
|
||||
|
||||
CV_Assert( src.type() == CV_8UC1 );
|
||||
|
||||
cudaStream_t stream = StreamAccessor::getStream(_stream);
|
||||
|
||||
cv::Size whole;
|
||||
cv::Point offset;
|
||||
src.locateROI(whole, offset);
|
||||
|
||||
if (deviceSupports(WARP_SHUFFLE_FUNCTIONS) && src.cols <= 2048
|
||||
&& offset.x % 16 == 0 && ((src.cols + 63) / 64) * 64 <= (static_cast<int>(src.step) - offset.x))
|
||||
{
|
||||
ensureSizeIsEnough(((src.rows + 7) / 8) * 8, ((src.cols + 63) / 64) * 64, CV_32SC1, buffer);
|
||||
|
||||
cv::cuda::device::imgproc::shfl_integral_gpu(src, buffer, stream);
|
||||
|
||||
_dst.create(src.rows + 1, src.cols + 1, CV_32SC1);
|
||||
GpuMat dst = _dst.getGpuMat();
|
||||
|
||||
dst.setTo(Scalar::all(0), _stream);
|
||||
|
||||
GpuMat inner = dst(Rect(1, 1, src.cols, src.rows));
|
||||
GpuMat res = buffer(Rect(0, 0, src.cols, src.rows));
|
||||
|
||||
res.copyTo(inner, _stream);
|
||||
}
|
||||
else
|
||||
{
|
||||
#ifndef HAVE_OPENCV_CUDALEGACY
|
||||
throw_no_cuda();
|
||||
#else
|
||||
_dst.create(src.rows + 1, src.cols + 1, CV_32SC1);
|
||||
GpuMat dst = _dst.getGpuMat();
|
||||
|
||||
NcvSize32u roiSize;
|
||||
roiSize.width = src.cols;
|
||||
roiSize.height = src.rows;
|
||||
|
||||
cudaDeviceProp prop;
|
||||
cudaSafeCall( cudaGetDeviceProperties(&prop, cv::cuda::getDevice()) );
|
||||
|
||||
Ncv32u bufSize;
|
||||
ncvSafeCall( nppiStIntegralGetSize_8u32u(roiSize, &bufSize, prop) );
|
||||
ensureSizeIsEnough(1, bufSize, CV_8UC1, buffer);
|
||||
|
||||
NppStStreamHandler h(stream);
|
||||
|
||||
ncvSafeCall( nppiStIntegral_8u32u_C1R(const_cast<Ncv8u*>(src.ptr<Ncv8u>()), static_cast<int>(src.step),
|
||||
dst.ptr<Ncv32u>(), static_cast<int>(dst.step), roiSize, buffer.ptr<Ncv8u>(), bufSize, prop) );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
// sqrIntegral
|
||||
|
||||
void cv::cuda::sqrIntegral(InputArray _src, OutputArray _dst, GpuMat& buf, Stream& _stream)
|
||||
{
|
||||
#ifndef HAVE_OPENCV_CUDALEGACY
|
||||
(void) _src;
|
||||
(void) _dst;
|
||||
(void) _stream;
|
||||
throw_no_cuda();
|
||||
#else
|
||||
GpuMat src = _src.getGpuMat();
|
||||
|
||||
CV_Assert( src.type() == CV_8U );
|
||||
|
||||
NcvSize32u roiSize;
|
||||
roiSize.width = src.cols;
|
||||
roiSize.height = src.rows;
|
||||
|
||||
cudaDeviceProp prop;
|
||||
cudaSafeCall( cudaGetDeviceProperties(&prop, cv::cuda::getDevice()) );
|
||||
|
||||
Ncv32u bufSize;
|
||||
ncvSafeCall(nppiStSqrIntegralGetSize_8u64u(roiSize, &bufSize, prop));
|
||||
|
||||
ensureSizeIsEnough(1, bufSize, CV_8U, buf);
|
||||
|
||||
cudaStream_t stream = StreamAccessor::getStream(_stream);
|
||||
|
||||
NppStStreamHandler h(stream);
|
||||
|
||||
_dst.create(src.rows + 1, src.cols + 1, CV_64F);
|
||||
GpuMat dst = _dst.getGpuMat();
|
||||
|
||||
ncvSafeCall(nppiStSqrIntegral_8u64u_C1R(const_cast<Ncv8u*>(src.ptr<Ncv8u>(0)), static_cast<int>(src.step),
|
||||
dst.ptr<Ncv64u>(0), static_cast<int>(dst.step), roiSize, buf.ptr<Ncv8u>(0), bufSize, prop));
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@ -125,43 +125,6 @@ INSTANTIATE_TEST_CASE_P(CUDA_Arithm, GEMM, testing::Combine(
|
||||
ALL_GEMM_FLAGS,
|
||||
WHOLE_SUBMAT));
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Integral
|
||||
|
||||
PARAM_TEST_CASE(Integral, cv::cuda::DeviceInfo, cv::Size, UseRoi)
|
||||
{
|
||||
cv::cuda::DeviceInfo devInfo;
|
||||
cv::Size size;
|
||||
bool useRoi;
|
||||
|
||||
virtual void SetUp()
|
||||
{
|
||||
devInfo = GET_PARAM(0);
|
||||
size = GET_PARAM(1);
|
||||
useRoi = GET_PARAM(2);
|
||||
|
||||
cv::cuda::setDevice(devInfo.deviceID());
|
||||
}
|
||||
};
|
||||
|
||||
CUDA_TEST_P(Integral, Accuracy)
|
||||
{
|
||||
cv::Mat src = randomMat(size, CV_8UC1);
|
||||
|
||||
cv::cuda::GpuMat dst = createMat(cv::Size(src.cols + 1, src.rows + 1), CV_32SC1, useRoi);
|
||||
cv::cuda::integral(loadMat(src, useRoi), dst);
|
||||
|
||||
cv::Mat dst_gold;
|
||||
cv::integral(src, dst_gold, CV_32S);
|
||||
|
||||
EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(CUDA_Arithm, Integral, testing::Combine(
|
||||
ALL_DEVICES,
|
||||
DIFFERENT_SIZES,
|
||||
WHOLE_SUBMAT));
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
// MulSpectrums
|
||||
|
||||
|
@ -816,4 +816,78 @@ INSTANTIATE_TEST_CASE_P(CUDA_Arithm, MeanStdDev, testing::Combine(
|
||||
DIFFERENT_SIZES,
|
||||
WHOLE_SUBMAT));
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Integral
|
||||
|
||||
PARAM_TEST_CASE(Integral, cv::cuda::DeviceInfo, cv::Size, UseRoi)
|
||||
{
|
||||
cv::cuda::DeviceInfo devInfo;
|
||||
cv::Size size;
|
||||
bool useRoi;
|
||||
|
||||
virtual void SetUp()
|
||||
{
|
||||
devInfo = GET_PARAM(0);
|
||||
size = GET_PARAM(1);
|
||||
useRoi = GET_PARAM(2);
|
||||
|
||||
cv::cuda::setDevice(devInfo.deviceID());
|
||||
}
|
||||
};
|
||||
|
||||
CUDA_TEST_P(Integral, Accuracy)
|
||||
{
|
||||
cv::Mat src = randomMat(size, CV_8UC1);
|
||||
|
||||
cv::cuda::GpuMat dst = createMat(cv::Size(src.cols + 1, src.rows + 1), CV_32SC1, useRoi);
|
||||
cv::cuda::integral(loadMat(src, useRoi), dst);
|
||||
|
||||
cv::Mat dst_gold;
|
||||
cv::integral(src, dst_gold, CV_32S);
|
||||
|
||||
EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(CUDA_Arithm, Integral, testing::Combine(
|
||||
ALL_DEVICES,
|
||||
DIFFERENT_SIZES,
|
||||
WHOLE_SUBMAT));
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// IntegralSqr
|
||||
|
||||
PARAM_TEST_CASE(IntegralSqr, cv::cuda::DeviceInfo, cv::Size, UseRoi)
|
||||
{
|
||||
cv::cuda::DeviceInfo devInfo;
|
||||
cv::Size size;
|
||||
bool useRoi;
|
||||
|
||||
virtual void SetUp()
|
||||
{
|
||||
devInfo = GET_PARAM(0);
|
||||
size = GET_PARAM(1);
|
||||
useRoi = GET_PARAM(2);
|
||||
|
||||
cv::cuda::setDevice(devInfo.deviceID());
|
||||
}
|
||||
};
|
||||
|
||||
CUDA_TEST_P(IntegralSqr, Accuracy)
|
||||
{
|
||||
cv::Mat src = randomMat(size, CV_8UC1);
|
||||
|
||||
cv::cuda::GpuMat dst = createMat(cv::Size(src.cols + 1, src.rows + 1), CV_64FC1, useRoi);
|
||||
cv::cuda::sqrIntegral(loadMat(src, useRoi), dst);
|
||||
|
||||
cv::Mat dst_gold, temp;
|
||||
cv::integral(src, temp, dst_gold);
|
||||
|
||||
EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(CUDA_Arithm, IntegralSqr, testing::Combine(
|
||||
ALL_DEVICES,
|
||||
DIFFERENT_SIZES,
|
||||
WHOLE_SUBMAT));
|
||||
|
||||
#endif // HAVE_CUDA
|
||||
|
@ -4,7 +4,7 @@ endif()
|
||||
|
||||
set(the_description "CUDA device layer")
|
||||
|
||||
ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4189 /wd4505 -Wundef -Wmissing-declarations -Wunused-function -Wunused-variable)
|
||||
ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4189 /wd4505 -Wundef -Wmissing-declarations -Wunused-function -Wunused-variable -Wenum-compare)
|
||||
|
||||
ocv_add_module(cudev)
|
||||
|
||||
|
@ -73,7 +73,7 @@
|
||||
#include "cudev/block/vec_distance.hpp"
|
||||
|
||||
#include "cudev/grid/copy.hpp"
|
||||
#include "cudev/grid/glob_reduce.hpp"
|
||||
#include "cudev/grid/reduce.hpp"
|
||||
#include "cudev/grid/histogram.hpp"
|
||||
#include "cudev/grid/integral.hpp"
|
||||
#include "cudev/grid/pyramids.hpp"
|
||||
|
@ -47,7 +47,7 @@
|
||||
#define __OPENCV_CUDEV_EXPR_REDUCTION_HPP__
|
||||
|
||||
#include "../common.hpp"
|
||||
#include "../grid/glob_reduce.hpp"
|
||||
#include "../grid/reduce.hpp"
|
||||
#include "../grid/histogram.hpp"
|
||||
#include "../grid/integral.hpp"
|
||||
#include "../grid/reduce_to_vec.hpp"
|
||||
|
@ -616,6 +616,30 @@ template <typename T> struct magnitude_func : binary_function<T, T, typename fun
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T> struct magnitude_sqr_func : binary_function<T, T, typename functional_detail::FloatType<T>::type>
|
||||
{
|
||||
__device__ __forceinline__ typename functional_detail::FloatType<T>::type operator ()(typename TypeTraits<T>::parameter_type a, typename TypeTraits<T>::parameter_type b) const
|
||||
{
|
||||
return a * a + b * b;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T, bool angleInDegrees> struct direction_func : binary_function<T, T, T>
|
||||
{
|
||||
__device__ T operator ()(T x, T y) const
|
||||
{
|
||||
atan2_func<T> f;
|
||||
typename atan2_func<T>::result_type angle = f(y, x);
|
||||
|
||||
angle += (angle < 0) * (2.0f * CV_PI_F);
|
||||
|
||||
if (angleInDegrees)
|
||||
angle *= (180.0f / CV_PI_F);
|
||||
|
||||
return saturate_cast<T>(angle);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T> struct pow_func : binary_function<T, float, float>
|
||||
{
|
||||
__device__ __forceinline__ float operator ()(T val, float power) const
|
||||
|
@ -594,7 +594,7 @@ namespace integral_detail
|
||||
CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
__host__ static void integral(const GlobPtr<uchar> src, GlobPtr<uint> dst, int rows, int cols, cudaStream_t stream)
|
||||
__host__ static void integral(const GlobPtr<uchar>& src, const GlobPtr<uint>& dst, int rows, int cols, cudaStream_t stream)
|
||||
{
|
||||
if (deviceSupports(FEATURE_SET_COMPUTE_30)
|
||||
&& (cols % 16 == 0)
|
||||
@ -614,7 +614,7 @@ namespace integral_detail
|
||||
CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
__host__ static void integral(const GlobPtr<uchar> src, GlobPtr<int> dst, int rows, int cols, cudaStream_t stream)
|
||||
__host__ __forceinline__ void integral(const GlobPtr<uchar>& src, const GlobPtr<int>& dst, int rows, int cols, cudaStream_t stream)
|
||||
{
|
||||
GlobPtr<uint> dstui = globPtr((uint*) dst.data, dst.step);
|
||||
integral(src, dstui, rows, cols, stream);
|
||||
|
177
modules/cudev/include/opencv2/cudev/grid/detail/minmaxloc.hpp
Normal file
177
modules/cudev/include/opencv2/cudev/grid/detail/minmaxloc.hpp
Normal file
@ -0,0 +1,177 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#ifndef __OPENCV_CUDEV_GRID_MINMAXLOC_DETAIL_HPP__
|
||||
#define __OPENCV_CUDEV_GRID_MINMAXLOC_DETAIL_HPP__
|
||||
|
||||
#include "../../common.hpp"
|
||||
#include "../../util/vec_traits.hpp"
|
||||
#include "../../util/type_traits.hpp"
|
||||
#include "../../util/limits.hpp"
|
||||
#include "../../block/reduce.hpp"
|
||||
|
||||
namespace cv { namespace cudev {
|
||||
|
||||
namespace grid_minmaxloc_detail
|
||||
{
|
||||
template <int BLOCK_SIZE, class SrcPtr, typename ResType, class MaskPtr>
|
||||
__global__ void minMaxLoc_pass_1(const SrcPtr src, ResType* minVal, ResType* maxVal, int* minLoc, int* maxLoc, const MaskPtr mask, const int rows, const int cols, const int patch_y, const int patch_x)
|
||||
{
|
||||
__shared__ ResType sMinVal[BLOCK_SIZE];
|
||||
__shared__ ResType sMaxVal[BLOCK_SIZE];
|
||||
__shared__ uint sMinLoc[BLOCK_SIZE];
|
||||
__shared__ uint sMaxLoc[BLOCK_SIZE];
|
||||
|
||||
const int x0 = blockIdx.x * blockDim.x * patch_x + threadIdx.x;
|
||||
const int y0 = blockIdx.y * blockDim.y * patch_y + threadIdx.y;
|
||||
|
||||
ResType myMin = numeric_limits<ResType>::max();
|
||||
ResType myMax = -numeric_limits<ResType>::max();
|
||||
int myMinLoc = -1;
|
||||
int myMaxLoc = -1;
|
||||
|
||||
for (int i = 0, y = y0; i < patch_y && y < rows; ++i, y += blockDim.y)
|
||||
{
|
||||
for (int j = 0, x = x0; j < patch_x && x < cols; ++j, x += blockDim.x)
|
||||
{
|
||||
if (mask(y, x))
|
||||
{
|
||||
const ResType srcVal = src(y, x);
|
||||
|
||||
if (srcVal < myMin)
|
||||
{
|
||||
myMin = srcVal;
|
||||
myMinLoc = y * cols + x;
|
||||
}
|
||||
|
||||
if (srcVal > myMax)
|
||||
{
|
||||
myMax = srcVal;
|
||||
myMaxLoc = y * cols + x;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
|
||||
|
||||
blockReduceKeyVal<BLOCK_SIZE>(smem_tuple(sMinVal, sMaxVal), tie(myMin, myMax),
|
||||
smem_tuple(sMinLoc, sMaxLoc), tie(myMinLoc, myMaxLoc),
|
||||
tid,
|
||||
make_tuple(less<ResType>(), greater<ResType>()));
|
||||
|
||||
const int bid = blockIdx.y * gridDim.x + blockIdx.x;
|
||||
|
||||
if (tid == 0)
|
||||
{
|
||||
minVal[bid] = myMin;
|
||||
maxVal[bid] = myMax;
|
||||
minLoc[bid] = myMinLoc;
|
||||
maxLoc[bid] = myMaxLoc;
|
||||
}
|
||||
}
|
||||
|
||||
template <int BLOCK_SIZE, typename T>
|
||||
__global__ void minMaxLoc_pass_2(T* minMal, T* maxVal, int* minLoc, int* maxLoc, int count)
|
||||
{
|
||||
__shared__ T sMinVal[BLOCK_SIZE];
|
||||
__shared__ T sMaxVal[BLOCK_SIZE];
|
||||
__shared__ int sMinLoc[BLOCK_SIZE];
|
||||
__shared__ int sMaxLoc[BLOCK_SIZE];
|
||||
|
||||
const int idx = ::min(threadIdx.x, count - 1);
|
||||
|
||||
T myMin = minMal[idx];
|
||||
T myMax = maxVal[idx];
|
||||
int myMinLoc = minLoc[idx];
|
||||
int myMaxLoc = maxLoc[idx];
|
||||
|
||||
blockReduceKeyVal<BLOCK_SIZE>(smem_tuple(sMinVal, sMaxVal), tie(myMin, myMax),
|
||||
smem_tuple(sMinLoc, sMaxLoc), tie(myMinLoc, myMaxLoc),
|
||||
threadIdx.x,
|
||||
make_tuple(less<T>(), greater<T>()));
|
||||
|
||||
if (threadIdx.x == 0)
|
||||
{
|
||||
minMal[0] = myMin;
|
||||
maxVal[0] = myMax;
|
||||
minLoc[0] = myMinLoc;
|
||||
maxLoc[0] = myMaxLoc;
|
||||
}
|
||||
}
|
||||
|
||||
template <class Policy>
|
||||
void getLaunchCfg(int rows, int cols, dim3& block, dim3& grid)
|
||||
{
|
||||
block = dim3(Policy::block_size_x, Policy::block_size_y);
|
||||
grid = dim3(divUp(cols, block.x * Policy::patch_size_x), divUp(rows, block.y * Policy::patch_size_y));
|
||||
|
||||
grid.x = ::min(grid.x, block.x);
|
||||
grid.y = ::min(grid.y, block.y);
|
||||
}
|
||||
|
||||
template <class Policy, class SrcPtr, typename ResType, class MaskPtr>
|
||||
__host__ void minMaxLoc(const SrcPtr& src, ResType* minVal, ResType* maxVal, int* minLoc, int* maxLoc, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
|
||||
{
|
||||
dim3 block, grid;
|
||||
getLaunchCfg<Policy>(cols, rows, block, grid);
|
||||
|
||||
const int patch_x = divUp(divUp(cols, grid.x), block.x);
|
||||
const int patch_y = divUp(divUp(rows, grid.y), block.y);
|
||||
|
||||
minMaxLoc_pass_1<Policy::block_size_x * Policy::block_size_y><<<grid, block, 0, stream>>>(src, minVal, maxVal, minLoc, maxLoc, mask, rows, cols, patch_y, patch_x);
|
||||
CV_CUDEV_SAFE_CALL( cudaGetLastError() );
|
||||
|
||||
minMaxLoc_pass_2<Policy::block_size_x * Policy::block_size_y><<<1, Policy::block_size_x * Policy::block_size_y, 0, stream>>>(minVal, maxVal, minLoc, maxLoc, grid.x * grid.y);
|
||||
CV_CUDEV_SAFE_CALL( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
|
||||
}
|
||||
}
|
||||
|
||||
}}
|
||||
|
||||
#endif
|
@ -43,8 +43,8 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#ifndef __OPENCV_CUDEV_GRID_GLOB_REDUCE_DETAIL_HPP__
|
||||
#define __OPENCV_CUDEV_GRID_GLOB_REDUCE_DETAIL_HPP__
|
||||
#ifndef __OPENCV_CUDEV_GRID_REDUCE_DETAIL_HPP__
|
||||
#define __OPENCV_CUDEV_GRID_REDUCE_DETAIL_HPP__
|
||||
|
||||
#include "../../common.hpp"
|
||||
#include "../../util/tuple.hpp"
|
||||
@ -59,7 +59,7 @@
|
||||
|
||||
namespace cv { namespace cudev {
|
||||
|
||||
namespace grid_glob_reduce_detail
|
||||
namespace grid_reduce_detail
|
||||
{
|
||||
// Unroll
|
||||
|
||||
@ -389,7 +389,7 @@ namespace grid_glob_reduce_detail
|
||||
// glob_reduce
|
||||
|
||||
template <class Reductor, int BLOCK_SIZE, int PATCH_X, int PATCH_Y, class SrcPtr, typename ResType, class MaskPtr>
|
||||
__global__ void glob_reduce(const SrcPtr src, ResType* result, const MaskPtr mask, const int rows, const int cols)
|
||||
__global__ void reduce(const SrcPtr src, ResType* result, const MaskPtr mask, const int rows, const int cols)
|
||||
{
|
||||
const int x0 = blockIdx.x * blockDim.x * PATCH_X + threadIdx.x;
|
||||
const int y0 = blockIdx.y * blockDim.y * PATCH_Y + threadIdx.y;
|
||||
@ -413,14 +413,12 @@ namespace grid_glob_reduce_detail
|
||||
}
|
||||
|
||||
template <class Reductor, class Policy, class SrcPtr, typename ResType, class MaskPtr>
|
||||
__host__ void glob_reduce(const SrcPtr& src, ResType* result, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
|
||||
__host__ void reduce(const SrcPtr& src, ResType* result, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
|
||||
{
|
||||
const dim3 block(Policy::block_size_x, Policy::block_size_y);
|
||||
const dim3 grid(divUp(cols, block.x * Policy::patch_size_x), divUp(rows, block.y * Policy::patch_size_y));
|
||||
|
||||
const int BLOCK_SIZE = Policy::block_size_x * Policy::block_size_y;
|
||||
|
||||
glob_reduce<Reductor, BLOCK_SIZE, Policy::patch_size_x, Policy::patch_size_y><<<grid, block, 0, stream>>>(src, result, mask, rows, cols);
|
||||
reduce<Reductor, Policy::block_size_x * Policy::block_size_y, Policy::patch_size_x, Policy::patch_size_y><<<grid, block, 0, stream>>>(src, result, mask, rows, cols);
|
||||
CV_CUDEV_SAFE_CALL( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
@ -433,40 +431,33 @@ namespace grid_glob_reduce_detail
|
||||
__host__ void sum(const SrcPtr& src, ResType* result, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
|
||||
{
|
||||
typedef typename PtrTraits<SrcPtr>::value_type src_type;
|
||||
const int cn = VecTraits<src_type>::cn;
|
||||
typedef typename MakeVec<ResType, cn>::type work_type;
|
||||
typedef typename VecTraits<ResType>::elem_type res_elem_type;
|
||||
|
||||
glob_reduce<SumReductor<src_type, work_type>, Policy>(src, result, mask, rows, cols, stream);
|
||||
reduce<SumReductor<src_type, ResType>, Policy>(src, (res_elem_type*) result, mask, rows, cols, stream);
|
||||
}
|
||||
|
||||
template <class Policy, class SrcPtr, typename ResType, class MaskPtr>
|
||||
__host__ void minVal(const SrcPtr& src, ResType* result, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
|
||||
{
|
||||
typedef typename PtrTraits<SrcPtr>::value_type src_type;
|
||||
const int cn = VecTraits<src_type>::cn;
|
||||
typedef typename MakeVec<ResType, cn>::type work_type;
|
||||
|
||||
glob_reduce<MinMaxReductor<minop<work_type>, src_type, work_type>, Policy>(src, result, mask, rows, cols, stream);
|
||||
reduce<MinMaxReductor<minop<ResType>, src_type, ResType>, Policy>(src, result, mask, rows, cols, stream);
|
||||
}
|
||||
|
||||
template <class Policy, class SrcPtr, typename ResType, class MaskPtr>
|
||||
__host__ void maxVal(const SrcPtr& src, ResType* result, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
|
||||
{
|
||||
typedef typename PtrTraits<SrcPtr>::value_type src_type;
|
||||
const int cn = VecTraits<src_type>::cn;
|
||||
typedef typename MakeVec<ResType, cn>::type work_type;
|
||||
|
||||
glob_reduce<MinMaxReductor<maxop<work_type>, src_type, work_type>, Policy>(src, result, mask, rows, cols, stream);
|
||||
reduce<MinMaxReductor<maxop<ResType>, src_type, ResType>, Policy>(src, result, mask, rows, cols, stream);
|
||||
}
|
||||
|
||||
template <class Policy, class SrcPtr, typename ResType, class MaskPtr>
|
||||
__host__ void minMaxVal(const SrcPtr& src, ResType* result, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
|
||||
{
|
||||
typedef typename PtrTraits<SrcPtr>::value_type src_type;
|
||||
const int cn = VecTraits<src_type>::cn;
|
||||
typedef typename MakeVec<ResType, cn>::type work_type;
|
||||
|
||||
glob_reduce<MinMaxReductor<both, src_type, work_type>, Policy>(src, result, mask, rows, cols, stream);
|
||||
reduce<MinMaxReductor<both, src_type, ResType>, Policy>(src, result, mask, rows, cols, stream);
|
||||
}
|
||||
}
|
||||
|
@ -54,12 +54,52 @@ namespace cv { namespace cudev {
|
||||
|
||||
namespace grid_reduce_to_vec_detail
|
||||
{
|
||||
template <int BLOCK_SIZE, typename work_type, typename work_elem_type, class Reductor, int cn> struct Reduce;
|
||||
|
||||
template <int BLOCK_SIZE, typename work_type, typename work_elem_type, class Reductor> struct Reduce<BLOCK_SIZE, work_type, work_elem_type, Reductor, 1>
|
||||
{
|
||||
__device__ __forceinline__ static void call(work_elem_type smem[1][BLOCK_SIZE], work_type& myVal)
|
||||
{
|
||||
typename Reductor::template rebind<work_elem_type>::other op;
|
||||
blockReduce<BLOCK_SIZE>(smem[0], myVal, threadIdx.x, op);
|
||||
}
|
||||
};
|
||||
|
||||
template <int BLOCK_SIZE, typename work_type, typename work_elem_type, class Reductor> struct Reduce<BLOCK_SIZE, work_type, work_elem_type, Reductor, 2>
|
||||
{
|
||||
__device__ __forceinline__ static void call(work_elem_type smem[2][BLOCK_SIZE], work_type& myVal)
|
||||
{
|
||||
typename Reductor::template rebind<work_elem_type>::other op;
|
||||
blockReduce<BLOCK_SIZE>(smem_tuple(smem[0], smem[1]), tie(myVal.x, myVal.y), threadIdx.x, make_tuple(op, op));
|
||||
}
|
||||
};
|
||||
|
||||
template <int BLOCK_SIZE, typename work_type, typename work_elem_type, class Reductor> struct Reduce<BLOCK_SIZE, work_type, work_elem_type, Reductor, 3>
|
||||
{
|
||||
__device__ __forceinline__ static void call(work_elem_type smem[3][BLOCK_SIZE], work_type& myVal)
|
||||
{
|
||||
typename Reductor::template rebind<work_elem_type>::other op;
|
||||
blockReduce<BLOCK_SIZE>(smem_tuple(smem[0], smem[1], smem[2]), tie(myVal.x, myVal.y, myVal.z), threadIdx.x, make_tuple(op, op, op));
|
||||
}
|
||||
};
|
||||
|
||||
template <int BLOCK_SIZE, typename work_type, typename work_elem_type, class Reductor> struct Reduce<BLOCK_SIZE, work_type, work_elem_type, Reductor, 4>
|
||||
{
|
||||
__device__ __forceinline__ static void call(work_elem_type smem[4][BLOCK_SIZE], work_type& myVal)
|
||||
{
|
||||
typename Reductor::template rebind<work_elem_type>::other op;
|
||||
blockReduce<BLOCK_SIZE>(smem_tuple(smem[0], smem[1], smem[2], smem[3]), tie(myVal.x, myVal.y, myVal.z, myVal.w), threadIdx.x, make_tuple(op, op, op, op));
|
||||
}
|
||||
};
|
||||
|
||||
template <class Reductor, int BLOCK_SIZE, class SrcPtr, typename ResType, class MaskPtr>
|
||||
__global__ void reduceToColumn(const SrcPtr src, ResType* dst, const MaskPtr mask, const int cols)
|
||||
{
|
||||
typedef typename Reductor::work_type work_type;
|
||||
typedef typename VecTraits<work_type>::elem_type work_elem_type;
|
||||
const int cn = VecTraits<work_type>::cn;
|
||||
|
||||
__shared__ work_type smem[BLOCK_SIZE];
|
||||
__shared__ work_elem_type smem[cn][BLOCK_SIZE];
|
||||
|
||||
const int y = blockIdx.x;
|
||||
|
||||
@ -75,7 +115,7 @@ namespace grid_reduce_to_vec_detail
|
||||
}
|
||||
}
|
||||
|
||||
blockReduce<BLOCK_SIZE>(smem, myVal, threadIdx.x, op);
|
||||
Reduce<BLOCK_SIZE, work_type, work_elem_type, Reductor, cn>::call(smem, myVal);
|
||||
|
||||
if (threadIdx.x == 0)
|
||||
dst[y] = saturate_cast<ResType>(Reductor::result(myVal, cols));
|
||||
|
@ -217,7 +217,7 @@ namespace grid_transform_detail
|
||||
}
|
||||
|
||||
template <int SHIFT, typename SrcType1, typename SrcType2, typename DstType, class BinOp, class MaskPtr>
|
||||
__global__ void transformSmart(const GlobPtr<SrcType1> src1_, const GlobPtr<SrcType2> src2_, PtrStep<DstType> dst_, const BinOp op, const MaskPtr mask, const int rows, const int cols)
|
||||
__global__ void transformSmart(const GlobPtr<SrcType1> src1_, const GlobPtr<SrcType2> src2_, GlobPtr<DstType> dst_, const BinOp op, const MaskPtr mask, const int rows, const int cols)
|
||||
{
|
||||
typedef typename MakeVec<SrcType1, SHIFT>::type read_type1;
|
||||
typedef typename MakeVec<SrcType2, SHIFT>::type read_type2;
|
||||
@ -345,25 +345,25 @@ namespace grid_transform_detail
|
||||
};
|
||||
|
||||
template <class Policy, class SrcPtr, typename DstType, class UnOp, class MaskPtr>
|
||||
__host__ void transform(const SrcPtr& src, const GlobPtr<DstType>& dst, const UnOp& op, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
|
||||
__host__ void transform_unary(const SrcPtr& src, const GlobPtr<DstType>& dst, const UnOp& op, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
|
||||
{
|
||||
TransformDispatcher<false, Policy>::call(src, dst, op, mask, rows, cols, stream);
|
||||
}
|
||||
|
||||
template <class Policy, class SrcPtr1, class SrcPtr2, typename DstType, class BinOp, class MaskPtr>
|
||||
__host__ void transform(const SrcPtr1& src1, const SrcPtr2& src2, const GlobPtr<DstType>& dst, const BinOp& op, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
|
||||
__host__ void transform_binary(const SrcPtr1& src1, const SrcPtr2& src2, const GlobPtr<DstType>& dst, const BinOp& op, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
|
||||
{
|
||||
TransformDispatcher<false, Policy>::call(src1, src2, dst, op, mask, rows, cols, stream);
|
||||
}
|
||||
|
||||
template <class Policy, typename SrcType, typename DstType, class UnOp, class MaskPtr>
|
||||
__host__ void transform(const GlobPtr<SrcType>& src, const GlobPtr<DstType>& dst, const UnOp& op, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
|
||||
__host__ void transform_unary(const GlobPtr<SrcType>& src, const GlobPtr<DstType>& dst, const UnOp& op, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
|
||||
{
|
||||
TransformDispatcher<VecTraits<SrcType>::cn == 1 && VecTraits<DstType>::cn == 1 && Policy::shift != 1, Policy>::call(src, dst, op, mask, rows, cols, stream);
|
||||
}
|
||||
|
||||
template <class Policy, typename SrcType1, typename SrcType2, typename DstType, class BinOp, class MaskPtr>
|
||||
__host__ void transform(const GlobPtr<SrcType1>& src1, const GlobPtr<SrcType2>& src2, const GlobPtr<DstType>& dst, const BinOp& op, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
|
||||
__host__ void transform_binary(const GlobPtr<SrcType1>& src1, const GlobPtr<SrcType2>& src2, const GlobPtr<DstType>& dst, const BinOp& op, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
|
||||
{
|
||||
TransformDispatcher<VecTraits<SrcType1>::cn == 1 && VecTraits<SrcType2>::cn == 1 && VecTraits<DstType>::cn == 1 && Policy::shift != 1, Policy>::call(src1, src2, dst, op, mask, rows, cols, stream);
|
||||
}
|
||||
|
@ -55,15 +55,12 @@ namespace cv { namespace cudev {
|
||||
|
||||
namespace transpose_detail
|
||||
{
|
||||
const int TRANSPOSE_TILE_DIM = 16;
|
||||
const int TRANSPOSE_BLOCK_ROWS = 16;
|
||||
|
||||
template <class SrcPtr, typename DstType>
|
||||
template <int TILE_DIM, int BLOCK_DIM_Y, class SrcPtr, typename DstType>
|
||||
__global__ void transpose(const SrcPtr src, GlobPtr<DstType> dst, const int rows, const int cols)
|
||||
{
|
||||
typedef typename PtrTraits<SrcPtr>::value_type src_type;
|
||||
|
||||
__shared__ src_type tile[TRANSPOSE_TILE_DIM][TRANSPOSE_TILE_DIM + 1];
|
||||
__shared__ src_type tile[TILE_DIM][TILE_DIM + 1];
|
||||
|
||||
int blockIdx_x, blockIdx_y;
|
||||
|
||||
@ -80,12 +77,12 @@ namespace transpose_detail
|
||||
blockIdx_x = ((bid / gridDim.y) + blockIdx_y) % gridDim.x;
|
||||
}
|
||||
|
||||
int xIndex = blockIdx_x * TRANSPOSE_TILE_DIM + threadIdx.x;
|
||||
int yIndex = blockIdx_y * TRANSPOSE_TILE_DIM + threadIdx.y;
|
||||
int xIndex = blockIdx_x * TILE_DIM + threadIdx.x;
|
||||
int yIndex = blockIdx_y * TILE_DIM + threadIdx.y;
|
||||
|
||||
if (xIndex < cols)
|
||||
{
|
||||
for (int i = 0; i < TRANSPOSE_TILE_DIM; i += TRANSPOSE_BLOCK_ROWS)
|
||||
for (int i = 0; i < TILE_DIM; i += BLOCK_DIM_Y)
|
||||
{
|
||||
if (yIndex + i < rows)
|
||||
{
|
||||
@ -96,12 +93,12 @@ namespace transpose_detail
|
||||
|
||||
__syncthreads();
|
||||
|
||||
xIndex = blockIdx_y * TRANSPOSE_TILE_DIM + threadIdx.x;
|
||||
yIndex = blockIdx_x * TRANSPOSE_TILE_DIM + threadIdx.y;
|
||||
xIndex = blockIdx_y * TILE_DIM + threadIdx.x;
|
||||
yIndex = blockIdx_x * TILE_DIM + threadIdx.y;
|
||||
|
||||
if (xIndex < rows)
|
||||
{
|
||||
for (int i = 0; i < TRANSPOSE_TILE_DIM; i += TRANSPOSE_BLOCK_ROWS)
|
||||
for (int i = 0; i < TILE_DIM; i += BLOCK_DIM_Y)
|
||||
{
|
||||
if (yIndex + i < cols)
|
||||
{
|
||||
@ -111,13 +108,13 @@ namespace transpose_detail
|
||||
}
|
||||
}
|
||||
|
||||
template <class SrcPtr, typename DstType>
|
||||
template <class Policy, class SrcPtr, typename DstType>
|
||||
__host__ void transpose(const SrcPtr& src, const GlobPtr<DstType>& dst, int rows, int cols, cudaStream_t stream)
|
||||
{
|
||||
const dim3 block(TRANSPOSE_TILE_DIM, TRANSPOSE_TILE_DIM);
|
||||
const dim3 block(Policy::tile_dim, Policy::block_dim_y);
|
||||
const dim3 grid(divUp(cols, block.x), divUp(rows, block.y));
|
||||
|
||||
transpose<<<grid, block, 0, stream>>>(src, dst, rows, cols);
|
||||
transpose<Policy::tile_dim, Policy::block_dim_y><<<grid, block, 0, stream>>>(src, dst, rows, cols);
|
||||
CV_CUDEV_SAFE_CALL( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
|
@ -43,8 +43,8 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#ifndef __OPENCV_CUDEV_GRID_GLOB_REDUCE_HPP__
|
||||
#define __OPENCV_CUDEV_GRID_GLOB_REDUCE_HPP__
|
||||
#ifndef __OPENCV_CUDEV_GRID_REDUCE_HPP__
|
||||
#define __OPENCV_CUDEV_GRID_REDUCE_HPP__
|
||||
|
||||
#include <limits>
|
||||
#include "../common.hpp"
|
||||
@ -52,13 +52,18 @@
|
||||
#include "../ptr2d/gpumat.hpp"
|
||||
#include "../ptr2d/mask.hpp"
|
||||
#include "../ptr2d/transform.hpp"
|
||||
#include "detail/glob_reduce.hpp"
|
||||
#include "detail/reduce.hpp"
|
||||
#include "detail/minmaxloc.hpp"
|
||||
|
||||
namespace cv { namespace cudev {
|
||||
|
||||
template <class Policy, class SrcPtr, typename ResType, class MaskPtr>
|
||||
__host__ void gridCalcSum_(const SrcPtr& src, GpuMat_<ResType>& dst, const MaskPtr& mask, Stream& stream = Stream::Null())
|
||||
{
|
||||
typedef typename PtrTraits<SrcPtr>::value_type src_type;
|
||||
|
||||
CV_StaticAssert( unsigned(VecTraits<src_type>::cn) == unsigned(VecTraits<ResType>::cn), "" );
|
||||
|
||||
dst.create(1, 1);
|
||||
dst.setTo(0, stream);
|
||||
|
||||
@ -67,27 +72,31 @@ __host__ void gridCalcSum_(const SrcPtr& src, GpuMat_<ResType>& dst, const MaskP
|
||||
|
||||
CV_Assert( getRows(mask) == rows && getCols(mask) == cols );
|
||||
|
||||
grid_glob_reduce_detail::sum<Policy>(shrinkPtr(src),
|
||||
dst[0],
|
||||
shrinkPtr(mask),
|
||||
rows, cols,
|
||||
StreamAccessor::getStream(stream));
|
||||
grid_reduce_detail::sum<Policy>(shrinkPtr(src),
|
||||
dst[0],
|
||||
shrinkPtr(mask),
|
||||
rows, cols,
|
||||
StreamAccessor::getStream(stream));
|
||||
}
|
||||
|
||||
template <class Policy, class SrcPtr, typename ResType>
|
||||
__host__ void gridCalcSum_(const SrcPtr& src, GpuMat_<ResType>& dst, Stream& stream = Stream::Null())
|
||||
{
|
||||
typedef typename PtrTraits<SrcPtr>::value_type src_type;
|
||||
|
||||
CV_StaticAssert( unsigned(VecTraits<src_type>::cn) == unsigned(VecTraits<ResType>::cn), "" );
|
||||
|
||||
dst.create(1, 1);
|
||||
dst.setTo(0, stream);
|
||||
|
||||
const int rows = getRows(src);
|
||||
const int cols = getCols(src);
|
||||
|
||||
grid_glob_reduce_detail::sum<Policy>(shrinkPtr(src),
|
||||
dst[0],
|
||||
WithOutMask(),
|
||||
rows, cols,
|
||||
StreamAccessor::getStream(stream));
|
||||
grid_reduce_detail::sum<Policy>(shrinkPtr(src),
|
||||
dst[0],
|
||||
WithOutMask(),
|
||||
rows, cols,
|
||||
StreamAccessor::getStream(stream));
|
||||
}
|
||||
|
||||
template <class Policy, class SrcPtr, typename ResType, class MaskPtr>
|
||||
@ -101,11 +110,11 @@ __host__ void gridFindMinVal_(const SrcPtr& src, GpuMat_<ResType>& dst, const Ma
|
||||
|
||||
CV_Assert( getRows(mask) == rows && getCols(mask) == cols );
|
||||
|
||||
grid_glob_reduce_detail::minVal<Policy>(shrinkPtr(src),
|
||||
dst[0],
|
||||
shrinkPtr(mask),
|
||||
rows, cols,
|
||||
StreamAccessor::getStream(stream));
|
||||
grid_reduce_detail::minVal<Policy>(shrinkPtr(src),
|
||||
dst[0],
|
||||
shrinkPtr(mask),
|
||||
rows, cols,
|
||||
StreamAccessor::getStream(stream));
|
||||
}
|
||||
|
||||
template <class Policy, class SrcPtr, typename ResType>
|
||||
@ -117,11 +126,11 @@ __host__ void gridFindMinVal_(const SrcPtr& src, GpuMat_<ResType>& dst, Stream&
|
||||
const int rows = getRows(src);
|
||||
const int cols = getCols(src);
|
||||
|
||||
grid_glob_reduce_detail::minVal<Policy>(shrinkPtr(src),
|
||||
dst[0],
|
||||
WithOutMask(),
|
||||
rows, cols,
|
||||
StreamAccessor::getStream(stream));
|
||||
grid_reduce_detail::minVal<Policy>(shrinkPtr(src),
|
||||
dst[0],
|
||||
WithOutMask(),
|
||||
rows, cols,
|
||||
StreamAccessor::getStream(stream));
|
||||
}
|
||||
|
||||
template <class Policy, class SrcPtr, typename ResType, class MaskPtr>
|
||||
@ -135,11 +144,11 @@ __host__ void gridFindMaxVal_(const SrcPtr& src, GpuMat_<ResType>& dst, const Ma
|
||||
|
||||
CV_Assert( getRows(mask) == rows && getCols(mask) == cols );
|
||||
|
||||
grid_glob_reduce_detail::maxVal<Policy>(shrinkPtr(src),
|
||||
dst[0],
|
||||
shrinkPtr(mask),
|
||||
rows, cols,
|
||||
StreamAccessor::getStream(stream));
|
||||
grid_reduce_detail::maxVal<Policy>(shrinkPtr(src),
|
||||
dst[0],
|
||||
shrinkPtr(mask),
|
||||
rows, cols,
|
||||
StreamAccessor::getStream(stream));
|
||||
}
|
||||
|
||||
template <class Policy, class SrcPtr, typename ResType>
|
||||
@ -151,11 +160,11 @@ __host__ void gridFindMaxVal_(const SrcPtr& src, GpuMat_<ResType>& dst, Stream&
|
||||
const int rows = getRows(src);
|
||||
const int cols = getCols(src);
|
||||
|
||||
grid_glob_reduce_detail::maxVal<Policy>(shrinkPtr(src),
|
||||
dst[0],
|
||||
WithOutMask(),
|
||||
rows, cols,
|
||||
StreamAccessor::getStream(stream));
|
||||
grid_reduce_detail::maxVal<Policy>(shrinkPtr(src),
|
||||
dst[0],
|
||||
WithOutMask(),
|
||||
rows, cols,
|
||||
StreamAccessor::getStream(stream));
|
||||
}
|
||||
|
||||
template <class Policy, class SrcPtr, typename ResType, class MaskPtr>
|
||||
@ -170,11 +179,11 @@ __host__ void gridFindMinMaxVal_(const SrcPtr& src, GpuMat_<ResType>& dst, const
|
||||
|
||||
CV_Assert( getRows(mask) == rows && getCols(mask) == cols );
|
||||
|
||||
grid_glob_reduce_detail::minMaxVal<Policy>(shrinkPtr(src),
|
||||
dst[0],
|
||||
shrinkPtr(mask),
|
||||
rows, cols,
|
||||
StreamAccessor::getStream(stream));
|
||||
grid_reduce_detail::minMaxVal<Policy>(shrinkPtr(src),
|
||||
dst[0],
|
||||
shrinkPtr(mask),
|
||||
rows, cols,
|
||||
StreamAccessor::getStream(stream));
|
||||
}
|
||||
|
||||
template <class Policy, class SrcPtr, typename ResType>
|
||||
@ -187,11 +196,51 @@ __host__ void gridFindMinMaxVal_(const SrcPtr& src, GpuMat_<ResType>& dst, Strea
|
||||
const int rows = getRows(src);
|
||||
const int cols = getCols(src);
|
||||
|
||||
grid_glob_reduce_detail::minMaxVal<Policy>(shrinkPtr(src),
|
||||
dst[0],
|
||||
WithOutMask(),
|
||||
rows, cols,
|
||||
StreamAccessor::getStream(stream));
|
||||
grid_reduce_detail::minMaxVal<Policy>(shrinkPtr(src),
|
||||
dst[0],
|
||||
WithOutMask(),
|
||||
rows, cols,
|
||||
StreamAccessor::getStream(stream));
|
||||
}
|
||||
|
||||
template <class Policy, class SrcPtr, typename ResType, class MaskPtr>
|
||||
__host__ void gridMinMaxLoc_(const SrcPtr& src, GpuMat_<ResType>& valBuf, GpuMat_<int>& locBuf, const MaskPtr& mask, Stream& stream = Stream::Null())
|
||||
{
|
||||
const int rows = getRows(src);
|
||||
const int cols = getCols(src);
|
||||
|
||||
CV_Assert( getRows(mask) == rows && getCols(mask) == cols );
|
||||
|
||||
dim3 grid, block;
|
||||
grid_minmaxloc_detail::getLaunchCfg<Policy>(rows, cols, block, grid);
|
||||
|
||||
valBuf.create(2, grid.x * grid.y);
|
||||
locBuf.create(2, grid.x * grid.y);
|
||||
|
||||
grid_minmaxloc_detail::minMaxLoc<Policy>(shrinkPtr(src),
|
||||
valBuf[0], valBuf[1], locBuf[0], locBuf[1],
|
||||
shrinkPtr(mask),
|
||||
rows, cols,
|
||||
StreamAccessor::getStream(stream));
|
||||
}
|
||||
|
||||
template <class Policy, class SrcPtr, typename ResType>
|
||||
__host__ void gridMinMaxLoc_(const SrcPtr& src, GpuMat_<ResType>& valBuf, GpuMat_<int>& locBuf, Stream& stream = Stream::Null())
|
||||
{
|
||||
const int rows = getRows(src);
|
||||
const int cols = getCols(src);
|
||||
|
||||
dim3 grid, block;
|
||||
grid_minmaxloc_detail::getLaunchCfg<Policy>(rows, cols, block, grid);
|
||||
|
||||
valBuf.create(2, grid.x * grid.y);
|
||||
locBuf.create(2, grid.x * grid.y);
|
||||
|
||||
grid_minmaxloc_detail::minMaxLoc<Policy>(shrinkPtr(src),
|
||||
valBuf[0], valBuf[1], locBuf[0], locBuf[1],
|
||||
WithOutMask(),
|
||||
rows, cols,
|
||||
StreamAccessor::getStream(stream));
|
||||
}
|
||||
|
||||
template <class Policy, class SrcPtr, typename ResType, class MaskPtr>
|
||||
@ -209,11 +258,11 @@ __host__ void gridCountNonZero_(const SrcPtr& src, GpuMat_<ResType>& dst, const
|
||||
not_equal_to<src_type> ne_op;
|
||||
const src_type zero = VecTraits<src_type>::all(0);
|
||||
|
||||
grid_glob_reduce_detail::sum<Policy>(shrinkPtr(transformPtr(src, bind2nd(ne_op, zero))),
|
||||
dst[0],
|
||||
shrinkPtr(mask),
|
||||
rows, cols,
|
||||
StreamAccessor::getStream(stream));
|
||||
grid_reduce_detail::sum<Policy>(shrinkPtr(transformPtr(src, bind2nd(ne_op, zero))),
|
||||
dst[0],
|
||||
shrinkPtr(mask),
|
||||
rows, cols,
|
||||
StreamAccessor::getStream(stream));
|
||||
}
|
||||
|
||||
template <class Policy, class SrcPtr, typename ResType>
|
||||
@ -229,11 +278,11 @@ __host__ void gridCountNonZero_(const SrcPtr& src, GpuMat_<ResType>& dst, Stream
|
||||
not_equal_to<src_type> ne_op;
|
||||
const src_type zero = VecTraits<src_type>::all(0);
|
||||
|
||||
grid_glob_reduce_detail::sum<Policy>(shrinkPtr(transformPtr(src, bind2nd(ne_op, zero))),
|
||||
dst[0],
|
||||
WithOutMask(),
|
||||
rows, cols,
|
||||
StreamAccessor::getStream(stream));
|
||||
grid_reduce_detail::sum<Policy>(shrinkPtr(transformPtr(src, bind2nd(ne_op, zero))),
|
||||
dst[0],
|
||||
WithOutMask(),
|
||||
rows, cols,
|
||||
StreamAccessor::getStream(stream));
|
||||
}
|
||||
|
||||
// default policy
|
||||
@ -297,6 +346,18 @@ __host__ void gridFindMinMaxVal(const SrcPtr& src, GpuMat_<ResType>& dst, Stream
|
||||
gridFindMinMaxVal_<DefaultGlobReducePolicy>(src, dst, stream);
|
||||
}
|
||||
|
||||
template <class SrcPtr, typename ResType, class MaskPtr>
|
||||
__host__ void gridMinMaxLoc(const SrcPtr& src, GpuMat_<ResType>& valBuf, GpuMat_<int>& locBuf, const MaskPtr& mask, Stream& stream = Stream::Null())
|
||||
{
|
||||
gridMinMaxLoc_<DefaultGlobReducePolicy>(src, valBuf, locBuf, mask, stream);
|
||||
}
|
||||
|
||||
template <class SrcPtr, typename ResType>
|
||||
__host__ void gridMinMaxLoc(const SrcPtr& src, GpuMat_<ResType>& valBuf, GpuMat_<int>& locBuf, Stream& stream = Stream::Null())
|
||||
{
|
||||
gridMinMaxLoc_<DefaultGlobReducePolicy>(src, valBuf, locBuf, stream);
|
||||
}
|
||||
|
||||
template <class SrcPtr, typename ResType, class MaskPtr>
|
||||
__host__ void gridCountNonZero(const SrcPtr& src, GpuMat_<ResType>& dst, const MaskPtr& mask, Stream& stream = Stream::Null())
|
||||
{
|
@ -49,6 +49,7 @@
|
||||
#include "../common.hpp"
|
||||
#include "../util/vec_traits.hpp"
|
||||
#include "../util/limits.hpp"
|
||||
#include "../util/saturate_cast.hpp"
|
||||
#include "../ptr2d/traits.hpp"
|
||||
#include "../ptr2d/gpumat.hpp"
|
||||
#include "../ptr2d/mask.hpp"
|
||||
@ -62,6 +63,11 @@ template <typename T> struct Sum : plus<T>
|
||||
{
|
||||
typedef T work_type;
|
||||
|
||||
template <typename U> struct rebind
|
||||
{
|
||||
typedef Sum<U> other;
|
||||
};
|
||||
|
||||
__device__ __forceinline__ static T initialValue()
|
||||
{
|
||||
return VecTraits<T>::all(0);
|
||||
@ -77,14 +83,19 @@ template <typename T> struct Avg : plus<T>
|
||||
{
|
||||
typedef T work_type;
|
||||
|
||||
template <typename U> struct rebind
|
||||
{
|
||||
typedef Avg<U> other;
|
||||
};
|
||||
|
||||
__device__ __forceinline__ static T initialValue()
|
||||
{
|
||||
return VecTraits<T>::all(0);
|
||||
}
|
||||
|
||||
__device__ __forceinline__ static T result(T r, int sz)
|
||||
__device__ __forceinline__ static T result(T r, float sz)
|
||||
{
|
||||
return r / sz;
|
||||
return saturate_cast<T>(r / sz);
|
||||
}
|
||||
};
|
||||
|
||||
@ -92,6 +103,11 @@ template <typename T> struct Min : minimum<T>
|
||||
{
|
||||
typedef T work_type;
|
||||
|
||||
template <typename U> struct rebind
|
||||
{
|
||||
typedef Min<U> other;
|
||||
};
|
||||
|
||||
__device__ __forceinline__ static T initialValue()
|
||||
{
|
||||
return VecTraits<T>::all(numeric_limits<typename VecTraits<T>::elem_type>::max());
|
||||
@ -107,6 +123,11 @@ template <typename T> struct Max : maximum<T>
|
||||
{
|
||||
typedef T work_type;
|
||||
|
||||
template <typename U> struct rebind
|
||||
{
|
||||
typedef Max<U> other;
|
||||
};
|
||||
|
||||
__device__ __forceinline__ static T initialValue()
|
||||
{
|
||||
return VecTraits<T>::all(-numeric_limits<typename VecTraits<T>::elem_type>::max());
|
||||
@ -158,7 +179,7 @@ __host__ void gridReduceToColumn_(const SrcPtr& src, GpuMat_<ResType>& dst, cons
|
||||
|
||||
CV_Assert( getRows(mask) == rows && getCols(mask) == cols );
|
||||
|
||||
createContinuous(rows, 1, DataType<ResType>::type, dst);
|
||||
dst.create(1, rows);
|
||||
|
||||
grid_reduce_to_vec_detail::reduceToColumn<Reductor, Policy>(shrinkPtr(src),
|
||||
dst[0],
|
||||
@ -173,7 +194,7 @@ __host__ void gridReduceToColumn_(const SrcPtr& src, GpuMat_<ResType>& dst, Stre
|
||||
const int rows = getRows(src);
|
||||
const int cols = getCols(src);
|
||||
|
||||
createContinuous(rows, 1, DataType<ResType>::type, dst);
|
||||
dst.create(1, rows);
|
||||
|
||||
grid_reduce_to_vec_detail::reduceToColumn<Reductor, Policy>(shrinkPtr(src),
|
||||
dst[0],
|
||||
|
@ -51,6 +51,7 @@
|
||||
#include "../util/vec_traits.hpp"
|
||||
#include "../ptr2d/traits.hpp"
|
||||
#include "../ptr2d/gpumat.hpp"
|
||||
#include "../ptr2d/glob.hpp"
|
||||
#include "../ptr2d/mask.hpp"
|
||||
#include "detail/split_merge.hpp"
|
||||
|
||||
@ -75,6 +76,24 @@ __host__ void gridMerge_(const SrcPtrTuple& src, GpuMat_<DstType>& dst, const Ma
|
||||
StreamAccessor::getStream(stream));
|
||||
}
|
||||
|
||||
template <class Policy, class SrcPtrTuple, typename DstType, class MaskPtr>
|
||||
__host__ void gridMerge_(const SrcPtrTuple& src, const GlobPtrSz<DstType>& dst, const MaskPtr& mask, Stream& stream = Stream::Null())
|
||||
{
|
||||
CV_StaticAssert( VecTraits<DstType>::cn == tuple_size<SrcPtrTuple>::value, "" );
|
||||
|
||||
const int rows = getRows(src);
|
||||
const int cols = getCols(src);
|
||||
|
||||
CV_Assert( getRows(dst) == rows && getCols(dst) == cols );
|
||||
CV_Assert( getRows(mask) == rows && getCols(mask) == cols );
|
||||
|
||||
grid_split_merge_detail::MergeImpl<VecTraits<DstType>::cn, Policy>::merge(shrinkPtr(src),
|
||||
shrinkPtr(dst),
|
||||
shrinkPtr(mask),
|
||||
rows, cols,
|
||||
StreamAccessor::getStream(stream));
|
||||
}
|
||||
|
||||
template <class Policy, class SrcPtrTuple, typename DstType>
|
||||
__host__ void gridMerge_(const SrcPtrTuple& src, GpuMat_<DstType>& dst, Stream& stream = Stream::Null())
|
||||
{
|
||||
@ -92,6 +111,23 @@ __host__ void gridMerge_(const SrcPtrTuple& src, GpuMat_<DstType>& dst, Stream&
|
||||
StreamAccessor::getStream(stream));
|
||||
}
|
||||
|
||||
template <class Policy, class SrcPtrTuple, typename DstType>
|
||||
__host__ void gridMerge_(const SrcPtrTuple& src, const GlobPtrSz<DstType>& dst, Stream& stream = Stream::Null())
|
||||
{
|
||||
CV_StaticAssert( VecTraits<DstType>::cn == tuple_size<SrcPtrTuple>::value, "" );
|
||||
|
||||
const int rows = getRows(src);
|
||||
const int cols = getCols(src);
|
||||
|
||||
CV_Assert( getRows(dst) == rows && getCols(dst) == cols );
|
||||
|
||||
grid_split_merge_detail::MergeImpl<VecTraits<DstType>::cn, Policy>::merge(shrinkPtr(src),
|
||||
shrinkPtr(dst),
|
||||
WithOutMask(),
|
||||
rows, cols,
|
||||
StreamAccessor::getStream(stream));
|
||||
}
|
||||
|
||||
template <class Policy, class SrcPtr, typename DstType, class MaskPtr>
|
||||
__host__ void gridSplit_(const SrcPtr& src, const tuple< GpuMat_<DstType>&, GpuMat_<DstType>& >& dst, const MaskPtr& mask, Stream& stream = Stream::Null())
|
||||
{
|
||||
@ -132,6 +168,25 @@ __host__ void gridSplit_(const SrcPtr& src, GpuMat_<DstType> (&dst)[2], const Ma
|
||||
StreamAccessor::getStream(stream));
|
||||
}
|
||||
|
||||
template <class Policy, class SrcPtr, typename DstType, class MaskPtr>
|
||||
__host__ void gridSplit_(const SrcPtr& src, GlobPtrSz<DstType> (&dst)[2], const MaskPtr& mask, Stream& stream = Stream::Null())
|
||||
{
|
||||
CV_StaticAssert( VecTraits<typename PtrTraits<SrcPtr>::value_type>::cn == 2, "" );
|
||||
|
||||
const int rows = getRows(src);
|
||||
const int cols = getCols(src);
|
||||
|
||||
CV_Assert( getRows(dst[0]) == rows && getCols(dst[0]) == cols );
|
||||
CV_Assert( getRows(dst[1]) == rows && getCols(dst[1]) == cols );
|
||||
CV_Assert( getRows(mask) == rows && getCols(mask) == cols );
|
||||
|
||||
grid_split_merge_detail::split<Policy>(shrinkPtr(src),
|
||||
shrinkPtr(dst[0]), shrinkPtr(dst[1]),
|
||||
shrinkPtr(mask),
|
||||
rows, cols,
|
||||
StreamAccessor::getStream(stream));
|
||||
}
|
||||
|
||||
template <class Policy, class SrcPtr, typename DstType>
|
||||
__host__ void gridSplit_(const SrcPtr& src, const tuple< GpuMat_<DstType>&, GpuMat_<DstType>& >& dst, Stream& stream = Stream::Null())
|
||||
{
|
||||
@ -168,6 +223,24 @@ __host__ void gridSplit_(const SrcPtr& src, GpuMat_<DstType> (&dst)[2], Stream&
|
||||
StreamAccessor::getStream(stream));
|
||||
}
|
||||
|
||||
template <class Policy, class SrcPtr, typename DstType>
|
||||
__host__ void gridSplit_(const SrcPtr& src, GlobPtrSz<DstType> (&dst)[2], Stream& stream = Stream::Null())
|
||||
{
|
||||
CV_StaticAssert( VecTraits<typename PtrTraits<SrcPtr>::value_type>::cn == 2, "" );
|
||||
|
||||
const int rows = getRows(src);
|
||||
const int cols = getCols(src);
|
||||
|
||||
CV_Assert( getRows(dst[0]) == rows && getCols(dst[0]) == cols );
|
||||
CV_Assert( getRows(dst[1]) == rows && getCols(dst[1]) == cols );
|
||||
|
||||
grid_split_merge_detail::split<Policy>(shrinkPtr(src),
|
||||
shrinkPtr(dst[0]), shrinkPtr(dst[1]),
|
||||
WithOutMask(),
|
||||
rows, cols,
|
||||
StreamAccessor::getStream(stream));
|
||||
}
|
||||
|
||||
template <class Policy, class SrcPtr, typename DstType, class MaskPtr>
|
||||
__host__ void gridSplit_(const SrcPtr& src, const tuple< GpuMat_<DstType>&, GpuMat_<DstType>&, GpuMat_<DstType>& >& dst, const MaskPtr& mask, Stream& stream = Stream::Null())
|
||||
{
|
||||
@ -210,6 +283,26 @@ __host__ void gridSplit_(const SrcPtr& src, GpuMat_<DstType> (&dst)[3], const Ma
|
||||
StreamAccessor::getStream(stream));
|
||||
}
|
||||
|
||||
template <class Policy, class SrcPtr, typename DstType, class MaskPtr>
|
||||
__host__ void gridSplit_(const SrcPtr& src, GlobPtrSz<DstType> (&dst)[3], const MaskPtr& mask, Stream& stream = Stream::Null())
|
||||
{
|
||||
CV_StaticAssert( VecTraits<typename PtrTraits<SrcPtr>::value_type>::cn == 3, "" );
|
||||
|
||||
const int rows = getRows(src);
|
||||
const int cols = getCols(src);
|
||||
|
||||
CV_Assert( getRows(dst[0]) == rows && getCols(dst[0]) == cols );
|
||||
CV_Assert( getRows(dst[1]) == rows && getCols(dst[1]) == cols );
|
||||
CV_Assert( getRows(dst[2]) == rows && getCols(dst[2]) == cols );
|
||||
CV_Assert( getRows(mask) == rows && getCols(mask) == cols );
|
||||
|
||||
grid_split_merge_detail::split<Policy>(shrinkPtr(src),
|
||||
shrinkPtr(dst[0]), shrinkPtr(dst[1]), shrinkPtr(dst[2]),
|
||||
shrinkPtr(mask),
|
||||
rows, cols,
|
||||
StreamAccessor::getStream(stream));
|
||||
}
|
||||
|
||||
template <class Policy, class SrcPtr, typename DstType>
|
||||
__host__ void gridSplit_(const SrcPtr& src, const tuple< GpuMat_<DstType>&, GpuMat_<DstType>&, GpuMat_<DstType>& >& dst, Stream& stream = Stream::Null())
|
||||
{
|
||||
@ -248,6 +341,25 @@ __host__ void gridSplit_(const SrcPtr& src, GpuMat_<DstType> (&dst)[3], Stream&
|
||||
StreamAccessor::getStream(stream));
|
||||
}
|
||||
|
||||
template <class Policy, class SrcPtr, typename DstType>
|
||||
__host__ void gridSplit_(const SrcPtr& src, GlobPtrSz<DstType> (&dst)[3], Stream& stream = Stream::Null())
|
||||
{
|
||||
CV_StaticAssert( VecTraits<typename PtrTraits<SrcPtr>::value_type>::cn == 3, "" );
|
||||
|
||||
const int rows = getRows(src);
|
||||
const int cols = getCols(src);
|
||||
|
||||
CV_Assert( getRows(dst[0]) == rows && getCols(dst[0]) == cols );
|
||||
CV_Assert( getRows(dst[1]) == rows && getCols(dst[1]) == cols );
|
||||
CV_Assert( getRows(dst[2]) == rows && getCols(dst[2]) == cols );
|
||||
|
||||
grid_split_merge_detail::split<Policy>(shrinkPtr(src),
|
||||
shrinkPtr(dst[0]), shrinkPtr(dst[1]), shrinkPtr(dst[2]),
|
||||
WithOutMask(),
|
||||
rows, cols,
|
||||
StreamAccessor::getStream(stream));
|
||||
}
|
||||
|
||||
template <class Policy, class SrcPtr, typename DstType, class MaskPtr>
|
||||
__host__ void gridSplit_(const SrcPtr& src, const tuple< GpuMat_<DstType>&, GpuMat_<DstType>&, GpuMat_<DstType>&, GpuMat_<DstType>& >& dst, const MaskPtr& mask, Stream& stream = Stream::Null())
|
||||
{
|
||||
@ -283,10 +395,31 @@ __host__ void gridSplit_(const SrcPtr& src, GpuMat_<DstType> (&dst)[4], const Ma
|
||||
dst[0].create(rows, cols);
|
||||
dst[1].create(rows, cols);
|
||||
dst[2].create(rows, cols);
|
||||
dst[4].create(rows, cols);
|
||||
dst[3].create(rows, cols);
|
||||
|
||||
grid_split_merge_detail::split<Policy>(shrinkPtr(src),
|
||||
shrinkPtr(dst[0]), shrinkPtr(dst[1]), shrinkPtr(dst[2]), shrinkPtr(dst[4]),
|
||||
shrinkPtr(dst[0]), shrinkPtr(dst[1]), shrinkPtr(dst[2]), shrinkPtr(dst[3]),
|
||||
shrinkPtr(mask),
|
||||
rows, cols,
|
||||
StreamAccessor::getStream(stream));
|
||||
}
|
||||
|
||||
template <class Policy, class SrcPtr, typename DstType, class MaskPtr>
|
||||
__host__ void gridSplit_(const SrcPtr& src, GlobPtrSz<DstType> (&dst)[4], const MaskPtr& mask, Stream& stream = Stream::Null())
|
||||
{
|
||||
CV_StaticAssert( VecTraits<typename PtrTraits<SrcPtr>::value_type>::cn == 4, "" );
|
||||
|
||||
const int rows = getRows(src);
|
||||
const int cols = getCols(src);
|
||||
|
||||
CV_Assert( getRows(dst[0]) == rows && getCols(dst[0]) == cols );
|
||||
CV_Assert( getRows(dst[1]) == rows && getCols(dst[1]) == cols );
|
||||
CV_Assert( getRows(dst[2]) == rows && getCols(dst[2]) == cols );
|
||||
CV_Assert( getRows(dst[3]) == rows && getCols(dst[3]) == cols );
|
||||
CV_Assert( getRows(mask) == rows && getCols(mask) == cols );
|
||||
|
||||
grid_split_merge_detail::split<Policy>(shrinkPtr(src),
|
||||
shrinkPtr(dst[0]), shrinkPtr(dst[1]), shrinkPtr(dst[2]), shrinkPtr(dst[3]),
|
||||
shrinkPtr(mask),
|
||||
rows, cols,
|
||||
StreamAccessor::getStream(stream));
|
||||
@ -323,10 +456,30 @@ __host__ void gridSplit_(const SrcPtr& src, GpuMat_<DstType> (&dst)[4], Stream&
|
||||
dst[0].create(rows, cols);
|
||||
dst[1].create(rows, cols);
|
||||
dst[2].create(rows, cols);
|
||||
dst[4].create(rows, cols);
|
||||
dst[3].create(rows, cols);
|
||||
|
||||
grid_split_merge_detail::split<Policy>(shrinkPtr(src),
|
||||
shrinkPtr(dst[0]), shrinkPtr(dst[1]), shrinkPtr(dst[2]), shrinkPtr(dst[4]),
|
||||
shrinkPtr(dst[0]), shrinkPtr(dst[1]), shrinkPtr(dst[2]), shrinkPtr(dst[3]),
|
||||
WithOutMask(),
|
||||
rows, cols,
|
||||
StreamAccessor::getStream(stream));
|
||||
}
|
||||
|
||||
template <class Policy, class SrcPtr, typename DstType>
|
||||
__host__ void gridSplit_(const SrcPtr& src, GlobPtrSz<DstType> (&dst)[4], Stream& stream = Stream::Null())
|
||||
{
|
||||
CV_StaticAssert( VecTraits<typename PtrTraits<SrcPtr>::value_type>::cn == 4, "" );
|
||||
|
||||
const int rows = getRows(src);
|
||||
const int cols = getCols(src);
|
||||
|
||||
CV_Assert( getRows(dst[0]) == rows && getCols(dst[0]) == cols );
|
||||
CV_Assert( getRows(dst[1]) == rows && getCols(dst[1]) == cols );
|
||||
CV_Assert( getRows(dst[2]) == rows && getCols(dst[2]) == cols );
|
||||
CV_Assert( getRows(dst[3]) == rows && getCols(dst[3]) == cols );
|
||||
|
||||
grid_split_merge_detail::split<Policy>(shrinkPtr(src),
|
||||
shrinkPtr(dst[0]), shrinkPtr(dst[1]), shrinkPtr(dst[2]), shrinkPtr(dst[3]),
|
||||
WithOutMask(),
|
||||
rows, cols,
|
||||
StreamAccessor::getStream(stream));
|
||||
@ -348,12 +501,24 @@ __host__ void gridMerge(const SrcPtrTuple& src, GpuMat_<DstType>& dst, const Mas
|
||||
gridMerge_<DefaultSplitMergePolicy>(src, dst, mask, stream);
|
||||
}
|
||||
|
||||
template <class SrcPtrTuple, typename DstType, class MaskPtr>
|
||||
__host__ void gridMerge(const SrcPtrTuple& src, const GlobPtrSz<DstType>& dst, const MaskPtr& mask, Stream& stream = Stream::Null())
|
||||
{
|
||||
gridMerge_<DefaultSplitMergePolicy>(src, dst, mask, stream);
|
||||
}
|
||||
|
||||
template <class SrcPtrTuple, typename DstType>
|
||||
__host__ void gridMerge(const SrcPtrTuple& src, GpuMat_<DstType>& dst, Stream& stream = Stream::Null())
|
||||
{
|
||||
gridMerge_<DefaultSplitMergePolicy>(src, dst, stream);
|
||||
}
|
||||
|
||||
template <class SrcPtrTuple, typename DstType>
|
||||
__host__ void gridMerge(const SrcPtrTuple& src, const GlobPtrSz<DstType>& dst, Stream& stream = Stream::Null())
|
||||
{
|
||||
gridMerge_<DefaultSplitMergePolicy>(src, dst, stream);
|
||||
}
|
||||
|
||||
template <class SrcPtr, typename DstType, class MaskPtr>
|
||||
__host__ void gridSplit(const SrcPtr& src, const tuple< GpuMat_<DstType>&, GpuMat_<DstType>& >& dst, const MaskPtr& mask, Stream& stream = Stream::Null())
|
||||
{
|
||||
@ -396,12 +561,24 @@ __host__ void gridSplit(const SrcPtr& src, GpuMat_<DstType> (&dst)[COUNT], const
|
||||
gridSplit_<DefaultSplitMergePolicy>(src, dst, mask, stream);
|
||||
}
|
||||
|
||||
template <class SrcPtr, typename DstType, int COUNT, class MaskPtr>
|
||||
__host__ void gridSplit(const SrcPtr& src, GlobPtrSz<DstType> (&dst)[COUNT], const MaskPtr& mask, Stream& stream = Stream::Null())
|
||||
{
|
||||
gridSplit_<DefaultSplitMergePolicy>(src, dst, mask, stream);
|
||||
}
|
||||
|
||||
template <class SrcPtr, typename DstType, int COUNT>
|
||||
__host__ void gridSplit(const SrcPtr& src, GpuMat_<DstType> (&dst)[COUNT], Stream& stream = Stream::Null())
|
||||
{
|
||||
gridSplit_<DefaultSplitMergePolicy>(src, dst, stream);
|
||||
}
|
||||
|
||||
template <class SrcPtr, typename DstType, int COUNT>
|
||||
__host__ void gridSplit(const SrcPtr& src, GlobPtrSz<DstType> (&dst)[COUNT], Stream& stream = Stream::Null())
|
||||
{
|
||||
gridSplit_<DefaultSplitMergePolicy>(src, dst, stream);
|
||||
}
|
||||
|
||||
}}
|
||||
|
||||
#endif
|
||||
|
@ -58,7 +58,7 @@
|
||||
namespace cv { namespace cudev {
|
||||
|
||||
template <class Policy, class SrcPtr, typename DstType, class UnOp, class MaskPtr>
|
||||
__host__ void gridTransform_(const SrcPtr& src, GpuMat_<DstType>& dst, const UnOp& op, const MaskPtr& mask, Stream& stream = Stream::Null())
|
||||
__host__ void gridTransformUnary_(const SrcPtr& src, GpuMat_<DstType>& dst, const UnOp& op, const MaskPtr& mask, Stream& stream = Stream::Null())
|
||||
{
|
||||
const int rows = getRows(src);
|
||||
const int cols = getCols(src);
|
||||
@ -67,11 +67,11 @@ __host__ void gridTransform_(const SrcPtr& src, GpuMat_<DstType>& dst, const UnO
|
||||
|
||||
dst.create(rows, cols);
|
||||
|
||||
grid_transform_detail::transform<Policy>(shrinkPtr(src), shrinkPtr(dst), op, shrinkPtr(mask), rows, cols, StreamAccessor::getStream(stream));
|
||||
grid_transform_detail::transform_unary<Policy>(shrinkPtr(src), shrinkPtr(dst), op, shrinkPtr(mask), rows, cols, StreamAccessor::getStream(stream));
|
||||
}
|
||||
|
||||
template <class Policy, class SrcPtr, typename DstType, class UnOp, class MaskPtr>
|
||||
__host__ void gridTransform_(const SrcPtr& src, const GlobPtrSz<DstType>& dst, const UnOp& op, const MaskPtr& mask, Stream& stream = Stream::Null())
|
||||
__host__ void gridTransformUnary_(const SrcPtr& src, const GlobPtrSz<DstType>& dst, const UnOp& op, const MaskPtr& mask, Stream& stream = Stream::Null())
|
||||
{
|
||||
const int rows = getRows(src);
|
||||
const int cols = getCols(src);
|
||||
@ -79,33 +79,33 @@ __host__ void gridTransform_(const SrcPtr& src, const GlobPtrSz<DstType>& dst, c
|
||||
CV_Assert( getRows(dst) == rows && getCols(dst) == cols );
|
||||
CV_Assert( getRows(mask) == rows && getCols(mask) == cols );
|
||||
|
||||
grid_transform_detail::transform<Policy>(shrinkPtr(src), shrinkPtr(dst), op, shrinkPtr(mask), rows, cols, StreamAccessor::getStream(stream));
|
||||
grid_transform_detail::transform_unary<Policy>(shrinkPtr(src), shrinkPtr(dst), op, shrinkPtr(mask), rows, cols, StreamAccessor::getStream(stream));
|
||||
}
|
||||
|
||||
template <class Policy, class SrcPtr, typename DstType, class UnOp>
|
||||
__host__ void gridTransform_(const SrcPtr& src, GpuMat_<DstType>& dst, const UnOp& op, Stream& stream = Stream::Null())
|
||||
__host__ void gridTransformUnary_(const SrcPtr& src, GpuMat_<DstType>& dst, const UnOp& op, Stream& stream = Stream::Null())
|
||||
{
|
||||
const int rows = getRows(src);
|
||||
const int cols = getCols(src);
|
||||
|
||||
dst.create(rows, cols);
|
||||
|
||||
grid_transform_detail::transform<Policy>(shrinkPtr(src), shrinkPtr(dst), op, WithOutMask(), rows, cols, StreamAccessor::getStream(stream));
|
||||
grid_transform_detail::transform_unary<Policy>(shrinkPtr(src), shrinkPtr(dst), op, WithOutMask(), rows, cols, StreamAccessor::getStream(stream));
|
||||
}
|
||||
|
||||
template <class Policy, class SrcPtr, typename DstType, class UnOp>
|
||||
__host__ void gridTransform_(const SrcPtr& src, const GlobPtrSz<DstType>& dst, const UnOp& op, Stream& stream = Stream::Null())
|
||||
__host__ void gridTransformUnary_(const SrcPtr& src, const GlobPtrSz<DstType>& dst, const UnOp& op, Stream& stream = Stream::Null())
|
||||
{
|
||||
const int rows = getRows(src);
|
||||
const int cols = getCols(src);
|
||||
|
||||
CV_Assert( getRows(dst) == rows && getCols(dst) == cols );
|
||||
|
||||
grid_transform_detail::transform<Policy>(shrinkPtr(src), shrinkPtr(dst), op, WithOutMask(), rows, cols, StreamAccessor::getStream(stream));
|
||||
grid_transform_detail::transform_unary<Policy>(shrinkPtr(src), shrinkPtr(dst), op, WithOutMask(), rows, cols, StreamAccessor::getStream(stream));
|
||||
}
|
||||
|
||||
template <class Policy, class SrcPtr1, class SrcPtr2, typename DstType, class BinOp, class MaskPtr>
|
||||
__host__ void gridTransform_(const SrcPtr1& src1, const SrcPtr2& src2, GpuMat_<DstType>& dst, const BinOp& op, const MaskPtr& mask, Stream& stream = Stream::Null())
|
||||
__host__ void gridTransformBinary_(const SrcPtr1& src1, const SrcPtr2& src2, GpuMat_<DstType>& dst, const BinOp& op, const MaskPtr& mask, Stream& stream = Stream::Null())
|
||||
{
|
||||
const int rows = getRows(src1);
|
||||
const int cols = getCols(src1);
|
||||
@ -115,11 +115,11 @@ __host__ void gridTransform_(const SrcPtr1& src1, const SrcPtr2& src2, GpuMat_<D
|
||||
|
||||
dst.create(rows, cols);
|
||||
|
||||
grid_transform_detail::transform<Policy>(shrinkPtr(src1), shrinkPtr(src2), shrinkPtr(dst), op, shrinkPtr(mask), rows, cols, StreamAccessor::getStream(stream));
|
||||
grid_transform_detail::transform_binary<Policy>(shrinkPtr(src1), shrinkPtr(src2), shrinkPtr(dst), op, shrinkPtr(mask), rows, cols, StreamAccessor::getStream(stream));
|
||||
}
|
||||
|
||||
template <class Policy, class SrcPtr1, class SrcPtr2, typename DstType, class BinOp, class MaskPtr>
|
||||
__host__ void gridTransform_(const SrcPtr1& src1, const SrcPtr2& src2, const GlobPtrSz<DstType>& dst, const BinOp& op, const MaskPtr& mask, Stream& stream = Stream::Null())
|
||||
__host__ void gridTransformBinary_(const SrcPtr1& src1, const SrcPtr2& src2, const GlobPtrSz<DstType>& dst, const BinOp& op, const MaskPtr& mask, Stream& stream = Stream::Null())
|
||||
{
|
||||
const int rows = getRows(src1);
|
||||
const int cols = getCols(src1);
|
||||
@ -128,11 +128,11 @@ __host__ void gridTransform_(const SrcPtr1& src1, const SrcPtr2& src2, const Glo
|
||||
CV_Assert( getRows(src2) == rows && getCols(src2) == cols );
|
||||
CV_Assert( getRows(mask) == rows && getCols(mask) == cols );
|
||||
|
||||
grid_transform_detail::transform<Policy>(shrinkPtr(src1), shrinkPtr(src2), shrinkPtr(dst), op, shrinkPtr(mask), rows, cols, StreamAccessor::getStream(stream));
|
||||
grid_transform_detail::transform_binary<Policy>(shrinkPtr(src1), shrinkPtr(src2), shrinkPtr(dst), op, shrinkPtr(mask), rows, cols, StreamAccessor::getStream(stream));
|
||||
}
|
||||
|
||||
template <class Policy, class SrcPtr1, class SrcPtr2, typename DstType, class BinOp>
|
||||
__host__ void gridTransform_(const SrcPtr1& src1, const SrcPtr2& src2, GpuMat_<DstType>& dst, const BinOp& op, Stream& stream = Stream::Null())
|
||||
__host__ void gridTransformBinary_(const SrcPtr1& src1, const SrcPtr2& src2, GpuMat_<DstType>& dst, const BinOp& op, Stream& stream = Stream::Null())
|
||||
{
|
||||
const int rows = getRows(src1);
|
||||
const int cols = getCols(src1);
|
||||
@ -141,11 +141,11 @@ __host__ void gridTransform_(const SrcPtr1& src1, const SrcPtr2& src2, GpuMat_<D
|
||||
|
||||
dst.create(rows, cols);
|
||||
|
||||
grid_transform_detail::transform<Policy>(shrinkPtr(src1), shrinkPtr(src2), shrinkPtr(dst), op, WithOutMask(), rows, cols, StreamAccessor::getStream(stream));
|
||||
grid_transform_detail::transform_binary<Policy>(shrinkPtr(src1), shrinkPtr(src2), shrinkPtr(dst), op, WithOutMask(), rows, cols, StreamAccessor::getStream(stream));
|
||||
}
|
||||
|
||||
template <class Policy, class SrcPtr1, class SrcPtr2, typename DstType, class BinOp>
|
||||
__host__ void gridTransform_(const SrcPtr1& src1, const SrcPtr2& src2, GlobPtrSz<DstType>& dst, const BinOp& op, Stream& stream = Stream::Null())
|
||||
__host__ void gridTransformBinary_(const SrcPtr1& src1, const SrcPtr2& src2, const GlobPtrSz<DstType>& dst, const BinOp& op, Stream& stream = Stream::Null())
|
||||
{
|
||||
const int rows = getRows(src1);
|
||||
const int cols = getCols(src1);
|
||||
@ -153,11 +153,11 @@ __host__ void gridTransform_(const SrcPtr1& src1, const SrcPtr2& src2, GlobPtrSz
|
||||
CV_Assert( getRows(dst) == rows && getCols(dst) == cols );
|
||||
CV_Assert( getRows(src2) == rows && getCols(src2) == cols );
|
||||
|
||||
grid_transform_detail::transform<Policy>(shrinkPtr(src1), shrinkPtr(src2), shrinkPtr(dst), op, WithOutMask(), rows, cols, StreamAccessor::getStream(stream));
|
||||
grid_transform_detail::transform_binary<Policy>(shrinkPtr(src1), shrinkPtr(src2), shrinkPtr(dst), op, WithOutMask(), rows, cols, StreamAccessor::getStream(stream));
|
||||
}
|
||||
|
||||
template <class Policy, class SrcPtr, typename D0, typename D1, class OpTuple, class MaskPtr>
|
||||
__host__ void gridTransform_(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>& >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
|
||||
__host__ void gridTransformTuple_(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>& >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
|
||||
{
|
||||
CV_StaticAssert( tuple_size<OpTuple>::value == 2, "" );
|
||||
|
||||
@ -178,7 +178,7 @@ __host__ void gridTransform_(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMa
|
||||
}
|
||||
|
||||
template <class Policy, class SrcPtr, typename D0, typename D1, class OpTuple, class MaskPtr>
|
||||
__host__ void gridTransform_(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1> >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
|
||||
__host__ void gridTransformTuple_(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1> >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
|
||||
{
|
||||
CV_StaticAssert( tuple_size<OpTuple>::value == 2, "" );
|
||||
|
||||
@ -198,7 +198,7 @@ __host__ void gridTransform_(const SrcPtr& src, const tuple< GlobPtrSz<D0>, Glob
|
||||
}
|
||||
|
||||
template <class Policy, class SrcPtr, typename D0, typename D1, class OpTuple>
|
||||
__host__ void gridTransform_(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>& >& dst, const OpTuple& op, Stream& stream = Stream::Null())
|
||||
__host__ void gridTransformTuple_(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>& >& dst, const OpTuple& op, Stream& stream = Stream::Null())
|
||||
{
|
||||
CV_StaticAssert( tuple_size<OpTuple>::value == 2, "" );
|
||||
|
||||
@ -217,7 +217,7 @@ __host__ void gridTransform_(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMa
|
||||
}
|
||||
|
||||
template <class Policy, class SrcPtr, typename D0, typename D1, class OpTuple>
|
||||
__host__ void gridTransform_(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1> >& dst, const OpTuple& op, Stream& stream = Stream::Null())
|
||||
__host__ void gridTransformTuple_(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1> >& dst, const OpTuple& op, Stream& stream = Stream::Null())
|
||||
{
|
||||
CV_StaticAssert( tuple_size<OpTuple>::value == 2, "" );
|
||||
|
||||
@ -236,7 +236,7 @@ __host__ void gridTransform_(const SrcPtr& src, const tuple< GlobPtrSz<D0>, Glob
|
||||
}
|
||||
|
||||
template <class Policy, class SrcPtr, typename D0, typename D1, typename D2, class OpTuple, class MaskPtr>
|
||||
__host__ void gridTransform_(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>&, GpuMat_<D2>& >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
|
||||
__host__ void gridTransformTuple_(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>&, GpuMat_<D2>& >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
|
||||
{
|
||||
CV_StaticAssert( tuple_size<OpTuple>::value == 3, "" );
|
||||
|
||||
@ -258,7 +258,7 @@ __host__ void gridTransform_(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMa
|
||||
}
|
||||
|
||||
template <class Policy, class SrcPtr, typename D0, typename D1, typename D2, class OpTuple, class MaskPtr>
|
||||
__host__ void gridTransform_(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1>, GlobPtrSz<D2> >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
|
||||
__host__ void gridTransformTuple_(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1>, GlobPtrSz<D2> >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
|
||||
{
|
||||
CV_StaticAssert( tuple_size<OpTuple>::value == 3, "" );
|
||||
|
||||
@ -279,7 +279,7 @@ __host__ void gridTransform_(const SrcPtr& src, const tuple< GlobPtrSz<D0>, Glob
|
||||
}
|
||||
|
||||
template <class Policy, class SrcPtr, typename D0, typename D1, typename D2, class OpTuple>
|
||||
__host__ void gridTransform_(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>&, GpuMat_<D2>& >& dst, const OpTuple& op, Stream& stream = Stream::Null())
|
||||
__host__ void gridTransformTuple_(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>&, GpuMat_<D2>& >& dst, const OpTuple& op, Stream& stream = Stream::Null())
|
||||
{
|
||||
CV_StaticAssert( tuple_size<OpTuple>::value == 3, "" );
|
||||
|
||||
@ -299,7 +299,7 @@ __host__ void gridTransform_(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMa
|
||||
}
|
||||
|
||||
template <class Policy, class SrcPtr, typename D0, typename D1, typename D2, class OpTuple>
|
||||
__host__ void gridTransform_(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1>, GlobPtrSz<D2> >& dst, const OpTuple& op, Stream& stream = Stream::Null())
|
||||
__host__ void gridTransformTuple_(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1>, GlobPtrSz<D2> >& dst, const OpTuple& op, Stream& stream = Stream::Null())
|
||||
{
|
||||
CV_StaticAssert( tuple_size<OpTuple>::value == 3, "" );
|
||||
|
||||
@ -319,7 +319,7 @@ __host__ void gridTransform_(const SrcPtr& src, const tuple< GlobPtrSz<D0>, Glob
|
||||
}
|
||||
|
||||
template <class Policy, class SrcPtr, typename D0, typename D1, typename D2, typename D3, class OpTuple, class MaskPtr>
|
||||
__host__ void gridTransform_(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>&, GpuMat_<D2>&, GpuMat_<D3>& >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
|
||||
__host__ void gridTransformTuple_(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>&, GpuMat_<D2>&, GpuMat_<D3>& >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
|
||||
{
|
||||
CV_StaticAssert( tuple_size<OpTuple>::value == 4, "" );
|
||||
|
||||
@ -342,7 +342,7 @@ __host__ void gridTransform_(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMa
|
||||
}
|
||||
|
||||
template <class Policy, class SrcPtr, typename D0, typename D1, typename D2, typename D3, class OpTuple, class MaskPtr>
|
||||
__host__ void gridTransform_(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1>, GlobPtrSz<D2>, GlobPtrSz<D3> >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
|
||||
__host__ void gridTransformTuple_(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1>, GlobPtrSz<D2>, GlobPtrSz<D3> >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
|
||||
{
|
||||
CV_StaticAssert( tuple_size<OpTuple>::value == 4, "" );
|
||||
|
||||
@ -364,7 +364,7 @@ __host__ void gridTransform_(const SrcPtr& src, const tuple< GlobPtrSz<D0>, Glob
|
||||
}
|
||||
|
||||
template <class Policy, class SrcPtr, typename D0, typename D1, typename D2, typename D3, class OpTuple>
|
||||
__host__ void gridTransform_(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>&, GpuMat_<D2>&, GpuMat_<D3>& >& dst, const OpTuple& op, Stream& stream = Stream::Null())
|
||||
__host__ void gridTransformTuple_(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>&, GpuMat_<D2>&, GpuMat_<D3>& >& dst, const OpTuple& op, Stream& stream = Stream::Null())
|
||||
{
|
||||
CV_StaticAssert( tuple_size<OpTuple>::value == 4, "" );
|
||||
|
||||
@ -385,7 +385,7 @@ __host__ void gridTransform_(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMa
|
||||
}
|
||||
|
||||
template <class Policy, class SrcPtr, typename D0, typename D1, typename D2, typename D3, class OpTuple>
|
||||
__host__ void gridTransform_(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1>, GlobPtrSz<D2>, GlobPtrSz<D3> >& dst, const OpTuple& op, Stream& stream = Stream::Null())
|
||||
__host__ void gridTransformTuple_(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1>, GlobPtrSz<D2>, GlobPtrSz<D3> >& dst, const OpTuple& op, Stream& stream = Stream::Null())
|
||||
{
|
||||
CV_StaticAssert( tuple_size<OpTuple>::value == 4, "" );
|
||||
|
||||
@ -417,123 +417,123 @@ struct DefaultTransformPolicy
|
||||
};
|
||||
|
||||
template <class SrcPtr, typename DstType, class Op, class MaskPtr>
|
||||
__host__ void gridTransform(const SrcPtr& src, GpuMat_<DstType>& dst, const Op& op, const MaskPtr& mask, Stream& stream = Stream::Null())
|
||||
__host__ void gridTransformUnary(const SrcPtr& src, GpuMat_<DstType>& dst, const Op& op, const MaskPtr& mask, Stream& stream = Stream::Null())
|
||||
{
|
||||
gridTransform_<DefaultTransformPolicy>(src, dst, op, mask, stream);
|
||||
gridTransformUnary_<DefaultTransformPolicy>(src, dst, op, mask, stream);
|
||||
}
|
||||
|
||||
template <class SrcPtr, typename DstType, class Op, class MaskPtr>
|
||||
__host__ void gridTransform(const SrcPtr& src, const GlobPtrSz<DstType>& dst, const Op& op, const MaskPtr& mask, Stream& stream = Stream::Null())
|
||||
__host__ void gridTransformUnary(const SrcPtr& src, const GlobPtrSz<DstType>& dst, const Op& op, const MaskPtr& mask, Stream& stream = Stream::Null())
|
||||
{
|
||||
gridTransform_<DefaultTransformPolicy>(src, dst, op, mask, stream);
|
||||
gridTransformUnary_<DefaultTransformPolicy>(src, dst, op, mask, stream);
|
||||
}
|
||||
|
||||
template <class SrcPtr, typename DstType, class Op>
|
||||
__host__ void gridTransform(const SrcPtr& src, GpuMat_<DstType>& dst, const Op& op, Stream& stream = Stream::Null())
|
||||
__host__ void gridTransformUnary(const SrcPtr& src, GpuMat_<DstType>& dst, const Op& op, Stream& stream = Stream::Null())
|
||||
{
|
||||
gridTransform_<DefaultTransformPolicy>(src, dst, op, stream);
|
||||
gridTransformUnary_<DefaultTransformPolicy>(src, dst, op, stream);
|
||||
}
|
||||
|
||||
template <class SrcPtr, typename DstType, class Op>
|
||||
__host__ void gridTransform(const SrcPtr& src, const GlobPtrSz<DstType>& dst, const Op& op, Stream& stream = Stream::Null())
|
||||
__host__ void gridTransformUnary(const SrcPtr& src, const GlobPtrSz<DstType>& dst, const Op& op, Stream& stream = Stream::Null())
|
||||
{
|
||||
gridTransform_<DefaultTransformPolicy>(src, dst, op, stream);
|
||||
gridTransformUnary_<DefaultTransformPolicy>(src, dst, op, stream);
|
||||
}
|
||||
|
||||
template <class SrcPtr1, class SrcPtr2, typename DstType, class Op, class MaskPtr>
|
||||
__host__ void gridTransform(const SrcPtr1& src1, const SrcPtr1& src2, GpuMat_<DstType>& dst, const Op& op, const MaskPtr& mask, Stream& stream = Stream::Null())
|
||||
__host__ void gridTransformBinary(const SrcPtr1& src1, const SrcPtr2& src2, GpuMat_<DstType>& dst, const Op& op, const MaskPtr& mask, Stream& stream = Stream::Null())
|
||||
{
|
||||
gridTransform_<DefaultTransformPolicy>(src1, src2, dst, op, mask, stream);
|
||||
gridTransformBinary_<DefaultTransformPolicy>(src1, src2, dst, op, mask, stream);
|
||||
}
|
||||
|
||||
template <class SrcPtr1, class SrcPtr2, typename DstType, class Op, class MaskPtr>
|
||||
__host__ void gridTransform(const SrcPtr1& src1, const SrcPtr1& src2, const GlobPtrSz<DstType>& dst, const Op& op, const MaskPtr& mask, Stream& stream = Stream::Null())
|
||||
__host__ void gridTransformBinary(const SrcPtr1& src1, const SrcPtr2& src2, const GlobPtrSz<DstType>& dst, const Op& op, const MaskPtr& mask, Stream& stream = Stream::Null())
|
||||
{
|
||||
gridTransform_<DefaultTransformPolicy>(src1, src2, dst, op, mask, stream);
|
||||
gridTransformBinary_<DefaultTransformPolicy>(src1, src2, dst, op, mask, stream);
|
||||
}
|
||||
|
||||
template <class SrcPtr1, class SrcPtr2, typename DstType, class Op>
|
||||
__host__ void gridTransform(const SrcPtr1& src1, const SrcPtr1& src2, GpuMat_<DstType>& dst, const Op& op, Stream& stream = Stream::Null())
|
||||
__host__ void gridTransformBinary(const SrcPtr1& src1, const SrcPtr2& src2, GpuMat_<DstType>& dst, const Op& op, Stream& stream = Stream::Null())
|
||||
{
|
||||
gridTransform_<DefaultTransformPolicy>(src1, src2, dst, op, stream);
|
||||
gridTransformBinary_<DefaultTransformPolicy>(src1, src2, dst, op, stream);
|
||||
}
|
||||
|
||||
template <class SrcPtr1, class SrcPtr2, typename DstType, class Op>
|
||||
__host__ void gridTransform(const SrcPtr1& src1, const SrcPtr1& src2, const GlobPtrSz<DstType>& dst, const Op& op, Stream& stream = Stream::Null())
|
||||
__host__ void gridTransformBinary(const SrcPtr1& src1, const SrcPtr2& src2, const GlobPtrSz<DstType>& dst, const Op& op, Stream& stream = Stream::Null())
|
||||
{
|
||||
gridTransform_<DefaultTransformPolicy>(src1, src2, dst, op, stream);
|
||||
gridTransformBinary_<DefaultTransformPolicy>(src1, src2, dst, op, stream);
|
||||
}
|
||||
|
||||
template <class SrcPtr, typename D0, typename D1, class OpTuple, class MaskPtr>
|
||||
__host__ void gridTransform(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>& >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
|
||||
__host__ void gridTransformTuple(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>& >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
|
||||
{
|
||||
gridTransform_<DefaultTransformPolicy>(src, dst, op, mask, stream);
|
||||
gridTransformTuple_<DefaultTransformPolicy>(src, dst, op, mask, stream);
|
||||
}
|
||||
|
||||
template <class SrcPtr, typename D0, typename D1, class OpTuple, class MaskPtr>
|
||||
__host__ void gridTransform(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1> >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
|
||||
__host__ void gridTransformTuple(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1> >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
|
||||
{
|
||||
gridTransform_<DefaultTransformPolicy>(src, dst, op, mask, stream);
|
||||
gridTransformTuple_<DefaultTransformPolicy>(src, dst, op, mask, stream);
|
||||
}
|
||||
|
||||
template <class SrcPtr, typename D0, typename D1, class OpTuple>
|
||||
__host__ void gridTransform(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>& >& dst, const OpTuple& op, Stream& stream = Stream::Null())
|
||||
__host__ void gridTransformTuple(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>& >& dst, const OpTuple& op, Stream& stream = Stream::Null())
|
||||
{
|
||||
gridTransform_<DefaultTransformPolicy>(src, dst, op, stream);
|
||||
gridTransformTuple_<DefaultTransformPolicy>(src, dst, op, stream);
|
||||
}
|
||||
|
||||
template <class SrcPtr, typename D0, typename D1, class OpTuple>
|
||||
__host__ void gridTransform(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1> >& dst, const OpTuple& op, Stream& stream = Stream::Null())
|
||||
__host__ void gridTransformTuple(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1> >& dst, const OpTuple& op, Stream& stream = Stream::Null())
|
||||
{
|
||||
gridTransform_<DefaultTransformPolicy>(src, dst, op, stream);
|
||||
gridTransformTuple_<DefaultTransformPolicy>(src, dst, op, stream);
|
||||
}
|
||||
|
||||
template <class SrcPtr, typename D0, typename D1, typename D2, class OpTuple, class MaskPtr>
|
||||
__host__ void gridTransform(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>&, GpuMat_<D2>& >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
|
||||
__host__ void gridTransformTuple(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>&, GpuMat_<D2>& >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
|
||||
{
|
||||
gridTransform_<DefaultTransformPolicy>(src, dst, op, mask, stream);
|
||||
gridTransformTuple_<DefaultTransformPolicy>(src, dst, op, mask, stream);
|
||||
}
|
||||
|
||||
template <class SrcPtr, typename D0, typename D1, typename D2, class OpTuple, class MaskPtr>
|
||||
__host__ void gridTransform(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1>, GlobPtrSz<D2> >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
|
||||
__host__ void gridTransformTuple(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1>, GlobPtrSz<D2> >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
|
||||
{
|
||||
gridTransform_<DefaultTransformPolicy>(src, dst, op, mask, stream);
|
||||
gridTransformTuple_<DefaultTransformPolicy>(src, dst, op, mask, stream);
|
||||
}
|
||||
|
||||
template <class SrcPtr, typename D0, typename D1, typename D2, class OpTuple>
|
||||
__host__ void gridTransform(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>&, GpuMat_<D2>& >& dst, const OpTuple& op, Stream& stream = Stream::Null())
|
||||
__host__ void gridTransformTuple(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>&, GpuMat_<D2>& >& dst, const OpTuple& op, Stream& stream = Stream::Null())
|
||||
{
|
||||
gridTransform_<DefaultTransformPolicy>(src, dst, op, stream);
|
||||
gridTransformTuple_<DefaultTransformPolicy>(src, dst, op, stream);
|
||||
}
|
||||
|
||||
template <class SrcPtr, typename D0, typename D1, typename D2, class OpTuple>
|
||||
__host__ void gridTransform(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1>, GlobPtrSz<D2> >& dst, const OpTuple& op, Stream& stream = Stream::Null())
|
||||
__host__ void gridTransformTuple(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1>, GlobPtrSz<D2> >& dst, const OpTuple& op, Stream& stream = Stream::Null())
|
||||
{
|
||||
gridTransform_<DefaultTransformPolicy>(src, dst, op, stream);
|
||||
gridTransformTuple_<DefaultTransformPolicy>(src, dst, op, stream);
|
||||
}
|
||||
|
||||
template <class SrcPtr, typename D0, typename D1, typename D2, typename D3, class OpTuple, class MaskPtr>
|
||||
__host__ void gridTransform(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>&, GpuMat_<D2>&, GpuMat_<D3>& >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
|
||||
__host__ void gridTransformTuple(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>&, GpuMat_<D2>&, GpuMat_<D3>& >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
|
||||
{
|
||||
gridTransform_<DefaultTransformPolicy>(src, dst, op, mask, stream);
|
||||
gridTransformTuple_<DefaultTransformPolicy>(src, dst, op, mask, stream);
|
||||
}
|
||||
|
||||
template <class SrcPtr, typename D0, typename D1, typename D2, typename D3, class OpTuple, class MaskPtr>
|
||||
__host__ void gridTransform(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1>, GlobPtrSz<D2>, GlobPtrSz<D3> >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
|
||||
__host__ void gridTransformTuple(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1>, GlobPtrSz<D2>, GlobPtrSz<D3> >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
|
||||
{
|
||||
gridTransform_<DefaultTransformPolicy>(src, dst, op, mask, stream);
|
||||
gridTransformTuple_<DefaultTransformPolicy>(src, dst, op, mask, stream);
|
||||
}
|
||||
|
||||
template <class SrcPtr, typename D0, typename D1, typename D2, typename D3, class OpTuple>
|
||||
__host__ void gridTransform(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>&, GpuMat_<D2>&, GpuMat_<D3>& >& dst, const OpTuple& op, Stream& stream = Stream::Null())
|
||||
__host__ void gridTransformTuple(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>&, GpuMat_<D2>&, GpuMat_<D3>& >& dst, const OpTuple& op, Stream& stream = Stream::Null())
|
||||
{
|
||||
gridTransform_<DefaultTransformPolicy>(src, dst, op, stream);
|
||||
gridTransformTuple_<DefaultTransformPolicy>(src, dst, op, stream);
|
||||
}
|
||||
|
||||
template <class SrcPtr, typename D0, typename D1, typename D2, typename D3, class OpTuple>
|
||||
__host__ void gridTransform(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1>, GlobPtrSz<D2>, GlobPtrSz<D3> >& dst, const OpTuple& op, Stream& stream = Stream::Null())
|
||||
__host__ void gridTransformTuple(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1>, GlobPtrSz<D2>, GlobPtrSz<D3> >& dst, const OpTuple& op, Stream& stream = Stream::Null())
|
||||
{
|
||||
gridTransform_<DefaultTransformPolicy>(src, dst, op, stream);
|
||||
gridTransformTuple_<DefaultTransformPolicy>(src, dst, op, stream);
|
||||
}
|
||||
|
||||
}}
|
||||
|
@ -49,19 +49,53 @@
|
||||
#include "../common.hpp"
|
||||
#include "../ptr2d/traits.hpp"
|
||||
#include "../ptr2d/gpumat.hpp"
|
||||
#include "../ptr2d/glob.hpp"
|
||||
#include "detail/transpose.hpp"
|
||||
|
||||
namespace cv { namespace cudev {
|
||||
|
||||
template <class SrcPtr, typename DstType>
|
||||
__host__ void gridTranspose(const SrcPtr& src, GpuMat_<DstType>& dst, Stream& stream = Stream::Null())
|
||||
template <class Policy, class SrcPtr, typename DstType>
|
||||
__host__ void gridTranspose_(const SrcPtr& src, GpuMat_<DstType>& dst, Stream& stream = Stream::Null())
|
||||
{
|
||||
const int rows = getRows(src);
|
||||
const int cols = getCols(src);
|
||||
|
||||
dst.create(cols, rows);
|
||||
|
||||
transpose_detail::transpose(shrinkPtr(src), shrinkPtr(dst), rows, cols, StreamAccessor::getStream(stream));
|
||||
transpose_detail::transpose<Policy>(shrinkPtr(src), shrinkPtr(dst), rows, cols, StreamAccessor::getStream(stream));
|
||||
}
|
||||
|
||||
template <class Policy, class SrcPtr, typename DstType>
|
||||
__host__ void gridTranspose_(const SrcPtr& src, const GlobPtrSz<DstType>& dst, Stream& stream = Stream::Null())
|
||||
{
|
||||
const int rows = getRows(src);
|
||||
const int cols = getCols(src);
|
||||
|
||||
CV_Assert( getRows(dst) == cols && getCols(dst) == rows );
|
||||
|
||||
transpose_detail::transpose<Policy>(shrinkPtr(src), shrinkPtr(dst), rows, cols, StreamAccessor::getStream(stream));
|
||||
}
|
||||
|
||||
// Default Policy
|
||||
|
||||
struct DefaultTransposePolicy
|
||||
{
|
||||
enum {
|
||||
tile_dim = 16,
|
||||
block_dim_y = 16
|
||||
};
|
||||
};
|
||||
|
||||
template <class SrcPtr, typename DstType>
|
||||
__host__ void gridTranspose(const SrcPtr& src, GpuMat_<DstType>& dst, Stream& stream = Stream::Null())
|
||||
{
|
||||
gridTranspose_<DefaultTransposePolicy>(src, dst, stream);
|
||||
}
|
||||
|
||||
template <class SrcPtr, typename DstType>
|
||||
__host__ void gridTranspose(const SrcPtr& src, const GlobPtrSz<DstType>& dst, Stream& stream = Stream::Null())
|
||||
{
|
||||
gridTranspose_<DefaultTransposePolicy>(src, dst, stream);
|
||||
}
|
||||
|
||||
}}
|
||||
|
@ -47,6 +47,7 @@
|
||||
#define __OPENCV_CUDEV_PTR2D_LUT_HPP__
|
||||
|
||||
#include "../common.hpp"
|
||||
#include "../util/vec_traits.hpp"
|
||||
#include "../grid/copy.hpp"
|
||||
#include "traits.hpp"
|
||||
#include "gpumat.hpp"
|
||||
@ -63,7 +64,8 @@ template <class SrcPtr, class TablePtr> struct LutPtr
|
||||
|
||||
__device__ __forceinline__ typename PtrTraits<TablePtr>::value_type operator ()(typename PtrTraits<SrcPtr>::index_type y, typename PtrTraits<SrcPtr>::index_type x) const
|
||||
{
|
||||
return tbl(0, src(y, x));
|
||||
typedef typename PtrTraits<TablePtr>::index_type tbl_index_type;
|
||||
return tbl(VecTraits<tbl_index_type>::all(0), src(y, x));
|
||||
}
|
||||
};
|
||||
|
||||
@ -81,8 +83,6 @@ template <class SrcPtr, class TablePtr> struct LutPtrSz : LutPtr<SrcPtr, TablePt
|
||||
template <class SrcPtr, class TablePtr>
|
||||
__host__ LutPtrSz<typename PtrTraits<SrcPtr>::ptr_type, typename PtrTraits<TablePtr>::ptr_type> lutPtr(const SrcPtr& src, const TablePtr& tbl)
|
||||
{
|
||||
CV_Assert( getRows(tbl) == 1 );
|
||||
|
||||
LutPtrSz<typename PtrTraits<SrcPtr>::ptr_type, typename PtrTraits<TablePtr>::ptr_type> ptr;
|
||||
ptr.src = shrinkPtr(src);
|
||||
ptr.tbl = shrinkPtr(tbl);
|
||||
|
@ -62,6 +62,42 @@ struct WithOutMask
|
||||
}
|
||||
};
|
||||
|
||||
template <class MaskPtr> struct SingleMaskChannels
|
||||
{
|
||||
typedef typename PtrTraits<MaskPtr>::value_type value_type;
|
||||
typedef typename PtrTraits<MaskPtr>::index_type index_type;
|
||||
|
||||
MaskPtr mask;
|
||||
int channels;
|
||||
|
||||
__device__ __forceinline__ value_type operator()(index_type y, index_type x) const
|
||||
{
|
||||
return mask(y, x / channels);
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
template <class MaskPtr> struct SingleMaskChannelsSz : SingleMaskChannels<MaskPtr>
|
||||
{
|
||||
int rows, cols;
|
||||
};
|
||||
|
||||
template <class MaskPtr>
|
||||
__host__ SingleMaskChannelsSz<typename PtrTraits<MaskPtr>::ptr_type>
|
||||
singleMaskChannels(const MaskPtr& mask, int channels)
|
||||
{
|
||||
SingleMaskChannelsSz<typename PtrTraits<MaskPtr>::ptr_type> ptr;
|
||||
ptr.mask = shrinkPtr(mask);
|
||||
ptr.channels = channels;
|
||||
ptr.rows = getRows(mask);
|
||||
ptr.cols = getCols(mask) * channels;
|
||||
return ptr;
|
||||
}
|
||||
|
||||
template <class MaskPtr> struct PtrTraits< SingleMaskChannelsSz<MaskPtr> > : PtrTraitsBase<SingleMaskChannelsSz<MaskPtr>, SingleMaskChannels<MaskPtr> >
|
||||
{
|
||||
};
|
||||
|
||||
}}
|
||||
|
||||
#endif
|
||||
|
@ -194,10 +194,23 @@ CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(~, uint, uint)
|
||||
return VecTraits<output_type ## 4>::make(func (a.x), func (a.y), func (a.z), func (a.w)); \
|
||||
}
|
||||
|
||||
namespace vec_math_detail
|
||||
{
|
||||
__device__ __forceinline__ schar abs_(schar val)
|
||||
{
|
||||
return (schar) ::abs((int) val);
|
||||
}
|
||||
|
||||
__device__ __forceinline__ short abs_(short val)
|
||||
{
|
||||
return (short) ::abs((int) val);
|
||||
}
|
||||
}
|
||||
|
||||
CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(abs, /*::abs*/, uchar, uchar)
|
||||
CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(abs, ::abs, char, char)
|
||||
CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(abs, vec_math_detail::abs_, char, char)
|
||||
CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(abs, /*::abs*/, ushort, ushort)
|
||||
CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(abs, ::abs, short, short)
|
||||
CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(abs, vec_math_detail::abs_, short, short)
|
||||
CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(abs, ::abs, int, int)
|
||||
CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(abs, /*::abs*/, uint, uint)
|
||||
CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(abs, ::fabsf, float, float)
|
||||
|
@ -70,7 +70,7 @@ CV_CUDEV_MAKE_VEC_INST(double)
|
||||
|
||||
#undef CV_CUDEV_MAKE_VEC_INST
|
||||
|
||||
template<> struct MakeVec<schar, 1> { typedef char type; };
|
||||
template<> struct MakeVec<schar, 1> { typedef schar type; };
|
||||
template<> struct MakeVec<schar, 2> { typedef char2 type; };
|
||||
template<> struct MakeVec<schar, 3> { typedef char3 type; };
|
||||
template<> struct MakeVec<schar, 4> { typedef char4 type; };
|
||||
|
@ -228,6 +228,9 @@ TEST(ReduceToColumn, Sum)
|
||||
|
||||
Mat dst_gold;
|
||||
cv::reduce(src, dst_gold, 1, REDUCE_SUM, CV_32S);
|
||||
dst_gold.cols = dst_gold.rows;
|
||||
dst_gold.rows = 1;
|
||||
dst_gold.step = dst_gold.cols * dst_gold.elemSize();
|
||||
|
||||
EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
|
||||
}
|
||||
@ -244,6 +247,9 @@ TEST(ReduceToColumn, Avg)
|
||||
|
||||
Mat dst_gold;
|
||||
cv::reduce(src, dst_gold, 1, REDUCE_AVG, CV_32F);
|
||||
dst_gold.cols = dst_gold.rows;
|
||||
dst_gold.rows = 1;
|
||||
dst_gold.step = dst_gold.cols * dst_gold.elemSize();
|
||||
|
||||
EXPECT_MAT_NEAR(dst_gold, dst, 1e-4);
|
||||
}
|
||||
@ -260,6 +266,9 @@ TEST(ReduceToColumn, Min)
|
||||
|
||||
Mat dst_gold;
|
||||
cv::reduce(src, dst_gold, 1, REDUCE_MIN);
|
||||
dst_gold.cols = dst_gold.rows;
|
||||
dst_gold.rows = 1;
|
||||
dst_gold.step = dst_gold.cols * dst_gold.elemSize();
|
||||
|
||||
EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
|
||||
}
|
||||
@ -276,6 +285,9 @@ TEST(ReduceToColumn, Max)
|
||||
|
||||
Mat dst_gold;
|
||||
cv::reduce(src, dst_gold, 1, REDUCE_MAX);
|
||||
dst_gold.cols = dst_gold.rows;
|
||||
dst_gold.rows = 1;
|
||||
dst_gold.step = dst_gold.cols * dst_gold.elemSize();
|
||||
|
||||
EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user