diff --git a/modules/cudaarithm/src/cuda/div_mat.cu b/modules/cudaarithm/src/cuda/div_mat.cu index 4a62a44ae3..e139cb4ae9 100644 --- a/modules/cudaarithm/src/cuda/div_mat.cu +++ b/modules/cudaarithm/src/cuda/div_mat.cu @@ -40,191 +40,203 @@ // //M*/ -#if !defined CUDA_DISABLER +#include "opencv2/opencv_modules.hpp" -#include "opencv2/core/cuda/common.hpp" -#include "opencv2/core/cuda/functional.hpp" -#include "opencv2/core/cuda/transform.hpp" -#include "opencv2/core/cuda/saturate_cast.hpp" -#include "opencv2/core/cuda/simd_functions.hpp" +#ifndef HAVE_OPENCV_CUDEV -#include "arithm_func_traits.hpp" +#error "opencv_cudev is required" -using namespace cv::cuda; -using namespace cv::cuda::device; +#else -namespace arithm +#include "opencv2/cudev.hpp" + +using namespace cv::cudev; + +void divMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat&, double scale, Stream& stream, int); +void divMat_8uc4_32f(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream); +void divMat_16sc4_32f(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream); + +namespace { - struct Div_8uc4_32f : binary_function - { - __device__ __forceinline__ uint operator ()(uint a, float b) const - { - uint res = 0; - - if (b != 0) - { - b = 1.0f / b; - res |= (saturate_cast((0xffu & (a )) * b) ); - res |= (saturate_cast((0xffu & (a >> 8)) * b) << 8); - res |= (saturate_cast((0xffu & (a >> 16)) * b) << 16); - res |= (saturate_cast((0xffu & (a >> 24)) * b) << 24); - } - - return res; - } - }; - - struct Div_16sc4_32f : binary_function - { - __device__ __forceinline__ short4 operator ()(short4 a, float b) const - { - return b != 0 ? make_short4(saturate_cast(a.x / b), saturate_cast(a.y / b), - saturate_cast(a.z / b), saturate_cast(a.w / b)) - : make_short4(0,0,0,0); - } - }; - - template struct Div : binary_function + template struct DivOp : binary_function { __device__ __forceinline__ D operator ()(T a, T b) const { return b != 0 ? saturate_cast(a / b) : 0; } - - __host__ __device__ __forceinline__ Div() {} - __host__ __device__ __forceinline__ Div(const Div&) {} }; - template struct Div : binary_function + template struct DivOp : binary_function { __device__ __forceinline__ float operator ()(T a, T b) const { - return b != 0 ? static_cast(a) / b : 0; + return b != 0 ? static_cast(a) / b : 0.0f; } - - __host__ __device__ __forceinline__ Div() {} - __host__ __device__ __forceinline__ Div(const Div&) {} }; - template struct Div : binary_function + template struct DivOp : binary_function { __device__ __forceinline__ double operator ()(T a, T b) const { - return b != 0 ? static_cast(a) / b : 0; + return b != 0 ? static_cast(a) / b : 0.0; } - - __host__ __device__ __forceinline__ Div() {} - __host__ __device__ __forceinline__ Div(const Div&) {} }; - template struct DivScale : binary_function + template struct DivScaleOp : binary_function { S scale; - __host__ explicit DivScale(S scale_) : scale(scale_) {} - __device__ __forceinline__ D operator ()(T a, T b) const { return b != 0 ? saturate_cast(scale * a / b) : 0; } }; -} -namespace cv { namespace cuda { namespace device -{ - template <> struct TransformFunctorTraits : arithm::ArithmFuncTraits + template struct TransformPolicy : DefaultTransformPolicy { }; - - template struct TransformFunctorTraits< arithm::Div > : arithm::ArithmFuncTraits + template <> struct TransformPolicy : DefaultTransformPolicy { + enum { + shift = 1 + }; }; - template struct TransformFunctorTraits< arithm::DivScale > : arithm::ArithmFuncTraits - { - }; -}}} - -namespace arithm -{ - void divMat_8uc4_32f(PtrStepSz src1, PtrStepSzf src2, PtrStepSz dst, cudaStream_t stream) - { - device::transform(src1, src2, dst, Div_8uc4_32f(), WithOutMask(), stream); - } - - void divMat_16sc4_32f(PtrStepSz src1, PtrStepSzf src2, PtrStepSz dst, cudaStream_t stream) - { - device::transform(src1, src2, dst, Div_16sc4_32f(), WithOutMask(), stream); - } - template - void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream) + void divMatImpl(const GpuMat& src1, const GpuMat& src2, const GpuMat& dst, double scale, Stream& stream) { if (scale == 1) { - Div op; - device::transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, op, WithOutMask(), stream); + DivOp op; + gridTransformBinary_< TransformPolicy >(globPtr(src1), globPtr(src2), globPtr(dst), op, stream); } else { - DivScale op(static_cast(scale)); - device::transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, op, WithOutMask(), stream); + DivScaleOp op; + op.scale = static_cast(scale); + gridTransformBinary_< TransformPolicy >(globPtr(src1), globPtr(src2), globPtr(dst), op, stream); } } - - template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - - template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - - //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - - //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - - //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - - //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - - //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); } -#endif // CUDA_DISABLER +void divMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat&, double scale, Stream& stream, int) +{ + typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, const GpuMat& dst, double scale, Stream& stream); + static const func_t funcs[7][7] = + { + { + divMatImpl, + divMatImpl, + divMatImpl, + divMatImpl, + divMatImpl, + divMatImpl, + divMatImpl + }, + { + divMatImpl, + divMatImpl, + divMatImpl, + divMatImpl, + divMatImpl, + divMatImpl, + divMatImpl + }, + { + 0 /*divMatImpl*/, + 0 /*divMatImpl*/, + divMatImpl, + divMatImpl, + divMatImpl, + divMatImpl, + divMatImpl + }, + { + 0 /*divMatImpl*/, + 0 /*divMatImpl*/, + divMatImpl, + divMatImpl, + divMatImpl, + divMatImpl, + divMatImpl + }, + { + 0 /*divMatImpl*/, + 0 /*divMatImpl*/, + 0 /*divMatImpl*/, + 0 /*divMatImpl*/, + divMatImpl, + divMatImpl, + divMatImpl + }, + { + 0 /*divMatImpl*/, + 0 /*divMatImpl*/, + 0 /*divMatImpl*/, + 0 /*divMatImpl*/, + 0 /*divMatImpl*/, + divMatImpl, + divMatImpl + }, + { + 0 /*divMatImpl*/, + 0 /*divMatImpl*/, + 0 /*divMatImpl*/, + 0 /*divMatImpl*/, + 0 /*divMatImpl*/, + 0 /*divMatImpl*/, + divMatImpl + } + }; + + const int sdepth = src1.depth(); + const int ddepth = dst.depth(); + + CV_DbgAssert( sdepth < 7 && ddepth < 7 ); + + GpuMat src1_ = src1.reshape(1); + GpuMat src2_ = src2.reshape(1); + GpuMat dst_ = dst.reshape(1); + + const func_t func = funcs[sdepth][ddepth]; + + if (!func) + CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types"); + + func(src1_, src2_, dst_, scale, stream); +} + +namespace +{ + template + struct DivOpSpecial : binary_function + { + __device__ __forceinline__ T operator ()(const T& a, float b) const + { + typedef typename VecTraits::elem_type elem_type; + + T res = VecTraits::all(0); + + if (b != 0) + { + b = 1.0f / b; + res.x = saturate_cast(a.x * b); + res.y = saturate_cast(a.y * b); + res.z = saturate_cast(a.z * b); + res.w = saturate_cast(a.w * b); + } + + return res; + } + }; +} + +void divMat_8uc4_32f(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream) +{ + gridTransformBinary(globPtr(src1), globPtr(src2), globPtr(dst), DivOpSpecial(), stream); +} + +void divMat_16sc4_32f(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream) +{ + gridTransformBinary(globPtr(src1), globPtr(src2), globPtr(dst), DivOpSpecial(), stream); +} + +#endif diff --git a/modules/cudaarithm/src/cuda/div_scalar.cu b/modules/cudaarithm/src/cuda/div_scalar.cu index 9ee1727a75..186e1766e9 100644 --- a/modules/cudaarithm/src/cuda/div_scalar.cu +++ b/modules/cudaarithm/src/cuda/div_scalar.cu @@ -40,129 +40,225 @@ // //M*/ -#if !defined CUDA_DISABLER +#include "opencv2/opencv_modules.hpp" -#include "opencv2/core/cuda/common.hpp" -#include "opencv2/core/cuda/functional.hpp" -#include "opencv2/core/cuda/transform.hpp" -#include "opencv2/core/cuda/saturate_cast.hpp" -#include "opencv2/core/cuda/simd_functions.hpp" +#ifndef HAVE_OPENCV_CUDEV -#include "arithm_func_traits.hpp" +#error "opencv_cudev is required" -using namespace cv::cuda; -using namespace cv::cuda::device; +#else -namespace arithm +#include "opencv2/cudev.hpp" + +using namespace cv::cudev; + +void divScalar(const GpuMat& src, cv::Scalar val, bool inv, GpuMat& dst, const GpuMat& mask, double scale, Stream& stream, int); + +namespace { - template struct DivScalar : unary_function + template struct SafeDiv; + template <> struct SafeDiv<1> { - S val; - - __host__ explicit DivScalar(S val_) : val(val_) {} - - __device__ __forceinline__ D operator ()(T a) const + template + __device__ __forceinline__ static T op(T a, T b) { - return saturate_cast(a / val); + return b != 0 ? a / b : 0; + } + }; + template <> struct SafeDiv<2> + { + template + __device__ __forceinline__ static T op(const T& a, const T& b) + { + T res; + + res.x = b.x != 0 ? a.x / b.x : 0; + res.y = b.y != 0 ? a.y / b.y : 0; + + return res; + } + }; + template <> struct SafeDiv<3> + { + template + __device__ __forceinline__ static T op(const T& a, const T& b) + { + T res; + + res.x = b.x != 0 ? a.x / b.x : 0; + res.y = b.y != 0 ? a.y / b.y : 0; + res.z = b.z != 0 ? a.z / b.z : 0; + + return res; + } + }; + template <> struct SafeDiv<4> + { + template + __device__ __forceinline__ static T op(const T& a, const T& b) + { + T res; + + res.x = b.x != 0 ? a.x / b.x : 0; + res.y = b.y != 0 ? a.y / b.y : 0; + res.z = b.z != 0 ? a.z / b.z : 0; + res.w = b.w != 0 ? a.w / b.w : 0; + + return res; } }; - template struct DivScalarInv : unary_function + template struct DivScalarOp : unary_function { - S val; + ScalarType val; - explicit DivScalarInv(S val_) : val(val_) {} - - __device__ __forceinline__ D operator ()(T a) const + __device__ __forceinline__ DstType operator ()(SrcType a) const { - return a != 0 ? saturate_cast(val / a) : 0; + return saturate_cast(SafeDiv::cn>::op(saturate_cast(a), val)); } }; -} -namespace cv { namespace cuda { namespace device -{ - template struct TransformFunctorTraits< arithm::DivScalar > : arithm::ArithmFuncTraits + template struct DivScalarOpInv : unary_function { + ScalarType val; + + __device__ __forceinline__ DstType operator ()(SrcType a) const + { + return saturate_cast(SafeDiv::cn>::op(val, saturate_cast(a))); + } }; - template struct TransformFunctorTraits< arithm::DivScalarInv > : arithm::ArithmFuncTraits + template struct TransformPolicy : DefaultTransformPolicy { }; -}}} - -namespace arithm -{ - template - void divScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream) + template <> struct TransformPolicy : DefaultTransformPolicy { + enum { + shift = 1 + }; + }; + + template + void divScalarImpl(const GpuMat& src, cv::Scalar value, bool inv, GpuMat& dst, Stream& stream) + { + typedef typename MakeVec::cn>::type ScalarType; + + cv::Scalar_ value_ = value; + if (inv) { - DivScalarInv op(static_cast(val)); - device::transform((PtrStepSz) src1, (PtrStepSz) dst, op, WithOutMask(), stream); + DivScalarOpInv op; + op.val = VecTraits::make(value_.val); + + gridTransformUnary_< TransformPolicy >(globPtr(src), globPtr(dst), op, stream); } else { - DivScalar op(static_cast(val)); - device::transform((PtrStepSz) src1, (PtrStepSz) dst, op, WithOutMask(), stream); + DivScalarOp op; + op.val = VecTraits::make(value_.val); + + gridTransformUnary_< TransformPolicy >(globPtr(src), globPtr(dst), op, stream); } } - - template void divScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream); - template void divScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream); - template void divScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream); - template void divScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream); - template void divScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream); - template void divScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream); - template void divScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream); - - template void divScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream); - template void divScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream); - template void divScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream); - template void divScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream); - template void divScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream); - template void divScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream); - template void divScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream); - - //template void divScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream); - //template void divScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream); - template void divScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream); - template void divScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream); - template void divScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream); - template void divScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream); - template void divScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream); - - //template void divScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream); - //template void divScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream); - template void divScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream); - template void divScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream); - template void divScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream); - template void divScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream); - template void divScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream); - - //template void divScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream); - //template void divScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream); - //template void divScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream); - //template void divScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream); - template void divScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream); - template void divScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream); - template void divScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream); - - //template void divScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream); - //template void divScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream); - //template void divScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream); - //template void divScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream); - //template void divScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream); - template void divScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream); - template void divScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream); - - //template void divScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream); - //template void divScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream); - //template void divScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream); - //template void divScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream); - //template void divScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream); - //template void divScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream); - template void divScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream); } -#endif // CUDA_DISABLER +void divScalar(const GpuMat& src, cv::Scalar val, bool inv, GpuMat& dst, const GpuMat&, double scale, Stream& stream, int) +{ + typedef void (*func_t)(const GpuMat& src, cv::Scalar val, bool inv, GpuMat& dst, Stream& stream); + static const func_t funcs[7][7][4] = + { + { + {divScalarImpl, divScalarImpl, divScalarImpl, divScalarImpl}, + {divScalarImpl, divScalarImpl, divScalarImpl, divScalarImpl}, + {divScalarImpl, divScalarImpl, divScalarImpl, divScalarImpl}, + {divScalarImpl, divScalarImpl, divScalarImpl, divScalarImpl}, + {divScalarImpl, divScalarImpl, divScalarImpl, divScalarImpl}, + {divScalarImpl, divScalarImpl, divScalarImpl, divScalarImpl}, + {divScalarImpl, divScalarImpl, divScalarImpl, divScalarImpl} + }, + { + {divScalarImpl, divScalarImpl, divScalarImpl, divScalarImpl}, + {divScalarImpl, divScalarImpl, divScalarImpl, divScalarImpl}, + {divScalarImpl, divScalarImpl, divScalarImpl, divScalarImpl}, + {divScalarImpl, divScalarImpl, divScalarImpl, divScalarImpl}, + {divScalarImpl, divScalarImpl, divScalarImpl, divScalarImpl}, + {divScalarImpl, divScalarImpl, divScalarImpl, divScalarImpl}, + {divScalarImpl, divScalarImpl, divScalarImpl, divScalarImpl} + }, + { + {0 /*divScalarImpl*/, 0 /*divScalarImpl*/, 0 /*divScalarImpl*/, 0 /*divScalarImpl*/}, + {0 /*divScalarImpl*/, 0 /*divScalarImpl*/, 0 /*divScalarImpl*/, 0 /*divScalarImpl*/}, + {divScalarImpl, divScalarImpl, divScalarImpl, divScalarImpl}, + {divScalarImpl, divScalarImpl, divScalarImpl, divScalarImpl}, + {divScalarImpl, divScalarImpl, divScalarImpl, divScalarImpl}, + {divScalarImpl, divScalarImpl, divScalarImpl, divScalarImpl}, + {divScalarImpl, divScalarImpl, divScalarImpl, divScalarImpl} + }, + { + {0 /*divScalarImpl*/, 0 /*divScalarImpl*/, 0 /*divScalarImpl*/, 0 /*divScalarImpl*/}, + {0 /*divScalarImpl*/, 0 /*divScalarImpl*/, 0 /*divScalarImpl*/, 0 /*divScalarImpl*/}, + {divScalarImpl, divScalarImpl, divScalarImpl, divScalarImpl}, + {divScalarImpl, divScalarImpl, divScalarImpl, divScalarImpl}, + {divScalarImpl, divScalarImpl, divScalarImpl, divScalarImpl}, + {divScalarImpl, divScalarImpl, divScalarImpl, divScalarImpl}, + {divScalarImpl, divScalarImpl, divScalarImpl, divScalarImpl} + }, + { + {0 /*divScalarImpl*/, 0 /*divScalarImpl*/, 0 /*divScalarImpl*/, 0 /*divScalarImpl*/}, + {0 /*divScalarImpl*/, 0 /*divScalarImpl*/, 0 /*divScalarImpl*/, 0 /*divScalarImpl*/}, + {0 /*divScalarImpl*/, 0 /*divScalarImpl*/, 0 /*divScalarImpl*/, 0 /*divScalarImpl*/}, + {0 /*divScalarImpl*/, 0 /*divScalarImpl*/, 0 /*divScalarImpl*/, 0 /*divScalarImpl*/}, + {divScalarImpl, divScalarImpl, divScalarImpl, divScalarImpl}, + {divScalarImpl, divScalarImpl, divScalarImpl, divScalarImpl}, + {divScalarImpl, divScalarImpl, divScalarImpl, divScalarImpl} + }, + { + {0 /*divScalarImpl*/, 0 /*divScalarImpl*/, 0 /*divScalarImpl*/, 0 /*divScalarImpl*/}, + {0 /*divScalarImpl*/, 0 /*divScalarImpl*/, 0 /*divScalarImpl*/, 0 /*divScalarImpl*/}, + {0 /*divScalarImpl*/, 0 /*divScalarImpl*/, 0 /*divScalarImpl*/, 0 /*divScalarImpl*/}, + {0 /*divScalarImpl*/, 0 /*divScalarImpl*/, 0 /*divScalarImpl*/, 0 /*divScalarImpl*/}, + {0 /*divScalarImpl*/, 0 /*divScalarImpl*/, 0 /*divScalarImpl*/, 0 /*divScalarImpl*/}, + {divScalarImpl, divScalarImpl, divScalarImpl, divScalarImpl}, + {divScalarImpl, divScalarImpl, divScalarImpl, divScalarImpl} + }, + { + {0 /*divScalarImpl*/, 0 /*divScalarImpl*/, 0 /*divScalarImpl*/, 0 /*divScalarImpl*/}, + {0 /*divScalarImpl*/, 0 /*divScalarImpl*/, 0 /*divScalarImpl*/, 0 /*divScalarImpl*/}, + {0 /*divScalarImpl*/, 0 /*divScalarImpl*/, 0 /*divScalarImpl*/, 0 /*divScalarImpl*/}, + {0 /*divScalarImpl*/, 0 /*divScalarImpl*/, 0 /*divScalarImpl*/, 0 /*divScalarImpl*/}, + {0 /*divScalarImpl*/, 0 /*divScalarImpl*/, 0 /*divScalarImpl*/, 0 /*divScalarImpl*/}, + {0 /*divScalarImpl*/, 0 /*divScalarImpl*/, 0 /*divScalarImpl*/, 0 /*divScalarImpl*/}, + {divScalarImpl, divScalarImpl, divScalarImpl, divScalarImpl} + } + }; + + const int sdepth = src.depth(); + const int ddepth = dst.depth(); + const int cn = src.channels(); + + CV_DbgAssert( sdepth < 7 && ddepth < 7 && cn <= 4 ); + + if (inv) + { + val[0] *= scale; + val[1] *= scale; + val[2] *= scale; + val[3] *= scale; + } + else + { + val[0] /= scale; + val[1] /= scale; + val[2] /= scale; + val[3] /= scale; + } + + const func_t func = funcs[sdepth][ddepth][cn - 1]; + + if (!func) + CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types"); + + func(src, val, inv, dst, stream); +} + +#endif diff --git a/modules/cudaarithm/src/element_operations.cpp b/modules/cudaarithm/src/element_operations.cpp index d170166a7a..b846520e86 100644 --- a/modules/cudaarithm/src/element_operations.cpp +++ b/modules/cudaarithm/src/element_operations.cpp @@ -401,229 +401,11 @@ void cv::cuda::multiply(InputArray _src1, InputArray _src2, OutputArray _dst, do //////////////////////////////////////////////////////////////////////// // divide -namespace arithm -{ - void divMat_8uc4_32f(PtrStepSz src1, PtrStepSzf src2, PtrStepSz dst, cudaStream_t stream); +void divMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat&, double scale, Stream& stream, int); +void divMat_8uc4_32f(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream); +void divMat_16sc4_32f(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream); - void divMat_16sc4_32f(PtrStepSz src1, PtrStepSzf src2, PtrStepSz dst, cudaStream_t stream); - - template - void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); -} - -static void divMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat&, double scale, Stream& _stream, int) -{ - typedef void (*func_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - static const func_t funcs[7][7] = - { - { - arithm::divMat, - arithm::divMat, - arithm::divMat, - arithm::divMat, - arithm::divMat, - arithm::divMat, - arithm::divMat - }, - { - arithm::divMat, - arithm::divMat, - arithm::divMat, - arithm::divMat, - arithm::divMat, - arithm::divMat, - arithm::divMat - }, - { - 0 /*arithm::divMat*/, - 0 /*arithm::divMat*/, - arithm::divMat, - arithm::divMat, - arithm::divMat, - arithm::divMat, - arithm::divMat - }, - { - 0 /*arithm::divMat*/, - 0 /*arithm::divMat*/, - arithm::divMat, - arithm::divMat, - arithm::divMat, - arithm::divMat, - arithm::divMat - }, - { - 0 /*arithm::divMat*/, - 0 /*arithm::divMat*/, - 0 /*arithm::divMat*/, - 0 /*arithm::divMat*/, - arithm::divMat, - arithm::divMat, - arithm::divMat - }, - { - 0 /*arithm::divMat*/, - 0 /*arithm::divMat*/, - 0 /*arithm::divMat*/, - 0 /*arithm::divMat*/, - 0 /*arithm::divMat*/, - arithm::divMat, - arithm::divMat - }, - { - 0 /*arithm::divMat*/, - 0 /*arithm::divMat*/, - 0 /*arithm::divMat*/, - 0 /*arithm::divMat*/, - 0 /*arithm::divMat*/, - 0 /*arithm::divMat*/, - arithm::divMat - } - }; - - const int sdepth = src1.depth(); - const int ddepth = dst.depth(); - const int cn = src1.channels(); - - cudaStream_t stream = StreamAccessor::getStream(_stream); - - PtrStepSzb src1_(src1.rows, src1.cols * cn, src1.data, src1.step); - PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step); - PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step); - - const func_t func = funcs[sdepth][ddepth]; - - if (!func) - CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types"); - - func(src1_, src2_, dst_, scale, stream); -} - -namespace arithm -{ - template - void divScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream); -} - -static void divScalar(const GpuMat& src, Scalar val, bool inv, GpuMat& dst, const GpuMat&, double scale, Stream& _stream, int) -{ - typedef void (*func_t)(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream); - static const func_t funcs[7][7] = - { - { - arithm::divScalar, - arithm::divScalar, - arithm::divScalar, - arithm::divScalar, - arithm::divScalar, - arithm::divScalar, - arithm::divScalar - }, - { - arithm::divScalar, - arithm::divScalar, - arithm::divScalar, - arithm::divScalar, - arithm::divScalar, - arithm::divScalar, - arithm::divScalar - }, - { - 0 /*arithm::divScalar*/, - 0 /*arithm::divScalar*/, - arithm::divScalar, - arithm::divScalar, - arithm::divScalar, - arithm::divScalar, - arithm::divScalar - }, - { - 0 /*arithm::divScalar*/, - 0 /*arithm::divScalar*/, - arithm::divScalar, - arithm::divScalar, - arithm::divScalar, - arithm::divScalar, - arithm::divScalar - }, - { - 0 /*arithm::divScalar*/, - 0 /*arithm::divScalar*/, - 0 /*arithm::divScalar*/, - 0 /*arithm::divScalar*/, - arithm::divScalar, - arithm::divScalar, - arithm::divScalar - }, - { - 0 /*arithm::divScalar*/, - 0 /*arithm::divScalar*/, - 0 /*arithm::divScalar*/, - 0 /*arithm::divScalar*/, - 0 /*arithm::divScalar*/, - arithm::divScalar, - arithm::divScalar - }, - { - 0 /*arithm::divScalar*/, - 0 /*arithm::divScalar*/, - 0 /*arithm::divScalar*/, - 0 /*arithm::divScalar*/, - 0 /*arithm::divScalar*/, - 0 /*arithm::divScalar*/, - arithm::divScalar - } - }; - - typedef void (*npp_func_t)(const PtrStepSzb src, Scalar sc, PtrStepb dst, cudaStream_t stream); - static const npp_func_t npp_funcs[7][4] = - { - {NppArithmScalar::call, 0, NppArithmScalar::call, NppArithmScalar::call}, - {0 , 0, 0 , 0 }, - {NppArithmScalar::call, 0, NppArithmScalar::call, NppArithmScalar::call}, - {NppArithmScalar::call, 0, NppArithmScalar::call, NppArithmScalar::call}, - {NppArithmScalar::call, 0, NppArithmScalar::call, 0 }, - {NppArithmScalar::call, 0, NppArithmScalar::call, NppArithmScalar::call}, - {0 , 0, 0 , 0 } - }; - - const int sdepth = src.depth(); - const int ddepth = dst.depth(); - const int cn = src.channels(); - - cudaStream_t stream = StreamAccessor::getStream(_stream); - - if (inv) - { - val[0] *= scale; - val[1] *= scale; - val[2] *= scale; - val[3] *= scale; - } - else - { - val[0] /= scale; - val[1] /= scale; - val[2] /= scale; - val[3] /= scale; - } - - const npp_func_t npp_func = npp_funcs[sdepth][cn - 1]; - if (ddepth == sdepth && cn > 1 && npp_func != 0 && !inv) - { - npp_func(src, val, dst, stream); - return; - } - - CV_Assert( cn == 1 ); - - const func_t func = funcs[sdepth][ddepth]; - - if (!func) - CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types"); - - func(src, val[0], inv, dst, stream); -} +void divScalar(const GpuMat& src, cv::Scalar val, bool inv, GpuMat& dst, const GpuMat& mask, double scale, Stream& stream, int); void cv::cuda::divide(InputArray _src1, InputArray _src2, OutputArray _dst, double scale, int dtype, Stream& stream) { @@ -637,7 +419,7 @@ void cv::cuda::divide(InputArray _src1, InputArray _src2, OutputArray _dst, doub _dst.create(src1.size(), src1.type()); GpuMat dst = _dst.getGpuMat(); - arithm::divMat_8uc4_32f(src1, src2, dst, StreamAccessor::getStream(stream)); + divMat_8uc4_32f(src1, src2, dst, stream); } else if (_src1.type() == CV_16SC4 && _src2.type() == CV_32FC1) { @@ -649,7 +431,7 @@ void cv::cuda::divide(InputArray _src1, InputArray _src2, OutputArray _dst, doub _dst.create(src1.size(), src1.type()); GpuMat dst = _dst.getGpuMat(); - arithm::divMat_16sc4_32f(src1, src2, dst, StreamAccessor::getStream(stream)); + divMat_16sc4_32f(src1, src2, dst, stream); } else {