diff --git a/modules/cudaarithm/src/cuda/sub_mat.cu b/modules/cudaarithm/src/cuda/sub_mat.cu index 873b73c46a..ec8d229229 100644 --- a/modules/cudaarithm/src/cuda/sub_mat.cu +++ b/modules/cudaarithm/src/cuda/sub_mat.cu @@ -40,146 +40,186 @@ // //M*/ -#if !defined CUDA_DISABLER +#include "opencv2/opencv_modules.hpp" -#include "opencv2/core/cuda/common.hpp" -#include "opencv2/core/cuda/functional.hpp" -#include "opencv2/core/cuda/transform.hpp" -#include "opencv2/core/cuda/saturate_cast.hpp" -#include "opencv2/core/cuda/simd_functions.hpp" +#ifndef HAVE_OPENCV_CUDEV -#include "arithm_func_traits.hpp" +#error "opencv_cudev is required" -using namespace cv::cuda; -using namespace cv::cuda::device; +#else -namespace arithm +#include "opencv2/cudev.hpp" + +using namespace cv::cudev; + +void subMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, double, Stream& _stream, int); + +namespace { - struct VSub4 : binary_function - { - __device__ __forceinline__ uint operator ()(uint a, uint b) const - { - return vsub4(a, b); - } - - __host__ __device__ __forceinline__ VSub4() {} - __host__ __device__ __forceinline__ VSub4(const VSub4&) {} - }; - - struct VSub2 : binary_function - { - __device__ __forceinline__ uint operator ()(uint a, uint b) const - { - return vsub2(a, b); - } - - __host__ __device__ __forceinline__ VSub2() {} - __host__ __device__ __forceinline__ VSub2(const VSub2&) {} - }; - - template struct SubMat : binary_function + template struct SubOp1 : binary_function { __device__ __forceinline__ D operator ()(T a, T b) const { return saturate_cast(a - b); } - - __host__ __device__ __forceinline__ SubMat() {} - __host__ __device__ __forceinline__ SubMat(const SubMat&) {} }; -} - -namespace cv { namespace cuda { namespace device -{ - template <> struct TransformFunctorTraits< arithm::VSub4 > : arithm::ArithmFuncTraits - { - }; - - template <> struct TransformFunctorTraits< arithm::VSub2 > : arithm::ArithmFuncTraits - { - }; - - template struct TransformFunctorTraits< arithm::SubMat > : arithm::ArithmFuncTraits - { - }; -}}} - -namespace arithm -{ - void subMat_v4(PtrStepSz src1, PtrStepSz src2, PtrStepSz dst, cudaStream_t stream) - { - device::transform(src1, src2, dst, VSub4(), WithOutMask(), stream); - } - - void subMat_v2(PtrStepSz src1, PtrStepSz src2, PtrStepSz dst, cudaStream_t stream) - { - device::transform(src1, src2, dst, VSub2(), WithOutMask(), stream); - } template - void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream) + void subMat_v1(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, Stream& stream) { if (mask.data) - device::transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, SubMat(), mask, stream); + gridTransformBinary(globPtr(src1), globPtr(src2), globPtr(dst), SubOp1(), globPtr(mask), stream); else - device::transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, SubMat(), WithOutMask(), stream); + gridTransformBinary(globPtr(src1), globPtr(src2), globPtr(dst), SubOp1(), stream); } - template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + struct SubOp2 : binary_function + { + __device__ __forceinline__ uint operator ()(uint a, uint b) const + { + return vsub2(a, b); + } + }; - template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + void subMat_v2(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream) + { + const int vcols = src1.cols >> 1; - //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + GlobPtrSz src1_ = globPtr((uint*) src1.data, src1.step, src1.rows, vcols); + GlobPtrSz src2_ = globPtr((uint*) src2.data, src2.step, src1.rows, vcols); + GlobPtrSz dst_ = globPtr((uint*) dst.data, dst.step, src1.rows, vcols); - //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + gridTransformBinary(src1_, src2_, dst_, SubOp2(), stream); + } - //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + struct SubOp4 : binary_function + { + __device__ __forceinline__ uint operator ()(uint a, uint b) const + { + return vsub4(a, b); + } + }; - //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + void subMat_v4(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream) + { + const int vcols = src1.cols >> 2; - //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + GlobPtrSz src1_ = globPtr((uint*) src1.data, src1.step, src1.rows, vcols); + GlobPtrSz src2_ = globPtr((uint*) src2.data, src2.step, src1.rows, vcols); + GlobPtrSz dst_ = globPtr((uint*) dst.data, dst.step, src1.rows, vcols); + + gridTransformBinary(src1_, src2_, dst_, SubOp4(), stream); + } } -#endif // CUDA_DISABLER +void subMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, double, Stream& stream, int) +{ + typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, Stream& stream); + static const func_t funcs[7][7] = + { + { + subMat_v1, + subMat_v1, + subMat_v1, + subMat_v1, + subMat_v1, + subMat_v1, + subMat_v1 + }, + { + subMat_v1, + subMat_v1, + subMat_v1, + subMat_v1, + subMat_v1, + subMat_v1, + subMat_v1 + }, + { + 0 /*subMat_v1*/, + 0 /*subMat_v1*/, + subMat_v1, + subMat_v1, + subMat_v1, + subMat_v1, + subMat_v1 + }, + { + 0 /*subMat_v1*/, + 0 /*subMat_v1*/, + subMat_v1, + subMat_v1, + subMat_v1, + subMat_v1, + subMat_v1 + }, + { + 0 /*subMat_v1*/, + 0 /*subMat_v1*/, + 0 /*subMat_v1*/, + 0 /*subMat_v1*/, + subMat_v1, + subMat_v1, + subMat_v1 + }, + { + 0 /*subMat_v1*/, + 0 /*subMat_v1*/, + 0 /*subMat_v1*/, + 0 /*subMat_v1*/, + 0 /*subMat_v1*/, + subMat_v1, + subMat_v1 + }, + { + 0 /*subMat_v1*/, + 0 /*subMat_v1*/, + 0 /*subMat_v1*/, + 0 /*subMat_v1*/, + 0 /*subMat_v1*/, + 0 /*subMat_v1*/, + subMat_v1 + } + }; + + const int sdepth = src1.depth(); + const int ddepth = dst.depth(); + + CV_DbgAssert( sdepth < 7 && ddepth < 7 ); + + GpuMat src1_ = src1.reshape(1); + GpuMat src2_ = src2.reshape(1); + GpuMat dst_ = dst.reshape(1); + + if (mask.empty() && (sdepth == CV_8U || sdepth == CV_16U) && ddepth == sdepth) + { + const intptr_t src1ptr = reinterpret_cast(src1_.data); + const intptr_t src2ptr = reinterpret_cast(src2_.data); + const intptr_t dstptr = reinterpret_cast(dst_.data); + + const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0; + + if (isAllAligned) + { + if (sdepth == CV_8U && (src1_.cols & 3) == 0) + { + subMat_v4(src1_, src2_, dst_, stream); + return; + } + else if (sdepth == CV_16U && (src1_.cols & 1) == 0) + { + subMat_v2(src1_, src2_, dst_, stream); + return; + } + } + } + + const func_t func = funcs[sdepth][ddepth]; + + if (!func) + CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types"); + + func(src1_, src2_, dst_, mask, stream); +} + +#endif diff --git a/modules/cudaarithm/src/cuda/sub_scalar.cu b/modules/cudaarithm/src/cuda/sub_scalar.cu index 5f4ef66a9c..35cea8cbe3 100644 --- a/modules/cudaarithm/src/cuda/sub_scalar.cu +++ b/modules/cudaarithm/src/cuda/sub_scalar.cu @@ -40,110 +40,164 @@ // //M*/ -#if !defined CUDA_DISABLER +#include "opencv2/opencv_modules.hpp" -#include "opencv2/core/cuda/common.hpp" -#include "opencv2/core/cuda/functional.hpp" -#include "opencv2/core/cuda/transform.hpp" -#include "opencv2/core/cuda/saturate_cast.hpp" -#include "opencv2/core/cuda/simd_functions.hpp" +#ifndef HAVE_OPENCV_CUDEV -#include "arithm_func_traits.hpp" +#error "opencv_cudev is required" -using namespace cv::cuda; -using namespace cv::cuda::device; +#else -namespace arithm +#include "opencv2/cudev.hpp" + +using namespace cv::cudev; + +void subScalar(const GpuMat& src, cv::Scalar val, bool inv, GpuMat& dst, const GpuMat& mask, double, Stream& stream, int); + +namespace { - template struct SubScalar : unary_function + template struct SubScalarOp : unary_function { - S val; - int scale; + ScalarType val; - __host__ SubScalar(S val_, int scale_) : val(val_), scale(scale_) {} - - __device__ __forceinline__ D operator ()(T a) const + __device__ __forceinline__ DstType operator ()(SrcType a) const { - return saturate_cast(scale * (a - val)); + return saturate_cast(saturate_cast(a) - val); } }; -} -namespace cv { namespace cuda { namespace device -{ - template struct TransformFunctorTraits< arithm::SubScalar > : arithm::ArithmFuncTraits + template struct SubScalarOpInv : unary_function + { + ScalarType val; + + __device__ __forceinline__ DstType operator ()(SrcType a) const + { + return saturate_cast(val - saturate_cast(a)); + } + }; + + template struct TransformPolicy : DefaultTransformPolicy { }; -}}} - -namespace arithm -{ - template - void subScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream) + template <> struct TransformPolicy : DefaultTransformPolicy { - SubScalar op(static_cast(val), inv ? -1 : 1); + enum { + shift = 1 + }; + }; - if (mask.data) - device::transform((PtrStepSz) src1, (PtrStepSz) dst, op, mask, stream); + template + void subScalarImpl(const GpuMat& src, cv::Scalar value, bool inv, GpuMat& dst, const GpuMat& mask, Stream& stream) + { + typedef typename MakeVec::cn>::type ScalarType; + + cv::Scalar_ value_ = value; + + if (inv) + { + SubScalarOpInv op; + op.val = VecTraits::make(value_.val); + + if (mask.data) + gridTransformUnary_< TransformPolicy >(globPtr(src), globPtr(dst), op, globPtr(mask), stream); + else + gridTransformUnary_< TransformPolicy >(globPtr(src), globPtr(dst), op, stream); + } else - device::transform((PtrStepSz) src1, (PtrStepSz) dst, op, WithOutMask(), stream); + { + SubScalarOp op; + op.val = VecTraits::make(value_.val); + + if (mask.data) + gridTransformUnary_< TransformPolicy >(globPtr(src), globPtr(dst), op, globPtr(mask), stream); + else + gridTransformUnary_< TransformPolicy >(globPtr(src), globPtr(dst), op, stream); + } } - - template void subScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void subScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void subScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void subScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void subScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void subScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void subScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - - template void subScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void subScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void subScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void subScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void subScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void subScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void subScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - - //template void subScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - //template void subScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void subScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void subScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void subScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void subScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void subScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - - //template void subScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - //template void subScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void subScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void subScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void subScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void subScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void subScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - - //template void subScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - //template void subScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - //template void subScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - //template void subScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void subScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void subScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void subScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - - //template void subScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - //template void subScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - //template void subScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - //template void subScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - //template void subScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void subScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void subScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - - //template void subScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - //template void subScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - //template void subScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - //template void subScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - //template void subScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - //template void subScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void subScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); } -#endif // CUDA_DISABLER +void subScalar(const GpuMat& src, cv::Scalar val, bool inv, GpuMat& dst, const GpuMat& mask, double, Stream& stream, int) +{ + typedef void (*func_t)(const GpuMat& src, cv::Scalar val, bool inv, GpuMat& dst, const GpuMat& mask, Stream& stream); + static const func_t funcs[7][7][4] = + { + { + {subScalarImpl, subScalarImpl, subScalarImpl, subScalarImpl}, + {subScalarImpl, subScalarImpl, subScalarImpl, subScalarImpl}, + {subScalarImpl, subScalarImpl, subScalarImpl, subScalarImpl}, + {subScalarImpl, subScalarImpl, subScalarImpl, subScalarImpl}, + {subScalarImpl, subScalarImpl, subScalarImpl, subScalarImpl}, + {subScalarImpl, subScalarImpl, subScalarImpl, subScalarImpl}, + {subScalarImpl, subScalarImpl, subScalarImpl, subScalarImpl} + }, + { + {subScalarImpl, subScalarImpl, subScalarImpl, subScalarImpl}, + {subScalarImpl, subScalarImpl, subScalarImpl, subScalarImpl}, + {subScalarImpl, subScalarImpl, subScalarImpl, subScalarImpl}, + {subScalarImpl, subScalarImpl, subScalarImpl, subScalarImpl}, + {subScalarImpl, subScalarImpl, subScalarImpl, subScalarImpl}, + {subScalarImpl, subScalarImpl, subScalarImpl, subScalarImpl}, + {subScalarImpl, subScalarImpl, subScalarImpl, subScalarImpl} + }, + { + {0 /*subScalarImpl*/, 0 /*subScalarImpl*/, 0 /*subScalarImpl*/, 0 /*subScalarImpl*/}, + {0 /*subScalarImpl*/, 0 /*subScalarImpl*/, 0 /*subScalarImpl*/, 0 /*subScalarImpl*/}, + {subScalarImpl, subScalarImpl, subScalarImpl, subScalarImpl}, + {subScalarImpl, subScalarImpl, subScalarImpl, subScalarImpl}, + {subScalarImpl, subScalarImpl, subScalarImpl, subScalarImpl}, + {subScalarImpl, subScalarImpl, subScalarImpl, subScalarImpl}, + {subScalarImpl, subScalarImpl, subScalarImpl, subScalarImpl} + }, + { + {0 /*subScalarImpl*/, 0 /*subScalarImpl*/, 0 /*subScalarImpl*/, 0 /*subScalarImpl*/}, + {0 /*subScalarImpl*/, 0 /*subScalarImpl*/, 0 /*subScalarImpl*/, 0 /*subScalarImpl*/}, + {subScalarImpl, subScalarImpl, subScalarImpl, subScalarImpl}, + {subScalarImpl, subScalarImpl, subScalarImpl, subScalarImpl}, + {subScalarImpl, subScalarImpl, subScalarImpl, subScalarImpl}, + {subScalarImpl, subScalarImpl, subScalarImpl, subScalarImpl}, + {subScalarImpl, subScalarImpl, subScalarImpl, subScalarImpl} + }, + { + {0 /*subScalarImpl*/, 0 /*subScalarImpl*/, 0 /*subScalarImpl*/, 0 /*subScalarImpl*/}, + {0 /*subScalarImpl*/, 0 /*subScalarImpl*/, 0 /*subScalarImpl*/, 0 /*subScalarImpl*/}, + {0 /*subScalarImpl*/, 0 /*subScalarImpl*/, 0 /*subScalarImpl*/, 0 /*subScalarImpl*/}, + {0 /*subScalarImpl*/, 0 /*subScalarImpl*/, 0 /*subScalarImpl*/, 0 /*subScalarImpl*/}, + {subScalarImpl, subScalarImpl, subScalarImpl, subScalarImpl}, + {subScalarImpl, subScalarImpl, subScalarImpl, subScalarImpl}, + {subScalarImpl, subScalarImpl, subScalarImpl, subScalarImpl} + }, + { + {0 /*subScalarImpl*/, 0 /*subScalarImpl*/, 0 /*subScalarImpl*/, 0 /*subScalarImpl*/}, + {0 /*subScalarImpl*/, 0 /*subScalarImpl*/, 0 /*subScalarImpl*/, 0 /*subScalarImpl*/}, + {0 /*subScalarImpl*/, 0 /*subScalarImpl*/, 0 /*subScalarImpl*/, 0 /*subScalarImpl*/}, + {0 /*subScalarImpl*/, 0 /*subScalarImpl*/, 0 /*subScalarImpl*/, 0 /*subScalarImpl*/}, + {0 /*subScalarImpl*/, 0 /*subScalarImpl*/, 0 /*subScalarImpl*/, 0 /*subScalarImpl*/}, + {subScalarImpl, subScalarImpl, subScalarImpl, subScalarImpl}, + {subScalarImpl, subScalarImpl, subScalarImpl, subScalarImpl} + }, + { + {0 /*subScalarImpl*/, 0 /*subScalarImpl*/, 0 /*subScalarImpl*/, 0 /*subScalarImpl*/}, + {0 /*subScalarImpl*/, 0 /*subScalarImpl*/, 0 /*subScalarImpl*/, 0 /*subScalarImpl*/}, + {0 /*subScalarImpl*/, 0 /*subScalarImpl*/, 0 /*subScalarImpl*/, 0 /*subScalarImpl*/}, + {0 /*subScalarImpl*/, 0 /*subScalarImpl*/, 0 /*subScalarImpl*/, 0 /*subScalarImpl*/}, + {0 /*subScalarImpl*/, 0 /*subScalarImpl*/, 0 /*subScalarImpl*/, 0 /*subScalarImpl*/}, + {0 /*subScalarImpl*/, 0 /*subScalarImpl*/, 0 /*subScalarImpl*/, 0 /*subScalarImpl*/}, + {subScalarImpl, subScalarImpl, subScalarImpl, subScalarImpl} + } + }; + + const int sdepth = src.depth(); + const int ddepth = dst.depth(); + const int cn = src.channels(); + + CV_DbgAssert( sdepth < 7 && ddepth < 7 && cn <= 4 ); + + const func_t func = funcs[sdepth][ddepth][cn - 1]; + + if (!func) + CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types"); + + func(src, val, inv, dst, mask, stream); +} + +#endif diff --git a/modules/cudaarithm/src/element_operations.cpp b/modules/cudaarithm/src/element_operations.cpp index b3711dcc1d..071c79368f 100644 --- a/modules/cudaarithm/src/element_operations.cpp +++ b/modules/cudaarithm/src/element_operations.cpp @@ -348,248 +348,9 @@ void cv::cuda::add(InputArray src1, InputArray src2, OutputArray dst, InputArray //////////////////////////////////////////////////////////////////////// // subtract -namespace arithm -{ - void subMat_v4(PtrStepSz src1, PtrStepSz src2, PtrStepSz dst, cudaStream_t stream); - void subMat_v2(PtrStepSz src1, PtrStepSz src2, PtrStepSz dst, cudaStream_t stream); +void subMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, double, Stream& _stream, int); - template - void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); -} - -static void subMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, double, Stream& _stream, int) -{ - typedef void (*func_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - static const func_t funcs[7][7] = - { - { - arithm::subMat, - arithm::subMat, - arithm::subMat, - arithm::subMat, - arithm::subMat, - arithm::subMat, - arithm::subMat - }, - { - arithm::subMat, - arithm::subMat, - arithm::subMat, - arithm::subMat, - arithm::subMat, - arithm::subMat, - arithm::subMat - }, - { - 0 /*arithm::subMat*/, - 0 /*arithm::subMat*/, - arithm::subMat, - arithm::subMat, - arithm::subMat, - arithm::subMat, - arithm::subMat - }, - { - 0 /*arithm::subMat*/, - 0 /*arithm::subMat*/, - arithm::subMat, - arithm::subMat, - arithm::subMat, - arithm::subMat, - arithm::subMat - }, - { - 0 /*arithm::subMat*/, - 0 /*arithm::subMat*/, - 0 /*arithm::subMat*/, - 0 /*arithm::subMat*/, - arithm::subMat, - arithm::subMat, - arithm::subMat - }, - { - 0 /*arithm::subMat*/, - 0 /*arithm::subMat*/, - 0 /*arithm::subMat*/, - 0 /*arithm::subMat*/, - 0 /*arithm::subMat*/, - arithm::subMat, - arithm::subMat - }, - { - 0 /*arithm::subMat*/, - 0 /*arithm::subMat*/, - 0 /*arithm::subMat*/, - 0 /*arithm::subMat*/, - 0 /*arithm::subMat*/, - 0 /*arithm::subMat*/, - arithm::subMat - } - }; - - const int sdepth = src1.depth(); - const int ddepth = dst.depth(); - const int cn = src1.channels(); - - cudaStream_t stream = StreamAccessor::getStream(_stream); - - PtrStepSzb src1_(src1.rows, src1.cols * cn, src1.data, src1.step); - PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step); - PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step); - - if (mask.empty() && (sdepth == CV_8U || sdepth == CV_16U) && ddepth == sdepth) - { - const intptr_t src1ptr = reinterpret_cast(src1_.data); - const intptr_t src2ptr = reinterpret_cast(src2_.data); - const intptr_t dstptr = reinterpret_cast(dst_.data); - - const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0; - - if (isAllAligned) - { - if (sdepth == CV_8U && (src1_.cols & 3) == 0) - { - const int vcols = src1_.cols >> 2; - - arithm::subMat_v4(PtrStepSz(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step), - PtrStepSz(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step), - PtrStepSz(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step), - stream); - - return; - } - else if (sdepth == CV_16U && (src1_.cols & 1) == 0) - { - const int vcols = src1_.cols >> 1; - - arithm::subMat_v2(PtrStepSz(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step), - PtrStepSz(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step), - PtrStepSz(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step), - stream); - - return; - } - } - } - - const func_t func = funcs[sdepth][ddepth]; - - if (!func) - CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types"); - - func(src1_, src2_, dst_, mask, stream); -} - -namespace arithm -{ - template - void subScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); -} - -static void subScalar(const GpuMat& src, Scalar val, bool inv, GpuMat& dst, const GpuMat& mask, double, Stream& _stream, int) -{ - typedef void (*func_t)(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - static const func_t funcs[7][7] = - { - { - arithm::subScalar, - arithm::subScalar, - arithm::subScalar, - arithm::subScalar, - arithm::subScalar, - arithm::subScalar, - arithm::subScalar - }, - { - arithm::subScalar, - arithm::subScalar, - arithm::subScalar, - arithm::subScalar, - arithm::subScalar, - arithm::subScalar, - arithm::subScalar - }, - { - 0 /*arithm::subScalar*/, - 0 /*arithm::subScalar*/, - arithm::subScalar, - arithm::subScalar, - arithm::subScalar, - arithm::subScalar, - arithm::subScalar - }, - { - 0 /*arithm::subScalar*/, - 0 /*arithm::subScalar*/, - arithm::subScalar, - arithm::subScalar, - arithm::subScalar, - arithm::subScalar, - arithm::subScalar - }, - { - 0 /*arithm::subScalar*/, - 0 /*arithm::subScalar*/, - 0 /*arithm::subScalar*/, - 0 /*arithm::subScalar*/, - arithm::subScalar, - arithm::subScalar, - arithm::subScalar - }, - { - 0 /*arithm::subScalar*/, - 0 /*arithm::subScalar*/, - 0 /*arithm::subScalar*/, - 0 /*arithm::subScalar*/, - 0 /*arithm::subScalar*/, - arithm::subScalar, - arithm::subScalar - }, - { - 0 /*arithm::subScalar*/, - 0 /*arithm::subScalar*/, - 0 /*arithm::subScalar*/, - 0 /*arithm::subScalar*/, - 0 /*arithm::subScalar*/, - 0 /*arithm::subScalar*/, - arithm::subScalar - } - }; - - typedef void (*npp_func_t)(const PtrStepSzb src, Scalar sc, PtrStepb dst, cudaStream_t stream); - static const npp_func_t npp_funcs[7][4] = - { - {NppArithmScalar::call, 0 , NppArithmScalar::call, NppArithmScalar::call}, - {0 , 0 , 0 , 0 }, - {NppArithmScalar::call, 0 , NppArithmScalar::call, NppArithmScalar::call}, - {NppArithmScalar::call, NppArithmScalar::call, NppArithmScalar::call, NppArithmScalar::call}, - {NppArithmScalar::call, NppArithmScalar::call, NppArithmScalar::call, 0 }, - {NppArithmScalar::call, NppArithmScalar::call, NppArithmScalar::call, NppArithmScalar::call}, - {0 , 0 , 0 , 0 } - }; - - const int sdepth = src.depth(); - const int ddepth = dst.depth(); - const int cn = src.channels(); - - cudaStream_t stream = StreamAccessor::getStream(_stream); - - const npp_func_t npp_func = npp_funcs[sdepth][cn - 1]; - if (ddepth == sdepth && cn > 1 && npp_func != 0 && !inv) - { - npp_func(src, val, dst, stream); - return; - } - - CV_Assert( cn == 1 ); - - const func_t func = funcs[sdepth][ddepth]; - - if (!func) - CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types"); - - func(src, val[0], inv, dst, mask, stream); -} +void subScalar(const GpuMat& src, Scalar val, bool inv, GpuMat& dst, const GpuMat& mask, double, Stream& stream, int); void cv::cuda::subtract(InputArray src1, InputArray src2, OutputArray dst, InputArray mask, int dtype, Stream& stream) {