diff --git a/modules/core/include/opencv2/core/gpumat.hpp b/modules/core/include/opencv2/core/gpumat.hpp index 9e12f6729a..d1fd1501ee 100644 --- a/modules/core/include/opencv2/core/gpumat.hpp +++ b/modules/core/include/opencv2/core/gpumat.hpp @@ -448,7 +448,7 @@ namespace cv { namespace gpu { int area = rows * cols; if (!m.isContinuous() || m.type() != type || m.size().area() != area) - m.create(1, area, type); + ensureSizeIsEnough(1, area, type, m); m = m.reshape(0, rows); } diff --git a/modules/gpu/src/cuda/element_operations.cu b/modules/gpu/src/cuda/element_operations.cu index 59ea3e04c4..b4d72f0cea 100644 --- a/modules/gpu/src/cuda/element_operations.cu +++ b/modules/gpu/src/cuda/element_operations.cu @@ -1058,12 +1058,12 @@ namespace cv { namespace gpu { namespace device ::cv::gpu::device::transform((DevMem2D_)src1, (DevMem2D_)src2, (DevMem2D_)dst, Absdiff(), stream); } - //template void absdiff_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream); + template void absdiff_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream); template void absdiff_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream); template void absdiff_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream); template void absdiff_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream); - //template void absdiff_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream); - //template void absdiff_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream); + template void absdiff_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream); + template void absdiff_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream); template void absdiff_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream); template struct AbsdiffScalar : unary_function diff --git a/modules/gpu/src/element_operations.cpp b/modules/gpu/src/element_operations.cpp index 4d44957a38..3081303ef2 100644 --- a/modules/gpu/src/element_operations.cpp +++ b/modules/gpu/src/element_operations.cpp @@ -159,7 +159,13 @@ void cv::gpu::add(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Gpu cudaStream_t stream = StreamAccessor::getStream(s); - if (mask.empty() && dst.type() == src1.type() && (src1.depth() == CV_8U || src1.depth() == CV_32S || src1.depth() == CV_32F)) + bool useNpp = + mask.empty() && + dst.type() == src1.type() && + (src1.depth() == CV_8U || src1.depth() == CV_32S || src1.depth() == CV_32F) && + (isAligned(src1.data, 16) && isAligned(src2.data, 16) && isAligned(dst.data, 16)); + + if (useNpp) { nppArithmCaller(src1, src2, dst, nppiAdd_8u_C1RSfs, nppiAdd_8u_C4RSfs, nppiAdd_32s_C1R, nppiAdd_32f_C1R, stream); return; @@ -271,7 +277,13 @@ void cv::gpu::subtract(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cons cudaStream_t stream = StreamAccessor::getStream(s); - if (mask.empty() && dst.type() == src1.type() && (src1.depth() == CV_8U || src1.depth() == CV_32S || src1.depth() == CV_32F)) + bool useNpp = + mask.empty() && + dst.type() == src1.type() && + (src1.depth() == CV_8U || src1.depth() == CV_32S || src1.depth() == CV_32F) && + (isAligned(src1.data, 16) && isAligned(src2.data, 16) && isAligned(dst.data, 16)); + + if (useNpp) { nppArithmCaller(src2, src1, dst, nppiSub_8u_C1RSfs, nppiSub_8u_C4RSfs, nppiSub_32s_C1R, nppiSub_32f_C1R, stream); return; @@ -403,8 +415,13 @@ void cv::gpu::multiply(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, doub dst.create(src1.size(), CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), src1.channels())); + bool useNpp = + scale == 1 && + dst.type() == src1.type() && + (src1.depth() == CV_8U || src1.depth() == CV_32S || src1.depth() == CV_32F) && + (isAligned(src1.data, 16) && isAligned(src2.data, 16) && isAligned(dst.data, 16)); - if (scale == 1 && dst.type() == src1.type() && (src1.depth() == CV_8U || src1.depth() == CV_32S || src1.depth() == CV_32F)) + if (useNpp) { nppArithmCaller(src2, src1, dst, nppiMul_8u_C1RSfs, nppiMul_8u_C4RSfs, nppiMul_32s_C1R, nppiMul_32f_C1R, stream); return; @@ -528,8 +545,13 @@ void cv::gpu::divide(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, double dst.create(src1.size(), CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), src1.channels())); + bool useNpp = + scale == 1 && + dst.type() == src1.type() && + (src1.depth() == CV_8U || src1.depth() == CV_32S || src1.depth() == CV_32F) && + (isAligned(src1.data, 16) && isAligned(src2.data, 16) && isAligned(dst.data, 16)); - if (scale == 1 && dst.type() == src1.type() && (src1.depth() == CV_8U || src1.depth() == CV_32S || src1.depth() == CV_32F)) + if (useNpp) { nppArithmCaller(src2, src1, dst, nppiDiv_8u_C1RSfs, nppiDiv_8u_C4RSfs, nppiDiv_32s_C1R, nppiDiv_32f_C1R, stream); return; @@ -643,7 +665,7 @@ void cv::gpu::absdiff(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Strea static const func_t funcs[] = { - 0/*absdiff_gpu*/, absdiff_gpu, absdiff_gpu, absdiff_gpu, 0/*absdiff_gpu*/, 0/*absdiff_gpu*/, absdiff_gpu + absdiff_gpu, absdiff_gpu, absdiff_gpu, absdiff_gpu, absdiff_gpu, absdiff_gpu, absdiff_gpu }; CV_Assert(src1.size() == src2.size() && src1.type() == src2.type()); @@ -656,7 +678,9 @@ void cv::gpu::absdiff(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Strea sz.width = src1.cols * src1.channels(); sz.height = src1.rows; - if (src1.depth() == CV_8U && (src1.cols * src1.channels()) % 4 == 0) + bool aligned = isAligned(src1.data, 16) && isAligned(src2.data, 16) && isAligned(dst.data, 16); + + if (aligned && src1.depth() == CV_8U && (src1.cols * src1.channels()) % 4 == 0) { NppStreamHandler h(stream); @@ -668,7 +692,7 @@ void cv::gpu::absdiff(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Strea if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); } - else if (src1.depth() == CV_8U) + else if (aligned && src1.depth() == CV_8U) { NppStreamHandler h(stream); @@ -678,7 +702,7 @@ void cv::gpu::absdiff(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Strea if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); } - else if (src1.depth() == CV_32S) + else if (aligned && src1.depth() == CV_32S) { NppStreamHandler h(stream); @@ -688,7 +712,7 @@ void cv::gpu::absdiff(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Strea if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); } - else if (src1.depth() == CV_32F) + else if (aligned && src1.depth() == CV_32F) { NppStreamHandler h(stream); diff --git a/modules/gpu/src/opencv2/gpu/device/common.hpp b/modules/gpu/src/opencv2/gpu/device/common.hpp index 9db7afa18f..a513bccb5c 100644 --- a/modules/gpu/src/opencv2/gpu/device/common.hpp +++ b/modules/gpu/src/opencv2/gpu/device/common.hpp @@ -67,6 +67,11 @@ namespace cv { namespace gpu { void error(const char *error_string, const char *file, const int line, const char *func); + + template static inline bool isAligned(const T* ptr, size_t size) + { + return reinterpret_cast(ptr) % size == 0; + } }} static inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "") diff --git a/modules/gpu/src/opencv2/gpu/device/detail/transform_detail.hpp b/modules/gpu/src/opencv2/gpu/device/detail/transform_detail.hpp index 1c499b9788..e053cb6bf0 100644 --- a/modules/gpu/src/opencv2/gpu/device/detail/transform_detail.hpp +++ b/modules/gpu/src/opencv2/gpu/device/detail/transform_detail.hpp @@ -309,7 +309,7 @@ namespace cv { namespace gpu { namespace device template<> struct TransformDispatcher { template - static void call(const DevMem2D_& src, const DevMem2D_& dst, const UnOp& op, const Mask& mask, cudaStream_t stream) + static void call(DevMem2D_ src, DevMem2D_ dst, UnOp op, Mask mask, cudaStream_t stream) { typedef TransformFunctorTraits ft; @@ -324,7 +324,7 @@ namespace cv { namespace gpu { namespace device } template - static void call(const DevMem2D_& src1, const DevMem2D_& src2, const DevMem2D_& dst, const BinOp& op, const Mask& mask, cudaStream_t stream) + static void call(DevMem2D_ src1, DevMem2D_ src2, DevMem2D_ dst, BinOp op, Mask mask, cudaStream_t stream) { typedef TransformFunctorTraits ft; @@ -341,12 +341,18 @@ namespace cv { namespace gpu { namespace device template<> struct TransformDispatcher { template - static void call(const DevMem2D_& src, const DevMem2D_& dst, const UnOp& op, const Mask& mask, cudaStream_t stream) + static void call(DevMem2D_ src, DevMem2D_ dst, UnOp op, Mask mask, cudaStream_t stream) { typedef TransformFunctorTraits ft; StaticAssert::check(); + if (!isAligned(src.data, ft::smart_shift * sizeof(T)) || !isAligned(dst.data, ft::smart_shift * sizeof(D))) + { + TransformDispatcher::call(src, dst, op, mask, stream); + return; + } + const dim3 threads(ft::smart_block_dim_x, ft::smart_block_dim_y, 1); const dim3 grid(divUp(src.cols, threads.x * ft::smart_shift), divUp(src.rows, threads.y), 1); @@ -358,12 +364,18 @@ namespace cv { namespace gpu { namespace device } template - static void call(const DevMem2D_& src1, const DevMem2D_& src2, const DevMem2D_& dst, const BinOp& op, const Mask& mask, cudaStream_t stream) + static void call(DevMem2D_ src1, DevMem2D_ src2, DevMem2D_ dst, BinOp op, Mask mask, cudaStream_t stream) { typedef TransformFunctorTraits ft; StaticAssert::check(); + if (!isAligned(src1.data, ft::smart_shift * sizeof(T1)) || !isAligned(src2.data, ft::smart_shift * sizeof(T2)) || !isAligned(dst.data, ft::smart_shift * sizeof(D))) + { + TransformDispatcher::call(src1, src2, dst, op, mask, stream); + return; + } + const dim3 threads(ft::smart_block_dim_x, ft::smart_block_dim_y, 1); const dim3 grid(divUp(src1.cols, threads.x * ft::smart_shift), divUp(src1.rows, threads.y), 1); @@ -376,14 +388,14 @@ namespace cv { namespace gpu { namespace device }; template - static void transform_caller(const DevMem2D_& src, const DevMem2D_& dst, const UnOp& op, const Mask& mask, cudaStream_t stream) + static inline void transform_caller(DevMem2D_ src, DevMem2D_ dst, UnOp op, Mask mask, cudaStream_t stream) { typedef TransformFunctorTraits ft; TransformDispatcher::cn == 1 && VecTraits::cn == 1 && ft::smart_shift != 1>::call(src, dst, op, mask, stream); } template - static void transform_caller(const DevMem2D_& src1, const DevMem2D_& src2, const DevMem2D_& dst, const BinOp& op, const Mask& mask, cudaStream_t stream) + static inline void transform_caller(DevMem2D_ src1, DevMem2D_ src2, DevMem2D_ dst, BinOp op, Mask mask, cudaStream_t stream) { typedef TransformFunctorTraits ft; TransformDispatcher::cn == 1 && VecTraits::cn == 1 && VecTraits::cn == 1 && ft::smart_shift != 1>::call(src1, src2, dst, op, mask, stream); diff --git a/modules/gpu/src/opencv2/gpu/device/transform.hpp b/modules/gpu/src/opencv2/gpu/device/transform.hpp index f4ea1531e8..b7e1303158 100644 --- a/modules/gpu/src/opencv2/gpu/device/transform.hpp +++ b/modules/gpu/src/opencv2/gpu/device/transform.hpp @@ -50,25 +50,25 @@ namespace cv { namespace gpu { namespace device { template - void transform(const DevMem2D_& src, const DevMem2D_& dst, const UnOp& op, cudaStream_t stream = 0) + static inline void transform(DevMem2D_ src, DevMem2D_ dst, UnOp op, cudaStream_t stream = 0) { transform_detail::transform_caller(src, dst, op, WithOutMask(), stream); } template - void transform(const DevMem2D_& src, const DevMem2D_& dst, const PtrStepb& mask, const UnOp& op, cudaStream_t stream = 0) + static inline void transform(DevMem2D_ src, DevMem2D_ dst, PtrStepb mask, UnOp op, cudaStream_t stream = 0) { transform_detail::transform_caller(src, dst, op, SingleMask(mask), stream); } template - void transform(const DevMem2D_& src1, const DevMem2D_& src2, const DevMem2D_& dst, const BinOp& op, cudaStream_t stream = 0) + static inline void transform(DevMem2D_ src1, DevMem2D_ src2, DevMem2D_ dst, BinOp op, cudaStream_t stream = 0) { transform_detail::transform_caller(src1, src2, dst, op, WithOutMask(), stream); } template - void transform(const DevMem2D_& src1, const DevMem2D_& src2, const DevMem2D_& dst, const PtrStepb& mask, const BinOp& op, cudaStream_t stream = 0) + static inline void transform(DevMem2D_ src1, DevMem2D_ src2, DevMem2D_ dst, PtrStepb mask, BinOp op, cudaStream_t stream = 0) { transform_detail::transform_caller(src1, src2, dst, op, SingleMask(mask), stream); }