Merge pull request #24647 from fengyuentau:cuda_sub

dnn cuda: support Sub #24647

Related https://github.com/opencv/opencv/issues/24606#issuecomment-1837390257

### Pull Request Readiness Checklist

See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request

- [x] I agree to contribute to the project under Apache 2 License.
- [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [x] The PR is proposed to the proper branch
- [x] There is a reference to the original bug report and related work
- [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
      Patch to opencv_extra has the same branch name.
- [x] The feature is well documented and sample code can be built with the project CMake
This commit is contained in:
Yuantao Feng 2023-12-06 04:46:24 -06:00 committed by GitHub
parent f5ec92e4ca
commit a2edf4d929
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 33 additions and 3 deletions

View File

@ -319,7 +319,13 @@ void eltwise_div_2(const Stream& stream, TensorSpan<T> output, TensorView<T> x,
eltwise_op<T, DivFunctor<T>>(stream, output, x, y);
}
template <class T>
void eltwise_sub_2(const Stream& stream, TensorSpan<T> output, TensorView<T> x, TensorView<T> y) {
eltwise_op<T, SubFunctor<T>>(stream, output, x, y);
}
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
template void eltwise_sub_2(const Stream& stream, TensorSpan<__half> output, TensorView<__half> x, TensorView<__half> y);
template void eltwise_div_2(const Stream& stream, TensorSpan<__half> output, TensorView<__half> x, TensorView<__half> y);
template void eltwise_prod_2(const Stream& stream, TensorSpan<__half> output, TensorView<__half> x, TensorView<__half> y);
template void eltwise_sum_coeff_2(const Stream&, TensorSpan<__half>, __half, TensorView<__half>, __half, TensorView<__half>);
@ -327,6 +333,7 @@ void eltwise_div_2(const Stream& stream, TensorSpan<T> output, TensorView<T> x,
template void eltwise_max_2(const Stream& stream, TensorSpan<__half> output, TensorView<__half> x, TensorView<__half> y);
template void eltwise_min_2(const Stream& stream, TensorSpan<__half> output, TensorView<__half> x, TensorView<__half> y);
#endif
template void eltwise_sub_2(const Stream& stream, TensorSpan<float> output, TensorView<float> x, TensorView<float> y);
template void eltwise_div_2(const Stream& stream, TensorSpan<float> output, TensorView<float> x, TensorView<float> y);
template void eltwise_prod_2(const Stream& stream, TensorSpan<float> output, TensorView<float> x, TensorView<float> y);
template void eltwise_sum_coeff_2(const Stream&, TensorSpan<float>, float, TensorView<float>, float, TensorView<float>);

View File

@ -741,6 +741,18 @@ struct DivFunctor {
CUDA4DNN_DEVICE T operator()(T x, T y) { return x / y; }
};
template <class T>
struct SubFunctor {
struct Params {
CUDA4DNN_HOST_DEVICE Params() { }
};
CUDA4DNN_DEVICE SubFunctor() { }
CUDA4DNN_DEVICE SubFunctor(const Params& params) { }
CUDA4DNN_DEVICE T operator()(T x, T y) { return x - y; }
};
template <class T>
struct SignFunctor {
struct Params {

View File

@ -30,6 +30,9 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
template <class T>
void eltwise_div_2(const csl::Stream& stream, csl::TensorSpan<T> output, csl::TensorView<T> x, csl::TensorView<T> y);
template <class T>
void eltwise_sub_2(const csl::Stream& stream, csl::TensorSpan<T> output, csl::TensorView<T> x, csl::TensorView<T> y);
}}}} /* namespace cv::dnn::cuda4dnn::kernels */
#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_ELTWISE_OPS_HPP */

View File

@ -27,6 +27,7 @@ namespace cv { namespace dnn { namespace cuda4dnn {
PRODUCT,
DIV,
MIN,
SUB,
};
class EltwiseOpBase : public CUDABackendNode {
@ -88,6 +89,7 @@ namespace cv { namespace dnn { namespace cuda4dnn {
else
kernels::eltwise_sum_coeff_2<T>(stream, output, coeffs[0], input_x, coeffs[1], input_y);
break;
case EltwiseOpType::SUB: kernels::eltwise_sub_2<T>(stream, output, input_x, input_y); break;
}
}
else
@ -119,6 +121,7 @@ namespace cv { namespace dnn { namespace cuda4dnn {
kernels::eltwise_sum_coeff_2<T>(stream, output, coeff_x, output, coeffs[i], input);
}
break;
case EltwiseOpType::SUB: kernels::eltwise_sub_2<T>(stream, output, output, input); break;
}
}
}

View File

@ -114,9 +114,11 @@ public:
op == OPERATION::GREATER_EQUAL ||
op == OPERATION::LESS_EQUAL
);
if (op == OPERATION::MAX || op == OPERATION::MIN || op == OPERATION::SUM ||
op == OPERATION::PROD || op == OPERATION::DIV || op == OPERATION::ADD)
return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_CUDA;
if (backendId == DNN_BACKEND_CUDA) {
return op == OPERATION::MAX || op == OPERATION::MIN || op == OPERATION::SUM ||
op == OPERATION::PROD || op == OPERATION::DIV || op == OPERATION::ADD ||
op == OPERATION::SUB;
}
return backendId == DNN_BACKEND_OPENCV;
}
@ -828,6 +830,9 @@ public:
case OPERATION::ADD:
op_ = cuda4dnn::EltwiseOpType::SUM;
break;
case OPERATION::SUB:
op_ = cuda4dnn::EltwiseOpType::SUB;
break;
default: return Ptr<BackendNode>(); // return empty cuda_node if the EltwiseOpType is unsupported type.
};