diff --git a/modules/dnn/src/cuda/eltwise_ops.cu b/modules/dnn/src/cuda/eltwise_ops.cu index 8a861b3067..e2a7cc9a67 100644 --- a/modules/dnn/src/cuda/eltwise_ops.cu +++ b/modules/dnn/src/cuda/eltwise_ops.cu @@ -132,8 +132,23 @@ void eltwise_op(const Stream& stream, TensorSpan output, TensorView x, Ten } else { - CV_Assert(is_shape_compatible(output, x)); - CV_Assert(is_shape_compatible(output, y)); + auto inShape1 = x.shape_as_vector(); + auto inShape2 = y.shape_as_vector(); + auto outShape = output.shape_as_vector(); + + std::size_t x_ndims = inShape1.size(), y_ndims = inShape2.size(); + if (x_ndims >= y_ndims) { + for (std::size_t i = 0; i < (x_ndims - y_ndims); i++) { + inShape2.insert(inShape2.begin(), 1); + } + } else { + for (std::size_t i = 0; i < (y_ndims - x_ndims); i++) { + inShape1.insert(inShape1.begin(), 1); + } + } + + CV_Assert(is_shape_compatible1(outShape, inShape1)); + CV_Assert(is_shape_compatible1(outShape, inShape2)); /* matching singleton axes in both input tensors can be eliminated * @@ -148,20 +163,21 @@ void eltwise_op(const Stream& stream, TensorSpan output, TensorView x, Ten * x: [1, 256, 32, 32] -> [256, 32, 32] * y: [1, 256, 1, 1] -> [256, 1, 1] */ - for (int r = 0; r < output.rank(); r++) - { - while (x.rank() > r && y.rank() > r && x.get_axis_size(r) == 1 && y.get_axis_size(r) == 1) { - CV_Assert(output.get_axis_size(r) == 1); - - x.squeeze(r); - y.squeeze(r); - output.squeeze(r); + int eliminate_times = 0; + for (std::size_t i = 0; i < outShape.size(); i++) { + if (inShape1[i] == 1 && inShape2[i] == 1 && outShape[i] == 1 && i != (outShape.size() - 1)) { + eliminate_times++; + } else { + break; + } + } + if (eliminate_times > 0) { + for (int i = 0; i < eliminate_times; i++) { + inShape1.erase(inShape1.begin()); + inShape2.erase(inShape2.begin()); + outShape.erase(outShape.begin()); } } - - auto inShape1 = x.shape_as_vector(); - auto inShape2 = y.shape_as_vector(); - auto outShape = output.shape_as_vector(); /* contiguous axes that do not broadcast can be merged into one axis * diff --git a/modules/dnn/src/cuda4dnn/csl/tensor.hpp b/modules/dnn/src/cuda4dnn/csl/tensor.hpp index 5a1286de99..8f495ac807 100644 --- a/modules/dnn/src/cuda4dnn/csl/tensor.hpp +++ b/modules/dnn/src/cuda4dnn/csl/tensor.hpp @@ -1187,6 +1187,23 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { return true; } + template + bool is_shape_compatible1(const ShapeType &x_shape, const ShapeType &y_shape) noexcept { + const auto x_ndims = x_shape.size(), y_ndims = y_shape.size(); + + if (x_ndims != y_ndims) { + return false; + } + + for (int i = 0; i < x_ndims; i++) { + if (x_shape[i] != y_shape[i] && x_shape[i] != 1 && y_shape[i] != 1) { + return false; + } + } + + return true; + } + /** returns the rank to which the given tensor can be squeezed to */ template std::size_t get_effective_rank(const TensorType& x) noexcept { diff --git a/modules/dnn/src/layers/nary_eltwise_layers.cpp b/modules/dnn/src/layers/nary_eltwise_layers.cpp index 661861cbe3..b22eb5bbf0 100644 --- a/modules/dnn/src/layers/nary_eltwise_layers.cpp +++ b/modules/dnn/src/layers/nary_eltwise_layers.cpp @@ -818,19 +818,6 @@ public: { auto context = reinterpret_cast(context_); - auto input_0_shape = inputs[0].dynamicCast()->getShape(); - for (int i = 1; i < inputs.size(); i++) - { - auto input_i_shape = inputs[i].dynamicCast()->getShape(); - if (input_0_shape.size() != input_i_shape.size()) - return Ptr(); - // check if the shape can be supported by `eltwise_ops.cu`, or return the default BackendNode - for (int j = 0; j < input_0_shape.size(); j++) - if (input_0_shape[j] != input_i_shape[j] && - input_0_shape[j] != 1 && input_i_shape[j] != 1) - return Ptr(); - } - cuda4dnn::EltwiseOpType op_ = cuda4dnn::EltwiseOpType::SUM; switch (op) { case OPERATION::MAX: diff --git a/modules/dnn/src/net_impl_fuse.cpp b/modules/dnn/src/net_impl_fuse.cpp index dfa542bd41..b81bf14acc 100644 --- a/modules/dnn/src/net_impl_fuse.cpp +++ b/modules/dnn/src/net_impl_fuse.cpp @@ -728,6 +728,10 @@ void Net::Impl::fuseLayers(const std::vector& blobsToKeep_) if(inp_i_data->skip || inp_i_data->consumers.size() != 1) break; #ifdef HAVE_CUDA + /* Risk: Not every operation in "NaryEltwise" is supported in the CUDA backend. There is a chance + that Concat's output is filled with data in both host and device, leading to data missing. + See https://github.com/opencv/opencv/issues/24721 for more details. + */ if (preferableBackend == DNN_BACKEND_CUDA && (inp_i_data->layerInstance->supportBackend(DNN_BACKEND_CUDA) == false || (inp_i_data->layerInstance->type != "Convolution" && diff --git a/modules/dnn/test/test_backends.cpp b/modules/dnn/test/test_backends.cpp index f255ab87aa..591ec63515 100644 --- a/modules/dnn/test/test_backends.cpp +++ b/modules/dnn/test/test_backends.cpp @@ -102,6 +102,12 @@ public: Net net; }; +TEST_P(DNNTestNetwork, DISABLED_YOLOv8n) { + processNet("dnn/onnx/models/yolov8n.onnx", "", Size(640, 640), "output0"); + expectNoFallbacksFromIE(net); + expectNoFallbacksFromCUDA(net); +} + TEST_P(DNNTestNetwork, AlexNet) { applyTestTag(CV_TEST_TAG_MEMORY_1GB); @@ -1518,6 +1524,71 @@ INSTANTIATE_TEST_CASE_P(Layer_Test_Backends, Eltwise, testing::Combine( dnnBackendsAndTargets() )); +//////////////////////////////////////////////////////////////////////////////// +// Element-wise layers +//////////////////////////////////////////////////////////////////////////////// +using NaryEltwiseConcat = TestWithParam, tuple>>; +TEST_P(NaryEltwiseConcat, Accuracy) { + auto param = GetParam(); + std::vector input_shape = get<0>(param); + auto backend_id = get<0>(get<1>(param)); + auto target_id = get<1>(get<1>(param)); + + /* Build the following net: + + <1x4x84> + / + [Input] -+-> Mul(B<1x84>) -> Concat(axis=1) -> [Output] + | | + +-> Sigmoid ----------+ + + */ + Net net; + + std::vector mul_B_shape(input_shape.size() - 1, 1); + mul_B_shape.back() = input_shape.back(); + Mat mul_B(mul_B_shape, CV_32FC1); + randn(mul_B, 0.f, 1.f); + LayerParams mul_B_lp; + mul_B_lp.name = "mul_B"; + mul_B_lp.type = "Const"; + mul_B_lp.blobs.push_back(mul_B); + int id_mul_B = net.addLayer(mul_B_lp.name, mul_B_lp.type, mul_B_lp); + + LayerParams mul_lp; + mul_lp.name = "mul"; + mul_lp.type = "NaryEltwise"; + mul_lp.set("operation", "mul"); + int id_mul = net.addLayer(mul_lp.name, mul_lp.type, mul_lp); + net.connect(0, 0, id_mul, 0); + net.connect(id_mul_B, 0, id_mul, 1); + + LayerParams sigmoid_lp; + sigmoid_lp.name = "sigmoid"; + sigmoid_lp.type = "Sigmoid"; + int id_sigmoid = net.addLayer(sigmoid_lp.name, sigmoid_lp.type, sigmoid_lp); + net.connect(0, 0, id_sigmoid, 0); + + LayerParams concat_lp; + concat_lp.name = "concat"; + concat_lp.type = "Concat"; + concat_lp.set("axis", 1); + int id_concat = net.addLayer(concat_lp.name, concat_lp.type, concat_lp); + net.connect(id_mul, 0, id_concat, 0); + net.connect(id_sigmoid, 0, id_concat, 1); + + // Run test + Mat input(input_shape, CV_32FC1); + testLayer(input, net, backend_id, target_id, false); +} + +INSTANTIATE_TEST_CASE_P(Layer_Test_Backends, NaryEltwiseConcat, testing::Combine( + testing::Values(std::vector{1, 4, 84}), + dnnBackendsAndTargets()) +); + + + INSTANTIATE_TEST_CASE_P(/*nothing*/, Test_layers_backends, dnnBackendsAndTargets()); }} // namespace diff --git a/modules/dnn/test/test_layers.cpp b/modules/dnn/test/test_layers.cpp index 707ae51673..744128544b 100644 --- a/modules/dnn/test/test_layers.cpp +++ b/modules/dnn/test/test_layers.cpp @@ -2050,7 +2050,7 @@ private: net.setPreferableTarget(target); Mat re; - ASSERT_NO_THROW(re = net.forward()); // runtime error + re = net.forward(); auto ptr_re = (float *) re.data; for (int i = 0; i < re.total(); i++) if (op == "sum"){