Merge pull request #24834 from fengyuentau:cuda_naryeltwise_broadcast

dnn (cuda): support broadcasting if a.rank() != b.rank() #24834 Inspired by https://github.com/opencv/opencv/pull/24786. This PR keeps the fusion of `NaryEltwise` and `Concat` while addressed the data missing problem via supporting broadcasting if a.rank() != b.rank(). Resolves https://github.com/opencv/opencv/issues/23977 Resolves https://github.com/opencv/opencv/issues/24606 Resolves https://github.com/opencv/opencv/issues/24635 Resolves https://github.com/opencv/opencv/issues/24721 ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [x] There is a reference to the original bug report and related work - [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [x] The feature is well documented and sample code can be built with the project CMake
2025-08-05 22:19:14 +08:00 · 2024-01-11 01:04:46 -06:00 · 2024-01-11 01:04:46 -06:00 · e7ccff9805
commit e7ccff9805
parent be1373f01a
6 changed files with 123 additions and 28 deletions
--- a/modules/dnn/src/cuda/eltwise_ops.cu
+++ b/modules/dnn/src/cuda/eltwise_ops.cu
@ -132,8 +132,23 @@ void eltwise_op(const Stream& stream, TensorSpan<T> output, TensorView<T> x, Ten
    }
    else
    {
-        CV_Assert(is_shape_compatible(output, x));
-        CV_Assert(is_shape_compatible(output, y));
+        auto inShape1 = x.shape_as_vector();
+        auto inShape2 = y.shape_as_vector();
+        auto outShape = output.shape_as_vector();
+
+        std::size_t x_ndims = inShape1.size(), y_ndims = inShape2.size();
+        if (x_ndims >= y_ndims) {
+            for (std::size_t i = 0; i < (x_ndims - y_ndims); i++) {
+               inShape2.insert(inShape2.begin(), 1);
+            }
+        } else {
+            for (std::size_t i = 0; i < (y_ndims - x_ndims); i++) {
+               inShape1.insert(inShape1.begin(), 1);
+            }
+        }
+
+        CV_Assert(is_shape_compatible1(outShape, inShape1));
+        CV_Assert(is_shape_compatible1(outShape, inShape2));

        /* matching singleton axes in both input tensors can be eliminated
         *
@ -148,20 +163,21 @@ void eltwise_op(const Stream& stream, TensorSpan<T> output, TensorView<T> x, Ten
         * x: [1, 256, 32, 32] -> [256, 32, 32]
         * y: [1, 256, 1, 1] -> [256, 1, 1]
         */
-        for (int r = 0; r < output.rank(); r++)
-        {
-            while (x.rank() > r && y.rank() > r && x.get_axis_size(r) == 1 && y.get_axis_size(r) == 1) {
-                CV_Assert(output.get_axis_size(r) == 1);
-
-                x.squeeze(r);
-                y.squeeze(r);
-                output.squeeze(r);
+        int eliminate_times = 0;
+        for (std::size_t i = 0; i < outShape.size(); i++) {
+            if (inShape1[i] == 1 && inShape2[i] == 1 && outShape[i] == 1 && i != (outShape.size() - 1)) {
+                eliminate_times++;
+            } else {
+                break;
+            }
+        }
+        if (eliminate_times > 0) {
+            for (int i = 0; i < eliminate_times; i++) {
+                inShape1.erase(inShape1.begin());
+                inShape2.erase(inShape2.begin());
+                outShape.erase(outShape.begin());
            }
        }
-
-        auto inShape1 = x.shape_as_vector();
-        auto inShape2 = y.shape_as_vector();
-        auto outShape = output.shape_as_vector();

        /* contiguous axes that do not broadcast can be merged into one axis
         *
--- a/modules/dnn/src/cuda4dnn/csl/tensor.hpp
+++ b/modules/dnn/src/cuda4dnn/csl/tensor.hpp
@ -1187,6 +1187,23 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl {
        return true;
    }

+    template <typename ShapeType>
+    bool is_shape_compatible1(const ShapeType &x_shape, const ShapeType &y_shape) noexcept {
+        const auto x_ndims = x_shape.size(), y_ndims = y_shape.size();
+
+        if (x_ndims != y_ndims) {
+            return false;
+        }
+
+        for (int i = 0; i < x_ndims; i++) {
+            if (x_shape[i] != y_shape[i] && x_shape[i] != 1 && y_shape[i] != 1) {
+                 return false;
+            }
+        }
+
+        return true;
+    }
+
    /** returns the rank to which the given tensor can be squeezed to */
    template <class TensorType>
    std::size_t get_effective_rank(const TensorType& x) noexcept {
--- a/modules/dnn/src/layers/nary_eltwise_layers.cpp
+++ b/modules/dnn/src/layers/nary_eltwise_layers.cpp
@ -818,19 +818,6 @@ public:
    {
        auto context = reinterpret_cast<csl::CSLContext*>(context_);

-        auto input_0_shape = inputs[0].dynamicCast<CUDABackendWrapper>()->getShape();
-        for (int i = 1; i < inputs.size(); i++)
-        {
-            auto input_i_shape = inputs[i].dynamicCast<CUDABackendWrapper>()->getShape();
-            if (input_0_shape.size() != input_i_shape.size())
-                return Ptr<BackendNode>();
-            // check if the shape can be supported by `eltwise_ops.cu`, or return the default BackendNode
-            for (int j = 0; j < input_0_shape.size(); j++)
-                if (input_0_shape[j] != input_i_shape[j] &&
-                    input_0_shape[j] != 1 && input_i_shape[j] != 1)
-                    return Ptr<BackendNode>();
-        }
-
        cuda4dnn::EltwiseOpType op_ = cuda4dnn::EltwiseOpType::SUM;
        switch (op) {
            case OPERATION::MAX:
--- a/modules/dnn/src/net_impl_fuse.cpp
+++ b/modules/dnn/src/net_impl_fuse.cpp
@ -728,6 +728,10 @@ void Net::Impl::fuseLayers(const std::vector<LayerPin>& blobsToKeep_)
                    if(inp_i_data->skip || inp_i_data->consumers.size() != 1)
                        break;
 #ifdef HAVE_CUDA
+                    /* Risk: Not every operation in "NaryEltwise" is supported in the CUDA backend. There is a chance
+                             that Concat's output is filled with data in both host and device, leading to data missing.
+                             See https://github.com/opencv/opencv/issues/24721 for more details.
+                    */
                    if (preferableBackend == DNN_BACKEND_CUDA &&
                        (inp_i_data->layerInstance->supportBackend(DNN_BACKEND_CUDA) == false ||
                         (inp_i_data->layerInstance->type != "Convolution" &&
--- a/modules/dnn/test/test_backends.cpp
+++ b/modules/dnn/test/test_backends.cpp
@ -102,6 +102,12 @@ public:
    Net net;
 };

+TEST_P(DNNTestNetwork, DISABLED_YOLOv8n) {
+    processNet("dnn/onnx/models/yolov8n.onnx", "", Size(640, 640), "output0");
+    expectNoFallbacksFromIE(net);
+    expectNoFallbacksFromCUDA(net);
+}
+
 TEST_P(DNNTestNetwork, AlexNet)
 {
    applyTestTag(CV_TEST_TAG_MEMORY_1GB);
@ -1518,6 +1524,71 @@ INSTANTIATE_TEST_CASE_P(Layer_Test_Backends, Eltwise, testing::Combine(
               dnnBackendsAndTargets()
 ));

+////////////////////////////////////////////////////////////////////////////////
+// Element-wise layers
+////////////////////////////////////////////////////////////////////////////////
+using NaryEltwiseConcat = TestWithParam<tuple<std::vector<int>, tuple<Backend, Target>>>;
+TEST_P(NaryEltwiseConcat, Accuracy) {
+    auto param = GetParam();
+    std::vector<int> input_shape = get<0>(param);
+    auto backend_id = get<0>(get<1>(param));
+    auto target_id = get<1>(get<1>(param));
+
+    /* Build the following net:
+
+           <1x4x84>
+           /
+        [Input] -+-> Mul(B<1x84>) -> Concat(axis=1) -> [Output]
+                 |                     |
+                 +-> Sigmoid ----------+
+
+    */
+    Net net;
+
+    std::vector<int> mul_B_shape(input_shape.size() - 1, 1);
+    mul_B_shape.back() = input_shape.back();
+    Mat mul_B(mul_B_shape, CV_32FC1);
+    randn(mul_B, 0.f, 1.f);
+    LayerParams mul_B_lp;
+    mul_B_lp.name = "mul_B";
+    mul_B_lp.type = "Const";
+    mul_B_lp.blobs.push_back(mul_B);
+    int id_mul_B = net.addLayer(mul_B_lp.name, mul_B_lp.type, mul_B_lp);
+
+    LayerParams mul_lp;
+    mul_lp.name = "mul";
+    mul_lp.type = "NaryEltwise";
+    mul_lp.set("operation", "mul");
+    int id_mul = net.addLayer(mul_lp.name, mul_lp.type, mul_lp);
+    net.connect(0, 0, id_mul, 0);
+    net.connect(id_mul_B, 0, id_mul, 1);
+
+    LayerParams sigmoid_lp;
+    sigmoid_lp.name = "sigmoid";
+    sigmoid_lp.type = "Sigmoid";
+    int id_sigmoid = net.addLayer(sigmoid_lp.name, sigmoid_lp.type, sigmoid_lp);
+    net.connect(0, 0, id_sigmoid, 0);
+
+    LayerParams concat_lp;
+    concat_lp.name = "concat";
+    concat_lp.type = "Concat";
+    concat_lp.set("axis", 1);
+    int id_concat = net.addLayer(concat_lp.name, concat_lp.type, concat_lp);
+    net.connect(id_mul, 0, id_concat, 0);
+    net.connect(id_sigmoid, 0, id_concat, 1);
+
+    // Run test
+    Mat input(input_shape, CV_32FC1);
+    testLayer(input, net, backend_id, target_id, false);
+}
+
+INSTANTIATE_TEST_CASE_P(Layer_Test_Backends, NaryEltwiseConcat, testing::Combine(
+    testing::Values(std::vector<int>{1, 4, 84}),
+    dnnBackendsAndTargets())
+);
+
+
+
 INSTANTIATE_TEST_CASE_P(/*nothing*/, Test_layers_backends, dnnBackendsAndTargets());

 }} // namespace
--- a/modules/dnn/test/test_layers.cpp
+++ b/modules/dnn/test/test_layers.cpp
@ -2050,7 +2050,7 @@ private:
        net.setPreferableTarget(target);

        Mat re;
-        ASSERT_NO_THROW(re = net.forward()); // runtime error
+        re = net.forward();
        auto ptr_re = (float *) re.data;
        for (int i = 0; i < re.total(); i++)
            if (op == "sum"){