diff --git a/modules/dnn/src/cuda/eltwise_ops.cu b/modules/dnn/src/cuda/eltwise_ops.cu
index 8a861b3067..e2a7cc9a67 100644
--- a/modules/dnn/src/cuda/eltwise_ops.cu
+++ b/modules/dnn/src/cuda/eltwise_ops.cu
@@ -132,8 +132,23 @@ void eltwise_op(const Stream& stream, TensorSpan<T> output, TensorView<T> x, Ten
     }
     else
     {
-        CV_Assert(is_shape_compatible(output, x));
-        CV_Assert(is_shape_compatible(output, y));
+        auto inShape1 = x.shape_as_vector();
+        auto inShape2 = y.shape_as_vector();
+        auto outShape = output.shape_as_vector();
+
+        std::size_t x_ndims = inShape1.size(), y_ndims = inShape2.size();
+        if (x_ndims >= y_ndims) {
+            for (std::size_t i = 0; i < (x_ndims - y_ndims); i++) {
+               inShape2.insert(inShape2.begin(), 1);
+            }
+        } else {
+            for (std::size_t i = 0; i < (y_ndims - x_ndims); i++) {
+               inShape1.insert(inShape1.begin(), 1);
+            }
+        }
+
+        CV_Assert(is_shape_compatible1(outShape, inShape1));
+        CV_Assert(is_shape_compatible1(outShape, inShape2));
 
         /* matching singleton axes in both input tensors can be eliminated
          *
@@ -148,20 +163,21 @@ void eltwise_op(const Stream& stream, TensorSpan<T> output, TensorView<T> x, Ten
          * x: [1, 256, 32, 32] -> [256, 32, 32]
          * y: [1, 256, 1, 1] -> [256, 1, 1]
          */
-        for (int r = 0; r < output.rank(); r++)
-        {
-            while (x.rank() > r && y.rank() > r && x.get_axis_size(r) == 1 && y.get_axis_size(r) == 1) {
-                CV_Assert(output.get_axis_size(r) == 1);
-
-                x.squeeze(r);
-                y.squeeze(r);
-                output.squeeze(r);
+        int eliminate_times = 0;
+        for (std::size_t i = 0; i < outShape.size(); i++) {
+            if (inShape1[i] == 1 && inShape2[i] == 1 && outShape[i] == 1 && i != (outShape.size() - 1)) {
+                eliminate_times++;
+            } else {
+                break;
+            }
+        }
+        if (eliminate_times > 0) {
+            for (int i = 0; i < eliminate_times; i++) {
+                inShape1.erase(inShape1.begin());
+                inShape2.erase(inShape2.begin());
+                outShape.erase(outShape.begin());
             }
         }
-
-        auto inShape1 = x.shape_as_vector();
-        auto inShape2 = y.shape_as_vector();
-        auto outShape = output.shape_as_vector();
 
         /* contiguous axes that do not broadcast can be merged into one axis
          *
diff --git a/modules/dnn/src/cuda4dnn/csl/tensor.hpp b/modules/dnn/src/cuda4dnn/csl/tensor.hpp
index 5a1286de99..8f495ac807 100644
--- a/modules/dnn/src/cuda4dnn/csl/tensor.hpp
+++ b/modules/dnn/src/cuda4dnn/csl/tensor.hpp
@@ -1187,6 +1187,23 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl {
         return true;
     }
 
+    template <typename ShapeType>
+    bool is_shape_compatible1(const ShapeType &x_shape, const ShapeType &y_shape) noexcept {
+        const auto x_ndims = x_shape.size(), y_ndims = y_shape.size();
+
+        if (x_ndims != y_ndims) {
+            return false;
+        }
+
+        for (int i = 0; i < x_ndims; i++) {
+            if (x_shape[i] != y_shape[i] && x_shape[i] != 1 && y_shape[i] != 1) {
+                 return false;
+            }
+        }
+
+        return true;
+    }
+
     /** returns the rank to which the given tensor can be squeezed to */
     template <class TensorType>
     std::size_t get_effective_rank(const TensorType& x) noexcept {
diff --git a/modules/dnn/src/layers/nary_eltwise_layers.cpp b/modules/dnn/src/layers/nary_eltwise_layers.cpp
index 661861cbe3..b22eb5bbf0 100644
--- a/modules/dnn/src/layers/nary_eltwise_layers.cpp
+++ b/modules/dnn/src/layers/nary_eltwise_layers.cpp
@@ -818,19 +818,6 @@ public:
     {
         auto context = reinterpret_cast<csl::CSLContext*>(context_);
 
-        auto input_0_shape = inputs[0].dynamicCast<CUDABackendWrapper>()->getShape();
-        for (int i = 1; i < inputs.size(); i++)
-        {
-            auto input_i_shape = inputs[i].dynamicCast<CUDABackendWrapper>()->getShape();
-            if (input_0_shape.size() != input_i_shape.size())
-                return Ptr<BackendNode>();
-            // check if the shape can be supported by `eltwise_ops.cu`, or return the default BackendNode
-            for (int j = 0; j < input_0_shape.size(); j++)
-                if (input_0_shape[j] != input_i_shape[j] &&
-                    input_0_shape[j] != 1 && input_i_shape[j] != 1)
-                    return Ptr<BackendNode>();
-        }
-
         cuda4dnn::EltwiseOpType op_ = cuda4dnn::EltwiseOpType::SUM;
         switch (op) {
             case OPERATION::MAX:
diff --git a/modules/dnn/src/net_impl_fuse.cpp b/modules/dnn/src/net_impl_fuse.cpp
index dfa542bd41..b81bf14acc 100644
--- a/modules/dnn/src/net_impl_fuse.cpp
+++ b/modules/dnn/src/net_impl_fuse.cpp
@@ -728,6 +728,10 @@ void Net::Impl::fuseLayers(const std::vector<LayerPin>& blobsToKeep_)
                     if(inp_i_data->skip || inp_i_data->consumers.size() != 1)
                         break;
 #ifdef HAVE_CUDA
+                    /* Risk: Not every operation in "NaryEltwise" is supported in the CUDA backend. There is a chance
+                             that Concat's output is filled with data in both host and device, leading to data missing.
+                             See https://github.com/opencv/opencv/issues/24721 for more details.
+                    */
                     if (preferableBackend == DNN_BACKEND_CUDA &&
                         (inp_i_data->layerInstance->supportBackend(DNN_BACKEND_CUDA) == false ||
                          (inp_i_data->layerInstance->type != "Convolution" &&
diff --git a/modules/dnn/test/test_backends.cpp b/modules/dnn/test/test_backends.cpp
index f255ab87aa..591ec63515 100644
--- a/modules/dnn/test/test_backends.cpp
+++ b/modules/dnn/test/test_backends.cpp
@@ -102,6 +102,12 @@ public:
     Net net;
 };
 
+TEST_P(DNNTestNetwork, DISABLED_YOLOv8n) {
+    processNet("dnn/onnx/models/yolov8n.onnx", "", Size(640, 640), "output0");
+    expectNoFallbacksFromIE(net);
+    expectNoFallbacksFromCUDA(net);
+}
+
 TEST_P(DNNTestNetwork, AlexNet)
 {
     applyTestTag(CV_TEST_TAG_MEMORY_1GB);
@@ -1518,6 +1524,71 @@ INSTANTIATE_TEST_CASE_P(Layer_Test_Backends, Eltwise, testing::Combine(
                dnnBackendsAndTargets()
 ));
 
+////////////////////////////////////////////////////////////////////////////////
+// Element-wise layers
+////////////////////////////////////////////////////////////////////////////////
+using NaryEltwiseConcat = TestWithParam<tuple<std::vector<int>, tuple<Backend, Target>>>;
+TEST_P(NaryEltwiseConcat, Accuracy) {
+    auto param = GetParam();
+    std::vector<int> input_shape = get<0>(param);
+    auto backend_id = get<0>(get<1>(param));
+    auto target_id = get<1>(get<1>(param));
+
+    /* Build the following net:
+
+           <1x4x84>
+           /
+        [Input] -+-> Mul(B<1x84>) -> Concat(axis=1) -> [Output]
+                 |                     |
+                 +-> Sigmoid ----------+
+
+    */
+    Net net;
+
+    std::vector<int> mul_B_shape(input_shape.size() - 1, 1);
+    mul_B_shape.back() = input_shape.back();
+    Mat mul_B(mul_B_shape, CV_32FC1);
+    randn(mul_B, 0.f, 1.f);
+    LayerParams mul_B_lp;
+    mul_B_lp.name = "mul_B";
+    mul_B_lp.type = "Const";
+    mul_B_lp.blobs.push_back(mul_B);
+    int id_mul_B = net.addLayer(mul_B_lp.name, mul_B_lp.type, mul_B_lp);
+
+    LayerParams mul_lp;
+    mul_lp.name = "mul";
+    mul_lp.type = "NaryEltwise";
+    mul_lp.set("operation", "mul");
+    int id_mul = net.addLayer(mul_lp.name, mul_lp.type, mul_lp);
+    net.connect(0, 0, id_mul, 0);
+    net.connect(id_mul_B, 0, id_mul, 1);
+
+    LayerParams sigmoid_lp;
+    sigmoid_lp.name = "sigmoid";
+    sigmoid_lp.type = "Sigmoid";
+    int id_sigmoid = net.addLayer(sigmoid_lp.name, sigmoid_lp.type, sigmoid_lp);
+    net.connect(0, 0, id_sigmoid, 0);
+
+    LayerParams concat_lp;
+    concat_lp.name = "concat";
+    concat_lp.type = "Concat";
+    concat_lp.set("axis", 1);
+    int id_concat = net.addLayer(concat_lp.name, concat_lp.type, concat_lp);
+    net.connect(id_mul, 0, id_concat, 0);
+    net.connect(id_sigmoid, 0, id_concat, 1);
+
+    // Run test
+    Mat input(input_shape, CV_32FC1);
+    testLayer(input, net, backend_id, target_id, false);
+}
+
+INSTANTIATE_TEST_CASE_P(Layer_Test_Backends, NaryEltwiseConcat, testing::Combine(
+    testing::Values(std::vector<int>{1, 4, 84}),
+    dnnBackendsAndTargets())
+);
+
+
+
 INSTANTIATE_TEST_CASE_P(/*nothing*/, Test_layers_backends, dnnBackendsAndTargets());
 
 }} // namespace
diff --git a/modules/dnn/test/test_layers.cpp b/modules/dnn/test/test_layers.cpp
index 707ae51673..744128544b 100644
--- a/modules/dnn/test/test_layers.cpp
+++ b/modules/dnn/test/test_layers.cpp
@@ -2050,7 +2050,7 @@ private:
         net.setPreferableTarget(target);
 
         Mat re;
-        ASSERT_NO_THROW(re = net.forward()); // runtime error
+        re = net.forward();
         auto ptr_re = (float *) re.data;
         for (int i = 0; i < re.total(); i++)
             if (op == "sum"){