Merge branch 4.x

2025-06-07 09:25:45 +08:00 · 2023-08-08 17:31:57 +03:00 · 2023-08-08 17:31:57 +03:00 · a6748df587
commit a6748df587
parent b47704eabc 7d59db4ec4
37 changed files with 575 additions and 210 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -74,6 +74,10 @@ if(POLICY CMP0077)
  cmake_policy(SET CMP0077 NEW)  # CMake 3.13+: option() honors normal variables.
 endif()

+if(POLICY CMP0146)
+  cmake_policy(SET CMP0146 OLD)  # CMake 3.27+: use CMake FindCUDA if available.
+endif()
+
 #
 # Configure OpenCV CMake hooks
 #
--- a/modules/dnn/perf/perf_layer.cpp
+++ b/modules/dnn/perf/perf_layer.cpp
@ -643,4 +643,69 @@ INSTANTIATE_TEST_CASE_P(/**/, Layer_ScatterND, testing::Values(std::make_tuple(D
 INSTANTIATE_TEST_CASE_P(/**/, Layer_LayerNorm, testing::Values(std::make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU)));
 INSTANTIATE_TEST_CASE_P(/**/, Layer_LayerNormExpanded, testing::Values(std::make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU)));

+
+typedef TestBaseWithParam<tuple<Vec4i, int, bool, tuple<Backend, Target> > > Layer_FullyConnected;
+PERF_TEST_P_(Layer_FullyConnected, fc)
+{
+    std::vector<int> inpShape;
+    inpShape.reserve(4);
+    for (int i = 0; i < 4; ++i) {
+        int dim = get<0>(GetParam())[i];
+        if (dim == 0)
+            break;
+        inpShape.push_back(dim);
+    }
+    Mat input(inpShape, CV_32F);
+    randn(input, 0, 1);
+
+    int axis = input.dims - 1;
+    int outDims = get<1>(GetParam());
+    bool isMatMul = get<2>(GetParam());
+    int backendId = get<0>(get<3>(GetParam()));
+    int targetId = get<1>(get<3>(GetParam()));
+
+    std::vector<int> weightShape;
+    if (isMatMul) {
+        weightShape = inpShape;
+        weightShape[weightShape.size() - 2] = outDims;
+    } else {
+        weightShape = {outDims, (int)input.total(axis, input.dims)};
+    }
+    Mat weights(weightShape, CV_32F);
+    randn(weights, 0, 1);
+
+    LayerParams lp;
+    lp.set("axis", input.dims - 1);
+    lp.set("is_matmul", weights.dims > 2);
+    lp.set("bias_term", false);
+    lp.set("transB", true);
+    lp.set("num_output", (int)weights.total(0, weights.dims - 1));
+    lp.blobs.resize(1, weights);
+
+    Net net;
+    net.addLayerToPrev("matmul", "InnerProduct", lp);
+
+    net.setInput(input);
+    net.setPreferableBackend(backendId);
+    net.setPreferableTarget(targetId);
+
+    // warmup
+    Mat output = net.forward();
+
+    TEST_CYCLE()
+    {
+        net.forward();
+    }
+    SANITY_CHECK_NOTHING();
+}
+INSTANTIATE_TEST_CASE_P(/**/, Layer_FullyConnected, Combine(
+    Values(                // input size
+        Vec4i(5, 512, 384),
+        Vec4i(5, 16, 512, 128)
+    ),
+    Values(256, 512, 1024),  // output dimension
+    testing::Bool(),         // is_matmul
+    dnnBackendsAndTargets()
+));
+
 } // namespace
--- a/modules/dnn/src/cuda/activations.cu
+++ b/modules/dnn/src/cuda/activations.cu
@ -248,6 +248,11 @@ void selu(const Stream& stream, Span<T> output, View<T> input, T alpha, T gamma)
    generic_op<T, SeluFunctor<T>>(stream, output, input, {alpha, gamma});
 }

+template <class T>
+void gelu(const Stream& stream, Span<T> output, View<T> input) {
+    generic_op<T, GeluFunctor<T>>(stream, output, input);
+}
+
 template <class T>
 void sign(const Stream& stream, Span<T> output, View<T> input) {
    generic_op<T, SignFunctor<T>>(stream, output, input);
@ -324,6 +329,7 @@ template void tan<__half>(const Stream&, Span<__half>, View<__half>);
 template void celu<__half>(const Stream&, Span<__half>, View<__half>, __half);
 template void hardsigmoid<__half>(const Stream&, Span<__half>, View<__half>, __half, __half);
 template void selu<__half>(const Stream&, Span<__half>, View<__half>, __half, __half);
+template void gelu<__half>(const Stream&, Span<__half>, View<__half>);
 template void thresholdedrelu<__half>(const Stream&, Span<__half>, View<__half>, __half);
 template void power<__half>(const Stream&, Span<__half>, View<__half>, __half, __half, __half);
 template void exp<__half>(const Stream&, Span<__half>, View<__half>, __half, __half);
@ -366,6 +372,7 @@ template void tan<float>(const Stream&, Span<float>, View<float>);
 template void celu<float>(const Stream&, Span<float>, View<float>, float);
 template void hardsigmoid<float>(const Stream&, Span<float>, View<float>, float, float);
 template void selu<float>(const Stream&, Span<float>, View<float>, float, float);
+template void gelu<float>(const Stream&, Span<float>, View<float>);
 template void thresholdedrelu<float>(const Stream&, Span<float>, View<float>, float);
 template void power<float>(const Stream&, Span<float>, View<float>, float, float, float);
 template void exp<float>(const Stream&, Span<float>, View<float>, float, float);
--- a/modules/dnn/src/cuda/functors.hpp
+++ b/modules/dnn/src/cuda/functors.hpp
@ -588,6 +588,21 @@ struct SeluFunctor {
    T alpha, gamma;
 };

+template <class T>
+struct GeluFunctor {
+    struct Params {
+        CUDA4DNN_HOST_DEVICE Params() { }
+    };
+
+    CUDA4DNN_DEVICE GeluFunctor() { }
+    CUDA4DNN_DEVICE GeluFunctor(const Params& params) { }
+
+    CUDA4DNN_DEVICE T operator()(T value) {
+        using csl::device::erf;
+        return static_cast<T>(0.5f) * value * (static_cast<T>(1.f) + erf(value * static_cast<T>(M_SQRT1_2)));
+    }
+};
+
 template <class T>
 struct ThresholdedReluFunctor {
    struct Params {
--- a/modules/dnn/src/cuda4dnn/kernels/activations.hpp
+++ b/modules/dnn/src/cuda4dnn/kernels/activations.hpp
@ -114,6 +114,9 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
    template <class T>
    void selu(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input, T alpha, T gamma);

+    template <class T>
+    void gelu(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input);
+
    template <class T>
    void thresholdedrelu(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input, T alpha);

--- a/modules/dnn/src/cuda4dnn/primitives/activation.hpp
+++ b/modules/dnn/src/cuda4dnn/primitives/activation.hpp
@ -537,6 +537,20 @@ namespace cv { namespace dnn { namespace cuda4dnn {
        const T alpha, gamma;
    };

+    template <class T>
+    class GeluOp final : public BaseOp<GeluOp, T> {
+    public:
+        GeluOp(csl::Stream stream_) : stream(std::move(stream_)) { }
+
+        void calculate(csl::TensorSpan<T> output, csl::TensorView<T> input) const
+        {
+            kernels::gelu<T>(stream, output, input);
+        }
+
+    private:
+        csl::Stream stream;
+    };
+
    template <class T>
    class ThresholdedReluOp final : public BaseOp<ThresholdedReluOp, T> {
    public:
--- a/modules/dnn/src/cuda4dnn/primitives/normalize_bbox.hpp
+++ b/modules/dnn/src/cuda4dnn/primitives/normalize_bbox.hpp
@ -111,7 +111,7 @@ namespace cv { namespace dnn { namespace cuda4dnn {
             * or there might be several weights
             * or we don't have to scale
             */
-            if (weight != 1.0)
+            if (weight != static_cast<T>(1.0f))
            {
                kernels::scale1_with_bias1<T>(stream, output, input, weight, 1.0);
            }
--- a/modules/dnn/src/cuda4dnn/primitives/region.hpp
+++ b/modules/dnn/src/cuda4dnn/primitives/region.hpp
@ -121,7 +121,7 @@ namespace cv { namespace dnn { namespace cuda4dnn {
                new_coords
            );

-            if (nms_iou_threshold > 0) {
+            if (nms_iou_threshold > static_cast<T>(0.0f)) {
                auto output_mat = output_wrapper->getMutableHostMat();
                CV_Assert(output_mat.type() == CV_32F);
                for (int i = 0; i < input.get_axis_size(0); i++) {
--- a/modules/dnn/src/ie_ngraph.cpp
+++ b/modules/dnn/src/ie_ngraph.cpp
@ -446,66 +446,6 @@ void InfEngineNgraphNet::addOutput(const Ptr<InfEngineNgraphNode>& node)
    requestedOutputs.insert({name, node.get()});
 }

-void InfEngineNgraphNet::setNodePtr(std::shared_ptr<ngraph::Node>* ptr) {
-    all_nodes.emplace((*ptr)->get_friendly_name(), ptr);
-}
-
- void InfEngineNgraphNet::release()
- {
-     // FIXIT release should not be conditional, release ALL
-     for (auto& node : components.back()) {
-#if INF_ENGINE_VER_MAJOR_GT(INF_ENGINE_RELEASE_2020_4)
-         if (!(ngraph::op::is_parameter(node) || ngraph::op::is_output(node) || ngraph::op::is_constant(node)) ) {
-#else
-         if (!(node->is_parameter() || node->is_output() || node->is_constant()) ) {
-#endif
-             auto it = all_nodes.find(node->get_friendly_name());
-             if (it != all_nodes.end()) {
-                 it->second->reset();
-                 all_nodes.erase(it);
-             }
-         }
-     }
- }
-
-void InfEngineNgraphNet::dfs(std::shared_ptr<ngraph::Node>& node,
-                             std::vector<std::shared_ptr<ngraph::Node>>& comp,
-                             std::unordered_map<std::string, bool>& used) {
-    used[node->get_friendly_name()] = true;
-    comp.push_back(node);
-    auto inputs = node->get_users();
-    for (size_t i = 0; i < node->get_input_size(); ++i) {
-        inputs.push_back(node->input_value(i).get_node()->shared_from_this());
-    }
-
-    for (auto& to : inputs) {
-        if (!used[to->get_friendly_name()]) {
-            dfs(to, comp, used);
-        }
-    }
-}
-
-int InfEngineNgraphNet::getNumComponents()
-{
-    if (!components.empty()) {
-        return components.size();
-    }
-    std::unordered_map<std::string, bool> used;
-    auto inputs = ngraph_function->get_ordered_ops();
-    for (auto& node : inputs) {
-        used.emplace(node->get_friendly_name(), false);
-    }
-
-    for (auto& node : inputs) {
-        if (!used[node->get_friendly_name()]) {
-            std::vector<std::shared_ptr<ngraph::Node>> current_comp;
-            dfs(node, current_comp, used);
-            components.push_back(current_comp);
-        }
-    }
-    return components.size();
-}
-
 void InfEngineNgraphNet::createNet(Target targetId) {
    if (!hasNetOwner)
    {
@ -524,46 +464,7 @@ void InfEngineNgraphNet::createNet(Target targetId) {
        }
        CV_Assert_N(!inputs_vec.empty(), !outs.empty());
        ngraph_function = std::make_shared<ngraph::Function>(outs, inputs_vec);
-
-        int num_comp = getNumComponents();
-        CV_LOG_DEBUG(NULL, "DNN/IE: number of subgraphs: " << num_comp);
-        if (num_comp > 1) {
-            for (int i = num_comp - 1; i >= 0; --i) {
-                ngraph::ResultVector outputs;
-                ngraph::ParameterVector inps;
-                for (auto& node : components.back()) {
-#if INF_ENGINE_VER_MAJOR_GT(INF_ENGINE_RELEASE_2020_4)
-                    if (ngraph::op::is_parameter(node)) {
-#else
-                    if (node->is_parameter()) {
-#endif
-                        CV_LOG_DEBUG(NULL, "DNN/IE: subgraph[" << i << "]: +input[" << inps.size() << "] = '" << node->get_friendly_name() << "'");
-                        auto parameter = std::dynamic_pointer_cast<ngraph::op::Parameter>(node);
-                        inps.push_back(parameter);
-                    }
-#if INF_ENGINE_VER_MAJOR_GT(INF_ENGINE_RELEASE_2020_4)
-                    else if (ngraph::op::is_output(node)) {
-#else
-                    else if (node->is_output()) {
-#endif
-                        CV_LOG_DEBUG(NULL, "DNN/IE: subgraph[" << i << "]: +output[" << outputs.size() << "] = '" << node->get_friendly_name() << "'");
-                        auto result = std::dynamic_pointer_cast<ngraph::op::Result>(node);
-                        outputs.push_back(result);
-                    }
-                }
-                CV_LOG_DEBUG(NULL, "DNN/IE: subgraph[" << i << ": nodes=" << components.back().size() << " inputs=" << inps.size() << " outputs=" << outputs.size());
-                isInit = false;
-                CV_Assert_N(!inps.empty(), !outputs.empty());
-                ngraph_function = std::make_shared<ngraph::Function>(outputs, inps);
-                release();
-                components.pop_back();
-                init(targetId);
-            }
-        } else {
-            release();
-            components.clear();
-            init(targetId);
-        }
+        init(targetId);
    }
 }

--- a/modules/dnn/src/ie_ngraph.hpp
+++ b/modules/dnn/src/ie_ngraph.hpp
@ -50,22 +50,14 @@ public:
    void addBlobs(const std::vector<cv::Ptr<BackendWrapper> >& ptrs);

    void createNet(Target targetId);
-    void setNodePtr(std::shared_ptr<ngraph::Node>* ptr);

    void reset();

 //private:
    detail::NetImplBase& netImpl_;

-    void release();
-    int getNumComponents();
-    void dfs(std::shared_ptr<ngraph::Node>& node, std::vector<std::shared_ptr<ngraph::Node>>& comp,
-             std::unordered_map<std::string, bool>& used);
-
    ngraph::ParameterVector inputs_vec;
    std::shared_ptr<ngraph::Function> ngraph_function;
-    std::vector<std::vector<std::shared_ptr<ngraph::Node>>> components;
-    std::unordered_map<std::string, std::shared_ptr<ngraph::Node>* > all_nodes;

    InferenceEngine::ExecutableNetwork netExec;
 #if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2022_1)
--- a/modules/dnn/src/layers/detection_output_layer.cpp
+++ b/modules/dnn/src/layers/detection_output_layer.cpp
@ -221,7 +221,7 @@ public:
    {
        return backendId == DNN_BACKEND_OPENCV ||
               (backendId == DNN_BACKEND_CUDA && !_groupByClasses) ||
-               (backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && !_locPredTransposed && _bboxesNormalized);
+               backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH;
    }

    bool getMemoryShapes(const std::vector<MatShape> &inputs,
@ -1006,9 +1006,30 @@ public:
    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs, const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
    {
        CV_Assert(nodes.size() == 3);
-        auto& box_logits  = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
-        auto& class_preds = nodes[1].dynamicCast<InfEngineNgraphNode>()->node;
-        auto& proposals   = nodes[2].dynamicCast<InfEngineNgraphNode>()->node;
+        auto box_logits  = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
+        auto class_preds = nodes[1].dynamicCast<InfEngineNgraphNode>()->node;
+        auto proposals   = nodes[2].dynamicCast<InfEngineNgraphNode>()->node;
+
+        if (_locPredTransposed) {
+            // Convert box predictions from yxYX to xyXY
+            box_logits = std::make_shared<ngraph::op::v1::Reshape>(box_logits,
+                std::make_shared<ngraph::op::Constant>(ngraph::element::i32, ngraph::Shape{3}, std::vector<int32_t>{0, -1, 2}),
+                true
+            );
+            int axis = 2;
+            box_logits = std::make_shared<ngraph::op::v1::Reverse>(box_logits,
+                std::make_shared<ngraph::op::Constant>(ngraph::element::i32, ngraph::Shape{1}, &axis),
+                ngraph::op::v1::Reverse::Mode::INDEX
+            );
+        }
+
+        auto shape = std::make_shared<ngraph::op::Constant>(ngraph::element::i32, ngraph::Shape{2}, std::vector<int32_t>{0, -1});
+        box_logits = std::make_shared<ngraph::op::v1::Reshape>(box_logits, shape, true);
+        class_preds = std::make_shared<ngraph::op::v1::Reshape>(class_preds, shape, true);
+        proposals = std::make_shared<ngraph::op::v1::Reshape>(proposals,
+            std::make_shared<ngraph::op::Constant>(ngraph::element::i32, ngraph::Shape{3}, std::vector<int32_t>{0, _varianceEncodedInTarget ? 1 : 2, -1}),
+            true
+        );

        ngraph::op::DetectionOutputAttrs attrs;
        attrs.num_classes                = _numClasses;
--- a/modules/dnn/src/layers/elementwise_layers.cpp
+++ b/modules/dnn/src/layers/elementwise_layers.cpp
@ -821,7 +821,7 @@ struct GeluFunctor : public BaseDefaultFunctor<GeluFunctor>

    bool supportBackend(int backendId, int)
    {
-        return backendId == DNN_BACKEND_OPENCV;
+        return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_CUDA;
    }

    inline float calculate(float x) const
@ -829,6 +829,13 @@ struct GeluFunctor : public BaseDefaultFunctor<GeluFunctor>
        return 0.5f * x * (1.0f + erf(x * M_SQRT1_2));
    }

+#ifdef HAVE_CUDA
+    Ptr<BackendNode> initCUDA(int target, csl::Stream stream)
+    {
+        return make_cuda_node<cuda4dnn::GeluOp>(target, stream);
+    }
+#endif
+
    int64 getFLOPSPerElement() const { return 100; }
 };

--- a/modules/dnn/src/layers/fully_connected_layer.cpp
+++ b/modules/dnn/src/layers/fully_connected_layer.cpp
@ -180,15 +180,12 @@ public:
    virtual bool supportBackend(int backendId) CV_OVERRIDE
    {
        bool tranAorB = transA || transB;
-#ifdef HAVE_INF_ENGINE
-        if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
-            return axis == 1 && !tranAorB;
-#endif
        return backendId == DNN_BACKEND_OPENCV ||
               backendId == DNN_BACKEND_CUDA ||
               (backendId == DNN_BACKEND_HALIDE && haveHalide() && axis == 1 && !tranAorB) ||
               (backendId == DNN_BACKEND_WEBNN && axis == 1 && !tranAorB) ||
               backendId == DNN_BACKEND_CANN ||
+               backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH ||
               (backendId == DNN_BACKEND_VKCOM && haveVulkan() && !tranAorB);
    }

@ -630,8 +627,10 @@ public:

            if(input_wrapper->getRank() == inp2Dim)
                return make_cuda_node<cuda4dnn::MatMulOp>(preferableTarget, std::move(context->stream), std::move(context->cublas_handle), oriMat, biasMat_, transA, transB);
-            else
+            else {
+                CV_LOG_INFO(NULL, "DNN/CUDA: no implementation for MatMul with rank " << input_wrapper->getRank());
                return Ptr<BackendNode>();
+            }
        }

        auto flatten_start_axis = normalize_axis(axis, input_wrapper->getRank());
@ -800,17 +799,26 @@ public:
        if (nodes.size() == 2)
        {
            auto& inp2 = nodes[1].dynamicCast<InfEngineNgraphNode>()->node;
-            matmul = std::make_shared<ngraph::op::MatMul>(ieInpNode, inp2, false, false);
+            matmul = std::make_shared<ngraph::op::MatMul>(ieInpNode, inp2, transA, transB);
        }
        else
        {
-            std::vector<int64_t> data = {(int64_t)ieInpNode->get_shape()[0], (int64_t)blobs[0].size[1]};
-            auto new_shape = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{2}, data.data());
-            auto inp = std::make_shared<ngraph::op::v1::Reshape>(ieInpNode, new_shape, true);
+            std::vector<int> shape(1 + normalize_axis(axis, ieInpNode->get_shape().size()), 0);
+            shape[shape.size() - 1] = -1;
+            auto inp = std::make_shared<ngraph::op::v1::Reshape>(
+                ieInpNode,
+                std::make_shared<ngraph::op::Constant>(ngraph::element::i32, ngraph::Shape{shape.size()}, shape.data()),
+                true
+            );

-            std::vector<size_t> weight_shape{(size_t)blobs[0].size[0], (size_t)blobs[0].size[1]};
+            std::vector<size_t> weight_shape;
+            if (isMatMul) {
+                weight_shape = getShape<size_t>(oriMat);
+            } else {
+                weight_shape = {(size_t)blobs[0].size[0], (size_t)blobs[0].size[1]};
+            }
            auto ieWeights = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, weight_shape, blobs[0].data);
-            matmul = std::make_shared<ngraph::op::MatMul>(inp, ieWeights, false, true);
+            matmul = std::make_shared<ngraph::op::MatMul>(inp, ieWeights, transA, transB);
        }

        if (bias) {
--- a/modules/dnn/src/layers/max_unpooling_layer.cpp
+++ b/modules/dnn/src/layers/max_unpooling_layer.cpp
@ -13,6 +13,7 @@ Implementation of Batch Normalization layer.
 #include "layers_common.hpp"
 #include "../op_cuda.hpp"
 #include "../op_halide.hpp"
+#include "../ie_ngraph.hpp"
 #include <opencv2/dnn/shape_utils.hpp>
 #include <opencv2/core/utils/logger.hpp>

@ -41,6 +42,7 @@ public:
    {
        return backendId == DNN_BACKEND_OPENCV ||
               backendId == DNN_BACKEND_CUDA ||
+               backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH ||
               (backendId == DNN_BACKEND_HALIDE && haveHalide() && !poolPad.width && !poolPad.height);
    }

@ -181,6 +183,50 @@ public:
 #endif  // HAVE_HALIDE
        return Ptr<BackendNode>();
    }
+
+#ifdef HAVE_DNN_NGRAPH
+    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs,
+                                        const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
+    {
+        auto features = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
+        auto indices = nodes[1].dynamicCast<InfEngineNgraphNode>()->node;
+
+        std::vector<MatShape> inpShapes(nodes.size());
+        std::vector<MatShape> outShapes, internals;
+        for (int i = 0; i < nodes.size(); ++i) {
+            std::vector<size_t> shape = nodes[i].dynamicCast<InfEngineNgraphNode>()->node->get_shape();
+            inpShapes[i] = std::vector<int>(shape.begin(), shape.end());
+        }
+        getMemoryShapes(inpShapes, 1, outShapes, internals);
+
+        Mat zeros = Mat::zeros(1, total(outShapes[0]), CV_32F);
+        auto zeroInp = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{zeros.total()}, zeros.data);
+
+        int newShape = -1;
+        features = std::make_shared<ngraph::op::v1::Reshape>(
+            features,
+            std::make_shared<ngraph::op::Constant>(ngraph::element::i32, ngraph::Shape{1}, &newShape),
+            true
+        );
+        indices = std::make_shared<ngraph::op::v1::Reshape>(
+            indices,
+            std::make_shared<ngraph::op::Constant>(ngraph::element::i32, ngraph::Shape{1}, &newShape),
+            true
+        );
+        if (indices->get_element_type() != ngraph::element::i32 && indices->get_element_type() != ngraph::element::i64) {
+            indices = std::make_shared<ngraph::op::Convert>(indices, ngraph::element::i64);
+        }
+
+        int axis = 0;
+        std::shared_ptr<ngraph::Node> unpool = std::make_shared<ngraph::op::ScatterElementsUpdate>(zeroInp, indices, features,
+            std::make_shared<ngraph::op::Constant>(ngraph::element::i32, ngraph::Shape{1}, &axis));
+
+        auto shape = std::make_shared<ngraph::op::Constant>(ngraph::element::i32, ngraph::Shape{outShapes[0].size()}, outShapes[0].data());
+        unpool = std::make_shared<ngraph::op::v1::Reshape>(unpool, shape, true);
+
+        return Ptr<BackendNode>(new InfEngineNgraphNode(unpool));
+    }
+#endif  // HAVE_DNN_NGRAPH
 };

 Ptr<MaxUnpoolLayer> MaxUnpoolLayer::create(const LayerParams& params)
--- a/modules/dnn/src/layers/pooling_layer.cpp
+++ b/modules/dnn/src/layers/pooling_layer.cpp
@ -209,7 +209,7 @@ public:
 #ifdef HAVE_INF_ENGINE
        if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
        {
-            return !computeMaxIdx && type != STOCHASTIC && kernel_size.size() > 1 && (kernel_size.size() != 3 || !isArmComputePlugin());
+            return type != STOCHASTIC && kernel_size.size() > 1 && (kernel_size.size() != 3 || !isArmComputePlugin());
        }
 #endif
        if (backendId == DNN_BACKEND_OPENCV)
@ -613,9 +613,17 @@ public:
            return Ptr<BackendNode>(new InfEngineNgraphNode(reduce_sum));
        }
        else if (type == MAX) {
-            auto max_pool = std::make_shared<ngraph::op::v1::MaxPool>(ieInpNode, ngraph::Strides(strides),
-                            ngraph::Shape(pads_begin), ngraph::Shape(pads_end), ngraph::Shape(kernel_size),
-                            rounding_type, pad_type);
+            std::shared_ptr<ngraph::Node> max_pool;
+            if (computeMaxIdx) {
+                std::vector<size_t> dilations(kernel_size.size(), 1);
+                max_pool = std::make_shared<ngraph::op::v8::MaxPool>(ieInpNode, ngraph::Strides(strides), ngraph::Strides(dilations),
+                                ngraph::Shape(pads_begin), ngraph::Shape(pads_end), ngraph::Shape(kernel_size),
+                                rounding_type, pad_type);
+            } else {
+                max_pool = std::make_shared<ngraph::op::v1::MaxPool>(ieInpNode, ngraph::Strides(strides),
+                                ngraph::Shape(pads_begin), ngraph::Shape(pads_end), ngraph::Shape(kernel_size),
+                                rounding_type, pad_type);
+            }
            return Ptr<BackendNode>(new InfEngineNgraphNode(max_pool));
        }
        else if (type == ROI) {
--- a/modules/dnn/src/layers/reduce_layer.cpp
+++ b/modules/dnn/src/layers/reduce_layer.cpp
@ -425,7 +425,7 @@ public:
            dtype* p_dst = dst.ptr<dtype>();

            size_t main_index = start / last_unreduced_dim;
-            size_t loop = start / last_unreduced_dim;
+            size_t loop = start % last_unreduced_dim;
            size_t origin = unprojected_steps[main_index] + loop * last_unreduced_step;
            for (int i = start; i < end; ++i) {
                Op accumulator(n_reduce, p_src[origin + projected_steps[0]]);
--- a/modules/dnn/src/layers/resize_layer.cpp
+++ b/modules/dnn/src/layers/resize_layer.cpp
@ -410,7 +410,10 @@ public:
        }
        attrs.shape_calculation_mode = ngraph::op::v4::Interpolate::ShapeCalcMode::SIZES;

-        if (alignCorners) {
+        CV_Assert(!halfPixelCenters || !alignCorners);
+        if (halfPixelCenters) {
+            attrs.coordinate_transformation_mode = ngraph::op::v4::Interpolate::CoordinateTransformMode::HALF_PIXEL;
+        } else if (alignCorners) {
            attrs.coordinate_transformation_mode = ngraph::op::v4::Interpolate::CoordinateTransformMode::ALIGN_CORNERS;
        }

@ -427,7 +430,10 @@ public:
        }
        attrs.shape_calculation_mode = ngraph::op::v4::Interpolate::ShapeCalcMode::sizes;

-        if (alignCorners) {
+        CV_Assert(!halfPixelCenters || !alignCorners);
+        if (halfPixelCenters) {
+            attrs.coordinate_transformation_mode = ngraph::op::v4::Interpolate::CoordinateTransformMode::half_pixel;
+        } else if (alignCorners) {
            attrs.coordinate_transformation_mode = ngraph::op::v4::Interpolate::CoordinateTransformMode::align_corners;
        }

--- a/modules/dnn/src/net_openvino.cpp
+++ b/modules/dnn/src/net_openvino.cpp
@ -476,13 +476,14 @@ void NetImplOpenVINO::initBackend(const std::vector<LayerPin>& blobsToKeep_)
            {
                int lid = ld.inputBlobsId[i].lid;
                int oid = ld.inputBlobsId[i].oid;
-                if (oid == 0 || lid == 0)
-                    continue;

                auto ieInpNode = inputNodes[i].dynamicCast<InfEngineNgraphNode>();
                const auto& ngraph_input_node = ieInpNode->node;
                CV_LOG_DEBUG(NULL, "DNN/IE: bind output port " << lid << ":" << oid << " (" << ngraph_input_node->get_friendly_name() << ":" << ngraph_input_node->get_type_info().name << ")");

+                if ((oid == 0 && ngraph_input_node->get_output_size() == 1) || lid == 0)
+                    continue;
+
                // Handle parameters from other subnets. Output port is not used in this case
 #if INF_ENGINE_VER_MAJOR_GT(INF_ENGINE_RELEASE_2020_4)
                if ((ngraph::op::is_parameter(ngraph_input_node) || ngraph::op::is_constant(ngraph_input_node)) &&
@ -549,7 +550,6 @@ void NetImplOpenVINO::initBackend(const std::vector<LayerPin>& blobsToKeep_)
                break;
            }
        }
-        ieNode->net->setNodePtr(&ieNode->node);

        net->addBlobs(ld.inputBlobsWrappers);
        net->addBlobs(ld.outputBlobsWrappers);
--- a/modules/dnn/src/onnx/onnx_importer.cpp
+++ b/modules/dnn/src/onnx/onnx_importer.cpp
@ -1385,13 +1385,19 @@ void ONNXImporter::parseSplit(LayerParams& layerParams, const opencv_onnx::NodeP
        CV_Assert(constBlobs.find(node_proto.input(1)) != constBlobs.end());
        Mat splitsBlob = getBlob(node_proto, 1);
        int splitSize = splitsBlob.total();
-
-        std::vector<int> slicePoints(splitSize - 1, splitsBlob.at<int>(0));
-        for (int i = 1; i < splitSize - 1; ++i)
+        if (splitSize == 1)
        {
-            slicePoints[i] = slicePoints[i - 1] + splitsBlob.at<int>(i);
+            layerParams.set("num_split", 1);
+        }
+        else
+        {
+            std::vector<int> slicePoints(splitSize - 1, splitsBlob.at<int>(0));
+            for (int i = 1; i < splitSize - 1; ++i)
+            {
+                slicePoints[i] = slicePoints[i - 1] + splitsBlob.at<int>(i);
+            }
+            layerParams.set("slice_point", DictValue::arrayInt(&slicePoints[0], slicePoints.size()));
        }
-        layerParams.set("slice_point", DictValue::arrayInt(&slicePoints[0], slicePoints.size()));
    }
    else
    {
@ -1965,9 +1971,11 @@ void ONNXImporter::parseGemm(LayerParams& layerParams, const opencv_onnx::NodePr
    }

    int transB = layerParams.get<int>("transB", 0);
+    int secondInpDims;
    if (constBlobs.find(node_proto.input(1)) != constBlobs.end())
    {
        Mat weights = getBlob(node_proto, 1);
+        secondInpDims = weights.dims;

        if (transA == 0) // optimized barnch, for now, we can only optimize the Gemm when transA = 0.
        {
@ -1993,7 +2001,10 @@ void ONNXImporter::parseGemm(LayerParams& layerParams, const opencv_onnx::NodePr
        }
    }
    else
+    {
        layerParams.set("transB", transB == 1);
+        secondInpDims = outShapes[node_proto.input(1)].size();
+    }

    if (node_proto.input_size() == 3)
    {
@ -2002,7 +2013,7 @@ void ONNXImporter::parseGemm(LayerParams& layerParams, const opencv_onnx::NodePr
    }

    layerParams.set("bias_term", node_proto.input_size() == 3);
-    layerParams.set("is_matmul", true);
+    layerParams.set("is_matmul", secondInpDims > 2);
    addLayer(layerParams, node_proto);
 }

@ -2045,7 +2056,7 @@ void ONNXImporter::parseMatMul(LayerParams& layerParams, const opencv_onnx::Node
        layerParams.blobs.push_back(transBlob);
        int numOutput = layerParams.blobs[0].total(0, secondInpDims - 1);
        layerParams.set("num_output", numOutput);
-        layerParams.set("is_matmul", true);
+        layerParams.set("is_matmul", secondInpDims > 2);
    } else
        secondInpDims = outShapes[node_proto.input(1)].size();

--- a/modules/dnn/test/test_caffe_importer.cpp
+++ b/modules/dnn/test/test_caffe_importer.cpp
@ -731,21 +731,23 @@ TEST_P(Test_Caffe_nets, FasterRCNN_vgg16)
        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
 #endif

-    double scoreDiff = 0.0;
-#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2022010000)
-    // Check 'backward_compatible_check || in_out_elements_equal' failed at core/src/op/reshape.cpp:427:
-    // While validating node 'v1::Reshape bbox_pred_reshape (bbox_pred[0]:f32{1,84}, Constant_265242[0]:i64{4}) -> (f32{?,?,?,?})' with friendly_name 'bbox_pred_reshape':
-    // Requested output shape {1,6300,4,1} is incompatible with input shape {1, 84}
+    double scoreDiff = 0.0, iouDiff = 0.0;
+#if defined(INF_ENGINE_RELEASE)
    if (target == DNN_TARGET_MYRIAD)
        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
-    if (target == DNN_TARGET_OPENCL_FP16)
-        scoreDiff = 0.02;
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) {
+        iouDiff = 0.02;
+        if (target == DNN_TARGET_OPENCL || target == DNN_TARGET_OPENCL_FP16) {
+            scoreDiff = 0.04;
+            iouDiff = 0.06;
+        }
+    }
 #endif

    static Mat ref = (Mat_<float>(3, 7) << 0, 2, 0.949398, 99.2454, 210.141, 601.205, 462.849,
                                           0, 7, 0.997022, 481.841, 92.3218, 722.685, 175.953,
                                           0, 12, 0.993028, 133.221, 189.377, 350.994, 563.166);
-    testFaster("faster_rcnn_vgg16.prototxt", "VGG16_faster_rcnn_final.caffemodel", ref, scoreDiff);
+    testFaster("faster_rcnn_vgg16.prototxt", "VGG16_faster_rcnn_final.caffemodel", ref, scoreDiff, iouDiff);
 }

 TEST_P(Test_Caffe_nets, FasterRCNN_zf)
@ -766,9 +768,6 @@ TEST_P(Test_Caffe_nets, FasterRCNN_zf)
        );
 #endif

-    if ((backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 ||
-         backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) && target == DNN_TARGET_OPENCL_FP16)
-        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16);
    if ((backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 ||
         backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) && target == DNN_TARGET_MYRIAD)
        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD);
@ -779,7 +778,14 @@ TEST_P(Test_Caffe_nets, FasterRCNN_zf)
    static Mat ref = (Mat_<float>(3, 7) << 0, 2, 0.90121, 120.407, 115.83, 570.586, 528.395,
                                           0, 7, 0.988779, 469.849, 75.1756, 718.64, 186.762,
                                           0, 12, 0.967198, 138.588, 206.843, 329.766, 553.176);
-    testFaster("faster_rcnn_zf.prototxt", "ZF_faster_rcnn_final.caffemodel", ref);
+
+    double scoreDiff = 0.0, iouDiff = 0.0;
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) {
+        scoreDiff = 0.02;
+        iouDiff = 0.13;
+    }
+
+    testFaster("faster_rcnn_zf.prototxt", "ZF_faster_rcnn_final.caffemodel", ref, scoreDiff, iouDiff);
 }

 TEST_P(Test_Caffe_nets, RFCN)
@ -802,8 +808,8 @@ TEST_P(Test_Caffe_nets, RFCN)
        iouDiff = 0.12;
    }

-#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2022010000)
-    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL_FP16)
+#if defined(INF_ENGINE_RELEASE)
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
    {
        scoreDiff = 0.1f;
        iouDiff = 0.2f;
--- a/modules/dnn/test/test_darknet_importer.cpp
+++ b/modules/dnn/test/test_darknet_importer.cpp
@ -102,11 +102,14 @@ TEST(Test_Darknet, read_yolo_voc_stream)
 class Test_Darknet_layers : public DNNTestLayer
 {
 public:
-    void testDarknetLayer(const std::string& name, bool hasWeights = false, bool testBatchProcessing = true)
+    void testDarknetLayer(const std::string& name, bool hasWeights = false, bool testBatchProcessing = true,
+                          double l1 = 0.0, double lInf = 0.0)
    {
        SCOPED_TRACE(name);
        Mat inp = blobFromNPY(findDataFile("dnn/darknet/" + name + "_in.npy"));
        Mat ref = blobFromNPY(findDataFile("dnn/darknet/" + name + "_out.npy"));
+        l1 = l1 ? l1 : default_l1;
+        lInf = lInf ? lInf : default_lInf;

        std::string cfg = findDataFile("dnn/darknet/" + name + ".cfg");
        std::string model = "";
@ -120,7 +123,7 @@ public:
        net.setPreferableTarget(target);
        net.setInput(inp);
        Mat out = net.forward();
-        normAssert(out, ref, "", default_l1, default_lInf);
+        normAssert(out, ref, "", l1, lInf);

        if (inp.size[0] == 1 && testBatchProcessing)  // test handling of batch size
        {
@ -166,8 +169,8 @@ public:
            }*/
            ASSERT_EQ(out2.dims, ref2.dims) << ref.dims;

-            normAssert(out2(ranges0), ref2, "", default_l1, default_lInf);
-            normAssert(out2(ranges1), ref2, "", default_l1, default_lInf);
+            normAssert(out2(ranges0), ref2, "", l1, lInf);
+            normAssert(out2(ranges1), ref2, "", l1, lInf);
        }
    }
 };
@ -1046,7 +1049,7 @@ TEST_P(Test_Darknet_layers, region)
       applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
 #endif

-#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2022010000)
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_GE(2022010000)
    // accuracy on CPU, OpenCL
    // Expected: (normL1) <= (l1), actual: 0.000358148 vs 1e-05
    //   |ref| = 1.207319974899292
@ -1116,7 +1119,12 @@ TEST_P(Test_Darknet_layers, connected)
        applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
    if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_CPU_FP16)
        applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16);
-    testDarknetLayer("connected", true);
+    double l1 = 0.0;
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL)
+    {
+        l1 = 3e-5;
+    }
+    testDarknetLayer("connected", true, true, l1);
 }

 TEST_P(Test_Darknet_layers, relu)
--- a/modules/dnn/test/test_halide_layers.cpp
+++ b/modules/dnn/test/test_halide_layers.cpp
@ -361,22 +361,9 @@ TEST_P(MaxPooling, Accuracy)
    Backend backendId = get<0>(get<5>(GetParam()));
    Target targetId = get<1>(get<5>(GetParam()));

-#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LE(2018050000)
-    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && targetId == DNN_TARGET_MYRIAD
-            && inSize == Size(7, 6) && kernel == Size(3, 2)
-            && (stride == Size(1, 1) || stride == Size(2, 2))
-            && (pad == Size(0, 1) || pad == Size(1, 1))
-    )
-        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
-#endif
-
-#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2018050000)
-    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && targetId == DNN_TARGET_MYRIAD
-            && (kernel == Size(2, 2) || kernel == Size(3, 2))
-            && stride == Size(1, 1) && (pad == Size(0, 0) || pad == Size(0, 1))
-    )
-        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
-#endif
+    // https://github.com/openvinotoolkit/openvino/issues/18731
+    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && stride != Size(1, 1))
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);

 #if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_GE(2019010000)
    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && targetId == DNN_TARGET_MYRIAD
@ -467,6 +454,11 @@ TEST_P(FullyConnected, Accuracy)
    {
        l1 = 0.01;
    }
+    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && targetId == DNN_TARGET_OPENCL)
+    {
+        l1 = 5e-3;
+        lInf = 7e-3;
+    }
 #endif
    if (targetId == DNN_TARGET_CUDA_FP16)
        l1 = 0.015;
--- a/modules/dnn/test/test_ie_models.cpp
+++ b/modules/dnn/test/test_ie_models.cpp
@ -465,8 +465,8 @@ TEST_P(DNNTestHighLevelAPI, predict)
    const std::string modelPath = getOpenVINOModel(modelName, isFP16);
    ASSERT_FALSE(modelPath.empty()) << modelName;

-    std::string xmlPath = findDataFile(modelPath + ".xml");
-    std::string binPath = findDataFile(modelPath + ".bin");
+    std::string xmlPath = findDataFile(modelPath + ".xml", false);
+    std::string binPath = findDataFile(modelPath + ".bin", false);

    Model model(xmlPath, binPath);
    Mat frame = imread(findDataFile("dnn/googlenet_1.png"));
--- a/modules/dnn/test/test_layers.cpp
+++ b/modules/dnn/test/test_layers.cpp
@ -215,7 +215,13 @@ TEST_P(Test_Caffe_layers, InnerProduct)
    if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_CPU_FP16)
        applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16);

-    testLayerUsingCaffeModels("layer_inner_product", true);
+    double l1 = 0.0, lInf = 0.0;
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && (target == DNN_TARGET_OPENCL || target == DNN_TARGET_OPENCL_FP16))
+    {
+        l1 = 5e-3;
+        lInf = 2e-2;
+    }
+    testLayerUsingCaffeModels("layer_inner_product", true, true, l1, lInf);
 }

 TEST_P(Test_Caffe_layers, Pooling_max)
--- a/modules/dnn/test/test_model.cpp
+++ b/modules/dnn/test/test_model.cpp
@ -447,14 +447,17 @@ TEST_P(Test_Model, DetectionOutput)
    {
        if (backend == DNN_BACKEND_OPENCV)
            scoreDiff = 4e-3;
-#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_GE(2022010000)
-        else if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
-            scoreDiff = 4e-2;
-#endif
        else
            scoreDiff = 2e-2;
        iouDiff = 1.8e-1;
    }
+#if defined(INF_ENGINE_RELEASE)
+        if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
+        {
+            scoreDiff = 0.05;
+            iouDiff = 0.08;
+        }
+#endif

    testDetectModel(weights_file, config_file, img_path, refClassIds, refConfidences, refBoxes,
                    scoreDiff, iouDiff, confThreshold, nmsThreshold, size, mean);
--- a/modules/dnn/test/test_onnx_conformance_layer_filter__openvino.inl.hpp
+++ b/modules/dnn/test/test_onnx_conformance_layer_filter__openvino.inl.hpp
@ -579,9 +579,7 @@ CASE(test_dropout_default_mask_ratio)
 CASE(test_dropout_default_old)
    // no filter
 CASE(test_dropout_default_ratio)
-#if SKIP_SET_1
-    SKIP;
-#endif
+    // no filter
 CASE(test_dropout_random_old)
    // no filter
 CASE(test_dynamicquantizelinear)
--- a/modules/dnn/test/test_onnx_importer.cpp
+++ b/modules/dnn/test/test_onnx_importer.cpp
@ -52,7 +52,7 @@ public:
    }

    void testONNXModels(const String& basename, const Extension ext = npy,
-                        const double l1 = 0, const float lInf = 0, const bool useSoftmax = false,
+                        double l1 = 0, double lInf = 0, const bool useSoftmax = false,
                        bool checkNoFallbacks = true, int numInps = 1)
    {
        String onnxmodel = _tf("models/" + basename + ".onnx", required);
@ -102,7 +102,12 @@ public:
            netSoftmax.setInput(ref);
            ref = netSoftmax.forward();
        }
-        normAssert(ref, out, "", l1 ? l1 : default_l1, lInf ? lInf : default_lInf);
+        if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL)
+        {
+            l1 = std::max(l1, 1.4e-3);
+            lInf = std::max(lInf, 8e-3);
+        }
+        normAssert(ref, out, basename.c_str(), l1 ? l1 : default_l1, lInf ? lInf : default_lInf);
        if (checkNoFallbacks)
            expectNoFallbacksFromIE(net);
    }
--- a/modules/dnn/test/test_tf_importer.cpp
+++ b/modules/dnn/test/test_tf_importer.cpp
@ -1816,6 +1816,11 @@ TEST_P(Test_TensorFlow_nets, Mask_RCNN)

    double scoreDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16) ? 0.2 : 2e-5;
    double iouDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16) ? 0.018 : default_lInf;
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
+    {
+        scoreDiff = std::max(scoreDiff, 0.06);
+        iouDiff = std::max(iouDiff, 0.01);
+    }
    normAssertDetections(refDetections, outDetections, "", /*threshold for zero confidence*/1e-5, scoreDiff, iouDiff);

    // Output size of masks is NxCxHxW where
--- a/modules/dnn/test/test_tflite_importer.cpp
+++ b/modules/dnn/test/test_tflite_importer.cpp
@ -20,6 +20,14 @@ namespace opencv_test { namespace {
 using namespace cv;
 using namespace cv::dnn;

+class Test_TFLite : public DNNTestLayer {
+public:
+    void testModel(Net& net, const std::string& modelName, const Mat& input, double l1 = 0, double lInf = 0);
+    void testModel(const std::string& modelName, const Mat& input, double l1 = 0, double lInf = 0);
+    void testModel(const std::string& modelName, const Size& inpSize, double l1 = 0, double lInf = 0);
+    void testLayer(const std::string& modelName, double l1 = 0, double lInf = 0);
+};
+
 void testInputShapes(const Net& net, const std::vector<Mat>& inps) {
    std::vector<MatShape> inLayerShapes;
    std::vector<MatShape> outLayerShapes;
@ -31,8 +39,14 @@ void testInputShapes(const Net& net, const std::vector<Mat>& inps) {
    }
 }

-void testModel(Net& net, const std::string& modelName, const Mat& input, double l1 = 1e-5, double lInf = 1e-4)
+void Test_TFLite::testModel(Net& net, const std::string& modelName, const Mat& input, double l1, double lInf)
 {
+    l1 = l1 ? l1 : default_l1;
+    lInf = lInf ? lInf : default_lInf;
+
+    net.setPreferableBackend(backend);
+    net.setPreferableTarget(target);
+
    testInputShapes(net, {input});
    net.setInput(input);

@ -48,20 +62,20 @@ void testModel(Net& net, const std::string& modelName, const Mat& input, double
    }
 }

-void testModel(const std::string& modelName, const Mat& input, double l1 = 1e-5, double lInf = 1e-4)
+void Test_TFLite::testModel(const std::string& modelName, const Mat& input, double l1, double lInf)
 {
    Net net = readNet(findDataFile("dnn/tflite/" + modelName + ".tflite", false));
    testModel(net, modelName, input, l1, lInf);
 }

-void testModel(const std::string& modelName, const Size& inpSize, double l1 = 1e-5, double lInf = 1e-4)
+void Test_TFLite::testModel(const std::string& modelName, const Size& inpSize, double l1, double lInf)
 {
    Mat input = imread(findDataFile("cv/shared/lena.png"));
    input = blobFromImage(input, 1.0 / 255, inpSize, 0, true);
    testModel(modelName, input, l1, lInf);
 }

-void testLayer(const std::string& modelName, double l1 = 1e-5, double lInf = 1e-4)
+void Test_TFLite::testLayer(const std::string& modelName, double l1, double lInf)
 {
    Mat inp = blobFromNPY(findDataFile("dnn/tflite/" + modelName + "_inp.npy"));
    Net net = readNet(findDataFile("dnn/tflite/" + modelName + ".tflite"));
@ -69,29 +83,66 @@ void testLayer(const std::string& modelName, double l1 = 1e-5, double lInf = 1e-
 }

 // https://google.github.io/mediapipe/solutions/face_mesh
-TEST(Test_TFLite, face_landmark)
+TEST_P(Test_TFLite, face_landmark)
 {
-    testModel("face_landmark", Size(192, 192), 2e-5, 2e-4);
+    if (backend == DNN_BACKEND_CUDA && target == DNN_TARGET_CUDA_FP16)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_CUDA_FP16);
+    double l1 = 2e-5, lInf = 2e-4;
+    if (target == DNN_TARGET_CPU_FP16 || target == DNN_TARGET_CUDA_FP16 || target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD ||
+        (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL))
+    {
+        l1 = 0.15;
+        lInf = 0.82;
+    }
+    testModel("face_landmark", Size(192, 192), l1, lInf);
 }

 // https://google.github.io/mediapipe/solutions/face_detection
-TEST(Test_TFLite, face_detection_short_range)
+TEST_P(Test_TFLite, face_detection_short_range)
 {
-    testModel("face_detection_short_range", Size(128, 128));
+    double l1 = 0, lInf = 2e-4;
+    if (target == DNN_TARGET_CPU_FP16 || target == DNN_TARGET_CUDA_FP16 || target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD ||
+        (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL))
+    {
+        l1 = 0.04;
+        lInf = 0.8;
+    }
+    testModel("face_detection_short_range", Size(128, 128), l1, lInf);
 }

 // https://google.github.io/mediapipe/solutions/selfie_segmentation
-TEST(Test_TFLite, selfie_segmentation)
+TEST_P(Test_TFLite, selfie_segmentation)
 {
-    testModel("selfie_segmentation", Size(256, 256));
+    double l1 = 0, lInf = 0;
+    if (target == DNN_TARGET_CPU_FP16 || target == DNN_TARGET_CUDA_FP16 || target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD ||
+        (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL))
+    {
+        l1 = 0.01;
+        lInf = 0.48;
+    }
+    testModel("selfie_segmentation", Size(256, 256), l1, lInf);
 }

-TEST(Test_TFLite, max_unpooling)
+TEST_P(Test_TFLite, max_unpooling)
 {
+    if (backend == DNN_BACKEND_CUDA)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_CUDA);
+
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target != DNN_TARGET_CPU) {
+        if (target == DNN_TARGET_OPENCL_FP16) applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
+        if (target == DNN_TARGET_OPENCL)      applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
+        if (target == DNN_TARGET_MYRIAD)      applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
+    }
+
+    if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
+
    // Due Max Unpoling is a numerically unstable operation and small difference between frameworks
    // might lead to positional difference of maximal elements in the tensor, this test checks
    // behavior of Max Unpooling layer only.
    Net net = readNet(findDataFile("dnn/tflite/hair_segmentation.tflite", false));
+    net.setPreferableBackend(backend);
+    net.setPreferableTarget(target);

    Mat input = imread(findDataFile("cv/shared/lena.png"));
    cvtColor(input, input, COLOR_BGR2RGBA);
@ -101,7 +152,15 @@ TEST(Test_TFLite, max_unpooling)
    net.setInput(input);

    std::vector<std::vector<Mat> > outs;
-    net.forward(outs, {"p_re_lu_1", "max_pooling_with_argmax2d", "conv2d_86", "max_unpooling2d_2"});
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) {
+        // TODO: seems like a bug with a retrieving intermediate tensors
+        net.forward(outs, {"conv2d_transpose_4", "p_re_lu_1", "max_pooling_with_argmax2d", "conv2d_86", "max_unpooling2d_2"});
+        outs.erase(outs.begin());
+    }
+    else {
+        net.forward(outs, {"p_re_lu_1", "max_pooling_with_argmax2d", "conv2d_86", "max_unpooling2d_2"});
+    }
+
    ASSERT_EQ(outs.size(), 4);
    ASSERT_EQ(outs[0].size(), 1);
    ASSERT_EQ(outs[1].size(), 2);
@ -117,6 +176,8 @@ TEST(Test_TFLite, max_unpooling)
    ASSERT_EQ(poolOut.size, poolIds.size);
    ASSERT_EQ(poolOut.size, unpoolInp.size);

+    ASSERT_EQ(countNonZero(poolInp), poolInp.total());
+
    for (int c = 0; c < 32; ++c) {
        float *poolInpData = poolInp.ptr<float>(0, c);
        float *poolOutData = poolOut.ptr<float>(0, c);
@ -135,15 +196,19 @@ TEST(Test_TFLite, max_unpooling)
                    }
                }
                EXPECT_EQ(poolInpData[maxIdx], poolOutData[y * 64 + x]) << errMsg;
-                EXPECT_EQ(poolIdsData[y * 64 + x], (float)maxIdx) << errMsg;
+                if (backend != DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) {
+                    EXPECT_EQ(poolIdsData[y * 64 + x], (float)maxIdx) << errMsg;
+                }
                EXPECT_EQ(unpoolOutData[maxIdx], unpoolInpData[y * 64 + x]) << errMsg;
            }
        }
    }
 }

-TEST(Test_TFLite, EfficientDet_int8) {
+TEST_P(Test_TFLite, EfficientDet_int8) {
    Net net = readNet(findDataFile("dnn/tflite/coco_efficientdet_lite0_v1_1.0_quant_2021_09_06.tflite", false));
+    net.setPreferableBackend(backend);
+    net.setPreferableTarget(target);

    Mat img = imread(findDataFile("dnn/dog416.png"));
    Mat blob = blobFromImage(img, 1.0, Size(320, 320));
@ -158,10 +223,18 @@ TEST(Test_TFLite, EfficientDet_int8) {
    normAssertDetections(ref, out, "", 0.5, 0.05, 0.1);
 }

-TEST(Test_TFLite, replicate_by_pack) {
-    testLayer("replicate_by_pack");
+TEST_P(Test_TFLite, replicate_by_pack) {
+    double l1 = 0, lInf = 0;
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL)
+    {
+        l1 = 4e-4;
+        lInf = 2e-3;
+    }
+    testLayer("replicate_by_pack", l1, lInf);
 }

+INSTANTIATE_TEST_CASE_P(/**/, Test_TFLite, dnnBackendsAndTargets());
+
 }}  // namespace

 #endif  // OPENCV_TEST_DNN_TFLITE
--- a/modules/gapi/include/opencv2/gapi/infer/bindings_onnx.hpp
+++ b/modules/gapi/include/opencv2/gapi/infer/bindings_onnx.hpp
@ -39,6 +39,12 @@ public:
    GAPI_WRAP
    PyParams& cfgAddExecutionProvider(ep::DirectML ep);

+    GAPI_WRAP
+    PyParams& cfgAddExecutionProvider(ep::CUDA ep);
+
+    GAPI_WRAP
+    PyParams& cfgAddExecutionProvider(ep::TensorRT ep);
+
    GAPI_WRAP
    PyParams& cfgDisableMemPattern();

--- a/modules/gapi/include/opencv2/gapi/infer/onnx.hpp
+++ b/modules/gapi/include/opencv2/gapi/infer/onnx.hpp
@ -32,6 +32,56 @@ namespace onnx {
 */
 namespace ep {

+/**
+ * @brief This structure provides functions
+ * that fill inference options for CUDA Execution Provider.
+ * Please follow https://onnxruntime.ai/docs/execution-providers/CUDA-ExecutionProvider.html#cuda-execution-provider
+ */
+struct GAPI_EXPORTS_W_SIMPLE CUDA {
+    // NB: Used from python.
+    /// @private -- Exclude this constructor from OpenCV documentation
+    GAPI_WRAP
+    CUDA() = default;
+
+    /** @brief Class constructor.
+
+    Constructs CUDA parameters based on device type information.
+
+    @param dev_id Target device id to use.
+    */
+    GAPI_WRAP
+    explicit CUDA(const int dev_id)
+        : device_id(dev_id) {
+    }
+
+    int device_id;
+};
+
+/**
+ * @brief This structure provides functions
+ * that fill inference options for TensorRT Execution Provider.
+ * Please follow https://onnxruntime.ai/docs/execution-providers/TensorRT-ExecutionProvider.html#tensorrt-execution-provider
+ */
+struct GAPI_EXPORTS_W_SIMPLE TensorRT {
+    // NB: Used from python.
+    /// @private -- Exclude this constructor from OpenCV documentation
+    GAPI_WRAP
+    TensorRT() = default;
+
+    /** @brief Class constructor.
+
+    Constructs TensorRT parameters based on device type information.
+
+    @param dev_id Target device id to use.
+    */
+    GAPI_WRAP
+    explicit TensorRT(const int dev_id)
+        : device_id(dev_id) {
+    }
+
+    int device_id;
+};
+
 /**
 * @brief This structure provides functions
 * that fill inference options for ONNX OpenVINO Execution Provider.
@ -143,7 +193,11 @@ public:
    DeviceDesc ddesc;
 };

-using EP = cv::util::variant<cv::util::monostate, OpenVINO, DirectML>;
+using EP = cv::util::variant< cv::util::monostate
+                            , OpenVINO
+                            , DirectML
+                            , CUDA
+                            , TensorRT>;

 } // namespace ep

@ -431,6 +485,34 @@ public:
        return *this;
    }

+    /** @brief Adds execution provider for runtime.
+
+    The function is used to add ONNX Runtime CUDA Execution Provider options.
+
+    @param ep CUDA Execution Provider options.
+    @see cv::gapi::onnx::ep::CUDA.
+
+    @return the reference on modified object.
+    */
+    Params<Net>& cfgAddExecutionProvider(ep::CUDA&& ep) {
+        desc.execution_providers.emplace_back(std::move(ep));
+        return *this;
+    }
+
+    /** @brief Adds execution provider for runtime.
+
+    The function is used to add ONNX Runtime TensorRT Execution Provider options.
+
+    @param ep TensorRT Execution Provider options.
+    @see cv::gapi::onnx::ep::TensorRT.
+
+    @return the reference on modified object.
+    */
+    Params<Net>& cfgAddExecutionProvider(ep::TensorRT&& ep) {
+        desc.execution_providers.emplace_back(std::move(ep));
+        return *this;
+    }
+
    /** @brief Disables the memory pattern optimization.

    @return the reference on modified object.
@ -491,6 +573,16 @@ public:
        desc.execution_providers.emplace_back(std::move(ep));
    }

+    /** @see onnx::Params::cfgAddExecutionProvider. */
+    void cfgAddExecutionProvider(ep::CUDA&& ep) {
+        desc.execution_providers.emplace_back(std::move(ep));
+    }
+
+    /** @see onnx::Params::cfgAddExecutionProvider. */
+    void cfgAddExecutionProvider(ep::TensorRT&& ep) {
+        desc.execution_providers.emplace_back(std::move(ep));
+    }
+
    /** @see onnx::Params::cfgDisableMemPattern. */
    void cfgDisableMemPattern() {
        desc.disable_mem_pattern = true;
--- a/modules/gapi/misc/python/pyopencv_gapi.hpp
+++ b/modules/gapi/misc/python/pyopencv_gapi.hpp
@ -31,6 +31,8 @@ using map_string_and_vector_float   = std::map<std::string, std::vector<float>>;
 using map_int_and_double            = std::map<int, double>;
 using ep_OpenVINO                   = cv::gapi::onnx::ep::OpenVINO;
 using ep_DirectML                   = cv::gapi::onnx::ep::DirectML;
+using ep_CUDA                       = cv::gapi::onnx::ep::CUDA;
+using ep_TensorRT                   = cv::gapi::onnx::ep::TensorRT;

 // NB: Python wrapper generate T_U for T<U>
 // This behavior is only observed for inputs
--- a/modules/gapi/src/backends/onnx/bindings_onnx.cpp
+++ b/modules/gapi/src/backends/onnx/bindings_onnx.cpp
@ -33,6 +33,18 @@ cv::gapi::onnx::PyParams::cfgAddExecutionProvider(cv::gapi::onnx::ep::DirectML e
    return *this;
 }

+cv::gapi::onnx::PyParams&
+cv::gapi::onnx::PyParams::cfgAddExecutionProvider(cv::gapi::onnx::ep::CUDA ep) {
+    m_priv->cfgAddExecutionProvider(std::move(ep));
+    return *this;
+}
+
+cv::gapi::onnx::PyParams&
+cv::gapi::onnx::PyParams::cfgAddExecutionProvider(cv::gapi::onnx::ep::TensorRT ep) {
+    m_priv->cfgAddExecutionProvider(std::move(ep));
+    return *this;
+}
+
 cv::gapi::onnx::PyParams&
 cv::gapi::onnx::PyParams::cfgDisableMemPattern() {
    m_priv->cfgDisableMemPattern();
--- a/modules/gapi/src/backends/onnx/gonnxbackend.cpp
+++ b/modules/gapi/src/backends/onnx/gonnxbackend.cpp
@ -145,9 +145,39 @@ public:
    void run();
 };

+static void addCUDAExecutionProvider(Ort::SessionOptions *session_options,
+                                     const cv::gapi::onnx::ep::CUDA &cuda_ep) {
+     OrtCUDAProviderOptions options{};
+     options.device_id = cuda_ep.device_id;
+
+     try {
+        session_options->AppendExecutionProvider_CUDA(options);
+     } catch (const std::exception &e) {
+         std::stringstream ss;
+         ss << "ONNX Backend: Failed to enable CUDA"
+            << " Execution Provider: " << e.what();
+         cv::util::throw_error(std::runtime_error(ss.str()));
+     }
+}
+
+static void addTensorRTExecutionProvider(Ort::SessionOptions *session_options,
+                                         const cv::gapi::onnx::ep::TensorRT &trt_ep) {
+     OrtTensorRTProviderOptions options{};
+     options.device_id = trt_ep.device_id;
+
+     try {
+        session_options->AppendExecutionProvider_TensorRT(options);
+     } catch (const std::exception &e) {
+         std::stringstream ss;
+         ss << "ONNX Backend: Failed to enable TensorRT"
+            << " Execution Provider: " << e.what();
+         cv::util::throw_error(std::runtime_error(ss.str()));
+     }
+}
+
 static void addOpenVINOExecutionProvider(Ort::SessionOptions *session_options,
                                         const cv::gapi::onnx::ep::OpenVINO &ov_ep) {
-     OrtOpenVINOProviderOptions options;
+     OrtOpenVINOProviderOptions options{};
     options.device_type = ov_ep.device_type.c_str();
     options.cache_dir = ov_ep.cache_dir.c_str();
     options.num_of_threads = ov_ep.num_of_threads;
@ -181,6 +211,18 @@ static void addExecutionProvider(Ort::SessionOptions          *session_options,
            addDMLExecutionProvider(session_options, dml_ep);
            break;
        }
+        case ep::EP::index_of<ep::CUDA>(): {
+            GAPI_LOG_INFO(NULL, "CUDA Execution Provider is added.");
+            const auto &cuda_ep = cv::util::get<ep::CUDA>(execution_provider);
+            addCUDAExecutionProvider(session_options, cuda_ep);
+            break;
+        }
+        case ep::EP::index_of<ep::TensorRT>(): {
+            GAPI_LOG_INFO(NULL, "TensorRT Execution Provider is added.");
+            const auto &trt_ep = cv::util::get<ep::TensorRT>(execution_provider);
+            addTensorRTExecutionProvider(session_options, trt_ep);
+            break;
+        }
        default:
            GAPI_LOG_INFO(NULL, "CPU Execution Provider is added.");
            break;
--- a/modules/videoio/src/cap_v4l.cpp
+++ b/modules/videoio/src/cap_v4l.cpp
@ -260,6 +260,10 @@ typedef uint32_t __u32;
 #define V4L2_CID_IRIS_ABSOLUTE (V4L2_CID_CAMERA_CLASS_BASE+17)
 #endif

+#ifndef v4l2_fourcc_be
+#define v4l2_fourcc_be(a, b, c, d) (v4l2_fourcc(a, b, c, d) | (1U << 31))
+#endif
+
 #ifndef V4L2_PIX_FMT_Y10
 #define V4L2_PIX_FMT_Y10 v4l2_fourcc('Y', '1', '0', ' ')
 #endif
--- a/modules/videoio/test/test_precomp.hpp
+++ b/modules/videoio/test/test_precomp.hpp
@ -65,7 +65,7 @@ inline std::string fourccToStringSafe(int fourcc)
 {
    std::string res = fourccToString(fourcc);
    // TODO: return hex values for invalid characters
-    std::transform(res.begin(), res.end(), res.begin(), [](uint8_t c) { return (c >= '0' && c <= 'z') ? c : (c == ' ' ? '_' : 'x'); });
+    std::transform(res.begin(), res.end(), res.begin(), [](char c) -> char { return (c >= '0' && c <= 'z') ? c : (c == ' ' ? '_' : 'x'); });
    return res;
 }

--- a/modules/videoio/test/test_v4l2.cpp
+++ b/modules/videoio/test/test_v4l2.cpp
@ -22,6 +22,9 @@
 #include <linux/videodev2.h>

 // workarounds for older versions
+#ifndef v4l2_fourcc_be
+#define v4l2_fourcc_be(a, b, c, d) (v4l2_fourcc(a, b, c, d) | (1U << 31))
+#endif
 #ifndef V4L2_PIX_FMT_Y10
 #define V4L2_PIX_FMT_Y10 v4l2_fourcc('Y', '1', '0', ' ')
 #endif