From b758897c2972e1b46601808e7b785e94a1def0bd Mon Sep 17 00:00:00 2001
From: Yuantao Feng <yuantao.feng@outlook.com>
Date: Fri, 29 Mar 2024 22:35:23 +0800
Subject: [PATCH] Merge pull request #25271 from fengyuentau:matmul_bias

Merge with https://github.com/opencv/opencv_extra/pull/1158

Todo:

- [x] Fix Attention pattern recognition.
- [x] Handle other backends.

Benchmark:

"VIT_B_32 OCV/CPU", M1, results in milliseconds.

| Model | 4.x | This PR |
| - | - | - |
| VIT_B_32 OCV/CPU | 87.66 | **83.83** |


### Pull Request Readiness Checklist

See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request

- [x] I agree to contribute to the project under Apache 2 License.
- [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [x] The PR is proposed to the proper branch
- [x] There is a reference to the original bug report and related work
- [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
      Patch to opencv_extra has the same branch name.
- [x] The feature is well documented and sample code can be built with the project CMake
---
 .../cuda4dnn/primitives/matmul_broadcast.hpp  |  26 ++-
 modules/dnn/src/layers/matmul_layer.cpp       | 202 ++++++++++++++++--
 .../dnn/src/onnx/onnx_graph_simplifier.cpp    | 136 ++++++++++--
 modules/dnn/src/onnx/onnx_importer.cpp        |  13 +-
 modules/dnn/test/test_graph_simplifier.cpp    |   7 +
 modules/dnn/test/test_onnx_importer.cpp       |   6 +
 6 files changed, 347 insertions(+), 43 deletions(-)
diff --git a/modules/dnn/src/cuda4dnn/primitives/matmul_broadcast.hpp b/modules/dnn/src/cuda4dnn/primitives/matmul_broadcast.hpp
index 824d917382..c99a1b5f3a 100644
--- a/modules/dnn/src/cuda4dnn/primitives/matmul_broadcast.hpp
+++ b/modules/dnn/src/cuda4dnn/primitives/matmul_broadcast.hpp
@@ -12,6 +12,8 @@
 #include "../csl/tensor.hpp"
 #include "../csl/tensor_ops.hpp"
 
+#include "../kernels/eltwise_ops.hpp" // for adding bias
+
 #include <opencv2/core.hpp>
 
 #include <utility>
@@ -23,7 +25,7 @@ namespace cv { namespace dnn { namespace cuda4dnn {
     public:
         using wrapper_type = GetCUDABackendWrapperType<T>;
 
-        MatMulBroadcastOp(csl::Stream stream_, csl::cublas::Handle handle, const Mat &B, bool _transA, bool _transB,
+        MatMulBroadcastOp(csl::Stream stream_, csl::cublas::Handle handle, const Mat &B, const Mat &bias, bool _transA, bool _transB,
                  const std::vector<size_t> &A_offsets_, const std::vector<size_t> &B_offsets_, std::vector<size_t> &C_offsets_,
                  size_t batch_)
             : stream(std::move(stream_)), cublasHandle(std::move(handle)), A_offsets(A_offsets_), B_offsets(B_offsets_), C_offsets(C_offsets_), batch(batch_)
@@ -33,6 +35,11 @@ namespace cv { namespace dnn { namespace cuda4dnn {
                 csl::copyMatToTensor<T>(B, input_B_tensor, stream);
             }
 
+            if (!bias.empty()) {
+                bias_tensor = csl::makeTensorHeader<T>(bias);
+                csl::copyMatToTensor<T>(bias, bias_tensor, stream);
+            }
+
             transA = _transA;
             transB = _transB;
         }
@@ -42,9 +49,6 @@ namespace cv { namespace dnn { namespace cuda4dnn {
             const std::vector<cv::Ptr<BackendWrapper>>& outputs,
             csl::Workspace& workspace) override
         {
-            CV_Assert(((inputs.size() == 2 && input_B_tensor.empty()) ||
-                       (inputs.size() == 1 && !input_B_tensor.empty())) && outputs.size() == 1);
-
             auto input_A_wrapper = inputs[0].dynamicCast<wrapper_type>();
             auto input_A = input_A_wrapper->getView();
 
@@ -60,12 +64,26 @@ namespace cv { namespace dnn { namespace cuda4dnn {
             auto output = output_wrapper->getSpan();
 
             csl::tensor_ops::gemmBatched<T>(cublasHandle, batch, 0.f, output, C_offsets, 1.f, transA, input_A, A_offsets, transB, input_B, B_offsets);
+
+            // add bias if exists
+            if (!bias_tensor.empty() || inputs.size() >= 3) {
+                csl::TensorView<T> bias;
+                if (bias_tensor.empty()) {
+                    auto bias_wrapper = inputs[2].dynamicCast<wrapper_type>();
+                    bias = bias_wrapper->getView();
+                } else {
+                    bias = csl::TensorView<T>(bias_tensor);
+                }
+
+                kernels::eltwise_sum_2<T>(stream, output, output, bias);
+            }
         }
 
     private:
         csl::Stream stream;
         csl::cublas::Handle cublasHandle;
         csl::Tensor<T> input_B_tensor;
+        csl::Tensor<T> bias_tensor;
         bool transA, transB;
 
         std::vector<size_t> A_offsets;
diff --git a/modules/dnn/src/layers/matmul_layer.cpp b/modules/dnn/src/layers/matmul_layer.cpp
index a571592dfb..448af27c18 100644
--- a/modules/dnn/src/layers/matmul_layer.cpp
+++ b/modules/dnn/src/layers/matmul_layer.cpp
@@ -26,6 +26,10 @@ using namespace cv::dnn::cuda4dnn;
 namespace cv { namespace dnn {
 
 class MatMulLayerImpl CV_FINAL : public MatMulLayer {
+#ifdef HAVE_OPENCL
+    UMat weight_umat, bias_umat;
+#endif
+
  public:
     MatMulLayerImpl(const LayerParams& params) {
         setParamsFrom(params);
@@ -34,6 +38,8 @@ class MatMulLayerImpl CV_FINAL : public MatMulLayer {
         trans_b = params.get<bool>("transB", false);
         alpha = params.get<float>("alpha", 1.f);
         beta = params.get<float>("beta", 1.f);
+
+        real_ndims_C = params.get<int>("real_ndims_C", -1);
     }
 
     virtual bool supportBackend(int backendId) CV_OVERRIDE {
@@ -48,8 +54,9 @@ class MatMulLayerImpl CV_FINAL : public MatMulLayer {
                                  const int requiredOutputs,
                                  std::vector<MatShape> &outputs,
                                  std::vector<MatShape> &internals) const CV_OVERRIDE {
-        CV_CheckGE(inputs.size(), static_cast<size_t>(1), "DNN/MatMul: one varible input at least");
-        CV_CheckLE(inputs.size(), static_cast<size_t>(2), "DNN/MatMul: two variable inputs at most");
+        int num_inputs = inputs.size() + blobs.size();
+        CV_CheckGE(num_inputs, 2, "DNN/MatMul: two inputs at least");
+        CV_CheckLE(num_inputs, 3, "DNN/MatMul: three inputs at most");
 
         const auto shape_A = inputs[0], shape_B = blobs.empty() ? inputs[1] : shape(blobs[0]);
         CV_CheckGE(shape_A.size(), static_cast<size_t>(2), "DNN/MatMul: invalid shape of input A");
@@ -64,7 +71,7 @@ class MatMulLayerImpl CV_FINAL : public MatMulLayer {
         int K_B = trans_b ? nB : mB;
         CV_CheckEQ(K_A, K_B, "DNN/MatMul: invalid dimension K");
 
-        // Check legal broadcast. It is legal for sure if A and B are 2d, or one of them is 2d.
+        // Check if inputs are broadcastable.
         MatShape common_shape;
         if (shape_A.size() != 2 || shape_B.size() != 2) {
             const auto &shape_more_dims = shape_A.size() > shape_B.size() ? shape_A : shape_B;
@@ -89,6 +96,24 @@ class MatMulLayerImpl CV_FINAL : public MatMulLayer {
             common_shape[1] = N;
         }
 
+        // Check if bias is broadcastable
+        if (num_inputs == 3) {
+            const auto shape_C = blobs.empty() ? inputs.back() : shape(blobs.back());
+            if (real_ndims_C == 1) { // (1) or (N)
+                CV_Check(shape_C[0], shape_C[0] == 1 || shape_C[0] == N, "DNN/MatMul: invalid dimension of C");
+            } else if (real_ndims_C >= 2) {
+                const auto &shape_large = common_shape.size() > shape_C.size() ? common_shape : shape_C;
+                const auto &shape_small = common_shape.size() > shape_C.size() ? shape_C : common_shape;
+                size_t diff_dims = shape_large.size() - shape_small.size();
+                for (size_t i = 0; i < shape_small.size(); i++) {
+                    const auto dl = shape_small[i], dm = shape_large[i + diff_dims];
+                    if (dl != 1 && dm != 1 && dl != dm) {
+                        CV_Error(Error::StsBadSize, "DNN/MatMul: invalid shape of C");
+                    }
+                }
+            }
+        }
+
         outputs.assign(1, common_shape);
         return false;
     }
@@ -109,6 +134,44 @@ class MatMulLayerImpl CV_FINAL : public MatMulLayer {
             fastGemmPackB(blobs[0], packed_input_B, trans_b, opt);
             helper.updatePackedBOffsets(packed_input_B.size());
         }
+
+        // broadcast bias if needed
+        if ((inputs.size() + blobs.size()) >= 3 && blobs.size() >= 2) {
+            const auto bias_mat = blobs.back();
+            const auto bias_shape = shape(bias_mat);
+            bool is_broadcast_needed = real_ndims_C == 0 || real_ndims_C == 1 || (total(bias_shape) != total(C_shape) || bias_shape.size() != C_shape.size());
+
+            if (is_broadcast_needed) {
+                broadcast_bias = Mat(C_shape, CV_32F);
+                auto *broadcast_bias_ptr = broadcast_bias.ptr<float>();
+
+                const auto *bias = bias_mat.ptr<const float>();
+                if (bias_mat.total() == 1) { // [], [1], [1, ...]
+                    float b = (*bias) * beta;
+                    for (size_t i = 0; i < broadcast_bias.total(); i++) {
+                        broadcast_bias_ptr[i] = b;
+                    }
+                } else if (real_ndims_C == 1) { // [n]
+                    size_t inner_size = C_shape.back(),
+                        loops = total(C_shape) / inner_size;
+                    for (size_t i = 0; i < loops; i++) {
+                        size_t step = i * inner_size;
+                        for (size_t j = 0; j < inner_size; j++) {
+                            broadcast_bias_ptr[step + j] = beta * bias[j];
+                        }
+                    }
+                } else {
+                    broadcast(bias_mat, C_shape, broadcast_bias);
+                }
+            } else {
+                broadcast_bias = blobs.back();
+            }
+        }
+
+#ifdef HAVE_OPENCL
+        weight_umat.release();
+        bias_umat.release();
+#endif
     }
 
     // works like Y = numpy.matmul(A, B)
@@ -134,7 +197,38 @@ class MatMulLayerImpl CV_FINAL : public MatMulLayer {
 
         const auto *a = A.ptr<const float>();
         auto *y = Y.ptr<float>();
-        std::memset(y, 0, Y.total() * sizeof(float));
+        // add bias if existed
+        if ((inputs.size() + blobs.size()) >= 3) {
+            const auto &shape_Y = shape(Y);
+            if (blobs.empty()) { // bias from input
+                const auto &bias_mat = inputs.back();
+                const auto *bias = bias_mat.ptr<const float>();
+                if (bias_mat.total() == 1) { // [], [1], [1, ...]
+                    float b = (*bias) * beta;
+                    for (size_t i = 0; i < Y.total(); i++) {
+                        y[i] = b;
+                    }
+                } else if (real_ndims_C == 1) { // [n]
+                    const size_t inner_size = shape_Y.back(),
+                                 batches = total(Y) / inner_size;
+                    parallel_for_(Range(0, batches), [&] (const Range &r) {
+                        for (int i = r.start; i < r.end; i++) {
+                            const size_t output_offset = i * inner_size;
+                            for (size_t j = 0; j < inner_size; j++) {
+                                y[output_offset + j] = beta * bias[j];
+                            }
+                        }
+                    }, double(batches * inner_size * (1 / 1024.0)));
+                } else {
+                    broadcast(bias_mat, shape_Y, Y);
+                }
+            } else { // bias from constant
+                const auto *bias = broadcast_bias.ptr<const float>();
+                std::memcpy(y, bias, total(shape_Y) * sizeof(float));
+            }
+        } else {
+            std::memset(y, 0, Y.total() * sizeof(float));
+        }
 
         if (blobs.empty()) {
             const auto &B = inputs[1];
@@ -158,14 +252,36 @@ class MatMulLayerImpl CV_FINAL : public MatMulLayer {
         inputs_arr.getUMatVector(inputs);
         outputs_arr.getUMatVector(outputs);
 
-        const auto &input_A = inputs[0];
-        UMat input_B;
-        if (blobs.empty()) {
-            input_B = inputs[1];
-        } else {
-            blobs[0].copyTo(input_B);
+        // does not support bias as input
+        if (inputs.size() >= 3) {
+            return false;
         }
+
+        const auto &input_A = inputs[0];
         auto &output = outputs[0];
+        const auto output_shape = shape(output);
+
+        if (blobs.empty()) {
+            weight_umat = inputs[1];
+            if ((inputs.size() + blobs.size() >= 3)) {
+                bias_umat = UMat::zeros(output_shape.size(), output_shape.data(), CV_32F);
+            }
+        } else {
+            if (weight_umat.empty()) {
+                blobs.front().copyTo(weight_umat);
+            }
+            if ((inputs.size() + blobs.size() >= 3)) {
+                if (bias_umat.empty()) {
+                    broadcast_bias.copyTo(bias_umat);
+                }
+            } else {
+                if (bias_umat.empty()) {
+                    bias_umat = UMat::zeros(output_shape.size(), output_shape.data(), CV_32F);
+                }
+            }
+        }
+
+        auto &input_B = weight_umat;
 
         int M = static_cast<int>(helper.M),
             N = static_cast<int>(helper.N),
@@ -181,7 +297,7 @@ class MatMulLayerImpl CV_FINAL : public MatMulLayer {
         UMat A, B, C, A_fp32, B_fp32, C_fp32;
         for (int i = 0; i < batch; i++) {
             A = input_A_2d.row(helper.A_rows[i]).reshape(1, trans_a ? K : M);
-            B = input_B_2d.row(helper.B_rows[i]).reshape(1, trans_b ? K : N);
+            B = input_B_2d.row(helper.B_rows[i]).reshape(1, trans_b ? N : K);
             C = output_2d.row(helper.C_rows[i]).reshape(1, M);
 
             if (trans_a) {
@@ -200,7 +316,6 @@ class MatMulLayerImpl CV_FINAL : public MatMulLayer {
                 B_fp32 = B;
                 C_fp32 = C;
             }
-
             cv::gemm(A_fp32, B_fp32, 1.f, noArray(), 0.f, C_fp32);
             if (use_half) {
                 A_fp32.convertTo(A, CV_16F);
@@ -208,6 +323,12 @@ class MatMulLayerImpl CV_FINAL : public MatMulLayer {
                 C_fp32.convertTo(C, CV_16F);
             }
         }
+
+        // add bias
+        if (!bias_umat.empty()) {
+            cv::add(output, bias_umat, output);
+        }
+
         return true;
     }
 #endif // HAVE_OPENCL
@@ -216,18 +337,28 @@ class MatMulLayerImpl CV_FINAL : public MatMulLayer {
     virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs,
                                         const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE {
         auto& input_A_node = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
-        std::shared_ptr<ov::Node> matmul;
+        std::shared_ptr<ov::Node> result;
+        ov::Output<ov::Node> bias;
 
-        if (nodes.size() == 2) {
+        if (blobs.empty()) {
             auto &input_B_node = nodes[1].dynamicCast<InfEngineNgraphNode>()->node;
-            matmul = std::make_shared<ov::op::v0::MatMul>(input_A_node, input_B_node, trans_a, trans_b);
+            result = std::make_shared<ov::op::v0::MatMul>(input_A_node, input_B_node, trans_a, trans_b);
+            if (nodes.size() >= 3) {
+                bias = nodes[2].dynamicCast<InfEngineNgraphNode>()->node;
+                result = std::make_shared<ov::op::v1::Add>(result, bias);
+            }
         } else {
             auto input_B_shape = getShape<size_t>(blobs[0]);
             auto input_B_node = std::make_shared<ov::op::v0::Constant>(ov::element::f32, input_B_shape, blobs[0].data);
-            matmul = std::make_shared<ov::op::v0::MatMul>(input_A_node, input_B_node, trans_a, trans_b);
+            result = std::make_shared<ov::op::v0::MatMul>(input_A_node, input_B_node, trans_a, trans_b);
+            if ((nodes.size() + blobs.size()) >= 3) {
+                const auto bias_shape = shape(broadcast_bias);
+                bias = std::make_shared<ov::op::v0::Constant>(ov::element::f32, std::vector<size_t>(bias_shape.begin(), bias_shape.end()), broadcast_bias.data);
+                result = std::make_shared<ov::op::v1::Add>(result, bias);
+            }
         }
 
-        return Ptr<BackendNode>(new InfEngineNgraphNode(matmul));
+        return Ptr<BackendNode>(new InfEngineNgraphNode(result));
     }
 #endif // HAVE_DNN_NGRAPH
 
@@ -239,7 +370,7 @@ class MatMulLayerImpl CV_FINAL : public MatMulLayer {
 
         const auto input_A_shape = shape(*input_A_wrapper->getMat());
         const auto output_shape = shape(*output_wrapper->getMat());
-        if (output_shape.size() != 2) {
+        if ((inputs.size() + blobs.size()) >= 3 || output_shape.size() != 2) {
             return Ptr<BackendNode>();
         }
 
@@ -259,11 +390,17 @@ class MatMulLayerImpl CV_FINAL : public MatMulLayer {
                               const std::vector<Ptr<BackendWrapper>>& inputs,
                               const std::vector<Ptr<BackendWrapper>>& outputs) override {
         auto context = reinterpret_cast<csl::CSLContext*>(context_);
-        auto input_B = blobs.empty() ? Mat() : blobs[0];
+        auto input_B = Mat(), bias = Mat();
+        if (!blobs.empty()) {
+            input_B = blobs.front();
+            if (blobs.size() >= 2) {
+                bias = broadcast_bias;
+            }
+        }
 
         CV_CheckFalse(helper.empty(), "DNN/MatMul/CUDA: MatMulHelper is not initialized");
 
-        return make_cuda_node<cuda4dnn::MatMulBroadcastOp>(preferableTarget, std::move(context->stream), std::move(context->cublas_handle), input_B, trans_a, trans_b, helper.A_offsets, helper.B_offsets, helper.C_offsets, helper.batch);
+        return make_cuda_node<cuda4dnn::MatMulBroadcastOp>(preferableTarget, std::move(context->stream), std::move(context->cublas_handle), input_B, bias, trans_a, trans_b, helper.A_offsets, helper.B_offsets, helper.C_offsets, helper.batch);
     }
 #endif // HAVE_CUDA
 
@@ -275,7 +412,7 @@ class MatMulLayerImpl CV_FINAL : public MatMulLayer {
         auto input_A_desc = input_A_wrapper->getTensorDesc();
         auto input_A_node = nodes[0].dynamicCast<CannBackendNode>()->getOp();
 
-        auto op = std::make_shared<ge::op::BatchMatMul>(name);
+        auto op = std::make_shared<ge::op::BatchMatMulV2>(name);
 
         // set attributes
         op->set_attr_adj_x1(trans_a);
@@ -292,11 +429,31 @@ class MatMulLayerImpl CV_FINAL : public MatMulLayer {
             auto input_B_node = nodes[1].dynamicCast<CannBackendNode>()->getOp();
             op->set_input_x2_by_name(*input_B_node, "y");
             op->update_input_desc_x2(*input_B_desc);
+            if (inputs.size() >= 3) {
+                auto input_bias_wrapper = inputs[2].dynamicCast<CannBackendWrapper>();
+                auto input_bias_desc = input_bias_wrapper->getTensorDesc();
+                auto input_bias_node = nodes[2].dynamicCast<CannBackendNode>()->getOp();
+                op->set_input_bias_by_name(*input_bias_node, "y");
+                op->update_input_desc_bias(*input_bias_desc);
+            }
         } else { // constant input B
             auto B = blobs[0];
             auto const_B_node = std::make_shared<CannConstOp>(B.data, B.type(), shape(B), cv::format("%s_B", name.c_str()));
             op->set_input_x2_by_name(*(const_B_node->getOp()), "y");
             op->update_input_desc_x2(*(const_B_node->getTensorDesc()));
+            if ((inputs.size() + blobs.size()) >= 3) { // does not support broadcast bias
+                auto bias_mat = blobs.back();
+                auto bias_shape = shape(bias_mat);
+
+                // reshape if 1d
+                if (real_ndims_C == 1 && bias_shape.front() != 1) {
+                    bias_shape = std::vector<int>{bias_shape.front()};
+                }
+
+                auto const_bias_node = std::make_shared<CannConstOp>(bias_mat.data, bias_mat.type(), bias_shape, cv::format("%s_bias", name.c_str()));
+                op->set_input_bias_by_name(*(const_bias_node->getOp()), "y");
+                op->update_input_desc_bias(*(const_bias_node->getTensorDesc()));
+            }
         }
 
         // set outputs
@@ -312,7 +469,10 @@ class MatMulLayerImpl CV_FINAL : public MatMulLayer {
     float alpha;
     float beta;
 
+    int real_ndims_C;
+
     std::vector<float> packed_input_B;
+    Mat broadcast_bias;
 
     FastGemmOpt opt;
     MatMulHelper helper;
diff --git a/modules/dnn/src/onnx/onnx_graph_simplifier.cpp b/modules/dnn/src/onnx/onnx_graph_simplifier.cpp
index 7b8dd483c7..a87910a4c4 100644
--- a/modules/dnn/src/onnx/onnx_graph_simplifier.cpp
+++ b/modules/dnn/src/onnx/onnx_graph_simplifier.cpp
@@ -242,6 +242,115 @@ class AdjustSliceAllOptionalInputsSubgraph : public Subgraph {
     size_t num_inputs_;
 };
 
+/* Fusion for biased MatMul.
+
+   Graph before fusion: [Input] -> MatMul -> Add -> [Output]
+
+   Graph after fusion:  [Input] -> MatMul -> [Output]
+                                     \
+                                     bias
+*/
+
+class BiasedMatmulSubgraph : public Subgraph {
+ public:
+    BiasedMatmulSubgraph() {
+        int input = addNodeToMatch("");
+        matmul_id = addNodeToMatch("MatMul", input, addNodeToMatch(""));
+        add_id = addNodeToMatch("Add", addNodeToMatch(""), matmul_id);
+
+        setFusedNode("MatMul", input);
+    }
+
+    virtual bool match(const Ptr<ImportGraphWrapper>& net, int nodeId,
+                       std::vector<int>& matchedNodesIds) CV_OVERRIDE {
+        if (Subgraph::match(net, nodeId, matchedNodesIds)) {
+            auto onnx_net = net.dynamicCast<ONNXGraphWrapper>();
+
+            // get input weight from MatMul
+            {
+                // make sure that input A is not Constant
+                if (onnx_net->getInputInitializerId(matchedNodesIds[matmul_id], 0) >= 0) {
+                    return false;
+                } else {
+                    const Ptr<ImportNodeWrapper> node = net->getNode(matchedNodesIds[matmul_id]);
+
+                    int constant_id = Subgraph::getInputNodeId(net, node, 0);
+                    auto constant_node = net->getNode(constant_id);
+                    if (constant_node->getType() == "Constant") {
+                        return false;
+                    }
+                }
+
+                bool is_weight_const = false;
+                int initializer_id = onnx_net->getInputInitializerId(matchedNodesIds[matmul_id], 1);
+                if (initializer_id != -1) { // Initializer
+                    weight_name = onnx_net->getNameOfInitializer(initializer_id);
+                    is_weight_const = true;
+                } else { // Constant layer
+                    const Ptr<ImportNodeWrapper> node = net->getNode(matchedNodesIds[matmul_id]);
+
+                    int constant_id = Subgraph::getInputNodeId(net, node, 1);
+                    auto constant_node = net->getNode(constant_id);
+                    if (constant_node->getType() == "Constant") {
+                        weight_name = node->getInputName(1);
+                        is_weight_const = true;
+                    }
+                }
+
+                if (!is_weight_const) {
+                    return false;
+                }
+            }
+
+            // get input bias from Add
+            {
+                bool is_bias_const = false;
+                int initializer_id = std::max(onnx_net->getInputInitializerId(matchedNodesIds[add_id], 0),
+                                              onnx_net->getInputInitializerId(matchedNodesIds[add_id], 1));
+                if (initializer_id != -1) {
+                    bias_name = onnx_net->getNameOfInitializer(initializer_id);
+                    is_bias_const = true;
+                } else { // Constant layer
+                    const Ptr<ImportNodeWrapper> node = net->getNode(matchedNodesIds[add_id]);
+
+                    int constant_id = Subgraph::getInputNodeId(net, node, 0);
+                    auto constant_node = net->getNode(constant_id);
+                    if (constant_node->getType() == "Constant") {
+                        bias_name = node->getInputName(0);
+                        is_bias_const = true;
+                    } else {
+                        constant_id = Subgraph::getInputNodeId(net, node, 1);
+                        constant_node = net->getNode(constant_id);
+                        if (constant_node->getType() == "Constant") {
+                            bias_name = node->getInputName(1);
+                            is_bias_const = true;
+                        }
+                    }
+                }
+                if (!is_bias_const) {
+                    return false;
+                }
+            }
+
+            return true;
+        }
+        return false;
+    }
+
+    virtual void finalize(const Ptr<ImportGraphWrapper>& net,
+                          const Ptr<ImportNodeWrapper>& fusedNode,
+                          std::vector<Ptr<ImportNodeWrapper> >&) CV_OVERRIDE {
+        opencv_onnx::NodeProto* node = fusedNode.dynamicCast<ONNXNodeWrapper>()->node;
+        // add inputs
+        node->add_input(weight_name);
+        node->add_input(bias_name);
+    }
+
+ private:
+    int matmul_id, add_id;
+    std::string weight_name, bias_name;
+};
+
 /*  The fusion for the multi-head attention from vision transformer.
 
     Abbreviations:
@@ -322,22 +431,21 @@ class AttentionSubGraph : public Subgraph {
     AttentionSubGraph() {
         int input = addNodeToMatch("");
         int transpose = addNodeToMatch("Transpose", input); // tranpose does not make any differences to the accuracy here in this subgraph
-        att_matmul = addNodeToMatch("MatMul", transpose, addNodeToMatch(""));
-        att_add = addNodeToMatch("Add", addNodeToMatch(""), att_matmul);
+        att_matmul = addNodeToMatch("MatMul", transpose, addNodeToMatch(""), addNodeToMatch("")); // add is fused into matmul via BiasedMatMulSubgraph
 
         // v_path
-        slice_v = addNodeToMatch("Slice", std::vector<int>{att_add, addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch("")});
+        slice_v = addNodeToMatch("Slice", std::vector<int>{att_matmul, addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch("")});
         int reshape_v = addNodeToMatch("Reshape", slice_v, addNodeToMatch(""));
         int transpose_v = addNodeToMatch("Transpose", reshape_v);
 
         // q_path
-        slice_q = addNodeToMatch("Slice", std::vector<int>{att_add, addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch("")});
+        slice_q = addNodeToMatch("Slice", std::vector<int>{att_matmul, addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch("")});
         reshape_q = addNodeToMatch("Reshape", slice_q, addNodeToMatch(""));
         int transpose_q = addNodeToMatch("Transpose", reshape_q);
         div_q = addNodeToMatch("Div", transpose_q, addNodeToMatch(""));
 
         // k_path
-        slice_k = addNodeToMatch("Slice", std::vector<int>{att_add, addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch("")});
+        slice_k = addNodeToMatch("Slice", std::vector<int>{att_matmul, addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch("")});
         int reshape_k = addNodeToMatch("Reshape", slice_k, addNodeToMatch(""));
         int transpose_k = addNodeToMatch("Transpose", reshape_k);
 
@@ -380,7 +488,7 @@ class AttentionSubGraph : public Subgraph {
 
             // get names
             weight_name = getInputName(net, matchedNodesIds[att_matmul], 1);
-            bias_name = getInputName(net, matchedNodesIds[att_add], 0);
+            bias_name = getInputName(net, matchedNodesIds[att_matmul], 2);
             return true;
         }
         return false;
@@ -414,7 +522,7 @@ class AttentionSubGraph : public Subgraph {
     }
 
  private:
-    int att_matmul, att_add;
+    int att_matmul;
     int slice_q, slice_k, slice_v;
     int reshape_q, div_q, last_reshape;
 
@@ -436,20 +544,19 @@ class AttentionSingleHeadSubGraph : public Subgraph {
     AttentionSingleHeadSubGraph() {
         int input = addNodeToMatch("");
         int transpose = addNodeToMatch("Transpose", input); // tranpose does not make any differences to the accuracy here in this subgraph
-        att_matmul = addNodeToMatch("MatMul", transpose, addNodeToMatch(""));
-        att_add = addNodeToMatch("Add", addNodeToMatch(""), att_matmul);
+        att_matmul = addNodeToMatch("MatMul", transpose, addNodeToMatch(""), addNodeToMatch("")); // add is fused into matmul via BiasedMatMulSubgraph
 
         // v_path
-        slice_v = addNodeToMatch("Slice", std::vector<int>{att_add, addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch("")});
+        slice_v = addNodeToMatch("Slice", std::vector<int>{att_matmul, addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch("")});
         int transpose_v = addNodeToMatch("Transpose", slice_v);
 
         // q_path
-        slice_q = addNodeToMatch("Slice", std::vector<int>{att_add, addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch("")});
+        slice_q = addNodeToMatch("Slice", std::vector<int>{att_matmul, addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch("")});
         int transpose_q = addNodeToMatch("Transpose", slice_q);
         div_q = addNodeToMatch("Div", transpose_q, addNodeToMatch(""));
 
         // k_path
-        slice_k = addNodeToMatch("Slice", std::vector<int>{att_add, addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch("")});
+        slice_k = addNodeToMatch("Slice", std::vector<int>{att_matmul, addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch("")});
         int transpose_k = addNodeToMatch("Transpose", slice_k);
 
         // qk
@@ -491,7 +598,7 @@ class AttentionSingleHeadSubGraph : public Subgraph {
 
             // get names
             weight_name = getInputName(net, matchedNodesIds[att_matmul], 1);
-            bias_name = getInputName(net, matchedNodesIds[att_add], 0);
+            bias_name = getInputName(net, matchedNodesIds[att_matmul], 2);
             return true;
         }
         return false;
@@ -525,7 +632,7 @@ class AttentionSingleHeadSubGraph : public Subgraph {
     }
 
  protected:
-    int att_matmul, att_add;
+    int att_matmul;
     int slice_q, slice_k, slice_v;
     int div_q, last_reshape;
 
@@ -1558,6 +1665,7 @@ public:
 void simplifySubgraphs(opencv_onnx::GraphProto& net)
 {
     std::vector<Ptr<Subgraph> > subgraphs;
+    subgraphs.push_back(makePtr<BiasedMatmulSubgraph>());
     subgraphs.push_back(makePtr<AdjustSliceAllOptionalInputsSubgraph>(3));
     subgraphs.push_back(makePtr<AdjustSliceAllOptionalInputsSubgraph>(4));
     subgraphs.push_back(makePtr<GeluSubGraph>());
diff --git a/modules/dnn/src/onnx/onnx_importer.cpp b/modules/dnn/src/onnx/onnx_importer.cpp
index 997914c4cf..c8fb026d8d 100644
--- a/modules/dnn/src/onnx/onnx_importer.cpp
+++ b/modules/dnn/src/onnx/onnx_importer.cpp
@@ -1961,7 +1961,8 @@ void ONNXImporter::parseGemm(LayerParams& layerParams, const opencv_onnx::NodePr
 
 void ONNXImporter::parseMatMul(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto_) {
     auto node_proto = node_proto_;
-    CV_CheckEQ(node_proto.input_size(), 2, "ONNXImporter/MatMul: two inputs required");
+    CV_CheckGE(node_proto.input_size(), 2, "ONNXImporter/MatMul: two inputs required at least");
+    CV_CheckLE(node_proto.input_size(), 3, "ONNXImporter/MatMul: three inputs required at most");
 
     for (int i = 0; i < node_proto.input_size(); i++) {
         if (constBlobs.find(node_proto.input(i)) == constBlobs.end()) {
@@ -1970,9 +1971,7 @@ void ONNXImporter::parseMatMul(LayerParams& layerParams, const opencv_onnx::Node
 
         Mat blob = getBlob(node_proto, i);
 
-        if (i == 1) {
-            layerParams.blobs.push_back(blob);
-        } else {
+        if (i == 0) {
             LayerParams const_params;
             const_params.name = node_proto.input(i);
             const_params.type = "Const";
@@ -1983,6 +1982,12 @@ void ONNXImporter::parseMatMul(LayerParams& layerParams, const opencv_onnx::Node
             addLayer(const_params, const_node_proto);
 
             node_proto.set_input(i, const_params.name);
+        } else {
+            layerParams.blobs.push_back(blob);
+        }
+
+        if (i == 2 && constBlobsExtraInfo.find(node_proto.input(2)) != constBlobsExtraInfo.end()) {
+            layerParams.set("real_ndims_C", getBlobExtraInfo(node_proto, 2).real_ndims);
         }
     }
 
diff --git a/modules/dnn/test/test_graph_simplifier.cpp b/modules/dnn/test/test_graph_simplifier.cpp
index 91b4e271f5..24da7e65b0 100644
--- a/modules/dnn/test/test_graph_simplifier.cpp
+++ b/modules/dnn/test/test_graph_simplifier.cpp
@@ -143,4 +143,11 @@ TEST_F(Test_Graph_Simplifier, AttentionSubgraph) {
     test("attention_single_head", "Attention");
 }
 
+TEST_F(Test_Graph_Simplifier, BiasedMatMulSubgraph) {
+    /* Test for 1 subgraphs
+        - BiasedMatMulSubgraph
+    */
+    test("biased_matmul", "MatMul");
+}
+
 }}
diff --git a/modules/dnn/test/test_onnx_importer.cpp b/modules/dnn/test/test_onnx_importer.cpp
index 4d475857e5..4b9229a11e 100644
--- a/modules/dnn/test/test_onnx_importer.cpp
+++ b/modules/dnn/test/test_onnx_importer.cpp
@@ -3090,6 +3090,12 @@ TEST_P(Test_ONNX_layers, LayerNormNoFusion) {
     testONNXModels("layer_norm_no_fusion");
 }
 
+TEST_P(Test_ONNX_layers, MatMulAddFusion) {
+    double l1 = (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL) ? 0.0018 : default_l1;
+    double lInf = (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL) ? 0.011 : default_lInf;
+    testONNXModels("biased_matmul", npy, l1, lInf);
+}
+
 INSTANTIATE_TEST_CASE_P(/**/, Test_ONNX_nets, dnnBackendsAndTargets());
 
 }} // namespace