From b758897c2972e1b46601808e7b785e94a1def0bd Mon Sep 17 00:00:00 2001 From: Yuantao Feng Date: Fri, 29 Mar 2024 22:35:23 +0800 Subject: [PATCH] Merge pull request #25271 from fengyuentau:matmul_bias Merge with https://github.com/opencv/opencv_extra/pull/1158 Todo: - [x] Fix Attention pattern recognition. - [x] Handle other backends. Benchmark: "VIT_B_32 OCV/CPU", M1, results in milliseconds. | Model | 4.x | This PR | | - | - | - | | VIT_B_32 OCV/CPU | 87.66 | **83.83** | ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [x] There is a reference to the original bug report and related work - [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [x] The feature is well documented and sample code can be built with the project CMake --- .../cuda4dnn/primitives/matmul_broadcast.hpp | 26 ++- modules/dnn/src/layers/matmul_layer.cpp | 202 ++++++++++++++++-- .../dnn/src/onnx/onnx_graph_simplifier.cpp | 136 ++++++++++-- modules/dnn/src/onnx/onnx_importer.cpp | 13 +- modules/dnn/test/test_graph_simplifier.cpp | 7 + modules/dnn/test/test_onnx_importer.cpp | 6 + 6 files changed, 347 insertions(+), 43 deletions(-) diff --git a/modules/dnn/src/cuda4dnn/primitives/matmul_broadcast.hpp b/modules/dnn/src/cuda4dnn/primitives/matmul_broadcast.hpp index 824d917382..c99a1b5f3a 100644 --- a/modules/dnn/src/cuda4dnn/primitives/matmul_broadcast.hpp +++ b/modules/dnn/src/cuda4dnn/primitives/matmul_broadcast.hpp @@ -12,6 +12,8 @@ #include "../csl/tensor.hpp" #include "../csl/tensor_ops.hpp" +#include "../kernels/eltwise_ops.hpp" // for adding bias + #include #include @@ -23,7 +25,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { public: using wrapper_type = GetCUDABackendWrapperType; - MatMulBroadcastOp(csl::Stream stream_, csl::cublas::Handle handle, const Mat &B, bool _transA, bool _transB, + MatMulBroadcastOp(csl::Stream stream_, csl::cublas::Handle handle, const Mat &B, const Mat &bias, bool _transA, bool _transB, const std::vector &A_offsets_, const std::vector &B_offsets_, std::vector &C_offsets_, size_t batch_) : stream(std::move(stream_)), cublasHandle(std::move(handle)), A_offsets(A_offsets_), B_offsets(B_offsets_), C_offsets(C_offsets_), batch(batch_) @@ -33,6 +35,11 @@ namespace cv { namespace dnn { namespace cuda4dnn { csl::copyMatToTensor(B, input_B_tensor, stream); } + if (!bias.empty()) { + bias_tensor = csl::makeTensorHeader(bias); + csl::copyMatToTensor(bias, bias_tensor, stream); + } + transA = _transA; transB = _transB; } @@ -42,9 +49,6 @@ namespace cv { namespace dnn { namespace cuda4dnn { const std::vector>& outputs, csl::Workspace& workspace) override { - CV_Assert(((inputs.size() == 2 && input_B_tensor.empty()) || - (inputs.size() == 1 && !input_B_tensor.empty())) && outputs.size() == 1); - auto input_A_wrapper = inputs[0].dynamicCast(); auto input_A = input_A_wrapper->getView(); @@ -60,12 +64,26 @@ namespace cv { namespace dnn { namespace cuda4dnn { auto output = output_wrapper->getSpan(); csl::tensor_ops::gemmBatched(cublasHandle, batch, 0.f, output, C_offsets, 1.f, transA, input_A, A_offsets, transB, input_B, B_offsets); + + // add bias if exists + if (!bias_tensor.empty() || inputs.size() >= 3) { + csl::TensorView bias; + if (bias_tensor.empty()) { + auto bias_wrapper = inputs[2].dynamicCast(); + bias = bias_wrapper->getView(); + } else { + bias = csl::TensorView(bias_tensor); + } + + kernels::eltwise_sum_2(stream, output, output, bias); + } } private: csl::Stream stream; csl::cublas::Handle cublasHandle; csl::Tensor input_B_tensor; + csl::Tensor bias_tensor; bool transA, transB; std::vector A_offsets; diff --git a/modules/dnn/src/layers/matmul_layer.cpp b/modules/dnn/src/layers/matmul_layer.cpp index a571592dfb..448af27c18 100644 --- a/modules/dnn/src/layers/matmul_layer.cpp +++ b/modules/dnn/src/layers/matmul_layer.cpp @@ -26,6 +26,10 @@ using namespace cv::dnn::cuda4dnn; namespace cv { namespace dnn { class MatMulLayerImpl CV_FINAL : public MatMulLayer { +#ifdef HAVE_OPENCL + UMat weight_umat, bias_umat; +#endif + public: MatMulLayerImpl(const LayerParams& params) { setParamsFrom(params); @@ -34,6 +38,8 @@ class MatMulLayerImpl CV_FINAL : public MatMulLayer { trans_b = params.get("transB", false); alpha = params.get("alpha", 1.f); beta = params.get("beta", 1.f); + + real_ndims_C = params.get("real_ndims_C", -1); } virtual bool supportBackend(int backendId) CV_OVERRIDE { @@ -48,8 +54,9 @@ class MatMulLayerImpl CV_FINAL : public MatMulLayer { const int requiredOutputs, std::vector &outputs, std::vector &internals) const CV_OVERRIDE { - CV_CheckGE(inputs.size(), static_cast(1), "DNN/MatMul: one varible input at least"); - CV_CheckLE(inputs.size(), static_cast(2), "DNN/MatMul: two variable inputs at most"); + int num_inputs = inputs.size() + blobs.size(); + CV_CheckGE(num_inputs, 2, "DNN/MatMul: two inputs at least"); + CV_CheckLE(num_inputs, 3, "DNN/MatMul: three inputs at most"); const auto shape_A = inputs[0], shape_B = blobs.empty() ? inputs[1] : shape(blobs[0]); CV_CheckGE(shape_A.size(), static_cast(2), "DNN/MatMul: invalid shape of input A"); @@ -64,7 +71,7 @@ class MatMulLayerImpl CV_FINAL : public MatMulLayer { int K_B = trans_b ? nB : mB; CV_CheckEQ(K_A, K_B, "DNN/MatMul: invalid dimension K"); - // Check legal broadcast. It is legal for sure if A and B are 2d, or one of them is 2d. + // Check if inputs are broadcastable. MatShape common_shape; if (shape_A.size() != 2 || shape_B.size() != 2) { const auto &shape_more_dims = shape_A.size() > shape_B.size() ? shape_A : shape_B; @@ -89,6 +96,24 @@ class MatMulLayerImpl CV_FINAL : public MatMulLayer { common_shape[1] = N; } + // Check if bias is broadcastable + if (num_inputs == 3) { + const auto shape_C = blobs.empty() ? inputs.back() : shape(blobs.back()); + if (real_ndims_C == 1) { // (1) or (N) + CV_Check(shape_C[0], shape_C[0] == 1 || shape_C[0] == N, "DNN/MatMul: invalid dimension of C"); + } else if (real_ndims_C >= 2) { + const auto &shape_large = common_shape.size() > shape_C.size() ? common_shape : shape_C; + const auto &shape_small = common_shape.size() > shape_C.size() ? shape_C : common_shape; + size_t diff_dims = shape_large.size() - shape_small.size(); + for (size_t i = 0; i < shape_small.size(); i++) { + const auto dl = shape_small[i], dm = shape_large[i + diff_dims]; + if (dl != 1 && dm != 1 && dl != dm) { + CV_Error(Error::StsBadSize, "DNN/MatMul: invalid shape of C"); + } + } + } + } + outputs.assign(1, common_shape); return false; } @@ -109,6 +134,44 @@ class MatMulLayerImpl CV_FINAL : public MatMulLayer { fastGemmPackB(blobs[0], packed_input_B, trans_b, opt); helper.updatePackedBOffsets(packed_input_B.size()); } + + // broadcast bias if needed + if ((inputs.size() + blobs.size()) >= 3 && blobs.size() >= 2) { + const auto bias_mat = blobs.back(); + const auto bias_shape = shape(bias_mat); + bool is_broadcast_needed = real_ndims_C == 0 || real_ndims_C == 1 || (total(bias_shape) != total(C_shape) || bias_shape.size() != C_shape.size()); + + if (is_broadcast_needed) { + broadcast_bias = Mat(C_shape, CV_32F); + auto *broadcast_bias_ptr = broadcast_bias.ptr(); + + const auto *bias = bias_mat.ptr(); + if (bias_mat.total() == 1) { // [], [1], [1, ...] + float b = (*bias) * beta; + for (size_t i = 0; i < broadcast_bias.total(); i++) { + broadcast_bias_ptr[i] = b; + } + } else if (real_ndims_C == 1) { // [n] + size_t inner_size = C_shape.back(), + loops = total(C_shape) / inner_size; + for (size_t i = 0; i < loops; i++) { + size_t step = i * inner_size; + for (size_t j = 0; j < inner_size; j++) { + broadcast_bias_ptr[step + j] = beta * bias[j]; + } + } + } else { + broadcast(bias_mat, C_shape, broadcast_bias); + } + } else { + broadcast_bias = blobs.back(); + } + } + +#ifdef HAVE_OPENCL + weight_umat.release(); + bias_umat.release(); +#endif } // works like Y = numpy.matmul(A, B) @@ -134,7 +197,38 @@ class MatMulLayerImpl CV_FINAL : public MatMulLayer { const auto *a = A.ptr(); auto *y = Y.ptr(); - std::memset(y, 0, Y.total() * sizeof(float)); + // add bias if existed + if ((inputs.size() + blobs.size()) >= 3) { + const auto &shape_Y = shape(Y); + if (blobs.empty()) { // bias from input + const auto &bias_mat = inputs.back(); + const auto *bias = bias_mat.ptr(); + if (bias_mat.total() == 1) { // [], [1], [1, ...] + float b = (*bias) * beta; + for (size_t i = 0; i < Y.total(); i++) { + y[i] = b; + } + } else if (real_ndims_C == 1) { // [n] + const size_t inner_size = shape_Y.back(), + batches = total(Y) / inner_size; + parallel_for_(Range(0, batches), [&] (const Range &r) { + for (int i = r.start; i < r.end; i++) { + const size_t output_offset = i * inner_size; + for (size_t j = 0; j < inner_size; j++) { + y[output_offset + j] = beta * bias[j]; + } + } + }, double(batches * inner_size * (1 / 1024.0))); + } else { + broadcast(bias_mat, shape_Y, Y); + } + } else { // bias from constant + const auto *bias = broadcast_bias.ptr(); + std::memcpy(y, bias, total(shape_Y) * sizeof(float)); + } + } else { + std::memset(y, 0, Y.total() * sizeof(float)); + } if (blobs.empty()) { const auto &B = inputs[1]; @@ -158,14 +252,36 @@ class MatMulLayerImpl CV_FINAL : public MatMulLayer { inputs_arr.getUMatVector(inputs); outputs_arr.getUMatVector(outputs); - const auto &input_A = inputs[0]; - UMat input_B; - if (blobs.empty()) { - input_B = inputs[1]; - } else { - blobs[0].copyTo(input_B); + // does not support bias as input + if (inputs.size() >= 3) { + return false; } + + const auto &input_A = inputs[0]; auto &output = outputs[0]; + const auto output_shape = shape(output); + + if (blobs.empty()) { + weight_umat = inputs[1]; + if ((inputs.size() + blobs.size() >= 3)) { + bias_umat = UMat::zeros(output_shape.size(), output_shape.data(), CV_32F); + } + } else { + if (weight_umat.empty()) { + blobs.front().copyTo(weight_umat); + } + if ((inputs.size() + blobs.size() >= 3)) { + if (bias_umat.empty()) { + broadcast_bias.copyTo(bias_umat); + } + } else { + if (bias_umat.empty()) { + bias_umat = UMat::zeros(output_shape.size(), output_shape.data(), CV_32F); + } + } + } + + auto &input_B = weight_umat; int M = static_cast(helper.M), N = static_cast(helper.N), @@ -181,7 +297,7 @@ class MatMulLayerImpl CV_FINAL : public MatMulLayer { UMat A, B, C, A_fp32, B_fp32, C_fp32; for (int i = 0; i < batch; i++) { A = input_A_2d.row(helper.A_rows[i]).reshape(1, trans_a ? K : M); - B = input_B_2d.row(helper.B_rows[i]).reshape(1, trans_b ? K : N); + B = input_B_2d.row(helper.B_rows[i]).reshape(1, trans_b ? N : K); C = output_2d.row(helper.C_rows[i]).reshape(1, M); if (trans_a) { @@ -200,7 +316,6 @@ class MatMulLayerImpl CV_FINAL : public MatMulLayer { B_fp32 = B; C_fp32 = C; } - cv::gemm(A_fp32, B_fp32, 1.f, noArray(), 0.f, C_fp32); if (use_half) { A_fp32.convertTo(A, CV_16F); @@ -208,6 +323,12 @@ class MatMulLayerImpl CV_FINAL : public MatMulLayer { C_fp32.convertTo(C, CV_16F); } } + + // add bias + if (!bias_umat.empty()) { + cv::add(output, bias_umat, output); + } + return true; } #endif // HAVE_OPENCL @@ -216,18 +337,28 @@ class MatMulLayerImpl CV_FINAL : public MatMulLayer { virtual Ptr initNgraph(const std::vector >& inputs, const std::vector >& nodes) CV_OVERRIDE { auto& input_A_node = nodes[0].dynamicCast()->node; - std::shared_ptr matmul; + std::shared_ptr result; + ov::Output bias; - if (nodes.size() == 2) { + if (blobs.empty()) { auto &input_B_node = nodes[1].dynamicCast()->node; - matmul = std::make_shared(input_A_node, input_B_node, trans_a, trans_b); + result = std::make_shared(input_A_node, input_B_node, trans_a, trans_b); + if (nodes.size() >= 3) { + bias = nodes[2].dynamicCast()->node; + result = std::make_shared(result, bias); + } } else { auto input_B_shape = getShape(blobs[0]); auto input_B_node = std::make_shared(ov::element::f32, input_B_shape, blobs[0].data); - matmul = std::make_shared(input_A_node, input_B_node, trans_a, trans_b); + result = std::make_shared(input_A_node, input_B_node, trans_a, trans_b); + if ((nodes.size() + blobs.size()) >= 3) { + const auto bias_shape = shape(broadcast_bias); + bias = std::make_shared(ov::element::f32, std::vector(bias_shape.begin(), bias_shape.end()), broadcast_bias.data); + result = std::make_shared(result, bias); + } } - return Ptr(new InfEngineNgraphNode(matmul)); + return Ptr(new InfEngineNgraphNode(result)); } #endif // HAVE_DNN_NGRAPH @@ -239,7 +370,7 @@ class MatMulLayerImpl CV_FINAL : public MatMulLayer { const auto input_A_shape = shape(*input_A_wrapper->getMat()); const auto output_shape = shape(*output_wrapper->getMat()); - if (output_shape.size() != 2) { + if ((inputs.size() + blobs.size()) >= 3 || output_shape.size() != 2) { return Ptr(); } @@ -259,11 +390,17 @@ class MatMulLayerImpl CV_FINAL : public MatMulLayer { const std::vector>& inputs, const std::vector>& outputs) override { auto context = reinterpret_cast(context_); - auto input_B = blobs.empty() ? Mat() : blobs[0]; + auto input_B = Mat(), bias = Mat(); + if (!blobs.empty()) { + input_B = blobs.front(); + if (blobs.size() >= 2) { + bias = broadcast_bias; + } + } CV_CheckFalse(helper.empty(), "DNN/MatMul/CUDA: MatMulHelper is not initialized"); - return make_cuda_node(preferableTarget, std::move(context->stream), std::move(context->cublas_handle), input_B, trans_a, trans_b, helper.A_offsets, helper.B_offsets, helper.C_offsets, helper.batch); + return make_cuda_node(preferableTarget, std::move(context->stream), std::move(context->cublas_handle), input_B, bias, trans_a, trans_b, helper.A_offsets, helper.B_offsets, helper.C_offsets, helper.batch); } #endif // HAVE_CUDA @@ -275,7 +412,7 @@ class MatMulLayerImpl CV_FINAL : public MatMulLayer { auto input_A_desc = input_A_wrapper->getTensorDesc(); auto input_A_node = nodes[0].dynamicCast()->getOp(); - auto op = std::make_shared(name); + auto op = std::make_shared(name); // set attributes op->set_attr_adj_x1(trans_a); @@ -292,11 +429,31 @@ class MatMulLayerImpl CV_FINAL : public MatMulLayer { auto input_B_node = nodes[1].dynamicCast()->getOp(); op->set_input_x2_by_name(*input_B_node, "y"); op->update_input_desc_x2(*input_B_desc); + if (inputs.size() >= 3) { + auto input_bias_wrapper = inputs[2].dynamicCast(); + auto input_bias_desc = input_bias_wrapper->getTensorDesc(); + auto input_bias_node = nodes[2].dynamicCast()->getOp(); + op->set_input_bias_by_name(*input_bias_node, "y"); + op->update_input_desc_bias(*input_bias_desc); + } } else { // constant input B auto B = blobs[0]; auto const_B_node = std::make_shared(B.data, B.type(), shape(B), cv::format("%s_B", name.c_str())); op->set_input_x2_by_name(*(const_B_node->getOp()), "y"); op->update_input_desc_x2(*(const_B_node->getTensorDesc())); + if ((inputs.size() + blobs.size()) >= 3) { // does not support broadcast bias + auto bias_mat = blobs.back(); + auto bias_shape = shape(bias_mat); + + // reshape if 1d + if (real_ndims_C == 1 && bias_shape.front() != 1) { + bias_shape = std::vector{bias_shape.front()}; + } + + auto const_bias_node = std::make_shared(bias_mat.data, bias_mat.type(), bias_shape, cv::format("%s_bias", name.c_str())); + op->set_input_bias_by_name(*(const_bias_node->getOp()), "y"); + op->update_input_desc_bias(*(const_bias_node->getTensorDesc())); + } } // set outputs @@ -312,7 +469,10 @@ class MatMulLayerImpl CV_FINAL : public MatMulLayer { float alpha; float beta; + int real_ndims_C; + std::vector packed_input_B; + Mat broadcast_bias; FastGemmOpt opt; MatMulHelper helper; diff --git a/modules/dnn/src/onnx/onnx_graph_simplifier.cpp b/modules/dnn/src/onnx/onnx_graph_simplifier.cpp index 7b8dd483c7..a87910a4c4 100644 --- a/modules/dnn/src/onnx/onnx_graph_simplifier.cpp +++ b/modules/dnn/src/onnx/onnx_graph_simplifier.cpp @@ -242,6 +242,115 @@ class AdjustSliceAllOptionalInputsSubgraph : public Subgraph { size_t num_inputs_; }; +/* Fusion for biased MatMul. + + Graph before fusion: [Input] -> MatMul -> Add -> [Output] + + Graph after fusion: [Input] -> MatMul -> [Output] + \ + bias +*/ + +class BiasedMatmulSubgraph : public Subgraph { + public: + BiasedMatmulSubgraph() { + int input = addNodeToMatch(""); + matmul_id = addNodeToMatch("MatMul", input, addNodeToMatch("")); + add_id = addNodeToMatch("Add", addNodeToMatch(""), matmul_id); + + setFusedNode("MatMul", input); + } + + virtual bool match(const Ptr& net, int nodeId, + std::vector& matchedNodesIds) CV_OVERRIDE { + if (Subgraph::match(net, nodeId, matchedNodesIds)) { + auto onnx_net = net.dynamicCast(); + + // get input weight from MatMul + { + // make sure that input A is not Constant + if (onnx_net->getInputInitializerId(matchedNodesIds[matmul_id], 0) >= 0) { + return false; + } else { + const Ptr node = net->getNode(matchedNodesIds[matmul_id]); + + int constant_id = Subgraph::getInputNodeId(net, node, 0); + auto constant_node = net->getNode(constant_id); + if (constant_node->getType() == "Constant") { + return false; + } + } + + bool is_weight_const = false; + int initializer_id = onnx_net->getInputInitializerId(matchedNodesIds[matmul_id], 1); + if (initializer_id != -1) { // Initializer + weight_name = onnx_net->getNameOfInitializer(initializer_id); + is_weight_const = true; + } else { // Constant layer + const Ptr node = net->getNode(matchedNodesIds[matmul_id]); + + int constant_id = Subgraph::getInputNodeId(net, node, 1); + auto constant_node = net->getNode(constant_id); + if (constant_node->getType() == "Constant") { + weight_name = node->getInputName(1); + is_weight_const = true; + } + } + + if (!is_weight_const) { + return false; + } + } + + // get input bias from Add + { + bool is_bias_const = false; + int initializer_id = std::max(onnx_net->getInputInitializerId(matchedNodesIds[add_id], 0), + onnx_net->getInputInitializerId(matchedNodesIds[add_id], 1)); + if (initializer_id != -1) { + bias_name = onnx_net->getNameOfInitializer(initializer_id); + is_bias_const = true; + } else { // Constant layer + const Ptr node = net->getNode(matchedNodesIds[add_id]); + + int constant_id = Subgraph::getInputNodeId(net, node, 0); + auto constant_node = net->getNode(constant_id); + if (constant_node->getType() == "Constant") { + bias_name = node->getInputName(0); + is_bias_const = true; + } else { + constant_id = Subgraph::getInputNodeId(net, node, 1); + constant_node = net->getNode(constant_id); + if (constant_node->getType() == "Constant") { + bias_name = node->getInputName(1); + is_bias_const = true; + } + } + } + if (!is_bias_const) { + return false; + } + } + + return true; + } + return false; + } + + virtual void finalize(const Ptr& net, + const Ptr& fusedNode, + std::vector >&) CV_OVERRIDE { + opencv_onnx::NodeProto* node = fusedNode.dynamicCast()->node; + // add inputs + node->add_input(weight_name); + node->add_input(bias_name); + } + + private: + int matmul_id, add_id; + std::string weight_name, bias_name; +}; + /* The fusion for the multi-head attention from vision transformer. Abbreviations: @@ -322,22 +431,21 @@ class AttentionSubGraph : public Subgraph { AttentionSubGraph() { int input = addNodeToMatch(""); int transpose = addNodeToMatch("Transpose", input); // tranpose does not make any differences to the accuracy here in this subgraph - att_matmul = addNodeToMatch("MatMul", transpose, addNodeToMatch("")); - att_add = addNodeToMatch("Add", addNodeToMatch(""), att_matmul); + att_matmul = addNodeToMatch("MatMul", transpose, addNodeToMatch(""), addNodeToMatch("")); // add is fused into matmul via BiasedMatMulSubgraph // v_path - slice_v = addNodeToMatch("Slice", std::vector{att_add, addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch("")}); + slice_v = addNodeToMatch("Slice", std::vector{att_matmul, addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch("")}); int reshape_v = addNodeToMatch("Reshape", slice_v, addNodeToMatch("")); int transpose_v = addNodeToMatch("Transpose", reshape_v); // q_path - slice_q = addNodeToMatch("Slice", std::vector{att_add, addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch("")}); + slice_q = addNodeToMatch("Slice", std::vector{att_matmul, addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch("")}); reshape_q = addNodeToMatch("Reshape", slice_q, addNodeToMatch("")); int transpose_q = addNodeToMatch("Transpose", reshape_q); div_q = addNodeToMatch("Div", transpose_q, addNodeToMatch("")); // k_path - slice_k = addNodeToMatch("Slice", std::vector{att_add, addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch("")}); + slice_k = addNodeToMatch("Slice", std::vector{att_matmul, addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch("")}); int reshape_k = addNodeToMatch("Reshape", slice_k, addNodeToMatch("")); int transpose_k = addNodeToMatch("Transpose", reshape_k); @@ -380,7 +488,7 @@ class AttentionSubGraph : public Subgraph { // get names weight_name = getInputName(net, matchedNodesIds[att_matmul], 1); - bias_name = getInputName(net, matchedNodesIds[att_add], 0); + bias_name = getInputName(net, matchedNodesIds[att_matmul], 2); return true; } return false; @@ -414,7 +522,7 @@ class AttentionSubGraph : public Subgraph { } private: - int att_matmul, att_add; + int att_matmul; int slice_q, slice_k, slice_v; int reshape_q, div_q, last_reshape; @@ -436,20 +544,19 @@ class AttentionSingleHeadSubGraph : public Subgraph { AttentionSingleHeadSubGraph() { int input = addNodeToMatch(""); int transpose = addNodeToMatch("Transpose", input); // tranpose does not make any differences to the accuracy here in this subgraph - att_matmul = addNodeToMatch("MatMul", transpose, addNodeToMatch("")); - att_add = addNodeToMatch("Add", addNodeToMatch(""), att_matmul); + att_matmul = addNodeToMatch("MatMul", transpose, addNodeToMatch(""), addNodeToMatch("")); // add is fused into matmul via BiasedMatMulSubgraph // v_path - slice_v = addNodeToMatch("Slice", std::vector{att_add, addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch("")}); + slice_v = addNodeToMatch("Slice", std::vector{att_matmul, addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch("")}); int transpose_v = addNodeToMatch("Transpose", slice_v); // q_path - slice_q = addNodeToMatch("Slice", std::vector{att_add, addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch("")}); + slice_q = addNodeToMatch("Slice", std::vector{att_matmul, addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch("")}); int transpose_q = addNodeToMatch("Transpose", slice_q); div_q = addNodeToMatch("Div", transpose_q, addNodeToMatch("")); // k_path - slice_k = addNodeToMatch("Slice", std::vector{att_add, addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch("")}); + slice_k = addNodeToMatch("Slice", std::vector{att_matmul, addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch("")}); int transpose_k = addNodeToMatch("Transpose", slice_k); // qk @@ -491,7 +598,7 @@ class AttentionSingleHeadSubGraph : public Subgraph { // get names weight_name = getInputName(net, matchedNodesIds[att_matmul], 1); - bias_name = getInputName(net, matchedNodesIds[att_add], 0); + bias_name = getInputName(net, matchedNodesIds[att_matmul], 2); return true; } return false; @@ -525,7 +632,7 @@ class AttentionSingleHeadSubGraph : public Subgraph { } protected: - int att_matmul, att_add; + int att_matmul; int slice_q, slice_k, slice_v; int div_q, last_reshape; @@ -1558,6 +1665,7 @@ public: void simplifySubgraphs(opencv_onnx::GraphProto& net) { std::vector > subgraphs; + subgraphs.push_back(makePtr()); subgraphs.push_back(makePtr(3)); subgraphs.push_back(makePtr(4)); subgraphs.push_back(makePtr()); diff --git a/modules/dnn/src/onnx/onnx_importer.cpp b/modules/dnn/src/onnx/onnx_importer.cpp index 997914c4cf..c8fb026d8d 100644 --- a/modules/dnn/src/onnx/onnx_importer.cpp +++ b/modules/dnn/src/onnx/onnx_importer.cpp @@ -1961,7 +1961,8 @@ void ONNXImporter::parseGemm(LayerParams& layerParams, const opencv_onnx::NodePr void ONNXImporter::parseMatMul(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto_) { auto node_proto = node_proto_; - CV_CheckEQ(node_proto.input_size(), 2, "ONNXImporter/MatMul: two inputs required"); + CV_CheckGE(node_proto.input_size(), 2, "ONNXImporter/MatMul: two inputs required at least"); + CV_CheckLE(node_proto.input_size(), 3, "ONNXImporter/MatMul: three inputs required at most"); for (int i = 0; i < node_proto.input_size(); i++) { if (constBlobs.find(node_proto.input(i)) == constBlobs.end()) { @@ -1970,9 +1971,7 @@ void ONNXImporter::parseMatMul(LayerParams& layerParams, const opencv_onnx::Node Mat blob = getBlob(node_proto, i); - if (i == 1) { - layerParams.blobs.push_back(blob); - } else { + if (i == 0) { LayerParams const_params; const_params.name = node_proto.input(i); const_params.type = "Const"; @@ -1983,6 +1982,12 @@ void ONNXImporter::parseMatMul(LayerParams& layerParams, const opencv_onnx::Node addLayer(const_params, const_node_proto); node_proto.set_input(i, const_params.name); + } else { + layerParams.blobs.push_back(blob); + } + + if (i == 2 && constBlobsExtraInfo.find(node_proto.input(2)) != constBlobsExtraInfo.end()) { + layerParams.set("real_ndims_C", getBlobExtraInfo(node_proto, 2).real_ndims); } } diff --git a/modules/dnn/test/test_graph_simplifier.cpp b/modules/dnn/test/test_graph_simplifier.cpp index 91b4e271f5..24da7e65b0 100644 --- a/modules/dnn/test/test_graph_simplifier.cpp +++ b/modules/dnn/test/test_graph_simplifier.cpp @@ -143,4 +143,11 @@ TEST_F(Test_Graph_Simplifier, AttentionSubgraph) { test("attention_single_head", "Attention"); } +TEST_F(Test_Graph_Simplifier, BiasedMatMulSubgraph) { + /* Test for 1 subgraphs + - BiasedMatMulSubgraph + */ + test("biased_matmul", "MatMul"); +} + }} diff --git a/modules/dnn/test/test_onnx_importer.cpp b/modules/dnn/test/test_onnx_importer.cpp index 4d475857e5..4b9229a11e 100644 --- a/modules/dnn/test/test_onnx_importer.cpp +++ b/modules/dnn/test/test_onnx_importer.cpp @@ -3090,6 +3090,12 @@ TEST_P(Test_ONNX_layers, LayerNormNoFusion) { testONNXModels("layer_norm_no_fusion"); } +TEST_P(Test_ONNX_layers, MatMulAddFusion) { + double l1 = (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL) ? 0.0018 : default_l1; + double lInf = (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL) ? 0.011 : default_lInf; + testONNXModels("biased_matmul", npy, l1, lInf); +} + INSTANTIATE_TEST_CASE_P(/**/, Test_ONNX_nets, dnnBackendsAndTargets()); }} // namespace