Merge pull request #24509 from Abdurrahheem:ash/dev_einsum_fast_gemm

Fast gemm for einsum #24509 ## This PR adds performance tests for Einsum Layer with FastGemm. See below results of performance test on different inputs ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [ ] There is a reference to the original bug report and related work - [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [x] The feature is well documented and sample code can be built with the project CMake
2025-06-07 09:25:45 +08:00 · 2023-11-16 17:20:17 +04:00 · 2023-11-16 17:20:17 +04:00 · 8c10545d3c
commit 8c10545d3c
parent 83d70b0f36
2 changed files with 141 additions and 145 deletions
--- a/modules/dnn/perf/perf_einsum.cpp
+++ b/modules/dnn/perf/perf_einsum.cpp
@ -11,19 +11,16 @@ struct EinsumParams {
    int outputSize;
    std::string equation;
    std::vector<MatShape> einsumInpShapes;
-    EinsumParams(std::string equation_, int inputSize_, int outputSize_,  std::vector<MatShape> einsumInpShapes_ = std::vector<MatShape>())
+    EinsumParams(std::string equation_, std::vector<MatShape> einsumInpShapes_ = std::vector<MatShape>())
    {
-        inputSize = inputSize_;
-        outputSize = outputSize_;
+        inputSize = einsumInpShapes_.size();
        equation = equation_;
        einsumInpShapes = einsumInpShapes_;
    }
 };

 static inline void PrintTo(const EinsumParams& params, ::std::ostream* os) {
-     (*os) << "Eqiation=" << params.equation << ", "
-        << "InputSize=" << params.inputSize << ", "
-        << "OutputSize=" << params.outputSize << ", ";
+     (*os) << "Equation=" << params.equation << " ";

        (*os) << "InputShape={";
        for(int i = 0; i < params.einsumInpShapes.size(); i++)
@ -41,22 +38,22 @@ static inline void PrintTo(const EinsumParams& params, ::std::ostream* os) {
 // test cases
 static const EinsumParams testEinsumConfigs[] = {
    // TODO: Add tests with one input after ellips merge
-    {"ij, jk -> ik", 2, 1,  {{2, 3}, {3, 2}}},
-    {"ij, jk -> ik", 2, 1,  {{20, 30}, {30, 20}}},
-    {"ij, jk -> ik", 2, 1,  {{113, 127}, {127, 113}}},
+    {"ij, jk -> ik", {{2, 3}, {3, 2}}},
+    {"ij, jk -> ik", {{20, 30}, {30, 20}}},
+    {"ij, jk -> ik", {{113, 127}, {127, 113}}},

-    {"imkj, injs -> imnks", 2, 1,  {{1, 4, 7, 9}, {1, 5, 9, 8}}},
-    {"imkj, injs -> imnks", 2, 1,  {{1, 4, 70, 90}, {1, 5, 90, 80}}},
-    {"imkj, injs -> imnks", 2, 1,  {{1, 4, 73, 91}, {1, 5, 91, 57}}},
+    {"imkj, injs -> imnks", {{1, 4, 7, 9}, {1, 5, 9, 8}}},
+    {"imkj, injs -> imnks", {{1, 4, 70, 90}, {1, 5, 90, 80}}},
+    {"imkj, injs -> imnks", {{1, 4, 73, 91}, {1, 5, 91, 57}}},

-    {"ij -> i", 1, 1, {{30, 40}}},
-    {"ij -> i", 1, 1, {{113, 374}}},
+    {"ij -> i",  {{30, 40}}},
+    {"ij -> i",  {{113, 374}}},

-    {"...ij -> ...i", 1, 1, {{30, 40}}},
-    {"...ij -> ...i", 1, 1, {{113, 374}}},
+    {"...ij -> ...i", {{30, 40}}},
+    {"...ij -> ...i", {{113, 374}}},

-    {"...ij, ...jk -> ...ik", 2, 1, {{40, 50}, {50, 80}}},
-    {"...ij, ...jk -> ...ik", 2, 1, {{47, 51}, {51, 83}}},
+    {"...ij, ...jk -> ...ik",  {{40, 50}, {50, 80}}},
+    {"...ij, ...jk -> ...ik",  {{47, 51}, {51, 83}}},
 };

 class Layer_Einsum: public TestBaseWithParam<EinsumParams> {};
@ -68,7 +65,7 @@ PERF_TEST_P_(Layer_Einsum, einsum) {
    lp.name = "testEinsum";
    lp.set("equation", params.equation);
    lp.set("inputSize", params.inputSize);
-    lp.set("outputSize", params.outputSize);
+    lp.set("outputSize", 1);

    CV_CheckFalse(params.einsumInpShapes.empty(), "ERROR no inputs shapes provided");

@ -79,38 +76,27 @@ PERF_TEST_P_(Layer_Einsum, einsum) {
    Net net;
    std::vector<Mat> inputs;
    std::vector<std::string> input_names;
-    if (params.inputSize == 1){
+    int id = net.addLayer(lp.name, lp.type, lp);

+    for (int i = 0; i < params.inputSize; ++i) {
        // create inputs
-        inputs.emplace_back(Mat(params.einsumInpShapes[0].size(), params.einsumInpShapes[0].data(), CV_32FC1));
+        inputs.emplace_back(Mat(params.einsumInpShapes[i].size(), params.einsumInpShapes[i].data(), CV_32FC1));

-        int id = net.addLayerToPrev(lp.name, lp.type, lp);
-        net.connect(0, 0, id, 0);
+        // connect each input to the layer
+        net.connect(0, i, id, i);

-        input_names.emplace_back("input1");
-
-    } else {
-
-        // create inputs
-        inputs.emplace_back(Mat(params.einsumInpShapes[0].size(), params.einsumInpShapes[0].data(), CV_32FC1));
-        inputs.emplace_back(Mat(params.einsumInpShapes[1].size(), params.einsumInpShapes[1].data(), CV_32FC1));
-
-        int id = net.addLayerToPrev(lp.name, lp.type, lp);
-        net.connect(0, 0, id, 0);
-        net.connect(0, 1, id, 1);
-
-        input_names.emplace_back("input1");
-        input_names.emplace_back("input2");
+        // create input names dynamically, assuming input naming follows a consistent pattern
+        input_names.emplace_back("input" + std::to_string(i + 1));
    }

    //warm up
+    std::vector<Mat> outputs;
    net.setInputsNames(input_names);
    for (int i = 0; i < input_names.size(); i++){
        net.setInput(inputs[i], input_names[i]);
    }
-    Mat out = net.forward();
+    net.forward(outputs, "testEinsum");

-    std::vector<Mat> outputs;
    TEST_CYCLE()
    {
        net.forward(outputs, "testEinsum");
--- a/modules/dnn/src/layers/einsum_layer.cpp
+++ b/modules/dnn/src/layers/einsum_layer.cpp
@ -6,6 +6,7 @@
 #include <opencv2/dnn/shape_utils.hpp>
 #include "../precomp.hpp"
 #include "layers_common.hpp"
+#include "cpu_kernels/fast_gemm.hpp"

 namespace cv
 {
@ -32,111 +33,6 @@ static bool IsTransposeReshapeForEinsum(const std::vector<size_t>& perm,
    return true;
 }

-static Mat batchwiseMatMul(
-    const Mat& input1,
-    const MatShape& input1ShapeOverride,
-    const Mat& input2,
-    const MatShape& input2ShapeOverride)
-{
-    // Sanity checks before the actual MatMul
-    CV_CheckType(input1.type(), input2.type(), "Data types of the inputs must match for MatMul");
-    CV_CheckEQ(input1ShapeOverride.size(), (size_t) 3, "Only 1 batch dimension is allowed for MatMul");
-    CV_CheckEQ(input2ShapeOverride.size(), (size_t) 3, "Only 1 batch dimension is allowed for MatMul");
-    CV_CheckEQ((size_t) input1ShapeOverride[0], (size_t) input2ShapeOverride[0], "Batch dimension should match for MatMul;");
-    CV_CheckEQ((size_t) input1ShapeOverride[2], (size_t) input2ShapeOverride[1], "Incompatible matrix dimensions for matMul");
-
-    size_t batches = input1ShapeOverride[0];
-    size_t M = input1ShapeOverride[1];
-    size_t K = input1ShapeOverride[2];
-    size_t N = input2ShapeOverride[2];
-
-    std::vector<Mat> output;
-    if (batches > 1)
-    {
-        Mat reshapedInput1 = input1;
-        Mat reshapedInput2 = input2;
-
-        // input1 should of size MxK
-        // check if input1 needs reshape, if need reshape
-        if (input1.size[0] != M || input1.size[1] != K)
-        {
-            int shape[] = {static_cast<int>(batches), static_cast<int>(M), static_cast<int>(K)};
-            reshapedInput1 = input1.reshape(1, 3, shape);
-        }
-
-        // input2 should be of size KxN
-        // check if input2 needs reshape, if needs reshape
-        if (input2.size[0] != K || input2.size[1] != N)
-        {
-            int shape[] = {static_cast<int>(batches), static_cast<int>(K), static_cast<int>(N)};
-            reshapedInput2 = input2.reshape(1, 3, shape);
-        }
-
-        for (size_t i=0; i < batches; i++)
-        {
-            std::vector<Range> ranges1 = {cv::Range(i, i+1)};
-            for (int j = 1; j < reshapedInput1.dims; j++)
-                ranges1.emplace_back(cv::Range::all());
-
-            Mat part1 = reshapedInput1(ranges1);
-            int shape[] = {static_cast<int>(M), static_cast<int>(K)};
-            part1 = part1.reshape(1, sizeof(shape)/sizeof(shape[0]), shape);
-
-            std::vector<Range> ranges2 = {cv::Range(i, i+1)};
-            for (int j = 1; j < reshapedInput2.dims; j++)
-                ranges2.emplace_back(cv::Range::all());
-
-            Mat part2 = reshapedInput2(ranges2);
-            int shape2[] = {static_cast<int>(K), static_cast<int>(N)};
-            part2 = part2.reshape(1, sizeof(shape2)/sizeof(shape2[0]), shape2);
-
-            Mat tmp_output;
-            cv::gemm(part1, part2, 1.0, cv::Mat(), 1.0, tmp_output);
-            int newShape[] = {1, static_cast<int>(M), static_cast<int>(N)};
-            tmp_output = tmp_output.reshape(1, sizeof(newShape)/sizeof(newShape[0]), newShape);
-
-            output.emplace_back(tmp_output);
-        }
-
-    } else {
-
-        Mat reshapedInput1 = input1;
-        Mat reshapedInput2 = input2;
-
-        // input1 should of size MxK
-        // check if input1 needs reshape, if need reshape
-        if (input1.dims > 2 || input1.size[0] != M || input1.size[1] != K)
-        {
-            int shape[] = {static_cast<int>(M), static_cast<int>(K)};
-            reshapedInput1 = input1.reshape(1, 2, shape);
-        }
-
-        // input2 should be of size KxN
-        // check if input2 needs reshape, if needs reshape
-        if (input2.dims > 2 || input2.size[0] != K || input2.size[1] != N)
-        {
-            int shape2[] = {static_cast<int>(K), static_cast<int>(N)};
-            reshapedInput2 = input2.reshape(1, 2, shape2);
-        }
-
-        Mat tmp_output;
-        cv::gemm(reshapedInput1, reshapedInput2, 1.0, cv::Mat(), 1.0, tmp_output);
-
-        int newShape[] = {1, static_cast<int>(M), static_cast<int>(N)};
-        tmp_output = tmp_output.reshape(1, sizeof(newShape)/sizeof(newShape[0]), newShape);
-        output.emplace_back(tmp_output);
-
-    }
-
-    int outputDim[] = {static_cast<int>(output.size()), static_cast<int>(M), static_cast<int>(N)};
-    Mat output_buffer = Mat::zeros(3, outputDim, CV_32F);
-
-    for (size_t i = 0; i < output.size(); i++) {
-        Mat output_slice = output_buffer.row(i);
-        output[i].copyTo(output_slice);
-    }
-    return output_buffer;
-};

 static Mat Transpose(
    const Mat& input,
@ -452,6 +348,8 @@ public:
    // The number of dimensions that are encompassed by an "ellipsis" - "...".
    size_t numOfEllipsisDims = 0;

+    // Backend for fastgemm
+    FastGemmOpt opt;

    void parseEquation(String equation);
    void processEquation(const std::vector<MatShape>& inputs);
@ -469,7 +367,12 @@ public:
        const MatShape& reduceDims,
        bool isFinalPair
    );
-
+    Mat batchwiseMatMul(
+        const Mat& input1,
+        const MatShape& input1ShapeOverride,
+        const Mat& input2,
+        const MatShape& input2ShapeOverride
+    );

    // constructor
    LayerEinsumImpl(const LayerParams& params)
@ -491,6 +394,7 @@ public:
            einsumInpShapes.emplace_back(shape);
        }

+        opt.init();

        // Maintains a mapping between input indices and their corresponding subscript labels for each input
        inputSubscriptIndices.reserve(numInputs);
@ -1389,6 +1293,112 @@ Mat LayerEinsumImpl::pairwiseOperandProcess(
    return output;
 };

+Mat LayerEinsumImpl::batchwiseMatMul(
+    const Mat& input1,
+    const MatShape& input1ShapeOverride,
+    const Mat& input2,
+    const MatShape& input2ShapeOverride)
+{
+
+    // Sanity checks before the actual MatMul
+    CV_CheckType(input1.type(), input2.type(), "Data types of the inputs must match for MatMul");
+    CV_CheckEQ(input1ShapeOverride.size(), (size_t) 3, "Only 1 batch dimension is allowed for MatMul");
+    CV_CheckEQ(input2ShapeOverride.size(), (size_t) 3, "Only 1 batch dimension is allowed for MatMul");
+    CV_CheckEQ((size_t) input1ShapeOverride[0], (size_t) input2ShapeOverride[0], "Batch dimension should match for MatMul;");
+    CV_CheckEQ((size_t) input1ShapeOverride[2], (size_t) input2ShapeOverride[1], "Incompatible matrix dimensions for matMul");
+
+    int batches = input1ShapeOverride[0];
+    int M = input1ShapeOverride[1];
+    int K = input1ShapeOverride[2];
+    int N = input2ShapeOverride[2];
+
+    std::vector<Mat> output;
+    if (batches > 1)
+    {
+        Mat reshapedInput1 = input1;
+        Mat reshapedInput2 = input2;
+
+        // input1 should of size MxK
+        // check if input1 needs reshape, if need reshape
+        if (input1.size[0] != M || input1.size[1] != K)
+        {
+            int shape[] = {batches, M, K};
+            reshapedInput1 = input1.reshape(1, 3, shape);
+        }
+
+        // input2 should be of size KxN
+        // check if input2 needs reshape, if needs reshape
+        if (input2.size[0] != K || input2.size[1] != N)
+        {
+            int shape[] = {batches, K, N};
+            reshapedInput2 = input2.reshape(1, 3, shape);
+        }
+
+        for (size_t i=0; i < batches; i++)
+        {
+            std::vector<Range> ranges1 = {cv::Range(i, i+1)};
+            for (int j = 1; j < reshapedInput1.dims; j++)
+                ranges1.emplace_back(cv::Range::all());
+
+            Mat part1 = reshapedInput1(ranges1);
+            int shape[] = {M, K};
+            part1 = part1.reshape(1, sizeof(shape)/sizeof(shape[0]), shape);
+
+            std::vector<Range> ranges2 = {cv::Range(i, i+1)};
+            for (int j = 1; j < reshapedInput2.dims; j++)
+                ranges2.emplace_back(cv::Range::all());
+
+            Mat part2 = reshapedInput2(ranges2);
+            int shape2[] = {K, N};
+            part2 = part2.reshape(1, sizeof(shape2)/sizeof(shape2[0]), shape2);
+
+            Mat tmp_output(M, N, part1.type());
+            fastGemm(false, false, 1.0, part1, part2, 0.0, tmp_output, opt);
+            int newShape[] = {1, M, N};
+            tmp_output = tmp_output.reshape(1, sizeof(newShape)/sizeof(newShape[0]), newShape);
+
+            output.emplace_back(tmp_output);
+        }
+
+    } else {
+
+        Mat reshapedInput1 = input1;
+        Mat reshapedInput2 = input2;
+
+        // input1 should of size MxK
+        // check if input1 needs reshape, if need reshape
+        if (input1.dims > 2 || input1.size[0] != M || input1.size[1] != K)
+        {
+            int shape[] = {M, K};
+            reshapedInput1 = input1.reshape(1, 2, shape);
+        }
+
+        // input2 should be of size KxN
+        // check if input2 needs reshape, if needs reshape
+        if (input2.dims > 2 || input2.size[0] != K || input2.size[1] != N)
+        {
+            int shape2[] = {K, N};
+            reshapedInput2 = input2.reshape(1, 2, shape2);
+        }
+
+        Mat tmp_output(M, N, reshapedInput1.type());
+        fastGemm(false, false, 1.0, reshapedInput1, reshapedInput2, 0.0, tmp_output, opt);
+
+        int newShape[] = {1, M, N};
+        tmp_output = tmp_output.reshape(1, sizeof(newShape)/sizeof(newShape[0]), newShape);
+        output.emplace_back(tmp_output);
+
+    }
+
+    int outputDim[] = {static_cast<int>(output.size()), M, N};
+    Mat output_buffer = Mat::zeros(3, outputDim, CV_32F);
+
+    for (size_t i = 0; i < output.size(); i++) {
+        Mat output_slice = output_buffer.row(i);
+        output[i].copyTo(output_slice);
+    }
+    return output_buffer;
+};
 Ptr<EinsumLayer> EinsumLayer::create(const LayerParams& params)
 {
    return makePtr<LayerEinsumImpl>(params);