diff --git a/modules/dnn/perf/perf_einsum.cpp b/modules/dnn/perf/perf_einsum.cpp
index c3706d3153..bad9d956be 100644
--- a/modules/dnn/perf/perf_einsum.cpp
+++ b/modules/dnn/perf/perf_einsum.cpp
@@ -11,19 +11,16 @@ struct EinsumParams {
     int outputSize;
     std::string equation;
     std::vector<MatShape> einsumInpShapes;
-    EinsumParams(std::string equation_, int inputSize_, int outputSize_,  std::vector<MatShape> einsumInpShapes_ = std::vector<MatShape>())
+    EinsumParams(std::string equation_, std::vector<MatShape> einsumInpShapes_ = std::vector<MatShape>())
     {
-        inputSize = inputSize_;
-        outputSize = outputSize_;
+        inputSize = einsumInpShapes_.size();
         equation = equation_;
         einsumInpShapes = einsumInpShapes_;
     }
 };
 
 static inline void PrintTo(const EinsumParams& params, ::std::ostream* os) {
-     (*os) << "Eqiation=" << params.equation << ", "
-        << "InputSize=" << params.inputSize << ", "
-        << "OutputSize=" << params.outputSize << ", ";
+     (*os) << "Equation=" << params.equation << " ";
 
         (*os) << "InputShape={";
         for(int i = 0; i < params.einsumInpShapes.size(); i++)
@@ -41,22 +38,22 @@ static inline void PrintTo(const EinsumParams& params, ::std::ostream* os) {
 // test cases
 static const EinsumParams testEinsumConfigs[] = {
     // TODO: Add tests with one input after ellips merge
-    {"ij, jk -> ik", 2, 1,  {{2, 3}, {3, 2}}},
-    {"ij, jk -> ik", 2, 1,  {{20, 30}, {30, 20}}},
-    {"ij, jk -> ik", 2, 1,  {{113, 127}, {127, 113}}},
+    {"ij, jk -> ik", {{2, 3}, {3, 2}}},
+    {"ij, jk -> ik", {{20, 30}, {30, 20}}},
+    {"ij, jk -> ik", {{113, 127}, {127, 113}}},
 
-    {"imkj, injs -> imnks", 2, 1,  {{1, 4, 7, 9}, {1, 5, 9, 8}}},
-    {"imkj, injs -> imnks", 2, 1,  {{1, 4, 70, 90}, {1, 5, 90, 80}}},
-    {"imkj, injs -> imnks", 2, 1,  {{1, 4, 73, 91}, {1, 5, 91, 57}}},
+    {"imkj, injs -> imnks", {{1, 4, 7, 9}, {1, 5, 9, 8}}},
+    {"imkj, injs -> imnks", {{1, 4, 70, 90}, {1, 5, 90, 80}}},
+    {"imkj, injs -> imnks", {{1, 4, 73, 91}, {1, 5, 91, 57}}},
 
-    {"ij -> i", 1, 1, {{30, 40}}},
-    {"ij -> i", 1, 1, {{113, 374}}},
+    {"ij -> i",  {{30, 40}}},
+    {"ij -> i",  {{113, 374}}},
 
-    {"...ij -> ...i", 1, 1, {{30, 40}}},
-    {"...ij -> ...i", 1, 1, {{113, 374}}},
+    {"...ij -> ...i", {{30, 40}}},
+    {"...ij -> ...i", {{113, 374}}},
 
-    {"...ij, ...jk -> ...ik", 2, 1, {{40, 50}, {50, 80}}},
-    {"...ij, ...jk -> ...ik", 2, 1, {{47, 51}, {51, 83}}},
+    {"...ij, ...jk -> ...ik",  {{40, 50}, {50, 80}}},
+    {"...ij, ...jk -> ...ik",  {{47, 51}, {51, 83}}},
 };
 
 class Layer_Einsum: public TestBaseWithParam<EinsumParams> {};
@@ -68,7 +65,7 @@ PERF_TEST_P_(Layer_Einsum, einsum) {
     lp.name = "testEinsum";
     lp.set("equation", params.equation);
     lp.set("inputSize", params.inputSize);
-    lp.set("outputSize", params.outputSize);
+    lp.set("outputSize", 1);
 
     CV_CheckFalse(params.einsumInpShapes.empty(), "ERROR no inputs shapes provided");
 
@@ -79,38 +76,27 @@ PERF_TEST_P_(Layer_Einsum, einsum) {
     Net net;
     std::vector<Mat> inputs;
     std::vector<std::string> input_names;
-    if (params.inputSize == 1){
+    int id = net.addLayer(lp.name, lp.type, lp);
 
+    for (int i = 0; i < params.inputSize; ++i) {
         // create inputs
-        inputs.emplace_back(Mat(params.einsumInpShapes[0].size(), params.einsumInpShapes[0].data(), CV_32FC1));
+        inputs.emplace_back(Mat(params.einsumInpShapes[i].size(), params.einsumInpShapes[i].data(), CV_32FC1));
 
-        int id = net.addLayerToPrev(lp.name, lp.type, lp);
-        net.connect(0, 0, id, 0);
+        // connect each input to the layer
+        net.connect(0, i, id, i);
 
-        input_names.emplace_back("input1");
-
-    } else {
-
-        // create inputs
-        inputs.emplace_back(Mat(params.einsumInpShapes[0].size(), params.einsumInpShapes[0].data(), CV_32FC1));
-        inputs.emplace_back(Mat(params.einsumInpShapes[1].size(), params.einsumInpShapes[1].data(), CV_32FC1));
-
-        int id = net.addLayerToPrev(lp.name, lp.type, lp);
-        net.connect(0, 0, id, 0);
-        net.connect(0, 1, id, 1);
-
-        input_names.emplace_back("input1");
-        input_names.emplace_back("input2");
+        // create input names dynamically, assuming input naming follows a consistent pattern
+        input_names.emplace_back("input" + std::to_string(i + 1));
     }
 
     //warm up
+    std::vector<Mat> outputs;
     net.setInputsNames(input_names);
     for (int i = 0; i < input_names.size(); i++){
         net.setInput(inputs[i], input_names[i]);
     }
-    Mat out = net.forward();
+    net.forward(outputs, "testEinsum");
 
-    std::vector<Mat> outputs;
     TEST_CYCLE()
     {
         net.forward(outputs, "testEinsum");
diff --git a/modules/dnn/src/layers/einsum_layer.cpp b/modules/dnn/src/layers/einsum_layer.cpp
index baf4297c0e..c7f9aaca06 100644
--- a/modules/dnn/src/layers/einsum_layer.cpp
+++ b/modules/dnn/src/layers/einsum_layer.cpp
@@ -6,6 +6,7 @@
 #include <opencv2/dnn/shape_utils.hpp>
 #include "../precomp.hpp"
 #include "layers_common.hpp"
+#include "cpu_kernels/fast_gemm.hpp"
 
 namespace cv
 {
@@ -32,111 +33,6 @@ static bool IsTransposeReshapeForEinsum(const std::vector<size_t>& perm,
     return true;
 }
 
-static Mat batchwiseMatMul(
-    const Mat& input1,
-    const MatShape& input1ShapeOverride,
-    const Mat& input2,
-    const MatShape& input2ShapeOverride)
-{
-    // Sanity checks before the actual MatMul
-    CV_CheckType(input1.type(), input2.type(), "Data types of the inputs must match for MatMul");
-    CV_CheckEQ(input1ShapeOverride.size(), (size_t) 3, "Only 1 batch dimension is allowed for MatMul");
-    CV_CheckEQ(input2ShapeOverride.size(), (size_t) 3, "Only 1 batch dimension is allowed for MatMul");
-    CV_CheckEQ((size_t) input1ShapeOverride[0], (size_t) input2ShapeOverride[0], "Batch dimension should match for MatMul;");
-    CV_CheckEQ((size_t) input1ShapeOverride[2], (size_t) input2ShapeOverride[1], "Incompatible matrix dimensions for matMul");
-
-    size_t batches = input1ShapeOverride[0];
-    size_t M = input1ShapeOverride[1];
-    size_t K = input1ShapeOverride[2];
-    size_t N = input2ShapeOverride[2];
-
-    std::vector<Mat> output;
-    if (batches > 1)
-    {
-        Mat reshapedInput1 = input1;
-        Mat reshapedInput2 = input2;
-
-        // input1 should of size MxK
-        // check if input1 needs reshape, if need reshape
-        if (input1.size[0] != M || input1.size[1] != K)
-        {
-            int shape[] = {static_cast<int>(batches), static_cast<int>(M), static_cast<int>(K)};
-            reshapedInput1 = input1.reshape(1, 3, shape);
-        }
-
-        // input2 should be of size KxN
-        // check if input2 needs reshape, if needs reshape
-        if (input2.size[0] != K || input2.size[1] != N)
-        {
-            int shape[] = {static_cast<int>(batches), static_cast<int>(K), static_cast<int>(N)};
-            reshapedInput2 = input2.reshape(1, 3, shape);
-        }
-
-        for (size_t i=0; i < batches; i++)
-        {
-            std::vector<Range> ranges1 = {cv::Range(i, i+1)};
-            for (int j = 1; j < reshapedInput1.dims; j++)
-                ranges1.emplace_back(cv::Range::all());
-
-            Mat part1 = reshapedInput1(ranges1);
-            int shape[] = {static_cast<int>(M), static_cast<int>(K)};
-            part1 = part1.reshape(1, sizeof(shape)/sizeof(shape[0]), shape);
-
-            std::vector<Range> ranges2 = {cv::Range(i, i+1)};
-            for (int j = 1; j < reshapedInput2.dims; j++)
-                ranges2.emplace_back(cv::Range::all());
-
-            Mat part2 = reshapedInput2(ranges2);
-            int shape2[] = {static_cast<int>(K), static_cast<int>(N)};
-            part2 = part2.reshape(1, sizeof(shape2)/sizeof(shape2[0]), shape2);
-
-            Mat tmp_output;
-            cv::gemm(part1, part2, 1.0, cv::Mat(), 1.0, tmp_output);
-            int newShape[] = {1, static_cast<int>(M), static_cast<int>(N)};
-            tmp_output = tmp_output.reshape(1, sizeof(newShape)/sizeof(newShape[0]), newShape);
-
-            output.emplace_back(tmp_output);
-        }
-
-    } else {
-
-        Mat reshapedInput1 = input1;
-        Mat reshapedInput2 = input2;
-
-        // input1 should of size MxK
-        // check if input1 needs reshape, if need reshape
-        if (input1.dims > 2 || input1.size[0] != M || input1.size[1] != K)
-        {
-            int shape[] = {static_cast<int>(M), static_cast<int>(K)};
-            reshapedInput1 = input1.reshape(1, 2, shape);
-        }
-
-        // input2 should be of size KxN
-        // check if input2 needs reshape, if needs reshape
-        if (input2.dims > 2 || input2.size[0] != K || input2.size[1] != N)
-        {
-            int shape2[] = {static_cast<int>(K), static_cast<int>(N)};
-            reshapedInput2 = input2.reshape(1, 2, shape2);
-        }
-
-        Mat tmp_output;
-        cv::gemm(reshapedInput1, reshapedInput2, 1.0, cv::Mat(), 1.0, tmp_output);
-
-        int newShape[] = {1, static_cast<int>(M), static_cast<int>(N)};
-        tmp_output = tmp_output.reshape(1, sizeof(newShape)/sizeof(newShape[0]), newShape);
-        output.emplace_back(tmp_output);
-
-    }
-
-    int outputDim[] = {static_cast<int>(output.size()), static_cast<int>(M), static_cast<int>(N)};
-    Mat output_buffer = Mat::zeros(3, outputDim, CV_32F);
-
-    for (size_t i = 0; i < output.size(); i++) {
-        Mat output_slice = output_buffer.row(i);
-        output[i].copyTo(output_slice);
-    }
-    return output_buffer;
-};
 
 static Mat Transpose(
     const Mat& input,
@@ -452,6 +348,8 @@ public:
     // The number of dimensions that are encompassed by an "ellipsis" - "...".
     size_t numOfEllipsisDims = 0;
 
+    // Backend for fastgemm
+    FastGemmOpt opt;
 
     void parseEquation(String equation);
     void processEquation(const std::vector<MatShape>& inputs);
@@ -469,7 +367,12 @@ public:
         const MatShape& reduceDims,
         bool isFinalPair
     );
-
+    Mat batchwiseMatMul(
+        const Mat& input1,
+        const MatShape& input1ShapeOverride,
+        const Mat& input2,
+        const MatShape& input2ShapeOverride
+    );
 
     // constructor
     LayerEinsumImpl(const LayerParams& params)
@@ -491,6 +394,7 @@ public:
             einsumInpShapes.emplace_back(shape);
         }
 
+        opt.init();
 
         // Maintains a mapping between input indices and their corresponding subscript labels for each input
         inputSubscriptIndices.reserve(numInputs);
@@ -1389,6 +1293,112 @@ Mat LayerEinsumImpl::pairwiseOperandProcess(
     return output;
 };
 
+Mat LayerEinsumImpl::batchwiseMatMul(
+    const Mat& input1,
+    const MatShape& input1ShapeOverride,
+    const Mat& input2,
+    const MatShape& input2ShapeOverride)
+{
+
+    // Sanity checks before the actual MatMul
+    CV_CheckType(input1.type(), input2.type(), "Data types of the inputs must match for MatMul");
+    CV_CheckEQ(input1ShapeOverride.size(), (size_t) 3, "Only 1 batch dimension is allowed for MatMul");
+    CV_CheckEQ(input2ShapeOverride.size(), (size_t) 3, "Only 1 batch dimension is allowed for MatMul");
+    CV_CheckEQ((size_t) input1ShapeOverride[0], (size_t) input2ShapeOverride[0], "Batch dimension should match for MatMul;");
+    CV_CheckEQ((size_t) input1ShapeOverride[2], (size_t) input2ShapeOverride[1], "Incompatible matrix dimensions for matMul");
+
+    int batches = input1ShapeOverride[0];
+    int M = input1ShapeOverride[1];
+    int K = input1ShapeOverride[2];
+    int N = input2ShapeOverride[2];
+
+    std::vector<Mat> output;
+    if (batches > 1)
+    {
+        Mat reshapedInput1 = input1;
+        Mat reshapedInput2 = input2;
+
+        // input1 should of size MxK
+        // check if input1 needs reshape, if need reshape
+        if (input1.size[0] != M || input1.size[1] != K)
+        {
+            int shape[] = {batches, M, K};
+            reshapedInput1 = input1.reshape(1, 3, shape);
+        }
+
+        // input2 should be of size KxN
+        // check if input2 needs reshape, if needs reshape
+        if (input2.size[0] != K || input2.size[1] != N)
+        {
+            int shape[] = {batches, K, N};
+            reshapedInput2 = input2.reshape(1, 3, shape);
+        }
+
+        for (size_t i=0; i < batches; i++)
+        {
+            std::vector<Range> ranges1 = {cv::Range(i, i+1)};
+            for (int j = 1; j < reshapedInput1.dims; j++)
+                ranges1.emplace_back(cv::Range::all());
+
+            Mat part1 = reshapedInput1(ranges1);
+            int shape[] = {M, K};
+            part1 = part1.reshape(1, sizeof(shape)/sizeof(shape[0]), shape);
+
+            std::vector<Range> ranges2 = {cv::Range(i, i+1)};
+            for (int j = 1; j < reshapedInput2.dims; j++)
+                ranges2.emplace_back(cv::Range::all());
+
+            Mat part2 = reshapedInput2(ranges2);
+            int shape2[] = {K, N};
+            part2 = part2.reshape(1, sizeof(shape2)/sizeof(shape2[0]), shape2);
+
+            Mat tmp_output(M, N, part1.type());
+            fastGemm(false, false, 1.0, part1, part2, 0.0, tmp_output, opt);
+            int newShape[] = {1, M, N};
+            tmp_output = tmp_output.reshape(1, sizeof(newShape)/sizeof(newShape[0]), newShape);
+
+            output.emplace_back(tmp_output);
+        }
+
+    } else {
+
+        Mat reshapedInput1 = input1;
+        Mat reshapedInput2 = input2;
+
+        // input1 should of size MxK
+        // check if input1 needs reshape, if need reshape
+        if (input1.dims > 2 || input1.size[0] != M || input1.size[1] != K)
+        {
+            int shape[] = {M, K};
+            reshapedInput1 = input1.reshape(1, 2, shape);
+        }
+
+        // input2 should be of size KxN
+        // check if input2 needs reshape, if needs reshape
+        if (input2.dims > 2 || input2.size[0] != K || input2.size[1] != N)
+        {
+            int shape2[] = {K, N};
+            reshapedInput2 = input2.reshape(1, 2, shape2);
+        }
+
+        Mat tmp_output(M, N, reshapedInput1.type());
+        fastGemm(false, false, 1.0, reshapedInput1, reshapedInput2, 0.0, tmp_output, opt);
+
+        int newShape[] = {1, M, N};
+        tmp_output = tmp_output.reshape(1, sizeof(newShape)/sizeof(newShape[0]), newShape);
+        output.emplace_back(tmp_output);
+
+    }
+
+    int outputDim[] = {static_cast<int>(output.size()), M, N};
+    Mat output_buffer = Mat::zeros(3, outputDim, CV_32F);
+
+    for (size_t i = 0; i < output.size(); i++) {
+        Mat output_slice = output_buffer.row(i);
+        output[i].copyTo(output_slice);
+    }
+    return output_buffer;
+};
 Ptr<EinsumLayer> EinsumLayer::create(const LayerParams& params)
 {
     return makePtr<LayerEinsumImpl>(params);