Merge pull request #23614 from Abdurrahheem:lstm_layout_attribute

LSTM ONNX Layout Attribute Support #23614 ### Explanation This PR contains necessary changes to support `layout` attribute. This attributes is present in [ONNX](https://github.com/onnx/onnx/blob/main/docs/Operators.md#lstm) and [Torch](https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html#lstm) (in touch it is name as `batch_first=True`) libraries. When `layout = 1` input to LSTM layer is expected to have batch dimension first -> `[batch_size, sequence_length, features]` vs `layout = 0` - default `[sequence_length, batch_size, features]` ### Test Data Test data and data generator for PR located here [#1063](https://github.com/opencv/opencv_extra/pull/1063) ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [ ] There is a reference to the original bug report and related work - [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [x] The feature is well documented and sample code can be built with the project CMake
2025-07-24 14:06:27 +08:00 · 2023-05-17 22:46:56 +03:00 · 2023-05-17 22:46:56 +03:00 · d2143bcd44
commit d2143bcd44
parent d2618bfe11
3 changed files with 74 additions and 8 deletions
--- a/modules/dnn/src/layers/recurrent_layers.cpp
+++ b/modules/dnn/src/layers/recurrent_layers.cpp
@ -113,12 +113,19 @@ class LSTMLayerImpl CV_FINAL : public LSTMLayer
    MatShape outTailShape;  //shape of single output sample
    MatShape outTsShape;    //shape of N output samples

+    enum layout_t : int {
+        SEQ_BATCH_HID = 0,
+        BATCH_SEQ_HID = 1
+    };
+
    bool useTimestampDim;
    bool produceCellOutput;
    float forgetBias, cellClip;
    bool useCellClip, usePeephole;
    bool reverse;   // If true, go in negative direction along the time axis
    bool bidirectional;  // If true, produces both forward and reversed directions along time axis
+    layout_t layout;  // If layout == BATCH_SEQ_HID, uses batch_size x seq_length x num_hidden for input and output
+                      // else uses seq_length x batch_size x num_hidden

    ActivationFunction f_activation;
    ActivationFunction g_activation;
@ -198,6 +205,7 @@ public:
                }
            }
        }
+        layout = (layout_t) params.get<int>("layout", SEQ_BATCH_HID);
        useTimestampDim = params.get<bool>("use_timestamp_dim", true);
        produceCellOutput = params.get<bool>("produce_cell_output", false);
        forgetBias = params.get<float>("forget_bias", 0.0f);
@ -291,8 +299,13 @@ public:
        if (useTimestampDim)
        {
            CV_Assert(inp0.size() >= 2 && total(inp0, 2) == _numInp);
-            _numSamples = inp0[1];
-            outResShape.push_back(inp0[0]);
+            if (layout == SEQ_BATCH_HID) {
+                _numSamples = inp0[1];
+                outResShape.push_back(inp0[0]);
+            } else {
+                _numSamples = inp0[0];
+                outResShape.push_back(inp0[1]);
+            }
        }
        else
        {
@ -349,8 +362,13 @@ public:
        if (useTimestampDim)
        {
            CV_Assert(inp0.dims >= 2 && (int)inp0.total(2) == numInp);
-            numTimeStamps = inp0.size[0];
-            numSamples = inp0.size[1];
+            if (layout == SEQ_BATCH_HID){
+                numTimeStamps = inp0.size[0];
+                numSamples = inp0.size[1];
+            }else{
+                numTimeStamps = inp0.size[1];
+                numSamples = inp0.size[0];
+            }
        }
        else
        {
@ -383,6 +401,21 @@ public:
        outputs_arr.getMatVector(output);
        internals_arr.getMatVector(internals);

+        if (layout == BATCH_SEQ_HID){
+            //swap axis 0 and 1 input x
+            cv::Mat tmp;
+            // Since python input is 4 dimentional and C++ input 3 dimentinal
+            // we need to proccess each differently
+            if (input[0].dims == 4){
+                // here !!!
+                CV_Assert(input[0].size[3] == 1);
+                cv::transposeND(input[0], {1, 0, 2, 3}, tmp); //back to seq_len, batch_size, hidden_size format
+            }else{
+                cv::transposeND(input[0], {1, 0, 2}, tmp); //back to seq_len, batch_size, hidden_size format
+            }
+            input[0] = tmp;
+        }
+
        Mat cOut = produceCellOutput ? output[0].clone() : Mat();
        const bool needYcTransform = !originalBlobs.empty(); // if the producer is onnx
        const int numDirs = 1 + static_cast<int>(bidirectional);
@ -599,7 +632,12 @@ public:
                    cInternal.copyTo(cOutTs.rowRange(curRowRange));
            }
        }
-
+        // transpose to match batch first output
+        if (layout == BATCH_SEQ_HID){
+            cv::Mat tmp;
+            cv::transposeND(output[0], {1, 0, 2}, tmp);
+            output[0] = tmp;
+        }
        if (needYcTransform && produceCellOutput)
        {
            fixCellState(cOut, numDirs);
@ -618,7 +656,13 @@ public:

        // permute to {0, 2, 1, 3};
        cv::Mat newCellState;
-        cv::transposeND(cOut, {0, 2, 1, 3}, newCellState);
+        // transpose to match batch first output
+        if (layout == BATCH_SEQ_HID){
+            cv::transposeND(cOut, {2, 0, 1, 3}, newCellState);
+        }
+        else{
+            cv::transposeND(cOut, {0, 2, 1, 3}, newCellState);
+        }
        cOut = newCellState;

        if (numDirs == 1)
--- a/modules/dnn/src/onnx/onnx_importer.cpp
+++ b/modules/dnn/src/onnx/onnx_importer.cpp
@ -1637,8 +1637,16 @@ void ONNXImporter::parseLSTM(LayerParams& layerParams, const opencv_onnx::NodePr
    CV_Assert(shapeIt != outShapes.end());
    const MatShape x_shape = shapeIt->second;

-    const int seq_length = x_shape[0];
-    const int batch_size = x_shape[1];
+    //if layout is 1, change batch and sequence dims
+    const int layout = layerParams.get<int>("layout", 0);
+    int batch_size, seq_length;
+    if (layout == 1){
+        batch_size = x_shape[0];
+        seq_length = x_shape[1];
+    }else{
+        seq_length = x_shape[0];
+        batch_size = x_shape[1];
+    }
    const int input_size = x_shape[2];
    const int hidden_size = layerParams.get<int>("hidden_size");
    const int num_directions = constBlobs[lstm_proto.input(1)].size[0];
--- a/modules/dnn/test/test_onnx_importer.cpp
+++ b/modules/dnn/test/test_onnx_importer.cpp
@ -1393,6 +1393,20 @@ TEST_P(Test_ONNX_layers, LSTM_init_h0_c0)
        applyTestTag(CV_TEST_TAG_DNN_SKIP_CUDA);
    testONNXModels("lstm_init_h0_c0", npy, 0, 0, false, false, 3);
 }
+// epsilon is larger because onnx does not match with torch/opencv exactly
+TEST_P(Test_ONNX_layers, LSTM_layout_seq)
+{
+    if(backend == DNN_BACKEND_CUDA)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_CUDA);
+    testONNXModels("lstm_layout_0", npy, 0.005, 0.005, false, false, 3);
+}
+// epsilon is larger because onnx does not match with torch/opencv exactly
+TEST_P(Test_ONNX_layers, LSTM_layout_batch)
+{
+    if(backend == DNN_BACKEND_CUDA)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_CUDA);
+    testONNXModels("lstm_layout_1", npy, 0.005, 0.005, false, false, 3);
+}

 TEST_P(Test_ONNX_layers, Pad2d_Unfused)
 {