diff --git a/modules/dnn/src/layers/recurrent_layers.cpp b/modules/dnn/src/layers/recurrent_layers.cpp index 8c3b5810bf..dc973816ef 100644 --- a/modules/dnn/src/layers/recurrent_layers.cpp +++ b/modules/dnn/src/layers/recurrent_layers.cpp @@ -113,12 +113,19 @@ class LSTMLayerImpl CV_FINAL : public LSTMLayer MatShape outTailShape; //shape of single output sample MatShape outTsShape; //shape of N output samples + enum layout_t : int { + SEQ_BATCH_HID = 0, + BATCH_SEQ_HID = 1 + }; + bool useTimestampDim; bool produceCellOutput; float forgetBias, cellClip; bool useCellClip, usePeephole; bool reverse; // If true, go in negative direction along the time axis bool bidirectional; // If true, produces both forward and reversed directions along time axis + layout_t layout; // If layout == BATCH_SEQ_HID, uses batch_size x seq_length x num_hidden for input and output + // else uses seq_length x batch_size x num_hidden ActivationFunction f_activation; ActivationFunction g_activation; @@ -198,6 +205,7 @@ public: } } } + layout = (layout_t) params.get("layout", SEQ_BATCH_HID); useTimestampDim = params.get("use_timestamp_dim", true); produceCellOutput = params.get("produce_cell_output", false); forgetBias = params.get("forget_bias", 0.0f); @@ -291,8 +299,13 @@ public: if (useTimestampDim) { CV_Assert(inp0.size() >= 2 && total(inp0, 2) == _numInp); - _numSamples = inp0[1]; - outResShape.push_back(inp0[0]); + if (layout == SEQ_BATCH_HID) { + _numSamples = inp0[1]; + outResShape.push_back(inp0[0]); + } else { + _numSamples = inp0[0]; + outResShape.push_back(inp0[1]); + } } else { @@ -349,8 +362,13 @@ public: if (useTimestampDim) { CV_Assert(inp0.dims >= 2 && (int)inp0.total(2) == numInp); - numTimeStamps = inp0.size[0]; - numSamples = inp0.size[1]; + if (layout == SEQ_BATCH_HID){ + numTimeStamps = inp0.size[0]; + numSamples = inp0.size[1]; + }else{ + numTimeStamps = inp0.size[1]; + numSamples = inp0.size[0]; + } } else { @@ -383,6 +401,21 @@ public: outputs_arr.getMatVector(output); internals_arr.getMatVector(internals); + if (layout == BATCH_SEQ_HID){ + //swap axis 0 and 1 input x + cv::Mat tmp; + // Since python input is 4 dimentional and C++ input 3 dimentinal + // we need to proccess each differently + if (input[0].dims == 4){ + // here !!! + CV_Assert(input[0].size[3] == 1); + cv::transposeND(input[0], {1, 0, 2, 3}, tmp); //back to seq_len, batch_size, hidden_size format + }else{ + cv::transposeND(input[0], {1, 0, 2}, tmp); //back to seq_len, batch_size, hidden_size format + } + input[0] = tmp; + } + Mat cOut = produceCellOutput ? output[0].clone() : Mat(); const bool needYcTransform = !originalBlobs.empty(); // if the producer is onnx const int numDirs = 1 + static_cast(bidirectional); @@ -599,7 +632,12 @@ public: cInternal.copyTo(cOutTs.rowRange(curRowRange)); } } - + // transpose to match batch first output + if (layout == BATCH_SEQ_HID){ + cv::Mat tmp; + cv::transposeND(output[0], {1, 0, 2}, tmp); + output[0] = tmp; + } if (needYcTransform && produceCellOutput) { fixCellState(cOut, numDirs); @@ -618,7 +656,13 @@ public: // permute to {0, 2, 1, 3}; cv::Mat newCellState; - cv::transposeND(cOut, {0, 2, 1, 3}, newCellState); + // transpose to match batch first output + if (layout == BATCH_SEQ_HID){ + cv::transposeND(cOut, {2, 0, 1, 3}, newCellState); + } + else{ + cv::transposeND(cOut, {0, 2, 1, 3}, newCellState); + } cOut = newCellState; if (numDirs == 1) diff --git a/modules/dnn/src/onnx/onnx_importer.cpp b/modules/dnn/src/onnx/onnx_importer.cpp index 3c3e8787be..196928b3cd 100644 --- a/modules/dnn/src/onnx/onnx_importer.cpp +++ b/modules/dnn/src/onnx/onnx_importer.cpp @@ -1637,8 +1637,16 @@ void ONNXImporter::parseLSTM(LayerParams& layerParams, const opencv_onnx::NodePr CV_Assert(shapeIt != outShapes.end()); const MatShape x_shape = shapeIt->second; - const int seq_length = x_shape[0]; - const int batch_size = x_shape[1]; + //if layout is 1, change batch and sequence dims + const int layout = layerParams.get("layout", 0); + int batch_size, seq_length; + if (layout == 1){ + batch_size = x_shape[0]; + seq_length = x_shape[1]; + }else{ + seq_length = x_shape[0]; + batch_size = x_shape[1]; + } const int input_size = x_shape[2]; const int hidden_size = layerParams.get("hidden_size"); const int num_directions = constBlobs[lstm_proto.input(1)].size[0]; diff --git a/modules/dnn/test/test_onnx_importer.cpp b/modules/dnn/test/test_onnx_importer.cpp index 12a5ad1957..ec98b87dd2 100644 --- a/modules/dnn/test/test_onnx_importer.cpp +++ b/modules/dnn/test/test_onnx_importer.cpp @@ -1393,6 +1393,20 @@ TEST_P(Test_ONNX_layers, LSTM_init_h0_c0) applyTestTag(CV_TEST_TAG_DNN_SKIP_CUDA); testONNXModels("lstm_init_h0_c0", npy, 0, 0, false, false, 3); } +// epsilon is larger because onnx does not match with torch/opencv exactly +TEST_P(Test_ONNX_layers, LSTM_layout_seq) +{ + if(backend == DNN_BACKEND_CUDA) + applyTestTag(CV_TEST_TAG_DNN_SKIP_CUDA); + testONNXModels("lstm_layout_0", npy, 0.005, 0.005, false, false, 3); +} +// epsilon is larger because onnx does not match with torch/opencv exactly +TEST_P(Test_ONNX_layers, LSTM_layout_batch) +{ + if(backend == DNN_BACKEND_CUDA) + applyTestTag(CV_TEST_TAG_DNN_SKIP_CUDA); + testONNXModels("lstm_layout_1", npy, 0.005, 0.005, false, false, 3); +} TEST_P(Test_ONNX_layers, Pad2d_Unfused) {