diff --git a/modules/dnn/perf/perf_recurrent.cpp b/modules/dnn/perf/perf_recurrent.cpp
new file mode 100644
index 0000000000..fe2a51886c
--- /dev/null
+++ b/modules/dnn/perf/perf_recurrent.cpp
@@ -0,0 +1,90 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "perf_precomp.hpp"
+
+namespace opencv_test {
+
+struct LstmParams {
+    // Batch size
+    int nrSamples;
+
+    // Size of the input vector
+    int inputSize;
+
+    // Size of the internal state vector
+    int hiddenSize;
+
+    // Number of timesteps for the LSTM
+    int nrSteps;
+};
+
+static inline void PrintTo(const LstmParams& params, ::std::ostream* os) {
+    (*os) << "BATCH=" << params.nrSamples
+        << ", IN=" << params.inputSize
+        << ", HIDDEN=" << params.hiddenSize
+        << ", TS=" << params.nrSteps;
+}
+
+static const LstmParams testLstmConfigs[] = {
+    {1, 192, 192, 100},
+    {1, 1024, 192, 100},
+    {1, 64, 192, 100},
+    {1, 192, 512, 100},
+    {64, 192, 192, 2},
+    {64, 1024, 192, 2},
+    {64, 64, 192, 2},
+    {64, 192, 512, 2},
+    {128, 192, 192, 2},
+    {128, 1024, 192, 2},
+    {128, 64, 192, 2},
+    {128, 192, 512, 2}
+};
+
+class Layer_LSTM : public TestBaseWithParam<LstmParams> {};
+
+PERF_TEST_P_(Layer_LSTM, lstm) {
+    const LstmParams& params = GetParam();
+    LayerParams lp;
+    lp.type = "LSTM";
+    lp.name = "testLstm";
+    lp.set("produce_cell_output", false);
+    lp.set("use_timestamp_dim", true);
+
+    Mat weightH(params.hiddenSize * 4, params.hiddenSize, CV_32FC1, cv::Scalar(0));
+    Mat weightX(params.hiddenSize * 4, params.inputSize, CV_32FC1, cv::Scalar(0));
+    Mat bias(params.hiddenSize * 4, 1, CV_32FC1, cv::Scalar(0));
+    Mat hInternal(params.nrSteps, params.hiddenSize, CV_32FC1, cv::Scalar(0));
+    Mat cInternal(params.nrSteps, params.hiddenSize, CV_32FC1, cv::Scalar(0));
+    lp.blobs.push_back(weightH);
+    lp.blobs.push_back(weightX);
+    lp.blobs.push_back(bias);
+    lp.blobs.push_back(hInternal);
+    lp.blobs.push_back(cInternal);
+
+    std::vector<int> inputDims;
+    inputDims.push_back(params.nrSamples);
+    inputDims.push_back(params.nrSteps);
+    inputDims.push_back(params.inputSize);
+    Mat input(inputDims.size(), inputDims.data(), CV_32FC1);
+    input = cv::Scalar(0);
+
+    Net net;
+    net.addLayerToPrev(lp.name, lp.type, lp);
+    net.setInput(input);
+
+    // Warm up
+    std::vector<Mat> outputs(2);
+    net.forward(outputs, "testLstm");
+
+    TEST_CYCLE()
+    {
+        net.forward(outputs, "testLstm");
+    }
+    SANITY_CHECK_NOTHING();
+}
+
+INSTANTIATE_TEST_CASE_P(/**/, Layer_LSTM, testing::ValuesIn(testLstmConfigs));
+
+} // namespace
diff --git a/modules/dnn/src/layers/fully_connected_layer.cpp b/modules/dnn/src/layers/fully_connected_layer.cpp
index e25ca5a68f..5acce939f1 100644
--- a/modules/dnn/src/layers/fully_connected_layer.cpp
+++ b/modules/dnn/src/layers/fully_connected_layer.cpp
@@ -222,17 +222,17 @@ public:
 
             #if CV_TRY_AVX512_SKX
                 if( useAVX512 )
-                    opt_AVX512_SKX::fastGEMM1T( sptr, wptr, wstep, biasptr, dptr, nw, vecsize);
+                    opt_AVX512_SKX::fastGEMM1T( sptr, wptr, wstep, biasptr, dptr, nw, vecsize_aligned);
                 else
             #endif
             #if CV_TRY_AVX2
                 if( useAVX2 )
-                    opt_AVX2::fastGEMM1T( sptr, wptr, wstep, biasptr, dptr, nw, vecsize);
+                    opt_AVX2::fastGEMM1T( sptr, wptr, wstep, biasptr, dptr, nw, vecsize_aligned);
                 else
             #endif
             #if CV_TRY_AVX
                 if( useAVX )
-                    opt_AVX::fastGEMM1T( sptr, wptr, wstep, biasptr, dptr, nw, vecsize);
+                    opt_AVX::fastGEMM1T( sptr, wptr, wstep, biasptr, dptr, nw, vecsize_aligned);
                 else
             #endif
                 {
diff --git a/modules/dnn/src/layers/layers_common.simd.hpp b/modules/dnn/src/layers/layers_common.simd.hpp
index 706695a7b2..accc644676 100644
--- a/modules/dnn/src/layers/layers_common.simd.hpp
+++ b/modules/dnn/src/layers/layers_common.simd.hpp
@@ -550,13 +550,24 @@ void fastDepthwiseConv( const float* wptr,
     _mm256_zeroupper();
 }
 
+// Used to generate the mask used when calculating tails
+static const uint32_t tailMaskArray[15] = {
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0xffffffffUL, 0xffffffffUL, 0xffffffffUL, 0xffffffffUL, 0xffffffffUL, 0xffffffffUL, 0xffffffffUL
+};
+
 // dst = vec * weights^t + bias
+// Requires that vecsize is at least 8 or equal to 0 to avoid memory access problems. Does not require alignment.
 void fastGEMM1T( const float* vec, const float* weights,
                  size_t wstep, const float* bias,
                  float* dst, int nvecs, int vecsize )
 {
     int i = 0;
 
+    CV_Assert(vecsize >= 8 || vecsize == 0);
+
+    __m256 tailMask = _mm256_loadu_ps(reinterpret_cast<const float*>(tailMaskArray) + (vecsize % 8));
+
     for( ; i <= nvecs - 8; i += 8 )
     {
         const float* wptr = weights + i*wstep;
@@ -565,18 +576,36 @@ void fastGEMM1T( const float* vec, const float* weights,
                vs4 = _mm256_setzero_ps(), vs5 = _mm256_setzero_ps(),
                vs6 = _mm256_setzero_ps(), vs7 = _mm256_setzero_ps();
 
-        for( int k = 0; k < vecsize; k += 8, wptr += 8 )
+        int k = 0;
+        for( ; k <= vecsize-8; k += 8, wptr += 8 )
         {
-            __m256 v = _mm256_load_ps(vec + k);
+            __m256 v = _mm256_loadu_ps(vec + k);
 
-            vs0 = _mm256_fmadd_ps(_mm256_load_ps(wptr), v, vs0);
-            vs1 = _mm256_fmadd_ps(_mm256_load_ps(wptr + wstep), v, vs1);
-            vs2 = _mm256_fmadd_ps(_mm256_load_ps(wptr + wstep*2), v, vs2);
-            vs3 = _mm256_fmadd_ps(_mm256_load_ps(wptr + wstep*3), v, vs3);
-            vs4 = _mm256_fmadd_ps(_mm256_load_ps(wptr + wstep*4), v, vs4);
-            vs5 = _mm256_fmadd_ps(_mm256_load_ps(wptr + wstep*5), v, vs5);
-            vs6 = _mm256_fmadd_ps(_mm256_load_ps(wptr + wstep*6), v, vs6);
-            vs7 = _mm256_fmadd_ps(_mm256_load_ps(wptr + wstep*7), v, vs7);
+            vs0 = _mm256_fmadd_ps(_mm256_loadu_ps(wptr), v, vs0);
+            vs1 = _mm256_fmadd_ps(_mm256_loadu_ps(wptr + wstep), v, vs1);
+            vs2 = _mm256_fmadd_ps(_mm256_loadu_ps(wptr + wstep*2), v, vs2);
+            vs3 = _mm256_fmadd_ps(_mm256_loadu_ps(wptr + wstep*3), v, vs3);
+            vs4 = _mm256_fmadd_ps(_mm256_loadu_ps(wptr + wstep*4), v, vs4);
+            vs5 = _mm256_fmadd_ps(_mm256_loadu_ps(wptr + wstep*5), v, vs5);
+            vs6 = _mm256_fmadd_ps(_mm256_loadu_ps(wptr + wstep*6), v, vs6);
+            vs7 = _mm256_fmadd_ps(_mm256_loadu_ps(wptr + wstep*7), v, vs7);
+        }
+
+        if (k != vecsize) {
+            // Tail
+            k = vecsize - 8;
+            wptr = weights + i * wstep + k;
+            __m256 v = _mm256_loadu_ps(vec + k);
+            v = _mm256_and_ps(v, tailMask);
+
+            vs0 = _mm256_fmadd_ps(_mm256_and_ps(_mm256_loadu_ps(wptr), tailMask), v, vs0);
+            vs1 = _mm256_fmadd_ps(_mm256_and_ps(_mm256_loadu_ps(wptr + wstep), tailMask), v, vs1);
+            vs2 = _mm256_fmadd_ps(_mm256_and_ps(_mm256_loadu_ps(wptr + wstep * 2), tailMask), v, vs2);
+            vs3 = _mm256_fmadd_ps(_mm256_and_ps(_mm256_loadu_ps(wptr + wstep * 3), tailMask), v, vs3);
+            vs4 = _mm256_fmadd_ps(_mm256_and_ps(_mm256_loadu_ps(wptr + wstep * 4), tailMask), v, vs4);
+            vs5 = _mm256_fmadd_ps(_mm256_and_ps(_mm256_loadu_ps(wptr + wstep * 5), tailMask), v, vs5);
+            vs6 = _mm256_fmadd_ps(_mm256_and_ps(_mm256_loadu_ps(wptr + wstep * 6), tailMask), v, vs6);
+            vs7 = _mm256_fmadd_ps(_mm256_and_ps(_mm256_loadu_ps(wptr + wstep * 7), tailMask), v, vs7);
         }
 
         __m256 s0 = _mm256_hadd_ps(_mm256_hadd_ps(vs0, vs1), _mm256_hadd_ps(vs2, vs3));
@@ -598,10 +627,20 @@ void fastGEMM1T( const float* vec, const float* weights,
         const float* wptr = weights + i*wstep;
         __m256 vs0 = _mm256_setzero_ps();
 
-        for( int k = 0; k < vecsize; k += 8, wptr += 8 )
+        int k = 0;
+        for( ; k <= vecsize-8; k += 8, wptr += 8 )
         {
-            __m256 v = _mm256_load_ps(vec + k);
-            vs0 = _mm256_fmadd_ps(_mm256_load_ps(wptr), v, vs0);
+            __m256 v = _mm256_loadu_ps(vec + k);
+            vs0 = _mm256_fmadd_ps(_mm256_loadu_ps(wptr), v, vs0);
+        }
+
+        if (k != vecsize) {
+            // Tail
+            k = vecsize - 8;
+            wptr = weights + i * wstep + k;
+            __m256 v = _mm256_loadu_ps(vec + k);
+            v = _mm256_and_ps(v, tailMask);
+            vs0 = _mm256_fmadd_ps(_mm256_and_ps(_mm256_loadu_ps(wptr), tailMask), v, vs0);
         }
 
         __m256 s0 = _mm256_hadd_ps(_mm256_hadd_ps(vs0, vs0), vs0);
diff --git a/modules/dnn/src/layers/recurrent_layers.cpp b/modules/dnn/src/layers/recurrent_layers.cpp
index 9088c13390..21dafa142d 100644
--- a/modules/dnn/src/layers/recurrent_layers.cpp
+++ b/modules/dnn/src/layers/recurrent_layers.cpp
@@ -46,6 +46,8 @@
 #include <cmath>
 #include <opencv2/dnn/shape_utils.hpp>
 
+#include "layers_common.hpp"
+
 namespace cv
 {
 namespace dnn
@@ -118,10 +120,23 @@ class LSTMLayerImpl CV_FINAL : public LSTMLayer
     ActivationFunction g_activation;
     ActivationFunction h_activation;
 
+#if CV_TRY_AVX
+    bool useAVX;
+#endif
+#if CV_TRY_AVX2
+    bool useAVX2;
+#endif
+
 public:
 
     LSTMLayerImpl(const LayerParams& params)
         : numTimeStamps(0), numSamples(0)
+#if CV_TRY_AVX
+          , useAVX(checkHardwareSupport(CPU_AVX))
+#endif
+#if CV_TRY_AVX2
+          , useAVX2(checkHardwareSupport(CPU_AVX2))
+#endif
     {
         setParamsFrom(params);
 
@@ -343,6 +358,15 @@ public:
             hOutTs = hOutTs.colRange(i * hOutTs.cols / numDirs, (i + 1) * hOutTs.cols / numDirs);
             Mat cOutTs = produceCellOutput ? output[1].reshape(1, numSamplesTotal) : Mat();
 
+#if CV_TRY_AVX2 || CV_TRY_AVX
+            bool canUseAvx = gates.isContinuous() && bias.isContinuous()
+                && Wx.depth() == CV_32F && gates.depth() == CV_32F
+                && bias.depth() == CV_32F && Wx.cols >= 8;
+            bool canUseAvx_hInternal = hInternal.isContinuous() && gates.isContinuous() && bias.isContinuous()
+                && Wh.depth() == CV_32F && hInternal.depth() == CV_32F && gates.depth() == CV_32F
+                && Wh.cols >= 8;
+#endif
+
             int tsStart, tsEnd, tsInc;
             if (reverse || i == 1) {
                 tsStart = numTimeStamps - 1;
@@ -359,9 +383,82 @@ public:
                 Range curRowRange(ts*numSamples, (ts + 1)*numSamples);
                 Mat xCurr = xTs.rowRange(curRowRange);
 
-                gemm(xCurr, Wx, 1, gates, 0, gates, GEMM_2_T);      // Wx * x_t
-                gemm(hInternal, Wh, 1, gates, 1, gates, GEMM_2_T);  //+Wh * h_{t-1}
-                gemm(dummyOnes, bias, 1, gates, 1, gates);          //+b
+#if CV_TRY_AVX2
+                if (useAVX2 && canUseAvx && xCurr.isContinuous())
+                {
+                    for (int n = 0; n < xCurr.rows; n++) {
+                        opt_AVX2::fastGEMM1T(
+                            xCurr.ptr<float>(n),
+                            Wx.ptr<float>(),
+                            Wx.step1(),
+                            bias.ptr<float>(),
+                            gates.ptr<float>(n),
+                            Wx.rows,
+                            Wx.cols
+                        );
+                    }
+                }
+                else
+#endif
+#if CV_TRY_AVX
+                if (useAVX && canUseAvx && xCurr.isContinuous())
+                {
+                    for (int n = 0; n < xCurr.rows; n++) {
+                        opt_AVX::fastGEMM1T(
+                            xCurr.ptr<float>(n),
+                            Wx.ptr<float>(),
+                            Wx.step1(),
+                            bias.ptr<float>(),
+                            gates.ptr<float>(n),
+                            Wx.rows,
+                            Wx.cols
+                        );
+                    }
+                }
+                else
+#endif
+                {
+                    gemm(xCurr, Wx, 1, gates, 0, gates, GEMM_2_T);      // Wx * x_t
+                    gemm(dummyOnes, bias, 1, gates, 1, gates);          //+b
+                }
+
+#if CV_TRY_AVX2
+                if (useAVX2 && canUseAvx_hInternal)
+                {
+                    for (int n = 0; n < hInternal.rows; n++) {
+                        opt_AVX2::fastGEMM1T(
+                            hInternal.ptr<float>(n),
+                            Wh.ptr<float>(),
+                            Wh.step1(),
+                            gates.ptr<float>(n),
+                            gates.ptr<float>(n),
+                            Wh.rows,
+                            Wh.cols
+                        );
+                    }
+                }
+                else
+#endif
+#if CV_TRY_AVX
+                if (useAVX && canUseAvx_hInternal)
+                {
+                    for (int n = 0; n < hInternal.rows; n++) {
+                        opt_AVX::fastGEMM1T(
+                            hInternal.ptr<float>(n),
+                            Wh.ptr<float>(),
+                            Wh.step1(),
+                            gates.ptr<float>(n),
+                            gates.ptr<float>(n),
+                            Wh.rows,
+                            Wh.cols
+                        );
+                    }
+                }
+                else
+#endif
+                {
+                    gemm(hInternal, Wh, 1, gates, 1, gates, GEMM_2_T);  //+Wh * h_{t-1}
+                }
 
                 Mat gateI = gates.colRange(0*numOut, 1*numOut);
                 Mat gateF = gates.colRange(1*numOut, 2*numOut);