diff --git a/modules/dnn/perf/perf_recurrent.cpp b/modules/dnn/perf/perf_recurrent.cpp new file mode 100644 index 0000000000..fe2a51886c --- /dev/null +++ b/modules/dnn/perf/perf_recurrent.cpp @@ -0,0 +1,90 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#include "perf_precomp.hpp" + +namespace opencv_test { + +struct LstmParams { + // Batch size + int nrSamples; + + // Size of the input vector + int inputSize; + + // Size of the internal state vector + int hiddenSize; + + // Number of timesteps for the LSTM + int nrSteps; +}; + +static inline void PrintTo(const LstmParams& params, ::std::ostream* os) { + (*os) << "BATCH=" << params.nrSamples + << ", IN=" << params.inputSize + << ", HIDDEN=" << params.hiddenSize + << ", TS=" << params.nrSteps; +} + +static const LstmParams testLstmConfigs[] = { + {1, 192, 192, 100}, + {1, 1024, 192, 100}, + {1, 64, 192, 100}, + {1, 192, 512, 100}, + {64, 192, 192, 2}, + {64, 1024, 192, 2}, + {64, 64, 192, 2}, + {64, 192, 512, 2}, + {128, 192, 192, 2}, + {128, 1024, 192, 2}, + {128, 64, 192, 2}, + {128, 192, 512, 2} +}; + +class Layer_LSTM : public TestBaseWithParam {}; + +PERF_TEST_P_(Layer_LSTM, lstm) { + const LstmParams& params = GetParam(); + LayerParams lp; + lp.type = "LSTM"; + lp.name = "testLstm"; + lp.set("produce_cell_output", false); + lp.set("use_timestamp_dim", true); + + Mat weightH(params.hiddenSize * 4, params.hiddenSize, CV_32FC1, cv::Scalar(0)); + Mat weightX(params.hiddenSize * 4, params.inputSize, CV_32FC1, cv::Scalar(0)); + Mat bias(params.hiddenSize * 4, 1, CV_32FC1, cv::Scalar(0)); + Mat hInternal(params.nrSteps, params.hiddenSize, CV_32FC1, cv::Scalar(0)); + Mat cInternal(params.nrSteps, params.hiddenSize, CV_32FC1, cv::Scalar(0)); + lp.blobs.push_back(weightH); + lp.blobs.push_back(weightX); + lp.blobs.push_back(bias); + lp.blobs.push_back(hInternal); + lp.blobs.push_back(cInternal); + + std::vector inputDims; + inputDims.push_back(params.nrSamples); + inputDims.push_back(params.nrSteps); + inputDims.push_back(params.inputSize); + Mat input(inputDims.size(), inputDims.data(), CV_32FC1); + input = cv::Scalar(0); + + Net net; + net.addLayerToPrev(lp.name, lp.type, lp); + net.setInput(input); + + // Warm up + std::vector outputs(2); + net.forward(outputs, "testLstm"); + + TEST_CYCLE() + { + net.forward(outputs, "testLstm"); + } + SANITY_CHECK_NOTHING(); +} + +INSTANTIATE_TEST_CASE_P(/**/, Layer_LSTM, testing::ValuesIn(testLstmConfigs)); + +} // namespace diff --git a/modules/dnn/src/layers/fully_connected_layer.cpp b/modules/dnn/src/layers/fully_connected_layer.cpp index e25ca5a68f..5acce939f1 100644 --- a/modules/dnn/src/layers/fully_connected_layer.cpp +++ b/modules/dnn/src/layers/fully_connected_layer.cpp @@ -222,17 +222,17 @@ public: #if CV_TRY_AVX512_SKX if( useAVX512 ) - opt_AVX512_SKX::fastGEMM1T( sptr, wptr, wstep, biasptr, dptr, nw, vecsize); + opt_AVX512_SKX::fastGEMM1T( sptr, wptr, wstep, biasptr, dptr, nw, vecsize_aligned); else #endif #if CV_TRY_AVX2 if( useAVX2 ) - opt_AVX2::fastGEMM1T( sptr, wptr, wstep, biasptr, dptr, nw, vecsize); + opt_AVX2::fastGEMM1T( sptr, wptr, wstep, biasptr, dptr, nw, vecsize_aligned); else #endif #if CV_TRY_AVX if( useAVX ) - opt_AVX::fastGEMM1T( sptr, wptr, wstep, biasptr, dptr, nw, vecsize); + opt_AVX::fastGEMM1T( sptr, wptr, wstep, biasptr, dptr, nw, vecsize_aligned); else #endif { diff --git a/modules/dnn/src/layers/layers_common.simd.hpp b/modules/dnn/src/layers/layers_common.simd.hpp index 706695a7b2..accc644676 100644 --- a/modules/dnn/src/layers/layers_common.simd.hpp +++ b/modules/dnn/src/layers/layers_common.simd.hpp @@ -550,13 +550,24 @@ void fastDepthwiseConv( const float* wptr, _mm256_zeroupper(); } +// Used to generate the mask used when calculating tails +static const uint32_t tailMaskArray[15] = { + 0, 0, 0, 0, 0, 0, 0, 0, + 0xffffffffUL, 0xffffffffUL, 0xffffffffUL, 0xffffffffUL, 0xffffffffUL, 0xffffffffUL, 0xffffffffUL +}; + // dst = vec * weights^t + bias +// Requires that vecsize is at least 8 or equal to 0 to avoid memory access problems. Does not require alignment. void fastGEMM1T( const float* vec, const float* weights, size_t wstep, const float* bias, float* dst, int nvecs, int vecsize ) { int i = 0; + CV_Assert(vecsize >= 8 || vecsize == 0); + + __m256 tailMask = _mm256_loadu_ps(reinterpret_cast(tailMaskArray) + (vecsize % 8)); + for( ; i <= nvecs - 8; i += 8 ) { const float* wptr = weights + i*wstep; @@ -565,18 +576,36 @@ void fastGEMM1T( const float* vec, const float* weights, vs4 = _mm256_setzero_ps(), vs5 = _mm256_setzero_ps(), vs6 = _mm256_setzero_ps(), vs7 = _mm256_setzero_ps(); - for( int k = 0; k < vecsize; k += 8, wptr += 8 ) + int k = 0; + for( ; k <= vecsize-8; k += 8, wptr += 8 ) { - __m256 v = _mm256_load_ps(vec + k); + __m256 v = _mm256_loadu_ps(vec + k); - vs0 = _mm256_fmadd_ps(_mm256_load_ps(wptr), v, vs0); - vs1 = _mm256_fmadd_ps(_mm256_load_ps(wptr + wstep), v, vs1); - vs2 = _mm256_fmadd_ps(_mm256_load_ps(wptr + wstep*2), v, vs2); - vs3 = _mm256_fmadd_ps(_mm256_load_ps(wptr + wstep*3), v, vs3); - vs4 = _mm256_fmadd_ps(_mm256_load_ps(wptr + wstep*4), v, vs4); - vs5 = _mm256_fmadd_ps(_mm256_load_ps(wptr + wstep*5), v, vs5); - vs6 = _mm256_fmadd_ps(_mm256_load_ps(wptr + wstep*6), v, vs6); - vs7 = _mm256_fmadd_ps(_mm256_load_ps(wptr + wstep*7), v, vs7); + vs0 = _mm256_fmadd_ps(_mm256_loadu_ps(wptr), v, vs0); + vs1 = _mm256_fmadd_ps(_mm256_loadu_ps(wptr + wstep), v, vs1); + vs2 = _mm256_fmadd_ps(_mm256_loadu_ps(wptr + wstep*2), v, vs2); + vs3 = _mm256_fmadd_ps(_mm256_loadu_ps(wptr + wstep*3), v, vs3); + vs4 = _mm256_fmadd_ps(_mm256_loadu_ps(wptr + wstep*4), v, vs4); + vs5 = _mm256_fmadd_ps(_mm256_loadu_ps(wptr + wstep*5), v, vs5); + vs6 = _mm256_fmadd_ps(_mm256_loadu_ps(wptr + wstep*6), v, vs6); + vs7 = _mm256_fmadd_ps(_mm256_loadu_ps(wptr + wstep*7), v, vs7); + } + + if (k != vecsize) { + // Tail + k = vecsize - 8; + wptr = weights + i * wstep + k; + __m256 v = _mm256_loadu_ps(vec + k); + v = _mm256_and_ps(v, tailMask); + + vs0 = _mm256_fmadd_ps(_mm256_and_ps(_mm256_loadu_ps(wptr), tailMask), v, vs0); + vs1 = _mm256_fmadd_ps(_mm256_and_ps(_mm256_loadu_ps(wptr + wstep), tailMask), v, vs1); + vs2 = _mm256_fmadd_ps(_mm256_and_ps(_mm256_loadu_ps(wptr + wstep * 2), tailMask), v, vs2); + vs3 = _mm256_fmadd_ps(_mm256_and_ps(_mm256_loadu_ps(wptr + wstep * 3), tailMask), v, vs3); + vs4 = _mm256_fmadd_ps(_mm256_and_ps(_mm256_loadu_ps(wptr + wstep * 4), tailMask), v, vs4); + vs5 = _mm256_fmadd_ps(_mm256_and_ps(_mm256_loadu_ps(wptr + wstep * 5), tailMask), v, vs5); + vs6 = _mm256_fmadd_ps(_mm256_and_ps(_mm256_loadu_ps(wptr + wstep * 6), tailMask), v, vs6); + vs7 = _mm256_fmadd_ps(_mm256_and_ps(_mm256_loadu_ps(wptr + wstep * 7), tailMask), v, vs7); } __m256 s0 = _mm256_hadd_ps(_mm256_hadd_ps(vs0, vs1), _mm256_hadd_ps(vs2, vs3)); @@ -598,10 +627,20 @@ void fastGEMM1T( const float* vec, const float* weights, const float* wptr = weights + i*wstep; __m256 vs0 = _mm256_setzero_ps(); - for( int k = 0; k < vecsize; k += 8, wptr += 8 ) + int k = 0; + for( ; k <= vecsize-8; k += 8, wptr += 8 ) { - __m256 v = _mm256_load_ps(vec + k); - vs0 = _mm256_fmadd_ps(_mm256_load_ps(wptr), v, vs0); + __m256 v = _mm256_loadu_ps(vec + k); + vs0 = _mm256_fmadd_ps(_mm256_loadu_ps(wptr), v, vs0); + } + + if (k != vecsize) { + // Tail + k = vecsize - 8; + wptr = weights + i * wstep + k; + __m256 v = _mm256_loadu_ps(vec + k); + v = _mm256_and_ps(v, tailMask); + vs0 = _mm256_fmadd_ps(_mm256_and_ps(_mm256_loadu_ps(wptr), tailMask), v, vs0); } __m256 s0 = _mm256_hadd_ps(_mm256_hadd_ps(vs0, vs0), vs0); diff --git a/modules/dnn/src/layers/recurrent_layers.cpp b/modules/dnn/src/layers/recurrent_layers.cpp index 9088c13390..21dafa142d 100644 --- a/modules/dnn/src/layers/recurrent_layers.cpp +++ b/modules/dnn/src/layers/recurrent_layers.cpp @@ -46,6 +46,8 @@ #include #include +#include "layers_common.hpp" + namespace cv { namespace dnn @@ -118,10 +120,23 @@ class LSTMLayerImpl CV_FINAL : public LSTMLayer ActivationFunction g_activation; ActivationFunction h_activation; +#if CV_TRY_AVX + bool useAVX; +#endif +#if CV_TRY_AVX2 + bool useAVX2; +#endif + public: LSTMLayerImpl(const LayerParams& params) : numTimeStamps(0), numSamples(0) +#if CV_TRY_AVX + , useAVX(checkHardwareSupport(CPU_AVX)) +#endif +#if CV_TRY_AVX2 + , useAVX2(checkHardwareSupport(CPU_AVX2)) +#endif { setParamsFrom(params); @@ -343,6 +358,15 @@ public: hOutTs = hOutTs.colRange(i * hOutTs.cols / numDirs, (i + 1) * hOutTs.cols / numDirs); Mat cOutTs = produceCellOutput ? output[1].reshape(1, numSamplesTotal) : Mat(); +#if CV_TRY_AVX2 || CV_TRY_AVX + bool canUseAvx = gates.isContinuous() && bias.isContinuous() + && Wx.depth() == CV_32F && gates.depth() == CV_32F + && bias.depth() == CV_32F && Wx.cols >= 8; + bool canUseAvx_hInternal = hInternal.isContinuous() && gates.isContinuous() && bias.isContinuous() + && Wh.depth() == CV_32F && hInternal.depth() == CV_32F && gates.depth() == CV_32F + && Wh.cols >= 8; +#endif + int tsStart, tsEnd, tsInc; if (reverse || i == 1) { tsStart = numTimeStamps - 1; @@ -359,9 +383,82 @@ public: Range curRowRange(ts*numSamples, (ts + 1)*numSamples); Mat xCurr = xTs.rowRange(curRowRange); - gemm(xCurr, Wx, 1, gates, 0, gates, GEMM_2_T); // Wx * x_t - gemm(hInternal, Wh, 1, gates, 1, gates, GEMM_2_T); //+Wh * h_{t-1} - gemm(dummyOnes, bias, 1, gates, 1, gates); //+b +#if CV_TRY_AVX2 + if (useAVX2 && canUseAvx && xCurr.isContinuous()) + { + for (int n = 0; n < xCurr.rows; n++) { + opt_AVX2::fastGEMM1T( + xCurr.ptr(n), + Wx.ptr(), + Wx.step1(), + bias.ptr(), + gates.ptr(n), + Wx.rows, + Wx.cols + ); + } + } + else +#endif +#if CV_TRY_AVX + if (useAVX && canUseAvx && xCurr.isContinuous()) + { + for (int n = 0; n < xCurr.rows; n++) { + opt_AVX::fastGEMM1T( + xCurr.ptr(n), + Wx.ptr(), + Wx.step1(), + bias.ptr(), + gates.ptr(n), + Wx.rows, + Wx.cols + ); + } + } + else +#endif + { + gemm(xCurr, Wx, 1, gates, 0, gates, GEMM_2_T); // Wx * x_t + gemm(dummyOnes, bias, 1, gates, 1, gates); //+b + } + +#if CV_TRY_AVX2 + if (useAVX2 && canUseAvx_hInternal) + { + for (int n = 0; n < hInternal.rows; n++) { + opt_AVX2::fastGEMM1T( + hInternal.ptr(n), + Wh.ptr(), + Wh.step1(), + gates.ptr(n), + gates.ptr(n), + Wh.rows, + Wh.cols + ); + } + } + else +#endif +#if CV_TRY_AVX + if (useAVX && canUseAvx_hInternal) + { + for (int n = 0; n < hInternal.rows; n++) { + opt_AVX::fastGEMM1T( + hInternal.ptr(n), + Wh.ptr(), + Wh.step1(), + gates.ptr(n), + gates.ptr(n), + Wh.rows, + Wh.cols + ); + } + } + else +#endif + { + gemm(hInternal, Wh, 1, gates, 1, gates, GEMM_2_T); //+Wh * h_{t-1} + } Mat gateI = gates.colRange(0*numOut, 1*numOut); Mat gateF = gates.colRange(1*numOut, 2*numOut);