diff --git a/modules/dnn/src/layers/fast_convolution/depthwise_convolution.cpp b/modules/dnn/src/layers/fast_convolution/depthwise_convolution.cpp index 4e5adf25d1..a527523a32 100644 --- a/modules/dnn/src/layers/fast_convolution/depthwise_convolution.cpp +++ b/modules/dnn/src/layers/fast_convolution/depthwise_convolution.cpp @@ -20,6 +20,7 @@ static void depthWiseBlock(const float *inptr, float *outptr, const float *weigh int inner_ybottom, bool ifMinMaxAct, bool useSIMD, bool is3x3) { #if CV_SIMD128 + const int VEC_NLANES = 4; v_float32x4 vminval = v_setall_f32(minval), vmaxval = v_setall_f32(maxval); v_float32x4 w0 = v_setall_f32( @@ -110,7 +111,7 @@ static void depthWiseBlock(const float *inptr, float *outptr, const float *weigh { if (dy0 == 3) { - for (; x0 <= x1 - FAST_VEC_NLANES; x0 += FAST_VEC_NLANES) + for (; x0 <= x1 - VEC_NLANES; x0 += VEC_NLANES) { int xi_ = x0 * stride_x - pad_left; const float *inptr_xi = inptr + Wi * yi_ + xi_; @@ -186,7 +187,7 @@ static void depthWiseBlock(const float *inptr, float *outptr, const float *weigh } else { - for (; x0 <= x1 - FAST_VEC_NLANES; x0 += FAST_VEC_NLANES) + for (; x0 <= x1 - VEC_NLANES; x0 += VEC_NLANES) { int xi_ = x0 * stride_x - pad_left; const float *inptr_xi = inptr + Wi * yi_ + xi_; @@ -211,7 +212,7 @@ static void depthWiseBlock(const float *inptr, float *outptr, const float *weigh } else { - for (; x0 <= x1 - FAST_VEC_NLANES; x0 += FAST_VEC_NLANES) + for (; x0 <= x1 - VEC_NLANES; x0 += VEC_NLANES) { int xi_ = x0 * stride_x - pad_left, k = 0; const float *inptr_xi = inptr + Wi * yi_ + xi_; @@ -314,7 +315,12 @@ void runDepthwise(InputArray _input, OutputArray _output, const Ptr& int pad_top = conv->pad_top, pad_bottom = conv->pad_bottom; int pad_left = conv->pad_left, pad_right = conv->pad_right; - int ksize = Hk * Wk, padded_ksize = ((ksize + FAST_VEC_NLANES - 1) / FAST_VEC_NLANES) * FAST_VEC_NLANES; + int VEC_NLANES = 4; +#if CV_TRY_AVX2 + if (conv->useAVX2) + VEC_NLANES = 8; +#endif + int ksize = Hk * Wk, padded_ksize = ((ksize + VEC_NLANES - 1) / VEC_NLANES) * VEC_NLANES; const float *inp = input.ptr(); float *out = output.ptr(); diff --git a/modules/dnn/src/layers/fast_convolution/fast_convolution.avx2.cpp b/modules/dnn/src/layers/fast_convolution/fast_convolution.avx2.cpp index 604e45e628..aa10d40bee 100644 --- a/modules/dnn/src/layers/fast_convolution/fast_convolution.avx2.cpp +++ b/modules/dnn/src/layers/fast_convolution/fast_convolution.avx2.cpp @@ -78,6 +78,7 @@ void depthWiseBlock_AVX2(const float *inptr, float *outptr, const float *weights int dilation_y, int stride_x, int stride_y, int inner_xleft, int inner_xright, int inner_ytop, int inner_ybottom, bool ifMinMaxAct, bool useSIMD, bool is3x3) { + const int VEC_NLANES = 8; __m256 vminval = _mm256_set1_ps(minval); __m256 vmaxval = _mm256_set1_ps(maxval); @@ -174,7 +175,7 @@ void depthWiseBlock_AVX2(const float *inptr, float *outptr, const float *weights { if (dy0 == 3) { - for (; x0 <= x1 - FAST_VEC_NLANES; x0 += FAST_VEC_NLANES) + for (; x0 <= x1 - VEC_NLANES; x0 += VEC_NLANES) { int xi_ = x0 * stride_x - pad_left; const float *inptr_xi = inptr + Wi * yi_ + xi_; @@ -250,7 +251,7 @@ void depthWiseBlock_AVX2(const float *inptr, float *outptr, const float *weights } else { - for (; x0 <= x1 - FAST_VEC_NLANES; x0 += FAST_VEC_NLANES) + for (; x0 <= x1 - VEC_NLANES; x0 += VEC_NLANES) { int xi_ = x0 * stride_x - pad_left; const float *inptr_xi = inptr + Wi * yi_ + xi_; @@ -276,7 +277,7 @@ void depthWiseBlock_AVX2(const float *inptr, float *outptr, const float *weights } else { - for (; x0 <= x1 - FAST_VEC_NLANES; x0 += FAST_VEC_NLANES) + for (; x0 <= x1 - VEC_NLANES; x0 += VEC_NLANES) { int xi_ = x0 * stride_x - pad_left, k = 0; const float *inptr_xi = inptr + Wi * yi_ + xi_; @@ -701,7 +702,6 @@ void _fx_winograd_AtXA_8x8_f32(const float* inptr, int inpstep, z50 = _mm256_add_ps(vbias, z50); } - // TODO make sure the lenght of bpptr is 8. if (bpptr) { z00 = _mm256_add_ps(z00, _mm256_loadu_ps(bpptr)); diff --git a/modules/dnn/src/layers/fast_convolution/fast_convolution.cpp b/modules/dnn/src/layers/fast_convolution/fast_convolution.cpp index 6e6b6c0ead..8002f396f1 100644 --- a/modules/dnn/src/layers/fast_convolution/fast_convolution.cpp +++ b/modules/dnn/src/layers/fast_convolution/fast_convolution.cpp @@ -49,6 +49,15 @@ Ptr initFastConv2d( useWinograd && ((conv->useSIMD128 || conv->useAVX2 || conv->useNEON) && Hk == 3 && Wk == 3 && dilation_y == 1 && dilation_x == 1 && stride_y == 1 && stride_x == 1) ? _FX_CONV_TYPE_WINOGRAD3X3 : _FX_CONV_TYPE_GENERIC; + + int VEC_NLANES = 4; +#if CV_TRY_AVX2 + if (!conv->useAVX2 && conv->conv_type == _FX_CONV_TYPE_WINOGRAD3X3) // convert Winograd to generic conv. + conv->conv_type = _FX_CONV_TYPE_GENERIC; + if (conv->useAVX2) + VEC_NLANES = 8; +#endif + Mat weightsMat = _weightsMat.getMat(); auto wShape = shape(weightsMat); const size_t wstep = weightsMat.step1(); @@ -61,7 +70,7 @@ Ptr initFastConv2d( int ksize = Hk*Wk; // this code aims to let memory fit with vector size. - int padded_ksize = ((ksize + FAST_VEC_NLANES-1) / FAST_VEC_NLANES) * FAST_VEC_NLANES; + int padded_ksize = ((ksize + VEC_NLANES-1) / VEC_NLANES) * VEC_NLANES; int nweights = C*padded_ksize; conv->weightsBuf.reserve(nweights + VEC_ALIGN); conv->weightsBufPtr = alignPtr(conv->weightsBuf.data(), VEC_ALIGN); @@ -265,7 +274,8 @@ void runFastConv2d(InputArray _input, OutputArray _output, const Ptr else if (conv->conv_type == _FX_CONV_TYPE_WINOGRAD3X3 && inputShape[2] >= 12 && inputShape[3] >= 12) // winograd { CV_Assert(conv->weightsWinoBufPtr); - return runWinograd63(input, fusedAddMat, output, conv, ntasks, minval, maxval, activ, ifMinMaxAct); + if (runWinograd63(input, fusedAddMat, output, conv, ntasks, minval, maxval, activ, ifMinMaxAct)) + return; } int N = inputShape[0], C = inputShape[1], Hi = inputShape[2], Wi = inputShape[3]; // [N, C, H, W] diff --git a/modules/dnn/src/layers/fast_convolution/fast_convolution.hpp b/modules/dnn/src/layers/fast_convolution/fast_convolution.hpp index 62c4170a3a..01f5edee48 100644 --- a/modules/dnn/src/layers/fast_convolution/fast_convolution.hpp +++ b/modules/dnn/src/layers/fast_convolution/fast_convolution.hpp @@ -12,35 +12,25 @@ #if CV_NEON && CV_NEON_AARCH64 // 32 registers. #define CONV_MR 4 #define CONV_NR 28 -enum { FAST_VEC_NLANES=4 }; #elif CV_NEON // 16 registers. #define CONV_MR 4 #define CONV_NR 12 -enum { FAST_VEC_NLANES=4 }; #else // SIMD 128, AVX or AVX2 #define CONV_MR 4 #define CONV_NR 24 - -#if CV_TRY_AVX2 -enum { FAST_VEC_NLANES=8 }; // AVX2 -#else -enum { FAST_VEC_NLANES=4 }; // SIMD 128 -#endif - -#endif #endif +// Winograd Params enum { _FX_WINO_STEP=6, _FX_WINO_KSIZE=3, _FX_WINO_SIZE=_FX_WINO_STEP+_FX_WINO_KSIZE-1, _FX_WINO_AREA=_FX_WINO_SIZE*_FX_WINO_SIZE, -#if CV_TRY_AVX2 || (CV_NEON && CV_NEON_AARCH64) _FX_WINO_KBLOCK = 4, +#if (CV_NEON && CV_NEON_AARCH64) || CV_TRY_AVX2 _FX_WINO_IBLOCK = 6, #else - _FX_WINO_KBLOCK = 4, _FX_WINO_IBLOCK = 3, #endif @@ -52,8 +42,8 @@ enum { _FX_WINO_NATOMS_F32 = _FX_WINO_AREA / _FX_WINO_ATOM_F32, // for AVX2, it is 8, otherwise, it's 16. }; - enum { _FX_CONV_TYPE_GENERIC=0, _FX_CONV_TYPE_DEPTHWISE=1, _FX_CONV_TYPE_WINOGRAD3X3=2 }; +#endif namespace cv { namespace dnn { @@ -77,8 +67,18 @@ struct FastConv2d #else bool useSIMD128 = false; #endif + +#if CV_TRY_AVX2 bool useAVX2 = checkHardwareSupport(CPU_AVX2); +#else + bool useAVX2 = false; +#endif + +#if CV_NEON bool useNEON = checkHardwareSupport(CPU_NEON); +#else + bool useNEON = false; +#endif }; // return a FastConv2d instance. @@ -99,7 +99,7 @@ void runFastConv2d(InputArray _input, OutputArray _output, const Ptr void runDepthwise(InputArray _input, OutputArray _output, const Ptr& conv, float minval, float maxval, ActivationLayer* activ, bool ifMinMaxAct); -void runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _output, const Ptr& conv, int ntasks, +int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _output, const Ptr& conv, int ntasks, float minval, float maxval, ActivationLayer* activ, bool ifMinMaxAct); } // namespace dnn diff --git a/modules/dnn/src/layers/fast_convolution/winograd_3x3s1_f63.cpp b/modules/dnn/src/layers/fast_convolution/winograd_3x3s1_f63.cpp index bc44f73a22..10b55f3604 100644 --- a/modules/dnn/src/layers/fast_convolution/winograd_3x3s1_f63.cpp +++ b/modules/dnn/src/layers/fast_convolution/winograd_3x3s1_f63.cpp @@ -13,6 +13,8 @@ #include "fast_convolution.hpp" namespace cv { namespace dnn { + +#if CV_NEON || CV_SIMD128 || CV_TRY_AVX2 enum { VEC_ALIGN = 32, DFT_TYPE = CV_32F }; // Memory alignment. static void @@ -141,7 +143,7 @@ _fx_winograd_accum_f32(const float* inwptr, const float* wptr, vst1q_f32(outbuf + 20*64, s32); } } -#elif CV_SIMD +#elif CV_SIMD128 CV_Assert(_FX_WINO_IBLOCK == 3 && _FX_WINO_KBLOCK == 4); for (int atom_id = 0; atom_id < _FX_WINO_NATOMS_F32; atom_id++, outbuf += _FX_WINO_ATOM_F32) @@ -183,15 +185,15 @@ _fx_winograd_accum_f32(const float* inwptr, const float* wptr, v_store(outbuf, s00); v_store(outbuf + 1*64, s01); v_store(outbuf + 2*64, s02); - v_store(outbuf + 6*64, s10); - v_store(outbuf + 7*64, s11); - v_store(outbuf + 8*64, s12); - v_store(outbuf + 12*64, s20); - v_store(outbuf + 13*64, s21); - v_store(outbuf + 14*64, s22); - v_store(outbuf + 18*64, s30); - v_store(outbuf + 19*64, s31); - v_store(outbuf + 20*64, s32); + v_store(outbuf + 3*64, s10); + v_store(outbuf + 4*64, s11); + v_store(outbuf + 5*64, s12); + v_store(outbuf + 6*64, s20); + v_store(outbuf + 7*64, s21); + v_store(outbuf + 8*64, s22); + v_store(outbuf + 9*64, s30); + v_store(outbuf + 10*64, s31); + v_store(outbuf + 11*64, s32); } #else for (int atom_id = 0; atom_id < _FX_WINO_NATOMS_F32; @@ -406,7 +408,7 @@ _fx_winograd_BtXB_8x8_f32(const float* inptr, int inpstep, vst1q_f32(outptr + outstep*13, z61); vst1q_f32(outptr + outstep*14, z70); vst1q_f32(outptr + outstep*15, z71); -#elif CV_SIMD +#elif CV_SIMD128 v_float32x4 x00 = v_load(inptr), x01 = v_load(inptr + 4); v_float32x4 x10 = v_load(inptr + inpstep), x11 = v_load(inptr + inpstep + 4); v_float32x4 x20 = v_load(inptr + inpstep*2), x21 = v_load(inptr + inpstep*2 + 4); @@ -750,8 +752,7 @@ _fx_winograd_AtXA_8x8_f32(const float* inptr, int inpstep, vst1_f32(outptr + outstep*4 + 4, vget_low_f32(z41)); vst1q_f32(outptr + outstep*5, z50); vst1_f32(outptr + outstep*5 + 4, vget_low_f32(z51)); -//#elif CV_AVX2 -#elif CV_SIMD +#elif CV_SIMD128 v_float32x4 x00 = v_load(inptr), x01 = v_load(inptr + 4); v_float32x4 x10 = v_load(inptr + inpstep), x11 = v_load(inptr + inpstep + 4); v_float32x4 x20 = v_load(inptr + inpstep*2), x21 = v_load(inptr + inpstep*2 + 4); @@ -919,7 +920,7 @@ _fx_winograd_AtXA_8x8_f32(const float* inptr, int inpstep, #endif } -void runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _output, const Ptr& conv, +int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _output, const Ptr& conv, int ntasks, float minval, float maxval, ActivationLayer* activ, bool ifMinMaxAct) { Mat input = _input.getMat(); @@ -1138,5 +1139,15 @@ void runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _outp } } }}); + return 1; } + +#else + +int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _output, const Ptr& conv, + int ntasks, float minval, float maxval, ActivationLayer* activ, bool ifMinMaxAct) +{ + return 0; +} +#endif }} // namespace cv::dnn