fixed bug at winograd of SIMD128 and more robust code.

2025-06-11 20:09:23 +08:00 · 2022-10-21 19:14:54 +08:00 · 2022-10-21 19:14:54 +08:00 · cee8c86b6e
commit cee8c86b6e
parent 5d292826b2
5 changed files with 65 additions and 38 deletions
--- a/modules/dnn/src/layers/fast_convolution/depthwise_convolution.cpp
+++ b/modules/dnn/src/layers/fast_convolution/depthwise_convolution.cpp
@ -20,6 +20,7 @@ static void depthWiseBlock(const float *inptr, float *outptr, const float *weigh
                           int inner_ybottom, bool ifMinMaxAct, bool useSIMD, bool is3x3)
 {
 #if CV_SIMD128
    const int VEC_NLANES = 4;
    v_float32x4 vminval = v_setall_f32(minval), vmaxval = v_setall_f32(maxval);
    v_float32x4 w0 = v_setall_f32(
@ -110,7 +111,7 @@ static void depthWiseBlock(const float *inptr, float *outptr, const float *weigh
                {
                    if (dy0 == 3)
                    {
-                        for (; x0 <= x1 - FAST_VEC_NLANES; x0 += FAST_VEC_NLANES)
+                        for (; x0 <= x1 - VEC_NLANES; x0 += VEC_NLANES)
                        {
                            int xi_ = x0 * stride_x - pad_left;
                            const float *inptr_xi = inptr + Wi * yi_ + xi_;
@ -186,7 +187,7 @@ static void depthWiseBlock(const float *inptr, float *outptr, const float *weigh
                    }
                    else
                    {
-                        for (; x0 <= x1 - FAST_VEC_NLANES; x0 += FAST_VEC_NLANES)
+                        for (; x0 <= x1 - VEC_NLANES; x0 += VEC_NLANES)
                        {
                            int xi_ = x0 * stride_x - pad_left;
                            const float *inptr_xi = inptr + Wi * yi_ + xi_;
@ -211,7 +212,7 @@ static void depthWiseBlock(const float *inptr, float *outptr, const float *weigh
                }
                else
                {
-                    for (; x0 <= x1 - FAST_VEC_NLANES; x0 += FAST_VEC_NLANES)
+                    for (; x0 <= x1 - VEC_NLANES; x0 += VEC_NLANES)
                    {
                        int xi_ = x0 * stride_x - pad_left, k = 0;
                        const float *inptr_xi = inptr + Wi * yi_ + xi_;
@ -314,7 +315,12 @@ void runDepthwise(InputArray _input, OutputArray _output, const Ptr<FastConv2d>&
    int pad_top = conv->pad_top, pad_bottom = conv->pad_bottom;
    int pad_left = conv->pad_left, pad_right = conv->pad_right;
-    int ksize = Hk * Wk, padded_ksize = ((ksize + FAST_VEC_NLANES - 1) / FAST_VEC_NLANES) * FAST_VEC_NLANES;
+    int VEC_NLANES = 4;
 #if CV_TRY_AVX2
    if (conv->useAVX2)
        VEC_NLANES = 8;
 #endif
    int ksize = Hk * Wk, padded_ksize = ((ksize + VEC_NLANES - 1) / VEC_NLANES) * VEC_NLANES;
    const float *inp = input.ptr<float>();
    float *out = output.ptr<float>();
--- a/modules/dnn/src/layers/fast_convolution/fast_convolution.avx2.cpp
+++ b/modules/dnn/src/layers/fast_convolution/fast_convolution.avx2.cpp
@ -78,6 +78,7 @@ void depthWiseBlock_AVX2(const float *inptr, float *outptr, const float *weights
                    int dilation_y, int stride_x, int stride_y, int inner_xleft, int inner_xright, int inner_ytop,
                    int inner_ybottom, bool ifMinMaxAct, bool useSIMD, bool is3x3)
 {
    const int VEC_NLANES = 8;
    __m256 vminval = _mm256_set1_ps(minval);
    __m256 vmaxval = _mm256_set1_ps(maxval);
@ -174,7 +175,7 @@ void depthWiseBlock_AVX2(const float *inptr, float *outptr, const float *weights
                {
                    if (dy0 == 3)
                    {
-                        for (; x0 <= x1 - FAST_VEC_NLANES; x0 += FAST_VEC_NLANES)
+                        for (; x0 <= x1 - VEC_NLANES; x0 += VEC_NLANES)
                        {
                            int xi_ = x0 * stride_x - pad_left;
                            const float *inptr_xi = inptr + Wi * yi_ + xi_;
@ -250,7 +251,7 @@ void depthWiseBlock_AVX2(const float *inptr, float *outptr, const float *weights
                    }
                    else
                    {
-                        for (; x0 <= x1 - FAST_VEC_NLANES; x0 += FAST_VEC_NLANES)
+                        for (; x0 <= x1 - VEC_NLANES; x0 += VEC_NLANES)
                        {
                            int xi_ = x0 * stride_x - pad_left;
                            const float *inptr_xi = inptr + Wi * yi_ + xi_;
@ -276,7 +277,7 @@ void depthWiseBlock_AVX2(const float *inptr, float *outptr, const float *weights
                }
                else
                {
-                    for (; x0 <= x1 - FAST_VEC_NLANES; x0 += FAST_VEC_NLANES)
+                    for (; x0 <= x1 - VEC_NLANES; x0 += VEC_NLANES)
                    {
                        int xi_ = x0 * stride_x - pad_left, k = 0;
                        const float *inptr_xi = inptr + Wi * yi_ + xi_;
@ -701,7 +702,6 @@ void _fx_winograd_AtXA_8x8_f32(const float* inptr, int inpstep,
        z50 = _mm256_add_ps(vbias, z50);
    }
    // TODO make sure the lenght of bpptr is 8.
    if (bpptr)
    {
        z00 = _mm256_add_ps(z00, _mm256_loadu_ps(bpptr));
--- a/modules/dnn/src/layers/fast_convolution/fast_convolution.cpp
+++ b/modules/dnn/src/layers/fast_convolution/fast_convolution.cpp
@ -49,6 +49,15 @@ Ptr<FastConv2d> initFastConv2d(
            useWinograd && ((conv->useSIMD128 || conv->useAVX2 || conv->useNEON) && Hk == 3 && Wk == 3 &&
            dilation_y == 1 && dilation_x == 1 && stride_y == 1 && stride_x == 1) ? _FX_CONV_TYPE_WINOGRAD3X3 :
            _FX_CONV_TYPE_GENERIC;
    int VEC_NLANES = 4;
 #if CV_TRY_AVX2
    if (!conv->useAVX2 && conv->conv_type == _FX_CONV_TYPE_WINOGRAD3X3) // convert Winograd to generic conv.
        conv->conv_type = _FX_CONV_TYPE_GENERIC;
    if (conv->useAVX2)
       VEC_NLANES = 8;
 #endif
    Mat weightsMat = _weightsMat.getMat();
    auto wShape = shape(weightsMat);
    const size_t wstep = weightsMat.step1();
@ -61,7 +70,7 @@ Ptr<FastConv2d> initFastConv2d(
        int ksize = Hk*Wk;
        // this code aims to let memory fit with vector size.
-        int padded_ksize = ((ksize + FAST_VEC_NLANES-1) / FAST_VEC_NLANES) * FAST_VEC_NLANES;
+        int padded_ksize = ((ksize + VEC_NLANES-1) / VEC_NLANES) * VEC_NLANES;
        int nweights = C*padded_ksize;
        conv->weightsBuf.reserve(nweights + VEC_ALIGN);
        conv->weightsBufPtr = alignPtr(conv->weightsBuf.data(), VEC_ALIGN);
@ -265,7 +274,8 @@ void runFastConv2d(InputArray _input, OutputArray _output, const Ptr<FastConv2d>
    else if (conv->conv_type == _FX_CONV_TYPE_WINOGRAD3X3 && inputShape[2] >= 12 && inputShape[3] >= 12) // winograd
    {
        CV_Assert(conv->weightsWinoBufPtr);
-        return runWinograd63(input, fusedAddMat, output, conv, ntasks, minval, maxval, activ, ifMinMaxAct);
+        if (runWinograd63(input, fusedAddMat, output, conv, ntasks, minval, maxval, activ, ifMinMaxAct))
            return;
    }
    int N = inputShape[0], C = inputShape[1], Hi = inputShape[2], Wi = inputShape[3];  // [N, C, H, W]
--- a/modules/dnn/src/layers/fast_convolution/fast_convolution.hpp
+++ b/modules/dnn/src/layers/fast_convolution/fast_convolution.hpp
@ -12,35 +12,25 @@
 #if CV_NEON && CV_NEON_AARCH64  // 32 registers.
 #define CONV_MR 4
 #define CONV_NR 28
 enum { FAST_VEC_NLANES=4 };
 #elif CV_NEON              // 16 registers.
 #define CONV_MR 4
 #define CONV_NR 12
 enum { FAST_VEC_NLANES=4 };
 #else // SIMD 128, AVX or AVX2
 #define CONV_MR 4
 #define CONV_NR 24
 #if CV_TRY_AVX2
 enum { FAST_VEC_NLANES=8 }; // AVX2
 #else
 enum { FAST_VEC_NLANES=4 }; // SIMD 128
 #endif
 #endif
 #endif
 // Winograd Params
 enum {
    _FX_WINO_STEP=6,
    _FX_WINO_KSIZE=3,
    _FX_WINO_SIZE=_FX_WINO_STEP+_FX_WINO_KSIZE-1,
    _FX_WINO_AREA=_FX_WINO_SIZE*_FX_WINO_SIZE,
 #if CV_TRY_AVX2 || (CV_NEON && CV_NEON_AARCH64)
    _FX_WINO_KBLOCK = 4,
 #if (CV_NEON && CV_NEON_AARCH64) || CV_TRY_AVX2
    _FX_WINO_IBLOCK = 6,
 #else
    _FX_WINO_KBLOCK = 4,
    _FX_WINO_IBLOCK = 3,
 #endif
@ -52,8 +42,8 @@ enum {
    _FX_WINO_NATOMS_F32 = _FX_WINO_AREA / _FX_WINO_ATOM_F32, // for AVX2, it is 8, otherwise, it's 16.
 };
 enum { _FX_CONV_TYPE_GENERIC=0, _FX_CONV_TYPE_DEPTHWISE=1, _FX_CONV_TYPE_WINOGRAD3X3=2 };
 #endif
 namespace cv {
 namespace dnn {
@ -77,8 +67,18 @@ struct FastConv2d
 #else
    bool useSIMD128 = false;
 #endif
 #if CV_TRY_AVX2
    bool useAVX2 = checkHardwareSupport(CPU_AVX2);
 #else
    bool useAVX2 = false;
 #endif
 #if CV_NEON
    bool useNEON = checkHardwareSupport(CPU_NEON);
 #else
    bool useNEON = false;
 #endif
 };
 // return a FastConv2d instance.
@ -99,7 +99,7 @@ void runFastConv2d(InputArray _input, OutputArray _output, const Ptr<FastConv2d>
 void runDepthwise(InputArray _input, OutputArray _output, const Ptr<FastConv2d>& conv, float minval, float maxval,
        ActivationLayer* activ, bool ifMinMaxAct);
-void runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _output, const Ptr<FastConv2d>& conv, int ntasks,
+int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _output, const Ptr<FastConv2d>& conv, int ntasks,
                  float minval, float maxval, ActivationLayer* activ, bool ifMinMaxAct);
 } // namespace dnn
--- a/modules/dnn/src/layers/fast_convolution/winograd_3x3s1_f63.cpp
+++ b/modules/dnn/src/layers/fast_convolution/winograd_3x3s1_f63.cpp
@ -13,6 +13,8 @@
 #include "fast_convolution.hpp"
 namespace cv { namespace dnn {
 #if CV_NEON || CV_SIMD128 || CV_TRY_AVX2
 enum { VEC_ALIGN = 32, DFT_TYPE = CV_32F }; // Memory alignment.
 static void
@ -141,7 +143,7 @@ _fx_winograd_accum_f32(const float* inwptr, const float* wptr,
            vst1q_f32(outbuf + 20*64, s32);
        }
    }
-#elif CV_SIMD
+#elif CV_SIMD128
    CV_Assert(_FX_WINO_IBLOCK == 3 && _FX_WINO_KBLOCK == 4);
    for (int atom_id = 0; atom_id < _FX_WINO_NATOMS_F32; atom_id++,
            outbuf += _FX_WINO_ATOM_F32)
@ -183,15 +185,15 @@ _fx_winograd_accum_f32(const float* inwptr, const float* wptr,
        v_store(outbuf, s00);
        v_store(outbuf + 1*64, s01);
        v_store(outbuf + 2*64, s02);
-        v_store(outbuf + 6*64, s10);
+        v_store(outbuf + 3*64, s10);
-        v_store(outbuf + 7*64, s11);
+        v_store(outbuf + 4*64, s11);
-        v_store(outbuf + 8*64, s12);
+        v_store(outbuf + 5*64, s12);
-        v_store(outbuf + 12*64, s20);
+        v_store(outbuf + 6*64, s20);
-        v_store(outbuf + 13*64, s21);
+        v_store(outbuf + 7*64, s21);
-        v_store(outbuf + 14*64, s22);
+        v_store(outbuf + 8*64, s22);
-        v_store(outbuf + 18*64, s30);
+        v_store(outbuf + 9*64, s30);
-        v_store(outbuf + 19*64, s31);
+        v_store(outbuf + 10*64, s31);
-        v_store(outbuf + 20*64, s32);
+        v_store(outbuf + 11*64, s32);
    }
 #else
    for (int atom_id = 0; atom_id < _FX_WINO_NATOMS_F32;
@ -406,7 +408,7 @@ _fx_winograd_BtXB_8x8_f32(const float* inptr, int inpstep,
    vst1q_f32(outptr + outstep*13, z61);
    vst1q_f32(outptr + outstep*14, z70);
    vst1q_f32(outptr + outstep*15, z71);
-#elif CV_SIMD
+#elif CV_SIMD128
    v_float32x4 x00 = v_load(inptr), x01 = v_load(inptr + 4);
    v_float32x4 x10 = v_load(inptr + inpstep), x11 = v_load(inptr + inpstep + 4);
    v_float32x4 x20 = v_load(inptr + inpstep*2), x21 = v_load(inptr + inpstep*2 + 4);
@ -750,8 +752,7 @@ _fx_winograd_AtXA_8x8_f32(const float* inptr, int inpstep,
    vst1_f32(outptr + outstep*4 + 4, vget_low_f32(z41));
    vst1q_f32(outptr + outstep*5, z50);
    vst1_f32(outptr + outstep*5 + 4, vget_low_f32(z51));
-//#elif CV_AVX2
+#elif CV_SIMD128
 #elif CV_SIMD
    v_float32x4 x00 = v_load(inptr), x01 = v_load(inptr + 4);
    v_float32x4 x10 = v_load(inptr + inpstep), x11 = v_load(inptr + inpstep + 4);
    v_float32x4 x20 = v_load(inptr + inpstep*2), x21 = v_load(inptr + inpstep*2 + 4);
@ -919,7 +920,7 @@ _fx_winograd_AtXA_8x8_f32(const float* inptr, int inpstep,
 #endif
 }
-void runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _output, const Ptr<FastConv2d>& conv,
+int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _output, const Ptr<FastConv2d>& conv,
                  int ntasks, float minval, float maxval, ActivationLayer* activ, bool ifMinMaxAct)
 {
    Mat input = _input.getMat();
@ -1138,5 +1139,15 @@ void runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _outp
            }
        }
    }});
    return 1;
 }
 #else
 int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _output, const Ptr<FastConv2d>& conv,
                  int ntasks, float minval, float maxval, ActivationLayer* activ, bool ifMinMaxAct)
 {
    return 0;
 }
 #endif
 }} // namespace cv::dnn