Merge pull request #23763 from zihaomu:add_runtime_check

DNN: fix bug for X86 Winograd #23763 Address https://github.com/opencv/opencv/issues/23760 The patch aims to add a runtime check for X86 platform without AVX(2). ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [ ] There is a reference to the original bug report and related work - [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [ ] The feature is well documented and sample code can be built with the project CMake
2024-11-24 19:20:28 +08:00 · 2023-06-09 14:18:12 +08:00 · 2023-06-09 14:18:12 +08:00 · eec8a20c33
commit eec8a20c33
parent 5d913f4d72
3 changed files with 35 additions and 24 deletions
--- a/modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.cpp
+++ b/modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.cpp
@ -31,7 +31,6 @@ void winofunc_BtXB_8x8_f32(const float* inptr, int inpstep,
 void winofunc_AtXA_8x8_f32(const float* inptr, int inpstep, float* bpptr, int bpstep, float* outptr, int outstep,
                          float bias, float minval, float maxval, bool ifMinMaxAct);

-
 int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _output, const Ptr<FastConv>& conv,
                  int ntasks, float minval, float maxval, ActivationLayer* activ, bool ifMinMaxAct)
 {
@ -51,6 +50,23 @@ int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _outpu
    int pad_left = conv->pad_left;

    int ngroups = conv->ngroups, Cg = C/ngroups, Kg = K/ngroups;
+
+    const int CONV_WINO_KBLOCK = 4;
+#if (CV_NEON && CV_NEON_AARCH64)
+    const int CONV_WINO_IBLOCK = 6;
+#elif  CV_TRY_AVX || CV_TRY_AVX2
+    const int CONV_WINO_IBLOCK = (conv->useAVX || conv->useAVX2) ? 6 : 3;
+#else
+    const int CONV_WINO_IBLOCK = 3;
+#endif
+
+#if CV_TRY_AVX || CV_TRY_AVX2
+    const int CONV_WINO_ATOM_F32 = (conv->useAVX || conv->useAVX2) ? 8 : 4;
+#else
+    const int CONV_WINO_ATOM_F32 = 4;
+#endif
+    const int CONV_WINO_NATOMS_F32 = CONV_WINO_AREA / CONV_WINO_ATOM_F32; // for AVX2, it is 8, otherwise, it's 16.
+
    int Kg_nblocks = (Kg + CONV_WINO_KBLOCK - 1)/CONV_WINO_KBLOCK;
    const size_t inp_planesize = (size_t)Hi*Wi;
    const size_t out_planesize = (size_t)H0*W0;
@ -398,7 +414,7 @@ void winofunc_accum_f32(const float* inwptr, const float* wptr, float* outbuf, i
 void winofunc_BtXB_8x8_f32(const float* inptr, int inpstep,
                          float* outptr, int Cg, const int winoIblock, const int winoAtomF32)
 {
-    CV_Assert(CONV_WINO_IBLOCK == 3 && CONV_WINO_KBLOCK == 4 && CONV_WINO_ATOM_F32 == 4);
+    CV_Assert(winoIblock == 3 && winoAtomF32 == 4);
    v_float32x4 x00 = v_load(inptr), x01 = v_load(inptr + 4);
    v_float32x4 x10 = v_load(inptr + inpstep), x11 = v_load(inptr + inpstep + 4);
    v_float32x4 x20 = v_load(inptr + inpstep*2), x21 = v_load(inptr + inpstep*2 + 4);
@ -573,7 +589,6 @@ void winofunc_AtXA_8x8_f32(const float* inptr, int inpstep,
                          float* bpptr, int bpstep, float* outptr, int outstep,
                          float bias, float minval, float maxval, bool ifMinMaxAct)
 {
-    CV_Assert(CONV_WINO_IBLOCK == 3 && CONV_WINO_KBLOCK == 4 && CONV_WINO_ATOM_F32 == 4);
    v_float32x4 x00 = v_load(inptr), x01 = v_load(inptr + 4);
    v_float32x4 x10 = v_load(inptr + inpstep), x11 = v_load(inptr + inpstep + 4);
    v_float32x4 x20 = v_load(inptr + inpstep*2), x21 = v_load(inptr + inpstep*2 + 4);
--- a/modules/dnn/src/layers/cpu_kernels/convolution.cpp
+++ b/modules/dnn/src/layers/cpu_kernels/convolution.cpp
@ -181,6 +181,21 @@ Ptr<FastConv> initFastConv(
                {0.0f, 0.0f, 1.0f}
        };

+        const int CONV_WINO_KBLOCK = 4;
+
+#if CV_TRY_AVX || CV_TRY_AVX2
+        const int CONV_WINO_ATOM_F32 = (conv->useAVX || conv->useAVX2) ? 8 : 4;
+#else
+        const int CONV_WINO_ATOM_F32 = 4;
+#endif
+        const int CONV_WINO_NATOMS_F32 = CONV_WINO_AREA / CONV_WINO_ATOM_F32; // for AVX2, it is 8, otherwise, it's 16.
+
+#ifdef CONV_ARM_FP16
+        // FP 16
+        const int CONV_WINO_ATOM_F16 = CONV_WINO_ATOM_F32 * 2;
+        const int CONV_WINO_NATOMS_F16 = CONV_WINO_AREA / CONV_WINO_ATOM_F16;
+#endif
+
        // the weights are packed as 6-dim tensor:
        // ngroups * ceil((K/ngroups)/KBLOCK) * (W*W/ATOM_SIZE) * (C/ngroups) * KBLOCK * ATOM_SIZE,
        // where W is the size of Winograd-transformed kernel (8x8),
--- a/modules/dnn/src/layers/cpu_kernels/convolution.hpp
+++ b/modules/dnn/src/layers/cpu_kernels/convolution.hpp
@ -33,36 +33,17 @@ typedef __fp16 float16_t; // Fix conflict between float16_t in arm_neon.h and fl
 #define CONV_NR_FP32 24
 #endif

-// Winograd Params
 enum {
    CONV_WINO_STEP=6,
    CONV_WINO_KSIZE=3,
    CONV_WINO_SIZE=CONV_WINO_STEP+CONV_WINO_KSIZE - 1, // 8
    CONV_WINO_AREA=CONV_WINO_SIZE*CONV_WINO_SIZE,
-
-    CONV_WINO_KBLOCK = 4,
-#if (CV_NEON && CV_NEON_AARCH64) || CV_TRY_AVX || CV_TRY_AVX2
-    CONV_WINO_IBLOCK = 6,
-#else
-    CONV_WINO_IBLOCK = 3,
-#endif
-
-#if CV_TRY_AVX || CV_TRY_AVX2
-    CONV_WINO_ATOM_F32 = 8,
-#else
-    CONV_WINO_ATOM_F32 = 4,
-#endif
-
-    CONV_WINO_NATOMS_F32 = CONV_WINO_AREA / CONV_WINO_ATOM_F32, // for AVX2, it is 8, otherwise, it's 16.
-
-    // FP 16
-    CONV_WINO_ATOM_F16 = CONV_WINO_ATOM_F32 * 2,
-    CONV_WINO_NATOMS_F16 = CONV_WINO_AREA / CONV_WINO_ATOM_F16,
 };

 // NOTE that: CONV_TYPE_DEPTHWISE is for 3x3 depthwise conv, and others depthwise will be set as CONV_TYPE_DEPTHWISE_REMAIN.
 enum { CONV_TYPE_GENERIC=0, CONV_TYPE_DEPTHWISE=1, CONV_TYPE_WINOGRAD3X3=2, CONV_TYPE_DEPTHWISE_REMAIN=3 };
 enum { CONV_1D = 0, CONV_2D = 1, CONV_3D = 2 };
+
 #endif

 namespace cv {