diff --git a/modules/dnn/src/layers/fast_convolution/depthwise_convolution.cpp b/modules/dnn/src/layers/fast_convolution/depthwise_convolution.cpp
index 4e5adf25d1..a527523a32 100644
--- a/modules/dnn/src/layers/fast_convolution/depthwise_convolution.cpp
+++ b/modules/dnn/src/layers/fast_convolution/depthwise_convolution.cpp
@@ -20,6 +20,7 @@ static void depthWiseBlock(const float *inptr, float *outptr, const float *weigh
                            int inner_ybottom, bool ifMinMaxAct, bool useSIMD, bool is3x3)
 {
 #if CV_SIMD128
+    const int VEC_NLANES = 4;
     v_float32x4 vminval = v_setall_f32(minval), vmaxval = v_setall_f32(maxval);
 
     v_float32x4 w0 = v_setall_f32(
@@ -110,7 +111,7 @@ static void depthWiseBlock(const float *inptr, float *outptr, const float *weigh
                 {
                     if (dy0 == 3)
                     {
-                        for (; x0 <= x1 - FAST_VEC_NLANES; x0 += FAST_VEC_NLANES)
+                        for (; x0 <= x1 - VEC_NLANES; x0 += VEC_NLANES)
                         {
                             int xi_ = x0 * stride_x - pad_left;
                             const float *inptr_xi = inptr + Wi * yi_ + xi_;
@@ -186,7 +187,7 @@ static void depthWiseBlock(const float *inptr, float *outptr, const float *weigh
                     }
                     else
                     {
-                        for (; x0 <= x1 - FAST_VEC_NLANES; x0 += FAST_VEC_NLANES)
+                        for (; x0 <= x1 - VEC_NLANES; x0 += VEC_NLANES)
                         {
                             int xi_ = x0 * stride_x - pad_left;
                             const float *inptr_xi = inptr + Wi * yi_ + xi_;
@@ -211,7 +212,7 @@ static void depthWiseBlock(const float *inptr, float *outptr, const float *weigh
                 }
                 else
                 {
-                    for (; x0 <= x1 - FAST_VEC_NLANES; x0 += FAST_VEC_NLANES)
+                    for (; x0 <= x1 - VEC_NLANES; x0 += VEC_NLANES)
                     {
                         int xi_ = x0 * stride_x - pad_left, k = 0;
                         const float *inptr_xi = inptr + Wi * yi_ + xi_;
@@ -314,7 +315,12 @@ void runDepthwise(InputArray _input, OutputArray _output, const Ptr<FastConv2d>&
     int pad_top = conv->pad_top, pad_bottom = conv->pad_bottom;
     int pad_left = conv->pad_left, pad_right = conv->pad_right;
 
-    int ksize = Hk * Wk, padded_ksize = ((ksize + FAST_VEC_NLANES - 1) / FAST_VEC_NLANES) * FAST_VEC_NLANES;
+    int VEC_NLANES = 4;
+#if CV_TRY_AVX2
+    if (conv->useAVX2)
+        VEC_NLANES = 8;
+#endif
+    int ksize = Hk * Wk, padded_ksize = ((ksize + VEC_NLANES - 1) / VEC_NLANES) * VEC_NLANES;
 
     const float *inp = input.ptr<float>();
     float *out = output.ptr<float>();
diff --git a/modules/dnn/src/layers/fast_convolution/fast_convolution.avx2.cpp b/modules/dnn/src/layers/fast_convolution/fast_convolution.avx2.cpp
index 604e45e628..aa10d40bee 100644
--- a/modules/dnn/src/layers/fast_convolution/fast_convolution.avx2.cpp
+++ b/modules/dnn/src/layers/fast_convolution/fast_convolution.avx2.cpp
@@ -78,6 +78,7 @@ void depthWiseBlock_AVX2(const float *inptr, float *outptr, const float *weights
                     int dilation_y, int stride_x, int stride_y, int inner_xleft, int inner_xright, int inner_ytop,
                     int inner_ybottom, bool ifMinMaxAct, bool useSIMD, bool is3x3)
 {
+    const int VEC_NLANES = 8;
     __m256 vminval = _mm256_set1_ps(minval);
     __m256 vmaxval = _mm256_set1_ps(maxval);
 
@@ -174,7 +175,7 @@ void depthWiseBlock_AVX2(const float *inptr, float *outptr, const float *weights
                 {
                     if (dy0 == 3)
                     {
-                        for (; x0 <= x1 - FAST_VEC_NLANES; x0 += FAST_VEC_NLANES)
+                        for (; x0 <= x1 - VEC_NLANES; x0 += VEC_NLANES)
                         {
                             int xi_ = x0 * stride_x - pad_left;
                             const float *inptr_xi = inptr + Wi * yi_ + xi_;
@@ -250,7 +251,7 @@ void depthWiseBlock_AVX2(const float *inptr, float *outptr, const float *weights
                     }
                     else
                     {
-                        for (; x0 <= x1 - FAST_VEC_NLANES; x0 += FAST_VEC_NLANES)
+                        for (; x0 <= x1 - VEC_NLANES; x0 += VEC_NLANES)
                         {
                             int xi_ = x0 * stride_x - pad_left;
                             const float *inptr_xi = inptr + Wi * yi_ + xi_;
@@ -276,7 +277,7 @@ void depthWiseBlock_AVX2(const float *inptr, float *outptr, const float *weights
                 }
                 else
                 {
-                    for (; x0 <= x1 - FAST_VEC_NLANES; x0 += FAST_VEC_NLANES)
+                    for (; x0 <= x1 - VEC_NLANES; x0 += VEC_NLANES)
                     {
                         int xi_ = x0 * stride_x - pad_left, k = 0;
                         const float *inptr_xi = inptr + Wi * yi_ + xi_;
@@ -701,7 +702,6 @@ void _fx_winograd_AtXA_8x8_f32(const float* inptr, int inpstep,
         z50 = _mm256_add_ps(vbias, z50);
     }
 
-    // TODO make sure the lenght of bpptr is 8.
     if (bpptr)
     {
         z00 = _mm256_add_ps(z00, _mm256_loadu_ps(bpptr));
diff --git a/modules/dnn/src/layers/fast_convolution/fast_convolution.cpp b/modules/dnn/src/layers/fast_convolution/fast_convolution.cpp
index 6e6b6c0ead..8002f396f1 100644
--- a/modules/dnn/src/layers/fast_convolution/fast_convolution.cpp
+++ b/modules/dnn/src/layers/fast_convolution/fast_convolution.cpp
@@ -49,6 +49,15 @@ Ptr<FastConv2d> initFastConv2d(
             useWinograd && ((conv->useSIMD128 || conv->useAVX2 || conv->useNEON) && Hk == 3 && Wk == 3 &&
             dilation_y == 1 && dilation_x == 1 && stride_y == 1 && stride_x == 1) ? _FX_CONV_TYPE_WINOGRAD3X3 :
             _FX_CONV_TYPE_GENERIC;
+
+    int VEC_NLANES = 4;
+#if CV_TRY_AVX2
+    if (!conv->useAVX2 && conv->conv_type == _FX_CONV_TYPE_WINOGRAD3X3) // convert Winograd to generic conv.
+        conv->conv_type = _FX_CONV_TYPE_GENERIC;
+    if (conv->useAVX2)
+       VEC_NLANES = 8;
+#endif
+
     Mat weightsMat = _weightsMat.getMat();
     auto wShape = shape(weightsMat);
     const size_t wstep = weightsMat.step1();
@@ -61,7 +70,7 @@ Ptr<FastConv2d> initFastConv2d(
         int ksize = Hk*Wk;
 
         // this code aims to let memory fit with vector size.
-        int padded_ksize = ((ksize + FAST_VEC_NLANES-1) / FAST_VEC_NLANES) * FAST_VEC_NLANES;
+        int padded_ksize = ((ksize + VEC_NLANES-1) / VEC_NLANES) * VEC_NLANES;
         int nweights = C*padded_ksize;
         conv->weightsBuf.reserve(nweights + VEC_ALIGN);
         conv->weightsBufPtr = alignPtr(conv->weightsBuf.data(), VEC_ALIGN);
@@ -265,7 +274,8 @@ void runFastConv2d(InputArray _input, OutputArray _output, const Ptr<FastConv2d>
     else if (conv->conv_type == _FX_CONV_TYPE_WINOGRAD3X3 && inputShape[2] >= 12 && inputShape[3] >= 12) // winograd
     {
         CV_Assert(conv->weightsWinoBufPtr);
-        return runWinograd63(input, fusedAddMat, output, conv, ntasks, minval, maxval, activ, ifMinMaxAct);
+        if (runWinograd63(input, fusedAddMat, output, conv, ntasks, minval, maxval, activ, ifMinMaxAct))
+            return;
     }
 
     int N = inputShape[0], C = inputShape[1], Hi = inputShape[2], Wi = inputShape[3];  // [N, C, H, W]
diff --git a/modules/dnn/src/layers/fast_convolution/fast_convolution.hpp b/modules/dnn/src/layers/fast_convolution/fast_convolution.hpp
index 62c4170a3a..01f5edee48 100644
--- a/modules/dnn/src/layers/fast_convolution/fast_convolution.hpp
+++ b/modules/dnn/src/layers/fast_convolution/fast_convolution.hpp
@@ -12,35 +12,25 @@
 #if CV_NEON && CV_NEON_AARCH64  // 32 registers.
 #define CONV_MR 4
 #define CONV_NR 28
-enum { FAST_VEC_NLANES=4 };
 #elif CV_NEON              // 16 registers.
 #define CONV_MR 4
 #define CONV_NR 12
-enum { FAST_VEC_NLANES=4 };
 #else // SIMD 128, AVX or AVX2
 #define CONV_MR 4
 #define CONV_NR 24
-
-#if CV_TRY_AVX2
-enum { FAST_VEC_NLANES=8 }; // AVX2
-#else
-enum { FAST_VEC_NLANES=4 }; // SIMD 128
-#endif
-
-#endif
 #endif
 
+// Winograd Params
 enum {
     _FX_WINO_STEP=6,
     _FX_WINO_KSIZE=3,
     _FX_WINO_SIZE=_FX_WINO_STEP+_FX_WINO_KSIZE-1,
     _FX_WINO_AREA=_FX_WINO_SIZE*_FX_WINO_SIZE,
 
-#if CV_TRY_AVX2 || (CV_NEON && CV_NEON_AARCH64)
     _FX_WINO_KBLOCK = 4,
+#if (CV_NEON && CV_NEON_AARCH64) || CV_TRY_AVX2
     _FX_WINO_IBLOCK = 6,
 #else
-    _FX_WINO_KBLOCK = 4,
     _FX_WINO_IBLOCK = 3,
 #endif
 
@@ -52,8 +42,8 @@ enum {
 
     _FX_WINO_NATOMS_F32 = _FX_WINO_AREA / _FX_WINO_ATOM_F32, // for AVX2, it is 8, otherwise, it's 16.
 };
-
 enum { _FX_CONV_TYPE_GENERIC=0, _FX_CONV_TYPE_DEPTHWISE=1, _FX_CONV_TYPE_WINOGRAD3X3=2 };
+#endif
 
 namespace cv {
 namespace dnn {
@@ -77,8 +67,18 @@ struct FastConv2d
 #else
     bool useSIMD128 = false;
 #endif
+
+#if CV_TRY_AVX2
     bool useAVX2 = checkHardwareSupport(CPU_AVX2);
+#else
+    bool useAVX2 = false;
+#endif
+
+#if CV_NEON
     bool useNEON = checkHardwareSupport(CPU_NEON);
+#else
+    bool useNEON = false;
+#endif
 };
 
 // return a FastConv2d instance.
@@ -99,7 +99,7 @@ void runFastConv2d(InputArray _input, OutputArray _output, const Ptr<FastConv2d>
 void runDepthwise(InputArray _input, OutputArray _output, const Ptr<FastConv2d>& conv, float minval, float maxval,
         ActivationLayer* activ, bool ifMinMaxAct);
 
-void runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _output, const Ptr<FastConv2d>& conv, int ntasks,
+int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _output, const Ptr<FastConv2d>& conv, int ntasks,
                   float minval, float maxval, ActivationLayer* activ, bool ifMinMaxAct);
 
 } // namespace dnn
diff --git a/modules/dnn/src/layers/fast_convolution/winograd_3x3s1_f63.cpp b/modules/dnn/src/layers/fast_convolution/winograd_3x3s1_f63.cpp
index bc44f73a22..10b55f3604 100644
--- a/modules/dnn/src/layers/fast_convolution/winograd_3x3s1_f63.cpp
+++ b/modules/dnn/src/layers/fast_convolution/winograd_3x3s1_f63.cpp
@@ -13,6 +13,8 @@
 #include "fast_convolution.hpp"
 
 namespace cv { namespace dnn {
+
+#if CV_NEON || CV_SIMD128 || CV_TRY_AVX2
 enum { VEC_ALIGN = 32, DFT_TYPE = CV_32F }; // Memory alignment.
 
 static void
@@ -141,7 +143,7 @@ _fx_winograd_accum_f32(const float* inwptr, const float* wptr,
             vst1q_f32(outbuf + 20*64, s32);
         }
     }
-#elif CV_SIMD
+#elif CV_SIMD128
     CV_Assert(_FX_WINO_IBLOCK == 3 && _FX_WINO_KBLOCK == 4);
     for (int atom_id = 0; atom_id < _FX_WINO_NATOMS_F32; atom_id++,
             outbuf += _FX_WINO_ATOM_F32)
@@ -183,15 +185,15 @@ _fx_winograd_accum_f32(const float* inwptr, const float* wptr,
         v_store(outbuf, s00);
         v_store(outbuf + 1*64, s01);
         v_store(outbuf + 2*64, s02);
-        v_store(outbuf + 6*64, s10);
-        v_store(outbuf + 7*64, s11);
-        v_store(outbuf + 8*64, s12);
-        v_store(outbuf + 12*64, s20);
-        v_store(outbuf + 13*64, s21);
-        v_store(outbuf + 14*64, s22);
-        v_store(outbuf + 18*64, s30);
-        v_store(outbuf + 19*64, s31);
-        v_store(outbuf + 20*64, s32);
+        v_store(outbuf + 3*64, s10);
+        v_store(outbuf + 4*64, s11);
+        v_store(outbuf + 5*64, s12);
+        v_store(outbuf + 6*64, s20);
+        v_store(outbuf + 7*64, s21);
+        v_store(outbuf + 8*64, s22);
+        v_store(outbuf + 9*64, s30);
+        v_store(outbuf + 10*64, s31);
+        v_store(outbuf + 11*64, s32);
     }
 #else
     for (int atom_id = 0; atom_id < _FX_WINO_NATOMS_F32;
@@ -406,7 +408,7 @@ _fx_winograd_BtXB_8x8_f32(const float* inptr, int inpstep,
     vst1q_f32(outptr + outstep*13, z61);
     vst1q_f32(outptr + outstep*14, z70);
     vst1q_f32(outptr + outstep*15, z71);
-#elif CV_SIMD
+#elif CV_SIMD128
     v_float32x4 x00 = v_load(inptr), x01 = v_load(inptr + 4);
     v_float32x4 x10 = v_load(inptr + inpstep), x11 = v_load(inptr + inpstep + 4);
     v_float32x4 x20 = v_load(inptr + inpstep*2), x21 = v_load(inptr + inpstep*2 + 4);
@@ -750,8 +752,7 @@ _fx_winograd_AtXA_8x8_f32(const float* inptr, int inpstep,
     vst1_f32(outptr + outstep*4 + 4, vget_low_f32(z41));
     vst1q_f32(outptr + outstep*5, z50);
     vst1_f32(outptr + outstep*5 + 4, vget_low_f32(z51));
-//#elif CV_AVX2
-#elif CV_SIMD
+#elif CV_SIMD128
     v_float32x4 x00 = v_load(inptr), x01 = v_load(inptr + 4);
     v_float32x4 x10 = v_load(inptr + inpstep), x11 = v_load(inptr + inpstep + 4);
     v_float32x4 x20 = v_load(inptr + inpstep*2), x21 = v_load(inptr + inpstep*2 + 4);
@@ -919,7 +920,7 @@ _fx_winograd_AtXA_8x8_f32(const float* inptr, int inpstep,
 #endif
 }
 
-void runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _output, const Ptr<FastConv2d>& conv,
+int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _output, const Ptr<FastConv2d>& conv,
                   int ntasks, float minval, float maxval, ActivationLayer* activ, bool ifMinMaxAct)
 {
     Mat input = _input.getMat();
@@ -1138,5 +1139,15 @@ void runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _outp
             }
         }
     }});
+    return 1;
 }
+
+#else
+
+int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _output, const Ptr<FastConv2d>& conv,
+                  int ntasks, float minval, float maxval, ActivationLayer* activ, bool ifMinMaxAct)
+{
+    return 0;
+}
+#endif
 }} // namespace cv::dnn