Merge branch 4.x

2025-08-03 20:56:30 +08:00 · 2024-11-22 02:32:17 +00:00 · 2024-11-22 02:32:17 +00:00 · 7808d50412
commit 7808d50412
parent 5a3e18973b 7be5181bff
20 changed files with 1381 additions and 1615 deletions
--- a/3rdparty/kleidicv/CMakeLists.txt
+++ b/3rdparty/kleidicv/CMakeLists.txt
@ -1,24 +1,7 @@
 project(kleidicv_hal)

-set(KLEIDICV_SOURCE_PATH "" CACHE PATH "Directory containing KleidiCV sources")
-ocv_update(KLEIDICV_SRC_COMMIT "0.2.0")
-ocv_update(KLEIDICV_SRC_HASH "dabe522e8f55ac342d07a787391dab80")
-
-if(KLEIDICV_SOURCE_PATH)
-  set(THE_ROOT "${KLEIDICV_SOURCE_PATH}")
-else()
-  ocv_download(FILENAME "kleidicv-${KLEIDICV_SRC_COMMIT}.tar.gz"
-                HASH ${KLEIDICV_SRC_HASH}
-                URL
-                  "${OPENCV_KLEIDICV_URL}"
-                  "$ENV{OPENCV_KLEIDICV_URL}"
-                  "https://gitlab.arm.com/kleidi/kleidicv/-/archive/${KLEIDICV_SRC_COMMIT}/"
-                DESTINATION_DIR "${OpenCV_BINARY_DIR}/3rdparty/kleidicv/"
-                ID KLEIDICV
-                STATUS res
-                UNPACK RELATIVE_URL)
-  set(THE_ROOT "${OpenCV_BINARY_DIR}/3rdparty/kleidicv/kleidicv-${KLEIDICV_SRC_COMMIT}")
+if(HAVE_KLEIDICV)
+  option(KLEIDICV_ENABLE_SME2 "" OFF) # not compatible with some CLang versions in NDK
+  include("${KLEIDICV_SOURCE_PATH}/adapters/opencv/CMakeLists.txt")
 endif()

-option(KLEIDICV_ENABLE_SME2 "" OFF) # not compatible with some CLang versions in NDK
-include("${THE_ROOT}/adapters/opencv/CMakeLists.txt")
--- a/3rdparty/kleidicv/kleidicv.cmake
+++ b/3rdparty/kleidicv/kleidicv.cmake
@ -0,0 +1,21 @@
+function(download_kleidicv root_var)
+  set(${root_var} "" PARENT_SCOPE)
+
+  ocv_update(KLEIDICV_SRC_COMMIT "0.2.0")
+  ocv_update(KLEIDICV_SRC_HASH "dabe522e8f55ac342d07a787391dab80")
+
+  set(THE_ROOT "${OpenCV_BINARY_DIR}/3rdparty/kleidicv")
+  ocv_download(FILENAME "kleidicv-${KLEIDICV_SRC_COMMIT}.tar.gz"
+                HASH ${KLEIDICV_SRC_HASH}
+                URL
+                  "${OPENCV_KLEIDICV_URL}"
+                  "$ENV{OPENCV_KLEIDICV_URL}"
+                  "https://gitlab.arm.com/kleidi/kleidicv/-/archive/${KLEIDICV_SRC_COMMIT}/"
+                DESTINATION_DIR ${THE_ROOT}
+                ID KLEIDICV
+                STATUS res
+                UNPACK RELATIVE_URL)
+  if(res)
+    set(${root_var} "${OpenCV_BINARY_DIR}/3rdparty/kleidicv/kleidicv-${KLEIDICV_SRC_COMMIT}" PARENT_SCOPE)
+  endif()
+endfunction()
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -861,7 +861,7 @@ if(NOT DEFINED OpenCV_HAL)
  set(OpenCV_HAL "OpenCV_HAL")
 endif()

-if(WITH_KLEIDICV)
+if(HAVE_KLEIDICV)
  ocv_debug_message(STATUS "Enable KleidiCV acceleration")
  if(NOT ";${OpenCV_HAL};" MATCHES ";kleidicv;")
    set(OpenCV_HAL "kleidicv;${OpenCV_HAL}")
--- a/cmake/OpenCVFindLibsPerf.cmake
+++ b/cmake/OpenCVFindLibsPerf.cmake
@ -161,3 +161,19 @@ if(WITH_CLP)
    endif()
  endif()
 endif(WITH_CLP)
+
+# --- ARM KleidiCV
+if(WITH_KLEIDICV)
+  if(KLEIDICV_SOURCE_PATH AND EXISTS "${KLEIDICV_SOURCE_PATH}/adapters/opencv/CMakeLists.txt")
+    set(HAVE_KLEIDICV ON)
+  endif()
+  if(NOT HAVE_KLEIDICV)
+    include("${OpenCV_SOURCE_DIR}/3rdparty/kleidicv/kleidicv.cmake")
+    download_kleidicv(KLEIDICV_SOURCE_PATH)
+    if(KLEIDICV_SOURCE_PATH)
+      set(HAVE_KLEIDICV ON)
+    endif()
+  else()
+    set(HAVE_KLEIDICV OFF)
+  endif()
+endif(WITH_KLEIDICV)
--- a/modules/core/src/arithm.cpp
+++ b/modules/core/src/arithm.cpp
@ -613,7 +613,7 @@ static bool ocl_arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,

 typedef int (*ScalarFunc)(const uchar* src, size_t step_src,
                          uchar* dst, size_t step_dst, int width, int height,
-                          void* scalar, bool scalarIsFirst);
+                          void* scalar, bool scalarIsFirst, int nChannels);

 typedef int (*ExtendedTypeFunc)(const uchar* src1, size_t step1,
                                const uchar* src2, size_t step2,
@ -887,7 +887,6 @@ static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
            for( size_t j = 0; j < total; j += blocksize )
            {
                int bsz = (int)MIN(total - j, blocksize);
-                Size bszn(bsz*cn, 1);
                const uchar *sptr1 = ptrs[0];
                const uchar* sptr2 = buf2;
                uchar* dptr = ptrs[1];
@ -900,17 +899,17 @@ static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
                // try to perform operation in 1 call, fallback to classic way if fail
                uchar* opconverted = haveMask ? maskbuf : dptr;
                if (!scalarFunc || src2.total() != 1 ||
-                    scalarFunc(extSptr1, 1, opconverted, 1, bszn.width, bszn.height, (void*)extSptr2, swapped12) != 0)
+                    scalarFunc(extSptr1, 1, opconverted, 1, bsz, 1, (void*)extSptr2, swapped12, cn) != 0)
                {
                    // try to perform operation with conversion in one call
                    // if fail, use converter functions

                    if (!extendedFunc || extendedFunc(extSptr1, 1, extSptr2, 1, opconverted, 1,
-                                                    bszn.width, bszn.height, usrdata) != 0)
+                                                      bsz*cn, 1, usrdata) != 0)
                    {
                        if( cvtsrc1 )
                        {
-                            cvtsrc1( sptr1, 1, 0, 1, buf1, 1, bszn, 0 );
+                            cvtsrc1( sptr1, 1, 0, 1, buf1, 1, Size(bsz*cn, 1), 0 );
                            sptr1 = buf1;
                        }

@ -918,12 +917,12 @@ static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
                            std::swap(sptr1, sptr2);

                        uchar* fdst = ( haveMask || cvtdst ) ? wbuf : dptr;
-                        func( sptr1, 1, sptr2, 1, fdst, 1, bszn.width, bszn.height, usrdata );
+                        func( sptr1, 1, sptr2, 1, fdst, 1, bsz*cn, 1, usrdata );

                        if (cvtdst)
                        {
                            uchar* cdst = haveMask ? maskbuf : dptr;
-                            cvtdst(wbuf, 1, 0, 1, cdst, 1, bszn, 0);
+                            cvtdst(wbuf, 1, 0, 1, cdst, 1, Size(bsz*cn, 1), 0);
                        }
                        opconverted = cvtdst ? maskbuf : wbuf;
                    }
@ -965,9 +964,9 @@ static BinaryFuncC* getAddTab()
 }

 static int addScalar32f32fWrapper(const uchar* src, size_t step_src, uchar* dst, size_t step_dst, int width, int height,
-                                  void* scalar, bool /*scalarIsFirst*/)
+                                  void* scalar, bool /*scalarIsFirst*/, int nChannels)
 {
-    int res = cv_hal_addScalar32f32f((const float*)src, step_src, (float *)dst, step_dst, width, height, (const float*)scalar);
+    int res = cv_hal_addScalar32f32f((const float*)src, step_src, (float *)dst, step_dst, width, height, (const float*)scalar, nChannels);
    if (res == CV_HAL_ERROR_OK || res == CV_HAL_ERROR_NOT_IMPLEMENTED)
        return res;
    else
@ -978,9 +977,9 @@ static int addScalar32f32fWrapper(const uchar* src, size_t step_src, uchar* dst,
 }

 static int addScalar16s16sWrapper(const uchar* src, size_t step_src, uchar* dst, size_t step_dst, int width, int height,
-                                  void* scalar, bool /*scalarIsFirst*/)
+                                  void* scalar, bool /*scalarIsFirst*/, int nChannels)
 {
-    int res = cv_hal_addScalar16s16s((const int16_t*)src, step_src, (int16_t *)dst, step_dst, width, height, (const int16_t*)scalar);
+    int res = cv_hal_addScalar16s16s((const int16_t*)src, step_src, (int16_t *)dst, step_dst, width, height, (const int16_t*)scalar, nChannels);
    if (res == CV_HAL_ERROR_OK || res == CV_HAL_ERROR_NOT_IMPLEMENTED)
        return res;
    else
@ -1094,6 +1093,67 @@ static BinaryFuncC* getAbsDiffTab()
    return absDiffTab;
 }

+
+static int absDiffScalar32f32fWrapper(const uchar* src, size_t step_src, uchar* dst, size_t step_dst, int width, int height,
+                                      void* scalar, bool /*scalarIsFirst*/, int nChannels)
+{
+    int res = cv_hal_absDiffScalar32f32f((const float*)src, step_src, (float *)dst, step_dst, width, height, (const float*)scalar, nChannels);
+    if (res == CV_HAL_ERROR_OK || res == CV_HAL_ERROR_NOT_IMPLEMENTED)
+        return res;
+    else
+    {
+        CV_Error_(cv::Error::StsInternal, ("HAL implementation addScalar32f32f ==> " CVAUX_STR(cv_hal_addScalar32f32f)
+                                           " returned %d (0x%08x)", res, res));
+    }
+}
+
+static int absDiffScalar32s32uWrapper(const uchar* src, size_t step_src, uchar* dst, size_t step_dst, int width, int height,
+                                      void* scalar, bool /*scalarIsFirst*/, int nChannels)
+{
+    int res = cv_hal_absDiffScalar32s32u((const int*)src, step_src, (uint32_t*)dst, step_dst, width, height, (const int*)scalar, nChannels);
+    if (res == CV_HAL_ERROR_OK || res == CV_HAL_ERROR_NOT_IMPLEMENTED)
+        return res;
+    else
+    {
+        CV_Error_(cv::Error::StsInternal, ("HAL implementation addScalar32f32f ==> " CVAUX_STR(cv_hal_addScalar32f32f)
+                                           " returned %d (0x%08x)", res, res));
+    }
+}
+
+static int absDiffScalar8u8uWrapper(const uchar* src, size_t step_src, uchar* dst, size_t step_dst, int width, int height,
+                                      void* scalar, bool /*scalarIsFirst*/, int nChannels)
+{
+    int res = cv_hal_absDiffScalar8u8u((const uchar*)src, step_src, (uchar*)dst, step_dst, width, height, (const uchar*)scalar, nChannels);
+    if (res == CV_HAL_ERROR_OK || res == CV_HAL_ERROR_NOT_IMPLEMENTED)
+        return res;
+    else
+    {
+        CV_Error_(cv::Error::StsInternal, ("HAL implementation addScalar32f32f ==> " CVAUX_STR(cv_hal_addScalar32f32f)
+                                           " returned %d (0x%08x)", res, res));
+    }
+}
+
+static ScalarFunc getAbsDiffScalarFunc(int srcType, int dstType)
+{
+    if (srcType == CV_32F && dstType == CV_32F)
+    {
+        return absDiffScalar32f32fWrapper;
+    }
+    // resulting type is 32U in fact
+    else if (srcType == CV_32S && dstType == CV_32S)
+    {
+        return absDiffScalar32s32uWrapper;
+    }
+    else if (srcType == CV_8U && dstType == CV_8U)
+    {
+        return absDiffScalar8u8uWrapper;
+    }
+    else
+    {
+        return nullptr;
+    }
+}
+
 }

 void cv::add( InputArray src1, InputArray src2, OutputArray dst,
@ -1108,7 +1168,17 @@ void cv::add( InputArray src1, InputArray src2, OutputArray dst,
        return;
    }

-    ScalarFunc scalarFunc = getAddScalarFunc(src1.depth(), dtype < 0 ? dst.depth() : dtype);
+    int sdepth = src1.depth();
+    if (checkScalar(src1, src1.type(), src1.kind(), _InputArray::MATX))
+    {
+        sdepth = src2.depth();
+    }
+    if (checkScalar(src2, src2.type(), src2.kind(), _InputArray::MATX))
+    {
+        sdepth = src1.depth();
+    }
+
+    ScalarFunc scalarFunc = getAddScalarFunc(sdepth, dtype < 0 ? dst.depth() : dtype);
    arithm_op(src1, src2, dst, mask, dtype, getAddTab(), false, 0, OCL_OP_ADD, nullptr,
              /* scalarFunc */ scalarFunc );
 }
@ -1141,7 +1211,18 @@ void cv::absdiff( InputArray src1, InputArray src2, OutputArray dst )
        return;
    }

-    arithm_op(src1, src2, dst, noArray(), -1, getAbsDiffTab(), false, 0, OCL_OP_ABSDIFF);
+    int sdepth = src1.depth();
+    if (checkScalar(src1, src1.type(), src1.kind(), _InputArray::MATX))
+    {
+        sdepth = src2.depth();
+    }
+    if (checkScalar(src2, src2.type(), src2.kind(), _InputArray::MATX))
+    {
+        sdepth = src1.depth();
+    }
+    ScalarFunc scalarFunc = getAbsDiffScalarFunc(sdepth, dst.depth());
+    arithm_op(src1, src2, dst, noArray(), -1, getAbsDiffTab(), false, 0, OCL_OP_ABSDIFF,
+              /* extendedFunc */ nullptr, scalarFunc);
 }

 void cv::copyTo(InputArray _src, OutputArray _dst, InputArray _mask)
--- a/modules/core/src/hal_replacement.hpp
+++ b/modules/core/src/hal_replacement.hpp
@ -119,9 +119,10 @@ Add scalar: _dst[i] = src[i] + scalar
@param width width of the images
@param height height of the images
@param scalar_data pointer to scalar value
+@param nChannels number of channels per element
 */
-inline int hal_ni_addScalar32f32f(const float *src_data, size_t src_step, float *dst_data, size_t dst_step, int width, int height, const float* scalar_data) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_addScalar16s16s(const int16_t *src_data, size_t src_step, int16_t *dst_data, size_t dst_step, int width, int height, const int16_t* scalar_data) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_addScalar32f32f(const float*   src_data, size_t src_step, float*   dst_data, size_t dst_step, int width, int height, const float*   scalar_data, int nChannels) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_addScalar16s16s(const int16_t* src_data, size_t src_step, int16_t* dst_data, size_t dst_step, int width, int height, const int16_t* scalar_data, int nChannels) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
 //! @}

 /**
@ -190,6 +191,23 @@ inline int hal_ni_absdiff64u(const uint64 *src1_data, size_t src1_step, const ui
 inline int hal_ni_absdiff64s(const int64 *src1_data, size_t src1_step, const int64 *src2_data, size_t src2_step, int64 *dst_data, size_t dst_step, int width, int height) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
 inline int hal_ni_absdiff16f(const cv_hal_f16 *src1_data, size_t src1_step, const cv_hal_f16 *src2_data, size_t src2_step, cv_hal_f16 *dst_data, size_t dst_step, int width, int height) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
 inline int hal_ni_absdiff16bf(const cv_hal_bf16 *src1_data, size_t src1_step, const cv_hal_bf16 *src2_data, size_t src2_step, cv_hal_bf16 *dst_data, size_t dst_step, int width, int height) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+
+/*
+Absolute difference with scalar: _dst[i] = | src[i] - scalar |_
+
+@param src_data source image data
+@param src_step source image step
+@param dst_data destination image data
+@param dst_step destination image step
+@param width width of the images
+@param height height of the images
+@param scalar_data pointer to scalar value
+@param nChannels number of channels per element
+*/
+inline int hal_ni_absDiffScalar32f32f(const float* src_data, size_t src_step, float*    dst_data, size_t dst_step, int width, int height, const float* scalar_data, int nChannels) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_absDiffScalar32s32u(const int*   src_data, size_t src_step, uint32_t* dst_data, size_t dst_step, int width, int height, const int*   scalar_data, int nChannels) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_absDiffScalar8u8u  (const uchar* src_data, size_t src_step, uchar*    dst_data, size_t dst_step, int width, int height, const uchar* scalar_data, int nChannels) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+
 //! @}

 /**
@ -279,6 +297,9 @@ inline int hal_ni_not8u(const uchar *src_data, size_t src_step, uchar *dst_data,
 #define cv_hal_absdiff64f hal_ni_absdiff64f
 #define cv_hal_absdiff16f hal_ni_absdiff16f
 #define cv_hal_absdiff16bf hal_ni_absdiff16bf
+#define cv_hal_absDiffScalar32f32f hal_ni_absDiffScalar32f32f
+#define cv_hal_absDiffScalar32s32u hal_ni_absDiffScalar32s32u
+#define cv_hal_absDiffScalar8u8u   hal_ni_absDiffScalar8u8u
 #define cv_hal_and8u hal_ni_and8u
 #define cv_hal_or8u hal_ni_or8u
 #define cv_hal_xor8u hal_ni_xor8u
--- a/modules/dnn/CMakeLists.txt
+++ b/modules/dnn/CMakeLists.txt
@ -8,7 +8,7 @@ ocv_add_dispatched_file_force_all("layers/layers_common" AVX AVX2 AVX512_SKX RVV
 ocv_add_dispatched_file_force_all("int8layers/layers_common" AVX2 AVX512_SKX RVV LASX)
 ocv_add_dispatched_file_force_all("layers/cpu_kernels/conv_block" AVX AVX2 NEON NEON_FP16)
 ocv_add_dispatched_file_force_all("layers/cpu_kernels/conv_depthwise" AVX AVX2 RVV LASX)
-ocv_add_dispatched_file_force_all("layers/cpu_kernels/conv_winograd_f63" AVX AVX2 NEON_FP16)
+ocv_add_dispatched_file("layers/cpu_kernels/conv_winograd_f63" AVX AVX2 NEON NEON_FP16)
 ocv_add_dispatched_file_force_all("layers/cpu_kernels/fast_gemm_kernels" AVX AVX2 NEON LASX)

 ocv_add_module(dnn opencv_core opencv_imgproc WRAP python java objc js)
--- a/modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.cpp
+++ b/modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.cpp
@ -12,28 +12,21 @@
 #include "../../precomp.hpp"
 #include "convolution.hpp"

-#include "conv_winograd_f63.simd.hpp"
-#include "layers/cpu_kernels/conv_winograd_f63.simd_declarations.hpp" // defines CV_CPU_DISPATCH_MODES_ALL=AVX2,...,BASELINE based on CMakeLists.txt content
-
 namespace cv { namespace dnn {

-#if CV_NEON || CV_SIMD128 || CV_TRY_AVX2
 enum { VEC_ALIGN = 32, DFT_TYPE = CV_32F }; // Memory alignment.

-void winofunc_accum_F32(const float* inwptr, const float* wptr, float* outbuf, int Cg, int iblock,
-                            const int winoIblock, const int winoKblock, const int winoAtomF32, const int winoNatomF32);
-
-/*Input transform*/
-void winofunc_BtXB_8x8_F32(const float* inptr, int inpstep,
-                          float* outptr, int Cg, const int winoIblock, const int winoAtomF32);
-
-/*Output transform*/
-void winofunc_AtXA_8x8_F32(const float* inptr, int inpstep, float* bpptr, int bpstep, float* outptr, int outstep,
-                          float bias, float minval, float maxval, bool ifMinMaxAct);
-
 int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _output, const Ptr<FastConv>& conv,
                  int ntasks, float minval, float maxval, ActivationLayer* activ, bool ifMinMaxAct)
 {
+    const cv::dnn::Winofunc func =
+        conv->useFP16 ? cv::dnn::getWinofunc_F16()
+        : (conv->useAVX || conv->useAVX2 || conv->useNEON || conv->useRVV || conv->useSIMD128) ? cv::dnn::getWinofunc_F32()
+        : cv::dnn::Winofunc::empty();
+
+    if (!func.isGood())
+        return 0;
+
    Mat input = _input.getMat();
    Mat output = _output.getMat();
    Mat fusedAddMat = _fusedAddMat.getMat();
@ -52,42 +45,10 @@ int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _outpu
    int ngroups = conv->ngroups, Cg = C/ngroups, Kg = K/ngroups;

    const int CONV_WINO_KBLOCK = 4;
-#if (CV_NEON && CV_NEON_AARCH64)
-    const int CONV_WINO_IBLOCK = 6;
-#elif  CV_TRY_AVX || CV_TRY_AVX2
-    const int CONV_WINO_IBLOCK = (conv->useAVX || conv->useAVX2) ? 6 : 3;
-#else
-    const int CONV_WINO_IBLOCK = 3;
-#endif
-
-#if CV_TRY_AVX || CV_TRY_AVX2
-    const int CONV_WINO_ATOM_F32 = (conv->useAVX || conv->useAVX2) ? 8 : 4;
-#else
-    const int CONV_WINO_ATOM_F32 = 4;
-#endif
-    const int CONV_WINO_NATOMS_F32 = CONV_WINO_AREA / CONV_WINO_ATOM_F32; // for AVX2, it is 8, otherwise, it's 16.
-
-    int CONV_WINO_ATOM = CONV_WINO_ATOM_F32;
-    int CONV_WINO_NATOMS = CONV_WINO_NATOMS_F32;
-
-#ifdef CONV_ARM_FP16
-    // FP 16
-    const int CONV_WINO_ATOM_F16 = CONV_WINO_ATOM_F32 * 2;
-    const int CONV_WINO_NATOMS_F16 = CONV_WINO_AREA / CONV_WINO_ATOM_F16;
-#endif
-
-    int esz = sizeof(float );
-
-#ifdef CONV_ARM_FP16
-    const bool useFP16 = conv->useFP16;
-    if (useFP16)
-    {
-        // works at FP 16.
-        CONV_WINO_ATOM = CONV_WINO_ATOM_F16;
-        CONV_WINO_NATOMS = CONV_WINO_NATOMS_F16;
-        esz = sizeof(__fp16);
-    }
-#endif
+    const int CONV_WINO_IBLOCK = func.iblock;
+    const int CONV_WINO_ATOM = func.natom;
+    const int CONV_WINO_NATOMS = CONV_WINO_AREA / CONV_WINO_ATOM;
+    const int esz = func.esz;

    int Kg_nblocks = (Kg + CONV_WINO_KBLOCK - 1)/CONV_WINO_KBLOCK;
    const size_t inp_planesize = (size_t)Hi*Wi;
@ -175,35 +136,7 @@ int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _outpu
                            inptr = inpbuf;
                            inpstep = CONV_WINO_SIZE;
                        }
-
-#if CV_TRY_AVX2
-                        if (conv->useAVX2)
-                            opt_AVX2::winofunc_BtXB_8x8_F32(inptr, inpstep, (float *)inwptr, Cg, CONV_WINO_IBLOCK, CONV_WINO_ATOM);
-                        else
-#endif
-#if CV_TRY_AVX
-                        if (conv->useAVX)
-                            opt_AVX::winofunc_BtXB_8x8_F32(inptr, inpstep, (float *)inwptr, Cg, CONV_WINO_IBLOCK, CONV_WINO_ATOM);
-                        else
-#endif
-#if CV_NEON && CV_NEON_AARCH64
-                        if (conv->useNEON)
-                        {
-#ifdef CONV_ARM_FP16
-                            if (useFP16)
-                            {
-                                opt_NEON_FP16::winofunc_BtXB_8x8_F16(inptr, inpstep, inwptr, Cg, CONV_WINO_IBLOCK,
-                                                                CONV_WINO_ATOM);
-                            }
-                            else
-#endif
-                            opt_NEON::winofunc_BtXB_8x8_F32(inptr, inpstep, (float *)inwptr, Cg, CONV_WINO_IBLOCK,
-                                                            CONV_WINO_ATOM);
-                        }
-                        else
-#endif
-                        winofunc_BtXB_8x8_F32(inptr, inpstep, (float *)inwptr, Cg, CONV_WINO_IBLOCK, CONV_WINO_ATOM);
-
+                        func.BtXB_8x8(inptr, inpstep, (uchar*)inwptr, Cg, CONV_WINO_IBLOCK, CONV_WINO_ATOM);
                    }
                    else
                    {
@ -219,18 +152,20 @@ int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _outpu
    // apply inverse Winograd transforms to the sums,
    // add bias, apply activation function if any and store the results.
    char* wptr0 = nullptr;
-#ifdef CONV_ARM_FP16
-    if (useFP16)
+    if (esz == 2)
    {
        CV_Assert(!conv->weightsWinoBuf_FP16.empty());
        wptr0 = (char *)conv->getWeightsWinoFP16();
    }
-    else
-#endif
+    else if (esz == 4)
    {
        CV_Assert(!conv->weightsWinoBuf.empty());
        wptr0 = (char *)conv->getWeightsWino();
    }
+    else
+    {
+        CV_Error(Error::StsError, "Impossible configuration");
+    }

    parallel_for_(Range(0, ntasks), [&](const Range& r0) {
    for (int task_id = r0.start; task_id < r0.end; task_id++)
@ -271,36 +206,9 @@ int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _outpu
                char* inwptr = wbuf_all + inwofs * esz;
                char* wptr = wptr0 + wofs * esz;

-#if CV_TRY_AVX2
-                if (conv->useAVX2)
-                    opt_AVX2::winofunc_accum_F32((float *)inwptr, (float *)wptr, (float *)out_wbuf, Cg, block_id1 - block_id0, CONV_WINO_IBLOCK,
-                                       CONV_WINO_KBLOCK, CONV_WINO_ATOM, CONV_WINO_NATOMS);
-                else
-#endif
-#if CV_TRY_AVX
-                if (conv->useAVX)
-                    opt_AVX::winofunc_accum_F32((float *)inwptr, (float *)wptr, (float *)out_wbuf, Cg, block_id1 - block_id0, CONV_WINO_IBLOCK,
-                                       CONV_WINO_KBLOCK, CONV_WINO_ATOM, CONV_WINO_NATOMS);
-                else
-#endif
-#if CV_NEON && CV_NEON_AARCH64
-                if (conv->useNEON)
-                {
-#ifdef CONV_ARM_FP16
-                    if (useFP16)
-                    {
-                        opt_NEON_FP16::winofunc_accum_F16(inwptr, wptr, out_wbuf, Cg, block_id1 - block_id0, CONV_WINO_IBLOCK,
-                                                     CONV_WINO_KBLOCK, CONV_WINO_ATOM, CONV_WINO_NATOMS);
-                    }
-                    else
-#endif
-                    opt_NEON::winofunc_accum_F32((float *)inwptr, (float *)wptr, (float *)out_wbuf, Cg, block_id1 - block_id0, CONV_WINO_IBLOCK,
-                                                 CONV_WINO_KBLOCK, CONV_WINO_ATOM, CONV_WINO_NATOMS);
-                }
-                else
-#endif
-                winofunc_accum_F32((float *)inwptr, (float *)wptr, (float *)out_wbuf, Cg, block_id1 - block_id0, CONV_WINO_IBLOCK,
-                                       CONV_WINO_KBLOCK, CONV_WINO_ATOM, CONV_WINO_NATOMS);
+                func.accum((uchar*)inwptr, (uchar*)wptr, (uchar*)out_wbuf, Cg,
+                           block_id1 - block_id0, CONV_WINO_IBLOCK,
+                           CONV_WINO_KBLOCK, CONV_WINO_ATOM, CONV_WINO_NATOMS);

                for (int k = k0; k < k1; k++)
                {
@ -336,37 +244,10 @@ int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _outpu
                                           dx1*sizeof(pbptr0[0]));
                            }
                        }
-#if CV_TRY_AVX2
-                        if (conv->useAVX2)
-                            opt_AVX2::winofunc_AtXA_8x8_F32((float *)out_wbuf + ((k - k0)*CONV_WINO_IBLOCK + (block_id - block_id0))*CONV_WINO_AREA, CONV_WINO_SIZE,
-                                                                bpptr, outstep, outptr, outstep, biasv, minval, maxval, ifMinMaxAct);
-                        else
-#endif
-#if CV_TRY_AVX
-                        if (conv->useAVX)
-                            opt_AVX::winofunc_AtXA_8x8_F32((float *)out_wbuf + ((k - k0)*CONV_WINO_IBLOCK + (block_id - block_id0))*CONV_WINO_AREA, CONV_WINO_SIZE,
-                                                                bpptr, outstep, outptr, outstep, biasv, minval, maxval, ifMinMaxAct);
-                        else
-#endif
-#if CV_NEON && CV_NEON_AARCH64
-                        // NEON optimization is only for ARMv8 device, and for ARMv7 device, we use the Universal intrinsics.
-                        if (conv->useNEON)
-                        {
-#ifdef CONV_ARM_FP16
-                            if (useFP16)
-                            {
-                                opt_NEON_FP16::winofunc_AtXA_8x8_F16(out_wbuf + ((k - k0)*CONV_WINO_IBLOCK + (block_id - block_id0))*CONV_WINO_AREA * esz, CONV_WINO_SIZE,
-                                                                bpptr, outstep, outptr, outstep, biasv, minval, maxval, ifMinMaxAct);
-                            }
-                            else
-#endif
-                            opt_NEON::winofunc_AtXA_8x8_F32((float *)out_wbuf + ((k - k0)*CONV_WINO_IBLOCK + (block_id - block_id0))*CONV_WINO_AREA, CONV_WINO_SIZE,
-                                                            bpptr, outstep, outptr, outstep, biasv, minval, maxval, ifMinMaxAct);
-                        }
-                        else
-#endif
-                        winofunc_AtXA_8x8_F32((float *)out_wbuf + ((k - k0)*CONV_WINO_IBLOCK + (block_id - block_id0))*CONV_WINO_AREA, CONV_WINO_SIZE,
-                                                  bpptr, outstep, outptr, outstep, biasv, minval, maxval, ifMinMaxAct);
+
+                        const int count = ((k - k0)*CONV_WINO_IBLOCK + (block_id - block_id0))*CONV_WINO_AREA;
+                        func.AtXA_8x8((uchar*)out_wbuf + count * esz, CONV_WINO_SIZE,
+                                      bpptr, outstep, outptr, outstep, biasv, minval, maxval, ifMinMaxAct);

                        if (partial)
                        {
@ -383,441 +264,4 @@ int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _outpu
    return 1;
 }

-/****************************************************************************************\
-                                    SIMD for winograd function
-\****************************************************************************************/
-
-#if CV_SIMD128
-
-void winofunc_accum_F32(const float* inwptr, const float* wptr, float* outbuf, int Cg, int iblock,
-                            const int winoIblock, const int winoKblock, const int winoAtomF32, const int winoNatomF32)
-{
-#if 1
-    CV_Assert(winoIblock == 3 && winoKblock == 4 && winoAtomF32 == 4);
-    for (int atom_id = 0; atom_id < winoNatomF32; atom_id++,
-            outbuf += winoAtomF32)
-    {
-        v_float32x4 s00 = v_setzero_f32(), s01 = s00, s02 = s00;
-        v_float32x4 s10 = v_setzero_f32(), s11 = s00, s12 = s00;
-        v_float32x4 s20 = v_setzero_f32(), s21 = s00, s22 = s00;
-        v_float32x4 s30 = v_setzero_f32(), s31 = s00, s32 = s00;
-
-        for (int c = 0; c < Cg; c++, inwptr += winoIblock*winoAtomF32,
-                                     wptr += winoKblock*winoAtomF32)
-        {
-            v_float32x4 x0, x1, x2;
-            x0 = v_load(inwptr);
-            x1 = v_load(inwptr + 4);
-            x2 = v_load(inwptr + 8);
-
-            v_float32x4 w0 = v_load(wptr);
-            s00 = v_fma(w0, x0, s00);
-            s01 = v_fma(w0, x1, s01);
-            s02 = v_fma(w0, x2, s02);
-
-            w0 = v_load(wptr + 4);
-            s10 = v_fma(w0, x0, s10);
-            s11 = v_fma(w0, x1, s11);
-            s12 = v_fma(w0, x2, s12);
-
-            w0 = v_load(wptr + 8);
-            s20 = v_fma(w0, x0, s20);
-            s21 = v_fma(w0, x1, s21);
-            s22 = v_fma(w0, x2, s22);
-
-            w0 = v_load(wptr + 12);
-            s30 = v_fma(w0, x0, s30);
-            s31 = v_fma(w0, x1, s31);
-            s32 = v_fma(w0, x2, s32);
-        }
-
-        v_store(outbuf, s00);
-        v_store(outbuf + 1*64, s01);
-        v_store(outbuf + 2*64, s02);
-        v_store(outbuf + 3*64, s10);
-        v_store(outbuf + 4*64, s11);
-        v_store(outbuf + 5*64, s12);
-        v_store(outbuf + 6*64, s20);
-        v_store(outbuf + 7*64, s21);
-        v_store(outbuf + 8*64, s22);
-        v_store(outbuf + 9*64, s30);
-        v_store(outbuf + 10*64, s31);
-        v_store(outbuf + 11*64, s32);
-    }
-#else
-    // Naive C++ code, the code should never be run here.
-    for (int atom_id = 0; atom_id < winoNatomF32;
-                atom_id++, outbuf += winoAtomF32)
-    {
-        float sumbuf[winoIblock*winoKblock*winoAtomF32];
-        memset(sumbuf, 0, sizeof(sumbuf));
-        for (int c = 0; c < Cg; c++, inwptr += winoIblock*winoAtomF32,
-                                     wptr += winoKblock*winoAtomF32)
-        {
-            for (int i = 0; i < winoKblock; i++)
-            {
-                for (int j = 0; j < winoIblock; j++)
-                {
-                    int i_ = i*winoAtomF32;
-                    int j_ = j*winoAtomF32;
-                    int ij_ = i_*winoIblock + j_;
-                    float s0 = inwptr[j_ + 0]*wptr[i_ + 0];
-                    float s1 = inwptr[j_ + 1]*wptr[i_ + 1];
-                    float s2 = inwptr[j_ + 2]*wptr[i_ + 2];
-                    float s3 = inwptr[j_ + 3]*wptr[i_ + 3];
-                    sumbuf[ij_ + 0] += s0;
-                    sumbuf[ij_ + 1] += s1;
-                    sumbuf[ij_ + 2] += s2;
-                    sumbuf[ij_ + 3] += s3;
-                }
-            }
-        }
-        for (int ij = 0; ij < winoKblock*winoIblock; ij++)
-        {
-            int ij_ = ij*winoAtomF32;
-            int ij_out = ij*CONV_WINO_AREA;
-            outbuf[ij_out + 0] = sumbuf[ij_ + 0];
-            outbuf[ij_out + 1] = sumbuf[ij_ + 1];
-            outbuf[ij_out + 2] = sumbuf[ij_ + 2];
-            outbuf[ij_out + 3] = sumbuf[ij_ + 3];
-        }
-    }
-#endif
-}
-
-/*Input transform*/
-void winofunc_BtXB_8x8_F32(const float* inptr, int inpstep,
-                          float* outptr, int Cg, const int winoIblock, const int winoAtomF32)
-{
-    CV_Assert(winoIblock == 3 && winoAtomF32 == 4);
-    v_float32x4 x00 = v_load(inptr), x01 = v_load(inptr + 4);
-    v_float32x4 x10 = v_load(inptr + inpstep), x11 = v_load(inptr + inpstep + 4);
-    v_float32x4 x20 = v_load(inptr + inpstep*2), x21 = v_load(inptr + inpstep*2 + 4);
-    v_float32x4 x30 = v_load(inptr + inpstep*3), x31 = v_load(inptr + inpstep*3 + 4);
-    v_float32x4 x40 = v_load(inptr + inpstep*4), x41 = v_load(inptr + inpstep*4 + 4);
-    v_float32x4 x50 = v_load(inptr + inpstep*5), x51 = v_load(inptr + inpstep*5 + 4);
-    v_float32x4 x60 = v_load(inptr + inpstep*6), x61 = v_load(inptr + inpstep*6 + 4);
-    v_float32x4 x70 = v_load(inptr + inpstep*7), x71 = v_load(inptr + inpstep*7 + 4);
-
-    v_float32x4 z00, z01, z10, z11, z20, z21, z30, z31, z40, z41, z50, z51, z60, z61, z70, z71;
-
-    {
-        /* Y[0] = [1.f, 0.f, -5.25f, 0.f, 5.25f, 0.f, -1.f, 0.f]*X */
-        /* Y[7] = [0.f, -1.f, 0.f, 5.25f, 0.f, -5.25f, 0.f, 1.f]*X */
-        v_float32x4 q5_25 = v_setall_f32(5.25f), t00, t01, t10, t11;
-        t00 = v_sub(x40, x20);
-        t01 = v_sub(x41, x21);
-        t10 = v_sub(x30, x50);
-        t11 = v_sub(x31, x51);
-        v_float32x4 y00 = v_fma(t00, q5_25, v_sub(x00, x60));
-        v_float32x4 y01 = v_fma(t01, q5_25, v_sub(x01, x61));
-        v_float32x4 y70 = v_fma(t10, q5_25, v_sub(x70, x10));
-        v_float32x4 y71 = v_fma(t11, q5_25, v_sub(x71, x11));
-
-        /* Y[1] = [0.f, 1.f, 1.f, -4.25f, -4.25f, 1.f, 1.f, 0.f]*X */
-        /* Y[2] = [0.f, -1.f, 1.f, 4.25f, -4.25f, -1.f, 1.f, 0.f]*X */
-        v_float32x4 qm4_25 = v_setall_f32(-4.25f);
-        t00 = v_fma(x30, qm4_25, v_add(x10, x50));
-        t01 = v_fma(x31, qm4_25, v_add(x11, x51));
-        t10 = v_fma(x40, qm4_25, v_add(x20, x60));
-        t11 = v_fma(x41, qm4_25, v_add(x21, x61));
-
-        v_float32x4 y10 = v_add(t00, t10), y11 = v_add(t01, t11);
-        v_float32x4 y20 = v_sub(t10, t00), y21 = v_sub(t11, t01);
-
-        /* Y[3] = [0.f, 0.5f, 0.25f, -2.5f, -1.25f, 2.f, 1.f, 0.f]*X */
-        /* Y[4] = [0.f, -0.5f, 0.25f, 2.5f, -1.25f, -2.f, 1.f, 0.f]*X */
-        v_float32x4 q0_5 = v_setall_f32(0.5f), q0_25 = v_setall_f32(0.25f);
-        v_float32x4 qm2_5 = v_setall_f32(-2.5f), qm1_25 = v_setall_f32(-1.25f);
-        t00 = v_fma(x10, q0_5, v_add(x50, x50));
-        t01 = v_fma(x11, q0_5, v_add(x51, x51));
-        t10 = v_fma(x20, q0_25, x60);
-        t11 = v_fma(x21, q0_25, x61);
-        t00 = v_fma(x30, qm2_5, t00);
-        t01 = v_fma(x31, qm2_5, t01);
-        t10 = v_fma(x40, qm1_25, t10);
-        t11 = v_fma(x41, qm1_25, t11);
-
-        v_float32x4 y30 = v_add(t00, t10), y31 = v_add(t01, t11);
-        v_float32x4 y40 = v_sub(t10, t00), y41 = v_sub(t11, t01);
-
-        /* Y[5] = [0.f, 2.f, 4.f, -2.5f, -5.f, 0.5f, 1.f, 0.f]*X */
-        /* Y[6] = [0.f, -2.f, 4.f, 2.5f, -5.f, -0.5f, 1.f, 0.f]*X */
-        v_float32x4 q4 = v_setall_f32(4.f), qm5 = v_setall_f32(-5.f);
-        t00 = v_fma(x50, q0_5, v_add(x10, x10));
-        t01 = v_fma(x51, q0_5, v_add(x11, x11));
-        t10 = v_fma(x20, q4   , x60);
-        t11 = v_fma(x21, q4   , x61);
-        t00 = v_fma(x30, qm2_5, t00);
-        t01 = v_fma(x31, qm2_5, t01);
-        t10 = v_fma(x40, qm5  , t10);
-        t11 = v_fma(x41, qm5  , t11);
-
-        v_float32x4 y50 = v_add(t00, t10), y51 = v_add(t01, t11);
-        v_float32x4 y60 = v_sub(t10, t00), y61 = v_sub(t11, t01);
-
-        /* transpose 8x8 matrix with v_transpose4x4 */
-
-        v_float32x4 y000, y100, y200, y300, y010, y110, y210, y310, y400, y500, y600, y700, y410, y510, y610, y710;
-        v_transpose4x4(y00, y10, y20, y30, y000, y100, y200, y300);
-        v_transpose4x4(y01, y11, y21, y31, y010, y110, y210, y310);
-        v_transpose4x4(y40, y50, y60, y70, y400, y500, y600, y700);
-        v_transpose4x4(y41, y51, y61, y71, y410, y510, y610, y710);
-
-        /* Z[0] = [1.f, 0.f, -5.25f, 0.f, 5.25f, 0.f, -1.f, 0.f]*Y */
-        /* Z[7] = [0.f, -1.f, 0.f, 5.25f, 0.f, -5.25f, 0.f, 1.f]*Y */
-        t00 = v_sub(y010, y200);
-        t01 = v_sub(y410, y600);
-        t10 = v_sub(y300, y110);
-        t11 = v_sub(y700, y510);
-        z00 = v_fma(t00, q5_25, v_sub(y000, y210));
-        z01 = v_fma(t01, q5_25, v_sub(y400, y610));
-        z70 = v_fma(t10, q5_25, v_sub(y310, y100));
-        z71 = v_fma(t11, q5_25, v_sub(y710, y500));
-
-        /* Z[1] = [0.f, 1.f, 1.f, -4.25f, -4.25f, 1.f, 1.f, 0.f]*Y */
-        /* Z[2] = [0.f, -1.f, 1.f, 4.25f, -4.25f, -1.f, 1.f, 0.f]*Y */
-        t00 = v_fma(y300, qm4_25, v_add(y100, y110));
-        t01 = v_fma(y700, qm4_25, v_add(y500, y510));
-        t10 = v_fma(y010, qm4_25, v_add(y200, y210));
-        t11 = v_fma(y410, qm4_25, v_add(y600, y610));
-
-        z10 = v_add(t00, t10); z11 = v_add(t01, t11);
-        z20 = v_sub(t10, t00); z21 = v_sub(t11, t01);
-
-        /* Z[3] = [0.f, 0.5f, 0.25f, -2.5f, -1.25f, 2.f, 1.f, 0.f]*Y */
-        /* Z[4] = [0.f, -0.5f, 0.25f, 2.5f, -1.25f, -2.f, 1.f, 0.f]*Y */
-        t00 = v_fma(y100, q0_5, v_add(y110, y110));
-        t01 = v_fma(y500, q0_5, v_add(y510, y510));
-        t10 = v_fma(y200, q0_25, y210);
-        t11 = v_fma(y600, q0_25, y610);
-        t00 = v_fma(y300, qm2_5, t00);
-        t01 = v_fma(y700, qm2_5, t01);
-        t10 = v_fma(y010, qm1_25, t10);
-        t11 = v_fma(y410, qm1_25, t11);
-
-        z30 = v_add(t00, t10); z31 = v_add(t01, t11);
-        z40 = v_sub(t10, t00); z41 = v_sub(t11, t01);
-
-        /* Z[5] = [0.f, 2.f, 4.f, -2.5f, -5.f, 0.5f, 1.f, 0.f]*Y */
-        /* Z[6] = [0.f, -2.f, 4.f, 2.5f, -5.f, -0.5f, 1.f, 0.f]*Y */
-        t00 = v_fma(y110, q0_5, v_add(y100, y100));
-        t01 = v_fma(y510, q0_5, v_add(y500, y500));
-        t10 = v_fma(y200, q4, y210);
-        t11 = v_fma(y600, q4, y610);
-        t00 = v_fma(y300, qm2_5, t00);
-        t01 = v_fma(y700, qm2_5, t01);
-        t10 = v_fma(y010, qm5, t10);
-        t11 = v_fma(y410, qm5, t11);
-
-        z50 = v_add(t00, t10); z51 = v_add(t01, t11);
-        z60 = v_sub(t10, t00); z61 = v_sub(t11, t01);
-    }
-
-    const int outstep = winoIblock*winoAtomF32*Cg;
-
-    v_store(outptr, z00);
-    v_store(outptr + outstep, z01);
-    v_store(outptr + outstep*2, z10);
-    v_store(outptr + outstep*3, z11);
-    v_store(outptr + outstep*4, z20);
-    v_store(outptr + outstep*5, z21);
-    v_store(outptr + outstep*6, z30);
-    v_store(outptr + outstep*7, z31);
-    v_store(outptr + outstep*8, z40);
-    v_store(outptr + outstep*9, z41);
-    v_store(outptr + outstep*10, z50);
-    v_store(outptr + outstep*11, z51);
-    v_store(outptr + outstep*12, z60);
-    v_store(outptr + outstep*13, z61);
-    v_store(outptr + outstep*14, z70);
-    v_store(outptr + outstep*15, z71);
-}
-
-/*Output transform*/
-/*  Inverse Winograd 8x8 transform:
-    out = (A'*inp*A)', where
-    inp is input 8x8 FP32 matrix,
-    A' is
-    [1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 0.f,
-     0.f, 1.f, -1.f, 2.f, -2.f, 0.5f, -0.5f, 0.f,
-     0.f, 1.f, 1.f, 4.f, 4.f, 0.25f, 0.25f, 0.f,
-     0.f, 1.f, -1.f, 8.f, -8.f, 0.125f, -0.125f, 0.f,
-     0.f, 1.f, 1.f, 16.f, 16.f, 1.f/16, 1.f/16, 0.f,
-     0.f, 1.f, -1.f, 32.f, -32.f, 1.f/32, -1.f/32, 1.f]
-
-    inp is pre-loaded into xij registers,
-    out will be stored in zij, where (0<=i<=7 for x, 0<=i<=5 for z), 0<=j<=1.
-
-    After the inverse transform is done, we add bias,
-    optionally add results from the earlier tensors (by-pass),
-    optionally apply activation function and then
-    store the final results.
-
-    That is, after both forward and then inverse transformation,
-    we get non-transposed result.
-    Of course, for the correct work of Winograd-based convolution,
-    the Winograd-transformed weights should also be transposed.
-    init_conv() (see OpConv.fx) takes care of that.
-*/
-void winofunc_AtXA_8x8_F32(const float* inptr, int inpstep,
-                          float* bpptr, int bpstep, float* outptr, int outstep,
-                          float bias, float minval, float maxval, bool ifMinMaxAct)
-{
-    v_float32x4 x00 = v_load(inptr), x01 = v_load(inptr + 4);
-    v_float32x4 x10 = v_load(inptr + inpstep), x11 = v_load(inptr + inpstep + 4);
-    v_float32x4 x20 = v_load(inptr + inpstep*2), x21 = v_load(inptr + inpstep*2 + 4);
-    v_float32x4 x30 = v_load(inptr + inpstep*3), x31 = v_load(inptr + inpstep*3 + 4);
-    v_float32x4 x40 = v_load(inptr + inpstep*4), x41 = v_load(inptr + inpstep*4 + 4);
-    v_float32x4 x50 = v_load(inptr + inpstep*5), x51 = v_load(inptr + inpstep*5 + 4);
-    v_float32x4 x60 = v_load(inptr + inpstep*6), x61 = v_load(inptr + inpstep*6 + 4);
-    v_float32x4 x70 = v_load(inptr + inpstep*7), x71 = v_load(inptr + inpstep*7 + 4);
-    v_float32x4 z00, z01, z10, z11, z20, z21, z30, z31, z40, z41, z50, z51;
-
-    {
-        v_float32x4 s12_0, s12_1, s34_0, s34_1, s56_0, s56_1;
-        s12_0 = v_add(x10, x20); s12_1 = v_add(x11, x21);
-        s34_0 = v_add(x30, x40); s34_1 = v_add(x31, x41);
-        s56_0 = v_add(x50, x60); s56_1 = v_add(x51, x61);
-
-        v_float32x4 y00 = v_add(v_add(v_add(x00, s12_0), s34_0), s56_0);
-        v_float32x4 y01 = v_add(v_add(v_add(x01, s12_1), s34_1), s56_1);
-
-        v_float32x4 a0 = v_setall_f32(0.25f), a1 = v_setall_f32(4.0f);
-        v_float32x4 y20 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0));
-        v_float32x4 y21 = v_fma(s56_1, a0 ,v_fma(s34_1, a1, s12_1) );
-
-        a0 = v_setall_f32(1.f/16), a1 = v_setall_f32(16.0f);
-        v_float32x4 y40 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0));
-        v_float32x4 y41 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1));
-
-        s12_0 = v_sub(x10, x20); s12_1 = v_sub(x11, x21);
-        s34_0 = v_sub(x30, x40); s34_1 = v_sub(x31, x41);
-        s56_0 = v_sub(x50, x60); s56_1 = v_sub(x51, x61);
-
-        a0 = v_setall_f32(1.f/32), a1 = v_setall_f32(32.f);
-        v_float32x4 y50 = v_fma(s56_0, a0, v_fma(s34_0, a1, v_add(x70, s12_0)));
-        v_float32x4 y51 = v_fma(s56_1, a0, v_fma(s34_1, a1, v_add(x71, s12_1)));
-
-        a0 = v_setall_f32(0.5f), a1 = v_setall_f32(2.f);
-        v_float32x4 y10 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0));
-        v_float32x4 y11 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1));
-
-        a0 = v_setall_f32(0.125f), a1 = v_setall_f32(8.f);
-        v_float32x4 y30 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0));
-        v_float32x4 y31 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1));
-
-        v_float32x4 y60 = v_setall_f32(0.f), y61 = y60, y70 = y60, y71 = y60;
-
-        /* transpose 8x8 matrix with v_transpose4x4 */
-
-        v_float32x4 y000, y100, y200, y300, y010, y110, y210, y310, y400, y500, y600, y700, y410, y510, y610, y710;
-        v_transpose4x4(y00, y10, y20, y30, y000, y100, y200, y300);
-        v_transpose4x4(y01, y11, y21, y31, y010, y110, y210, y310);
-        v_transpose4x4(y40, y50, y60, y70, y400, y500, y600, y700);
-        v_transpose4x4(y41, y51, y61, y71, y410, y510, y610, y710);
-
-        s12_0 = v_add(y100, y200); s12_1 = v_add(y500, y600);
-        s34_0 = v_add(y300, y010); s34_1 = v_add(y700, y410);
-        s56_0 = v_add(y110, y210); s56_1 = v_add(y510, y610);
-
-        z00 = v_add(v_add(v_add(y000, s12_0), s34_0), s56_0);
-        z01 = v_add(v_add(v_add(y400, s12_1), s34_1), s56_1);
-
-        a0 = v_setall_f32(0.25f), a1 = v_setall_f32(4.0f);
-        z20 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0));
-        z21 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1));
-
-        a0 = v_setall_f32(1.f/16), a1 = v_setall_f32(16.0f);
-        z40 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0));
-        z41 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1));
-
-        s12_0 = v_sub(y100, y200); s12_1 = v_sub(y500, y600);
-        s34_0 = v_sub(y300, y010); s34_1 = v_sub(y700, y410);
-        s56_0 = v_sub(y110, y210); s56_1 = v_sub(y510, y610);
-
-        a0 = v_setall_f32(1.f/32), a1 = v_setall_f32(32.0f);
-        z50 = v_fma(s56_0, a0, v_fma(s34_0, a1, v_add(y310, s12_0)));
-        z51 = v_fma(s56_1, a0, v_fma(s34_1, a1, v_add(y710, s12_1)));
-        a0 = v_setall_f32(0.5f), a1 = v_setall_f32(2.0f);
-        z10 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0));
-        z11 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1));
-
-        a0 = v_setall_f32(0.125f), a1 = v_setall_f32(8.0f);
-        z30 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0));
-        z31 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1));
-
-        v_float32x4 vbias = v_setall_f32(bias);
-        z00 = v_add(z00, vbias);
-        z01 = v_add(z01, vbias);
-        z10 = v_add(z10, vbias);
-        z11 = v_add(z11, vbias);
-        z20 = v_add(z20, vbias);
-        z21 = v_add(z21, vbias);
-        z30 = v_add(z30, vbias);
-        z31 = v_add(z31, vbias);
-        z40 = v_add(z40, vbias);
-        z41 = v_add(z41, vbias);
-        z50 = v_add(z50, vbias);
-        z51 = v_add(z51, vbias);
-    }
-
-    if (bpptr)
-    {
-        z00 = v_add(z00, v_load(bpptr));
-        z01 = v_add(z01, v_load_low(bpptr + 4));
-        z10 = v_add(z10, v_load(bpptr + bpstep));
-        z11 = v_add(z11, v_load_low(bpptr + bpstep + 4));
-        z20 = v_add(z20, v_load(bpptr + bpstep * 2));
-        z21 = v_add(z21, v_load_low(bpptr + bpstep * 2 + 4));
-        z30 = v_add(z30, v_load(bpptr + bpstep * 3));
-        z31 = v_add(z31, v_load_low(bpptr + bpstep * 3 + 4));
-        z40 = v_add(z40, v_load(bpptr + bpstep * 4));
-        z41 = v_add(z41, v_load_low(bpptr + bpstep * 4 + 4));
-        z50 = v_add(z50, v_load(bpptr + bpstep * 5));
-        z51 = v_add(z51, v_load_low(bpptr + bpstep * 5 + 4));
-    }
-
-    if (ifMinMaxAct)
-    {
-        v_float32x4 vmax = v_setall_f32(maxval);
-        v_float32x4 vmin = v_setall_f32(minval);
-
-        z00 = v_min(v_max(z00, vmin), vmax);
-        z01 = v_min(v_max(z01, vmin), vmax);
-        z10 = v_min(v_max(z10, vmin), vmax);
-        z11 = v_min(v_max(z11, vmin), vmax);
-        z20 = v_min(v_max(z20, vmin), vmax);
-        z21 = v_min(v_max(z21, vmin), vmax);
-        z30 = v_min(v_max(z30, vmin), vmax);
-        z31 = v_min(v_max(z31, vmin), vmax);
-        z40 = v_min(v_max(z40, vmin), vmax);
-        z41 = v_min(v_max(z41, vmin), vmax);
-        z50 = v_min(v_max(z50, vmin), vmax);
-        z51 = v_min(v_max(z51, vmin), vmax);
-    }
-
-    v_store(outptr, z00);
-    v_store_low(outptr + 4, z01);
-    v_store(outptr + outstep, z10);
-    v_store_low(outptr + outstep + 4, z11);
-    v_store(outptr + outstep*2, z20);
-    v_store_low(outptr + outstep*2 + 4, z21);
-    v_store(outptr + outstep*3, z30);
-    v_store_low(outptr + outstep*3 + 4, z31);
-    v_store(outptr + outstep*4, z40);
-    v_store_low(outptr + outstep*4 + 4, z41);
-    v_store(outptr + outstep*5, z50);
-    v_store_low(outptr + outstep*5 + 4, z51);
-}
-#endif
-
-#else
-int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _output, const Ptr<FastConv>& conv,
-                  int ntasks, float minval, float maxval, ActivationLayer* activ, bool ifMinMaxAct)
-{
-    return 0;
-}
-#endif
-
 }} // namespace cv::dnn
--- a/modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.dispatch.cpp
+++ b/modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.dispatch.cpp
@ -0,0 +1,22 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "convolution.hpp"
+#include "conv_winograd_f63.simd.hpp"
+#include "layers/cpu_kernels/conv_winograd_f63.simd_declarations.hpp"
+
+namespace cv {
+namespace dnn {
+
+cv::dnn::Winofunc getWinofunc_F32()
+{
+    CV_CPU_DISPATCH(getWinofunc_F32, (), CV_CPU_DISPATCH_MODES_ALL);
+}
+
+cv::dnn::Winofunc getWinofunc_F16()
+{
+    CV_CPU_DISPATCH(getWinofunc_F16, (), CV_CPU_DISPATCH_MODES_ALL);
+}
+
+}} // namespace cv::dnn::
--- a/modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.neon.cpp
+++ b/modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.neon.cpp
@ -1,476 +0,0 @@
-// This file is part of OpenCV project.
-// It is subject to the license terms in the LICENSE file found in the top-level directory
-// of this distribution and at http://opencv.org/license.html.
-
-#include "../../precomp.hpp"
-#include "convolution.hpp"
-#include "opencv2/core/hal/intrin.hpp"
-
-namespace cv {
-namespace dnn {
-
-// NEON code work around.
-namespace opt_NEON
-{
-
-#if CV_NEON && CV_NEON_AARCH64
-
-/* Accumulate */
-void winofunc_accum_F32(const float* inwptr, const float* wptr, float* outbuf, int Cg, int iblock,
-                            const int winoIblock, const int winoKblock, const int winoAtomF32, const int winoNatomF32)
-{
-    CV_Assert(winoIblock == 6 && winoKblock == 4 && winoAtomF32 == 4);
-    if (iblock > 3)
-    {
-        for (int atom_id = 0; atom_id < winoNatomF32; atom_id++,
-                outbuf += winoAtomF32)
-        {
-            float32x4_t s00 = vdupq_n_f32(0.f), s01 = s00, s02 = s00, s03 = s00, s04 = s00, s05 = s00;
-            float32x4_t s10 = vdupq_n_f32(0.f), s11 = s00, s12 = s00, s13 = s00, s14 = s00, s15 = s00;
-            float32x4_t s20 = vdupq_n_f32(0.f), s21 = s00, s22 = s00, s23 = s00, s24 = s00, s25 = s00;
-            float32x4_t s30 = vdupq_n_f32(0.f), s31 = s00, s32 = s00, s33 = s00, s34 = s00, s35 = s00;
-            for (int c = 0; c < Cg; c++, inwptr += winoIblock*winoAtomF32,
-                                         wptr += winoKblock*winoAtomF32) {
-                float32x4_t w0 = vld1q_f32(wptr), w1 = vld1q_f32(wptr + 4);
-                float32x4_t w2 = vld1q_f32(wptr + 8), w3 = vld1q_f32(wptr + 12);
-                float32x4_t x0, x1;
-                x0 = vld1q_f32(inwptr);
-                x1 = vld1q_f32(inwptr + 4);
-                s00 = vfmaq_f32(s00, w0, x0);
-                s01 = vfmaq_f32(s01, w0, x1);
-                s10 = vfmaq_f32(s10, w1, x0);
-                s11 = vfmaq_f32(s11, w1, x1);
-                s20 = vfmaq_f32(s20, w2, x0);
-                s21 = vfmaq_f32(s21, w2, x1);
-                s30 = vfmaq_f32(s30, w3, x0);
-                s31 = vfmaq_f32(s31, w3, x1);
-                x0 = vld1q_f32(inwptr + 8);
-                x1 = vld1q_f32(inwptr + 12);
-                s02 = vfmaq_f32(s02, w0, x0);
-                s03 = vfmaq_f32(s03, w0, x1);
-                s12 = vfmaq_f32(s12, w1, x0);
-                s13 = vfmaq_f32(s13, w1, x1);
-                s22 = vfmaq_f32(s22, w2, x0);
-                s23 = vfmaq_f32(s23, w2, x1);
-                s32 = vfmaq_f32(s32, w3, x0);
-                s33 = vfmaq_f32(s33, w3, x1);
-                x0 = vld1q_f32(inwptr + 16);
-                x1 = vld1q_f32(inwptr + 20);
-                s04 = vfmaq_f32(s04, w0, x0);
-                s05 = vfmaq_f32(s05, w0, x1);
-                s14 = vfmaq_f32(s14, w1, x0);
-                s15 = vfmaq_f32(s15, w1, x1);
-                s24 = vfmaq_f32(s24, w2, x0);
-                s25 = vfmaq_f32(s25, w2, x1);
-                s34 = vfmaq_f32(s34, w3, x0);
-                s35 = vfmaq_f32(s35, w3, x1);
-            }
-
-            vst1q_f32(outbuf, s00);
-            vst1q_f32(outbuf + 1*64, s01);
-            vst1q_f32(outbuf + 2*64, s02);
-            vst1q_f32(outbuf + 3*64, s03);
-            vst1q_f32(outbuf + 4*64, s04);
-            vst1q_f32(outbuf + 5*64, s05);
-
-            vst1q_f32(outbuf + 6*64, s10);
-            vst1q_f32(outbuf + 7*64, s11);
-            vst1q_f32(outbuf + 8*64, s12);
-            vst1q_f32(outbuf + 9*64, s13);
-            vst1q_f32(outbuf + 10*64, s14);
-            vst1q_f32(outbuf + 11*64, s15);
-
-            vst1q_f32(outbuf + 12*64, s20);
-            vst1q_f32(outbuf + 13*64, s21);
-            vst1q_f32(outbuf + 14*64, s22);
-            vst1q_f32(outbuf + 15*64, s23);
-            vst1q_f32(outbuf + 16*64, s24);
-            vst1q_f32(outbuf + 17*64, s25);
-
-            vst1q_f32(outbuf + 18*64, s30);
-            vst1q_f32(outbuf + 19*64, s31);
-            vst1q_f32(outbuf + 20*64, s32);
-            vst1q_f32(outbuf + 21*64, s33);
-            vst1q_f32(outbuf + 22*64, s34);
-            vst1q_f32(outbuf + 23*64, s35);
-        }
-    }
-    else
-    {
-        for (int atom_id = 0; atom_id < winoNatomF32; atom_id++,
-                outbuf += winoAtomF32)
-        {
-            float32x4_t s00 = vdupq_n_f32(0.f), s01 = s00, s02 = s00;
-            float32x4_t s10 = vdupq_n_f32(0.f), s11 = s00, s12 = s00;
-            float32x4_t s20 = vdupq_n_f32(0.f), s21 = s00, s22 = s00;
-            float32x4_t s30 = vdupq_n_f32(0.f), s31 = s00, s32 = s00;
-            for (int c = 0; c < Cg; c++, inwptr += winoIblock*winoAtomF32,
-                                         wptr += winoKblock*winoAtomF32) {
-                float32x4_t w0 = vld1q_f32(wptr), w1 = vld1q_f32(wptr + 4);
-                float32x4_t w2 = vld1q_f32(wptr + 8), w3 = vld1q_f32(wptr + 12);
-                float32x4_t x0, x1, x2;
-                x0 = vld1q_f32(inwptr);
-                x1 = vld1q_f32(inwptr + 4);
-                x2 = vld1q_f32(inwptr + 8);
-                s00 = vfmaq_f32(s00, w0, x0);
-                s01 = vfmaq_f32(s01, w0, x1);
-                s02 = vfmaq_f32(s02, w0, x2);
-                s10 = vfmaq_f32(s10, w1, x0);
-                s11 = vfmaq_f32(s11, w1, x1);
-                s12 = vfmaq_f32(s12, w1, x2);
-                s20 = vfmaq_f32(s20, w2, x0);
-                s21 = vfmaq_f32(s21, w2, x1);
-                s22 = vfmaq_f32(s22, w2, x2);
-                s30 = vfmaq_f32(s30, w3, x0);
-                s31 = vfmaq_f32(s31, w3, x1);
-                s32 = vfmaq_f32(s32, w3, x2);
-            }
-
-            vst1q_f32(outbuf, s00);
-            vst1q_f32(outbuf + 1*64, s01);
-            vst1q_f32(outbuf + 2*64, s02);
-            vst1q_f32(outbuf + 6*64, s10);
-            vst1q_f32(outbuf + 7*64, s11);
-            vst1q_f32(outbuf + 8*64, s12);
-            vst1q_f32(outbuf + 12*64, s20);
-            vst1q_f32(outbuf + 13*64, s21);
-            vst1q_f32(outbuf + 14*64, s22);
-            vst1q_f32(outbuf + 18*64, s30);
-            vst1q_f32(outbuf + 19*64, s31);
-            vst1q_f32(outbuf + 20*64, s32);
-        }
-    }
-}
-
-#undef T4x4
-#define T4x4(a, b, c, d, tr0, tr1) \
-    tr0 = vtrnq_f32(a, b); \
-    tr1 = vtrnq_f32(c, d); \
-    a = vcombine_f32(vget_low_f32(tr0.val[0]), vget_low_f32(tr1.val[0])); \
-    b = vcombine_f32(vget_low_f32(tr0.val[1]), vget_low_f32(tr1.val[1])); \
-    c = vcombine_f32(vget_high_f32(tr0.val[0]), vget_high_f32(tr1.val[0])); \
-    d = vcombine_f32(vget_high_f32(tr0.val[1]), vget_high_f32(tr1.val[1]))
-
-/*Input transform*/
-void winofunc_BtXB_8x8_F32(const float* inptr, int inpstep,
-                          float* outptr, int Cg, const int winoIblock, const int winoAtomF32)
-{
-    float32x4_t x00 = vld1q_f32(inptr), x01 = vld1q_f32(inptr + 4);
-    float32x4_t x10 = vld1q_f32(inptr + inpstep), x11 = vld1q_f32(inptr + inpstep + 4);
-    float32x4_t x20 = vld1q_f32(inptr + inpstep*2), x21 = vld1q_f32(inptr + inpstep*2 + 4);
-    float32x4_t x30 = vld1q_f32(inptr + inpstep*3), x31 = vld1q_f32(inptr + inpstep*3 + 4);
-    float32x4_t x40 = vld1q_f32(inptr + inpstep*4), x41 = vld1q_f32(inptr + inpstep*4 + 4);
-    float32x4_t x50 = vld1q_f32(inptr + inpstep*5), x51 = vld1q_f32(inptr + inpstep*5 + 4);
-    float32x4_t x60 = vld1q_f32(inptr + inpstep*6), x61 = vld1q_f32(inptr + inpstep*6 + 4);
-    float32x4_t x70 = vld1q_f32(inptr + inpstep*7), x71 = vld1q_f32(inptr + inpstep*7 + 4);
-
-    float32x4_t z00, z01, z10, z11, z20, z21, z30, z31, z40, z41, z50, z51, z60, z61, z70, z71;
-
-    {
-        /* Y[0] = [1.f, 0.f, -5.25f, 0.f, 5.25f, 0.f, -1.f, 0.f]*X */
-        /* Y[7] = [0.f, -1.f, 0.f, 5.25f, 0.f, -5.25f, 0.f, 1.f]*X */
-        float32x4_t q5_25 = vdupq_n_f32(5.25f), t00, t01, t10, t11;
-        t00 = vsubq_f32(x40, x20);
-        t01 = vsubq_f32(x41, x21);
-        t10 = vsubq_f32(x30, x50);
-        t11 = vsubq_f32(x31, x51);
-        float32x4_t y00 = vfmaq_f32(vsubq_f32(x00, x60), t00, q5_25);
-        float32x4_t y01 = vfmaq_f32(vsubq_f32(x01, x61), t01, q5_25);
-        float32x4_t y70 = vfmaq_f32(vsubq_f32(x70, x10), t10, q5_25);
-        float32x4_t y71 = vfmaq_f32(vsubq_f32(x71, x11), t11, q5_25);
-
-        /* Y[1] = [0.f, 1.f, 1.f, -4.25f, -4.25f, 1.f, 1.f, 0.f]*X */
-        /* Y[2] = [0.f, -1.f, 1.f, 4.25f, -4.25f, -1.f, 1.f, 0.f]*X */
-        float32x4_t qm4_25 = vdupq_n_f32(-4.25f);
-        t00 = vfmaq_f32(vaddq_f32(x10, x50), x30, qm4_25);
-        t01 = vfmaq_f32(vaddq_f32(x11, x51), x31, qm4_25);
-        t10 = vfmaq_f32(vaddq_f32(x20, x60), x40, qm4_25);
-        t11 = vfmaq_f32(vaddq_f32(x21, x61), x41, qm4_25);
-
-        float32x4_t y10 = vaddq_f32(t00, t10), y11 = vaddq_f32(t01, t11);
-        float32x4_t y20 = vsubq_f32(t10, t00), y21 = vsubq_f32(t11, t01);
-
-        /* Y[3] = [0.f, 0.5f, 0.25f, -2.5f, -1.25f, 2.f, 1.f, 0.f]*X */
-        /* Y[4] = [0.f, -0.5f, 0.25f, 2.5f, -1.25f, -2.f, 1.f, 0.f]*X */
-        float32x4_t q0_5 = vdupq_n_f32(0.5f), q0_25 = vdupq_n_f32(0.25f);
-        float32x4_t qm2_5 = vdupq_n_f32(-2.5f), qm1_25 = vdupq_n_f32(-1.25f);
-        t00 = vfmaq_f32(vaddq_f32(x50, x50), x10, q0_5);
-        t01 = vfmaq_f32(vaddq_f32(x51, x51), x11, q0_5);
-        t10 = vfmaq_f32(x60, x20, q0_25);
-        t11 = vfmaq_f32(x61, x21, q0_25);
-        t00 = vfmaq_f32(t00, x30, qm2_5);
-        t01 = vfmaq_f32(t01, x31, qm2_5);
-        t10 = vfmaq_f32(t10, x40, qm1_25);
-        t11 = vfmaq_f32(t11, x41, qm1_25);
-
-        float32x4_t y30 = vaddq_f32(t00, t10), y31 = vaddq_f32(t01, t11);
-        float32x4_t y40 = vsubq_f32(t10, t00), y41 = vsubq_f32(t11, t01);
-
-        /* Y[5] = [0.f, 2.f, 4.f, -2.5f, -5.f, 0.5f, 1.f, 0.f]*X */
-        /* Y[6] = [0.f, -2.f, 4.f, 2.5f, -5.f, -0.5f, 1.f, 0.f]*X */
-        float32x4_t q4 = vdupq_n_f32(4.f), qm5 = vdupq_n_f32(-5.f);
-        t00 = vfmaq_f32(vaddq_f32(x10, x10), x50, q0_5);
-        t01 = vfmaq_f32(vaddq_f32(x11, x11), x51, q0_5);
-        t10 = vfmaq_f32(x60, x20, q4);
-        t11 = vfmaq_f32(x61, x21, q4);
-        t00 = vfmaq_f32(t00, x30, qm2_5);
-        t01 = vfmaq_f32(t01, x31, qm2_5);
-        t10 = vfmaq_f32(t10, x40, qm5);
-        t11 = vfmaq_f32(t11, x41, qm5);
-
-        float32x4_t y50 = vaddq_f32(t00, t10), y51 = vaddq_f32(t01, t11);
-        float32x4_t y60 = vsubq_f32(t10, t00), y61 = vsubq_f32(t11, t01);
-
-        /* transpose 8x8 matrix in-place with some renumeration of the elements: */
-        /* Y:              */
-        /*        y00 y01  */
-        /*        y10 y11  */
-        /*        ...      */
-        /*        y70 y71  */
-        /*   Y':           */
-        /*        y00 y40  */
-        /*        y10 y50  */
-        /*        y20 y60  */
-        /*        y30 y70  */
-        /*        y01 y41  */
-        /*        y11 y51  */
-        /*        y21 y61  */
-        /*        y31 y71  */
-        /*    in other words, y40 <-> y01, y50 <-> y11, y60 <-> y21, y70 <-> y31 */
-        float32x4x2_t tr0, tr1;
-
-        T4x4(y00, y10, y20, y30, tr0, tr1);
-        T4x4(y01, y11, y21, y31, tr0, tr1);
-        T4x4(y40, y50, y60, y70, tr0, tr1);
-        T4x4(y41, y51, y61, y71, tr0, tr1);
-
-        /* Z[0] = [1.f, 0.f, -5.25f, 0.f, 5.25f, 0.f, -1.f, 0.f]*Y */
-        /* Z[7] = [0.f, -1.f, 0.f, 5.25f, 0.f, -5.25f, 0.f, 1.f]*Y */
-        t00 = vsubq_f32(y01, y20);
-        t01 = vsubq_f32(y41, y60);
-        t10 = vsubq_f32(y30, y11);
-        t11 = vsubq_f32(y70, y51);
-        z00 = vfmaq_f32(vsubq_f32(y00, y21), t00, q5_25);
-        z01 = vfmaq_f32(vsubq_f32(y40, y61), t01, q5_25);
-        z70 = vfmaq_f32(vsubq_f32(y31, y10), t10, q5_25);
-        z71 = vfmaq_f32(vsubq_f32(y71, y50), t11, q5_25);
-
-        /* Z[1] = [0.f, 1.f, 1.f, -4.25f, -4.25f, 1.f, 1.f, 0.f]*Y */
-        /* Z[2] = [0.f, -1.f, 1.f, 4.25f, -4.25f, -1.f, 1.f, 0.f]*Y */
-        t00 = vfmaq_f32(vaddq_f32(y10, y11), y30, qm4_25);
-        t01 = vfmaq_f32(vaddq_f32(y50, y51), y70, qm4_25);
-        t10 = vfmaq_f32(vaddq_f32(y20, y21), y01, qm4_25);
-        t11 = vfmaq_f32(vaddq_f32(y60, y61), y41, qm4_25);
-
-        z10 = vaddq_f32(t00, t10); z11 = vaddq_f32(t01, t11);
-        z20 = vsubq_f32(t10, t00); z21 = vsubq_f32(t11, t01);
-
-        /* Z[3] = [0.f, 0.5f, 0.25f, -2.5f, -1.25f, 2.f, 1.f, 0.f]*Y */
-        /* Z[4] = [0.f, -0.5f, 0.25f, 2.5f, -1.25f, -2.f, 1.f, 0.f]*Y */
-        t00 = vfmaq_f32(vaddq_f32(y11, y11), y10, q0_5);
-        t01 = vfmaq_f32(vaddq_f32(y51, y51), y50, q0_5);
-        t10 = vfmaq_f32(y21, y20, q0_25);
-        t11 = vfmaq_f32(y61, y60, q0_25);
-        t00 = vfmaq_f32(t00, y30, qm2_5);
-        t01 = vfmaq_f32(t01, y70, qm2_5);
-        t10 = vfmaq_f32(t10, y01, qm1_25);
-        t11 = vfmaq_f32(t11, y41, qm1_25);
-
-        z30 = vaddq_f32(t00, t10); z31 = vaddq_f32(t01, t11);
-        z40 = vsubq_f32(t10, t00); z41 = vsubq_f32(t11, t01);
-
-        /* Z[5] = [0.f, 2.f, 4.f, -2.5f, -5.f, 0.5f, 1.f, 0.f]*Y */
-        /* Z[6] = [0.f, -2.f, 4.f, 2.5f, -5.f, -0.5f, 1.f, 0.f]*Y */
-        t00 = vfmaq_f32(vaddq_f32(y10, y10), y11, q0_5);
-        t01 = vfmaq_f32(vaddq_f32(y50, y50), y51, q0_5);
-        t10 = vfmaq_f32(y21, y20, q4);
-        t11 = vfmaq_f32(y61, y60, q4);
-        t00 = vfmaq_f32(t00, y30, qm2_5);
-        t01 = vfmaq_f32(t01, y70, qm2_5);
-        t10 = vfmaq_f32(t10, y01, qm5);
-        t11 = vfmaq_f32(t11, y41, qm5);
-
-        z50 = vaddq_f32(t00, t10); z51 = vaddq_f32(t01, t11);
-        z60 = vsubq_f32(t10, t00); z61 = vsubq_f32(t11, t01);
-    }
-
-    const int outstep = winoIblock*winoAtomF32*Cg;
-
-    vst1q_f32(outptr, z00);
-    vst1q_f32(outptr + outstep, z01);
-    vst1q_f32(outptr + outstep*2, z10);
-    vst1q_f32(outptr + outstep*3, z11);
-    vst1q_f32(outptr + outstep*4, z20);
-    vst1q_f32(outptr + outstep*5, z21);
-    vst1q_f32(outptr + outstep*6, z30);
-    vst1q_f32(outptr + outstep*7, z31);
-    vst1q_f32(outptr + outstep*8, z40);
-    vst1q_f32(outptr + outstep*9, z41);
-    vst1q_f32(outptr + outstep*10, z50);
-    vst1q_f32(outptr + outstep*11, z51);
-    vst1q_f32(outptr + outstep*12, z60);
-    vst1q_f32(outptr + outstep*13, z61);
-    vst1q_f32(outptr + outstep*14, z70);
-    vst1q_f32(outptr + outstep*15, z71);
-}
-
-/*Output transform*/
-void winofunc_AtXA_8x8_F32(const float* inptr, int inpstep,
-                          float* bpptr, int bpstep, float* outptr, int outstep,
-                          float bias, float minval, float maxval, bool ifMinMaxAct)
-{
-    float32x4_t x00 = vld1q_f32(inptr), x01 = vld1q_f32(inptr + 4);
-    float32x4_t x10 = vld1q_f32(inptr + inpstep), x11 = vld1q_f32(inptr + inpstep + 4);
-    float32x4_t x20 = vld1q_f32(inptr + inpstep*2), x21 = vld1q_f32(inptr + inpstep*2 + 4);
-    float32x4_t x30 = vld1q_f32(inptr + inpstep*3), x31 = vld1q_f32(inptr + inpstep*3 + 4);
-    float32x4_t x40 = vld1q_f32(inptr + inpstep*4), x41 = vld1q_f32(inptr + inpstep*4 + 4);
-    float32x4_t x50 = vld1q_f32(inptr + inpstep*5), x51 = vld1q_f32(inptr + inpstep*5 + 4);
-    float32x4_t x60 = vld1q_f32(inptr + inpstep*6), x61 = vld1q_f32(inptr + inpstep*6 + 4);
-    float32x4_t x70 = vld1q_f32(inptr + inpstep*7), x71 = vld1q_f32(inptr + inpstep*7 + 4);
-    float32x4_t z00, z01, z10, z11, z20, z21, z30, z31, z40, z41, z50, z51;
-
-    {
-        float32x4_t s12_0, s12_1, s34_0, s34_1, s56_0, s56_1;
-        s12_0 = vaddq_f32(x10, x20); s12_1 = vaddq_f32(x11, x21);
-        s34_0 = vaddq_f32(x30, x40); s34_1 = vaddq_f32(x31, x41);
-        s56_0 = vaddq_f32(x50, x60); s56_1 = vaddq_f32(x51, x61);
-
-        float32x4_t y00 = vaddq_f32(vaddq_f32(vaddq_f32(x00, s12_0), s34_0), s56_0);
-        float32x4_t y01 = vaddq_f32(vaddq_f32(vaddq_f32(x01, s12_1), s34_1), s56_1);
-        float32x4_t y20 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 4.0f), s56_0, 0.25f);
-        float32x4_t y21 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 4.0f), s56_1, 0.25f);
-        float32x4_t y40 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 16.0f), s56_0, 1.f/16);
-        float32x4_t y41 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 16.0f), s56_1, 1.f/16);
-
-        s12_0 = vsubq_f32(x10, x20); s12_1 = vsubq_f32(x11, x21);
-        s34_0 = vsubq_f32(x30, x40); s34_1 = vsubq_f32(x31, x41);
-        s56_0 = vsubq_f32(x50, x60); s56_1 = vsubq_f32(x51, x61);
-
-        float32x4_t y50 = vfmaq_n_f32(vfmaq_n_f32(vaddq_f32(x70, s12_0),
-                                      s34_0, 32.f), s56_0, 1.f/32);
-        float32x4_t y51 = vfmaq_n_f32(vfmaq_n_f32(vaddq_f32(x71, s12_1),
-                                      s34_1, 32.f), s56_1, 1.f/32);
-        float32x4_t y10 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 2.0f), s56_0, 0.5f);
-        float32x4_t y11 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 2.0f), s56_1, 0.5f);
-        float32x4_t y30 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 8.0f), s56_0, 0.125f);
-        float32x4_t y31 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 8.0f), s56_1, 0.125f);
-        float32x4_t y60 = vdupq_n_f32(0.f), y61 = y60, y70 = y60, y71 = y60;
-
-        /* transpose 8x8 matrix in-place with some renumeration of the elements: */
-        /*  Y: */
-        /*        y00 y01 */
-        /*        y10 y11 */
-        /*        ... */
-        /*        y50 y51 */
-        /*        0   0 */
-        /*        0   0 */
-        /*   Y': */
-        /*        y00 y40 */
-        /*        y10 y50 */
-        /*        y20 y60 */
-        /*        y30 y70 */
-        /*        y01 y41 */
-        /*        y11 y51 */
-        /*        y21 y61 */
-        /*        y31 y71 */
-        /*    in other words, y40 <-> y01, y50 <-> y11, y60 <-> y21, y70 <-> y31 */
-        float32x4x2_t tr0, tr1;
-
-        T4x4(y00, y10, y20, y30, tr0, tr1);
-        T4x4(y01, y11, y21, y31, tr0, tr1);
-        T4x4(y40, y50, y60, y70, tr0, tr1);
-        T4x4(y41, y51, y61, y71, tr0, tr1);
-
-        s12_0 = vaddq_f32(y10, y20); s12_1 = vaddq_f32(y50, y60);
-        s34_0 = vaddq_f32(y30, y01); s34_1 = vaddq_f32(y70, y41);
-        s56_0 = vaddq_f32(y11, y21); s56_1 = vaddq_f32(y51, y61);
-
-        z00 = vaddq_f32(vaddq_f32(vaddq_f32(y00, s12_0), s34_0), s56_0);
-        z01 = vaddq_f32(vaddq_f32(vaddq_f32(y40, s12_1), s34_1), s56_1);
-        z20 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 4.0f), s56_0, 0.25f);
-        z21 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 4.0f), s56_1, 0.25f);
-        z40 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 16.0f), s56_0, 1.f/16);
-        z41 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 16.0f), s56_1, 1.f/16);
-
-        s12_0 = vsubq_f32(y10, y20); s12_1 = vsubq_f32(y50, y60);
-        s34_0 = vsubq_f32(y30, y01); s34_1 = vsubq_f32(y70, y41);
-        s56_0 = vsubq_f32(y11, y21); s56_1 = vsubq_f32(y51, y61);
-
-        z50 = vfmaq_n_f32(vfmaq_n_f32(vaddq_f32(y31, s12_0),
-                          s34_0, 32.f), s56_0, 1.f/32);
-        z51 = vfmaq_n_f32(vfmaq_n_f32(vaddq_f32(y71, s12_1),
-                          s34_1, 32.f), s56_1, 1.f/32);
-        z10 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 2.0f), s56_0, 0.5f);
-        z11 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 2.0f), s56_1, 0.5f);
-        z30 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 8.0f), s56_0, 0.125f);
-        z31 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 8.0f), s56_1, 0.125f);
-        float32x4_t vbias = vdupq_n_f32(bias);
-
-        z00 = vaddq_f32(z00, vbias);
-        z01 = vaddq_f32(z01, vbias);
-        z10 = vaddq_f32(z10, vbias);
-        z11 = vaddq_f32(z11, vbias);
-        z20 = vaddq_f32(z20, vbias);
-        z21 = vaddq_f32(z21, vbias);
-        z30 = vaddq_f32(z30, vbias);
-        z31 = vaddq_f32(z31, vbias);
-        z40 = vaddq_f32(z40, vbias);
-        z41 = vaddq_f32(z41, vbias);
-        z50 = vaddq_f32(z50, vbias);
-        z51 = vaddq_f32(z51, vbias);
-    }
-
-    if (bpptr)
-    {
-        float32x2_t zhalf = vdup_n_f32(0.f);
-        z00 = vaddq_f32(z00, vld1q_f32(bpptr));
-        z01 = vaddq_f32(z01, vcombine_f32(vld1_f32(bpptr + 4), zhalf));
-        z10 = vaddq_f32(z10, vld1q_f32(bpptr + bpstep));
-        z11 = vaddq_f32(z11, vcombine_f32(vld1_f32(bpptr + bpstep + 4), zhalf));
-        z20 = vaddq_f32(z20, vld1q_f32(bpptr + bpstep*2));
-        z21 = vaddq_f32(z21, vcombine_f32(vld1_f32(bpptr + bpstep*2 + 4), zhalf));
-        z30 = vaddq_f32(z30, vld1q_f32(bpptr + bpstep*3));
-        z31 = vaddq_f32(z31, vcombine_f32(vld1_f32(bpptr + bpstep*3 + 4), zhalf));
-        z40 = vaddq_f32(z40, vld1q_f32(bpptr + bpstep*4));
-        z41 = vaddq_f32(z41, vcombine_f32(vld1_f32(bpptr + bpstep*4 + 4), zhalf));
-        z50 = vaddq_f32(z50, vld1q_f32(bpptr + bpstep*5));
-        z51 = vaddq_f32(z51, vcombine_f32(vld1_f32(bpptr + bpstep*5 + 4), zhalf));
-    }
-
-    if (ifMinMaxAct)
-    {
-        float32x4_t vmax = vdupq_n_f32(maxval);
-        float32x4_t vmin = vdupq_n_f32(minval);
-
-        z00 = vminq_f32(vmaxq_f32(z00, vmin), vmax);
-        z01 = vminq_f32(vmaxq_f32(z01, vmin), vmax);
-        z10 = vminq_f32(vmaxq_f32(z10, vmin), vmax);
-        z11 = vminq_f32(vmaxq_f32(z11, vmin), vmax);
-        z20 = vminq_f32(vmaxq_f32(z20, vmin), vmax);
-        z21 = vminq_f32(vmaxq_f32(z21, vmin), vmax);
-        z30 = vminq_f32(vmaxq_f32(z30, vmin), vmax);
-        z31 = vminq_f32(vmaxq_f32(z31, vmin), vmax);
-        z40 = vminq_f32(vmaxq_f32(z40, vmin), vmax);
-        z41 = vminq_f32(vmaxq_f32(z41, vmin), vmax);
-        z50 = vminq_f32(vmaxq_f32(z50, vmin), vmax);
-        z51 = vminq_f32(vmaxq_f32(z51, vmin), vmax);
-    }
-
-    vst1q_f32(outptr, z00);
-    vst1_f32(outptr + 4, vget_low_f32(z01));
-    vst1q_f32(outptr + outstep, z10);
-    vst1_f32(outptr + outstep + 4, vget_low_f32(z11));
-    vst1q_f32(outptr + outstep*2, z20);
-    vst1_f32(outptr + outstep*2 + 4, vget_low_f32(z21));
-    vst1q_f32(outptr + outstep*3, z30);
-    vst1_f32(outptr + outstep*3 + 4, vget_low_f32(z31));
-    vst1q_f32(outptr + outstep*4, z40);
-    vst1_f32(outptr + outstep*4 + 4, vget_low_f32(z41));
-    vst1q_f32(outptr + outstep*5, z50);
-    vst1_f32(outptr + outstep*5 + 4, vget_low_f32(z51));
-}
-
-#endif
-}
-
-}} // namespace
--- a/modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.simd.hpp
+++ b/modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.simd.hpp
--- a/modules/dnn/src/layers/cpu_kernels/convolution.hpp
+++ b/modules/dnn/src/layers/cpu_kernels/convolution.hpp
@ -6,6 +6,7 @@
 #define OPENCV_FAST_CONVOLUTION_HPP

 #include "opencv2/core/hal/intrin.hpp"
+#include "opencv2/dnn/all_layers.hpp"

 #ifndef CONV_PRAM
 #define CONV_PRAM
@ -119,25 +120,30 @@ void convBlock_F32(int np, const float* a, const float* b, float* c, int ldc, bo

 void convBlockMR1_F32(int np, const float* a, const float* b, float* c, const float bias, bool init_c,
                      const float minval, const float maxval, bool ifMinMaxAct, const int width, const int convNR);
-
-#if CV_NEON_AARCH64
-/* Accumulate */
-void winofunc_accum_F32(const float* inwptr, const float* wptr, float* outbuf, int Cg, int iblock,
-                    const int winoIblock, const int winoKblock, const int winoAtom, const int winoNatom);
-
-/*Input transform*/
-void winofunc_BtXB_8x8_F32(const float* inptr, int inpstep,
-                       float* outptr, int Cg, const int winoIblock, const int winoAtom);
-
-/*Output transform*/
-void winofunc_AtXA_8x8_F32(const float* inptr, int inpstep,
-                       float* bpptr, int bpstep, float* outptr, int outstep,
-                       float bias, float minval, float maxval, bool ifMinMaxAct);
-#endif // CV_NEON_AARCH64
 #endif // CV_NEON
 } // namespace opt_NEON.


+
+// === Function tables
+struct Winofunc
+{
+    void (*accum)(const uchar* inwptr, const uchar* wptr, uchar* outbuf, int Cg, int iblock, const int winoIblock, const int winoKblock, const int winoAtomF32, const int winoNatomF32);
+    void (*BtXB_8x8)(const float* inptr, int inpstep, uchar* outptr, int Cg, const int winoIblock, const int winoAtomF32);
+    void (*AtXA_8x8)(const uchar* inptr, int inpstep, float* bpptr, int bpstep, float* outptr, int outstep, float bias, float minval, float maxval, bool ifMinMaxAct);
+    int iblock;
+    int natom;
+    int esz;
+
+    bool isGood() const { return accum && BtXB_8x8 && AtXA_8x8 && iblock > 0 && natom > 0 && esz > 0; }
+    static Winofunc empty() { return {0, 0, 0, 0, 0, 0}; }
+};
+
+// === wrapper calls (implemented in .dispatch.cpp)
+Winofunc getWinofunc_F32();
+Winofunc getWinofunc_F16();
+
+
 } // namespace dnn
 } // namespace cv

--- a/modules/flann/include/opencv2/flann/hdf5.h
+++ b/modules/flann/include/opencv2/flann/hdf5.h
@ -1,235 +0,0 @@
-/***********************************************************************
- * Software License Agreement (BSD License)
- *
- * Copyright 2008-2009  Marius Muja (mariusm@cs.ubc.ca). All rights reserved.
- * Copyright 2008-2009  David G. Lowe (lowe@cs.ubc.ca). All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
- * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
- * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
- * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *************************************************************************/
-
-
-#ifndef OPENCV_FLANN_HDF5_H_
-#define OPENCV_FLANN_HDF5_H_
-
-//! @cond IGNORED
-
-#include <hdf5.h>
-
-#include "matrix.h"
-
-
-namespace cvflann
-{
-
-namespace
-{
-
-template<typename T>
-hid_t get_hdf5_type()
-{
-    throw FLANNException("Unsupported type for IO operations");
-}
-
-template<>
-hid_t get_hdf5_type<char>() { return H5T_NATIVE_CHAR; }
-template<>
-hid_t get_hdf5_type<unsigned char>() { return H5T_NATIVE_UCHAR; }
-template<>
-hid_t get_hdf5_type<short int>() { return H5T_NATIVE_SHORT; }
-template<>
-hid_t get_hdf5_type<unsigned short int>() { return H5T_NATIVE_USHORT; }
-template<>
-hid_t get_hdf5_type<int>() { return H5T_NATIVE_INT; }
-template<>
-hid_t get_hdf5_type<unsigned int>() { return H5T_NATIVE_UINT; }
-template<>
-hid_t get_hdf5_type<long>() { return H5T_NATIVE_LONG; }
-template<>
-hid_t get_hdf5_type<unsigned long>() { return H5T_NATIVE_ULONG; }
-template<>
-hid_t get_hdf5_type<float>() { return H5T_NATIVE_FLOAT; }
-template<>
-hid_t get_hdf5_type<double>() { return H5T_NATIVE_DOUBLE; }
-}
-
-
-#define CHECK_ERROR(x,y) if ((x)<0) throw FLANNException((y));
-
-template<typename T>
-void save_to_file(const cvflann::Matrix<T>& dataset, const String& filename, const String& name)
-{
-
-#if H5Eset_auto_vers == 2
-    H5Eset_auto( H5E_DEFAULT, NULL, NULL );
-#else
-    H5Eset_auto( NULL, NULL );
-#endif
-
-    herr_t status;
-    hid_t file_id;
-    file_id = H5Fopen(filename.c_str(), H5F_ACC_RDWR, H5P_DEFAULT);
-    if (file_id < 0) {
-        file_id = H5Fcreate(filename.c_str(), H5F_ACC_EXCL, H5P_DEFAULT, H5P_DEFAULT);
-    }
-    CHECK_ERROR(file_id,"Error creating hdf5 file.");
-
-    hsize_t     dimsf[2];              // dataset dimensions
-    dimsf[0] = dataset.rows;
-    dimsf[1] = dataset.cols;
-
-    hid_t space_id = H5Screate_simple(2, dimsf, NULL);
-    hid_t memspace_id = H5Screate_simple(2, dimsf, NULL);
-
-    hid_t dataset_id;
-#if H5Dcreate_vers == 2
-    dataset_id = H5Dcreate2(file_id, name.c_str(), get_hdf5_type<T>(), space_id, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
-#else
-    dataset_id = H5Dcreate(file_id, name.c_str(), get_hdf5_type<T>(), space_id, H5P_DEFAULT);
-#endif
-
-    if (dataset_id<0) {
-#if H5Dopen_vers == 2
-        dataset_id = H5Dopen2(file_id, name.c_str(), H5P_DEFAULT);
-#else
-        dataset_id = H5Dopen(file_id, name.c_str());
-#endif
-    }
-    CHECK_ERROR(dataset_id,"Error creating or opening dataset in file.");
-
-    status = H5Dwrite(dataset_id, get_hdf5_type<T>(), memspace_id, space_id, H5P_DEFAULT, dataset.data );
-    CHECK_ERROR(status, "Error writing to dataset");
-
-    H5Sclose(memspace_id);
-    H5Sclose(space_id);
-    H5Dclose(dataset_id);
-    H5Fclose(file_id);
-
-}
-
-
-template<typename T>
-void load_from_file(cvflann::Matrix<T>& dataset, const String& filename, const String& name)
-{
-    herr_t status;
-    hid_t file_id = H5Fopen(filename.c_str(), H5F_ACC_RDWR, H5P_DEFAULT);
-    CHECK_ERROR(file_id,"Error opening hdf5 file.");
-
-    hid_t dataset_id;
-#if H5Dopen_vers == 2
-    dataset_id = H5Dopen2(file_id, name.c_str(), H5P_DEFAULT);
-#else
-    dataset_id = H5Dopen(file_id, name.c_str());
-#endif
-    CHECK_ERROR(dataset_id,"Error opening dataset in file.");
-
-    hid_t space_id = H5Dget_space(dataset_id);
-
-    hsize_t dims_out[2];
-    H5Sget_simple_extent_dims(space_id, dims_out, NULL);
-
-    dataset = cvflann::Matrix<T>(new T[dims_out[0]*dims_out[1]], dims_out[0], dims_out[1]);
-
-    status = H5Dread(dataset_id, get_hdf5_type<T>(), H5S_ALL, H5S_ALL, H5P_DEFAULT, dataset[0]);
-    CHECK_ERROR(status, "Error reading dataset");
-
-    H5Sclose(space_id);
-    H5Dclose(dataset_id);
-    H5Fclose(file_id);
-}
-
-
-#ifdef HAVE_MPI
-
-namespace mpi
-{
-/**
- * Loads a the hyperslice corresponding to this processor from a hdf5 file.
- * @param flann_dataset Dataset where the data is loaded
- * @param filename HDF5 file name
- * @param name Name of dataset inside file
- */
-template<typename T>
-void load_from_file(cvflann::Matrix<T>& dataset, const String& filename, const String& name)
-{
-    MPI_Comm comm  = MPI_COMM_WORLD;
-    MPI_Info info  = MPI_INFO_NULL;
-
-    int mpi_size, mpi_rank;
-    MPI_Comm_size(comm, &mpi_size);
-    MPI_Comm_rank(comm, &mpi_rank);
-
-    herr_t status;
-
-    hid_t plist_id = H5Pcreate(H5P_FILE_ACCESS);
-    H5Pset_fapl_mpio(plist_id, comm, info);
-    hid_t file_id = H5Fopen(filename.c_str(), H5F_ACC_RDWR, plist_id);
-    CHECK_ERROR(file_id,"Error opening hdf5 file.");
-    H5Pclose(plist_id);
-    hid_t dataset_id;
-#if H5Dopen_vers == 2
-    dataset_id = H5Dopen2(file_id, name.c_str(), H5P_DEFAULT);
-#else
-    dataset_id = H5Dopen(file_id, name.c_str());
-#endif
-    CHECK_ERROR(dataset_id,"Error opening dataset in file.");
-
-    hid_t space_id = H5Dget_space(dataset_id);
-    hsize_t dims[2];
-    H5Sget_simple_extent_dims(space_id, dims, NULL);
-
-    hsize_t count[2];
-    hsize_t offset[2];
-
-    hsize_t item_cnt = dims[0]/mpi_size+(dims[0]%mpi_size==0 ? 0 : 1);
-    hsize_t cnt = (mpi_rank<mpi_size-1 ? item_cnt : dims[0]-item_cnt*(mpi_size-1));
-
-    count[0] = cnt;
-    count[1] = dims[1];
-    offset[0] = mpi_rank*item_cnt;
-    offset[1] = 0;
-
-    hid_t memspace_id = H5Screate_simple(2,count,NULL);
-
-    H5Sselect_hyperslab(space_id, H5S_SELECT_SET, offset, NULL, count, NULL);
-
-    dataset.rows = count[0];
-    dataset.cols = count[1];
-    dataset.data = new T[dataset.rows*dataset.cols];
-
-    plist_id = H5Pcreate(H5P_DATASET_XFER);
-    H5Pset_dxpl_mpio(plist_id, H5FD_MPIO_COLLECTIVE);
-    status = H5Dread(dataset_id, get_hdf5_type<T>(), memspace_id, space_id, plist_id, dataset.data);
-    CHECK_ERROR(status, "Error reading dataset");
-
-    H5Pclose(plist_id);
-    H5Sclose(space_id);
-    H5Sclose(memspace_id);
-    H5Dclose(dataset_id);
-    H5Fclose(file_id);
-}
-}
-#endif // HAVE_MPI
-} // namespace cvflann::mpi
-
-//! @endcond
-
-#endif /* OPENCV_FLANN_HDF5_H_ */
--- a/modules/imgcodecs/CMakeLists.txt
+++ b/modules/imgcodecs/CMakeLists.txt
@ -190,7 +190,7 @@ endif()
 if(TARGET opencv_test_imgcodecs AND HAVE_OPENEXR AND "$ENV{OPENCV_IO_ENABLE_OPENEXR}")
  ocv_target_compile_definitions(opencv_test_imgcodecs PRIVATE OPENCV_IMGCODECS_ENABLE_OPENEXR_TESTS=1)
 endif()
-if(TARGET opencv_test_imgcodecs AND ((HAVE_PNG AND NOT (PNG_VERSION VERSION_LESS "1.6.31")) OR HAVE_SPNG))
+if(TARGET opencv_test_imgcodecs AND ((HAVE_PNG AND NOT (PNG_VERSION_STRING VERSION_LESS "1.6.31")) OR HAVE_SPNG))
  # details: https://github.com/glennrp/libpng/commit/68cb0aaee3de6371b81a4613476d9b33e43e95b1
  ocv_target_compile_definitions(opencv_test_imgcodecs PRIVATE OPENCV_IMGCODECS_PNG_WITH_EXIF=1)
 endif()
--- a/modules/imgcodecs/src/grfmt_exr.cpp
+++ b/modules/imgcodecs/src/grfmt_exr.cpp
@ -754,7 +754,10 @@ bool  ExrEncoder::write( const Mat& img, const std::vector<int>& params )
            case IMWRITE_EXR_COMPRESSION_B44A:
                header.compression() = B44A_COMPRESSION;
                break;
-#if ((OPENEXR_VERSION_MAJOR * 1000 + OPENEXR_VERSION_MINOR) >= (2 * 1000 + 2)) // available since version 2.2.0
+// version macros introduced in openexr 2.0.1.
+// - https://github.com/AcademySoftwareFoundation/openexr/commit/60cdff8a6f5c4e25a374e5f366d6e9b4efd869b3#diff-c4bae0726aebe410e407db9abd406d9cf2684f82dd8a08f46d84e8b7c35cf22aR67
+#if defined(OPENEXR_VERSION_MAJOR) && defined(OPENEXR_VERSION_MINOR) && OPENEXR_VERSION_MAJOR * 1000 + OPENEXR_VERSION_MINOR >= 2 * 1000 + 2
+            // available since version 2.2.0
            case IMWRITE_EXR_COMPRESSION_DWAA:
                header.compression() = DWAA_COMPRESSION;
                break;
@ -768,10 +771,12 @@ bool  ExrEncoder::write( const Mat& img, const std::vector<int>& params )
        }
        if (params[i] == IMWRITE_EXR_DWA_COMPRESSION_LEVEL)
        {
-#if OPENEXR_VERSION_MAJOR >= 3
-            header.dwaCompressionLevel() = params[i + 1];
-#else
+#if !defined(OPENEXR_VERSION_MAJOR)
+            CV_LOG_ONCE_WARNING(NULL, "Setting `IMWRITE_EXR_DWA_COMPRESSION_LEVEL` not supported in unknown OpenEXR version possibly prior to 2.0.1 (version 3 is required)");
+#elif OPENEXR_VERSION_MAJOR < 3
            CV_LOG_ONCE_WARNING(NULL, "Setting `IMWRITE_EXR_DWA_COMPRESSION_LEVEL` not supported in OpenEXR version " + std::to_string(OPENEXR_VERSION_MAJOR) + " (version 3 is required)");
+#else
+            header.dwaCompressionLevel() = params[i + 1];
 #endif
        }
    }
--- a/modules/imgcodecs/src/loadsave.cpp
+++ b/modules/imgcodecs/src/loadsave.cpp
@ -83,6 +83,9 @@ static Size validateInputImageSize(const Size& size)

 static inline int calcType(int type, int flags)
 {
+    if ( (flags & (IMREAD_COLOR | IMREAD_ANYCOLOR | IMREAD_ANYDEPTH)) == (IMREAD_COLOR | IMREAD_ANYCOLOR | IMREAD_ANYDEPTH))
+        return type;
+
    if( (flags & IMREAD_LOAD_GDAL) != IMREAD_LOAD_GDAL && flags != IMREAD_UNCHANGED )
    {
        if( (flags & IMREAD_ANYDEPTH) == 0 )
--- a/modules/imgcodecs/test/test_avif.cpp
+++ b/modules/imgcodecs/test/test_avif.cpp
@ -187,51 +187,6 @@ INSTANTIATE_TEST_CASE_P(

 ////////////////////////////////////////////////////////////////////////////////

-typedef testing::TestWithParam<string> Imgcodecs_AVIF_Exif;
-
-TEST_P(Imgcodecs_AVIF_Exif, exif_orientation) {
-  const string root = cvtest::TS::ptr()->get_data_path();
-  const string filename = root + GetParam();
-  const int colorThresholdHigh = 250;
-  const int colorThresholdLow = 5;
-
-  Mat m_img = imread(filename);
-  ASSERT_FALSE(m_img.empty());
-  Vec3b vec;
-
-  // Checking the first quadrant (with supposed red)
-  vec = m_img.at<Vec3b>(2, 2);  // some point inside the square
-  EXPECT_LE(vec.val[0], colorThresholdLow);
-  EXPECT_LE(vec.val[1], colorThresholdLow);
-  EXPECT_GE(vec.val[2], colorThresholdHigh);
-
-  // Checking the second quadrant (with supposed green)
-  vec = m_img.at<Vec3b>(2, 7);  // some point inside the square
-  EXPECT_LE(vec.val[0], colorThresholdLow);
-  EXPECT_GE(vec.val[1], colorThresholdHigh);
-  EXPECT_LE(vec.val[2], colorThresholdLow);
-
-  // Checking the third quadrant (with supposed blue)
-  vec = m_img.at<Vec3b>(7, 2);  // some point inside the square
-  EXPECT_GE(vec.val[0], colorThresholdHigh);
-  EXPECT_LE(vec.val[1], colorThresholdLow);
-  EXPECT_LE(vec.val[2], colorThresholdLow);
-}
-
-const string exif_files[] = {"readwrite/testExifOrientation_1.avif",
-                             "readwrite/testExifOrientation_2.avif",
-                             "readwrite/testExifOrientation_3.avif",
-                             "readwrite/testExifOrientation_4.avif",
-                             "readwrite/testExifOrientation_5.avif",
-                             "readwrite/testExifOrientation_6.avif",
-                             "readwrite/testExifOrientation_7.avif",
-                             "readwrite/testExifOrientation_8.avif"};
-
-INSTANTIATE_TEST_CASE_P(ExifFiles, Imgcodecs_AVIF_Exif,
-                        testing::ValuesIn(exif_files));
-
-////////////////////////////////////////////////////////////////////////////////
-
 class Imgcodecs_Avif_Animation_RoundTripSuite
    : public Imgcodecs_Avif_RoundTripSuite {
 public:
--- a/modules/imgcodecs/test/test_exif.cpp
+++ b/modules/imgcodecs/test/test_exif.cpp
@ -0,0 +1,151 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level
+// directory of this distribution and at http://opencv.org/license.html
+
+#include "test_precomp.hpp"
+
+namespace opencv_test { namespace {
+
+/**
+ * Test to check whether the EXIF orientation tag was processed successfully or not.
+ * The test uses a set of 8 images named testExifOrientation_{1 to 8}.(extension).
+ * Each test image is a 10x10 square, divided into four smaller sub-squares:
+ * (R corresponds to Red, G to Green, B to Blue, W to White)
+ * ---------             ---------
+ * | R | G |             | G | R |
+ * |-------| - (tag 1)   |-------| - (tag 2)
+ * | B | W |             | W | B |
+ * ---------             ---------
+ *
+ * ---------             ---------
+ * | W | B |             | B | W |
+ * |-------| - (tag 3)   |-------| - (tag 4)
+ * | G | R |             | R | G |
+ * ---------             ---------
+ *
+ * ---------             ---------
+ * | R | B |             | G | W |
+ * |-------| - (tag 5)   |-------| - (tag 6)
+ * | G | W |             | R | B |
+ * ---------             ---------
+ *
+ * ---------             ---------
+ * | W | G |             | B | R |
+ * |-------| - (tag 7)   |-------| - (tag 8)
+ * | B | R |             | W | G |
+ * ---------             ---------
+ *
+ *
+ * Each image contains an EXIF field with an orientation tag (0x112).
+ * After reading each image and applying the orientation tag,
+ * the resulting image should be:
+ * ---------
+ * | R | G |
+ * |-------|
+ * | B | W |
+ * ---------
+ *
+ * Note:
+ * The flags parameter of the imread function is set as IMREAD_COLOR | IMREAD_ANYCOLOR | IMREAD_ANYDEPTH.
+ * Using this combination is an undocumented trick to load images similarly to the IMREAD_UNCHANGED flag,
+ * preserving the alpha channel (if present) while also applying the orientation.
+ */
+
+typedef testing::TestWithParam<string> Exif;
+
+TEST_P(Exif, exif_orientation)
+{
+    const string root = cvtest::TS::ptr()->get_data_path();
+    const string filename = root + GetParam();
+    const int colorThresholdHigh = 250;
+    const int colorThresholdLow = 5;
+
+    // Refer to the note in the explanation above.
+    Mat m_img = imread(filename, IMREAD_COLOR | IMREAD_ANYCOLOR | IMREAD_ANYDEPTH);
+    ASSERT_FALSE(m_img.empty());
+
+    if (m_img.channels() == 3)
+    {
+        Vec3b vec;
+
+        //Checking the first quadrant (with supposed red)
+        vec = m_img.at<Vec3b>(2, 2); //some point inside the square
+        EXPECT_LE(vec.val[0], colorThresholdLow);
+        EXPECT_LE(vec.val[1], colorThresholdLow);
+        EXPECT_GE(vec.val[2], colorThresholdHigh);
+
+        //Checking the second quadrant (with supposed green)
+        vec = m_img.at<Vec3b>(2, 7);  //some point inside the square
+        EXPECT_LE(vec.val[0], colorThresholdLow);
+        EXPECT_GE(vec.val[1], colorThresholdHigh);
+        EXPECT_LE(vec.val[2], colorThresholdLow);
+
+        //Checking the third quadrant (with supposed blue)
+        vec = m_img.at<Vec3b>(7, 2);  //some point inside the square
+        EXPECT_GE(vec.val[0], colorThresholdHigh);
+        EXPECT_LE(vec.val[1], colorThresholdLow);
+        EXPECT_LE(vec.val[2], colorThresholdLow);
+    }
+    else
+    {
+        Vec4b vec;
+
+        //Checking the first quadrant (with supposed red)
+        vec = m_img.at<Vec4b>(2, 2); //some point inside the square
+        EXPECT_LE(vec.val[0], colorThresholdLow);
+        EXPECT_LE(vec.val[1], colorThresholdLow);
+        EXPECT_GE(vec.val[2], colorThresholdHigh);
+
+        //Checking the second quadrant (with supposed green)
+        vec = m_img.at<Vec4b>(2, 7);  //some point inside the square
+        EXPECT_LE(vec.val[0], colorThresholdLow);
+        EXPECT_GE(vec.val[1], colorThresholdHigh);
+        EXPECT_LE(vec.val[2], colorThresholdLow);
+
+        //Checking the third quadrant (with supposed blue)
+        vec = m_img.at<Vec4b>(7, 2);  //some point inside the square
+        EXPECT_GE(vec.val[0], colorThresholdHigh);
+        EXPECT_LE(vec.val[1], colorThresholdLow);
+        EXPECT_LE(vec.val[2], colorThresholdLow);
+    }
+}
+
+const string exif_files[] =
+{
+#ifdef HAVE_JPEG
+    "readwrite/testExifOrientation_1.jpg",
+    "readwrite/testExifOrientation_2.jpg",
+    "readwrite/testExifOrientation_3.jpg",
+    "readwrite/testExifOrientation_4.jpg",
+    "readwrite/testExifOrientation_5.jpg",
+    "readwrite/testExifOrientation_6.jpg",
+    "readwrite/testExifOrientation_7.jpg",
+    "readwrite/testExifOrientation_8.jpg",
+#endif
+#ifdef OPENCV_IMGCODECS_PNG_WITH_EXIF
+    "readwrite/testExifOrientation_1.png",
+    "readwrite/testExifOrientation_2.png",
+    "readwrite/testExifOrientation_3.png",
+    "readwrite/testExifOrientation_4.png",
+    "readwrite/testExifOrientation_5.png",
+    "readwrite/testExifOrientation_6.png",
+    "readwrite/testExifOrientation_7.png",
+    "readwrite/testExifOrientation_8.png",
+#endif
+#ifdef HAVE_AVIF
+    "readwrite/testExifOrientation_1.avif",
+    "readwrite/testExifOrientation_2.avif",
+    "readwrite/testExifOrientation_3.avif",
+    "readwrite/testExifOrientation_4.avif",
+    "readwrite/testExifOrientation_5.avif",
+    "readwrite/testExifOrientation_6.avif",
+    "readwrite/testExifOrientation_7.avif",
+    "readwrite/testExifOrientation_8.avif",
+#endif
+};
+
+INSTANTIATE_TEST_CASE_P(Imgcodecs, Exif,
+    testing::ValuesIn(exif_files));
+
+}
+}
--- a/modules/imgcodecs/test/test_jpeg.cpp
+++ b/modules/imgcodecs/test/test_jpeg.cpp
@ -11,95 +11,6 @@ extern "C" {
 #include "jpeglib.h"
 }

-/**
- * Test for check whether reading exif orientation tag was processed successfully or not
- * The test info is the set of 8 images named testExifRotate_{1 to 8}.jpg
- * The test image is the square 10x10 points divided by four sub-squares:
- * (R corresponds to Red, G to Green, B to Blue, W to white)
- * ---------             ---------
- * | R | G |             | G | R |
- * |-------| - (tag 1)   |-------| - (tag 2)
- * | B | W |             | W | B |
- * ---------             ---------
- *
- * ---------             ---------
- * | W | B |             | B | W |
- * |-------| - (tag 3)   |-------| - (tag 4)
- * | G | R |             | R | G |
- * ---------             ---------
- *
- * ---------             ---------
- * | R | B |             | G | W |
- * |-------| - (tag 5)   |-------| - (tag 6)
- * | G | W |             | R | B |
- * ---------             ---------
- *
- * ---------             ---------
- * | W | G |             | B | R |
- * |-------| - (tag 7)   |-------| - (tag 8)
- * | B | R |             | W | G |
- * ---------             ---------
- *
- *
- * Every image contains exif field with orientation tag (0x112)
- * After reading each image the corresponding matrix must be read as
- * ---------
- * | R | G |
- * |-------|
- * | B | W |
- * ---------
- *
- */
-
-typedef testing::TestWithParam<string> Imgcodecs_Jpeg_Exif;
-
-TEST_P(Imgcodecs_Jpeg_Exif, exif_orientation)
-{
-    const string root = cvtest::TS::ptr()->get_data_path();
-    const string filename = root + GetParam();
-    const int colorThresholdHigh = 250;
-    const int colorThresholdLow = 5;
-
-    Mat m_img = imread(filename);
-    ASSERT_FALSE(m_img.empty());
-    Vec3b vec;
-
-    //Checking the first quadrant (with supposed red)
-    vec = m_img.at<Vec3b>(2, 2); //some point inside the square
-    EXPECT_LE(vec.val[0], colorThresholdLow);
-    EXPECT_LE(vec.val[1], colorThresholdLow);
-    EXPECT_GE(vec.val[2], colorThresholdHigh);
-
-    //Checking the second quadrant (with supposed green)
-    vec = m_img.at<Vec3b>(2, 7);  //some point inside the square
-    EXPECT_LE(vec.val[0], colorThresholdLow);
-    EXPECT_GE(vec.val[1], colorThresholdHigh);
-    EXPECT_LE(vec.val[2], colorThresholdLow);
-
-    //Checking the third quadrant (with supposed blue)
-    vec = m_img.at<Vec3b>(7, 2);  //some point inside the square
-    EXPECT_GE(vec.val[0], colorThresholdHigh);
-    EXPECT_LE(vec.val[1], colorThresholdLow);
-    EXPECT_LE(vec.val[2], colorThresholdLow);
-}
-
-const string exif_files[] =
-{
-    "readwrite/testExifOrientation_1.jpg",
-    "readwrite/testExifOrientation_2.jpg",
-    "readwrite/testExifOrientation_3.jpg",
-    "readwrite/testExifOrientation_4.jpg",
-    "readwrite/testExifOrientation_5.jpg",
-    "readwrite/testExifOrientation_6.jpg",
-    "readwrite/testExifOrientation_7.jpg",
-    "readwrite/testExifOrientation_8.jpg"
-};
-
-INSTANTIATE_TEST_CASE_P(ExifFiles, Imgcodecs_Jpeg_Exif,
-                        testing::ValuesIn(exif_files));
-
-//==================================================================================================
-
 TEST(Imgcodecs_Jpeg, encode_empty)
 {
    cv::Mat img;
--- a/modules/imgcodecs/test/test_png.cpp
+++ b/modules/imgcodecs/test/test_png.cpp
@ -109,100 +109,6 @@ TEST(Imgcodecs_Png, read_color_palette_with_alpha)
    EXPECT_EQ(img.at<Vec3b>(0, 1), Vec3b(255, 0, 0));
 }

-/**
- * Test for check whether reading exif orientation tag was processed successfully or not
- * The test info is the set of 8 images named testExifRotate_{1 to 8}.png
- * The test image is the square 10x10 points divided by four sub-squares:
- * (R corresponds to Red, G to Green, B to Blue, W to white)
- * ---------             ---------
- * | R | G |             | G | R |
- * |-------| - (tag 1)   |-------| - (tag 2)
- * | B | W |             | W | B |
- * ---------             ---------
- *
- * ---------             ---------
- * | W | B |             | B | W |
- * |-------| - (tag 3)   |-------| - (tag 4)
- * | G | R |             | R | G |
- * ---------             ---------
- *
- * ---------             ---------
- * | R | B |             | G | W |
- * |-------| - (tag 5)   |-------| - (tag 6)
- * | G | W |             | R | B |
- * ---------             ---------
- *
- * ---------             ---------
- * | W | G |             | B | R |
- * |-------| - (tag 7)   |-------| - (tag 8)
- * | B | R |             | W | G |
- * ---------             ---------
- *
- *
- * Every image contains exif field with orientation tag (0x112)
- * After reading each image and applying the orientation tag,
- * the resulting image should be:
- * ---------
- * | R | G |
- * |-------|
- * | B | W |
- * ---------
- *
- */
-
-typedef testing::TestWithParam<string> Imgcodecs_PNG_Exif;
-
-// Solution to issue 16579: PNG read doesn't support Exif orientation data
-#ifdef OPENCV_IMGCODECS_PNG_WITH_EXIF
-TEST_P(Imgcodecs_PNG_Exif, exif_orientation)
-#else
-TEST_P(Imgcodecs_PNG_Exif, DISABLED_exif_orientation)
-#endif
-{
-    const string root = cvtest::TS::ptr()->get_data_path();
-    const string filename = root + GetParam();
-    const int colorThresholdHigh = 250;
-    const int colorThresholdLow = 5;
-
-    Mat m_img = imread(filename);
-    ASSERT_FALSE(m_img.empty());
-    Vec3b vec;
-
-    //Checking the first quadrant (with supposed red)
-    vec = m_img.at<Vec3b>(2, 2); //some point inside the square
-    EXPECT_LE(vec.val[0], colorThresholdLow);
-    EXPECT_LE(vec.val[1], colorThresholdLow);
-    EXPECT_GE(vec.val[2], colorThresholdHigh);
-
-    //Checking the second quadrant (with supposed green)
-    vec = m_img.at<Vec3b>(2, 7);  //some point inside the square
-    EXPECT_LE(vec.val[0], colorThresholdLow);
-    EXPECT_GE(vec.val[1], colorThresholdHigh);
-    EXPECT_LE(vec.val[2], colorThresholdLow);
-
-    //Checking the third quadrant (with supposed blue)
-    vec = m_img.at<Vec3b>(7, 2);  //some point inside the square
-    EXPECT_GE(vec.val[0], colorThresholdHigh);
-    EXPECT_LE(vec.val[1], colorThresholdLow);
-    EXPECT_LE(vec.val[2], colorThresholdLow);
-}
-
-const string exif_files[] =
-{
-    "readwrite/testExifOrientation_1.png",
-    "readwrite/testExifOrientation_2.png",
-    "readwrite/testExifOrientation_3.png",
-    "readwrite/testExifOrientation_4.png",
-    "readwrite/testExifOrientation_5.png",
-    "readwrite/testExifOrientation_6.png",
-    "readwrite/testExifOrientation_7.png",
-    "readwrite/testExifOrientation_8.png"
-};
-
-INSTANTIATE_TEST_CASE_P(ExifFiles, Imgcodecs_PNG_Exif,
-    testing::ValuesIn(exif_files));
-
-
 typedef testing::TestWithParam<string> Imgcodecs_Png_PngSuite;

 TEST_P(Imgcodecs_Png_PngSuite, decode)