diff --git a/modules/core/CMakeLists.txt b/modules/core/CMakeLists.txt
index 285326a963..96c190f5b8 100644
--- a/modules/core/CMakeLists.txt
+++ b/modules/core/CMakeLists.txt
@@ -13,8 +13,9 @@ ocv_add_dispatched_file(split SSE2 AVX2)
 ocv_add_dispatched_file(sum SSE2 AVX2)
 
 # dispatching for accuracy tests
-ocv_add_dispatched_file_force_all(test_intrin128 TEST SSE2 SSE3 SSSE3 SSE4_1 SSE4_2 AVX FP16 AVX2)
-ocv_add_dispatched_file_force_all(test_intrin256 TEST AVX2)
+ocv_add_dispatched_file_force_all(test_intrin128 TEST SSE2 SSE3 SSSE3 SSE4_1 SSE4_2 AVX FP16 AVX2 AVX512_SKX)
+ocv_add_dispatched_file_force_all(test_intrin256 TEST AVX2 AVX512_SKX)
+ocv_add_dispatched_file_force_all(test_intrin512 TEST AVX512_SKX)
 
 ocv_add_module(core
                OPTIONAL opencv_cudev
diff --git a/modules/core/include/opencv2/core/hal/intrin.hpp b/modules/core/include/opencv2/core/hal/intrin.hpp
index 460c5c5900..adce1b3fb1 100644
--- a/modules/core/include/opencv2/core/hal/intrin.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin.hpp
@@ -180,6 +180,18 @@ using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE;
 
 #endif
 
+// AVX512 can be used together with SSE2 and AVX2, so
+// we define those sets of intrinsics at once.
+// For some of AVX512 intrinsics get v512_ prefix instead of v_, e.g. v512_load() vs v_load().
+// Wide intrinsics will be mapped to v512_ counterparts in this case(e.g. vx_load() => v512_load())
+#if CV_AVX512_SKX
+
+#define CV__SIMD_FORWARD 512
+#include "opencv2/core/hal/intrin_forward.hpp"
+#include "opencv2/core/hal/intrin_avx512.hpp"
+
+#endif
+
 //! @cond IGNORED
 
 namespace cv {
@@ -321,13 +333,41 @@ template<typename _Tp> struct V_RegTraits
     CV_DEF_REG_TRAITS(v256, v_float64x4, double, f64, v_float64x4, void, void, v_int64x4, v_int32x8);
 #endif
 
+#if CV_SIMD512
+    CV_DEF_REG_TRAITS(v512, v_uint8x64, uchar, u8, v_uint8x64, v_uint16x32, v_uint32x16, v_int8x64, void);
+    CV_DEF_REG_TRAITS(v512, v_int8x64, schar, s8, v_uint8x64, v_int16x32, v_int32x16, v_int8x64, void);
+    CV_DEF_REG_TRAITS(v512, v_uint16x32, ushort, u16, v_uint16x32, v_uint32x16, v_uint64x8, v_int16x32, void);
+    CV_DEF_REG_TRAITS(v512, v_int16x32, short, s16, v_uint16x32, v_int32x16, v_int64x8, v_int16x32, void);
+    CV_DEF_REG_TRAITS(v512, v_uint32x16, unsigned, u32, v_uint32x16, v_uint64x8, void, v_int32x16, void);
+    CV_DEF_REG_TRAITS(v512, v_int32x16, int, s32, v_uint32x16, v_int64x8, void, v_int32x16, void);
+    CV_DEF_REG_TRAITS(v512, v_float32x16, float, f32, v_float32x16, v_float64x8, void, v_int32x16, v_int32x16);
+    CV_DEF_REG_TRAITS(v512, v_uint64x8, uint64, u64, v_uint64x8, void, void, v_int64x8, void);
+    CV_DEF_REG_TRAITS(v512, v_int64x8, int64, s64, v_uint64x8, void, void, v_int64x8, void);
+    CV_DEF_REG_TRAITS(v512, v_float64x8, double, f64, v_float64x8, void, void, v_int64x8, v_int32x16);
+#endif
+
 #if CV_SIMD512 && (!defined(CV__SIMD_FORCE_WIDTH) || CV__SIMD_FORCE_WIDTH == 512)
 #define CV__SIMD_NAMESPACE simd512
 namespace CV__SIMD_NAMESPACE {
     #define CV_SIMD 1
     #define CV_SIMD_64F CV_SIMD512_64F
+    #define CV_SIMD_FP16 CV_SIMD512_FP16
     #define CV_SIMD_WIDTH 64
-    // TODO typedef v_uint8 / v_int32 / etc types here
+    typedef v_uint8x64    v_uint8;
+    typedef v_int8x64     v_int8;
+    typedef v_uint16x32   v_uint16;
+    typedef v_int16x32    v_int16;
+    typedef v_uint32x16   v_uint32;
+    typedef v_int32x16    v_int32;
+    typedef v_uint64x8    v_uint64;
+    typedef v_int64x8     v_int64;
+    typedef v_float32x16  v_float32;
+    CV_INTRIN_DEFINE_WIDE_INTRIN_ALL_TYPES(v512)
+#if CV_SIMD512_64F
+    typedef v_float64x8   v_float64;
+    CV_INTRIN_DEFINE_WIDE_INTRIN(double, v_float64, f64, v512, load)
+#endif
+        inline void vx_cleanup() { v512_cleanup(); }
 } // namespace
 using namespace CV__SIMD_NAMESPACE;
 #elif CV_SIMD256 && (!defined(CV__SIMD_FORCE_WIDTH) || CV__SIMD_FORCE_WIDTH == 256)
diff --git a/modules/core/include/opencv2/core/hal/intrin_avx512.hpp b/modules/core/include/opencv2/core/hal/intrin_avx512.hpp
new file mode 100644
index 0000000000..69d8d8398d
--- /dev/null
+++ b/modules/core/include/opencv2/core/hal/intrin_avx512.hpp
@@ -0,0 +1,2743 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+#ifndef OPENCV_HAL_INTRIN_AVX512_HPP
+#define OPENCV_HAL_INTRIN_AVX512_HPP
+
+#define CVT_ROUND_MODES_IMPLEMENTED 0
+
+#define CV_SIMD512 1
+#define CV_SIMD512_64F 1
+#define CV_SIMD512_FP16 0  // no native operations with FP16 type. Only load/store from float32x8 are available (if CV_FP16 == 1)
+
+#define _v512_set_epu64(a7, a6, a5, a4, a3, a2, a1, a0) _mm512_set_epi64((int64)(a7),(int64)(a6),(int64)(a5),(int64)(a4),(int64)(a3),(int64)(a2),(int64)(a1),(int64)(a0))
+#define _v512_set_epu32(a15, a14, a13, a12, a11, a10,  a9,  a8,  a7,  a6,  a5,  a4,  a3,  a2,  a1,  a0) \
+        _mm512_set_epi64(((int64)(a15)<<32)|(int64)(a14), ((int64)(a13)<<32)|(int64)(a12), ((int64)(a11)<<32)|(int64)(a10), ((int64)( a9)<<32)|(int64)( a8), \
+                         ((int64)( a7)<<32)|(int64)( a6), ((int64)( a5)<<32)|(int64)( a4), ((int64)( a3)<<32)|(int64)( a2), ((int64)( a1)<<32)|(int64)( a0))
+#define _v512_set_epu16(a31, a30, a29, a28, a27, a26, a25, a24, a23, a22, a21, a20, a19, a18, a17, a16, \
+                        a15, a14, a13, a12, a11, a10,  a9,  a8,  a7,  a6,  a5,  a4,  a3,  a2,  a1,  a0) \
+        _v512_set_epu32(((unsigned)(a31)<<16)|(unsigned)(a30), ((unsigned)(a29)<<16)|(unsigned)(a28), ((unsigned)(a27)<<16)|(unsigned)(a26), ((unsigned)(a25)<<16)|(unsigned)(a24), \
+                        ((unsigned)(a23)<<16)|(unsigned)(a22), ((unsigned)(a21)<<16)|(unsigned)(a20), ((unsigned)(a19)<<16)|(unsigned)(a18), ((unsigned)(a17)<<16)|(unsigned)(a16), \
+                        ((unsigned)(a15)<<16)|(unsigned)(a14), ((unsigned)(a13)<<16)|(unsigned)(a12), ((unsigned)(a11)<<16)|(unsigned)(a10), ((unsigned)( a9)<<16)|(unsigned)( a8), \
+                        ((unsigned)( a7)<<16)|(unsigned)( a6), ((unsigned)( a5)<<16)|(unsigned)( a4), ((unsigned)( a3)<<16)|(unsigned)( a2), ((unsigned)( a1)<<16)|(unsigned)( a0))
+#define _v512_set_epu8(a63, a62, a61, a60, a59, a58, a57, a56, a55, a54, a53, a52, a51, a50, a49, a48, \
+                       a47, a46, a45, a44, a43, a42, a41, a40, a39, a38, a37, a36, a35, a34, a33, a32, \
+                       a31, a30, a29, a28, a27, a26, a25, a24, a23, a22, a21, a20, a19, a18, a17, a16, \
+                       a15, a14, a13, a12, a11, a10,  a9,  a8,  a7,  a6,  a5,  a4,  a3,  a2,  a1,  a0) \
+        _v512_set_epu32(((unsigned)(a63)<<24)|((unsigned)(a62)<<16)|((unsigned)(a61)<<8)|(unsigned)(a60),((unsigned)(a59)<<24)|((unsigned)(a58)<<16)|((unsigned)(a57)<<8)|(unsigned)(a56), \
+                        ((unsigned)(a55)<<24)|((unsigned)(a54)<<16)|((unsigned)(a53)<<8)|(unsigned)(a52),((unsigned)(a51)<<24)|((unsigned)(a50)<<16)|((unsigned)(a49)<<8)|(unsigned)(a48), \
+                        ((unsigned)(a47)<<24)|((unsigned)(a46)<<16)|((unsigned)(a45)<<8)|(unsigned)(a44),((unsigned)(a43)<<24)|((unsigned)(a42)<<16)|((unsigned)(a41)<<8)|(unsigned)(a40), \
+                        ((unsigned)(a39)<<24)|((unsigned)(a38)<<16)|((unsigned)(a37)<<8)|(unsigned)(a36),((unsigned)(a35)<<24)|((unsigned)(a34)<<16)|((unsigned)(a33)<<8)|(unsigned)(a32), \
+                        ((unsigned)(a31)<<24)|((unsigned)(a30)<<16)|((unsigned)(a29)<<8)|(unsigned)(a28),((unsigned)(a27)<<24)|((unsigned)(a26)<<16)|((unsigned)(a25)<<8)|(unsigned)(a24), \
+                        ((unsigned)(a23)<<24)|((unsigned)(a22)<<16)|((unsigned)(a21)<<8)|(unsigned)(a20),((unsigned)(a19)<<24)|((unsigned)(a18)<<16)|((unsigned)(a17)<<8)|(unsigned)(a16), \
+                        ((unsigned)(a15)<<24)|((unsigned)(a14)<<16)|((unsigned)(a13)<<8)|(unsigned)(a12),((unsigned)(a11)<<24)|((unsigned)(a10)<<16)|((unsigned)( a9)<<8)|(unsigned)( a8), \
+                        ((unsigned)( a7)<<24)|((unsigned)( a6)<<16)|((unsigned)( a5)<<8)|(unsigned)( a4),((unsigned)( a3)<<24)|((unsigned)( a2)<<16)|((unsigned)( a1)<<8)|(unsigned)( a0))
+#define _v512_set_epi8(a63, a62, a61, a60, a59, a58, a57, a56, a55, a54, a53, a52, a51, a50, a49, a48, \
+                       a47, a46, a45, a44, a43, a42, a41, a40, a39, a38, a37, a36, a35, a34, a33, a32, \
+                       a31, a30, a29, a28, a27, a26, a25, a24, a23, a22, a21, a20, a19, a18, a17, a16, \
+                       a15, a14, a13, a12, a11, a10,  a9,  a8,  a7,  a6,  a5,  a4,  a3,  a2,  a1,  a0) \
+        _v512_set_epu8((uchar)(a63), (uchar)(a62), (uchar)(a61), (uchar)(a60), (uchar)(a59), (uchar)(a58), (uchar)(a57), (uchar)(a56), \
+                       (uchar)(a55), (uchar)(a54), (uchar)(a53), (uchar)(a52), (uchar)(a51), (uchar)(a50), (uchar)(a49), (uchar)(a48), \
+                       (uchar)(a47), (uchar)(a46), (uchar)(a45), (uchar)(a44), (uchar)(a43), (uchar)(a42), (uchar)(a41), (uchar)(a40), \
+                       (uchar)(a39), (uchar)(a38), (uchar)(a37), (uchar)(a36), (uchar)(a35), (uchar)(a34), (uchar)(a33), (uchar)(a32), \
+                       (uchar)(a31), (uchar)(a30), (uchar)(a29), (uchar)(a28), (uchar)(a27), (uchar)(a26), (uchar)(a25), (uchar)(a24), \
+                       (uchar)(a23), (uchar)(a22), (uchar)(a21), (uchar)(a20), (uchar)(a19), (uchar)(a18), (uchar)(a17), (uchar)(a16), \
+                       (uchar)(a15), (uchar)(a14), (uchar)(a13), (uchar)(a12), (uchar)(a11), (uchar)(a10), (uchar)( a9), (uchar)( a8), \
+                       (uchar)( a7), (uchar)( a6), (uchar)( a5), (uchar)( a4), (uchar)( a3), (uchar)( a2), (uchar)( a1), (uchar)( a0))
+
+#ifndef _mm512_cvtpd_pslo
+#ifdef _mm512_zextsi256_si512
+#define _mm512_cvtpd_pslo(a) _mm512_zextps256_ps512(_mm512_cvtpd_ps(a))
+#else
+//if preferred way to extend with zeros is unavailable
+#define _mm512_cvtpd_pslo(a) _mm512_castps256_ps512(_mm512_cvtpd_ps(a))
+#endif
+#endif
+///////// Utils ////////////
+
+namespace
+{
+
+inline __m512i _v512_combine(const __m256i& lo, const __m256i& hi)
+{ return _mm512_inserti32x8(_mm512_castsi256_si512(lo), hi, 1); }
+
+inline __m512 _v512_combine(const __m256& lo, const __m256& hi)
+{ return _mm512_insertf32x8(_mm512_castps256_ps512(lo), hi, 1); }
+
+inline __m512d _v512_combine(const __m256d& lo, const __m256d& hi)
+{ return _mm512_insertf64x4(_mm512_castpd256_pd512(lo), hi, 1); }
+
+inline int _v_cvtsi512_si32(const __m512i& a)
+{ return _mm_cvtsi128_si32(_mm512_castsi512_si128(a)); }
+
+inline __m256i _v512_extract_high(const __m512i& v)
+{ return _mm512_extracti32x8_epi32(v, 1); }
+
+inline __m256  _v512_extract_high(const __m512& v)
+{ return _mm512_extractf32x8_ps(v, 1); }
+
+inline __m256d _v512_extract_high(const __m512d& v)
+{ return _mm512_extractf64x4_pd(v, 1); }
+
+inline __m256i _v512_extract_low(const __m512i& v)
+{ return _mm512_castsi512_si256(v); }
+
+inline __m256  _v512_extract_low(const __m512& v)
+{ return _mm512_castps512_ps256(v); }
+
+inline __m256d _v512_extract_low(const __m512d& v)
+{ return _mm512_castpd512_pd256(v); }
+
+inline __m512i _v512_insert(const __m512i& a, const __m256i& b)
+{ return _mm512_inserti32x8(a, b, 0); }
+
+inline __m512 _v512_insert(const __m512& a, const __m256& b)
+{ return _mm512_insertf32x8(a, b, 0); }
+
+inline __m512d _v512_insert(const __m512d& a, const __m256d& b)
+{ return _mm512_insertf64x4(a, b, 0); }
+
+}
+
+namespace cv
+{
+
+//! @cond IGNORED
+
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
+
+///////// Types ////////////
+
+struct v_uint8x64
+{
+    typedef uchar lane_type;
+    enum { nlanes = 64 };
+    __m512i val;
+
+    explicit v_uint8x64(__m512i v) : val(v) {}
+    v_uint8x64(uchar v0,  uchar v1,  uchar v2,  uchar v3,
+               uchar v4,  uchar v5,  uchar v6,  uchar v7,
+               uchar v8,  uchar v9,  uchar v10, uchar v11,
+               uchar v12, uchar v13, uchar v14, uchar v15,
+               uchar v16, uchar v17, uchar v18, uchar v19,
+               uchar v20, uchar v21, uchar v22, uchar v23,
+               uchar v24, uchar v25, uchar v26, uchar v27,
+               uchar v28, uchar v29, uchar v30, uchar v31,
+               uchar v32, uchar v33, uchar v34, uchar v35,
+               uchar v36, uchar v37, uchar v38, uchar v39,
+               uchar v40, uchar v41, uchar v42, uchar v43,
+               uchar v44, uchar v45, uchar v46, uchar v47,
+               uchar v48, uchar v49, uchar v50, uchar v51,
+               uchar v52, uchar v53, uchar v54, uchar v55,
+               uchar v56, uchar v57, uchar v58, uchar v59,
+               uchar v60, uchar v61, uchar v62, uchar v63)
+    {
+        val = _v512_set_epu8(v63, v62, v61, v60, v59, v58, v57, v56, v55, v54, v53, v52, v51, v50, v49, v48,
+                             v47, v46, v45, v44, v43, v42, v41, v40, v39, v38, v37, v36, v35, v34, v33, v32,
+                             v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16,
+                             v15, v14, v13, v12, v11, v10, v9,  v8,  v7,  v6,  v5,  v4,  v3,  v2,  v1,  v0);
+    }
+    v_uint8x64() : val(_mm512_setzero_si512()) {}
+    uchar get0() const { return (uchar)_v_cvtsi512_si32(val); }
+};
+
+struct v_int8x64
+{
+    typedef schar lane_type;
+    enum { nlanes = 64 };
+    __m512i val;
+
+    explicit v_int8x64(__m512i v) : val(v) {}
+    v_int8x64(schar v0,  schar v1,  schar v2,  schar v3,
+              schar v4,  schar v5,  schar v6,  schar v7,
+              schar v8,  schar v9,  schar v10, schar v11,
+              schar v12, schar v13, schar v14, schar v15,
+              schar v16, schar v17, schar v18, schar v19,
+              schar v20, schar v21, schar v22, schar v23,
+              schar v24, schar v25, schar v26, schar v27,
+              schar v28, schar v29, schar v30, schar v31,
+              schar v32, schar v33, schar v34, schar v35,
+              schar v36, schar v37, schar v38, schar v39,
+              schar v40, schar v41, schar v42, schar v43,
+              schar v44, schar v45, schar v46, schar v47,
+              schar v48, schar v49, schar v50, schar v51,
+              schar v52, schar v53, schar v54, schar v55,
+              schar v56, schar v57, schar v58, schar v59,
+              schar v60, schar v61, schar v62, schar v63)
+    {
+        val = _v512_set_epi8(v63, v62, v61, v60, v59, v58, v57, v56, v55, v54, v53, v52, v51, v50, v49, v48,
+                             v47, v46, v45, v44, v43, v42, v41, v40, v39, v38, v37, v36, v35, v34, v33, v32,
+                             v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16,
+                             v15, v14, v13, v12, v11, v10, v9,  v8,  v7,  v6,  v5,  v4,  v3,  v2,  v1,  v0);
+    }
+    v_int8x64() : val(_mm512_setzero_si512()) {}
+    schar get0() const { return (schar)_v_cvtsi512_si32(val); }
+};
+
+struct v_uint16x32
+{
+    typedef ushort lane_type;
+    enum { nlanes = 32 };
+    __m512i val;
+
+    explicit v_uint16x32(__m512i v) : val(v) {}
+    v_uint16x32(ushort v0,  ushort v1,  ushort v2,  ushort v3,
+                ushort v4,  ushort v5,  ushort v6,  ushort v7,
+                ushort v8,  ushort v9,  ushort v10, ushort v11,
+                ushort v12, ushort v13, ushort v14, ushort v15,
+                ushort v16, ushort v17, ushort v18, ushort v19,
+                ushort v20, ushort v21, ushort v22, ushort v23,
+                ushort v24, ushort v25, ushort v26, ushort v27,
+                ushort v28, ushort v29, ushort v30, ushort v31)
+    {
+        val = _v512_set_epu16(v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16,
+                              v15, v14, v13, v12, v11, v10, v9,  v8,  v7,  v6,  v5,  v4,  v3,  v2,  v1,  v0);
+    }
+    v_uint16x32() : val(_mm512_setzero_si512()) {}
+    ushort get0() const { return (ushort)_v_cvtsi512_si32(val); }
+};
+
+struct v_int16x32
+{
+    typedef short lane_type;
+    enum { nlanes = 32 };
+    __m512i val;
+
+    explicit v_int16x32(__m512i v) : val(v) {}
+    v_int16x32(short v0,  short v1,  short v2,  short v3,  short v4,  short v5,  short v6,  short v7,
+               short v8,  short v9,  short v10, short v11, short v12, short v13, short v14, short v15,
+               short v16, short v17, short v18, short v19, short v20, short v21, short v22, short v23,
+               short v24, short v25, short v26, short v27, short v28, short v29, short v30, short v31)
+    {
+        val = _v512_set_epu16((ushort)v31, (ushort)v30, (ushort)v29, (ushort)v28, (ushort)v27, (ushort)v26, (ushort)v25, (ushort)v24,
+                              (ushort)v23, (ushort)v22, (ushort)v21, (ushort)v20, (ushort)v19, (ushort)v18, (ushort)v17, (ushort)v16,
+                              (ushort)v15, (ushort)v14, (ushort)v13, (ushort)v12, (ushort)v11, (ushort)v10, (ushort)v9 , (ushort)v8,
+                              (ushort)v7 , (ushort)v6 , (ushort)v5 , (ushort)v4 , (ushort)v3 , (ushort)v2 , (ushort)v1 , (ushort)v0);
+    }
+    v_int16x32() : val(_mm512_setzero_si512()) {}
+    short get0() const { return (short)_v_cvtsi512_si32(val); }
+};
+
+struct v_uint32x16
+{
+    typedef unsigned lane_type;
+    enum { nlanes = 16 };
+    __m512i val;
+
+    explicit v_uint32x16(__m512i v) : val(v) {}
+    v_uint32x16(unsigned v0,  unsigned v1,  unsigned v2,  unsigned v3,
+                unsigned v4,  unsigned v5,  unsigned v6,  unsigned v7,
+                unsigned v8,  unsigned v9,  unsigned v10, unsigned v11,
+                unsigned v12, unsigned v13, unsigned v14, unsigned v15)
+    {
+        val = _mm512_setr_epi32((int)v0,  (int)v1,  (int)v2,  (int)v3, (int)v4,  (int)v5,  (int)v6,  (int)v7,
+                                (int)v8,  (int)v9,  (int)v10, (int)v11, (int)v12, (int)v13, (int)v14, (int)v15);
+    }
+    v_uint32x16() : val(_mm512_setzero_si512()) {}
+    unsigned get0() const { return (unsigned)_v_cvtsi512_si32(val); }
+};
+
+struct v_int32x16
+{
+    typedef int lane_type;
+    enum { nlanes = 16 };
+    __m512i val;
+
+    explicit v_int32x16(__m512i v) : val(v) {}
+    v_int32x16(int v0, int v1, int v2,  int v3,  int v4,  int v5,  int v6,  int v7,
+               int v8, int v9, int v10, int v11, int v12, int v13, int v14, int v15)
+    {
+        val = _mm512_setr_epi32(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
+    }
+    v_int32x16() : val(_mm512_setzero_si512()) {}
+    int get0() const { return _v_cvtsi512_si32(val); }
+};
+
+struct v_float32x16
+{
+    typedef float lane_type;
+    enum { nlanes = 16 };
+    __m512 val;
+
+    explicit v_float32x16(__m512 v) : val(v) {}
+    v_float32x16(float v0, float v1, float v2,  float v3,  float v4,  float v5,  float v6,  float v7,
+                 float v8, float v9, float v10, float v11, float v12, float v13, float v14, float v15)
+    {
+        val = _mm512_setr_ps(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
+    }
+    v_float32x16() : val(_mm512_setzero_ps()) {}
+    float get0() const { return _mm_cvtss_f32(_mm512_castps512_ps128(val)); }
+};
+
+struct v_uint64x8
+{
+    typedef uint64 lane_type;
+    enum { nlanes = 8 };
+    __m512i val;
+
+    explicit v_uint64x8(__m512i v) : val(v) {}
+    v_uint64x8(uint64 v0, uint64 v1, uint64 v2, uint64 v3, uint64 v4, uint64 v5, uint64 v6, uint64 v7)
+    { val = _mm512_setr_epi64((int64)v0, (int64)v1, (int64)v2, (int64)v3, (int64)v4, (int64)v5, (int64)v6, (int64)v7); }
+    v_uint64x8() : val(_mm512_setzero_si512()) {}
+    uint64 get0() const
+    {
+    #if defined __x86_64__ || defined _M_X64
+        return (uint64)_mm_cvtsi128_si64(_mm512_castsi512_si128(val));
+    #else
+        int a = _mm_cvtsi128_si32(_mm512_castsi512_si128(val));
+        int b = _mm_cvtsi128_si32(_mm512_castsi512_si128(_mm512_srli_epi64(val, 32)));
+        return (unsigned)a | ((uint64)(unsigned)b << 32);
+    #endif
+    }
+};
+
+struct v_int64x8
+{
+    typedef int64 lane_type;
+    enum { nlanes = 8 };
+    __m512i val;
+
+    explicit v_int64x8(__m512i v) : val(v) {}
+    v_int64x8(int64 v0, int64 v1, int64 v2, int64 v3, int64 v4, int64 v5, int64 v6, int64 v7)
+    { val = _mm512_setr_epi64(v0, v1, v2, v3, v4, v5, v6, v7); }
+    v_int64x8() : val(_mm512_setzero_si512()) {}
+
+    int64 get0() const
+    {
+    #if defined __x86_64__ || defined _M_X64
+        return (int64)_mm_cvtsi128_si64(_mm512_castsi512_si128(val));
+    #else
+        int a = _mm_cvtsi128_si32(_mm512_castsi512_si128(val));
+        int b = _mm_cvtsi128_si32(_mm512_castsi512_si128(_mm512_srli_epi64(val, 32)));
+        return (int64)((unsigned)a | ((uint64)(unsigned)b << 32));
+    #endif
+    }
+};
+
+struct v_float64x8
+{
+    typedef double lane_type;
+    enum { nlanes = 8 };
+    __m512d val;
+
+    explicit v_float64x8(__m512d v) : val(v) {}
+    v_float64x8(double v0, double v1, double v2, double v3, double v4, double v5, double v6, double v7)
+    { val = _mm512_setr_pd(v0, v1, v2, v3, v4, v5, v6, v7); }
+    v_float64x8() : val(_mm512_setzero_pd()) {}
+    double get0() const { return _mm_cvtsd_f64(_mm512_castpd512_pd128(val)); }
+};
+
+//////////////// Load and store operations ///////////////
+
+#define OPENCV_HAL_IMPL_AVX512_LOADSTORE(_Tpvec, _Tp)                    \
+    inline _Tpvec v512_load(const _Tp* ptr)                           \
+    { return _Tpvec(_mm512_loadu_si512((const __m512i*)ptr)); }       \
+    inline _Tpvec v512_load_aligned(const _Tp* ptr)                   \
+    { return _Tpvec(_mm512_load_si512((const __m512i*)ptr)); }        \
+    inline _Tpvec v512_load_low(const _Tp* ptr)                       \
+    {                                                                 \
+        __m256i v256 = _mm256_loadu_si256((const __m256i*)ptr);       \
+        return _Tpvec(_mm512_castsi256_si512(v256));                  \
+    }                                                                 \
+    inline _Tpvec v512_load_halves(const _Tp* ptr0, const _Tp* ptr1)  \
+    {                                                                 \
+        __m256i vlo = _mm256_loadu_si256((const __m256i*)ptr0);       \
+        __m256i vhi = _mm256_loadu_si256((const __m256i*)ptr1);       \
+        return _Tpvec(_v512_combine(vlo, vhi));                       \
+    }                                                                 \
+    inline void v_store(_Tp* ptr, const _Tpvec& a)                    \
+    { _mm512_storeu_si512((__m512i*)ptr, a.val); }                    \
+    inline void v_store_aligned(_Tp* ptr, const _Tpvec& a)            \
+    { _mm512_store_si512((__m512i*)ptr, a.val); }                     \
+    inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a)    \
+    { _mm512_stream_si512((__m512i*)ptr, a.val); }                    \
+    inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
+    { \
+        if( mode == hal::STORE_UNALIGNED ) \
+            _mm512_storeu_si512((__m512i*)ptr, a.val); \
+        else if( mode == hal::STORE_ALIGNED_NOCACHE )  \
+            _mm512_stream_si512((__m512i*)ptr, a.val); \
+        else \
+            _mm512_store_si512((__m512i*)ptr, a.val); \
+    } \
+    inline void v_store_low(_Tp* ptr, const _Tpvec& a)                \
+    { _mm256_storeu_si256((__m256i*)ptr, _v512_extract_low(a.val)); }    \
+    inline void v_store_high(_Tp* ptr, const _Tpvec& a)               \
+    { _mm256_storeu_si256((__m256i*)ptr, _v512_extract_high(a.val)); }
+
+OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_uint8x64,  uchar)
+OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_int8x64,   schar)
+OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_uint16x32, ushort)
+OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_int16x32,  short)
+OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_uint32x16,  unsigned)
+OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_int32x16,   int)
+OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_uint64x8,  uint64)
+OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_int64x8,   int64)
+
+#define OPENCV_HAL_IMPL_AVX512_LOADSTORE_FLT(_Tpvec, _Tp, suffix, halfreg)   \
+    inline _Tpvec v512_load(const _Tp* ptr)                               \
+    { return _Tpvec(_mm512_loadu_##suffix(ptr)); }                        \
+    inline _Tpvec v512_load_aligned(const _Tp* ptr)                       \
+    { return _Tpvec(_mm512_load_##suffix(ptr)); }                         \
+    inline _Tpvec v512_load_low(const _Tp* ptr)                           \
+    {                                                                     \
+        return _Tpvec(_mm512_cast##suffix##256_##suffix##512              \
+                     (_mm256_loadu_##suffix(ptr)));                       \
+    }                                                                     \
+    inline _Tpvec v512_load_halves(const _Tp* ptr0, const _Tp* ptr1)      \
+    {                                                                     \
+        halfreg vlo = _mm256_loadu_##suffix(ptr0);                        \
+        halfreg vhi = _mm256_loadu_##suffix(ptr1);                        \
+        return _Tpvec(_v512_combine(vlo, vhi));                           \
+    }                                                                     \
+    inline void v_store(_Tp* ptr, const _Tpvec& a)                        \
+    { _mm512_storeu_##suffix(ptr, a.val); }                               \
+    inline void v_store_aligned(_Tp* ptr, const _Tpvec& a)                \
+    { _mm512_store_##suffix(ptr, a.val); }                                \
+    inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a)        \
+    { _mm512_stream_##suffix(ptr, a.val); }                               \
+    inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
+    { \
+        if( mode == hal::STORE_UNALIGNED ) \
+            _mm512_storeu_##suffix(ptr, a.val); \
+        else if( mode == hal::STORE_ALIGNED_NOCACHE )  \
+            _mm512_stream_##suffix(ptr, a.val); \
+        else \
+            _mm512_store_##suffix(ptr, a.val); \
+    } \
+    inline void v_store_low(_Tp* ptr, const _Tpvec& a)                    \
+    { _mm256_storeu_##suffix(ptr, _v512_extract_low(a.val)); }            \
+    inline void v_store_high(_Tp* ptr, const _Tpvec& a)                   \
+    { _mm256_storeu_##suffix(ptr, _v512_extract_high(a.val)); }
+
+OPENCV_HAL_IMPL_AVX512_LOADSTORE_FLT(v_float32x16, float,  ps, __m256)
+OPENCV_HAL_IMPL_AVX512_LOADSTORE_FLT(v_float64x8, double, pd, __m256d)
+
+#define OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, _Tpvecf, suffix, cast) \
+    inline _Tpvec v_reinterpret_as_##suffix(const _Tpvecf& a)   \
+    { return _Tpvec(cast(a.val)); }
+
+#define OPENCV_HAL_IMPL_AVX512_INIT(_Tpvec, _Tp, suffix, ssuffix, ctype_s)         \
+    inline _Tpvec v512_setzero_##suffix()                                          \
+    { return _Tpvec(_mm512_setzero_si512()); }                                     \
+    inline _Tpvec v512_setall_##suffix(_Tp v)                                      \
+    { return _Tpvec(_mm512_set1_##ssuffix((ctype_s)v)); }                          \
+    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint8x64,   suffix, OPENCV_HAL_NOP)      \
+    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int8x64,    suffix, OPENCV_HAL_NOP)      \
+    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint16x32,  suffix, OPENCV_HAL_NOP)      \
+    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int16x32,   suffix, OPENCV_HAL_NOP)      \
+    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint32x16,  suffix, OPENCV_HAL_NOP)      \
+    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int32x16,   suffix, OPENCV_HAL_NOP)      \
+    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint64x8,   suffix, OPENCV_HAL_NOP)      \
+    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int64x8,    suffix, OPENCV_HAL_NOP)      \
+    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_float32x16, suffix, _mm512_castps_si512) \
+    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_float64x8,  suffix, _mm512_castpd_si512)
+
+OPENCV_HAL_IMPL_AVX512_INIT(v_uint8x64,  uchar,    u8,  epi8,   char)
+OPENCV_HAL_IMPL_AVX512_INIT(v_int8x64,   schar,    s8,  epi8,   char)
+OPENCV_HAL_IMPL_AVX512_INIT(v_uint16x32, ushort,   u16, epi16,  short)
+OPENCV_HAL_IMPL_AVX512_INIT(v_int16x32,  short,    s16, epi16,  short)
+OPENCV_HAL_IMPL_AVX512_INIT(v_uint32x16, unsigned, u32, epi32,  int)
+OPENCV_HAL_IMPL_AVX512_INIT(v_int32x16,  int,      s32, epi32,  int)
+OPENCV_HAL_IMPL_AVX512_INIT(v_uint64x8,  uint64,   u64, epi64,  int64)
+OPENCV_HAL_IMPL_AVX512_INIT(v_int64x8,   int64,    s64, epi64,  int64)
+
+#define OPENCV_HAL_IMPL_AVX512_INIT_FLT(_Tpvec, _Tp, suffix, zsuffix, cast) \
+    inline _Tpvec v512_setzero_##suffix()                                   \
+    { return _Tpvec(_mm512_setzero_##zsuffix()); }                          \
+    inline _Tpvec v512_setall_##suffix(_Tp v)                               \
+    { return _Tpvec(_mm512_set1_##zsuffix(v)); }                            \
+    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint8x64,  suffix, cast)          \
+    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int8x64,   suffix, cast)          \
+    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint16x32, suffix, cast)          \
+    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int16x32,  suffix, cast)          \
+    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint32x16, suffix, cast)          \
+    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int32x16,  suffix, cast)          \
+    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint64x8,  suffix, cast)          \
+    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int64x8,   suffix, cast)
+
+OPENCV_HAL_IMPL_AVX512_INIT_FLT(v_float32x16, float,  f32, ps, _mm512_castsi512_ps)
+OPENCV_HAL_IMPL_AVX512_INIT_FLT(v_float64x8,  double, f64, pd, _mm512_castsi512_pd)
+
+inline v_float32x16 v_reinterpret_as_f32(const v_float32x16& a)
+{ return a; }
+inline v_float32x16 v_reinterpret_as_f32(const v_float64x8& a)
+{ return v_float32x16(_mm512_castpd_ps(a.val)); }
+
+inline v_float64x8 v_reinterpret_as_f64(const v_float64x8& a)
+{ return a; }
+inline v_float64x8 v_reinterpret_as_f64(const v_float32x16& a)
+{ return v_float64x8(_mm512_castps_pd(a.val)); }
+
+// FP16
+inline v_float32x16 v512_load_expand(const float16_t* ptr)
+{
+    return v_float32x16(_mm512_cvtph_ps(_mm256_loadu_si256((const __m256i*)ptr)));
+}
+
+inline void v_pack_store(float16_t* ptr, const v_float32x16& a)
+{
+    __m256i ah = _mm512_cvtps_ph(a.val, 0);
+    _mm256_storeu_si256((__m256i*)ptr, ah);
+}
+
+/* Recombine & ZIP */
+inline void v_zip(const v_int8x64& a, const v_int8x64& b, v_int8x64& ab0, v_int8x64& ab1)
+{
+#if CV_AVX_512VBMI
+    __m512i mask0 = _v512_set_epu8( 95,  31,  94,  30,  93,  29,  92,  28,  91,  27,  90,  26,  89,  25,  88,  24,
+                                    87,  23,  86,  22,  85,  21,  84,  20,  83,  19,  82,  18,  81,  17,  80,  16,
+                                    79,  15,  78,  14,  77,  13,  76,  12,  75,  11,  74,  10,  73,   9,  72,   8,
+                                    71,   7,  70,   6,  69,   5,  68,   4,  67,   3,  66,   2,  65,   1,  64,   0);
+    ab0 = v_int8x64(_mm512_permutex2var_epi8(a.val, mask0, b.val));
+    __m512i mask1 = _v512_set_epu8(127,  63, 126,  62, 125,  61, 124,  60, 123,  59, 122,  58, 121,  57, 120,  56,
+                                   119,  55, 118,  54, 117,  53, 116,  52, 115,  51, 114,  50, 113,  49, 112,  48,
+                                   111,  47, 110,  46, 109,  45, 108,  44, 107,  43, 106,  42, 105,  41, 104,  40,
+                                   103,  39, 102,  38, 101,  37, 100,  36,  99,  35,  98,  34,  97,  33,  96,  32);
+    ab1 = v_int8x64(_mm512_permutex2var_epi8(a.val, mask1, b.val));
+#else
+    __m512i low  = _mm512_unpacklo_epi8(a.val, b.val);
+    __m512i high = _mm512_unpackhi_epi8(a.val, b.val);
+    ab0 = v_int8x64(_mm512_permutex2var_epi64(low, _v512_set_epu64(11, 10, 3, 2,  9,  8, 1, 0), high));
+    ab1 = v_int8x64(_mm512_permutex2var_epi64(low, _v512_set_epu64(15, 14, 7, 6, 13, 12, 5, 4), high));
+#endif
+}
+inline void v_zip(const v_int16x32& a, const v_int16x32& b, v_int16x32& ab0, v_int16x32& ab1)
+{
+    __m512i mask0 = _v512_set_epu16(47, 15, 46, 14, 45, 13, 44, 12, 43, 11, 42, 10, 41,  9, 40,  8,
+                                    39,  7, 38,  6, 37,  5, 36,  4, 35,  3, 34,  2, 33,  1, 32,  0);
+    ab0 = v_int16x32(_mm512_permutex2var_epi16(a.val, mask0, b.val));
+    __m512i mask1 = _v512_set_epu16(63, 31, 62, 30, 61, 29, 60, 28, 59, 27, 58, 26, 57, 25, 56, 24,
+                                    55, 23, 54, 22, 53, 21, 52, 20, 51, 19, 50, 18, 49, 17, 48, 16);
+    ab1 = v_int16x32(_mm512_permutex2var_epi16(a.val, mask1, b.val));
+}
+inline void v_zip(const v_int32x16& a, const v_int32x16& b, v_int32x16& ab0, v_int32x16& ab1)
+{
+    __m512i mask0 = _v512_set_epu32(23,  7, 22,  6, 21,  5, 20,  4, 19,  3, 18,  2, 17, 1, 16, 0);
+    ab0 = v_int32x16(_mm512_permutex2var_epi32(a.val, mask0, b.val));
+    __m512i mask1 = _v512_set_epu32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8);
+    ab1 = v_int32x16(_mm512_permutex2var_epi32(a.val, mask1, b.val));
+}
+inline void v_zip(const v_int64x8& a, const v_int64x8& b, v_int64x8& ab0, v_int64x8& ab1)
+{
+    __m512i mask0 = _v512_set_epu64(11, 3, 10, 2,  9, 1,  8, 0);
+    ab0 = v_int64x8(_mm512_permutex2var_epi64(a.val, mask0, b.val));
+    __m512i mask1 = _v512_set_epu64(15, 7, 14, 6, 13, 5, 12, 4);
+    ab1 = v_int64x8(_mm512_permutex2var_epi64(a.val, mask1, b.val));
+}
+
+inline void v_zip(const v_uint8x64&  a, const v_uint8x64&  b, v_uint8x64& ab0, v_uint8x64& ab1)
+{
+    v_int8x64 i0, i1;
+    v_zip(v_reinterpret_as_s8(a), v_reinterpret_as_s8(b), i0, i1);
+    ab0 = v_reinterpret_as_u8(i0);
+    ab1 = v_reinterpret_as_u8(i1);
+}
+inline void v_zip(const v_uint16x32&  a, const v_uint16x32&  b, v_uint16x32& ab0, v_uint16x32& ab1)
+{
+    v_int16x32 i0, i1;
+    v_zip(v_reinterpret_as_s16(a), v_reinterpret_as_s16(b), i0, i1);
+    ab0 = v_reinterpret_as_u16(i0);
+    ab1 = v_reinterpret_as_u16(i1);
+}
+inline void v_zip(const v_uint32x16&  a, const v_uint32x16&  b, v_uint32x16& ab0, v_uint32x16& ab1)
+{
+    v_int32x16 i0, i1;
+    v_zip(v_reinterpret_as_s32(a), v_reinterpret_as_s32(b), i0, i1);
+    ab0 = v_reinterpret_as_u32(i0);
+    ab1 = v_reinterpret_as_u32(i1);
+}
+inline void v_zip(const v_uint64x8&  a, const v_uint64x8&  b, v_uint64x8& ab0, v_uint64x8& ab1)
+{
+    v_int64x8 i0, i1;
+    v_zip(v_reinterpret_as_s64(a), v_reinterpret_as_s64(b), i0, i1);
+    ab0 = v_reinterpret_as_u64(i0);
+    ab1 = v_reinterpret_as_u64(i1);
+}
+inline void v_zip(const v_float32x16&  a, const v_float32x16&  b, v_float32x16& ab0, v_float32x16& ab1)
+{
+    v_int32x16 i0, i1;
+    v_zip(v_reinterpret_as_s32(a), v_reinterpret_as_s32(b), i0, i1);
+    ab0 = v_reinterpret_as_f32(i0);
+    ab1 = v_reinterpret_as_f32(i1);
+}
+inline void v_zip(const v_float64x8&  a, const v_float64x8&  b, v_float64x8& ab0, v_float64x8& ab1)
+{
+    v_int64x8 i0, i1;
+    v_zip(v_reinterpret_as_s64(a), v_reinterpret_as_s64(b), i0, i1);
+    ab0 = v_reinterpret_as_f64(i0);
+    ab1 = v_reinterpret_as_f64(i1);
+}
+
+#define OPENCV_HAL_IMPL_AVX512_COMBINE(_Tpvec, suffix)                                    \
+    inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b)                         \
+    { return _Tpvec(_v512_combine(_v512_extract_low(a.val), _v512_extract_low(b.val))); } \
+    inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b)                        \
+    { return _Tpvec(_v512_insert(b.val, _v512_extract_high(a.val))); }                    \
+    inline void v_recombine(const _Tpvec& a, const _Tpvec& b,                             \
+                                  _Tpvec& c, _Tpvec& d)                                   \
+    {                                                                                     \
+        c.val = _v512_combine(_v512_extract_low(a.val),_v512_extract_low(b.val));         \
+        d.val = _v512_insert(b.val,_v512_extract_high(a.val));                            \
+    }
+
+
+OPENCV_HAL_IMPL_AVX512_COMBINE(v_uint8x64,   epi8)
+OPENCV_HAL_IMPL_AVX512_COMBINE(v_int8x64,    epi8)
+OPENCV_HAL_IMPL_AVX512_COMBINE(v_uint16x32,  epi16)
+OPENCV_HAL_IMPL_AVX512_COMBINE(v_int16x32,   epi16)
+OPENCV_HAL_IMPL_AVX512_COMBINE(v_uint32x16,  epi32)
+OPENCV_HAL_IMPL_AVX512_COMBINE(v_int32x16,   epi32)
+OPENCV_HAL_IMPL_AVX512_COMBINE(v_uint64x8,   epi64)
+OPENCV_HAL_IMPL_AVX512_COMBINE(v_int64x8,    epi64)
+OPENCV_HAL_IMPL_AVX512_COMBINE(v_float32x16, ps)
+OPENCV_HAL_IMPL_AVX512_COMBINE(v_float64x8,  pd)
+
+////////// Arithmetic, bitwise and comparison operations /////////
+
+/* Element-wise binary and unary operations */
+
+/** Non-saturating arithmetics **/
+#define OPENCV_HAL_IMPL_AVX512_BIN_FUNC(func, _Tpvec, intrin) \
+    inline _Tpvec func(const _Tpvec& a, const _Tpvec& b)      \
+    { return _Tpvec(intrin(a.val, b.val)); }
+
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_add_wrap, v_uint8x64, _mm512_add_epi8)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_add_wrap, v_int8x64, _mm512_add_epi8)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_add_wrap, v_uint16x32, _mm512_add_epi16)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_add_wrap, v_int16x32, _mm512_add_epi16)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_sub_wrap, v_uint8x64, _mm512_sub_epi8)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_sub_wrap, v_int8x64, _mm512_sub_epi8)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_sub_wrap, v_uint16x32, _mm512_sub_epi16)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_sub_wrap, v_int16x32, _mm512_sub_epi16)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_mul_wrap, v_uint16x32, _mm512_mullo_epi16)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_mul_wrap, v_int16x32, _mm512_mullo_epi16)
+
+inline v_uint8x64 v_mul_wrap(const v_uint8x64& a, const v_uint8x64& b)
+{
+    __m512i ad = _mm512_srai_epi16(a.val, 8);
+    __m512i bd = _mm512_srai_epi16(b.val, 8);
+    __m512i p0 = _mm512_mullo_epi16(a.val, b.val); // even
+    __m512i p1 = _mm512_slli_epi16(_mm512_mullo_epi16(ad, bd), 8); // odd
+    return v_uint8x64(_mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, p0, p1));
+}
+inline v_int8x64 v_mul_wrap(const v_int8x64& a, const v_int8x64& b)
+{
+    return v_reinterpret_as_s8(v_mul_wrap(v_reinterpret_as_u8(a), v_reinterpret_as_u8(b)));
+}
+
+#define OPENCV_HAL_IMPL_AVX512_BIN_OP(bin_op, _Tpvec, intrin)            \
+    inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b)     \
+    { return _Tpvec(intrin(a.val, b.val)); }                             \
+    inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b)       \
+    { a.val = intrin(a.val, b.val); return a; }
+
+OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_uint32x16, _mm512_add_epi32)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_uint32x16, _mm512_sub_epi32)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_int32x16, _mm512_add_epi32)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_int32x16, _mm512_sub_epi32)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_uint64x8, _mm512_add_epi64)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_uint64x8, _mm512_sub_epi64)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_int64x8, _mm512_add_epi64)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_int64x8, _mm512_sub_epi64)
+
+OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_uint32x16, _mm512_mullo_epi32)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_int32x16, _mm512_mullo_epi32)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_uint64x8, _mm512_mullo_epi64)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_int64x8, _mm512_mullo_epi64)
+
+/** Saturating arithmetics **/
+OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_uint8x64,  _mm512_adds_epu8)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_uint8x64,  _mm512_subs_epu8)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_int8x64,   _mm512_adds_epi8)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_int8x64,   _mm512_subs_epi8)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_uint16x32, _mm512_adds_epu16)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_uint16x32, _mm512_subs_epu16)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_int16x32,  _mm512_adds_epi16)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_int16x32,  _mm512_subs_epi16)
+
+OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_float32x16, _mm512_add_ps)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_float32x16, _mm512_sub_ps)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_float32x16, _mm512_mul_ps)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(/, v_float32x16, _mm512_div_ps)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_float64x8, _mm512_add_pd)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_float64x8, _mm512_sub_pd)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_float64x8, _mm512_mul_pd)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(/, v_float64x8, _mm512_div_pd)
+
+// saturating multiply
+inline v_uint8x64 operator * (const v_uint8x64& a, const v_uint8x64& b)
+{
+    v_uint16x32 c, d;
+    v_mul_expand(a, b, c, d);
+    return v_pack(c, d);
+}
+inline v_int8x64 operator * (const v_int8x64& a, const v_int8x64& b)
+{
+    v_int16x32 c, d;
+    v_mul_expand(a, b, c, d);
+    return v_pack(c, d);
+}
+inline v_uint16x32 operator * (const v_uint16x32& a, const v_uint16x32& b)
+{
+    __m512i pl = _mm512_mullo_epi16(a.val, b.val);
+    __m512i ph = _mm512_mulhi_epu16(a.val, b.val);
+    __m512i p0 = _mm512_unpacklo_epi16(pl, ph);
+    __m512i p1 = _mm512_unpackhi_epi16(pl, ph);
+
+    const __m512i m = _mm512_set1_epi32(65535);
+    return v_uint16x32(_mm512_packus_epi32(_mm512_min_epu32(p0, m), _mm512_min_epu32(p1, m)));
+}
+inline v_int16x32 operator * (const v_int16x32& a, const v_int16x32& b)
+{
+    __m512i pl = _mm512_mullo_epi16(a.val, b.val);
+    __m512i ph = _mm512_mulhi_epi16(a.val, b.val);
+    __m512i p0 = _mm512_unpacklo_epi16(pl, ph);
+    __m512i p1 = _mm512_unpackhi_epi16(pl, ph);
+    return v_int16x32(_mm512_packs_epi32(p0, p1));
+}
+
+inline v_uint8x64& operator *= (v_uint8x64& a, const v_uint8x64& b)
+{ a = a * b; return a; }
+inline v_int8x64& operator *= (v_int8x64& a, const v_int8x64& b)
+{ a = a * b; return a; }
+inline v_uint16x32& operator *= (v_uint16x32& a, const v_uint16x32& b)
+{ a = a * b; return a; }
+inline v_int16x32& operator *= (v_int16x32& a, const v_int16x32& b)
+{ a = a * b; return a; }
+
+inline v_int16x32 v_mul_hi(const v_int16x32& a, const v_int16x32& b) { return v_int16x32(_mm512_mulhi_epi16(a.val, b.val)); }
+inline v_uint16x32 v_mul_hi(const v_uint16x32& a, const v_uint16x32& b) { return v_uint16x32(_mm512_mulhi_epu16(a.val, b.val)); }
+
+//  Multiply and expand
+inline void v_mul_expand(const v_uint8x64& a, const v_uint8x64& b,
+                         v_uint16x32& c, v_uint16x32& d)
+{
+    v_uint16x32 a0, a1, b0, b1;
+    v_expand(a, a0, a1);
+    v_expand(b, b0, b1);
+    c = v_mul_wrap(a0, b0);
+    d = v_mul_wrap(a1, b1);
+}
+
+inline void v_mul_expand(const v_int8x64& a, const v_int8x64& b,
+                         v_int16x32& c, v_int16x32& d)
+{
+    v_int16x32 a0, a1, b0, b1;
+    v_expand(a, a0, a1);
+    v_expand(b, b0, b1);
+    c = v_mul_wrap(a0, b0);
+    d = v_mul_wrap(a1, b1);
+}
+
+inline void v_mul_expand(const v_int16x32& a, const v_int16x32& b,
+                         v_int32x16& c, v_int32x16& d)
+{
+    v_int16x32 v0, v1;
+    v_zip(v_mul_wrap(a, b), v_mul_hi(a, b), v0, v1);
+
+    c = v_reinterpret_as_s32(v0);
+    d = v_reinterpret_as_s32(v1);
+}
+
+inline void v_mul_expand(const v_uint16x32& a, const v_uint16x32& b,
+                         v_uint32x16& c, v_uint32x16& d)
+{
+    v_uint16x32 v0, v1;
+    v_zip(v_mul_wrap(a, b), v_mul_hi(a, b), v0, v1);
+
+    c = v_reinterpret_as_u32(v0);
+    d = v_reinterpret_as_u32(v1);
+}
+
+inline void v_mul_expand(const v_uint32x16& a, const v_uint32x16& b,
+                         v_uint64x8& c, v_uint64x8& d)
+{
+    v_zip(v_uint64x8(_mm512_mul_epu32(a.val, b.val)),
+          v_uint64x8(_mm512_mul_epu32(_mm512_srli_epi64(a.val, 32), _mm512_srli_epi64(b.val, 32))), c, d);
+}
+
+inline void v_mul_expand(const v_int32x16& a, const v_int32x16& b,
+    v_int64x8& c, v_int64x8& d)
+{
+    v_zip(v_int64x8(_mm512_mul_epi32(a.val, b.val)),
+          v_int64x8(_mm512_mul_epi32(_mm512_srli_epi64(a.val, 32), _mm512_srli_epi64(b.val, 32))), c, d);
+}
+
+/** Bitwise shifts **/
+#define OPENCV_HAL_IMPL_AVX512_SHIFT_OP(_Tpuvec, _Tpsvec, suffix) \
+    inline _Tpuvec operator << (const _Tpuvec& a, int imm)        \
+    { return _Tpuvec(_mm512_slli_##suffix(a.val, imm)); }         \
+    inline _Tpsvec operator << (const _Tpsvec& a, int imm)        \
+    { return _Tpsvec(_mm512_slli_##suffix(a.val, imm)); }         \
+    inline _Tpuvec operator >> (const _Tpuvec& a, int imm)        \
+    { return _Tpuvec(_mm512_srli_##suffix(a.val, imm)); }         \
+    inline _Tpsvec operator >> (const _Tpsvec& a, int imm)        \
+    { return _Tpsvec(_mm512_srai_##suffix(a.val, imm)); }         \
+    template<int imm>                                             \
+    inline _Tpuvec v_shl(const _Tpuvec& a)                        \
+    { return _Tpuvec(_mm512_slli_##suffix(a.val, imm)); }         \
+    template<int imm>                                             \
+    inline _Tpsvec v_shl(const _Tpsvec& a)                        \
+    { return _Tpsvec(_mm512_slli_##suffix(a.val, imm)); }         \
+    template<int imm>                                             \
+    inline _Tpuvec v_shr(const _Tpuvec& a)                        \
+    { return _Tpuvec(_mm512_srli_##suffix(a.val, imm)); }         \
+    template<int imm>                                             \
+    inline _Tpsvec v_shr(const _Tpsvec& a)                        \
+    { return _Tpsvec(_mm512_srai_##suffix(a.val, imm)); }
+
+OPENCV_HAL_IMPL_AVX512_SHIFT_OP(v_uint16x32, v_int16x32, epi16)
+OPENCV_HAL_IMPL_AVX512_SHIFT_OP(v_uint32x16, v_int32x16, epi32)
+OPENCV_HAL_IMPL_AVX512_SHIFT_OP(v_uint64x8,  v_int64x8,  epi64)
+
+
+/** Bitwise logic **/
+#define OPENCV_HAL_IMPL_AVX512_LOGIC_OP(_Tpvec, suffix, not_const) \
+    OPENCV_HAL_IMPL_AVX512_BIN_OP(&, _Tpvec, _mm512_and_##suffix)  \
+    OPENCV_HAL_IMPL_AVX512_BIN_OP(|, _Tpvec, _mm512_or_##suffix)   \
+    OPENCV_HAL_IMPL_AVX512_BIN_OP(^, _Tpvec, _mm512_xor_##suffix)  \
+    inline _Tpvec operator ~ (const _Tpvec& a)                     \
+    { return _Tpvec(_mm512_xor_##suffix(a.val, not_const)); }
+
+OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_uint8x64,   si512, _mm512_set1_epi32(-1))
+OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_int8x64,    si512, _mm512_set1_epi32(-1))
+OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_uint16x32,  si512, _mm512_set1_epi32(-1))
+OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_int16x32,   si512, _mm512_set1_epi32(-1))
+OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_uint32x16,  si512, _mm512_set1_epi32(-1))
+OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_int32x16,   si512, _mm512_set1_epi32(-1))
+OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_uint64x8,   si512, _mm512_set1_epi64(-1))
+OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_int64x8,    si512, _mm512_set1_epi64(-1))
+OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_float32x16, ps,    _mm512_castsi512_ps(_mm512_set1_epi32(-1)))
+OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_float64x8,  pd,    _mm512_castsi512_pd(_mm512_set1_epi32(-1)))
+
+/** Select **/
+#define OPENCV_HAL_IMPL_AVX512_SELECT(_Tpvec, suffix, zsuf)                      \
+    inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
+    { return _Tpvec(_mm512_mask_blend_##suffix(_mm512_cmp_##suffix##_mask(mask.val, _mm512_setzero_##zsuf(), _MM_CMPINT_EQ), a.val, b.val)); }
+
+OPENCV_HAL_IMPL_AVX512_SELECT(v_uint8x64,   epi8, si512)
+OPENCV_HAL_IMPL_AVX512_SELECT(v_int8x64,    epi8, si512)
+OPENCV_HAL_IMPL_AVX512_SELECT(v_uint16x32, epi16, si512)
+OPENCV_HAL_IMPL_AVX512_SELECT(v_int16x32,  epi16, si512)
+OPENCV_HAL_IMPL_AVX512_SELECT(v_uint32x16, epi32, si512)
+OPENCV_HAL_IMPL_AVX512_SELECT(v_int32x16,  epi32, si512)
+OPENCV_HAL_IMPL_AVX512_SELECT(v_uint64x8,  epi64, si512)
+OPENCV_HAL_IMPL_AVX512_SELECT(v_int64x8,   epi64, si512)
+OPENCV_HAL_IMPL_AVX512_SELECT(v_float32x16,   ps,    ps)
+OPENCV_HAL_IMPL_AVX512_SELECT(v_float64x8,    pd,    pd)
+
+/** Comparison **/
+#define OPENCV_HAL_IMPL_AVX512_CMP_INT(bin_op, imm8, _Tpvec, sufcmp, sufset, tval) \
+    inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b)               \
+    { return _Tpvec(_mm512_maskz_set1_##sufset(_mm512_cmp_##sufcmp##_mask(a.val, b.val, imm8), tval)); }
+
+#define OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(_Tpvec, sufcmp, sufset, tval)              \
+    OPENCV_HAL_IMPL_AVX512_CMP_INT(==, _MM_CMPINT_EQ,  _Tpvec, sufcmp, sufset, tval) \
+    OPENCV_HAL_IMPL_AVX512_CMP_INT(!=, _MM_CMPINT_NE,  _Tpvec, sufcmp, sufset, tval) \
+    OPENCV_HAL_IMPL_AVX512_CMP_INT(<,  _MM_CMPINT_LT,  _Tpvec, sufcmp, sufset, tval) \
+    OPENCV_HAL_IMPL_AVX512_CMP_INT(>,  _MM_CMPINT_NLE, _Tpvec, sufcmp, sufset, tval) \
+    OPENCV_HAL_IMPL_AVX512_CMP_INT(<=, _MM_CMPINT_LE,  _Tpvec, sufcmp, sufset, tval) \
+    OPENCV_HAL_IMPL_AVX512_CMP_INT(>=, _MM_CMPINT_NLT, _Tpvec, sufcmp, sufset, tval)
+
+OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_uint8x64,   epu8,  epi8, (char)-1)
+OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_int8x64,    epi8,  epi8, (char)-1)
+OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_uint16x32, epu16, epi16, (short)-1)
+OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_int16x32,  epi16, epi16, (short)-1)
+OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_uint32x16, epu32, epi32, (int)-1)
+OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_int32x16,  epi32, epi32, (int)-1)
+OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_uint64x8,  epu64, epi64, (int64)-1)
+OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_int64x8,   epi64, epi64, (int64)-1)
+
+#define OPENCV_HAL_IMPL_AVX512_CMP_FLT(bin_op, imm8, _Tpvec, sufcmp, sufset, tval) \
+    inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b)               \
+    { return _Tpvec(_mm512_castsi512_##sufcmp(_mm512_maskz_set1_##sufset(_mm512_cmp_##sufcmp##_mask(a.val, b.val, imm8), tval))); }
+
+#define OPENCV_HAL_IMPL_AVX512_CMP_OP_FLT(_Tpvec, sufcmp, sufset, tval)           \
+    OPENCV_HAL_IMPL_AVX512_CMP_FLT(==, _CMP_EQ_OQ,  _Tpvec, sufcmp, sufset, tval) \
+    OPENCV_HAL_IMPL_AVX512_CMP_FLT(!=, _CMP_NEQ_OQ, _Tpvec, sufcmp, sufset, tval) \
+    OPENCV_HAL_IMPL_AVX512_CMP_FLT(<,  _CMP_LT_OQ,  _Tpvec, sufcmp, sufset, tval) \
+    OPENCV_HAL_IMPL_AVX512_CMP_FLT(>,  _CMP_GT_OQ,  _Tpvec, sufcmp, sufset, tval) \
+    OPENCV_HAL_IMPL_AVX512_CMP_FLT(<=, _CMP_LE_OQ,  _Tpvec, sufcmp, sufset, tval) \
+    OPENCV_HAL_IMPL_AVX512_CMP_FLT(>=, _CMP_GE_OQ,  _Tpvec, sufcmp, sufset, tval)
+
+OPENCV_HAL_IMPL_AVX512_CMP_OP_FLT(v_float32x16, ps, epi32, (int)-1)
+OPENCV_HAL_IMPL_AVX512_CMP_OP_FLT(v_float64x8,  pd, epi64, (int64)-1)
+
+inline v_float32x16 v_not_nan(const v_float32x16& a)
+{ return v_float32x16(_mm512_castsi512_ps(_mm512_maskz_set1_epi32(_mm512_cmp_ps_mask(a.val, a.val, _CMP_ORD_Q), (int)-1))); }
+inline v_float64x8 v_not_nan(const v_float64x8& a)
+{ return v_float64x8(_mm512_castsi512_pd(_mm512_maskz_set1_epi64(_mm512_cmp_pd_mask(a.val, a.val, _CMP_ORD_Q), (int64)-1))); }
+
+/** min/max **/
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_uint8x64,   _mm512_min_epu8)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_uint8x64,   _mm512_max_epu8)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_int8x64,    _mm512_min_epi8)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_int8x64,    _mm512_max_epi8)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_uint16x32,  _mm512_min_epu16)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_uint16x32,  _mm512_max_epu16)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_int16x32,   _mm512_min_epi16)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_int16x32,   _mm512_max_epi16)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_uint32x16,  _mm512_min_epu32)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_uint32x16,  _mm512_max_epu32)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_int32x16,   _mm512_min_epi32)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_int32x16,   _mm512_max_epi32)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_uint64x8,   _mm512_min_epu64)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_uint64x8,   _mm512_max_epu64)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_int64x8,    _mm512_min_epi64)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_int64x8,    _mm512_max_epi64)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_float32x16, _mm512_min_ps)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_float32x16, _mm512_max_ps)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_float64x8,  _mm512_min_pd)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_float64x8,  _mm512_max_pd)
+
+/** Rotate **/
+template<int imm>
+inline v_int8x64 v_rotate_right(const v_int8x64& a, const v_int8x64& b)
+{
+    if (imm == 0) return a;
+    if (imm == 64) return b;
+    if (imm >= 128) return v_int8x64();
+#if CV_AVX_512VBMI
+    return v_int8x64(_mm512_permutex2var_epi8(a.val,
+           _v512_set_epu8(0x3f + imm,0x3e + imm,0x3d + imm,0x3c + imm,0x3b + imm,0x3a + imm,0x39 + imm,0x38 + imm,
+                          0x37 + imm,0x36 + imm,0x35 + imm,0x34 + imm,0x33 + imm,0x32 + imm,0x31 + imm,0x30 + imm,
+                          0x2f + imm,0x2e + imm,0x2d + imm,0x2c + imm,0x2b + imm,0x2a + imm,0x29 + imm,0x28 + imm,
+                          0x27 + imm,0x26 + imm,0x25 + imm,0x24 + imm,0x23 + imm,0x22 + imm,0x21 + imm,0x20 + imm,
+                          0x1f + imm,0x1e + imm,0x1d + imm,0x1c + imm,0x1b + imm,0x1a + imm,0x19 + imm,0x18 + imm,
+                          0x17 + imm,0x16 + imm,0x15 + imm,0x14 + imm,0x13 + imm,0x12 + imm,0x11 + imm,0x10 + imm,
+                          0x0f + imm,0x0e + imm,0x0d + imm,0x0c + imm,0x0b + imm,0x0a + imm,0x09 + imm,0x08 + imm,
+                          0x07 + imm,0x06 + imm,0x05 + imm,0x04 + imm,0x03 + imm,0x02 + imm,0x01 + imm,0x00 + imm), b.val));
+#else
+    __m512i pre = _mm512_alignr_epi32(b.val, a.val, imm/4);
+    if (imm % 4)
+    {
+        __m512i post;
+        if (imm/4 < 15)
+            post = _mm512_alignr_epi32(b.val, a.val, imm/4 + 1);
+        else if (imm/4 == 15)
+            post = b.val;
+        else
+            post = _mm512_alignr_epi32(_mm512_setzero_si512(), b.val, imm/4 - 15);
+        return v_int8x64(_mm512_or_si512(_mm512_srli_epi32(pre, (imm % 4)*8), _mm512_slli_epi32(post, (4 - imm % 4)*8)));
+    }
+    else
+        return v_int8x64(pre);
+#endif
+}
+template<int imm>
+inline v_int8x64 v_rotate_left(const v_int8x64& a, const v_int8x64& b)
+{
+    if (imm == 0) return a;
+    if (imm == 64) return b;
+    if (imm >= 128) return v_int8x64();
+#if CV_AVX_512VBMI
+    return v_int8x64(_mm512_permutex2var_epi8(b.val,
+           _v512_set_epi8(0x7f - imm,0x7e - imm,0x7d - imm,0x7c - imm,0x7b - imm,0x7a - imm,0x79 - imm,0x78 - imm,
+                          0x77 - imm,0x76 - imm,0x75 - imm,0x74 - imm,0x73 - imm,0x72 - imm,0x71 - imm,0x70 - imm,
+                          0x6f - imm,0x6e - imm,0x6d - imm,0x6c - imm,0x6b - imm,0x6a - imm,0x69 - imm,0x68 - imm,
+                          0x67 - imm,0x66 - imm,0x65 - imm,0x64 - imm,0x63 - imm,0x62 - imm,0x61 - imm,0x60 - imm,
+                          0x5f - imm,0x5e - imm,0x5d - imm,0x5c - imm,0x5b - imm,0x5a - imm,0x59 - imm,0x58 - imm,
+                          0x57 - imm,0x56 - imm,0x55 - imm,0x54 - imm,0x53 - imm,0x52 - imm,0x51 - imm,0x50 - imm,
+                          0x4f - imm,0x4e - imm,0x4d - imm,0x4c - imm,0x4b - imm,0x4a - imm,0x49 - imm,0x48 - imm,
+                          0x47 - imm,0x46 - imm,0x45 - imm,0x44 - imm,0x43 - imm,0x42 - imm,0x41 - imm,0x40 - imm), a.val));
+#else
+    if (imm < 64) return v_rotate_right<64 - imm>(b, a);
+    else return v_rotate_right<128 - imm>(v512_setzero_s8(), b);
+#endif
+}
+template<int imm>
+inline v_int8x64 v_rotate_right(const v_int8x64& a)
+{
+    if (imm == 0) return a;
+    if (imm >= 64) return v_int8x64();
+#if CV_AVX_512VBMI
+    return v_int8x64(_mm512_maskz_permutexvar_epi8(0xFFFFFFFFFFFFFFFF >> imm,
+           _v512_set_epu8(0x3f + imm,0x3e + imm,0x3d + imm,0x3c + imm,0x3b + imm,0x3a + imm,0x39 + imm,0x38 + imm,
+                          0x37 + imm,0x36 + imm,0x35 + imm,0x34 + imm,0x33 + imm,0x32 + imm,0x31 + imm,0x30 + imm,
+                          0x2f + imm,0x2e + imm,0x2d + imm,0x2c + imm,0x2b + imm,0x2a + imm,0x29 + imm,0x28 + imm,
+                          0x27 + imm,0x26 + imm,0x25 + imm,0x24 + imm,0x23 + imm,0x22 + imm,0x21 + imm,0x20 + imm,
+                          0x1f + imm,0x1e + imm,0x1d + imm,0x1c + imm,0x1b + imm,0x1a + imm,0x19 + imm,0x18 + imm,
+                          0x17 + imm,0x16 + imm,0x15 + imm,0x14 + imm,0x13 + imm,0x12 + imm,0x11 + imm,0x10 + imm,
+                          0x0f + imm,0x0e + imm,0x0d + imm,0x0c + imm,0x0b + imm,0x0a + imm,0x09 + imm,0x08 + imm,
+                          0x07 + imm,0x06 + imm,0x05 + imm,0x04 + imm,0x03 + imm,0x02 + imm,0x01 + imm,0x00 + imm), a.val));
+#else
+    return v_rotate_right<imm>(a, v512_setzero_s8());
+#endif
+}
+template<int imm>
+inline v_int8x64 v_rotate_left(const v_int8x64& a)
+{
+    if (imm == 0) return a;
+    if (imm >= 64) return v_int8x64();
+#if CV_AVX_512VBMI
+    return v_int8x64(_mm512_maskz_permutexvar_epi8(0xFFFFFFFFFFFFFFFF << imm,
+           _v512_set_epi8(0x3f - imm,0x3e - imm,0x3d - imm,0x3c - imm,0x3b - imm,0x3a - imm,0x39 - imm,0x38 - imm,
+                          0x37 - imm,0x36 - imm,0x35 - imm,0x34 - imm,0x33 - imm,0x32 - imm,0x31 - imm,0x30 - imm,
+                          0x2f - imm,0x2e - imm,0x2d - imm,0x2c - imm,0x2b - imm,0x2a - imm,0x29 - imm,0x28 - imm,
+                          0x27 - imm,0x26 - imm,0x25 - imm,0x24 - imm,0x23 - imm,0x22 - imm,0x21 - imm,0x20 - imm,
+                          0x1f - imm,0x1e - imm,0x1d - imm,0x1c - imm,0x1b - imm,0x1a - imm,0x19 - imm,0x18 - imm,
+                          0x17 - imm,0x16 - imm,0x15 - imm,0x14 - imm,0x13 - imm,0x12 - imm,0x11 - imm,0x10 - imm,
+                          0x0f - imm,0x0e - imm,0x0d - imm,0x0c - imm,0x0b - imm,0x0a - imm,0x09 - imm,0x08 - imm,
+                          0x07 - imm,0x06 - imm,0x05 - imm,0x04 - imm,0x03 - imm,0x02 - imm,0x01 - imm,0x00 - imm), a.val));
+#else
+    return v_rotate_right<64 - imm>(v512_setzero_s8(), a);
+#endif
+}
+
+#define OPENCV_HAL_IMPL_AVX512_ROTATE_PM(_Tpvec, suffix)                                                                                 \
+template<int imm> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b)                                                          \
+{ return v_reinterpret_as_##suffix(v_rotate_left<imm * sizeof(_Tpvec::lane_type)>(v_reinterpret_as_s8(a), v_reinterpret_as_s8(b))); }    \
+template<int imm> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b)                                                         \
+{ return v_reinterpret_as_##suffix(v_rotate_right<imm * sizeof(_Tpvec::lane_type)>(v_reinterpret_as_s8(a), v_reinterpret_as_s8(b))); }   \
+template<int imm> inline _Tpvec v_rotate_left(const _Tpvec& a)                                                                           \
+{ return v_reinterpret_as_##suffix(v_rotate_left<imm * sizeof(_Tpvec::lane_type)>(v_reinterpret_as_s8(a))); }                            \
+template<int imm> inline _Tpvec v_rotate_right(const _Tpvec& a)                                                                          \
+{ return v_reinterpret_as_##suffix(v_rotate_right<imm * sizeof(_Tpvec::lane_type)>(v_reinterpret_as_s8(a))); }
+
+#define OPENCV_HAL_IMPL_AVX512_ROTATE_EC(_Tpvec, suffix)                                                                                 \
+template<int imm>                                                                                                                        \
+inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b)                                                                            \
+{                                                                                                                                        \
+    enum { SHIFT2 = _Tpvec::nlanes - imm };                                                                                              \
+    enum { MASK = (1 << _Tpvec::nlanes) - 1 };                                                                                           \
+    if (imm == 0) return a;                                                                                                              \
+    if (imm == _Tpvec::nlanes) return b;                                                                                                 \
+    if (imm >= 2*_Tpvec::nlanes) return _Tpvec();                                                                                        \
+    return _Tpvec(_mm512_mask_expand_##suffix(_mm512_maskz_compress_##suffix((MASK << SHIFT2)&MASK, b.val), (MASK << imm)&MASK, a.val)); \
+}                                                                                                                                        \
+template<int imm>                                                                                                                        \
+inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b)                                                                           \
+{                                                                                                                                        \
+    enum { SHIFT2 = _Tpvec::nlanes - imm };                                                                                              \
+    enum { MASK = (1 << _Tpvec::nlanes) - 1 };                                                                                           \
+    if (imm == 0) return a;                                                                                                              \
+    if (imm == _Tpvec::nlanes) return b;                                                                                                 \
+    if (imm >= 2*_Tpvec::nlanes) return _Tpvec();                                                                                        \
+    return _Tpvec(_mm512_mask_expand_##suffix(_mm512_maskz_compress_##suffix((MASK << imm)&MASK, a.val), (MASK << SHIFT2)&MASK, b.val)); \
+}                                                                                                                                        \
+template<int imm>                                                                                                                        \
+inline _Tpvec v_rotate_left(const _Tpvec& a)                                                                                             \
+{                                                                                                                                        \
+    enum { SHIFT2 = _Tpvec::nlanes - imm };                                                                                              \
+    enum { MASK = (1 << _Tpvec::nlanes) - 1 };                                                                                           \
+    if (imm == 0) return a;                                                                                                              \
+    if (imm >= _Tpvec::nlanes) return _Tpvec();                                                                                          \
+    return _Tpvec(_mm512_maskz_expand_##suffix((MASK << imm)&MASK, a.val));                                                              \
+}                                                                                                                                        \
+template<int imm>                                                                                                                        \
+inline _Tpvec v_rotate_right(const _Tpvec& a)                                                                                            \
+{                                                                                                                                        \
+    enum { SHIFT2 = _Tpvec::nlanes - imm };                                                                                              \
+    enum { MASK = (1 << _Tpvec::nlanes) - 1 };                                                                                           \
+    if (imm == 0) return a;                                                                                                              \
+    if (imm >= _Tpvec::nlanes) return _Tpvec();                                                                                          \
+    return _Tpvec(_mm512_maskz_compress_##suffix((MASK << imm)&MASK, a.val));                                                            \
+}
+
+OPENCV_HAL_IMPL_AVX512_ROTATE_PM(v_uint8x64,   u8)
+OPENCV_HAL_IMPL_AVX512_ROTATE_PM(v_uint16x32,  u16)
+OPENCV_HAL_IMPL_AVX512_ROTATE_PM(v_int16x32,   s16)
+OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_uint32x16,  epi32)
+OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_int32x16,   epi32)
+OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_uint64x8,   epi64)
+OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_int64x8,    epi64)
+OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_float32x16, ps)
+OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_float64x8,  pd)
+
+////////// Reduce /////////
+
+/** Reduce **/
+#define OPENCV_HAL_IMPL_AVX512_REDUCE_ADD64(a, b) a + b
+#define OPENCV_HAL_IMPL_AVX512_REDUCE_8(sctype, func, _Tpvec, ifunc, scop)                                          \
+    inline sctype v_reduce_##func(const _Tpvec& a)                                                                  \
+    { __m256i half = _mm256_##ifunc(_v512_extract_low(a.val), _v512_extract_high(a.val));                           \
+      sctype CV_DECL_ALIGNED(64) idx[2];                                                                            \
+      _mm_store_si128((__m128i*)idx, _mm_##ifunc(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1))); \
+      return scop(idx[0], idx[1]); }
+OPENCV_HAL_IMPL_AVX512_REDUCE_8(uint64, min, v_uint64x8, min_epu64, min)
+OPENCV_HAL_IMPL_AVX512_REDUCE_8(uint64, max, v_uint64x8, max_epu64, max)
+OPENCV_HAL_IMPL_AVX512_REDUCE_8(uint64, sum, v_uint64x8, add_epi64, OPENCV_HAL_IMPL_AVX512_REDUCE_ADD64)
+OPENCV_HAL_IMPL_AVX512_REDUCE_8(int64,  min, v_int64x8,  min_epi64, min)
+OPENCV_HAL_IMPL_AVX512_REDUCE_8(int64,  max, v_int64x8,  max_epi64, max)
+OPENCV_HAL_IMPL_AVX512_REDUCE_8(int64,  sum, v_int64x8,  add_epi64, OPENCV_HAL_IMPL_AVX512_REDUCE_ADD64)
+
+#define OPENCV_HAL_IMPL_AVX512_REDUCE_8F(func, ifunc, scop)                                         \
+    inline double v_reduce_##func(const v_float64x8& a)                                             \
+    { __m256d half = _mm256_##ifunc(_v512_extract_low(a.val), _v512_extract_high(a.val));           \
+      double CV_DECL_ALIGNED(64) idx[2];                                                            \
+      _mm_store_pd(idx, _mm_##ifunc(_mm256_castpd256_pd128(half), _mm256_extractf128_pd(half, 1))); \
+      return scop(idx[0], idx[1]); }
+OPENCV_HAL_IMPL_AVX512_REDUCE_8F(min, min_pd, min)
+OPENCV_HAL_IMPL_AVX512_REDUCE_8F(max, max_pd, max)
+OPENCV_HAL_IMPL_AVX512_REDUCE_8F(sum, add_pd, OPENCV_HAL_IMPL_AVX512_REDUCE_ADD64)
+
+#define OPENCV_HAL_IMPL_AVX512_REDUCE_16(sctype, func, _Tpvec, ifunc)                                 \
+    inline sctype v_reduce_##func(const _Tpvec& a)                                                    \
+    { __m256i half = _mm256_##ifunc(_v512_extract_low(a.val), _v512_extract_high(a.val));             \
+      __m128i quarter = _mm_##ifunc(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1)); \
+      quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 8));                                     \
+      quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 4));                                     \
+      return (sctype)_mm_cvtsi128_si32(quarter); }
+OPENCV_HAL_IMPL_AVX512_REDUCE_16(uint, min, v_uint32x16, min_epu32)
+OPENCV_HAL_IMPL_AVX512_REDUCE_16(uint, max, v_uint32x16, max_epu32)
+OPENCV_HAL_IMPL_AVX512_REDUCE_16(int,  min, v_int32x16,  min_epi32)
+OPENCV_HAL_IMPL_AVX512_REDUCE_16(int,  max, v_int32x16,  max_epi32)
+
+#define OPENCV_HAL_IMPL_AVX512_REDUCE_16F(func, ifunc)                                            \
+    inline float v_reduce_##func(const v_float32x16& a)                                           \
+    { __m256 half = _mm256_##ifunc(_v512_extract_low(a.val), _v512_extract_high(a.val));          \
+      __m128 quarter = _mm_##ifunc(_mm256_castps256_ps128(half), _mm256_extractf128_ps(half, 1)); \
+      quarter = _mm_##ifunc(quarter, _mm_permute_ps(quarter, _MM_SHUFFLE(0, 0, 3, 2)));           \
+      quarter = _mm_##ifunc(quarter, _mm_permute_ps(quarter, _MM_SHUFFLE(0, 0, 0, 1)));           \
+      return _mm_cvtss_f32(quarter); }
+OPENCV_HAL_IMPL_AVX512_REDUCE_16F(min, min_ps)
+OPENCV_HAL_IMPL_AVX512_REDUCE_16F(max, max_ps)
+
+inline float v_reduce_sum(const v_float32x16& a)
+{
+    __m256 half = _mm256_add_ps(_v512_extract_low(a.val), _v512_extract_high(a.val));
+    __m128 quarter = _mm_add_ps(_mm256_castps256_ps128(half), _mm256_extractf128_ps(half, 1));
+    quarter = _mm_hadd_ps(quarter, quarter);
+    return _mm_cvtss_f32(_mm_hadd_ps(quarter, quarter));
+}
+inline int v_reduce_sum(const v_int32x16& a)
+{
+    __m256i half = _mm256_add_epi32(_v512_extract_low(a.val), _v512_extract_high(a.val));
+    __m128i quarter = _mm_add_epi32(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1));
+    quarter = _mm_hadd_epi32(quarter, quarter);
+    return _mm_cvtsi128_si32(_mm_hadd_epi32(quarter, quarter));
+}
+inline uint v_reduce_sum(const v_uint32x16& a)
+{ return (uint)v_reduce_sum(v_reinterpret_as_s32(a)); }
+
+#define OPENCV_HAL_IMPL_AVX512_REDUCE_32(sctype, func, _Tpvec, ifunc)                                 \
+    inline sctype v_reduce_##func(const _Tpvec& a)                                                    \
+    { __m256i half = _mm256_##ifunc(_v512_extract_low(a.val), _v512_extract_high(a.val));             \
+      __m128i quarter = _mm_##ifunc(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1)); \
+      quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 8));                                     \
+      quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 4));                                     \
+      quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 2));                                     \
+      return (sctype)_mm_cvtsi128_si32(quarter); }
+OPENCV_HAL_IMPL_AVX512_REDUCE_32(ushort, min, v_uint16x32, min_epu16)
+OPENCV_HAL_IMPL_AVX512_REDUCE_32(ushort, max, v_uint16x32, max_epu16)
+OPENCV_HAL_IMPL_AVX512_REDUCE_32(short,  min, v_int16x32,  min_epi16)
+OPENCV_HAL_IMPL_AVX512_REDUCE_32(short,  max, v_int16x32,  max_epi16)
+
+inline int v_reduce_sum(const v_int16x32& a)
+{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
+inline uint v_reduce_sum(const v_uint16x32& a)
+{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
+
+#define OPENCV_HAL_IMPL_AVX512_REDUCE_64(sctype, func, _Tpvec, ifunc)                                 \
+    inline sctype v_reduce_##func(const _Tpvec& a)                                                    \
+    { __m256i half = _mm256_##ifunc(_v512_extract_low(a.val), _v512_extract_high(a.val));             \
+      __m128i quarter = _mm_##ifunc(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1)); \
+      quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 8));                                     \
+      quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 4));                                     \
+      quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 2));                                     \
+      quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 1));                                     \
+      return (sctype)_mm_cvtsi128_si32(quarter); }
+OPENCV_HAL_IMPL_AVX512_REDUCE_64(uchar, min, v_uint8x64, min_epu8)
+OPENCV_HAL_IMPL_AVX512_REDUCE_64(uchar, max, v_uint8x64, max_epu8)
+OPENCV_HAL_IMPL_AVX512_REDUCE_64(schar, min, v_int8x64,  min_epi8)
+OPENCV_HAL_IMPL_AVX512_REDUCE_64(schar, max, v_int8x64,  max_epi8)
+
+#define OPENCV_HAL_IMPL_AVX512_REDUCE_64_SUM(sctype, _Tpvec, suffix)                                    \
+    inline sctype v_reduce_sum(const _Tpvec& a)                                                         \
+    {   __m512i a16 = _mm512_add_epi16(_mm512_cvt##suffix##_epi16(_v512_extract_low(a.val)),            \
+                                       _mm512_cvt##suffix##_epi16(_v512_extract_high(a.val)));          \
+        a16 = _mm512_cvtepi16_epi32(_mm256_add_epi16(_v512_extract_low(a16), _v512_extract_high(a16))); \
+        __m256i a8 = _mm256_add_epi32(_v512_extract_low(a16), _v512_extract_high(a16));                 \
+        __m128i a4 = _mm_add_epi32(_mm256_castsi256_si128(a8), _mm256_extracti128_si256(a8, 1));        \
+        a4 = _mm_hadd_epi32(a4, a4);                                                                    \
+        return (sctype)_mm_cvtsi128_si32(_mm_hadd_epi32(a4, a4)); }
+OPENCV_HAL_IMPL_AVX512_REDUCE_64_SUM(uint, v_uint8x64, epu8)
+OPENCV_HAL_IMPL_AVX512_REDUCE_64_SUM(int,  v_int8x64,  epi8)
+
+inline v_float32x16 v_reduce_sum4(const v_float32x16& a, const v_float32x16& b,
+                                  const v_float32x16& c, const v_float32x16& d)
+{
+    __m256 abl = _mm256_hadd_ps(_v512_extract_low(a.val), _v512_extract_low(b.val));
+    __m256 abh = _mm256_hadd_ps(_v512_extract_high(a.val), _v512_extract_high(b.val));
+    __m256 cdl = _mm256_hadd_ps(_v512_extract_low(c.val), _v512_extract_low(d.val));
+    __m256 cdh = _mm256_hadd_ps(_v512_extract_high(c.val), _v512_extract_high(d.val));
+    return v_float32x16(_v512_combine(_mm256_hadd_ps(abl, cdl), _mm256_hadd_ps(abh, cdh)));
+}
+
+inline unsigned v_reduce_sad(const v_uint8x64& a, const v_uint8x64& b)
+{
+    __m512i val = _mm512_sad_epu8(a.val, b.val);
+    __m256i half = _mm256_add_epi32(_v512_extract_low(val), _v512_extract_high(val));
+    __m128i quarter = _mm_add_epi32(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1));
+    return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(quarter, _mm_unpackhi_epi64(quarter, quarter)));
+}
+inline unsigned v_reduce_sad(const v_int8x64& a, const v_int8x64& b)
+{
+    __m512i val = _mm512_set1_epi8(0x80);
+    val = _mm512_sad_epu8(_mm512_add_epi8(a.val, val), _mm512_add_epi8(b.val, val));
+    __m256i half = _mm256_add_epi32(_v512_extract_low(val), _v512_extract_high(val));
+    __m128i quarter = _mm_add_epi32(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1));
+    return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(quarter, _mm_unpackhi_epi64(quarter, quarter)));
+}
+inline unsigned v_reduce_sad(const v_uint16x32& a, const v_uint16x32& b)
+{ return v_reduce_sum(v_add_wrap(a - b, b - a)); }
+inline unsigned v_reduce_sad(const v_int16x32& a, const v_int16x32& b)
+{ return v_reduce_sum(v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b)))); }
+inline unsigned v_reduce_sad(const v_uint32x16& a, const v_uint32x16& b)
+{ return v_reduce_sum(v_max(a, b) - v_min(a, b)); }
+inline unsigned v_reduce_sad(const v_int32x16& a, const v_int32x16& b)
+{ return v_reduce_sum(v_reinterpret_as_u32(v_max(a, b) - v_min(a, b))); }
+inline float v_reduce_sad(const v_float32x16& a, const v_float32x16& b)
+{ return v_reduce_sum((a - b) & v_float32x16(_mm512_castsi512_ps(_mm512_set1_epi32(0x7fffffff)))); }
+inline double v_reduce_sad(const v_float64x8& a, const v_float64x8& b)
+{ return v_reduce_sum((a - b) & v_float64x8(_mm512_castsi512_pd(_mm512_set1_epi64(0x7fffffffffffffff)))); }
+
+/** Popcount **/
+inline v_uint8x64 v_popcount(const v_int8x64& a)
+{
+#if CV_AVX_512BITALG
+    return v_uint8x64(_mm512_popcnt_epi8(a.val));
+#elif CV_AVX_512VBMI
+    __m512i _popcnt_table0 = _v512_set_epu8(7, 6, 6, 5, 6, 5, 5, 4, 6, 5, 5, 4, 5, 4, 4, 3,
+                                            5, 4, 4, 3, 4, 3, 3, 2, 4, 3, 3, 2, 3, 2, 2, 1,
+                                            5, 4, 4, 3, 4, 3, 3, 2, 4, 3, 3, 2, 3, 2, 2, 1,
+                                            4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0);
+    __m512i _popcnt_table1 = _v512_set_epu8(7, 6, 6, 5, 6, 5, 5, 4, 6, 5, 5, 4, 5, 4, 4, 3,
+                                            6, 5, 5, 4, 5, 4, 4, 3, 5, 4, 4, 3, 4, 3, 3, 2,
+                                            6, 5, 5, 4, 5, 4, 4, 3, 5, 4, 4, 3, 4, 3, 3, 2,
+                                            5, 4, 4, 3, 4, 3, 3, 2, 4, 3, 3, 2, 3, 2, 2, 1);
+    return v_uint8x64(_mm512_sub_epi8(_mm512_permutex2var_epi8(_popcnt_table0, a.val, _popcnt_table1), _mm512_movm_epi8(_mm512_movepi8_mask(a.val))));
+#else
+    __m512i _popcnt_table = _mm512_set4_epi32(0x04030302, 0x03020201, 0x03020201, 0x02010100);
+    __m512i _popcnt_mask = _mm512_set1_epi8(0x0F);
+
+    return v_uint8x64(_mm512_add_epi8(_mm512_shuffle_epi8(_popcnt_table, _mm512_and_si512(                  a.val,     _popcnt_mask)),
+                                      _mm512_shuffle_epi8(_popcnt_table, _mm512_and_si512(_mm512_srli_epi16(a.val, 4), _popcnt_mask))));
+#endif
+}
+inline v_uint16x32 v_popcount(const v_int16x32& a)
+{
+#if CV_AVX_512BITALG
+    return v_uint16x32(_mm512_popcnt_epi16(a.val));
+#elif CV_AVX_512VPOPCNTDQ
+    __m512i zero = _mm512_setzero_si512();
+    return v_uint16x32(_mm512_packs_epi32(_mm512_popcnt_epi32(_mm512_unpacklo_epi16(a.val, zero)),
+                                          _mm512_popcnt_epi32(_mm512_unpackhi_epi16(a.val, zero))));
+#else
+    v_uint8x64 p = v_popcount(v_reinterpret_as_s8(a));
+    p += v_rotate_right<1>(p);
+    return v_reinterpret_as_u16(p) & v512_setall_u16(0x00ff);
+#endif
+}
+inline v_uint32x16 v_popcount(const v_int32x16& a)
+{
+#if CV_AVX_512VPOPCNTDQ
+    return v_uint32x16(_mm512_popcnt_epi32(a.val));
+#else
+    v_uint8x64 p = v_popcount(v_reinterpret_as_s8(a));
+    p += v_rotate_right<1>(p);
+    p += v_rotate_right<2>(p);
+    return v_reinterpret_as_u32(p) & v512_setall_u32(0x000000ff);
+#endif
+}
+inline v_uint64x8 v_popcount(const v_int64x8& a)
+{
+#if CV_AVX_512VPOPCNTDQ
+    return v_uint64x8(_mm512_popcnt_epi64(a.val));
+#else
+    return v_uint64x8(_mm512_sad_epu8(v_popcount(v_reinterpret_as_s8(a)).val, _mm512_setzero_si512()));
+#endif
+}
+
+
+inline v_uint8x64  v_popcount(const v_uint8x64&  a) { return v_popcount(v_reinterpret_as_s8 (a)); }
+inline v_uint16x32 v_popcount(const v_uint16x32& a) { return v_popcount(v_reinterpret_as_s16(a)); }
+inline v_uint32x16 v_popcount(const v_uint32x16& a) { return v_popcount(v_reinterpret_as_s32(a)); }
+inline v_uint64x8  v_popcount(const v_uint64x8&  a) { return v_popcount(v_reinterpret_as_s64(a)); }
+
+
+////////// Other math /////////
+
+/** Some frequent operations **/
+#define OPENCV_HAL_IMPL_AVX512_MULADD(_Tpvec, suffix)                         \
+    inline _Tpvec v_fma(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c)    \
+    { return _Tpvec(_mm512_fmadd_##suffix(a.val, b.val, c.val)); }            \
+    inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
+    { return _Tpvec(_mm512_fmadd_##suffix(a.val, b.val, c.val)); }            \
+    inline _Tpvec v_sqrt(const _Tpvec& x)                                     \
+    { return _Tpvec(_mm512_sqrt_##suffix(x.val)); }                           \
+    inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b)           \
+    { return v_fma(a, a, b * b); }                                            \
+    inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b)               \
+    { return v_sqrt(v_fma(a, a, b * b)); }
+
+OPENCV_HAL_IMPL_AVX512_MULADD(v_float32x16, ps)
+OPENCV_HAL_IMPL_AVX512_MULADD(v_float64x8,  pd)
+
+inline v_int32x16 v_fma(const v_int32x16& a, const v_int32x16& b, const v_int32x16& c)
+{ return a * b + c; }
+inline v_int32x16 v_muladd(const v_int32x16& a, const v_int32x16& b, const v_int32x16& c)
+{ return v_fma(a, b, c); }
+
+inline v_float32x16 v_invsqrt(const v_float32x16& x)
+{
+#if CV_AVX_512ER
+    return v_float32x16(_mm512_rsqrt28_ps(x.val));
+#else
+    v_float32x16 half = x * v512_setall_f32(0.5);
+    v_float32x16 t  = v_float32x16(_mm512_rsqrt14_ps(x.val));
+    t *= v512_setall_f32(1.5) - ((t * t) * half);
+    return t;
+#endif
+}
+
+inline v_float64x8 v_invsqrt(const v_float64x8& x)
+{
+#if CV_AVX_512ER
+    return v_float64x8(_mm512_rsqrt28_pd(x.val));
+#else
+    return v512_setall_f64(1.) / v_sqrt(x);
+//    v_float64x8 half = x * v512_setall_f64(0.5);
+//    v_float64x8 t = v_float64x8(_mm512_rsqrt14_pd(x.val));
+//    t *= v512_setall_f64(1.5) - ((t * t) * half);
+//    t *= v512_setall_f64(1.5) - ((t * t) * half);
+//    return t;
+#endif
+}
+
+/** Absolute values **/
+#define OPENCV_HAL_IMPL_AVX512_ABS(_Tpvec, _Tpuvec, suffix) \
+    inline _Tpuvec v_abs(const _Tpvec& x)                   \
+    { return _Tpuvec(_mm512_abs_##suffix(x.val)); }
+
+OPENCV_HAL_IMPL_AVX512_ABS(v_int8x64,    v_uint8x64,    epi8)
+OPENCV_HAL_IMPL_AVX512_ABS(v_int16x32,   v_uint16x32,  epi16)
+OPENCV_HAL_IMPL_AVX512_ABS(v_int32x16,   v_uint32x16,  epi32)
+OPENCV_HAL_IMPL_AVX512_ABS(v_int64x8,    v_uint64x8,   epi64)
+
+inline v_float32x16 v_abs(const v_float32x16& x)
+{
+#ifdef _mm512_abs_pd
+    return v_float32x16(_mm512_abs_ps(x.val));
+#else
+    return v_float32x16(_mm512_castsi512_ps(_mm512_and_si512(_mm512_castps_si512(x.val),
+                        _v512_set_epu64(0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF,
+                                        0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF))));
+#endif
+}
+
+inline v_float64x8 v_abs(const v_float64x8& x)
+{
+#ifdef _mm512_abs_pd
+    #if defined __GNUC__ && (__GNUC__ < 7 || (__GNUC__ == 7 && __GNUC_MINOR__ <= 3) || (__GNUC__ == 8 && __GNUC_MINOR__ <= 2))
+        // Workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=87476
+        return v_float64x8(_mm512_abs_pd(_mm512_castpd_ps(x.val)));
+    #else
+        return v_float64x8(_mm512_abs_pd(x.val));
+    #endif
+#else
+    return v_float64x8(_mm512_castsi512_pd(_mm512_and_si512(_mm512_castpd_si512(x.val),
+                       _v512_set_epu64(0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF,
+                                       0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF))));
+#endif
+}
+
+/** Absolute difference **/
+inline v_uint8x64 v_absdiff(const v_uint8x64& a, const v_uint8x64& b)
+{ return v_add_wrap(a - b,  b - a); }
+inline v_uint16x32 v_absdiff(const v_uint16x32& a, const v_uint16x32& b)
+{ return v_add_wrap(a - b,  b - a); }
+inline v_uint32x16 v_absdiff(const v_uint32x16& a, const v_uint32x16& b)
+{ return v_max(a, b) - v_min(a, b); }
+
+inline v_uint8x64 v_absdiff(const v_int8x64& a, const v_int8x64& b)
+{
+    v_int8x64 d = v_sub_wrap(a, b);
+    v_int8x64 m = a < b;
+    return v_reinterpret_as_u8(v_sub_wrap(d ^ m, m));
+}
+
+inline v_uint16x32 v_absdiff(const v_int16x32& a, const v_int16x32& b)
+{ return v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))); }
+
+inline v_uint32x16 v_absdiff(const v_int32x16& a, const v_int32x16& b)
+{
+    v_int32x16 d = a - b;
+    v_int32x16 m = a < b;
+    return v_reinterpret_as_u32((d ^ m) - m);
+}
+
+inline v_float32x16 v_absdiff(const v_float32x16& a, const v_float32x16& b)
+{ return v_abs(a - b); }
+
+inline v_float64x8 v_absdiff(const v_float64x8& a, const v_float64x8& b)
+{ return v_abs(a - b); }
+
+/** Saturating absolute difference **/
+inline v_int8x64 v_absdiffs(const v_int8x64& a, const v_int8x64& b)
+{
+    v_int8x64 d = a - b;
+    v_int8x64 m = a < b;
+    return (d ^ m) - m;
+}
+inline v_int16x32 v_absdiffs(const v_int16x32& a, const v_int16x32& b)
+{ return v_max(a, b) - v_min(a, b); }
+
+////////// Conversions /////////
+
+/** Rounding **/
+inline v_int32x16 v_round(const v_float32x16& a)
+{ return v_int32x16(_mm512_cvtps_epi32(a.val)); }
+
+inline v_int32x16 v_round(const v_float64x8& a)
+{ return v_int32x16(_mm512_castsi256_si512(_mm512_cvtpd_epi32(a.val))); }
+
+inline v_int32x16 v_round(const v_float64x8& a, const v_float64x8& b)
+{ return v_int32x16(_v512_combine(_mm512_cvtpd_epi32(a.val), _mm512_cvtpd_epi32(b.val))); }
+
+inline v_int32x16 v_trunc(const v_float32x16& a)
+{ return v_int32x16(_mm512_cvttps_epi32(a.val)); }
+
+inline v_int32x16 v_trunc(const v_float64x8& a)
+{ return v_int32x16(_mm512_castsi256_si512(_mm512_cvttpd_epi32(a.val))); }
+
+#if CVT_ROUND_MODES_IMPLEMENTED
+inline v_int32x16 v_floor(const v_float32x16& a)
+{ return v_int32x16(_mm512_cvt_roundps_epi32(a.val, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)); }
+
+inline v_int32x16 v_floor(const v_float64x8& a)
+{ return v_int32x16(_mm512_castsi256_si512(_mm512_cvt_roundpd_epi32(a.val, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC))); }
+
+inline v_int32x16 v_ceil(const v_float32x16& a)
+{ return v_int32x16(_mm512_cvt_roundps_epi32(a.val, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)); }
+
+inline v_int32x16 v_ceil(const v_float64x8& a)
+{ return v_int32x16(_mm512_castsi256_si512(_mm512_cvt_roundpd_epi32(a.val, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC))); }
+#else
+inline v_int32x16 v_floor(const v_float32x16& a)
+{ return v_int32x16(_mm512_cvtps_epi32(_mm512_roundscale_ps(a.val, 1))); }
+
+inline v_int32x16 v_floor(const v_float64x8& a)
+{ return v_int32x16(_mm512_castsi256_si512(_mm512_cvtpd_epi32(_mm512_roundscale_pd(a.val, 1)))); }
+
+inline v_int32x16 v_ceil(const v_float32x16& a)
+{ return v_int32x16(_mm512_cvtps_epi32(_mm512_roundscale_ps(a.val, 2))); }
+
+inline v_int32x16 v_ceil(const v_float64x8& a)
+{ return v_int32x16(_mm512_castsi256_si512(_mm512_cvtpd_epi32(_mm512_roundscale_pd(a.val, 2)))); }
+#endif
+
+/** To float **/
+inline v_float32x16 v_cvt_f32(const v_int32x16& a)
+{ return v_float32x16(_mm512_cvtepi32_ps(a.val)); }
+
+inline v_float32x16 v_cvt_f32(const v_float64x8& a)
+{ return v_float32x16(_mm512_cvtpd_pslo(a.val)); }
+
+inline v_float32x16 v_cvt_f32(const v_float64x8& a, const v_float64x8& b)
+{ return v_float32x16(_v512_combine(_mm512_cvtpd_ps(a.val), _mm512_cvtpd_ps(b.val))); }
+
+inline v_float64x8 v_cvt_f64(const v_int32x16& a)
+{ return v_float64x8(_mm512_cvtepi32_pd(_v512_extract_low(a.val))); }
+
+inline v_float64x8 v_cvt_f64_high(const v_int32x16& a)
+{ return v_float64x8(_mm512_cvtepi32_pd(_v512_extract_high(a.val))); }
+
+inline v_float64x8 v_cvt_f64(const v_float32x16& a)
+{ return v_float64x8(_mm512_cvtps_pd(_v512_extract_low(a.val))); }
+
+inline v_float64x8 v_cvt_f64_high(const v_float32x16& a)
+{ return v_float64x8(_mm512_cvtps_pd(_v512_extract_high(a.val))); }
+
+////////////// Lookup table access ////////////////////
+
+inline v_int8x64 v512_lut(const schar* tab, const int* idx)
+{
+    __m128i p0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx    ), (const int *)tab, 1));
+    __m128i p1 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx + 1), (const int *)tab, 1));
+    __m128i p2 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx + 2), (const int *)tab, 1));
+    __m128i p3 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx + 3), (const int *)tab, 1));
+    return v_int8x64(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512(p0), p1, 1), p2, 2), p3, 3));
+}
+inline v_int8x64 v512_lut_pairs(const schar* tab, const int* idx)
+{
+    __m256i p0 = _mm512_cvtepi32_epi16(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx    ), (const int *)tab, 1));
+    __m256i p1 = _mm512_cvtepi32_epi16(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx + 1), (const int *)tab, 1));
+    return v_int8x64(_v512_combine(p0, p1));
+}
+inline v_int8x64 v512_lut_quads(const schar* tab, const int* idx)
+{
+    return v_int8x64(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx), (const int *)tab, 1));
+}
+inline v_uint8x64 v512_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v512_lut((const schar *)tab, idx)); }
+inline v_uint8x64 v512_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v512_lut_pairs((const schar *)tab, idx)); }
+inline v_uint8x64 v512_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v512_lut_quads((const schar *)tab, idx)); }
+
+inline v_int16x32 v512_lut(const short* tab, const int* idx)
+{
+    __m256i p0 = _mm512_cvtepi32_epi16(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx    ), (const int *)tab, 2));
+    __m256i p1 = _mm512_cvtepi32_epi16(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx + 1), (const int *)tab, 2));
+    return v_int16x32(_v512_combine(p0, p1));
+}
+inline v_int16x32 v512_lut_pairs(const short* tab, const int* idx)
+{
+    return v_int16x32(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx), (const int *)tab, 2));
+}
+inline v_int16x32 v512_lut_quads(const short* tab, const int* idx)
+{
+#if defined(__GNUC__)
+    return v_int16x32(_mm512_i32gather_epi64(_mm256_loadu_si256((const __m256i*)idx), (const long long int*)tab, 2));
+#else
+    return v_int16x32(_mm512_i32gather_epi64(_mm256_loadu_si256((const __m256i*)idx), (const int64*)tab, 2));
+#endif
+}
+inline v_uint16x32 v512_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v512_lut((const short *)tab, idx)); }
+inline v_uint16x32 v512_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v512_lut_pairs((const short *)tab, idx)); }
+inline v_uint16x32 v512_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v512_lut_quads((const short *)tab, idx)); }
+
+inline v_int32x16 v512_lut(const int* tab, const int* idx)
+{
+    return v_int32x16(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx), tab, 4));
+}
+inline v_int32x16 v512_lut_pairs(const int* tab, const int* idx)
+{
+#if defined(__GNUC__)
+    return v_int32x16(_mm512_i32gather_epi64(_mm256_loadu_si256((const __m256i*)idx), (const long long int*)tab, 4));
+#else
+    return v_int32x16(_mm512_i32gather_epi64(_mm256_loadu_si256((const __m256i*)idx), (const int64*)tab, 4));
+#endif
+}
+inline v_int32x16 v512_lut_quads(const int* tab, const int* idx)
+{
+    return v_int32x16(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512(
+                          _mm_loadu_si128((const __m128i*)(tab + idx[0]))),
+                          _mm_loadu_si128((const __m128i*)(tab + idx[1])), 1),
+                          _mm_loadu_si128((const __m128i*)(tab + idx[2])), 2),
+                          _mm_loadu_si128((const __m128i*)(tab + idx[3])), 3));
+}
+inline v_uint32x16 v512_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v512_lut((const int *)tab, idx)); }
+inline v_uint32x16 v512_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v512_lut_pairs((const int *)tab, idx)); }
+inline v_uint32x16 v512_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v512_lut_quads((const int *)tab, idx)); }
+
+inline v_int64x8 v512_lut(const int64* tab, const int* idx)
+{
+#if defined(__GNUC__)
+    return v_int64x8(_mm512_i32gather_epi64(_mm256_loadu_si256((const __m256i*)idx), (const long long int*)tab, 8));
+#else
+    return v_int64x8(_mm512_i32gather_epi64(_mm256_loadu_si256((const __m256i*)idx), tab , 8));
+#endif
+}
+inline v_int64x8 v512_lut_pairs(const int64* tab, const int* idx)
+{
+    return v_int64x8(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512(
+                         _mm_loadu_si128((const __m128i*)(tab + idx[0]))),
+                         _mm_loadu_si128((const __m128i*)(tab + idx[1])), 1),
+                         _mm_loadu_si128((const __m128i*)(tab + idx[2])), 2),
+                         _mm_loadu_si128((const __m128i*)(tab + idx[3])), 3));
+}
+inline v_uint64x8 v512_lut(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v512_lut((const int64 *)tab, idx)); }
+inline v_uint64x8 v512_lut_pairs(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v512_lut_pairs((const int64 *)tab, idx)); }
+
+inline v_float32x16 v512_lut(const float* tab, const int* idx)
+{
+    return v_float32x16(_mm512_i32gather_ps(_mm512_loadu_si512((const __m512i*)idx), tab, 4));
+}
+inline v_float32x16 v512_lut_pairs(const float* tab, const int* idx) { return v_reinterpret_as_f32(v512_lut_pairs((const int *)tab, idx)); }
+inline v_float32x16 v512_lut_quads(const float* tab, const int* idx) { return v_reinterpret_as_f32(v512_lut_quads((const int *)tab, idx)); }
+
+inline v_float64x8 v512_lut(const double* tab, const int* idx)
+{
+    return v_float64x8(_mm512_i32gather_pd(_mm256_loadu_si256((const __m256i*)idx), tab, 8));
+}
+inline v_float64x8 v512_lut_pairs(const double* tab, const int* idx)
+{
+        return v_float64x8(_mm512_insertf64x2(_mm512_insertf64x2(_mm512_insertf64x2(_mm512_castpd128_pd512(
+                               _mm_loadu_pd(tab + idx[0])),
+                               _mm_loadu_pd(tab + idx[1]), 1),
+                               _mm_loadu_pd(tab + idx[2]), 2),
+                               _mm_loadu_pd(tab + idx[3]), 3));
+}
+
+inline v_int32x16 v_lut(const int* tab, const v_int32x16& idxvec)
+{
+    return v_int32x16(_mm512_i32gather_epi32(idxvec.val, tab, 4));
+}
+
+inline v_uint32x16 v_lut(const unsigned* tab, const v_int32x16& idxvec)
+{
+    return v_reinterpret_as_u32(v_lut((const int *)tab, idxvec));
+}
+
+inline v_float32x16 v_lut(const float* tab, const v_int32x16& idxvec)
+{
+    return v_float32x16(_mm512_i32gather_ps(idxvec.val, tab, 4));
+}
+
+inline v_float64x8 v_lut(const double* tab, const v_int32x16& idxvec)
+{
+    return v_float64x8(_mm512_i32gather_pd(_v512_extract_low(idxvec.val), tab, 8));
+}
+
+inline void v_lut_deinterleave(const float* tab, const v_int32x16& idxvec, v_float32x16& x, v_float32x16& y)
+{
+    x.val = _mm512_i32gather_ps(idxvec.val, tab, 4);
+    y.val = _mm512_i32gather_ps(idxvec.val, tab + 1, 4);
+}
+
+inline void v_lut_deinterleave(const double* tab, const v_int32x16& idxvec, v_float64x8& x, v_float64x8& y)
+{
+    x.val = _mm512_i32gather_pd(_v512_extract_low(idxvec.val), tab, 8);
+    y.val = _mm512_i32gather_pd(_v512_extract_low(idxvec.val), tab + 1, 8);
+}
+
+inline v_int8x64 v_interleave_pairs(const v_int8x64& vec)
+{
+    return v_int8x64(_mm512_shuffle_epi8(vec.val, _mm512_set4_epi32(0x0f0d0e0c, 0x0b090a08, 0x07050604, 0x03010200)));
+}
+inline v_uint8x64 v_interleave_pairs(const v_uint8x64& vec) { return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); }
+inline v_int8x64 v_interleave_quads(const v_int8x64& vec)
+{
+    return v_int8x64(_mm512_shuffle_epi8(vec.val, _mm512_set4_epi32(0x0f0b0e0a, 0x0d090c08, 0x07030602, 0x05010400)));
+}
+inline v_uint8x64 v_interleave_quads(const v_uint8x64& vec) { return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); }
+
+inline v_int16x32 v_interleave_pairs(const v_int16x32& vec)
+{
+    return v_int16x32(_mm512_shuffle_epi8(vec.val, _mm512_set4_epi32(0x0f0e0b0a, 0x0d0c0908, 0x07060302, 0x05040100)));
+}
+inline v_uint16x32 v_interleave_pairs(const v_uint16x32& vec) { return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
+inline v_int16x32 v_interleave_quads(const v_int16x32& vec)
+{
+    return v_int16x32(_mm512_shuffle_epi8(vec.val, _mm512_set4_epi32(0x0f0e0706, 0x0d0c0504, 0x0b0a0302, 0x09080100)));
+}
+inline v_uint16x32 v_interleave_quads(const v_uint16x32& vec) { return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
+
+inline v_int32x16 v_interleave_pairs(const v_int32x16& vec)
+{
+    return v_int32x16(_mm512_shuffle_epi32(vec.val, _MM_PERM_ACBD));
+}
+inline v_uint32x16 v_interleave_pairs(const v_uint32x16& vec) { return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
+inline v_float32x16 v_interleave_pairs(const v_float32x16& vec) { return v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
+
+inline v_int8x64 v_pack_triplets(const v_int8x64& vec)
+{
+    return v_int8x64(_mm512_permutexvar_epi32(_v512_set_epu64(0x0000000f0000000f, 0x0000000f0000000f, 0x0000000e0000000d, 0x0000000c0000000a,
+                                                              0x0000000900000008, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000),
+                                              _mm512_shuffle_epi8(vec.val, _mm512_set4_epi32(0xffffff0f, 0x0e0d0c0a, 0x09080605, 0x04020100))));
+}
+inline v_uint8x64 v_pack_triplets(const v_uint8x64& vec) { return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }
+
+inline v_int16x32 v_pack_triplets(const v_int16x32& vec)
+{
+    return v_int16x32(_mm512_permutexvar_epi16(_v512_set_epu64(0x001f001f001f001f, 0x001f001f001f001f, 0x001e001d001c001a, 0x0019001800160015,
+                                                               0x0014001200110010, 0x000e000d000c000a, 0x0009000800060005, 0x0004000200010000), vec.val));
+}
+inline v_uint16x32 v_pack_triplets(const v_uint16x32& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
+
+inline v_int32x16 v_pack_triplets(const v_int32x16& vec)
+{
+    return v_int32x16(_mm512_permutexvar_epi32(_v512_set_epu64(0x0000000f0000000f, 0x0000000f0000000f, 0x0000000e0000000d, 0x0000000c0000000a,
+                                                               0x0000000900000008, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000), vec.val));
+}
+inline v_uint32x16 v_pack_triplets(const v_uint32x16& vec) { return v_reinterpret_as_u32(v_pack_triplets(v_reinterpret_as_s32(vec))); }
+inline v_float32x16 v_pack_triplets(const v_float32x16& vec)
+{
+    return v_float32x16(_mm512_permutexvar_ps(_v512_set_epu64(0x0000000f0000000f, 0x0000000f0000000f, 0x0000000e0000000d, 0x0000000c0000000a,
+                                                              0x0000000900000008, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000), vec.val));
+}
+
+////////// Matrix operations /////////
+
+inline v_int32x16 v_dotprod(const v_int16x32& a, const v_int16x32& b)
+{ return v_int32x16(_mm512_madd_epi16(a.val, b.val)); }
+
+inline v_int32x16 v_dotprod(const v_int16x32& a, const v_int16x32& b, const v_int32x16& c)
+{ return v_dotprod(a, b) + c; }
+
+#define OPENCV_HAL_AVX512_SPLAT2_PS(a, im) \
+    v_float32x16(_mm512_permute_ps(a.val, _MM_SHUFFLE(im, im, im, im)))
+
+inline v_float32x16 v_matmul(const v_float32x16& v,
+                             const v_float32x16& m0, const v_float32x16& m1,
+                             const v_float32x16& m2, const v_float32x16& m3)
+{
+    v_float32x16 v04 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 0);
+    v_float32x16 v15 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 1);
+    v_float32x16 v26 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 2);
+    v_float32x16 v37 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 3);
+    return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, v37 * m3)));
+}
+
+inline v_float32x16 v_matmuladd(const v_float32x16& v,
+                                const v_float32x16& m0, const v_float32x16& m1,
+                                const v_float32x16& m2, const v_float32x16& a)
+{
+    v_float32x16 v04 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 0);
+    v_float32x16 v15 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 1);
+    v_float32x16 v26 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 2);
+    return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, a)));
+}
+
+#define OPENCV_HAL_IMPL_AVX512_TRANSPOSE4x4(_Tpvec, suffix, cast_from, cast_to) \
+    inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1,              \
+                               const _Tpvec& a2, const _Tpvec& a3,              \
+                               _Tpvec& b0, _Tpvec& b1, _Tpvec& b2, _Tpvec& b3)  \
+    {                                                                           \
+        __m512i t0 = cast_from(_mm512_unpacklo_##suffix(a0.val, a1.val));       \
+        __m512i t1 = cast_from(_mm512_unpacklo_##suffix(a2.val, a3.val));       \
+        __m512i t2 = cast_from(_mm512_unpackhi_##suffix(a0.val, a1.val));       \
+        __m512i t3 = cast_from(_mm512_unpackhi_##suffix(a2.val, a3.val));       \
+        b0.val = cast_to(_mm512_unpacklo_epi64(t0, t1));                        \
+        b1.val = cast_to(_mm512_unpackhi_epi64(t0, t1));                        \
+        b2.val = cast_to(_mm512_unpacklo_epi64(t2, t3));                        \
+        b3.val = cast_to(_mm512_unpackhi_epi64(t2, t3));                        \
+    }
+
+OPENCV_HAL_IMPL_AVX512_TRANSPOSE4x4(v_uint32x16,  epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_AVX512_TRANSPOSE4x4(v_int32x16,   epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_AVX512_TRANSPOSE4x4(v_float32x16, ps, _mm512_castps_si512, _mm512_castsi512_ps)
+
+//////////////// Value reordering ///////////////
+
+/* Expand */
+#define OPENCV_HAL_IMPL_AVX512_EXPAND(_Tpvec, _Tpwvec, _Tp, intrin) \
+    inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
+    {                                                               \
+        b0.val = intrin(_v512_extract_low(a.val));                  \
+        b1.val = intrin(_v512_extract_high(a.val));                 \
+    }                                                               \
+    inline _Tpwvec v_expand_low(const _Tpvec& a)                    \
+    { return _Tpwvec(intrin(_v512_extract_low(a.val))); }           \
+    inline _Tpwvec v_expand_high(const _Tpvec& a)                   \
+    { return _Tpwvec(intrin(_v512_extract_high(a.val))); }          \
+    inline _Tpwvec v512_load_expand(const _Tp* ptr)                 \
+    {                                                               \
+        __m256i a = _mm256_loadu_si256((const __m256i*)ptr);        \
+        return _Tpwvec(intrin(a));                                  \
+    }
+
+OPENCV_HAL_IMPL_AVX512_EXPAND(v_uint8x64,  v_uint16x32, uchar,    _mm512_cvtepu8_epi16)
+OPENCV_HAL_IMPL_AVX512_EXPAND(v_int8x64,   v_int16x32,  schar,    _mm512_cvtepi8_epi16)
+OPENCV_HAL_IMPL_AVX512_EXPAND(v_uint16x32, v_uint32x16, ushort,   _mm512_cvtepu16_epi32)
+OPENCV_HAL_IMPL_AVX512_EXPAND(v_int16x32,  v_int32x16,  short,    _mm512_cvtepi16_epi32)
+OPENCV_HAL_IMPL_AVX512_EXPAND(v_uint32x16, v_uint64x8,  unsigned, _mm512_cvtepu32_epi64)
+OPENCV_HAL_IMPL_AVX512_EXPAND(v_int32x16,  v_int64x8,   int,      _mm512_cvtepi32_epi64)
+
+#define OPENCV_HAL_IMPL_AVX512_EXPAND_Q(_Tpvec, _Tp, intrin) \
+    inline _Tpvec v512_load_expand_q(const _Tp* ptr)         \
+    {                                                        \
+        __m128i a = _mm_loadu_si128((const __m128i*)ptr);    \
+        return _Tpvec(intrin(a));                            \
+    }
+
+OPENCV_HAL_IMPL_AVX512_EXPAND_Q(v_uint32x16, uchar, _mm512_cvtepu8_epi32)
+OPENCV_HAL_IMPL_AVX512_EXPAND_Q(v_int32x16,  schar, _mm512_cvtepi8_epi32)
+
+/* pack */
+// 16
+inline v_int8x64 v_pack(const v_int16x32& a, const v_int16x32& b)
+{ return v_int8x64(_mm512_permutexvar_epi64(_v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packs_epi16(a.val, b.val))); }
+
+inline v_uint8x64 v_pack(const v_uint16x32& a, const v_uint16x32& b)
+{
+    const __m512i t = _mm512_set1_epi16(255);
+    return v_uint8x64(_v512_combine(_mm512_cvtepi16_epi8(_mm512_min_epu16(a.val, t)), _mm512_cvtepi16_epi8(_mm512_min_epu16(b.val, t))));
+}
+
+inline v_uint8x64 v_pack_u(const v_int16x32& a, const v_int16x32& b)
+{
+    return v_uint8x64(_mm512_permutexvar_epi64(_v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packus_epi16(a.val, b.val)));
+}
+
+inline void v_pack_store(schar* ptr, const v_int16x32& a)
+{ v_store_low(ptr, v_pack(a, a)); }
+
+inline void v_pack_store(uchar* ptr, const v_uint16x32& a)
+{
+    const __m512i m = _mm512_set1_epi16(255);
+    _mm256_storeu_si256((__m256i*)ptr, _mm512_cvtepi16_epi8(_mm512_min_epu16(a.val, m)));
+}
+
+inline void v_pack_u_store(uchar* ptr, const v_int16x32& a)
+{ v_store_low(ptr, v_pack_u(a, a)); }
+
+template<int n> inline
+v_uint8x64 v_rshr_pack(const v_uint16x32& a, const v_uint16x32& b)
+{
+    // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
+    v_uint16x32 delta = v512_setall_u16((short)(1 << (n-1)));
+    return v_pack_u(v_reinterpret_as_s16((a + delta) >> n),
+                    v_reinterpret_as_s16((b + delta) >> n));
+}
+
+template<int n> inline
+void v_rshr_pack_store(uchar* ptr, const v_uint16x32& a)
+{
+    v_uint16x32 delta = v512_setall_u16((short)(1 << (n-1)));
+    v_pack_u_store(ptr, v_reinterpret_as_s16((a + delta) >> n));
+}
+
+template<int n> inline
+v_uint8x64 v_rshr_pack_u(const v_int16x32& a, const v_int16x32& b)
+{
+    v_int16x32 delta = v512_setall_s16((short)(1 << (n-1)));
+    return v_pack_u((a + delta) >> n, (b + delta) >> n);
+}
+
+template<int n> inline
+void v_rshr_pack_u_store(uchar* ptr, const v_int16x32& a)
+{
+    v_int16x32 delta = v512_setall_s16((short)(1 << (n-1)));
+    v_pack_u_store(ptr, (a + delta) >> n);
+}
+
+template<int n> inline
+v_int8x64 v_rshr_pack(const v_int16x32& a, const v_int16x32& b)
+{
+    v_int16x32 delta = v512_setall_s16((short)(1 << (n-1)));
+    return v_pack((a + delta) >> n, (b + delta) >> n);
+}
+
+template<int n> inline
+void v_rshr_pack_store(schar* ptr, const v_int16x32& a)
+{
+    v_int16x32 delta = v512_setall_s16((short)(1 << (n-1)));
+    v_pack_store(ptr, (a + delta) >> n);
+}
+
+// 32
+inline v_int16x32 v_pack(const v_int32x16& a, const v_int32x16& b)
+{ return v_int16x32(_mm512_permutexvar_epi64(_v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packs_epi32(a.val, b.val))); }
+
+inline v_uint16x32 v_pack(const v_uint32x16& a, const v_uint32x16& b)
+{
+    const __m512i m = _mm512_set1_epi32(65535);
+    return v_uint16x32(_v512_combine(_mm512_cvtepi32_epi16(_mm512_min_epu32(a.val, m)), _mm512_cvtepi32_epi16(_mm512_min_epu32(b.val, m))));
+}
+
+inline v_uint16x32 v_pack_u(const v_int32x16& a, const v_int32x16& b)
+{ return v_uint16x32(_mm512_permutexvar_epi64(_v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packus_epi32(a.val, b.val))); }
+
+inline void v_pack_store(short* ptr, const v_int32x16& a)
+{ v_store_low(ptr, v_pack(a, a)); }
+
+inline void v_pack_store(ushort* ptr, const v_uint32x16& a)
+{
+    const __m512i m = _mm512_set1_epi32(65535);
+    _mm256_storeu_si256((__m256i*)ptr, _mm512_cvtepi32_epi16(_mm512_min_epu32(a.val, m)));
+}
+
+inline void v_pack_u_store(ushort* ptr, const v_int32x16& a)
+{ v_store_low(ptr, v_pack_u(a, a)); }
+
+
+template<int n> inline
+v_uint16x32 v_rshr_pack(const v_uint32x16& a, const v_uint32x16& b)
+{
+    v_uint32x16 delta = v512_setall_u32(1 << (n-1));
+    return v_pack_u(v_reinterpret_as_s32((a + delta) >> n),
+                    v_reinterpret_as_s32((b + delta) >> n));
+}
+
+template<int n> inline
+void v_rshr_pack_store(ushort* ptr, const v_uint32x16& a)
+{
+    v_uint32x16 delta = v512_setall_u32(1 << (n-1));
+    v_pack_u_store(ptr, v_reinterpret_as_s32((a + delta) >> n));
+}
+
+template<int n> inline
+v_uint16x32 v_rshr_pack_u(const v_int32x16& a, const v_int32x16& b)
+{
+    v_int32x16 delta = v512_setall_s32(1 << (n-1));
+    return v_pack_u((a + delta) >> n, (b + delta) >> n);
+}
+
+template<int n> inline
+void v_rshr_pack_u_store(ushort* ptr, const v_int32x16& a)
+{
+    v_int32x16 delta = v512_setall_s32(1 << (n-1));
+    v_pack_u_store(ptr, (a + delta) >> n);
+}
+
+template<int n> inline
+v_int16x32 v_rshr_pack(const v_int32x16& a, const v_int32x16& b)
+{
+    v_int32x16 delta = v512_setall_s32(1 << (n-1));
+    return v_pack((a + delta) >> n, (b + delta) >> n);
+}
+
+template<int n> inline
+void v_rshr_pack_store(short* ptr, const v_int32x16& a)
+{
+    v_int32x16 delta = v512_setall_s32(1 << (n-1));
+    v_pack_store(ptr, (a + delta) >> n);
+}
+
+// 64
+// Non-saturating pack
+inline v_uint32x16 v_pack(const v_uint64x8& a, const v_uint64x8& b)
+{ return v_uint32x16(_v512_combine(_mm512_cvtepi64_epi32(a.val), _mm512_cvtepi64_epi32(b.val))); }
+
+inline v_int32x16 v_pack(const v_int64x8& a, const v_int64x8& b)
+{ return v_reinterpret_as_s32(v_pack(v_reinterpret_as_u64(a), v_reinterpret_as_u64(b))); }
+
+inline void v_pack_store(unsigned* ptr, const v_uint64x8& a)
+{ _mm256_storeu_si256((__m256i*)ptr, _mm512_cvtepi64_epi32(a.val)); }
+
+inline void v_pack_store(int* ptr, const v_int64x8& b)
+{ v_pack_store((unsigned*)ptr, v_reinterpret_as_u64(b)); }
+
+template<int n> inline
+v_uint32x16 v_rshr_pack(const v_uint64x8& a, const v_uint64x8& b)
+{
+    v_uint64x8 delta = v512_setall_u64((uint64)1 << (n-1));
+    return v_pack((a + delta) >> n, (b + delta) >> n);
+}
+
+template<int n> inline
+void v_rshr_pack_store(unsigned* ptr, const v_uint64x8& a)
+{
+    v_uint64x8 delta = v512_setall_u64((uint64)1 << (n-1));
+    v_pack_store(ptr, (a + delta) >> n);
+}
+
+template<int n> inline
+v_int32x16 v_rshr_pack(const v_int64x8& a, const v_int64x8& b)
+{
+    v_int64x8 delta = v512_setall_s64((int64)1 << (n-1));
+    return v_pack((a + delta) >> n, (b + delta) >> n);
+}
+
+template<int n> inline
+void v_rshr_pack_store(int* ptr, const v_int64x8& a)
+{
+    v_int64x8 delta = v512_setall_s64((int64)1 << (n-1));
+    v_pack_store(ptr, (a + delta) >> n);
+}
+
+// pack boolean
+inline v_uint8x64 v_pack_b(const v_uint16x32& a, const v_uint16x32& b)
+{ return v_uint8x64(_mm512_permutexvar_epi64(_v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packs_epi16(a.val, b.val))); }
+
+inline v_uint8x64 v_pack_b(const v_uint32x16& a, const v_uint32x16& b,
+                           const v_uint32x16& c, const v_uint32x16& d)
+{
+    __m512i ab = _mm512_packs_epi32(a.val, b.val);
+    __m512i cd = _mm512_packs_epi32(c.val, d.val);
+
+    return v_uint8x64(_mm512_permutexvar_epi32(_v512_set_epu32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0), _mm512_packs_epi16(ab, cd)));
+}
+
+inline v_uint8x64 v_pack_b(const v_uint64x8& a, const v_uint64x8& b, const v_uint64x8& c,
+                           const v_uint64x8& d, const v_uint64x8& e, const v_uint64x8& f,
+                           const v_uint64x8& g, const v_uint64x8& h)
+{
+    __m512i ab = _mm512_packs_epi32(a.val, b.val);
+    __m512i cd = _mm512_packs_epi32(c.val, d.val);
+    __m512i ef = _mm512_packs_epi32(e.val, f.val);
+    __m512i gh = _mm512_packs_epi32(g.val, h.val);
+
+    __m512i abcd = _mm512_packs_epi32(ab, cd);
+    __m512i efgh = _mm512_packs_epi32(ef, gh);
+
+    return v_uint8x64(_mm512_permutexvar_epi16(_v512_set_epu16(31, 23, 15, 7, 30, 22, 14, 6, 29, 21, 13, 5, 28, 20, 12, 4,
+                                                               27, 19, 11, 3, 26, 18, 10, 2, 25, 17,  9, 1, 24, 16,  8, 0), _mm512_packs_epi16(abcd, efgh)));
+}
+
+/* Recombine */
+// its up there with load and store operations
+
+/* Extract */
+#define OPENCV_HAL_IMPL_AVX512_EXTRACT(_Tpvec)                \
+    template<int s>                                           \
+    inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) \
+    { return v_rotate_right<s>(a, b); }
+
+OPENCV_HAL_IMPL_AVX512_EXTRACT(v_uint8x64)
+OPENCV_HAL_IMPL_AVX512_EXTRACT(v_int8x64)
+OPENCV_HAL_IMPL_AVX512_EXTRACT(v_uint16x32)
+OPENCV_HAL_IMPL_AVX512_EXTRACT(v_int16x32)
+OPENCV_HAL_IMPL_AVX512_EXTRACT(v_uint32x16)
+OPENCV_HAL_IMPL_AVX512_EXTRACT(v_int32x16)
+OPENCV_HAL_IMPL_AVX512_EXTRACT(v_uint64x8)
+OPENCV_HAL_IMPL_AVX512_EXTRACT(v_int64x8)
+OPENCV_HAL_IMPL_AVX512_EXTRACT(v_float32x16)
+OPENCV_HAL_IMPL_AVX512_EXTRACT(v_float64x8)
+
+
+///////////////////// load deinterleave /////////////////////////////
+
+inline void v_load_deinterleave( const uchar* ptr, v_uint8x64& a, v_uint8x64& b )
+{
+    __m512i ab0 = _mm512_loadu_si512((const __m512i*)ptr);
+    __m512i ab1 = _mm512_loadu_si512((const __m512i*)(ptr + 64));
+#if CV_AVX_512VBMI
+    __m512i mask0 = _v512_set_epu8(126, 124, 122, 120, 118, 116, 114, 112, 110, 108, 106, 104, 102, 100, 98, 96,
+                                    94,  92,  90,  88,  86,  84,  82,  80,  78,  76,  74,  72,  70,  68, 66, 64,
+                                    62,  60,  58,  56,  54,  52,  50,  48,  46,  44,  42,  40,  38,  36, 34, 32,
+                                    30,  28,  26,  24,  22,  20,  18,  16,  14,  12,  10,   8,   6,   4,  2,  0);
+    __m512i mask1 = _v512_set_epu8(127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107, 105, 103, 101, 99, 97,
+                                    95,  93,  91,  89,  87,  85,  83,  81,  79,  77,  75,  73,  71,  69, 67, 65,
+                                    63,  61,  59,  57,  55,  53,  51,  49,  47,  45,  43,  41,  39,  37, 35, 33,
+                                    31,  29,  27,  25,  23,  21,  19,  17,  15,  13,  11,   9,   7,   5,  3,  1);
+    a = v_uint8x64(_mm512_permutex2var_epi8(ab0, mask0, ab1));
+    b = v_uint8x64(_mm512_permutex2var_epi8(ab0, mask1, ab1));
+#else
+    __m512i mask0 = _mm512_set4_epi32(0x0f0d0b09, 0x07050301, 0x0e0c0a08, 0x06040200);
+    __m512i a0b0 = _mm512_shuffle_epi8(ab0, mask0);
+    __m512i a1b1 = _mm512_shuffle_epi8(ab1, mask0);
+    __m512i mask1 = _v512_set_epu64(14, 12, 10, 8, 6, 4, 2, 0);
+    __m512i mask2 = _v512_set_epu64(15, 13, 11, 9, 7, 5, 3, 1);
+    a = v_uint8x64(_mm512_permutex2var_epi64(a0b0, mask1, a1b1));
+    b = v_uint8x64(_mm512_permutex2var_epi64(a0b0, mask2, a1b1));
+#endif
+}
+
+inline void v_load_deinterleave( const ushort* ptr, v_uint16x32& a, v_uint16x32& b )
+{
+    __m512i ab0 = _mm512_loadu_si512((const __m512i*)ptr);
+    __m512i ab1 = _mm512_loadu_si512((const __m512i*)(ptr + 32));
+    __m512i mask0 = _v512_set_epu16(62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32,
+                                    30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10,  8,  6,  4,  2,  0);
+    __m512i mask1 = _v512_set_epu16(63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33,
+                                    31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11,  9,  7,  5,  3,  1);
+    a = v_uint16x32(_mm512_permutex2var_epi16(ab0, mask0, ab1));
+    b = v_uint16x32(_mm512_permutex2var_epi16(ab0, mask1, ab1));
+}
+
+inline void v_load_deinterleave( const unsigned* ptr, v_uint32x16& a, v_uint32x16& b )
+{
+    __m512i ab0 = _mm512_loadu_si512((const __m512i*)ptr);
+    __m512i ab1 = _mm512_loadu_si512((const __m512i*)(ptr + 16));
+    __m512i mask0 = _v512_set_epu32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
+    __m512i mask1 = _v512_set_epu32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
+    a = v_uint32x16(_mm512_permutex2var_epi32(ab0, mask0, ab1));
+    b = v_uint32x16(_mm512_permutex2var_epi32(ab0, mask1, ab1));
+}
+
+inline void v_load_deinterleave( const uint64* ptr, v_uint64x8& a, v_uint64x8& b )
+{
+    __m512i ab0 = _mm512_loadu_si512((const __m512i*)ptr);
+    __m512i ab1 = _mm512_loadu_si512((const __m512i*)(ptr + 8));
+    __m512i mask0 = _v512_set_epu64(14, 12, 10, 8, 6, 4, 2, 0);
+    __m512i mask1 = _v512_set_epu64(15, 13, 11, 9, 7, 5, 3, 1);
+    a = v_uint64x8(_mm512_permutex2var_epi64(ab0, mask0, ab1));
+    b = v_uint64x8(_mm512_permutex2var_epi64(ab0, mask1, ab1));
+}
+
+inline void v_load_deinterleave( const uchar* ptr, v_uint8x64& b, v_uint8x64& g, v_uint8x64& r )
+{
+    __m512i bgr0 = _mm512_loadu_si512((const __m512i*)ptr);
+    __m512i bgr1 = _mm512_loadu_si512((const __m512i*)(ptr + 64));
+    __m512i bgr2 = _mm512_loadu_si512((const __m512i*)(ptr + 128));
+
+#if CV_AVX_512VBMI2
+    __m512i mask0 = _v512_set_epu8(126, 123, 120, 117, 114, 111, 108, 105, 102,  99,  96,  93,  90,  87,  84, 81,
+                                    78,  75,  72,  69,  66,  63,  60,  57,  54,  51,  48,  45,  42,  39,  36, 33,
+                                    30,  27,  24,  21,  18,  15,  12,   9,   6,   3,   0,  62,  59,  56,  53, 50,
+                                    47,  44,  41,  38,  35,  32,  29,  26,  23,  20,  17,  14,  11,   8,   5,  2);
+    __m512i r0b01 = _mm512_permutex2var_epi8(bgr0, mask0, bgr1);
+    __m512i b1g12 = _mm512_permutex2var_epi8(bgr1, mask0, bgr2);
+    __m512i r12b2 = _mm512_permutex2var_epi8(bgr1,
+                    _v512_set_epu8(125, 122, 119, 116, 113, 110, 107, 104, 101,  98,  95,  92,  89,  86,  83, 80,
+                                    77,  74,  71,  68,  65, 127, 124, 121, 118, 115, 112, 109, 106, 103, 100, 97,
+                                    94,  91,  88,  85,  82,  79,  76,  73,  70,  67,  64,  61,  58,  55,  52, 49,
+                                    46,  43,  40,  37,  34,  31,  28,  25,  22,  19,  16,  13,  10,   7,   4,  1), bgr2);
+    b = v_uint8x64(_mm512_mask_compress_epi8(r12b2, 0xffffffffffe00000, r0b01));
+    g = v_uint8x64(_mm512_mask_compress_epi8(b1g12, 0x2492492492492492, bgr0));
+    r = v_uint8x64(_mm512_mask_expand_epi8(r0b01, 0xffffffffffe00000, r12b2));
+#elif CV_AVX_512VBMI
+    __m512i b0g0b1 = _mm512_mask_blend_epi8(0xb6db6db6db6db6db, bgr1, bgr0);
+    __m512i g1r1g2 = _mm512_mask_blend_epi8(0xb6db6db6db6db6db, bgr2, bgr1);
+    __m512i r2b2r0 = _mm512_mask_blend_epi8(0xb6db6db6db6db6db, bgr0, bgr2);
+    b = v_uint8x64(_mm512_permutex2var_epi8(b0g0b1, _v512_set_epu8(125, 122, 119, 116, 113, 110, 107, 104, 101,  98,  95,  92,  89,  86,  83,  80,
+                                                                    77,  74,  71,  68,  65,  63,  61,  60,  58,  57,  55,  54,  52,  51,  49,  48,
+                                                                    46,  45,  43,  42,  40,  39,  37,  36,  34,  33,  31,  30,  28,  27,  25,  24,
+                                                                    23,  21,  20,  18,  17,  15,  14,  12,  11,   9,   8,   6,   5,   3,   2,   0), bgr2));
+    g = v_uint8x64(_mm512_permutex2var_epi8(g1r1g2, _v512_set_epu8( 63,  61,  60,  58,  57,  55,  54,  52,  51,  49,  48,  46,  45,  43,  42,  40,
+                                                                    39,  37,  36,  34,  33,  31,  30,  28,  27,  25,  24,  23,  21,  20,  18,  17,
+                                                                    15,  14,  12,  11,   9,   8,   6,   5,   3,   2,   0, 126, 123, 120, 117, 114,
+                                                                   111, 108, 105, 102,  99,  96,  93,  90,  87,  84,  81,  78,  75,  72,  69,  66), bgr0));
+    r = v_uint8x64(_mm512_permutex2var_epi8(r2b2r0, _v512_set_epu8( 63,  60,  57,  54,  51,  48,  45,  42,  39,  36,  33,  30,  27,  24,  21,  18,
+                                                                    15,  12,   9,   6,   3,   0, 125, 122, 119, 116, 113, 110, 107, 104, 101,  98,
+                                                                    95,  92,  89,  86,  83,  80,  77,  74,  71,  68,  65,  62,  59,  56,  53,  50,
+                                                                    47,  44,  41,  38,  35,  32,  29,  26,  23,  20,  17,  14,  11,   8,   5,   2), bgr1));
+#else
+    __m512i mask0 = _v512_set_epu16(61, 58, 55, 52, 49, 46, 43, 40, 37, 34, 63, 60, 57, 54, 51, 48,
+                                    45, 42, 39, 36, 33, 30, 27, 24, 21, 18, 15, 12,  9,  6,  3,  0);
+    __m512i b01g1 = _mm512_permutex2var_epi16(bgr0, mask0, bgr1);
+    __m512i r12b2 = _mm512_permutex2var_epi16(bgr1, mask0, bgr2);
+    __m512i g20r0 = _mm512_permutex2var_epi16(bgr2, mask0, bgr0);
+
+    __m512i b0g0 = _mm512_mask_blend_epi32(0xf800, b01g1, r12b2);
+    __m512i r0b1 = _mm512_permutex2var_epi16(bgr1, _v512_set_epu16(42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 29, 26, 23, 20, 17,
+                                                                   14, 11,  8,  5,  2, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43), g20r0);
+    __m512i g1r1 = _mm512_alignr_epi32(r12b2, g20r0, 11);
+    b = v_uint8x64(_mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, b0g0, r0b1));
+    r = v_uint8x64(_mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, r0b1, g1r1));
+    g = v_uint8x64(_mm512_shuffle_epi8(_mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, g1r1, b0g0), _mm512_set4_epi32(0x0e0f0c0d, 0x0a0b0809, 0x06070405, 0x02030001)));
+#endif
+}
+
+inline void v_load_deinterleave( const ushort* ptr, v_uint16x32& b, v_uint16x32& g, v_uint16x32& r )
+{
+    __m512i bgr0 = _mm512_loadu_si512((const __m512i*)ptr);
+    __m512i bgr1 = _mm512_loadu_si512((const __m512i*)(ptr + 32));
+    __m512i bgr2 = _mm512_loadu_si512((const __m512i*)(ptr + 64));
+
+    __m512i mask0 = _v512_set_epu16(61, 58, 55, 52, 49, 46, 43, 40, 37, 34, 63, 60, 57, 54, 51, 48,
+                                    45, 42, 39, 36, 33, 30, 27, 24, 21, 18, 15, 12,  9,  6,  3,  0);
+    __m512i b01g1 = _mm512_permutex2var_epi16(bgr0, mask0, bgr1);
+    __m512i r12b2 = _mm512_permutex2var_epi16(bgr1, mask0, bgr2);
+    __m512i g20r0 = _mm512_permutex2var_epi16(bgr2, mask0, bgr0);
+
+    b = v_uint16x32(_mm512_mask_blend_epi32(0xf800, b01g1, r12b2));
+    g = v_uint16x32(_mm512_permutex2var_epi16(bgr1, _v512_set_epu16(42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 29, 26, 23, 20, 17,
+                                                                    14, 11,  8,  5,  2, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43), g20r0));
+    r = v_uint16x32(_mm512_alignr_epi32(r12b2, g20r0, 11));
+}
+
+inline void v_load_deinterleave( const unsigned* ptr, v_uint32x16& b, v_uint32x16& g, v_uint32x16& r )
+{
+    __m512i bgr0 = _mm512_loadu_si512((const __m512i*)ptr);
+    __m512i bgr1 = _mm512_loadu_si512((const __m512i*)(ptr + 16));
+    __m512i bgr2 = _mm512_loadu_si512((const __m512i*)(ptr + 32));
+
+    __m512i mask0 = _v512_set_epu32(29, 26, 23, 20, 17, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0);
+    __m512i b01r1 = _mm512_permutex2var_epi32(bgr0, mask0, bgr1);
+    __m512i g12b2 = _mm512_permutex2var_epi32(bgr1, mask0, bgr2);
+    __m512i r20g0 = _mm512_permutex2var_epi32(bgr2, mask0, bgr0);
+
+    b = v_uint32x16(_mm512_mask_blend_epi32(0xf800, b01r1, g12b2));
+    g = v_uint32x16(_mm512_alignr_epi32(g12b2, r20g0, 11));
+    r = v_uint32x16(_mm512_permutex2var_epi32(bgr1, _v512_set_epu32(21, 20, 19, 18, 17, 16, 13, 10, 7, 4, 1, 26, 25, 24, 23, 22), r20g0));
+}
+
+inline void v_load_deinterleave( const uint64* ptr, v_uint64x8& b, v_uint64x8& g, v_uint64x8& r )
+{
+    __m512i bgr0 = _mm512_loadu_si512((const __m512i*)ptr);
+    __m512i bgr1 = _mm512_loadu_si512((const __m512i*)(ptr + 8));
+    __m512i bgr2 = _mm512_loadu_si512((const __m512i*)(ptr + 16));
+
+    __m512i mask0 = _v512_set_epu64(13, 10, 15, 12, 9, 6, 3, 0);
+    __m512i b01g1 = _mm512_permutex2var_epi64(bgr0, mask0, bgr1);
+    __m512i r12b2 = _mm512_permutex2var_epi64(bgr1, mask0, bgr2);
+    __m512i g20r0 = _mm512_permutex2var_epi64(bgr2, mask0, bgr0);
+
+    b = v_uint64x8(_mm512_mask_blend_epi64(0xc0, b01g1, r12b2));
+    r = v_uint64x8(_mm512_alignr_epi64(r12b2, g20r0, 6));
+    g = v_uint64x8(_mm512_permutex2var_epi64(bgr1, _v512_set_epu64(10, 9, 8, 5, 2, 13, 12, 11), g20r0));
+}
+
+inline void v_load_deinterleave( const uchar* ptr, v_uint8x64& b, v_uint8x64& g, v_uint8x64& r, v_uint8x64& a )
+{
+    __m512i bgra0 = _mm512_loadu_si512((const __m512i*)ptr);
+    __m512i bgra1 = _mm512_loadu_si512((const __m512i*)(ptr + 64));
+    __m512i bgra2 = _mm512_loadu_si512((const __m512i*)(ptr + 128));
+    __m512i bgra3 = _mm512_loadu_si512((const __m512i*)(ptr + 192));
+
+#if CV_AVX_512VBMI
+    __m512i mask0 = _v512_set_epu8(126, 124, 122, 120, 118, 116, 114, 112, 110, 108, 106, 104, 102, 100, 98, 96,
+                                    94,  92,  90,  88,  86,  84,  82,  80,  78,  76,  74,  72,  70,  68, 66, 64,
+                                    62,  60,  58,  56,  54,  52,  50,  48,  46,  44,  42,  40,  38,  36, 34, 32,
+                                    30,  28,  26,  24,  22,  20,  18,  16,  14,  12,  10,   8,   6,   4,  2,  0);
+    __m512i mask1 = _v512_set_epu8(127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107, 105, 103, 101, 99, 97,
+                                    95,  93,  91,  89,  87,  85,  83,  81,  79,  77,  75,  73,  71,  69, 67, 65,
+                                    63,  61,  59,  57,  55,  53,  51,  49,  47,  45,  43,  41,  39,  37, 35, 33,
+                                    31,  29,  27,  25,  23,  21,  19,  17,  15,  13,  11,   9,   7,   5,  3,  1);
+
+    __m512i br01 = _mm512_permutex2var_epi8(bgra0, mask0, bgra1);
+    __m512i ga01 = _mm512_permutex2var_epi8(bgra0, mask1, bgra1);
+    __m512i br23 = _mm512_permutex2var_epi8(bgra2, mask0, bgra3);
+    __m512i ga23 = _mm512_permutex2var_epi8(bgra2, mask1, bgra3);
+
+    b = v_uint8x64(_mm512_permutex2var_epi8(br01, mask0, br23));
+    r = v_uint8x64(_mm512_permutex2var_epi8(br01, mask1, br23));
+    g = v_uint8x64(_mm512_permutex2var_epi8(ga01, mask0, ga23));
+    a = v_uint8x64(_mm512_permutex2var_epi8(ga01, mask1, ga23));
+#else
+    __m512i mask = _mm512_set4_epi32(0x0f0b0703, 0x0e0a0602, 0x0d090501, 0x0c080400);
+    __m512i b0g0r0a0 = _mm512_shuffle_epi8(bgra0, mask);
+    __m512i b1g1r1a1 = _mm512_shuffle_epi8(bgra1, mask);
+    __m512i b2g2r2a2 = _mm512_shuffle_epi8(bgra2, mask);
+    __m512i b3g3r3a3 = _mm512_shuffle_epi8(bgra3, mask);
+
+    __m512i mask0 = _v512_set_epu32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
+    __m512i mask1 = _v512_set_epu32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
+
+    __m512i br01 = _mm512_permutex2var_epi32(b0g0r0a0, mask0, b1g1r1a1);
+    __m512i ga01 = _mm512_permutex2var_epi32(b0g0r0a0, mask1, b1g1r1a1);
+    __m512i br23 = _mm512_permutex2var_epi32(b2g2r2a2, mask0, b3g3r3a3);
+    __m512i ga23 = _mm512_permutex2var_epi32(b2g2r2a2, mask1, b3g3r3a3);
+
+    b = v_uint8x64(_mm512_permutex2var_epi32(br01, mask0, br23));
+    r = v_uint8x64(_mm512_permutex2var_epi32(br01, mask1, br23));
+    g = v_uint8x64(_mm512_permutex2var_epi32(ga01, mask0, ga23));
+    a = v_uint8x64(_mm512_permutex2var_epi32(ga01, mask1, ga23));
+#endif
+}
+
+inline void v_load_deinterleave( const ushort* ptr, v_uint16x32& b, v_uint16x32& g, v_uint16x32& r, v_uint16x32& a )
+{
+    __m512i bgra0 = _mm512_loadu_si512((const __m512i*)ptr);
+    __m512i bgra1 = _mm512_loadu_si512((const __m512i*)(ptr + 32));
+    __m512i bgra2 = _mm512_loadu_si512((const __m512i*)(ptr + 64));
+    __m512i bgra3 = _mm512_loadu_si512((const __m512i*)(ptr + 96));
+
+    __m512i mask0 = _v512_set_epu16(62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32,
+                                    30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10,  8,  6,  4,  2,  0);
+    __m512i mask1 = _v512_set_epu16(63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33,
+                                    31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11,  9,  7,  5,  3,  1);
+
+    __m512i br01 = _mm512_permutex2var_epi16(bgra0, mask0, bgra1);
+    __m512i ga01 = _mm512_permutex2var_epi16(bgra0, mask1, bgra1);
+    __m512i br23 = _mm512_permutex2var_epi16(bgra2, mask0, bgra3);
+    __m512i ga23 = _mm512_permutex2var_epi16(bgra2, mask1, bgra3);
+
+    b = v_uint16x32(_mm512_permutex2var_epi16(br01, mask0, br23));
+    r = v_uint16x32(_mm512_permutex2var_epi16(br01, mask1, br23));
+    g = v_uint16x32(_mm512_permutex2var_epi16(ga01, mask0, ga23));
+    a = v_uint16x32(_mm512_permutex2var_epi16(ga01, mask1, ga23));
+}
+
+inline void v_load_deinterleave( const unsigned* ptr, v_uint32x16& b, v_uint32x16& g, v_uint32x16& r, v_uint32x16& a )
+{
+    __m512i bgra0 = _mm512_loadu_si512((const __m512i*)ptr);
+    __m512i bgra1 = _mm512_loadu_si512((const __m512i*)(ptr + 16));
+    __m512i bgra2 = _mm512_loadu_si512((const __m512i*)(ptr + 32));
+    __m512i bgra3 = _mm512_loadu_si512((const __m512i*)(ptr + 48));
+
+    __m512i mask0 = _v512_set_epu32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
+    __m512i mask1 = _v512_set_epu32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
+
+    __m512i br01 = _mm512_permutex2var_epi32(bgra0, mask0, bgra1);
+    __m512i ga01 = _mm512_permutex2var_epi32(bgra0, mask1, bgra1);
+    __m512i br23 = _mm512_permutex2var_epi32(bgra2, mask0, bgra3);
+    __m512i ga23 = _mm512_permutex2var_epi32(bgra2, mask1, bgra3);
+
+    b = v_uint32x16(_mm512_permutex2var_epi32(br01, mask0, br23));
+    r = v_uint32x16(_mm512_permutex2var_epi32(br01, mask1, br23));
+    g = v_uint32x16(_mm512_permutex2var_epi32(ga01, mask0, ga23));
+    a = v_uint32x16(_mm512_permutex2var_epi32(ga01, mask1, ga23));
+}
+
+inline void v_load_deinterleave( const uint64* ptr, v_uint64x8& b, v_uint64x8& g, v_uint64x8& r, v_uint64x8& a )
+{
+    __m512i bgra0 = _mm512_loadu_si512((const __m512i*)ptr);
+    __m512i bgra1 = _mm512_loadu_si512((const __m512i*)(ptr + 8));
+    __m512i bgra2 = _mm512_loadu_si512((const __m512i*)(ptr + 16));
+    __m512i bgra3 = _mm512_loadu_si512((const __m512i*)(ptr + 24));
+
+    __m512i mask0 = _v512_set_epu64(14, 12, 10, 8, 6, 4, 2, 0);
+    __m512i mask1 = _v512_set_epu64(15, 13, 11, 9, 7, 5, 3, 1);
+
+    __m512i br01 = _mm512_permutex2var_epi64(bgra0, mask0, bgra1);
+    __m512i ga01 = _mm512_permutex2var_epi64(bgra0, mask1, bgra1);
+    __m512i br23 = _mm512_permutex2var_epi64(bgra2, mask0, bgra3);
+    __m512i ga23 = _mm512_permutex2var_epi64(bgra2, mask1, bgra3);
+
+    b = v_uint64x8(_mm512_permutex2var_epi64(br01, mask0, br23));
+    r = v_uint64x8(_mm512_permutex2var_epi64(br01, mask1, br23));
+    g = v_uint64x8(_mm512_permutex2var_epi64(ga01, mask0, ga23));
+    a = v_uint64x8(_mm512_permutex2var_epi64(ga01, mask1, ga23));
+}
+
+///////////////////////////// store interleave /////////////////////////////////////
+
+inline void v_store_interleave( uchar* ptr, const v_uint8x64& x, const v_uint8x64& y,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
+{
+    v_uint8x64 low, high;
+    v_zip(x, y, low, high);
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm512_stream_si512((__m512i*)ptr, low.val);
+        _mm512_stream_si512((__m512i*)(ptr + 64), high.val);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm512_store_si512((__m512i*)ptr, low.val);
+        _mm512_store_si512((__m512i*)(ptr + 64), high.val);
+    }
+    else
+    {
+        _mm512_storeu_si512((__m512i*)ptr, low.val);
+        _mm512_storeu_si512((__m512i*)(ptr + 64), high.val);
+    }
+}
+
+inline void v_store_interleave( ushort* ptr, const v_uint16x32& x, const v_uint16x32& y,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
+{
+    v_uint16x32 low, high;
+    v_zip(x, y, low, high);
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm512_stream_si512((__m512i*)ptr, low.val);
+        _mm512_stream_si512((__m512i*)(ptr + 32), high.val);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm512_store_si512((__m512i*)ptr, low.val);
+        _mm512_store_si512((__m512i*)(ptr + 32), high.val);
+    }
+    else
+    {
+        _mm512_storeu_si512((__m512i*)ptr, low.val);
+        _mm512_storeu_si512((__m512i*)(ptr + 32), high.val);
+    }
+}
+
+inline void v_store_interleave( unsigned* ptr, const v_uint32x16& x, const v_uint32x16& y,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
+{
+    v_uint32x16 low, high;
+    v_zip(x, y, low, high);
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm512_stream_si512((__m512i*)ptr, low.val);
+        _mm512_stream_si512((__m512i*)(ptr + 16), high.val);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm512_store_si512((__m512i*)ptr, low.val);
+        _mm512_store_si512((__m512i*)(ptr + 16), high.val);
+    }
+    else
+    {
+        _mm512_storeu_si512((__m512i*)ptr, low.val);
+        _mm512_storeu_si512((__m512i*)(ptr + 16), high.val);
+    }
+}
+
+inline void v_store_interleave( uint64* ptr, const v_uint64x8& x, const v_uint64x8& y,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
+{
+    v_uint64x8 low, high;
+    v_zip(x, y, low, high);
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm512_stream_si512((__m512i*)ptr, low.val);
+        _mm512_stream_si512((__m512i*)(ptr + 8), high.val);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm512_store_si512((__m512i*)ptr, low.val);
+        _mm512_store_si512((__m512i*)(ptr + 8), high.val);
+    }
+    else
+    {
+        _mm512_storeu_si512((__m512i*)ptr, low.val);
+        _mm512_storeu_si512((__m512i*)(ptr + 8), high.val);
+    }
+}
+
+inline void v_store_interleave( uchar* ptr, const v_uint8x64& b, const v_uint8x64& g, const v_uint8x64& r,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
+{
+#if CV_AVX_512VBMI
+    __m512i mask0 = _v512_set_epu8(127,  84,  20, 126,  83,  19, 125,  82,  18, 124,  81,  17, 123,  80,  16, 122,
+                                    79,  15, 121,  78,  14, 120,  77,  13, 119,  76,  12, 118,  75,  11, 117,  74,
+                                    10, 116,  73,   9, 115,  72,   8, 114,  71,   7, 113,  70,   6, 112,  69,   5,
+                                   111,  68,   4, 110,  67,   3, 109,  66,   2, 108,  65,   1, 107,  64,   0, 106);
+    __m512i mask1 = _v512_set_epu8( 21,  42, 105,  20,  41, 104,  19,  40, 103,  18,  39, 102,  17,  38, 101,  16,
+                                    37, 100,  15,  36,  99,  14,  35,  98,  13,  34,  97,  12,  33,  96,  11,  32,
+                                    95,  10,  31,  94,   9,  30,  93,   8,  29,  92,   7,  28,  91,   6,  27,  90,
+                                     5,  26,  89,   4,  25,  88,   3,  24,  87,   2,  23,  86,   1,  22,  85,   0);
+    __m512i mask2 = _v512_set_epu8(106, 127,  63, 105, 126,  62, 104, 125,  61, 103, 124,  60, 102, 123,  59, 101,
+                                   122,  58, 100, 121,  57,  99, 120,  56,  98, 119,  55,  97, 118,  54,  96, 117,
+                                    53,  95, 116,  52,  94, 115,  51,  93, 114,  50,  92, 113,  49,  91, 112,  48,
+                                    90, 111,  47,  89, 110,  46,  88, 109,  45,  87, 108,  44,  86, 107,  43,  85);
+    __m512i r2g0r0 = _mm512_permutex2var_epi8(g.val, mask0, r.val);
+    __m512i b0r1b1 = _mm512_permutex2var_epi8(b.val, mask1, r.val);
+    __m512i g1b2g2 = _mm512_permutex2var_epi8(b.val, mask2, g.val);
+
+    __m512i bgr0 = _mm512_mask_blend_epi8(0x9249249249249249, r2g0r0, b0r1b1);
+    __m512i bgr1 = _mm512_mask_blend_epi8(0x9249249249249249, b0r1b1, g1b2g2);
+    __m512i bgr2 = _mm512_mask_blend_epi8(0x9249249249249249, g1b2g2, r2g0r0);
+#else
+    __m512i g1g0 = _mm512_shuffle_epi8(g.val, _mm512_set4_epi32(0x0e0f0c0d, 0x0a0b0809, 0x06070405, 0x02030001));
+    __m512i b0g0 = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, b.val, g1g0);
+    __m512i r0b1 = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, r.val, b.val);
+    __m512i g1r1 = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, g1g0, r.val);
+
+    __m512i mask0 = _v512_set_epu16(42, 10, 31, 41,  9, 30, 40,  8, 29, 39,  7, 28, 38,  6, 27, 37,
+                                     5, 26, 36,  4, 25, 35,  3, 24, 34,  2, 23, 33,  1, 22, 32,  0);
+    __m512i mask1 = _v512_set_epu16(21, 52, 41, 20, 51, 40, 19, 50, 39, 18, 49, 38, 17, 48, 37, 16,
+                                    47, 36, 15, 46, 35, 14, 45, 34, 13, 44, 33, 12, 43, 32, 11, 42);
+    __m512i mask2 = _v512_set_epu16(63, 31, 20, 62, 30, 19, 61, 29, 18, 60, 28, 17, 59, 27, 16, 58,
+                                    26, 15, 57, 25, 14, 56, 24, 13, 55, 23, 12, 54, 22, 11, 53, 21);
+    __m512i b0g0b2 = _mm512_permutex2var_epi16(b0g0, mask0, r0b1);
+    __m512i r1b1r0 = _mm512_permutex2var_epi16(b0g0, mask1, g1r1);
+    __m512i g2r2g1 = _mm512_permutex2var_epi16(r0b1, mask2, g1r1);
+
+    __m512i bgr0 = _mm512_mask_blend_epi16(0x24924924, b0g0b2, r1b1r0);
+    __m512i bgr1 = _mm512_mask_blend_epi16(0x24924924, r1b1r0, g2r2g1);
+    __m512i bgr2 = _mm512_mask_blend_epi16(0x24924924, g2r2g1, b0g0b2);
+#endif
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm512_stream_si512((__m512i*)ptr, bgr0);
+        _mm512_stream_si512((__m512i*)(ptr + 64), bgr1);
+        _mm512_stream_si512((__m512i*)(ptr + 128), bgr2);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm512_store_si512((__m512i*)ptr, bgr0);
+        _mm512_store_si512((__m512i*)(ptr + 64), bgr1);
+        _mm512_store_si512((__m512i*)(ptr + 128), bgr2);
+    }
+    else
+    {
+        _mm512_storeu_si512((__m512i*)ptr, bgr0);
+        _mm512_storeu_si512((__m512i*)(ptr + 64), bgr1);
+        _mm512_storeu_si512((__m512i*)(ptr + 128), bgr2);
+    }
+}
+
+inline void v_store_interleave( ushort* ptr, const v_uint16x32& b, const v_uint16x32& g, const v_uint16x32& r,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
+{
+    __m512i mask0 = _v512_set_epu16(42, 10, 31, 41,  9, 30, 40,  8, 29, 39,  7, 28, 38,  6, 27, 37,
+                                     5, 26, 36,  4, 25, 35,  3, 24, 34,  2, 23, 33,  1, 22, 32,  0);
+    __m512i mask1 = _v512_set_epu16(21, 52, 41, 20, 51, 40, 19, 50, 39, 18, 49, 38, 17, 48, 37, 16,
+                                    47, 36, 15, 46, 35, 14, 45, 34, 13, 44, 33, 12, 43, 32, 11, 42);
+    __m512i mask2 = _v512_set_epu16(63, 31, 20, 62, 30, 19, 61, 29, 18, 60, 28, 17, 59, 27, 16, 58,
+                                    26, 15, 57, 25, 14, 56, 24, 13, 55, 23, 12, 54, 22, 11, 53, 21);
+    __m512i b0g0b2 = _mm512_permutex2var_epi16(b.val, mask0, g.val);
+    __m512i r1b1r0 = _mm512_permutex2var_epi16(b.val, mask1, r.val);
+    __m512i g2r2g1 = _mm512_permutex2var_epi16(g.val, mask2, r.val);
+
+    __m512i bgr0 = _mm512_mask_blend_epi16(0x24924924, b0g0b2, r1b1r0);
+    __m512i bgr1 = _mm512_mask_blend_epi16(0x24924924, r1b1r0, g2r2g1);
+    __m512i bgr2 = _mm512_mask_blend_epi16(0x24924924, g2r2g1, b0g0b2);
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm512_stream_si512((__m512i*)ptr, bgr0);
+        _mm512_stream_si512((__m512i*)(ptr + 32), bgr1);
+        _mm512_stream_si512((__m512i*)(ptr + 64), bgr2);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm512_store_si512((__m512i*)ptr, bgr0);
+        _mm512_store_si512((__m512i*)(ptr + 32), bgr1);
+        _mm512_store_si512((__m512i*)(ptr + 64), bgr2);
+    }
+    else
+    {
+        _mm512_storeu_si512((__m512i*)ptr, bgr0);
+        _mm512_storeu_si512((__m512i*)(ptr + 32), bgr1);
+        _mm512_storeu_si512((__m512i*)(ptr + 64), bgr2);
+    }
+}
+
+inline void v_store_interleave( unsigned* ptr, const v_uint32x16& b, const v_uint32x16& g, const v_uint32x16& r,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
+{
+    __m512i mask0 = _v512_set_epu32(26, 31, 15, 25, 30, 14, 24, 29, 13, 23, 28, 12, 22, 27, 11, 21);
+    __m512i mask1 = _v512_set_epu32(31, 10, 25, 30,  9, 24, 29,  8, 23, 28,  7, 22, 27,  6, 21, 26);
+    __m512i g1b2g2 = _mm512_permutex2var_epi32(b.val, mask0, g.val);
+    __m512i r2r1b1 = _mm512_permutex2var_epi32(b.val, mask1, r.val);
+
+    __m512i bgr0 = _mm512_mask_expand_epi32(_mm512_mask_expand_epi32(_mm512_maskz_expand_epi32(0x9249, b.val), 0x2492, g.val), 0x4924, r.val);
+    __m512i bgr1 = _mm512_mask_blend_epi32(0x9249, r2r1b1, g1b2g2);
+    __m512i bgr2 = _mm512_mask_blend_epi32(0x9249, g1b2g2, r2r1b1);
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm512_stream_si512((__m512i*)ptr, bgr0);
+        _mm512_stream_si512((__m512i*)(ptr + 16), bgr1);
+        _mm512_stream_si512((__m512i*)(ptr + 32), bgr2);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm512_store_si512((__m512i*)ptr, bgr0);
+        _mm512_store_si512((__m512i*)(ptr + 16), bgr1);
+        _mm512_store_si512((__m512i*)(ptr + 32), bgr2);
+    }
+    else
+    {
+        _mm512_storeu_si512((__m512i*)ptr, bgr0);
+        _mm512_storeu_si512((__m512i*)(ptr + 16), bgr1);
+        _mm512_storeu_si512((__m512i*)(ptr + 32), bgr2);
+    }
+}
+
+inline void v_store_interleave( uint64* ptr, const v_uint64x8& b, const v_uint64x8& g, const v_uint64x8& r,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
+{
+    __m512i mask0 = _v512_set_epu64( 5, 12,  7,  4, 11,  6,  3, 10);
+    __m512i mask1 = _v512_set_epu64(15,  7,  4, 14,  6,  3, 13,  5);
+    __m512i r1b1b2 = _mm512_permutex2var_epi64(b.val, mask0, r.val);
+    __m512i g2r2g1 = _mm512_permutex2var_epi64(g.val, mask1, r.val);
+
+    __m512i bgr0 = _mm512_mask_expand_epi64(_mm512_mask_expand_epi64(_mm512_maskz_expand_epi64(0x49, b.val), 0x92, g.val), 0x24, r.val);
+    __m512i bgr1 = _mm512_mask_blend_epi64(0xdb, g2r2g1, r1b1b2);
+    __m512i bgr2 = _mm512_mask_blend_epi64(0xdb, r1b1b2, g2r2g1);
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm512_stream_si512((__m512i*)ptr, bgr0);
+        _mm512_stream_si512((__m512i*)(ptr + 8), bgr1);
+        _mm512_stream_si512((__m512i*)(ptr + 16), bgr2);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm512_store_si512((__m512i*)ptr, bgr0);
+        _mm512_store_si512((__m512i*)(ptr + 8), bgr1);
+        _mm512_store_si512((__m512i*)(ptr + 16), bgr2);
+    }
+    else
+    {
+        _mm512_storeu_si512((__m512i*)ptr, bgr0);
+        _mm512_storeu_si512((__m512i*)(ptr + 8), bgr1);
+        _mm512_storeu_si512((__m512i*)(ptr + 16), bgr2);
+    }
+}
+
+inline void v_store_interleave( uchar* ptr, const v_uint8x64& b, const v_uint8x64& g,
+                                const v_uint8x64& r, const v_uint8x64& a,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
+{
+    v_uint8x64 br01, br23, ga01, ga23;
+    v_zip(b, r, br01, br23);
+    v_zip(g, a, ga01, ga23);
+    v_uint8x64 bgra0, bgra1, bgra2, bgra3;
+    v_zip(br01, ga01, bgra0, bgra1);
+    v_zip(br23, ga23, bgra2, bgra3);
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm512_stream_si512((__m512i*)ptr, bgra0.val);
+        _mm512_stream_si512((__m512i*)(ptr + 64), bgra1.val);
+        _mm512_stream_si512((__m512i*)(ptr + 128), bgra2.val);
+        _mm512_stream_si512((__m512i*)(ptr + 192), bgra3.val);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm512_store_si512((__m512i*)ptr, bgra0.val);
+        _mm512_store_si512((__m512i*)(ptr + 64), bgra1.val);
+        _mm512_store_si512((__m512i*)(ptr + 128), bgra2.val);
+        _mm512_store_si512((__m512i*)(ptr + 192), bgra3.val);
+    }
+    else
+    {
+        _mm512_storeu_si512((__m512i*)ptr, bgra0.val);
+        _mm512_storeu_si512((__m512i*)(ptr + 64), bgra1.val);
+        _mm512_storeu_si512((__m512i*)(ptr + 128), bgra2.val);
+        _mm512_storeu_si512((__m512i*)(ptr + 192), bgra3.val);
+    }
+}
+
+inline void v_store_interleave( ushort* ptr, const v_uint16x32& b, const v_uint16x32& g,
+                                const v_uint16x32& r, const v_uint16x32& a,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
+{
+    v_uint16x32 br01, br23, ga01, ga23;
+    v_zip(b, r, br01, br23);
+    v_zip(g, a, ga01, ga23);
+    v_uint16x32 bgra0, bgra1, bgra2, bgra3;
+    v_zip(br01, ga01, bgra0, bgra1);
+    v_zip(br23, ga23, bgra2, bgra3);
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm512_stream_si512((__m512i*)ptr, bgra0.val);
+        _mm512_stream_si512((__m512i*)(ptr + 32), bgra1.val);
+        _mm512_stream_si512((__m512i*)(ptr + 64), bgra2.val);
+        _mm512_stream_si512((__m512i*)(ptr + 96), bgra3.val);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm512_store_si512((__m512i*)ptr, bgra0.val);
+        _mm512_store_si512((__m512i*)(ptr + 32), bgra1.val);
+        _mm512_store_si512((__m512i*)(ptr + 64), bgra2.val);
+        _mm512_store_si512((__m512i*)(ptr + 96), bgra3.val);
+    }
+    else
+    {
+        _mm512_storeu_si512((__m512i*)ptr, bgra0.val);
+        _mm512_storeu_si512((__m512i*)(ptr + 32), bgra1.val);
+        _mm512_storeu_si512((__m512i*)(ptr + 64), bgra2.val);
+        _mm512_storeu_si512((__m512i*)(ptr + 96), bgra3.val);
+    }
+}
+
+inline void v_store_interleave( unsigned* ptr, const v_uint32x16& b, const v_uint32x16& g,
+                                const v_uint32x16& r, const v_uint32x16& a,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
+{
+    v_uint32x16 br01, br23, ga01, ga23;
+    v_zip(b, r, br01, br23);
+    v_zip(g, a, ga01, ga23);
+    v_uint32x16 bgra0, bgra1, bgra2, bgra3;
+    v_zip(br01, ga01, bgra0, bgra1);
+    v_zip(br23, ga23, bgra2, bgra3);
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm512_stream_si512((__m512i*)ptr, bgra0.val);
+        _mm512_stream_si512((__m512i*)(ptr + 16), bgra1.val);
+        _mm512_stream_si512((__m512i*)(ptr + 32), bgra2.val);
+        _mm512_stream_si512((__m512i*)(ptr + 48), bgra3.val);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm512_store_si512((__m512i*)ptr, bgra0.val);
+        _mm512_store_si512((__m512i*)(ptr + 16), bgra1.val);
+        _mm512_store_si512((__m512i*)(ptr + 32), bgra2.val);
+        _mm512_store_si512((__m512i*)(ptr + 48), bgra3.val);
+    }
+    else
+    {
+        _mm512_storeu_si512((__m512i*)ptr, bgra0.val);
+        _mm512_storeu_si512((__m512i*)(ptr + 16), bgra1.val);
+        _mm512_storeu_si512((__m512i*)(ptr + 32), bgra2.val);
+        _mm512_storeu_si512((__m512i*)(ptr + 48), bgra3.val);
+    }
+}
+
+inline void v_store_interleave( uint64* ptr, const v_uint64x8& b, const v_uint64x8& g,
+                                const v_uint64x8& r, const v_uint64x8& a,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
+{
+    v_uint64x8 br01, br23, ga01, ga23;
+    v_zip(b, r, br01, br23);
+    v_zip(g, a, ga01, ga23);
+    v_uint64x8 bgra0, bgra1, bgra2, bgra3;
+    v_zip(br01, ga01, bgra0, bgra1);
+    v_zip(br23, ga23, bgra2, bgra3);
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm512_stream_si512((__m512i*)ptr, bgra0.val);
+        _mm512_stream_si512((__m512i*)(ptr + 8), bgra1.val);
+        _mm512_stream_si512((__m512i*)(ptr + 16), bgra2.val);
+        _mm512_stream_si512((__m512i*)(ptr + 24), bgra3.val);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm512_store_si512((__m512i*)ptr, bgra0.val);
+        _mm512_store_si512((__m512i*)(ptr + 8), bgra1.val);
+        _mm512_store_si512((__m512i*)(ptr + 16), bgra2.val);
+        _mm512_store_si512((__m512i*)(ptr + 24), bgra3.val);
+    }
+    else
+    {
+        _mm512_storeu_si512((__m512i*)ptr, bgra0.val);
+        _mm512_storeu_si512((__m512i*)(ptr + 8), bgra1.val);
+        _mm512_storeu_si512((__m512i*)(ptr + 16), bgra2.val);
+        _mm512_storeu_si512((__m512i*)(ptr + 24), bgra3.val);
+    }
+}
+
+#define OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(_Tpvec0, _Tp0, suffix0, _Tpvec1, _Tp1, suffix1) \
+inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0 ) \
+{ \
+    _Tpvec1 a1, b1; \
+    v_load_deinterleave((const _Tp1*)ptr, a1, b1); \
+    a0 = v_reinterpret_as_##suffix0(a1); \
+    b0 = v_reinterpret_as_##suffix0(b1); \
+} \
+inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0 ) \
+{ \
+    _Tpvec1 a1, b1, c1; \
+    v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1); \
+    a0 = v_reinterpret_as_##suffix0(a1); \
+    b0 = v_reinterpret_as_##suffix0(b1); \
+    c0 = v_reinterpret_as_##suffix0(c1); \
+} \
+inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0, _Tpvec0& d0 ) \
+{ \
+    _Tpvec1 a1, b1, c1, d1; \
+    v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1, d1); \
+    a0 = v_reinterpret_as_##suffix0(a1); \
+    b0 = v_reinterpret_as_##suffix0(b1); \
+    c0 = v_reinterpret_as_##suffix0(c1); \
+    d0 = v_reinterpret_as_##suffix0(d1); \
+} \
+inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
+                                hal::StoreMode mode=hal::STORE_UNALIGNED ) \
+{ \
+    _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
+    _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
+    v_store_interleave((_Tp1*)ptr, a1, b1, mode);      \
+} \
+inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, const _Tpvec0& c0, \
+                                hal::StoreMode mode=hal::STORE_UNALIGNED ) \
+{ \
+    _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
+    _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
+    _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
+    v_store_interleave((_Tp1*)ptr, a1, b1, c1, mode);  \
+} \
+inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
+                                const _Tpvec0& c0, const _Tpvec0& d0, \
+                                hal::StoreMode mode=hal::STORE_UNALIGNED ) \
+{ \
+    _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
+    _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
+    _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
+    _Tpvec1 d1 = v_reinterpret_as_##suffix1(d0); \
+    v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1, mode); \
+}
+
+OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(v_int8x64, schar, s8, v_uint8x64, uchar, u8)
+OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(v_int16x32, short, s16, v_uint16x32, ushort, u16)
+OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(v_int32x16, int, s32, v_uint32x16, unsigned, u32)
+OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(v_float32x16, float, f32, v_uint32x16, unsigned, u32)
+OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(v_int64x8, int64, s64, v_uint64x8, uint64, u64)
+OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(v_float64x8, double, f64, v_uint64x8, uint64, u64)
+
+////////// Mask and checks /////////
+
+/** Mask **/
+inline int64 v_signmask(const v_int8x64& a) { return (int64)_mm512_cmp_epi8_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }
+inline int v_signmask(const v_int16x32& a) { return (int)_mm512_cmp_epi16_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }
+inline int v_signmask(const v_int32x16& a) { return (int)_mm512_cmp_epi32_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }
+inline int v_signmask(const v_int64x8& a) { return (int)_mm512_cmp_epi64_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }
+
+inline int64 v_signmask(const v_uint8x64& a) { return v_signmask(v_reinterpret_as_s8(a)); }
+inline int v_signmask(const v_uint16x32& a) { return v_signmask(v_reinterpret_as_s16(a)); }
+inline int v_signmask(const v_uint32x16& a) { return v_signmask(v_reinterpret_as_s32(a)); }
+inline int v_signmask(const v_uint64x8& a) { return v_signmask(v_reinterpret_as_s64(a)); }
+inline int v_signmask(const v_float32x16& a) { return v_signmask(v_reinterpret_as_s32(a)); }
+inline int v_signmask(const v_float64x8& a) { return v_signmask(v_reinterpret_as_s64(a)); }
+
+/** Checks **/
+inline bool v_check_all(const v_int8x64& a) { return !(bool)_mm512_cmp_epi8_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_NLT); }
+inline bool v_check_any(const v_int8x64& a) { return (bool)_mm512_cmp_epi8_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }
+inline bool v_check_all(const v_int16x32& a) { return !(bool)_mm512_cmp_epi16_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_NLT); }
+inline bool v_check_any(const v_int16x32& a) { return (bool)_mm512_cmp_epi16_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }
+inline bool v_check_all(const v_int32x16& a) { return !(bool)_mm512_cmp_epi32_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_NLT); }
+inline bool v_check_any(const v_int32x16& a) { return (bool)_mm512_cmp_epi32_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }
+inline bool v_check_all(const v_int64x8& a) { return !(bool)_mm512_cmp_epi64_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_NLT); }
+inline bool v_check_any(const v_int64x8& a) { return (bool)_mm512_cmp_epi64_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }
+
+inline bool v_check_all(const v_float32x16& a) { return v_check_all(v_reinterpret_as_s32(a)); }
+inline bool v_check_any(const v_float32x16& a) { return v_check_any(v_reinterpret_as_s32(a)); }
+inline bool v_check_all(const v_float64x8& a) { return v_check_all(v_reinterpret_as_s64(a)); }
+inline bool v_check_any(const v_float64x8& a) { return v_check_any(v_reinterpret_as_s64(a)); }
+inline bool v_check_all(const v_uint8x64& a) { return v_check_all(v_reinterpret_as_s8(a)); }
+inline bool v_check_all(const v_uint16x32& a) { return v_check_all(v_reinterpret_as_s16(a)); }
+inline bool v_check_all(const v_uint32x16& a) { return v_check_all(v_reinterpret_as_s32(a)); }
+inline bool v_check_all(const v_uint64x8& a) { return v_check_all(v_reinterpret_as_s64(a)); }
+inline bool v_check_any(const v_uint8x64& a) { return v_check_any(v_reinterpret_as_s8(a)); }
+inline bool v_check_any(const v_uint16x32& a) { return v_check_any(v_reinterpret_as_s16(a)); }
+inline bool v_check_any(const v_uint32x16& a) { return v_check_any(v_reinterpret_as_s32(a)); }
+inline bool v_check_any(const v_uint64x8& a) { return v_check_any(v_reinterpret_as_s64(a)); }
+
+inline void v512_cleanup() { _mm256_zeroall(); }
+
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
+
+//! @endcond
+
+} // cv::
+
+#endif // OPENCV_HAL_INTRIN_AVX_HPP
diff --git a/modules/core/include/opencv2/core/hal/intrin_forward.hpp b/modules/core/include/opencv2/core/hal/intrin_forward.hpp
index 4618552907..6873633165 100644
--- a/modules/core/include/opencv2/core/hal/intrin_forward.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_forward.hpp
@@ -14,9 +14,32 @@ namespace cv
 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
 
 /** Types **/
-#if CV__SIMD_FORWARD == 512
-// [todo] 512
-#error "AVX512 Not implemented yet"
+#if CV__SIMD_FORWARD == 1024
+// [todo] 1024
+#error "1024-long ops not implemented yet"
+#elif CV__SIMD_FORWARD == 512
+// 512
+#define __CV_VX(fun)   v512_##fun
+#define __CV_V_UINT8   v_uint8x64
+#define __CV_V_INT8    v_int8x64
+#define __CV_V_UINT16  v_uint16x32
+#define __CV_V_INT16   v_int16x32
+#define __CV_V_UINT32  v_uint32x16
+#define __CV_V_INT32   v_int32x16
+#define __CV_V_UINT64  v_uint64x8
+#define __CV_V_INT64   v_int64x8
+#define __CV_V_FLOAT32 v_float32x16
+#define __CV_V_FLOAT64 v_float64x8
+struct v_uint8x64;
+struct v_int8x64;
+struct v_uint16x32;
+struct v_int16x32;
+struct v_uint32x16;
+struct v_int32x16;
+struct v_uint64x8;
+struct v_int64x8;
+struct v_float32x16;
+struct v_float64x8;
 #elif CV__SIMD_FORWARD == 256
 // 256
 #define __CV_VX(fun)   v256_##fun
diff --git a/modules/core/test/test_intrin.cpp b/modules/core/test/test_intrin.cpp
index 15ace206c8..9bc4981cda 100644
--- a/modules/core/test/test_intrin.cpp
+++ b/modules/core/test/test_intrin.cpp
@@ -7,11 +7,15 @@
 #include "test_intrin128.simd_declarations.hpp"
 
 #undef CV_CPU_DISPATCH_MODES_ALL
-
 #include "opencv2/core/cv_cpu_dispatch.h"
 #include "test_intrin256.simd.hpp"
 #include "test_intrin256.simd_declarations.hpp"
 
+#undef CV_CPU_DISPATCH_MODES_ALL
+#include "opencv2/core/cv_cpu_dispatch.h"
+#include "test_intrin512.simd.hpp"
+#include "test_intrin512.simd_declarations.hpp"
+
 #ifdef _MSC_VER
 # pragma warning(disable:4702)  // unreachable code
 #endif
@@ -30,6 +34,11 @@ namespace opencv_test { namespace hal {
     throw SkipTestException("SIMD256 (" #cpu_opt ") is not available or disabled"); \
 } while(0)
 
+#define DISPATCH_SIMD512(fn, cpu_opt) do { \
+    CV_CPU_CALL_ ## cpu_opt ## _(fn, ()); \
+    throw SkipTestException("SIMD512 (" #cpu_opt ") is not available or disabled"); \
+} while(0)
+
 #define DEFINE_SIMD_TESTS(simd_size, cpu_opt) \
 TEST(hal_intrin ## simd_size, uint8x16_ ## cpu_opt)  { DISPATCH_SIMD ## simd_size(test_hal_intrin_uint8, cpu_opt); } \
 TEST(hal_intrin ## simd_size, int8x16_ ## cpu_opt)   { DISPATCH_SIMD ## simd_size(test_hal_intrin_int8, cpu_opt); } \
@@ -67,6 +76,9 @@ DEFINE_SIMD_TESTS(128, AVX)
 #if defined CV_CPU_DISPATCH_COMPILE_AVX2 || defined CV_CPU_BASELINE_COMPILE_AVX2
 DEFINE_SIMD_TESTS(128, AVX2)
 #endif
+#if defined CV_CPU_DISPATCH_COMPILE_AVX512_SKX || defined CV_CPU_BASELINE_COMPILE_AVX512_SKX
+DEFINE_SIMD_TESTS(128, AVX512_SKX)
+#endif
 
 TEST(hal_intrin128, float16x8_FP16)
 {
@@ -91,6 +103,10 @@ namespace intrin256 {
 DEFINE_SIMD_TESTS(256, AVX2)
 #endif
 
+#if defined CV_CPU_DISPATCH_COMPILE_AVX512_SKX || defined CV_CPU_BASELINE_COMPILE_AVX512_SKX
+DEFINE_SIMD_TESTS(256, AVX512_SKX)
+#endif
+
 TEST(hal_intrin256, float16x16_FP16)
 {
     //CV_CPU_CALL_FP16_(test_hal_intrin_float16, ());
@@ -101,4 +117,19 @@ TEST(hal_intrin256, float16x16_FP16)
 
 } // namespace intrin256
 
+namespace intrin512 {
+
+#if defined CV_CPU_DISPATCH_COMPILE_AVX512_SKX || defined CV_CPU_BASELINE_COMPILE_AVX512_SKX
+    DEFINE_SIMD_TESTS(512, AVX512_SKX)
+#endif
+
+TEST(hal_intrin512, float16x32_FP16)
+{
+    CV_CPU_CALL_AVX512_SKX_(test_hal_intrin_float16, ());
+    throw SkipTestException("Unsupported hardware: FP16 is not available");
+}
+
+
+} // namespace intrin512
+
 }} // namespace
\ No newline at end of file
diff --git a/modules/core/test/test_intrin512.simd.hpp b/modules/core/test/test_intrin512.simd.hpp
new file mode 100644
index 0000000000..0e941bc189
--- /dev/null
+++ b/modules/core/test/test_intrin512.simd.hpp
@@ -0,0 +1,23 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+#if !defined CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY && \
+    !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS // TODO? C++ fallback implementation for SIMD512
+
+#define CV__SIMD_FORCE_WIDTH 512
+#include "opencv2/core/hal/intrin.hpp"
+#undef CV__SIMD_FORCE_WIDTH
+
+#if CV_SIMD_WIDTH != 64
+#error "Invalid build configuration"
+#endif
+
+#endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
+
+namespace opencv_test { namespace hal { namespace intrin512 {
+CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
+
+#include "test_intrin_utils.hpp"
+
+CV_CPU_OPTIMIZATION_NAMESPACE_END
+}}} //namespace
diff --git a/modules/core/test/test_intrin_utils.hpp b/modules/core/test/test_intrin_utils.hpp
index 7438d9d1af..177d61f9c7 100644
--- a/modules/core/test/test_intrin_utils.hpp
+++ b/modules/core/test/test_intrin_utils.hpp
@@ -811,7 +811,9 @@ template<typename R> struct TheTest
         R a = dataA, b = dataB, c = dataC, d = dataD, e = dataE;
 
         EXPECT_EQ(2, v_signmask(a));
+#if CV_SIMD_WIDTH <= 32
         EXPECT_EQ(2 | (1 << (R::nlanes / 2)) | (1 << (R::nlanes - 1)), v_signmask(b));
+#endif
 
         EXPECT_EQ(false, v_check_all(a));
         EXPECT_EQ(false, v_check_all(b));
diff --git a/modules/imgproc/src/contours.cpp b/modules/imgproc/src/contours.cpp
index b952296279..c086050b0e 100644
--- a/modules/imgproc/src/contours.cpp
+++ b/modules/imgproc/src/contours.cpp
@@ -1061,10 +1061,16 @@ cvFindNextContour( CvContourScanner scanner )
                 }
                 else
                 {
-                    v_uint8 v_prev = vx_setall_u8((uchar)prev);
-                    for (; x <= width - v_uint8::nlanes; x += v_uint8::nlanes)
+#if CV_SIMD_WIDTH > 16
+                    v_uint8 vx_prev = vx_setall_u8((uchar)prev);
+                    while (x <= width - v_uint8::nlanes &&
+                           v_check_all(vx_load((uchar*)(img + x)) == vx_prev))
+                        x += v_uint8::nlanes;
+#endif
+                    v_uint8x16 v_prev = v_setall_u8((uchar)prev);
+                    for (; x <= width - v_uint8x16::nlanes; x += v_uint8x16::nlanes)
                     {
-                        unsigned int mask = (unsigned int)v_signmask(vx_load((uchar*)(img + x)) != v_prev);
+                        unsigned int mask = (unsigned int)v_signmask(v_load((uchar*)(img + x)) != v_prev);
                         if (mask)
                         {
                             p = img[(x += cv::trailingZeros32(mask))];
@@ -1328,10 +1334,16 @@ CvLinkedRunPoint;
 inline int findStartContourPoint(uchar *src_data, CvSize img_size, int j)
 {
 #if CV_SIMD
-    v_uint8 v_zero = vx_setzero_u8();
-    for (; j <= img_size.width - v_uint8::nlanes; j += v_uint8::nlanes)
+#if CV_SIMD_WIDTH > 16
+    v_uint8 vx_zero = vx_setzero_u8();
+    while (j <= img_size.width - v_uint8::nlanes &&
+           v_check_all(vx_load((uchar*)(src_data + j)) == vx_zero))
+        j += v_uint8::nlanes;
+#endif
+    v_uint8x16 v_zero = v_setzero_u8();
+    for (; j <= img_size.width - v_uint8x16::nlanes; j += v_uint8x16::nlanes)
     {
-        unsigned int mask = (unsigned int)v_signmask(vx_load((uchar*)(src_data + j)) != v_zero);
+        unsigned int mask = (unsigned int)v_signmask(v_load((uchar*)(src_data + j)) != v_zero);
         if (mask)
         {
             j += cv::trailingZeros32(mask);
@@ -1353,10 +1365,16 @@ inline int findEndContourPoint(uchar *src_data, CvSize img_size, int j)
     }
     else
     {
-        v_uint8 v_zero = vx_setzero_u8();
+#if CV_SIMD_WIDTH > 16
+        v_uint8 vx_zero = vx_setzero_u8();
+        while (j <= img_size.width - v_uint8::nlanes &&
+               v_check_all(vx_load((uchar*)(src_data + j)) != vx_zero))
+            j += v_uint8::nlanes;
+#endif
+        v_uint8x16 v_zero = v_setzero_u8();
         for (; j <= img_size.width - v_uint8::nlanes; j += v_uint8::nlanes)
         {
-            unsigned int mask = (unsigned int)v_signmask(vx_load((uchar*)(src_data + j)) == v_zero);
+            unsigned int mask = (unsigned int)v_signmask(v_load((uchar*)(src_data + j)) == v_zero);
             if (mask)
             {
                 j += cv::trailingZeros32(mask);
diff --git a/modules/imgproc/src/resize.cpp b/modules/imgproc/src/resize.cpp
index 2882f26341..ab0d3fe89f 100644
--- a/modules/imgproc/src/resize.cpp
+++ b/modules/imgproc/src/resize.cpp
@@ -2148,6 +2148,7 @@ public:
                 v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
                 bl = s0 + s3; gl = s1 + s4; rl = s2 + s5;
 #elif CV_SIMD_WIDTH == 64
+                v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
                 v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
                 bl = t0 + t3; gl = t1 + t4; rl = t2 + t5;
 #endif
@@ -2167,6 +2168,7 @@ public:
                 v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
                 bh = s0 + s3; gh = s1 + s4; rh = s2 + s5;
 #elif CV_SIMD_WIDTH == 64
+                v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
                 v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
                 bh = t0 + t3; gh = t1 + t4; rh = t2 + t5;
 #endif
diff --git a/modules/imgproc/src/sumpixels.cpp b/modules/imgproc/src/sumpixels.cpp
index 8531fc61f4..2052b02e41 100755
--- a/modules/imgproc/src/sumpixels.cpp
+++ b/modules/imgproc/src/sumpixels.cpp
@@ -127,7 +127,7 @@ struct Integral_SIMD<uchar, int, double>
             {
                 v_int16 el8 = v_reinterpret_as_s16(vx_load_expand(src_row + j));
                 v_int32 el4l, el4h;
-#if CV_AVX2
+#if CV_AVX2 && CV_SIMD_WIDTH == 32
                 __m256i vsum = _mm256_add_epi16(el8.val, _mm256_slli_si256(el8.val, 2));
                 vsum = _mm256_add_epi16(vsum, _mm256_slli_si256(vsum, 4));
                 vsum = _mm256_add_epi16(vsum, _mm256_slli_si256(vsum, 8));
@@ -138,7 +138,7 @@ struct Integral_SIMD<uchar, int, double>
 #else
                 el8 += v_rotate_left<1>(el8);
                 el8 += v_rotate_left<2>(el8);
-#if CV_SIMD_WIDTH == 32
+#if CV_SIMD_WIDTH >= 32
                 el8 += v_rotate_left<4>(el8);
 #if CV_SIMD_WIDTH == 64
                 el8 += v_rotate_left<8>(el8);
@@ -194,7 +194,7 @@ struct Integral_SIMD<uchar, float, double>
             {
                 v_int16 el8 = v_reinterpret_as_s16(vx_load_expand(src_row + j));
                 v_float32 el4l, el4h;
-#if CV_AVX2
+#if CV_AVX2 && CV_SIMD_WIDTH == 32
                 __m256i vsum = _mm256_add_epi16(el8.val, _mm256_slli_si256(el8.val, 2));
                 vsum = _mm256_add_epi16(vsum, _mm256_slli_si256(vsum, 4));
                 vsum = _mm256_add_epi16(vsum, _mm256_slli_si256(vsum, 8));
@@ -205,7 +205,7 @@ struct Integral_SIMD<uchar, float, double>
 #else
                 el8 += v_rotate_left<1>(el8);
                 el8 += v_rotate_left<2>(el8);
-#if CV_SIMD_WIDTH == 32
+#if CV_SIMD_WIDTH >= 32
                 el8 += v_rotate_left<4>(el8);
 #if CV_SIMD_WIDTH == 64
                 el8 += v_rotate_left<8>(el8);