Merge pull request #14210 from terfendail:wui_512

AVX512 wide universal intrinsics (#14210) * Added implementation of 512-bit wide universal intrinsics(WIP) * Added implementation of 512-bit wide universal intrinsics: implemented WUI vector types(WIP) * Added implementation of 512-bit wide universal intrinsics(WIP): implemented load/store * Added implementation of 512-bit wide universal intrinsics(WIP): implemented fp16 load/store * Added implementation of 512-bit wide universal intrinsics(WIP): implemented recombine and zip, implemented non-saturating and saturating arithmetics * Added implementation of 512-bit wide universal intrinsics(WIP): implemented bit operations * Added implementation of 512-bit wide universal intrinsics(WIP): implemented comparisons * Added implementation of 512-bit wide universal intrinsics(WIP): implemented lane shifts and reduction * Added implementation of 512-bit wide universal intrinsics(WIP): implemented absolute values * Added implementation of 512-bit wide universal intrinsics(WIP): implemented rounding and cast to float * Added implementation of 512-bit wide universal intrinsics(WIP): implemented LUT * Added implementation of 512-bit wide universal intrinsics(WIP): implemented type extension/narrowing and matrix operations * Added implementation of 512-bit wide universal intrinsics(WIP): implemented load_deinterleave for 2 and 3 channels images * Added implementation of 512-bit wide universal intrinsics(WIP): reimplemented load_deinterleave for 2- and implemented for 4-channel images * Added implementation of 512-bit wide universal intrinsics(WIP): implemented store_interleave * Added implementation of 512-bit wide universal intrinsics(WIP): implemented signmask and checks * Added implementation of 512-bit wide universal intrinsics(WIP): build fixes * Added implementation of 512-bit wide universal intrinsics(WIP): reimplemented popcount in case AVX512_BITALG is unavailable * Added implementation of 512-bit wide universal intrinsics(WIP): reimplemented zip * Added implementation of 512-bit wide universal intrinsics(WIP): reimplemented rotate for s8 and s16 * Added implementation of 512-bit wide universal intrinsics(WIP): reimplemented interleave/deinterleave for s8 and s16 * Added implementation of 512-bit wide universal intrinsics(WIP): updated v512_set macros * Added implementation of 512-bit wide universal intrinsics(WIP): fix for GCC wrong _mm512_abs_pd definition * Added implementation of 512-bit wide universal intrinsics(WIP): reworked v_zip to avoid AVX512_VBMI intrinsics * Added implementation of 512-bit wide universal intrinsics(WIP): reworked v_invsqrt to avoid AVX512_ER intrinsics * Added implementation of 512-bit wide universal intrinsics(WIP): reworked v_rotate, v_popcount and interleave/deinterleave for U8 to avoid AVX512_VBMI intrinsics * Added implementation of 512-bit wide universal intrinsics(WIP): fixed integral image SIMD part * Added implementation of 512-bit wide universal intrinsics(WIP): fixed warnings * Added implementation of 512-bit wide universal intrinsics(WIP): fixed load_deinterleave for u8 and u16 * Added implementation of 512-bit wide universal intrinsics(WIP): fixed v_invsqrt accuracy for f64 * Added implementation of 512-bit wide universal intrinsics(WIP): fixed interleave/deinterleave for u32 and u64 * Added implementation of 512-bit wide universal intrinsics(WIP): fixed interleave_pairs, interleave_quads and pack_triplets * Added implementation of 512-bit wide universal intrinsics(WIP): fixed rotate_left * Added implementation of 512-bit wide universal intrinsics(WIP): fixed rotate_left/right, part 2 * Added implementation of 512-bit wide universal intrinsics(WIP): fixed 512-wide universal intrinsics based resize * Added implementation of 512-bit wide universal intrinsics(WIP): fixed findContours by avoiding use of uint64 dependent 512-wide v_signmask() * Added implementation of 512-bit wide universal intrinsics(WIP): fixed trailing whitespaces * Added implementation of 512-bit wide universal intrinsics(WIP): reworked specific intrinsic sets dependent parts to check availability of intrinsics based on CPU feature group defines * Added implementation of 512-bit wide universal intrinsics(WIP):Updated AVX512 implementation of v_popcount to avoid AVX512VPOPCNTDQ intrinsics if unavailable. * Added implementation of 512-bit wide universal intrinsics(WIP): Fixed universal intrinsics data initialisation, v_mul_wrap, v_floor, v_ceil and v_signmask. * Added implementation of 512-bit wide universal intrinsics(WIP): Removed hasSIMD512() * Added implementation of 512-bit wide universal intrinsics(WIP): Fixes for gcc build * Added implementation of 512-bit wide universal intrinsics(WIP): Reworked v_signmask, v_check_any() and v_check_all() implementation.
2025-08-05 22:19:14 +08:00 · 2019-06-03 18:05:35 +03:00 · 2019-06-03 18:05:35 +03:00 · 3b015dfc7d
commit 3b015dfc7d
parent 3289a0aff9
10 changed files with 2902 additions and 19 deletions
--- a/modules/core/CMakeLists.txt
+++ b/modules/core/CMakeLists.txt
@ -13,8 +13,9 @@ ocv_add_dispatched_file(split SSE2 AVX2)
 ocv_add_dispatched_file(sum SSE2 AVX2)

 # dispatching for accuracy tests
-ocv_add_dispatched_file_force_all(test_intrin128 TEST SSE2 SSE3 SSSE3 SSE4_1 SSE4_2 AVX FP16 AVX2)
-ocv_add_dispatched_file_force_all(test_intrin256 TEST AVX2)
+ocv_add_dispatched_file_force_all(test_intrin128 TEST SSE2 SSE3 SSSE3 SSE4_1 SSE4_2 AVX FP16 AVX2 AVX512_SKX)
+ocv_add_dispatched_file_force_all(test_intrin256 TEST AVX2 AVX512_SKX)
+ocv_add_dispatched_file_force_all(test_intrin512 TEST AVX512_SKX)

 ocv_add_module(core
               OPTIONAL opencv_cudev
--- a/modules/core/include/opencv2/core/hal/intrin.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin.hpp
@ -180,6 +180,18 @@ using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE;

 #endif

+// AVX512 can be used together with SSE2 and AVX2, so
+// we define those sets of intrinsics at once.
+// For some of AVX512 intrinsics get v512_ prefix instead of v_, e.g. v512_load() vs v_load().
+// Wide intrinsics will be mapped to v512_ counterparts in this case(e.g. vx_load() => v512_load())
+#if CV_AVX512_SKX
+
+#define CV__SIMD_FORWARD 512
+#include "opencv2/core/hal/intrin_forward.hpp"
+#include "opencv2/core/hal/intrin_avx512.hpp"
+
+#endif
+
 //! @cond IGNORED

 namespace cv {
@ -321,13 +333,41 @@ template<typename _Tp> struct V_RegTraits
    CV_DEF_REG_TRAITS(v256, v_float64x4, double, f64, v_float64x4, void, void, v_int64x4, v_int32x8);
 #endif

+#if CV_SIMD512
+    CV_DEF_REG_TRAITS(v512, v_uint8x64, uchar, u8, v_uint8x64, v_uint16x32, v_uint32x16, v_int8x64, void);
+    CV_DEF_REG_TRAITS(v512, v_int8x64, schar, s8, v_uint8x64, v_int16x32, v_int32x16, v_int8x64, void);
+    CV_DEF_REG_TRAITS(v512, v_uint16x32, ushort, u16, v_uint16x32, v_uint32x16, v_uint64x8, v_int16x32, void);
+    CV_DEF_REG_TRAITS(v512, v_int16x32, short, s16, v_uint16x32, v_int32x16, v_int64x8, v_int16x32, void);
+    CV_DEF_REG_TRAITS(v512, v_uint32x16, unsigned, u32, v_uint32x16, v_uint64x8, void, v_int32x16, void);
+    CV_DEF_REG_TRAITS(v512, v_int32x16, int, s32, v_uint32x16, v_int64x8, void, v_int32x16, void);
+    CV_DEF_REG_TRAITS(v512, v_float32x16, float, f32, v_float32x16, v_float64x8, void, v_int32x16, v_int32x16);
+    CV_DEF_REG_TRAITS(v512, v_uint64x8, uint64, u64, v_uint64x8, void, void, v_int64x8, void);
+    CV_DEF_REG_TRAITS(v512, v_int64x8, int64, s64, v_uint64x8, void, void, v_int64x8, void);
+    CV_DEF_REG_TRAITS(v512, v_float64x8, double, f64, v_float64x8, void, void, v_int64x8, v_int32x16);
+#endif
+
 #if CV_SIMD512 && (!defined(CV__SIMD_FORCE_WIDTH) || CV__SIMD_FORCE_WIDTH == 512)
 #define CV__SIMD_NAMESPACE simd512
 namespace CV__SIMD_NAMESPACE {
    #define CV_SIMD 1
    #define CV_SIMD_64F CV_SIMD512_64F
+    #define CV_SIMD_FP16 CV_SIMD512_FP16
    #define CV_SIMD_WIDTH 64
-    // TODO typedef v_uint8 / v_int32 / etc types here
+    typedef v_uint8x64    v_uint8;
+    typedef v_int8x64     v_int8;
+    typedef v_uint16x32   v_uint16;
+    typedef v_int16x32    v_int16;
+    typedef v_uint32x16   v_uint32;
+    typedef v_int32x16    v_int32;
+    typedef v_uint64x8    v_uint64;
+    typedef v_int64x8     v_int64;
+    typedef v_float32x16  v_float32;
+    CV_INTRIN_DEFINE_WIDE_INTRIN_ALL_TYPES(v512)
+#if CV_SIMD512_64F
+    typedef v_float64x8   v_float64;
+    CV_INTRIN_DEFINE_WIDE_INTRIN(double, v_float64, f64, v512, load)
+#endif
+        inline void vx_cleanup() { v512_cleanup(); }
 } // namespace
 using namespace CV__SIMD_NAMESPACE;
 #elif CV_SIMD256 && (!defined(CV__SIMD_FORCE_WIDTH) || CV__SIMD_FORCE_WIDTH == 256)
--- a/modules/core/include/opencv2/core/hal/intrin_avx512.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_avx512.hpp
--- a/modules/core/include/opencv2/core/hal/intrin_forward.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_forward.hpp
@ -14,9 +14,32 @@ namespace cv
 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN

 /** Types **/
-#if CV__SIMD_FORWARD == 512
-// [todo] 512
-#error "AVX512 Not implemented yet"
+#if CV__SIMD_FORWARD == 1024
+// [todo] 1024
+#error "1024-long ops not implemented yet"
+#elif CV__SIMD_FORWARD == 512
+// 512
+#define __CV_VX(fun)   v512_##fun
+#define __CV_V_UINT8   v_uint8x64
+#define __CV_V_INT8    v_int8x64
+#define __CV_V_UINT16  v_uint16x32
+#define __CV_V_INT16   v_int16x32
+#define __CV_V_UINT32  v_uint32x16
+#define __CV_V_INT32   v_int32x16
+#define __CV_V_UINT64  v_uint64x8
+#define __CV_V_INT64   v_int64x8
+#define __CV_V_FLOAT32 v_float32x16
+#define __CV_V_FLOAT64 v_float64x8
+struct v_uint8x64;
+struct v_int8x64;
+struct v_uint16x32;
+struct v_int16x32;
+struct v_uint32x16;
+struct v_int32x16;
+struct v_uint64x8;
+struct v_int64x8;
+struct v_float32x16;
+struct v_float64x8;
 #elif CV__SIMD_FORWARD == 256
 // 256
 #define __CV_VX(fun)   v256_##fun
--- a/modules/core/test/test_intrin.cpp
+++ b/modules/core/test/test_intrin.cpp
@ -7,11 +7,15 @@
 #include "test_intrin128.simd_declarations.hpp"

 #undef CV_CPU_DISPATCH_MODES_ALL
-
 #include "opencv2/core/cv_cpu_dispatch.h"
 #include "test_intrin256.simd.hpp"
 #include "test_intrin256.simd_declarations.hpp"

+#undef CV_CPU_DISPATCH_MODES_ALL
+#include "opencv2/core/cv_cpu_dispatch.h"
+#include "test_intrin512.simd.hpp"
+#include "test_intrin512.simd_declarations.hpp"
+
 #ifdef _MSC_VER
 # pragma warning(disable:4702)  // unreachable code
 #endif
@ -30,6 +34,11 @@ namespace opencv_test { namespace hal {
    throw SkipTestException("SIMD256 (" #cpu_opt ") is not available or disabled"); \
 } while(0)

+#define DISPATCH_SIMD512(fn, cpu_opt) do { \
+    CV_CPU_CALL_ ## cpu_opt ## _(fn, ()); \
+    throw SkipTestException("SIMD512 (" #cpu_opt ") is not available or disabled"); \
+} while(0)
+
 #define DEFINE_SIMD_TESTS(simd_size, cpu_opt) \
 TEST(hal_intrin ## simd_size, uint8x16_ ## cpu_opt)  { DISPATCH_SIMD ## simd_size(test_hal_intrin_uint8, cpu_opt); } \
 TEST(hal_intrin ## simd_size, int8x16_ ## cpu_opt)   { DISPATCH_SIMD ## simd_size(test_hal_intrin_int8, cpu_opt); } \
@ -67,6 +76,9 @@ DEFINE_SIMD_TESTS(128, AVX)
 #if defined CV_CPU_DISPATCH_COMPILE_AVX2 || defined CV_CPU_BASELINE_COMPILE_AVX2
 DEFINE_SIMD_TESTS(128, AVX2)
 #endif
+#if defined CV_CPU_DISPATCH_COMPILE_AVX512_SKX || defined CV_CPU_BASELINE_COMPILE_AVX512_SKX
+DEFINE_SIMD_TESTS(128, AVX512_SKX)
+#endif

 TEST(hal_intrin128, float16x8_FP16)
 {
@ -91,6 +103,10 @@ namespace intrin256 {
 DEFINE_SIMD_TESTS(256, AVX2)
 #endif

+#if defined CV_CPU_DISPATCH_COMPILE_AVX512_SKX || defined CV_CPU_BASELINE_COMPILE_AVX512_SKX
+DEFINE_SIMD_TESTS(256, AVX512_SKX)
+#endif
+
 TEST(hal_intrin256, float16x16_FP16)
 {
    //CV_CPU_CALL_FP16_(test_hal_intrin_float16, ());
@ -101,4 +117,19 @@ TEST(hal_intrin256, float16x16_FP16)

 } // namespace intrin256

+namespace intrin512 {
+
+#if defined CV_CPU_DISPATCH_COMPILE_AVX512_SKX || defined CV_CPU_BASELINE_COMPILE_AVX512_SKX
+    DEFINE_SIMD_TESTS(512, AVX512_SKX)
+#endif
+
+TEST(hal_intrin512, float16x32_FP16)
+{
+    CV_CPU_CALL_AVX512_SKX_(test_hal_intrin_float16, ());
+    throw SkipTestException("Unsupported hardware: FP16 is not available");
+}
+
+
+} // namespace intrin512
+
 }} // namespace
--- a/modules/core/test/test_intrin512.simd.hpp
+++ b/modules/core/test/test_intrin512.simd.hpp
@ -0,0 +1,23 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+#if !defined CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY && \
+    !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS // TODO? C++ fallback implementation for SIMD512
+
+#define CV__SIMD_FORCE_WIDTH 512
+#include "opencv2/core/hal/intrin.hpp"
+#undef CV__SIMD_FORCE_WIDTH
+
+#if CV_SIMD_WIDTH != 64
+#error "Invalid build configuration"
+#endif
+
+#endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
+
+namespace opencv_test { namespace hal { namespace intrin512 {
+CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
+
+#include "test_intrin_utils.hpp"
+
+CV_CPU_OPTIMIZATION_NAMESPACE_END
+}}} //namespace
--- a/modules/core/test/test_intrin_utils.hpp
+++ b/modules/core/test/test_intrin_utils.hpp
@ -811,7 +811,9 @@ template<typename R> struct TheTest
        R a = dataA, b = dataB, c = dataC, d = dataD, e = dataE;

        EXPECT_EQ(2, v_signmask(a));
+#if CV_SIMD_WIDTH <= 32
        EXPECT_EQ(2 | (1 << (R::nlanes / 2)) | (1 << (R::nlanes - 1)), v_signmask(b));
+#endif

        EXPECT_EQ(false, v_check_all(a));
        EXPECT_EQ(false, v_check_all(b));
--- a/modules/imgproc/src/contours.cpp
+++ b/modules/imgproc/src/contours.cpp
@ -1061,10 +1061,16 @@ cvFindNextContour( CvContourScanner scanner )
                }
                else
                {
-                    v_uint8 v_prev = vx_setall_u8((uchar)prev);
-                    for (; x <= width - v_uint8::nlanes; x += v_uint8::nlanes)
+#if CV_SIMD_WIDTH > 16
+                    v_uint8 vx_prev = vx_setall_u8((uchar)prev);
+                    while (x <= width - v_uint8::nlanes &&
+                           v_check_all(vx_load((uchar*)(img + x)) == vx_prev))
+                        x += v_uint8::nlanes;
+#endif
+                    v_uint8x16 v_prev = v_setall_u8((uchar)prev);
+                    for (; x <= width - v_uint8x16::nlanes; x += v_uint8x16::nlanes)
                    {
-                        unsigned int mask = (unsigned int)v_signmask(vx_load((uchar*)(img + x)) != v_prev);
+                        unsigned int mask = (unsigned int)v_signmask(v_load((uchar*)(img + x)) != v_prev);
                        if (mask)
                        {
                            p = img[(x += cv::trailingZeros32(mask))];
@ -1328,10 +1334,16 @@ CvLinkedRunPoint;
 inline int findStartContourPoint(uchar *src_data, CvSize img_size, int j)
 {
 #if CV_SIMD
-    v_uint8 v_zero = vx_setzero_u8();
-    for (; j <= img_size.width - v_uint8::nlanes; j += v_uint8::nlanes)
+#if CV_SIMD_WIDTH > 16
+    v_uint8 vx_zero = vx_setzero_u8();
+    while (j <= img_size.width - v_uint8::nlanes &&
+           v_check_all(vx_load((uchar*)(src_data + j)) == vx_zero))
+        j += v_uint8::nlanes;
+#endif
+    v_uint8x16 v_zero = v_setzero_u8();
+    for (; j <= img_size.width - v_uint8x16::nlanes; j += v_uint8x16::nlanes)
    {
-        unsigned int mask = (unsigned int)v_signmask(vx_load((uchar*)(src_data + j)) != v_zero);
+        unsigned int mask = (unsigned int)v_signmask(v_load((uchar*)(src_data + j)) != v_zero);
        if (mask)
        {
            j += cv::trailingZeros32(mask);
@ -1353,10 +1365,16 @@ inline int findEndContourPoint(uchar *src_data, CvSize img_size, int j)
    }
    else
    {
-        v_uint8 v_zero = vx_setzero_u8();
+#if CV_SIMD_WIDTH > 16
+        v_uint8 vx_zero = vx_setzero_u8();
+        while (j <= img_size.width - v_uint8::nlanes &&
+               v_check_all(vx_load((uchar*)(src_data + j)) != vx_zero))
+            j += v_uint8::nlanes;
+#endif
+        v_uint8x16 v_zero = v_setzero_u8();
        for (; j <= img_size.width - v_uint8::nlanes; j += v_uint8::nlanes)
        {
-            unsigned int mask = (unsigned int)v_signmask(vx_load((uchar*)(src_data + j)) == v_zero);
+            unsigned int mask = (unsigned int)v_signmask(v_load((uchar*)(src_data + j)) == v_zero);
            if (mask)
            {
                j += cv::trailingZeros32(mask);
--- a/modules/imgproc/src/resize.cpp
+++ b/modules/imgproc/src/resize.cpp
@ -2148,6 +2148,7 @@ public:
                v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
                bl = s0 + s3; gl = s1 + s4; rl = s2 + s5;
 #elif CV_SIMD_WIDTH == 64
+                v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
                v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
                bl = t0 + t3; gl = t1 + t4; rl = t2 + t5;
 #endif
@ -2167,6 +2168,7 @@ public:
                v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
                bh = s0 + s3; gh = s1 + s4; rh = s2 + s5;
 #elif CV_SIMD_WIDTH == 64
+                v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
                v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
                bh = t0 + t3; gh = t1 + t4; rh = t2 + t5;
 #endif
--- a/modules/imgproc/src/sumpixels.cpp
+++ b/modules/imgproc/src/sumpixels.cpp
@ -127,7 +127,7 @@ struct Integral_SIMD<uchar, int, double>
            {
                v_int16 el8 = v_reinterpret_as_s16(vx_load_expand(src_row + j));
                v_int32 el4l, el4h;
-#if CV_AVX2
+#if CV_AVX2 && CV_SIMD_WIDTH == 32
                __m256i vsum = _mm256_add_epi16(el8.val, _mm256_slli_si256(el8.val, 2));
                vsum = _mm256_add_epi16(vsum, _mm256_slli_si256(vsum, 4));
                vsum = _mm256_add_epi16(vsum, _mm256_slli_si256(vsum, 8));
@ -138,7 +138,7 @@ struct Integral_SIMD<uchar, int, double>
 #else
                el8 += v_rotate_left<1>(el8);
                el8 += v_rotate_left<2>(el8);
-#if CV_SIMD_WIDTH == 32
+#if CV_SIMD_WIDTH >= 32
                el8 += v_rotate_left<4>(el8);
 #if CV_SIMD_WIDTH == 64
                el8 += v_rotate_left<8>(el8);
@ -194,7 +194,7 @@ struct Integral_SIMD<uchar, float, double>
            {
                v_int16 el8 = v_reinterpret_as_s16(vx_load_expand(src_row + j));
                v_float32 el4l, el4h;
-#if CV_AVX2
+#if CV_AVX2 && CV_SIMD_WIDTH == 32
                __m256i vsum = _mm256_add_epi16(el8.val, _mm256_slli_si256(el8.val, 2));
                vsum = _mm256_add_epi16(vsum, _mm256_slli_si256(vsum, 4));
                vsum = _mm256_add_epi16(vsum, _mm256_slli_si256(vsum, 8));
@ -205,7 +205,7 @@ struct Integral_SIMD<uchar, float, double>
 #else
                el8 += v_rotate_left<1>(el8);
                el8 += v_rotate_left<2>(el8);
-#if CV_SIMD_WIDTH == 32
+#if CV_SIMD_WIDTH >= 32
                el8 += v_rotate_left<4>(el8);
 #if CV_SIMD_WIDTH == 64
                el8 += v_rotate_left<8>(el8);