diff --git a/cmake/OpenCVCompilerOptimizations.cmake b/cmake/OpenCVCompilerOptimizations.cmake index 04f10aee28..8aace0eb0c 100644 --- a/cmake/OpenCVCompilerOptimizations.cmake +++ b/cmake/OpenCVCompilerOptimizations.cmake @@ -8,6 +8,7 @@ # CPU_{opt}_SUPPORTED=ON/OFF - compiler support (possibly with additional flag) # CPU_{opt}_IMPLIES= # CPU_{opt}_FORCE= - subset of "implies" list +# CPU_{opt}_GROUP= - similar to "implies" list, but additionally merges compiler flags # CPU_{opt}_FLAGS_ON="" # CPU_{opt}_FEATURE_ALIAS - mapping to CV_CPU_* HWFeature enum @@ -26,7 +27,7 @@ # # CPU_DISPATCH_FLAGS_${opt} - flags for source files compiled separately (.avx2.cpp) -set(CPU_ALL_OPTIMIZATIONS "SSE;SSE2;SSE3;SSSE3;SSE4_1;SSE4_2;POPCNT;AVX;FP16;AVX2;FMA3;AVX_512F") +set(CPU_ALL_OPTIMIZATIONS "SSE;SSE2;SSE3;SSSE3;SSE4_1;SSE4_2;POPCNT;AVX;FP16;AVX2;FMA3;AVX_512F;AVX512_SKX") list(APPEND CPU_ALL_OPTIMIZATIONS NEON VFPV3 FP16) list(APPEND CPU_ALL_OPTIMIZATIONS VSX) list(REMOVE_DUPLICATES CPU_ALL_OPTIMIZATIONS) @@ -145,7 +146,9 @@ elseif(" ${CMAKE_CXX_FLAGS} " MATCHES " -march=native | -xHost | /QxHost ") endif() if(X86 OR X86_64) - ocv_update(CPU_KNOWN_OPTIMIZATIONS "SSE;SSE2;SSE3;SSSE3;SSE4_1;POPCNT;SSE4_2;FP16;FMA3;AVX;AVX2;AVX_512F") + ocv_update(CPU_KNOWN_OPTIMIZATIONS "SSE;SSE2;SSE3;SSSE3;SSE4_1;POPCNT;SSE4_2;FP16;FMA3;AVX;AVX2;AVX_512F;AVX512_SKX") + + ocv_update(CPU_AVX512_SKX_GROUP "AVX_512F;AVX_512CD;AVX_512BW;AVX_512DQ;AVX_512VL") ocv_update(CPU_SSE_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_sse.cpp") ocv_update(CPU_SSE2_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_sse2.cpp") @@ -158,6 +161,7 @@ if(X86 OR X86_64) ocv_update(CPU_AVX2_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_avx2.cpp") ocv_update(CPU_FP16_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_fp16.cpp") ocv_update(CPU_AVX_512F_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_avx512.cpp") + ocv_update(CPU_AVX512_SKX_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_avx512skx.cpp") if(NOT OPENCV_CPU_OPT_IMPLIES_IGNORE) ocv_update(CPU_AVX_512F_IMPLIES "AVX2") @@ -206,6 +210,7 @@ if(X86 OR X86_64) ocv_intel_compiler_optimization_option(SSE "-msse" "/arch:SSE") endif() ocv_intel_compiler_optimization_option(AVX_512F "-march=common-avx512" "/arch:COMMON-AVX512") + ocv_intel_compiler_optimization_option(AVX512_SKX "-march=core-avx512" "/arch:CORE-AVX512") elseif(CMAKE_COMPILER_IS_GNUCXX) ocv_update(CPU_AVX2_FLAGS_ON "-mavx2") ocv_update(CPU_FP16_FLAGS_ON "-mf16c") @@ -221,6 +226,7 @@ if(X86 OR X86_64) if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS "5.0") # -mavx512f -mavx512pf -mavx512er -mavx512cd -mavx512vl -mavx512bw -mavx512dq -mavx512ifma -mavx512vbmi ocv_update(CPU_AVX_512F_FLAGS_ON "-mavx512f") + ocv_update(CPU_AVX512_SKX_FLAGS_ON "-mavx512f -mavx512cd -mavx512vl -mavx512bw -mavx512dq") endif() elseif(MSVC) ocv_update(CPU_AVX2_FLAGS_ON "/arch:AVX2") @@ -348,6 +354,18 @@ endmacro() foreach(OPT ${CPU_KNOWN_OPTIMIZATIONS}) set(CPU_${OPT}_USAGE_COUNT 0 CACHE INTERNAL "") + if(DEFINED CPU_${OPT}_GROUP) + if(NOT DEFINED CPU_${OPT}_IMPLIES) + set(CPU_${OPT}_IMPLIES "${CPU_${OPT}_GROUP}") + endif() + if(NOT DEFINED CPU_${OPT}_FLAGS_ON) + set(__flags "") + foreach(OPT2 ${CPU_${OPT}_GROUP}) + set(__flags "${__flags} ${CPU_${OPT2}_FLAGS_ON}") + endforeach() + set(CPU_${OPT}_FLAGS_ON "${__flags}") + endif() + endif() if(NOT DEFINED CPU_${OPT}_FORCE) set(CPU_${OPT}_FORCE "${CPU_${OPT}_IMPLIES}") endif() diff --git a/cmake/checks/cpu_avx512skx.cpp b/cmake/checks/cpu_avx512skx.cpp new file mode 100644 index 0000000000..375b62ea0c --- /dev/null +++ b/cmake/checks/cpu_avx512skx.cpp @@ -0,0 +1,14 @@ +#if defined __AVX512__ || defined __AVX512F__ +#include +void test() +{ + __m512i zmm = _mm512_setzero_si512(); + __m256i a = _mm256_setzero_si256(); + __m256i b = _mm256_abs_epi64(a); // VL + __m512i c = _mm512_abs_epi8(zmm); // BW + __m512i d = _mm512_broadcast_i32x8(b); // DQ +} +#else +#error "AVX512-SKX is not supported" +#endif +int main() { return 0; } diff --git a/modules/core/include/opencv2/core/cv_cpu_dispatch.h b/modules/core/include/opencv2/core/cv_cpu_dispatch.h index 5df7b8b4ef..bb0fed4e12 100644 --- a/modules/core/include/opencv2/core/cv_cpu_dispatch.h +++ b/modules/core/include/opencv2/core/cv_cpu_dispatch.h @@ -86,6 +86,10 @@ # include # define CV_AVX_512F 1 #endif +#ifdef CV_CPU_COMPILE_AVX512_SKX +# include +# define CV_AVX512_SKX 1 +#endif #ifdef CV_CPU_COMPILE_FMA3 # define CV_FMA3 1 #endif @@ -222,6 +226,9 @@ struct VZeroUpperGuard { #ifndef CV_AVX_512VL # define CV_AVX_512VL 0 #endif +#ifndef CV_AVX512_SKX +# define CV_AVX512_SKX 0 +#endif #ifndef CV_NEON # define CV_NEON 0 diff --git a/modules/core/include/opencv2/core/cv_cpu_helper.h b/modules/core/include/opencv2/core/cv_cpu_helper.h index 1b939a0a19..66895a7e39 100644 --- a/modules/core/include/opencv2/core/cv_cpu_helper.h +++ b/modules/core/include/opencv2/core/cv_cpu_helper.h @@ -180,6 +180,21 @@ #endif #define __CV_CPU_DISPATCH_CHAIN_AVX_512F(fn, args, mode, ...) CV_CPU_CALL_AVX_512F(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__)) +#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_AVX512_SKX +# define CV_TRY_AVX512_SKX 1 +# define CV_CPU_HAS_SUPPORT_AVX512_SKX 1 +# define CV_CPU_CALL_AVX512_SKX(fn, args) return (opt_AVX512_SKX::fn args) +#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_AVX512_SKX +# define CV_TRY_AVX512_SKX 1 +# define CV_CPU_HAS_SUPPORT_AVX512_SKX (cv::checkHardwareSupport(CV_CPU_AVX512_SKX)) +# define CV_CPU_CALL_AVX512_SKX(fn, args) if (CV_CPU_HAS_SUPPORT_AVX512_SKX) return (opt_AVX512_SKX::fn args) +#else +# define CV_TRY_AVX512_SKX 0 +# define CV_CPU_HAS_SUPPORT_AVX512_SKX 0 +# define CV_CPU_CALL_AVX512_SKX(fn, args) +#endif +#define __CV_CPU_DISPATCH_CHAIN_AVX512_SKX(fn, args, mode, ...) CV_CPU_CALL_AVX512_SKX(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__)) + #if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_NEON # define CV_TRY_NEON 1 # define CV_CPU_HAS_SUPPORT_NEON 1 diff --git a/modules/core/include/opencv2/core/cvdef.h b/modules/core/include/opencv2/core/cvdef.h index c1fcc6a172..bd28b7a9d4 100644 --- a/modules/core/include/opencv2/core/cvdef.h +++ b/modules/core/include/opencv2/core/cvdef.h @@ -146,7 +146,8 @@ namespace cv { namespace debug_build_guard { } using namespace debug_build_guard #define CV_CPU_AVX_512CD 15 #define CV_CPU_AVX_512DQ 16 #define CV_CPU_AVX_512ER 17 -#define CV_CPU_AVX_512IFMA512 18 +#define CV_CPU_AVX_512IFMA512 18 // deprecated +#define CV_CPU_AVX_512IFMA 18 #define CV_CPU_AVX_512PF 19 #define CV_CPU_AVX_512VBMI 20 #define CV_CPU_AVX_512VL 21 @@ -155,8 +156,11 @@ namespace cv { namespace debug_build_guard { } using namespace debug_build_guard #define CV_CPU_VSX 200 +// CPU features groups +#define CV_CPU_AVX512_SKX 256 + // when adding to this list remember to update the following enum -#define CV_HARDWARE_MAX_FEATURE 255 +#define CV_HARDWARE_MAX_FEATURE 512 /** @brief Available CPU features. */ @@ -179,14 +183,19 @@ enum CpuFeatures { CPU_AVX_512CD = 15, CPU_AVX_512DQ = 16, CPU_AVX_512ER = 17, - CPU_AVX_512IFMA512 = 18, + CPU_AVX_512IFMA512 = 18, // deprecated + CPU_AVX_512IFMA = 18, CPU_AVX_512PF = 19, CPU_AVX_512VBMI = 20, CPU_AVX_512VL = 21, CPU_NEON = 100, - CPU_VSX = 200 + CPU_VSX = 200, + + CPU_AVX512_SKX = 256, //!< Skylake-X with AVX-512F/CD/BW/DQ/VL + + CPU_MAX_FEATURE = 512 // see CV_HARDWARE_MAX_FEATURE }; diff --git a/modules/core/src/system.cpp b/modules/core/src/system.cpp index b8b65b6b42..ccf2ca0e28 100644 --- a/modules/core/src/system.cpp +++ b/modules/core/src/system.cpp @@ -301,7 +301,7 @@ struct HWFeatures g_hwFeatureNames[CPU_AVX_512CD] = "AVX512CD"; g_hwFeatureNames[CPU_AVX_512DQ] = "AVX512DQ"; g_hwFeatureNames[CPU_AVX_512ER] = "AVX512ER"; - g_hwFeatureNames[CPU_AVX_512IFMA512] = "AVX512IFMA"; + g_hwFeatureNames[CPU_AVX_512IFMA] = "AVX512IFMA"; g_hwFeatureNames[CPU_AVX_512PF] = "AVX512PF"; g_hwFeatureNames[CPU_AVX_512VBMI] = "AVX512VBMI"; g_hwFeatureNames[CPU_AVX_512VL] = "AVX512VL"; @@ -309,6 +309,8 @@ struct HWFeatures g_hwFeatureNames[CPU_NEON] = "NEON"; g_hwFeatureNames[CPU_VSX] = "VSX"; + + g_hwFeatureNames[CPU_AVX512_SKX] = "AVX512-SKX"; } void initialize(void) @@ -456,6 +458,11 @@ struct HWFeatures have[CV_CPU_AVX_512VBMI] = false; have[CV_CPU_AVX_512VL] = false; } + + if (have[CV_CPU_AVX_512F]) + { + have[CV_CPU_AVX512_SKX] = have[CV_CPU_AVX_512F] & have[CV_CPU_AVX_512CD] & have[CV_CPU_AVX_512BW] & have[CV_CPU_AVX_512DQ] & have[CV_CPU_AVX_512VL]; + } } #else CV_UNUSED(cpuid_data); diff --git a/modules/dnn/CMakeLists.txt b/modules/dnn/CMakeLists.txt index 1bd56f7310..abe07cf9f7 100644 --- a/modules/dnn/CMakeLists.txt +++ b/modules/dnn/CMakeLists.txt @@ -13,7 +13,7 @@ endif() set(the_description "Deep neural network module. It allows to load models from different frameworks and to make forward pass") -ocv_add_dispatched_file("layers/layers_common" AVX AVX2 AVX_512F) +ocv_add_dispatched_file("layers/layers_common" AVX AVX2 AVX512_SKX) ocv_add_module(dnn opencv_core opencv_imgproc WRAP python matlab java js) ocv_warnings_disable(CMAKE_CXX_FLAGS -Wno-shadow -Wno-parentheses -Wmaybe-uninitialized -Wsign-promo diff --git a/modules/dnn/src/layers/convolution_layer.cpp b/modules/dnn/src/layers/convolution_layer.cpp index f533962f38..cd02799b62 100644 --- a/modules/dnn/src/layers/convolution_layer.cpp +++ b/modules/dnn/src/layers/convolution_layer.cpp @@ -384,7 +384,7 @@ public: p.is1x1_ = kernel == Size(0,0) && pad == Size(0, 0); p.useAVX = checkHardwareSupport(CPU_AVX); p.useAVX2 = checkHardwareSupport(CPU_AVX2); - p.useAVX512 = CV_CPU_HAS_SUPPORT_AVX_512F; + p.useAVX512 = CV_CPU_HAS_SUPPORT_AVX512_SKX; int ncn = std::min(inpCn, (int)BLK_SIZE_CN); p.ofstab_.resize(kernel.width*kernel.height*ncn); @@ -564,10 +564,10 @@ public: // now compute dot product of the weights // and im2row-transformed part of the tensor int bsz = ofs1 - ofs0; - #if CV_TRY_AVX_512F + #if CV_TRY_AVX512_SKX /* AVX512 convolution requires an alignment of 16, and ROI is only there for larger vector sizes */ if(useAVX512) - opt_AVX_512F::fastConv(wptr, wstep, biasptr, rowbuf0, data_out0 + ofs0, + opt_AVX512_SKX::fastConv(wptr, wstep, biasptr, rowbuf0, data_out0 + ofs0, outShape, bsz, vsz, vsz_a, relu, cn0 == 0); else #endif @@ -1102,7 +1102,7 @@ public: nstripes_ = nstripes; useAVX = checkHardwareSupport(CPU_AVX); useAVX2 = checkHardwareSupport(CPU_AVX2); - useAVX512 = CV_CPU_HAS_SUPPORT_AVX_512F; + useAVX512 = CV_CPU_HAS_SUPPORT_AVX512_SKX; } void operator()(const Range& range_) const @@ -1120,9 +1120,9 @@ public: size_t bstep = b_->step1(); size_t cstep = c_->step1(); - #if CV_TRY_AVX_512F + #if CV_TRY_AVX512_SKX if( useAVX512 ) - opt_AVX_512F::fastGEMM( aptr, astep, bptr, bstep, cptr, cstep, mmax, kmax, nmax ); + opt_AVX512_SKX::fastGEMM( aptr, astep, bptr, bstep, cptr, cstep, mmax, kmax, nmax ); else #endif #if CV_TRY_AVX2 diff --git a/modules/dnn/src/layers/fully_connected_layer.cpp b/modules/dnn/src/layers/fully_connected_layer.cpp index 88279d23dd..a64a895a7f 100644 --- a/modules/dnn/src/layers/fully_connected_layer.cpp +++ b/modules/dnn/src/layers/fully_connected_layer.cpp @@ -161,7 +161,7 @@ public: p.activ = activ; p.useAVX = checkHardwareSupport(CPU_AVX); p.useAVX2 = checkHardwareSupport(CPU_AVX2); - p.useAVX512 = CV_CPU_HAS_SUPPORT_AVX_512F; + p.useAVX512 = CV_CPU_HAS_SUPPORT_AVX512_SKX; parallel_for_(Range(0, nstripes), p, nstripes); } @@ -196,9 +196,9 @@ public: memcpy(sptr, sptr_, vecsize*sizeof(sptr[0])); - #if CV_TRY_AVX_512F + #if CV_TRY_AVX512_SKX if( useAVX512 ) - opt_AVX_512F::fastGEMM1T( sptr, wptr, wstep, biasptr, dptr, nw, vecsize); + opt_AVX512_SKX::fastGEMM1T( sptr, wptr, wstep, biasptr, dptr, nw, vecsize); else #endif #if CV_TRY_AVX2 diff --git a/modules/dnn/src/layers/layers_common.simd.hpp b/modules/dnn/src/layers/layers_common.simd.hpp index a480426ba6..99d5538631 100644 --- a/modules/dnn/src/layers/layers_common.simd.hpp +++ b/modules/dnn/src/layers/layers_common.simd.hpp @@ -301,7 +301,7 @@ void fastGEMM( const float* aptr, size_t astep, const float* bptr, { int n = 0; -#if CV_AVX_512F +#if CV_AVX512_SKX // AVX512VL is necessary to avoid register spilling for( ; n <= nb - 32; n += 32 ) { for( int m = 0; m < ma; m += 4 )