diff --git a/modules/core/include/opencv2/core/cv_cpu_dispatch.h b/modules/core/include/opencv2/core/cv_cpu_dispatch.h index 42651aed5e..540fbb605c 100644 --- a/modules/core/include/opencv2/core/cv_cpu_dispatch.h +++ b/modules/core/include/opencv2/core/cv_cpu_dispatch.h @@ -216,6 +216,11 @@ struct VZeroUpperGuard { # define CV_VSX 1 #endif +#ifdef __F16C__ +# include +# define CV_FP16 1 +#endif + #endif // !__OPENCV_BUILD && !__CUDACC (Compatibility code) diff --git a/modules/core/include/opencv2/core/cvdef.h b/modules/core/include/opencv2/core/cvdef.h index 5bd3af33a4..6488b8bd4f 100644 --- a/modules/core/include/opencv2/core/cvdef.h +++ b/modules/core/include/opencv2/core/cvdef.h @@ -765,7 +765,7 @@ protected: float16_t() {} explicit float16_t(float x) { - #if CV_AVX2 + #if CV_FP16 __m128 v = _mm_load_ss(&x); w = (ushort)_mm_cvtsi128_si32(_mm_cvtps_ph(v, 0)); #else @@ -796,7 +796,7 @@ protected: operator float() const { - #if CV_AVX2 + #if CV_FP16 float f; _mm_store_ss(&f, _mm_cvtph_ps(_mm_cvtsi32_si128(w))); return f; diff --git a/modules/core/include/opencv2/core/hal/intrin_avx.hpp b/modules/core/include/opencv2/core/hal/intrin_avx.hpp index 5dc5bb567d..54e8927192 100644 --- a/modules/core/include/opencv2/core/hal/intrin_avx.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_avx.hpp @@ -3121,18 +3121,39 @@ OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_float32x8, float, f32, v_uint32x8, un OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_int64x4, int64, s64, v_uint64x4, uint64, u64) OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_float64x4, double, f64, v_uint64x4, uint64, u64) +// // FP16 +// + inline v_float32x8 v256_load_expand(const float16_t* ptr) { +#if CV_FP16 return v_float32x8(_mm256_cvtph_ps(_mm_loadu_si128((const __m128i*)ptr))); +#else + float CV_DECL_ALIGNED(32) buf[8]; + for (int i = 0; i < 8; i++) + buf[i] = (float)ptr[i]; + return v256_load_aligned(buf); +#endif } inline void v_pack_store(float16_t* ptr, const v_float32x8& a) { +#if CV_FP16 __m128i ah = _mm256_cvtps_ph(a.val, 0); _mm_storeu_si128((__m128i*)ptr, ah); +#else + float CV_DECL_ALIGNED(32) buf[8]; + v_store_aligned(buf, a); + for (int i = 0; i < 8; i++) + ptr[i] = float16_t(buf[i]); +#endif } +// +// end of FP16 +// + inline void v256_cleanup() { _mm256_zeroall(); } CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END diff --git a/modules/core/src/convert.simd.hpp b/modules/core/src/convert.simd.hpp index a16a1a8405..4af5533870 100644 --- a/modules/core/src/convert.simd.hpp +++ b/modules/core/src/convert.simd.hpp @@ -5,6 +5,11 @@ #include "precomp.hpp" #include "convert.hpp" +#if !defined(OPENCV_SUPRESS_WARNING_AVX2_WITHOUT_FP16C) && \ + (defined(__GNUC__) && defined(__AVX2__) && !defined(__F16C__)) +#warning "Non-optimal compiler flags: AVX2 without FP16. Generated code is very slow. Consider adding '-mf16c' compiler option." +#endif + namespace cv { CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN diff --git a/modules/core/test/test_intrin.cpp b/modules/core/test/test_intrin.cpp index 321fa64264..71d61e14e0 100644 --- a/modules/core/test/test_intrin.cpp +++ b/modules/core/test/test_intrin.cpp @@ -126,9 +126,11 @@ DEFINE_SIMD_TESTS(256, AVX512_SKX) TEST(hal_intrin256, float16x16_FP16) { +#if CV_TRY_FP16 //CV_CPU_CALL_FP16_(test_hal_intrin_float16, ()); CV_CPU_CALL_AVX2_(test_hal_intrin_float16, ()); - throw SkipTestException("Unsupported hardware: FP16 is not available"); +#endif + throw SkipTestException("Unsupported: FP16 is not available"); } @@ -142,8 +144,10 @@ namespace intrin512 { TEST(hal_intrin512, float16x32_FP16) { +#if CV_TRY_FP16 CV_CPU_CALL_AVX512_SKX_(test_hal_intrin_float16, ()); - throw SkipTestException("Unsupported hardware: FP16 is not available"); +#endif + throw SkipTestException("Unsupported: FP16 is not available"); } diff --git a/modules/core/test/test_intrin_utils.hpp b/modules/core/test/test_intrin_utils.hpp index 6731091463..84da496b42 100644 --- a/modules/core/test/test_intrin_utils.hpp +++ b/modules/core/test/test_intrin_utils.hpp @@ -1902,21 +1902,21 @@ void test_hal_intrin_float64() #endif } -#if CV_FP16 void test_hal_intrin_float16() { DUMP_ENTRY(v_float16); #if CV_FP16 TheTest() .test_loadstore_fp16_f32() -#endif #if CV_SIMD_FP16 .test_loadstore_fp16() .test_float_cvt_fp16() #endif ; -} +#else + std::cout << "SKIP: CV_FP16 is not available" << std::endl; #endif +} /*#if defined(CV_CPU_DISPATCH_MODE_FP16) && CV_CPU_DISPATCH_MODE == FP16 void test_hal_intrin_float16()