core(intrin): restrict FP16 operations

Intrinsics must be effective, so don't declare FP16 type/operations if there is no native support. - CV_FP16: supports load/store into/from float32 - CV_SIMD_FP16: declares FP16 types and native FP16 operations
2025-06-12 12:22:51 +08:00 · 2018-08-20 13:54:03 +03:00 · 2018-08-20 13:54:03 +03:00 · 67d46dfc6c
commit 67d46dfc6c
parent 3f5c3ddd27
5 changed files with 137 additions and 127 deletions
--- a/modules/core/include/opencv2/core/hal/intrin.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin.hpp
@ -204,6 +204,18 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
 #define CV_SIMD512_64F 0
 #endif

+#ifndef CV_SIMD128_FP16
+#define CV_SIMD128_FP16 0
+#endif
+
+#ifndef CV_SIMD256_FP16
+#define CV_SIMD256_FP16 0
+#endif
+
+#ifndef CV_SIMD512_FP16
+#define CV_SIMD512_FP16 0
+#endif
+
 //==================================================================================================

 #define CV_INTRIN_DEFINE_WIDE_INTRIN(typ, vtyp, short_typ, prefix, loadsfx) \
@ -274,8 +286,8 @@ template<typename _Tp> struct V_RegTraits
 #if CV_SIMD128_64F
    CV_DEF_REG_TRAITS(v, v_float64x2, double, f64, v_float64x2, void, void, v_int64x2, v_int32x4);
 #endif
-#if CV_FP16
-    CV_DEF_REG_TRAITS(v, v_float16x8, short, f16, v_float32x4, void, void, v_int16x8, v_int16x8);
+#if CV_SIMD128_FP16
+    CV_DEF_REG_TRAITS(v, v_float16x8, short, f16, v_float16x8, void, void, v_int16x8, v_int16x8);
 #endif
 #endif

@ -290,8 +302,8 @@ template<typename _Tp> struct V_RegTraits
    CV_DEF_REG_TRAITS(v256, v_uint64x4, uint64, u64, v_uint64x4, void, void, v_int64x4, void);
    CV_DEF_REG_TRAITS(v256, v_int64x4, int64, s64, v_uint64x4, void, void, v_int64x4, void);
    CV_DEF_REG_TRAITS(v256, v_float64x4, double, f64, v_float64x4, void, void, v_int64x4, v_int32x8);
-#if CV_FP16
-    CV_DEF_REG_TRAITS(v256, v_float16x16, short, f16, v_float32x8, void, void, v_int16x16, void);
+#if CV_SIMD256_FP16
+    CV_DEF_REG_TRAITS(v256, v_float16x16, short, f16, v_float16x16, void, void, v_int16x16, void);
 #endif
 #endif

@ -309,6 +321,7 @@ using namespace CV__SIMD_NAMESPACE;
 namespace CV__SIMD_NAMESPACE {
    #define CV_SIMD 1
    #define CV_SIMD_64F CV_SIMD256_64F
+    #define CV_SIMD_FP16 CV_SIMD256_FP16
    #define CV_SIMD_WIDTH 32
    typedef v_uint8x32   v_uint8;
    typedef v_int8x32    v_int8;
@ -323,6 +336,10 @@ namespace CV__SIMD_NAMESPACE {
    typedef v_float64x4  v_float64;
    #endif
    #if CV_FP16
+    #define vx_load_fp16_f32 v256_load_fp16_f32
+    #define vx_store_fp16 v_store_fp16
+    #endif
+    #if CV_SIMD256_FP16
    typedef v_float16x16  v_float16;
    CV_INTRIN_DEFINE_WIDE_INTRIN(short, v_float16, f16, v256, load_f16)
    #endif
@ -336,6 +353,7 @@ using namespace CV__SIMD_NAMESPACE;
 namespace CV__SIMD_NAMESPACE {
    #define CV_SIMD CV_SIMD128
    #define CV_SIMD_64F CV_SIMD128_64F
+    #define CV_SIMD_FP16 CV_SIMD128_FP16
    #define CV_SIMD_WIDTH 16
    typedef v_uint8x16  v_uint8;
    typedef v_int8x16   v_int8;
@ -350,6 +368,10 @@ namespace CV__SIMD_NAMESPACE {
    typedef v_float64x2 v_float64;
    #endif
    #if CV_FP16
+    #define vx_load_fp16_f32 v128_load_fp16_f32
+    #define vx_store_fp16 v_store_fp16
+    #endif
+    #if CV_SIMD128_FP16
    typedef v_float16x8  v_float16;
    CV_INTRIN_DEFINE_WIDE_INTRIN(short, v_float16, f16, v, load_f16)
    #endif
@ -393,6 +415,11 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
 #define CV_SIMD_64F 0
 #endif

+#ifndef CV_SIMD_FP16
+#define CV_SIMD_FP16 0  //!< Defined to 1 on native support of operations with float16x8_t / float16x16_t (SIMD256) types
+#endif
+
+
 #ifndef CV_SIMD
 #define CV_SIMD 0
 #endif
--- a/modules/core/include/opencv2/core/hal/intrin_avx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_avx.hpp
@ -7,6 +7,7 @@

 #define CV_SIMD256 1
 #define CV_SIMD256_64F 1
+#define CV_SIMD256_FP16 0  // no native operations with FP16 type. Only load/store from float32x8 are available (if CV_FP16 == 1)

 namespace cv
 {
@ -262,26 +263,6 @@ struct v_float64x4
    double get0() const { return _mm_cvtsd_f64(_mm256_castpd256_pd128(val)); }
 };

-struct v_float16x16
-{
-    typedef short lane_type;
-    enum { nlanes = 16 };
-    __m256i val;
-
-    explicit v_float16x16(__m256i v) : val(v) {}
-    v_float16x16(short v0, short v1, short v2, short v3,
-                 short v4, short v5, short v6, short v7,
-                 short v8, short v9, short v10, short v11,
-                 short v12, short v13, short v14, short v15)
-    {
-        val = _mm256_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
-    }
-    v_float16x16() : val(_mm256_setzero_si256()) {}
-    short get0() const { return (short)_v_cvtsi256_si32(val); }
-};
-inline v_float16x16 v256_setzero_f16() { return v_float16x16(_mm256_setzero_si256()); }
-inline v_float16x16 v256_setall_f16(short val) { return v_float16x16(_mm256_set1_epi16(val)); }
-
 //////////////// Load and store operations ///////////////

 #define OPENCV_HAL_IMPL_AVX_LOADSTORE(_Tpvec, _Tp)                    \
@ -424,20 +405,18 @@ inline v_float64x4 v_reinterpret_as_f64(const v_float64x4& a)
 inline v_float64x4 v_reinterpret_as_f64(const v_float32x8& a)
 { return v_float64x4(_mm256_castps_pd(a.val)); }

-inline v_float16x16 v256_load_f16(const short* ptr)
-{ return v_float16x16(_mm256_loadu_si256((const __m256i*)ptr)); }
-inline v_float16x16 v256_load_f16_aligned(const short* ptr)
-{ return v_float16x16(_mm256_load_si256((const __m256i*)ptr)); }
+#if CV_FP16
+inline v_float32x8 v256_load_fp16_f32(const short* ptr)
+{
+    return v_float32x8(_mm256_cvtph_ps(_mm_loadu_si128((const __m128i*)ptr)));
+}

-inline v_float16x16 v256_load_f16_low(const short* ptr)
-{ return v_float16x16(v256_load_low(ptr).val); }
-inline v_float16x16 v256_load_f16_halves(const short* ptr0, const short* ptr1)
-{ return v_float16x16(v256_load_halves(ptr0, ptr1).val); }
-
-inline void v_store(short* ptr, const v_float16x16& a)
-{ _mm256_storeu_si256((__m256i*)ptr, a.val); }
-inline void v_store_aligned(short* ptr, const v_float16x16& a)
-{ _mm256_store_si256((__m256i*)ptr, a.val); }
+inline void v_store_fp16(short* ptr, const v_float32x8& a)
+{
+    __m128i fp16_value = _mm256_cvtps_ph(a.val, 0);
+    _mm_store_si128((__m128i*)ptr, fp16_value);
+}
+#endif

 /* Recombine */
 /*#define OPENCV_HAL_IMPL_AVX_COMBINE(_Tpvec, perm)                    \
@ -1262,20 +1241,6 @@ inline v_float64x4 v_cvt_f64(const v_float32x8& a)
 inline v_float64x4 v_cvt_f64_high(const v_float32x8& a)
 { return v_float64x4(_mm256_cvtps_pd(_v256_extract_high(a.val))); }

-#if CV_FP16
-inline v_float32x8 v_cvt_f32(const v_float16x16& a)
-{ return v_float32x8(_mm256_cvtph_ps(_v256_extract_low(a.val))); }
-
-inline v_float32x8 v_cvt_f32_high(const v_float16x16& a)
-{ return v_float32x8(_mm256_cvtph_ps(_v256_extract_high(a.val))); }
-
-inline v_float16x16 v_cvt_f16(const v_float32x8& a, const v_float32x8& b)
-{
-    __m128i ah = _mm256_cvtps_ph(a.val, 0), bh = _mm256_cvtps_ph(b.val, 0);
-    return v_float16x16(_mm256_inserti128_si256(_mm256_castsi128_si256(ah), bh, 1));
-}
-#endif
-
 ////////////// Lookup table access ////////////////////

 inline v_int32x8 v_lut(const int* tab, const v_int32x8& idxvec)
--- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
@ -62,6 +62,15 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
 #define CV_SIMD128_64F 0
 #endif

+#ifndef CV_SIMD128_FP16
+# if CV_FP16 && (defined(__GNUC__) && __GNUC__ >= 5)  // #12027: float16x8_t is missing in GCC 4.8.2
+#   define CV_SIMD128_FP16 1
+# endif
+#endif
+#ifndef CV_SIMD128_FP16
+# define CV_SIMD128_FP16 0
+#endif
+
 #if CV_SIMD128_64F
 #define OPENCV_HAL_IMPL_NEON_REINTERPRET(_Tpv, suffix) \
 template <typename T> static inline \
@ -280,28 +289,9 @@ struct v_float64x2

 #if CV_FP16
 // Workaround for old compilers
-static inline int16x8_t vreinterpretq_s16_f16(float16x8_t a) { return (int16x8_t)a; }
-static inline float16x8_t vreinterpretq_f16_s16(int16x8_t a) { return (float16x8_t)a; }
 static inline int16x4_t vreinterpret_s16_f16(float16x4_t a) { return (int16x4_t)a; }
 static inline float16x4_t vreinterpret_f16_s16(int16x4_t a) { return (float16x4_t)a; }

-static inline float16x8_t cv_vld1q_f16(const void* ptr)
-{
-#ifndef vld1q_f16 // APPLE compiler defines vld1_f16 as macro
-    return vreinterpretq_f16_s16(vld1q_s16((const short*)ptr));
-#else
-    return vld1q_f16((const __fp16*)ptr);
-#endif
-}
-static inline void cv_vst1q_f16(void* ptr, float16x8_t a)
-{
-#ifndef vst1q_f16 // APPLE compiler defines vst1_f16 as macro
-    vst1q_s16((short*)ptr, vreinterpretq_s16_f16(a));
-#else
-    vst1q_f16((__fp16*)ptr, a);
-#endif
-}
-
 static inline float16x4_t cv_vld1_f16(const void* ptr)
 {
 #ifndef vld1_f16 // APPLE compiler defines vld1_f16 as macro
@ -323,6 +313,45 @@ static inline void cv_vst1_f16(void* ptr, float16x4_t a)
    #define vdup_n_f16(v) (float16x4_t){v, v, v, v}
 #endif

+#endif // CV_FP16
+
+#if CV_FP16
+inline v_float32x4 v128_load_fp16_f32(const short* ptr)
+{
+    float16x4_t a = cv_vld1_f16((const __fp16*)ptr);
+    return v_float32x4(vcvt_f32_f16(a));
+}
+
+inline void v_store_fp16(short* ptr, const v_float32x4& a)
+{
+    float16x4_t fp16 = vcvt_f16_f32(a.val);
+    cv_vst1_f16((short*)ptr, fp16);
+}
+#endif
+
+
+#if CV_SIMD128_FP16
+// Workaround for old compilers
+static inline int16x8_t vreinterpretq_s16_f16(float16x8_t a) { return (int16x8_t)a; }
+static inline float16x8_t vreinterpretq_f16_s16(int16x8_t a) { return (float16x8_t)a; }
+
+static inline float16x8_t cv_vld1q_f16(const void* ptr)
+{
+#ifndef vld1q_f16 // APPLE compiler defines vld1_f16 as macro
+    return vreinterpretq_f16_s16(vld1q_s16((const short*)ptr));
+#else
+    return vld1q_f16((const __fp16*)ptr);
+#endif
+}
+static inline void cv_vst1q_f16(void* ptr, float16x8_t a)
+{
+#ifndef vst1q_f16 // APPLE compiler defines vst1_f16 as macro
+    vst1q_s16((short*)ptr, vreinterpretq_s16_f16(a));
+#else
+    vst1q_f16((__fp16*)ptr, a);
+#endif
+}
+
 struct v_float16x8
 {
    typedef short lane_type;
@ -344,7 +373,8 @@ struct v_float16x8

 inline v_float16x8 v_setzero_f16() { return v_float16x8(vreinterpretq_f16_s16(vdupq_n_s16((short)0))); }
 inline v_float16x8 v_setall_f16(short v) { return v_float16x8(vreinterpretq_f16_s16(vdupq_n_s16(v))); }
-#endif
+
+#endif // CV_SIMD128_FP16

 #define OPENCV_HAL_IMPL_NEON_INIT(_Tpv, _Tp, suffix) \
 inline v_##_Tpv v_setzero_##suffix() { return v_##_Tpv(vdupq_n_##suffix((_Tp)0)); } \
@ -889,7 +919,7 @@ OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float32x4, float, f32)
 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float64x2, double, f64)
 #endif

-#if CV_FP16
+#if CV_SIMD128_FP16
 // Workaround for old comiplers
 inline v_float16x8 v_load_f16(const short* ptr)
 { return v_float16x8(cv_vld1q_f16(ptr)); }
@ -1462,7 +1492,7 @@ inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
 }
 #endif

-#if CV_FP16
+#if CV_SIMD128_FP16
 inline v_float32x4 v_cvt_f32(const v_float16x8& a)
 {
    return v_float32x4(vcvt_f32_f16(vget_low_f16(a.val)));
--- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp
@ -50,6 +50,7 @@

 #define CV_SIMD128 1
 #define CV_SIMD128_64F 1
+#define CV_SIMD128_FP16 0  // no native operations with FP16 type.

 namespace cv
 {
@ -272,28 +273,6 @@ struct v_float64x2
    __m128d val;
 };

-struct v_float16x8
-{
-    typedef short lane_type;
-    typedef __m128i vector_type;
-    enum { nlanes = 8 };
-
-    v_float16x8() : val(_mm_setzero_si128()) {}
-    explicit v_float16x8(__m128i v) : val(v) {}
-    v_float16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
-    {
-        val = _mm_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7);
-    }
-    short get0() const
-    {
-        return (short)_mm_cvtsi128_si32(val);
-    }
-
-    __m128i val;
-};
-inline v_float16x8 v_setzero_f16() { return v_float16x8(_mm_setzero_si128()); }
-inline v_float16x8 v_setall_f16(short val) { return v_float16x8(_mm_set1_epi16(val)); }
-
 namespace hal_sse_internal
 {
    template <typename to_sse_type, typename from_sse_type>
@ -1330,21 +1309,6 @@ inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
 OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float32x4, float, ps)
 OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float64x2, double, pd)

-inline v_float16x8 v_load_f16(const short* ptr)
-{ return v_float16x8(_mm_loadu_si128((const __m128i*)ptr)); }
-inline v_float16x8 v_load_f16_aligned(const short* ptr)
-{ return v_float16x8(_mm_load_si128((const __m128i*)ptr)); }
-
-inline v_float16x8 v_load_f16_low(const short* ptr)
-{ return v_float16x8(v_load_low(ptr).val); }
-inline v_float16x8 v_load_f16_halves(const short* ptr0, const short* ptr1)
-{ return v_float16x8(v_load_halves(ptr0, ptr1).val); }
-
-inline void v_store(short* ptr, const v_float16x8& a)
-{ _mm_storeu_si128((__m128i*)ptr, a.val); }
-inline void v_store_aligned(short* ptr, const v_float16x8& a)
-{ _mm_store_si128((__m128i*)ptr, a.val); }
-
 #define OPENCV_HAL_IMPL_SSE_REDUCE_OP_8(_Tpvec, scalartype, func, suffix, sbit) \
 inline scalartype v_reduce_##func(const v_##_Tpvec& a) \
 { \
@ -2622,19 +2586,15 @@ inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
 }

 #if CV_FP16
-inline v_float32x4 v_cvt_f32(const v_float16x8& a)
+inline v_float32x4 v128_load_fp16_f32(const short* ptr)
 {
-    return v_float32x4(_mm_cvtph_ps(a.val));
+    return v_float32x4(_mm_cvtph_ps(_mm_loadu_si128((const __m128i*)ptr)));
 }

-inline v_float32x4 v_cvt_f32_high(const v_float16x8& a)
+inline void v_store_fp16(short* ptr, const v_float32x4& a)
 {
-    return v_float32x4(_mm_cvtph_ps(_mm_unpackhi_epi64(a.val, a.val)));
-}
-
-inline v_float16x8 v_cvt_f16(const v_float32x4& a, const v_float32x4& b)
-{
-    return v_float16x8(_mm_unpacklo_epi64(_mm_cvtps_ph(a.val, 0), _mm_cvtps_ph(b.val, 0)));
+    __m128i fp16_value = _mm_cvtps_ph(a.val, 0);
+    _mm_storel_epi64((__m128i*)ptr, fp16_value);
 }
 #endif

--- a/modules/core/test/test_intrin_utils.hpp
+++ b/modules/core/test/test_intrin_utils.hpp
@ -1123,9 +1123,37 @@ template<typename R> struct TheTest
        return *this;
    }

+#if CV_FP16
+    TheTest & test_loadstore_fp16_f32()
+    {
+        printf("test_loadstore_fp16_f32 ...\n");
+        AlignedData<v_uint16> data; data.a.clear();
+        data.a.d[0] = 0x3c00; // 1.0
+        data.a.d[R::nlanes - 1] = (unsigned short)0xc000; // -2.0
+        AlignedData<v_float32> data_f32; data_f32.a.clear();
+        AlignedData<v_uint16> out;
+
+        R r1 = vx_load_fp16_f32((short*)data.a.d);
+        R r2(r1);
+        EXPECT_EQ(1.0f, r1.get0());
+        vx_store(data_f32.a.d, r2);
+        EXPECT_EQ(-2.0f, data_f32.a.d[R::nlanes - 1]);
+
+        out.a.clear();
+        vx_store_fp16((short*)out.a.d, r2);
+        for (int i = 0; i < R::nlanes; ++i)
+        {
+            EXPECT_EQ(data.a[i], out.a[i]) << "i=" << i;
+        }
+
+        return *this;
+    }
+#endif
+
+#if CV_SIMD_FP16
    TheTest & test_loadstore_fp16()
    {
-#if CV_FP16 && CV_SIMD
+        printf("test_loadstore_fp16 ...\n");
        AlignedData<R> data;
        AlignedData<R> out;

@ -1149,12 +1177,10 @@ template<typename R> struct TheTest
        EXPECT_EQ(data.a, out.a);

        return *this;
-#endif
    }
-
    TheTest & test_float_cvt_fp16()
    {
-#if CV_FP16 && CV_SIMD
+        printf("test_float_cvt_fp16 ...\n");
        AlignedData<v_float32> data;

        // check conversion
@ -1165,9 +1191,8 @@ template<typename R> struct TheTest
        EXPECT_EQ(r3.get0(), r1.get0());

        return *this;
-#endif
    }
-
+#endif
 };


@ -1448,7 +1473,10 @@ void test_hal_intrin_float64()
 void test_hal_intrin_float16()
 {
    DUMP_ENTRY(v_float16);
-#if CV_SIMD_WIDTH > 16
+#if CV_FP16
+    TheTest<v_float32>().test_loadstore_fp16_f32();
+#endif
+#if CV_SIMD_FP16
    TheTest<v_float16>()
        .test_loadstore_fp16()
        .test_float_cvt_fp16()