diff --git a/modules/core/include/opencv2/core/hal/intrin.hpp b/modules/core/include/opencv2/core/hal/intrin.hpp index 0e951aa2b8..a58543405b 100644 --- a/modules/core/include/opencv2/core/hal/intrin.hpp +++ b/modules/core/include/opencv2/core/hal/intrin.hpp @@ -165,6 +165,9 @@ CV_INTRIN_DEF_TYPE_TRAITS(uchar, schar, uchar, uchar, ushort, unsigned, unsigned CV_INTRIN_DEF_TYPE_TRAITS(schar, schar, uchar, uchar, short, int, int); CV_INTRIN_DEF_TYPE_TRAITS(ushort, short, ushort, ushort, unsigned, uint64, unsigned); CV_INTRIN_DEF_TYPE_TRAITS(short, short, ushort, ushort, int, int64, int); +#if CV_FP16_TYPE +CV_INTRIN_DEF_TYPE_TRAITS(__fp16, short, ushort, __fp16, float, double, float); +#endif CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(unsigned, int, unsigned, unsigned, uint64, unsigned); CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(int, int, unsigned, unsigned, int64, int); CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(float, int, unsigned, float, double, float); @@ -366,6 +369,9 @@ template struct V_RegTraits CV_DEF_REG_TRAITS(v, v_int8x16, schar, s8, v_uint8x16, v_int16x8, v_int32x4, v_int8x16, void); CV_DEF_REG_TRAITS(v, v_uint16x8, ushort, u16, v_uint16x8, v_uint32x4, v_uint64x2, v_int16x8, void); CV_DEF_REG_TRAITS(v, v_int16x8, short, s16, v_uint16x8, v_int32x4, v_int64x2, v_int16x8, void); +#if CV_SIMD128_FP16 + CV_DEF_REG_TRAITS(v, v_float16x8, __fp16, f16, v_float16x8, v_float32x4, v_float64x2, v_int16x8, v_int16x8); +#endif CV_DEF_REG_TRAITS(v, v_uint32x4, unsigned, u32, v_uint32x4, v_uint64x2, void, v_int32x4, void); CV_DEF_REG_TRAITS(v, v_int32x4, int, s32, v_uint32x4, v_int64x2, void, v_int32x4, void); #if CV_SIMD128_64F || CV_SIMD128_CPP @@ -499,6 +505,7 @@ using namespace CV__SIMD_NAMESPACE; #endif namespace CV__SIMD_NAMESPACE { #define CV_SIMD CV_SIMD128 + #define CV_SIMD_FP16 CV_SIMD128_FP16 #define CV_SIMD_64F CV_SIMD128_64F #define CV_SIMD_WIDTH 16 //! @addtogroup core_hal_intrin @@ -511,6 +518,10 @@ namespace CV__SIMD_NAMESPACE { typedef v_uint16x8 v_uint16; //! @brief Maximum available vector register capacity 16-bit signed integer values typedef v_int16x8 v_int16; + #if CV_SIMD128_FP16 + //! @brief Maximum available vector register capacity 16-bit floating point values (half precision) + typedef v_float16x8 v_float16; + #endif //! @brief Maximum available vector register capacity 32-bit unsigned integer values typedef v_uint32x4 v_uint32; //! @brief Maximum available vector register capacity 32-bit signed integer values @@ -558,6 +569,9 @@ namespace CV__SIMD_NAMESPACE { inline v_int8 vx_setall_s8(schar v) { return VXPREFIX(_setall_s8)(v); } inline v_uint16 vx_setall_u16(ushort v) { return VXPREFIX(_setall_u16)(v); } inline v_int16 vx_setall_s16(short v) { return VXPREFIX(_setall_s16)(v); } +#if CV_SIMD_FP16 + inline v_float16 vx_setall_f16(__fp16 v) { return VXPREFIX(_setall_f16)(v); } +#endif inline v_int32 vx_setall_s32(int v) { return VXPREFIX(_setall_s32)(v); } inline v_uint32 vx_setall_u32(unsigned v) { return VXPREFIX(_setall_u32)(v); } inline v_float32 vx_setall_f32(float v) { return VXPREFIX(_setall_f32)(v); } @@ -575,6 +589,9 @@ namespace CV__SIMD_NAMESPACE { inline v_int8 vx_setzero_s8() { return VXPREFIX(_setzero_s8)(); } inline v_uint16 vx_setzero_u16() { return VXPREFIX(_setzero_u16)(); } inline v_int16 vx_setzero_s16() { return VXPREFIX(_setzero_s16)(); } +#if CV_SIMD_FP16 + inline v_float16 vx_setzero_f16() { return VXPREFIX(_setzero_f16)(); } +#endif inline v_int32 vx_setzero_s32() { return VXPREFIX(_setzero_s32)(); } inline v_uint32 vx_setzero_u32() { return VXPREFIX(_setzero_u32)(); } inline v_float32 vx_setzero_f32() { return VXPREFIX(_setzero_f32)(); } @@ -592,6 +609,9 @@ namespace CV__SIMD_NAMESPACE { inline v_int8 vx_load(const schar * ptr) { return VXPREFIX(_load)(ptr); } inline v_uint16 vx_load(const ushort * ptr) { return VXPREFIX(_load)(ptr); } inline v_int16 vx_load(const short * ptr) { return VXPREFIX(_load)(ptr); } +#if CV_SIMD_FP16 + inline v_float16 vx_load(const __fp16 * ptr) { return VXPREFIX(_load)(ptr); } +#endif inline v_int32 vx_load(const int * ptr) { return VXPREFIX(_load)(ptr); } inline v_uint32 vx_load(const unsigned * ptr) { return VXPREFIX(_load)(ptr); } inline v_float32 vx_load(const float * ptr) { return VXPREFIX(_load)(ptr); } @@ -609,6 +629,9 @@ namespace CV__SIMD_NAMESPACE { inline v_int8 vx_load_aligned(const schar * ptr) { return VXPREFIX(_load_aligned)(ptr); } inline v_uint16 vx_load_aligned(const ushort * ptr) { return VXPREFIX(_load_aligned)(ptr); } inline v_int16 vx_load_aligned(const short * ptr) { return VXPREFIX(_load_aligned)(ptr); } +#if CV_SIMD_FP16 + inline v_float16 vx_load_aligned(const __fp16 * ptr) { return VXPREFIX(_load_aligned)(ptr); } +#endif inline v_int32 vx_load_aligned(const int * ptr) { return VXPREFIX(_load_aligned)(ptr); } inline v_uint32 vx_load_aligned(const unsigned * ptr) { return VXPREFIX(_load_aligned)(ptr); } inline v_float32 vx_load_aligned(const float * ptr) { return VXPREFIX(_load_aligned)(ptr); } @@ -626,6 +649,9 @@ namespace CV__SIMD_NAMESPACE { inline v_int8 vx_load_low(const schar * ptr) { return VXPREFIX(_load_low)(ptr); } inline v_uint16 vx_load_low(const ushort * ptr) { return VXPREFIX(_load_low)(ptr); } inline v_int16 vx_load_low(const short * ptr) { return VXPREFIX(_load_low)(ptr); } +#if CV_SIMD_FP16 + inline v_float16 vx_load_low(const __fp16 * ptr) { return VXPREFIX(_load_low)(ptr); } +#endif inline v_int32 vx_load_low(const int * ptr) { return VXPREFIX(_load_low)(ptr); } inline v_uint32 vx_load_low(const unsigned * ptr) { return VXPREFIX(_load_low)(ptr); } inline v_float32 vx_load_low(const float * ptr) { return VXPREFIX(_load_low)(ptr); } @@ -643,6 +669,9 @@ namespace CV__SIMD_NAMESPACE { inline v_int8 vx_load_halves(const schar * ptr0, const schar * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); } inline v_uint16 vx_load_halves(const ushort * ptr0, const ushort * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); } inline v_int16 vx_load_halves(const short * ptr0, const short * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); } +#if CV_SIMD_FP16 + inline v_float16 vx_load_halves(const __fp16 * ptr0, const __fp16 * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); } +#endif inline v_int32 vx_load_halves(const int * ptr0, const int * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); } inline v_uint32 vx_load_halves(const unsigned * ptr0, const unsigned * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); } inline v_float32 vx_load_halves(const float * ptr0, const float * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); } @@ -660,6 +689,9 @@ namespace CV__SIMD_NAMESPACE { inline v_int8 vx_lut(const schar * ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); } inline v_uint16 vx_lut(const ushort * ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); } inline v_int16 vx_lut(const short* ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); } +#if CV_SIMD_FP16 + inline v_float16 vx_lut(const __fp16 * ptr, const int * idx) { return VXPREFIX(_lut)(ptr, idx); } +#endif inline v_int32 vx_lut(const int* ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); } inline v_uint32 vx_lut(const unsigned* ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); } inline v_float32 vx_lut(const float* ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); } @@ -677,6 +709,9 @@ namespace CV__SIMD_NAMESPACE { inline v_int8 vx_lut_pairs(const schar * ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); } inline v_uint16 vx_lut_pairs(const ushort * ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); } inline v_int16 vx_lut_pairs(const short* ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); } +#if CV_SIMD_FP16 + inline v_float16 vx_lut_pairs(const __fp16 * ptr, const int * idx) { return VXPREFIX(_lut_pairs)(ptr, idx); } +#endif inline v_int32 vx_lut_pairs(const int* ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); } inline v_uint32 vx_lut_pairs(const unsigned* ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); } inline v_float32 vx_lut_pairs(const float* ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); } @@ -1180,6 +1215,9 @@ namespace CV__SIMD_NAMESPACE { OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int16) OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int32) OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int64) + #if CV_SIMD_FP16 + OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float16) + #endif OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float32) #if CV_SIMD_64F OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64) @@ -1196,6 +1234,9 @@ namespace CV__SIMD_NAMESPACE { OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint32) OPENCV_HAL_WRAP_BIN_OP_MUL(v_int16) OPENCV_HAL_WRAP_BIN_OP_MUL(v_int32) + #if CV_SIMD_FP16 + OPENCV_HAL_WRAP_BIN_OP_MUL(v_float16) + #endif OPENCV_HAL_WRAP_BIN_OP_MUL(v_float32) #if CV_SIMD_64F OPENCV_HAL_WRAP_BIN_OP_MUL(v_float64) @@ -1215,6 +1256,9 @@ namespace CV__SIMD_NAMESPACE { OPENCV_HAL_WRAP_EXTRACT(v_int32) OPENCV_HAL_WRAP_EXTRACT(v_uint64) OPENCV_HAL_WRAP_EXTRACT(v_int64) + #if CV_SIMD_FP16 + OPENCV_HAL_WRAP_EXTRACT(v_float16) + #endif OPENCV_HAL_WRAP_EXTRACT(v_float32) #if CV_SIMD_64F OPENCV_HAL_WRAP_EXTRACT(v_float64) diff --git a/modules/core/include/opencv2/core/hal/intrin_neon.hpp b/modules/core/include/opencv2/core/hal/intrin_neon.hpp index d598107bde..4fa3120ac7 100644 --- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp @@ -61,6 +61,11 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN #else #define CV_SIMD128_64F 0 #endif +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + #define CV_SIMD128_FP16 1 +#else + #define CV_SIMD128_FP16 0 +#endif // The following macro checks if the code is being compiled for the // AArch64 execution state of Armv8, to enable the 128-bit @@ -124,6 +129,9 @@ OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(uint16x8, uint16x4, u16) OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(int16x8, int16x4, s16) OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(uint32x4, uint32x2, u32) OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(int32x4, int32x2, s32) +#if CV_SIMD128_FP16 +OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(float16x8, float16x4, f16); +#endif OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(float32x4, float32x2, f32) OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_I64(uint64x2, uint64x1, u64) OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_I64(int64x2, int64x1, s64) @@ -285,6 +293,31 @@ private: } }; +#if CV_SIMD128_FP16 +struct v_float16x8 +{ + v_float16x8() {} + explicit v_float16x8(float16x8_t v) : val(v) {} + v_float16x8(__fp16 v0, __fp16 v1, __fp16 v2, __fp16 v3, __fp16 v4, __fp16 v5, __fp16 v6, __fp16 v7) + { + __fp16 v[] = {v0, v1, v2, v3, v4, v5, v6, v7}; + val = vld1q_f16(v); + } + float16x8_t val; + +private: + friend struct VTraits; + enum { nlanes = 8 }; + typedef __fp16 lane_type; + + friend typename VTraits::lane_type v_get0(const v_float16x8& v); + __fp16 get0() const + { + return vgetq_lane_f16(val, 0); + } +}; +#endif + struct v_float32x4 { v_float32x4() {} @@ -400,6 +433,23 @@ OPENCV_HAL_IMPL_NEON_INIT(uint32x4, unsigned, u32) OPENCV_HAL_IMPL_NEON_INIT(int32x4, int, s32) OPENCV_HAL_IMPL_NEON_INIT(uint64x2, uint64, u64) OPENCV_HAL_IMPL_NEON_INIT(int64x2, int64, s64) +#if CV_SIMD128_FP16 +OPENCV_HAL_IMPL_NEON_INIT(float16x8, __fp16, f16); +#define OPENCV_HAL_IMPL_NEON_INIT_FP16(_Tpv, suffix) \ +inline v_float16x8 v_reinterpret_as_f16(const v_##_Tpv& v) { return v_float16x8(vreinterpretq_f16_##suffix(v.val)); } +OPENCV_HAL_IMPL_NEON_INIT_FP16(uint8x16, u8) +OPENCV_HAL_IMPL_NEON_INIT_FP16(int8x16, s8) +OPENCV_HAL_IMPL_NEON_INIT_FP16(uint16x8, u16) +OPENCV_HAL_IMPL_NEON_INIT_FP16(int16x8, s16) +OPENCV_HAL_IMPL_NEON_INIT_FP16(uint32x4, u32) +OPENCV_HAL_IMPL_NEON_INIT_FP16(int32x4, s32) +OPENCV_HAL_IMPL_NEON_INIT_FP16(uint64x2, u64) +OPENCV_HAL_IMPL_NEON_INIT_FP16(int64x2, s64) +OPENCV_HAL_IMPL_NEON_INIT_FP16(float32x4, f32) +#if CV_SIMD128_64F +OPENCV_HAL_IMPL_NEON_INIT_FP16(float64x2, f64) +#endif +#endif OPENCV_HAL_IMPL_NEON_INIT(float32x4, float, f32) #if CV_SIMD128_64F #define OPENCV_HAL_IMPL_NEON_INIT_64(_Tpv, suffix) \ @@ -413,6 +463,9 @@ OPENCV_HAL_IMPL_NEON_INIT_64(uint32x4, u32) OPENCV_HAL_IMPL_NEON_INIT_64(int32x4, s32) OPENCV_HAL_IMPL_NEON_INIT_64(uint64x2, u64) OPENCV_HAL_IMPL_NEON_INIT_64(int64x2, s64) +#if CV_SIMD128_FP16 +OPENCV_HAL_IMPL_NEON_INIT_64(float16x8, f16) +#endif OPENCV_HAL_IMPL_NEON_INIT_64(float32x4, f32) OPENCV_HAL_IMPL_NEON_INIT_64(float64x2, f64) #endif @@ -505,6 +558,47 @@ inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0, return v_float32x4(res); } +#if CV_SIMD128_FP16 +// res = m0 * v[0] + m1 * v[1] + ... + m7 * v[7] +inline v_float16x8 v_matmul(const v_float16x8 &v, + const v_float16x8 &m0, const v_float16x8 &m1, + const v_float16x8 &m2, const v_float16x8 &m3, + const v_float16x8 &m4, const v_float16x8 &m5, + const v_float16x8 &m6, const v_float16x8 &m7) +{ + float16x4_t vl = vget_low_f16(v.val), vh = vget_high_f16(v.val); + float16x8_t res = vmulq_lane_f16(m0.val, vl, 0); + res = vfmaq_lane_f16(res, m1.val, vl, 1); + res = vfmaq_lane_f16(res, m2.val, vl, 2); + res = vfmaq_lane_f16(res, m3.val, vl, 3); + res = vfmaq_lane_f16(res, m4.val, vh, 0); + res = vfmaq_lane_f16(res, m5.val, vh, 1); + res = vfmaq_lane_f16(res, m6.val, vh, 2); + res = vfmaq_lane_f16(res, m7.val, vh, 3); + return v_float16x8(res); +} + +// res = m0 * v[0] + m1 * v[1] + ... + m6 * v[6] + a +inline v_float16x8 v_matmuladd(const v_float16x8 &v, + const v_float16x8 &m0, const v_float16x8 &m1, + const v_float16x8 &m2, const v_float16x8 &m3, + const v_float16x8 &m4, const v_float16x8 &m5, + const v_float16x8 &m6, + const v_float16x8 &a) +{ + float16x4_t vl = vget_low_f16(v.val), vh = vget_high_f16(v.val); + float16x8_t res = vmulq_lane_f16(m0.val, vl, 0); + res = vfmaq_lane_f16(res, m1.val, vl, 1); + res = vfmaq_lane_f16(res, m2.val, vl, 2); + res = vfmaq_lane_f16(res, m3.val, vl, 3); + res = vfmaq_lane_f16(res, m4.val, vh, 0); + res = vfmaq_lane_f16(res, m5.val, vh, 1); + res = vfmaq_lane_f16(res, m6.val, vh, 2); + res = vaddq_f16(res, a.val); + return v_float16x8(res); +} +#endif + #define OPENCV_HAL_IMPL_NEON_BIN_OP(bin_op, _Tpvec, intrin) \ inline _Tpvec bin_op (const _Tpvec& a, const _Tpvec& b) \ { \ @@ -525,6 +619,12 @@ OPENCV_HAL_IMPL_NEON_BIN_OP(v_mul, v_int32x4, vmulq_s32) OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_uint32x4, vaddq_u32) OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_uint32x4, vsubq_u32) OPENCV_HAL_IMPL_NEON_BIN_OP(v_mul, v_uint32x4, vmulq_u32) +#if CV_SIMD128_FP16 +OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_float16x8, vaddq_f16) +OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_float16x8, vsubq_f16) +OPENCV_HAL_IMPL_NEON_BIN_OP(v_mul, v_float16x8, vmulq_f16) +OPENCV_HAL_IMPL_NEON_BIN_OP(v_div, v_float16x8, vdivq_f16) +#endif OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_float32x4, vaddq_f32) OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_float32x4, vsubq_f32) OPENCV_HAL_IMPL_NEON_BIN_OP(v_mul, v_float32x4, vmulq_f32) @@ -944,6 +1044,21 @@ OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int32x4, s32) OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint64x2, u64) OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int64x2, s64) +#if CV_SIMD128_FP16 +#define OPENCV_HAL_IMPL_NEON_FP16_BIT_OP(bin_op, intrin) \ +inline v_float16x8 bin_op (const v_float16x8& a, const v_float16x8& b) \ +{ \ + return v_float16x8(vreinterpretq_f16_s16(intrin(vreinterpretq_s16_f16(a.val), vreinterpretq_s16_f16(b.val)))); \ +} +OPENCV_HAL_IMPL_NEON_FP16_BIT_OP(v_and, vandq_s16) +OPENCV_HAL_IMPL_NEON_FP16_BIT_OP(v_or, vorrq_s16) +OPENCV_HAL_IMPL_NEON_FP16_BIT_OP(v_xor, veorq_s16) +inline v_float16x8 v_not (const v_float16x8& a) +{ + return v_float16x8(vreinterpretq_f16_s16(vmvnq_s16(vreinterpretq_s16_f16(a.val)))); +} +#endif + #define OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(bin_op, intrin) \ inline v_float32x4 bin_op (const v_float32x4& a, const v_float32x4& b) \ { \ @@ -959,6 +1074,19 @@ inline v_float32x4 v_not (const v_float32x4& a) return v_float32x4(vreinterpretq_f32_s32(vmvnq_s32(vreinterpretq_s32_f32(a.val)))); } +#if CV_SIMD128_FP16 +inline v_float16x8 v_sqrt(const v_float16x8& x) +{ + return v_float16x8(vsqrtq_f16(x.val)); +} + +inline v_float16x8 v_invsqrt(const v_float16x8& x) +{ + v_float16x8 one = v_setall_f16(1.0f); + return v_div(one, v_sqrt(x)); +} +#endif + #if CV_SIMD128_64F inline v_float32x4 v_sqrt(const v_float32x4& x) { @@ -996,9 +1124,14 @@ OPENCV_HAL_IMPL_NEON_ABS(v_uint8x16, v_int8x16, u8, s8) OPENCV_HAL_IMPL_NEON_ABS(v_uint16x8, v_int16x8, u16, s16) OPENCV_HAL_IMPL_NEON_ABS(v_uint32x4, v_int32x4, u32, s32) -inline v_float32x4 v_abs(v_float32x4 x) +inline v_float32x4 v_abs(const v_float32x4 &x) { return v_float32x4(vabsq_f32(x.val)); } +#if CV_SIMD128_FP16 +inline v_float16x8 v_abs(const v_float16x8 &x) +{ return v_float16x8(vabsq_f16(x.val)); } +#endif + #if CV_SIMD128_64F #define OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(bin_op, intrin) \ inline v_float64x2 bin_op (const v_float64x2& a, const v_float64x2& b) \ @@ -1052,6 +1185,10 @@ OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int32x4, v_min, vminq_s32) OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int32x4, v_max, vmaxq_s32) OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_min, vminq_f32) OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_max, vmaxq_f32) +#if CV_SIMD128_FP16 +OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float16x8, v_min, vminq_f16) +OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float16x8, v_max, vmaxq_f16) +#endif #if CV_SIMD128_64F OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float64x2, v_min, vminq_f64) OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float64x2, v_max, vmaxq_f64) @@ -1075,6 +1212,9 @@ OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint8x16, OPENCV_HAL_NOP, u8, u8) OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int8x16, vreinterpretq_s8_u8, s8, u8) OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint16x8, OPENCV_HAL_NOP, u16, u16) OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int16x8, vreinterpretq_s16_u16, s16, u16) +#if CV_SIMD128_FP16 +OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_float16x8, vreinterpretq_f16_u16, f16, u16) +#endif OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint32x4, OPENCV_HAL_NOP, u32, u32) OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int32x4, vreinterpretq_s32_u32, s32, u32) OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_float32x4, vreinterpretq_f32_u32, f32, u32) @@ -1139,6 +1279,10 @@ static inline v_int64x2 v_lt (const v_int64x2& a, const v_int64x2& b) OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_float64x2, vreinterpretq_f64_u64, f64, u64) #endif +#if CV_SIMD128_FP16 +inline v_float16x8 v_not_nan(const v_float16x8& a) +{ return v_float16x8(vreinterpretq_f16_u16(vceqq_f16(a.val, a.val))); } +#endif inline v_float32x4 v_not_nan(const v_float32x4& a) { return v_float32x4(vreinterpretq_f32_u32(vceqq_f32(a.val, a.val))); } #if CV_SIMD128_64F @@ -1162,6 +1306,9 @@ OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_mul_wrap, vmulq_s16) OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_absdiff, vabdq_u8) OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_absdiff, vabdq_u16) OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_absdiff, vabdq_u32) +#if CV_SIMD128_FP16 +OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float16x8, v_absdiff, vabdq_f16) +#endif OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_absdiff, vabdq_f32) #if CV_SIMD128_64F OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float64x2, v_absdiff, vabdq_f64) @@ -1183,6 +1330,29 @@ OPENCV_HAL_IMPL_NEON_BIN_FUNC2(v_int8x16, v_uint8x16, vreinterpretq_u8_s8, v_abs OPENCV_HAL_IMPL_NEON_BIN_FUNC2(v_int16x8, v_uint16x8, vreinterpretq_u16_s16, v_absdiff, vabdq_s16) OPENCV_HAL_IMPL_NEON_BIN_FUNC2(v_int32x4, v_uint32x4, vreinterpretq_u32_s32, v_absdiff, vabdq_s32) +#if CV_SIMD128_FP16 +inline v_float16x8 v_magnitude(const v_float16x8& a, const v_float16x8& b) +{ + v_float16x8 x(vaddq_f16(vmulq_f16(a.val, a.val), vmulq_f16(b.val, b.val))); + return v_sqrt(x); +} + +inline v_float16x8 v_sqr_magnitude(const v_float16x8& a, const v_float16x8& b) +{ + return v_float16x8(vaddq_f16(vmulq_f16(a.val, a.val), vmulq_f16(b.val, b.val))); +} + +inline v_float16x8 v_fma(const v_float16x8& a, const v_float16x8& b, const v_float16x8& c) +{ + return v_float16x8(vfmaq_f16(c.val, a.val, b.val)); +} + +inline v_float16x8 v_muladd(const v_float16x8& a, const v_float16x8& b, const v_float16x8& c) +{ + return v_fma(a, b, c); +} +#endif + inline v_float32x4 v_magnitude(const v_float32x4& a, const v_float32x4& b) { v_float32x4 x(vmlaq_f32(vmulq_f32(a.val, a.val), b.val, b.val)); @@ -1285,6 +1455,9 @@ OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_uint16x8, u16) OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_int16x8, s16) OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_uint32x4, u32) OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_int32x4, s32) +#if CV_SIMD128_FP16 +OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_float16x8, f16) +#endif OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_float32x4, f32) OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_uint64x2, u64) OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_int64x2, s64) @@ -1336,6 +1509,9 @@ OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint32x4, unsigned, u32) OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int32x4, int, s32) OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint64x2, uint64, u64) OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int64x2, int64, s64) +#if CV_SIMD128_FP16 +OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float16x8, __fp16, f16) +#endif OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float32x4, float, f32) #if CV_SIMD128_64F OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float64x2, double, f64) @@ -1428,6 +1604,10 @@ OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_uint16x8, uint16x4, ushort, max, max, u16) OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_uint16x8, uint16x4, ushort, min, min, u16) OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_int16x8, int16x4, short, max, max, s16) OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_int16x8, int16x4, short, min, min, s16) +#if CV_SIMD128_FP16 +OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_float16x8, float16x4, __fp16, max, max, f16) +OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_float16x8, float16x4, __fp16, min, min, f16) +#endif #if CV_NEON_AARCH64 #define OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \ @@ -1498,6 +1678,24 @@ inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b, #endif // #if CV_NEON_AARCH64 } +#if CV_SIMD128_FP16 +inline v_float16x8 v_reduce_sum8(const v_float16x8 &a, const v_float16x8 &b, + const v_float16x8 &c, const v_float16x8 &d, + const v_float16x8 &w, const v_float16x8 &x, + const v_float16x8 &y, const v_float16x8 &z) +{ + float16x8_t ab = vpaddq_f16(a.val, b.val); // a0+a1 a2+a3 a4+a5 a6+a7 b0+b1 b2+b3 b4+b5 b6+b7 + float16x8_t cd = vpaddq_f16(c.val, d.val); // c0+c1 c2+c3 c4+c5 c6+c7 d0+d1 d2+d3 d4+d5 d6+d7 + float16x8_t wx = vpaddq_f16(w.val, x.val); // w0+w1 w2+w3 w4+w5 w6+w7 x0+x1 x2+x3 x4+x5 x6+x7 + float16x8_t yz = vpaddq_f16(y.val, z.val); // y0+y1 y2+y3 y4+y5 y6+y7 z0+z1 z2+z3 z4+z5 z6+z7 + + float16x8_t abcd = vpaddq_f16(ab, cd); // a0+a1+a2+a3 a4+a5+a6+a7 b0+b1+b2+b3 b4+b5+b6+b7 c0+c1+c2+c3 c4+c5+c6+c7 d0+d1+d2+d3 d4+d5+d6+d7 + float16x8_t wxyz = vpaddq_f16(wx, yz); // w0+w1+w2+w3 w4+w5+w6+w7 x0+x1+x2+x3 x4+x5+x6+x7 y0+y1+y2+y3 y4+y5+y6+y7 z0+z1+z2+z3 z4+z5+z6+z7 + + return v_float16x8(vpaddq_f16(abcd, wxyz)); +} +#endif + inline unsigned v_reduce_sad(const v_uint8x16& a, const v_uint8x16& b) { #if CV_NEON_AARCH64 @@ -1635,6 +1833,10 @@ inline int v_signmask(const v_uint16x8& a) } inline int v_signmask(const v_int16x8& a) { return v_signmask(v_reinterpret_as_u16(a)); } +#if CV_SIMD128_FP16 +inline int v_signmask(const v_float16x8& a) +{ return v_signmask(v_reinterpret_as_u16(a)); } +#endif inline int v_signmask(const v_uint32x4& a) { @@ -1678,6 +1880,9 @@ inline int v_scan_forward(const v_int8x16& a) { return trailingZeros32(v_signmas inline int v_scan_forward(const v_uint8x16& a) { return trailingZeros32(v_signmask(a)); } inline int v_scan_forward(const v_int16x8& a) { return trailingZeros32(v_signmask(a)); } inline int v_scan_forward(const v_uint16x8& a) { return trailingZeros32(v_signmask(a)); } +#if CV_SIMD128_FP16 +inline int v_scan_forward(const v_float16x8& a) { return trailingZeros32(v_signmask(a)); } +#endif inline int v_scan_forward(const v_int32x4& a) { return trailingZeros32(v_signmask(a)); } inline int v_scan_forward(const v_uint32x4& a) { return trailingZeros32(v_signmask(a)); } inline int v_scan_forward(const v_float32x4& a) { return trailingZeros32(v_signmask(a)); } @@ -1732,6 +1937,12 @@ inline bool v_check_all(const v_int8x16& a) { return v_check_all(v_reinterpret_as_u8(a)); } inline bool v_check_all(const v_int16x8& a) { return v_check_all(v_reinterpret_as_u16(a)); } +#if CV_SIMD128_FP16 +inline bool v_check_all(const v_float16x8& a) +{ return v_check_all(v_reinterpret_as_u16(a)); } +inline bool v_check_any(const v_float16x8& a) +{ return v_check_any(v_reinterpret_as_u16(a)); } +#endif inline bool v_check_all(const v_int32x4& a) { return v_check_all(v_reinterpret_as_u32(a)); } inline bool v_check_all(const v_float32x4& a) @@ -1767,6 +1978,9 @@ OPENCV_HAL_IMPL_NEON_SELECT(v_uint8x16, u8, u8) OPENCV_HAL_IMPL_NEON_SELECT(v_int8x16, s8, u8) OPENCV_HAL_IMPL_NEON_SELECT(v_uint16x8, u16, u16) OPENCV_HAL_IMPL_NEON_SELECT(v_int16x8, s16, u16) +#if CV_SIMD128_FP16 +OPENCV_HAL_IMPL_NEON_SELECT(v_float16x8, f16, u16) +#endif OPENCV_HAL_IMPL_NEON_SELECT(v_uint32x4, u32, u32) OPENCV_HAL_IMPL_NEON_SELECT(v_int32x4, s32, u32) OPENCV_HAL_IMPL_NEON_SELECT(v_float32x4, f32, u32) @@ -1884,6 +2098,9 @@ OPENCV_HAL_IMPL_NEON_UNPACKS(uint8x16, u8) OPENCV_HAL_IMPL_NEON_UNPACKS(int8x16, s8) OPENCV_HAL_IMPL_NEON_UNPACKS(uint16x8, u16) OPENCV_HAL_IMPL_NEON_UNPACKS(int16x8, s16) +#if CV_SIMD128_FP16 +OPENCV_HAL_IMPL_NEON_UNPACKS(float16x8, f16) +#endif OPENCV_HAL_IMPL_NEON_UNPACKS(uint32x4, u32) OPENCV_HAL_IMPL_NEON_UNPACKS(int32x4, s32) OPENCV_HAL_IMPL_NEON_UNPACKS(float32x4, f32) @@ -1909,6 +2126,11 @@ inline v_uint16x8 v_reverse(const v_uint16x8 &a) inline v_int16x8 v_reverse(const v_int16x8 &a) { return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); } +#if CV_SIMD128_FP16 +inline v_float16x8 v_reverse(const v_float16x8 &a) +{ return v_reinterpret_as_f16(v_reverse(v_reinterpret_as_u16(a))); } +#endif + inline v_uint32x4 v_reverse(const v_uint32x4 &a) { uint32x4_t vec = vrev64q_u32(a.val); @@ -1948,6 +2170,9 @@ OPENCV_HAL_IMPL_NEON_EXTRACT(uint8x16, u8) OPENCV_HAL_IMPL_NEON_EXTRACT(int8x16, s8) OPENCV_HAL_IMPL_NEON_EXTRACT(uint16x8, u16) OPENCV_HAL_IMPL_NEON_EXTRACT(int16x8, s16) +#if CV_SIMD128_FP16 +OPENCV_HAL_IMPL_NEON_EXTRACT(float16x8, f16) +#endif OPENCV_HAL_IMPL_NEON_EXTRACT(uint32x4, u32) OPENCV_HAL_IMPL_NEON_EXTRACT(int32x4, s32) OPENCV_HAL_IMPL_NEON_EXTRACT(uint64x2, u64) @@ -1964,6 +2189,9 @@ OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_uint8x16, uchar, u8) OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_int8x16, schar, s8) OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_uint16x8, ushort, u16) OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_int16x8, short, s16) +#if CV_SIMD128_FP16 +OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_float16x8, __fp16, f16) +#endif OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_uint32x4, uint, u32) OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_int32x4, int, s32) OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_uint64x2, uint64, u64) @@ -1980,6 +2208,9 @@ OPENCV_HAL_IMPL_NEON_BROADCAST(v_uint8x16, uchar, u8) OPENCV_HAL_IMPL_NEON_BROADCAST(v_int8x16, schar, s8) OPENCV_HAL_IMPL_NEON_BROADCAST(v_uint16x8, ushort, u16) OPENCV_HAL_IMPL_NEON_BROADCAST(v_int16x8, short, s16) +#if CV_SIMD128_FP16 +OPENCV_HAL_IMPL_NEON_BROADCAST(v_float16x8, __fp16, f16) +#endif OPENCV_HAL_IMPL_NEON_BROADCAST(v_uint32x4, uint, u32) OPENCV_HAL_IMPL_NEON_BROADCAST(v_int32x4, int, s32) OPENCV_HAL_IMPL_NEON_BROADCAST(v_uint64x2, uint64, u64) @@ -1989,6 +2220,32 @@ OPENCV_HAL_IMPL_NEON_BROADCAST(v_float32x4, float, f32) OPENCV_HAL_IMPL_NEON_BROADCAST(v_float64x2, double, f64) #endif +#if CV_SIMD128_FP16 +inline v_int16x8 v_round(const v_float16x8 &a) +{ + return v_int16x8(vcvtnq_s16_f16(a.val)); +} + +inline v_int16x8 v_floor(const v_float16x8 &a) +{ + int16x8_t a1 = vcvtq_s16_f16(a.val); + uint16x8_t mask = vcgtq_f16(vcvtq_f16_s16(a1), a.val); + return v_int16x8(vaddq_s16(a1, vreinterpretq_s16_u16(mask))); +} + +inline v_int16x8 v_ceil(const v_float16x8 &a) +{ + int16x8_t a1 = vcvtq_s16_f16(a.val); + uint16x8_t mask = vcgtq_f16(a.val, vcvtq_f16_s16(a1)); + return v_int16x8(vsubq_s16(a1, vreinterpretq_s16_u16(mask))); +} + +inline v_int16x8 v_trunc(const v_float16x8 &a) +{ + return v_int16x8(vcvtq_s16_f16(a.val)); +} +#endif + #if CV_SIMD128_64F inline v_int32x4 v_round(const v_float32x4& a) { @@ -2124,6 +2381,47 @@ OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(int32x4, s32) OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(float32x4, f32) #endif // #if CV_NEON_AARCH64 +#if CV_SIMD128_FP16 +inline void v_transpose8x8(const v_float16x8 &a0, const v_float16x8 &a1, + const v_float16x8 &a2, const v_float16x8 &a3, + const v_float16x8 &a4, const v_float16x8 &a5, + const v_float16x8 &a6, const v_float16x8 &a7, + v_float16x8 &b0, v_float16x8 &b1, + v_float16x8 &b2, v_float16x8 &b3, + v_float16x8 &b4, v_float16x8 &b5, + v_float16x8 &b6, v_float16x8 &b7) +{ + float32x4_t s0 = vreinterpretq_f32_f64(vtrn1q_f64(vreinterpretq_f64_f16(a0.val), vreinterpretq_f64_f16(a4.val))); + float32x4_t s1 = vreinterpretq_f32_f64(vtrn1q_f64(vreinterpretq_f64_f16(a1.val), vreinterpretq_f64_f16(a5.val))); + float32x4_t s2 = vreinterpretq_f32_f64(vtrn1q_f64(vreinterpretq_f64_f16(a2.val), vreinterpretq_f64_f16(a6.val))); + float32x4_t s3 = vreinterpretq_f32_f64(vtrn1q_f64(vreinterpretq_f64_f16(a3.val), vreinterpretq_f64_f16(a7.val))); + + float32x4_t s4 = vreinterpretq_f32_f64(vtrn2q_f64(vreinterpretq_f64_f16(a0.val), vreinterpretq_f64_f16(a4.val))); + float32x4_t s5 = vreinterpretq_f32_f64(vtrn2q_f64(vreinterpretq_f64_f16(a1.val), vreinterpretq_f64_f16(a5.val))); + float32x4_t s6 = vreinterpretq_f32_f64(vtrn2q_f64(vreinterpretq_f64_f16(a2.val), vreinterpretq_f64_f16(a6.val))); + float32x4_t s7 = vreinterpretq_f32_f64(vtrn2q_f64(vreinterpretq_f64_f16(a3.val), vreinterpretq_f64_f16(a7.val))); + + float16x8_t t0 = vreinterpretq_f16_f32(vtrn1q_f32(s0, s2)); + float16x8_t t1 = vreinterpretq_f16_f32(vtrn1q_f32(s1, s3)); + float16x8_t t2 = vreinterpretq_f16_f32(vtrn2q_f32(s0, s2)); + float16x8_t t3 = vreinterpretq_f16_f32(vtrn2q_f32(s1, s3)); + + float16x8_t t4 = vreinterpretq_f16_f32(vtrn1q_f32(s4, s6)); + float16x8_t t5 = vreinterpretq_f16_f32(vtrn1q_f32(s5, s7)); + float16x8_t t6 = vreinterpretq_f16_f32(vtrn2q_f32(s4, s6)); + float16x8_t t7 = vreinterpretq_f16_f32(vtrn2q_f32(s5, s7)); + + b0.val = vtrn1q_f16(t0, t1); + b1.val = vtrn2q_f16(t0, t1); + b2.val = vtrn1q_f16(t2, t3); + b3.val = vtrn2q_f16(t2, t3); + b4.val = vtrn1q_f16(t4, t5); + b5.val = vtrn2q_f16(t4, t5); + b6.val = vtrn1q_f16(t6, t7); + b7.val = vtrn2q_f16(t6, t7); +} +#endif + #define OPENCV_HAL_IMPL_NEON_INTERLEAVED(_Tpvec, _Tp, suffix) \ inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b) \ { \ @@ -2257,6 +2555,9 @@ OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint8x16, uchar, u8) OPENCV_HAL_IMPL_NEON_INTERLEAVED(int8x16, schar, s8) OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint16x8, ushort, u16) OPENCV_HAL_IMPL_NEON_INTERLEAVED(int16x8, short, s16) +#if CV_SIMD128_FP16 +OPENCV_HAL_IMPL_NEON_INTERLEAVED(float16x8, __fp16, f16) +#endif OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint32x4, unsigned, u32) OPENCV_HAL_IMPL_NEON_INTERLEAVED(int32x4, int, s32) OPENCV_HAL_IMPL_NEON_INTERLEAVED(float32x4, float, f32) @@ -2267,6 +2568,30 @@ OPENCV_HAL_IMPL_NEON_INTERLEAVED(float64x2, double, f64) OPENCV_HAL_IMPL_NEON_INTERLEAVED_INT64(int64, s64) OPENCV_HAL_IMPL_NEON_INTERLEAVED_INT64(uint64, u64) +#if CV_SIMD128_FP16 +inline v_float16x8 v_cvt_f16(const v_float32x4 &a) +{ + float16x4_t zero = vdup_n_f16((__fp16)0.0f); + return v_float16x8(vcombine_f16(vcvt_f16_f32(a.val), zero)); +} +inline v_float16x8 v_cvt_f16(const v_float32x4 &a, const v_float32x4 &b) +{ + return v_float16x8(vcombine_f16(vcvt_f16_f32(a.val), vcvt_f16_f32(b.val))); +} +inline v_float16x8 v_cvt_f16(const v_int16x8 &a) +{ + return v_float16x8(vcvtq_f16_s16(a.val)); +} +inline v_float32x4 v_cvt_f32(const v_float16x8 &a) +{ + return v_float32x4(vcvt_f32_f16(vget_low_f16(a.val))); +} +inline v_float32x4 v_cvt_f32_high(const v_float16x8 &a) +{ + return v_float32x4(vcvt_f32_f16(vget_high_f16(a.val))); +} +#endif + inline v_float32x4 v_cvt_f32(const v_int32x4& a) { return v_float32x4(vcvtq_f32_s32(a.val)); @@ -2422,6 +2747,46 @@ inline v_uint16x8 v_lut(const ushort* tab, const int* idx) { return v_reinterpre inline v_uint16x8 v_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_pairs((short*)tab, idx)); } inline v_uint16x8 v_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_quads((short*)tab, idx)); } +#if CV_SIMD128_FP16 +inline v_float16x8 v_lut(const float16_t *tab, const int *idx) +{ + const __fp16 *t = (const __fp16*)tab; + __fp16 CV_DECL_ALIGNED(32) elems[8] = + { + t[idx[0]], + t[idx[1]], + t[idx[2]], + t[idx[3]], + t[idx[4]], + t[idx[5]], + t[idx[6]], + t[idx[7]], + }; + return v_float16x8(vld1q_f16(elems)); +} +inline v_float16x8 v_lut_pairs(const float16_t *tab, const int *idx) +{ + const __fp16 *t = (const __fp16*)tab; + __fp16 CV_DECL_ALIGNED(32) elems[8] = + { + t[idx[0]], + t[idx[0] + 1], + t[idx[1]], + t[idx[1] + 1], + t[idx[2]], + t[idx[2] + 1], + t[idx[3]], + t[idx[3] + 1], + }; + return v_float16x8(vld1q_f16(elems)); +} +inline v_float16x8 v_lut_quads(const float16_t *tab, const int *idx) +{ + const __fp16 *t = (const __fp16*)tab; + return v_float16x8(vcombine_f16(vld1_f16(t + idx[0]), vld1_f16(t + idx[1]))); +} +#endif + inline v_int32x4 v_lut(const int* tab, const int* idx) { int CV_DECL_ALIGNED(32) elems[4] = diff --git a/modules/core/test/test_intrin_utils.hpp b/modules/core/test/test_intrin_utils.hpp index a8c565ec46..b50e459903 100644 --- a/modules/core/test/test_intrin_utils.hpp +++ b/modules/core/test/test_intrin_utils.hpp @@ -182,6 +182,13 @@ template<> inline void EXPECT_COMPARE_EQ_(const double a, const double b EXPECT_DOUBLE_EQ( a, b ); } +#if CV_SIMD_FP16 +template<> inline void EXPECT_COMPARE_EQ_<__fp16>(const __fp16 a, const __fp16 b) +{ + EXPECT_LT(std::abs(float(a - b)), 0.126); +} +#endif + // pack functions do not do saturation when converting from 64-bit types template inline T pack_saturate_cast(W a) { return saturate_cast(a); } @@ -554,6 +561,27 @@ template struct TheTest return *this; } + // Handle accuracy for fp16 + TheTest & test_div_fp16() + { +#if CV_SIMD_FP16 + Data dataA, dataB; + dataB.reverse(); + R a = dataA, b = dataB; + + Data resC = v_div(a, b); + for (int i = 0; i < VTraits::vlanes(); ++i) + { + SCOPED_TRACE(cv::format("i=%d", i)); + EXPECT_LT(std::abs(float((dataA[i] / dataB[i]) - resC[i])), 2e-4); + } +#else + std::cout << "SKIP: test_div_fp16, CV_SIMD_FP16 is not available" << std::endl; +#endif + + return *this; + } + TheTest & test_mul_expand() { typedef typename V_RegTraits::w_reg Rx2; @@ -604,11 +632,34 @@ template struct TheTest a = v_sub(a, b); Data resC = v_abs(a); + auto R_type_lowest = std::numeric_limits::lowest(); for (int i = 0; i < VTraits::vlanes(); ++i) { SCOPED_TRACE(cv::format("i=%d", i)); - R_type ssub = dataA[i] - dataB[i] < std::numeric_limits::lowest() ? std::numeric_limits::lowest() : dataA[i] - dataB[i]; + R_type ssub = (dataA[i] - dataB[i]) < R_type_lowest ? R_type_lowest : dataA[i] - dataB[i]; + EXPECT_EQ((u_type)std::abs(ssub), resC[i]); + } + + return *this; + } + + TheTest & test_abs_fp16() + { + typedef typename V_RegTraits::u_reg Ru; // v_float16x8 + typedef typename VTraits::lane_type u_type; // __fp16 + typedef typename VTraits::lane_type R_type; // __fp16 + Data dataA, dataB(10); + R a = dataA, b = dataB; + a = v_sub(a, b); + + Data resC = v_abs(a); + R_type R_type_lowest = R_type(-65504); // 0 11110 1111111111 + + for (int i = 0; i < VTraits::vlanes(); ++i) + { + SCOPED_TRACE(cv::format("i=%d", i)); + R_type ssub = (dataA[i] - dataB[i]) < R_type_lowest ? R_type_lowest : dataA[i] - dataB[i]; EXPECT_EQ((u_type)std::abs(ssub), resC[i]); } @@ -1492,6 +1543,54 @@ template struct TheTest return *this; } + TheTest & test_matmul_fp16() + { +#if CV_SIMD_FP16 + Data dataV, data0, data1, data2, data3, data4, data5, data6, data7; + data1.reverse(); + data2 += 2; + data3 *= 0.3; + data5.reverse(); + data6 += 1; + data7 *= 0.4; + R v = dataV, m0 = data0, m1 = data1, m2 = data2, m3 = data3, m4 = data4, m5 = data5, m6 = data6, m7 = data7; + + Data res = v_matmul(v, m0, m1, m2, m3, m4, m5, m6, m7); + int i = 0; + for (int j = i; j < i + 8; ++j) { + SCOPED_TRACE(cv::format("i=%d j=%d", i, j)); + LaneType val = dataV[i] * data0[j] + + dataV[i + 1] * data1[j] + + dataV[i + 2] * data2[j] + + dataV[i + 3] * data3[j] + + dataV[i + 4] * data4[j] + + dataV[i + 5] * data5[j] + + dataV[i + 6] * data6[j] + + dataV[i + 7] * data7[j]; + EXPECT_COMPARE_EQ(val, res[j]); + } + + Data resAdd = v_matmuladd(v, m0, m1, m2, m3, m4, m5, m6, m7); + i = 0; + for (int j = i; j < i + 8; ++j) { + SCOPED_TRACE(cv::format("i=%d j=%d", i, j)); + LaneType val = dataV[i] * data0[j] + + dataV[i + 1] * data1[j] + + dataV[i + 2] * data2[j] + + dataV[i + 3] * data3[j] + + dataV[i + 4] * data4[j] + + dataV[i + 5] * data5[j] + + dataV[i + 6] * data6[j] + + data7[j]; + EXPECT_COMPARE_EQ(val, resAdd[j]); + } +#else + std::cout << "SKIP: test_matmul_fp16, CV_SIMD_FP16 is not available" << std::endl; +#endif + + return *this; + } + TheTest & test_transpose() { Data dataA, dataB, dataC, dataD; @@ -1527,6 +1626,41 @@ template struct TheTest return *this; } + TheTest & test_transpose8x8_fp16() + { +#if CV_SIMD_FP16 + Data dataA0, dataA1, dataA2, dataA3, dataA4, dataA5, dataA6, dataA7; + dataA1 *= 2; + dataA2 *= 4; + dataA3 *= 6; + dataA4 *= 8; + dataA5 *= 10; + dataA6 *= 12; + dataA7 *= 14; + + R a0 = dataA0, a1 = dataA1, a2 = dataA2, a3 = dataA3, + a4 = dataA4, a5 = dataA5, a6 = dataA6, a7 = dataA7; + R b0, b1, b2, b3, b4, b5, b6, b7; + + v_transpose8x8(a0, a1, a2, a3, a4, a5, a6, a7, + b0, b1, b2, b3, b4, b5, b6, b7); + Data res0 = b0, res1 = b1, res2 = b2, res3 = b3, res4 = b4, res5 = b5, res6 = b6, res7 = b7; + + const Data ref[] = {dataA0, dataA1, dataA2, dataA3, dataA4, dataA5, dataA6, dataA7}; + const Data res[] = { res0, res1, res2, res3, res4, res5, res6, res7}; + for (int i = 0; i < 8; i++) { + for (int j = 0; j < 8; j++) { + SCOPED_TRACE(cv::format("i=%d j=%d", i, j)); + EXPECT_EQ(ref[i][j], res[j][i]); + } + } +#else + std::cout << "SKIP: test_transpose8x8_fp16, CV_SIMD_FP16 is not available" << std::endl; +#endif + + return *this; + } + TheTest & test_reduce_sum4() { Data dataA, dataB, dataC, dataD; @@ -1548,9 +1682,43 @@ template struct TheTest return *this; } + TheTest & test_reduce_sum8() + { +#if CV_SIMD_FP16 + Data dataA, dataB, dataC, dataD, dataW, dataX, dataY, dataZ; + dataB *= 0.01f; + dataC *= 0.001f; + dataD *= 0.002f; + dataW += 0.1f; + dataX *= 0.2f; + dataY += 1; + dataZ *= 2; + + R a = dataA, b = dataB, c = dataC, d = dataD, + w = dataW, x = dataX, y = dataY, z = dataZ; + Data res = v_reduce_sum8(a, b, c, d, w, x, y, z); + + for (int i = 0; i < VTraits::vlanes(); i += 8) + { + SCOPED_TRACE(cv::format("i=%d", i)); + EXPECT_COMPARE_EQ(dataA.sum(i, 8), res[i]); + EXPECT_COMPARE_EQ(dataB.sum(i, 8), res[i + 1]); + EXPECT_COMPARE_EQ(dataC.sum(i, 8), res[i + 2]); + EXPECT_COMPARE_EQ(dataD.sum(i, 8), res[i + 3]); + EXPECT_COMPARE_EQ(dataW.sum(i, 8), res[i + 4]); + EXPECT_COMPARE_EQ(dataX.sum(i, 8), res[i + 5]); + EXPECT_COMPARE_EQ(dataY.sum(i, 8), res[i + 6]); + EXPECT_COMPARE_EQ(dataZ.sum(i, 8), res[i + 7]); + } +#else + std::cout << "SKIP: test_reduce_sum8, CV_SIMD_FP16 is not available" << std::endl; +#endif + + return *this; + } + TheTest & test_loadstore_fp16_f32() { - printf("test_loadstore_fp16_f32 ...\n"); AlignedData data; data.a.clear(); data.a.d[0] = 0x3c00; // 1.0 data.a.d[VTraits::vlanes() - 1] = (unsigned short)0xc000; // -2.0 @@ -1573,22 +1741,21 @@ template struct TheTest return *this; } -#if 0 TheTest & test_loadstore_fp16() { - printf("test_loadstore_fp16 ...\n"); +#if CV_SIMD_FP16 AlignedData data; AlignedData out; // check if addresses are aligned and unaligned respectively - EXPECT_EQ((size_t)0, (size_t)&data.a.d % VTraits::max_nlanes); - EXPECT_NE((size_t)0, (size_t)&data.u.d % VTraits::max_nlanes); - EXPECT_EQ((size_t)0, (size_t)&out.a.d % VTraits::max_nlanes); - EXPECT_NE((size_t)0, (size_t)&out.u.d % VTraits::max_nlanes); + EXPECT_EQ((size_t)0, (size_t)&data.a.d % (sizeof(typename VTraits::lane_type) * VTraits::vlanes())); + EXPECT_NE((size_t)0, (size_t)&data.u.d % (sizeof(typename VTraits::lane_type) * VTraits::vlanes())); + EXPECT_EQ((size_t)0, (size_t)&out.a.d % (sizeof(typename VTraits::lane_type) * VTraits::vlanes())); + EXPECT_NE((size_t)0, (size_t)&out.u.d % (sizeof(typename VTraits::lane_type) * VTraits::vlanes())); // check some initialization methods R r1 = data.u; - R r2 = vx_load_expand((const hfloat*)data.a.d); + R r2 = vx_load(data.a.d); R r3(r2); EXPECT_EQ(data.u[0], v_get0(r1)); EXPECT_EQ(data.a[0], v_get0(r2)); @@ -1598,24 +1765,30 @@ template struct TheTest out.a.clear(); v_store(out.a.d, r1); EXPECT_EQ(data.a, out.a); +#else + std::cout << "SKIP: test_loadstore_fp16, CV_SIMD_FP16 is not available" << std::endl; +#endif return *this; } + TheTest & test_float_cvt_fp16() { - printf("test_float_cvt_fp16 ...\n"); +#if CV_SIMD_FP16 AlignedData data; // check conversion v_float32 r1 = vx_load(data.a.d); v_float16 r2 = v_cvt_f16(r1, vx_setzero_f32()); v_float32 r3 = v_cvt_f32(r2); - EXPECT_EQ(0x3c00, v_get0(r2)); + EXPECT_EQ(1, v_get0(r2)); EXPECT_EQ(v_get0(r3), v_get0(r1)); +#else + std::cout << "SKIP: test_float_cvt_fp16, CV_SIMD_FP16 is not available" << std::endl; +#endif return *this; } -#endif void do_check_cmp64(const Data& dataA, const Data& dataB) { @@ -2029,11 +2202,32 @@ void test_hal_intrin_float16() { DUMP_ENTRY(v_float16); #if CV_FP16 - TheTest() - .test_loadstore_fp16_f32() + TheTest().test_loadstore_fp16_f32(); #if CV_SIMD_FP16 + TheTest() .test_loadstore_fp16() .test_float_cvt_fp16() + .test_interleave() + .test_addsub() + .test_mul() + .test_div_fp16() + .test_abs_fp16() + .test_cmp() + .test_sqrt_abs() + .test_min_max() + .test_float_absdiff() + .test_mask() + .test_unpack() + .test_float_math() + .test_matmul_fp16() + .test_transpose8x8_fp16() + .test_reduce_sum8() + .test_reverse() + .test_extract<0>().test_extract<1>().test_extract<4>().test_extract<7>() + .test_rotate<0>().test_rotate<1>().test_rotate<4>().test_rotate<7>() + .test_extract_highest() + .test_broadcast_element<0>().test_broadcast_element<1>() + .test_extract_n<0>().test_extract_n<1>() #endif ; #else @@ -2041,17 +2235,6 @@ void test_hal_intrin_float16() #endif } - -/*#if defined(CV_CPU_DISPATCH_MODE_FP16) && CV_CPU_DISPATCH_MODE == FP16 -void test_hal_intrin_float16() -{ - TheTest() - .test_loadstore_fp16() - .test_float_cvt_fp16() - ; -} -#endif*/ - #endif //CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY //CV_CPU_OPTIMIZATION_NAMESPACE_END