mirror of
https://github.com/opencv/opencv.git
synced 2025-06-24 21:10:56 +08:00
Merge pull request #25796 from hanliutong:hfloat
Use hfloat instead of __fp16. #25796 Related: #25743 Currently, the type for the half-precision floating point data in the OpenCV source code is `__fp16`, which is a unique(?) type supported by the ARM compiler. Other compilers have very limited support for `__fp16`, so in order to introduce more backends that support FP16 (such as RISC-V), we may need a the more general FP16 type. In this patch, we use `hfloat` instead of `__fp16` in non-ARM code blocks, mainly affected parts are: - `core/hal/intrin.hpp`: Type Traits, REG Traits and `vx_` interface. - `core/hal/intrin_neon.hpp`: Universal Intrinsic API for FP16 type. - `core/test/test_intrin_utils.hpp`: Usage of Univseral Intrinsic - `core/include/opencv2/core/cvdef.h`: Definition of class `hfloat` If I understand correctly, class `hfloat` acts as a wrapper around FP16 types in different platform (`__fp16` for ARM and `_Float16` for RISC-V). Any OpenCV generic interface/source code should use `hfloat`, while platform-specific FP16 types only used in macro-guarded code blocks. /cc @fengyuentau @mshabunin ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [ ] I agree to contribute to the project under Apache 2 License. - [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [ ] The PR is proposed to the proper branch - [ ] There is a reference to the original bug report and related work - [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [ ] The feature is well documented and sample code can be built with the project CMake
This commit is contained in:
parent
6a11847d57
commit
1d9ca7160b
@ -392,8 +392,10 @@ typedef union Cv16suf
|
||||
{
|
||||
short i;
|
||||
ushort u;
|
||||
#if CV_FP16_TYPE
|
||||
#if CV_FP16_TYPE && defined __ARM_FP16_FORMAT_IEEE
|
||||
__fp16 h;
|
||||
#elif CV_FP16_TYPE // Other platforms suggested to use _Float16
|
||||
_Float16 h;
|
||||
#endif
|
||||
}
|
||||
Cv16suf;
|
||||
@ -834,12 +836,16 @@ class hfloat
|
||||
{
|
||||
public:
|
||||
#if CV_FP16_TYPE
|
||||
|
||||
hfloat() : h(0) {}
|
||||
hfloat() = default;
|
||||
explicit hfloat(float x) { h = (__fp16)x; }
|
||||
operator float() const { return (float)h; }
|
||||
#if defined __ARM_FP16_FORMAT_IEEE
|
||||
protected:
|
||||
__fp16 h;
|
||||
#else
|
||||
protected:
|
||||
_Float16 h;
|
||||
#endif
|
||||
|
||||
#else
|
||||
hfloat() : w(0) {}
|
||||
|
@ -166,7 +166,7 @@ CV_INTRIN_DEF_TYPE_TRAITS(schar, schar, uchar, uchar, short, int, int);
|
||||
CV_INTRIN_DEF_TYPE_TRAITS(ushort, short, ushort, ushort, unsigned, uint64, unsigned);
|
||||
CV_INTRIN_DEF_TYPE_TRAITS(short, short, ushort, ushort, int, int64, int);
|
||||
#if CV_FP16_TYPE
|
||||
CV_INTRIN_DEF_TYPE_TRAITS(__fp16, short, ushort, __fp16, float, double, float);
|
||||
CV_INTRIN_DEF_TYPE_TRAITS(hfloat, short, ushort, hfloat, float, double, float);
|
||||
#endif
|
||||
CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(unsigned, int, unsigned, unsigned, uint64, unsigned);
|
||||
CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(int, int, unsigned, unsigned, int64, int);
|
||||
@ -370,7 +370,7 @@ template<typename _Tp> struct V_RegTraits
|
||||
CV_DEF_REG_TRAITS(v, v_uint16x8, ushort, u16, v_uint16x8, v_uint32x4, v_uint64x2, v_int16x8, void);
|
||||
CV_DEF_REG_TRAITS(v, v_int16x8, short, s16, v_uint16x8, v_int32x4, v_int64x2, v_int16x8, void);
|
||||
#if CV_SIMD128_FP16
|
||||
CV_DEF_REG_TRAITS(v, v_float16x8, __fp16, f16, v_float16x8, v_float32x4, v_float64x2, v_int16x8, v_int16x8);
|
||||
CV_DEF_REG_TRAITS(v, v_float16x8, hfloat, f16, v_float16x8, v_float32x4, v_float64x2, v_int16x8, v_int16x8);
|
||||
#endif
|
||||
CV_DEF_REG_TRAITS(v, v_uint32x4, unsigned, u32, v_uint32x4, v_uint64x2, void, v_int32x4, void);
|
||||
CV_DEF_REG_TRAITS(v, v_int32x4, int, s32, v_uint32x4, v_int64x2, void, v_int32x4, void);
|
||||
@ -570,7 +570,7 @@ namespace CV__SIMD_NAMESPACE {
|
||||
inline v_uint16 vx_setall_u16(ushort v) { return VXPREFIX(_setall_u16)(v); }
|
||||
inline v_int16 vx_setall_s16(short v) { return VXPREFIX(_setall_s16)(v); }
|
||||
#if CV_SIMD_FP16
|
||||
inline v_float16 vx_setall_f16(__fp16 v) { return VXPREFIX(_setall_f16)(v); }
|
||||
inline v_float16 vx_setall_f16(hfloat v) { return VXPREFIX(_setall_f16)(v); }
|
||||
#endif
|
||||
inline v_int32 vx_setall_s32(int v) { return VXPREFIX(_setall_s32)(v); }
|
||||
inline v_uint32 vx_setall_u32(unsigned v) { return VXPREFIX(_setall_u32)(v); }
|
||||
@ -610,7 +610,7 @@ namespace CV__SIMD_NAMESPACE {
|
||||
inline v_uint16 vx_load(const ushort * ptr) { return VXPREFIX(_load)(ptr); }
|
||||
inline v_int16 vx_load(const short * ptr) { return VXPREFIX(_load)(ptr); }
|
||||
#if CV_SIMD_FP16
|
||||
inline v_float16 vx_load(const __fp16 * ptr) { return VXPREFIX(_load)(ptr); }
|
||||
inline v_float16 vx_load(const hfloat * ptr) { return VXPREFIX(_load)(ptr); }
|
||||
#endif
|
||||
inline v_int32 vx_load(const int * ptr) { return VXPREFIX(_load)(ptr); }
|
||||
inline v_uint32 vx_load(const unsigned * ptr) { return VXPREFIX(_load)(ptr); }
|
||||
@ -630,7 +630,7 @@ namespace CV__SIMD_NAMESPACE {
|
||||
inline v_uint16 vx_load_aligned(const ushort * ptr) { return VXPREFIX(_load_aligned)(ptr); }
|
||||
inline v_int16 vx_load_aligned(const short * ptr) { return VXPREFIX(_load_aligned)(ptr); }
|
||||
#if CV_SIMD_FP16
|
||||
inline v_float16 vx_load_aligned(const __fp16 * ptr) { return VXPREFIX(_load_aligned)(ptr); }
|
||||
inline v_float16 vx_load_aligned(const hfloat * ptr) { return VXPREFIX(_load_aligned)(ptr); }
|
||||
#endif
|
||||
inline v_int32 vx_load_aligned(const int * ptr) { return VXPREFIX(_load_aligned)(ptr); }
|
||||
inline v_uint32 vx_load_aligned(const unsigned * ptr) { return VXPREFIX(_load_aligned)(ptr); }
|
||||
@ -650,7 +650,7 @@ namespace CV__SIMD_NAMESPACE {
|
||||
inline v_uint16 vx_load_low(const ushort * ptr) { return VXPREFIX(_load_low)(ptr); }
|
||||
inline v_int16 vx_load_low(const short * ptr) { return VXPREFIX(_load_low)(ptr); }
|
||||
#if CV_SIMD_FP16
|
||||
inline v_float16 vx_load_low(const __fp16 * ptr) { return VXPREFIX(_load_low)(ptr); }
|
||||
inline v_float16 vx_load_low(const hfloat * ptr) { return VXPREFIX(_load_low)(ptr); }
|
||||
#endif
|
||||
inline v_int32 vx_load_low(const int * ptr) { return VXPREFIX(_load_low)(ptr); }
|
||||
inline v_uint32 vx_load_low(const unsigned * ptr) { return VXPREFIX(_load_low)(ptr); }
|
||||
@ -670,7 +670,7 @@ namespace CV__SIMD_NAMESPACE {
|
||||
inline v_uint16 vx_load_halves(const ushort * ptr0, const ushort * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
|
||||
inline v_int16 vx_load_halves(const short * ptr0, const short * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
|
||||
#if CV_SIMD_FP16
|
||||
inline v_float16 vx_load_halves(const __fp16 * ptr0, const __fp16 * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
|
||||
inline v_float16 vx_load_halves(const hfloat * ptr0, const hfloat * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
|
||||
#endif
|
||||
inline v_int32 vx_load_halves(const int * ptr0, const int * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
|
||||
inline v_uint32 vx_load_halves(const unsigned * ptr0, const unsigned * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
|
||||
@ -690,7 +690,7 @@ namespace CV__SIMD_NAMESPACE {
|
||||
inline v_uint16 vx_lut(const ushort * ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
|
||||
inline v_int16 vx_lut(const short* ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
|
||||
#if CV_SIMD_FP16
|
||||
inline v_float16 vx_lut(const __fp16 * ptr, const int * idx) { return VXPREFIX(_lut)(ptr, idx); }
|
||||
inline v_float16 vx_lut(const hfloat * ptr, const int * idx) { return VXPREFIX(_lut)(ptr, idx); }
|
||||
#endif
|
||||
inline v_int32 vx_lut(const int* ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
|
||||
inline v_uint32 vx_lut(const unsigned* ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
|
||||
@ -710,7 +710,7 @@ namespace CV__SIMD_NAMESPACE {
|
||||
inline v_uint16 vx_lut_pairs(const ushort * ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
|
||||
inline v_int16 vx_lut_pairs(const short* ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
|
||||
#if CV_SIMD_FP16
|
||||
inline v_float16 vx_lut_pairs(const __fp16 * ptr, const int * idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
|
||||
inline v_float16 vx_lut_pairs(const hfloat * ptr, const int * idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
|
||||
#endif
|
||||
inline v_int32 vx_lut_pairs(const int* ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
|
||||
inline v_uint32 vx_lut_pairs(const unsigned* ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
|
||||
|
@ -298,9 +298,9 @@ struct v_float16x8
|
||||
{
|
||||
v_float16x8() {}
|
||||
explicit v_float16x8(float16x8_t v) : val(v) {}
|
||||
v_float16x8(__fp16 v0, __fp16 v1, __fp16 v2, __fp16 v3, __fp16 v4, __fp16 v5, __fp16 v6, __fp16 v7)
|
||||
v_float16x8(hfloat v0, hfloat v1, hfloat v2, hfloat v3, hfloat v4, hfloat v5, hfloat v6, hfloat v7)
|
||||
{
|
||||
__fp16 v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
|
||||
__fp16 v[] = {(__fp16)v0, (__fp16)v1, (__fp16)v2, (__fp16)v3, (__fp16)v4, (__fp16)v5, (__fp16)v6, (__fp16)v7};
|
||||
val = vld1q_f16(v);
|
||||
}
|
||||
float16x8_t val;
|
||||
@ -308,12 +308,12 @@ struct v_float16x8
|
||||
private:
|
||||
friend struct VTraits<v_float16x8>;
|
||||
enum { nlanes = 8 };
|
||||
typedef __fp16 lane_type;
|
||||
typedef hfloat lane_type;
|
||||
|
||||
friend typename VTraits<v_float16x8>::lane_type v_get0<v_float16x8>(const v_float16x8& v);
|
||||
__fp16 get0() const
|
||||
hfloat get0() const
|
||||
{
|
||||
return vgetq_lane_f16(val, 0);
|
||||
return (hfloat)vgetq_lane_f16(val, 0);
|
||||
}
|
||||
};
|
||||
#endif
|
||||
@ -411,9 +411,9 @@ private:
|
||||
};
|
||||
#endif
|
||||
|
||||
#define OPENCV_HAL_IMPL_NEON_INIT(_Tpv, _Tp, suffix) \
|
||||
inline v_##_Tpv v_setzero_##suffix() { return v_##_Tpv(vdupq_n_##suffix((_Tp)0)); } \
|
||||
inline v_##_Tpv v_setall_##suffix(_Tp v) { return v_##_Tpv(vdupq_n_##suffix(v)); } \
|
||||
#define OPENCV_HAL_IMPL_NEON_INIT(_Tpv, _Tp, _TpCast, suffix) \
|
||||
inline v_##_Tpv v_setzero_##suffix() { return v_##_Tpv(vdupq_n_##suffix((_TpCast)0)); } \
|
||||
inline v_##_Tpv v_setall_##suffix(_Tp v) { return v_##_Tpv(vdupq_n_##suffix((_TpCast)v)); } \
|
||||
inline _Tpv##_t vreinterpretq_##suffix##_##suffix(_Tpv##_t v) { return v; } \
|
||||
inline v_uint8x16 v_reinterpret_as_u8(const v_##_Tpv& v) { return v_uint8x16(vreinterpretq_u8_##suffix(v.val)); } \
|
||||
inline v_int8x16 v_reinterpret_as_s8(const v_##_Tpv& v) { return v_int8x16(vreinterpretq_s8_##suffix(v.val)); } \
|
||||
@ -425,16 +425,16 @@ inline v_uint64x2 v_reinterpret_as_u64(const v_##_Tpv& v) { return v_uint64x2(vr
|
||||
inline v_int64x2 v_reinterpret_as_s64(const v_##_Tpv& v) { return v_int64x2(vreinterpretq_s64_##suffix(v.val)); } \
|
||||
inline v_float32x4 v_reinterpret_as_f32(const v_##_Tpv& v) { return v_float32x4(vreinterpretq_f32_##suffix(v.val)); }
|
||||
|
||||
OPENCV_HAL_IMPL_NEON_INIT(uint8x16, uchar, u8)
|
||||
OPENCV_HAL_IMPL_NEON_INIT(int8x16, schar, s8)
|
||||
OPENCV_HAL_IMPL_NEON_INIT(uint16x8, ushort, u16)
|
||||
OPENCV_HAL_IMPL_NEON_INIT(int16x8, short, s16)
|
||||
OPENCV_HAL_IMPL_NEON_INIT(uint32x4, unsigned, u32)
|
||||
OPENCV_HAL_IMPL_NEON_INIT(int32x4, int, s32)
|
||||
OPENCV_HAL_IMPL_NEON_INIT(uint64x2, uint64, u64)
|
||||
OPENCV_HAL_IMPL_NEON_INIT(int64x2, int64, s64)
|
||||
OPENCV_HAL_IMPL_NEON_INIT(uint8x16, uchar, uchar, u8)
|
||||
OPENCV_HAL_IMPL_NEON_INIT(int8x16, schar, schar, s8)
|
||||
OPENCV_HAL_IMPL_NEON_INIT(uint16x8, ushort, ushort, u16)
|
||||
OPENCV_HAL_IMPL_NEON_INIT(int16x8, short, short, s16)
|
||||
OPENCV_HAL_IMPL_NEON_INIT(uint32x4, unsigned, unsigned, u32)
|
||||
OPENCV_HAL_IMPL_NEON_INIT(int32x4, int, int, s32)
|
||||
OPENCV_HAL_IMPL_NEON_INIT(uint64x2, uint64, uint64, u64)
|
||||
OPENCV_HAL_IMPL_NEON_INIT(int64x2, int64, int64, s64)
|
||||
#if CV_SIMD128_FP16
|
||||
OPENCV_HAL_IMPL_NEON_INIT(float16x8, __fp16, f16);
|
||||
OPENCV_HAL_IMPL_NEON_INIT(float16x8, hfloat, __fp16, f16);
|
||||
#define OPENCV_HAL_IMPL_NEON_INIT_FP16(_Tpv, suffix) \
|
||||
inline v_float16x8 v_reinterpret_as_f16(const v_##_Tpv& v) { return v_float16x8(vreinterpretq_f16_##suffix(v.val)); }
|
||||
OPENCV_HAL_IMPL_NEON_INIT_FP16(uint8x16, u8)
|
||||
@ -450,11 +450,11 @@ OPENCV_HAL_IMPL_NEON_INIT_FP16(float32x4, f32)
|
||||
OPENCV_HAL_IMPL_NEON_INIT_FP16(float64x2, f64)
|
||||
#endif
|
||||
#endif
|
||||
OPENCV_HAL_IMPL_NEON_INIT(float32x4, float, f32)
|
||||
OPENCV_HAL_IMPL_NEON_INIT(float32x4, float, float, f32)
|
||||
#if CV_SIMD128_64F
|
||||
#define OPENCV_HAL_IMPL_NEON_INIT_64(_Tpv, suffix) \
|
||||
inline v_float64x2 v_reinterpret_as_f64(const v_##_Tpv& v) { return v_float64x2(vreinterpretq_f64_##suffix(v.val)); }
|
||||
OPENCV_HAL_IMPL_NEON_INIT(float64x2, double, f64)
|
||||
OPENCV_HAL_IMPL_NEON_INIT(float64x2, double, double, f64)
|
||||
OPENCV_HAL_IMPL_NEON_INIT_64(uint8x16, u8)
|
||||
OPENCV_HAL_IMPL_NEON_INIT_64(int8x16, s8)
|
||||
OPENCV_HAL_IMPL_NEON_INIT_64(uint16x8, u16)
|
||||
@ -1082,7 +1082,7 @@ inline v_float16x8 v_sqrt(const v_float16x8& x)
|
||||
|
||||
inline v_float16x8 v_invsqrt(const v_float16x8& x)
|
||||
{
|
||||
v_float16x8 one = v_setall_f16(1.0f);
|
||||
v_float16x8 one = v_setall_f16((hfloat)1.0f);
|
||||
return v_div(one, v_sqrt(x));
|
||||
}
|
||||
#endif
|
||||
@ -1467,7 +1467,7 @@ OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_float64x2, f64)
|
||||
|
||||
#if defined(__clang__) && defined(__aarch64__)
|
||||
// avoid LD2 instruction. details: https://github.com/opencv/opencv/issues/14863
|
||||
#define OPENCV_HAL_IMPL_NEON_LOAD_LOW_OP(_Tpvec, _Tp, suffix) \
|
||||
#define OPENCV_HAL_IMPL_NEON_LOAD_LOW_OP(_Tpvec, _Tp, _TpCast, suffix) \
|
||||
inline _Tpvec v_load_low(const _Tp* ptr) \
|
||||
{ \
|
||||
typedef uint64 CV_DECL_ALIGNED(1) unaligned_uint64; \
|
||||
@ -1475,46 +1475,46 @@ uint64 v = *(unaligned_uint64*)ptr; \
|
||||
return _Tpvec(v_reinterpret_as_##suffix(v_uint64x2(v, (uint64)123456))); \
|
||||
}
|
||||
#else
|
||||
#define OPENCV_HAL_IMPL_NEON_LOAD_LOW_OP(_Tpvec, _Tp, suffix) \
|
||||
#define OPENCV_HAL_IMPL_NEON_LOAD_LOW_OP(_Tpvec, _Tp, _TpCast, suffix) \
|
||||
inline _Tpvec v_load_low(const _Tp* ptr) \
|
||||
{ return _Tpvec(vcombine_##suffix(vld1_##suffix(ptr), vdup_n_##suffix((_Tp)0))); }
|
||||
{ return _Tpvec(vcombine_##suffix(vld1_##suffix((const _TpCast *)ptr), vdup_n_##suffix((_Tp)0))); }
|
||||
#endif
|
||||
|
||||
#define OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(_Tpvec, _Tp, suffix) \
|
||||
#define OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(_Tpvec, _Tp, _TpCast, suffix) \
|
||||
inline _Tpvec v_load(const _Tp* ptr) \
|
||||
{ return _Tpvec(vld1q_##suffix(ptr)); } \
|
||||
{ return _Tpvec(vld1q_##suffix((const _TpCast *)ptr)); } \
|
||||
inline _Tpvec v_load_aligned(const _Tp* ptr) \
|
||||
{ return _Tpvec(vld1q_##suffix(ptr)); } \
|
||||
OPENCV_HAL_IMPL_NEON_LOAD_LOW_OP(_Tpvec, _Tp, suffix) \
|
||||
{ return _Tpvec(vld1q_##suffix((const _TpCast *)ptr)); } \
|
||||
OPENCV_HAL_IMPL_NEON_LOAD_LOW_OP(_Tpvec, _Tp, _TpCast, suffix) \
|
||||
inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
|
||||
{ return _Tpvec(vcombine_##suffix(vld1_##suffix(ptr0), vld1_##suffix(ptr1))); } \
|
||||
{ return _Tpvec(vcombine_##suffix(vld1_##suffix((const _TpCast *)ptr0), vld1_##suffix((const _TpCast *)ptr1))); } \
|
||||
inline void v_store(_Tp* ptr, const _Tpvec& a) \
|
||||
{ vst1q_##suffix(ptr, a.val); } \
|
||||
{ vst1q_##suffix((_TpCast *)ptr, a.val); } \
|
||||
inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
|
||||
{ vst1q_##suffix(ptr, a.val); } \
|
||||
{ vst1q_##suffix((_TpCast *)ptr, a.val); } \
|
||||
inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
|
||||
{ vst1q_##suffix(ptr, a.val); } \
|
||||
{ vst1q_##suffix((_TpCast *)ptr, a.val); } \
|
||||
inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \
|
||||
{ vst1q_##suffix(ptr, a.val); } \
|
||||
{ vst1q_##suffix((_TpCast *)ptr, a.val); } \
|
||||
inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
|
||||
{ vst1_##suffix(ptr, vget_low_##suffix(a.val)); } \
|
||||
{ vst1_##suffix((_TpCast *)ptr, vget_low_##suffix(a.val)); } \
|
||||
inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
|
||||
{ vst1_##suffix(ptr, vget_high_##suffix(a.val)); }
|
||||
{ vst1_##suffix((_TpCast *)ptr, vget_high_##suffix(a.val)); }
|
||||
|
||||
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint8x16, uchar, u8)
|
||||
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int8x16, schar, s8)
|
||||
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint16x8, ushort, u16)
|
||||
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int16x8, short, s16)
|
||||
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint32x4, unsigned, u32)
|
||||
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int32x4, int, s32)
|
||||
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint64x2, uint64, u64)
|
||||
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int64x2, int64, s64)
|
||||
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint8x16, uchar, uchar, u8)
|
||||
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int8x16, schar, schar, s8)
|
||||
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint16x8, ushort, ushort, u16)
|
||||
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int16x8, short, short, s16)
|
||||
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint32x4, unsigned, unsigned, u32)
|
||||
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int32x4, int, int, s32)
|
||||
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint64x2, uint64, uint64, u64)
|
||||
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int64x2, int64, int64, s64)
|
||||
#if CV_SIMD128_FP16
|
||||
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float16x8, __fp16, f16)
|
||||
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float16x8, hfloat, __fp16, f16)
|
||||
#endif
|
||||
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float32x4, float, f32)
|
||||
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float32x4, float, float, f32)
|
||||
#if CV_SIMD128_64F
|
||||
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float64x2, double, f64)
|
||||
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float64x2, double, double, f64)
|
||||
#endif
|
||||
|
||||
inline unsigned v_reduce_sum(const v_uint8x16& a)
|
||||
@ -1588,7 +1588,7 @@ OPENCV_HAL_IMPL_NEON_REDUCE_OP_16(v_int8x16, int8x8, schar, min, min, s8)
|
||||
#define OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
|
||||
inline scalartype v_reduce_##func(const _Tpvec& a) \
|
||||
{ \
|
||||
return v##vectorfunc##vq_##suffix(a.val); \
|
||||
return (scalartype)v##vectorfunc##vq_##suffix(a.val); \
|
||||
}
|
||||
#else // #if CV_NEON_AARCH64
|
||||
#define OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
|
||||
@ -1605,8 +1605,8 @@ OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_uint16x8, uint16x4, ushort, min, min, u16)
|
||||
OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_int16x8, int16x4, short, max, max, s16)
|
||||
OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_int16x8, int16x4, short, min, min, s16)
|
||||
#if CV_SIMD128_FP16
|
||||
OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_float16x8, float16x4, __fp16, max, max, f16)
|
||||
OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_float16x8, float16x4, __fp16, min, min, f16)
|
||||
OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_float16x8, float16x4, hfloat, max, max, f16)
|
||||
OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_float16x8, float16x4, hfloat, min, min, f16)
|
||||
#endif
|
||||
|
||||
#if CV_NEON_AARCH64
|
||||
@ -2183,14 +2183,14 @@ OPENCV_HAL_IMPL_NEON_EXTRACT(float64x2, f64)
|
||||
#endif
|
||||
|
||||
#define OPENCV_HAL_IMPL_NEON_EXTRACT_N(_Tpvec, _Tp, suffix) \
|
||||
template<int i> inline _Tp v_extract_n(_Tpvec v) { return vgetq_lane_##suffix(v.val, i); }
|
||||
template<int i> inline _Tp v_extract_n(_Tpvec v) { return (_Tp)vgetq_lane_##suffix(v.val, i); }
|
||||
|
||||
OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_uint8x16, uchar, u8)
|
||||
OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_int8x16, schar, s8)
|
||||
OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_uint16x8, ushort, u16)
|
||||
OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_int16x8, short, s16)
|
||||
#if CV_SIMD128_FP16
|
||||
OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_float16x8, __fp16, f16)
|
||||
OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_float16x8, hfloat, f16)
|
||||
#endif
|
||||
OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_uint32x4, uint, u32)
|
||||
OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_int32x4, int, s32)
|
||||
@ -2209,7 +2209,7 @@ OPENCV_HAL_IMPL_NEON_BROADCAST(v_int8x16, schar, s8)
|
||||
OPENCV_HAL_IMPL_NEON_BROADCAST(v_uint16x8, ushort, u16)
|
||||
OPENCV_HAL_IMPL_NEON_BROADCAST(v_int16x8, short, s16)
|
||||
#if CV_SIMD128_FP16
|
||||
OPENCV_HAL_IMPL_NEON_BROADCAST(v_float16x8, __fp16, f16)
|
||||
OPENCV_HAL_IMPL_NEON_BROADCAST(v_float16x8, hfloat, f16)
|
||||
#endif
|
||||
OPENCV_HAL_IMPL_NEON_BROADCAST(v_uint32x4, uint, u32)
|
||||
OPENCV_HAL_IMPL_NEON_BROADCAST(v_int32x4, int, s32)
|
||||
@ -2422,16 +2422,16 @@ inline void v_transpose8x8(const v_float16x8 &a0, const v_float16x8 &a1,
|
||||
}
|
||||
#endif
|
||||
|
||||
#define OPENCV_HAL_IMPL_NEON_INTERLEAVED(_Tpvec, _Tp, suffix) \
|
||||
#define OPENCV_HAL_IMPL_NEON_INTERLEAVED(_Tpvec, _Tp, _TpCast, suffix) \
|
||||
inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b) \
|
||||
{ \
|
||||
_Tpvec##x2_t v = vld2q_##suffix(ptr); \
|
||||
_Tpvec##x2_t v = vld2q_##suffix((const _TpCast *)ptr); \
|
||||
a.val = v.val[0]; \
|
||||
b.val = v.val[1]; \
|
||||
} \
|
||||
inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, v_##_Tpvec& c) \
|
||||
{ \
|
||||
_Tpvec##x3_t v = vld3q_##suffix(ptr); \
|
||||
_Tpvec##x3_t v = vld3q_##suffix((const _TpCast *)ptr); \
|
||||
a.val = v.val[0]; \
|
||||
b.val = v.val[1]; \
|
||||
c.val = v.val[2]; \
|
||||
@ -2439,7 +2439,7 @@ inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, v_
|
||||
inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, \
|
||||
v_##_Tpvec& c, v_##_Tpvec& d) \
|
||||
{ \
|
||||
_Tpvec##x4_t v = vld4q_##suffix(ptr); \
|
||||
_Tpvec##x4_t v = vld4q_##suffix((const _TpCast *)ptr); \
|
||||
a.val = v.val[0]; \
|
||||
b.val = v.val[1]; \
|
||||
c.val = v.val[2]; \
|
||||
@ -2451,7 +2451,7 @@ inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec&
|
||||
_Tpvec##x2_t v; \
|
||||
v.val[0] = a.val; \
|
||||
v.val[1] = b.val; \
|
||||
vst2q_##suffix(ptr, v); \
|
||||
vst2q_##suffix((_TpCast *)ptr, v); \
|
||||
} \
|
||||
inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
|
||||
const v_##_Tpvec& c, hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
|
||||
@ -2460,7 +2460,7 @@ inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec&
|
||||
v.val[0] = a.val; \
|
||||
v.val[1] = b.val; \
|
||||
v.val[2] = c.val; \
|
||||
vst3q_##suffix(ptr, v); \
|
||||
vst3q_##suffix((_TpCast *)ptr, v); \
|
||||
} \
|
||||
inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
|
||||
const v_##_Tpvec& c, const v_##_Tpvec& d, \
|
||||
@ -2471,7 +2471,7 @@ inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec&
|
||||
v.val[1] = b.val; \
|
||||
v.val[2] = c.val; \
|
||||
v.val[3] = d.val; \
|
||||
vst4q_##suffix(ptr, v); \
|
||||
vst4q_##suffix((_TpCast *)ptr, v); \
|
||||
}
|
||||
|
||||
#define OPENCV_HAL_IMPL_NEON_INTERLEAVED_INT64(tp, suffix) \
|
||||
@ -2551,18 +2551,18 @@ inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, const v_##tp##x2&
|
||||
vst1_##suffix(ptr + 7, vget_high_##suffix(d.val)); \
|
||||
}
|
||||
|
||||
OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint8x16, uchar, u8)
|
||||
OPENCV_HAL_IMPL_NEON_INTERLEAVED(int8x16, schar, s8)
|
||||
OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint16x8, ushort, u16)
|
||||
OPENCV_HAL_IMPL_NEON_INTERLEAVED(int16x8, short, s16)
|
||||
OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint8x16, uchar, uchar, u8)
|
||||
OPENCV_HAL_IMPL_NEON_INTERLEAVED(int8x16, schar, schar, s8)
|
||||
OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint16x8, ushort, ushort, u16)
|
||||
OPENCV_HAL_IMPL_NEON_INTERLEAVED(int16x8, short, short, s16)
|
||||
#if CV_SIMD128_FP16
|
||||
OPENCV_HAL_IMPL_NEON_INTERLEAVED(float16x8, __fp16, f16)
|
||||
OPENCV_HAL_IMPL_NEON_INTERLEAVED(float16x8, hfloat, __fp16, f16)
|
||||
#endif
|
||||
OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint32x4, unsigned, u32)
|
||||
OPENCV_HAL_IMPL_NEON_INTERLEAVED(int32x4, int, s32)
|
||||
OPENCV_HAL_IMPL_NEON_INTERLEAVED(float32x4, float, f32)
|
||||
OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint32x4, unsigned, unsigned, u32)
|
||||
OPENCV_HAL_IMPL_NEON_INTERLEAVED(int32x4, int, int, s32)
|
||||
OPENCV_HAL_IMPL_NEON_INTERLEAVED(float32x4, float, float, f32)
|
||||
#if CV_SIMD128_64F
|
||||
OPENCV_HAL_IMPL_NEON_INTERLEAVED(float64x2, double, f64)
|
||||
OPENCV_HAL_IMPL_NEON_INTERLEAVED(float64x2, double, double, f64)
|
||||
#endif
|
||||
|
||||
OPENCV_HAL_IMPL_NEON_INTERLEAVED_INT64(int64, s64)
|
||||
@ -2748,7 +2748,7 @@ inline v_uint16x8 v_lut_pairs(const ushort* tab, const int* idx) { return v_rein
|
||||
inline v_uint16x8 v_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_quads((short*)tab, idx)); }
|
||||
|
||||
#if CV_SIMD128_FP16
|
||||
inline v_float16x8 v_lut(const float16_t *tab, const int *idx)
|
||||
inline v_float16x8 v_lut(const hfloat *tab, const int *idx)
|
||||
{
|
||||
const __fp16 *t = (const __fp16*)tab;
|
||||
__fp16 CV_DECL_ALIGNED(32) elems[8] =
|
||||
@ -2764,7 +2764,7 @@ inline v_float16x8 v_lut(const float16_t *tab, const int *idx)
|
||||
};
|
||||
return v_float16x8(vld1q_f16(elems));
|
||||
}
|
||||
inline v_float16x8 v_lut_pairs(const float16_t *tab, const int *idx)
|
||||
inline v_float16x8 v_lut_pairs(const hfloat *tab, const int *idx)
|
||||
{
|
||||
const __fp16 *t = (const __fp16*)tab;
|
||||
__fp16 CV_DECL_ALIGNED(32) elems[8] =
|
||||
@ -2780,7 +2780,7 @@ inline v_float16x8 v_lut_pairs(const float16_t *tab, const int *idx)
|
||||
};
|
||||
return v_float16x8(vld1q_f16(elems));
|
||||
}
|
||||
inline v_float16x8 v_lut_quads(const float16_t *tab, const int *idx)
|
||||
inline v_float16x8 v_lut_quads(const hfloat *tab, const int *idx)
|
||||
{
|
||||
const __fp16 *t = (const __fp16*)tab;
|
||||
return v_float16x8(vcombine_f16(vld1_f16(t + idx[0]), vld1_f16(t + idx[1])));
|
||||
|
@ -55,13 +55,13 @@ template <typename R> struct Data
|
||||
template <typename T> Data<R> & operator*=(T m)
|
||||
{
|
||||
for (int i = 0; i < VTraits<R>::vlanes(); ++i)
|
||||
d[i] *= (LaneType)m;
|
||||
d[i] = (LaneType)(d[i] * m);
|
||||
return *this;
|
||||
}
|
||||
template <typename T> Data<R> & operator+=(T m)
|
||||
{
|
||||
for (int i = 0; i < VTraits<R>::vlanes(); ++i)
|
||||
d[i] += (LaneType)m;
|
||||
d[i] = (LaneType)(d[i] + m);
|
||||
return *this;
|
||||
}
|
||||
void fill(LaneType val, int s, int c = VTraits<R>::vlanes())
|
||||
@ -113,9 +113,9 @@ template <typename R> struct Data
|
||||
}
|
||||
LaneType sum(int s, int c)
|
||||
{
|
||||
LaneType res = 0;
|
||||
LaneType res = (LaneType)0;
|
||||
for (int i = s; i < s + c; ++i)
|
||||
res += d[i];
|
||||
res = (LaneType)(res + d[i]);
|
||||
return res;
|
||||
}
|
||||
LaneType sum()
|
||||
@ -131,7 +131,7 @@ template <typename R> struct Data
|
||||
}
|
||||
void clear()
|
||||
{
|
||||
fill(0);
|
||||
fill((LaneType)0);
|
||||
}
|
||||
bool isZero() const
|
||||
{
|
||||
@ -183,7 +183,7 @@ template<> inline void EXPECT_COMPARE_EQ_<double>(const double a, const double b
|
||||
}
|
||||
|
||||
#if CV_SIMD_FP16
|
||||
template<> inline void EXPECT_COMPARE_EQ_<__fp16>(const __fp16 a, const __fp16 b)
|
||||
template<> inline void EXPECT_COMPARE_EQ_<hfloat>(const hfloat a, const hfloat b)
|
||||
{
|
||||
EXPECT_LT(std::abs(float(a - b)), 0.126);
|
||||
}
|
||||
@ -352,9 +352,9 @@ template<typename R> struct TheTest
|
||||
TheTest & test_interleave()
|
||||
{
|
||||
Data<R> data1, data2, data3, data4;
|
||||
data2 += 20;
|
||||
data3 += 40;
|
||||
data4 += 60;
|
||||
data2 += (LaneType)20;
|
||||
data3 += (LaneType)40;
|
||||
data4 += (LaneType)60;
|
||||
|
||||
|
||||
R a = data1, b = data2, c = data3;
|
||||
@ -366,7 +366,7 @@ template<typename R> struct TheTest
|
||||
v_store_interleave(buf3, a, b, c);
|
||||
v_store_interleave(buf4, d, e, f, g);
|
||||
|
||||
Data<R> z(0);
|
||||
Data<R> z((LaneType)0);
|
||||
a = b = c = d = e = f = g = z;
|
||||
|
||||
v_load_deinterleave(buf3, a, b, c);
|
||||
@ -647,9 +647,9 @@ template<typename R> struct TheTest
|
||||
TheTest & test_abs_fp16()
|
||||
{
|
||||
typedef typename V_RegTraits<R>::u_reg Ru; // v_float16x8
|
||||
typedef typename VTraits<Ru>::lane_type u_type; // __fp16
|
||||
typedef typename VTraits<R>::lane_type R_type; // __fp16
|
||||
Data<R> dataA, dataB(10);
|
||||
typedef typename VTraits<Ru>::lane_type u_type; // hfloat
|
||||
typedef typename VTraits<R>::lane_type R_type; // hfloat
|
||||
Data<R> dataA, dataB((LaneType)10);
|
||||
R a = dataA, b = dataB;
|
||||
a = v_sub(a, b);
|
||||
|
||||
@ -659,7 +659,7 @@ template<typename R> struct TheTest
|
||||
for (int i = 0; i < VTraits<Ru>::vlanes(); ++i)
|
||||
{
|
||||
SCOPED_TRACE(cv::format("i=%d", i));
|
||||
R_type ssub = (dataA[i] - dataB[i]) < R_type_lowest ? R_type_lowest : dataA[i] - dataB[i];
|
||||
R_type ssub = (R_type)((dataA[i] - dataB[i]) < R_type_lowest ? R_type_lowest : dataA[i] - dataB[i]);
|
||||
EXPECT_EQ((u_type)std::abs(ssub), resC[i]);
|
||||
}
|
||||
|
||||
@ -930,10 +930,10 @@ template<typename R> struct TheTest
|
||||
{
|
||||
Data<R> dataA(std::numeric_limits<LaneType>::max()),
|
||||
dataB(std::numeric_limits<LaneType>::min());
|
||||
dataA[0] = -1;
|
||||
dataB[0] = 1;
|
||||
dataA[1] = 2;
|
||||
dataB[1] = -2;
|
||||
dataA[0] = (LaneType)-1;
|
||||
dataB[0] = (LaneType)1;
|
||||
dataA[1] = (LaneType)2;
|
||||
dataB[1] = (LaneType)-2;
|
||||
R a = dataA, b = dataB;
|
||||
Data<R> resC = v_absdiff(a, b);
|
||||
for (int i = 0; i < VTraits<R>::vlanes(); ++i)
|
||||
@ -1008,9 +1008,9 @@ template<typename R> struct TheTest
|
||||
typedef typename VTraits<int_reg>::lane_type int_type;
|
||||
typedef typename VTraits<uint_reg>::lane_type uint_type;
|
||||
|
||||
Data<R> dataA, dataB(0), dataC, dataD(1), dataE(2);
|
||||
Data<R> dataA, dataB((LaneType)0), dataC, dataD((LaneType)1), dataE((LaneType)2);
|
||||
dataA[0] = (LaneType)std::numeric_limits<int_type>::max();
|
||||
dataA[1] *= (LaneType)-1;
|
||||
dataA[1] = (LaneType)(dataA[1] * (LaneType)-1);
|
||||
union
|
||||
{
|
||||
LaneType l;
|
||||
@ -1025,7 +1025,7 @@ template<typename R> struct TheTest
|
||||
dataB[VTraits<R>::vlanes() / 2] = mask_one;
|
||||
dataC *= (LaneType)-1;
|
||||
R a = dataA, b = dataB, c = dataC, d = dataD, e = dataE;
|
||||
dataC[VTraits<R>::vlanes() - 1] = 0;
|
||||
dataC[VTraits<R>::vlanes() - 1] = (LaneType)0;
|
||||
R nl = dataC;
|
||||
|
||||
EXPECT_EQ(2, v_signmask(a));
|
||||
@ -1586,14 +1586,15 @@ template<typename R> struct TheTest
|
||||
int i = 0;
|
||||
for (int j = i; j < i + 8; ++j) {
|
||||
SCOPED_TRACE(cv::format("i=%d j=%d", i, j));
|
||||
LaneType val = dataV[i] * data0[j] +
|
||||
LaneType val = (LaneType)(
|
||||
dataV[i] * data0[j] +
|
||||
dataV[i + 1] * data1[j] +
|
||||
dataV[i + 2] * data2[j] +
|
||||
dataV[i + 3] * data3[j] +
|
||||
dataV[i + 4] * data4[j] +
|
||||
dataV[i + 5] * data5[j] +
|
||||
dataV[i + 6] * data6[j] +
|
||||
dataV[i + 7] * data7[j];
|
||||
dataV[i + 7] * data7[j]);
|
||||
EXPECT_COMPARE_EQ(val, res[j]);
|
||||
}
|
||||
|
||||
@ -1601,14 +1602,15 @@ template<typename R> struct TheTest
|
||||
i = 0;
|
||||
for (int j = i; j < i + 8; ++j) {
|
||||
SCOPED_TRACE(cv::format("i=%d j=%d", i, j));
|
||||
LaneType val = dataV[i] * data0[j] +
|
||||
LaneType val = (LaneType)(
|
||||
dataV[i] * data0[j] +
|
||||
dataV[i + 1] * data1[j] +
|
||||
dataV[i + 2] * data2[j] +
|
||||
dataV[i + 3] * data3[j] +
|
||||
dataV[i + 4] * data4[j] +
|
||||
dataV[i + 5] * data5[j] +
|
||||
dataV[i + 6] * data6[j] +
|
||||
data7[j];
|
||||
data7[j]);
|
||||
EXPECT_COMPARE_EQ(val, resAdd[j]);
|
||||
}
|
||||
#else
|
||||
|
Loading…
Reference in New Issue
Block a user