mirror of
https://github.com/opencv/opencv.git
synced 2025-06-07 17:44:04 +08:00
Merge pull request #20011 from Developer-Ecosystem-Engineering:3.4
Improve performance on Arm64 * Improve performance on Apple silicon This patch will - Enable dot product intrinsics for macOS arm64 builds - Enable for macOS arm64 builds - Improve HAL primitives - reduction (sum, min, max, sad) - signmask - mul_expand - check_any / check_all Results on a M1 Macbook Pro * Updates to #20011 based on feedback - Removes Apple Silicon specific workarounds - Makes #ifdef sections smaller for v_mul_expand cases - Moves dot product optimization to compiler optimization check - Adds 4x4 matrix transpose optimization * Remove dotprod and fix v_transpose Based on the latest, we've removed dotprod entirely and will revisit in a future PR. Added explicit cats with v_transpose4x4() This should resolve all opens with this PR * Remove commented out lines Remove two extraneous comments
This commit is contained in:
parent
d3be58b6d7
commit
814550d2a6
@ -538,49 +538,81 @@ inline void v_mul_expand(const v_int8x16& a, const v_int8x16& b,
|
||||
v_int16x8& c, v_int16x8& d)
|
||||
{
|
||||
c.val = vmull_s8(vget_low_s8(a.val), vget_low_s8(b.val));
|
||||
#if CV_NEON_AARCH64
|
||||
d.val = vmull_high_s8(a.val, b.val);
|
||||
#else // #if CV_NEON_AARCH64
|
||||
d.val = vmull_s8(vget_high_s8(a.val), vget_high_s8(b.val));
|
||||
#endif // #if CV_NEON_AARCH64
|
||||
}
|
||||
|
||||
inline void v_mul_expand(const v_uint8x16& a, const v_uint8x16& b,
|
||||
v_uint16x8& c, v_uint16x8& d)
|
||||
{
|
||||
c.val = vmull_u8(vget_low_u8(a.val), vget_low_u8(b.val));
|
||||
#if CV_NEON_AARCH64
|
||||
d.val = vmull_high_u8(a.val, b.val);
|
||||
#else // #if CV_NEON_AARCH64
|
||||
d.val = vmull_u8(vget_high_u8(a.val), vget_high_u8(b.val));
|
||||
#endif // #if CV_NEON_AARCH64
|
||||
}
|
||||
|
||||
inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
|
||||
v_int32x4& c, v_int32x4& d)
|
||||
{
|
||||
c.val = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
|
||||
#if CV_NEON_AARCH64
|
||||
d.val = vmull_high_s16(a.val, b.val);
|
||||
#else // #if CV_NEON_AARCH64
|
||||
d.val = vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val));
|
||||
#endif // #if CV_NEON_AARCH64
|
||||
}
|
||||
|
||||
inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b,
|
||||
v_uint32x4& c, v_uint32x4& d)
|
||||
{
|
||||
c.val = vmull_u16(vget_low_u16(a.val), vget_low_u16(b.val));
|
||||
#if CV_NEON_AARCH64
|
||||
d.val = vmull_high_u16(a.val, b.val);
|
||||
#else // #if CV_NEON_AARCH64
|
||||
d.val = vmull_u16(vget_high_u16(a.val), vget_high_u16(b.val));
|
||||
#endif // #if CV_NEON_AARCH64
|
||||
}
|
||||
|
||||
inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
|
||||
v_uint64x2& c, v_uint64x2& d)
|
||||
{
|
||||
c.val = vmull_u32(vget_low_u32(a.val), vget_low_u32(b.val));
|
||||
#if CV_NEON_AARCH64
|
||||
d.val = vmull_high_u32(a.val, b.val);
|
||||
#else // #if CV_NEON_AARCH64
|
||||
d.val = vmull_u32(vget_high_u32(a.val), vget_high_u32(b.val));
|
||||
#endif // #if CV_NEON_AARCH64
|
||||
}
|
||||
|
||||
inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b)
|
||||
{
|
||||
return v_int16x8(vcombine_s16(
|
||||
vshrn_n_s32(vmull_s16( vget_low_s16(a.val), vget_low_s16(b.val)), 16),
|
||||
vshrn_n_s32(vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val)), 16)
|
||||
vshrn_n_s32(
|
||||
#if CV_NEON_AARCH64
|
||||
vmull_high_s16(a.val, b.val)
|
||||
#else // #if CV_NEON_AARCH64
|
||||
vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val))
|
||||
#endif // #if CV_NEON_AARCH64
|
||||
, 16)
|
||||
));
|
||||
}
|
||||
inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b)
|
||||
{
|
||||
return v_uint16x8(vcombine_u16(
|
||||
vshrn_n_u32(vmull_u16( vget_low_u16(a.val), vget_low_u16(b.val)), 16),
|
||||
vshrn_n_u32(vmull_u16(vget_high_u16(a.val), vget_high_u16(b.val)), 16)
|
||||
vshrn_n_u32(
|
||||
#if CV_NEON_AARCH64
|
||||
vmull_high_u16(a.val, b.val)
|
||||
#else // #if CV_NEON_AARCH64
|
||||
vmull_u16(vget_high_u16(a.val), vget_high_u16(b.val))
|
||||
#endif // #if CV_NEON_AARCH64
|
||||
, 16)
|
||||
));
|
||||
}
|
||||
|
||||
@ -1254,29 +1286,56 @@ OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float64x2, double, f64)
|
||||
|
||||
inline unsigned v_reduce_sum(const v_uint8x16& a)
|
||||
{
|
||||
#if CV_NEON_AARCH64
|
||||
uint16_t t0 = vaddlvq_u8(a.val);
|
||||
return t0;
|
||||
#else // #if CV_NEON_AARCH64
|
||||
uint32x4_t t0 = vpaddlq_u16(vpaddlq_u8(a.val));
|
||||
uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
|
||||
return vget_lane_u32(vpadd_u32(t1, t1), 0);
|
||||
#endif // #if CV_NEON_AARCH64
|
||||
}
|
||||
inline int v_reduce_sum(const v_int8x16& a)
|
||||
{
|
||||
#if CV_NEON_AARCH64
|
||||
int16_t t0 = vaddlvq_s8(a.val);
|
||||
return t0;
|
||||
#else // #if CV_NEON_AARCH64
|
||||
int32x4_t t0 = vpaddlq_s16(vpaddlq_s8(a.val));
|
||||
int32x2_t t1 = vpadd_s32(vget_low_s32(t0), vget_high_s32(t0));
|
||||
return vget_lane_s32(vpadd_s32(t1, t1), 0);
|
||||
#endif // #if CV_NEON_AARCH64
|
||||
}
|
||||
inline unsigned v_reduce_sum(const v_uint16x8& a)
|
||||
{
|
||||
#if CV_NEON_AARCH64
|
||||
uint32_t t0 = vaddlvq_u16(a.val);
|
||||
return t0;
|
||||
#else // #if CV_NEON_AARCH64
|
||||
uint32x4_t t0 = vpaddlq_u16(a.val);
|
||||
uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
|
||||
return vget_lane_u32(vpadd_u32(t1, t1), 0);
|
||||
#endif // #if CV_NEON_AARCH64
|
||||
}
|
||||
inline int v_reduce_sum(const v_int16x8& a)
|
||||
{
|
||||
#if CV_NEON_AARCH64
|
||||
int32_t t0 = vaddlvq_s16(a.val);
|
||||
return t0;
|
||||
#else // #if CV_NEON_AARCH64
|
||||
int32x4_t t0 = vpaddlq_s16(a.val);
|
||||
int32x2_t t1 = vpadd_s32(vget_low_s32(t0), vget_high_s32(t0));
|
||||
return vget_lane_s32(vpadd_s32(t1, t1), 0);
|
||||
#endif // #if CV_NEON_AARCH64
|
||||
}
|
||||
|
||||
#if CV_NEON_AARCH64
|
||||
#define OPENCV_HAL_IMPL_NEON_REDUCE_OP_16(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
|
||||
inline scalartype v_reduce_##func(const _Tpvec& a) \
|
||||
{ \
|
||||
return v##vectorfunc##vq_##suffix(a.val); \
|
||||
}
|
||||
#else // #if CV_NEON_AARCH64
|
||||
#define OPENCV_HAL_IMPL_NEON_REDUCE_OP_16(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
|
||||
inline scalartype v_reduce_##func(const _Tpvec& a) \
|
||||
{ \
|
||||
@ -1285,12 +1344,20 @@ inline scalartype v_reduce_##func(const _Tpvec& a) \
|
||||
a0 = vp##vectorfunc##_##suffix(a0, a0); \
|
||||
return (scalartype)vget_lane_##suffix(vp##vectorfunc##_##suffix(a0, a0),0); \
|
||||
}
|
||||
#endif // #if CV_NEON_AARCH64
|
||||
|
||||
OPENCV_HAL_IMPL_NEON_REDUCE_OP_16(v_uint8x16, uint8x8, uchar, max, max, u8)
|
||||
OPENCV_HAL_IMPL_NEON_REDUCE_OP_16(v_uint8x16, uint8x8, uchar, min, min, u8)
|
||||
OPENCV_HAL_IMPL_NEON_REDUCE_OP_16(v_int8x16, int8x8, schar, max, max, s8)
|
||||
OPENCV_HAL_IMPL_NEON_REDUCE_OP_16(v_int8x16, int8x8, schar, min, min, s8)
|
||||
|
||||
#if CV_NEON_AARCH64
|
||||
#define OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
|
||||
inline scalartype v_reduce_##func(const _Tpvec& a) \
|
||||
{ \
|
||||
return v##vectorfunc##vq_##suffix(a.val); \
|
||||
}
|
||||
#else // #if CV_NEON_AARCH64
|
||||
#define OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
|
||||
inline scalartype v_reduce_##func(const _Tpvec& a) \
|
||||
{ \
|
||||
@ -1298,18 +1365,27 @@ inline scalartype v_reduce_##func(const _Tpvec& a) \
|
||||
a0 = vp##vectorfunc##_##suffix(a0, a0); \
|
||||
return (scalartype)vget_lane_##suffix(vp##vectorfunc##_##suffix(a0, a0),0); \
|
||||
}
|
||||
#endif // #if CV_NEON_AARCH64
|
||||
|
||||
OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_uint16x8, uint16x4, ushort, max, max, u16)
|
||||
OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_uint16x8, uint16x4, ushort, min, min, u16)
|
||||
OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_int16x8, int16x4, short, max, max, s16)
|
||||
OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_int16x8, int16x4, short, min, min, s16)
|
||||
|
||||
#if CV_NEON_AARCH64
|
||||
#define OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
|
||||
inline scalartype v_reduce_##func(const _Tpvec& a) \
|
||||
{ \
|
||||
return v##vectorfunc##vq_##suffix(a.val); \
|
||||
}
|
||||
#else // #if CV_NEON_AARCH64
|
||||
#define OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
|
||||
inline scalartype v_reduce_##func(const _Tpvec& a) \
|
||||
{ \
|
||||
_Tpnvec##_t a0 = vp##vectorfunc##_##suffix(vget_low_##suffix(a.val), vget_high_##suffix(a.val)); \
|
||||
return (scalartype)vget_lane_##suffix(vp##vectorfunc##_##suffix(a0, vget_high_##suffix(a.val)),0); \
|
||||
}
|
||||
#endif // #if CV_NEON_AARCH64
|
||||
|
||||
OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, uint32x2, unsigned, sum, add, u32)
|
||||
OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, uint32x2, unsigned, max, max, u32)
|
||||
@ -1322,9 +1398,21 @@ OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, max, max, f32)
|
||||
OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, min, min, f32)
|
||||
|
||||
inline uint64 v_reduce_sum(const v_uint64x2& a)
|
||||
{ return vget_lane_u64(vadd_u64(vget_low_u64(a.val), vget_high_u64(a.val)),0); }
|
||||
{
|
||||
#if CV_NEON_AARCH64
|
||||
return vaddvq_u64(a.val);
|
||||
#else // #if CV_NEON_AARCH64
|
||||
return vget_lane_u64(vadd_u64(vget_low_u64(a.val), vget_high_u64(a.val)),0);
|
||||
#endif // #if CV_NEON_AARCH64
|
||||
}
|
||||
inline int64 v_reduce_sum(const v_int64x2& a)
|
||||
{ return vget_lane_s64(vadd_s64(vget_low_s64(a.val), vget_high_s64(a.val)),0); }
|
||||
{
|
||||
#if CV_NEON_AARCH64
|
||||
return vaddvq_s64(a.val);
|
||||
#else // #if CV_NEON_AARCH64
|
||||
return vget_lane_s64(vadd_s64(vget_low_s64(a.val), vget_high_s64(a.val)),0);
|
||||
#endif // #if CV_NEON_AARCH64
|
||||
}
|
||||
#if CV_SIMD128_64F
|
||||
inline double v_reduce_sum(const v_float64x2& a)
|
||||
{
|
||||
@ -1335,6 +1423,11 @@ inline double v_reduce_sum(const v_float64x2& a)
|
||||
inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
|
||||
const v_float32x4& c, const v_float32x4& d)
|
||||
{
|
||||
#if CV_NEON_AARCH64
|
||||
float32x4_t ab = vpaddq_f32(a.val, b.val); // a0+a1 a2+a3 b0+b1 b2+b3
|
||||
float32x4_t cd = vpaddq_f32(c.val, d.val); // c0+c1 d0+d1 c2+c3 d2+d3
|
||||
return v_float32x4(vpaddq_f32(ab, cd)); // sumA sumB sumC sumD
|
||||
#else // #if CV_NEON_AARCH64
|
||||
float32x4x2_t ab = vtrnq_f32(a.val, b.val);
|
||||
float32x4x2_t cd = vtrnq_f32(c.val, d.val);
|
||||
|
||||
@ -1345,49 +1438,91 @@ inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
|
||||
float32x4_t v1 = vcombine_f32(vget_high_f32(u0), vget_high_f32(u1));
|
||||
|
||||
return v_float32x4(vaddq_f32(v0, v1));
|
||||
#endif // #if CV_NEON_AARCH64
|
||||
}
|
||||
|
||||
inline unsigned v_reduce_sad(const v_uint8x16& a, const v_uint8x16& b)
|
||||
{
|
||||
#if CV_NEON_AARCH64
|
||||
uint8x16_t t0 = vabdq_u8(a.val, b.val);
|
||||
uint16_t t1 = vaddlvq_u8(t0);
|
||||
return t1;
|
||||
#else // #if CV_NEON_AARCH64
|
||||
uint32x4_t t0 = vpaddlq_u16(vpaddlq_u8(vabdq_u8(a.val, b.val)));
|
||||
uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
|
||||
return vget_lane_u32(vpadd_u32(t1, t1), 0);
|
||||
#endif // #if CV_NEON_AARCH64
|
||||
}
|
||||
inline unsigned v_reduce_sad(const v_int8x16& a, const v_int8x16& b)
|
||||
{
|
||||
#if CV_NEON_AARCH64
|
||||
uint8x16_t t0 = vreinterpretq_u8_s8(vabdq_s8(a.val, b.val));
|
||||
uint16_t t1 = vaddlvq_u8(t0);
|
||||
return t1;
|
||||
#else // #if CV_NEON_AARCH64
|
||||
uint32x4_t t0 = vpaddlq_u16(vpaddlq_u8(vreinterpretq_u8_s8(vabdq_s8(a.val, b.val))));
|
||||
uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
|
||||
return vget_lane_u32(vpadd_u32(t1, t1), 0);
|
||||
#endif // #if CV_NEON_AARCH64
|
||||
}
|
||||
inline unsigned v_reduce_sad(const v_uint16x8& a, const v_uint16x8& b)
|
||||
{
|
||||
#if CV_NEON_AARCH64
|
||||
uint16x8_t t0 = vabdq_u16(a.val, b.val);
|
||||
uint32_t t1 = vaddlvq_u16(t0);
|
||||
return t1;
|
||||
#else // #if CV_NEON_AARCH64
|
||||
uint32x4_t t0 = vpaddlq_u16(vabdq_u16(a.val, b.val));
|
||||
uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
|
||||
return vget_lane_u32(vpadd_u32(t1, t1), 0);
|
||||
#endif // #if CV_NEON_AARCH64
|
||||
}
|
||||
inline unsigned v_reduce_sad(const v_int16x8& a, const v_int16x8& b)
|
||||
{
|
||||
#if CV_NEON_AARCH64
|
||||
uint16x8_t t0 = vreinterpretq_u16_s16(vabdq_s16(a.val, b.val));
|
||||
uint32_t t1 = vaddlvq_u16(t0);
|
||||
return t1;
|
||||
#else // #if CV_NEON_AARCH64
|
||||
uint32x4_t t0 = vpaddlq_u16(vreinterpretq_u16_s16(vabdq_s16(a.val, b.val)));
|
||||
uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
|
||||
return vget_lane_u32(vpadd_u32(t1, t1), 0);
|
||||
#endif // #if CV_NEON_AARCH64
|
||||
}
|
||||
inline unsigned v_reduce_sad(const v_uint32x4& a, const v_uint32x4& b)
|
||||
{
|
||||
#if CV_NEON_AARCH64
|
||||
uint32x4_t t0 = vabdq_u32(a.val, b.val);
|
||||
uint32_t t1 = vaddvq_u32(t0);
|
||||
return t1;
|
||||
#else // #if CV_NEON_AARCH64
|
||||
uint32x4_t t0 = vabdq_u32(a.val, b.val);
|
||||
uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
|
||||
return vget_lane_u32(vpadd_u32(t1, t1), 0);
|
||||
#endif // #if CV_NEON_AARCH64
|
||||
}
|
||||
inline unsigned v_reduce_sad(const v_int32x4& a, const v_int32x4& b)
|
||||
{
|
||||
#if CV_NEON_AARCH64
|
||||
uint32x4_t t0 = vreinterpretq_u32_s32(vabdq_s32(a.val, b.val));
|
||||
uint32_t t1 = vaddvq_u32(t0);
|
||||
return t1;
|
||||
#else // #if CV_NEON_AARCH64
|
||||
uint32x4_t t0 = vreinterpretq_u32_s32(vabdq_s32(a.val, b.val));
|
||||
uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
|
||||
return vget_lane_u32(vpadd_u32(t1, t1), 0);
|
||||
#endif // #if CV_NEON_AARCH64
|
||||
}
|
||||
inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
|
||||
{
|
||||
#if CV_NEON_AARCH64
|
||||
float32x4_t t0 = vabdq_f32(a.val, b.val);
|
||||
return vaddvq_f32(t0);
|
||||
#else // #if CV_NEON_AARCH64
|
||||
float32x4_t t0 = vabdq_f32(a.val, b.val);
|
||||
float32x2_t t1 = vpadd_f32(vget_low_f32(t0), vget_high_f32(t0));
|
||||
return vget_lane_f32(vpadd_f32(t1, t1), 0);
|
||||
#endif // #if CV_NEON_AARCH64
|
||||
}
|
||||
|
||||
inline v_uint8x16 v_popcount(const v_uint8x16& a)
|
||||
@ -1409,30 +1544,54 @@ inline v_uint64x2 v_popcount(const v_int64x2& a)
|
||||
|
||||
inline int v_signmask(const v_uint8x16& a)
|
||||
{
|
||||
#if CV_NEON_AARCH64
|
||||
const int8x16_t signPosition = {0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7};
|
||||
const uint8x16_t byteOrder = {0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15};
|
||||
uint8x16_t v0 = vshlq_u8(vshrq_n_u8(a.val, 7), signPosition);
|
||||
uint8x16_t v1 = vqtbl1q_u8(v0, byteOrder);
|
||||
uint32_t t0 = vaddlvq_u16(vreinterpretq_u16_u8(v1));
|
||||
return t0;
|
||||
#else // #if CV_NEON_AARCH64
|
||||
int8x8_t m0 = vcreate_s8(CV_BIG_UINT(0x0706050403020100));
|
||||
uint8x16_t v0 = vshlq_u8(vshrq_n_u8(a.val, 7), vcombine_s8(m0, m0));
|
||||
uint64x2_t v1 = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(v0)));
|
||||
return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 8);
|
||||
#endif // #if CV_NEON_AARCH64
|
||||
}
|
||||
|
||||
inline int v_signmask(const v_int8x16& a)
|
||||
{ return v_signmask(v_reinterpret_as_u8(a)); }
|
||||
|
||||
inline int v_signmask(const v_uint16x8& a)
|
||||
{
|
||||
#if CV_NEON_AARCH64
|
||||
const int16x8_t signPosition = {0,1,2,3,4,5,6,7};
|
||||
uint16x8_t v0 = vshlq_u16(vshrq_n_u16(a.val, 15), signPosition);
|
||||
uint32_t t0 = vaddlvq_u16(v0);
|
||||
return t0;
|
||||
#else // #if CV_NEON_AARCH64
|
||||
int16x4_t m0 = vcreate_s16(CV_BIG_UINT(0x0003000200010000));
|
||||
uint16x8_t v0 = vshlq_u16(vshrq_n_u16(a.val, 15), vcombine_s16(m0, m0));
|
||||
uint64x2_t v1 = vpaddlq_u32(vpaddlq_u16(v0));
|
||||
return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 4);
|
||||
#endif // #if CV_NEON_AARCH64
|
||||
}
|
||||
inline int v_signmask(const v_int16x8& a)
|
||||
{ return v_signmask(v_reinterpret_as_u16(a)); }
|
||||
|
||||
inline int v_signmask(const v_uint32x4& a)
|
||||
{
|
||||
#if CV_NEON_AARCH64
|
||||
const int32x4_t signPosition = {0,1,2,3};
|
||||
uint32x4_t v0 = vshlq_u32(vshrq_n_u32(a.val, 31), signPosition);
|
||||
uint32_t t0 = vaddvq_u32(v0);
|
||||
return t0;
|
||||
#else // #if CV_NEON_AARCH64
|
||||
int32x2_t m0 = vcreate_s32(CV_BIG_UINT(0x0000000100000000));
|
||||
uint32x4_t v0 = vshlq_u32(vshrq_n_u32(a.val, 31), vcombine_s32(m0, m0));
|
||||
uint64x2_t v1 = vpaddlq_u32(v0);
|
||||
return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 2);
|
||||
#endif // #if CV_NEON_AARCH64
|
||||
}
|
||||
inline int v_signmask(const v_int32x4& a)
|
||||
{ return v_signmask(v_reinterpret_as_u32(a)); }
|
||||
@ -1440,9 +1599,16 @@ inline int v_signmask(const v_float32x4& a)
|
||||
{ return v_signmask(v_reinterpret_as_u32(a)); }
|
||||
inline int v_signmask(const v_uint64x2& a)
|
||||
{
|
||||
#if CV_NEON_AARCH64
|
||||
const int64x2_t signPosition = {0,1};
|
||||
uint64x2_t v0 = vshlq_u64(vshrq_n_u64(a.val, 63), signPosition);
|
||||
uint64_t t0 = vaddvq_u64(v0);
|
||||
return t0;
|
||||
#else // #if CV_NEON_AARCH64
|
||||
int64x1_t m0 = vdup_n_s64(0);
|
||||
uint64x2_t v0 = vshlq_u64(vshrq_n_u64(a.val, 63), vcombine_s64(m0, m0));
|
||||
return (int)vgetq_lane_u64(v0, 0) + ((int)vgetq_lane_u64(v0, 1) << 1);
|
||||
#endif // #if CV_NEON_AARCH64
|
||||
}
|
||||
inline int v_signmask(const v_int64x2& a)
|
||||
{ return v_signmask(v_reinterpret_as_u64(a)); }
|
||||
@ -1464,6 +1630,17 @@ inline int v_scan_forward(const v_uint64x2& a) { return trailingZeros32(v_signma
|
||||
inline int v_scan_forward(const v_float64x2& a) { return trailingZeros32(v_signmask(a)); }
|
||||
#endif
|
||||
|
||||
#if CV_NEON_AARCH64
|
||||
#define OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(_Tpvec, suffix, shift) \
|
||||
inline bool v_check_all(const v_##_Tpvec& a) \
|
||||
{ \
|
||||
return (vminvq_##suffix(a.val) >> shift) != 0; \
|
||||
} \
|
||||
inline bool v_check_any(const v_##_Tpvec& a) \
|
||||
{ \
|
||||
return (vmaxvq_##suffix(a.val) >> shift) != 0; \
|
||||
}
|
||||
#else // #if CV_NEON_AARCH64
|
||||
#define OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(_Tpvec, suffix, shift) \
|
||||
inline bool v_check_all(const v_##_Tpvec& a) \
|
||||
{ \
|
||||
@ -1477,6 +1654,7 @@ inline bool v_check_any(const v_##_Tpvec& a) \
|
||||
uint64x2_t v1 = vreinterpretq_u64_##suffix(v0); \
|
||||
return (vgetq_lane_u64(v1, 0) | vgetq_lane_u64(v1, 1)) != 0; \
|
||||
}
|
||||
#endif // #if CV_NEON_AARCH64
|
||||
|
||||
OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint8x16, u8, 7)
|
||||
OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint16x8, u16, 15)
|
||||
@ -1829,6 +2007,37 @@ inline v_int32x4 v_trunc(const v_float64x2& a)
|
||||
}
|
||||
#endif
|
||||
|
||||
#if CV_NEON_AARCH64
|
||||
#define OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(_Tpvec, suffix) \
|
||||
inline void v_transpose4x4(const v_##_Tpvec& a0, const v_##_Tpvec& a1, \
|
||||
const v_##_Tpvec& a2, const v_##_Tpvec& a3, \
|
||||
v_##_Tpvec& b0, v_##_Tpvec& b1, \
|
||||
v_##_Tpvec& b2, v_##_Tpvec& b3) \
|
||||
{ \
|
||||
/* -- Pass 1: 64b transpose */ \
|
||||
_Tpvec##_t t0 = vreinterpretq_##suffix##32_##suffix##64( \
|
||||
vtrn1q_##suffix##64(vreinterpretq_##suffix##64_##suffix##32(a0.val), \
|
||||
vreinterpretq_##suffix##64_##suffix##32(a2.val))); \
|
||||
_Tpvec##_t t1 = vreinterpretq_##suffix##32_##suffix##64( \
|
||||
vtrn1q_##suffix##64(vreinterpretq_##suffix##64_##suffix##32(a1.val), \
|
||||
vreinterpretq_##suffix##64_##suffix##32(a3.val))); \
|
||||
_Tpvec##_t t2 = vreinterpretq_##suffix##32_##suffix##64( \
|
||||
vtrn2q_##suffix##64(vreinterpretq_##suffix##64_##suffix##32(a0.val), \
|
||||
vreinterpretq_##suffix##64_##suffix##32(a2.val))); \
|
||||
_Tpvec##_t t3 = vreinterpretq_##suffix##32_##suffix##64( \
|
||||
vtrn2q_##suffix##64(vreinterpretq_##suffix##64_##suffix##32(a1.val), \
|
||||
vreinterpretq_##suffix##64_##suffix##32(a3.val))); \
|
||||
/* -- Pass 2: 32b transpose */ \
|
||||
b0.val = vtrn1q_##suffix##32(t0, t1); \
|
||||
b1.val = vtrn2q_##suffix##32(t0, t1); \
|
||||
b2.val = vtrn1q_##suffix##32(t2, t3); \
|
||||
b3.val = vtrn2q_##suffix##32(t2, t3); \
|
||||
}
|
||||
|
||||
OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(uint32x4, u)
|
||||
OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(int32x4, s)
|
||||
OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(float32x4, f)
|
||||
#else // #if CV_NEON_AARCH64
|
||||
#define OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(_Tpvec, suffix) \
|
||||
inline void v_transpose4x4(const v_##_Tpvec& a0, const v_##_Tpvec& a1, \
|
||||
const v_##_Tpvec& a2, const v_##_Tpvec& a3, \
|
||||
@ -1854,6 +2063,7 @@ inline void v_transpose4x4(const v_##_Tpvec& a0, const v_##_Tpvec& a1, \
|
||||
OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(uint32x4, u32)
|
||||
OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(int32x4, s32)
|
||||
OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(float32x4, f32)
|
||||
#endif // #if CV_NEON_AARCH64
|
||||
|
||||
#define OPENCV_HAL_IMPL_NEON_INTERLEAVED(_Tpvec, _Tp, suffix) \
|
||||
inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b) \
|
||||
|
@ -577,6 +577,25 @@ template<typename R> struct TheTest
|
||||
return *this;
|
||||
}
|
||||
|
||||
TheTest & test_mul_hi()
|
||||
{
|
||||
// typedef typename V_RegTraits<R>::w_reg Rx2;
|
||||
Data<R> dataA, dataB(32767);
|
||||
R a = dataA, b = dataB;
|
||||
|
||||
R c = v_mul_hi(a, b);
|
||||
|
||||
Data<R> resC = c;
|
||||
const int n = R::nlanes / 2;
|
||||
for (int i = 0; i < n; ++i)
|
||||
{
|
||||
SCOPED_TRACE(cv::format("i=%d", i));
|
||||
EXPECT_EQ((typename R::lane_type)((dataA[i] * dataB[i]) >> 16), resC[i]);
|
||||
}
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
TheTest & test_abs()
|
||||
{
|
||||
typedef typename V_RegTraits<R>::u_reg Ru;
|
||||
@ -1663,6 +1682,7 @@ void test_hal_intrin_uint16()
|
||||
.test_arithm_wrap()
|
||||
.test_mul()
|
||||
.test_mul_expand()
|
||||
.test_mul_hi()
|
||||
.test_cmp()
|
||||
.test_shift<1>()
|
||||
.test_shift<8>()
|
||||
@ -1697,6 +1717,7 @@ void test_hal_intrin_int16()
|
||||
.test_arithm_wrap()
|
||||
.test_mul()
|
||||
.test_mul_expand()
|
||||
.test_mul_hi()
|
||||
.test_cmp()
|
||||
.test_shift<1>()
|
||||
.test_shift<8>()
|
||||
|
Loading…
Reference in New Issue
Block a user