Merge pull request #19486 from fpetrogalli:dotprod_fast-3.4

* [hal][neon] Optimize the v_dotprod_fast intrinsics for aarch64.

On Armv8 in AArch64 execution mode, we can skip the sequence

   v<op>_<ty>(vget_high_<ty>(x), vget_high_<ty>(y))

in favour of

   v<op>_high_<ty>(x, y)

This has better changes for recent compilers to use less data movement
operations and better register allocation. See for example:

   https://godbolt.org/z/bPq7vd

* [hal][neon] Fix build failure on armv7.

* [hal][neon] Address review comments in PR.

PR: https://github.com/opencv/opencv/pull/19486

* [hal][neon] Define macro to check for the AArch64 execution state of Armv8.

* [hal][neon] Fix macro definition for AArch64.

The fix is needed to prevent warnings when building for Armv7.
This commit is contained in:
Francesco Petrogalli 2021-02-11 13:24:09 +00:00 committed by GitHub
parent cac5b0fe5a
commit 6ee23c9b85
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -62,6 +62,22 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
#define CV_SIMD128_64F 0
#endif
// The following macro checks if the code is being compiled for the
// AArch64 execution state of Armv8, to enable the 128-bit
// intrinsics. The macro `__ARM_64BIT_STATE` is the one recommended by
// the Arm C Language Extension (ACLE) specifications [1] to check the
// availability of 128-bit intrinsics, and it is supporrted by clang
// and gcc. The macro `_M_ARM64` is the equivalent one for Microsoft
// Visual Studio [2] .
//
// [1] https://developer.arm.com/documentation/101028/0012/13--Advanced-SIMD--Neon--intrinsics
// [2] https://docs.microsoft.com/en-us/cpp/preprocessor/predefined-macros
#if defined(__ARM_64BIT_STATE) || defined(_M_ARM64)
#define CV_NEON_AARCH64 1
#else
#define CV_NEON_AARCH64 0
#endif
// TODO
#define CV_NEON_DOT 0
@ -726,41 +742,61 @@ inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b,
// 16 >> 32
inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
{
#if CV_NEON_AARCH64
int32x4_t p = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
return v_int32x4(vmlal_high_s16(p, a.val, b.val));
#else
int16x4_t a0 = vget_low_s16(a.val);
int16x4_t a1 = vget_high_s16(a.val);
int16x4_t b0 = vget_low_s16(b.val);
int16x4_t b1 = vget_high_s16(b.val);
int32x4_t p = vmull_s16(a0, b0);
return v_int32x4(vmlal_s16(p, a1, b1));
#endif
}
inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
{
#if CV_NEON_AARCH64
int32x4_t p = vmlal_s16(c.val, vget_low_s16(a.val), vget_low_s16(b.val));
return v_int32x4(vmlal_high_s16(p, a.val, b.val));
#else
int16x4_t a0 = vget_low_s16(a.val);
int16x4_t a1 = vget_high_s16(a.val);
int16x4_t b0 = vget_low_s16(b.val);
int16x4_t b1 = vget_high_s16(b.val);
int32x4_t p = vmlal_s16(c.val, a0, b0);
return v_int32x4(vmlal_s16(p, a1, b1));
#endif
}
// 32 >> 64
inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
{
#if CV_NEON_AARCH64
int64x2_t p = vmull_s32(vget_low_s32(a.val), vget_low_s32(b.val));
return v_int64x2(vmlal_high_s32(p, a.val, b.val));
#else
int32x2_t a0 = vget_low_s32(a.val);
int32x2_t a1 = vget_high_s32(a.val);
int32x2_t b0 = vget_low_s32(b.val);
int32x2_t b1 = vget_high_s32(b.val);
int64x2_t p = vmull_s32(a0, b0);
return v_int64x2(vmlal_s32(p, a1, b1));
#endif
}
inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
{
#if CV_NEON_AARCH64
int64x2_t p = vmlal_s32(c.val, vget_low_s32(a.val), vget_low_s32(b.val));
return v_int64x2(vmlal_high_s32(p, a.val, b.val));
#else
int32x2_t a0 = vget_low_s32(a.val);
int32x2_t a1 = vget_high_s32(a.val);
int32x2_t b0 = vget_low_s32(b.val);
int32x2_t b1 = vget_high_s32(b.val);
int64x2_t p = vmlal_s32(c.val, a0, b0);
return v_int64x2(vmlal_s32(p, a1, b1));
#endif
}
// 8 >> 32
@ -1292,7 +1328,7 @@ inline int64 v_reduce_sum(const v_int64x2& a)
#if CV_SIMD128_64F
inline double v_reduce_sum(const v_float64x2& a)
{
return vgetq_lane_f64(a.val, 0) + vgetq_lane_f64(a.val, 1);
return vaddvq_f64(a.val);
}
#endif