From f38a61c66d8e45ad0b5e21bdc6e55223d1cdb59e Mon Sep 17 00:00:00 2001 From: "Paul E. Murphy" Date: Mon, 22 Jul 2019 14:23:56 -0500 Subject: [PATCH] fast_math: implement optimized PPC routines Implement cvRound using inline asm. No compiler support exists today to properly optimize this. This results in about a 4x speedup over the default rounding. Likewise, simplify the growing number of rounding function overloads. For P9 enabled targets, utilize the classification testing instruction to test for Inf/Nan values. Operation speedup is about 1.2x for FP32, and 1.5x for FP64 operands. For P8 targets, fallback to the GCC nan inline. It provides a 1.1/1.4x improvement for FP32/FP64 arguments. --- .../core/include/opencv2/core/fast_math.hpp | 84 ++++++++++++++----- 1 file changed, 62 insertions(+), 22 deletions(-) diff --git a/modules/core/include/opencv2/core/fast_math.hpp b/modules/core/include/opencv2/core/fast_math.hpp index 6eb6f1fa00..b1e8c4202d 100644 --- a/modules/core/include/opencv2/core/fast_math.hpp +++ b/modules/core/include/opencv2/core/fast_math.hpp @@ -74,7 +74,15 @@ # include "tegra_round.hpp" #endif -#if defined __GNUC__ && defined __arm__ && (defined __ARM_PCS_VFP || defined __ARM_VFPV3__ || defined __ARM_NEON__) && !defined __SOFTFP__ && !defined(__CUDACC__) +#if defined __PPC64__ && defined __GNUC__ && defined _ARCH_PWR8 && !defined (__CUDACC__) +# include +#endif + +#if ((defined _MSC_VER && defined _M_ARM) || defined CV_ICC || \ + defined __GNUC__) && defined HAVE_TEGRA_OPTIMIZATION + #define CV_INLINE_ROUND_DBL(value) TEGRA_ROUND_DBL(value); + #define CV_INLINE_ROUND_FLT(value) TEGRA_ROUND_FLT(value); +#elif defined __GNUC__ && defined __arm__ && (defined __ARM_PCS_VFP || defined __ARM_VFPV3__ || defined __ARM_NEON__) && !defined __SOFTFP__ && !defined(__CUDACC__) // 1. general scheme #define ARM_ROUND(_value, _asm_string) \ int res; \ @@ -84,12 +92,32 @@ return res // 2. version for double #ifdef __clang__ - #define ARM_ROUND_DBL(value) ARM_ROUND(value, "vcvtr.s32.f64 %[temp], %[value] \n vmov %[res], %[temp]") + #define CV_INLINE_ROUND_DBL(value) ARM_ROUND(value, "vcvtr.s32.f64 %[temp], %[value] \n vmov %[res], %[temp]") #else - #define ARM_ROUND_DBL(value) ARM_ROUND(value, "vcvtr.s32.f64 %[temp], %P[value] \n vmov %[res], %[temp]") + #define CV_INLINE_ROUND_DBL(value) ARM_ROUND(value, "vcvtr.s32.f64 %[temp], %P[value] \n vmov %[res], %[temp]") #endif // 3. version for float - #define ARM_ROUND_FLT(value) ARM_ROUND(value, "vcvtr.s32.f32 %[temp], %[value]\n vmov %[res], %[temp]") + #define CV_INLINE_ROUND_FLT(value) ARM_ROUND(value, "vcvtr.s32.f32 %[temp], %[value]\n vmov %[res], %[temp]") +#elif defined __PPC64__ && defined __GNUC__ && defined _ARCH_PWR8 && !defined (__CUDACC__) + // P8 and newer machines can convert fp32/64 to int quickly. + #define CV_INLINE_ROUND_DBL(value) \ + int out; \ + double temp; \ + __asm__( "fctiw %[temp],%[in]\n\tmffprwz %[out],%[temp]\n\t" : [out] "=r" (out), [temp] "=d" (temp) : [in] "d" ((double)(value)) : ); \ + return out; + + // FP32 also works with FP64 routine above + #define CV_INLINE_ROUND_FLT(value) CV_INLINE_ROUND_DBL(value) + + #ifdef _ARCH_PWR9 + #define CV_INLINE_ISINF_DBL(value) return scalar_test_data_class(value, 0x30); + #define CV_INLINE_ISNAN_DBL(value) return scalar_test_data_class(value, 0x40); + #define CV_INLINE_ISINF_FLT(value) CV_INLINE_ISINF_DBL(value) + #define CV_INLINE_ISNAN_FLT(value) CV_INLINE_ISNAN_DBL(value) + #endif +#elif defined CV_ICC || defined __GNUC__ + #define CV_INLINE_ROUND_DBL(value) return (int)(lrint(value)); + #define CV_INLINE_ROUND_FLT(value) return (int)(lrintf(value)); #endif #if defined __PPC64__ && !defined OPENCV_USE_FASTMATH_GCC_BUILTINS @@ -105,6 +133,16 @@ #define _OPENCV_FASTMATH_ENABLE_GCC_MATH_BUILTINS #endif +/* Allow overrides for some functions which may benefit from tuning. Likewise, + note that isinf is not used as the return value is signed. */ +#if defined _OPENCV_FASTMATH_ENABLE_GCC_MATH_BUILTINS && !defined CV_INLINE_ISNAN_DBL + #define CV_INLINE_ISNAN_DBL(value) return __builtin_isnan(value); +#endif + +#if defined _OPENCV_FASTMATH_ENABLE_GCC_MATH_BUILTINS && !defined CV_INLINE_ISNAN_FLT + #define CV_INLINE_ISNAN_FLT(value) return __builtin_isnanf(value); +#endif + /** @brief Rounds floating-point number to the nearest integer @param value floating-point number. If the value is outside of INT_MIN ... INT_MAX range, the @@ -125,15 +163,8 @@ cvRound( double value ) fistp t; } return t; -#elif ((defined _MSC_VER && defined _M_ARM) || defined CV_ICC || \ - defined __GNUC__) && defined HAVE_TEGRA_OPTIMIZATION - TEGRA_ROUND_DBL(value); -#elif defined CV_ICC || defined __GNUC__ -# if defined ARM_ROUND_DBL - ARM_ROUND_DBL(value); -# else - return (int)lrint(value); -# endif +#elif defined CV_INLINE_ROUND_DBL + CV_INLINE_ROUND_DBL(value); #else /* it's ok if round does not comply with IEEE754 standard; the tests should allow +/-1 difference when the tested functions use round */ @@ -184,10 +215,14 @@ CV_INLINE int cvCeil( double value ) otherwise. */ CV_INLINE int cvIsNaN( double value ) { +#if defined CV_INLINE_ISNAN_DBL + CV_INLINE_ISNAN_DBL(value); +#else Cv64suf ieee754; ieee754.f = value; return ((unsigned)(ieee754.u >> 32) & 0x7fffffff) + ((unsigned)ieee754.u != 0) > 0x7ff00000; +#endif } /** @brief Determines if the argument is Infinity. @@ -198,10 +233,14 @@ CV_INLINE int cvIsNaN( double value ) and 0 otherwise. */ CV_INLINE int cvIsInf( double value ) { +#if defined CV_INLINE_ISINF_DBL + CV_INLINE_ISINF_DBL(value); +#else Cv64suf ieee754; ieee754.f = value; return ((unsigned)(ieee754.u >> 32) & 0x7fffffff) == 0x7ff00000 && (unsigned)ieee754.u == 0; +#endif } #ifdef __cplusplus @@ -221,15 +260,8 @@ CV_INLINE int cvRound(float value) fistp t; } return t; -#elif ((defined _MSC_VER && defined _M_ARM) || defined CV_ICC || \ - defined __GNUC__) && defined HAVE_TEGRA_OPTIMIZATION - TEGRA_ROUND_FLT(value); -#elif defined CV_ICC || defined __GNUC__ -# if defined ARM_ROUND_FLT - ARM_ROUND_FLT(value); -# else - return (int)lrintf(value); -# endif +#elif defined CV_INLINE_ROUND_FLT + CV_INLINE_ROUND_FLT(value); #else /* it's ok if round does not comply with IEEE754 standard; the tests should allow +/-1 difference when the tested functions use round */ @@ -280,17 +312,25 @@ CV_INLINE int cvCeil( int value ) /** @overload */ CV_INLINE int cvIsNaN( float value ) { +#if defined CV_INLINE_ISNAN_FLT + CV_INLINE_ISNAN_FLT(value); +#else Cv32suf ieee754; ieee754.f = value; return (ieee754.u & 0x7fffffff) > 0x7f800000; +#endif } /** @overload */ CV_INLINE int cvIsInf( float value ) { +#if defined CV_INLINE_ISINF_FLT + CV_INLINE_ISINF_FLT(value); +#else Cv32suf ieee754; ieee754.f = value; return (ieee754.u & 0x7fffffff) == 0x7f800000; +#endif } #endif // __cplusplus