From def444d99fdc39b6341748db452e4b75288fa1a1 Mon Sep 17 00:00:00 2001 From: Sayed Adel Date: Sat, 14 Oct 2017 04:30:04 +0000 Subject: [PATCH 1/2] core: Several improvements to Power/VSX - changed behavior of vec_ctf, vec_ctu, vec_cts in gcc and clang to make them compatible with XLC - implemented most of missing conversion intrinsics in gcc and clang - implemented conversions intrinsics of odd-numbered elements - ignored gcc bug warning that caused by -Wunused-but-set-variable in rare cases - replaced right shift with algebraic right shift for signed vectors to shift in the sign bit. - added new universal intrinsics v_matmuladd, v_rotate_left/right - avoid using floating multiply-add in RNG --- .../include/opencv2/core/hal/intrin_vsx.hpp | 162 ++++-- .../core/include/opencv2/core/vsx_utils.hpp | 507 +++++++++++------- modules/core/src/rand.cpp | 16 + 3 files changed, 422 insertions(+), 263 deletions(-) diff --git a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp index 3d15945de7..2d9b2021f9 100644 --- a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp @@ -521,24 +521,25 @@ OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_add_wrap, vec_add) OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_sub_wrap, vec_sub) /** Bitwise shifts **/ -#define OPENCV_HAL_IMPL_VSX_SHIFT_OP(_Tpuvec, splfunc) \ -inline _Tpuvec operator << (const _Tpuvec& a, int imm) \ -{ return _Tpuvec(vec_sl(a.val, splfunc(imm))); } \ -inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \ -{ return _Tpuvec(vec_sr(a.val, splfunc(imm))); } \ -template inline _Tpuvec v_shl(const _Tpuvec& a) \ -{ return _Tpuvec(vec_sl(a.val, splfunc(imm))); } \ -template inline _Tpuvec v_shr(const _Tpuvec& a) \ -{ return _Tpuvec(vec_sr(a.val, splfunc(imm))); } +#define OPENCV_HAL_IMPL_VSX_SHIFT_OP(_Tpvec, shr, splfunc) \ +inline _Tpvec operator << (const _Tpvec& a, int imm) \ +{ return _Tpvec(vec_sl(a.val, splfunc(imm))); } \ +inline _Tpvec operator >> (const _Tpvec& a, int imm) \ +{ return _Tpvec(shr(a.val, splfunc(imm))); } \ +template inline _Tpvec v_shl(const _Tpvec& a) \ +{ return _Tpvec(vec_sl(a.val, splfunc(imm))); } \ +template inline _Tpvec v_shr(const _Tpvec& a) \ +{ return _Tpvec(shr(a.val, splfunc(imm))); } -OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_uint8x16, vec_uchar16_sp) -OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_int8x16, vec_uchar16_sp) -OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_uint16x8, vec_ushort8_sp) -OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_int16x8, vec_ushort8_sp) -OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_uint32x4, vec_uint4_sp) -OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_int32x4, vec_uint4_sp) -OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_uint64x2, vec_udword2_sp) -OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_int64x2, vec_udword2_sp) +OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_uint8x16, vec_sr, vec_uchar16_sp) +OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_uint16x8, vec_sr, vec_ushort8_sp) +OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_uint32x4, vec_sr, vec_uint4_sp) +OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_uint64x2, vec_sr, vec_udword2_sp) +// algebraic right shift +OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_int8x16, vec_sra, vec_uchar16_sp) +OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_int16x8, vec_sra, vec_ushort8_sp) +OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_int32x4, vec_sra, vec_uint4_sp) +OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_int64x2, vec_sra, vec_udword2_sp) /** Bitwise logic **/ #define OPENCV_HAL_IMPL_VSX_LOGIC_OP(_Tpvec) \ @@ -603,6 +604,64 @@ OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_int64x2) OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_min, vec_min) OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_max, vec_max) +/** Rotate **/ +#define OPENCV_IMPL_VSX_ROTATE(_Tpvec, suffix, shf, cast) \ +template \ +inline _Tpvec v_rotate_##suffix(const _Tpvec& a) \ +{ \ + const int wd = imm * sizeof(typename _Tpvec::lane_type); \ + if (wd > 15) \ + return _Tpvec(); \ + return _Tpvec((cast)shf(vec_uchar16_c(a.val), vec_uchar16_sp(wd << 3))); \ +} + +#define OPENCV_IMPL_VSX_ROTATE_LR(_Tpvec, cast) \ +OPENCV_IMPL_VSX_ROTATE(_Tpvec, left, vec_slo, cast) \ +OPENCV_IMPL_VSX_ROTATE(_Tpvec, right, vec_sro, cast) + +OPENCV_IMPL_VSX_ROTATE_LR(v_uint8x16, vec_uchar16) +OPENCV_IMPL_VSX_ROTATE_LR(v_int8x16, vec_char16) +OPENCV_IMPL_VSX_ROTATE_LR(v_uint16x8, vec_ushort8) +OPENCV_IMPL_VSX_ROTATE_LR(v_int16x8, vec_short8) +OPENCV_IMPL_VSX_ROTATE_LR(v_uint32x4, vec_uint4) +OPENCV_IMPL_VSX_ROTATE_LR(v_int32x4, vec_int4) +OPENCV_IMPL_VSX_ROTATE_LR(v_uint64x2, vec_udword2) +OPENCV_IMPL_VSX_ROTATE_LR(v_int64x2, vec_dword2) + + +template +inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) +{ + const int wd = imm * sizeof(typename _Tpvec::lane_type); + if (wd == 0) + return a; + return _Tpvec(vec_sld(b.val, a.val, 16 - wd)); +} + +template +inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) +{ + const int wd = imm * sizeof(typename _Tpvec::lane_type); + if (wd == 16) + return b; + return _Tpvec(vec_sld(a.val, b.val, wd)); +} + +#define OPENCV_IMPL_VSX_ROTATE_64(_Tpvec, suffix, rg1, rg2) \ +template \ +inline _Tpvec v_rotate_##suffix(const _Tpvec& a, const _Tpvec& b) \ +{ \ + if (imm == 1) \ + return _Tpvec(vec_permi(rg1.val, rg2.val, 2)); \ + return imm ? b : a; \ +} + +OPENCV_IMPL_VSX_ROTATE_64(v_int64x2, right, a, b) +OPENCV_IMPL_VSX_ROTATE_64(v_uint64x2, right, a, b) + +OPENCV_IMPL_VSX_ROTATE_64(v_int64x2, left, b, a) +OPENCV_IMPL_VSX_ROTATE_64(v_uint64x2, left, b, a) + ////////// Reduce and mask ///////// /** Reduce **/ @@ -724,7 +783,7 @@ inline int v_signmask(const v_float32x4& a) inline int v_signmask(const v_int64x2& a) { - const vec_dword2 sv = vec_sr(a.val, vec_udword2_sp(63)); + VSX_UNUSED(const vec_dword2) sv = vec_sr(a.val, vec_udword2_sp(63)); return (int)vec_extract(sv, 0) | (int)vec_extract(sv, 1) << 1; } inline int v_signmask(const v_uint64x2& a) @@ -810,66 +869,47 @@ OPENCV_HAL_IMPL_VSX_BIN_FUNC2(v_int64x2, v_uint64x2, vec_udword2_c, v_absdiff, v /** Rounding **/ inline v_int32x4 v_round(const v_float32x4& a) -{ return v_int32x4(vec_cts(vec_round(a.val), 0)); } +{ return v_int32x4(vec_cts(vec_round(a.val))); } inline v_int32x4 v_round(const v_float64x2& a) -{ - static const vec_uchar16 perm = {16, 17, 18, 19, 24, 25, 26, 27, 0, 0, 0, 0, 0, 0, 0, 0}; - return v_int32x4(vec_perm(vec_int4_z, vec_ctsw(vec_round(a.val)), perm)); -} +{ return v_int32x4(vec_mergesqo(vec_cts(vec_round(a.val)), vec_int4_z)); } inline v_int32x4 v_floor(const v_float32x4& a) -{ return v_int32x4(vec_cts(vec_floor(a.val), 0)); } +{ return v_int32x4(vec_cts(vec_floor(a.val))); } inline v_int32x4 v_floor(const v_float64x2& a) -{ - static const vec_uchar16 perm = {16, 17, 18, 19, 24, 25, 26, 27, 0, 0, 0, 0, 0, 0, 0, 0}; - return v_int32x4(vec_perm(vec_int4_z, vec_ctsw(vec_floor(a.val)), perm)); -} +{ return v_int32x4(vec_mergesqo(vec_cts(vec_floor(a.val)), vec_int4_z)); } inline v_int32x4 v_ceil(const v_float32x4& a) -{ return v_int32x4(vec_cts(vec_ceil(a.val), 0)); } +{ return v_int32x4(vec_cts(vec_ceil(a.val))); } inline v_int32x4 v_ceil(const v_float64x2& a) -{ - static const vec_uchar16 perm = {16, 17, 18, 19, 24, 25, 26, 27, 0, 0, 0, 0, 0, 0, 0, 0}; - return v_int32x4(vec_perm(vec_int4_z, vec_ctsw(vec_ceil(a.val)), perm)); -} +{ return v_int32x4(vec_mergesqo(vec_cts(vec_ceil(a.val)), vec_int4_z)); } inline v_int32x4 v_trunc(const v_float32x4& a) -{ return v_int32x4(vec_cts(a.val, 0)); } +{ return v_int32x4(vec_cts(a.val)); } inline v_int32x4 v_trunc(const v_float64x2& a) -{ - static const vec_uchar16 perm = {16, 17, 18, 19, 24, 25, 26, 27, 0, 0, 0, 0, 0, 0, 0, 0}; - return v_int32x4(vec_perm(vec_int4_z, vec_ctsw(a.val), perm)); -} +{ return v_int32x4(vec_mergesqo(vec_cts(a.val), vec_int4_z)); } /** To float **/ inline v_float32x4 v_cvt_f32(const v_int32x4& a) -{ return v_float32x4(vec_ctf(a.val, 0)); } +{ return v_float32x4(vec_ctf(a.val)); } inline v_float32x4 v_cvt_f32(const v_float64x2& a) -{ - static const vec_uchar16 perm = {16, 17, 18, 19, 24, 25, 26, 27, 0, 0, 0, 0, 0, 0, 0, 0}; - return v_float32x4(vec_perm(vec_float4_z, vec_cvf(a.val), perm)); -} +{ return v_float32x4(vec_mergesqo(vec_cvf(a.val), vec_float4_z)); } + inline v_float64x2 v_cvt_f64(const v_int32x4& a) -{ - return v_float64x2(vec_ctd(vec_mergeh(a.val, a.val), 0)); -} +{ return v_float64x2(vec_ctdo(vec_mergeh(a.val, a.val))); } + inline v_float64x2 v_cvt_f64_high(const v_int32x4& a) -{ - return v_float64x2(vec_ctd(vec_mergel(a.val, a.val), 0)); -} +{ return v_float64x2(vec_ctdo(vec_mergel(a.val, a.val))); } + inline v_float64x2 v_cvt_f64(const v_float32x4& a) -{ - return v_float64x2(vec_cvf(vec_mergeh(a.val, a.val))); -} +{ return v_float64x2(vec_cvfo(vec_mergeh(a.val, a.val))); } + inline v_float64x2 v_cvt_f64_high(const v_float32x4& a) -{ - return v_float64x2(vec_cvf(vec_mergel(a.val, a.val))); -} +{ return v_float64x2(vec_cvfo(vec_mergel(a.val, a.val))); } /** Reinterpret **/ /** its up there with load and store operations **/ @@ -886,10 +926,20 @@ inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0, const vec_float4 v0 = vec_splat(v.val, 0); const vec_float4 v1 = vec_splat(v.val, 1); const vec_float4 v2 = vec_splat(v.val, 2); - const vec_float4 v3 = vec_splat(v.val, 3); + VSX_UNUSED(const vec_float4) v3 = vec_splat(v.val, 3); return v_float32x4(vec_madd(v0, m0.val, vec_madd(v1, m1.val, vec_madd(v2, m2.val, vec_mul(v3, m3.val))))); } +inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0, + const v_float32x4& m1, const v_float32x4& m2, + const v_float32x4& a) +{ + const vec_float4 v0 = vec_splat(v.val, 0); + const vec_float4 v1 = vec_splat(v.val, 1); + const vec_float4 v2 = vec_splat(v.val, 2); + return v_float32x4(vec_madd(v0, m0.val, vec_madd(v1, m1.val, vec_madd(v2, m2.val, a.val)))); +} + #define OPENCV_HAL_IMPL_VSX_TRANSPOSE4x4(_Tpvec, _Tpvec2) \ inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \ const _Tpvec& a2, const _Tpvec& a3, \ diff --git a/modules/core/include/opencv2/core/vsx_utils.hpp b/modules/core/include/opencv2/core/vsx_utils.hpp index 3ce190b9b6..af962c0cb6 100644 --- a/modules/core/include/opencv2/core/vsx_utils.hpp +++ b/modules/core/include/opencv2/core/vsx_utils.hpp @@ -51,18 +51,6 @@ //! @{ #if CV_VSX -#define FORCE_INLINE(tp) extern inline tp __attribute__((always_inline)) - -#define VSX_REDIRECT_1RG(rt, rg, fnm, fn2) \ -FORCE_INLINE(rt) fnm(const rg& a) { return fn2(a); } - -#define VSX_REDIRECT_2RG(rt, rg, fnm, fn2) \ -FORCE_INLINE(rt) fnm(const rg& a, const rg& b) { return fn2(a, b); } - -#define VSX_IMPL_PERM(rt, fnm, ...) \ -FORCE_INLINE(rt) fnm(const rt& a, const rt& b) \ - { static const vec_uchar16 perm = {__VA_ARGS__}; return vec_perm(a, b, perm); } - #define __VSX_S16__(c, v) (c){v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v} #define __VSX_S8__(c, v) (c){v, v, v, v, v, v, v, v} #define __VSX_S4__(c, v) (c){v, v, v, v} @@ -172,10 +160,19 @@ typedef __vector double vec_double2; #define vec_bdword2_f (__VSX_S2__(vec_bdword2, 0)) #define vec_bdword2_t (__VSX_S2__(vec_bdword2, 1)) + +#define FORCE_INLINE(tp) extern inline tp __attribute__((always_inline)) + +#define VSX_REDIRECT_1RG(rt, rg, fnm, fn2) \ +FORCE_INLINE(rt) fnm(const rg& a) { return fn2(a); } + +#define VSX_REDIRECT_2RG(rt, rg, fnm, fn2) \ +FORCE_INLINE(rt) fnm(const rg& a, const rg& b) { return fn2(a, b); } + /* * GCC VSX compatibility **/ -#if defined(__GNUG__) && !defined(__IBMCPP__) && !defined(__clang__) +#if defined(__GNUG__) && !defined(__clang__) // inline asm helper #define VSX_IMPL_1RG(rt, rto, rg, rgo, opc, fnm) \ @@ -193,7 +190,7 @@ FORCE_INLINE(rt) fnm(const rg& a, const rg& b) \ #define VSX_IMPL_2VRG(rt, rg, opc, fnm) VSX_IMPL_2VRG_F(rt, rg, #opc" %0,%1,%2", fnm) #if __GNUG__ < 7 -/* up to GCC 6 vec_mul only supports precisions and llong */ +// up to GCC 6 vec_mul only supports precisions and llong # ifdef vec_mul # undef vec_mul # endif @@ -209,15 +206,15 @@ FORCE_INLINE(rt) fnm(const rg& a, const rg& b) \ 8, 9, 24, 25, 12, 13, 28, 29}; \ return vec_perm(Tcast(vec_mule(a, b)), Tcast(vec_mulo(a, b)), even_perm); \ } - VSX_IMPL_MULH(vec_short8, vec_short8_c) + VSX_IMPL_MULH(vec_short8, vec_short8_c) VSX_IMPL_MULH(vec_ushort8, vec_ushort8_c) - /* vmuluwm can be used for unsigned or signed integers, that's what they said */ - VSX_IMPL_2VRG(vec_int4, vec_int4, vmuluwm, vec_mul) + // vmuluwm can be used for unsigned or signed integers, that's what they said + VSX_IMPL_2VRG(vec_int4, vec_int4, vmuluwm, vec_mul) VSX_IMPL_2VRG(vec_uint4, vec_uint4, vmuluwm, vec_mul) - /* redirect to GCC builtin vec_mul, since it already supports precisions and llong */ - VSX_REDIRECT_2RG(vec_float4, vec_float4, vec_mul, __builtin_vec_mul) + // redirect to GCC builtin vec_mul, since it already supports precisions and llong + VSX_REDIRECT_2RG(vec_float4, vec_float4, vec_mul, __builtin_vec_mul) VSX_REDIRECT_2RG(vec_double2, vec_double2, vec_mul, __builtin_vec_mul) - VSX_REDIRECT_2RG(vec_dword2, vec_dword2, vec_mul, __builtin_vec_mul) + VSX_REDIRECT_2RG(vec_dword2, vec_dword2, vec_mul, __builtin_vec_mul) VSX_REDIRECT_2RG(vec_udword2, vec_udword2, vec_mul, __builtin_vec_mul) #endif // __GNUG__ < 7 @@ -237,75 +234,121 @@ FORCE_INLINE(rt) fnm(const rg& a, const rg& b) \ # define VSX_IMPL_CMPGE(rt, rg, opc, fnm) \ VSX_IMPL_2VRG_F(rt, rg, #opc" %0,%2,%1\n\t xxlnor %x0,%x0,%x0", fnm) - VSX_IMPL_CMPGE(vec_bchar16, vec_char16, vcmpgtsb, vec_cmpge) + VSX_IMPL_CMPGE(vec_bchar16, vec_char16, vcmpgtsb, vec_cmpge) VSX_IMPL_CMPGE(vec_bchar16, vec_uchar16, vcmpgtub, vec_cmpge) - VSX_IMPL_CMPGE(vec_bshort8, vec_short8, vcmpgtsh, vec_cmpge) + VSX_IMPL_CMPGE(vec_bshort8, vec_short8, vcmpgtsh, vec_cmpge) VSX_IMPL_CMPGE(vec_bshort8, vec_ushort8, vcmpgtuh, vec_cmpge) - VSX_IMPL_CMPGE(vec_bint4, vec_int4, vcmpgtsw, vec_cmpge) - VSX_IMPL_CMPGE(vec_bint4, vec_uint4, vcmpgtuw, vec_cmpge) - VSX_IMPL_CMPGE(vec_bdword2, vec_dword2, vcmpgtsd, vec_cmpge) + VSX_IMPL_CMPGE(vec_bint4, vec_int4, vcmpgtsw, vec_cmpge) + VSX_IMPL_CMPGE(vec_bint4, vec_uint4, vcmpgtuw, vec_cmpge) + VSX_IMPL_CMPGE(vec_bdword2, vec_dword2, vcmpgtsd, vec_cmpge) VSX_IMPL_CMPGE(vec_bdword2, vec_udword2, vcmpgtud, vec_cmpge) - /* redirect to GCC builtin cmpge, since it already supports precisions */ - VSX_REDIRECT_2RG(vec_bint4, vec_float4, vec_cmpge, __builtin_vec_cmpge) + +// redirect to GCC builtin cmpge, since it already supports precisions + VSX_REDIRECT_2RG(vec_bint4, vec_float4, vec_cmpge, __builtin_vec_cmpge) VSX_REDIRECT_2RG(vec_bdword2, vec_double2, vec_cmpge, __builtin_vec_cmpge) // up to gcc5 vec_nor doesn't support bool long long # undef vec_nor -template -VSX_REDIRECT_2RG(T, T, vec_nor, __builtin_vec_nor) + template + VSX_REDIRECT_2RG(T, T, vec_nor, __builtin_vec_nor) -FORCE_INLINE(vec_bdword2) vec_nor(const vec_bdword2& a, const vec_bdword2& b) -{ return vec_bdword2_c(__builtin_vec_nor(vec_dword2_c(a), vec_dword2_c(b))); } + FORCE_INLINE(vec_bdword2) vec_nor(const vec_bdword2& a, const vec_bdword2& b) + { return vec_bdword2_c(__builtin_vec_nor(vec_dword2_c(a), vec_dword2_c(b))); } #endif // __GNUG__ < 6 -// vector population count -#ifndef vec_popcnt - VSX_IMPL_1VRG(vec_uchar16, vec_uchar16, vpopcntb, vec_popcnt) - VSX_IMPL_1VRG(vec_uchar16, vec_char16, vpopcntb, vec_popcnt) - VSX_IMPL_1VRG(vec_ushort8, vec_ushort8, vpopcnth, vec_popcnt) - VSX_IMPL_1VRG(vec_ushort8, vec_short8, vpopcnth, vec_popcnt) - VSX_IMPL_1VRG(vec_uint4, vec_uint4, vpopcntw, vec_popcnt) - VSX_IMPL_1VRG(vec_uint4, vec_int4, vpopcntw, vec_popcnt) - VSX_IMPL_1VRG(vec_udword2, vec_udword2, vpopcntd, vec_popcnt) - VSX_IMPL_1VRG(vec_udword2, vec_dword2, vpopcntd, vec_popcnt) -#endif // vec_popcnt - #if __GNUG__ < 5 // vec_xxpermdi in gcc4 missing little-endian supports just like clang # define vec_permi(a, b, c) vec_xxpermdi(b, a, (3 ^ ((c & 1) << 1 | c >> 1))) // vec_packs doesn't support double words in gcc4 -# undef vec_packs -VSX_REDIRECT_2RG(vec_char16, vec_short8, vec_packs, __builtin_vec_packs) -VSX_REDIRECT_2RG(vec_uchar16, vec_ushort8, vec_packs, __builtin_vec_packs) -VSX_REDIRECT_2RG(vec_short8, vec_int4, vec_packs, __builtin_vec_packs) -VSX_REDIRECT_2RG(vec_ushort8, vec_uint4, vec_packs, __builtin_vec_packs) -VSX_IMPL_2VRG_F(vec_int4, vec_dword2, "vpksdss %0,%2,%1", vec_packs) -VSX_IMPL_2VRG_F(vec_uint4, vec_udword2, "vpkudus %0,%2,%1", vec_packs) +# undef vec_packs + VSX_REDIRECT_2RG(vec_char16, vec_short8, vec_packs, __builtin_vec_packs) + VSX_REDIRECT_2RG(vec_uchar16, vec_ushort8, vec_packs, __builtin_vec_packs) + VSX_REDIRECT_2RG(vec_short8, vec_int4, vec_packs, __builtin_vec_packs) + VSX_REDIRECT_2RG(vec_ushort8, vec_uint4, vec_packs, __builtin_vec_packs) + + VSX_IMPL_2VRG_F(vec_int4, vec_dword2, "vpksdss %0,%2,%1", vec_packs) + VSX_IMPL_2VRG_F(vec_uint4, vec_udword2, "vpkudus %0,%2,%1", vec_packs) #else # define vec_permi vec_xxpermdi -#endif - -// converts between single and double-precision -#ifndef vec_cvf - VSX_REDIRECT_1RG(vec_float4, vec_double2, vec_cvf, __builtin_vsx_xvcvdpsp) - FORCE_INLINE(vec_double2) vec_cvf(const vec_float4& a) - { return __builtin_vsx_xvcvspdp(vec_sld(a, a, 4)); } -#endif - -// converts 32 and 64 bit integers to double-precision -#ifndef vec_ctd -# define vec_ctd(a, b) __vec_ctd(a) - VSX_IMPL_1RG(vec_double2, wd, vec_int4, wa, xvcvsxwdp, __vec_ctd) - VSX_IMPL_1RG(vec_double2, wd, vec_uint4, wa, xvcvuxwdp, __vec_ctd) - VSX_IMPL_1RG(vec_double2, wd, vec_dword2, wi, xvcvsxddp, __vec_ctd) - VSX_IMPL_1RG(vec_double2, wd, vec_udword2, wi, xvcvuxddp, __vec_ctd) -#endif +#endif // __GNUG__ < 5 // shift left double by word immediate #ifndef vec_sldw # define vec_sldw __builtin_vsx_xxsldwi #endif +// vector population count +VSX_IMPL_1VRG(vec_uchar16, vec_uchar16, vpopcntb, vec_popcntu) +VSX_IMPL_1VRG(vec_uchar16, vec_char16, vpopcntb, vec_popcntu) +VSX_IMPL_1VRG(vec_ushort8, vec_ushort8, vpopcnth, vec_popcntu) +VSX_IMPL_1VRG(vec_ushort8, vec_short8, vpopcnth, vec_popcntu) +VSX_IMPL_1VRG(vec_uint4, vec_uint4, vpopcntw, vec_popcntu) +VSX_IMPL_1VRG(vec_uint4, vec_int4, vpopcntw, vec_popcntu) +VSX_IMPL_1VRG(vec_udword2, vec_udword2, vpopcntd, vec_popcntu) +VSX_IMPL_1VRG(vec_udword2, vec_dword2, vpopcntd, vec_popcntu) + +// converts between single and double-precision +#ifdef vec_cvf +# undef vec_cvf +#endif +VSX_REDIRECT_1RG(vec_float4, vec_double2, vec_cvf, __builtin_vsx_xvcvdpsp) +VSX_REDIRECT_1RG(vec_double2, vec_float4, vec_cvfo, __builtin_vsx_xvcvspdp) + +FORCE_INLINE(vec_double2) vec_cvf(const vec_float4& a) +{ return vec_cvfo(vec_sldw(a, a, 1)); } + +// converts word and doubleword to double-precision +#ifdef vec_ctd +# undef vec_ctd +#endif +VSX_IMPL_1RG(vec_double2, wd, vec_int4, wa, xvcvsxwdp, vec_ctdo) +VSX_IMPL_1RG(vec_double2, wd, vec_uint4, wa, xvcvuxwdp, vec_ctdo) +VSX_IMPL_1RG(vec_double2, wd, vec_dword2, wi, xvcvsxddp, vec_ctd) +VSX_IMPL_1RG(vec_double2, wd, vec_udword2, wi, xvcvuxddp, vec_ctd) + +FORCE_INLINE(vec_double2) vec_ctd(const vec_int4& a) +{ return vec_ctdo(vec_sldw(a, a, 1)); } + +FORCE_INLINE(vec_double2) vec_ctd(const vec_uint4& a) +{ return vec_ctdo(vec_sldw(a, a, 1)); } + +// converts word and doubleword to single-precision +#undef vec_ctf +VSX_IMPL_1RG(vec_float4, wf, vec_int4, wa, xvcvsxwsp, vec_ctf) +VSX_IMPL_1RG(vec_float4, wf, vec_uint4, wa, xvcvuxwsp, vec_ctf) +VSX_IMPL_1RG(vec_float4, wf, vec_dword2, wi, xvcvsxdsp, vec_ctf) +VSX_IMPL_1RG(vec_float4, wf, vec_udword2, wi, xvcvuxdsp, vec_ctf) + +// converts single and double precision to signed word +#undef vec_cts +VSX_IMPL_1RG(vec_int4, wa, vec_double2, wd, xvcvdpsxws, vec_cts) +VSX_IMPL_1RG(vec_int4, wa, vec_float4, wf, xvcvspsxws, vec_cts) + +// converts single and double precision to unsigned word +#undef vec_ctu +VSX_IMPL_1RG(vec_uint4, wa, vec_double2, wd, xvcvdpuxws, vec_ctu) +VSX_IMPL_1RG(vec_uint4, wa, vec_float4, wf, xvcvspuxws, vec_ctu) + +// converts single and double precision to signed doubleword +#ifdef vec_ctsl +# undef vec_ctsl +#endif +VSX_IMPL_1RG(vec_dword2, wi, vec_double2, wd, xvcvdpsxds, vec_ctsl) +VSX_IMPL_1RG(vec_dword2, wi, vec_float4, wf, xvcvspsxds, vec_ctslo) + +FORCE_INLINE(vec_dword2) vec_ctsl(const vec_float4& a) +{ return vec_ctslo(vec_sldw(a, a, 1)); } + +// converts single and double precision to unsigned doubleword +#ifdef vec_ctul +# undef vec_ctul +#endif +VSX_IMPL_1RG(vec_udword2, wi, vec_double2, wd, xvcvdpuxds, vec_ctul) +VSX_IMPL_1RG(vec_udword2, wi, vec_float4, wf, xvcvspuxds, vec_ctulo) + +FORCE_INLINE(vec_udword2) vec_ctul(const vec_float4& a) +{ return vec_ctulo(vec_sldw(a, a, 1)); } + // just in case if GCC doesn't define it #ifndef vec_xl # define vec_xl vec_vsx_ld @@ -327,8 +370,13 @@ VSX_IMPL_2VRG_F(vec_uint4, vec_udword2, "vpkudus %0,%2,%1", vec_packs) * Also there's already an open bug https://bugs.llvm.org/show_bug.cgi?id=31837 * * So we're not able to use inline asm and only use built-in functions that CLANG supports + * and use __builtin_convertvector if clang missng any of vector conversions built-in functions */ +// convert vector helper +#define VSX_IMPL_CONVERT(rt, rg, fnm) \ +FORCE_INLINE(rt) fnm(const rg& a) { return __builtin_convertvector(a, rt); } + #if __clang_major__ < 5 // implement vec_permi in a dirty way # define VSX_IMPL_CLANG_4_PERMI(Tvec) \ @@ -362,26 +410,6 @@ VSX_IMPL_2VRG_F(vec_uint4, vec_udword2, "vpkudus %0,%2,%1", vec_packs) # define vec_sldw vec_xxsldwi #endif -/* converts between single and double precision */ -#ifndef vec_cvf - VSX_REDIRECT_1RG(vec_float4, vec_double2, vec_cvf, __builtin_vsx_xvcvdpsp) - FORCE_INLINE(vec_double2) vec_cvf(const vec_float4& a) - { return __builtin_vsx_xvcvspdp(vec_sld(a, a, 4)); } -#endif - -/* converts 32 and 64 bit integers to double-precision */ -#ifndef vec_ctd -# define vec_ctd(a, b) __vec_ctd(a) - VSX_REDIRECT_1RG(vec_double2, vec_int4, __vec_ctd, __builtin_vsx_xvcvsxwdp) - VSX_REDIRECT_1RG(vec_double2, vec_uint4, __vec_ctd, __builtin_vsx_xvcvuxwdp) - // implement vec_ctd for double word in a dirty way since we are missing builtin xvcvsxddp, xvcvuxddp - // please try to avoid using it for double words - FORCE_INLINE(vec_double2) __vec_ctd(const vec_dword2& a) - { return vec_double2_set((double)vec_extract(a, 0), (double)vec_extract(a, 1)); } - FORCE_INLINE(vec_double2) __vec_ctd(const vec_udword2& a) - { return vec_double2_set((double)vec_extract(a, 0), (double)vec_extract(a, 1)); } -#endif - // Implement vec_rsqrt since clang only supports vec_rsqrte #ifndef vec_rsqrt FORCE_INLINE(vec_float4) vec_rsqrt(const vec_float4& a) @@ -391,27 +419,157 @@ VSX_IMPL_2VRG_F(vec_uint4, vec_udword2, "vpkudus %0,%2,%1", vec_packs) { return vec_div(vec_double2_sp(1), vec_sqrt(a)); } #endif +// vec_popcnt should return unsigned but clang has different thought just like gcc in vec_vpopcnt +#define VSX_IMPL_POPCNTU(Tvec, Tvec2, ucast) \ +FORCE_INLINE(Tvec) vec_popcntu(const Tvec2& a) \ +{ return ucast(vec_popcnt(a)); } +VSX_IMPL_POPCNTU(vec_uchar16, vec_char16, vec_uchar16_c); +VSX_IMPL_POPCNTU(vec_ushort8, vec_short8, vec_ushort8_c); +VSX_IMPL_POPCNTU(vec_uint4, vec_int4, vec_uint4_c); +// redirect unsigned types +VSX_REDIRECT_1RG(vec_uchar16, vec_uchar16, vec_popcntu, vec_popcnt) +VSX_REDIRECT_1RG(vec_ushort8, vec_ushort8, vec_popcntu, vec_popcnt) +VSX_REDIRECT_1RG(vec_uint4, vec_uint4, vec_popcntu, vec_popcnt) -/* - * __builtin_altivec_vctsxs in clang 5 and 6 causes ambiguous which used by vec_cts - * so we just redefine it and cast it -*/ +// converts between single and double precision +#ifdef vec_cvf +# undef vec_cvf +#endif +VSX_REDIRECT_1RG(vec_float4, vec_double2, vec_cvf, __builtin_vsx_xvcvdpsp) +VSX_REDIRECT_1RG(vec_double2, vec_float4, vec_cvfo, __builtin_vsx_xvcvspdp) + +FORCE_INLINE(vec_double2) vec_cvf(const vec_float4& a) +{ return vec_cvfo(vec_sldw(a, a, 1)); } + +// converts word and doubleword to double-precision +#ifdef vec_ctd +# undef vec_ctd +#endif +VSX_REDIRECT_1RG(vec_double2, vec_int4, vec_ctdo, __builtin_vsx_xvcvsxwdp) +VSX_REDIRECT_1RG(vec_double2, vec_uint4, vec_ctdo, __builtin_vsx_xvcvuxwdp) + +VSX_IMPL_CONVERT(vec_double2, vec_dword2, vec_ctd) +VSX_IMPL_CONVERT(vec_double2, vec_udword2, vec_ctd) + +FORCE_INLINE(vec_double2) vec_ctd(const vec_int4& a) +{ return vec_ctdo(vec_sldw(a, a, 1)); } + +FORCE_INLINE(vec_double2) vec_ctd(const vec_uint4& a) +{ return vec_ctdo(vec_sldw(a, a, 1)); } + +// converts word and doubleword to single-precision +#if __clang_major__ > 4 +# undef vec_ctf +#endif +VSX_IMPL_CONVERT(vec_float4, vec_int4, vec_ctf) +VSX_IMPL_CONVERT(vec_float4, vec_uint4, vec_ctf) +VSX_REDIRECT_1RG(vec_float4, vec_dword2, vec_ctf, __builtin_vsx_xvcvsxdsp) +VSX_REDIRECT_1RG(vec_float4, vec_udword2, vec_ctf, __builtin_vsx_xvcvuxdsp) + +// converts single and double precision to signed word #if __clang_major__ > 4 # undef vec_cts -# define vec_cts(__a, __b) \ - _Generic((__a), vector float \ - : (vector signed int)__builtin_altivec_vctsxs((__a), (__b)), vector double \ - : __extension__({ \ - vector double __ret = \ - (__a) * \ - (vector double)(vector unsigned long long)((0x3ffULL + (__b)) \ - << 52); \ - __builtin_convertvector(__ret, vector signed long long); \ - })) -#endif // __clang_major__ > 4 +#endif +VSX_REDIRECT_1RG(vec_int4, vec_double2, vec_cts, __builtin_vsx_xvcvdpsxws) +VSX_IMPL_CONVERT(vec_int4, vec_float4, vec_cts) + +// converts single and double precision to unsigned word +#if __clang_major__ > 4 +# undef vec_ctu +#endif +VSX_REDIRECT_1RG(vec_uint4, vec_double2, vec_ctu, __builtin_vsx_xvcvdpuxws) +VSX_IMPL_CONVERT(vec_uint4, vec_float4, vec_ctu) + +// converts single and double precision to signed doubleword +#ifdef vec_ctsl +# undef vec_ctsl +#endif +VSX_IMPL_CONVERT(vec_dword2, vec_double2, vec_ctsl) +// __builtin_convertvector unable to convert, xvcvspsxds is missing on it +FORCE_INLINE(vec_dword2) vec_ctslo(const vec_float4& a) +{ return vec_ctsl(vec_cvfo(a)); } + +FORCE_INLINE(vec_dword2) vec_ctsl(const vec_float4& a) +{ return vec_ctsl(vec_cvf(a)); } + +// converts single and double precision to unsigned doubleword +#ifdef vec_ctul +# undef vec_ctul +#endif +VSX_IMPL_CONVERT(vec_udword2, vec_double2, vec_ctul) +// __builtin_convertvector unable to convert, xvcvspuxds is missing on it +FORCE_INLINE(vec_udword2) vec_ctulo(const vec_float4& a) +{ return vec_ctul(vec_cvfo(a)); } + +FORCE_INLINE(vec_udword2) vec_ctul(const vec_float4& a) +{ return vec_ctul(vec_cvf(a)); } #endif // CLANG VSX compatibility +/* + * XLC VSX compatibility +**/ +#if defined(__IBMCPP__) + +// vector population count +#define vec_popcntu vec_popcnt + +// overload and redirect wih setting second arg to zero +// since we only support conversions without the second arg +#define VSX_IMPL_OVERLOAD_Z2(rt, rg, fnm) \ +FORCE_INLINE(rt) fnm(const rg& a) { return fnm(a, 0); } + +VSX_IMPL_OVERLOAD_Z2(vec_double2, vec_int4, vec_ctd) +VSX_IMPL_OVERLOAD_Z2(vec_double2, vec_uint4, vec_ctd) +VSX_IMPL_OVERLOAD_Z2(vec_double2, vec_dword2, vec_ctd) +VSX_IMPL_OVERLOAD_Z2(vec_double2, vec_udword2, vec_ctd) + +VSX_IMPL_OVERLOAD_Z2(vec_float4, vec_int4, vec_ctf) +VSX_IMPL_OVERLOAD_Z2(vec_float4, vec_uint4, vec_ctf) +VSX_IMPL_OVERLOAD_Z2(vec_float4, vec_dword2, vec_ctf) +VSX_IMPL_OVERLOAD_Z2(vec_float4, vec_udword2, vec_ctf) + +VSX_IMPL_OVERLOAD_Z2(vec_int4, vec_double2, vec_cts) +VSX_IMPL_OVERLOAD_Z2(vec_int4, vec_float4, vec_cts) + +VSX_IMPL_OVERLOAD_Z2(vec_uint4, vec_double2, vec_ctu) +VSX_IMPL_OVERLOAD_Z2(vec_uint4, vec_float4, vec_ctu) + +VSX_IMPL_OVERLOAD_Z2(vec_dword2, vec_double2, vec_ctsl) +VSX_IMPL_OVERLOAD_Z2(vec_dword2, vec_float4, vec_ctsl) + +VSX_IMPL_OVERLOAD_Z2(vec_udword2, vec_double2, vec_ctul) +VSX_IMPL_OVERLOAD_Z2(vec_udword2, vec_float4, vec_ctul) + +// fixme: implement conversions of odd-numbered elements in a dirty way +// since xlc doesn't support VSX registers operand in inline asm. +#define VSX_IMPL_DIRTY_ODD(rt, rg, fnm, fn2) \ +FORCE_INLINE(rt) fnm(const rg& a) { return fn2(vec_sldw(a, a, 3)); } + +VSX_IMPL_DIRTY_ODD(vec_double2, vec_float4, vec_cvfo, vec_cvf) +VSX_IMPL_DIRTY_ODD(vec_double2, vec_int4, vec_ctdo, vec_ctd) +VSX_IMPL_DIRTY_ODD(vec_double2, vec_uint4, vec_ctdo, vec_ctd) +VSX_IMPL_DIRTY_ODD(vec_dword2, vec_float4, vec_ctslo, vec_ctsl) +VSX_IMPL_DIRTY_ODD(vec_udword2, vec_float4, vec_ctulo, vec_ctul) + +#endif // XLC VSX compatibility + +// ignore GCC warning that casued by -Wunused-but-set-variable in rare cases +#if defined(__GNUG__) && !defined(__clang__) +# define VSX_UNUSED(Tvec) Tvec __attribute__((__unused__)) +#else // CLANG, XLC +# define VSX_UNUSED(Tvec) Tvec +#endif + +// gcc can find his way in casting log int and XLC, CLANG ambiguous +#if defined(__clang__) || defined(__IBMCPP__) + FORCE_INLINE(vec_udword2) vec_splats(uint64 v) + { return vec_splats((unsigned long long) v); } + + FORCE_INLINE(vec_dword2) vec_splats(int64 v) + { return vec_splats((long long) v); } +#endif + /* * implement vsx_ld(offset, pointer), vsx_st(vector, offset, pointer) * load and set using offset depend on the pointer type @@ -468,75 +626,6 @@ VSX_IMPL_2VRG_F(vec_uint4, vec_udword2, "vpkudus %0,%2,%1", vec_packs) { vsx_stf(vec, VSX_OFFSET(o, p), (long long*)p); } #endif -#if defined(__clang__) || defined(__IBMCPP__) - // gcc can find his way in casting log int and XLC, CLANG ambiguous - FORCE_INLINE(vec_udword2) vec_splats(uint64 v) - { return vec_splats((unsigned long long) v); } - - FORCE_INLINE(vec_dword2) vec_splats(int64 v) - { return vec_splats((long long) v); } -#endif - -// Implement store vector bool char for XLC -#if defined(__IBMCPP__) && defined(__clang__) - FORCE_INLINE(void) vec_xst(const vec_bchar16 &vec, long o, uchar* p) - { vec_xst(vec_uchar16_c(vec), VSX_OFFSET(o, p), p); } -#endif - -// Working around vec_popcnt compatibility -/* - * vec_popcnt should return unsigned but clang has different thought just like gcc in vec_vpopcnt - * - * use vec_popcntu instead to deal with it -*/ -#if defined(__clang__) && !defined(__IBMCPP__) -# define VSX_IMPL_CLANG_POPCNTU(Tvec, Tvec2, ucast) \ - FORCE_INLINE(Tvec) vec_popcntu(const Tvec2& a) \ - { return ucast(vec_popcnt(a)); } - - VSX_IMPL_CLANG_POPCNTU(vec_uchar16, vec_char16, vec_uchar16_c); - VSX_IMPL_CLANG_POPCNTU(vec_ushort8, vec_short8, vec_ushort8_c); - VSX_IMPL_CLANG_POPCNTU(vec_uint4, vec_int4, vec_uint4_c); - // redirect unsigned types - VSX_REDIRECT_1RG(vec_uchar16, vec_uchar16, vec_popcntu, vec_popcnt) - VSX_REDIRECT_1RG(vec_ushort8, vec_ushort8, vec_popcntu, vec_popcnt) - VSX_REDIRECT_1RG(vec_uint4, vec_uint4, vec_popcntu, vec_popcnt) -#else -# define vec_popcntu vec_popcnt -#endif - -// Working around vec_cts compatibility -/* - * vec_cts in gcc and clang converts single-precision to signed fixed-point word - * and from double-precision to signed doubleword, also there's no implement for vec_ctsl - * - * vec_cts in xlc converts single and double precision to signed fixed-point word - * and xlc has vec_ctsl which converts single and double precision to signed doubleword - * - * so to deal with this situation, use vec_cts only if you want to convert single-precision to signed fixed-point word - * and use vec_ctsl when you want to convert double-precision to signed doubleword - * - * Also we implemented vec_ctsw(a) to convert double-precision to signed fixed-point word -*/ - -// converts double-precision to signed doubleword for GCC and CLANG -#if !defined(vec_ctsl) && !defined(__IBMCPP__) && (defined(__GNUG__) || defined(__clang__)) -// GCC4 has incorrect results in convert to signed doubleword -# if !defined(__clang__) && __GNUG__ < 5 -# define vec_ctsl(a, b) __vec_ctsl(a) - VSX_IMPL_1RG(vec_dword2, wi, vec_double2, wd, xvcvdpsxds, __vec_ctsl) -# else // GCC > 4 , CLANG -# define vec_ctsl vec_cts -# endif -#endif - -// converts double-precision to signed fixed-point word -#if defined(__IBMCPP__) -# define vec_ctsw(a) vec_cts(a, 0) -#else // GCC, CLANG -# define vec_ctsw(a) vec_int4_c(__builtin_vsx_xvcvdpsxws(a)) -#endif - // load 4 unsigned bytes into uint4 vector #define vec_ld_buw(p) vec_uint4_set((p)[0], (p)[1], (p)[2], (p)[3]) @@ -571,14 +660,14 @@ FORCE_INLINE(Tvec) vec_ldz_l8(const Tp *p) return vec_and(vec_ld_l8(p), (Tvec)mask); \ } VSX_IMPL_LOAD_L8(vec_uchar16, uchar) -VSX_IMPL_LOAD_L8(vec_char16, schar) +VSX_IMPL_LOAD_L8(vec_char16, schar) VSX_IMPL_LOAD_L8(vec_ushort8, ushort) -VSX_IMPL_LOAD_L8(vec_short8, short) -VSX_IMPL_LOAD_L8(vec_uint4, uint) -VSX_IMPL_LOAD_L8(vec_int4, int) -VSX_IMPL_LOAD_L8(vec_float4, float) +VSX_IMPL_LOAD_L8(vec_short8, short) +VSX_IMPL_LOAD_L8(vec_uint4, uint) +VSX_IMPL_LOAD_L8(vec_int4, int) +VSX_IMPL_LOAD_L8(vec_float4, float) VSX_IMPL_LOAD_L8(vec_udword2, uint64) -VSX_IMPL_LOAD_L8(vec_dword2, int64) +VSX_IMPL_LOAD_L8(vec_dword2, int64) VSX_IMPL_LOAD_L8(vec_double2, double) // logical not @@ -606,41 +695,45 @@ FORCE_INLINE(rt) vec_unpackhu(const rg& a) \ { return reinterpret_cast(vec_mergeh(a, zero)); } VSX_IMPL_UNPACKU(vec_ushort8, vec_uchar16, vec_uchar16_z) -VSX_IMPL_UNPACKU(vec_uint4, vec_ushort8, vec_ushort8_z) -VSX_IMPL_UNPACKU(vec_udword2, vec_uint4, vec_uint4_z) +VSX_IMPL_UNPACKU(vec_uint4, vec_ushort8, vec_ushort8_z) +VSX_IMPL_UNPACKU(vec_udword2, vec_uint4, vec_uint4_z) /* * Implement vec_mergesqe and vec_mergesqo * Merges the sequence values of even and odd elements of two vectors */ +#define VSX_IMPL_PERM(rt, fnm, ...) \ +FORCE_INLINE(rt) fnm(const rt& a, const rt& b) \ +{ static const vec_uchar16 perm = {__VA_ARGS__}; return vec_perm(a, b, perm); } + // 16 #define perm16_mergesqe 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 #define perm16_mergesqo 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 VSX_IMPL_PERM(vec_uchar16, vec_mergesqe, perm16_mergesqe) VSX_IMPL_PERM(vec_uchar16, vec_mergesqo, perm16_mergesqo) -VSX_IMPL_PERM(vec_char16, vec_mergesqe, perm16_mergesqe) -VSX_IMPL_PERM(vec_char16, vec_mergesqo, perm16_mergesqo) +VSX_IMPL_PERM(vec_char16, vec_mergesqe, perm16_mergesqe) +VSX_IMPL_PERM(vec_char16, vec_mergesqo, perm16_mergesqo) // 8 #define perm8_mergesqe 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 #define perm8_mergesqo 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 VSX_IMPL_PERM(vec_ushort8, vec_mergesqe, perm8_mergesqe) VSX_IMPL_PERM(vec_ushort8, vec_mergesqo, perm8_mergesqo) -VSX_IMPL_PERM(vec_short8, vec_mergesqe, perm8_mergesqe) -VSX_IMPL_PERM(vec_short8, vec_mergesqo, perm8_mergesqo) +VSX_IMPL_PERM(vec_short8, vec_mergesqe, perm8_mergesqe) +VSX_IMPL_PERM(vec_short8, vec_mergesqo, perm8_mergesqo) // 4 #define perm4_mergesqe 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 #define perm4_mergesqo 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 -VSX_IMPL_PERM(vec_uint4, vec_mergesqe, perm4_mergesqe) -VSX_IMPL_PERM(vec_uint4, vec_mergesqo, perm4_mergesqo) -VSX_IMPL_PERM(vec_int4, vec_mergesqe, perm4_mergesqe) -VSX_IMPL_PERM(vec_int4, vec_mergesqo, perm4_mergesqo) +VSX_IMPL_PERM(vec_uint4, vec_mergesqe, perm4_mergesqe) +VSX_IMPL_PERM(vec_uint4, vec_mergesqo, perm4_mergesqo) +VSX_IMPL_PERM(vec_int4, vec_mergesqe, perm4_mergesqe) +VSX_IMPL_PERM(vec_int4, vec_mergesqo, perm4_mergesqo) VSX_IMPL_PERM(vec_float4, vec_mergesqe, perm4_mergesqe) VSX_IMPL_PERM(vec_float4, vec_mergesqo, perm4_mergesqo) // 2 VSX_REDIRECT_2RG(vec_double2, vec_double2, vec_mergesqe, vec_mergeh) VSX_REDIRECT_2RG(vec_double2, vec_double2, vec_mergesqo, vec_mergel) -VSX_REDIRECT_2RG(vec_dword2, vec_dword2, vec_mergesqe, vec_mergeh) -VSX_REDIRECT_2RG(vec_dword2, vec_dword2, vec_mergesqo, vec_mergel) +VSX_REDIRECT_2RG(vec_dword2, vec_dword2, vec_mergesqe, vec_mergeh) +VSX_REDIRECT_2RG(vec_dword2, vec_dword2, vec_mergesqo, vec_mergel) VSX_REDIRECT_2RG(vec_udword2, vec_udword2, vec_mergesqe, vec_mergeh) VSX_REDIRECT_2RG(vec_udword2, vec_udword2, vec_mergesqo, vec_mergel) @@ -662,8 +755,8 @@ VSX_IMPL_MERGESQHL(vec_int4) VSX_IMPL_MERGESQHL(vec_float4) VSX_REDIRECT_2RG(vec_udword2, vec_udword2, vec_mergesqh, vec_mergeh) VSX_REDIRECT_2RG(vec_udword2, vec_udword2, vec_mergesql, vec_mergel) -VSX_REDIRECT_2RG(vec_dword2, vec_dword2, vec_mergesqh, vec_mergeh) -VSX_REDIRECT_2RG(vec_dword2, vec_dword2, vec_mergesql, vec_mergel) +VSX_REDIRECT_2RG(vec_dword2, vec_dword2, vec_mergesqh, vec_mergeh) +VSX_REDIRECT_2RG(vec_dword2, vec_dword2, vec_mergesql, vec_mergel) VSX_REDIRECT_2RG(vec_double2, vec_double2, vec_mergesqh, vec_mergeh) VSX_REDIRECT_2RG(vec_double2, vec_double2, vec_mergesql, vec_mergel) @@ -687,13 +780,13 @@ FORCE_INLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b, \ vsx_stf(vec_mergeh(ac, bd), 32, ptr); \ vsx_stf(vec_mergel(ac, bd), 48, ptr); \ } -VSX_IMPL_ST_INTERLEAVE(uchar, vec_uchar16) -VSX_IMPL_ST_INTERLEAVE(schar, vec_char16) +VSX_IMPL_ST_INTERLEAVE(uchar, vec_uchar16) +VSX_IMPL_ST_INTERLEAVE(schar, vec_char16) VSX_IMPL_ST_INTERLEAVE(ushort, vec_ushort8) -VSX_IMPL_ST_INTERLEAVE(short, vec_short8) -VSX_IMPL_ST_INTERLEAVE(uint, vec_uint4) -VSX_IMPL_ST_INTERLEAVE(int, vec_int4) -VSX_IMPL_ST_INTERLEAVE(float, vec_float4) +VSX_IMPL_ST_INTERLEAVE(short, vec_short8) +VSX_IMPL_ST_INTERLEAVE(uint, vec_uint4) +VSX_IMPL_ST_INTERLEAVE(int, vec_int4) +VSX_IMPL_ST_INTERLEAVE(float, vec_float4) // 2 and 4 channels deinterleave for 16 lanes #define VSX_IMPL_ST_DINTERLEAVE_8(Tp, Tvec) \ @@ -753,7 +846,7 @@ FORCE_INLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b, \ d = vec_mergesql(cd0, cd1); \ } VSX_IMPL_ST_DINTERLEAVE_16(ushort, vec_ushort8) -VSX_IMPL_ST_DINTERLEAVE_16(short, vec_short8) +VSX_IMPL_ST_DINTERLEAVE_16(short, vec_short8) // 2 and 4 channels deinterleave for 4 lanes #define VSX_IMPL_ST_DINTERLEAVE_32(Tp, Tvec) \ @@ -782,8 +875,8 @@ FORCE_INLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b, \ c = vec_mergeh(m0, m1); \ d = vec_mergel(m0, m1); \ } -VSX_IMPL_ST_DINTERLEAVE_32(uint, vec_uint4) -VSX_IMPL_ST_DINTERLEAVE_32(int, vec_int4) +VSX_IMPL_ST_DINTERLEAVE_32(uint, vec_uint4) +VSX_IMPL_ST_DINTERLEAVE_32(int, vec_int4) VSX_IMPL_ST_DINTERLEAVE_32(float, vec_float4) // 2 and 4 channels interleave and deinterleave for 2 lanes @@ -820,9 +913,9 @@ FORCE_INLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b, \ c = vec_mergeh(v0, v1); \ d = vec_mergel(v0, v1); \ } -VSX_IMPL_ST_D_INTERLEAVE_64(int64, vec_dword2, vsx_ld2, vsx_st2) +VSX_IMPL_ST_D_INTERLEAVE_64(int64, vec_dword2, vsx_ld2, vsx_st2) VSX_IMPL_ST_D_INTERLEAVE_64(uint64, vec_udword2, vsx_ld2, vsx_st2) -VSX_IMPL_ST_D_INTERLEAVE_64(double, vec_double2, vsx_ld, vsx_st) +VSX_IMPL_ST_D_INTERLEAVE_64(double, vec_double2, vsx_ld, vsx_st) /* 3 channels */ #define VSX_IMPL_ST_INTERLEAVE_3CH_16(Tp, Tvec) \ @@ -887,7 +980,7 @@ FORCE_INLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b, Tvec& c) c = vec_perm(vec_perm(v1, v2, c12_perm), v3, c123_perm); \ } VSX_IMPL_ST_INTERLEAVE_3CH_8(ushort, vec_ushort8) -VSX_IMPL_ST_INTERLEAVE_3CH_8(short, vec_short8) +VSX_IMPL_ST_INTERLEAVE_3CH_8(short, vec_short8) #define VSX_IMPL_ST_INTERLEAVE_3CH_4(Tp, Tvec) \ FORCE_INLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b, \ @@ -912,8 +1005,8 @@ FORCE_INLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b, Tvec& c) b = vec_perm(v2, vec_sld(v1, v3, 8), flp2); \ c = vec_perm(vec_sld(v2, v1, 8), v3, flp); \ } -VSX_IMPL_ST_INTERLEAVE_3CH_4(uint, vec_uint4) -VSX_IMPL_ST_INTERLEAVE_3CH_4(int, vec_int4) +VSX_IMPL_ST_INTERLEAVE_3CH_4(uint, vec_uint4) +VSX_IMPL_ST_INTERLEAVE_3CH_4(int, vec_int4) VSX_IMPL_ST_INTERLEAVE_3CH_4(float, vec_float4) #define VSX_IMPL_ST_INTERLEAVE_3CH_2(Tp, Tvec, ld_func, st_func) \ @@ -934,9 +1027,9 @@ FORCE_INLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, \ b = vec_permi(v1, v3, 2); \ c = vec_permi(v2, v3, 1); \ } -VSX_IMPL_ST_INTERLEAVE_3CH_2(int64, vec_dword2, vsx_ld2, vsx_st2) +VSX_IMPL_ST_INTERLEAVE_3CH_2(int64, vec_dword2, vsx_ld2, vsx_st2) VSX_IMPL_ST_INTERLEAVE_3CH_2(uint64, vec_udword2, vsx_ld2, vsx_st2) -VSX_IMPL_ST_INTERLEAVE_3CH_2(double, vec_double2, vsx_ld, vsx_st) +VSX_IMPL_ST_INTERLEAVE_3CH_2(double, vec_double2, vsx_ld, vsx_st) #endif // CV_VSX diff --git a/modules/core/src/rand.cpp b/modules/core/src/rand.cpp index feb94690c5..cac5166ff6 100644 --- a/modules/core/src/rand.cpp +++ b/modules/core/src/rand.cpp @@ -74,6 +74,12 @@ namespace cv #define RNG_NEXT(x) ((uint64)(unsigned)(x)*CV_RNG_COEFF + ((x) >> 32)) +#ifdef __PPC64__ + #define PPC_MUL_ADD(ret, tmp, p0, p1) \ + asm volatile("fmuls %0,%1,%2\n\t fadds %0,%0,%3" : "=&f" (ret) \ + : "f" (tmp), "f" (p0), "f" (p1)) +#endif + /***************************************************************************************\ * Pseudo-Random Number Generators (PRNGs) * \***************************************************************************************/ @@ -248,6 +254,14 @@ static void randf_32f( float* arr, int len, uint64* state, const Vec2f* p, bool volatile float32x4_t v0 = vmulq_f32(vld1q_f32(f), p0); vst1q_f32(arr+i, vaddq_f32(v0, p1)); +#elif defined __PPC64__ + // inline asm is required for numerical stability! + // compilers tends to use floating multiply-add single(fmadds) + // instead of separate multiply and add + PPC_MUL_ADD(arr[i+0], f[0], p[i+0][0], p[i+0][1]); + PPC_MUL_ADD(arr[i+1], f[1], p[i+1][0], p[i+1][1]); + PPC_MUL_ADD(arr[i+2], f[2], p[i+2][0], p[i+2][1]); + PPC_MUL_ADD(arr[i+3], f[3], p[i+3][0], p[i+3][1]); #else arr[i+0] = f[0]*p[i+0][0] + p[i+0][1]; arr[i+1] = f[1]*p[i+1][0] + p[i+1][1]; @@ -269,6 +283,8 @@ static void randf_32f( float* arr, int len, uint64* state, const Vec2f* p, bool vdup_n_f32((float)(int)temp), vdup_n_f32(p[i][0])), vdup_n_f32(p[i][1])); arr[i] = vget_lane_f32(t, 0); +#elif defined __PPC64__ + PPC_MUL_ADD(arr[i], (float)(int)temp, p[i][0], p[i][1]); #else arr[i] = (int)temp*p[i][0] + p[i][1]; #endif From 2dc76d50092d912ce5a97f3c1ecaeb11168b5cda Mon Sep 17 00:00:00 2001 From: Sayed Adel Date: Sun, 22 Oct 2017 05:38:15 +0200 Subject: [PATCH 2/2] cmake: Added Power toolchain --- platforms/linux/ppc64-gnu.toolchain.cmake | 3 + platforms/linux/ppc64le-gnu.toolchain.cmake | 3 + platforms/linux/ppcat.toolchain.cmake | 129 ++++++++++++++++++++ 3 files changed, 135 insertions(+) create mode 100644 platforms/linux/ppc64-gnu.toolchain.cmake create mode 100644 platforms/linux/ppc64le-gnu.toolchain.cmake create mode 100644 platforms/linux/ppcat.toolchain.cmake diff --git a/platforms/linux/ppc64-gnu.toolchain.cmake b/platforms/linux/ppc64-gnu.toolchain.cmake new file mode 100644 index 0000000000..f6177c4f50 --- /dev/null +++ b/platforms/linux/ppc64-gnu.toolchain.cmake @@ -0,0 +1,3 @@ +set(CMAKE_SYSTEM_PROCESSOR ppc64) +set(GNU_MACHINE "powerpc64-linux-gnu" CACHE STRING "GNU compiler triple") +include("${CMAKE_CURRENT_LIST_DIR}/ppcat.toolchain.cmake") diff --git a/platforms/linux/ppc64le-gnu.toolchain.cmake b/platforms/linux/ppc64le-gnu.toolchain.cmake new file mode 100644 index 0000000000..2266c345c8 --- /dev/null +++ b/platforms/linux/ppc64le-gnu.toolchain.cmake @@ -0,0 +1,3 @@ +set(CMAKE_SYSTEM_PROCESSOR ppc64le) +set(GNU_MACHINE "powerpc64le-linux-gnu" CACHE STRING "GNU compiler triple") +include("${CMAKE_CURRENT_LIST_DIR}/ppcat.toolchain.cmake") diff --git a/platforms/linux/ppcat.toolchain.cmake b/platforms/linux/ppcat.toolchain.cmake new file mode 100644 index 0000000000..1703baa435 --- /dev/null +++ b/platforms/linux/ppcat.toolchain.cmake @@ -0,0 +1,129 @@ +if(COMMAND toolchain_save_config) + return() # prevent recursive call +endif() + +option(AT_PATH "Advance Toolchain directory" "") +option(AT_RPATH "Add new directories to runtime search path" "") +option(AT_HOST_LINK "Enable/disable Link against host advance toolchain runtime" OFF) +option(AT_NO_AUTOVEC "Disable/enable Auto Vectorizer optimization" OFF) + +set(CMAKE_SYSTEM_NAME Linux) +set(CMAKE_SYSTEM_VERSION 1) + +include("${CMAKE_CURRENT_LIST_DIR}/gnu.toolchain.cmake") + +if(NOT DEFINED CMAKE_C_COMPILER) + string(REGEX REPLACE "/+$" "" AT_PATH "${AT_PATH}") + + if(NOT AT_PATH) + message(FATAL_ERROR "'AT_PATH' option is required. Please set it to Advance Toolchain path to get toolchain works") + endif() + + if(NOT EXISTS ${AT_PATH}) + message(FATAL_ERROR "'${AT_PATH}' Advance Toolchain path isn't exist") + endif() + + set(CMAKE_C_COMPILER "${AT_PATH}/bin/${GNU_MACHINE}-gcc") + + if(NOT EXISTS ${CMAKE_C_COMPILER}) + message(FATAL_ERROR "GNU C compiler isn't exist on path '${CMAKE_C_COMPILER}'. Please install Advance Toolchain with ${CMAKE_SYSTEM_PROCESSOR} supports") + endif() +endif() + +if(NOT DEFINED CMAKE_CXX_COMPILER) + set(CMAKE_CXX_COMPILER "${AT_PATH}/bin/${GNU_MACHINE}-g++") + + if(NOT EXISTS ${CMAKE_CXX_COMPILER}) + message(FATAL_ERROR "GNU C++ compiler isn't exist. Invalid install of Advance Toolchain") + endif() +endif() + +if(NOT DEFINED AT_GCCROOT_PATH) + set(AT_GCCROOT_PATH "${AT_PATH}/${GNU_MACHINE}") + + if(NOT EXISTS ${AT_GCCROOT_PATH}) + message(FATAL_ERROR "GCC root path '${AT_GCCROOT_PATH}' isn't exist. Invalid install of Advance Toolchain") + endif() +endif() + +if(NOT DEFINED AT_SYSROOT_PATH) + if(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "ppc64") + set(AT_SYSROOT_PATH "${AT_PATH}/ppc") + else() + set(AT_SYSROOT_PATH "${AT_PATH}/${CMAKE_SYSTEM_PROCESSOR}") + endif() + + if(NOT EXISTS ${AT_SYSROOT_PATH}) + message(FATAL_ERROR "System root path '${AT_SYSROOT_PATH}' isn't exist. Invalid install of Advance Toolchain") + endif() +endif() + +if(NOT DEFINED CMAKE_EXE_LINKER_FLAGS) + set(CMAKE_CXX_FLAGS "" CACHE INTERAL "") + set(CMAKE_C_FLAGS "" CACHE INTERAL "") + set(CMAKE_EXE_LINKER_FLAGS "" CACHE INTERAL "") + set(CMAKE_SHARED_LINKER_FLAGS "" CACHE INTERAL "") + set(CMAKE_MODULE_LINKER_FLAGS "" CACHE INTERAL "") + + if(AT_RPATH) + string(REPLACE "," ";" RPATH_LIST ${AT_RPATH}) + endif() + + if(AT_HOST_LINK) + #get 64-bit dynamic linker path + file(STRINGS "${AT_SYSROOT_PATH}/usr/bin/ldd" RTLDLIST LIMIT_COUNT 1 REGEX "^RTLDLIST=[\"*\"]") + string(REGEX REPLACE "RTLDLIST=|\"" "" RTLDLIST "${RTLDLIST}") + string(REPLACE " " ";" RTLDLIST "${RTLDLIST}") + + #RTLDLIST must contains 32 and 64 bit paths + list(LENGTH RTLDLIST RTLDLIST_LEN) + if(NOT RTLDLIST_LEN GREATER 1) + message(FATAL_ERROR "Could not fetch dynamic linker path. Invalid install of Advance Toolchain") + endif() + + list (GET RTLDLIST 1 LINKER_PATH) + set(CMAKE_EXE_LINKER_FLAGS "-Wl,--dynamic-linker=${AT_SYSROOT_PATH}${LINKER_PATH}") + + list(APPEND RPATH_LIST "${AT_GCCROOT_PATH}/lib64/") + list(APPEND RPATH_LIST "${AT_SYSROOT_PATH}/lib64/") + list(APPEND RPATH_LIST "${AT_SYSROOT_PATH}/usr/lib64/") + list(APPEND RPATH_LIST "${PROJECT_BINARY_DIR}/lib/") + endif() + + list(LENGTH RPATH_LIST RPATH_LEN) + if(RPATH_LEN GREATER 0) + set(AT_LINKER_FLAGS "${AT_LINKER_FLAGS} -Wl") + foreach(RPATH ${RPATH_LIST}) + set(AT_LINKER_FLAGS "${AT_LINKER_FLAGS},-rpath,${RPATH}") + endforeach() + endif() + + set(CMAKE_SHARED_LINKER_FLAGS "${AT_LINKER_FLAGS} ${CMAKE_SHARED_LINKER_FLAGS}") + set(CMAKE_MODULE_LINKER_FLAGS "${AT_LINKER_FLAGS} ${CMAKE_MODULE_LINKER_FLAGS}") + set(CMAKE_EXE_LINKER_FLAGS "${AT_LINKER_FLAGS} ${CMAKE_EXE_LINKER_FLAGS}") + + if(AT_NO_AUTOVEC) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-tree-vectorize -fno-tree-slp-vectorize") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fno-tree-vectorize -fno-tree-slp-vectorize") + endif() + +endif() + +set(CMAKE_FIND_ROOT_PATH ${CMAKE_FIND_ROOT_PATH} ${AT_SYSROOT_PATH} ${AT_GCCROOT_PATH}) +set(CMAKE_SYSROOT ${AT_SYSROOT_PATH}) + +# what about ld.gold? +if(NOT DEFINED CMAKE_LINKER) + find_program(CMAKE_LINKER NAMES ld) +endif() + +if(NOT DEFINED CMAKE_AR) + find_program(CMAKE_AR NAMES ar) +endif() + +set(TOOLCHAIN_CONFIG_VARS ${TOOLCHAIN_CONFIG_VARS} + CMAKE_SYSROOT + AT_SYSROOT_PATH + AT_GCCROOT_PATH +) +toolchain_save_config()