mirror of
https://github.com/opencv/opencv.git
synced 2025-06-17 23:51:16 +08:00
core: Several improvements to Power/VSX
- changed behavior of vec_ctf, vec_ctu, vec_cts in gcc and clang to make them compatible with XLC - implemented most of missing conversion intrinsics in gcc and clang - implemented conversions intrinsics of odd-numbered elements - ignored gcc bug warning that caused by -Wunused-but-set-variable in rare cases - replaced right shift with algebraic right shift for signed vectors to shift in the sign bit. - added new universal intrinsics v_matmuladd, v_rotate_left/right - avoid using floating multiply-add in RNG
This commit is contained in:
parent
5ed354221c
commit
def444d99f
@ -521,24 +521,25 @@ OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_add_wrap, vec_add)
|
||||
OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_sub_wrap, vec_sub)
|
||||
|
||||
/** Bitwise shifts **/
|
||||
#define OPENCV_HAL_IMPL_VSX_SHIFT_OP(_Tpuvec, splfunc) \
|
||||
inline _Tpuvec operator << (const _Tpuvec& a, int imm) \
|
||||
{ return _Tpuvec(vec_sl(a.val, splfunc(imm))); } \
|
||||
inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \
|
||||
{ return _Tpuvec(vec_sr(a.val, splfunc(imm))); } \
|
||||
template<int imm> inline _Tpuvec v_shl(const _Tpuvec& a) \
|
||||
{ return _Tpuvec(vec_sl(a.val, splfunc(imm))); } \
|
||||
template<int imm> inline _Tpuvec v_shr(const _Tpuvec& a) \
|
||||
{ return _Tpuvec(vec_sr(a.val, splfunc(imm))); }
|
||||
#define OPENCV_HAL_IMPL_VSX_SHIFT_OP(_Tpvec, shr, splfunc) \
|
||||
inline _Tpvec operator << (const _Tpvec& a, int imm) \
|
||||
{ return _Tpvec(vec_sl(a.val, splfunc(imm))); } \
|
||||
inline _Tpvec operator >> (const _Tpvec& a, int imm) \
|
||||
{ return _Tpvec(shr(a.val, splfunc(imm))); } \
|
||||
template<int imm> inline _Tpvec v_shl(const _Tpvec& a) \
|
||||
{ return _Tpvec(vec_sl(a.val, splfunc(imm))); } \
|
||||
template<int imm> inline _Tpvec v_shr(const _Tpvec& a) \
|
||||
{ return _Tpvec(shr(a.val, splfunc(imm))); }
|
||||
|
||||
OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_uint8x16, vec_uchar16_sp)
|
||||
OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_int8x16, vec_uchar16_sp)
|
||||
OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_uint16x8, vec_ushort8_sp)
|
||||
OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_int16x8, vec_ushort8_sp)
|
||||
OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_uint32x4, vec_uint4_sp)
|
||||
OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_int32x4, vec_uint4_sp)
|
||||
OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_uint64x2, vec_udword2_sp)
|
||||
OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_int64x2, vec_udword2_sp)
|
||||
OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_uint8x16, vec_sr, vec_uchar16_sp)
|
||||
OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_uint16x8, vec_sr, vec_ushort8_sp)
|
||||
OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_uint32x4, vec_sr, vec_uint4_sp)
|
||||
OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_uint64x2, vec_sr, vec_udword2_sp)
|
||||
// algebraic right shift
|
||||
OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_int8x16, vec_sra, vec_uchar16_sp)
|
||||
OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_int16x8, vec_sra, vec_ushort8_sp)
|
||||
OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_int32x4, vec_sra, vec_uint4_sp)
|
||||
OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_int64x2, vec_sra, vec_udword2_sp)
|
||||
|
||||
/** Bitwise logic **/
|
||||
#define OPENCV_HAL_IMPL_VSX_LOGIC_OP(_Tpvec) \
|
||||
@ -603,6 +604,64 @@ OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_int64x2)
|
||||
OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_min, vec_min)
|
||||
OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_max, vec_max)
|
||||
|
||||
/** Rotate **/
|
||||
#define OPENCV_IMPL_VSX_ROTATE(_Tpvec, suffix, shf, cast) \
|
||||
template<int imm> \
|
||||
inline _Tpvec v_rotate_##suffix(const _Tpvec& a) \
|
||||
{ \
|
||||
const int wd = imm * sizeof(typename _Tpvec::lane_type); \
|
||||
if (wd > 15) \
|
||||
return _Tpvec(); \
|
||||
return _Tpvec((cast)shf(vec_uchar16_c(a.val), vec_uchar16_sp(wd << 3))); \
|
||||
}
|
||||
|
||||
#define OPENCV_IMPL_VSX_ROTATE_LR(_Tpvec, cast) \
|
||||
OPENCV_IMPL_VSX_ROTATE(_Tpvec, left, vec_slo, cast) \
|
||||
OPENCV_IMPL_VSX_ROTATE(_Tpvec, right, vec_sro, cast)
|
||||
|
||||
OPENCV_IMPL_VSX_ROTATE_LR(v_uint8x16, vec_uchar16)
|
||||
OPENCV_IMPL_VSX_ROTATE_LR(v_int8x16, vec_char16)
|
||||
OPENCV_IMPL_VSX_ROTATE_LR(v_uint16x8, vec_ushort8)
|
||||
OPENCV_IMPL_VSX_ROTATE_LR(v_int16x8, vec_short8)
|
||||
OPENCV_IMPL_VSX_ROTATE_LR(v_uint32x4, vec_uint4)
|
||||
OPENCV_IMPL_VSX_ROTATE_LR(v_int32x4, vec_int4)
|
||||
OPENCV_IMPL_VSX_ROTATE_LR(v_uint64x2, vec_udword2)
|
||||
OPENCV_IMPL_VSX_ROTATE_LR(v_int64x2, vec_dword2)
|
||||
|
||||
|
||||
template<int imm, typename _Tpvec>
|
||||
inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b)
|
||||
{
|
||||
const int wd = imm * sizeof(typename _Tpvec::lane_type);
|
||||
if (wd == 0)
|
||||
return a;
|
||||
return _Tpvec(vec_sld(b.val, a.val, 16 - wd));
|
||||
}
|
||||
|
||||
template<int imm, typename _Tpvec>
|
||||
inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b)
|
||||
{
|
||||
const int wd = imm * sizeof(typename _Tpvec::lane_type);
|
||||
if (wd == 16)
|
||||
return b;
|
||||
return _Tpvec(vec_sld(a.val, b.val, wd));
|
||||
}
|
||||
|
||||
#define OPENCV_IMPL_VSX_ROTATE_64(_Tpvec, suffix, rg1, rg2) \
|
||||
template<int imm> \
|
||||
inline _Tpvec v_rotate_##suffix(const _Tpvec& a, const _Tpvec& b) \
|
||||
{ \
|
||||
if (imm == 1) \
|
||||
return _Tpvec(vec_permi(rg1.val, rg2.val, 2)); \
|
||||
return imm ? b : a; \
|
||||
}
|
||||
|
||||
OPENCV_IMPL_VSX_ROTATE_64(v_int64x2, right, a, b)
|
||||
OPENCV_IMPL_VSX_ROTATE_64(v_uint64x2, right, a, b)
|
||||
|
||||
OPENCV_IMPL_VSX_ROTATE_64(v_int64x2, left, b, a)
|
||||
OPENCV_IMPL_VSX_ROTATE_64(v_uint64x2, left, b, a)
|
||||
|
||||
////////// Reduce and mask /////////
|
||||
|
||||
/** Reduce **/
|
||||
@ -724,7 +783,7 @@ inline int v_signmask(const v_float32x4& a)
|
||||
|
||||
inline int v_signmask(const v_int64x2& a)
|
||||
{
|
||||
const vec_dword2 sv = vec_sr(a.val, vec_udword2_sp(63));
|
||||
VSX_UNUSED(const vec_dword2) sv = vec_sr(a.val, vec_udword2_sp(63));
|
||||
return (int)vec_extract(sv, 0) | (int)vec_extract(sv, 1) << 1;
|
||||
}
|
||||
inline int v_signmask(const v_uint64x2& a)
|
||||
@ -810,66 +869,47 @@ OPENCV_HAL_IMPL_VSX_BIN_FUNC2(v_int64x2, v_uint64x2, vec_udword2_c, v_absdiff, v
|
||||
|
||||
/** Rounding **/
|
||||
inline v_int32x4 v_round(const v_float32x4& a)
|
||||
{ return v_int32x4(vec_cts(vec_round(a.val), 0)); }
|
||||
{ return v_int32x4(vec_cts(vec_round(a.val))); }
|
||||
|
||||
inline v_int32x4 v_round(const v_float64x2& a)
|
||||
{
|
||||
static const vec_uchar16 perm = {16, 17, 18, 19, 24, 25, 26, 27, 0, 0, 0, 0, 0, 0, 0, 0};
|
||||
return v_int32x4(vec_perm(vec_int4_z, vec_ctsw(vec_round(a.val)), perm));
|
||||
}
|
||||
{ return v_int32x4(vec_mergesqo(vec_cts(vec_round(a.val)), vec_int4_z)); }
|
||||
|
||||
inline v_int32x4 v_floor(const v_float32x4& a)
|
||||
{ return v_int32x4(vec_cts(vec_floor(a.val), 0)); }
|
||||
{ return v_int32x4(vec_cts(vec_floor(a.val))); }
|
||||
|
||||
inline v_int32x4 v_floor(const v_float64x2& a)
|
||||
{
|
||||
static const vec_uchar16 perm = {16, 17, 18, 19, 24, 25, 26, 27, 0, 0, 0, 0, 0, 0, 0, 0};
|
||||
return v_int32x4(vec_perm(vec_int4_z, vec_ctsw(vec_floor(a.val)), perm));
|
||||
}
|
||||
{ return v_int32x4(vec_mergesqo(vec_cts(vec_floor(a.val)), vec_int4_z)); }
|
||||
|
||||
inline v_int32x4 v_ceil(const v_float32x4& a)
|
||||
{ return v_int32x4(vec_cts(vec_ceil(a.val), 0)); }
|
||||
{ return v_int32x4(vec_cts(vec_ceil(a.val))); }
|
||||
|
||||
inline v_int32x4 v_ceil(const v_float64x2& a)
|
||||
{
|
||||
static const vec_uchar16 perm = {16, 17, 18, 19, 24, 25, 26, 27, 0, 0, 0, 0, 0, 0, 0, 0};
|
||||
return v_int32x4(vec_perm(vec_int4_z, vec_ctsw(vec_ceil(a.val)), perm));
|
||||
}
|
||||
{ return v_int32x4(vec_mergesqo(vec_cts(vec_ceil(a.val)), vec_int4_z)); }
|
||||
|
||||
inline v_int32x4 v_trunc(const v_float32x4& a)
|
||||
{ return v_int32x4(vec_cts(a.val, 0)); }
|
||||
{ return v_int32x4(vec_cts(a.val)); }
|
||||
|
||||
inline v_int32x4 v_trunc(const v_float64x2& a)
|
||||
{
|
||||
static const vec_uchar16 perm = {16, 17, 18, 19, 24, 25, 26, 27, 0, 0, 0, 0, 0, 0, 0, 0};
|
||||
return v_int32x4(vec_perm(vec_int4_z, vec_ctsw(a.val), perm));
|
||||
}
|
||||
{ return v_int32x4(vec_mergesqo(vec_cts(a.val), vec_int4_z)); }
|
||||
|
||||
/** To float **/
|
||||
inline v_float32x4 v_cvt_f32(const v_int32x4& a)
|
||||
{ return v_float32x4(vec_ctf(a.val, 0)); }
|
||||
{ return v_float32x4(vec_ctf(a.val)); }
|
||||
|
||||
inline v_float32x4 v_cvt_f32(const v_float64x2& a)
|
||||
{
|
||||
static const vec_uchar16 perm = {16, 17, 18, 19, 24, 25, 26, 27, 0, 0, 0, 0, 0, 0, 0, 0};
|
||||
return v_float32x4(vec_perm(vec_float4_z, vec_cvf(a.val), perm));
|
||||
}
|
||||
{ return v_float32x4(vec_mergesqo(vec_cvf(a.val), vec_float4_z)); }
|
||||
|
||||
inline v_float64x2 v_cvt_f64(const v_int32x4& a)
|
||||
{
|
||||
return v_float64x2(vec_ctd(vec_mergeh(a.val, a.val), 0));
|
||||
}
|
||||
{ return v_float64x2(vec_ctdo(vec_mergeh(a.val, a.val))); }
|
||||
|
||||
inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)
|
||||
{
|
||||
return v_float64x2(vec_ctd(vec_mergel(a.val, a.val), 0));
|
||||
}
|
||||
{ return v_float64x2(vec_ctdo(vec_mergel(a.val, a.val))); }
|
||||
|
||||
inline v_float64x2 v_cvt_f64(const v_float32x4& a)
|
||||
{
|
||||
return v_float64x2(vec_cvf(vec_mergeh(a.val, a.val)));
|
||||
}
|
||||
{ return v_float64x2(vec_cvfo(vec_mergeh(a.val, a.val))); }
|
||||
|
||||
inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
|
||||
{
|
||||
return v_float64x2(vec_cvf(vec_mergel(a.val, a.val)));
|
||||
}
|
||||
{ return v_float64x2(vec_cvfo(vec_mergel(a.val, a.val))); }
|
||||
|
||||
/** Reinterpret **/
|
||||
/** its up there with load and store operations **/
|
||||
@ -886,10 +926,20 @@ inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
|
||||
const vec_float4 v0 = vec_splat(v.val, 0);
|
||||
const vec_float4 v1 = vec_splat(v.val, 1);
|
||||
const vec_float4 v2 = vec_splat(v.val, 2);
|
||||
const vec_float4 v3 = vec_splat(v.val, 3);
|
||||
VSX_UNUSED(const vec_float4) v3 = vec_splat(v.val, 3);
|
||||
return v_float32x4(vec_madd(v0, m0.val, vec_madd(v1, m1.val, vec_madd(v2, m2.val, vec_mul(v3, m3.val)))));
|
||||
}
|
||||
|
||||
inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
|
||||
const v_float32x4& m1, const v_float32x4& m2,
|
||||
const v_float32x4& a)
|
||||
{
|
||||
const vec_float4 v0 = vec_splat(v.val, 0);
|
||||
const vec_float4 v1 = vec_splat(v.val, 1);
|
||||
const vec_float4 v2 = vec_splat(v.val, 2);
|
||||
return v_float32x4(vec_madd(v0, m0.val, vec_madd(v1, m1.val, vec_madd(v2, m2.val, a.val))));
|
||||
}
|
||||
|
||||
#define OPENCV_HAL_IMPL_VSX_TRANSPOSE4x4(_Tpvec, _Tpvec2) \
|
||||
inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \
|
||||
const _Tpvec& a2, const _Tpvec& a3, \
|
||||
|
@ -51,18 +51,6 @@
|
||||
//! @{
|
||||
#if CV_VSX
|
||||
|
||||
#define FORCE_INLINE(tp) extern inline tp __attribute__((always_inline))
|
||||
|
||||
#define VSX_REDIRECT_1RG(rt, rg, fnm, fn2) \
|
||||
FORCE_INLINE(rt) fnm(const rg& a) { return fn2(a); }
|
||||
|
||||
#define VSX_REDIRECT_2RG(rt, rg, fnm, fn2) \
|
||||
FORCE_INLINE(rt) fnm(const rg& a, const rg& b) { return fn2(a, b); }
|
||||
|
||||
#define VSX_IMPL_PERM(rt, fnm, ...) \
|
||||
FORCE_INLINE(rt) fnm(const rt& a, const rt& b) \
|
||||
{ static const vec_uchar16 perm = {__VA_ARGS__}; return vec_perm(a, b, perm); }
|
||||
|
||||
#define __VSX_S16__(c, v) (c){v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v}
|
||||
#define __VSX_S8__(c, v) (c){v, v, v, v, v, v, v, v}
|
||||
#define __VSX_S4__(c, v) (c){v, v, v, v}
|
||||
@ -172,10 +160,19 @@ typedef __vector double vec_double2;
|
||||
#define vec_bdword2_f (__VSX_S2__(vec_bdword2, 0))
|
||||
#define vec_bdword2_t (__VSX_S2__(vec_bdword2, 1))
|
||||
|
||||
|
||||
#define FORCE_INLINE(tp) extern inline tp __attribute__((always_inline))
|
||||
|
||||
#define VSX_REDIRECT_1RG(rt, rg, fnm, fn2) \
|
||||
FORCE_INLINE(rt) fnm(const rg& a) { return fn2(a); }
|
||||
|
||||
#define VSX_REDIRECT_2RG(rt, rg, fnm, fn2) \
|
||||
FORCE_INLINE(rt) fnm(const rg& a, const rg& b) { return fn2(a, b); }
|
||||
|
||||
/*
|
||||
* GCC VSX compatibility
|
||||
**/
|
||||
#if defined(__GNUG__) && !defined(__IBMCPP__) && !defined(__clang__)
|
||||
#if defined(__GNUG__) && !defined(__clang__)
|
||||
|
||||
// inline asm helper
|
||||
#define VSX_IMPL_1RG(rt, rto, rg, rgo, opc, fnm) \
|
||||
@ -193,7 +190,7 @@ FORCE_INLINE(rt) fnm(const rg& a, const rg& b) \
|
||||
#define VSX_IMPL_2VRG(rt, rg, opc, fnm) VSX_IMPL_2VRG_F(rt, rg, #opc" %0,%1,%2", fnm)
|
||||
|
||||
#if __GNUG__ < 7
|
||||
/* up to GCC 6 vec_mul only supports precisions and llong */
|
||||
// up to GCC 6 vec_mul only supports precisions and llong
|
||||
# ifdef vec_mul
|
||||
# undef vec_mul
|
||||
# endif
|
||||
@ -211,10 +208,10 @@ FORCE_INLINE(rt) fnm(const rg& a, const rg& b) \
|
||||
}
|
||||
VSX_IMPL_MULH(vec_short8, vec_short8_c)
|
||||
VSX_IMPL_MULH(vec_ushort8, vec_ushort8_c)
|
||||
/* vmuluwm can be used for unsigned or signed integers, that's what they said */
|
||||
// vmuluwm can be used for unsigned or signed integers, that's what they said
|
||||
VSX_IMPL_2VRG(vec_int4, vec_int4, vmuluwm, vec_mul)
|
||||
VSX_IMPL_2VRG(vec_uint4, vec_uint4, vmuluwm, vec_mul)
|
||||
/* redirect to GCC builtin vec_mul, since it already supports precisions and llong */
|
||||
// redirect to GCC builtin vec_mul, since it already supports precisions and llong
|
||||
VSX_REDIRECT_2RG(vec_float4, vec_float4, vec_mul, __builtin_vec_mul)
|
||||
VSX_REDIRECT_2RG(vec_double2, vec_double2, vec_mul, __builtin_vec_mul)
|
||||
VSX_REDIRECT_2RG(vec_dword2, vec_dword2, vec_mul, __builtin_vec_mul)
|
||||
@ -245,7 +242,8 @@ FORCE_INLINE(rt) fnm(const rg& a, const rg& b) \
|
||||
VSX_IMPL_CMPGE(vec_bint4, vec_uint4, vcmpgtuw, vec_cmpge)
|
||||
VSX_IMPL_CMPGE(vec_bdword2, vec_dword2, vcmpgtsd, vec_cmpge)
|
||||
VSX_IMPL_CMPGE(vec_bdword2, vec_udword2, vcmpgtud, vec_cmpge)
|
||||
/* redirect to GCC builtin cmpge, since it already supports precisions */
|
||||
|
||||
// redirect to GCC builtin cmpge, since it already supports precisions
|
||||
VSX_REDIRECT_2RG(vec_bint4, vec_float4, vec_cmpge, __builtin_vec_cmpge)
|
||||
VSX_REDIRECT_2RG(vec_bdword2, vec_double2, vec_cmpge, __builtin_vec_cmpge)
|
||||
|
||||
@ -258,18 +256,6 @@ FORCE_INLINE(vec_bdword2) vec_nor(const vec_bdword2& a, const vec_bdword2& b)
|
||||
{ return vec_bdword2_c(__builtin_vec_nor(vec_dword2_c(a), vec_dword2_c(b))); }
|
||||
#endif // __GNUG__ < 6
|
||||
|
||||
// vector population count
|
||||
#ifndef vec_popcnt
|
||||
VSX_IMPL_1VRG(vec_uchar16, vec_uchar16, vpopcntb, vec_popcnt)
|
||||
VSX_IMPL_1VRG(vec_uchar16, vec_char16, vpopcntb, vec_popcnt)
|
||||
VSX_IMPL_1VRG(vec_ushort8, vec_ushort8, vpopcnth, vec_popcnt)
|
||||
VSX_IMPL_1VRG(vec_ushort8, vec_short8, vpopcnth, vec_popcnt)
|
||||
VSX_IMPL_1VRG(vec_uint4, vec_uint4, vpopcntw, vec_popcnt)
|
||||
VSX_IMPL_1VRG(vec_uint4, vec_int4, vpopcntw, vec_popcnt)
|
||||
VSX_IMPL_1VRG(vec_udword2, vec_udword2, vpopcntd, vec_popcnt)
|
||||
VSX_IMPL_1VRG(vec_udword2, vec_dword2, vpopcntd, vec_popcnt)
|
||||
#endif // vec_popcnt
|
||||
|
||||
#if __GNUG__ < 5
|
||||
// vec_xxpermdi in gcc4 missing little-endian supports just like clang
|
||||
# define vec_permi(a, b, c) vec_xxpermdi(b, a, (3 ^ ((c & 1) << 1 | c >> 1)))
|
||||
@ -279,33 +265,90 @@ VSX_REDIRECT_2RG(vec_char16, vec_short8, vec_packs, __builtin_vec_packs)
|
||||
VSX_REDIRECT_2RG(vec_uchar16, vec_ushort8, vec_packs, __builtin_vec_packs)
|
||||
VSX_REDIRECT_2RG(vec_short8, vec_int4, vec_packs, __builtin_vec_packs)
|
||||
VSX_REDIRECT_2RG(vec_ushort8, vec_uint4, vec_packs, __builtin_vec_packs)
|
||||
|
||||
VSX_IMPL_2VRG_F(vec_int4, vec_dword2, "vpksdss %0,%2,%1", vec_packs)
|
||||
VSX_IMPL_2VRG_F(vec_uint4, vec_udword2, "vpkudus %0,%2,%1", vec_packs)
|
||||
#else
|
||||
# define vec_permi vec_xxpermdi
|
||||
#endif
|
||||
|
||||
// converts between single and double-precision
|
||||
#ifndef vec_cvf
|
||||
VSX_REDIRECT_1RG(vec_float4, vec_double2, vec_cvf, __builtin_vsx_xvcvdpsp)
|
||||
FORCE_INLINE(vec_double2) vec_cvf(const vec_float4& a)
|
||||
{ return __builtin_vsx_xvcvspdp(vec_sld(a, a, 4)); }
|
||||
#endif
|
||||
|
||||
// converts 32 and 64 bit integers to double-precision
|
||||
#ifndef vec_ctd
|
||||
# define vec_ctd(a, b) __vec_ctd(a)
|
||||
VSX_IMPL_1RG(vec_double2, wd, vec_int4, wa, xvcvsxwdp, __vec_ctd)
|
||||
VSX_IMPL_1RG(vec_double2, wd, vec_uint4, wa, xvcvuxwdp, __vec_ctd)
|
||||
VSX_IMPL_1RG(vec_double2, wd, vec_dword2, wi, xvcvsxddp, __vec_ctd)
|
||||
VSX_IMPL_1RG(vec_double2, wd, vec_udword2, wi, xvcvuxddp, __vec_ctd)
|
||||
#endif
|
||||
#endif // __GNUG__ < 5
|
||||
|
||||
// shift left double by word immediate
|
||||
#ifndef vec_sldw
|
||||
# define vec_sldw __builtin_vsx_xxsldwi
|
||||
#endif
|
||||
|
||||
// vector population count
|
||||
VSX_IMPL_1VRG(vec_uchar16, vec_uchar16, vpopcntb, vec_popcntu)
|
||||
VSX_IMPL_1VRG(vec_uchar16, vec_char16, vpopcntb, vec_popcntu)
|
||||
VSX_IMPL_1VRG(vec_ushort8, vec_ushort8, vpopcnth, vec_popcntu)
|
||||
VSX_IMPL_1VRG(vec_ushort8, vec_short8, vpopcnth, vec_popcntu)
|
||||
VSX_IMPL_1VRG(vec_uint4, vec_uint4, vpopcntw, vec_popcntu)
|
||||
VSX_IMPL_1VRG(vec_uint4, vec_int4, vpopcntw, vec_popcntu)
|
||||
VSX_IMPL_1VRG(vec_udword2, vec_udword2, vpopcntd, vec_popcntu)
|
||||
VSX_IMPL_1VRG(vec_udword2, vec_dword2, vpopcntd, vec_popcntu)
|
||||
|
||||
// converts between single and double-precision
|
||||
#ifdef vec_cvf
|
||||
# undef vec_cvf
|
||||
#endif
|
||||
VSX_REDIRECT_1RG(vec_float4, vec_double2, vec_cvf, __builtin_vsx_xvcvdpsp)
|
||||
VSX_REDIRECT_1RG(vec_double2, vec_float4, vec_cvfo, __builtin_vsx_xvcvspdp)
|
||||
|
||||
FORCE_INLINE(vec_double2) vec_cvf(const vec_float4& a)
|
||||
{ return vec_cvfo(vec_sldw(a, a, 1)); }
|
||||
|
||||
// converts word and doubleword to double-precision
|
||||
#ifdef vec_ctd
|
||||
# undef vec_ctd
|
||||
#endif
|
||||
VSX_IMPL_1RG(vec_double2, wd, vec_int4, wa, xvcvsxwdp, vec_ctdo)
|
||||
VSX_IMPL_1RG(vec_double2, wd, vec_uint4, wa, xvcvuxwdp, vec_ctdo)
|
||||
VSX_IMPL_1RG(vec_double2, wd, vec_dword2, wi, xvcvsxddp, vec_ctd)
|
||||
VSX_IMPL_1RG(vec_double2, wd, vec_udword2, wi, xvcvuxddp, vec_ctd)
|
||||
|
||||
FORCE_INLINE(vec_double2) vec_ctd(const vec_int4& a)
|
||||
{ return vec_ctdo(vec_sldw(a, a, 1)); }
|
||||
|
||||
FORCE_INLINE(vec_double2) vec_ctd(const vec_uint4& a)
|
||||
{ return vec_ctdo(vec_sldw(a, a, 1)); }
|
||||
|
||||
// converts word and doubleword to single-precision
|
||||
#undef vec_ctf
|
||||
VSX_IMPL_1RG(vec_float4, wf, vec_int4, wa, xvcvsxwsp, vec_ctf)
|
||||
VSX_IMPL_1RG(vec_float4, wf, vec_uint4, wa, xvcvuxwsp, vec_ctf)
|
||||
VSX_IMPL_1RG(vec_float4, wf, vec_dword2, wi, xvcvsxdsp, vec_ctf)
|
||||
VSX_IMPL_1RG(vec_float4, wf, vec_udword2, wi, xvcvuxdsp, vec_ctf)
|
||||
|
||||
// converts single and double precision to signed word
|
||||
#undef vec_cts
|
||||
VSX_IMPL_1RG(vec_int4, wa, vec_double2, wd, xvcvdpsxws, vec_cts)
|
||||
VSX_IMPL_1RG(vec_int4, wa, vec_float4, wf, xvcvspsxws, vec_cts)
|
||||
|
||||
// converts single and double precision to unsigned word
|
||||
#undef vec_ctu
|
||||
VSX_IMPL_1RG(vec_uint4, wa, vec_double2, wd, xvcvdpuxws, vec_ctu)
|
||||
VSX_IMPL_1RG(vec_uint4, wa, vec_float4, wf, xvcvspuxws, vec_ctu)
|
||||
|
||||
// converts single and double precision to signed doubleword
|
||||
#ifdef vec_ctsl
|
||||
# undef vec_ctsl
|
||||
#endif
|
||||
VSX_IMPL_1RG(vec_dword2, wi, vec_double2, wd, xvcvdpsxds, vec_ctsl)
|
||||
VSX_IMPL_1RG(vec_dword2, wi, vec_float4, wf, xvcvspsxds, vec_ctslo)
|
||||
|
||||
FORCE_INLINE(vec_dword2) vec_ctsl(const vec_float4& a)
|
||||
{ return vec_ctslo(vec_sldw(a, a, 1)); }
|
||||
|
||||
// converts single and double precision to unsigned doubleword
|
||||
#ifdef vec_ctul
|
||||
# undef vec_ctul
|
||||
#endif
|
||||
VSX_IMPL_1RG(vec_udword2, wi, vec_double2, wd, xvcvdpuxds, vec_ctul)
|
||||
VSX_IMPL_1RG(vec_udword2, wi, vec_float4, wf, xvcvspuxds, vec_ctulo)
|
||||
|
||||
FORCE_INLINE(vec_udword2) vec_ctul(const vec_float4& a)
|
||||
{ return vec_ctulo(vec_sldw(a, a, 1)); }
|
||||
|
||||
// just in case if GCC doesn't define it
|
||||
#ifndef vec_xl
|
||||
# define vec_xl vec_vsx_ld
|
||||
@ -327,8 +370,13 @@ VSX_IMPL_2VRG_F(vec_uint4, vec_udword2, "vpkudus %0,%2,%1", vec_packs)
|
||||
* Also there's already an open bug https://bugs.llvm.org/show_bug.cgi?id=31837
|
||||
*
|
||||
* So we're not able to use inline asm and only use built-in functions that CLANG supports
|
||||
* and use __builtin_convertvector if clang missng any of vector conversions built-in functions
|
||||
*/
|
||||
|
||||
// convert vector helper
|
||||
#define VSX_IMPL_CONVERT(rt, rg, fnm) \
|
||||
FORCE_INLINE(rt) fnm(const rg& a) { return __builtin_convertvector(a, rt); }
|
||||
|
||||
#if __clang_major__ < 5
|
||||
// implement vec_permi in a dirty way
|
||||
# define VSX_IMPL_CLANG_4_PERMI(Tvec) \
|
||||
@ -362,26 +410,6 @@ VSX_IMPL_2VRG_F(vec_uint4, vec_udword2, "vpkudus %0,%2,%1", vec_packs)
|
||||
# define vec_sldw vec_xxsldwi
|
||||
#endif
|
||||
|
||||
/* converts between single and double precision */
|
||||
#ifndef vec_cvf
|
||||
VSX_REDIRECT_1RG(vec_float4, vec_double2, vec_cvf, __builtin_vsx_xvcvdpsp)
|
||||
FORCE_INLINE(vec_double2) vec_cvf(const vec_float4& a)
|
||||
{ return __builtin_vsx_xvcvspdp(vec_sld(a, a, 4)); }
|
||||
#endif
|
||||
|
||||
/* converts 32 and 64 bit integers to double-precision */
|
||||
#ifndef vec_ctd
|
||||
# define vec_ctd(a, b) __vec_ctd(a)
|
||||
VSX_REDIRECT_1RG(vec_double2, vec_int4, __vec_ctd, __builtin_vsx_xvcvsxwdp)
|
||||
VSX_REDIRECT_1RG(vec_double2, vec_uint4, __vec_ctd, __builtin_vsx_xvcvuxwdp)
|
||||
// implement vec_ctd for double word in a dirty way since we are missing builtin xvcvsxddp, xvcvuxddp
|
||||
// please try to avoid using it for double words
|
||||
FORCE_INLINE(vec_double2) __vec_ctd(const vec_dword2& a)
|
||||
{ return vec_double2_set((double)vec_extract(a, 0), (double)vec_extract(a, 1)); }
|
||||
FORCE_INLINE(vec_double2) __vec_ctd(const vec_udword2& a)
|
||||
{ return vec_double2_set((double)vec_extract(a, 0), (double)vec_extract(a, 1)); }
|
||||
#endif
|
||||
|
||||
// Implement vec_rsqrt since clang only supports vec_rsqrte
|
||||
#ifndef vec_rsqrt
|
||||
FORCE_INLINE(vec_float4) vec_rsqrt(const vec_float4& a)
|
||||
@ -391,27 +419,157 @@ VSX_IMPL_2VRG_F(vec_uint4, vec_udword2, "vpkudus %0,%2,%1", vec_packs)
|
||||
{ return vec_div(vec_double2_sp(1), vec_sqrt(a)); }
|
||||
#endif
|
||||
|
||||
// vec_popcnt should return unsigned but clang has different thought just like gcc in vec_vpopcnt
|
||||
#define VSX_IMPL_POPCNTU(Tvec, Tvec2, ucast) \
|
||||
FORCE_INLINE(Tvec) vec_popcntu(const Tvec2& a) \
|
||||
{ return ucast(vec_popcnt(a)); }
|
||||
VSX_IMPL_POPCNTU(vec_uchar16, vec_char16, vec_uchar16_c);
|
||||
VSX_IMPL_POPCNTU(vec_ushort8, vec_short8, vec_ushort8_c);
|
||||
VSX_IMPL_POPCNTU(vec_uint4, vec_int4, vec_uint4_c);
|
||||
// redirect unsigned types
|
||||
VSX_REDIRECT_1RG(vec_uchar16, vec_uchar16, vec_popcntu, vec_popcnt)
|
||||
VSX_REDIRECT_1RG(vec_ushort8, vec_ushort8, vec_popcntu, vec_popcnt)
|
||||
VSX_REDIRECT_1RG(vec_uint4, vec_uint4, vec_popcntu, vec_popcnt)
|
||||
|
||||
/*
|
||||
* __builtin_altivec_vctsxs in clang 5 and 6 causes ambiguous which used by vec_cts
|
||||
* so we just redefine it and cast it
|
||||
*/
|
||||
// converts between single and double precision
|
||||
#ifdef vec_cvf
|
||||
# undef vec_cvf
|
||||
#endif
|
||||
VSX_REDIRECT_1RG(vec_float4, vec_double2, vec_cvf, __builtin_vsx_xvcvdpsp)
|
||||
VSX_REDIRECT_1RG(vec_double2, vec_float4, vec_cvfo, __builtin_vsx_xvcvspdp)
|
||||
|
||||
FORCE_INLINE(vec_double2) vec_cvf(const vec_float4& a)
|
||||
{ return vec_cvfo(vec_sldw(a, a, 1)); }
|
||||
|
||||
// converts word and doubleword to double-precision
|
||||
#ifdef vec_ctd
|
||||
# undef vec_ctd
|
||||
#endif
|
||||
VSX_REDIRECT_1RG(vec_double2, vec_int4, vec_ctdo, __builtin_vsx_xvcvsxwdp)
|
||||
VSX_REDIRECT_1RG(vec_double2, vec_uint4, vec_ctdo, __builtin_vsx_xvcvuxwdp)
|
||||
|
||||
VSX_IMPL_CONVERT(vec_double2, vec_dword2, vec_ctd)
|
||||
VSX_IMPL_CONVERT(vec_double2, vec_udword2, vec_ctd)
|
||||
|
||||
FORCE_INLINE(vec_double2) vec_ctd(const vec_int4& a)
|
||||
{ return vec_ctdo(vec_sldw(a, a, 1)); }
|
||||
|
||||
FORCE_INLINE(vec_double2) vec_ctd(const vec_uint4& a)
|
||||
{ return vec_ctdo(vec_sldw(a, a, 1)); }
|
||||
|
||||
// converts word and doubleword to single-precision
|
||||
#if __clang_major__ > 4
|
||||
# undef vec_ctf
|
||||
#endif
|
||||
VSX_IMPL_CONVERT(vec_float4, vec_int4, vec_ctf)
|
||||
VSX_IMPL_CONVERT(vec_float4, vec_uint4, vec_ctf)
|
||||
VSX_REDIRECT_1RG(vec_float4, vec_dword2, vec_ctf, __builtin_vsx_xvcvsxdsp)
|
||||
VSX_REDIRECT_1RG(vec_float4, vec_udword2, vec_ctf, __builtin_vsx_xvcvuxdsp)
|
||||
|
||||
// converts single and double precision to signed word
|
||||
#if __clang_major__ > 4
|
||||
# undef vec_cts
|
||||
# define vec_cts(__a, __b) \
|
||||
_Generic((__a), vector float \
|
||||
: (vector signed int)__builtin_altivec_vctsxs((__a), (__b)), vector double \
|
||||
: __extension__({ \
|
||||
vector double __ret = \
|
||||
(__a) * \
|
||||
(vector double)(vector unsigned long long)((0x3ffULL + (__b)) \
|
||||
<< 52); \
|
||||
__builtin_convertvector(__ret, vector signed long long); \
|
||||
}))
|
||||
#endif // __clang_major__ > 4
|
||||
#endif
|
||||
VSX_REDIRECT_1RG(vec_int4, vec_double2, vec_cts, __builtin_vsx_xvcvdpsxws)
|
||||
VSX_IMPL_CONVERT(vec_int4, vec_float4, vec_cts)
|
||||
|
||||
// converts single and double precision to unsigned word
|
||||
#if __clang_major__ > 4
|
||||
# undef vec_ctu
|
||||
#endif
|
||||
VSX_REDIRECT_1RG(vec_uint4, vec_double2, vec_ctu, __builtin_vsx_xvcvdpuxws)
|
||||
VSX_IMPL_CONVERT(vec_uint4, vec_float4, vec_ctu)
|
||||
|
||||
// converts single and double precision to signed doubleword
|
||||
#ifdef vec_ctsl
|
||||
# undef vec_ctsl
|
||||
#endif
|
||||
VSX_IMPL_CONVERT(vec_dword2, vec_double2, vec_ctsl)
|
||||
// __builtin_convertvector unable to convert, xvcvspsxds is missing on it
|
||||
FORCE_INLINE(vec_dword2) vec_ctslo(const vec_float4& a)
|
||||
{ return vec_ctsl(vec_cvfo(a)); }
|
||||
|
||||
FORCE_INLINE(vec_dword2) vec_ctsl(const vec_float4& a)
|
||||
{ return vec_ctsl(vec_cvf(a)); }
|
||||
|
||||
// converts single and double precision to unsigned doubleword
|
||||
#ifdef vec_ctul
|
||||
# undef vec_ctul
|
||||
#endif
|
||||
VSX_IMPL_CONVERT(vec_udword2, vec_double2, vec_ctul)
|
||||
// __builtin_convertvector unable to convert, xvcvspuxds is missing on it
|
||||
FORCE_INLINE(vec_udword2) vec_ctulo(const vec_float4& a)
|
||||
{ return vec_ctul(vec_cvfo(a)); }
|
||||
|
||||
FORCE_INLINE(vec_udword2) vec_ctul(const vec_float4& a)
|
||||
{ return vec_ctul(vec_cvf(a)); }
|
||||
|
||||
#endif // CLANG VSX compatibility
|
||||
|
||||
/*
|
||||
* XLC VSX compatibility
|
||||
**/
|
||||
#if defined(__IBMCPP__)
|
||||
|
||||
// vector population count
|
||||
#define vec_popcntu vec_popcnt
|
||||
|
||||
// overload and redirect wih setting second arg to zero
|
||||
// since we only support conversions without the second arg
|
||||
#define VSX_IMPL_OVERLOAD_Z2(rt, rg, fnm) \
|
||||
FORCE_INLINE(rt) fnm(const rg& a) { return fnm(a, 0); }
|
||||
|
||||
VSX_IMPL_OVERLOAD_Z2(vec_double2, vec_int4, vec_ctd)
|
||||
VSX_IMPL_OVERLOAD_Z2(vec_double2, vec_uint4, vec_ctd)
|
||||
VSX_IMPL_OVERLOAD_Z2(vec_double2, vec_dword2, vec_ctd)
|
||||
VSX_IMPL_OVERLOAD_Z2(vec_double2, vec_udword2, vec_ctd)
|
||||
|
||||
VSX_IMPL_OVERLOAD_Z2(vec_float4, vec_int4, vec_ctf)
|
||||
VSX_IMPL_OVERLOAD_Z2(vec_float4, vec_uint4, vec_ctf)
|
||||
VSX_IMPL_OVERLOAD_Z2(vec_float4, vec_dword2, vec_ctf)
|
||||
VSX_IMPL_OVERLOAD_Z2(vec_float4, vec_udword2, vec_ctf)
|
||||
|
||||
VSX_IMPL_OVERLOAD_Z2(vec_int4, vec_double2, vec_cts)
|
||||
VSX_IMPL_OVERLOAD_Z2(vec_int4, vec_float4, vec_cts)
|
||||
|
||||
VSX_IMPL_OVERLOAD_Z2(vec_uint4, vec_double2, vec_ctu)
|
||||
VSX_IMPL_OVERLOAD_Z2(vec_uint4, vec_float4, vec_ctu)
|
||||
|
||||
VSX_IMPL_OVERLOAD_Z2(vec_dword2, vec_double2, vec_ctsl)
|
||||
VSX_IMPL_OVERLOAD_Z2(vec_dword2, vec_float4, vec_ctsl)
|
||||
|
||||
VSX_IMPL_OVERLOAD_Z2(vec_udword2, vec_double2, vec_ctul)
|
||||
VSX_IMPL_OVERLOAD_Z2(vec_udword2, vec_float4, vec_ctul)
|
||||
|
||||
// fixme: implement conversions of odd-numbered elements in a dirty way
|
||||
// since xlc doesn't support VSX registers operand in inline asm.
|
||||
#define VSX_IMPL_DIRTY_ODD(rt, rg, fnm, fn2) \
|
||||
FORCE_INLINE(rt) fnm(const rg& a) { return fn2(vec_sldw(a, a, 3)); }
|
||||
|
||||
VSX_IMPL_DIRTY_ODD(vec_double2, vec_float4, vec_cvfo, vec_cvf)
|
||||
VSX_IMPL_DIRTY_ODD(vec_double2, vec_int4, vec_ctdo, vec_ctd)
|
||||
VSX_IMPL_DIRTY_ODD(vec_double2, vec_uint4, vec_ctdo, vec_ctd)
|
||||
VSX_IMPL_DIRTY_ODD(vec_dword2, vec_float4, vec_ctslo, vec_ctsl)
|
||||
VSX_IMPL_DIRTY_ODD(vec_udword2, vec_float4, vec_ctulo, vec_ctul)
|
||||
|
||||
#endif // XLC VSX compatibility
|
||||
|
||||
// ignore GCC warning that casued by -Wunused-but-set-variable in rare cases
|
||||
#if defined(__GNUG__) && !defined(__clang__)
|
||||
# define VSX_UNUSED(Tvec) Tvec __attribute__((__unused__))
|
||||
#else // CLANG, XLC
|
||||
# define VSX_UNUSED(Tvec) Tvec
|
||||
#endif
|
||||
|
||||
// gcc can find his way in casting log int and XLC, CLANG ambiguous
|
||||
#if defined(__clang__) || defined(__IBMCPP__)
|
||||
FORCE_INLINE(vec_udword2) vec_splats(uint64 v)
|
||||
{ return vec_splats((unsigned long long) v); }
|
||||
|
||||
FORCE_INLINE(vec_dword2) vec_splats(int64 v)
|
||||
{ return vec_splats((long long) v); }
|
||||
#endif
|
||||
|
||||
/*
|
||||
* implement vsx_ld(offset, pointer), vsx_st(vector, offset, pointer)
|
||||
* load and set using offset depend on the pointer type
|
||||
@ -468,75 +626,6 @@ VSX_IMPL_2VRG_F(vec_uint4, vec_udword2, "vpkudus %0,%2,%1", vec_packs)
|
||||
{ vsx_stf(vec, VSX_OFFSET(o, p), (long long*)p); }
|
||||
#endif
|
||||
|
||||
#if defined(__clang__) || defined(__IBMCPP__)
|
||||
// gcc can find his way in casting log int and XLC, CLANG ambiguous
|
||||
FORCE_INLINE(vec_udword2) vec_splats(uint64 v)
|
||||
{ return vec_splats((unsigned long long) v); }
|
||||
|
||||
FORCE_INLINE(vec_dword2) vec_splats(int64 v)
|
||||
{ return vec_splats((long long) v); }
|
||||
#endif
|
||||
|
||||
// Implement store vector bool char for XLC
|
||||
#if defined(__IBMCPP__) && defined(__clang__)
|
||||
FORCE_INLINE(void) vec_xst(const vec_bchar16 &vec, long o, uchar* p)
|
||||
{ vec_xst(vec_uchar16_c(vec), VSX_OFFSET(o, p), p); }
|
||||
#endif
|
||||
|
||||
// Working around vec_popcnt compatibility
|
||||
/*
|
||||
* vec_popcnt should return unsigned but clang has different thought just like gcc in vec_vpopcnt
|
||||
*
|
||||
* use vec_popcntu instead to deal with it
|
||||
*/
|
||||
#if defined(__clang__) && !defined(__IBMCPP__)
|
||||
# define VSX_IMPL_CLANG_POPCNTU(Tvec, Tvec2, ucast) \
|
||||
FORCE_INLINE(Tvec) vec_popcntu(const Tvec2& a) \
|
||||
{ return ucast(vec_popcnt(a)); }
|
||||
|
||||
VSX_IMPL_CLANG_POPCNTU(vec_uchar16, vec_char16, vec_uchar16_c);
|
||||
VSX_IMPL_CLANG_POPCNTU(vec_ushort8, vec_short8, vec_ushort8_c);
|
||||
VSX_IMPL_CLANG_POPCNTU(vec_uint4, vec_int4, vec_uint4_c);
|
||||
// redirect unsigned types
|
||||
VSX_REDIRECT_1RG(vec_uchar16, vec_uchar16, vec_popcntu, vec_popcnt)
|
||||
VSX_REDIRECT_1RG(vec_ushort8, vec_ushort8, vec_popcntu, vec_popcnt)
|
||||
VSX_REDIRECT_1RG(vec_uint4, vec_uint4, vec_popcntu, vec_popcnt)
|
||||
#else
|
||||
# define vec_popcntu vec_popcnt
|
||||
#endif
|
||||
|
||||
// Working around vec_cts compatibility
|
||||
/*
|
||||
* vec_cts in gcc and clang converts single-precision to signed fixed-point word
|
||||
* and from double-precision to signed doubleword, also there's no implement for vec_ctsl
|
||||
*
|
||||
* vec_cts in xlc converts single and double precision to signed fixed-point word
|
||||
* and xlc has vec_ctsl which converts single and double precision to signed doubleword
|
||||
*
|
||||
* so to deal with this situation, use vec_cts only if you want to convert single-precision to signed fixed-point word
|
||||
* and use vec_ctsl when you want to convert double-precision to signed doubleword
|
||||
*
|
||||
* Also we implemented vec_ctsw(a) to convert double-precision to signed fixed-point word
|
||||
*/
|
||||
|
||||
// converts double-precision to signed doubleword for GCC and CLANG
|
||||
#if !defined(vec_ctsl) && !defined(__IBMCPP__) && (defined(__GNUG__) || defined(__clang__))
|
||||
// GCC4 has incorrect results in convert to signed doubleword
|
||||
# if !defined(__clang__) && __GNUG__ < 5
|
||||
# define vec_ctsl(a, b) __vec_ctsl(a)
|
||||
VSX_IMPL_1RG(vec_dword2, wi, vec_double2, wd, xvcvdpsxds, __vec_ctsl)
|
||||
# else // GCC > 4 , CLANG
|
||||
# define vec_ctsl vec_cts
|
||||
# endif
|
||||
#endif
|
||||
|
||||
// converts double-precision to signed fixed-point word
|
||||
#if defined(__IBMCPP__)
|
||||
# define vec_ctsw(a) vec_cts(a, 0)
|
||||
#else // GCC, CLANG
|
||||
# define vec_ctsw(a) vec_int4_c(__builtin_vsx_xvcvdpsxws(a))
|
||||
#endif
|
||||
|
||||
// load 4 unsigned bytes into uint4 vector
|
||||
#define vec_ld_buw(p) vec_uint4_set((p)[0], (p)[1], (p)[2], (p)[3])
|
||||
|
||||
@ -613,6 +702,10 @@ VSX_IMPL_UNPACKU(vec_udword2, vec_uint4, vec_uint4_z)
|
||||
* Implement vec_mergesqe and vec_mergesqo
|
||||
* Merges the sequence values of even and odd elements of two vectors
|
||||
*/
|
||||
#define VSX_IMPL_PERM(rt, fnm, ...) \
|
||||
FORCE_INLINE(rt) fnm(const rt& a, const rt& b) \
|
||||
{ static const vec_uchar16 perm = {__VA_ARGS__}; return vec_perm(a, b, perm); }
|
||||
|
||||
// 16
|
||||
#define perm16_mergesqe 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
|
||||
#define perm16_mergesqo 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
|
||||
|
@ -74,6 +74,12 @@ namespace cv
|
||||
|
||||
#define RNG_NEXT(x) ((uint64)(unsigned)(x)*CV_RNG_COEFF + ((x) >> 32))
|
||||
|
||||
#ifdef __PPC64__
|
||||
#define PPC_MUL_ADD(ret, tmp, p0, p1) \
|
||||
asm volatile("fmuls %0,%1,%2\n\t fadds %0,%0,%3" : "=&f" (ret) \
|
||||
: "f" (tmp), "f" (p0), "f" (p1))
|
||||
#endif
|
||||
|
||||
/***************************************************************************************\
|
||||
* Pseudo-Random Number Generators (PRNGs) *
|
||||
\***************************************************************************************/
|
||||
@ -248,6 +254,14 @@ static void randf_32f( float* arr, int len, uint64* state, const Vec2f* p, bool
|
||||
|
||||
volatile float32x4_t v0 = vmulq_f32(vld1q_f32(f), p0);
|
||||
vst1q_f32(arr+i, vaddq_f32(v0, p1));
|
||||
#elif defined __PPC64__
|
||||
// inline asm is required for numerical stability!
|
||||
// compilers tends to use floating multiply-add single(fmadds)
|
||||
// instead of separate multiply and add
|
||||
PPC_MUL_ADD(arr[i+0], f[0], p[i+0][0], p[i+0][1]);
|
||||
PPC_MUL_ADD(arr[i+1], f[1], p[i+1][0], p[i+1][1]);
|
||||
PPC_MUL_ADD(arr[i+2], f[2], p[i+2][0], p[i+2][1]);
|
||||
PPC_MUL_ADD(arr[i+3], f[3], p[i+3][0], p[i+3][1]);
|
||||
#else
|
||||
arr[i+0] = f[0]*p[i+0][0] + p[i+0][1];
|
||||
arr[i+1] = f[1]*p[i+1][0] + p[i+1][1];
|
||||
@ -269,6 +283,8 @@ static void randf_32f( float* arr, int len, uint64* state, const Vec2f* p, bool
|
||||
vdup_n_f32((float)(int)temp), vdup_n_f32(p[i][0])),
|
||||
vdup_n_f32(p[i][1]));
|
||||
arr[i] = vget_lane_f32(t, 0);
|
||||
#elif defined __PPC64__
|
||||
PPC_MUL_ADD(arr[i], (float)(int)temp, p[i][0], p[i][1]);
|
||||
#else
|
||||
arr[i] = (int)temp*p[i][0] + p[i][1];
|
||||
#endif
|
||||
|
Loading…
Reference in New Issue
Block a user