hal: vsx: further optimize v_signmask

Use the quadword bit permutation instruction to creatively move
the sign bits to create the mask. Note that values above 127 will
result in 0.
This commit is contained in:
Paul E. Murphy 2019-08-02 15:06:47 -05:00
parent 7295983964
commit 1031b7f4bc

View File

@ -845,36 +845,24 @@ inline v_uint64x2 v_popcount(const v_int64x2& a)
/** Mask **/
inline int v_signmask(const v_uint8x16& a)
{
vec_uchar16 sv = vec_sr(a.val, vec_uchar16_sp(7));
static const vec_uchar16 slm = {0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7};
sv = vec_sl(sv, slm);
vec_uint4 sv4 = vec_sum4s(sv, vec_uint4_z);
static const vec_uint4 slm4 = {0, 0, 8, 8};
sv4 = vec_sl(sv4, slm4);
return vec_extract(vec_sums((vec_int4) sv4, vec_int4_z), 3);
static const vec_uchar16 qperm = {120, 112, 104, 96, 88, 80, 72, 64, 56, 48, 40, 32, 24, 16, 8, 0};
return vec_extract((vec_int4)vec_vbpermq(v_reinterpret_as_u8(a).val, qperm), 2);
}
inline int v_signmask(const v_int8x16& a)
{ return v_signmask(v_reinterpret_as_u8(a)); }
inline int v_signmask(const v_int16x8& a)
{
static const vec_ushort8 slm = {0, 1, 2, 3, 4, 5, 6, 7};
vec_short8 sv = vec_sr(a.val, vec_ushort8_sp(15));
sv = vec_sl(sv, slm);
vec_int4 svi = vec_int4_z;
svi = vec_sums(vec_sum4s(sv, svi), svi);
return vec_extract(svi, 3);
static const vec_uchar16 qperm = {112, 96, 80, 64, 48, 32, 16, 0, 128, 128, 128, 128, 128, 128, 128, 128};
return vec_extract((vec_int4)vec_vbpermq(v_reinterpret_as_u8(a).val, qperm), 2);
}
inline int v_signmask(const v_uint16x8& a)
{ return v_signmask(v_reinterpret_as_s16(a)); }
inline int v_signmask(const v_int32x4& a)
{
static const vec_uint4 slm = {0, 1, 2, 3};
vec_int4 sv = vec_sr(a.val, vec_uint4_sp(31));
sv = vec_sl(sv, slm);
sv = vec_sums(sv, vec_int4_z);
return vec_extract(sv, 3);
static const vec_uchar16 qperm = {96, 64, 32, 0, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128};
return vec_extract((vec_int4)vec_vbpermq(v_reinterpret_as_u8(a).val, qperm), 2);
}
inline int v_signmask(const v_uint32x4& a)
{ return v_signmask(v_reinterpret_as_s32(a)); }