mirror of
https://github.com/opencv/opencv.git
synced 2025-06-07 17:44:04 +08:00
hal: vsx: further optimize v_signmask
Use the quadword bit permutation instruction to creatively move the sign bits to create the mask. Note that values above 127 will result in 0.
This commit is contained in:
parent
7295983964
commit
1031b7f4bc
@ -845,36 +845,24 @@ inline v_uint64x2 v_popcount(const v_int64x2& a)
|
|||||||
/** Mask **/
|
/** Mask **/
|
||||||
inline int v_signmask(const v_uint8x16& a)
|
inline int v_signmask(const v_uint8x16& a)
|
||||||
{
|
{
|
||||||
vec_uchar16 sv = vec_sr(a.val, vec_uchar16_sp(7));
|
static const vec_uchar16 qperm = {120, 112, 104, 96, 88, 80, 72, 64, 56, 48, 40, 32, 24, 16, 8, 0};
|
||||||
static const vec_uchar16 slm = {0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7};
|
return vec_extract((vec_int4)vec_vbpermq(v_reinterpret_as_u8(a).val, qperm), 2);
|
||||||
sv = vec_sl(sv, slm);
|
|
||||||
vec_uint4 sv4 = vec_sum4s(sv, vec_uint4_z);
|
|
||||||
static const vec_uint4 slm4 = {0, 0, 8, 8};
|
|
||||||
sv4 = vec_sl(sv4, slm4);
|
|
||||||
return vec_extract(vec_sums((vec_int4) sv4, vec_int4_z), 3);
|
|
||||||
}
|
}
|
||||||
inline int v_signmask(const v_int8x16& a)
|
inline int v_signmask(const v_int8x16& a)
|
||||||
{ return v_signmask(v_reinterpret_as_u8(a)); }
|
{ return v_signmask(v_reinterpret_as_u8(a)); }
|
||||||
|
|
||||||
inline int v_signmask(const v_int16x8& a)
|
inline int v_signmask(const v_int16x8& a)
|
||||||
{
|
{
|
||||||
static const vec_ushort8 slm = {0, 1, 2, 3, 4, 5, 6, 7};
|
static const vec_uchar16 qperm = {112, 96, 80, 64, 48, 32, 16, 0, 128, 128, 128, 128, 128, 128, 128, 128};
|
||||||
vec_short8 sv = vec_sr(a.val, vec_ushort8_sp(15));
|
return vec_extract((vec_int4)vec_vbpermq(v_reinterpret_as_u8(a).val, qperm), 2);
|
||||||
sv = vec_sl(sv, slm);
|
|
||||||
vec_int4 svi = vec_int4_z;
|
|
||||||
svi = vec_sums(vec_sum4s(sv, svi), svi);
|
|
||||||
return vec_extract(svi, 3);
|
|
||||||
}
|
}
|
||||||
inline int v_signmask(const v_uint16x8& a)
|
inline int v_signmask(const v_uint16x8& a)
|
||||||
{ return v_signmask(v_reinterpret_as_s16(a)); }
|
{ return v_signmask(v_reinterpret_as_s16(a)); }
|
||||||
|
|
||||||
inline int v_signmask(const v_int32x4& a)
|
inline int v_signmask(const v_int32x4& a)
|
||||||
{
|
{
|
||||||
static const vec_uint4 slm = {0, 1, 2, 3};
|
static const vec_uchar16 qperm = {96, 64, 32, 0, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128};
|
||||||
vec_int4 sv = vec_sr(a.val, vec_uint4_sp(31));
|
return vec_extract((vec_int4)vec_vbpermq(v_reinterpret_as_u8(a).val, qperm), 2);
|
||||||
sv = vec_sl(sv, slm);
|
|
||||||
sv = vec_sums(sv, vec_int4_z);
|
|
||||||
return vec_extract(sv, 3);
|
|
||||||
}
|
}
|
||||||
inline int v_signmask(const v_uint32x4& a)
|
inline int v_signmask(const v_uint32x4& a)
|
||||||
{ return v_signmask(v_reinterpret_as_s32(a)); }
|
{ return v_signmask(v_reinterpret_as_s32(a)); }
|
||||||
|
Loading…
Reference in New Issue
Block a user