Merge pull request #10083 from alalek:core_intrinsics_load_low

This commit is contained in:
Alexander Alekhin 2017-11-15 16:08:56 +00:00
commit 1fbdca83f5
6 changed files with 47 additions and 8 deletions

View File

@ -99,7 +99,7 @@ block and to save contents of the register to memory block.
@ref v_setall_s8, @ref v_setall_u8, ...,
@ref v_setzero_u8, @ref v_setzero_s8, ...
- Memory operations:
@ref v_load, @ref v_load_aligned, @ref v_load_halves,
@ref v_load, @ref v_load_aligned, @ref v_load_low, @ref v_load_halves,
@ref v_store, @ref v_store_aligned,
@ref v_store_high, @ref v_store_low
@ -1080,6 +1080,26 @@ inline v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> v_load_aligned(const _Tp* ptr)
return v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes>(ptr);
}
/** @brief Load 64-bits of data to lower part (high part is undefined).
@param ptr memory block containing data for first half (0..n/2)
@code{.cpp}
int lo[2] = { 1, 2 };
v_int32x4 r = v_load_low(lo);
@endcode
*/
template<typename _Tp>
inline v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> v_load_low(const _Tp* ptr)
{
v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> c;
for( int i = 0; i < c.nlanes/2; i++ )
{
c.s[i] = ptr[i];
}
return c;
}
/** @brief Load register contents from two memory blocks
@param loptr memory block containing data for first half (0..n/2)

View File

@ -763,6 +763,8 @@ inline _Tpvec v_load(const _Tp* ptr) \
{ return _Tpvec(vld1q_##suffix(ptr)); } \
inline _Tpvec v_load_aligned(const _Tp* ptr) \
{ return _Tpvec(vld1q_##suffix(ptr)); } \
inline _Tpvec v_load_low(const _Tp* ptr) \
{ return _Tpvec(vcombine_##suffix(vld1_##suffix(ptr), vdup_n_##suffix((_Tp)0))); } \
inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
{ return _Tpvec(vcombine_##suffix(vld1_##suffix(ptr0), vld1_##suffix(ptr1))); } \
inline void v_store(_Tp* ptr, const _Tpvec& a) \

View File

@ -1016,6 +1016,8 @@ inline _Tpvec v_load(const _Tp* ptr) \
{ return _Tpvec(_mm_loadu_si128((const __m128i*)ptr)); } \
inline _Tpvec v_load_aligned(const _Tp* ptr) \
{ return _Tpvec(_mm_load_si128((const __m128i*)ptr)); } \
inline _Tpvec v_load_low(const _Tp* ptr) \
{ return _Tpvec(_mm_loadl_epi64((const __m128i*)ptr)); } \
inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
{ \
return _Tpvec(_mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i*)ptr0), \
@ -1044,6 +1046,8 @@ inline _Tpvec v_load(const _Tp* ptr) \
{ return _Tpvec(_mm_loadu_##suffix(ptr)); } \
inline _Tpvec v_load_aligned(const _Tp* ptr) \
{ return _Tpvec(_mm_load_##suffix(ptr)); } \
inline _Tpvec v_load_low(const _Tp* ptr) \
{ return _Tpvec(_mm_castsi128_##suffix(_mm_loadl_epi64((const __m128i*)ptr))); } \
inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
{ \
return _Tpvec(_mm_castsi128_##suffix( \

View File

@ -281,6 +281,8 @@ inline _Tpvec v_load(const _Tp* ptr) \
{ return _Tpvec(ld_func(0, ptr)); } \
inline _Tpvec v_load_aligned(const _Tp* ptr) \
{ return _Tpvec(ld_func(0, ptr)); } \
inline _Tpvec v_load_low(const _Tp* ptr) \
{ return _Tpvec(vec_ld_l8(ptr)); } \
inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
{ return _Tpvec(vec_mergesqh(vec_ld_l8(ptr0), vec_ld_l8(ptr1))); } \
inline void v_store(_Tp* ptr, const _Tpvec& a) \

View File

@ -556,17 +556,12 @@ VSX_IMPL_2VRG_F(vec_uint4, vec_udword2, "vpkudus %0,%2,%1", vec_packs)
* vec_ld_l8(ptr) -> Load 64-bits of integer data to lower part
* vec_ldz_l8(ptr) -> Load 64-bits of integer data to lower part and zero upper part
**/
#if defined(__clang__) && !defined(__IBMCPP__)
# define __VSX_LOAD_L8(Tvec, p) (Tvec)((vec_udword2)*((uint64*)(p)))
#else
# define __VSX_LOAD_L8(Tvec, p) *((Tvec*)(p))
#endif
#define VSX_IMPL_LOAD_L8(Tvec, Tp) \
FORCE_INLINE(Tvec) vec_ld_l8(const Tp *p) \
{ return __VSX_LOAD_L8(Tvec, p); } \
{ return ((Tvec)vec_promote(*((uint64*)p), 0)); } \
FORCE_INLINE(Tvec) vec_ldz_l8(const Tp *p) \
{ \
/* TODO: try (Tvec)(vec_udword2{*((uint64*)p), 0}) */ \
static const vec_bdword2 mask = {0xFFFFFFFFFFFFFFFF, 0x0000000000000000}; \
return vec_and(vec_ld_l8(p), (Tvec)mask); \
}

View File

@ -198,6 +198,22 @@ template<typename R> struct TheTest
EXPECT_EQ(data.a[0], r3.get0());
EXPECT_EQ(data.u[0], r4.get0());
R r_low = v_load_low((LaneType*)data.u.d);
EXPECT_EQ(data.u[0], r_low.get0());
v_store(out.u.d, r_low);
for (int i = 0; i < R::nlanes/2; ++i)
{
EXPECT_EQ((LaneType)data.u[i], (LaneType)out.u[i]);
}
R r_low_align8byte = v_load_low((LaneType*)((char*)data.u.d + 8));
EXPECT_EQ(data.u[R::nlanes/2], r_low_align8byte.get0());
v_store(out.u.d, r_low_align8byte);
for (int i = 0; i < R::nlanes/2; ++i)
{
EXPECT_EQ((LaneType)data.u[i + R::nlanes/2], (LaneType)out.u[i]);
}
// check some store methods
out.u.clear();
out.a.clear();