mirror of
https://github.com/opencv/opencv.git
synced 2025-01-24 03:03:12 +08:00
Merge pull request #10083 from alalek:core_intrinsics_load_low
This commit is contained in:
commit
1fbdca83f5
@ -99,7 +99,7 @@ block and to save contents of the register to memory block.
|
||||
@ref v_setall_s8, @ref v_setall_u8, ...,
|
||||
@ref v_setzero_u8, @ref v_setzero_s8, ...
|
||||
- Memory operations:
|
||||
@ref v_load, @ref v_load_aligned, @ref v_load_halves,
|
||||
@ref v_load, @ref v_load_aligned, @ref v_load_low, @ref v_load_halves,
|
||||
@ref v_store, @ref v_store_aligned,
|
||||
@ref v_store_high, @ref v_store_low
|
||||
|
||||
@ -1080,6 +1080,26 @@ inline v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> v_load_aligned(const _Tp* ptr)
|
||||
return v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes>(ptr);
|
||||
}
|
||||
|
||||
/** @brief Load 64-bits of data to lower part (high part is undefined).
|
||||
|
||||
@param ptr memory block containing data for first half (0..n/2)
|
||||
|
||||
@code{.cpp}
|
||||
int lo[2] = { 1, 2 };
|
||||
v_int32x4 r = v_load_low(lo);
|
||||
@endcode
|
||||
*/
|
||||
template<typename _Tp>
|
||||
inline v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> v_load_low(const _Tp* ptr)
|
||||
{
|
||||
v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> c;
|
||||
for( int i = 0; i < c.nlanes/2; i++ )
|
||||
{
|
||||
c.s[i] = ptr[i];
|
||||
}
|
||||
return c;
|
||||
}
|
||||
|
||||
/** @brief Load register contents from two memory blocks
|
||||
|
||||
@param loptr memory block containing data for first half (0..n/2)
|
||||
|
@ -763,6 +763,8 @@ inline _Tpvec v_load(const _Tp* ptr) \
|
||||
{ return _Tpvec(vld1q_##suffix(ptr)); } \
|
||||
inline _Tpvec v_load_aligned(const _Tp* ptr) \
|
||||
{ return _Tpvec(vld1q_##suffix(ptr)); } \
|
||||
inline _Tpvec v_load_low(const _Tp* ptr) \
|
||||
{ return _Tpvec(vcombine_##suffix(vld1_##suffix(ptr), vdup_n_##suffix((_Tp)0))); } \
|
||||
inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
|
||||
{ return _Tpvec(vcombine_##suffix(vld1_##suffix(ptr0), vld1_##suffix(ptr1))); } \
|
||||
inline void v_store(_Tp* ptr, const _Tpvec& a) \
|
||||
|
@ -1016,6 +1016,8 @@ inline _Tpvec v_load(const _Tp* ptr) \
|
||||
{ return _Tpvec(_mm_loadu_si128((const __m128i*)ptr)); } \
|
||||
inline _Tpvec v_load_aligned(const _Tp* ptr) \
|
||||
{ return _Tpvec(_mm_load_si128((const __m128i*)ptr)); } \
|
||||
inline _Tpvec v_load_low(const _Tp* ptr) \
|
||||
{ return _Tpvec(_mm_loadl_epi64((const __m128i*)ptr)); } \
|
||||
inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
|
||||
{ \
|
||||
return _Tpvec(_mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i*)ptr0), \
|
||||
@ -1044,6 +1046,8 @@ inline _Tpvec v_load(const _Tp* ptr) \
|
||||
{ return _Tpvec(_mm_loadu_##suffix(ptr)); } \
|
||||
inline _Tpvec v_load_aligned(const _Tp* ptr) \
|
||||
{ return _Tpvec(_mm_load_##suffix(ptr)); } \
|
||||
inline _Tpvec v_load_low(const _Tp* ptr) \
|
||||
{ return _Tpvec(_mm_castsi128_##suffix(_mm_loadl_epi64((const __m128i*)ptr))); } \
|
||||
inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
|
||||
{ \
|
||||
return _Tpvec(_mm_castsi128_##suffix( \
|
||||
|
@ -281,6 +281,8 @@ inline _Tpvec v_load(const _Tp* ptr) \
|
||||
{ return _Tpvec(ld_func(0, ptr)); } \
|
||||
inline _Tpvec v_load_aligned(const _Tp* ptr) \
|
||||
{ return _Tpvec(ld_func(0, ptr)); } \
|
||||
inline _Tpvec v_load_low(const _Tp* ptr) \
|
||||
{ return _Tpvec(vec_ld_l8(ptr)); } \
|
||||
inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
|
||||
{ return _Tpvec(vec_mergesqh(vec_ld_l8(ptr0), vec_ld_l8(ptr1))); } \
|
||||
inline void v_store(_Tp* ptr, const _Tpvec& a) \
|
||||
|
@ -556,17 +556,12 @@ VSX_IMPL_2VRG_F(vec_uint4, vec_udword2, "vpkudus %0,%2,%1", vec_packs)
|
||||
* vec_ld_l8(ptr) -> Load 64-bits of integer data to lower part
|
||||
* vec_ldz_l8(ptr) -> Load 64-bits of integer data to lower part and zero upper part
|
||||
**/
|
||||
#if defined(__clang__) && !defined(__IBMCPP__)
|
||||
# define __VSX_LOAD_L8(Tvec, p) (Tvec)((vec_udword2)*((uint64*)(p)))
|
||||
#else
|
||||
# define __VSX_LOAD_L8(Tvec, p) *((Tvec*)(p))
|
||||
#endif
|
||||
|
||||
#define VSX_IMPL_LOAD_L8(Tvec, Tp) \
|
||||
FORCE_INLINE(Tvec) vec_ld_l8(const Tp *p) \
|
||||
{ return __VSX_LOAD_L8(Tvec, p); } \
|
||||
{ return ((Tvec)vec_promote(*((uint64*)p), 0)); } \
|
||||
FORCE_INLINE(Tvec) vec_ldz_l8(const Tp *p) \
|
||||
{ \
|
||||
/* TODO: try (Tvec)(vec_udword2{*((uint64*)p), 0}) */ \
|
||||
static const vec_bdword2 mask = {0xFFFFFFFFFFFFFFFF, 0x0000000000000000}; \
|
||||
return vec_and(vec_ld_l8(p), (Tvec)mask); \
|
||||
}
|
||||
|
@ -198,6 +198,22 @@ template<typename R> struct TheTest
|
||||
EXPECT_EQ(data.a[0], r3.get0());
|
||||
EXPECT_EQ(data.u[0], r4.get0());
|
||||
|
||||
R r_low = v_load_low((LaneType*)data.u.d);
|
||||
EXPECT_EQ(data.u[0], r_low.get0());
|
||||
v_store(out.u.d, r_low);
|
||||
for (int i = 0; i < R::nlanes/2; ++i)
|
||||
{
|
||||
EXPECT_EQ((LaneType)data.u[i], (LaneType)out.u[i]);
|
||||
}
|
||||
|
||||
R r_low_align8byte = v_load_low((LaneType*)((char*)data.u.d + 8));
|
||||
EXPECT_EQ(data.u[R::nlanes/2], r_low_align8byte.get0());
|
||||
v_store(out.u.d, r_low_align8byte);
|
||||
for (int i = 0; i < R::nlanes/2; ++i)
|
||||
{
|
||||
EXPECT_EQ((LaneType)data.u[i + R::nlanes/2], (LaneType)out.u[i]);
|
||||
}
|
||||
|
||||
// check some store methods
|
||||
out.u.clear();
|
||||
out.a.clear();
|
||||
|
Loading…
Reference in New Issue
Block a user