mirror of
https://github.com/opencv/opencv.git
synced 2024-11-28 21:20:18 +08:00
Merge pull request #7182 from mself:two_channel_universal_intrinsics
This commit is contained in:
commit
595fd2757c
@ -103,7 +103,7 @@ block and to save contents of the register to memory block.
|
||||
|
||||
These operations allow to reorder or recombine elements in one or multiple vectors.
|
||||
|
||||
- Interleave, deinterleave (3 and 4 channels): @ref v_load_deinterleave, @ref v_store_interleave
|
||||
- Interleave, deinterleave (2, 3 and 4 channels): @ref v_load_deinterleave, @ref v_store_interleave
|
||||
- Expand: @ref v_load_expand, @ref v_load_expand_q, @ref v_expand
|
||||
- Pack: @ref v_pack, @ref v_pack_u, @ref v_rshr_pack, @ref v_rshr_pack_u,
|
||||
@ref v_pack_store, @ref v_pack_u_store, @ref v_rshr_pack_store, @ref v_rshr_pack_u_store
|
||||
@ -1075,12 +1075,31 @@ v_load_expand_q(const _Tp* ptr)
|
||||
return c;
|
||||
}
|
||||
|
||||
/** @brief Load and deinterleave (4 channels)
|
||||
/** @brief Load and deinterleave (2 channels)
|
||||
|
||||
Load data from memory deinterleave and store to 4 registers.
|
||||
Load data from memory deinterleave and store to 2 registers.
|
||||
Scheme:
|
||||
@code
|
||||
{A1 B1 C1 D1 A2 B2 C2 D2 ...} ==> {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}, {D1 D2 ...}
|
||||
{A1 B1 A2 B2 ...} ==> {A1 A2 ...}, {B1 B2 ...}
|
||||
@endcode
|
||||
For all types except 64-bit. */
|
||||
template<typename _Tp, int n> inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
|
||||
v_reg<_Tp, n>& b)
|
||||
{
|
||||
int i, i2;
|
||||
for( i = i2 = 0; i < n; i++, i2 += 2 )
|
||||
{
|
||||
a.s[i] = ptr[i2];
|
||||
b.s[i] = ptr[i2+1];
|
||||
}
|
||||
}
|
||||
|
||||
/** @brief Load and deinterleave (3 channels)
|
||||
|
||||
Load data from memory deinterleave and store to 3 registers.
|
||||
Scheme:
|
||||
@code
|
||||
{A1 B1 C1 A2 B2 C2 ...} ==> {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}
|
||||
@endcode
|
||||
For all types except 64-bit. */
|
||||
template<typename _Tp, int n> inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
|
||||
@ -1095,12 +1114,12 @@ template<typename _Tp, int n> inline void v_load_deinterleave(const _Tp* ptr, v_
|
||||
}
|
||||
}
|
||||
|
||||
/** @brief Load and deinterleave (3 channels)
|
||||
/** @brief Load and deinterleave (4 channels)
|
||||
|
||||
Load data from memory deinterleave and store to 3 registers.
|
||||
Load data from memory deinterleave and store to 4 registers.
|
||||
Scheme:
|
||||
@code
|
||||
{A1 B1 C1 A2 B2 C2 ...} ==> {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}
|
||||
{A1 B1 C1 D1 A2 B2 C2 D2 ...} ==> {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}, {D1 D2 ...}
|
||||
@endcode
|
||||
For all types except 64-bit. */
|
||||
template<typename _Tp, int n>
|
||||
@ -1118,12 +1137,32 @@ inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
|
||||
}
|
||||
}
|
||||
|
||||
/** @brief Interleave and store (2 channels)
|
||||
|
||||
Interleave and store data from 2 registers to memory.
|
||||
Scheme:
|
||||
@code
|
||||
{A1 A2 ...}, {B1 B2 ...} ==> {A1 B1 A2 B2 ...}
|
||||
@endcode
|
||||
For all types except 64-bit. */
|
||||
template<typename _Tp, int n>
|
||||
inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
|
||||
const v_reg<_Tp, n>& b)
|
||||
{
|
||||
int i, i2;
|
||||
for( i = i2 = 0; i < n; i++, i2 += 2 )
|
||||
{
|
||||
ptr[i2] = a.s[i];
|
||||
ptr[i2+1] = b.s[i];
|
||||
}
|
||||
}
|
||||
|
||||
/** @brief Interleave and store (3 channels)
|
||||
|
||||
Interleave and store data from 3 registers to memory.
|
||||
Scheme:
|
||||
@code
|
||||
{A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}, {D1 D2 ...} ==> {A1 B1 C1 D1 A2 B2 C2 D2 ...}
|
||||
{A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...} ==> {A1 B1 C1 A2 B2 C2 ...}
|
||||
@endcode
|
||||
For all types except 64-bit. */
|
||||
template<typename _Tp, int n>
|
||||
|
@ -809,6 +809,12 @@ OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(int32x4, s32)
|
||||
OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(float32x4, f32)
|
||||
|
||||
#define OPENCV_HAL_IMPL_NEON_INTERLEAVED(_Tpvec, _Tp, suffix) \
|
||||
inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b) \
|
||||
{ \
|
||||
_Tpvec##x2_t v = vld2q_##suffix(ptr); \
|
||||
a.val = v.val[0]; \
|
||||
b.val = v.val[1]; \
|
||||
} \
|
||||
inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, v_##_Tpvec& c) \
|
||||
{ \
|
||||
_Tpvec##x3_t v = vld3q_##suffix(ptr); \
|
||||
@ -825,6 +831,13 @@ inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, \
|
||||
c.val = v.val[2]; \
|
||||
d.val = v.val[3]; \
|
||||
} \
|
||||
inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b) \
|
||||
{ \
|
||||
_Tpvec##x2_t v; \
|
||||
v.val[0] = a.val; \
|
||||
v.val[1] = b.val; \
|
||||
vst2q_##suffix(ptr, v); \
|
||||
} \
|
||||
inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, const v_##_Tpvec& c) \
|
||||
{ \
|
||||
_Tpvec##x3_t v; \
|
||||
|
@ -1374,6 +1374,18 @@ inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4&
|
||||
v_transpose4x4(u0, u1, u2, u3, a, b, c, d);
|
||||
}
|
||||
|
||||
// 2-channel, float only
|
||||
inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b)
|
||||
{
|
||||
const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1);
|
||||
|
||||
__m128 u0 = _mm_loadu_ps(ptr); // a0 b0 a1 b1
|
||||
__m128 u1 = _mm_loadu_ps((ptr + 4)); // a2 b2 a3 b3
|
||||
|
||||
a.val = _mm_shuffle_ps(u0, u1, mask_lo); // a0 a1 a2 a3
|
||||
b.val = _mm_shuffle_ps(u0, u1, mask_hi); // b0 b1 ab b3
|
||||
}
|
||||
|
||||
inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
|
||||
const v_uint8x16& c )
|
||||
{
|
||||
@ -1529,6 +1541,18 @@ inline void v_store_interleave(unsigned* ptr, const v_uint32x4& a, const v_uint3
|
||||
v_store(ptr + 12, t3);
|
||||
}
|
||||
|
||||
// 2-channel, float only
|
||||
inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b)
|
||||
{
|
||||
// a0 a1 a2 a3 ...
|
||||
// b0 b1 b2 b3 ...
|
||||
__m128 u0 = _mm_unpacklo_ps(a.val, b.val); // a0 b0 a1 b1
|
||||
__m128 u1 = _mm_unpackhi_ps(a.val, b.val); // a2 b2 a3 b3
|
||||
|
||||
_mm_storeu_ps(ptr, u0);
|
||||
_mm_storeu_ps((ptr + 4), u1);
|
||||
}
|
||||
|
||||
#define OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(_Tpvec, _Tp, suffix, _Tpuvec, _Tpu, usuffix) \
|
||||
inline void v_load_deinterleave( const _Tp* ptr, _Tpvec& a0, \
|
||||
_Tpvec& b0, _Tpvec& c0 ) \
|
||||
|
@ -132,6 +132,32 @@ template<typename R> struct TheTest
|
||||
return *this;
|
||||
}
|
||||
|
||||
// float32x4 only
|
||||
TheTest & test_interleave_2channel()
|
||||
{
|
||||
Data<R> data1, data2;
|
||||
data2 += 20;
|
||||
|
||||
R a = data1, b = data2;
|
||||
|
||||
LaneType buf2[R::nlanes * 2];
|
||||
|
||||
v_store_interleave(buf2, a, b);
|
||||
|
||||
Data<R> z(0);
|
||||
a = b = z;
|
||||
|
||||
v_load_deinterleave(buf2, a, b);
|
||||
|
||||
for (int i = 0; i < R::nlanes; ++i)
|
||||
{
|
||||
EXPECT_EQ(data1, Data<R>(a));
|
||||
EXPECT_EQ(data2, Data<R>(b));
|
||||
}
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
// v_expand and v_load_expand
|
||||
TheTest & test_expand()
|
||||
{
|
||||
@ -846,6 +872,7 @@ TEST(hal_intrin, float32x4) {
|
||||
TheTest<v_float32x4>()
|
||||
.test_loadstore()
|
||||
.test_interleave()
|
||||
.test_interleave_2channel()
|
||||
.test_addsub()
|
||||
.test_mul()
|
||||
.test_div()
|
||||
|
Loading…
Reference in New Issue
Block a user