mirror of
https://github.com/opencv/opencv.git
synced 2025-08-06 14:36:36 +08:00
Merge pull request #13905 from terfendail:pyr_wintr2
This commit is contained in:
commit
a9f67c2d1d
@ -1610,6 +1610,16 @@ inline v_int16x16 v_pack_triplets(const v_int16x16& vec)
|
||||
}
|
||||
inline v_uint16x16 v_pack_triplets(const v_uint16x16& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
|
||||
|
||||
inline v_int32x8 v_pack_triplets(const v_int32x8& vec)
|
||||
{
|
||||
return v_int32x8(_mm256_permutevar8x32_epi32(vec.val, _mm256_set_epi64x(0x0000000700000007, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000)));
|
||||
}
|
||||
inline v_uint32x8 v_pack_triplets(const v_uint32x8& vec) { return v_reinterpret_as_u32(v_pack_triplets(v_reinterpret_as_s32(vec))); }
|
||||
inline v_float32x8 v_pack_triplets(const v_float32x8& vec)
|
||||
{
|
||||
return v_float32x8(_mm256_permutevar8x32_ps(vec.val, _mm256_set_epi64x(0x0000000700000007, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000)));
|
||||
}
|
||||
|
||||
////////// Matrix operations /////////
|
||||
|
||||
inline v_int32x8 v_dotprod(const v_int16x16& a, const v_int16x16& b)
|
||||
|
@ -1908,7 +1908,6 @@ template<typename _Tp, int n> inline v_reg<_Tp, n> v_interleave_quads(const v_re
|
||||
template<typename _Tp, int n> inline v_reg<_Tp, n> v_pack_triplets(const v_reg<_Tp, n>& vec)
|
||||
{
|
||||
v_reg<float, n> c;
|
||||
int j = 0;
|
||||
for (int i = 0; i < n/4; i++)
|
||||
{
|
||||
c.s[3*i ] = vec.s[4*i ];
|
||||
|
@ -1597,29 +1597,49 @@ inline v_int8x16 v_lut(const schar* tab, const int* idx)
|
||||
}
|
||||
inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx)
|
||||
{
|
||||
short CV_DECL_ALIGNED(32) elems[8] =
|
||||
schar CV_DECL_ALIGNED(32) elems[16] =
|
||||
{
|
||||
*(short*)(tab+idx[0]),
|
||||
*(short*)(tab+idx[1]),
|
||||
*(short*)(tab+idx[2]),
|
||||
*(short*)(tab+idx[3]),
|
||||
*(short*)(tab+idx[4]),
|
||||
*(short*)(tab+idx[5]),
|
||||
*(short*)(tab+idx[6]),
|
||||
*(short*)(tab+idx[7])
|
||||
tab[idx[0]],
|
||||
tab[idx[0] + 1],
|
||||
tab[idx[1]],
|
||||
tab[idx[1] + 1],
|
||||
tab[idx[2]],
|
||||
tab[idx[2] + 1],
|
||||
tab[idx[3]],
|
||||
tab[idx[3] + 1],
|
||||
tab[idx[4]],
|
||||
tab[idx[4] + 1],
|
||||
tab[idx[5]],
|
||||
tab[idx[5] + 1],
|
||||
tab[idx[6]],
|
||||
tab[idx[6] + 1],
|
||||
tab[idx[7]],
|
||||
tab[idx[7] + 1]
|
||||
};
|
||||
return v_int8x16(vreinterpretq_s8_s16(vld1q_s16(elems)));
|
||||
return v_int8x16(vld1q_s8(elems));
|
||||
}
|
||||
inline v_int8x16 v_lut_quads(const schar* tab, const int* idx)
|
||||
{
|
||||
int CV_DECL_ALIGNED(32) elems[4] =
|
||||
schar CV_DECL_ALIGNED(32) elems[16] =
|
||||
{
|
||||
*(int*)(tab + idx[0]),
|
||||
*(int*)(tab + idx[1]),
|
||||
*(int*)(tab + idx[2]),
|
||||
*(int*)(tab + idx[3])
|
||||
tab[idx[0]],
|
||||
tab[idx[0] + 1],
|
||||
tab[idx[0] + 2],
|
||||
tab[idx[0] + 3],
|
||||
tab[idx[1]],
|
||||
tab[idx[1] + 1],
|
||||
tab[idx[1] + 2],
|
||||
tab[idx[1] + 3],
|
||||
tab[idx[2]],
|
||||
tab[idx[2] + 1],
|
||||
tab[idx[2] + 2],
|
||||
tab[idx[2] + 3],
|
||||
tab[idx[3]],
|
||||
tab[idx[3] + 1],
|
||||
tab[idx[3] + 2],
|
||||
tab[idx[3] + 3]
|
||||
};
|
||||
return v_int8x16(vreinterpretq_s8_s32(vld1q_s32(elems)));
|
||||
return v_int8x16(vld1q_s8(elems));
|
||||
}
|
||||
inline v_uint8x16 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((schar*)tab, idx)); }
|
||||
inline v_uint8x16 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((schar*)tab, idx)); }
|
||||
@ -1642,23 +1662,22 @@ inline v_int16x8 v_lut(const short* tab, const int* idx)
|
||||
}
|
||||
inline v_int16x8 v_lut_pairs(const short* tab, const int* idx)
|
||||
{
|
||||
int CV_DECL_ALIGNED(32) elems[4] =
|
||||
short CV_DECL_ALIGNED(32) elems[8] =
|
||||
{
|
||||
*(int*)(tab + idx[0]),
|
||||
*(int*)(tab + idx[1]),
|
||||
*(int*)(tab + idx[2]),
|
||||
*(int*)(tab + idx[3])
|
||||
tab[idx[0]],
|
||||
tab[idx[0] + 1],
|
||||
tab[idx[1]],
|
||||
tab[idx[1] + 1],
|
||||
tab[idx[2]],
|
||||
tab[idx[2] + 1],
|
||||
tab[idx[3]],
|
||||
tab[idx[3] + 1]
|
||||
};
|
||||
return v_int16x8(vreinterpretq_s16_s32(vld1q_s32(elems)));
|
||||
return v_int16x8(vld1q_s16(elems));
|
||||
}
|
||||
inline v_int16x8 v_lut_quads(const short* tab, const int* idx)
|
||||
{
|
||||
int64 CV_DECL_ALIGNED(32) elems[2] =
|
||||
{
|
||||
*(int64*)(tab + idx[0]),
|
||||
*(int64*)(tab + idx[1])
|
||||
};
|
||||
return v_int16x8(vreinterpretq_s16_s64(vld1q_s64(elems)));
|
||||
return v_int16x8(vcombine_s16(vld1_s16(tab + idx[0]), vld1_s16(tab + idx[1])));
|
||||
}
|
||||
inline v_uint16x8 v_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut((short*)tab, idx)); }
|
||||
inline v_uint16x8 v_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_pairs((short*)tab, idx)); }
|
||||
@ -1677,12 +1696,7 @@ inline v_int32x4 v_lut(const int* tab, const int* idx)
|
||||
}
|
||||
inline v_int32x4 v_lut_pairs(const int* tab, const int* idx)
|
||||
{
|
||||
int64 CV_DECL_ALIGNED(32) elems[2] =
|
||||
{
|
||||
*(int64*)(tab + idx[0]),
|
||||
*(int64*)(tab + idx[1])
|
||||
};
|
||||
return v_int32x4(vreinterpretq_s32_s64(vld1q_s64(elems)));
|
||||
return v_int32x4(vcombine_s32(vld1_s32(tab + idx[0]), vld1_s32(tab + idx[1])));
|
||||
}
|
||||
inline v_int32x4 v_lut_quads(const int* tab, const int* idx)
|
||||
{
|
||||
@ -1800,7 +1814,8 @@ inline v_int16x8 v_interleave_pairs(const v_int16x8& vec)
|
||||
inline v_uint16x8 v_interleave_pairs(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
|
||||
inline v_int16x8 v_interleave_quads(const v_int16x8& vec)
|
||||
{
|
||||
return v_int16x8(vreinterpretq_s16_s8(vcombine_s8(vtbl1_s8(vget_low_s8(vreinterpretq_s8_s16(vec.val)), vcreate_s8(0x0b0a030209080100)), vtbl1_s8(vget_high_s8(vreinterpretq_s8_s16(vec.val)), vcreate_s8(0x0b0a030209080100)))));
|
||||
int16x4x2_t res = vzip_s16(vget_low_s16(vec.val), vget_high_s16(vec.val));
|
||||
return v_int16x8(vcombine_s16(res.val[0], res.val[1]));
|
||||
}
|
||||
inline v_uint16x8 v_interleave_quads(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
|
||||
|
||||
@ -1824,6 +1839,10 @@ inline v_int16x8 v_pack_triplets(const v_int16x8& vec)
|
||||
}
|
||||
inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
|
||||
|
||||
inline v_int32x4 v_pack_triplets(const v_int32x4& vec) { return vec; }
|
||||
inline v_uint32x4 v_pack_triplets(const v_uint32x4& vec) { return vec; }
|
||||
inline v_float32x4 v_pack_triplets(const v_float32x4& vec) { return vec; }
|
||||
|
||||
#if CV_SIMD128_64F
|
||||
inline v_float64x2 v_lut(const double* tab, const int* idx)
|
||||
{
|
||||
|
@ -2789,7 +2789,7 @@ inline v_int32x4 v_lut_pairs(const int* tab, const int* idx)
|
||||
}
|
||||
inline v_int32x4 v_lut_quads(const int* tab, const int* idx)
|
||||
{
|
||||
return v_int32x4(_mm_load_si128((const __m128i*)(tab + idx[0])));
|
||||
return v_int32x4(_mm_loadu_si128((const __m128i*)(tab + idx[0])));
|
||||
}
|
||||
inline v_uint32x4 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((const int *)tab, idx)); }
|
||||
inline v_uint32x4 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((const int *)tab, idx)); }
|
||||
@ -2801,7 +2801,7 @@ inline v_int64x2 v_lut(const int64_t* tab, const int* idx)
|
||||
}
|
||||
inline v_int64x2 v_lut_pairs(const int64_t* tab, const int* idx)
|
||||
{
|
||||
return v_int64x2(_mm_load_si128((const __m128i*)(tab + idx[0])));
|
||||
return v_int64x2(_mm_loadu_si128((const __m128i*)(tab + idx[0])));
|
||||
}
|
||||
inline v_uint64x2 v_lut(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut((const int64_t *)tab, idx)); }
|
||||
inline v_uint64x2 v_lut_pairs(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_pairs((const int64_t *)tab, idx)); }
|
||||
@ -2817,7 +2817,7 @@ inline v_float64x2 v_lut(const double* tab, const int* idx)
|
||||
{
|
||||
return v_float64x2(_mm_setr_pd(tab[idx[0]], tab[idx[1]]));
|
||||
}
|
||||
inline v_float64x2 v_lut_pairs(const double* tab, const int* idx) { return v_float64x2(_mm_castsi128_pd(_mm_load_si128((const __m128i*)(tab + idx[0])))); }
|
||||
inline v_float64x2 v_lut_pairs(const double* tab, const int* idx) { return v_float64x2(_mm_castsi128_pd(_mm_loadu_si128((const __m128i*)(tab + idx[0])))); }
|
||||
|
||||
inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
|
||||
{
|
||||
@ -2932,7 +2932,7 @@ inline v_int8x16 v_pack_triplets(const v_int8x16& vec)
|
||||
return v_int8x16(_mm_shuffle_epi8(vec.val, _mm_set_epi64x(0xffffff0f0e0d0c0a, 0x0908060504020100)));
|
||||
#else
|
||||
__m128i mask = _mm_set1_epi64x(0x00000000FFFFFFFF);
|
||||
__m128i a = _mm_or_si128(_mm_andnot_si128(mask, vec.val), _mm_and_si128(mask, _mm_sll_epi32(vec.val, _mm_set_epi64x(0, 8))));
|
||||
__m128i a = _mm_srli_si128(_mm_or_si128(_mm_andnot_si128(mask, vec.val), _mm_and_si128(mask, _mm_sll_epi32(vec.val, _mm_set_epi64x(0, 8)))), 1);
|
||||
return v_int8x16(_mm_srli_si128(_mm_shufflelo_epi16(a, _MM_SHUFFLE(2, 1, 0, 3)), 2));
|
||||
#endif
|
||||
}
|
||||
@ -2948,6 +2948,10 @@ inline v_int16x8 v_pack_triplets(const v_int16x8& vec)
|
||||
}
|
||||
inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
|
||||
|
||||
inline v_int32x4 v_pack_triplets(const v_int32x4& vec) { return vec; }
|
||||
inline v_uint32x4 v_pack_triplets(const v_uint32x4& vec) { return vec; }
|
||||
inline v_float32x4 v_pack_triplets(const v_float32x4& vec) { return vec; }
|
||||
|
||||
////////////// FP16 support ///////////////////////////
|
||||
|
||||
inline v_float32x4 v_load_expand(const float16_t* ptr)
|
||||
|
@ -1160,6 +1160,10 @@ inline v_int16x8 v_pack_triplets(const v_int16x8& vec)
|
||||
}
|
||||
inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
|
||||
|
||||
inline v_int32x4 v_pack_triplets(const v_int32x4& vec) { return vec; }
|
||||
inline v_uint32x4 v_pack_triplets(const v_uint32x4& vec) { return vec; }
|
||||
inline v_float32x4 v_pack_triplets(const v_float32x4& vec) { return vec; }
|
||||
|
||||
/////// FP16 support ////////
|
||||
|
||||
// [TODO] implement these 2 using VSX or universal intrinsics (copy from intrin_sse.cpp and adopt)
|
||||
|
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user