From 334c4d62b58469c355c99baf686ccde8de90a64c Mon Sep 17 00:00:00 2001 From: Vitaly Tuzov Date: Wed, 20 Feb 2019 14:30:28 +0300 Subject: [PATCH] Merge pull request #13781 from terfendail:warp_wintr Resize reworked using wide universal intrinsics (#13781) * Added wide universal intrinsics optimized implementation for 3 channel bit-exact linear resize * Reworked linear resize using new wide LUT intrinsics * Fix for VSX intrinsics --- .../core/include/opencv2/core/hal/intrin.hpp | 21 +- .../include/opencv2/core/hal/intrin_avx.hpp | 134 +++++++++ .../include/opencv2/core/hal/intrin_cpp.hpp | 73 +++++ .../include/opencv2/core/hal/intrin_neon.hpp | 225 +++++++++++++++ .../include/opencv2/core/hal/intrin_sse.hpp | 196 +++++++++++++ .../include/opencv2/core/hal/intrin_vsx.hpp | 130 +++++++++ modules/imgproc/src/resize.cpp | 268 +++++++----------- 7 files changed, 874 insertions(+), 173 deletions(-) diff --git a/modules/core/include/opencv2/core/hal/intrin.hpp b/modules/core/include/opencv2/core/hal/intrin.hpp index ef74176f33..460c5c5900 100644 --- a/modules/core/include/opencv2/core/hal/intrin.hpp +++ b/modules/core/include/opencv2/core/hal/intrin.hpp @@ -234,7 +234,12 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN inline vtyp vx_##loadsfx##_low(const typ* ptr) { return prefix##_##loadsfx##_low(ptr); } \ inline vtyp vx_##loadsfx##_halves(const typ* ptr0, const typ* ptr1) { return prefix##_##loadsfx##_halves(ptr0, ptr1); } \ inline void vx_store(typ* ptr, const vtyp& v) { return v_store(ptr, v); } \ - inline void vx_store_aligned(typ* ptr, const vtyp& v) { return v_store_aligned(ptr, v); } + inline void vx_store_aligned(typ* ptr, const vtyp& v) { return v_store_aligned(ptr, v); } \ + inline vtyp vx_lut(const typ* ptr, const int* idx) { return prefix##_lut(ptr, idx); } \ + inline vtyp vx_lut_pairs(const typ* ptr, const int* idx) { return prefix##_lut_pairs(ptr, idx); } + +#define CV_INTRIN_DEFINE_WIDE_LUT_QUAD(typ, vtyp, prefix) \ + inline vtyp vx_lut_quads(const typ* ptr, const int* idx) { return prefix##_lut_quads(ptr, idx); } #define CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(typ, wtyp, prefix) \ inline wtyp vx_load_expand(const typ* ptr) { return prefix##_load_expand(ptr); } @@ -244,6 +249,7 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN #define CV_INTRIN_DEFINE_WIDE_INTRIN_WITH_EXPAND(typ, vtyp, short_typ, wtyp, qtyp, prefix, loadsfx) \ CV_INTRIN_DEFINE_WIDE_INTRIN(typ, vtyp, short_typ, prefix, loadsfx) \ + CV_INTRIN_DEFINE_WIDE_LUT_QUAD(typ, vtyp, prefix) \ CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(typ, wtyp, prefix) \ CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND_Q(typ, qtyp, prefix) @@ -251,14 +257,19 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN CV_INTRIN_DEFINE_WIDE_INTRIN_WITH_EXPAND(uchar, v_uint8, u8, v_uint16, v_uint32, prefix, load) \ CV_INTRIN_DEFINE_WIDE_INTRIN_WITH_EXPAND(schar, v_int8, s8, v_int16, v_int32, prefix, load) \ CV_INTRIN_DEFINE_WIDE_INTRIN(ushort, v_uint16, u16, prefix, load) \ + CV_INTRIN_DEFINE_WIDE_LUT_QUAD(ushort, v_uint16, prefix) \ CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(ushort, v_uint32, prefix) \ CV_INTRIN_DEFINE_WIDE_INTRIN(short, v_int16, s16, prefix, load) \ + CV_INTRIN_DEFINE_WIDE_LUT_QUAD(short, v_int16, prefix) \ CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(short, v_int32, prefix) \ CV_INTRIN_DEFINE_WIDE_INTRIN(int, v_int32, s32, prefix, load) \ + CV_INTRIN_DEFINE_WIDE_LUT_QUAD(int, v_int32, prefix) \ CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(int, v_int64, prefix) \ CV_INTRIN_DEFINE_WIDE_INTRIN(unsigned, v_uint32, u32, prefix, load) \ + CV_INTRIN_DEFINE_WIDE_LUT_QUAD(unsigned, v_uint32, prefix) \ CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(unsigned, v_uint64, prefix) \ CV_INTRIN_DEFINE_WIDE_INTRIN(float, v_float32, f32, prefix, load) \ + CV_INTRIN_DEFINE_WIDE_LUT_QUAD(float, v_float32, prefix) \ CV_INTRIN_DEFINE_WIDE_INTRIN(int64, v_int64, s64, prefix, load) \ CV_INTRIN_DEFINE_WIDE_INTRIN(uint64, v_uint64, u64, prefix, load) \ CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(float16_t, v_float32, prefix) @@ -335,11 +346,11 @@ namespace CV__SIMD_NAMESPACE { typedef v_uint64x4 v_uint64; typedef v_int64x4 v_int64; typedef v_float32x8 v_float32; + CV_INTRIN_DEFINE_WIDE_INTRIN_ALL_TYPES(v256) #if CV_SIMD256_64F typedef v_float64x4 v_float64; - #endif - CV_INTRIN_DEFINE_WIDE_INTRIN_ALL_TYPES(v256) CV_INTRIN_DEFINE_WIDE_INTRIN(double, v_float64, f64, v256, load) + #endif inline void vx_cleanup() { v256_cleanup(); } } // namespace using namespace CV__SIMD_NAMESPACE; @@ -358,11 +369,9 @@ namespace CV__SIMD_NAMESPACE { typedef v_uint64x2 v_uint64; typedef v_int64x2 v_int64; typedef v_float32x4 v_float32; - #if CV_SIMD128_64F - typedef v_float64x2 v_float64; - #endif CV_INTRIN_DEFINE_WIDE_INTRIN_ALL_TYPES(v) #if CV_SIMD128_64F + typedef v_float64x2 v_float64; CV_INTRIN_DEFINE_WIDE_INTRIN(double, v_float64, f64, v, load) #endif inline void vx_cleanup() { v_cleanup(); } diff --git a/modules/core/include/opencv2/core/hal/intrin_avx.hpp b/modules/core/include/opencv2/core/hal/intrin_avx.hpp index c3797d67c1..913c915270 100644 --- a/modules/core/include/opencv2/core/hal/intrin_avx.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_avx.hpp @@ -1417,6 +1417,97 @@ inline v_float64x4 v_cvt_f64_high(const v_float32x8& a) ////////////// Lookup table access //////////////////// +inline v_int8x32 v256_lut(const schar* tab, const int* idx) +{ + return v_int8x32(_mm256_setr_epi8(tab[idx[ 0]], tab[idx[ 1]], tab[idx[ 2]], tab[idx[ 3]], tab[idx[ 4]], tab[idx[ 5]], tab[idx[ 6]], tab[idx[ 7]], + tab[idx[ 8]], tab[idx[ 9]], tab[idx[10]], tab[idx[11]], tab[idx[12]], tab[idx[13]], tab[idx[14]], tab[idx[15]], + tab[idx[16]], tab[idx[17]], tab[idx[18]], tab[idx[19]], tab[idx[20]], tab[idx[21]], tab[idx[22]], tab[idx[23]], + tab[idx[24]], tab[idx[25]], tab[idx[26]], tab[idx[27]], tab[idx[28]], tab[idx[29]], tab[idx[30]], tab[idx[31]])); +} +inline v_int8x32 v256_lut_pairs(const schar* tab, const int* idx) +{ + return v_int8x32(_mm256_setr_epi16(*(const short*)(tab + idx[ 0]), *(const short*)(tab + idx[ 1]), *(const short*)(tab + idx[ 2]), *(const short*)(tab + idx[ 3]), + *(const short*)(tab + idx[ 4]), *(const short*)(tab + idx[ 5]), *(const short*)(tab + idx[ 6]), *(const short*)(tab + idx[ 7]), + *(const short*)(tab + idx[ 8]), *(const short*)(tab + idx[ 9]), *(const short*)(tab + idx[10]), *(const short*)(tab + idx[11]), + *(const short*)(tab + idx[12]), *(const short*)(tab + idx[13]), *(const short*)(tab + idx[14]), *(const short*)(tab + idx[15]))); +} +inline v_int8x32 v256_lut_quads(const schar* tab, const int* idx) +{ + return v_int8x32(_mm256_i32gather_epi32((const int*)tab, _mm256_loadu_si256((const __m256i*)idx), 1)); +} +inline v_uint8x32 v256_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v256_lut((const schar *)tab, idx)); } +inline v_uint8x32 v256_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v256_lut_pairs((const schar *)tab, idx)); } +inline v_uint8x32 v256_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v256_lut_quads((const schar *)tab, idx)); } + +inline v_int16x16 v256_lut(const short* tab, const int* idx) +{ + return v_int16x16(_mm256_setr_epi16(tab[idx[0]], tab[idx[1]], tab[idx[ 2]], tab[idx[ 3]], tab[idx[ 4]], tab[idx[ 5]], tab[idx[ 6]], tab[idx[ 7]], + tab[idx[8]], tab[idx[9]], tab[idx[10]], tab[idx[11]], tab[idx[12]], tab[idx[13]], tab[idx[14]], tab[idx[15]])); +} +inline v_int16x16 v256_lut_pairs(const short* tab, const int* idx) +{ + return v_int16x16(_mm256_i32gather_epi32((const int*)tab, _mm256_loadu_si256((const __m256i*)idx), 2)); +} +inline v_int16x16 v256_lut_quads(const short* tab, const int* idx) +{ +#if defined(__GNUC__) + return v_int16x16(_mm256_i32gather_epi64((const long long int*)tab, _mm_loadu_si128((const __m128i*)idx), 2));//Looks like intrinsic has wrong definition +#else + return v_int16x16(_mm256_i32gather_epi64((const int64*)tab, _mm_loadu_si128((const __m128i*)idx), 2)); +#endif +} +inline v_uint16x16 v256_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v256_lut((const short *)tab, idx)); } +inline v_uint16x16 v256_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v256_lut_pairs((const short *)tab, idx)); } +inline v_uint16x16 v256_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v256_lut_quads((const short *)tab, idx)); } + +inline v_int32x8 v256_lut(const int* tab, const int* idx) +{ + return v_int32x8(_mm256_i32gather_epi32(tab, _mm256_loadu_si256((const __m256i*)idx), 4)); +} +inline v_int32x8 v256_lut_pairs(const int* tab, const int* idx) +{ +#if defined(__GNUC__) + return v_int32x8(_mm256_i32gather_epi64((const long long int*)tab, _mm_loadu_si128((const __m128i*)idx), 4)); +#else + return v_int32x8(_mm256_i32gather_epi64((const int64*)tab, _mm_loadu_si128((const __m128i*)idx), 4)); +#endif +} +inline v_int32x8 v256_lut_quads(const int* tab, const int* idx) +{ + return v_int32x8(_mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((const __m128i*)(tab + idx[0]))), _mm_loadu_si128((const __m128i*)(tab + idx[1])), 0x1)); +} +inline v_uint32x8 v256_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v256_lut((const int *)tab, idx)); } +inline v_uint32x8 v256_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v256_lut_pairs((const int *)tab, idx)); } +inline v_uint32x8 v256_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v256_lut_quads((const int *)tab, idx)); } + +inline v_int64x4 v256_lut(const int64* tab, const int* idx) +{ +#if defined(__GNUC__) + return v_int64x4(_mm256_i32gather_epi64((const long long int*)tab, _mm_loadu_si128((const __m128i*)idx), 8)); +#else + return v_int64x4(_mm256_i32gather_epi64(tab, _mm_loadu_si128((const __m128i*)idx), 8)); +#endif +} +inline v_int64x4 v256_lut_pairs(const int64* tab, const int* idx) +{ + return v_int64x4(_mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((const __m128i*)(tab + idx[0]))), _mm_loadu_si128((const __m128i*)(tab + idx[1])), 0x1)); +} +inline v_uint64x4 v256_lut(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v256_lut((const int64 *)tab, idx)); } +inline v_uint64x4 v256_lut_pairs(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v256_lut_pairs((const int64 *)tab, idx)); } + +inline v_float32x8 v256_lut(const float* tab, const int* idx) +{ + return v_float32x8(_mm256_i32gather_ps(tab, _mm256_loadu_si256((const __m256i*)idx), 4)); +} +inline v_float32x8 v256_lut_pairs(const float* tab, const int* idx) { return v_reinterpret_as_f32(v256_lut_pairs((const int *)tab, idx)); } +inline v_float32x8 v256_lut_quads(const float* tab, const int* idx) { return v_reinterpret_as_f32(v256_lut_quads((const int *)tab, idx)); } + +inline v_float64x4 v256_lut(const double* tab, const int* idx) +{ + return v_float64x4(_mm256_i32gather_pd(tab, _mm_loadu_si128((const __m128i*)idx), 8)); +} +inline v_float64x4 v256_lut_pairs(const double* tab, const int* idx) { return v_float64x4(_mm256_insertf128_pd(_mm256_castpd128_pd256(_mm_loadu_pd(tab + idx[0])), _mm_loadu_pd(tab + idx[1]), 0x1)); } + inline v_int32x8 v_lut(const int* tab, const v_int32x8& idxvec) { return v_int32x8(_mm256_i32gather_epi32(tab, idxvec.val, 4)); @@ -1476,6 +1567,49 @@ inline void v_lut_deinterleave(const double* tab, const v_int32x8& idxvec, v_flo y = v_float64x4(_mm256_unpackhi_pd(xy02, xy13)); } +inline v_int8x32 v_interleave_pairs(const v_int8x32& vec) +{ + return v_int8x32(_mm256_shuffle_epi8(vec.val, _mm256_set_epi64x(0x0f0d0e0c0b090a08, 0x0705060403010200, 0x0f0d0e0c0b090a08, 0x0705060403010200))); +} +inline v_uint8x32 v_interleave_pairs(const v_uint8x32& vec) { return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); } +inline v_int8x32 v_interleave_quads(const v_int8x32& vec) +{ + return v_int8x32(_mm256_shuffle_epi8(vec.val, _mm256_set_epi64x(0x0f0b0e0a0d090c08, 0x0703060205010400, 0x0f0b0e0a0d090c08, 0x0703060205010400))); +} +inline v_uint8x32 v_interleave_quads(const v_uint8x32& vec) { return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); } + +inline v_int16x16 v_interleave_pairs(const v_int16x16& vec) +{ + return v_int16x16(_mm256_shuffle_epi8(vec.val, _mm256_set_epi64x(0x0f0e0b0a0d0c0908, 0x0706030205040100, 0x0f0e0b0a0d0c0908, 0x0706030205040100))); +} +inline v_uint16x16 v_interleave_pairs(const v_uint16x16& vec) { return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); } +inline v_int16x16 v_interleave_quads(const v_int16x16& vec) +{ + return v_int16x16(_mm256_shuffle_epi8(vec.val, _mm256_set_epi64x(0x0f0e07060d0c0504, 0x0b0a030209080100, 0x0f0e07060d0c0504, 0x0b0a030209080100))); +} +inline v_uint16x16 v_interleave_quads(const v_uint16x16& vec) { return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); } + +inline v_int32x8 v_interleave_pairs(const v_int32x8& vec) +{ + return v_int32x8(_mm256_shuffle_epi32(vec.val, _MM_SHUFFLE(3, 1, 2, 0))); +} +inline v_uint32x8 v_interleave_pairs(const v_uint32x8& vec) { return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); } +inline v_float32x8 v_interleave_pairs(const v_float32x8& vec) { return v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); } + +inline v_int8x32 v_pack_triplets(const v_int8x32& vec) +{ + return v_int8x32(_mm256_permutevar8x32_epi32(_mm256_shuffle_epi8(vec.val, _mm256_broadcastsi128_si256(_mm_set_epi64x(0xffffff0f0e0d0c0a, 0x0908060504020100))), + _mm256_set_epi64x(0x0000000700000007, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000))); +} +inline v_uint8x32 v_pack_triplets(const v_uint8x32& vec) { return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); } + +inline v_int16x16 v_pack_triplets(const v_int16x16& vec) +{ + return v_int16x16(_mm256_permutevar8x32_epi32(_mm256_shuffle_epi8(vec.val, _mm256_broadcastsi128_si256(_mm_set_epi64x(0xffff0f0e0d0c0b0a, 0x0908050403020100))), + _mm256_set_epi64x(0x0000000700000007, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000))); +} +inline v_uint16x16 v_pack_triplets(const v_uint16x16& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); } + ////////// Matrix operations ///////// inline v_int32x8 v_dotprod(const v_int16x16& a, const v_int16x16& b) diff --git a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp index 3b419786a9..5cfaea7220 100644 --- a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp @@ -1799,6 +1799,28 @@ template inline v_reg v_cvt_f64(const v_reg& a) return c; } +template inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_lut(const _Tp* tab, const int* idx) +{ + v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> c; + for (int i = 0; i < V_TypeTraits<_Tp>::nlanes128; i++) + c.s[i] = tab[idx[i]]; + return c; +} +template inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_lut_pairs(const _Tp* tab, const int* idx) +{ + v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> c; + for (int i = 0; i < V_TypeTraits<_Tp>::nlanes128; i++) + c.s[i] = tab[idx[i / 2] + i % 2]; + return c; +} +template inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_lut_quads(const _Tp* tab, const int* idx) +{ + v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> c; + for (int i = 0; i < V_TypeTraits<_Tp>::nlanes128; i++) + c.s[i] = tab[idx[i / 4] + i % 4]; + return c; +} + template inline v_reg v_lut(const int* tab, const v_reg& idx) { v_reg c; @@ -1807,6 +1829,14 @@ template inline v_reg v_lut(const int* tab, const v_reg& return c; } +template inline v_reg v_lut(const unsigned* tab, const v_reg& idx) +{ + v_reg c; + for (int i = 0; i < n; i++) + c.s[i] = tab[idx.s[i]]; + return c; +} + template inline v_reg v_lut(const float* tab, const v_reg& idx) { v_reg c; @@ -1845,6 +1875,49 @@ template inline void v_lut_deinterleave(const double* tab, const v_reg inline v_reg<_Tp, n> v_interleave_pairs(const v_reg<_Tp, n>& vec) +{ + v_reg c; + for (int i = 0; i < n/4; i++) + { + c.s[4*i ] = vec.s[4*i ]; + c.s[4*i+1] = vec.s[4*i+2]; + c.s[4*i+2] = vec.s[4*i+1]; + c.s[4*i+3] = vec.s[4*i+3]; + } + return c; +} + +template inline v_reg<_Tp, n> v_interleave_quads(const v_reg<_Tp, n>& vec) +{ + v_reg c; + for (int i = 0; i < n/8; i++) + { + c.s[8*i ] = vec.s[8*i ]; + c.s[8*i+1] = vec.s[8*i+4]; + c.s[8*i+2] = vec.s[8*i+1]; + c.s[8*i+3] = vec.s[8*i+5]; + c.s[8*i+4] = vec.s[8*i+2]; + c.s[8*i+5] = vec.s[8*i+6]; + c.s[8*i+6] = vec.s[8*i+3]; + c.s[8*i+7] = vec.s[8*i+7]; + } + return c; +} + +template inline v_reg<_Tp, n> v_pack_triplets(const v_reg<_Tp, n>& vec) +{ + v_reg c; + int j = 0; + for (int i = 0; i < n/4; i++) + { + c.s[3*i ] = vec.s[4*i ]; + c.s[3*i+1] = vec.s[4*i+1]; + c.s[3*i+2] = vec.s[4*i+2]; + } + return c; +} + /** @brief Transpose 4x4 matrix Scheme: diff --git a/modules/core/include/opencv2/core/hal/intrin_neon.hpp b/modules/core/include/opencv2/core/hal/intrin_neon.hpp index 0cbde4d539..f67479171d 100644 --- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp @@ -1572,6 +1572,162 @@ inline v_float64x2 v_cvt_f64_high(const v_float32x4& a) ////////////// Lookup table access //////////////////// +inline v_int8x16 v_lut(const schar* tab, const int* idx) +{ + schar CV_DECL_ALIGNED(32) elems[16] = + { + tab[idx[ 0]], + tab[idx[ 1]], + tab[idx[ 2]], + tab[idx[ 3]], + tab[idx[ 4]], + tab[idx[ 5]], + tab[idx[ 6]], + tab[idx[ 7]], + tab[idx[ 8]], + tab[idx[ 9]], + tab[idx[10]], + tab[idx[11]], + tab[idx[12]], + tab[idx[13]], + tab[idx[14]], + tab[idx[15]] + }; + return v_int8x16(vld1q_s8(elems)); +} +inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx) +{ + short CV_DECL_ALIGNED(32) elems[8] = + { + *(short*)(tab+idx[0]), + *(short*)(tab+idx[1]), + *(short*)(tab+idx[2]), + *(short*)(tab+idx[3]), + *(short*)(tab+idx[4]), + *(short*)(tab+idx[5]), + *(short*)(tab+idx[6]), + *(short*)(tab+idx[7]) + }; + return v_int8x16(vreinterpretq_s8_s16(vld1q_s16(elems))); +} +inline v_int8x16 v_lut_quads(const schar* tab, const int* idx) +{ + int CV_DECL_ALIGNED(32) elems[4] = + { + *(int*)(tab + idx[0]), + *(int*)(tab + idx[1]), + *(int*)(tab + idx[2]), + *(int*)(tab + idx[3]) + }; + return v_int8x16(vreinterpretq_s8_s32(vld1q_s32(elems))); +} +inline v_uint8x16 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((schar*)tab, idx)); } +inline v_uint8x16 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((schar*)tab, idx)); } +inline v_uint8x16 v_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_quads((schar*)tab, idx)); } + +inline v_int16x8 v_lut(const short* tab, const int* idx) +{ + short CV_DECL_ALIGNED(32) elems[8] = + { + tab[idx[0]], + tab[idx[1]], + tab[idx[2]], + tab[idx[3]], + tab[idx[4]], + tab[idx[5]], + tab[idx[6]], + tab[idx[7]] + }; + return v_int16x8(vld1q_s16(elems)); +} +inline v_int16x8 v_lut_pairs(const short* tab, const int* idx) +{ + int CV_DECL_ALIGNED(32) elems[4] = + { + *(int*)(tab + idx[0]), + *(int*)(tab + idx[1]), + *(int*)(tab + idx[2]), + *(int*)(tab + idx[3]) + }; + return v_int16x8(vreinterpretq_s16_s32(vld1q_s32(elems))); +} +inline v_int16x8 v_lut_quads(const short* tab, const int* idx) +{ + int64 CV_DECL_ALIGNED(32) elems[2] = + { + *(int64*)(tab + idx[0]), + *(int64*)(tab + idx[1]) + }; + return v_int16x8(vreinterpretq_s16_s64(vld1q_s64(elems))); +} +inline v_uint16x8 v_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut((short*)tab, idx)); } +inline v_uint16x8 v_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_pairs((short*)tab, idx)); } +inline v_uint16x8 v_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_quads((short*)tab, idx)); } + +inline v_int32x4 v_lut(const int* tab, const int* idx) +{ + int CV_DECL_ALIGNED(32) elems[4] = + { + tab[idx[0]], + tab[idx[1]], + tab[idx[2]], + tab[idx[3]] + }; + return v_int32x4(vld1q_s32(elems)); +} +inline v_int32x4 v_lut_pairs(const int* tab, const int* idx) +{ + int64 CV_DECL_ALIGNED(32) elems[2] = + { + *(int64*)(tab + idx[0]), + *(int64*)(tab + idx[1]) + }; + return v_int32x4(vreinterpretq_s32_s64(vld1q_s64(elems))); +} +inline v_int32x4 v_lut_quads(const int* tab, const int* idx) +{ + return v_int32x4(vld1q_s32(tab + idx[0])); +} +inline v_uint32x4 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((int*)tab, idx)); } +inline v_uint32x4 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((int*)tab, idx)); } +inline v_uint32x4 v_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_quads((int*)tab, idx)); } + +inline v_int64x2 v_lut(const int64_t* tab, const int* idx) +{ + return v_int64x2(vcombine_s64(vcreate_s64(tab[idx[0]]), vcreate_s64(tab[idx[1]]))); +} +inline v_int64x2 v_lut_pairs(const int64_t* tab, const int* idx) +{ + return v_int64x2(vld1q_s64(tab + idx[0])); +} +inline v_uint64x2 v_lut(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut((const int64_t *)tab, idx)); } +inline v_uint64x2 v_lut_pairs(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_pairs((const int64_t *)tab, idx)); } + +inline v_float32x4 v_lut(const float* tab, const int* idx) +{ + float CV_DECL_ALIGNED(32) elems[4] = + { + tab[idx[0]], + tab[idx[1]], + tab[idx[2]], + tab[idx[3]] + }; + return v_float32x4(vld1q_f32(elems)); +} +inline v_float32x4 v_lut_pairs(const float* tab, const int* idx) +{ + uint64 CV_DECL_ALIGNED(32) elems[2] = + { + *(uint64*)(tab + idx[0]), + *(uint64*)(tab + idx[1]) + }; + return v_float32x4(vreinterpretq_f32_u64(vld1q_u64(elems))); +} +inline v_float32x4 v_lut_quads(const float* tab, const int* idx) +{ + return v_float32x4(vld1q_f32(tab + idx[0])); +} + inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec) { int CV_DECL_ALIGNED(32) elems[4] = @@ -1584,6 +1740,18 @@ inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec) return v_int32x4(vld1q_s32(elems)); } +inline v_uint32x4 v_lut(const unsigned* tab, const v_int32x4& idxvec) +{ + unsigned CV_DECL_ALIGNED(32) elems[4] = + { + tab[vgetq_lane_s32(idxvec.val, 0)], + tab[vgetq_lane_s32(idxvec.val, 1)], + tab[vgetq_lane_s32(idxvec.val, 2)], + tab[vgetq_lane_s32(idxvec.val, 3)] + }; + return v_uint32x4(vld1q_u32(elems)); +} + inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec) { float CV_DECL_ALIGNED(32) elems[4] = @@ -1614,7 +1782,64 @@ inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_floa y = v_float32x4(tab[idx[0]+1], tab[idx[1]+1], tab[idx[2]+1], tab[idx[3]+1]); } +inline v_int8x16 v_interleave_pairs(const v_int8x16& vec) +{ + return v_int8x16(vcombine_s8(vtbl1_s8(vget_low_s8(vec.val), vcreate_s8(0x0705060403010200)), vtbl1_s8(vget_high_s8(vec.val), vcreate_s8(0x0705060403010200)))); +} +inline v_uint8x16 v_interleave_pairs(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); } +inline v_int8x16 v_interleave_quads(const v_int8x16& vec) +{ + return v_int8x16(vcombine_s8(vtbl1_s8(vget_low_s8(vec.val), vcreate_s8(0x0703060205010400)), vtbl1_s8(vget_high_s8(vec.val), vcreate_s8(0x0703060205010400)))); +} +inline v_uint8x16 v_interleave_quads(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); } + +inline v_int16x8 v_interleave_pairs(const v_int16x8& vec) +{ + return v_int16x8(vreinterpretq_s16_s8(vcombine_s8(vtbl1_s8(vget_low_s8(vreinterpretq_s8_s16(vec.val)), vcreate_s8(0x0706030205040100)), vtbl1_s8(vget_high_s8(vreinterpretq_s8_s16(vec.val)), vcreate_s8(0x0706030205040100))))); +} +inline v_uint16x8 v_interleave_pairs(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); } +inline v_int16x8 v_interleave_quads(const v_int16x8& vec) +{ + return v_int16x8(vreinterpretq_s16_s8(vcombine_s8(vtbl1_s8(vget_low_s8(vreinterpretq_s8_s16(vec.val)), vcreate_s8(0x0b0a030209080100)), vtbl1_s8(vget_high_s8(vreinterpretq_s8_s16(vec.val)), vcreate_s8(0x0b0a030209080100))))); +} +inline v_uint16x8 v_interleave_quads(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); } + +inline v_int32x4 v_interleave_pairs(const v_int32x4& vec) +{ + int32x2x2_t res = vzip_s32(vget_low_s32(vec.val), vget_high_s32(vec.val)); + return v_int32x4(vcombine_s32(res.val[0], res.val[1])); +} +inline v_uint32x4 v_interleave_pairs(const v_uint32x4& vec) { return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); } +inline v_float32x4 v_interleave_pairs(const v_float32x4& vec) { return v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); } + +inline v_int8x16 v_pack_triplets(const v_int8x16& vec) +{ + return v_int8x16(vextq_s8(vcombine_s8(vtbl1_s8(vget_low_s8(vec.val), vcreate_s8(0x0605040201000000)), vtbl1_s8(vget_high_s8(vec.val), vcreate_s8(0x0807060504020100))), vdupq_n_s8(0), 2)); +} +inline v_uint8x16 v_pack_triplets(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); } + +inline v_int16x8 v_pack_triplets(const v_int16x8& vec) +{ + return v_int16x8(vreinterpretq_s16_s8(vextq_s8(vcombine_s8(vtbl1_s8(vget_low_s8(vreinterpretq_s8_s16(vec.val)), vcreate_s8(0x0504030201000000)), vget_high_s8(vreinterpretq_s8_s16(vec.val))), vdupq_n_s8(0), 2))); +} +inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); } + #if CV_SIMD128_64F +inline v_float64x2 v_lut(const double* tab, const int* idx) +{ + double CV_DECL_ALIGNED(32) elems[2] = + { + tab[idx[0]], + tab[idx[1]] + }; + return v_float64x2(vld1q_f64(elems)); +} + +inline v_float64x2 v_lut_pairs(const double* tab, const int* idx) +{ + return v_float64x2(vld1q_f64(tab + idx[0])); +} + inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec) { double CV_DECL_ALIGNED(32) elems[2] = diff --git a/modules/core/include/opencv2/core/hal/intrin_sse.hpp b/modules/core/include/opencv2/core/hal/intrin_sse.hpp index f7a67da1a5..dcfae9a3a8 100644 --- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp @@ -2699,6 +2699,126 @@ inline void v_store_fp16(short* ptr, const v_float32x4& a) ////////////// Lookup table access //////////////////// +inline v_int8x16 v_lut(const schar* tab, const int* idx) +{ +#if defined(_MSC_VER) + return v_int8x16(_mm_setr_epi8(tab[idx[0]], tab[idx[1]], tab[idx[ 2]], tab[idx[ 3]], tab[idx[ 4]], tab[idx[ 5]], tab[idx[ 6]], tab[idx[ 7]], + tab[idx[8]], tab[idx[9]], tab[idx[10]], tab[idx[11]], tab[idx[12]], tab[idx[13]], tab[idx[14]], tab[idx[15]])); +#else + return v_int8x16(_mm_setr_epi64( + _mm_setr_pi8(tab[idx[0]], tab[idx[1]], tab[idx[ 2]], tab[idx[ 3]], tab[idx[ 4]], tab[idx[ 5]], tab[idx[ 6]], tab[idx[ 7]]), + _mm_setr_pi8(tab[idx[8]], tab[idx[9]], tab[idx[10]], tab[idx[11]], tab[idx[12]], tab[idx[13]], tab[idx[14]], tab[idx[15]]) + )); +#endif +} +inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx) +{ +#if defined(_MSC_VER) + return v_int8x16(_mm_setr_epi16(*(const short*)(tab + idx[0]), *(const short*)(tab + idx[1]), *(const short*)(tab + idx[2]), *(const short*)(tab + idx[3]), + *(const short*)(tab + idx[4]), *(const short*)(tab + idx[5]), *(const short*)(tab + idx[6]), *(const short*)(tab + idx[7]))); +#else + return v_int8x16(_mm_setr_epi64( + _mm_setr_pi16(*(const short*)(tab + idx[0]), *(const short*)(tab + idx[1]), *(const short*)(tab + idx[2]), *(const short*)(tab + idx[3])), + _mm_setr_pi16(*(const short*)(tab + idx[4]), *(const short*)(tab + idx[5]), *(const short*)(tab + idx[6]), *(const short*)(tab + idx[7])) + )); +#endif +} +inline v_int8x16 v_lut_quads(const schar* tab, const int* idx) +{ +#if defined(_MSC_VER) + return v_int8x16(_mm_setr_epi32(*(const int*)(tab + idx[0]), *(const int*)(tab + idx[1]), + *(const int*)(tab + idx[2]), *(const int*)(tab + idx[3]))); +#else + return v_int8x16(_mm_setr_epi64( + _mm_setr_pi32(*(const int*)(tab + idx[0]), *(const int*)(tab + idx[1])), + _mm_setr_pi32(*(const int*)(tab + idx[2]), *(const int*)(tab + idx[3])) + )); +#endif +} +inline v_uint8x16 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((const schar *)tab, idx)); } +inline v_uint8x16 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((const schar *)tab, idx)); } +inline v_uint8x16 v_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_quads((const schar *)tab, idx)); } + +inline v_int16x8 v_lut(const short* tab, const int* idx) +{ +#if defined(_MSC_VER) + return v_int16x8(_mm_setr_epi16(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]], + tab[idx[4]], tab[idx[5]], tab[idx[6]], tab[idx[7]])); +#else + return v_int16x8(_mm_setr_epi64( + _mm_setr_pi16(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]), + _mm_setr_pi16(tab[idx[4]], tab[idx[5]], tab[idx[6]], tab[idx[7]]) + )); +#endif +} +inline v_int16x8 v_lut_pairs(const short* tab, const int* idx) +{ +#if defined(_MSC_VER) + return v_int16x8(_mm_setr_epi32(*(const int*)(tab + idx[0]), *(const int*)(tab + idx[1]), + *(const int*)(tab + idx[2]), *(const int*)(tab + idx[3]))); +#else + return v_int16x8(_mm_setr_epi64( + _mm_setr_pi32(*(const int*)(tab + idx[0]), *(const int*)(tab + idx[1])), + _mm_setr_pi32(*(const int*)(tab + idx[2]), *(const int*)(tab + idx[3])) + )); +#endif +} +inline v_int16x8 v_lut_quads(const short* tab, const int* idx) +{ + return v_int16x8(_mm_set_epi64x(*(const int64_t*)(tab + idx[1]), *(const int64_t*)(tab + idx[0]))); +} +inline v_uint16x8 v_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut((const short *)tab, idx)); } +inline v_uint16x8 v_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_pairs((const short *)tab, idx)); } +inline v_uint16x8 v_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_quads((const short *)tab, idx)); } + +inline v_int32x4 v_lut(const int* tab, const int* idx) +{ +#if defined(_MSC_VER) + return v_int32x4(_mm_setr_epi32(tab[idx[0]], tab[idx[1]], + tab[idx[2]], tab[idx[3]])); +#else + return v_int32x4(_mm_setr_epi64( + _mm_setr_pi32(tab[idx[0]], tab[idx[1]]), + _mm_setr_pi32(tab[idx[2]], tab[idx[3]]) + )); +#endif +} +inline v_int32x4 v_lut_pairs(const int* tab, const int* idx) +{ + return v_int32x4(_mm_set_epi64x(*(const int64_t*)(tab + idx[1]), *(const int64_t*)(tab + idx[0]))); +} +inline v_int32x4 v_lut_quads(const int* tab, const int* idx) +{ + return v_int32x4(_mm_load_si128((const __m128i*)(tab + idx[0]))); +} +inline v_uint32x4 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((const int *)tab, idx)); } +inline v_uint32x4 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((const int *)tab, idx)); } +inline v_uint32x4 v_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_quads((const int *)tab, idx)); } + +inline v_int64x2 v_lut(const int64_t* tab, const int* idx) +{ + return v_int64x2(_mm_set_epi64x(tab[idx[1]], tab[idx[0]])); +} +inline v_int64x2 v_lut_pairs(const int64_t* tab, const int* idx) +{ + return v_int64x2(_mm_load_si128((const __m128i*)(tab + idx[0]))); +} +inline v_uint64x2 v_lut(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut((const int64_t *)tab, idx)); } +inline v_uint64x2 v_lut_pairs(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_pairs((const int64_t *)tab, idx)); } + +inline v_float32x4 v_lut(const float* tab, const int* idx) +{ + return v_float32x4(_mm_setr_ps(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]])); +} +inline v_float32x4 v_lut_pairs(const float* tab, const int* idx) { return v_reinterpret_as_f32(v_lut_pairs((const int *)tab, idx)); } +inline v_float32x4 v_lut_quads(const float* tab, const int* idx) { return v_reinterpret_as_f32(v_lut_quads((const int *)tab, idx)); } + +inline v_float64x2 v_lut(const double* tab, const int* idx) +{ + return v_float64x2(_mm_setr_pd(tab[idx[0]], tab[idx[1]])); +} +inline v_float64x2 v_lut_pairs(const double* tab, const int* idx) { return v_float64x2(_mm_castsi128_pd(_mm_load_si128((const __m128i*)(tab + idx[0])))); } + inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec) { int CV_DECL_ALIGNED(32) idx[4]; @@ -2706,6 +2826,11 @@ inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec) return v_int32x4(_mm_setr_epi32(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]])); } +inline v_uint32x4 v_lut(const unsigned* tab, const v_int32x4& idxvec) +{ + return v_reinterpret_as_u32(v_lut((const int *)tab, idxvec)); +} + inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec) { int CV_DECL_ALIGNED(32) idx[4]; @@ -2751,6 +2876,77 @@ inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_flo y = v_float64x2(_mm_unpackhi_pd(xy0, xy1)); } +inline v_int8x16 v_interleave_pairs(const v_int8x16& vec) +{ +#if CV_SSSE3 + return v_int8x16(_mm_shuffle_epi8(vec.val, _mm_set_epi64x(0x0f0d0e0c0b090a08, 0x0705060403010200))); +#else + __m128i a = _mm_shufflelo_epi16(vec.val, _MM_SHUFFLE(3, 1, 2, 0)); + a = _mm_shufflehi_epi16(a, _MM_SHUFFLE(3, 1, 2, 0)); + a = _mm_shuffle_epi32(a, _MM_SHUFFLE(3, 1, 2, 0)); + return v_int8x16(_mm_unpacklo_epi8(a, _mm_unpackhi_epi64(a, a))); +#endif +} +inline v_uint8x16 v_interleave_pairs(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); } +inline v_int8x16 v_interleave_quads(const v_int8x16& vec) +{ +#if CV_SSSE3 + return v_int8x16(_mm_shuffle_epi8(vec.val, _mm_set_epi64x(0x0f0b0e0a0d090c08, 0x0703060205010400))); +#else + __m128i a = _mm_shuffle_epi32(vec.val, _MM_SHUFFLE(3, 1, 2, 0)); + return v_int8x16(_mm_unpacklo_epi8(a, _mm_unpackhi_epi64(a, a))); +#endif +} +inline v_uint8x16 v_interleave_quads(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); } + +inline v_int16x8 v_interleave_pairs(const v_int16x8& vec) +{ +#if CV_SSSE3 + return v_int16x8(_mm_shuffle_epi8(vec.val, _mm_set_epi64x(0x0f0e0b0a0d0c0908, 0x0706030205040100))); +#else + __m128i a = _mm_shufflelo_epi16(vec.val, _MM_SHUFFLE(3, 1, 2, 0)); + return v_int16x8(_mm_shufflehi_epi16(a, _MM_SHUFFLE(3, 1, 2, 0))); +#endif +} +inline v_uint16x8 v_interleave_pairs(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); } +inline v_int16x8 v_interleave_quads(const v_int16x8& vec) +{ +#if CV_SSSE3 + return v_int16x8(_mm_shuffle_epi8(vec.val, _mm_set_epi64x(0x0f0e07060d0c0504, 0x0b0a030209080100))); +#else + return v_int16x8(_mm_unpacklo_epi16(vec.val, _mm_unpackhi_epi64(vec.val, vec.val))); +#endif +} +inline v_uint16x8 v_interleave_quads(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); } + +inline v_int32x4 v_interleave_pairs(const v_int32x4& vec) +{ + return v_int32x4(_mm_shuffle_epi32(vec.val, _MM_SHUFFLE(3, 1, 2, 0))); +} +inline v_uint32x4 v_interleave_pairs(const v_uint32x4& vec) { return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); } +inline v_float32x4 v_interleave_pairs(const v_float32x4& vec) { return v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); } + +inline v_int8x16 v_pack_triplets(const v_int8x16& vec) +{ +#if CV_SSSE3 + return v_int8x16(_mm_shuffle_epi8(vec.val, _mm_set_epi64x(0xffffff0f0e0d0c0a, 0x0908060504020100))); +#else + __m128i mask = _mm_set1_epi64x(0x00000000FFFFFFFF); + __m128i a = _mm_or_si128(_mm_andnot_si128(mask, vec.val), _mm_and_si128(mask, _mm_sll_epi32(vec.val, _mm_set_epi64x(0, 8)))); + return v_int8x16(_mm_srli_si128(_mm_shufflelo_epi16(a, _MM_SHUFFLE(2, 1, 0, 3)), 2)); +#endif +} +inline v_uint8x16 v_pack_triplets(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); } + +inline v_int16x8 v_pack_triplets(const v_int16x8& vec) +{ +#if CV_SSSE3 + return v_int16x8(_mm_shuffle_epi8(vec.val, _mm_set_epi64x(0xffff0f0e0d0c0b0a, 0x0908050403020100))); +#else + return v_int16x8(_mm_srli_si128(_mm_shufflelo_epi16(vec.val, _MM_SHUFFLE(2, 1, 0, 3)), 2)); +#endif +} +inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); } ////////////// FP16 support /////////////////////////// diff --git a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp index fce5c444ed..ddda1d10d0 100644 --- a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp @@ -993,6 +993,80 @@ inline v_float64x2 v_cvt_f64_high(const v_float32x4& a) ////////////// Lookup table access //////////////////// +inline v_int8x16 v_lut(const schar* tab, const int* idx) +{ + return v_int8x16(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]], tab[idx[4]], tab[idx[5]], tab[idx[6]], tab[idx[7]], + tab[idx[8]], tab[idx[9]], tab[idx[10]], tab[idx[11]], tab[idx[12]], tab[idx[13]], tab[idx[14]], tab[idx[15]]); +} +inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx) +{ + return v_reinterpret_as_s8(v_int16x8(*(const short*)(tab+idx[0]), *(const short*)(tab+idx[1]), *(const short*)(tab+idx[2]), *(const short*)(tab+idx[3]), + *(const short*)(tab+idx[4]), *(const short*)(tab+idx[5]), *(const short*)(tab+idx[6]), *(const short*)(tab+idx[7]))); +} +inline v_int8x16 v_lut_quads(const schar* tab, const int* idx) +{ + return v_reinterpret_as_s8(v_int32x4(*(const int*)(tab+idx[0]), *(const int*)(tab+idx[1]), *(const int*)(tab+idx[2]), *(const int*)(tab+idx[3]))); +} +inline v_uint8x16 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((const schar*)tab, idx)); } +inline v_uint8x16 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((const schar*)tab, idx)); } +inline v_uint8x16 v_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_quads((const schar*)tab, idx)); } + +inline v_int16x8 v_lut(const short* tab, const int* idx) +{ + return v_int16x8(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]], tab[idx[4]], tab[idx[5]], tab[idx[6]], tab[idx[7]]); +} +inline v_int16x8 v_lut_pairs(const short* tab, const int* idx) +{ + return v_reinterpret_as_s16(v_int32x4(*(const int*)(tab + idx[0]), *(const int*)(tab + idx[1]), *(const int*)(tab + idx[2]), *(const int*)(tab + idx[3]))); +} +inline v_int16x8 v_lut_quads(const short* tab, const int* idx) +{ + return v_reinterpret_as_s16(v_int64x2(*(const int64*)(tab + idx[0]), *(const int64*)(tab + idx[1]))); +} +inline v_uint16x8 v_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut((const short*)tab, idx)); } +inline v_uint16x8 v_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_pairs((const short*)tab, idx)); } +inline v_uint16x8 v_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_quads((const short*)tab, idx)); } + +inline v_int32x4 v_lut(const int* tab, const int* idx) +{ + return v_int32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]); +} +inline v_int32x4 v_lut_pairs(const int* tab, const int* idx) +{ + return v_reinterpret_as_s32(v_int64x2(*(const int64*)(tab + idx[0]), *(const int64*)(tab + idx[1]))); +} +inline v_int32x4 v_lut_quads(const int* tab, const int* idx) +{ + return v_int32x4(vsx_ld(0, tab + idx[0])); +} +inline v_uint32x4 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((const int*)tab, idx)); } +inline v_uint32x4 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((const int*)tab, idx)); } +inline v_uint32x4 v_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_quads((const int*)tab, idx)); } + +inline v_int64x2 v_lut(const int64_t* tab, const int* idx) +{ + return v_int64x2(tab[idx[0]], tab[idx[1]]); +} +inline v_int64x2 v_lut_pairs(const int64_t* tab, const int* idx) +{ + return v_int64x2(vsx_ld2(0, tab + idx[0])); +} +inline v_uint64x2 v_lut(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut((const int64_t *)tab, idx)); } +inline v_uint64x2 v_lut_pairs(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_pairs((const int64_t *)tab, idx)); } + +inline v_float32x4 v_lut(const float* tab, const int* idx) +{ + return v_float32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]); +} +inline v_float32x4 v_lut_pairs(const float* tab, const int* idx) { return v_reinterpret_as_f32(v_lut_pairs((const int*)tab, idx)); } +inline v_float32x4 v_lut_quads(const float* tab, const int* idx) { return v_load(tab + *idx); } + +inline v_float64x2 v_lut(const double* tab, const int* idx) +{ + return v_float64x2(tab[idx[0]], tab[idx[1]]); +} +inline v_float64x2 v_lut_pairs(const double* tab, const int* idx) { return v_load(tab + *idx); } + inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec) { int CV_DECL_ALIGNED(32) idx[4]; @@ -1000,6 +1074,13 @@ inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec) return v_int32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]); } +inline v_uint32x4 v_lut(const unsigned* tab, const v_int32x4& idxvec) +{ + int CV_DECL_ALIGNED(32) idx[4]; + v_store_aligned(idx, idxvec); + return v_uint32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]); +} + inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec) { int CV_DECL_ALIGNED(32) idx[4]; @@ -1030,6 +1111,55 @@ inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_flo y = v_float64x2(tab[idx[0]+1], tab[idx[1]+1]); } +inline v_int8x16 v_interleave_pairs(const v_int8x16& vec) +{ + vec_short8 vec0 = vec_mergeh((vec_short8)vec.val, (vec_short8)vec_mergesql(vec.val, vec.val)); + vec0 = vec_mergeh(vec0, vec_mergesql(vec0, vec0)); + return v_int8x16(vec_mergeh((vec_char16)vec0, (vec_char16)vec_mergesql(vec0, vec0))); +} +inline v_uint8x16 v_interleave_pairs(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); } +inline v_int8x16 v_interleave_quads(const v_int8x16& vec) +{ + vec_char16 vec0 = (vec_char16)vec_mergeh((vec_int4)vec.val, (vec_int4)vec_mergesql(vec.val, vec.val)); + return v_int8x16(vec_mergeh(vec0, vec_mergesql(vec0, vec0))); +} +inline v_uint8x16 v_interleave_quads(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); } + +inline v_int16x8 v_interleave_pairs(const v_int16x8& vec) +{ + vec_short8 vec0 = (vec_short8)vec_mergeh((vec_int4)vec.val, (vec_int4)vec_mergesql(vec.val, vec.val)); + return v_int16x8(vec_mergeh(vec0, vec_mergesql(vec0, vec0))); +} +inline v_uint16x8 v_interleave_pairs(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); } +inline v_int16x8 v_interleave_quads(const v_int16x8& vec) +{ + return v_int16x8(vec_mergeh(vec.val, vec_mergesql(vec.val, vec.val))); +} +inline v_uint16x8 v_interleave_quads(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); } + +inline v_int32x4 v_interleave_pairs(const v_int32x4& vec) +{ + return v_int32x4(vec_mergeh(vec.val, vec_mergesql(vec.val, vec.val))); +} +inline v_uint32x4 v_interleave_pairs(const v_uint32x4& vec) { return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); } +inline v_float32x4 v_interleave_pairs(const v_float32x4& vec) { return v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); } + +inline v_int8x16 v_pack_triplets(const v_int8x16& vec) +{ + schar CV_DECL_ALIGNED(32) val[16]; + v_store_aligned(val, vec); + return v_int8x16(val[0], val[1], val[2], val[4], val[5], val[6], val[8], val[9], val[10], val[12], val[13], val[14], val[15], val[15], val[15], val[15]); +} +inline v_uint8x16 v_pack_triplets(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); } + +inline v_int16x8 v_pack_triplets(const v_int16x8& vec) +{ + short CV_DECL_ALIGNED(32) val[8]; + v_store_aligned(val, vec); + return v_int16x8(val[0], val[1], val[2], val[4], val[5], val[6], val[7], val[7]); +} +inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); } + /////// FP16 support //////// // [TODO] implement these 2 using VSX or universal intrinsics (copy from intrin_sse.cpp and adopt) diff --git a/modules/imgproc/src/resize.cpp b/modules/imgproc/src/resize.cpp index f182b77d0c..996f6977b1 100644 --- a/modules/imgproc/src/resize.cpp +++ b/modules/imgproc/src/resize.cpp @@ -340,155 +340,6 @@ static void hlineResizeCn(ET* src, int cn, int *ofst, FT* m, FT* dst, int dst_mi hline::ResizeCn(src, cn, ofst, m, dst, dst_min, dst_max, dst_width); }; -#if CV_SIMD512 -inline void v_load_indexed1(uint8_t* src, int *ofst, v_uint16 &v_src0, v_uint16 &v_src1) -{ - v_expand(v_reinterpret_as_u8(v_uint16( - *((uint16_t*)(src + ofst[ 0])), *((uint16_t*)(src + ofst[ 1])), *((uint16_t*)(src + ofst[ 2])), *((uint16_t*)(src + ofst[ 3])), - *((uint16_t*)(src + ofst[ 4])), *((uint16_t*)(src + ofst[ 5])), *((uint16_t*)(src + ofst[ 6])), *((uint16_t*)(src + ofst[ 7])), - *((uint16_t*)(src + ofst[ 8])), *((uint16_t*)(src + ofst[ 9])), *((uint16_t*)(src + ofst[10])), *((uint16_t*)(src + ofst[11])), - *((uint16_t*)(src + ofst[12])), *((uint16_t*)(src + ofst[13])), *((uint16_t*)(src + ofst[14])), *((uint16_t*)(src + ofst[15])), - *((uint16_t*)(src + ofst[16])), *((uint16_t*)(src + ofst[17])), *((uint16_t*)(src + ofst[14])), *((uint16_t*)(src + ofst[15])), - *((uint16_t*)(src + ofst[20])), *((uint16_t*)(src + ofst[21])), *((uint16_t*)(src + ofst[14])), *((uint16_t*)(src + ofst[15])), - *((uint16_t*)(src + ofst[24])), *((uint16_t*)(src + ofst[25])), *((uint16_t*)(src + ofst[14])), *((uint16_t*)(src + ofst[15])), - *((uint16_t*)(src + ofst[28])), *((uint16_t*)(src + ofst[29])), *((uint16_t*)(src + ofst[14])), *((uint16_t*)(src + ofst[15])))), - v_src0, v_src1); -} -inline void v_load_indexed2(uint8_t* src, int *ofst, v_uint16 &v_src0, v_uint16 &v_src1) -{ - v_expand(v_reinterpret_as_u8(v_uint32( - *((uint32_t*)(src + 2 * ofst[ 0])), *((uint32_t*)(src + 2 * ofst[ 1])), *((uint32_t*)(src + 2 * ofst[ 2])), *((uint32_t*)(src + 2 * ofst[ 3])), - *((uint32_t*)(src + 2 * ofst[ 4])), *((uint32_t*)(src + 2 * ofst[ 5])), *((uint32_t*)(src + 2 * ofst[ 6])), *((uint32_t*)(src + 2 * ofst[ 7])), - *((uint32_t*)(src + 2 * ofst[ 8])), *((uint32_t*)(src + 2 * ofst[ 9])), *((uint32_t*)(src + 2 * ofst[10])), *((uint32_t*)(src + 2 * ofst[11])), - *((uint32_t*)(src + 2 * ofst[12])), *((uint32_t*)(src + 2 * ofst[13])), *((uint32_t*)(src + 2 * ofst[14])), *((uint32_t*)(src + 2 * ofst[15])))), - v_src0, v_src1); - v_uint32 v_tmp0, v_tmp1, v_tmp2, v_tmp3; - v_zip(v_reinterpret_as_u32(v_src0), v_reinterpret_as_u32(v_src1), v_tmp2, v_tmp3); - v_zip(v_tmp2, v_tmp3, v_tmp0, v_tmp1); - v_zip(v_tmp0, v_tmp1, v_tmp2, v_tmp3); - v_zip(v_tmp2, v_tmp3, v_tmp0, v_tmp1); - v_zip(v_reinterpret_as_u16(v_tmp0), v_reinterpret_as_u16(v_tmp1), v_src0, v_src1); -} -inline void v_load_indexed4(uint8_t* src, int *ofst, v_uint16 &v_src0, v_uint16 &v_src1) -{ - v_expand(v_reinterpret_as_u8(v_uint64( - *((uint64_t*)(src + 4 * ofst[0])), *((uint64_t*)(src + 4 * ofst[1])), *((uint64_t*)(src + 4 * ofst[2])), *((uint64_t*)(src + 4 * ofst[3])), - *((uint64_t*)(src + 4 * ofst[4])), *((uint64_t*)(src + 4 * ofst[5])), *((uint64_t*)(src + 4 * ofst[6])), *((uint64_t*)(src + 4 * ofst[7])))), - v_src0, v_src1); - v_uint64 v_tmp0, v_tmp1, v_tmp2, v_tmp3; - v_zip(v_reinterpret_as_u64(v_src0), v_reinterpret_as_u64(v_src1), v_tmp2, v_tmp3); - v_zip(v_tmp2, v_tmp3, v_tmp0, v_tmp1); - v_zip(v_tmp0, v_tmp1, v_tmp2, v_tmp3); - v_zip(v_reinterpret_as_u16(v_tmp2), v_reinterpret_as_u16(v_tmp3), v_src0, v_src1); -} -inline void v_load_indexed_deinterleave(uint16_t* src, int *ofst, v_uint32 &v_src0, v_uint32 &v_src1) -{ - v_expand(v_reinterpret_as_u16(v_uint32( - *((uint32_t*)(src + ofst[ 0])), *((uint32_t*)(src + ofst[ 1])), *((uint32_t*)(src + ofst[ 2])), *((uint32_t*)(src + ofst[ 3])), - *((uint32_t*)(src + ofst[ 4])), *((uint32_t*)(src + ofst[ 5])), *((uint32_t*)(src + ofst[ 6])), *((uint32_t*)(src + ofst[ 7])), - *((uint32_t*)(src + ofst[ 8])), *((uint32_t*)(src + ofst[ 9])), *((uint32_t*)(src + ofst[10])), *((uint32_t*)(src + ofst[11])), - *((uint32_t*)(src + ofst[12])), *((uint32_t*)(src + ofst[13])), *((uint32_t*)(src + ofst[14])), *((uint32_t*)(src + ofst[15])))), - v_src0, v_src1); - v_uint32 v_tmp0, v_tmp1; - v_zip(v_src0, v_src1, v_tmp0, v_tmp1); - v_zip(v_tmp0, v_tmp1, v_src0, v_src1); - v_zip(v_src0, v_src1, v_tmp0, v_tmp1); - v_zip(v_tmp0, v_tmp1, v_src0, v_src1); -} -#elif CV_SIMD256 -inline void v_load_indexed1(uint8_t* src, int *ofst, v_uint16 &v_src0, v_uint16 &v_src1) -{ - v_expand(v_reinterpret_as_u8(v_uint16( - *((uint16_t*)(src + ofst[ 0])), *((uint16_t*)(src + ofst[ 1])), *((uint16_t*)(src + ofst[ 2])), *((uint16_t*)(src + ofst[ 3])), - *((uint16_t*)(src + ofst[ 4])), *((uint16_t*)(src + ofst[ 5])), *((uint16_t*)(src + ofst[ 6])), *((uint16_t*)(src + ofst[ 7])), - *((uint16_t*)(src + ofst[ 8])), *((uint16_t*)(src + ofst[ 9])), *((uint16_t*)(src + ofst[10])), *((uint16_t*)(src + ofst[11])), - *((uint16_t*)(src + ofst[12])), *((uint16_t*)(src + ofst[13])), *((uint16_t*)(src + ofst[14])), *((uint16_t*)(src + ofst[15])))), - v_src0, v_src1); -} -inline void v_load_indexed2(uint8_t* src, int *ofst, v_uint16 &v_src0, v_uint16 &v_src1) -{ - v_expand(v_reinterpret_as_u8(v_uint32( - *((uint32_t*)(src + 2 * ofst[0])), *((uint32_t*)(src + 2 * ofst[1])), *((uint32_t*)(src + 2 * ofst[2])), *((uint32_t*)(src + 2 * ofst[3])), - *((uint32_t*)(src + 2 * ofst[4])), *((uint32_t*)(src + 2 * ofst[5])), *((uint32_t*)(src + 2 * ofst[6])), *((uint32_t*)(src + 2 * ofst[7])))), - v_src0, v_src1); - v_uint32 v_tmp0, v_tmp1, v_tmp2, v_tmp3; - v_zip(v_reinterpret_as_u32(v_src0), v_reinterpret_as_u32(v_src1), v_tmp2, v_tmp3); - v_zip(v_tmp2, v_tmp3, v_tmp0, v_tmp1); - v_zip(v_tmp0, v_tmp1, v_tmp2, v_tmp3); - v_zip(v_reinterpret_as_u16(v_tmp2), v_reinterpret_as_u16(v_tmp3), v_src0, v_src1); -} -inline void v_load_indexed4(uint8_t* src, int *ofst, v_uint16 &v_src0, v_uint16 &v_src1) -{ - v_expand(v_reinterpret_as_u8(v_uint64( - *((uint64_t*)(src + 4 * ofst[0])), *((uint64_t*)(src + 4 * ofst[1])), *((uint64_t*)(src + 4 * ofst[2])), *((uint64_t*)(src + 4 * ofst[3])))), - v_src0, v_src1); - v_uint64 v_tmp0, v_tmp1, v_tmp2, v_tmp3; - v_zip(v_reinterpret_as_u64(v_src0), v_reinterpret_as_u64(v_src1), v_tmp2, v_tmp3); - v_zip(v_tmp2, v_tmp3, v_tmp0, v_tmp1); - v_zip(v_reinterpret_as_u16(v_tmp0), v_reinterpret_as_u16(v_tmp1), v_src0, v_src1); -} -inline void v_load_indexed_deinterleave(uint16_t* src, int *ofst, v_uint32 &v_src0, v_uint32 &v_src1) -{ - v_uint32 v_tmp0, v_tmp1; - v_expand(v_reinterpret_as_u16(v_uint32( - *((uint32_t*)(src + ofst[0])), *((uint32_t*)(src + ofst[1])), *((uint32_t*)(src + ofst[2])), *((uint32_t*)(src + ofst[3])), - *((uint32_t*)(src + ofst[4])), *((uint32_t*)(src + ofst[5])), *((uint32_t*)(src + ofst[6])), *((uint32_t*)(src + ofst[7])))), - v_tmp0, v_tmp1); - v_zip(v_tmp0, v_tmp1, v_src0, v_src1); - v_zip(v_src0, v_src1, v_tmp0, v_tmp1); - v_zip(v_tmp0, v_tmp1, v_src0, v_src1); -} -#elif CV_SIMD128 -inline void v_load_indexed1(uint8_t* src, int *ofst, v_uint16 &v_src0, v_uint16 &v_src1) -{ - uint16_t buf[8]; - buf[0] = *((uint16_t*)(src + ofst[0])); - buf[1] = *((uint16_t*)(src + ofst[1])); - buf[2] = *((uint16_t*)(src + ofst[2])); - buf[3] = *((uint16_t*)(src + ofst[3])); - buf[4] = *((uint16_t*)(src + ofst[4])); - buf[5] = *((uint16_t*)(src + ofst[5])); - buf[6] = *((uint16_t*)(src + ofst[6])); - buf[7] = *((uint16_t*)(src + ofst[7])); - v_src0 = vx_load_expand((uint8_t*)buf); - v_src1 = vx_load_expand((uint8_t*)buf + 8); -} -inline void v_load_indexed2(uint8_t* src, int *ofst, v_uint16 &v_src0, v_uint16 &v_src1) -{ - uint32_t buf[4]; - buf[0] = *((uint32_t*)(src + 2 * ofst[0])); - buf[1] = *((uint32_t*)(src + 2 * ofst[1])); - buf[2] = *((uint32_t*)(src + 2 * ofst[2])); - buf[3] = *((uint32_t*)(src + 2 * ofst[3])); - v_uint32 v_tmp0, v_tmp1, v_tmp2, v_tmp3; - v_tmp0 = v_reinterpret_as_u32(vx_load_expand((uint8_t*)buf)); - v_tmp1 = v_reinterpret_as_u32(vx_load_expand((uint8_t*)buf + 8)); - v_zip(v_tmp0, v_tmp1, v_tmp2, v_tmp3); - v_zip(v_tmp2, v_tmp3, v_tmp0, v_tmp1); - v_zip(v_reinterpret_as_u16(v_tmp0), v_reinterpret_as_u16(v_tmp1), v_src0, v_src1); -} -inline void v_load_indexed4(uint8_t* src, int *ofst, v_uint16 &v_src0, v_uint16 &v_src1) -{ - v_uint16 v_tmp0, v_tmp1; - v_src0 = vx_load_expand(src + 4 * ofst[0]); - v_src1 = vx_load_expand(src + 4 * ofst[1]); - v_recombine(v_src0, v_src1, v_tmp0, v_tmp1); - v_zip(v_tmp0, v_tmp1, v_src0, v_src1); -} -inline void v_load_indexed_deinterleave(uint16_t* src, int *ofst, v_uint32 &v_src0, v_uint32 &v_src1) -{ - uint32_t buf[4]; - buf[0] = *((uint32_t*)(src + ofst[0])); - buf[1] = *((uint32_t*)(src + ofst[1])); - buf[2] = *((uint32_t*)(src + ofst[2])); - buf[3] = *((uint32_t*)(src + ofst[3])); - v_src0 = vx_load_expand((uint16_t*)buf); - v_src1 = vx_load_expand((uint16_t*)buf + 4); - v_uint32 v_tmp0, v_tmp1; - v_zip(v_src0, v_src1, v_tmp0, v_tmp1); - v_zip(v_tmp0, v_tmp1, v_src0, v_src1); -} -#endif template <> void hlineResizeCn(uint8_t* src, int, int *ofst, ufixedpoint16* m, ufixedpoint16* dst, int dst_min, int dst_max, int dst_width) { @@ -507,16 +358,23 @@ void hlineResizeCn(uint8_t* src, int, int *o *(dst++) = src_0; } #if CV_SIMD - for (; i <= dst_max - VECSZ; i += VECSZ, m += 2*VECSZ, dst += VECSZ) + for (; i <= dst_max - 2*VECSZ; i += 2*VECSZ, m += 4*VECSZ, dst += 2*VECSZ) { v_uint16 v_src0, v_src1; - v_load_indexed1(src, ofst + i, v_src0, v_src1); - - v_int16 v_mul0 = vx_load((int16_t*)m); - v_int16 v_mul1 = vx_load((int16_t*)m + VECSZ); - v_uint32 v_res0 = v_reinterpret_as_u32(v_dotprod(v_reinterpret_as_s16(v_src0), v_mul0)); - v_uint32 v_res1 = v_reinterpret_as_u32(v_dotprod(v_reinterpret_as_s16(v_src1), v_mul1)); - v_store((uint16_t*)dst, v_pack(v_res0, v_res1)); + v_expand(vx_lut_pairs(src, ofst + i), v_src0, v_src1); + v_store((uint16_t*)dst , v_pack(v_reinterpret_as_u32(v_dotprod(v_reinterpret_as_s16(v_src0), vx_load((int16_t*)m))), + v_reinterpret_as_u32(v_dotprod(v_reinterpret_as_s16(v_src1), vx_load((int16_t*)m + VECSZ))))); + v_expand(vx_lut_pairs(src, ofst + i + VECSZ), v_src0, v_src1); + v_store((uint16_t*)dst+VECSZ, v_pack(v_reinterpret_as_u32(v_dotprod(v_reinterpret_as_s16(v_src0), vx_load((int16_t*)m + 2*VECSZ))), + v_reinterpret_as_u32(v_dotprod(v_reinterpret_as_s16(v_src1), vx_load((int16_t*)m + 3*VECSZ))))); + } + if (i <= dst_max - VECSZ) + { + v_uint16 v_src0, v_src1; + v_expand(vx_lut_pairs(src, ofst + i), v_src0, v_src1); + v_store((uint16_t*)dst, v_pack(v_reinterpret_as_u32(v_dotprod(v_reinterpret_as_s16(v_src0), vx_load((int16_t*)m))), + v_reinterpret_as_u32(v_dotprod(v_reinterpret_as_s16(v_src1), vx_load((int16_t*)m + VECSZ))))); + i += VECSZ; m += 2*VECSZ; dst += VECSZ; } #endif for (; i < dst_max; i += 1, m += 2) @@ -564,7 +422,7 @@ void hlineResizeCn(uint8_t* src, int, int *o for (; i <= dst_max - VECSZ/2; i += VECSZ/2, m += VECSZ, dst += VECSZ) { v_uint16 v_src0, v_src1; - v_load_indexed2(src, ofst + i, v_src0, v_src1); + v_expand(v_interleave_pairs(v_reinterpret_as_u8(vx_lut_pairs((uint16_t*)src, ofst + i))), v_src0, v_src1); v_uint32 v_mul = vx_load((uint32_t*)m);//AaBbCcDd v_uint32 v_zip0, v_zip1; @@ -595,6 +453,81 @@ void hlineResizeCn(uint8_t* src, int, int *o } } template <> +void hlineResizeCn(uint8_t* src, int, int *ofst, ufixedpoint16* m, ufixedpoint16* dst, int dst_min, int dst_max, int dst_width) +{ + int i = 0; + union { + uint64_t q; + uint16_t w[4]; + } srccn; + ((ufixedpoint16*)(srccn.w))[0] = src[0]; + ((ufixedpoint16*)(srccn.w))[1] = src[1]; + ((ufixedpoint16*)(srccn.w))[2] = src[2]; + ((ufixedpoint16*)(srccn.w))[3] = 0; +#if CV_SIMD + const int VECSZ = v_uint16::nlanes; + v_uint16 v_srccn = v_pack_triplets(v_reinterpret_as_u16(vx_setall_u64(srccn.q))); + for (; i <= dst_min - (VECSZ+2)/3; i += VECSZ/4, m += VECSZ/2, dst += 3*VECSZ/4) // Points that fall left from src image so became equal to leftmost src point + { + v_store((uint16_t*)dst, v_srccn); + } +#endif + for (; i < dst_min; i++, m += 2) + { + *(dst++) = ((ufixedpoint16*)(srccn.w))[0]; + *(dst++) = ((ufixedpoint16*)(srccn.w))[1]; + *(dst++) = ((ufixedpoint16*)(srccn.w))[2]; + } +#if CV_SIMD + CV_DECL_ALIGNED(CV_SIMD_WIDTH) int ofst3[VECSZ/2]; + for (; i <= dst_max - (3*VECSZ/4 + (VECSZ+2)/3); i += VECSZ/2, m += VECSZ, dst += 3*VECSZ/2) + { + v_store(ofst3, vx_load(ofst + i) * vx_setall_s32(3)); + v_uint8 v_src01, v_src23; + v_uint16 v_src0, v_src1, v_src2, v_src3; + v_zip(vx_lut_quads(src, ofst3), vx_lut_quads(src+3, ofst3), v_src01, v_src23); + v_expand(v_src01, v_src0, v_src1); + v_expand(v_src23, v_src2, v_src3); + + v_uint32 v_mul0, v_mul1, v_mul2, v_mul3, v_tmp; + v_mul0 = vx_load((uint32_t*)m);//AaBbCcDd + v_zip(v_mul0, v_mul0, v_mul3, v_tmp );//AaAaBbBb CcCcDdDd + v_zip(v_mul3, v_mul3, v_mul0, v_mul1);//AaAaAaAa BbBbBbBb + v_zip(v_tmp , v_tmp , v_mul2, v_mul3);//CcCcCcCc DdDdDdDd + + v_uint32 v_res0 = v_reinterpret_as_u32(v_dotprod(v_reinterpret_as_s16(v_src0), v_reinterpret_as_s16(v_mul0))); + v_uint32 v_res1 = v_reinterpret_as_u32(v_dotprod(v_reinterpret_as_s16(v_src1), v_reinterpret_as_s16(v_mul1))); + v_uint32 v_res2 = v_reinterpret_as_u32(v_dotprod(v_reinterpret_as_s16(v_src2), v_reinterpret_as_s16(v_mul2))); + v_uint32 v_res3 = v_reinterpret_as_u32(v_dotprod(v_reinterpret_as_s16(v_src3), v_reinterpret_as_s16(v_mul3))); + v_store((uint16_t*)dst , v_pack_triplets(v_pack(v_res0, v_res1))); + v_store((uint16_t*)dst + 3*VECSZ/4, v_pack_triplets(v_pack(v_res2, v_res3))); + } +#endif + for (; i < dst_max; i += 1, m += 2) + { + uint8_t* px = src + 3 * ofst[i]; + *(dst++) = m[0] * px[0] + m[1] * px[3]; + *(dst++) = m[0] * px[1] + m[1] * px[4]; + *(dst++) = m[0] * px[2] + m[1] * px[5]; + } + ((ufixedpoint16*)(srccn.w))[0] = (src + 3*ofst[dst_width - 1])[0]; + ((ufixedpoint16*)(srccn.w))[1] = (src + 3*ofst[dst_width - 1])[1]; + ((ufixedpoint16*)(srccn.w))[2] = (src + 3*ofst[dst_width - 1])[2]; +#if CV_SIMD + v_srccn = v_pack_triplets(v_reinterpret_as_u16(vx_setall_u64(srccn.q))); + for (; i <= dst_width - (VECSZ+2)/3; i += VECSZ/4, dst += 3*VECSZ/4) // Points that fall right from src image so became equal to rightmost src point + { + v_store((uint16_t*)dst, v_srccn); + } +#endif + for (; i < dst_width; i++) + { + *(dst++) = ((ufixedpoint16*)(srccn.w))[0]; + *(dst++) = ((ufixedpoint16*)(srccn.w))[1]; + *(dst++) = ((ufixedpoint16*)(srccn.w))[2]; + } +} +template <> void hlineResizeCn(uint8_t* src, int, int *ofst, ufixedpoint16* m, ufixedpoint16* dst, int dst_min, int dst_max, int dst_width) { int i = 0; @@ -614,20 +547,19 @@ void hlineResizeCn(uint8_t* src, int, int *o v_store((uint16_t*)dst, v_srccn); } #endif - if (i < dst_min) // Points that fall left from src image so became equal to leftmost src point + for (; i < dst_min; i++, m += 2) { *(dst++) = ((ufixedpoint16*)(srccn.w))[0]; *(dst++) = ((ufixedpoint16*)(srccn.w))[1]; *(dst++) = ((ufixedpoint16*)(srccn.w))[2]; *(dst++) = ((ufixedpoint16*)(srccn.w))[3]; - i++; m += 2; } #if CV_SIMD for (; i <= dst_max - VECSZ/2; i += VECSZ/2, m += VECSZ, dst += 2*VECSZ) { v_uint16 v_src0, v_src1, v_src2, v_src3; - v_load_indexed4(src, ofst + i, v_src0, v_src1); - v_load_indexed4(src, ofst + i + VECSZ/4, v_src2, v_src3); + v_expand(v_interleave_quads(v_reinterpret_as_u8(vx_lut_pairs((uint32_t*)src, ofst + i))), v_src0, v_src1); + v_expand(v_interleave_quads(v_reinterpret_as_u8(vx_lut_pairs((uint32_t*)src, ofst + i + VECSZ/4))), v_src2, v_src3); v_uint32 v_mul0, v_mul1, v_mul2, v_mul3, v_tmp; v_mul0 = vx_load((uint32_t*)m);//AaBbCcDd @@ -660,7 +592,7 @@ void hlineResizeCn(uint8_t* src, int, int *o v_store((uint16_t*)dst, v_srccn); } #endif - if (i < dst_width) + for (; i < dst_width; i++) { *(dst++) = ((ufixedpoint16*)(srccn.w))[0]; *(dst++) = ((ufixedpoint16*)(srccn.w))[1]; @@ -689,10 +621,12 @@ void hlineResizeCn(uint16_t* src, int, int for (; i <= dst_max - VECSZ; i += VECSZ, m += 2*VECSZ, dst += VECSZ) { v_uint32 v_src0, v_src1; - v_load_indexed_deinterleave(src, ofst + i, v_src0, v_src1); - v_uint32 v_mul0, v_mul1; - v_load_deinterleave((uint32_t*)m, v_mul0, v_mul1); - v_store((uint32_t*)dst, v_src0 * v_mul0 + v_src1 * v_mul1);//abcd + v_expand(vx_lut_pairs(src, ofst + i), v_src0, v_src1); + + v_uint64 v_res0 = v_reinterpret_as_u64(v_src0 * vx_load((uint32_t*)m)); + v_uint64 v_res1 = v_reinterpret_as_u64(v_src1 * vx_load((uint32_t*)m + VECSZ)); + v_store((uint32_t*)dst, v_pack((v_res0 & vx_setall_u64(0xFFFFFFFF)) + (v_res0 >> 32), + (v_res1 & vx_setall_u64(0xFFFFFFFF)) + (v_res1 >> 32))); } #endif for (; i < dst_max; i += 1, m += 2)