diff --git a/modules/core/include/opencv2/core/hal/intrin_avx.hpp b/modules/core/include/opencv2/core/hal/intrin_avx.hpp index fbd6f470cd..6fc03b7274 100644 --- a/modules/core/include/opencv2/core/hal/intrin_avx.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_avx.hpp @@ -90,6 +90,50 @@ inline __m256i _v256_packs_epu32(const __m256i& a, const __m256i& b) return _mm256_packus_epi32(am, bm); } +template +inline int _v256_extract_epi8(const __m256i& a) +{ +#if defined(CV__SIMD_HAVE_mm256_extract_epi8) || (CV_AVX2 && (!defined(_MSC_VER) || _MSC_VER >= 1910/*MSVS 2017*/)) + return _mm256_extract_epi8(a, i); +#else + __m128i b = _mm256_extractf128_si256(a, ((i) >> 4)); + return _mm_extract_epi8(b, i & 15); // SSE4.1 +#endif +} + +template +inline int _v256_extract_epi16(const __m256i& a) +{ +#if defined(CV__SIMD_HAVE_mm256_extract_epi8) || (CV_AVX2 && (!defined(_MSC_VER) || _MSC_VER >= 1910/*MSVS 2017*/)) + return _mm256_extract_epi16(a, i); +#else + __m128i b = _mm256_extractf128_si256(a, ((i) >> 3)); + return _mm_extract_epi16(b, i & 7); // SSE2 +#endif +} + +template +inline int _v256_extract_epi32(const __m256i& a) +{ +#if defined(CV__SIMD_HAVE_mm256_extract_epi8) || (CV_AVX2 && (!defined(_MSC_VER) || _MSC_VER >= 1910/*MSVS 2017*/)) + return _mm256_extract_epi32(a, i); +#else + __m128i b = _mm256_extractf128_si256(a, ((i) >> 2)); + return _mm_extract_epi32(b, i & 3); // SSE4.1 +#endif +} + +template +inline int64 _v256_extract_epi64(const __m256i& a) +{ +#if defined(CV__SIMD_HAVE_mm256_extract_epi8) || (CV_AVX2 && (!defined(_MSC_VER) || _MSC_VER >= 1910/*MSVS 2017*/)) + return _mm256_extract_epi64(a, i); +#else + __m128i b = _mm256_extractf128_si256(a, ((i) >> 1)); + return _mm_extract_epi64(b, i & 1); // SSE4.1 +#endif +} + ///////// Types //////////// struct v_uint8x32 @@ -2195,6 +2239,85 @@ OPENCV_HAL_IMPL_AVX_EXTRACT(v_int64x4) OPENCV_HAL_IMPL_AVX_EXTRACT(v_float32x8) OPENCV_HAL_IMPL_AVX_EXTRACT(v_float64x4) +template +inline uchar v_extract_n(v_uint8x32 a) +{ + return (uchar)_v256_extract_epi8(a.val); +} + +template +inline schar v_extract_n(v_int8x32 a) +{ + return (schar)v_extract_n(v_reinterpret_as_u8(a)); +} + +template +inline ushort v_extract_n(v_uint16x16 a) +{ + return (ushort)_v256_extract_epi16(a.val); +} + +template +inline short v_extract_n(v_int16x16 a) +{ + return (short)v_extract_n(v_reinterpret_as_u16(a)); +} + +template +inline uint v_extract_n(v_uint32x8 a) +{ + return (uint)_v256_extract_epi32(a.val); +} + +template +inline int v_extract_n(v_int32x8 a) +{ + return (int)v_extract_n(v_reinterpret_as_u32(a)); +} + +template +inline uint64 v_extract_n(v_uint64x4 a) +{ + return (uint64)_v256_extract_epi64(a.val); +} + +template +inline int64 v_extract_n(v_int64x4 v) +{ + return (int64)v_extract_n(v_reinterpret_as_u64(v)); +} + +template +inline float v_extract_n(v_float32x8 v) +{ + union { uint iv; float fv; } d; + d.iv = v_extract_n(v_reinterpret_as_u32(v)); + return d.fv; +} + +template +inline double v_extract_n(v_float64x4 v) +{ + union { uint64 iv; double dv; } d; + d.iv = v_extract_n(v_reinterpret_as_u64(v)); + return d.dv; +} + +template +inline v_uint32x8 v_broadcast_element(v_uint32x8 a) +{ + static const __m256i perm = _mm256_set1_epi32((char)i); + return v_uint32x8(_mm256_permutevar8x32_epi32(a.val, perm)); +} + +template +inline v_int32x8 v_broadcast_element(const v_int32x8 &a) +{ return v_reinterpret_as_s32(v_broadcast_element(v_reinterpret_as_u32(a))); } + +template +inline v_float32x8 v_broadcast_element(const v_float32x8 &a) +{ return v_reinterpret_as_f32(v_broadcast_element(v_reinterpret_as_u32(a))); } + ///////////////////// load deinterleave ///////////////////////////// diff --git a/modules/core/include/opencv2/core/hal/intrin_avx512.hpp b/modules/core/include/opencv2/core/hal/intrin_avx512.hpp index 2c31a8d014..3fa9027c04 100644 --- a/modules/core/include/opencv2/core/hal/intrin_avx512.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_avx512.hpp @@ -2228,6 +2228,35 @@ OPENCV_HAL_IMPL_AVX512_EXTRACT(v_int64x8) OPENCV_HAL_IMPL_AVX512_EXTRACT(v_float32x16) OPENCV_HAL_IMPL_AVX512_EXTRACT(v_float64x8) +#define OPENCV_HAL_IMPL_AVX512_EXTRACT_N(_Tpvec, _Tp) \ +template inline _Tp v_extract_n(_Tpvec v) { return v_rotate_right(v).get0(); } + +OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_uint8x64, uchar) +OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_int8x64, schar) +OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_uint16x32, ushort) +OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_int16x32, short) +OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_uint32x16, uint) +OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_int32x16, int) +OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_uint64x8, uint64) +OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_int64x8, int64) +OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_float32x16, float) +OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_float64x8, double) + +template +inline v_uint32x16 v_broadcast_element(v_uint32x16 a) +{ + static const __m512i perm = _mm512_set1_epi32((char)i); + return v_uint32x16(_mm512_permutexvar_epi32(perm, a.val)); +} + +template +inline v_int32x16 v_broadcast_element(const v_int32x16 &a) +{ return v_reinterpret_as_s32(v_broadcast_element(v_reinterpret_as_u32(a))); } + +template +inline v_float32x16 v_broadcast_element(const v_float32x16 &a) +{ return v_reinterpret_as_f32(v_broadcast_element(v_reinterpret_as_u32(a))); } + ///////////////////// load deinterleave ///////////////////////////// diff --git a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp index ed5396e49e..67d3155f00 100644 --- a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp @@ -217,6 +217,8 @@ Regular integers: |cvt_flt64 | | | | | | x | |transpose4x4 | | | | | x | x | |reverse | x | x | x | x | x | x | +|extract_n | x | x | x | x | x | x | +|broadcast_element | | | | | x | x | Big integers: @@ -230,6 +232,7 @@ Big integers: |extract | x | x | |rotate (lanes) | x | x | |cvt_flt64 | | x | +|extract_n | x | x | Floating point: @@ -254,6 +257,8 @@ Floating point: |extract | x | x | |rotate (lanes) | x | x | |reverse | x | x | +|extract_n | x | x | +|broadcast_element | x | | @{ */ @@ -1784,6 +1789,42 @@ inline v_reg<_Tp, n> v_extract(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) return r; } +/** @brief Vector extract + +Scheme: +Return the s-th element of v. +Restriction: 0 <= s < nlanes + +Usage: +@code +v_int32x4 a; +int r; +r = v_extract_n<2>(a); +@endcode +For all types. */ +template +inline _Tp v_extract_n(const v_reg<_Tp, n>& v) +{ + CV_DbgAssert(s >= 0 && s < n); + return v.s[s]; +} + +/** @brief Broadcast i-th element of vector + +Scheme: +@code +{ v[0] v[1] v[2] ... v[SZ] } => { v[i], v[i], v[i] ... v[i] } +@endcode +Restriction: 0 <= i < nlanes +Supported types: 32-bit integers and floats (s32/u32/f32) + */ +template +inline v_reg<_Tp, n> v_broadcast_element(const v_reg<_Tp, n>& a) +{ + CV_DbgAssert(i >= 0 && i < n); + return v_reg<_Tp, n>::all(a.s[i]); +} + /** @brief Round Rounds each value. Input type is float vector ==> output type is int vector.*/ diff --git a/modules/core/include/opencv2/core/hal/intrin_msa.hpp b/modules/core/include/opencv2/core/hal/intrin_msa.hpp index 4dbdfef49d..ca2cba725d 100755 --- a/modules/core/include/opencv2/core/hal/intrin_msa.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_msa.hpp @@ -1783,6 +1783,18 @@ inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_flo y = v_float64x2(MSA_TPV_REINTERPRET(v2f64, msa_ilvodq_s64(MSA_TPV_REINTERPRET(v2i64, xy1), MSA_TPV_REINTERPRET(v2i64, xy0)))); } +template +inline typename _Tp::lane_type v_extract_n(const _Tp& a) +{ + return v_rotate_right(a).get0(); +} + +template +inline _Tp v_broadcast_element(const _Tp& a) +{ + return _Tp::all(v_extract_n(a)); +} + ////// FP16 suport /////// #if CV_FP16 inline v_float32x4 v_load_expand(const float16_t* ptr) diff --git a/modules/core/include/opencv2/core/hal/intrin_neon.hpp b/modules/core/include/opencv2/core/hal/intrin_neon.hpp index abbd635fac..4da389f48b 100644 --- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp @@ -1651,6 +1651,38 @@ OPENCV_HAL_IMPL_NEON_EXTRACT(float32x4, f32) OPENCV_HAL_IMPL_NEON_EXTRACT(float64x2, f64) #endif +#define OPENCV_HAL_IMPL_NEON_EXTRACT_N(_Tpvec, _Tp, suffix) \ +template inline _Tp v_extract_n(_Tpvec v) { return vgetq_lane_##suffix(v.val, i); } + +OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_uint8x16, uchar, u8) +OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_int8x16, schar, s8) +OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_uint16x8, ushort, u16) +OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_int16x8, short, s16) +OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_uint32x4, uint, u32) +OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_int32x4, int, s32) +OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_uint64x2, uint64, u64) +OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_int64x2, int64, s64) +OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_float32x4, float, f32) +#if CV_SIMD128_64F +OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_float64x2, double, f64) +#endif + +#define OPENCV_HAL_IMPL_NEON_BROADCAST(_Tpvec, _Tp, suffix) \ +template inline _Tpvec v_broadcast_element(_Tpvec v) { _Tp t = v_extract_n(v); return v_setall_##suffix(t); } + +OPENCV_HAL_IMPL_NEON_BROADCAST(v_uint8x16, uchar, u8) +OPENCV_HAL_IMPL_NEON_BROADCAST(v_int8x16, schar, s8) +OPENCV_HAL_IMPL_NEON_BROADCAST(v_uint16x8, ushort, u16) +OPENCV_HAL_IMPL_NEON_BROADCAST(v_int16x8, short, s16) +OPENCV_HAL_IMPL_NEON_BROADCAST(v_uint32x4, uint, u32) +OPENCV_HAL_IMPL_NEON_BROADCAST(v_int32x4, int, s32) +OPENCV_HAL_IMPL_NEON_BROADCAST(v_uint64x2, uint64, u64) +OPENCV_HAL_IMPL_NEON_BROADCAST(v_int64x2, int64, s64) +OPENCV_HAL_IMPL_NEON_BROADCAST(v_float32x4, float, f32) +#if CV_SIMD128_64F +OPENCV_HAL_IMPL_NEON_BROADCAST(v_float64x2, double, f64) +#endif + #if CV_SIMD128_64F inline v_int32x4 v_round(const v_float32x4& a) { diff --git a/modules/core/include/opencv2/core/hal/intrin_sse.hpp b/modules/core/include/opencv2/core/hal/intrin_sse.hpp index e7370504ef..867ff55340 100644 --- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp @@ -57,6 +57,14 @@ namespace cv //! @cond IGNORED +// +// Compilation troubleshooting: +// - MSVC: error C2719: 'a': formal parameter with requested alignment of 16 won't be aligned +// Replace parameter declaration to const reference: +// -v_int32x4 a +// +const v_int32x4& a +// + CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN ///////// Types //////////// @@ -3270,6 +3278,100 @@ inline v_int32x4 v_pack_triplets(const v_int32x4& vec) { return vec; } inline v_uint32x4 v_pack_triplets(const v_uint32x4& vec) { return vec; } inline v_float32x4 v_pack_triplets(const v_float32x4& vec) { return vec; } +template +inline uchar v_extract_n(const v_uint8x16& v) +{ +#if CV_SSE4_1 + return (uchar)_mm_extract_epi8(v.val, i); +#else + return v_rotate_right(v).get0(); +#endif +} + +template +inline schar v_extract_n(const v_int8x16& v) +{ + return (schar)v_extract_n(v_reinterpret_as_u8(v)); +} + +template +inline ushort v_extract_n(const v_uint16x8& v) +{ + return (ushort)_mm_extract_epi16(v.val, i); +} + +template +inline short v_extract_n(const v_int16x8& v) +{ + return (short)v_extract_n(v_reinterpret_as_u16(v)); +} + +template +inline uint v_extract_n(const v_uint32x4& v) +{ +#if CV_SSE4_1 + return (uint)_mm_extract_epi32(v.val, i); +#else + return v_rotate_right(v).get0(); +#endif +} + +template +inline int v_extract_n(const v_int32x4& v) +{ + return (int)v_extract_n(v_reinterpret_as_u32(v)); +} + +template +inline uint64 v_extract_n(const v_uint64x2& v) +{ +#ifdef CV__SIMD_NATIVE_mm_extract_epi64 + return (uint64)_v128_extract_epi64(v.val); +#else + return v_rotate_right(v).get0(); +#endif +} + +template +inline int64 v_extract_n(const v_int64x2& v) +{ + return (int64)v_extract_n(v_reinterpret_as_u64(v)); +} + +template +inline float v_extract_n(const v_float32x4& v) +{ + union { uint iv; float fv; } d; + d.iv = v_extract_n(v_reinterpret_as_u32(v)); + return d.fv; +} + +template +inline double v_extract_n(const v_float64x2& v) +{ + union { uint64 iv; double dv; } d; + d.iv = v_extract_n(v_reinterpret_as_u64(v)); + return d.dv; +} + +template +inline v_int32x4 v_broadcast_element(const v_int32x4& v) +{ + return v_int32x4(_mm_shuffle_epi32(v.val, _MM_SHUFFLE(i,i,i,i))); +} + +template +inline v_uint32x4 v_broadcast_element(const v_uint32x4& v) +{ + return v_uint32x4(_mm_shuffle_epi32(v.val, _MM_SHUFFLE(i,i,i,i))); +} + +template +inline v_float32x4 v_broadcast_element(const v_float32x4& v) +{ + return v_float32x4(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE((char)i,(char)i,(char)i,(char)i))); +} + ////////////// FP16 support /////////////////////////// inline v_float32x4 v_load_expand(const float16_t* ptr) diff --git a/modules/core/include/opencv2/core/hal/intrin_sse_em.hpp b/modules/core/include/opencv2/core/hal/intrin_sse_em.hpp index be2766847c..6fb088161a 100644 --- a/modules/core/include/opencv2/core/hal/intrin_sse_em.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_sse_em.hpp @@ -158,10 +158,23 @@ inline __m128i _v128_packs_epu32(const __m128i& a, const __m128i& b) #endif } +template +inline int64 _v128_extract_epi64(const __m128i& a) +{ +#if defined(CV__SIMD_HAVE_mm_extract_epi64) || (CV_SSE4_1 && (defined(__x86_64__)/*GCC*/ || defined(_M_X64)/*MSVC*/)) +#define CV__SIMD_NATIVE_mm_extract_epi64 1 + return _mm_extract_epi64(a, i); +#else + CV_DECL_ALIGNED(16) int64 tmp[2]; + _mm_store_si128((__m128i*)tmp, a); + return tmp[i]; +#endif +} + CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END //! @endcond } // cv:: -#endif // OPENCV_HAL_INTRIN_SSE_EM_HPP \ No newline at end of file +#endif // OPENCV_HAL_INTRIN_SSE_EM_HPP diff --git a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp index 5b4a0d4137..e4d13af1a2 100644 --- a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp @@ -206,6 +206,20 @@ struct v_float64x2 { return vec_extract(val, 0); } }; +#define OPENCV_HAL_IMPL_VSX_EXTRACT_N(_Tpvec, _Tp) \ +template inline _Tp v_extract_n(VSX_UNUSED(_Tpvec v)) { return vec_extract(v.val, i); } + +OPENCV_HAL_IMPL_VSX_EXTRACT_N(v_uint8x16, uchar) +OPENCV_HAL_IMPL_VSX_EXTRACT_N(v_int8x16, schar) +OPENCV_HAL_IMPL_VSX_EXTRACT_N(v_uint16x8, ushort) +OPENCV_HAL_IMPL_VSX_EXTRACT_N(v_int16x8, short) +OPENCV_HAL_IMPL_VSX_EXTRACT_N(v_uint32x4, uint) +OPENCV_HAL_IMPL_VSX_EXTRACT_N(v_int32x4, int) +OPENCV_HAL_IMPL_VSX_EXTRACT_N(v_uint64x2, uint64) +OPENCV_HAL_IMPL_VSX_EXTRACT_N(v_int64x2, int64) +OPENCV_HAL_IMPL_VSX_EXTRACT_N(v_float32x4, float) +OPENCV_HAL_IMPL_VSX_EXTRACT_N(v_float64x2, double) + //////////////// Load and store operations /////////////// /* @@ -1524,6 +1538,82 @@ OPENCV_HAL_IMPL_VSX_TRANSPOSE4x4(v_uint32x4, vec_uint4) OPENCV_HAL_IMPL_VSX_TRANSPOSE4x4(v_int32x4, vec_int4) OPENCV_HAL_IMPL_VSX_TRANSPOSE4x4(v_float32x4, vec_float4) +template +inline v_int8x16 v_broadcast_element(v_int8x16 v) +{ + return v_int8x16(vec_perm(v.val, v.val, vec_splats((unsigned char)i))); +} + +template +inline v_uint8x16 v_broadcast_element(v_uint8x16 v) +{ + return v_uint8x16(vec_perm(v.val, v.val, vec_splats((unsigned char)i))); +} + +template +inline v_int16x8 v_broadcast_element(v_int16x8 v) +{ + unsigned char t0 = 2*i, t1 = 2*i + 1; + vec_uchar16 p = {t0, t1, t0, t1, t0, t1, t0, t1, t0, t1, t0, t1, t0, t1, t0, t1}; + return v_int16x8(vec_perm(v.val, v.val, p)); +} + +template +inline v_uint16x8 v_broadcast_element(v_uint16x8 v) +{ + unsigned char t0 = 2*i, t1 = 2*i + 1; + vec_uchar16 p = {t0, t1, t0, t1, t0, t1, t0, t1, t0, t1, t0, t1, t0, t1, t0, t1}; + return v_uint16x8(vec_perm(v.val, v.val, p)); +} + +template +inline v_int32x4 v_broadcast_element(v_int32x4 v) +{ + unsigned char t0 = 4*i, t1 = 4*i + 1, t2 = 4*i + 2, t3 = 4*i + 3; + vec_uchar16 p = {t0, t1, t2, t3, t0, t1, t2, t3, t0, t1, t2, t3, t0, t1, t2, t3}; + return v_int32x4(vec_perm(v.val, v.val, p)); +} + +template +inline v_uint32x4 v_broadcast_element(v_uint32x4 v) +{ + unsigned char t0 = 4*i, t1 = 4*i + 1, t2 = 4*i + 2, t3 = 4*i + 3; + vec_uchar16 p = {t0, t1, t2, t3, t0, t1, t2, t3, t0, t1, t2, t3, t0, t1, t2, t3}; + return v_uint32x4(vec_perm(v.val, v.val, p)); +} + +template +inline v_int64x2 v_broadcast_element(v_int64x2 v) +{ + unsigned char t0 = 8*i, t1 = 8*i + 1, t2 = 8*i + 2, t3 = 8*i + 3, t4 = 8*i + 4, t5 = 8*i + 5, t6 = 8*i + 6, t7 = 8*i + 7; + vec_uchar16 p = {t0, t1, t2, t3, t4, t5, t6, t7, t0, t1, t2, t3, t4, t5, t6, t7}; + return v_int64x2(vec_perm(v.val, v.val, p)); +} + +template +inline v_uint64x2 v_broadcast_element(v_uint64x2 v) +{ + unsigned char t0 = 8*i, t1 = 8*i + 1, t2 = 8*i + 2, t3 = 8*i + 3, t4 = 8*i + 4, t5 = 8*i + 5, t6 = 8*i + 6, t7 = 8*i + 7; + vec_uchar16 p = {t0, t1, t2, t3, t4, t5, t6, t7, t0, t1, t2, t3, t4, t5, t6, t7}; + return v_uint64x2(vec_perm(v.val, v.val, p)); +} + +template +inline v_float32x4 v_broadcast_element(v_float32x4 v) +{ + unsigned char t0 = 4*i, t1 = 4*i + 1, t2 = 4*i + 2, t3 = 4*i + 3; + vec_uchar16 p = {t0, t1, t2, t3, t0, t1, t2, t3, t0, t1, t2, t3, t0, t1, t2, t3}; + return v_float32x4(vec_perm(v.val, v.val, p)); +} + +template +inline v_float64x2 v_broadcast_element(v_float64x2 v) +{ + unsigned char t0 = 8*i, t1 = 8*i + 1, t2 = 8*i + 2, t3 = 8*i + 3, t4 = 8*i + 4, t5 = 8*i + 5, t6 = 8*i + 6, t7 = 8*i + 7; + vec_uchar16 p = {t0, t1, t2, t3, t4, t5, t6, t7, t0, t1, t2, t3, t4, t5, t6, t7}; + return v_float64x2(vec_perm(v.val, v.val, p)); +} + CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END //! @endcond diff --git a/modules/core/include/opencv2/core/hal/intrin_wasm.hpp b/modules/core/include/opencv2/core/hal/intrin_wasm.hpp index 4b8cd61dd2..7b3259f4c0 100644 --- a/modules/core/include/opencv2/core/hal/intrin_wasm.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_wasm.hpp @@ -4213,6 +4213,29 @@ inline v_int32x4 v_pack_triplets(const v_int32x4& vec) { return vec; } inline v_uint32x4 v_pack_triplets(const v_uint32x4& vec) { return vec; } inline v_float32x4 v_pack_triplets(const v_float32x4& vec) { return vec; } +template +inline typename _Tp::lane_type v_extract_n(const _Tp& a) +{ + return v_rotate_right(a).get0(); +} + +template +inline v_uint32x4 v_broadcast_element(const v_uint32x4& a) +{ + return v_setall_u32(v_extract_n(a)); +} +template +inline v_int32x4 v_broadcast_element(const v_int32x4& a) +{ + return v_setall_s32(v_extract_n(a)); +} +template +inline v_float32x4 v_broadcast_element(const v_float32x4& a) +{ + return v_setall_f32(v_extract_n(a)); +} + + ////////////// FP16 support /////////////////////////// inline v_float32x4 v_load_expand(const float16_t* ptr) diff --git a/modules/core/test/test_intrin_utils.hpp b/modules/core/test/test_intrin_utils.hpp index bd1e24722c..633279b5de 100644 --- a/modules/core/test/test_intrin_utils.hpp +++ b/modules/core/test/test_intrin_utils.hpp @@ -134,17 +134,21 @@ template struct Data } const LaneType & operator[](int i) const { +#if 0 // TODO: strange bug - AVX2 tests are failed with this + CV_CheckGE(i, 0, ""); CV_CheckLT(i, (int)R::nlanes, ""); +#else CV_Assert(i >= 0 && i < R::nlanes); +#endif return d[i]; } LaneType & operator[](int i) { - CV_Assert(i >= 0 && i < R::nlanes); + CV_CheckGE(i, 0, ""); CV_CheckLT(i, (int)R::nlanes, ""); return d[i]; } int_type as_int(int i) const { - CV_Assert(i >= 0 && i < R::nlanes); + CV_CheckGE(i, 0, ""); CV_CheckLT(i, (int)R::nlanes, ""); union { LaneType l; @@ -1190,6 +1194,40 @@ template struct TheTest return *this; } + template + TheTest & test_extract_n() + { + SCOPED_TRACE(s); + Data dataA; + LaneType test_value = (LaneType)(s + 50); + dataA[s] = test_value; + R a = dataA; + + LaneType res = v_extract_n(a); + EXPECT_EQ(test_value, res); + + return *this; + } + + template + TheTest & test_broadcast_element() + { + SCOPED_TRACE(s); + Data dataA; + LaneType test_value = (LaneType)(s + 50); + dataA[s] = test_value; + R a = dataA; + + Data res = v_broadcast_element(a); + + for (int i = 0; i < R::nlanes; ++i) + { + SCOPED_TRACE(i); + EXPECT_EQ(test_value, res[i]); + } + return *this; + } + TheTest & test_float_math() { typedef typename V_RegTraits::round_reg Ri; @@ -1498,6 +1536,7 @@ template struct TheTest void test_hal_intrin_uint8() { DUMP_ENTRY(v_uint8); + typedef v_uint8 R; TheTest() .test_loadstore() .test_interleave() @@ -1522,21 +1561,21 @@ void test_hal_intrin_uint8() .test_reverse() .test_extract<0>().test_extract<1>().test_extract<8>().test_extract<15>() .test_rotate<0>().test_rotate<1>().test_rotate<8>().test_rotate<15>() - ; - + .test_extract_n<0>().test_extract_n<1>().test_extract_n() + //.test_broadcast_element<0>().test_broadcast_element<1>().test_broadcast_element() #if CV_SIMD_WIDTH == 32 - TheTest() .test_pack<9>().test_pack<10>().test_pack<13>().test_pack<15>() .test_pack_u<9>().test_pack_u<10>().test_pack_u<13>().test_pack_u<15>() .test_extract<16>().test_extract<17>().test_extract<23>().test_extract<31>() .test_rotate<16>().test_rotate<17>().test_rotate<23>().test_rotate<31>() - ; #endif + ; } void test_hal_intrin_int8() { DUMP_ENTRY(v_int8); + typedef v_int8 R; TheTest() .test_loadstore() .test_interleave() @@ -1561,6 +1600,8 @@ void test_hal_intrin_int8() .test_reverse() .test_extract<0>().test_extract<1>().test_extract<8>().test_extract<15>() .test_rotate<0>().test_rotate<1>().test_rotate<8>().test_rotate<15>() + .test_extract_n<0>().test_extract_n<1>().test_extract_n() + //.test_broadcast_element<0>().test_broadcast_element<1>().test_broadcast_element() ; } @@ -1569,6 +1610,7 @@ void test_hal_intrin_int8() void test_hal_intrin_uint16() { DUMP_ENTRY(v_uint16); + typedef v_uint16 R; TheTest() .test_loadstore() .test_interleave() @@ -1594,12 +1636,15 @@ void test_hal_intrin_uint16() .test_reverse() .test_extract<0>().test_extract<1>().test_extract<4>().test_extract<7>() .test_rotate<0>().test_rotate<1>().test_rotate<4>().test_rotate<7>() + .test_extract_n<0>().test_extract_n<1>().test_extract_n() + //.test_broadcast_element<0>().test_broadcast_element<1>().test_broadcast_element() ; } void test_hal_intrin_int16() { DUMP_ENTRY(v_int16); + typedef v_int16 R; TheTest() .test_loadstore() .test_interleave() @@ -1627,6 +1672,8 @@ void test_hal_intrin_int16() .test_reverse() .test_extract<0>().test_extract<1>().test_extract<4>().test_extract<7>() .test_rotate<0>().test_rotate<1>().test_rotate<4>().test_rotate<7>() + .test_extract_n<0>().test_extract_n<1>().test_extract_n() + //.test_broadcast_element<0>().test_broadcast_element<1>().test_broadcast_element() ; } @@ -1635,6 +1682,7 @@ void test_hal_intrin_int16() void test_hal_intrin_uint32() { DUMP_ENTRY(v_uint32); + typedef v_uint32 R; TheTest() .test_loadstore() .test_interleave() @@ -1657,6 +1705,8 @@ void test_hal_intrin_uint32() .test_reverse() .test_extract<0>().test_extract<1>().test_extract<2>().test_extract<3>() .test_rotate<0>().test_rotate<1>().test_rotate<2>().test_rotate<3>() + .test_extract_n<0>().test_extract_n<1>().test_extract_n() + .test_broadcast_element<0>().test_broadcast_element<1>().test_broadcast_element() .test_transpose() ; } @@ -1664,6 +1714,7 @@ void test_hal_intrin_uint32() void test_hal_intrin_int32() { DUMP_ENTRY(v_int32); + typedef v_int32 R; TheTest() .test_loadstore() .test_interleave() @@ -1687,6 +1738,8 @@ void test_hal_intrin_int32() .test_reverse() .test_extract<0>().test_extract<1>().test_extract<2>().test_extract<3>() .test_rotate<0>().test_rotate<1>().test_rotate<2>().test_rotate<3>() + .test_extract_n<0>().test_extract_n<1>().test_extract_n() + .test_broadcast_element<0>().test_broadcast_element<1>().test_broadcast_element() .test_float_cvt32() .test_float_cvt64() .test_transpose() @@ -1698,6 +1751,7 @@ void test_hal_intrin_int32() void test_hal_intrin_uint64() { DUMP_ENTRY(v_uint64); + typedef v_uint64 R; TheTest() .test_loadstore() .test_addsub() @@ -1709,12 +1763,15 @@ void test_hal_intrin_uint64() .test_reverse() .test_extract<0>().test_extract<1>() .test_rotate<0>().test_rotate<1>() + .test_extract_n<0>().test_extract_n<1>().test_extract_n() + //.test_broadcast_element<0>().test_broadcast_element<1>().test_broadcast_element() ; } void test_hal_intrin_int64() { DUMP_ENTRY(v_int64); + typedef v_int64 R; TheTest() .test_loadstore() .test_addsub() @@ -1726,6 +1783,8 @@ void test_hal_intrin_int64() .test_reverse() .test_extract<0>().test_extract<1>() .test_rotate<0>().test_rotate<1>() + .test_extract_n<0>().test_extract_n<1>().test_extract_n() + //.test_broadcast_element<0>().test_broadcast_element<1>().test_broadcast_element() .test_cvt64_double() ; } @@ -1734,6 +1793,7 @@ void test_hal_intrin_int64() void test_hal_intrin_float32() { DUMP_ENTRY(v_float32); + typedef v_float32 R; TheTest() .test_loadstore() .test_interleave() @@ -1757,20 +1817,20 @@ void test_hal_intrin_float32() .test_reverse() .test_extract<0>().test_extract<1>().test_extract<2>().test_extract<3>() .test_rotate<0>().test_rotate<1>().test_rotate<2>().test_rotate<3>() - ; - + .test_extract_n<0>().test_extract_n<1>().test_extract_n() + .test_broadcast_element<0>().test_broadcast_element<1>().test_broadcast_element() #if CV_SIMD_WIDTH == 32 - TheTest() .test_extract<4>().test_extract<5>().test_extract<6>().test_extract<7>() .test_rotate<4>().test_rotate<5>().test_rotate<6>().test_rotate<7>() - ; #endif + ; } void test_hal_intrin_float64() { DUMP_ENTRY(v_float64); #if CV_SIMD_64F + typedef v_float64 R; TheTest() .test_loadstore() .test_addsub() @@ -1787,14 +1847,13 @@ void test_hal_intrin_float64() .test_reverse() .test_extract<0>().test_extract<1>() .test_rotate<0>().test_rotate<1>() - ; - + .test_extract_n<0>().test_extract_n<1>().test_extract_n() + //.test_broadcast_element<0>().test_broadcast_element<1>().test_broadcast_element() #if CV_SIMD_WIDTH == 32 - TheTest() .test_extract<2>().test_extract<3>() .test_rotate<2>().test_rotate<3>() +#endif ; -#endif //CV_SIMD256 #endif } @@ -1804,14 +1863,14 @@ void test_hal_intrin_float16() { DUMP_ENTRY(v_float16); #if CV_FP16 - TheTest().test_loadstore_fp16_f32(); + TheTest() + .test_loadstore_fp16_f32() #endif #if CV_SIMD_FP16 - TheTest() .test_loadstore_fp16() .test_float_cvt_fp16() - ; #endif + ; } #endif diff --git a/modules/imgproc/src/sumpixels.cpp b/modules/imgproc/src/sumpixels.cpp index 2052b02e41..89337f3507 100755 --- a/modules/imgproc/src/sumpixels.cpp +++ b/modules/imgproc/src/sumpixels.cpp @@ -147,7 +147,8 @@ struct Integral_SIMD v_expand(el8, el4l, el4h); el4l += prev; el4h += el4l; - prev = vx_setall_s32(v_rotate_right(el4h).get0()); + + prev = v_broadcast_element(el4h); #endif v_store(sum_row + j , el4l + vx_load(prev_sum_row + j )); v_store(sum_row + j + v_int32::nlanes, el4h + vx_load(prev_sum_row + j + v_int32::nlanes)); @@ -215,7 +216,8 @@ struct Integral_SIMD v_expand(el8, el4li, el4hi); el4l = v_cvt_f32(el4li) + prev; el4h = v_cvt_f32(el4hi) + el4l; - prev = vx_setall_f32(v_rotate_right(el4h).get0()); + + prev = v_broadcast_element(el4h); #endif v_store(sum_row + j , el4l + vx_load(prev_sum_row + j )); v_store(sum_row + j + v_float32::nlanes, el4h + vx_load(prev_sum_row + j + v_float32::nlanes));