From 6f675ae75bbf16d92d1f0c379d13cb76a2bf8c4d Mon Sep 17 00:00:00 2001 From: Ryan Wong Date: Fri, 20 Apr 2018 08:43:47 -0700 Subject: [PATCH] Merge pull request #11304 from kinchungwong:issue_11242_intrin_cv34x_nocpp11 * Issue 11242 intrinsics v_extract, v_rotate improvement, branch 3.4, without C++11 (remove type restrictions for SSE2, use PALIGNR on SSSE3, compile to no-op when imm is 0 or nlanes). * fix whitespace * Fix #11242 (NEON intrinsics v_rotate...) branch 3.4 Separate macro expansion OPENCV_HAL_IMPL_NEON_SHIFT_OP for bitwise shifts for integers, from macro expansion OPENCV_HAL_IMPL_NEON_ROTATE for lane rotations. Bitwise shifts do not apply to floats, but lane-rotations can apply to both. * fix whitespace * Fix #11242 compile error (VSX intrinsics v_rotate(a)) branch 3.4 no-c++11 --- .../include/opencv2/core/hal/intrin_cpp.hpp | 7 +- .../include/opencv2/core/hal/intrin_neon.hpp | 33 ++-- .../include/opencv2/core/hal/intrin_sse.hpp | 144 ++++++++++++++++-- .../include/opencv2/core/hal/intrin_vsx.hpp | 3 +- modules/core/test/test_intrin.cpp | 4 + 5 files changed, 163 insertions(+), 28 deletions(-) diff --git a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp index 5518eace9b..b619f1a0c2 100644 --- a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp @@ -202,6 +202,7 @@ Regular integers: |pack_u | x | | x | | | | |unpack | x | x | x | x | x | x | |extract | x | x | x | x | x | x | +|rotate (lanes) | x | x | x | x | x | x | |cvt_flt32 | | | | | | x | |cvt_flt64 | | | | | | x | |transpose4x4 | | | | | x | x | @@ -215,6 +216,7 @@ Big integers: |shift | x | x | |logical | x | x | |extract | x | x | +|rotate (lanes) | x | x | Floating point: @@ -236,7 +238,8 @@ Floating point: |sqrt, abs | x | x | |float math | x | x | |transpose4x4 | x | | - +|extract | x | x | +|rotate (lanes) | x | x | @{ */ @@ -1499,7 +1502,7 @@ Usage: v_int32x4 a, b, c; c = v_extract<2>(a, b); @endcode -For integer types only. */ +For all types. */ template inline v_reg<_Tp, n> v_extract(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) { diff --git a/modules/core/include/opencv2/core/hal/intrin_neon.hpp b/modules/core/include/opencv2/core/hal/intrin_neon.hpp index 033cf0f2dc..95c9bfb1fe 100644 --- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp @@ -770,15 +770,7 @@ template inline _Tpvec v_shl(const _Tpvec& a) \ template inline _Tpvec v_shr(const _Tpvec& a) \ { return _Tpvec(vshrq_n_##suffix(a.val, n)); } \ template inline _Tpvec v_rshr(const _Tpvec& a) \ -{ return _Tpvec(vrshrq_n_##suffix(a.val, n)); } \ -template inline _Tpvec v_rotate_right(const _Tpvec& a) \ -{ return _Tpvec(vextq_##suffix(a.val, vdupq_n_##suffix(0), n)); } \ -template inline _Tpvec v_rotate_left(const _Tpvec& a) \ -{ return _Tpvec(vextq_##suffix(vdupq_n_##suffix(0), a.val, _Tpvec::nlanes - n)); } \ -template inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \ -{ return _Tpvec(vextq_##suffix(a.val, b.val, n)); } \ -template inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \ -{ return _Tpvec(vextq_##suffix(b.val, a.val, _Tpvec::nlanes - n)); } +{ return _Tpvec(vrshrq_n_##suffix(a.val, n)); } OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint8x16, u8, schar, s8) OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int8x16, s8, schar, s8) @@ -789,6 +781,29 @@ OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int32x4, s32, int, s32) OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint64x2, u64, int64, s64) OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int64x2, s64, int64, s64) +#define OPENCV_HAL_IMPL_NEON_ROTATE_OP(_Tpvec, suffix) \ +template inline _Tpvec v_rotate_right(const _Tpvec& a) \ +{ return _Tpvec(vextq_##suffix(a.val, vdupq_n_##suffix(0), n)); } \ +template inline _Tpvec v_rotate_left(const _Tpvec& a) \ +{ return _Tpvec(vextq_##suffix(vdupq_n_##suffix(0), a.val, _Tpvec::nlanes - n)); } \ +template inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \ +{ return _Tpvec(vextq_##suffix(a.val, b.val, n)); } \ +template inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \ +{ return _Tpvec(vextq_##suffix(b.val, a.val, _Tpvec::nlanes - n)); } + +OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_uint8x16, u8) +OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_int8x16, s8) +OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_uint16x8, u16) +OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_int16x8, s16) +OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_uint32x4, u32) +OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_int32x4, s32) +OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_float32x4, f32) +OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_uint64x2, u64) +OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_int64x2, s64) +#if CV_SIMD128_64F +OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_float64x2, f64) +#endif + #define OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(_Tpvec, _Tp, suffix) \ inline _Tpvec v_load(const _Tp* ptr) \ { return _Tpvec(vld1q_##suffix(ptr)); } \ diff --git a/modules/core/include/opencv2/core/hal/intrin_sse.hpp b/modules/core/include/opencv2/core/hal/intrin_sse.hpp index 7c1c103a19..c91b05de93 100644 --- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp @@ -61,6 +61,7 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN struct v_uint8x16 { typedef uchar lane_type; + typedef __m128i vector_type; enum { nlanes = 16 }; v_uint8x16() : val(_mm_setzero_si128()) {} @@ -84,6 +85,7 @@ struct v_uint8x16 struct v_int8x16 { typedef schar lane_type; + typedef __m128i vector_type; enum { nlanes = 16 }; v_int8x16() : val(_mm_setzero_si128()) {} @@ -107,6 +109,7 @@ struct v_int8x16 struct v_uint16x8 { typedef ushort lane_type; + typedef __m128i vector_type; enum { nlanes = 8 }; v_uint16x8() : val(_mm_setzero_si128()) {} @@ -127,6 +130,7 @@ struct v_uint16x8 struct v_int16x8 { typedef short lane_type; + typedef __m128i vector_type; enum { nlanes = 8 }; v_int16x8() : val(_mm_setzero_si128()) {} @@ -146,6 +150,7 @@ struct v_int16x8 struct v_uint32x4 { typedef unsigned lane_type; + typedef __m128i vector_type; enum { nlanes = 4 }; v_uint32x4() : val(_mm_setzero_si128()) {} @@ -164,6 +169,7 @@ struct v_uint32x4 struct v_int32x4 { typedef int lane_type; + typedef __m128i vector_type; enum { nlanes = 4 }; v_int32x4() : val(_mm_setzero_si128()) {} @@ -182,6 +188,7 @@ struct v_int32x4 struct v_float32x4 { typedef float lane_type; + typedef __m128 vector_type; enum { nlanes = 4 }; v_float32x4() : val(_mm_setzero_ps()) {} @@ -200,6 +207,7 @@ struct v_float32x4 struct v_uint64x2 { typedef uint64 lane_type; + typedef __m128i vector_type; enum { nlanes = 2 }; v_uint64x2() : val(_mm_setzero_si128()) {} @@ -220,6 +228,7 @@ struct v_uint64x2 struct v_int64x2 { typedef int64 lane_type; + typedef __m128i vector_type; enum { nlanes = 2 }; v_int64x2() : val(_mm_setzero_si128()) {} @@ -240,6 +249,7 @@ struct v_int64x2 struct v_float64x2 { typedef double lane_type; + typedef __m128d vector_type; enum { nlanes = 2 }; v_float64x2() : val(_mm_setzero_pd()) {} @@ -259,6 +269,7 @@ struct v_float64x2 struct v_float16x4 { typedef short lane_type; + typedef __m128i vector_type; enum { nlanes = 4 }; v_float16x4() : val(_mm_setzero_si128()) {} @@ -275,6 +286,27 @@ struct v_float16x4 }; #endif +namespace hal_sse_internal +{ + template + to_sse_type v_sse_reinterpret_as(const from_sse_type& val); + +#define OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(to_sse_type, from_sse_type, sse_cast_intrin) \ + template<> inline \ + to_sse_type v_sse_reinterpret_as(const from_sse_type& a) \ + { return sse_cast_intrin(a); } + + OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128i, __m128i, OPENCV_HAL_NOP); + OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128i, __m128, _mm_castps_si128); + OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128i, __m128d, _mm_castpd_si128); + OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128, __m128i, _mm_castsi128_ps); + OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128, __m128, OPENCV_HAL_NOP); + OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128, __m128d, _mm_castpd_ps); + OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128d, __m128i, _mm_castsi128_pd); + OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128d, __m128, _mm_castps_pd); + OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128d, __m128d, OPENCV_HAL_NOP); +} + #define OPENCV_HAL_IMPL_SSE_INITVEC(_Tpvec, _Tp, suffix, zsuffix, ssuffix, _Tps, cast) \ inline _Tpvec v_setzero_##suffix() { return _Tpvec(_mm_setzero_##zsuffix()); } \ inline _Tpvec v_setall_##suffix(_Tp v) { return _Tpvec(_mm_set1_##ssuffix((_Tps)v)); } \ @@ -1062,31 +1094,116 @@ OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint16x8, v_int16x8, epi16, _mm_srai_epi16) OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint32x4, v_int32x4, epi32, _mm_srai_epi32) OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint64x2, v_int64x2, epi64, v_srai_epi64) +namespace hal_sse_internal +{ + template 16)), + bool is_first = (imm == 0), + bool is_half = (imm == 8), + bool is_second = (imm == 16), + bool is_other = (((imm > 0) && (imm < 8)) || ((imm > 8) && (imm < 16)))> + class v_sse_palignr_u8_class; + + template + class v_sse_palignr_u8_class; + + template + class v_sse_palignr_u8_class + { + public: + inline __m128i operator()(const __m128i& a, const __m128i&) const + { + return a; + } + }; + + template + class v_sse_palignr_u8_class + { + public: + inline __m128i operator()(const __m128i& a, const __m128i& b) const + { + return _mm_unpacklo_epi64(_mm_unpackhi_epi64(a, a), b); + } + }; + + template + class v_sse_palignr_u8_class + { + public: + inline __m128i operator()(const __m128i&, const __m128i& b) const + { + return b; + } + }; + + template + class v_sse_palignr_u8_class + { +#if CV_SSSE3 + public: + inline __m128i operator()(const __m128i& a, const __m128i& b) const + { + return _mm_alignr_epi8(b, a, imm); + } +#else + public: + inline __m128i operator()(const __m128i& a, const __m128i& b) const + { + enum { imm2 = (sizeof(__m128i) - imm) }; + return _mm_or_si128(_mm_srli_si128(a, imm), _mm_slli_si128(b, imm2)); + } +#endif + }; + + template + inline __m128i v_sse_palignr_u8(const __m128i& a, const __m128i& b) + { + CV_StaticAssert((imm >= 0) && (imm <= 16), "Invalid imm for v_sse_palignr_u8."); + return v_sse_palignr_u8_class()(a, b); + } +} + template inline _Tpvec v_rotate_right(const _Tpvec &a) { - enum { CV_SHIFT = imm*(sizeof(typename _Tpvec::lane_type)) }; - return _Tpvec(_mm_srli_si128(a.val, CV_SHIFT)); + using namespace hal_sse_internal; + enum { imm2 = (imm * sizeof(typename _Tpvec::lane_type)) }; + return _Tpvec(v_sse_reinterpret_as( + _mm_srli_si128( + v_sse_reinterpret_as<__m128i>(a.val), imm2))); } + template inline _Tpvec v_rotate_left(const _Tpvec &a) { - enum { CV_SHIFT = imm*(sizeof(typename _Tpvec::lane_type)) }; - return _Tpvec(_mm_slli_si128(a.val, CV_SHIFT)); + using namespace hal_sse_internal; + enum { imm2 = (imm * sizeof(typename _Tpvec::lane_type)) }; + return _Tpvec(v_sse_reinterpret_as( + _mm_slli_si128( + v_sse_reinterpret_as<__m128i>(a.val), imm2))); } + template inline _Tpvec v_rotate_right(const _Tpvec &a, const _Tpvec &b) { - enum { CV_SHIFT1 = imm*(sizeof(typename _Tpvec::lane_type)) }; - enum { CV_SHIFT2 = 16 - imm*(sizeof(typename _Tpvec::lane_type)) }; - return _Tpvec(_mm_or_si128(_mm_srli_si128(a.val, CV_SHIFT1), _mm_slli_si128(b.val, CV_SHIFT2))); + using namespace hal_sse_internal; + enum { imm2 = (imm * sizeof(typename _Tpvec::lane_type)) }; + return _Tpvec(v_sse_reinterpret_as( + v_sse_palignr_u8( + v_sse_reinterpret_as<__m128i>(a.val), + v_sse_reinterpret_as<__m128i>(b.val)))); } + template inline _Tpvec v_rotate_left(const _Tpvec &a, const _Tpvec &b) { - enum { CV_SHIFT1 = imm*(sizeof(typename _Tpvec::lane_type)) }; - enum { CV_SHIFT2 = 16 - imm*(sizeof(typename _Tpvec::lane_type)) }; - return _Tpvec(_mm_or_si128(_mm_slli_si128(a.val, CV_SHIFT1), _mm_srli_si128(b.val, CV_SHIFT2))); + using namespace hal_sse_internal; + enum { imm2 = ((_Tpvec::nlanes - imm) * sizeof(typename _Tpvec::lane_type)) }; + return _Tpvec(v_sse_reinterpret_as( + v_sse_palignr_u8( + v_sse_reinterpret_as<__m128i>(b.val), + v_sse_reinterpret_as<__m128i>(a.val)))); } #define OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(_Tpvec, _Tp) \ @@ -1403,12 +1520,7 @@ OPENCV_HAL_IMPL_SSE_UNPACKS(v_float64x2, pd, _mm_castpd_si128, _mm_castsi128_pd) template inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) { - const int w = sizeof(typename _Tpvec::lane_type); - const int n = _Tpvec::nlanes; - __m128i ra, rb; - ra = _mm_srli_si128(a.val, s*w); - rb = _mm_slli_si128(b.val, (n-s)*w); - return _Tpvec(_mm_or_si128(ra, rb)); + return v_rotate_right(a, b); } inline v_int32x4 v_round(const v_float32x4& a) diff --git a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp index 85cef469c3..3fff6651e3 100644 --- a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp @@ -562,9 +562,10 @@ OPENCV_IMPL_VSX_ROTATE_LR(v_uint16x8, vec_ushort8) OPENCV_IMPL_VSX_ROTATE_LR(v_int16x8, vec_short8) OPENCV_IMPL_VSX_ROTATE_LR(v_uint32x4, vec_uint4) OPENCV_IMPL_VSX_ROTATE_LR(v_int32x4, vec_int4) +OPENCV_IMPL_VSX_ROTATE_LR(v_float32x4, vec_float4) OPENCV_IMPL_VSX_ROTATE_LR(v_uint64x2, vec_udword2) OPENCV_IMPL_VSX_ROTATE_LR(v_int64x2, vec_dword2) - +OPENCV_IMPL_VSX_ROTATE_LR(v_float64x2, vec_double2) template inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) diff --git a/modules/core/test/test_intrin.cpp b/modules/core/test/test_intrin.cpp index 7fa38130c9..4171babc03 100644 --- a/modules/core/test/test_intrin.cpp +++ b/modules/core/test/test_intrin.cpp @@ -215,6 +215,8 @@ TEST(hal_intrin, float32x4) { .test_matmul() .test_transpose() .test_reduce_sum4() + .test_extract<0>().test_extract<1>().test_extract<2>().test_extract<3>() + .test_rotate<0>().test_rotate<1>().test_rotate<2>().test_rotate<3>() ; } @@ -233,6 +235,8 @@ TEST(hal_intrin, float64x2) { .test_unpack() .test_float_math() .test_float_cvt32() + .test_extract<0>().test_extract<1>() + .test_rotate<0>().test_rotate<1>() ; } #endif