Merge pull request #11304 from kinchungwong:issue_11242_intrin_cv34x_nocpp11

* Issue 11242 intrinsics v_extract, v_rotate improvement, branch 3.4, without C++11 (remove type restrictions for SSE2, use PALIGNR on SSSE3, compile to no-op when imm is 0 or nlanes). * fix whitespace * Fix #11242 (NEON intrinsics v_rotate...) branch 3.4 Separate macro expansion OPENCV_HAL_IMPL_NEON_SHIFT_OP for bitwise shifts for integers, from macro expansion OPENCV_HAL_IMPL_NEON_ROTATE for lane rotations. Bitwise shifts do not apply to floats, but lane-rotations can apply to both. * fix whitespace * Fix #11242 compile error (VSX intrinsics v_rotate(a)) branch 3.4 no-c++11
2025-07-01 01:10:52 +08:00 · 2018-04-20 08:43:47 -07:00 · 2018-04-20 08:43:47 -07:00 · 6f675ae75b
commit 6f675ae75b
parent c8b515ea69
5 changed files with 163 additions and 28 deletions
--- a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
@ -202,6 +202,7 @@ Regular integers:
 |pack_u             | x |   | x |   |   |   |
 |unpack             | x | x | x | x | x | x |
 |extract            | x | x | x | x | x | x |
 |rotate (lanes)     | x | x | x | x | x | x |
 |cvt_flt32          |   |   |   |   |   | x |
 |cvt_flt64          |   |   |   |   |   | x |
 |transpose4x4       |   |   |   |   | x | x |
@ -215,6 +216,7 @@ Big integers:
 |shift              | x | x |
 |logical            | x | x |
 |extract            | x | x |
 |rotate (lanes)     | x | x |
 Floating point:
@ -236,7 +238,8 @@ Floating point:
 |sqrt, abs          | x | x |
 |float math         | x | x |
 |transpose4x4       | x |   |
-
+|extract            | x | x |
 |rotate (lanes)     | x | x |
 @{ */
@ -1499,7 +1502,7 @@ Usage:
 v_int32x4 a, b, c;
 c = v_extract<2>(a, b);
@endcode
-For integer types only. */
+For all types. */
 template<int s, typename _Tp, int n>
 inline v_reg<_Tp, n> v_extract(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
 {
--- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
@ -770,15 +770,7 @@ template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
 template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
 { return _Tpvec(vshrq_n_##suffix(a.val, n)); } \
 template<int n> inline _Tpvec v_rshr(const _Tpvec& a) \
-{ return _Tpvec(vrshrq_n_##suffix(a.val, n)); } \
+{ return _Tpvec(vrshrq_n_##suffix(a.val, n)); }
 template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
 { return _Tpvec(vextq_##suffix(a.val, vdupq_n_##suffix(0), n)); } \
 template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
 { return _Tpvec(vextq_##suffix(vdupq_n_##suffix(0), a.val, _Tpvec::nlanes - n)); } \
 template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
 { return _Tpvec(vextq_##suffix(a.val, b.val, n)); } \
 template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
 { return _Tpvec(vextq_##suffix(b.val, a.val, _Tpvec::nlanes - n)); }
 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint8x16, u8, schar, s8)
 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int8x16, s8, schar, s8)
@ -789,6 +781,29 @@ OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int32x4, s32, int, s32)
 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint64x2, u64, int64, s64)
 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int64x2, s64, int64, s64)
 #define OPENCV_HAL_IMPL_NEON_ROTATE_OP(_Tpvec, suffix) \
 template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
 { return _Tpvec(vextq_##suffix(a.val, vdupq_n_##suffix(0), n)); } \
 template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
 { return _Tpvec(vextq_##suffix(vdupq_n_##suffix(0), a.val, _Tpvec::nlanes - n)); } \
 template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
 { return _Tpvec(vextq_##suffix(a.val, b.val, n)); } \
 template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
 { return _Tpvec(vextq_##suffix(b.val, a.val, _Tpvec::nlanes - n)); }
 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_uint8x16, u8)
 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_int8x16, s8)
 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_uint16x8, u16)
 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_int16x8, s16)
 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_uint32x4, u32)
 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_int32x4, s32)
 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_float32x4, f32)
 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_uint64x2, u64)
 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_int64x2, s64)
 #if CV_SIMD128_64F
 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_float64x2, f64)
 #endif
 #define OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(_Tpvec, _Tp, suffix) \
 inline _Tpvec v_load(const _Tp* ptr) \
 { return _Tpvec(vld1q_##suffix(ptr)); } \
--- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp
@ -61,6 +61,7 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
 struct v_uint8x16
 {
    typedef uchar lane_type;
    typedef __m128i vector_type;
    enum { nlanes = 16 };
    v_uint8x16() : val(_mm_setzero_si128()) {}
@ -84,6 +85,7 @@ struct v_uint8x16
 struct v_int8x16
 {
    typedef schar lane_type;
    typedef __m128i vector_type;
    enum { nlanes = 16 };
    v_int8x16() : val(_mm_setzero_si128()) {}
@ -107,6 +109,7 @@ struct v_int8x16
 struct v_uint16x8
 {
    typedef ushort lane_type;
    typedef __m128i vector_type;
    enum { nlanes = 8 };
    v_uint16x8() : val(_mm_setzero_si128()) {}
@ -127,6 +130,7 @@ struct v_uint16x8
 struct v_int16x8
 {
    typedef short lane_type;
    typedef __m128i vector_type;
    enum { nlanes = 8 };
    v_int16x8() : val(_mm_setzero_si128()) {}
@ -146,6 +150,7 @@ struct v_int16x8
 struct v_uint32x4
 {
    typedef unsigned lane_type;
    typedef __m128i vector_type;
    enum { nlanes = 4 };
    v_uint32x4() : val(_mm_setzero_si128()) {}
@ -164,6 +169,7 @@ struct v_uint32x4
 struct v_int32x4
 {
    typedef int lane_type;
    typedef __m128i vector_type;
    enum { nlanes = 4 };
    v_int32x4() : val(_mm_setzero_si128()) {}
@ -182,6 +188,7 @@ struct v_int32x4
 struct v_float32x4
 {
    typedef float lane_type;
    typedef __m128 vector_type;
    enum { nlanes = 4 };
    v_float32x4() : val(_mm_setzero_ps()) {}
@ -200,6 +207,7 @@ struct v_float32x4
 struct v_uint64x2
 {
    typedef uint64 lane_type;
    typedef __m128i vector_type;
    enum { nlanes = 2 };
    v_uint64x2() : val(_mm_setzero_si128()) {}
@ -220,6 +228,7 @@ struct v_uint64x2
 struct v_int64x2
 {
    typedef int64 lane_type;
    typedef __m128i vector_type;
    enum { nlanes = 2 };
    v_int64x2() : val(_mm_setzero_si128()) {}
@ -240,6 +249,7 @@ struct v_int64x2
 struct v_float64x2
 {
    typedef double lane_type;
    typedef __m128d vector_type;
    enum { nlanes = 2 };
    v_float64x2() : val(_mm_setzero_pd()) {}
@ -259,6 +269,7 @@ struct v_float64x2
 struct v_float16x4
 {
    typedef short lane_type;
    typedef __m128i vector_type;
    enum { nlanes = 4 };
    v_float16x4() : val(_mm_setzero_si128()) {}
@ -275,6 +286,27 @@ struct v_float16x4
 };
 #endif
 namespace hal_sse_internal
 {
    template <typename to_sse_type, typename from_sse_type>
    to_sse_type v_sse_reinterpret_as(const from_sse_type& val);
 #define OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(to_sse_type, from_sse_type, sse_cast_intrin) \
    template<> inline \
    to_sse_type v_sse_reinterpret_as(const from_sse_type& a) \
    { return sse_cast_intrin(a); }
    OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128i, __m128i, OPENCV_HAL_NOP);
    OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128i, __m128, _mm_castps_si128);
    OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128i, __m128d, _mm_castpd_si128);
    OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128, __m128i, _mm_castsi128_ps);
    OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128, __m128, OPENCV_HAL_NOP);
    OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128, __m128d, _mm_castpd_ps);
    OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128d, __m128i, _mm_castsi128_pd);
    OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128d, __m128, _mm_castps_pd);
    OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128d, __m128d, OPENCV_HAL_NOP);
 }
 #define OPENCV_HAL_IMPL_SSE_INITVEC(_Tpvec, _Tp, suffix, zsuffix, ssuffix, _Tps, cast) \
 inline _Tpvec v_setzero_##suffix() { return _Tpvec(_mm_setzero_##zsuffix()); } \
 inline _Tpvec v_setall_##suffix(_Tp v) { return _Tpvec(_mm_set1_##ssuffix((_Tps)v)); } \
@ -1062,31 +1094,116 @@ OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint16x8, v_int16x8, epi16, _mm_srai_epi16)
 OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint32x4, v_int32x4, epi32, _mm_srai_epi32)
 OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint64x2, v_int64x2, epi64, v_srai_epi64)
 namespace hal_sse_internal
 {
    template <int imm,
        bool is_invalid = ((imm < 0) || (imm > 16)),
        bool is_first = (imm == 0),
        bool is_half = (imm == 8),
        bool is_second = (imm == 16),
        bool is_other = (((imm > 0) && (imm < 8)) || ((imm > 8) && (imm < 16)))>
    class v_sse_palignr_u8_class;
    template <int imm>
    class v_sse_palignr_u8_class<imm, true, false, false, false, false>;
    template <int imm>
    class v_sse_palignr_u8_class<imm, false, true, false, false, false>
    {
    public:
        inline __m128i operator()(const __m128i& a, const __m128i&) const
        {
            return a;
        }
    };
    template <int imm>
    class v_sse_palignr_u8_class<imm, false, false, true, false, false>
    {
    public:
        inline __m128i operator()(const __m128i& a, const __m128i& b) const
        {
            return _mm_unpacklo_epi64(_mm_unpackhi_epi64(a, a), b);
        }
    };
    template <int imm>
    class v_sse_palignr_u8_class<imm, false, false, false, true, false>
    {
    public:
        inline __m128i operator()(const __m128i&, const __m128i& b) const
        {
            return b;
        }
    };
    template <int imm>
    class v_sse_palignr_u8_class<imm, false, false, false, false, true>
    {
 #if CV_SSSE3
    public:
        inline __m128i operator()(const __m128i& a, const __m128i& b) const
        {
            return _mm_alignr_epi8(b, a, imm);
        }
 #else
    public:
        inline __m128i operator()(const __m128i& a, const __m128i& b) const
        {
            enum { imm2 = (sizeof(__m128i) - imm) };
            return _mm_or_si128(_mm_srli_si128(a, imm), _mm_slli_si128(b, imm2));
        }
 #endif
    };
    template <int imm>
    inline __m128i v_sse_palignr_u8(const __m128i& a, const __m128i& b)
    {
        CV_StaticAssert((imm >= 0) && (imm <= 16), "Invalid imm for v_sse_palignr_u8.");
        return v_sse_palignr_u8_class<imm>()(a, b);
    }
 }
 template<int imm, typename _Tpvec>
 inline _Tpvec v_rotate_right(const _Tpvec &a)
 {
-    enum { CV_SHIFT = imm*(sizeof(typename _Tpvec::lane_type)) };
+    using namespace hal_sse_internal;
-    return _Tpvec(_mm_srli_si128(a.val, CV_SHIFT));
+    enum { imm2 = (imm * sizeof(typename _Tpvec::lane_type)) };
    return _Tpvec(v_sse_reinterpret_as<typename _Tpvec::vector_type>(
        _mm_srli_si128(
            v_sse_reinterpret_as<__m128i>(a.val), imm2)));
 }
 template<int imm, typename _Tpvec>
 inline _Tpvec v_rotate_left(const _Tpvec &a)
 {
-    enum { CV_SHIFT = imm*(sizeof(typename _Tpvec::lane_type)) };
+    using namespace hal_sse_internal;
-    return _Tpvec(_mm_slli_si128(a.val, CV_SHIFT));
+    enum { imm2 = (imm * sizeof(typename _Tpvec::lane_type)) };
    return _Tpvec(v_sse_reinterpret_as<typename _Tpvec::vector_type>(
        _mm_slli_si128(
            v_sse_reinterpret_as<__m128i>(a.val), imm2)));
 }
 template<int imm, typename _Tpvec>
 inline _Tpvec v_rotate_right(const _Tpvec &a, const _Tpvec &b)
 {
-    enum { CV_SHIFT1 = imm*(sizeof(typename _Tpvec::lane_type)) };
+    using namespace hal_sse_internal;
-    enum { CV_SHIFT2 = 16 - imm*(sizeof(typename _Tpvec::lane_type)) };
+    enum { imm2 = (imm * sizeof(typename _Tpvec::lane_type)) };
-    return _Tpvec(_mm_or_si128(_mm_srli_si128(a.val, CV_SHIFT1), _mm_slli_si128(b.val, CV_SHIFT2)));
+    return _Tpvec(v_sse_reinterpret_as<typename _Tpvec::vector_type>(
        v_sse_palignr_u8<imm2>(
            v_sse_reinterpret_as<__m128i>(a.val),
            v_sse_reinterpret_as<__m128i>(b.val))));
 }
 template<int imm, typename _Tpvec>
 inline _Tpvec v_rotate_left(const _Tpvec &a, const _Tpvec &b)
 {
-    enum { CV_SHIFT1 = imm*(sizeof(typename _Tpvec::lane_type)) };
+    using namespace hal_sse_internal;
-    enum { CV_SHIFT2 = 16 - imm*(sizeof(typename _Tpvec::lane_type)) };
+    enum { imm2 = ((_Tpvec::nlanes - imm) * sizeof(typename _Tpvec::lane_type)) };
-    return _Tpvec(_mm_or_si128(_mm_slli_si128(a.val, CV_SHIFT1), _mm_srli_si128(b.val, CV_SHIFT2)));
+    return _Tpvec(v_sse_reinterpret_as<typename _Tpvec::vector_type>(
        v_sse_palignr_u8<imm2>(
            v_sse_reinterpret_as<__m128i>(b.val),
            v_sse_reinterpret_as<__m128i>(a.val))));
 }
 #define OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(_Tpvec, _Tp) \
@ -1403,12 +1520,7 @@ OPENCV_HAL_IMPL_SSE_UNPACKS(v_float64x2, pd, _mm_castpd_si128, _mm_castsi128_pd)
 template<int s, typename _Tpvec>
 inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b)
 {
-    const int w = sizeof(typename _Tpvec::lane_type);
+    return v_rotate_right<s>(a, b);
    const int n = _Tpvec::nlanes;
    __m128i ra, rb;
    ra = _mm_srli_si128(a.val, s*w);
    rb = _mm_slli_si128(b.val, (n-s)*w);
    return _Tpvec(_mm_or_si128(ra, rb));
 }
 inline v_int32x4 v_round(const v_float32x4& a)
--- a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
@ -562,9 +562,10 @@ OPENCV_IMPL_VSX_ROTATE_LR(v_uint16x8, vec_ushort8)
 OPENCV_IMPL_VSX_ROTATE_LR(v_int16x8,  vec_short8)
 OPENCV_IMPL_VSX_ROTATE_LR(v_uint32x4, vec_uint4)
 OPENCV_IMPL_VSX_ROTATE_LR(v_int32x4,  vec_int4)
 OPENCV_IMPL_VSX_ROTATE_LR(v_float32x4, vec_float4)
 OPENCV_IMPL_VSX_ROTATE_LR(v_uint64x2, vec_udword2)
 OPENCV_IMPL_VSX_ROTATE_LR(v_int64x2,  vec_dword2)
-
+OPENCV_IMPL_VSX_ROTATE_LR(v_float64x2, vec_double2)
 template<int imm, typename _Tpvec>
 inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b)
--- a/modules/core/test/test_intrin.cpp
+++ b/modules/core/test/test_intrin.cpp
@ -215,6 +215,8 @@ TEST(hal_intrin, float32x4) {
        .test_matmul()
        .test_transpose()
        .test_reduce_sum4()
        .test_extract<0>().test_extract<1>().test_extract<2>().test_extract<3>()
        .test_rotate<0>().test_rotate<1>().test_rotate<2>().test_rotate<3>()
        ;
 }
@ -233,6 +235,8 @@ TEST(hal_intrin, float64x2) {
        .test_unpack()
        .test_float_math()
        .test_float_cvt32()
        .test_extract<0>().test_extract<1>()
        .test_rotate<0>().test_rotate<1>()
        ;
 }
 #endif