Merge pull request #11304 from kinchungwong:issue_11242_intrin_cv34x_nocpp11

* Issue 11242 intrinsics v_extract, v_rotate improvement, branch 3.4, without C++11 (remove type restrictions for SSE2, use PALIGNR on SSSE3, compile to no-op when imm is 0 or nlanes).

* fix whitespace

* Fix #11242 (NEON intrinsics v_rotate...) branch 3.4
Separate macro expansion OPENCV_HAL_IMPL_NEON_SHIFT_OP for bitwise shifts for integers, from macro expansion OPENCV_HAL_IMPL_NEON_ROTATE for lane rotations. Bitwise shifts do not apply to floats, but lane-rotations can apply to both.

* fix whitespace

* Fix #11242 compile error (VSX intrinsics v_rotate(a)) branch 3.4 no-c++11
This commit is contained in:
Ryan Wong 2018-04-20 08:43:47 -07:00 committed by Alexander Alekhin
parent c8b515ea69
commit 6f675ae75b
5 changed files with 163 additions and 28 deletions

View File

@ -202,6 +202,7 @@ Regular integers:
|pack_u | x | | x | | | | |pack_u | x | | x | | | |
|unpack | x | x | x | x | x | x | |unpack | x | x | x | x | x | x |
|extract | x | x | x | x | x | x | |extract | x | x | x | x | x | x |
|rotate (lanes) | x | x | x | x | x | x |
|cvt_flt32 | | | | | | x | |cvt_flt32 | | | | | | x |
|cvt_flt64 | | | | | | x | |cvt_flt64 | | | | | | x |
|transpose4x4 | | | | | x | x | |transpose4x4 | | | | | x | x |
@ -215,6 +216,7 @@ Big integers:
|shift | x | x | |shift | x | x |
|logical | x | x | |logical | x | x |
|extract | x | x | |extract | x | x |
|rotate (lanes) | x | x |
Floating point: Floating point:
@ -236,7 +238,8 @@ Floating point:
|sqrt, abs | x | x | |sqrt, abs | x | x |
|float math | x | x | |float math | x | x |
|transpose4x4 | x | | |transpose4x4 | x | |
|extract | x | x |
|rotate (lanes) | x | x |
@{ */ @{ */
@ -1499,7 +1502,7 @@ Usage:
v_int32x4 a, b, c; v_int32x4 a, b, c;
c = v_extract<2>(a, b); c = v_extract<2>(a, b);
@endcode @endcode
For integer types only. */ For all types. */
template<int s, typename _Tp, int n> template<int s, typename _Tp, int n>
inline v_reg<_Tp, n> v_extract(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) inline v_reg<_Tp, n> v_extract(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
{ {

View File

@ -770,15 +770,7 @@ template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
template<int n> inline _Tpvec v_shr(const _Tpvec& a) \ template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
{ return _Tpvec(vshrq_n_##suffix(a.val, n)); } \ { return _Tpvec(vshrq_n_##suffix(a.val, n)); } \
template<int n> inline _Tpvec v_rshr(const _Tpvec& a) \ template<int n> inline _Tpvec v_rshr(const _Tpvec& a) \
{ return _Tpvec(vrshrq_n_##suffix(a.val, n)); } \ { return _Tpvec(vrshrq_n_##suffix(a.val, n)); }
template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
{ return _Tpvec(vextq_##suffix(a.val, vdupq_n_##suffix(0), n)); } \
template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
{ return _Tpvec(vextq_##suffix(vdupq_n_##suffix(0), a.val, _Tpvec::nlanes - n)); } \
template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(vextq_##suffix(a.val, b.val, n)); } \
template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(vextq_##suffix(b.val, a.val, _Tpvec::nlanes - n)); }
OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint8x16, u8, schar, s8) OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint8x16, u8, schar, s8)
OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int8x16, s8, schar, s8) OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int8x16, s8, schar, s8)
@ -789,6 +781,29 @@ OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int32x4, s32, int, s32)
OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint64x2, u64, int64, s64) OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint64x2, u64, int64, s64)
OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int64x2, s64, int64, s64) OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int64x2, s64, int64, s64)
#define OPENCV_HAL_IMPL_NEON_ROTATE_OP(_Tpvec, suffix) \
template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
{ return _Tpvec(vextq_##suffix(a.val, vdupq_n_##suffix(0), n)); } \
template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
{ return _Tpvec(vextq_##suffix(vdupq_n_##suffix(0), a.val, _Tpvec::nlanes - n)); } \
template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(vextq_##suffix(a.val, b.val, n)); } \
template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(vextq_##suffix(b.val, a.val, _Tpvec::nlanes - n)); }
OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_uint8x16, u8)
OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_int8x16, s8)
OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_uint16x8, u16)
OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_int16x8, s16)
OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_uint32x4, u32)
OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_int32x4, s32)
OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_float32x4, f32)
OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_uint64x2, u64)
OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_int64x2, s64)
#if CV_SIMD128_64F
OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_float64x2, f64)
#endif
#define OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(_Tpvec, _Tp, suffix) \ #define OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(_Tpvec, _Tp, suffix) \
inline _Tpvec v_load(const _Tp* ptr) \ inline _Tpvec v_load(const _Tp* ptr) \
{ return _Tpvec(vld1q_##suffix(ptr)); } \ { return _Tpvec(vld1q_##suffix(ptr)); } \

View File

@ -61,6 +61,7 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
struct v_uint8x16 struct v_uint8x16
{ {
typedef uchar lane_type; typedef uchar lane_type;
typedef __m128i vector_type;
enum { nlanes = 16 }; enum { nlanes = 16 };
v_uint8x16() : val(_mm_setzero_si128()) {} v_uint8x16() : val(_mm_setzero_si128()) {}
@ -84,6 +85,7 @@ struct v_uint8x16
struct v_int8x16 struct v_int8x16
{ {
typedef schar lane_type; typedef schar lane_type;
typedef __m128i vector_type;
enum { nlanes = 16 }; enum { nlanes = 16 };
v_int8x16() : val(_mm_setzero_si128()) {} v_int8x16() : val(_mm_setzero_si128()) {}
@ -107,6 +109,7 @@ struct v_int8x16
struct v_uint16x8 struct v_uint16x8
{ {
typedef ushort lane_type; typedef ushort lane_type;
typedef __m128i vector_type;
enum { nlanes = 8 }; enum { nlanes = 8 };
v_uint16x8() : val(_mm_setzero_si128()) {} v_uint16x8() : val(_mm_setzero_si128()) {}
@ -127,6 +130,7 @@ struct v_uint16x8
struct v_int16x8 struct v_int16x8
{ {
typedef short lane_type; typedef short lane_type;
typedef __m128i vector_type;
enum { nlanes = 8 }; enum { nlanes = 8 };
v_int16x8() : val(_mm_setzero_si128()) {} v_int16x8() : val(_mm_setzero_si128()) {}
@ -146,6 +150,7 @@ struct v_int16x8
struct v_uint32x4 struct v_uint32x4
{ {
typedef unsigned lane_type; typedef unsigned lane_type;
typedef __m128i vector_type;
enum { nlanes = 4 }; enum { nlanes = 4 };
v_uint32x4() : val(_mm_setzero_si128()) {} v_uint32x4() : val(_mm_setzero_si128()) {}
@ -164,6 +169,7 @@ struct v_uint32x4
struct v_int32x4 struct v_int32x4
{ {
typedef int lane_type; typedef int lane_type;
typedef __m128i vector_type;
enum { nlanes = 4 }; enum { nlanes = 4 };
v_int32x4() : val(_mm_setzero_si128()) {} v_int32x4() : val(_mm_setzero_si128()) {}
@ -182,6 +188,7 @@ struct v_int32x4
struct v_float32x4 struct v_float32x4
{ {
typedef float lane_type; typedef float lane_type;
typedef __m128 vector_type;
enum { nlanes = 4 }; enum { nlanes = 4 };
v_float32x4() : val(_mm_setzero_ps()) {} v_float32x4() : val(_mm_setzero_ps()) {}
@ -200,6 +207,7 @@ struct v_float32x4
struct v_uint64x2 struct v_uint64x2
{ {
typedef uint64 lane_type; typedef uint64 lane_type;
typedef __m128i vector_type;
enum { nlanes = 2 }; enum { nlanes = 2 };
v_uint64x2() : val(_mm_setzero_si128()) {} v_uint64x2() : val(_mm_setzero_si128()) {}
@ -220,6 +228,7 @@ struct v_uint64x2
struct v_int64x2 struct v_int64x2
{ {
typedef int64 lane_type; typedef int64 lane_type;
typedef __m128i vector_type;
enum { nlanes = 2 }; enum { nlanes = 2 };
v_int64x2() : val(_mm_setzero_si128()) {} v_int64x2() : val(_mm_setzero_si128()) {}
@ -240,6 +249,7 @@ struct v_int64x2
struct v_float64x2 struct v_float64x2
{ {
typedef double lane_type; typedef double lane_type;
typedef __m128d vector_type;
enum { nlanes = 2 }; enum { nlanes = 2 };
v_float64x2() : val(_mm_setzero_pd()) {} v_float64x2() : val(_mm_setzero_pd()) {}
@ -259,6 +269,7 @@ struct v_float64x2
struct v_float16x4 struct v_float16x4
{ {
typedef short lane_type; typedef short lane_type;
typedef __m128i vector_type;
enum { nlanes = 4 }; enum { nlanes = 4 };
v_float16x4() : val(_mm_setzero_si128()) {} v_float16x4() : val(_mm_setzero_si128()) {}
@ -275,6 +286,27 @@ struct v_float16x4
}; };
#endif #endif
namespace hal_sse_internal
{
template <typename to_sse_type, typename from_sse_type>
to_sse_type v_sse_reinterpret_as(const from_sse_type& val);
#define OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(to_sse_type, from_sse_type, sse_cast_intrin) \
template<> inline \
to_sse_type v_sse_reinterpret_as(const from_sse_type& a) \
{ return sse_cast_intrin(a); }
OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128i, __m128i, OPENCV_HAL_NOP);
OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128i, __m128, _mm_castps_si128);
OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128i, __m128d, _mm_castpd_si128);
OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128, __m128i, _mm_castsi128_ps);
OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128, __m128, OPENCV_HAL_NOP);
OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128, __m128d, _mm_castpd_ps);
OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128d, __m128i, _mm_castsi128_pd);
OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128d, __m128, _mm_castps_pd);
OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128d, __m128d, OPENCV_HAL_NOP);
}
#define OPENCV_HAL_IMPL_SSE_INITVEC(_Tpvec, _Tp, suffix, zsuffix, ssuffix, _Tps, cast) \ #define OPENCV_HAL_IMPL_SSE_INITVEC(_Tpvec, _Tp, suffix, zsuffix, ssuffix, _Tps, cast) \
inline _Tpvec v_setzero_##suffix() { return _Tpvec(_mm_setzero_##zsuffix()); } \ inline _Tpvec v_setzero_##suffix() { return _Tpvec(_mm_setzero_##zsuffix()); } \
inline _Tpvec v_setall_##suffix(_Tp v) { return _Tpvec(_mm_set1_##ssuffix((_Tps)v)); } \ inline _Tpvec v_setall_##suffix(_Tp v) { return _Tpvec(_mm_set1_##ssuffix((_Tps)v)); } \
@ -1062,31 +1094,116 @@ OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint16x8, v_int16x8, epi16, _mm_srai_epi16)
OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint32x4, v_int32x4, epi32, _mm_srai_epi32) OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint32x4, v_int32x4, epi32, _mm_srai_epi32)
OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint64x2, v_int64x2, epi64, v_srai_epi64) OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint64x2, v_int64x2, epi64, v_srai_epi64)
namespace hal_sse_internal
{
template <int imm,
bool is_invalid = ((imm < 0) || (imm > 16)),
bool is_first = (imm == 0),
bool is_half = (imm == 8),
bool is_second = (imm == 16),
bool is_other = (((imm > 0) && (imm < 8)) || ((imm > 8) && (imm < 16)))>
class v_sse_palignr_u8_class;
template <int imm>
class v_sse_palignr_u8_class<imm, true, false, false, false, false>;
template <int imm>
class v_sse_palignr_u8_class<imm, false, true, false, false, false>
{
public:
inline __m128i operator()(const __m128i& a, const __m128i&) const
{
return a;
}
};
template <int imm>
class v_sse_palignr_u8_class<imm, false, false, true, false, false>
{
public:
inline __m128i operator()(const __m128i& a, const __m128i& b) const
{
return _mm_unpacklo_epi64(_mm_unpackhi_epi64(a, a), b);
}
};
template <int imm>
class v_sse_palignr_u8_class<imm, false, false, false, true, false>
{
public:
inline __m128i operator()(const __m128i&, const __m128i& b) const
{
return b;
}
};
template <int imm>
class v_sse_palignr_u8_class<imm, false, false, false, false, true>
{
#if CV_SSSE3
public:
inline __m128i operator()(const __m128i& a, const __m128i& b) const
{
return _mm_alignr_epi8(b, a, imm);
}
#else
public:
inline __m128i operator()(const __m128i& a, const __m128i& b) const
{
enum { imm2 = (sizeof(__m128i) - imm) };
return _mm_or_si128(_mm_srli_si128(a, imm), _mm_slli_si128(b, imm2));
}
#endif
};
template <int imm>
inline __m128i v_sse_palignr_u8(const __m128i& a, const __m128i& b)
{
CV_StaticAssert((imm >= 0) && (imm <= 16), "Invalid imm for v_sse_palignr_u8.");
return v_sse_palignr_u8_class<imm>()(a, b);
}
}
template<int imm, typename _Tpvec> template<int imm, typename _Tpvec>
inline _Tpvec v_rotate_right(const _Tpvec &a) inline _Tpvec v_rotate_right(const _Tpvec &a)
{ {
enum { CV_SHIFT = imm*(sizeof(typename _Tpvec::lane_type)) }; using namespace hal_sse_internal;
return _Tpvec(_mm_srli_si128(a.val, CV_SHIFT)); enum { imm2 = (imm * sizeof(typename _Tpvec::lane_type)) };
return _Tpvec(v_sse_reinterpret_as<typename _Tpvec::vector_type>(
_mm_srli_si128(
v_sse_reinterpret_as<__m128i>(a.val), imm2)));
} }
template<int imm, typename _Tpvec> template<int imm, typename _Tpvec>
inline _Tpvec v_rotate_left(const _Tpvec &a) inline _Tpvec v_rotate_left(const _Tpvec &a)
{ {
enum { CV_SHIFT = imm*(sizeof(typename _Tpvec::lane_type)) }; using namespace hal_sse_internal;
return _Tpvec(_mm_slli_si128(a.val, CV_SHIFT)); enum { imm2 = (imm * sizeof(typename _Tpvec::lane_type)) };
return _Tpvec(v_sse_reinterpret_as<typename _Tpvec::vector_type>(
_mm_slli_si128(
v_sse_reinterpret_as<__m128i>(a.val), imm2)));
} }
template<int imm, typename _Tpvec> template<int imm, typename _Tpvec>
inline _Tpvec v_rotate_right(const _Tpvec &a, const _Tpvec &b) inline _Tpvec v_rotate_right(const _Tpvec &a, const _Tpvec &b)
{ {
enum { CV_SHIFT1 = imm*(sizeof(typename _Tpvec::lane_type)) }; using namespace hal_sse_internal;
enum { CV_SHIFT2 = 16 - imm*(sizeof(typename _Tpvec::lane_type)) }; enum { imm2 = (imm * sizeof(typename _Tpvec::lane_type)) };
return _Tpvec(_mm_or_si128(_mm_srli_si128(a.val, CV_SHIFT1), _mm_slli_si128(b.val, CV_SHIFT2))); return _Tpvec(v_sse_reinterpret_as<typename _Tpvec::vector_type>(
v_sse_palignr_u8<imm2>(
v_sse_reinterpret_as<__m128i>(a.val),
v_sse_reinterpret_as<__m128i>(b.val))));
} }
template<int imm, typename _Tpvec> template<int imm, typename _Tpvec>
inline _Tpvec v_rotate_left(const _Tpvec &a, const _Tpvec &b) inline _Tpvec v_rotate_left(const _Tpvec &a, const _Tpvec &b)
{ {
enum { CV_SHIFT1 = imm*(sizeof(typename _Tpvec::lane_type)) }; using namespace hal_sse_internal;
enum { CV_SHIFT2 = 16 - imm*(sizeof(typename _Tpvec::lane_type)) }; enum { imm2 = ((_Tpvec::nlanes - imm) * sizeof(typename _Tpvec::lane_type)) };
return _Tpvec(_mm_or_si128(_mm_slli_si128(a.val, CV_SHIFT1), _mm_srli_si128(b.val, CV_SHIFT2))); return _Tpvec(v_sse_reinterpret_as<typename _Tpvec::vector_type>(
v_sse_palignr_u8<imm2>(
v_sse_reinterpret_as<__m128i>(b.val),
v_sse_reinterpret_as<__m128i>(a.val))));
} }
#define OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(_Tpvec, _Tp) \ #define OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(_Tpvec, _Tp) \
@ -1403,12 +1520,7 @@ OPENCV_HAL_IMPL_SSE_UNPACKS(v_float64x2, pd, _mm_castpd_si128, _mm_castsi128_pd)
template<int s, typename _Tpvec> template<int s, typename _Tpvec>
inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b)
{ {
const int w = sizeof(typename _Tpvec::lane_type); return v_rotate_right<s>(a, b);
const int n = _Tpvec::nlanes;
__m128i ra, rb;
ra = _mm_srli_si128(a.val, s*w);
rb = _mm_slli_si128(b.val, (n-s)*w);
return _Tpvec(_mm_or_si128(ra, rb));
} }
inline v_int32x4 v_round(const v_float32x4& a) inline v_int32x4 v_round(const v_float32x4& a)

View File

@ -562,9 +562,10 @@ OPENCV_IMPL_VSX_ROTATE_LR(v_uint16x8, vec_ushort8)
OPENCV_IMPL_VSX_ROTATE_LR(v_int16x8, vec_short8) OPENCV_IMPL_VSX_ROTATE_LR(v_int16x8, vec_short8)
OPENCV_IMPL_VSX_ROTATE_LR(v_uint32x4, vec_uint4) OPENCV_IMPL_VSX_ROTATE_LR(v_uint32x4, vec_uint4)
OPENCV_IMPL_VSX_ROTATE_LR(v_int32x4, vec_int4) OPENCV_IMPL_VSX_ROTATE_LR(v_int32x4, vec_int4)
OPENCV_IMPL_VSX_ROTATE_LR(v_float32x4, vec_float4)
OPENCV_IMPL_VSX_ROTATE_LR(v_uint64x2, vec_udword2) OPENCV_IMPL_VSX_ROTATE_LR(v_uint64x2, vec_udword2)
OPENCV_IMPL_VSX_ROTATE_LR(v_int64x2, vec_dword2) OPENCV_IMPL_VSX_ROTATE_LR(v_int64x2, vec_dword2)
OPENCV_IMPL_VSX_ROTATE_LR(v_float64x2, vec_double2)
template<int imm, typename _Tpvec> template<int imm, typename _Tpvec>
inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b)

View File

@ -215,6 +215,8 @@ TEST(hal_intrin, float32x4) {
.test_matmul() .test_matmul()
.test_transpose() .test_transpose()
.test_reduce_sum4() .test_reduce_sum4()
.test_extract<0>().test_extract<1>().test_extract<2>().test_extract<3>()
.test_rotate<0>().test_rotate<1>().test_rotate<2>().test_rotate<3>()
; ;
} }
@ -233,6 +235,8 @@ TEST(hal_intrin, float64x2) {
.test_unpack() .test_unpack()
.test_float_math() .test_float_math()
.test_float_cvt32() .test_float_cvt32()
.test_extract<0>().test_extract<1>()
.test_rotate<0>().test_rotate<1>()
; ;
} }
#endif #endif