From 65726e4244594a6eaf5218b26d43e7f52e980841 Mon Sep 17 00:00:00 2001 From: Alexander Alekhin Date: Tue, 17 Apr 2018 17:50:23 +0300 Subject: [PATCH] core(hal): improve v_select() SSE4.1+ v_select 'mask' is restricted to these values only: 0 or ~0 (0xff/0xffff/etc) mask in accuracy test is updated. --- .../include/opencv2/core/hal/intrin_cpp.hpp | 19 +++++++------ .../include/opencv2/core/hal/intrin_sse.hpp | 27 ++++++++++++++++++- modules/core/perf/perf_mat.cpp | 4 +-- modules/core/src/copy.cpp | 21 ++------------- modules/core/test/test_intrin_utils.hpp | 14 ++++++---- 5 files changed, 50 insertions(+), 35 deletions(-) diff --git a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp index 5518eace9b..269bf5616c 100644 --- a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp @@ -1039,13 +1039,16 @@ template inline bool v_check_any(const v_reg<_Tp, n>& a) return false; } -/** @brief Bitwise select +/** @brief Per-element select (blend operation) -Return value will be built by combining values a and b using the following scheme: -If the i-th bit in _mask_ is 1 - select i-th bit from _a_ -else - select i-th bit from _b_ */ +Return value will be built by combining values _a_ and _b_ using the following scheme: + result[i] = mask[i] ? a[i] : b[i]; + +@Note: _mask_ element values are restricted to these values: +- 0: select element from _b_ +- 0xff/0xffff/etc: select element from _a_ +(fully compatible with bitwise-based operator) +*/ template inline v_reg<_Tp, n> v_select(const v_reg<_Tp, n>& mask, const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) { @@ -1055,8 +1058,8 @@ template inline v_reg<_Tp, n> v_select(const v_reg<_Tp, n>& for( int i = 0; i < n; i++ ) { int_type m = Traits::reinterpret_int(mask.s[i]); - c.s[i] = Traits::reinterpret_from_int((Traits::reinterpret_int(a.s[i]) & m) - | (Traits::reinterpret_int(b.s[i]) & ~m)); + CV_DbgAssert(m == 0 || m == (~(int_type)0)); // restrict mask values: 0 or 0xff/0xffff/etc + c.s[i] = m ? a.s[i] : b.s[i]; } return c; } diff --git a/modules/core/include/opencv2/core/hal/intrin_sse.hpp b/modules/core/include/opencv2/core/hal/intrin_sse.hpp index 62ffa3ec88..4a129d09c5 100644 --- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp @@ -406,10 +406,14 @@ void v_rshr_pack_store(schar* ptr, const v_int16x8& a) } -// bit-wise "mask ? a : b" +// byte-wise "mask ? a : b" inline __m128i v_select_si128(__m128i mask, __m128i a, __m128i b) { +#if CV_SSE4_1 + return _mm_blendv_epi8(b, a, mask); +#else return _mm_xor_si128(b, _mm_and_si128(_mm_xor_si128(a, b), mask)); +#endif } inline v_uint16x8 v_pack(const v_uint32x4& a, const v_uint32x4& b) @@ -1254,6 +1258,26 @@ OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int32x4, epi8, v_packq_epi32, OPENCV_HAL_AND, OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_float32x4, ps, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 15, 15) OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_float64x2, pd, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 3, 3) +#if CV_SSE4_1 +#define OPENCV_HAL_IMPL_SSE_SELECT(_Tpvec, cast_ret, cast, suffix) \ +inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \ +{ \ + return _Tpvec(cast_ret(_mm_blendv_##suffix(cast(b.val), cast(a.val), cast(mask.val)))); \ +} + +OPENCV_HAL_IMPL_SSE_SELECT(v_uint8x16, OPENCV_HAL_NOP, OPENCV_HAL_NOP, epi8) +OPENCV_HAL_IMPL_SSE_SELECT(v_int8x16, OPENCV_HAL_NOP, OPENCV_HAL_NOP, epi8) +OPENCV_HAL_IMPL_SSE_SELECT(v_uint16x8, OPENCV_HAL_NOP, OPENCV_HAL_NOP, epi8) +OPENCV_HAL_IMPL_SSE_SELECT(v_int16x8, OPENCV_HAL_NOP, OPENCV_HAL_NOP, epi8) +OPENCV_HAL_IMPL_SSE_SELECT(v_uint32x4, _mm_castps_si128, _mm_castsi128_ps, ps) +OPENCV_HAL_IMPL_SSE_SELECT(v_int32x4, _mm_castps_si128, _mm_castsi128_ps, ps) +// OPENCV_HAL_IMPL_SSE_SELECT(v_uint64x2, TBD, TBD, pd) +// OPENCV_HAL_IMPL_SSE_SELECT(v_int64x2, TBD, TBD, ps) +OPENCV_HAL_IMPL_SSE_SELECT(v_float32x4, OPENCV_HAL_NOP, OPENCV_HAL_NOP, ps) +OPENCV_HAL_IMPL_SSE_SELECT(v_float64x2, OPENCV_HAL_NOP, OPENCV_HAL_NOP, pd) + +#else // CV_SSE4_1 + #define OPENCV_HAL_IMPL_SSE_SELECT(_Tpvec, suffix) \ inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \ { \ @@ -1270,6 +1294,7 @@ OPENCV_HAL_IMPL_SSE_SELECT(v_int32x4, si128) // OPENCV_HAL_IMPL_SSE_SELECT(v_int64x2, si128) OPENCV_HAL_IMPL_SSE_SELECT(v_float32x4, ps) OPENCV_HAL_IMPL_SSE_SELECT(v_float64x2, pd) +#endif #define OPENCV_HAL_IMPL_SSE_EXPAND(_Tpuvec, _Tpwuvec, _Tpu, _Tpsvec, _Tpwsvec, _Tps, suffix, wsuffix, shift) \ inline void v_expand(const _Tpuvec& a, _Tpwuvec& b0, _Tpwuvec& b1) \ diff --git a/modules/core/perf/perf_mat.cpp b/modules/core/perf/perf_mat.cpp index 83f10e1add..325ef5fb7c 100644 --- a/modules/core/perf/perf_mat.cpp +++ b/modules/core/perf/perf_mat.cpp @@ -96,8 +96,8 @@ PERF_TEST_P(Size_MatType, Mat_Clone_Roi, } PERF_TEST_P(Size_MatType, Mat_CopyToWithMask, - testing::Combine(testing::Values(TYPICAL_MAT_SIZES), - testing::Values(CV_8UC1, CV_8UC2)) + testing::Combine(testing::Values(::perf::sz1080p, ::perf::szODD), + testing::Values(CV_8UC1, CV_8UC2, CV_8UC3, CV_16UC1, CV_32SC1, CV_32FC4)) ) { const Size_MatType_t params = GetParam(); diff --git a/modules/core/src/copy.cpp b/modules/core/src/copy.cpp index 9531270903..f4f18cb740 100644 --- a/modules/core/src/copy.cpp +++ b/modules/core/src/copy.cpp @@ -91,11 +91,7 @@ copyMask_(const uchar* _src, size_t sstep, const uchar* mask, size_t mste uchar* dst = (uchar*)_dst; int x = 0; #if CV_SIMD128 - if( hasSIMD128() - #if CV_SSE4_2 - && USE_SSE4_2 - #endif - ) { + { v_uint8x16 v_zero = v_setzero_u8(); for( ; x <= size.width - 16; x += 16 ) @@ -104,11 +100,7 @@ copyMask_(const uchar* _src, size_t sstep, const uchar* mask, size_t mste v_dst = v_load(dst + x), v_nmask = v_load(mask + x) == v_zero; - #if CV_SSE4_2 - v_dst = v_uint8x16(_mm_blendv_epi8(v_src.val, v_dst.val, v_nmask.val)); - #else v_dst = v_select(v_nmask, v_dst, v_src); - #endif v_store(dst + x, v_dst); } } @@ -130,11 +122,7 @@ copyMask_(const uchar* _src, size_t sstep, const uchar* mask, size_t mst ushort* dst = (ushort*)_dst; int x = 0; #if CV_SIMD128 - if( hasSIMD128() - #if CV_SSE4_2 - && USE_SSE4_2 - #endif - ) { + { v_uint8x16 v_zero = v_setzero_u8(); for( ; x <= size.width - 16; x += 16 ) @@ -146,13 +134,8 @@ copyMask_(const uchar* _src, size_t sstep, const uchar* mask, size_t mst v_uint8x16 v_nmask = v_load(mask + x) == v_zero; v_zip(v_nmask, v_nmask, v_nmask1, v_nmask2); - #if CV_SSE4_2 - v_dst1 = v_uint16x8(_mm_blendv_epi8(v_src1.val, v_dst1.val, v_nmask1.val)); - v_dst2 = v_uint16x8(_mm_blendv_epi8(v_src2.val, v_dst2.val, v_nmask2.val)); - #else v_dst1 = v_select(v_reinterpret_as_u16(v_nmask1), v_dst1, v_src1); v_dst2 = v_select(v_reinterpret_as_u16(v_nmask2), v_dst2, v_src2); - #endif v_store(dst + x, v_dst1); v_store(dst + x + 8, v_dst2); } diff --git a/modules/core/test/test_intrin_utils.hpp b/modules/core/test/test_intrin_utils.hpp index 43d8aaff4d..7a21c9eb56 100644 --- a/modules/core/test/test_intrin_utils.hpp +++ b/modules/core/test/test_intrin_utils.hpp @@ -657,8 +657,15 @@ template struct TheTest TheTest & test_mask() { - Data dataA, dataB, dataC, dataD(1), dataE(2); + typedef V_TypeTraits Traits; + typedef typename Traits::int_type int_type; + + Data dataA, dataB(0), dataC, dataD(1), dataE(2); dataA[1] *= (LaneType)-1; + const LaneType mask_one = Traits::reinterpret_from_int(~(typename Traits::uint_type)(0)); + dataB[1] = mask_one; + dataB[R::nlanes / 2] = mask_one; + dataB[R::nlanes - 1] = mask_one; dataC *= (LaneType)-1; R a = dataA, b = dataB, c = dataC, d = dataD, e = dataE; @@ -670,12 +677,9 @@ template struct TheTest EXPECT_EQ(true, v_check_all(c)); EXPECT_EQ(true, v_check_any(a)); - EXPECT_EQ(false, v_check_any(b)); + EXPECT_EQ(true, v_check_any(b)); EXPECT_EQ(true, v_check_any(c)); - typedef V_TypeTraits Traits; - typedef typename Traits::int_type int_type; - R f = v_select(b, d, e); Data resF = f; for (int i = 0; i < R::nlanes; ++i)