From 65726e4244594a6eaf5218b26d43e7f52e980841 Mon Sep 17 00:00:00 2001
From: Alexander Alekhin <alexander.alekhin@intel.com>
Date: Tue, 17 Apr 2018 17:50:23 +0300
Subject: [PATCH] core(hal): improve v_select() SSE4.1+

v_select 'mask' is restricted to these values only: 0 or ~0 (0xff/0xffff/etc)
mask in accuracy test is updated.
---
 .../include/opencv2/core/hal/intrin_cpp.hpp   | 19 +++++++------
 .../include/opencv2/core/hal/intrin_sse.hpp   | 27 ++++++++++++++++++-
 modules/core/perf/perf_mat.cpp                |  4 +--
 modules/core/src/copy.cpp                     | 21 ++-------------
 modules/core/test/test_intrin_utils.hpp       | 14 ++++++----
 5 files changed, 50 insertions(+), 35 deletions(-)
diff --git a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
index 5518eace9b..269bf5616c 100644
--- a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
@@ -1039,13 +1039,16 @@ template<typename _Tp, int n> inline bool v_check_any(const v_reg<_Tp, n>& a)
     return false;
 }
 
-/** @brief Bitwise select
+/** @brief Per-element select (blend operation)
 
-Return value will be built by combining values a and b using the following scheme:
-If the i-th bit in _mask_ is 1
-    select i-th bit from _a_
-else
-    select i-th bit from _b_ */
+Return value will be built by combining values _a_ and _b_ using the following scheme:
+    result[i] = mask[i] ? a[i] : b[i];
+
+@Note: _mask_ element values are restricted to these values:
+- 0: select element from _b_
+- 0xff/0xffff/etc: select element from _a_
+(fully compatible with bitwise-based operator)
+*/
 template<typename _Tp, int n> inline v_reg<_Tp, n> v_select(const v_reg<_Tp, n>& mask,
                                                            const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
 {
@@ -1055,8 +1058,8 @@ template<typename _Tp, int n> inline v_reg<_Tp, n> v_select(const v_reg<_Tp, n>&
     for( int i = 0; i < n; i++ )
     {
         int_type m = Traits::reinterpret_int(mask.s[i]);
-        c.s[i] =  Traits::reinterpret_from_int((Traits::reinterpret_int(a.s[i]) & m)
-                                             | (Traits::reinterpret_int(b.s[i]) & ~m));
+        CV_DbgAssert(m == 0 || m == (~(int_type)0));  // restrict mask values: 0 or 0xff/0xffff/etc
+        c.s[i] = m ? a.s[i] : b.s[i];
     }
     return c;
 }
diff --git a/modules/core/include/opencv2/core/hal/intrin_sse.hpp b/modules/core/include/opencv2/core/hal/intrin_sse.hpp
index 62ffa3ec88..4a129d09c5 100644
--- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp
@@ -406,10 +406,14 @@ void v_rshr_pack_store(schar* ptr, const v_int16x8& a)
 }
 
 
-// bit-wise "mask ? a : b"
+// byte-wise "mask ? a : b"
 inline __m128i v_select_si128(__m128i mask, __m128i a, __m128i b)
 {
+#if CV_SSE4_1
+    return _mm_blendv_epi8(b, a, mask);
+#else
     return _mm_xor_si128(b, _mm_and_si128(_mm_xor_si128(a, b), mask));
+#endif
 }
 
 inline v_uint16x8 v_pack(const v_uint32x4& a, const v_uint32x4& b)
@@ -1254,6 +1258,26 @@ OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int32x4, epi8, v_packq_epi32, OPENCV_HAL_AND,
 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_float32x4, ps, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 15, 15)
 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_float64x2, pd, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 3, 3)
 
+#if CV_SSE4_1
+#define OPENCV_HAL_IMPL_SSE_SELECT(_Tpvec, cast_ret, cast, suffix) \
+inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(cast_ret(_mm_blendv_##suffix(cast(b.val), cast(a.val), cast(mask.val)))); \
+}
+
+OPENCV_HAL_IMPL_SSE_SELECT(v_uint8x16, OPENCV_HAL_NOP, OPENCV_HAL_NOP, epi8)
+OPENCV_HAL_IMPL_SSE_SELECT(v_int8x16, OPENCV_HAL_NOP, OPENCV_HAL_NOP, epi8)
+OPENCV_HAL_IMPL_SSE_SELECT(v_uint16x8, OPENCV_HAL_NOP, OPENCV_HAL_NOP, epi8)
+OPENCV_HAL_IMPL_SSE_SELECT(v_int16x8, OPENCV_HAL_NOP, OPENCV_HAL_NOP, epi8)
+OPENCV_HAL_IMPL_SSE_SELECT(v_uint32x4, _mm_castps_si128, _mm_castsi128_ps, ps)
+OPENCV_HAL_IMPL_SSE_SELECT(v_int32x4, _mm_castps_si128, _mm_castsi128_ps, ps)
+// OPENCV_HAL_IMPL_SSE_SELECT(v_uint64x2, TBD, TBD, pd)
+// OPENCV_HAL_IMPL_SSE_SELECT(v_int64x2, TBD, TBD, ps)
+OPENCV_HAL_IMPL_SSE_SELECT(v_float32x4, OPENCV_HAL_NOP, OPENCV_HAL_NOP, ps)
+OPENCV_HAL_IMPL_SSE_SELECT(v_float64x2, OPENCV_HAL_NOP, OPENCV_HAL_NOP, pd)
+
+#else // CV_SSE4_1
+
 #define OPENCV_HAL_IMPL_SSE_SELECT(_Tpvec, suffix) \
 inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
 { \
@@ -1270,6 +1294,7 @@ OPENCV_HAL_IMPL_SSE_SELECT(v_int32x4, si128)
 // OPENCV_HAL_IMPL_SSE_SELECT(v_int64x2, si128)
 OPENCV_HAL_IMPL_SSE_SELECT(v_float32x4, ps)
 OPENCV_HAL_IMPL_SSE_SELECT(v_float64x2, pd)
+#endif
 
 #define OPENCV_HAL_IMPL_SSE_EXPAND(_Tpuvec, _Tpwuvec, _Tpu, _Tpsvec, _Tpwsvec, _Tps, suffix, wsuffix, shift) \
 inline void v_expand(const _Tpuvec& a, _Tpwuvec& b0, _Tpwuvec& b1) \
diff --git a/modules/core/perf/perf_mat.cpp b/modules/core/perf/perf_mat.cpp
index 83f10e1add..325ef5fb7c 100644
--- a/modules/core/perf/perf_mat.cpp
+++ b/modules/core/perf/perf_mat.cpp
@@ -96,8 +96,8 @@ PERF_TEST_P(Size_MatType, Mat_Clone_Roi,
 }
 
 PERF_TEST_P(Size_MatType, Mat_CopyToWithMask,
-            testing::Combine(testing::Values(TYPICAL_MAT_SIZES),
-                             testing::Values(CV_8UC1, CV_8UC2))
+            testing::Combine(testing::Values(::perf::sz1080p, ::perf::szODD),
+                             testing::Values(CV_8UC1, CV_8UC2, CV_8UC3, CV_16UC1, CV_32SC1, CV_32FC4))
             )
 {
     const Size_MatType_t params = GetParam();
diff --git a/modules/core/src/copy.cpp b/modules/core/src/copy.cpp
index 9531270903..f4f18cb740 100644
--- a/modules/core/src/copy.cpp
+++ b/modules/core/src/copy.cpp
@@ -91,11 +91,7 @@ copyMask_<uchar>(const uchar* _src, size_t sstep, const uchar* mask, size_t mste
         uchar* dst = (uchar*)_dst;
         int x = 0;
         #if CV_SIMD128
-        if( hasSIMD128()
-           #if CV_SSE4_2
-           && USE_SSE4_2
-           #endif
-        ) {
+        {
             v_uint8x16 v_zero = v_setzero_u8();
 
             for( ; x <= size.width - 16; x += 16 )
@@ -104,11 +100,7 @@ copyMask_<uchar>(const uchar* _src, size_t sstep, const uchar* mask, size_t mste
                            v_dst   = v_load(dst  + x),
                            v_nmask = v_load(mask + x) == v_zero;
 
-                #if CV_SSE4_2
-                v_dst = v_uint8x16(_mm_blendv_epi8(v_src.val, v_dst.val, v_nmask.val));
-                #else
                 v_dst = v_select(v_nmask, v_dst, v_src);
-                #endif
                 v_store(dst + x, v_dst);
             }
         }
@@ -130,11 +122,7 @@ copyMask_<ushort>(const uchar* _src, size_t sstep, const uchar* mask, size_t mst
         ushort* dst = (ushort*)_dst;
         int x = 0;
         #if CV_SIMD128
-        if( hasSIMD128()
-          #if CV_SSE4_2
-          && USE_SSE4_2
-          #endif
-        ) {
+        {
             v_uint8x16 v_zero = v_setzero_u8();
 
             for( ; x <= size.width - 16; x += 16 )
@@ -146,13 +134,8 @@ copyMask_<ushort>(const uchar* _src, size_t sstep, const uchar* mask, size_t mst
                 v_uint8x16 v_nmask = v_load(mask + x) == v_zero;
                 v_zip(v_nmask, v_nmask, v_nmask1, v_nmask2);
 
-                #if CV_SSE4_2
-                v_dst1 = v_uint16x8(_mm_blendv_epi8(v_src1.val, v_dst1.val, v_nmask1.val));
-                v_dst2 = v_uint16x8(_mm_blendv_epi8(v_src2.val, v_dst2.val, v_nmask2.val));
-                #else
                 v_dst1 = v_select(v_reinterpret_as_u16(v_nmask1), v_dst1, v_src1);
                 v_dst2 = v_select(v_reinterpret_as_u16(v_nmask2), v_dst2, v_src2);
-                #endif
                 v_store(dst + x, v_dst1);
                 v_store(dst + x + 8, v_dst2);
             }
diff --git a/modules/core/test/test_intrin_utils.hpp b/modules/core/test/test_intrin_utils.hpp
index 43d8aaff4d..7a21c9eb56 100644
--- a/modules/core/test/test_intrin_utils.hpp
+++ b/modules/core/test/test_intrin_utils.hpp
@@ -657,8 +657,15 @@ template<typename R> struct TheTest
 
     TheTest & test_mask()
     {
-        Data<R> dataA, dataB, dataC, dataD(1), dataE(2);
+        typedef V_TypeTraits<LaneType> Traits;
+        typedef typename Traits::int_type int_type;
+
+        Data<R> dataA, dataB(0), dataC, dataD(1), dataE(2);
         dataA[1] *= (LaneType)-1;
+        const LaneType mask_one = Traits::reinterpret_from_int(~(typename Traits::uint_type)(0));
+        dataB[1] = mask_one;
+        dataB[R::nlanes / 2] = mask_one;
+        dataB[R::nlanes - 1] = mask_one;
         dataC *= (LaneType)-1;
         R a = dataA, b = dataB, c = dataC, d = dataD, e = dataE;
 
@@ -670,12 +677,9 @@ template<typename R> struct TheTest
         EXPECT_EQ(true, v_check_all(c));
 
         EXPECT_EQ(true, v_check_any(a));
-        EXPECT_EQ(false, v_check_any(b));
+        EXPECT_EQ(true, v_check_any(b));
         EXPECT_EQ(true, v_check_any(c));
 
-        typedef V_TypeTraits<LaneType> Traits;
-        typedef typename Traits::int_type int_type;
-
         R f = v_select(b, d, e);
         Data<R> resF = f;
         for (int i = 0; i < R::nlanes; ++i)