From fe59a5695f9afd9cbf02fd20a1551ed0d4dfeac8 Mon Sep 17 00:00:00 2001
From: Alexander Alekhin <alexander.a.alekhin@gmail.com>
Date: Mon, 27 Feb 2023 03:17:46 +0000
Subject: [PATCH] core(simd): 64-bit integer EQ/NE without misused 64F guard

---
 .../include/opencv2/core/hal/intrin_cpp.hpp   |  8 +-
 .../include/opencv2/core/hal/intrin_neon.hpp  | 54 +++++++++----
 modules/core/test/test_intrin_utils.hpp       | 78 +++++++++++--------
 3 files changed, 86 insertions(+), 54 deletions(-)
diff --git a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
index 46222140e6..9a97376898 100644
--- a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
@@ -879,14 +879,10 @@ OPENCV_HAL_IMPL_CMP_OP(<=)
 For all types except 64-bit integer values. */
 OPENCV_HAL_IMPL_CMP_OP(>=)
 
-/** @brief Equal comparison
-
-For all types except 64-bit integer values. */
+/** @brief Equal comparison */
 OPENCV_HAL_IMPL_CMP_OP(==)
 
-/** @brief Not equal comparison
-
-For all types except 64-bit integer values. */
+/** @brief Not equal comparison */
 OPENCV_HAL_IMPL_CMP_OP(!=)
 
 template<int n>
diff --git a/modules/core/include/opencv2/core/hal/intrin_neon.hpp b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
index 5792694a40..3897cee12b 100644
--- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
@@ -1038,18 +1038,6 @@ OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float64x2, v_min, vminq_f64)
 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float64x2, v_max, vmaxq_f64)
 #endif
 
-#if CV_SIMD128_64F
-inline int64x2_t vmvnq_s64(int64x2_t a)
-{
-    int64x2_t vx = vreinterpretq_s64_u32(vdupq_n_u32(0xFFFFFFFF));
-    return veorq_s64(a, vx);
-}
-inline uint64x2_t vmvnq_u64(uint64x2_t a)
-{
-    uint64x2_t vx = vreinterpretq_u64_u32(vdupq_n_u32(0xFFFFFFFF));
-    return veorq_u64(a, vx);
-}
-#endif
 #define OPENCV_HAL_IMPL_NEON_INT_CMP_OP(_Tpvec, cast, suffix, not_suffix) \
 inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
 { return _Tpvec(cast(vceqq_##suffix(a.val, b.val))); } \
@@ -1071,9 +1059,47 @@ OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int16x8, vreinterpretq_s16_u16, s16, u16)
 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint32x4, OPENCV_HAL_NOP, u32, u32)
 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int32x4, vreinterpretq_s32_u32, s32, u32)
 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_float32x4, vreinterpretq_f32_u32, f32, u32)
+#if defined(__aarch64__) || defined(_M_ARM64)
+static inline uint64x2_t vmvnq_u64(uint64x2_t a)
+{
+    uint64x2_t vx = vreinterpretq_u64_u32(vdupq_n_u32(0xFFFFFFFF));
+    return veorq_u64(a, vx);
+}
+//OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint64x2, OPENCV_HAL_NOP, u64, u64)
+//OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int64x2, vreinterpretq_s64_u64, s64, u64)
+static inline v_uint64x2 operator == (const v_uint64x2& a, const v_uint64x2& b)
+{ return v_uint64x2(vceqq_u64(a.val, b.val)); }
+static inline v_uint64x2 operator != (const v_uint64x2& a, const v_uint64x2& b)
+{ return v_uint64x2(vmvnq_u64(vceqq_u64(a.val, b.val))); }
+static inline v_int64x2 operator == (const v_int64x2& a, const v_int64x2& b)
+{ return v_int64x2(vreinterpretq_s64_u64(vceqq_s64(a.val, b.val))); }
+static inline v_int64x2 operator != (const v_int64x2& a, const v_int64x2& b)
+{ return v_int64x2(vreinterpretq_s64_u64(vmvnq_u64(vceqq_s64(a.val, b.val)))); }
+#else
+static inline v_uint64x2 operator == (const v_uint64x2& a, const v_uint64x2& b)
+{
+    uint32x4_t cmp = vceqq_u32(vreinterpretq_u32_u64(a.val), vreinterpretq_u32_u64(b.val));
+    uint32x4_t swapped = vrev64q_u32(cmp);
+    return v_uint64x2(vreinterpretq_u64_u32(vandq_u32(cmp, swapped)));
+}
+static inline v_uint64x2 operator != (const v_uint64x2& a, const v_uint64x2& b)
+{
+    uint32x4_t cmp = vceqq_u32(vreinterpretq_u32_u64(a.val), vreinterpretq_u32_u64(b.val));
+    uint32x4_t swapped = vrev64q_u32(cmp);
+    uint64x2_t v_eq = vreinterpretq_u64_u32(vandq_u32(cmp, swapped));
+    uint64x2_t vx = vreinterpretq_u64_u32(vdupq_n_u32(0xFFFFFFFF));
+    return v_uint64x2(veorq_u64(v_eq, vx));
+}
+static inline v_int64x2 operator == (const v_int64x2& a, const v_int64x2& b)
+{
+    return v_reinterpret_as_s64(v_reinterpret_as_u64(a) == v_reinterpret_as_u64(b));
+}
+static inline v_int64x2 operator != (const v_int64x2& a, const v_int64x2& b)
+{
+    return v_reinterpret_as_s64(v_reinterpret_as_u64(a) != v_reinterpret_as_u64(b));
+}
+#endif
 #if CV_SIMD128_64F
-OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint64x2, OPENCV_HAL_NOP, u64, u64)
-OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int64x2, vreinterpretq_s64_u64, s64, u64)
 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_float64x2, vreinterpretq_f64_u64, f64, u64)
 #endif
 
diff --git a/modules/core/test/test_intrin_utils.hpp b/modules/core/test/test_intrin_utils.hpp
index 3f196f1342..da1f26790c 100644
--- a/modules/core/test/test_intrin_utils.hpp
+++ b/modules/core/test/test_intrin_utils.hpp
@@ -97,7 +97,7 @@ template <typename R> struct Data
     {
         *this = r;
     }
-    operator R ()
+    operator R () const
     {
         return initializer<R::nlanes>().init(*this);
     }
@@ -1559,11 +1559,34 @@ template<typename R> struct TheTest
     }
 #endif
 
-#if CV_SIMD_64F
+    void do_check_cmp64(const Data<R>& dataA, const Data<R>& dataB)
+    {
+        R a = dataA;
+        R b = dataB;
+
+        Data<R> dataEQ = (a == b);
+        Data<R> dataNE = (a != b);
+
+        for (int i = 0; i < R::nlanes; ++i)
+        {
+            SCOPED_TRACE(cv::format("i=%d", i));
+            if (cvtest::debugLevel > 0) cout << "i=" << i << " ( " << dataA[i] << " vs " << dataB[i] << " ): eq=" << dataEQ[i] << " ne=" << dataNE[i] << endl;
+            EXPECT_NE((LaneType)dataEQ[i], (LaneType)dataNE[i]);
+            if (dataA[i] == dataB[i])
+                EXPECT_EQ((LaneType)-1, (LaneType)dataEQ[i]);
+            else
+                EXPECT_EQ((LaneType)0, (LaneType)dataEQ[i]);
+            if (dataA[i] != dataB[i])
+                EXPECT_EQ((LaneType)-1, (LaneType)dataNE[i]);
+            else
+                EXPECT_EQ((LaneType)0, (LaneType)dataNE[i]);
+        }
+    }
+
     TheTest & test_cmp64()
     {
-        Data<R> dataA, dataB;
-        R a = dataA, b = dataB;
+        Data<R> dataA;
+        Data<R> dataB;
 
         for (int i = 0; i < R::nlanes; ++i)
         {
@@ -1571,37 +1594,25 @@ template<typename R> struct TheTest
         }
         dataA[0]++;
 
-        a = dataA, b = dataB;
+        do_check_cmp64(dataA, dataB);
+        do_check_cmp64(dataB, dataA);
 
-        Data<R> resC = (a == b);
-        Data<R> resD = (a != b);
+        dataA[0] = dataB[0];
+        dataA[1] += (((LaneType)1) << 32);
+        do_check_cmp64(dataA, dataB);
+        do_check_cmp64(dataB, dataA);
 
-        for (int i = 0; i < R::nlanes; ++i)
-        {
-            SCOPED_TRACE(cv::format("i=%d", i));
-            EXPECT_EQ(dataA[i] == dataB[i], resC[i] != 0);
-            EXPECT_EQ(dataA[i] != dataB[i], resD[i] != 0);
-        }
+        dataA[0] = (LaneType)-1;
+        dataB[0] = (LaneType)-1;
+        dataA[1] = (LaneType)-1;
+        dataB[1] = (LaneType)2;
 
-        for (int i = 0; i < R::nlanes; ++i)
-        {
-            dataA[i] = dataB[i] = (LaneType)-1;
-        }
+        do_check_cmp64(dataA, dataB);
+        do_check_cmp64(dataB, dataA);
 
-        a = dataA, b = dataB;
-
-        resC = (a == b);
-        resD = (a != b);
-
-        for (int i = 0; i < R::nlanes; ++i)
-        {
-            SCOPED_TRACE(cv::format("i=%d", i));
-            EXPECT_EQ(dataA[i] == dataB[i], resC[i] != 0);
-            EXPECT_EQ(dataA[i] != dataB[i], resD[i] != 0);
-        }
         return *this;
     }
-#endif
+
 };
 
 
@@ -1837,9 +1848,8 @@ void test_hal_intrin_uint64()
     TheTest<v_uint64>()
         .test_loadstore()
         .test_addsub()
-#if CV_SIMD_64F
         .test_cmp64()
-#endif
+        //.test_cmp() - not declared as supported
         .test_shift<1>().test_shift<8>()
         .test_logic()
         .test_reverse()
@@ -1857,9 +1867,8 @@ void test_hal_intrin_int64()
     TheTest<v_int64>()
         .test_loadstore()
         .test_addsub()
-#if CV_SIMD_64F
         .test_cmp64()
-#endif
+        //.test_cmp() - not declared as supported
         .test_shift<1>().test_shift<8>()
         .test_logic()
         .test_reverse()
@@ -1936,7 +1945,8 @@ void test_hal_intrin_float64()
         .test_rotate<2>().test_rotate<3>()
 #endif
         ;
-
+#else
+    std::cout << "SKIP: CV_SIMD_64F is not available" << std::endl;
 #endif
 }