From 1b3db545a30f9b2df7331bdee9570f913c1217a9 Mon Sep 17 00:00:00 2001
From: Yuantao Feng <yuantao.feng@outlook.com>
Date: Mon, 7 Apr 2025 15:56:02 +0800
Subject: [PATCH] Merge pull request #27145 from
 fengyuentau:4x/core/copyMask-simd

core: further vectorize copyTo with mask #27145

Merge with https://github.com/opencv/opencv_extra/pull/1247.

### Pull Request Readiness Checklist

See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request

- [x] I agree to contribute to the project under Apache 2 License.
- [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [x] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
      Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
---
 modules/core/perf/perf_mat.cpp |   2 +-
 modules/core/src/copy.cpp      | 122 +++++++++++++++++++++++++++++++++
 2 files changed, 123 insertions(+), 1 deletion(-)
diff --git a/modules/core/perf/perf_mat.cpp b/modules/core/perf/perf_mat.cpp
index a179483503..277eb92c21 100644
--- a/modules/core/perf/perf_mat.cpp
+++ b/modules/core/perf/perf_mat.cpp
@@ -99,7 +99,7 @@ PERF_TEST_P(Size_MatType, Mat_Clone_Roi,
 
 PERF_TEST_P(Size_MatType, Mat_CopyToWithMask,
             testing::Combine(testing::Values(::perf::sz1080p, ::perf::szODD),
-                             testing::Values(CV_8UC1, CV_8UC2, CV_8UC3, CV_16UC1, CV_32SC1, CV_32FC4))
+                             testing::Values(CV_8UC1, CV_8UC2, CV_8UC3, CV_16UC1, CV_16UC3, CV_32SC1, CV_32SC2, CV_32FC4))
             )
 {
     const Size_MatType_t params = GetParam();
diff --git a/modules/core/src/copy.cpp b/modules/core/src/copy.cpp
index 9f4fa8604a..68d6b938e7 100644
--- a/modules/core/src/copy.cpp
+++ b/modules/core/src/copy.cpp
@@ -12,6 +12,7 @@
 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
 // Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
 // Copyright (C) 2014, Itseez Inc., all rights reserved.
+// Copyright (C) 2025, SpaceMIT Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
@@ -178,6 +179,41 @@ copyMask_<uchar>(const uchar* _src, size_t sstep, const uchar* mask, size_t mste
     }
 }
 
+template<> void
+copyMask_<Vec3b>(const uchar* _src, size_t sstep, const uchar* mask, size_t mstep, uchar* _dst, size_t dstep, Size size)
+{
+    for( ; size.height--; mask += mstep, _src += sstep, _dst += dstep )
+    {
+        const uchar* src = (const uchar*)_src;
+        uchar* dst = (uchar*)_dst;
+        int x = 0;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+        for( ; x <= size.width - VTraits<v_uint8>::vlanes(); x += VTraits<v_uint8>::vlanes() )
+        {
+            v_uint8 v_nmask = v_eq(vx_load(mask + x), vx_setzero_u8());
+
+            v_uint8 v_src0, v_src1, v_src2;
+            v_uint8 v_dst0, v_dst1, v_dst2;
+            v_load_deinterleave(src + 3 * x, v_src0, v_src1, v_src2);
+            v_load_deinterleave(dst + 3 * x, v_dst0, v_dst1, v_dst2);
+
+            v_dst0 = v_select(v_nmask, v_dst0, v_src0);
+            v_dst1 = v_select(v_nmask, v_dst1, v_src1);
+            v_dst2 = v_select(v_nmask, v_dst2, v_src2);
+
+            v_store_interleave(dst + 3 * x, v_dst0, v_dst1, v_dst2);
+        }
+        vx_cleanup();
+#endif
+        for( ; x < size.width; x++ )
+            if( mask[x] ) {
+                dst[3 * x] = src[3 * x];
+                dst[3 * x + 1] = src[3 * x + 1];
+                dst[3 * x + 2] = src[3 * x + 2];
+            }
+    }
+}
+
 template<> void
 copyMask_<ushort>(const uchar* _src, size_t sstep, const uchar* mask, size_t mstep, uchar* _dst, size_t dstep, Size size)
 {
@@ -215,6 +251,92 @@ copyMask_<ushort>(const uchar* _src, size_t sstep, const uchar* mask, size_t mst
     }
 }
 
+template<> void
+copyMask_<Vec3s>(const uchar* _src, size_t sstep, const uchar* mask, size_t mstep, uchar* _dst, size_t dstep, Size size)
+{
+    for( ; size.height--; mask += mstep, _src += sstep, _dst += dstep )
+    {
+        const ushort* src = (const ushort*)_src;
+        ushort* dst = (ushort*)_dst;
+        int x = 0;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+        for( ; x <= size.width - VTraits<v_uint8>::vlanes(); x += VTraits<v_uint8>::vlanes() )
+        {
+            v_uint8 v_nmask = v_eq(vx_load(mask + x), vx_setzero_u8());
+            v_uint8 v_nmask0, v_nmask1;
+            v_zip(v_nmask, v_nmask, v_nmask0, v_nmask1);
+
+            v_uint16 v_src0, v_src1, v_src2;
+            v_uint16 v_dst0, v_dst1, v_dst2;
+            v_load_deinterleave(src + 3 * x, v_src0, v_src1, v_src2);
+            v_load_deinterleave(dst + 3 * x, v_dst0, v_dst1, v_dst2);
+            v_uint16 v_src3, v_src4, v_src5;
+            v_uint16 v_dst3, v_dst4, v_dst5;
+            v_load_deinterleave(src + 3 * (x + VTraits<v_uint16>::vlanes()), v_src3, v_src4, v_src5);
+            v_load_deinterleave(dst + 3 * (x + VTraits<v_uint16>::vlanes()), v_dst3, v_dst4, v_dst5);
+
+            v_dst0 = v_select(v_reinterpret_as_u16(v_nmask0), v_dst0, v_src0);
+            v_dst1 = v_select(v_reinterpret_as_u16(v_nmask0), v_dst1, v_src1);
+            v_dst2 = v_select(v_reinterpret_as_u16(v_nmask0), v_dst2, v_src2);
+            v_dst3 = v_select(v_reinterpret_as_u16(v_nmask1), v_dst3, v_src3);
+            v_dst4 = v_select(v_reinterpret_as_u16(v_nmask1), v_dst4, v_src4);
+            v_dst5 = v_select(v_reinterpret_as_u16(v_nmask1), v_dst5, v_src5);
+
+            v_store_interleave(dst + 3 * x, v_dst0, v_dst1, v_dst2);
+            v_store_interleave(dst + 3 * (x + VTraits<v_uint16>::vlanes()), v_dst3, v_dst4, v_dst5);
+        }
+        vx_cleanup();
+#endif
+        for( ; x < size.width; x++ )
+            if( mask[x] ) {
+                dst[3 * x] = src[3 * x];
+                dst[3 * x + 1] = src[3 * x + 1];
+                dst[3 * x + 2] = src[3 * x + 2];
+            }
+    }
+}
+
+template<> void
+copyMask_<int>(const uchar* _src, size_t sstep, const uchar* mask, size_t mstep, uchar* _dst, size_t dstep, Size size)
+{
+    for( ; size.height--; mask += mstep, _src += sstep, _dst += dstep )
+    {
+        const int* src = (const int*)_src;
+        int* dst = (int*)_dst;
+        int x = 0;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+        for (; x <= size.width - VTraits<v_uint8>::vlanes(); x += VTraits<v_uint8>::vlanes())
+        {
+            v_int32 v_src0 = vx_load(src + x), v_dst0 = vx_load(dst + x);
+            v_int32 v_src1 = vx_load(src + x +     VTraits<v_int32>::vlanes()), v_dst1 = vx_load(dst + x +     VTraits<v_int32>::vlanes());
+            v_int32 v_src2 = vx_load(src + x + 2 * VTraits<v_int32>::vlanes()), v_dst2 = vx_load(dst + x + 2 * VTraits<v_int32>::vlanes());
+            v_int32 v_src3 = vx_load(src + x + 3 * VTraits<v_int32>::vlanes()), v_dst3 = vx_load(dst + x + 3 * VTraits<v_int32>::vlanes());
+
+            v_uint8 v_nmask = v_eq(vx_load(mask + x), vx_setzero_u8());
+            v_uint8 v_nmask0, v_nmask1;
+            v_zip(v_nmask, v_nmask, v_nmask0, v_nmask1);
+            v_uint8 v_nmask00, v_nmask01, v_nmask10, v_nmask11;
+            v_zip(v_nmask0, v_nmask0, v_nmask00, v_nmask01);
+            v_zip(v_nmask1, v_nmask1, v_nmask10, v_nmask11);
+
+            v_dst0 = v_select(v_reinterpret_as_s32(v_nmask00), v_dst0, v_src0);
+            v_dst1 = v_select(v_reinterpret_as_s32(v_nmask01), v_dst1, v_src1);
+            v_dst2 = v_select(v_reinterpret_as_s32(v_nmask10), v_dst2, v_src2);
+            v_dst3 = v_select(v_reinterpret_as_s32(v_nmask11), v_dst3, v_src3);
+
+            vx_store(dst + x, v_dst0);
+            vx_store(dst + x +     VTraits<v_int32>::vlanes(), v_dst1);
+            vx_store(dst + x + 2 * VTraits<v_int32>::vlanes(), v_dst2);
+            vx_store(dst + x + 3 * VTraits<v_int32>::vlanes(), v_dst3);
+        }
+        vx_cleanup();
+#endif
+        for (; x < size.width; x++)
+            if ( mask[x] )
+                dst[x] = src[x];
+    }
+}
+
 static void
 copyMaskGeneric(const uchar* _src, size_t sstep, const uchar* mask, size_t mstep, uchar* _dst, size_t dstep, Size size, void* _esz)
 {