Merge pull request #27145 from fengyuentau:4x/core/copyMask-simd

core: further vectorize copyTo with mask #27145

Merge with https://github.com/opencv/opencv_extra/pull/1247.

### Pull Request Readiness Checklist

See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request

- [x] I agree to contribute to the project under Apache 2 License.
- [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [x] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
      Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
This commit is contained in:
Yuantao Feng 2025-04-07 15:56:02 +08:00 committed by GitHub
parent 81859255ca
commit 1b3db545a3
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 123 additions and 1 deletions

View File

@ -99,7 +99,7 @@ PERF_TEST_P(Size_MatType, Mat_Clone_Roi,
PERF_TEST_P(Size_MatType, Mat_CopyToWithMask,
testing::Combine(testing::Values(::perf::sz1080p, ::perf::szODD),
testing::Values(CV_8UC1, CV_8UC2, CV_8UC3, CV_16UC1, CV_32SC1, CV_32FC4))
testing::Values(CV_8UC1, CV_8UC2, CV_8UC3, CV_16UC1, CV_16UC3, CV_32SC1, CV_32SC2, CV_32FC4))
)
{
const Size_MatType_t params = GetParam();

View File

@ -12,6 +12,7 @@
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
// Copyright (C) 2014, Itseez Inc., all rights reserved.
// Copyright (C) 2025, SpaceMIT Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
@ -178,6 +179,41 @@ copyMask_<uchar>(const uchar* _src, size_t sstep, const uchar* mask, size_t mste
}
}
template<> void
copyMask_<Vec3b>(const uchar* _src, size_t sstep, const uchar* mask, size_t mstep, uchar* _dst, size_t dstep, Size size)
{
for( ; size.height--; mask += mstep, _src += sstep, _dst += dstep )
{
const uchar* src = (const uchar*)_src;
uchar* dst = (uchar*)_dst;
int x = 0;
#if (CV_SIMD || CV_SIMD_SCALABLE)
for( ; x <= size.width - VTraits<v_uint8>::vlanes(); x += VTraits<v_uint8>::vlanes() )
{
v_uint8 v_nmask = v_eq(vx_load(mask + x), vx_setzero_u8());
v_uint8 v_src0, v_src1, v_src2;
v_uint8 v_dst0, v_dst1, v_dst2;
v_load_deinterleave(src + 3 * x, v_src0, v_src1, v_src2);
v_load_deinterleave(dst + 3 * x, v_dst0, v_dst1, v_dst2);
v_dst0 = v_select(v_nmask, v_dst0, v_src0);
v_dst1 = v_select(v_nmask, v_dst1, v_src1);
v_dst2 = v_select(v_nmask, v_dst2, v_src2);
v_store_interleave(dst + 3 * x, v_dst0, v_dst1, v_dst2);
}
vx_cleanup();
#endif
for( ; x < size.width; x++ )
if( mask[x] ) {
dst[3 * x] = src[3 * x];
dst[3 * x + 1] = src[3 * x + 1];
dst[3 * x + 2] = src[3 * x + 2];
}
}
}
template<> void
copyMask_<ushort>(const uchar* _src, size_t sstep, const uchar* mask, size_t mstep, uchar* _dst, size_t dstep, Size size)
{
@ -215,6 +251,92 @@ copyMask_<ushort>(const uchar* _src, size_t sstep, const uchar* mask, size_t mst
}
}
template<> void
copyMask_<Vec3s>(const uchar* _src, size_t sstep, const uchar* mask, size_t mstep, uchar* _dst, size_t dstep, Size size)
{
for( ; size.height--; mask += mstep, _src += sstep, _dst += dstep )
{
const ushort* src = (const ushort*)_src;
ushort* dst = (ushort*)_dst;
int x = 0;
#if (CV_SIMD || CV_SIMD_SCALABLE)
for( ; x <= size.width - VTraits<v_uint8>::vlanes(); x += VTraits<v_uint8>::vlanes() )
{
v_uint8 v_nmask = v_eq(vx_load(mask + x), vx_setzero_u8());
v_uint8 v_nmask0, v_nmask1;
v_zip(v_nmask, v_nmask, v_nmask0, v_nmask1);
v_uint16 v_src0, v_src1, v_src2;
v_uint16 v_dst0, v_dst1, v_dst2;
v_load_deinterleave(src + 3 * x, v_src0, v_src1, v_src2);
v_load_deinterleave(dst + 3 * x, v_dst0, v_dst1, v_dst2);
v_uint16 v_src3, v_src4, v_src5;
v_uint16 v_dst3, v_dst4, v_dst5;
v_load_deinterleave(src + 3 * (x + VTraits<v_uint16>::vlanes()), v_src3, v_src4, v_src5);
v_load_deinterleave(dst + 3 * (x + VTraits<v_uint16>::vlanes()), v_dst3, v_dst4, v_dst5);
v_dst0 = v_select(v_reinterpret_as_u16(v_nmask0), v_dst0, v_src0);
v_dst1 = v_select(v_reinterpret_as_u16(v_nmask0), v_dst1, v_src1);
v_dst2 = v_select(v_reinterpret_as_u16(v_nmask0), v_dst2, v_src2);
v_dst3 = v_select(v_reinterpret_as_u16(v_nmask1), v_dst3, v_src3);
v_dst4 = v_select(v_reinterpret_as_u16(v_nmask1), v_dst4, v_src4);
v_dst5 = v_select(v_reinterpret_as_u16(v_nmask1), v_dst5, v_src5);
v_store_interleave(dst + 3 * x, v_dst0, v_dst1, v_dst2);
v_store_interleave(dst + 3 * (x + VTraits<v_uint16>::vlanes()), v_dst3, v_dst4, v_dst5);
}
vx_cleanup();
#endif
for( ; x < size.width; x++ )
if( mask[x] ) {
dst[3 * x] = src[3 * x];
dst[3 * x + 1] = src[3 * x + 1];
dst[3 * x + 2] = src[3 * x + 2];
}
}
}
template<> void
copyMask_<int>(const uchar* _src, size_t sstep, const uchar* mask, size_t mstep, uchar* _dst, size_t dstep, Size size)
{
for( ; size.height--; mask += mstep, _src += sstep, _dst += dstep )
{
const int* src = (const int*)_src;
int* dst = (int*)_dst;
int x = 0;
#if (CV_SIMD || CV_SIMD_SCALABLE)
for (; x <= size.width - VTraits<v_uint8>::vlanes(); x += VTraits<v_uint8>::vlanes())
{
v_int32 v_src0 = vx_load(src + x), v_dst0 = vx_load(dst + x);
v_int32 v_src1 = vx_load(src + x + VTraits<v_int32>::vlanes()), v_dst1 = vx_load(dst + x + VTraits<v_int32>::vlanes());
v_int32 v_src2 = vx_load(src + x + 2 * VTraits<v_int32>::vlanes()), v_dst2 = vx_load(dst + x + 2 * VTraits<v_int32>::vlanes());
v_int32 v_src3 = vx_load(src + x + 3 * VTraits<v_int32>::vlanes()), v_dst3 = vx_load(dst + x + 3 * VTraits<v_int32>::vlanes());
v_uint8 v_nmask = v_eq(vx_load(mask + x), vx_setzero_u8());
v_uint8 v_nmask0, v_nmask1;
v_zip(v_nmask, v_nmask, v_nmask0, v_nmask1);
v_uint8 v_nmask00, v_nmask01, v_nmask10, v_nmask11;
v_zip(v_nmask0, v_nmask0, v_nmask00, v_nmask01);
v_zip(v_nmask1, v_nmask1, v_nmask10, v_nmask11);
v_dst0 = v_select(v_reinterpret_as_s32(v_nmask00), v_dst0, v_src0);
v_dst1 = v_select(v_reinterpret_as_s32(v_nmask01), v_dst1, v_src1);
v_dst2 = v_select(v_reinterpret_as_s32(v_nmask10), v_dst2, v_src2);
v_dst3 = v_select(v_reinterpret_as_s32(v_nmask11), v_dst3, v_src3);
vx_store(dst + x, v_dst0);
vx_store(dst + x + VTraits<v_int32>::vlanes(), v_dst1);
vx_store(dst + x + 2 * VTraits<v_int32>::vlanes(), v_dst2);
vx_store(dst + x + 3 * VTraits<v_int32>::vlanes(), v_dst3);
}
vx_cleanup();
#endif
for (; x < size.width; x++)
if ( mask[x] )
dst[x] = src[x];
}
}
static void
copyMaskGeneric(const uchar* _src, size_t sstep, const uchar* mask, size_t mstep, uchar* _dst, size_t dstep, Size size, void* _esz)
{