From 1b3db545a30f9b2df7331bdee9570f913c1217a9 Mon Sep 17 00:00:00 2001 From: Yuantao Feng Date: Mon, 7 Apr 2025 15:56:02 +0800 Subject: [PATCH] Merge pull request #27145 from fengyuentau:4x/core/copyMask-simd core: further vectorize copyTo with mask #27145 Merge with https://github.com/opencv/opencv_extra/pull/1247. ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [ ] There is a reference to the original bug report and related work - [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [ ] The feature is well documented and sample code can be built with the project CMake --- modules/core/perf/perf_mat.cpp | 2 +- modules/core/src/copy.cpp | 122 +++++++++++++++++++++++++++++++++ 2 files changed, 123 insertions(+), 1 deletion(-) diff --git a/modules/core/perf/perf_mat.cpp b/modules/core/perf/perf_mat.cpp index a179483503..277eb92c21 100644 --- a/modules/core/perf/perf_mat.cpp +++ b/modules/core/perf/perf_mat.cpp @@ -99,7 +99,7 @@ PERF_TEST_P(Size_MatType, Mat_Clone_Roi, PERF_TEST_P(Size_MatType, Mat_CopyToWithMask, testing::Combine(testing::Values(::perf::sz1080p, ::perf::szODD), - testing::Values(CV_8UC1, CV_8UC2, CV_8UC3, CV_16UC1, CV_32SC1, CV_32FC4)) + testing::Values(CV_8UC1, CV_8UC2, CV_8UC3, CV_16UC1, CV_16UC3, CV_32SC1, CV_32SC2, CV_32FC4)) ) { const Size_MatType_t params = GetParam(); diff --git a/modules/core/src/copy.cpp b/modules/core/src/copy.cpp index 9f4fa8604a..68d6b938e7 100644 --- a/modules/core/src/copy.cpp +++ b/modules/core/src/copy.cpp @@ -12,6 +12,7 @@ // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. // Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved. // Copyright (C) 2014, Itseez Inc., all rights reserved. +// Copyright (C) 2025, SpaceMIT Inc., all rights reserved. // Third party copyrights are property of their respective owners. // // Redistribution and use in source and binary forms, with or without modification, @@ -178,6 +179,41 @@ copyMask_(const uchar* _src, size_t sstep, const uchar* mask, size_t mste } } +template<> void +copyMask_(const uchar* _src, size_t sstep, const uchar* mask, size_t mstep, uchar* _dst, size_t dstep, Size size) +{ + for( ; size.height--; mask += mstep, _src += sstep, _dst += dstep ) + { + const uchar* src = (const uchar*)_src; + uchar* dst = (uchar*)_dst; + int x = 0; +#if (CV_SIMD || CV_SIMD_SCALABLE) + for( ; x <= size.width - VTraits::vlanes(); x += VTraits::vlanes() ) + { + v_uint8 v_nmask = v_eq(vx_load(mask + x), vx_setzero_u8()); + + v_uint8 v_src0, v_src1, v_src2; + v_uint8 v_dst0, v_dst1, v_dst2; + v_load_deinterleave(src + 3 * x, v_src0, v_src1, v_src2); + v_load_deinterleave(dst + 3 * x, v_dst0, v_dst1, v_dst2); + + v_dst0 = v_select(v_nmask, v_dst0, v_src0); + v_dst1 = v_select(v_nmask, v_dst1, v_src1); + v_dst2 = v_select(v_nmask, v_dst2, v_src2); + + v_store_interleave(dst + 3 * x, v_dst0, v_dst1, v_dst2); + } + vx_cleanup(); +#endif + for( ; x < size.width; x++ ) + if( mask[x] ) { + dst[3 * x] = src[3 * x]; + dst[3 * x + 1] = src[3 * x + 1]; + dst[3 * x + 2] = src[3 * x + 2]; + } + } +} + template<> void copyMask_(const uchar* _src, size_t sstep, const uchar* mask, size_t mstep, uchar* _dst, size_t dstep, Size size) { @@ -215,6 +251,92 @@ copyMask_(const uchar* _src, size_t sstep, const uchar* mask, size_t mst } } +template<> void +copyMask_(const uchar* _src, size_t sstep, const uchar* mask, size_t mstep, uchar* _dst, size_t dstep, Size size) +{ + for( ; size.height--; mask += mstep, _src += sstep, _dst += dstep ) + { + const ushort* src = (const ushort*)_src; + ushort* dst = (ushort*)_dst; + int x = 0; +#if (CV_SIMD || CV_SIMD_SCALABLE) + for( ; x <= size.width - VTraits::vlanes(); x += VTraits::vlanes() ) + { + v_uint8 v_nmask = v_eq(vx_load(mask + x), vx_setzero_u8()); + v_uint8 v_nmask0, v_nmask1; + v_zip(v_nmask, v_nmask, v_nmask0, v_nmask1); + + v_uint16 v_src0, v_src1, v_src2; + v_uint16 v_dst0, v_dst1, v_dst2; + v_load_deinterleave(src + 3 * x, v_src0, v_src1, v_src2); + v_load_deinterleave(dst + 3 * x, v_dst0, v_dst1, v_dst2); + v_uint16 v_src3, v_src4, v_src5; + v_uint16 v_dst3, v_dst4, v_dst5; + v_load_deinterleave(src + 3 * (x + VTraits::vlanes()), v_src3, v_src4, v_src5); + v_load_deinterleave(dst + 3 * (x + VTraits::vlanes()), v_dst3, v_dst4, v_dst5); + + v_dst0 = v_select(v_reinterpret_as_u16(v_nmask0), v_dst0, v_src0); + v_dst1 = v_select(v_reinterpret_as_u16(v_nmask0), v_dst1, v_src1); + v_dst2 = v_select(v_reinterpret_as_u16(v_nmask0), v_dst2, v_src2); + v_dst3 = v_select(v_reinterpret_as_u16(v_nmask1), v_dst3, v_src3); + v_dst4 = v_select(v_reinterpret_as_u16(v_nmask1), v_dst4, v_src4); + v_dst5 = v_select(v_reinterpret_as_u16(v_nmask1), v_dst5, v_src5); + + v_store_interleave(dst + 3 * x, v_dst0, v_dst1, v_dst2); + v_store_interleave(dst + 3 * (x + VTraits::vlanes()), v_dst3, v_dst4, v_dst5); + } + vx_cleanup(); +#endif + for( ; x < size.width; x++ ) + if( mask[x] ) { + dst[3 * x] = src[3 * x]; + dst[3 * x + 1] = src[3 * x + 1]; + dst[3 * x + 2] = src[3 * x + 2]; + } + } +} + +template<> void +copyMask_(const uchar* _src, size_t sstep, const uchar* mask, size_t mstep, uchar* _dst, size_t dstep, Size size) +{ + for( ; size.height--; mask += mstep, _src += sstep, _dst += dstep ) + { + const int* src = (const int*)_src; + int* dst = (int*)_dst; + int x = 0; +#if (CV_SIMD || CV_SIMD_SCALABLE) + for (; x <= size.width - VTraits::vlanes(); x += VTraits::vlanes()) + { + v_int32 v_src0 = vx_load(src + x), v_dst0 = vx_load(dst + x); + v_int32 v_src1 = vx_load(src + x + VTraits::vlanes()), v_dst1 = vx_load(dst + x + VTraits::vlanes()); + v_int32 v_src2 = vx_load(src + x + 2 * VTraits::vlanes()), v_dst2 = vx_load(dst + x + 2 * VTraits::vlanes()); + v_int32 v_src3 = vx_load(src + x + 3 * VTraits::vlanes()), v_dst3 = vx_load(dst + x + 3 * VTraits::vlanes()); + + v_uint8 v_nmask = v_eq(vx_load(mask + x), vx_setzero_u8()); + v_uint8 v_nmask0, v_nmask1; + v_zip(v_nmask, v_nmask, v_nmask0, v_nmask1); + v_uint8 v_nmask00, v_nmask01, v_nmask10, v_nmask11; + v_zip(v_nmask0, v_nmask0, v_nmask00, v_nmask01); + v_zip(v_nmask1, v_nmask1, v_nmask10, v_nmask11); + + v_dst0 = v_select(v_reinterpret_as_s32(v_nmask00), v_dst0, v_src0); + v_dst1 = v_select(v_reinterpret_as_s32(v_nmask01), v_dst1, v_src1); + v_dst2 = v_select(v_reinterpret_as_s32(v_nmask10), v_dst2, v_src2); + v_dst3 = v_select(v_reinterpret_as_s32(v_nmask11), v_dst3, v_src3); + + vx_store(dst + x, v_dst0); + vx_store(dst + x + VTraits::vlanes(), v_dst1); + vx_store(dst + x + 2 * VTraits::vlanes(), v_dst2); + vx_store(dst + x + 3 * VTraits::vlanes(), v_dst3); + } + vx_cleanup(); +#endif + for (; x < size.width; x++) + if ( mask[x] ) + dst[x] = src[x]; + } +} + static void copyMaskGeneric(const uchar* _src, size_t sstep, const uchar* mask, size_t mstep, uchar* _dst, size_t dstep, Size size, void* _esz) {