mirror of
https://github.com/opencv/opencv.git
synced 2025-06-22 19:51:26 +08:00
Merge pull request #27145 from fengyuentau:4x/core/copyMask-simd
core: further vectorize copyTo with mask #27145 Merge with https://github.com/opencv/opencv_extra/pull/1247. ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [ ] There is a reference to the original bug report and related work - [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [ ] The feature is well documented and sample code can be built with the project CMake
This commit is contained in:
parent
81859255ca
commit
1b3db545a3
@ -99,7 +99,7 @@ PERF_TEST_P(Size_MatType, Mat_Clone_Roi,
|
|||||||
|
|
||||||
PERF_TEST_P(Size_MatType, Mat_CopyToWithMask,
|
PERF_TEST_P(Size_MatType, Mat_CopyToWithMask,
|
||||||
testing::Combine(testing::Values(::perf::sz1080p, ::perf::szODD),
|
testing::Combine(testing::Values(::perf::sz1080p, ::perf::szODD),
|
||||||
testing::Values(CV_8UC1, CV_8UC2, CV_8UC3, CV_16UC1, CV_32SC1, CV_32FC4))
|
testing::Values(CV_8UC1, CV_8UC2, CV_8UC3, CV_16UC1, CV_16UC3, CV_32SC1, CV_32SC2, CV_32FC4))
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
const Size_MatType_t params = GetParam();
|
const Size_MatType_t params = GetParam();
|
||||||
|
@ -12,6 +12,7 @@
|
|||||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||||
// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
|
// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
|
||||||
// Copyright (C) 2014, Itseez Inc., all rights reserved.
|
// Copyright (C) 2014, Itseez Inc., all rights reserved.
|
||||||
|
// Copyright (C) 2025, SpaceMIT Inc., all rights reserved.
|
||||||
// Third party copyrights are property of their respective owners.
|
// Third party copyrights are property of their respective owners.
|
||||||
//
|
//
|
||||||
// Redistribution and use in source and binary forms, with or without modification,
|
// Redistribution and use in source and binary forms, with or without modification,
|
||||||
@ -178,6 +179,41 @@ copyMask_<uchar>(const uchar* _src, size_t sstep, const uchar* mask, size_t mste
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<> void
|
||||||
|
copyMask_<Vec3b>(const uchar* _src, size_t sstep, const uchar* mask, size_t mstep, uchar* _dst, size_t dstep, Size size)
|
||||||
|
{
|
||||||
|
for( ; size.height--; mask += mstep, _src += sstep, _dst += dstep )
|
||||||
|
{
|
||||||
|
const uchar* src = (const uchar*)_src;
|
||||||
|
uchar* dst = (uchar*)_dst;
|
||||||
|
int x = 0;
|
||||||
|
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||||
|
for( ; x <= size.width - VTraits<v_uint8>::vlanes(); x += VTraits<v_uint8>::vlanes() )
|
||||||
|
{
|
||||||
|
v_uint8 v_nmask = v_eq(vx_load(mask + x), vx_setzero_u8());
|
||||||
|
|
||||||
|
v_uint8 v_src0, v_src1, v_src2;
|
||||||
|
v_uint8 v_dst0, v_dst1, v_dst2;
|
||||||
|
v_load_deinterleave(src + 3 * x, v_src0, v_src1, v_src2);
|
||||||
|
v_load_deinterleave(dst + 3 * x, v_dst0, v_dst1, v_dst2);
|
||||||
|
|
||||||
|
v_dst0 = v_select(v_nmask, v_dst0, v_src0);
|
||||||
|
v_dst1 = v_select(v_nmask, v_dst1, v_src1);
|
||||||
|
v_dst2 = v_select(v_nmask, v_dst2, v_src2);
|
||||||
|
|
||||||
|
v_store_interleave(dst + 3 * x, v_dst0, v_dst1, v_dst2);
|
||||||
|
}
|
||||||
|
vx_cleanup();
|
||||||
|
#endif
|
||||||
|
for( ; x < size.width; x++ )
|
||||||
|
if( mask[x] ) {
|
||||||
|
dst[3 * x] = src[3 * x];
|
||||||
|
dst[3 * x + 1] = src[3 * x + 1];
|
||||||
|
dst[3 * x + 2] = src[3 * x + 2];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
template<> void
|
template<> void
|
||||||
copyMask_<ushort>(const uchar* _src, size_t sstep, const uchar* mask, size_t mstep, uchar* _dst, size_t dstep, Size size)
|
copyMask_<ushort>(const uchar* _src, size_t sstep, const uchar* mask, size_t mstep, uchar* _dst, size_t dstep, Size size)
|
||||||
{
|
{
|
||||||
@ -215,6 +251,92 @@ copyMask_<ushort>(const uchar* _src, size_t sstep, const uchar* mask, size_t mst
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<> void
|
||||||
|
copyMask_<Vec3s>(const uchar* _src, size_t sstep, const uchar* mask, size_t mstep, uchar* _dst, size_t dstep, Size size)
|
||||||
|
{
|
||||||
|
for( ; size.height--; mask += mstep, _src += sstep, _dst += dstep )
|
||||||
|
{
|
||||||
|
const ushort* src = (const ushort*)_src;
|
||||||
|
ushort* dst = (ushort*)_dst;
|
||||||
|
int x = 0;
|
||||||
|
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||||
|
for( ; x <= size.width - VTraits<v_uint8>::vlanes(); x += VTraits<v_uint8>::vlanes() )
|
||||||
|
{
|
||||||
|
v_uint8 v_nmask = v_eq(vx_load(mask + x), vx_setzero_u8());
|
||||||
|
v_uint8 v_nmask0, v_nmask1;
|
||||||
|
v_zip(v_nmask, v_nmask, v_nmask0, v_nmask1);
|
||||||
|
|
||||||
|
v_uint16 v_src0, v_src1, v_src2;
|
||||||
|
v_uint16 v_dst0, v_dst1, v_dst2;
|
||||||
|
v_load_deinterleave(src + 3 * x, v_src0, v_src1, v_src2);
|
||||||
|
v_load_deinterleave(dst + 3 * x, v_dst0, v_dst1, v_dst2);
|
||||||
|
v_uint16 v_src3, v_src4, v_src5;
|
||||||
|
v_uint16 v_dst3, v_dst4, v_dst5;
|
||||||
|
v_load_deinterleave(src + 3 * (x + VTraits<v_uint16>::vlanes()), v_src3, v_src4, v_src5);
|
||||||
|
v_load_deinterleave(dst + 3 * (x + VTraits<v_uint16>::vlanes()), v_dst3, v_dst4, v_dst5);
|
||||||
|
|
||||||
|
v_dst0 = v_select(v_reinterpret_as_u16(v_nmask0), v_dst0, v_src0);
|
||||||
|
v_dst1 = v_select(v_reinterpret_as_u16(v_nmask0), v_dst1, v_src1);
|
||||||
|
v_dst2 = v_select(v_reinterpret_as_u16(v_nmask0), v_dst2, v_src2);
|
||||||
|
v_dst3 = v_select(v_reinterpret_as_u16(v_nmask1), v_dst3, v_src3);
|
||||||
|
v_dst4 = v_select(v_reinterpret_as_u16(v_nmask1), v_dst4, v_src4);
|
||||||
|
v_dst5 = v_select(v_reinterpret_as_u16(v_nmask1), v_dst5, v_src5);
|
||||||
|
|
||||||
|
v_store_interleave(dst + 3 * x, v_dst0, v_dst1, v_dst2);
|
||||||
|
v_store_interleave(dst + 3 * (x + VTraits<v_uint16>::vlanes()), v_dst3, v_dst4, v_dst5);
|
||||||
|
}
|
||||||
|
vx_cleanup();
|
||||||
|
#endif
|
||||||
|
for( ; x < size.width; x++ )
|
||||||
|
if( mask[x] ) {
|
||||||
|
dst[3 * x] = src[3 * x];
|
||||||
|
dst[3 * x + 1] = src[3 * x + 1];
|
||||||
|
dst[3 * x + 2] = src[3 * x + 2];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template<> void
|
||||||
|
copyMask_<int>(const uchar* _src, size_t sstep, const uchar* mask, size_t mstep, uchar* _dst, size_t dstep, Size size)
|
||||||
|
{
|
||||||
|
for( ; size.height--; mask += mstep, _src += sstep, _dst += dstep )
|
||||||
|
{
|
||||||
|
const int* src = (const int*)_src;
|
||||||
|
int* dst = (int*)_dst;
|
||||||
|
int x = 0;
|
||||||
|
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||||
|
for (; x <= size.width - VTraits<v_uint8>::vlanes(); x += VTraits<v_uint8>::vlanes())
|
||||||
|
{
|
||||||
|
v_int32 v_src0 = vx_load(src + x), v_dst0 = vx_load(dst + x);
|
||||||
|
v_int32 v_src1 = vx_load(src + x + VTraits<v_int32>::vlanes()), v_dst1 = vx_load(dst + x + VTraits<v_int32>::vlanes());
|
||||||
|
v_int32 v_src2 = vx_load(src + x + 2 * VTraits<v_int32>::vlanes()), v_dst2 = vx_load(dst + x + 2 * VTraits<v_int32>::vlanes());
|
||||||
|
v_int32 v_src3 = vx_load(src + x + 3 * VTraits<v_int32>::vlanes()), v_dst3 = vx_load(dst + x + 3 * VTraits<v_int32>::vlanes());
|
||||||
|
|
||||||
|
v_uint8 v_nmask = v_eq(vx_load(mask + x), vx_setzero_u8());
|
||||||
|
v_uint8 v_nmask0, v_nmask1;
|
||||||
|
v_zip(v_nmask, v_nmask, v_nmask0, v_nmask1);
|
||||||
|
v_uint8 v_nmask00, v_nmask01, v_nmask10, v_nmask11;
|
||||||
|
v_zip(v_nmask0, v_nmask0, v_nmask00, v_nmask01);
|
||||||
|
v_zip(v_nmask1, v_nmask1, v_nmask10, v_nmask11);
|
||||||
|
|
||||||
|
v_dst0 = v_select(v_reinterpret_as_s32(v_nmask00), v_dst0, v_src0);
|
||||||
|
v_dst1 = v_select(v_reinterpret_as_s32(v_nmask01), v_dst1, v_src1);
|
||||||
|
v_dst2 = v_select(v_reinterpret_as_s32(v_nmask10), v_dst2, v_src2);
|
||||||
|
v_dst3 = v_select(v_reinterpret_as_s32(v_nmask11), v_dst3, v_src3);
|
||||||
|
|
||||||
|
vx_store(dst + x, v_dst0);
|
||||||
|
vx_store(dst + x + VTraits<v_int32>::vlanes(), v_dst1);
|
||||||
|
vx_store(dst + x + 2 * VTraits<v_int32>::vlanes(), v_dst2);
|
||||||
|
vx_store(dst + x + 3 * VTraits<v_int32>::vlanes(), v_dst3);
|
||||||
|
}
|
||||||
|
vx_cleanup();
|
||||||
|
#endif
|
||||||
|
for (; x < size.width; x++)
|
||||||
|
if ( mask[x] )
|
||||||
|
dst[x] = src[x];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
copyMaskGeneric(const uchar* _src, size_t sstep, const uchar* mask, size_t mstep, uchar* _dst, size_t dstep, Size size, void* _esz)
|
copyMaskGeneric(const uchar* _src, size_t sstep, const uchar* mask, size_t mstep, uchar* _dst, size_t dstep, Size size, void* _esz)
|
||||||
{
|
{
|
||||||
|
Loading…
Reference in New Issue
Block a user