mirror of
https://github.com/opencv/opencv.git
synced 2025-08-06 14:36:36 +08:00
Merge pull request #24166 from hanliutong:rewrite-remaining
Rewrite Universal Intrinsic code: ImgProc (CV_SIMD_WIDTH related Part) #24166 Related PR: #24058, #24132. The goal of this series of PRs is to modify the SIMD code blocks in the opencv/modules/imgproc folder by using the new Universal Intrinsic API. The modification of this PR mainly focuses on the code that uses the `CV_SIMD_WIDTH` macro. This macro is sometimes used for loop tail processing, such as `box_filter.simd.hpp` and `morph.simd.hpp`. ```cpp #if CV_SIMD int i = 0; for (i < n - v_uint16::nlanes; i += v_uint16::nlanes) { // some universal intrinsic code // e.g. v_uint16... } #if CV_SIMD_WIDTH > 16 for (i < n - v_uint16x8::nlanes; i += v_uint16x8::nlanes) { // handle loop tail by 128 bit SIMD // e.g. v_uint16x8 } #endif //CV_SIMD_WIDTH #endif// CV_SIMD ``` The main contradiction is that the variable-length Universal Intrinsic backend cannot use 128bit fixed-length data structures. Therefore, this PR uses the scalar loop to handle the loop tail. This PR is marked as draft because the modification of the `box_filter.simd.hpp` file caused a compilation error. The cause of the error is initially believed to be due to an internal error in the GCC compiler. ```bash box_filter.simd.hpp:1162:5: internal compiler error: Segmentation fault 1162 | } | ^ 0xe03883 crash_signal /wafer/share/gcc/gcc/toplev.cc:314 0x7ff261c4251f ??? ./signal/../sysdeps/unix/sysv/linux/x86_64/libc_sigaction.c:0 0x6bde48 hash_set<rtl_ssa::set_info*, false, default_hash_traits<rtl_ssa::set_info*> >::iterator::operator*() /wafer/share/gcc/gcc/hash-set.h:125 0x6bde48 extract_single_source /wafer/share/gcc/gcc/config/riscv/riscv-vsetvl.cc:1184 0x6bde48 extract_single_source /wafer/share/gcc/gcc/config/riscv/riscv-vsetvl.cc:1174 0x119ad9e pass_vsetvl::propagate_avl() const /wafer/share/gcc/gcc/config/riscv/riscv-vsetvl.cc:4087 0x119ceaf pass_vsetvl::execute(function*) /wafer/share/gcc/gcc/config/riscv/riscv-vsetvl.cc:4344 0x119ceaf pass_vsetvl::execute(function*) /wafer/share/gcc/gcc/config/riscv/riscv-vsetvl.cc:4325 Please submit a full bug report, with preprocessed source (by using -freport-bug). Please include the complete backtrace with any bug report. ``` This PR can be compiled with Clang 16, and `opencv_test_imgproc` is passed on QEMU. ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [ ] I agree to contribute to the project under Apache 2 License. - [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [ ] The PR is proposed to the proper branch - [ ] There is a reference to the original bug report and related work - [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [ ] The feature is well documented and sample code can be built with the project CMake
This commit is contained in:
parent
b870ad46bf
commit
320c0bf419
@ -309,15 +309,15 @@ struct ColumnSum<int, uchar> :
|
||||
{
|
||||
const int* Sp = (const int*)src[0];
|
||||
int i = 0;
|
||||
#if CV_SIMD
|
||||
for (; i <= width - v_int32::nlanes; i += v_int32::nlanes)
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
for (; i <= width - VTraits<v_int32>::vlanes(); i += VTraits<v_int32>::vlanes())
|
||||
{
|
||||
v_store(SUM + i, vx_load(SUM + i) + vx_load(Sp + i));
|
||||
v_store(SUM + i, v_add(vx_load(SUM + i), vx_load(Sp + i)));
|
||||
}
|
||||
#if CV_SIMD_WIDTH > 16
|
||||
#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
|
||||
for (; i <= width - v_int32x4::nlanes; i += v_int32x4::nlanes)
|
||||
{
|
||||
v_store(SUM + i, v_load(SUM + i) + v_load(Sp + i));
|
||||
v_store(SUM + i, v_add(v_load(SUM + i), v_load(Sp + i)));
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
@ -339,37 +339,37 @@ struct ColumnSum<int, uchar> :
|
||||
if( haveScale )
|
||||
{
|
||||
int i = 0;
|
||||
#if CV_SIMD
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
v_float32 _v_scale = vx_setall_f32((float)_scale);
|
||||
for( ; i <= width - v_uint16::nlanes; i += v_uint16::nlanes )
|
||||
for( ; i <= width - VTraits<v_uint16>::vlanes(); i += VTraits<v_uint16>::vlanes() )
|
||||
{
|
||||
v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i);
|
||||
v_int32 v_s01 = vx_load(SUM + i + v_int32::nlanes) + vx_load(Sp + i + v_int32::nlanes);
|
||||
v_int32 v_s0 = v_add(vx_load(SUM + i), vx_load(Sp + i));
|
||||
v_int32 v_s01 = v_add(vx_load(SUM + i + VTraits<v_int32>::vlanes()), vx_load(Sp + i + VTraits<v_int32>::vlanes()));
|
||||
|
||||
v_uint32 v_s0d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s0) * _v_scale));
|
||||
v_uint32 v_s01d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s01) * _v_scale));
|
||||
v_uint32 v_s0d = v_reinterpret_as_u32(v_round(v_mul(v_cvt_f32(v_s0), _v_scale)));
|
||||
v_uint32 v_s01d = v_reinterpret_as_u32(v_round(v_mul(v_cvt_f32(v_s01), _v_scale)));
|
||||
|
||||
v_uint16 v_dst = v_pack(v_s0d, v_s01d);
|
||||
v_pack_store(D + i, v_dst);
|
||||
|
||||
v_store(SUM + i, v_s0 - vx_load(Sm + i));
|
||||
v_store(SUM + i + v_int32::nlanes, v_s01 - vx_load(Sm + i + v_int32::nlanes));
|
||||
v_store(SUM + i, v_sub(v_s0, vx_load(Sm + i)));
|
||||
v_store(SUM + i + VTraits<v_int32>::vlanes(), v_sub(v_s01, vx_load(Sm + i + VTraits<v_int32>::vlanes())));
|
||||
}
|
||||
#if CV_SIMD_WIDTH > 16
|
||||
#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
|
||||
v_float32x4 v_scale = v_setall_f32((float)_scale);
|
||||
for( ; i <= width-v_uint16x8::nlanes; i+=v_uint16x8::nlanes )
|
||||
{
|
||||
v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i);
|
||||
v_int32x4 v_s01 = v_load(SUM + i + v_int32x4::nlanes) + v_load(Sp + i + v_int32x4::nlanes);
|
||||
v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i));
|
||||
v_int32x4 v_s01 = v_add(v_load(SUM + i + v_int32x4::nlanes), v_load(Sp + i + v_int32x4::nlanes));
|
||||
|
||||
v_uint32x4 v_s0d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s0) * v_scale));
|
||||
v_uint32x4 v_s01d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s01) * v_scale));
|
||||
v_uint32x4 v_s0d = v_reinterpret_as_u32(v_round(v_mul(v_cvt_f32(v_s0), v_scale)));
|
||||
v_uint32x4 v_s01d = v_reinterpret_as_u32(v_round(v_mul(v_cvt_f32(v_s01), v_scale)));
|
||||
|
||||
v_uint16x8 v_dst = v_pack(v_s0d, v_s01d);
|
||||
v_pack_store(D + i, v_dst);
|
||||
|
||||
v_store(SUM + i, v_s0 - v_load(Sm + i));
|
||||
v_store(SUM + i + v_int32x4::nlanes, v_s01 - v_load(Sm + i + v_int32x4::nlanes));
|
||||
v_store(SUM + i, v_sub(v_s0, v_load(Sm + i)));
|
||||
v_store(SUM + i + v_int32x4::nlanes, v_sub(v_s01, v_load(Sm + i + v_int32x4::nlanes)));
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
@ -383,29 +383,29 @@ struct ColumnSum<int, uchar> :
|
||||
else
|
||||
{
|
||||
int i = 0;
|
||||
#if CV_SIMD
|
||||
for( ; i <= width-v_uint16::nlanes; i+=v_uint16::nlanes )
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
for( ; i <= width-VTraits<v_uint16>::vlanes(); i+=VTraits<v_uint16>::vlanes() )
|
||||
{
|
||||
v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i);
|
||||
v_int32 v_s01 = vx_load(SUM + i + v_int32::nlanes) + vx_load(Sp + i + v_int32::nlanes);
|
||||
v_int32 v_s0 = v_add(vx_load(SUM + i), vx_load(Sp + i));
|
||||
v_int32 v_s01 = v_add(vx_load(SUM + i + VTraits<v_int32>::vlanes()), vx_load(Sp + i + VTraits<v_int32>::vlanes()));
|
||||
|
||||
v_uint16 v_dst = v_pack(v_reinterpret_as_u32(v_s0), v_reinterpret_as_u32(v_s01));
|
||||
v_pack_store(D + i, v_dst);
|
||||
|
||||
v_store(SUM + i, v_s0 - vx_load(Sm + i));
|
||||
v_store(SUM + i + v_int32::nlanes, v_s01 - vx_load(Sm + i + v_int32::nlanes));
|
||||
v_store(SUM + i, v_sub(v_s0, vx_load(Sm + i)));
|
||||
v_store(SUM + i + VTraits<v_int32>::vlanes(), v_sub(v_s01, vx_load(Sm + i + VTraits<v_int32>::vlanes())));
|
||||
}
|
||||
#if CV_SIMD_WIDTH > 16
|
||||
#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
|
||||
for( ; i <= width-v_uint16x8::nlanes; i+=v_uint16x8::nlanes )
|
||||
{
|
||||
v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i);
|
||||
v_int32x4 v_s01 = v_load(SUM + i + v_int32x4::nlanes) + v_load(Sp + i + v_int32x4::nlanes);
|
||||
v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i));
|
||||
v_int32x4 v_s01 = v_add(v_load(SUM + i + v_int32x4::nlanes), v_load(Sp + i + v_int32x4::nlanes));
|
||||
|
||||
v_uint16x8 v_dst = v_pack(v_reinterpret_as_u32(v_s0), v_reinterpret_as_u32(v_s01));
|
||||
v_pack_store(D + i, v_dst);
|
||||
|
||||
v_store(SUM + i, v_s0 - v_load(Sm + i));
|
||||
v_store(SUM + i + v_int32x4::nlanes, v_s01 - v_load(Sm + i + v_int32x4::nlanes));
|
||||
v_store(SUM + i, v_sub(v_s0, v_load(Sm + i)));
|
||||
v_store(SUM + i + v_int32x4::nlanes, v_sub(v_s01, v_load(Sm + i + v_int32x4::nlanes)));
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
@ -480,15 +480,15 @@ public BaseColumnFilter
|
||||
{
|
||||
const ushort* Sp = (const ushort*)src[0];
|
||||
int i = 0;
|
||||
#if CV_SIMD
|
||||
for( ; i <= width - v_uint16::nlanes; i += v_uint16::nlanes )
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
for( ; i <= width - VTraits<v_uint16>::vlanes(); i += VTraits<v_uint16>::vlanes() )
|
||||
{
|
||||
v_store(SUM + i, vx_load(SUM + i) + vx_load(Sp + i));
|
||||
v_store(SUM + i, v_add(vx_load(SUM + i), vx_load(Sp + i)));
|
||||
}
|
||||
#if CV_SIMD_WIDTH > 16
|
||||
#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
|
||||
for( ; i <= width - v_uint16x8::nlanes; i += v_uint16x8::nlanes )
|
||||
{
|
||||
v_store(SUM + i, v_load(SUM + i) + v_load(Sp + i));
|
||||
v_store(SUM + i, v_add(v_load(SUM + i), v_load(Sp + i)));
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
@ -510,27 +510,27 @@ public BaseColumnFilter
|
||||
if( haveScale )
|
||||
{
|
||||
int i = 0;
|
||||
#if CV_SIMD
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
v_uint32 _ds4 = vx_setall_u32((unsigned)ds);
|
||||
v_uint16 _dd8 = vx_setall_u16((ushort)dd);
|
||||
|
||||
for( ; i <= width-v_uint8::nlanes; i+=v_uint8::nlanes )
|
||||
for( ; i <= width-VTraits<v_uint8>::vlanes(); i+=VTraits<v_uint8>::vlanes() )
|
||||
{
|
||||
v_uint16 _sm0 = vx_load(Sm + i);
|
||||
v_uint16 _sm1 = vx_load(Sm + i + v_uint16::nlanes);
|
||||
v_uint16 _sm1 = vx_load(Sm + i + VTraits<v_uint16>::vlanes());
|
||||
|
||||
v_uint16 _s0 = v_add_wrap(vx_load(SUM + i), vx_load(Sp + i));
|
||||
v_uint16 _s1 = v_add_wrap(vx_load(SUM + i + v_uint16::nlanes), vx_load(Sp + i + v_uint16::nlanes));
|
||||
v_uint16 _s1 = v_add_wrap(vx_load(SUM + i + VTraits<v_uint16>::vlanes()), vx_load(Sp + i + VTraits<v_uint16>::vlanes()));
|
||||
|
||||
v_uint32 _s00, _s01, _s10, _s11;
|
||||
|
||||
v_expand(_s0 + _dd8, _s00, _s01);
|
||||
v_expand(_s1 + _dd8, _s10, _s11);
|
||||
v_expand(v_add(_s0, _dd8), _s00, _s01);
|
||||
v_expand(v_add(_s1, _dd8), _s10, _s11);
|
||||
|
||||
_s00 = v_shr<SHIFT>(_s00*_ds4);
|
||||
_s01 = v_shr<SHIFT>(_s01*_ds4);
|
||||
_s10 = v_shr<SHIFT>(_s10*_ds4);
|
||||
_s11 = v_shr<SHIFT>(_s11*_ds4);
|
||||
_s00 = v_shr<SHIFT>(v_mul(_s00, _ds4));
|
||||
_s01 = v_shr<SHIFT>(v_mul(_s01, _ds4));
|
||||
_s10 = v_shr<SHIFT>(v_mul(_s10, _ds4));
|
||||
_s11 = v_shr<SHIFT>(v_mul(_s11, _ds4));
|
||||
|
||||
v_int16 r0 = v_pack(v_reinterpret_as_s32(_s00), v_reinterpret_as_s32(_s01));
|
||||
v_int16 r1 = v_pack(v_reinterpret_as_s32(_s10), v_reinterpret_as_s32(_s11));
|
||||
@ -540,9 +540,9 @@ public BaseColumnFilter
|
||||
|
||||
v_store(D + i, v_pack_u(r0, r1));
|
||||
v_store(SUM + i, _s0);
|
||||
v_store(SUM + i + v_uint16::nlanes, _s1);
|
||||
v_store(SUM + i + VTraits<v_uint16>::vlanes(), _s1);
|
||||
}
|
||||
#if CV_SIMD_WIDTH > 16
|
||||
#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
|
||||
v_uint32x4 ds4 = v_setall_u32((unsigned)ds);
|
||||
v_uint16x8 dd8 = v_setall_u16((ushort)dd);
|
||||
|
||||
@ -556,13 +556,13 @@ public BaseColumnFilter
|
||||
|
||||
v_uint32x4 _s00, _s01, _s10, _s11;
|
||||
|
||||
v_expand(_s0 + dd8, _s00, _s01);
|
||||
v_expand(_s1 + dd8, _s10, _s11);
|
||||
v_expand(v_add(_s0, dd8), _s00, _s01);
|
||||
v_expand(v_add(_s1, dd8), _s10, _s11);
|
||||
|
||||
_s00 = v_shr<SHIFT>(_s00*ds4);
|
||||
_s01 = v_shr<SHIFT>(_s01*ds4);
|
||||
_s10 = v_shr<SHIFT>(_s10*ds4);
|
||||
_s11 = v_shr<SHIFT>(_s11*ds4);
|
||||
_s00 = v_shr<SHIFT>(v_mul(_s00, ds4));
|
||||
_s01 = v_shr<SHIFT>(v_mul(_s01, ds4));
|
||||
_s10 = v_shr<SHIFT>(v_mul(_s10, ds4));
|
||||
_s11 = v_shr<SHIFT>(v_mul(_s11, ds4));
|
||||
|
||||
v_int16x8 r0 = v_pack(v_reinterpret_as_s32(_s00), v_reinterpret_as_s32(_s01));
|
||||
v_int16x8 r1 = v_pack(v_reinterpret_as_s32(_s10), v_reinterpret_as_s32(_s11));
|
||||
@ -643,15 +643,15 @@ struct ColumnSum<int, short> :
|
||||
{
|
||||
const int* Sp = (const int*)src[0];
|
||||
i = 0;
|
||||
#if CV_SIMD
|
||||
for( ; i <= width - v_int32::nlanes; i+=v_int32::nlanes )
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
for( ; i <= width - VTraits<v_int32>::vlanes(); i+=VTraits<v_int32>::vlanes() )
|
||||
{
|
||||
v_store(SUM + i, vx_load(SUM + i) + vx_load(Sp + i));
|
||||
v_store(SUM + i, v_add(vx_load(SUM + i), vx_load(Sp + i)));
|
||||
}
|
||||
#if CV_SIMD_WIDTH > 16
|
||||
#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
|
||||
for( ; i <= width - v_int32x4::nlanes; i+=v_int32x4::nlanes )
|
||||
{
|
||||
v_store(SUM + i, v_load(SUM + i) + v_load(Sp + i));
|
||||
v_store(SUM + i, v_add(v_load(SUM + i), v_load(Sp + i)));
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
@ -673,33 +673,33 @@ struct ColumnSum<int, short> :
|
||||
if( haveScale )
|
||||
{
|
||||
i = 0;
|
||||
#if CV_SIMD
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
v_float32 _v_scale = vx_setall_f32((float)_scale);
|
||||
for( ; i <= width-v_int16::nlanes; i+=v_int16::nlanes )
|
||||
for( ; i <= width-VTraits<v_int16>::vlanes(); i+=VTraits<v_int16>::vlanes() )
|
||||
{
|
||||
v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i);
|
||||
v_int32 v_s01 = vx_load(SUM + i + v_int32::nlanes) + vx_load(Sp + i + v_int32::nlanes);
|
||||
v_int32 v_s0 = v_add(vx_load(SUM + i), vx_load(Sp + i));
|
||||
v_int32 v_s01 = v_add(vx_load(SUM + i + VTraits<v_int32>::vlanes()), vx_load(Sp + i + VTraits<v_int32>::vlanes()));
|
||||
|
||||
v_int32 v_s0d = v_round(v_cvt_f32(v_s0) * _v_scale);
|
||||
v_int32 v_s01d = v_round(v_cvt_f32(v_s01) * _v_scale);
|
||||
v_int32 v_s0d = v_round(v_mul(v_cvt_f32(v_s0), _v_scale));
|
||||
v_int32 v_s01d = v_round(v_mul(v_cvt_f32(v_s01), _v_scale));
|
||||
v_store(D + i, v_pack(v_s0d, v_s01d));
|
||||
|
||||
v_store(SUM + i, v_s0 - vx_load(Sm + i));
|
||||
v_store(SUM + i + v_int32::nlanes, v_s01 - vx_load(Sm + i + v_int32::nlanes));
|
||||
v_store(SUM + i, v_sub(v_s0, vx_load(Sm + i)));
|
||||
v_store(SUM + i + VTraits<v_int32>::vlanes(), v_sub(v_s01, vx_load(Sm + i + VTraits<v_int32>::vlanes())));
|
||||
}
|
||||
#if CV_SIMD_WIDTH > 16
|
||||
#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
|
||||
v_float32x4 v_scale = v_setall_f32((float)_scale);
|
||||
for( ; i <= width-v_int16x8::nlanes; i+=v_int16x8::nlanes )
|
||||
{
|
||||
v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i);
|
||||
v_int32x4 v_s01 = v_load(SUM + i + v_int32x4::nlanes) + v_load(Sp + i + v_int32x4::nlanes);
|
||||
v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i));
|
||||
v_int32x4 v_s01 = v_add(v_load(SUM + i + v_int32x4::nlanes), v_load(Sp + i + v_int32x4::nlanes));
|
||||
|
||||
v_int32x4 v_s0d = v_round(v_cvt_f32(v_s0) * v_scale);
|
||||
v_int32x4 v_s01d = v_round(v_cvt_f32(v_s01) * v_scale);
|
||||
v_int32x4 v_s0d = v_round(v_mul(v_cvt_f32(v_s0), v_scale));
|
||||
v_int32x4 v_s01d = v_round(v_mul(v_cvt_f32(v_s01), v_scale));
|
||||
v_store(D + i, v_pack(v_s0d, v_s01d));
|
||||
|
||||
v_store(SUM + i, v_s0 - v_load(Sm + i));
|
||||
v_store(SUM + i + v_int32x4::nlanes, v_s01 - v_load(Sm + i + v_int32x4::nlanes));
|
||||
v_store(SUM + i, v_sub(v_s0, v_load(Sm + i)));
|
||||
v_store(SUM + i + v_int32x4::nlanes, v_sub(v_s01, v_load(Sm + i + v_int32x4::nlanes)));
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
@ -713,27 +713,27 @@ struct ColumnSum<int, short> :
|
||||
else
|
||||
{
|
||||
i = 0;
|
||||
#if CV_SIMD
|
||||
for( ; i <= width-v_int16::nlanes; i+=v_int16::nlanes )
|
||||
#if CV_SIMD // TODO: enable for CV_SIMD_SCALABLE, GCC 13 related
|
||||
for( ; i <= width-VTraits<v_int16>::vlanes(); i+=VTraits<v_int16>::vlanes() )
|
||||
{
|
||||
v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i);
|
||||
v_int32 v_s01 = vx_load(SUM + i + v_int32::nlanes) + vx_load(Sp + i + v_int32::nlanes);
|
||||
v_int32 v_s0 = v_add(vx_load(SUM + i), vx_load(Sp + i));
|
||||
v_int32 v_s01 = v_add(vx_load(SUM + i + VTraits<v_int32>::vlanes()), vx_load(Sp + i + VTraits<v_int32>::vlanes()));
|
||||
|
||||
v_store(D + i, v_pack(v_s0, v_s01));
|
||||
|
||||
v_store(SUM + i, v_s0 - vx_load(Sm + i));
|
||||
v_store(SUM + i + v_int32::nlanes, v_s01 - vx_load(Sm + i + v_int32::nlanes));
|
||||
v_store(SUM + i, v_sub(v_s0, vx_load(Sm + i)));
|
||||
v_store(SUM + i + VTraits<v_int32>::vlanes(), v_sub(v_s01, vx_load(Sm + i + VTraits<v_int32>::vlanes())));
|
||||
}
|
||||
#if CV_SIMD_WIDTH > 16
|
||||
#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
|
||||
for( ; i <= width-v_int16x8::nlanes; i+=v_int16x8::nlanes )
|
||||
{
|
||||
v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i);
|
||||
v_int32x4 v_s01 = v_load(SUM + i + v_int32x4::nlanes) + v_load(Sp + i + v_int32x4::nlanes);
|
||||
v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i));
|
||||
v_int32x4 v_s01 = v_add(v_load(SUM + i + v_int32x4::nlanes), v_load(Sp + i + v_int32x4::nlanes));
|
||||
|
||||
v_store(D + i, v_pack(v_s0, v_s01));
|
||||
|
||||
v_store(SUM + i, v_s0 - v_load(Sm + i));
|
||||
v_store(SUM + i + v_int32x4::nlanes, v_s01 - v_load(Sm + i + v_int32x4::nlanes));
|
||||
v_store(SUM + i, v_sub(v_s0, v_load(Sm + i)));
|
||||
v_store(SUM + i + v_int32x4::nlanes, v_sub(v_s01, v_load(Sm + i + v_int32x4::nlanes)));
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
@ -792,15 +792,15 @@ struct ColumnSum<int, ushort> :
|
||||
{
|
||||
const int* Sp = (const int*)src[0];
|
||||
int i = 0;
|
||||
#if CV_SIMD
|
||||
for (; i <= width - v_int32::nlanes; i += v_int32::nlanes)
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
for (; i <= width - VTraits<v_int32>::vlanes(); i += VTraits<v_int32>::vlanes())
|
||||
{
|
||||
v_store(SUM + i, vx_load(SUM + i) + vx_load(Sp + i));
|
||||
v_store(SUM + i, v_add(vx_load(SUM + i), vx_load(Sp + i)));
|
||||
}
|
||||
#if CV_SIMD_WIDTH > 16
|
||||
#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
|
||||
for (; i <= width - v_int32x4::nlanes; i += v_int32x4::nlanes)
|
||||
{
|
||||
v_store(SUM + i, v_load(SUM + i) + v_load(Sp + i));
|
||||
v_store(SUM + i, v_add(v_load(SUM + i), v_load(Sp + i)));
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
@ -822,33 +822,33 @@ struct ColumnSum<int, ushort> :
|
||||
if( haveScale )
|
||||
{
|
||||
int i = 0;
|
||||
#if CV_SIMD
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
v_float32 _v_scale = vx_setall_f32((float)_scale);
|
||||
for( ; i <= width-v_uint16::nlanes; i+=v_uint16::nlanes )
|
||||
for( ; i <= width-VTraits<v_uint16>::vlanes(); i+=VTraits<v_uint16>::vlanes() )
|
||||
{
|
||||
v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i);
|
||||
v_int32 v_s01 = vx_load(SUM + i + v_int32::nlanes) + vx_load(Sp + i + v_int32::nlanes);
|
||||
v_int32 v_s0 = v_add(vx_load(SUM + i), vx_load(Sp + i));
|
||||
v_int32 v_s01 = v_add(vx_load(SUM + i + VTraits<v_int32>::vlanes()), vx_load(Sp + i + VTraits<v_int32>::vlanes()));
|
||||
|
||||
v_uint32 v_s0d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s0) * _v_scale));
|
||||
v_uint32 v_s01d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s01) * _v_scale));
|
||||
v_uint32 v_s0d = v_reinterpret_as_u32(v_round(v_mul(v_cvt_f32(v_s0), _v_scale)));
|
||||
v_uint32 v_s01d = v_reinterpret_as_u32(v_round(v_mul(v_cvt_f32(v_s01), _v_scale)));
|
||||
v_store(D + i, v_pack(v_s0d, v_s01d));
|
||||
|
||||
v_store(SUM + i, v_s0 - vx_load(Sm + i));
|
||||
v_store(SUM + i + v_int32::nlanes, v_s01 - vx_load(Sm + i + v_int32::nlanes));
|
||||
v_store(SUM + i, v_sub(v_s0, vx_load(Sm + i)));
|
||||
v_store(SUM + i + VTraits<v_int32>::vlanes(), v_sub(v_s01, vx_load(Sm + i + VTraits<v_int32>::vlanes())));
|
||||
}
|
||||
#if CV_SIMD_WIDTH > 16
|
||||
#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
|
||||
v_float32x4 v_scale = v_setall_f32((float)_scale);
|
||||
for( ; i <= width-v_uint16x8::nlanes; i+=v_uint16x8::nlanes )
|
||||
{
|
||||
v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i);
|
||||
v_int32x4 v_s01 = v_load(SUM + i + v_int32x4::nlanes) + v_load(Sp + i + v_int32x4::nlanes);
|
||||
v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i));
|
||||
v_int32x4 v_s01 = v_add(v_load(SUM + i + v_int32x4::nlanes), v_load(Sp + i + v_int32x4::nlanes));
|
||||
|
||||
v_uint32x4 v_s0d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s0) * v_scale));
|
||||
v_uint32x4 v_s01d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s01) * v_scale));
|
||||
v_uint32x4 v_s0d = v_reinterpret_as_u32(v_round(v_mul(v_cvt_f32(v_s0), v_scale)));
|
||||
v_uint32x4 v_s01d = v_reinterpret_as_u32(v_round(v_mul(v_cvt_f32(v_s01), v_scale)));
|
||||
v_store(D + i, v_pack(v_s0d, v_s01d));
|
||||
|
||||
v_store(SUM + i, v_s0 - v_load(Sm + i));
|
||||
v_store(SUM + i + v_int32x4::nlanes, v_s01 - v_load(Sm + i + v_int32x4::nlanes));
|
||||
v_store(SUM + i, v_sub(v_s0, v_load(Sm + i)));
|
||||
v_store(SUM + i + v_int32x4::nlanes, v_sub(v_s01, v_load(Sm + i + v_int32x4::nlanes)));
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
@ -862,27 +862,27 @@ struct ColumnSum<int, ushort> :
|
||||
else
|
||||
{
|
||||
int i = 0;
|
||||
#if CV_SIMD
|
||||
for( ; i <= width-v_uint16::nlanes; i+=v_uint16::nlanes )
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
for( ; i <= width-VTraits<v_uint16>::vlanes(); i+=VTraits<v_uint16>::vlanes() )
|
||||
{
|
||||
v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i);
|
||||
v_int32 v_s01 = vx_load(SUM + i + v_int32::nlanes) + vx_load(Sp + i + v_int32::nlanes);
|
||||
v_int32 v_s0 = v_add(vx_load(SUM + i), vx_load(Sp + i));
|
||||
v_int32 v_s01 = v_add(vx_load(SUM + i + VTraits<v_int32>::vlanes()), vx_load(Sp + i + VTraits<v_int32>::vlanes()));
|
||||
|
||||
v_store(D + i, v_pack(v_reinterpret_as_u32(v_s0), v_reinterpret_as_u32(v_s01)));
|
||||
|
||||
v_store(SUM + i, v_s0 - vx_load(Sm + i));
|
||||
v_store(SUM + i + v_int32::nlanes, v_s01 - vx_load(Sm + i + v_int32::nlanes));
|
||||
v_store(SUM + i, v_sub(v_s0, vx_load(Sm + i)));
|
||||
v_store(SUM + i + VTraits<v_int32>::vlanes(), v_sub(v_s01, vx_load(Sm + i + VTraits<v_int32>::vlanes())));
|
||||
}
|
||||
#if CV_SIMD_WIDTH > 16
|
||||
#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
|
||||
for( ; i <= width-v_uint16x8::nlanes; i+=v_uint16x8::nlanes )
|
||||
{
|
||||
v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i);
|
||||
v_int32x4 v_s01 = v_load(SUM + i + v_int32x4::nlanes) + v_load(Sp + i + v_int32x4::nlanes);
|
||||
v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i));
|
||||
v_int32x4 v_s01 = v_add(v_load(SUM + i + v_int32x4::nlanes), v_load(Sp + i + v_int32x4::nlanes));
|
||||
|
||||
v_store(D + i, v_pack(v_reinterpret_as_u32(v_s0), v_reinterpret_as_u32(v_s01)));
|
||||
|
||||
v_store(SUM + i, v_s0 - v_load(Sm + i));
|
||||
v_store(SUM + i + v_int32x4::nlanes, v_s01 - v_load(Sm + i + v_int32x4::nlanes));
|
||||
v_store(SUM + i, v_sub(v_s0, v_load(Sm + i)));
|
||||
v_store(SUM + i + v_int32x4::nlanes, v_sub(v_s01, v_load(Sm + i + v_int32x4::nlanes)));
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
@ -939,15 +939,15 @@ struct ColumnSum<int, int> :
|
||||
{
|
||||
const int* Sp = (const int*)src[0];
|
||||
int i = 0;
|
||||
#if CV_SIMD
|
||||
for( ; i <= width - v_int32::nlanes; i+=v_int32::nlanes )
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
for( ; i <= width - VTraits<v_int32>::vlanes(); i+=VTraits<v_int32>::vlanes() )
|
||||
{
|
||||
v_store(SUM + i, vx_load(SUM + i) + vx_load(Sp + i));
|
||||
v_store(SUM + i, v_add(vx_load(SUM + i), vx_load(Sp + i)));
|
||||
}
|
||||
#if CV_SIMD_WIDTH > 16
|
||||
#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
|
||||
for( ; i <= width - v_int32x4::nlanes; i+=v_int32x4::nlanes )
|
||||
{
|
||||
v_store(SUM + i, v_load(SUM + i) + v_load(Sp + i));
|
||||
v_store(SUM + i, v_add(v_load(SUM + i), v_load(Sp + i)));
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
@ -969,25 +969,25 @@ struct ColumnSum<int, int> :
|
||||
if( haveScale )
|
||||
{
|
||||
int i = 0;
|
||||
#if CV_SIMD
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
v_float32 _v_scale = vx_setall_f32((float)_scale);
|
||||
for( ; i <= width-v_int32::nlanes; i+=v_int32::nlanes )
|
||||
for( ; i <= width-VTraits<v_int32>::vlanes(); i+=VTraits<v_int32>::vlanes() )
|
||||
{
|
||||
v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i);
|
||||
v_int32 v_s0d = v_round(v_cvt_f32(v_s0) * _v_scale);
|
||||
v_int32 v_s0 = v_add(vx_load(SUM + i), vx_load(Sp + i));
|
||||
v_int32 v_s0d = v_round(v_mul(v_cvt_f32(v_s0), _v_scale));
|
||||
|
||||
v_store(D + i, v_s0d);
|
||||
v_store(SUM + i, v_s0 - vx_load(Sm + i));
|
||||
v_store(SUM + i, v_sub(v_s0, vx_load(Sm + i)));
|
||||
}
|
||||
#if CV_SIMD_WIDTH > 16
|
||||
#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
|
||||
v_float32x4 v_scale = v_setall_f32((float)_scale);
|
||||
for( ; i <= width-v_int32x4::nlanes; i+=v_int32x4::nlanes )
|
||||
{
|
||||
v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i);
|
||||
v_int32x4 v_s0d = v_round(v_cvt_f32(v_s0) * v_scale);
|
||||
v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i));
|
||||
v_int32x4 v_s0d = v_round(v_mul(v_cvt_f32(v_s0), v_scale));
|
||||
|
||||
v_store(D + i, v_s0d);
|
||||
v_store(SUM + i, v_s0 - v_load(Sm + i));
|
||||
v_store(SUM + i, v_sub(v_s0, v_load(Sm + i)));
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
@ -1001,21 +1001,21 @@ struct ColumnSum<int, int> :
|
||||
else
|
||||
{
|
||||
int i = 0;
|
||||
#if CV_SIMD
|
||||
for( ; i <= width-v_int32::nlanes; i+=v_int32::nlanes )
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
for( ; i <= width-VTraits<v_int32>::vlanes(); i+=VTraits<v_int32>::vlanes() )
|
||||
{
|
||||
v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i);
|
||||
v_int32 v_s0 = v_add(vx_load(SUM + i), vx_load(Sp + i));
|
||||
|
||||
v_store(D + i, v_s0);
|
||||
v_store(SUM + i, v_s0 - vx_load(Sm + i));
|
||||
v_store(SUM + i, v_sub(v_s0, vx_load(Sm + i)));
|
||||
}
|
||||
#if CV_SIMD_WIDTH > 16
|
||||
#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
|
||||
for( ; i <= width-v_int32x4::nlanes; i+=v_int32x4::nlanes )
|
||||
{
|
||||
v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i);
|
||||
v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i));
|
||||
|
||||
v_store(D + i, v_s0);
|
||||
v_store(SUM + i, v_s0 - v_load(Sm + i));
|
||||
v_store(SUM + i, v_sub(v_s0, v_load(Sm + i)));
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
@ -1073,15 +1073,15 @@ struct ColumnSum<int, float> :
|
||||
{
|
||||
const int* Sp = (const int*)src[0];
|
||||
int i = 0;
|
||||
#if CV_SIMD
|
||||
for( ; i <= width - v_int32::nlanes; i+=v_int32::nlanes )
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
for( ; i <= width - VTraits<v_int32>::vlanes(); i+=VTraits<v_int32>::vlanes() )
|
||||
{
|
||||
v_store(SUM + i, vx_load(SUM + i) + vx_load(Sp + i));
|
||||
v_store(SUM + i, v_add(vx_load(SUM + i), vx_load(Sp + i)));
|
||||
}
|
||||
#if CV_SIMD_WIDTH > 16
|
||||
#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
|
||||
for( ; i <= width - v_int32x4::nlanes; i+=v_int32x4::nlanes )
|
||||
{
|
||||
v_store(SUM + i, v_load(SUM + i) + v_load(Sp + i));
|
||||
v_store(SUM + i, v_add(v_load(SUM + i), v_load(Sp + i)));
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
@ -1105,21 +1105,21 @@ struct ColumnSum<int, float> :
|
||||
{
|
||||
int i = 0;
|
||||
|
||||
#if CV_SIMD
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
v_float32 _v_scale = vx_setall_f32((float)_scale);
|
||||
for (; i <= width - v_int32::nlanes; i += v_int32::nlanes)
|
||||
for (; i <= width - VTraits<v_int32>::vlanes(); i += VTraits<v_int32>::vlanes())
|
||||
{
|
||||
v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i);
|
||||
v_store(D + i, v_cvt_f32(v_s0) * _v_scale);
|
||||
v_store(SUM + i, v_s0 - vx_load(Sm + i));
|
||||
v_int32 v_s0 = v_add(vx_load(SUM + i), vx_load(Sp + i));
|
||||
v_store(D + i, v_mul(v_cvt_f32(v_s0), _v_scale));
|
||||
v_store(SUM + i, v_sub(v_s0, vx_load(Sm + i)));
|
||||
}
|
||||
#if CV_SIMD_WIDTH > 16
|
||||
#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
|
||||
v_float32x4 v_scale = v_setall_f32((float)_scale);
|
||||
for (; i <= width - v_int32x4::nlanes; i += v_int32x4::nlanes)
|
||||
{
|
||||
v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i);
|
||||
v_store(D + i, v_cvt_f32(v_s0) * v_scale);
|
||||
v_store(SUM + i, v_s0 - v_load(Sm + i));
|
||||
v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i));
|
||||
v_store(D + i, v_mul(v_cvt_f32(v_s0), v_scale));
|
||||
v_store(SUM + i, v_sub(v_s0, v_load(Sm + i)));
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
@ -1134,19 +1134,19 @@ struct ColumnSum<int, float> :
|
||||
{
|
||||
int i = 0;
|
||||
|
||||
#if CV_SIMD
|
||||
for( ; i <= width-v_int32::nlanes; i+=v_int32::nlanes )
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
for( ; i <= width-VTraits<v_int32>::vlanes(); i+=VTraits<v_int32>::vlanes() )
|
||||
{
|
||||
v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i);
|
||||
v_int32 v_s0 = v_add(vx_load(SUM + i), vx_load(Sp + i));
|
||||
v_store(D + i, v_cvt_f32(v_s0));
|
||||
v_store(SUM + i, v_s0 - vx_load(Sm + i));
|
||||
v_store(SUM + i, v_sub(v_s0, vx_load(Sm + i)));
|
||||
}
|
||||
#if CV_SIMD_WIDTH > 16
|
||||
#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
|
||||
for( ; i <= width-v_int32x4::nlanes; i+=v_int32x4::nlanes )
|
||||
{
|
||||
v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i);
|
||||
v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i));
|
||||
v_store(D + i, v_cvt_f32(v_s0));
|
||||
v_store(SUM + i, v_s0 - v_load(Sm + i));
|
||||
v_store(SUM + i, v_sub(v_s0, v_load(Sm + i)));
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
@ -106,12 +106,12 @@ struct MorphNoVec
|
||||
int operator()(uchar**, int, uchar*, int) const { return 0; }
|
||||
};
|
||||
|
||||
#if CV_SIMD
|
||||
#if CV_SIMD // TODO: enable for CV_SIMD_SCALABLE, GCC 13 related
|
||||
|
||||
template<class VecUpdate> struct MorphRowVec
|
||||
{
|
||||
typedef typename VecUpdate::vtype vtype;
|
||||
typedef typename vtype::lane_type stype;
|
||||
typedef typename VTraits<vtype>::lane_type stype;
|
||||
MorphRowVec(int _ksize, int _anchor) : ksize(_ksize), anchor(_anchor) {}
|
||||
int operator()(const uchar* src, uchar* dst, int width, int cn) const
|
||||
{
|
||||
@ -121,52 +121,52 @@ template<class VecUpdate> struct MorphRowVec
|
||||
width *= cn;
|
||||
VecUpdate updateOp;
|
||||
|
||||
for( i = 0; i <= width - 4*vtype::nlanes; i += 4*vtype::nlanes )
|
||||
for( i = 0; i <= width - 4*VTraits<vtype>::vlanes(); i += 4*VTraits<vtype>::vlanes() )
|
||||
{
|
||||
vtype s0 = vx_load((const stype*)src + i);
|
||||
vtype s1 = vx_load((const stype*)src + i + vtype::nlanes);
|
||||
vtype s2 = vx_load((const stype*)src + i + 2*vtype::nlanes);
|
||||
vtype s3 = vx_load((const stype*)src + i + 3*vtype::nlanes);
|
||||
vtype s1 = vx_load((const stype*)src + i + VTraits<vtype>::vlanes());
|
||||
vtype s2 = vx_load((const stype*)src + i + 2*VTraits<vtype>::vlanes());
|
||||
vtype s3 = vx_load((const stype*)src + i + 3*VTraits<vtype>::vlanes());
|
||||
for (k = cn; k < _ksize; k += cn)
|
||||
{
|
||||
s0 = updateOp(s0, vx_load((const stype*)src + i + k));
|
||||
s1 = updateOp(s1, vx_load((const stype*)src + i + k + vtype::nlanes));
|
||||
s2 = updateOp(s2, vx_load((const stype*)src + i + k + 2*vtype::nlanes));
|
||||
s3 = updateOp(s3, vx_load((const stype*)src + i + k + 3*vtype::nlanes));
|
||||
s1 = updateOp(s1, vx_load((const stype*)src + i + k + VTraits<vtype>::vlanes()));
|
||||
s2 = updateOp(s2, vx_load((const stype*)src + i + k + 2*VTraits<vtype>::vlanes()));
|
||||
s3 = updateOp(s3, vx_load((const stype*)src + i + k + 3*VTraits<vtype>::vlanes()));
|
||||
}
|
||||
v_store((stype*)dst + i, s0);
|
||||
v_store((stype*)dst + i + vtype::nlanes, s1);
|
||||
v_store((stype*)dst + i + 2*vtype::nlanes, s2);
|
||||
v_store((stype*)dst + i + 3*vtype::nlanes, s3);
|
||||
v_store((stype*)dst + i + VTraits<vtype>::vlanes(), s1);
|
||||
v_store((stype*)dst + i + 2*VTraits<vtype>::vlanes(), s2);
|
||||
v_store((stype*)dst + i + 3*VTraits<vtype>::vlanes(), s3);
|
||||
}
|
||||
if( i <= width - 2*vtype::nlanes )
|
||||
if( i <= width - 2*VTraits<vtype>::vlanes() )
|
||||
{
|
||||
vtype s0 = vx_load((const stype*)src + i);
|
||||
vtype s1 = vx_load((const stype*)src + i + vtype::nlanes);
|
||||
vtype s1 = vx_load((const stype*)src + i + VTraits<vtype>::vlanes());
|
||||
for( k = cn; k < _ksize; k += cn )
|
||||
{
|
||||
s0 = updateOp(s0, vx_load((const stype*)src + i + k));
|
||||
s1 = updateOp(s1, vx_load((const stype*)src + i + k + vtype::nlanes));
|
||||
s1 = updateOp(s1, vx_load((const stype*)src + i + k + VTraits<vtype>::vlanes()));
|
||||
}
|
||||
v_store((stype*)dst + i, s0);
|
||||
v_store((stype*)dst + i + vtype::nlanes, s1);
|
||||
i += 2*vtype::nlanes;
|
||||
v_store((stype*)dst + i + VTraits<vtype>::vlanes(), s1);
|
||||
i += 2*VTraits<vtype>::vlanes();
|
||||
}
|
||||
if( i <= width - vtype::nlanes )
|
||||
if( i <= width - VTraits<vtype>::vlanes() )
|
||||
{
|
||||
vtype s = vx_load((const stype*)src + i);
|
||||
for( k = cn; k < _ksize; k += cn )
|
||||
s = updateOp(s, vx_load((const stype*)src + i + k));
|
||||
v_store((stype*)dst + i, s);
|
||||
i += vtype::nlanes;
|
||||
i += VTraits<vtype>::vlanes();
|
||||
}
|
||||
if( i <= width - vtype::nlanes/2 )
|
||||
if( i <= width - VTraits<vtype>::vlanes()/2 )
|
||||
{
|
||||
vtype s = vx_load_low((const stype*)src + i);
|
||||
for( k = cn; k < _ksize; k += cn )
|
||||
s = updateOp(s, vx_load_low((const stype*)src + i + k));
|
||||
v_store_low((stype*)dst + i, s);
|
||||
i += vtype::nlanes/2;
|
||||
i += VTraits<vtype>::vlanes()/2;
|
||||
}
|
||||
|
||||
return i - i % cn;
|
||||
@ -179,7 +179,7 @@ template<class VecUpdate> struct MorphRowVec
|
||||
template<class VecUpdate> struct MorphColumnVec
|
||||
{
|
||||
typedef typename VecUpdate::vtype vtype;
|
||||
typedef typename vtype::lane_type stype;
|
||||
typedef typename VTraits<vtype>::lane_type stype;
|
||||
MorphColumnVec(int _ksize, int _anchor) : ksize(_ksize), anchor(_anchor) {}
|
||||
int operator()(const uchar** _src, uchar* _dst, int dststep, int count, int width) const
|
||||
{
|
||||
@ -189,7 +189,7 @@ template<class VecUpdate> struct MorphColumnVec
|
||||
VecUpdate updateOp;
|
||||
|
||||
for( i = 0; i < count + ksize - 1; i++ )
|
||||
CV_Assert( ((size_t)_src[i] & (CV_SIMD_WIDTH-1)) == 0 );
|
||||
CV_Assert( ((size_t)_src[i] & (VTraits<v_uint8>::vlanes()-1)) == 0 );
|
||||
|
||||
const stype** src = (const stype**)_src;
|
||||
stype* dst = (stype*)_dst;
|
||||
@ -197,58 +197,58 @@ template<class VecUpdate> struct MorphColumnVec
|
||||
|
||||
for( ; _ksize > 1 && count > 1; count -= 2, dst += dststep*2, src += 2 )
|
||||
{
|
||||
for( i = 0; i <= width - 4*vtype::nlanes; i += 4*vtype::nlanes)
|
||||
for( i = 0; i <= width - 4*VTraits<vtype>::vlanes(); i += 4*VTraits<vtype>::vlanes())
|
||||
{
|
||||
const stype* sptr = src[1] + i;
|
||||
vtype s0 = vx_load_aligned(sptr);
|
||||
vtype s1 = vx_load_aligned(sptr + vtype::nlanes);
|
||||
vtype s2 = vx_load_aligned(sptr + 2*vtype::nlanes);
|
||||
vtype s3 = vx_load_aligned(sptr + 3*vtype::nlanes);
|
||||
vtype s1 = vx_load_aligned(sptr + VTraits<vtype>::vlanes());
|
||||
vtype s2 = vx_load_aligned(sptr + 2*VTraits<vtype>::vlanes());
|
||||
vtype s3 = vx_load_aligned(sptr + 3*VTraits<vtype>::vlanes());
|
||||
|
||||
for( k = 2; k < _ksize; k++ )
|
||||
{
|
||||
sptr = src[k] + i;
|
||||
s0 = updateOp(s0, vx_load_aligned(sptr));
|
||||
s1 = updateOp(s1, vx_load_aligned(sptr + vtype::nlanes));
|
||||
s2 = updateOp(s2, vx_load_aligned(sptr + 2*vtype::nlanes));
|
||||
s3 = updateOp(s3, vx_load_aligned(sptr + 3*vtype::nlanes));
|
||||
s1 = updateOp(s1, vx_load_aligned(sptr + VTraits<vtype>::vlanes()));
|
||||
s2 = updateOp(s2, vx_load_aligned(sptr + 2*VTraits<vtype>::vlanes()));
|
||||
s3 = updateOp(s3, vx_load_aligned(sptr + 3*VTraits<vtype>::vlanes()));
|
||||
}
|
||||
|
||||
sptr = src[0] + i;
|
||||
v_store(dst + i, updateOp(s0, vx_load_aligned(sptr)));
|
||||
v_store(dst + i + vtype::nlanes, updateOp(s1, vx_load_aligned(sptr + vtype::nlanes)));
|
||||
v_store(dst + i + 2*vtype::nlanes, updateOp(s2, vx_load_aligned(sptr + 2*vtype::nlanes)));
|
||||
v_store(dst + i + 3*vtype::nlanes, updateOp(s3, vx_load_aligned(sptr + 3*vtype::nlanes)));
|
||||
v_store(dst + i + VTraits<vtype>::vlanes(), updateOp(s1, vx_load_aligned(sptr + VTraits<vtype>::vlanes())));
|
||||
v_store(dst + i + 2*VTraits<vtype>::vlanes(), updateOp(s2, vx_load_aligned(sptr + 2*VTraits<vtype>::vlanes())));
|
||||
v_store(dst + i + 3*VTraits<vtype>::vlanes(), updateOp(s3, vx_load_aligned(sptr + 3*VTraits<vtype>::vlanes())));
|
||||
|
||||
sptr = src[k] + i;
|
||||
v_store(dst + dststep + i, updateOp(s0, vx_load_aligned(sptr)));
|
||||
v_store(dst + dststep + i + vtype::nlanes, updateOp(s1, vx_load_aligned(sptr + vtype::nlanes)));
|
||||
v_store(dst + dststep + i + 2*vtype::nlanes, updateOp(s2, vx_load_aligned(sptr + 2*vtype::nlanes)));
|
||||
v_store(dst + dststep + i + 3*vtype::nlanes, updateOp(s3, vx_load_aligned(sptr + 3*vtype::nlanes)));
|
||||
v_store(dst + dststep + i + VTraits<vtype>::vlanes(), updateOp(s1, vx_load_aligned(sptr + VTraits<vtype>::vlanes())));
|
||||
v_store(dst + dststep + i + 2*VTraits<vtype>::vlanes(), updateOp(s2, vx_load_aligned(sptr + 2*VTraits<vtype>::vlanes())));
|
||||
v_store(dst + dststep + i + 3*VTraits<vtype>::vlanes(), updateOp(s3, vx_load_aligned(sptr + 3*VTraits<vtype>::vlanes())));
|
||||
}
|
||||
if( i <= width - 2*vtype::nlanes )
|
||||
if( i <= width - 2*VTraits<vtype>::vlanes() )
|
||||
{
|
||||
const stype* sptr = src[1] + i;
|
||||
vtype s0 = vx_load_aligned(sptr);
|
||||
vtype s1 = vx_load_aligned(sptr + vtype::nlanes);
|
||||
vtype s1 = vx_load_aligned(sptr + VTraits<vtype>::vlanes());
|
||||
|
||||
for( k = 2; k < _ksize; k++ )
|
||||
{
|
||||
sptr = src[k] + i;
|
||||
s0 = updateOp(s0, vx_load_aligned(sptr));
|
||||
s1 = updateOp(s1, vx_load_aligned(sptr + vtype::nlanes));
|
||||
s1 = updateOp(s1, vx_load_aligned(sptr + VTraits<vtype>::vlanes()));
|
||||
}
|
||||
|
||||
sptr = src[0] + i;
|
||||
v_store(dst + i, updateOp(s0, vx_load_aligned(sptr)));
|
||||
v_store(dst + i + vtype::nlanes, updateOp(s1, vx_load_aligned(sptr + vtype::nlanes)));
|
||||
v_store(dst + i + VTraits<vtype>::vlanes(), updateOp(s1, vx_load_aligned(sptr + VTraits<vtype>::vlanes())));
|
||||
|
||||
sptr = src[k] + i;
|
||||
v_store(dst + dststep + i, updateOp(s0, vx_load_aligned(sptr)));
|
||||
v_store(dst + dststep + i + vtype::nlanes, updateOp(s1, vx_load_aligned(sptr + vtype::nlanes)));
|
||||
i += 2*vtype::nlanes;
|
||||
v_store(dst + dststep + i + VTraits<vtype>::vlanes(), updateOp(s1, vx_load_aligned(sptr + VTraits<vtype>::vlanes())));
|
||||
i += 2*VTraits<vtype>::vlanes();
|
||||
}
|
||||
if( i <= width - vtype::nlanes )
|
||||
if( i <= width - VTraits<vtype>::vlanes() )
|
||||
{
|
||||
vtype s0 = vx_load_aligned(src[1] + i);
|
||||
|
||||
@ -257,9 +257,9 @@ template<class VecUpdate> struct MorphColumnVec
|
||||
|
||||
v_store(dst + i, updateOp(s0, vx_load_aligned(src[0] + i)));
|
||||
v_store(dst + dststep + i, updateOp(s0, vx_load_aligned(src[k] + i)));
|
||||
i += vtype::nlanes;
|
||||
i += VTraits<vtype>::vlanes();
|
||||
}
|
||||
if( i <= width - vtype::nlanes/2 )
|
||||
if( i <= width - VTraits<vtype>::vlanes()/2 )
|
||||
{
|
||||
vtype s0 = vx_load_low(src[1] + i);
|
||||
|
||||
@ -268,66 +268,66 @@ template<class VecUpdate> struct MorphColumnVec
|
||||
|
||||
v_store_low(dst + i, updateOp(s0, vx_load_low(src[0] + i)));
|
||||
v_store_low(dst + dststep + i, updateOp(s0, vx_load_low(src[k] + i)));
|
||||
i += vtype::nlanes/2;
|
||||
i += VTraits<vtype>::vlanes()/2;
|
||||
}
|
||||
}
|
||||
|
||||
for( ; count > 0; count--, dst += dststep, src++ )
|
||||
{
|
||||
for( i = 0; i <= width - 4*vtype::nlanes; i += 4*vtype::nlanes)
|
||||
for( i = 0; i <= width - 4*VTraits<vtype>::vlanes(); i += 4*VTraits<vtype>::vlanes())
|
||||
{
|
||||
const stype* sptr = src[0] + i;
|
||||
vtype s0 = vx_load_aligned(sptr);
|
||||
vtype s1 = vx_load_aligned(sptr + vtype::nlanes);
|
||||
vtype s2 = vx_load_aligned(sptr + 2*vtype::nlanes);
|
||||
vtype s3 = vx_load_aligned(sptr + 3*vtype::nlanes);
|
||||
vtype s1 = vx_load_aligned(sptr + VTraits<vtype>::vlanes());
|
||||
vtype s2 = vx_load_aligned(sptr + 2*VTraits<vtype>::vlanes());
|
||||
vtype s3 = vx_load_aligned(sptr + 3*VTraits<vtype>::vlanes());
|
||||
|
||||
for( k = 1; k < _ksize; k++ )
|
||||
{
|
||||
sptr = src[k] + i;
|
||||
s0 = updateOp(s0, vx_load_aligned(sptr));
|
||||
s1 = updateOp(s1, vx_load_aligned(sptr + vtype::nlanes));
|
||||
s2 = updateOp(s2, vx_load_aligned(sptr + 2*vtype::nlanes));
|
||||
s3 = updateOp(s3, vx_load_aligned(sptr + 3*vtype::nlanes));
|
||||
s1 = updateOp(s1, vx_load_aligned(sptr + VTraits<vtype>::vlanes()));
|
||||
s2 = updateOp(s2, vx_load_aligned(sptr + 2*VTraits<vtype>::vlanes()));
|
||||
s3 = updateOp(s3, vx_load_aligned(sptr + 3*VTraits<vtype>::vlanes()));
|
||||
}
|
||||
v_store(dst + i, s0);
|
||||
v_store(dst + i + vtype::nlanes, s1);
|
||||
v_store(dst + i + 2*vtype::nlanes, s2);
|
||||
v_store(dst + i + 3*vtype::nlanes, s3);
|
||||
v_store(dst + i + VTraits<vtype>::vlanes(), s1);
|
||||
v_store(dst + i + 2*VTraits<vtype>::vlanes(), s2);
|
||||
v_store(dst + i + 3*VTraits<vtype>::vlanes(), s3);
|
||||
}
|
||||
if( i <= width - 2*vtype::nlanes )
|
||||
if( i <= width - 2*VTraits<vtype>::vlanes() )
|
||||
{
|
||||
const stype* sptr = src[0] + i;
|
||||
vtype s0 = vx_load_aligned(sptr);
|
||||
vtype s1 = vx_load_aligned(sptr + vtype::nlanes);
|
||||
vtype s1 = vx_load_aligned(sptr + VTraits<vtype>::vlanes());
|
||||
|
||||
for( k = 1; k < _ksize; k++ )
|
||||
{
|
||||
sptr = src[k] + i;
|
||||
s0 = updateOp(s0, vx_load_aligned(sptr));
|
||||
s1 = updateOp(s1, vx_load_aligned(sptr + vtype::nlanes));
|
||||
s1 = updateOp(s1, vx_load_aligned(sptr + VTraits<vtype>::vlanes()));
|
||||
}
|
||||
v_store(dst + i, s0);
|
||||
v_store(dst + i + vtype::nlanes, s1);
|
||||
i += 2*vtype::nlanes;
|
||||
v_store(dst + i + VTraits<vtype>::vlanes(), s1);
|
||||
i += 2*VTraits<vtype>::vlanes();
|
||||
}
|
||||
if( i <= width - vtype::nlanes )
|
||||
if( i <= width - VTraits<vtype>::vlanes() )
|
||||
{
|
||||
vtype s0 = vx_load_aligned(src[0] + i);
|
||||
|
||||
for( k = 1; k < _ksize; k++ )
|
||||
s0 = updateOp(s0, vx_load_aligned(src[k] + i));
|
||||
v_store(dst + i, s0);
|
||||
i += vtype::nlanes;
|
||||
i += VTraits<vtype>::vlanes();
|
||||
}
|
||||
if( i <= width - vtype::nlanes/2 )
|
||||
if( i <= width - VTraits<vtype>::vlanes()/2 )
|
||||
{
|
||||
vtype s0 = vx_load_low(src[0] + i);
|
||||
|
||||
for( k = 1; k < _ksize; k++ )
|
||||
s0 = updateOp(s0, vx_load_low(src[k] + i));
|
||||
v_store_low(dst + i, s0);
|
||||
i += vtype::nlanes/2;
|
||||
i += VTraits<vtype>::vlanes()/2;
|
||||
}
|
||||
}
|
||||
|
||||
@ -341,7 +341,7 @@ template<class VecUpdate> struct MorphColumnVec
|
||||
template<class VecUpdate> struct MorphVec
|
||||
{
|
||||
typedef typename VecUpdate::vtype vtype;
|
||||
typedef typename vtype::lane_type stype;
|
||||
typedef typename VTraits<vtype>::lane_type stype;
|
||||
int operator()(uchar** _src, int nz, uchar* _dst, int width) const
|
||||
{
|
||||
CV_INSTRUMENT_REGION();
|
||||
@ -351,56 +351,56 @@ template<class VecUpdate> struct MorphVec
|
||||
int i, k;
|
||||
VecUpdate updateOp;
|
||||
|
||||
for( i = 0; i <= width - 4*vtype::nlanes; i += 4*vtype::nlanes )
|
||||
for( i = 0; i <= width - 4*VTraits<vtype>::vlanes(); i += 4*VTraits<vtype>::vlanes() )
|
||||
{
|
||||
const stype* sptr = src[0] + i;
|
||||
vtype s0 = vx_load(sptr);
|
||||
vtype s1 = vx_load(sptr + vtype::nlanes);
|
||||
vtype s2 = vx_load(sptr + 2*vtype::nlanes);
|
||||
vtype s3 = vx_load(sptr + 3*vtype::nlanes);
|
||||
vtype s1 = vx_load(sptr + VTraits<vtype>::vlanes());
|
||||
vtype s2 = vx_load(sptr + 2*VTraits<vtype>::vlanes());
|
||||
vtype s3 = vx_load(sptr + 3*VTraits<vtype>::vlanes());
|
||||
for( k = 1; k < nz; k++ )
|
||||
{
|
||||
sptr = src[k] + i;
|
||||
s0 = updateOp(s0, vx_load(sptr));
|
||||
s1 = updateOp(s1, vx_load(sptr + vtype::nlanes));
|
||||
s2 = updateOp(s2, vx_load(sptr + 2*vtype::nlanes));
|
||||
s3 = updateOp(s3, vx_load(sptr + 3*vtype::nlanes));
|
||||
s1 = updateOp(s1, vx_load(sptr + VTraits<vtype>::vlanes()));
|
||||
s2 = updateOp(s2, vx_load(sptr + 2*VTraits<vtype>::vlanes()));
|
||||
s3 = updateOp(s3, vx_load(sptr + 3*VTraits<vtype>::vlanes()));
|
||||
}
|
||||
v_store(dst + i, s0);
|
||||
v_store(dst + i + vtype::nlanes, s1);
|
||||
v_store(dst + i + 2*vtype::nlanes, s2);
|
||||
v_store(dst + i + 3*vtype::nlanes, s3);
|
||||
v_store(dst + i + VTraits<vtype>::vlanes(), s1);
|
||||
v_store(dst + i + 2*VTraits<vtype>::vlanes(), s2);
|
||||
v_store(dst + i + 3*VTraits<vtype>::vlanes(), s3);
|
||||
}
|
||||
if( i <= width - 2*vtype::nlanes )
|
||||
if( i <= width - 2*VTraits<vtype>::vlanes() )
|
||||
{
|
||||
const stype* sptr = src[0] + i;
|
||||
vtype s0 = vx_load(sptr);
|
||||
vtype s1 = vx_load(sptr + vtype::nlanes);
|
||||
vtype s1 = vx_load(sptr + VTraits<vtype>::vlanes());
|
||||
for( k = 1; k < nz; k++ )
|
||||
{
|
||||
sptr = src[k] + i;
|
||||
s0 = updateOp(s0, vx_load(sptr));
|
||||
s1 = updateOp(s1, vx_load(sptr + vtype::nlanes));
|
||||
s1 = updateOp(s1, vx_load(sptr + VTraits<vtype>::vlanes()));
|
||||
}
|
||||
v_store(dst + i, s0);
|
||||
v_store(dst + i + vtype::nlanes, s1);
|
||||
i += 2*vtype::nlanes;
|
||||
v_store(dst + i + VTraits<vtype>::vlanes(), s1);
|
||||
i += 2*VTraits<vtype>::vlanes();
|
||||
}
|
||||
if( i <= width - vtype::nlanes )
|
||||
if( i <= width - VTraits<vtype>::vlanes() )
|
||||
{
|
||||
vtype s0 = vx_load(src[0] + i);
|
||||
for( k = 1; k < nz; k++ )
|
||||
s0 = updateOp(s0, vx_load(src[k] + i));
|
||||
v_store(dst + i, s0);
|
||||
i += vtype::nlanes;
|
||||
i += VTraits<vtype>::vlanes();
|
||||
}
|
||||
if( i <= width - vtype::nlanes/2 )
|
||||
if( i <= width - VTraits<vtype>::vlanes()/2 )
|
||||
{
|
||||
vtype s0 = vx_load_low(src[0] + i);
|
||||
for( k = 1; k < nz; k++ )
|
||||
s0 = updateOp(s0, vx_load_low(src[k] + i));
|
||||
v_store_low(dst + i, s0);
|
||||
i += vtype::nlanes/2;
|
||||
i += VTraits<vtype>::vlanes()/2;
|
||||
}
|
||||
return i;
|
||||
}
|
||||
|
@ -879,14 +879,14 @@ static Rect pointSetBoundingRect( const Mat& points )
|
||||
if( npoints == 0 )
|
||||
return Rect();
|
||||
|
||||
#if CV_SIMD
|
||||
#if CV_SIMD // TODO: enable for CV_SIMD_SCALABLE, loop tail related.
|
||||
const int64_t* pts = points.ptr<int64_t>();
|
||||
|
||||
if( !is_float )
|
||||
{
|
||||
v_int32 minval, maxval;
|
||||
minval = maxval = v_reinterpret_as_s32(vx_setall_s64(*pts)); //min[0]=pt.x, min[1]=pt.y, min[2]=pt.x, min[3]=pt.y
|
||||
for( i = 1; i <= npoints - v_int32::nlanes/2; i+= v_int32::nlanes/2 )
|
||||
for( i = 1; i <= npoints - VTraits<v_int32>::vlanes()/2; i+= VTraits<v_int32>::vlanes()/2 )
|
||||
{
|
||||
v_int32 ptXY2 = v_reinterpret_as_s32(vx_load(pts + i));
|
||||
minval = v_min(ptXY2, minval);
|
||||
@ -894,22 +894,22 @@ static Rect pointSetBoundingRect( const Mat& points )
|
||||
}
|
||||
minval = v_min(v_reinterpret_as_s32(v_expand_low(v_reinterpret_as_u32(minval))), v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(minval))));
|
||||
maxval = v_max(v_reinterpret_as_s32(v_expand_low(v_reinterpret_as_u32(maxval))), v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(maxval))));
|
||||
if( i <= npoints - v_int32::nlanes/4 )
|
||||
if( i <= npoints - VTraits<v_int32>::vlanes()/4 )
|
||||
{
|
||||
v_int32 ptXY = v_reinterpret_as_s32(v_expand_low(v_reinterpret_as_u32(vx_load_low(pts + i))));
|
||||
minval = v_min(ptXY, minval);
|
||||
maxval = v_max(ptXY, maxval);
|
||||
i += v_int64::nlanes/2;
|
||||
i += VTraits<v_int64>::vlanes()/2;
|
||||
}
|
||||
for(int j = 16; j < CV_SIMD_WIDTH; j*=2)
|
||||
for(int j = 16; j < VTraits<v_uint8>::vlanes(); j*=2)
|
||||
{
|
||||
minval = v_min(v_reinterpret_as_s32(v_expand_low(v_reinterpret_as_u32(minval))), v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(minval))));
|
||||
maxval = v_max(v_reinterpret_as_s32(v_expand_low(v_reinterpret_as_u32(maxval))), v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(maxval))));
|
||||
}
|
||||
xmin = minval.get0();
|
||||
xmax = maxval.get0();
|
||||
ymin = v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(minval))).get0();
|
||||
ymax = v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(maxval))).get0();
|
||||
xmin = v_get0(minval);
|
||||
xmax = v_get0(maxval);
|
||||
ymin = v_get0(v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(minval))));
|
||||
ymax = v_get0(v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(maxval))));
|
||||
#if CV_SIMD_WIDTH > 16
|
||||
if( i < npoints )
|
||||
{
|
||||
@ -921,18 +921,18 @@ static Rect pointSetBoundingRect( const Mat& points )
|
||||
minval2 = v_min(ptXY, minval2);
|
||||
maxval2 = v_max(ptXY, maxval2);
|
||||
}
|
||||
xmin = min(xmin, minval2.get0());
|
||||
xmax = max(xmax, maxval2.get0());
|
||||
ymin = min(ymin, v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(minval2))).get0());
|
||||
ymax = max(ymax, v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(maxval2))).get0());
|
||||
xmin = min(xmin, v_get0(minval2));
|
||||
xmax = max(xmax, v_get0(maxval2));
|
||||
ymin = min(ymin, v_get0(v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(minval2)))));
|
||||
ymax = max(ymax, v_get0(v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(maxval2)))));
|
||||
}
|
||||
#endif
|
||||
#endif // CV_SIMD
|
||||
}
|
||||
else
|
||||
{
|
||||
v_float32 minval, maxval;
|
||||
minval = maxval = v_reinterpret_as_f32(vx_setall_s64(*pts)); //min[0]=pt.x, min[1]=pt.y, min[2]=pt.x, min[3]=pt.y
|
||||
for( i = 1; i <= npoints - v_float32::nlanes/2; i+= v_float32::nlanes/2 )
|
||||
for( i = 1; i <= npoints - VTraits<v_float32>::vlanes()/2; i+= VTraits<v_float32>::vlanes()/2 )
|
||||
{
|
||||
v_float32 ptXY2 = v_reinterpret_as_f32(vx_load(pts + i));
|
||||
minval = v_min(ptXY2, minval);
|
||||
@ -940,22 +940,22 @@ static Rect pointSetBoundingRect( const Mat& points )
|
||||
}
|
||||
minval = v_min(v_reinterpret_as_f32(v_expand_low(v_reinterpret_as_u32(minval))), v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(minval))));
|
||||
maxval = v_max(v_reinterpret_as_f32(v_expand_low(v_reinterpret_as_u32(maxval))), v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(maxval))));
|
||||
if( i <= npoints - v_float32::nlanes/4 )
|
||||
if( i <= npoints - VTraits<v_float32>::vlanes()/4 )
|
||||
{
|
||||
v_float32 ptXY = v_reinterpret_as_f32(v_expand_low(v_reinterpret_as_u32(vx_load_low(pts + i))));
|
||||
minval = v_min(ptXY, minval);
|
||||
maxval = v_max(ptXY, maxval);
|
||||
i += v_float32::nlanes/4;
|
||||
i += VTraits<v_float32>::vlanes()/4;
|
||||
}
|
||||
for(int j = 16; j < CV_SIMD_WIDTH; j*=2)
|
||||
for(int j = 16; j < VTraits<v_uint8>::vlanes(); j*=2)
|
||||
{
|
||||
minval = v_min(v_reinterpret_as_f32(v_expand_low(v_reinterpret_as_u32(minval))), v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(minval))));
|
||||
maxval = v_max(v_reinterpret_as_f32(v_expand_low(v_reinterpret_as_u32(maxval))), v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(maxval))));
|
||||
}
|
||||
xmin = cvFloor(minval.get0());
|
||||
xmax = cvFloor(maxval.get0());
|
||||
ymin = cvFloor(v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(minval))).get0());
|
||||
ymax = cvFloor(v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(maxval))).get0());
|
||||
xmin = cvFloor(v_get0(minval));
|
||||
xmax = cvFloor(v_get0(maxval));
|
||||
ymin = cvFloor(v_get0(v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(minval)))));
|
||||
ymax = cvFloor(v_get0(v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(maxval)))));
|
||||
#if CV_SIMD_WIDTH > 16
|
||||
if( i < npoints )
|
||||
{
|
||||
@ -967,10 +967,10 @@ static Rect pointSetBoundingRect( const Mat& points )
|
||||
minval2 = v_min(ptXY, minval2);
|
||||
maxval2 = v_max(ptXY, maxval2);
|
||||
}
|
||||
xmin = min(xmin, cvFloor(minval2.get0()));
|
||||
xmax = max(xmax, cvFloor(maxval2.get0()));
|
||||
ymin = min(ymin, cvFloor(v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(minval2))).get0()));
|
||||
ymax = max(ymax, cvFloor(v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(maxval2))).get0()));
|
||||
xmin = min(xmin, cvFloor(v_get0(minval2)));
|
||||
xmax = max(xmax, cvFloor(v_get0(maxval2)));
|
||||
ymin = min(ymin, cvFloor(v_get0(v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(minval2))))));
|
||||
ymax = max(ymax, cvFloor(v_get0(v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(maxval2))))));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user