diff --git a/modules/core/include/opencv2/core/hal/intrin.hpp b/modules/core/include/opencv2/core/hal/intrin.hpp index ee8310b5c5..88a002145a 100644 --- a/modules/core/include/opencv2/core/hal/intrin.hpp +++ b/modules/core/include/opencv2/core/hal/intrin.hpp @@ -745,7 +745,22 @@ namespace CV__SIMD_NAMESPACE { inline _Tpvec v_add(const _Tpvec& f1, const _Tpvec& f2, const Args&... vf) { \ return v_add(f1 + f2, vf...); \ } + #define OPENCV_HAL_WRAP_SHIFT_OP(_Tpvec) \ + inline _Tpvec v_shr(const _Tpvec& a, int n) \ + { \ + return a >> n; \ + } \ + inline _Tpvec v_shl(const _Tpvec& a, int n) \ + { \ + return a << n; \ + } + OPENCV_HAL_WRAP_SHIFT_OP(v_uint16) + OPENCV_HAL_WRAP_SHIFT_OP(v_uint32) + OPENCV_HAL_WRAP_SHIFT_OP(v_uint64) + OPENCV_HAL_WRAP_SHIFT_OP(v_int16) + OPENCV_HAL_WRAP_SHIFT_OP(v_int32) + OPENCV_HAL_WRAP_SHIFT_OP(v_int64) OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint8) OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint16) OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint32) @@ -769,6 +784,12 @@ namespace CV__SIMD_NAMESPACE { OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int32x4) OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int64x2) OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float32x4) + OPENCV_HAL_WRAP_SHIFT_OP(v_uint16x8) + OPENCV_HAL_WRAP_SHIFT_OP(v_uint32x4) + OPENCV_HAL_WRAP_SHIFT_OP(v_uint64x2) + OPENCV_HAL_WRAP_SHIFT_OP(v_int16x8) + OPENCV_HAL_WRAP_SHIFT_OP(v_int32x4) + OPENCV_HAL_WRAP_SHIFT_OP(v_int64x2) #if CV_SIMD_64F OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64x2) #endif @@ -784,6 +805,12 @@ namespace CV__SIMD_NAMESPACE { OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int32x8) OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int64x4) OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float32x8) + OPENCV_HAL_WRAP_SHIFT_OP(v_uint16x16) + OPENCV_HAL_WRAP_SHIFT_OP(v_uint32x8) + OPENCV_HAL_WRAP_SHIFT_OP(v_uint64x4) + OPENCV_HAL_WRAP_SHIFT_OP(v_int16x16) + OPENCV_HAL_WRAP_SHIFT_OP(v_int32x8) + OPENCV_HAL_WRAP_SHIFT_OP(v_int64x4) #if CV_SIMD_64F OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64x4) #endif @@ -801,7 +828,9 @@ namespace CV__SIMD_NAMESPACE { inline _Tpvec v_xor(const _Tpvec& a, const _Tpvec& b) \ { \ return a ^ b; \ - } \ + } + + #define OPENCV_HAL_WRAP_NOT_OP(_Tpvec) \ inline _Tpvec v_not(const _Tpvec& a) \ { \ return ~a; \ @@ -815,6 +844,18 @@ namespace CV__SIMD_NAMESPACE { OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int16) OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int32) OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int64) + OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_float32) + OPENCV_HAL_WRAP_NOT_OP(v_uint8) + OPENCV_HAL_WRAP_NOT_OP(v_uint16) + OPENCV_HAL_WRAP_NOT_OP(v_uint32) + OPENCV_HAL_WRAP_NOT_OP(v_uint64) + OPENCV_HAL_WRAP_NOT_OP(v_int8) + OPENCV_HAL_WRAP_NOT_OP(v_int16) + OPENCV_HAL_WRAP_NOT_OP(v_int32) + OPENCV_HAL_WRAP_NOT_OP(v_int64) + #if CV_SIMD_64F + OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_float64) + #endif #if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128 OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint8x16) OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint16x8) @@ -824,6 +865,18 @@ namespace CV__SIMD_NAMESPACE { OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int16x8) OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int32x4) OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int64x2) + OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_float32x4) + OPENCV_HAL_WRAP_NOT_OP(v_uint8x16) + OPENCV_HAL_WRAP_NOT_OP(v_uint16x8) + OPENCV_HAL_WRAP_NOT_OP(v_uint32x4) + OPENCV_HAL_WRAP_NOT_OP(v_uint64x2) + OPENCV_HAL_WRAP_NOT_OP(v_int8x16) + OPENCV_HAL_WRAP_NOT_OP(v_int16x8) + OPENCV_HAL_WRAP_NOT_OP(v_int32x4) + OPENCV_HAL_WRAP_NOT_OP(v_int64x2) + #if CV_SIMD_64F + OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_float64x2) + #endif #endif #if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256 OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint8x32) @@ -834,6 +887,18 @@ namespace CV__SIMD_NAMESPACE { OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int16x16) OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int32x8) OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int64x4) + OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_float32x8) + OPENCV_HAL_WRAP_NOT_OP(v_uint8x32) + OPENCV_HAL_WRAP_NOT_OP(v_uint16x16) + OPENCV_HAL_WRAP_NOT_OP(v_uint32x8) + OPENCV_HAL_WRAP_NOT_OP(v_uint64x4) + OPENCV_HAL_WRAP_NOT_OP(v_int8x32) + OPENCV_HAL_WRAP_NOT_OP(v_int16x16) + OPENCV_HAL_WRAP_NOT_OP(v_int32x8) + OPENCV_HAL_WRAP_NOT_OP(v_int64x4) + #if CV_SIMD_64F + OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_float64x4) + #endif #endif #define OPENCV_HAL_WRAP_BIN_OP_MUL(_Tpvec) \ diff --git a/modules/core/include/opencv2/core/hal/intrin_rvv_compat_overloaded.hpp b/modules/core/include/opencv2/core/hal/intrin_rvv_compat_overloaded.hpp index 7dd735f99a..914ad28978 100644 --- a/modules/core/include/opencv2/core/hal/intrin_rvv_compat_overloaded.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_rvv_compat_overloaded.hpp @@ -45,6 +45,7 @@ OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vuint8m2_t, u8m2, vuint8m2_t, i8) OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vuint8m4_t, u8m4, vuint8m4_t, i8) OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vuint8m8_t, u8m8, vuint8m8_t, i8) OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vfloat32m1_t, f32m1, vuint32m1_t, i32) +OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vuint32m1_t, u32m1, vuint32m1_t, i32) #if CV_SIMD_SCALABLE_64F OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vfloat64m1_t, f64m1, vuint32mf2_t, i32) #endif diff --git a/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp b/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp index 6c28b44f5b..a45c90cf90 100644 --- a/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp @@ -475,6 +475,25 @@ OPENCV_HAL_IMPL_RVV_LUT(v_float32, float, m1) OPENCV_HAL_IMPL_RVV_LUT(v_float64, double, mf2) #endif +#define OPENCV_HAL_IMPL_RVV_LUT_VEC(_Tpvec, _Tp) \ +inline _Tpvec v_lut(const _Tp* tab, const v_int32& vidx) \ +{ \ + v_uint32 vidx_ = vmul(vreinterpret_u32m1(vidx), sizeof(_Tp), VTraits::vlanes()); \ + return vloxei32(tab, vidx_, VTraits<_Tpvec>::vlanes()); \ +} +OPENCV_HAL_IMPL_RVV_LUT_VEC(v_float32, float) +OPENCV_HAL_IMPL_RVV_LUT_VEC(v_int32, int) +OPENCV_HAL_IMPL_RVV_LUT_VEC(v_uint32, unsigned) + +#if CV_SIMD_SCALABLE_64F +inline v_float64 v_lut(const double* tab, const v_int32& vidx) \ +{ \ + vuint32mf2_t vidx_ = vmul(vlmul_trunc_u32mf2(vreinterpret_u32m1(vidx)), sizeof(double), VTraits::vlanes()); \ + return vloxei32(tab, vidx_, VTraits::vlanes()); \ +} +#endif + + inline v_uint8 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((schar*)tab, idx)); } inline v_uint8 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((schar*)tab, idx)); } inline v_uint8 v_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_quads((schar*)tab, idx)); } @@ -690,23 +709,27 @@ inline v_float64 v_not (const v_float64& a) \ ////////////// Bitwise shifts ////////////// +/* Usage +1. v_shl(vec); +2. v_shl(vec, N); // instead of vec << N, when N is non-constant. +*/ #define OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(_Tpvec, vl) \ -template inline _Tpvec v_shl(const _Tpvec& a) \ +template inline _Tpvec v_shl(const _Tpvec& a, int n = s) \ { \ return _Tpvec(vsll(a, uint8_t(n), vl)); \ } \ -template inline _Tpvec v_shr(const _Tpvec& a) \ +template inline _Tpvec v_shr(const _Tpvec& a, int n = s) \ { \ return _Tpvec(vsrl(a, uint8_t(n), vl)); \ } #define OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(_Tpvec, vl) \ -template inline _Tpvec v_shl(const _Tpvec& a) \ +template inline _Tpvec v_shl(const _Tpvec& a, int n = s) \ { \ return _Tpvec(vsll(a, uint8_t(n), vl)); \ } \ -template inline _Tpvec v_shr(const _Tpvec& a) \ +template inline _Tpvec v_shr(const _Tpvec& a, int n = s) \ { \ return _Tpvec(vsra(a, uint8_t(n), vl)); \ } diff --git a/modules/imgproc/src/bilateral_filter.simd.hpp b/modules/imgproc/src/bilateral_filter.simd.hpp index 0d2c394368..332b36646c 100644 --- a/modules/imgproc/src/bilateral_filter.simd.hpp +++ b/modules/imgproc/src/bilateral_filter.simd.hpp @@ -99,33 +99,33 @@ public: const uchar* ksptr2 = sptr + space_ofs[k+2]; const uchar* ksptr3 = sptr + space_ofs[k+3]; j = 0; -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) v_float32 kweight0 = vx_setall_f32(space_weight[k]); v_float32 kweight1 = vx_setall_f32(space_weight[k+1]); v_float32 kweight2 = vx_setall_f32(space_weight[k+2]); v_float32 kweight3 = vx_setall_f32(space_weight[k+3]); - for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes) + for (; j <= size.width - VTraits::vlanes(); j += VTraits::vlanes()) { v_uint32 rval = vx_load_expand_q(sptr + j); v_uint32 val = vx_load_expand_q(ksptr0 + j); - v_float32 w = kweight0 * v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, rval))); - v_float32 v_wsum = vx_load_aligned(wsum + j) + w; + v_float32 w = v_mul(kweight0, v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, rval)))); + v_float32 v_wsum = v_add(vx_load_aligned(wsum + j), w); v_float32 v_sum = v_muladd(v_cvt_f32(v_reinterpret_as_s32(val)), w, vx_load_aligned(sum + j)); val = vx_load_expand_q(ksptr1 + j); - w = kweight1 * v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, rval))); - v_wsum += w; + w = v_mul(kweight1, v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, rval)))); + v_wsum = v_add(v_wsum, w); v_sum = v_muladd(v_cvt_f32(v_reinterpret_as_s32(val)), w, v_sum); val = vx_load_expand_q(ksptr2 + j); - w = kweight2 * v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, rval))); - v_wsum += w; + w = v_mul(kweight2, v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, rval)))); + v_wsum = v_add(v_wsum, w); v_sum = v_muladd(v_cvt_f32(v_reinterpret_as_s32(val)), w, v_sum); val = vx_load_expand_q(ksptr3 + j); - w = kweight3 * v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, rval))); - v_wsum += w; + w = v_mul(kweight3, v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, rval)))); + v_wsum = v_add(v_wsum, w); v_sum = v_muladd(v_cvt_f32(v_reinterpret_as_s32(val)), w, v_sum); v_store_aligned(wsum + j, v_wsum); @@ -172,13 +172,13 @@ public: { const uchar* ksptr = sptr + space_ofs[k]; j = 0; -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) v_float32 kweight = vx_setall_f32(space_weight[k]); - for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes) + for (; j <= size.width - VTraits::vlanes(); j += VTraits::vlanes()) { v_uint32 val = vx_load_expand_q(ksptr + j); - v_float32 w = kweight * v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, vx_load_expand_q(sptr + j)))); - v_store_aligned(wsum + j, vx_load_aligned(wsum + j) + w); + v_float32 w = v_mul(kweight, v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, vx_load_expand_q(sptr + j))))); + v_store_aligned(wsum + j, v_add(vx_load_aligned(wsum + j), w)); v_store_aligned(sum + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val)), w, vx_load_aligned(sum + j))); } #endif @@ -191,10 +191,10 @@ public: } } j = 0; -#if CV_SIMD - for (; j <= size.width - 2*v_float32::nlanes; j += 2*v_float32::nlanes) - v_pack_u_store(dptr + j, v_pack(v_round(vx_load_aligned(sum + j ) / vx_load_aligned(wsum + j )), - v_round(vx_load_aligned(sum + j + v_float32::nlanes) / vx_load_aligned(wsum + j + v_float32::nlanes)))); +#if (CV_SIMD || CV_SIMD_SCALABLE) + for (; j <= size.width - 2*VTraits::vlanes(); j += 2*VTraits::vlanes()) + v_pack_u_store(dptr + j, v_pack(v_round(v_div(vx_load_aligned(sum + j), vx_load_aligned(wsum + j))), + v_round(v_div(vx_load_aligned(sum + j + VTraits::vlanes()), vx_load_aligned(wsum + j + VTraits::vlanes()))))); #endif for (; j < size.width; j++) { @@ -221,13 +221,13 @@ public: const uchar* ksptr3 = sptr + space_ofs[k+3]; const uchar* rsptr = sptr; j = 0; -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) v_float32 kweight0 = vx_setall_f32(space_weight[k]); v_float32 kweight1 = vx_setall_f32(space_weight[k+1]); v_float32 kweight2 = vx_setall_f32(space_weight[k+2]); v_float32 kweight3 = vx_setall_f32(space_weight[k+3]); - for (; j <= size.width - v_uint8::nlanes; j += v_uint8::nlanes, rsptr += 3*v_uint8::nlanes, - ksptr0 += 3*v_uint8::nlanes, ksptr1 += 3*v_uint8::nlanes, ksptr2 += 3*v_uint8::nlanes, ksptr3 += 3*v_uint8::nlanes) + for (; j <= size.width - VTraits::vlanes(); j += VTraits::vlanes(), rsptr += 3*VTraits::vlanes(), + ksptr0 += 3*VTraits::vlanes(), ksptr1 += 3*VTraits::vlanes(), ksptr2 += 3*VTraits::vlanes(), ksptr3 += 3*VTraits::vlanes()) { v_uint8 kb, kg, kr, rb, rg, rr; v_load_deinterleave(rsptr, rb, rg, rr); @@ -236,163 +236,163 @@ public: v_uint16 val0, val1, val2, val3, val4; v_expand(v_absdiff(kb, rb), val0, val1); v_expand(v_absdiff(kg, rg), val2, val3); - val0 += val2; val1 += val3; + val0 = v_add(val0, val2); val1 = v_add(val1, val3); v_expand(v_absdiff(kr, rr), val2, val3); - val0 += val2; val1 += val3; + val0 = v_add(val0, val2); val1 = v_add(val1, val3); v_uint32 vall, valh; v_expand(val0, vall, valh); - v_float32 w0 = kweight0 * v_lut(color_weight, v_reinterpret_as_s32(vall)); - v_float32 w1 = kweight0 * v_lut(color_weight, v_reinterpret_as_s32(valh)); - v_store_aligned(wsum + j, w0 + vx_load_aligned(wsum + j)); - v_store_aligned(wsum + j + v_float32::nlanes, w1 + vx_load_aligned(wsum + j + v_float32::nlanes)); + v_float32 w0 = v_mul(kweight0, v_lut(color_weight, v_reinterpret_as_s32(vall))); + v_float32 w1 = v_mul(kweight0, v_lut(color_weight, v_reinterpret_as_s32(valh))); + v_store_aligned(wsum + j, v_add(w0, vx_load_aligned(wsum + j))); + v_store_aligned(wsum + j + VTraits::vlanes(), v_add(w1, vx_load_aligned(wsum + j + VTraits::vlanes()))); v_expand(kb, val0, val2); v_expand(val0, vall, valh); v_store_aligned(sum_b + j , v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j))); - v_store_aligned(sum_b + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + v_float32::nlanes))); + v_store_aligned(sum_b + j + VTraits::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + VTraits::vlanes()))); v_expand(kg, val0, val3); v_expand(val0, vall, valh); v_store_aligned(sum_g + j , v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j))); - v_store_aligned(sum_g + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + v_float32::nlanes))); + v_store_aligned(sum_g + j + VTraits::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + VTraits::vlanes()))); v_expand(kr, val0, val4); v_expand(val0, vall, valh); v_store_aligned(sum_r + j , v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j))); - v_store_aligned(sum_r + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + v_float32::nlanes))); + v_store_aligned(sum_r + j + VTraits::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + VTraits::vlanes()))); v_expand(val1, vall, valh); - w0 = kweight0 * v_lut(color_weight, v_reinterpret_as_s32(vall)); - w1 = kweight0 * v_lut(color_weight, v_reinterpret_as_s32(valh)); - v_store_aligned(wsum + j + 2 * v_float32::nlanes, w0 + vx_load_aligned(wsum + j + 2 * v_float32::nlanes)); - v_store_aligned(wsum + j + 3 * v_float32::nlanes, w1 + vx_load_aligned(wsum + j + 3 * v_float32::nlanes)); + w0 = v_mul(kweight0, v_lut(color_weight, v_reinterpret_as_s32(vall))); + w1 = v_mul(kweight0, v_lut(color_weight, v_reinterpret_as_s32(valh))); + v_store_aligned(wsum + j + 2 * VTraits::vlanes(), v_add(w0, vx_load_aligned(wsum + j + 2 * VTraits::vlanes()))); + v_store_aligned(wsum + j + 3 * VTraits::vlanes(), v_add(w1, vx_load_aligned(wsum + j + 3 * VTraits::vlanes()))); v_expand(val2, vall, valh); - v_store_aligned(sum_b + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j + 2 * v_float32::nlanes))); - v_store_aligned(sum_b + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + 3 * v_float32::nlanes))); + v_store_aligned(sum_b + j + 2 * VTraits::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j + 2 * VTraits::vlanes()))); + v_store_aligned(sum_b + j + 3 * VTraits::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + 3 * VTraits::vlanes()))); v_expand(val3, vall, valh); - v_store_aligned(sum_g + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j + 2 * v_float32::nlanes))); - v_store_aligned(sum_g + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + 3 * v_float32::nlanes))); + v_store_aligned(sum_g + j + 2 * VTraits::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j + 2 * VTraits::vlanes()))); + v_store_aligned(sum_g + j + 3 * VTraits::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + 3 * VTraits::vlanes()))); v_expand(val4, vall, valh); - v_store_aligned(sum_r + j + 2*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j + 2*v_float32::nlanes))); - v_store_aligned(sum_r + j + 3*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + 3*v_float32::nlanes))); + v_store_aligned(sum_r + j + 2*VTraits::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j + 2*VTraits::vlanes()))); + v_store_aligned(sum_r + j + 3*VTraits::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + 3*VTraits::vlanes()))); v_load_deinterleave(ksptr1, kb, kg, kr); v_expand(v_absdiff(kb, rb), val0, val1); v_expand(v_absdiff(kg, rg), val2, val3); - val0 += val2; val1 += val3; + val0 = v_add(val0, val2); val1 = v_add(val1, val3); v_expand(v_absdiff(kr, rr), val2, val3); - val0 += val2; val1 += val3; + val0 = v_add(val0, val2); val1 = v_add(val1, val3); v_expand(val0, vall, valh); - w0 = kweight1 * v_lut(color_weight, v_reinterpret_as_s32(vall)); - w1 = kweight1 * v_lut(color_weight, v_reinterpret_as_s32(valh)); - v_store_aligned(wsum + j, w0 + vx_load_aligned(wsum + j)); - v_store_aligned(wsum + j + v_float32::nlanes, w1 + vx_load_aligned(wsum + j + v_float32::nlanes)); + w0 = v_mul(kweight1, v_lut(color_weight, v_reinterpret_as_s32(vall))); + w1 = v_mul(kweight1, v_lut(color_weight, v_reinterpret_as_s32(valh))); + v_store_aligned(wsum + j, v_add(w0, vx_load_aligned(wsum + j))); + v_store_aligned(wsum + j + VTraits::vlanes(), v_add(w1, vx_load_aligned(wsum + j + VTraits::vlanes()))); v_expand(kb, val0, val2); v_expand(val0, vall, valh); v_store_aligned(sum_b + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j))); - v_store_aligned(sum_b + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + v_float32::nlanes))); + v_store_aligned(sum_b + j + VTraits::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + VTraits::vlanes()))); v_expand(kg, val0, val3); v_expand(val0, vall, valh); v_store_aligned(sum_g + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j))); - v_store_aligned(sum_g + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + v_float32::nlanes))); + v_store_aligned(sum_g + j + VTraits::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + VTraits::vlanes()))); v_expand(kr, val0, val4); v_expand(val0, vall, valh); v_store_aligned(sum_r + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j))); - v_store_aligned(sum_r + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + v_float32::nlanes))); + v_store_aligned(sum_r + j + VTraits::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + VTraits::vlanes()))); v_expand(val1, vall, valh); - w0 = kweight1 * v_lut(color_weight, v_reinterpret_as_s32(vall)); - w1 = kweight1 * v_lut(color_weight, v_reinterpret_as_s32(valh)); - v_store_aligned(wsum + j + 2 * v_float32::nlanes, w0 + vx_load_aligned(wsum + j + 2 * v_float32::nlanes)); - v_store_aligned(wsum + j + 3 * v_float32::nlanes, w1 + vx_load_aligned(wsum + j + 3 * v_float32::nlanes)); + w0 = v_mul(kweight1, v_lut(color_weight, v_reinterpret_as_s32(vall))); + w1 = v_mul(kweight1, v_lut(color_weight, v_reinterpret_as_s32(valh))); + v_store_aligned(wsum + j + 2 * VTraits::vlanes(), v_add(w0, vx_load_aligned(wsum + j + 2 * VTraits::vlanes()))); + v_store_aligned(wsum + j + 3 * VTraits::vlanes(), v_add(w1, vx_load_aligned(wsum + j + 3 * VTraits::vlanes()))); v_expand(val2, vall, valh); - v_store_aligned(sum_b + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j + 2 * v_float32::nlanes))); - v_store_aligned(sum_b + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + 3 * v_float32::nlanes))); + v_store_aligned(sum_b + j + 2 * VTraits::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j + 2 * VTraits::vlanes()))); + v_store_aligned(sum_b + j + 3 * VTraits::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + 3 * VTraits::vlanes()))); v_expand(val3, vall, valh); - v_store_aligned(sum_g + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j + 2 * v_float32::nlanes))); - v_store_aligned(sum_g + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + 3 * v_float32::nlanes))); + v_store_aligned(sum_g + j + 2 * VTraits::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j + 2 * VTraits::vlanes()))); + v_store_aligned(sum_g + j + 3 * VTraits::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + 3 * VTraits::vlanes()))); v_expand(val4, vall, valh); - v_store_aligned(sum_r + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j + 2 * v_float32::nlanes))); - v_store_aligned(sum_r + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + 3 * v_float32::nlanes))); + v_store_aligned(sum_r + j + 2 * VTraits::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j + 2 * VTraits::vlanes()))); + v_store_aligned(sum_r + j + 3 * VTraits::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + 3 * VTraits::vlanes()))); v_load_deinterleave(ksptr2, kb, kg, kr); v_expand(v_absdiff(kb, rb), val0, val1); v_expand(v_absdiff(kg, rg), val2, val3); - val0 += val2; val1 += val3; + val0 = v_add(val0, val2); val1 = v_add(val1, val3); v_expand(v_absdiff(kr, rr), val2, val3); - val0 += val2; val1 += val3; + val0 = v_add(val0, val2); val1 = v_add(val1, val3); v_expand(val0, vall, valh); - w0 = kweight2 * v_lut(color_weight, v_reinterpret_as_s32(vall)); - w1 = kweight2 * v_lut(color_weight, v_reinterpret_as_s32(valh)); - v_store_aligned(wsum + j, w0 + vx_load_aligned(wsum + j)); - v_store_aligned(wsum + j + v_float32::nlanes, w1 + vx_load_aligned(wsum + j + v_float32::nlanes)); + w0 = v_mul(kweight2, v_lut(color_weight, v_reinterpret_as_s32(vall))); + w1 = v_mul(kweight2, v_lut(color_weight, v_reinterpret_as_s32(valh))); + v_store_aligned(wsum + j, v_add(w0, vx_load_aligned(wsum + j))); + v_store_aligned(wsum + j + VTraits::vlanes(), v_add(w1, vx_load_aligned(wsum + j + VTraits::vlanes()))); v_expand(kb, val0, val2); v_expand(val0, vall, valh); v_store_aligned(sum_b + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j))); - v_store_aligned(sum_b + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + v_float32::nlanes))); + v_store_aligned(sum_b + j + VTraits::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + VTraits::vlanes()))); v_expand(kg, val0, val3); v_expand(val0, vall, valh); v_store_aligned(sum_g + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j))); - v_store_aligned(sum_g + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + v_float32::nlanes))); + v_store_aligned(sum_g + j + VTraits::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + VTraits::vlanes()))); v_expand(kr, val0, val4); v_expand(val0, vall, valh); v_store_aligned(sum_r + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j))); - v_store_aligned(sum_r + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + v_float32::nlanes))); + v_store_aligned(sum_r + j + VTraits::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + VTraits::vlanes()))); v_expand(val1, vall, valh); - w0 = kweight2 * v_lut(color_weight, v_reinterpret_as_s32(vall)); - w1 = kweight2 * v_lut(color_weight, v_reinterpret_as_s32(valh)); - v_store_aligned(wsum + j + 2 * v_float32::nlanes, w0 + vx_load_aligned(wsum + j + 2 * v_float32::nlanes)); - v_store_aligned(wsum + j + 3 * v_float32::nlanes, w1 + vx_load_aligned(wsum + j + 3 * v_float32::nlanes)); + w0 = v_mul(kweight2, v_lut(color_weight, v_reinterpret_as_s32(vall))); + w1 = v_mul(kweight2, v_lut(color_weight, v_reinterpret_as_s32(valh))); + v_store_aligned(wsum + j + 2 * VTraits::vlanes(), v_add(w0, vx_load_aligned(wsum + j + 2 * VTraits::vlanes()))); + v_store_aligned(wsum + j + 3 * VTraits::vlanes(), v_add(w1, vx_load_aligned(wsum + j + 3 * VTraits::vlanes()))); v_expand(val2, vall, valh); - v_store_aligned(sum_b + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j + 2 * v_float32::nlanes))); - v_store_aligned(sum_b + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + 3 * v_float32::nlanes))); + v_store_aligned(sum_b + j + 2 * VTraits::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j + 2 * VTraits::vlanes()))); + v_store_aligned(sum_b + j + 3 * VTraits::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + 3 * VTraits::vlanes()))); v_expand(val3, vall, valh); - v_store_aligned(sum_g + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j + 2 * v_float32::nlanes))); - v_store_aligned(sum_g + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + 3 * v_float32::nlanes))); + v_store_aligned(sum_g + j + 2 * VTraits::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j + 2 * VTraits::vlanes()))); + v_store_aligned(sum_g + j + 3 * VTraits::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + 3 * VTraits::vlanes()))); v_expand(val4, vall, valh); - v_store_aligned(sum_r + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j + 2 * v_float32::nlanes))); - v_store_aligned(sum_r + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + 3 * v_float32::nlanes))); + v_store_aligned(sum_r + j + 2 * VTraits::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j + 2 * VTraits::vlanes()))); + v_store_aligned(sum_r + j + 3 * VTraits::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + 3 * VTraits::vlanes()))); v_load_deinterleave(ksptr3, kb, kg, kr); v_expand(v_absdiff(kb, rb), val0, val1); v_expand(v_absdiff(kg, rg), val2, val3); - val0 += val2; val1 += val3; + val0 = v_add(val0, val2); val1 = v_add(val1, val3); v_expand(v_absdiff(kr, rr), val2, val3); - val0 += val2; val1 += val3; + val0 = v_add(val0, val2); val1 = v_add(val1, val3); v_expand(val0, vall, valh); - w0 = kweight3 * v_lut(color_weight, v_reinterpret_as_s32(vall)); - w1 = kweight3 * v_lut(color_weight, v_reinterpret_as_s32(valh)); - v_store_aligned(wsum + j, w0 + vx_load_aligned(wsum + j)); - v_store_aligned(wsum + j + v_float32::nlanes, w1 + vx_load_aligned(wsum + j + v_float32::nlanes)); + w0 = v_mul(kweight3, v_lut(color_weight, v_reinterpret_as_s32(vall))); + w1 = v_mul(kweight3, v_lut(color_weight, v_reinterpret_as_s32(valh))); + v_store_aligned(wsum + j, v_add(w0, vx_load_aligned(wsum + j))); + v_store_aligned(wsum + j + VTraits::vlanes(), v_add(w1, vx_load_aligned(wsum + j + VTraits::vlanes()))); v_expand(kb, val0, val2); v_expand(val0, vall, valh); v_store_aligned(sum_b + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j))); - v_store_aligned(sum_b + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + v_float32::nlanes))); + v_store_aligned(sum_b + j + VTraits::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + VTraits::vlanes()))); v_expand(kg, val0, val3); v_expand(val0, vall, valh); v_store_aligned(sum_g + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j))); - v_store_aligned(sum_g + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + v_float32::nlanes))); + v_store_aligned(sum_g + j + VTraits::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + VTraits::vlanes()))); v_expand(kr, val0, val4); v_expand(val0, vall, valh); v_store_aligned(sum_r + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j))); - v_store_aligned(sum_r + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + v_float32::nlanes))); + v_store_aligned(sum_r + j + VTraits::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + VTraits::vlanes()))); v_expand(val1, vall, valh); - w0 = kweight3 * v_lut(color_weight, v_reinterpret_as_s32(vall)); - w1 = kweight3 * v_lut(color_weight, v_reinterpret_as_s32(valh)); - v_store_aligned(wsum + j + 2 * v_float32::nlanes, w0 + vx_load_aligned(wsum + j + 2 * v_float32::nlanes)); - v_store_aligned(wsum + j + 3 * v_float32::nlanes, w1 + vx_load_aligned(wsum + j + 3 * v_float32::nlanes)); + w0 = v_mul(kweight3, v_lut(color_weight, v_reinterpret_as_s32(vall))); + w1 = v_mul(kweight3, v_lut(color_weight, v_reinterpret_as_s32(valh))); + v_store_aligned(wsum + j + 2 * VTraits::vlanes(), v_add(w0, vx_load_aligned(wsum + j + 2 * VTraits::vlanes()))); + v_store_aligned(wsum + j + 3 * VTraits::vlanes(), v_add(w1, vx_load_aligned(wsum + j + 3 * VTraits::vlanes()))); v_expand(val2, vall, valh); - v_store_aligned(sum_b + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j + 2 * v_float32::nlanes))); - v_store_aligned(sum_b + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + 3 * v_float32::nlanes))); + v_store_aligned(sum_b + j + 2 * VTraits::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j + 2 * VTraits::vlanes()))); + v_store_aligned(sum_b + j + 3 * VTraits::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + 3 * VTraits::vlanes()))); v_expand(val3, vall, valh); - v_store_aligned(sum_g + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j + 2 * v_float32::nlanes))); - v_store_aligned(sum_g + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + 3 * v_float32::nlanes))); + v_store_aligned(sum_g + j + 2 * VTraits::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j + 2 * VTraits::vlanes()))); + v_store_aligned(sum_g + j + 3 * VTraits::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + 3 * VTraits::vlanes()))); v_expand(val4, vall, valh); - v_store_aligned(sum_r + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j + 2 * v_float32::nlanes))); - v_store_aligned(sum_r + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + 3 * v_float32::nlanes))); + v_store_aligned(sum_r + j + 2 * VTraits::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j + 2 * VTraits::vlanes()))); + v_store_aligned(sum_r + j + 3 * VTraits::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + 3 * VTraits::vlanes()))); } #endif #if CV_SIMD128 @@ -442,9 +442,9 @@ public: const uchar* ksptr = sptr + space_ofs[k]; const uchar* rsptr = sptr; j = 0; -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) v_float32 kweight = vx_setall_f32(space_weight[k]); - for (; j <= size.width - v_uint8::nlanes; j += v_uint8::nlanes, ksptr += 3*v_uint8::nlanes, rsptr += 3*v_uint8::nlanes) + for (; j <= size.width - VTraits::vlanes(); j += VTraits::vlanes(), ksptr += 3*VTraits::vlanes(), rsptr += 3*VTraits::vlanes()) { v_uint8 kb, kg, kr, rb, rg, rr; v_load_deinterleave(ksptr, kb, kg, kr); @@ -456,39 +456,39 @@ public: v_expand(v_absdiff(kr, rr), r_l, r_h); v_uint32 val0, val1, val2, val3; - v_expand(b_l + g_l + r_l, val0, val1); - v_expand(b_h + g_h + r_h, val2, val3); + v_expand(v_add(v_add(b_l, g_l), r_l), val0, val1); + v_expand(v_add(v_add(b_h, g_h), r_h), val2, val3); v_expand(kb, b_l, b_h); v_expand(kg, g_l, g_h); v_expand(kr, r_l, r_h); - v_float32 w0 = kweight * v_lut(color_weight, v_reinterpret_as_s32(val0)); - v_float32 w1 = kweight * v_lut(color_weight, v_reinterpret_as_s32(val1)); - v_float32 w2 = kweight * v_lut(color_weight, v_reinterpret_as_s32(val2)); - v_float32 w3 = kweight * v_lut(color_weight, v_reinterpret_as_s32(val3)); - v_store_aligned(wsum + j , w0 + vx_load_aligned(wsum + j)); - v_store_aligned(wsum + j + v_float32::nlanes, w1 + vx_load_aligned(wsum + j + v_float32::nlanes)); - v_store_aligned(wsum + j + 2*v_float32::nlanes, w2 + vx_load_aligned(wsum + j + 2*v_float32::nlanes)); - v_store_aligned(wsum + j + 3*v_float32::nlanes, w3 + vx_load_aligned(wsum + j + 3*v_float32::nlanes)); + v_float32 w0 = v_mul(kweight, v_lut(color_weight, v_reinterpret_as_s32(val0))); + v_float32 w1 = v_mul(kweight, v_lut(color_weight, v_reinterpret_as_s32(val1))); + v_float32 w2 = v_mul(kweight, v_lut(color_weight, v_reinterpret_as_s32(val2))); + v_float32 w3 = v_mul(kweight, v_lut(color_weight, v_reinterpret_as_s32(val3))); + v_store_aligned(wsum + j , v_add(w0, vx_load_aligned(wsum + j))); + v_store_aligned(wsum + j + VTraits::vlanes(), v_add(w1, vx_load_aligned(wsum + j + VTraits::vlanes()))); + v_store_aligned(wsum + j + 2*VTraits::vlanes(), v_add(w2, vx_load_aligned(wsum + j + 2 * VTraits::vlanes()))); + v_store_aligned(wsum + j + 3*VTraits::vlanes(), v_add(w3, vx_load_aligned(wsum + j + 3 * VTraits::vlanes()))); v_expand(b_l, val0, val1); v_expand(b_h, val2, val3); v_store_aligned(sum_b + j , v_muladd(v_cvt_f32(v_reinterpret_as_s32(val0)), w0, vx_load_aligned(sum_b + j))); - v_store_aligned(sum_b + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val1)), w1, vx_load_aligned(sum_b + j + v_float32::nlanes))); - v_store_aligned(sum_b + j + 2*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val2)), w2, vx_load_aligned(sum_b + j + 2*v_float32::nlanes))); - v_store_aligned(sum_b + j + 3*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val3)), w3, vx_load_aligned(sum_b + j + 3*v_float32::nlanes))); + v_store_aligned(sum_b + j + VTraits::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(val1)), w1, vx_load_aligned(sum_b + j + VTraits::vlanes()))); + v_store_aligned(sum_b + j + 2*VTraits::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(val2)), w2, vx_load_aligned(sum_b + j + 2*VTraits::vlanes()))); + v_store_aligned(sum_b + j + 3*VTraits::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(val3)), w3, vx_load_aligned(sum_b + j + 3*VTraits::vlanes()))); v_expand(g_l, val0, val1); v_expand(g_h, val2, val3); v_store_aligned(sum_g + j , v_muladd(v_cvt_f32(v_reinterpret_as_s32(val0)), w0, vx_load_aligned(sum_g + j))); - v_store_aligned(sum_g + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val1)), w1, vx_load_aligned(sum_g + j + v_float32::nlanes))); - v_store_aligned(sum_g + j + 2*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val2)), w2, vx_load_aligned(sum_g + j + 2*v_float32::nlanes))); - v_store_aligned(sum_g + j + 3*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val3)), w3, vx_load_aligned(sum_g + j + 3*v_float32::nlanes))); + v_store_aligned(sum_g + j + VTraits::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(val1)), w1, vx_load_aligned(sum_g + j + VTraits::vlanes()))); + v_store_aligned(sum_g + j + 2*VTraits::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(val2)), w2, vx_load_aligned(sum_g + j + 2*VTraits::vlanes()))); + v_store_aligned(sum_g + j + 3*VTraits::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(val3)), w3, vx_load_aligned(sum_g + j + 3*VTraits::vlanes()))); v_expand(r_l, val0, val1); v_expand(r_h, val2, val3); v_store_aligned(sum_r + j , v_muladd(v_cvt_f32(v_reinterpret_as_s32(val0)), w0, vx_load_aligned(sum_r + j))); - v_store_aligned(sum_r + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val1)), w1, vx_load_aligned(sum_r + j + v_float32::nlanes))); - v_store_aligned(sum_r + j + 2*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val2)), w2, vx_load_aligned(sum_r + j + 2*v_float32::nlanes))); - v_store_aligned(sum_r + j + 3*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val3)), w3, vx_load_aligned(sum_r + j + 3*v_float32::nlanes))); + v_store_aligned(sum_r + j + VTraits::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(val1)), w1, vx_load_aligned(sum_r + j + VTraits::vlanes()))); + v_store_aligned(sum_r + j + 2*VTraits::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(val2)), w2, vx_load_aligned(sum_r + j + 2*VTraits::vlanes()))); + v_store_aligned(sum_r + j + 3*VTraits::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(val3)), w3, vx_load_aligned(sum_r + j + 3*VTraits::vlanes()))); } #endif for(; j < size.width; j++, ksptr += 3, rsptr += 3) @@ -500,27 +500,27 @@ public: } } j = 0; -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) v_float32 v_one = vx_setall_f32(1.f); - for(; j <= size.width - v_uint8::nlanes; j += v_uint8::nlanes, dptr += 3*v_uint8::nlanes) + for(; j <= size.width - VTraits::vlanes(); j += VTraits::vlanes(), dptr += 3*VTraits::vlanes()) { - v_float32 w0 = v_one / vx_load_aligned(wsum + j); - v_float32 w1 = v_one / vx_load_aligned(wsum + j + v_float32::nlanes); - v_float32 w2 = v_one / vx_load_aligned(wsum + j + 2*v_float32::nlanes); - v_float32 w3 = v_one / vx_load_aligned(wsum + j + 3*v_float32::nlanes); + v_float32 w0 = v_div(v_one, vx_load_aligned(wsum + j)); + v_float32 w1 = v_div(v_one, vx_load_aligned(wsum + j + VTraits::vlanes())); + v_float32 w2 = v_div(v_one, vx_load_aligned(wsum + j + 2 * VTraits::vlanes())); + v_float32 w3 = v_div(v_one, vx_load_aligned(wsum + j + 3 * VTraits::vlanes())); - v_store_interleave(dptr, v_pack_u(v_pack(v_round(w0 * vx_load_aligned(sum_b + j)), - v_round(w1 * vx_load_aligned(sum_b + j + v_float32::nlanes))), - v_pack(v_round(w2 * vx_load_aligned(sum_b + j + 2*v_float32::nlanes)), - v_round(w3 * vx_load_aligned(sum_b + j + 3*v_float32::nlanes)))), - v_pack_u(v_pack(v_round(w0 * vx_load_aligned(sum_g + j)), - v_round(w1 * vx_load_aligned(sum_g + j + v_float32::nlanes))), - v_pack(v_round(w2 * vx_load_aligned(sum_g + j + 2*v_float32::nlanes)), - v_round(w3 * vx_load_aligned(sum_g + j + 3*v_float32::nlanes)))), - v_pack_u(v_pack(v_round(w0 * vx_load_aligned(sum_r + j)), - v_round(w1 * vx_load_aligned(sum_r + j + v_float32::nlanes))), - v_pack(v_round(w2 * vx_load_aligned(sum_r + j + 2*v_float32::nlanes)), - v_round(w3 * vx_load_aligned(sum_r + j + 3*v_float32::nlanes))))); + v_store_interleave(dptr, v_pack_u(v_pack(v_round(v_mul(w0, vx_load_aligned(sum_b + j))), + v_round(v_mul(w1, vx_load_aligned(sum_b + j + VTraits::vlanes())))), + v_pack(v_round(v_mul(w2, vx_load_aligned(sum_b + j + 2 * VTraits::vlanes()))), + v_round(v_mul(w3, vx_load_aligned(sum_b + j + 3 * VTraits::vlanes()))))), + v_pack_u(v_pack(v_round(v_mul(w0, vx_load_aligned(sum_g + j))), + v_round(v_mul(w1, vx_load_aligned(sum_g + j + VTraits::vlanes())))), + v_pack(v_round(v_mul(w2, vx_load_aligned(sum_g + j + 2 * VTraits::vlanes()))), + v_round(v_mul(w3, vx_load_aligned(sum_g + j + 3 * VTraits::vlanes()))))), + v_pack_u(v_pack(v_round(v_mul(w0, vx_load_aligned(sum_r + j))), + v_round(v_mul(w1, vx_load_aligned(sum_r + j + VTraits::vlanes())))), + v_pack(v_round(v_mul(w2, vx_load_aligned(sum_r + j + 2 * VTraits::vlanes()))), + v_round(v_mul(w3, vx_load_aligned(sum_r + j + 3 * VTraits::vlanes())))))); } #endif for(; j < size.width; j++) @@ -533,7 +533,7 @@ public: } } } -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) vx_cleanup(); #endif } @@ -589,7 +589,7 @@ public: memset(buf.data(), 0, buf.size() * sizeof(float)); float *sum = alignPtr(buf.data(), CV_SIMD_WIDTH); float *wsum = sum + alignSize(size.width, CV_SIMD_WIDTH); -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) v_float32 v_one = vx_setall_f32(1.f); v_float32 sindex = vx_setall_f32(scale_index); #endif @@ -601,50 +601,50 @@ public: const float* ksptr2 = sptr + space_ofs[k + 2]; const float* ksptr3 = sptr + space_ofs[k + 3]; j = 0; -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) v_float32 kweight0 = vx_setall_f32(space_weight[k]); v_float32 kweight1 = vx_setall_f32(space_weight[k+1]); v_float32 kweight2 = vx_setall_f32(space_weight[k+2]); v_float32 kweight3 = vx_setall_f32(space_weight[k+3]); - for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes) + for (; j <= size.width - VTraits::vlanes(); j += VTraits::vlanes()) { v_float32 rval = vx_load(sptr + j); v_float32 val = vx_load(ksptr0 + j); v_float32 knan = v_not_nan(val); - v_float32 alpha = (v_absdiff(val, rval) * sindex) & v_not_nan(rval) & knan; + v_float32 alpha = v_and(v_and(v_mul(v_absdiff(val, rval), sindex), v_not_nan(rval)), knan); v_int32 idx = v_trunc(alpha); - alpha -= v_cvt_f32(idx); - v_float32 w = (kweight0 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one-alpha))) & knan; - v_float32 v_wsum = vx_load_aligned(wsum + j) + w; - v_float32 v_sum = v_muladd(val & knan, w, vx_load_aligned(sum + j)); + alpha = v_sub(alpha, v_cvt_f32(idx)); + v_float32 w = v_and(v_mul(kweight0, v_muladd(v_lut(this->expLUT + 1, idx), alpha, v_mul(v_lut(this->expLUT, idx), v_sub(v_one, alpha)))), knan); + v_float32 v_wsum = v_add(vx_load_aligned(wsum + j), w); + v_float32 v_sum = v_muladd(v_and(val, knan), w, vx_load_aligned(sum + j)); val = vx_load(ksptr1 + j); knan = v_not_nan(val); - alpha = (v_absdiff(val, rval) * sindex) & v_not_nan(rval) & knan; + alpha = v_and(v_and(v_mul(v_absdiff(val, rval), sindex), v_not_nan(rval)), knan); idx = v_trunc(alpha); - alpha -= v_cvt_f32(idx); - w = (kweight1 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan; - v_wsum += w; - v_sum = v_muladd(val & knan, w, v_sum); + alpha = v_sub(alpha, v_cvt_f32(idx)); + w = v_and(v_mul(kweight1, v_muladd(v_lut(this->expLUT + 1, idx), alpha, v_mul(v_lut(this->expLUT, idx), v_sub(v_one, alpha)))), knan); + v_wsum = v_add(v_wsum, w); + v_sum = v_muladd(v_and(val, knan), w, v_sum); val = vx_load(ksptr2 + j); knan = v_not_nan(val); - alpha = (v_absdiff(val, rval) * sindex) & v_not_nan(rval) & knan; + alpha = v_and(v_and(v_mul(v_absdiff(val, rval), sindex), v_not_nan(rval)), knan); idx = v_trunc(alpha); - alpha -= v_cvt_f32(idx); - w = (kweight2 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan; - v_wsum += w; - v_sum = v_muladd(val & knan, w, v_sum); + alpha = v_sub(alpha, v_cvt_f32(idx)); + w = v_and(v_mul(kweight2, v_muladd(v_lut(this->expLUT + 1, idx), alpha, v_mul(v_lut(this->expLUT, idx), v_sub(v_one, alpha)))), knan); + v_wsum = v_add(v_wsum, w); + v_sum = v_muladd(v_and(val, knan), w, v_sum); val = vx_load(ksptr3 + j); knan = v_not_nan(val); - alpha = (v_absdiff(val, rval) * sindex) & v_not_nan(rval) & knan; + alpha = v_and(v_and(v_mul(v_absdiff(val, rval), sindex), v_not_nan(rval)), knan); idx = v_trunc(alpha); - alpha -= v_cvt_f32(idx); - w = (kweight3 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan; - v_wsum += w; - v_sum = v_muladd(val & knan, w, v_sum); + alpha = v_sub(alpha, v_cvt_f32(idx)); + w = v_and(v_mul(kweight3, v_muladd(v_lut(this->expLUT + 1, idx), alpha, v_mul(v_lut(this->expLUT, idx), v_sub(v_one, alpha)))), knan); + v_wsum = v_add(v_wsum, w); + v_sum = v_muladd(v_and(val, knan), w, v_sum); v_store_aligned(wsum + j, v_wsum); v_store_aligned(sum + j, v_sum); @@ -720,20 +720,20 @@ public: { const float* ksptr = sptr + space_ofs[k]; j = 0; -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) v_float32 kweight = vx_setall_f32(space_weight[k]); - for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes) + for (; j <= size.width - VTraits::vlanes(); j += VTraits::vlanes()) { v_float32 val = vx_load(ksptr + j); v_float32 rval = vx_load(sptr + j); v_float32 knan = v_not_nan(val); - v_float32 alpha = (v_absdiff(val, rval) * sindex) & v_not_nan(rval) & knan; + v_float32 alpha = v_and(v_and(v_mul(v_absdiff(val, rval), sindex), v_not_nan(rval)), knan); v_int32 idx = v_trunc(alpha); - alpha -= v_cvt_f32(idx); + alpha = v_sub(alpha, v_cvt_f32(idx)); - v_float32 w = (kweight * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one-alpha))) & knan; - v_store_aligned(wsum + j, vx_load_aligned(wsum + j) + w); - v_store_aligned(sum + j, v_muladd(val & knan, w, vx_load_aligned(sum + j))); + v_float32 w = v_and(v_mul(kweight, v_muladd(v_lut(this->expLUT + 1, idx), alpha, v_mul(v_lut(this->expLUT, idx), v_sub(v_one, alpha)))), knan); + v_store_aligned(wsum + j, v_add(vx_load_aligned(wsum + j), w)); + v_store_aligned(sum + j, v_muladd(v_and(val, knan), w, vx_load_aligned(sum + j))); } #endif for (; j < size.width; j++) @@ -752,11 +752,11 @@ public: } } j = 0; -#if CV_SIMD - for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes) +#if (CV_SIMD || CV_SIMD_SCALABLE) + for (; j <= size.width - VTraits::vlanes(); j += VTraits::vlanes()) { v_float32 v_val = vx_load(sptr + j); - v_store(dptr + j, (vx_load_aligned(sum + j) + (v_val & v_not_nan(v_val))) / (vx_load_aligned(wsum + j) + (v_one & v_not_nan(v_val)))); + v_store(dptr + j, v_div(v_add(vx_load_aligned(sum + j), v_and(v_val, v_not_nan(v_val))), v_add(vx_load_aligned(wsum + j), v_and(v_one, v_not_nan(v_val))))); } #endif for (; j < size.width; j++) @@ -774,7 +774,7 @@ public: float *sum_g = sum_b + alignSize(size.width, CV_SIMD_WIDTH); float *sum_r = sum_g + alignSize(size.width, CV_SIMD_WIDTH); float *wsum = sum_r + alignSize(size.width, CV_SIMD_WIDTH); -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) v_float32 v_one = vx_setall_f32(1.f); v_float32 sindex = vx_setall_f32(scale_index); #endif @@ -787,60 +787,60 @@ public: const float* ksptr3 = sptr + space_ofs[k+3]; const float* rsptr = sptr; j = 0; -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) v_float32 kweight0 = vx_setall_f32(space_weight[k]); v_float32 kweight1 = vx_setall_f32(space_weight[k+1]); v_float32 kweight2 = vx_setall_f32(space_weight[k+2]); v_float32 kweight3 = vx_setall_f32(space_weight[k+3]); - for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes, rsptr += 3 * v_float32::nlanes, - ksptr0 += 3 * v_float32::nlanes, ksptr1 += 3 * v_float32::nlanes, ksptr2 += 3 * v_float32::nlanes, ksptr3 += 3 * v_float32::nlanes) + for (; j <= size.width - VTraits::vlanes(); j += VTraits::vlanes(), rsptr += 3 * VTraits::vlanes(), + ksptr0 += 3 * VTraits::vlanes(), ksptr1 += 3 * VTraits::vlanes(), ksptr2 += 3 * VTraits::vlanes(), ksptr3 += 3 * VTraits::vlanes()) { v_float32 kb, kg, kr, rb, rg, rr; v_load_deinterleave(rsptr, rb, rg, rr); v_load_deinterleave(ksptr0, kb, kg, kr); - v_float32 knan = v_not_nan(kb) & v_not_nan(kg) & v_not_nan(kr); - v_float32 alpha = ((v_absdiff(kb, rb) + v_absdiff(kg, rg) + v_absdiff(kr, rr)) * sindex) & v_not_nan(rb) & v_not_nan(rg) & v_not_nan(rr) & knan; + v_float32 knan = v_and(v_and(v_not_nan(kb), v_not_nan(kg)), v_not_nan(kr)); + v_float32 alpha = v_and(v_and(v_and(v_and(v_mul(v_add(v_add(v_absdiff(kb, rb), v_absdiff(kg, rg)), v_absdiff(kr, rr)), sindex), v_not_nan(rb)), v_not_nan(rg)), v_not_nan(rr)), knan); v_int32 idx = v_trunc(alpha); - alpha -= v_cvt_f32(idx); - v_float32 w = (kweight0 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan; - v_float32 v_wsum = vx_load_aligned(wsum + j) + w; - v_float32 v_sum_b = v_muladd(kb & knan, w, vx_load_aligned(sum_b + j)); - v_float32 v_sum_g = v_muladd(kg & knan, w, vx_load_aligned(sum_g + j)); - v_float32 v_sum_r = v_muladd(kr & knan, w, vx_load_aligned(sum_r + j)); + alpha = v_sub(alpha, v_cvt_f32(idx)); + v_float32 w = v_and(v_mul(kweight0, v_muladd(v_lut(this->expLUT + 1, idx), alpha, v_mul(v_lut(this->expLUT, idx), v_sub(v_one, alpha)))), knan); + v_float32 v_wsum = v_add(vx_load_aligned(wsum + j), w); + v_float32 v_sum_b = v_muladd(v_and(kb, knan), w, vx_load_aligned(sum_b + j)); + v_float32 v_sum_g = v_muladd(v_and(kg, knan), w, vx_load_aligned(sum_g + j)); + v_float32 v_sum_r = v_muladd(v_and(kr, knan), w, vx_load_aligned(sum_r + j)); v_load_deinterleave(ksptr1, kb, kg, kr); - knan = v_not_nan(kb) & v_not_nan(kg) & v_not_nan(kr); - alpha = ((v_absdiff(kb, rb) + v_absdiff(kg, rg) + v_absdiff(kr, rr)) * sindex) & v_not_nan(rb) & v_not_nan(rg) & v_not_nan(rr) & knan; + knan = v_and(v_and(v_not_nan(kb), v_not_nan(kg)), v_not_nan(kr)); + alpha = v_and(v_and(v_and(v_and(v_mul(v_add(v_add(v_absdiff(kb, rb), v_absdiff(kg, rg)), v_absdiff(kr, rr)), sindex), v_not_nan(rb)), v_not_nan(rg)), v_not_nan(rr)), knan); idx = v_trunc(alpha); - alpha -= v_cvt_f32(idx); - w = (kweight1 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan; - v_wsum += w; - v_sum_b = v_muladd(kb & knan, w, v_sum_b); - v_sum_g = v_muladd(kg & knan, w, v_sum_g); - v_sum_r = v_muladd(kr & knan, w, v_sum_r); + alpha = v_sub(alpha, v_cvt_f32(idx)); + w = v_and(v_mul(kweight1, v_muladd(v_lut(this->expLUT + 1, idx), alpha, v_mul(v_lut(this->expLUT, idx), v_sub(v_one, alpha)))), knan); + v_wsum = v_add(v_wsum, w); + v_sum_b = v_muladd(v_and(kb, knan), w, v_sum_b); + v_sum_g = v_muladd(v_and(kg, knan), w, v_sum_g); + v_sum_r = v_muladd(v_and(kr, knan), w, v_sum_r); v_load_deinterleave(ksptr2, kb, kg, kr); - knan = v_not_nan(kb) & v_not_nan(kg) & v_not_nan(kr); - alpha = ((v_absdiff(kb, rb) + v_absdiff(kg, rg) + v_absdiff(kr, rr)) * sindex) & v_not_nan(rb) & v_not_nan(rg) & v_not_nan(rr) & knan; + knan = v_and(v_and(v_not_nan(kb), v_not_nan(kg)), v_not_nan(kr)); + alpha = v_and(v_and(v_and(v_and(v_mul(v_add(v_add(v_absdiff(kb, rb), v_absdiff(kg, rg)), v_absdiff(kr, rr)), sindex), v_not_nan(rb)), v_not_nan(rg)), v_not_nan(rr)), knan); idx = v_trunc(alpha); - alpha -= v_cvt_f32(idx); - w = (kweight2 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan; - v_wsum += w; - v_sum_b = v_muladd(kb & knan, w, v_sum_b); - v_sum_g = v_muladd(kg & knan, w, v_sum_g); - v_sum_r = v_muladd(kr & knan, w, v_sum_r); + alpha = v_sub(alpha, v_cvt_f32(idx)); + w = v_and(v_mul(kweight2, v_muladd(v_lut(this->expLUT + 1, idx), alpha, v_mul(v_lut(this->expLUT, idx), v_sub(v_one, alpha)))), knan); + v_wsum = v_add(v_wsum, w); + v_sum_b = v_muladd(v_and(kb, knan), w, v_sum_b); + v_sum_g = v_muladd(v_and(kg, knan), w, v_sum_g); + v_sum_r = v_muladd(v_and(kr, knan), w, v_sum_r); v_load_deinterleave(ksptr3, kb, kg, kr); - knan = v_not_nan(kb) & v_not_nan(kg) & v_not_nan(kr); - alpha = ((v_absdiff(kb, rb) + v_absdiff(kg, rg) + v_absdiff(kr, rr)) * sindex) & v_not_nan(rb) & v_not_nan(rg) & v_not_nan(rr) & knan; + knan = v_and(v_and(v_not_nan(kb), v_not_nan(kg)), v_not_nan(kr)); + alpha = v_and(v_and(v_and(v_and(v_mul(v_add(v_add(v_absdiff(kb, rb), v_absdiff(kg, rg)), v_absdiff(kr, rr)), sindex), v_not_nan(rb)), v_not_nan(rg)), v_not_nan(rr)), knan); idx = v_trunc(alpha); - alpha -= v_cvt_f32(idx); - w = (kweight3 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan; - v_wsum += w; - v_sum_b = v_muladd(kb & knan, w, v_sum_b); - v_sum_g = v_muladd(kg & knan, w, v_sum_g); - v_sum_r = v_muladd(kr & knan, w, v_sum_r); + alpha = v_sub(alpha, v_cvt_f32(idx)); + w = v_and(v_mul(kweight3, v_muladd(v_lut(this->expLUT + 1, idx), alpha, v_mul(v_lut(this->expLUT, idx), v_sub(v_one, alpha)))), knan); + v_wsum = v_add(v_wsum, w); + v_sum_b = v_muladd(v_and(kb, knan), w, v_sum_b); + v_sum_g = v_muladd(v_and(kg, knan), w, v_sum_g); + v_sum_r = v_muladd(v_and(kr, knan), w, v_sum_r); v_store_aligned(wsum + j, v_wsum); v_store_aligned(sum_b + j, v_sum_b); @@ -938,24 +938,24 @@ public: const float* ksptr = sptr + space_ofs[k]; const float* rsptr = sptr; j = 0; -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) v_float32 kweight = vx_setall_f32(space_weight[k]); - for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes, ksptr += 3*v_float32::nlanes, rsptr += 3*v_float32::nlanes) + for (; j <= size.width - VTraits::vlanes(); j += VTraits::vlanes(), ksptr += 3*VTraits::vlanes(), rsptr += 3*VTraits::vlanes()) { v_float32 kb, kg, kr, rb, rg, rr; v_load_deinterleave(ksptr, kb, kg, kr); v_load_deinterleave(rsptr, rb, rg, rr); - v_float32 knan = v_not_nan(kb) & v_not_nan(kg) & v_not_nan(kr); - v_float32 alpha = ((v_absdiff(kb, rb) + v_absdiff(kg, rg) + v_absdiff(kr, rr)) * sindex) & v_not_nan(rb) & v_not_nan(rg) & v_not_nan(rr) & knan; + v_float32 knan = v_and(v_and(v_not_nan(kb), v_not_nan(kg)), v_not_nan(kr)); + v_float32 alpha = v_and(v_and(v_and(v_and(v_mul(v_add(v_add(v_absdiff(kb, rb), v_absdiff(kg, rg)), v_absdiff(kr, rr)), sindex), v_not_nan(rb)), v_not_nan(rg)), v_not_nan(rr)), knan); v_int32 idx = v_trunc(alpha); - alpha -= v_cvt_f32(idx); + alpha = v_sub(alpha, v_cvt_f32(idx)); - v_float32 w = (kweight * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan; - v_store_aligned(wsum + j, vx_load_aligned(wsum + j) + w); - v_store_aligned(sum_b + j, v_muladd(kb & knan, w, vx_load_aligned(sum_b + j))); - v_store_aligned(sum_g + j, v_muladd(kg & knan, w, vx_load_aligned(sum_g + j))); - v_store_aligned(sum_r + j, v_muladd(kr & knan, w, vx_load_aligned(sum_r + j))); + v_float32 w = v_and(v_mul(kweight, v_muladd(v_lut(this->expLUT + 1, idx), alpha, v_mul(v_lut(this->expLUT, idx), v_sub(v_one, alpha)))), knan); + v_store_aligned(wsum + j, v_add(vx_load_aligned(wsum + j), w)); + v_store_aligned(sum_b + j, v_muladd(v_and(kb, knan), w, vx_load_aligned(sum_b + j))); + v_store_aligned(sum_g + j, v_muladd(v_and(kg, knan), w, vx_load_aligned(sum_g + j))); + v_store_aligned(sum_r + j, v_muladd(v_and(kr, knan), w, vx_load_aligned(sum_r + j))); } #endif for (; j < size.width; j++, ksptr += 3, rsptr += 3) @@ -978,14 +978,14 @@ public: } } j = 0; -#if CV_SIMD - for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes, sptr += 3*v_float32::nlanes, dptr += 3*v_float32::nlanes) +#if (CV_SIMD || CV_SIMD_SCALABLE) + for (; j <= size.width - VTraits::vlanes(); j += VTraits::vlanes(), sptr += 3*VTraits::vlanes(), dptr += 3*VTraits::vlanes()) { v_float32 b, g, r; v_load_deinterleave(sptr, b, g, r); - v_float32 mask = v_not_nan(b) & v_not_nan(g) & v_not_nan(r); - v_float32 w = v_one / (vx_load_aligned(wsum + j) + (v_one & mask)); - v_store_interleave(dptr, (vx_load_aligned(sum_b + j) + (b & mask)) * w, (vx_load_aligned(sum_g + j) + (g & mask)) * w, (vx_load_aligned(sum_r + j) + (r & mask)) * w); + v_float32 mask = v_and(v_and(v_not_nan(b), v_not_nan(g)), v_not_nan(r)); + v_float32 w = v_div(v_one, v_add(vx_load_aligned(wsum + j), v_and(v_one, mask))); + v_store_interleave(dptr, v_mul(v_add(vx_load_aligned(sum_b + j), v_and(b, mask)), w), v_mul(v_add(vx_load_aligned(sum_g + j), v_and(g, mask)), w), v_mul(v_add(vx_load_aligned(sum_r + j), v_and(r, mask)), w)); } #endif for (; j < size.width; j++) @@ -1011,7 +1011,7 @@ public: } } } -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) vx_cleanup(); #endif } diff --git a/modules/imgproc/src/color_lab.cpp b/modules/imgproc/src/color_lab.cpp index 3b18944a0c..d111efdc47 100644 --- a/modules/imgproc/src/color_lab.cpp +++ b/modules/imgproc/src/color_lab.cpp @@ -56,40 +56,38 @@ template static inline _Tp splineInterpolate(_Tp x, const _Tp* tab return ((tab[3]*x + tab[2])*x + tab[1])*x + tab[0]; } -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) template static inline cv::v_float32 splineInterpolate(const cv::v_float32& x, const _Tp* tab, int n) { using namespace cv; v_int32 ix = v_min(v_max(v_trunc(x), vx_setzero_s32()), vx_setall_s32(n-1)); - cv::v_float32 xx = x - v_cvt_f32(ix); - ix = ix << 2; + cv::v_float32 xx = v_sub(x, v_cvt_f32(ix)); + ix = v_shl<2>(ix); - v_float32 t[4]; + v_float32 t0, t1, t2, t3; // assume that v_float32::nlanes == v_int32::nlanes - if(v_float32::nlanes == 4) + if(VTraits::vlanes() == 4) { -#if CV_SIMD_WIDTH == 16 int32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) idx[4]; v_store_aligned(idx, ix); - v_float32x4 tt[4]; - tt[0] = v_load(tab + idx[0]); - tt[1] = v_load(tab + idx[1]); - tt[2] = v_load(tab + idx[2]); - tt[3] = v_load(tab + idx[3]); - v_transpose4x4(tt[0], tt[1], tt[2], tt[3], - t[0], t[1], t[2], t[3]); -#endif + v_float32 tt0, tt1, tt2, tt3; + tt0 = vx_load(tab + idx[0]); + tt1 = vx_load(tab + idx[1]); + tt2 = vx_load(tab + idx[2]); + tt3 = vx_load(tab + idx[3]); + v_transpose4x4(tt0, tt1, tt2, tt3, + t0, t1, t2, t3); } else { - t[0] = v_lut(tab + 0, ix); - t[1] = v_lut(tab + 1, ix); - t[2] = v_lut(tab + 2, ix); - t[3] = v_lut(tab + 3, ix); + t0 = v_lut(tab + 0, ix); + t1 = v_lut(tab + 1, ix); + t2 = v_lut(tab + 2, ix); + t3 = v_lut(tab + 3, ix); } - return v_fma(v_fma(v_fma(t[3], xx, t[2]), xx, t[1]), xx, t[0]); + return v_fma(v_fma(v_fma(t3, xx, t2), xx, t1), xx, t0); } #endif @@ -207,8 +205,8 @@ struct RGB2XYZ_f C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5], C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8]; int i = 0; -#if CV_SIMD - const int vsize = v_float32::nlanes; +#if (CV_SIMD || CV_SIMD_SCALABLE) + const int vsize = VTraits::vlanes(); v_float32 vc0 = vx_setall_f32(C0), vc1 = vx_setall_f32(C1), vc2 = vx_setall_f32(C2); v_float32 vc3 = vx_setall_f32(C3), vc4 = vx_setall_f32(C4), vc5 = vx_setall_f32(C5); v_float32 vc6 = vx_setall_f32(C6), vc7 = vx_setall_f32(C7), vc8 = vx_setall_f32(C8); @@ -226,9 +224,9 @@ struct RGB2XYZ_f } v_float32 x, y, z; - x = v_fma(b, vc0, v_fma(g, vc1, r*vc2)); - y = v_fma(b, vc3, v_fma(g, vc4, r*vc5)); - z = v_fma(b, vc6, v_fma(g, vc7, r*vc8)); + x = v_fma(b, vc0, v_fma(g, vc1, v_mul(r, vc2))); + y = v_fma(b, vc3, v_fma(g, vc4, v_mul(r, vc5))); + z = v_fma(b, vc6, v_fma(g, vc7, v_mul(r, vc8))); v_store_interleave(dst, x, y, z); } @@ -313,8 +311,8 @@ struct RGB2XYZ_i C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5], C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8]; -#if CV_SIMD - const int vsize = v_uint8::nlanes; +#if (CV_SIMD || CV_SIMD_SCALABLE) + const int vsize = VTraits::vlanes(); int descaleShift = 1 << (shift-1); v_int16 vdescale = vx_setall_s16((short)descaleShift); v_int16 cxbg, cxr1, cybg, cyr1, czbg, czr1; @@ -349,27 +347,36 @@ struct RGB2XYZ_i sg0 = v_reinterpret_as_s16(g0); sg1 = v_reinterpret_as_s16(g1); sb0 = v_reinterpret_as_s16(b0); sb1 = v_reinterpret_as_s16(b1); - v_int16 bg[4], rd[4]; - v_zip(sb0, sg0, bg[0], bg[1]); - v_zip(sb1, sg1, bg[2], bg[3]); - v_zip(sr0, vdescale, rd[0], rd[1]); - v_zip(sr1, vdescale, rd[2], rd[3]); + v_int16 bg0, bg1, bg2, bg3, rd0, rd1, rd2, rd3; + v_zip(sb0, sg0, bg0, bg1); + v_zip(sb1, sg1, bg2, bg3); + v_zip(sr0, vdescale, rd0, rd1); + v_zip(sr1, vdescale, rd2, rd3); - v_uint32 vx[4], vy[4], vz[4]; - for(int j = 0; j < 4; j++) - { - vx[j] = v_reinterpret_as_u32(v_dotprod(bg[j], cxbg) + v_dotprod(rd[j], cxr1)) >> shift; - vy[j] = v_reinterpret_as_u32(v_dotprod(bg[j], cybg) + v_dotprod(rd[j], cyr1)) >> shift; - vz[j] = v_reinterpret_as_u32(v_dotprod(bg[j], czbg) + v_dotprod(rd[j], czr1)) >> shift; - } + v_uint32 vx0, vx1, vx2, vx3; + v_uint32 vy0, vy1, vy2, vy3; + v_uint32 vz0, vz1, vz2, vz3; + + vx0 = v_shr(v_reinterpret_as_u32(v_add(v_dotprod(bg0, cxbg), v_dotprod(rd0, cxr1)))); + vy0 = v_shr(v_reinterpret_as_u32(v_add(v_dotprod(bg0, cybg), v_dotprod(rd0, cyr1)))); + vz0 = v_shr(v_reinterpret_as_u32(v_add(v_dotprod(bg0, czbg), v_dotprod(rd0, czr1)))); + vx1 = v_shr(v_reinterpret_as_u32(v_add(v_dotprod(bg1, cxbg), v_dotprod(rd1, cxr1)))); + vy1 = v_shr(v_reinterpret_as_u32(v_add(v_dotprod(bg1, cybg), v_dotprod(rd1, cyr1)))); + vz1 = v_shr(v_reinterpret_as_u32(v_add(v_dotprod(bg1, czbg), v_dotprod(rd1, czr1)))); + vx2 = v_shr(v_reinterpret_as_u32(v_add(v_dotprod(bg2, cxbg), v_dotprod(rd2, cxr1)))); + vy2 = v_shr(v_reinterpret_as_u32(v_add(v_dotprod(bg2, cybg), v_dotprod(rd2, cyr1)))); + vz2 = v_shr(v_reinterpret_as_u32(v_add(v_dotprod(bg2, czbg), v_dotprod(rd2, czr1)))); + vx3 = v_shr(v_reinterpret_as_u32(v_add(v_dotprod(bg3, cxbg), v_dotprod(rd3, cxr1)))); + vy3 = v_shr(v_reinterpret_as_u32(v_add(v_dotprod(bg3, cybg), v_dotprod(rd3, cyr1)))); + vz3 = v_shr(v_reinterpret_as_u32(v_add(v_dotprod(bg3, czbg), v_dotprod(rd3, czr1)))); v_uint16 x0, x1, y0, y1, z0, z1; - x0 = v_pack(vx[0], vx[1]); - x1 = v_pack(vx[2], vx[3]); - y0 = v_pack(vy[0], vy[1]); - y1 = v_pack(vy[2], vy[3]); - z0 = v_pack(vz[0], vz[1]); - z1 = v_pack(vz[2], vz[3]); + x0 = v_pack(vx0, vx1); + x1 = v_pack(vx2, vx3); + y0 = v_pack(vy0, vy1); + y1 = v_pack(vy2, vy3); + z0 = v_pack(vz0, vz1); + z1 = v_pack(vz2, vz3); v_uint8 x, y, z; x = v_pack(x0, x1); @@ -424,8 +431,8 @@ struct RGB2XYZ_i int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5], C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8]; -#if CV_SIMD - const int vsize = v_uint16::nlanes; +#if (CV_SIMD || CV_SIMD_SCALABLE) + const int vsize = VTraits::vlanes(); const int descaleShift = 1 << (shift-1); v_int16 vdescale = vx_setall_s16(descaleShift); v_int16 vc0 = vx_setall_s16((short)C0), vc1 = vx_setall_s16((short)C1), vc2 = vx_setall_s16((short)C2); @@ -464,29 +471,29 @@ struct RGB2XYZ_i v_int16 ymr, ymg, ymb; v_int16 zmr, zmg, zmb; - v_int16 mr = sr < zero, mg = sg < zero, mb = sb < zero; + v_int16 mr = v_lt(sr, zero), mg = v_lt(sg, zero), mb = v_lt(sb, zero); - xmb = mb & vc0; - xmg = mg & vc1; - xmr = mr & vc2; - ymb = mb & vc3; - ymg = mg & vc4; - ymr = mr & vc5; - zmb = mb & vc6; - zmg = mg & vc7; - zmr = mr & vc8; + xmb = v_and(mb, vc0); + xmg = v_and(mg, vc1); + xmr = v_and(mr, vc2); + ymb = v_and(mb, vc3); + ymg = v_and(mg, vc4); + ymr = v_and(mr, vc5); + zmb = v_and(mb, vc6); + zmg = v_and(mg, vc7); + zmr = v_and(mr, vc8); v_int32 xfix0, xfix1, yfix0, yfix1, zfix0, zfix1; - v_expand(xmr + xmg + xmb, xfix0, xfix1); - v_expand(ymr + ymg + ymb, yfix0, yfix1); - v_expand(zmr + zmg + zmb, zfix0, zfix1); + v_expand(v_add(v_add(xmr, xmg), xmb), xfix0, xfix1); + v_expand(v_add(v_add(ymr, ymg), ymb), yfix0, yfix1); + v_expand(v_add(v_add(zmr, zmg), zmb), zfix0, zfix1); - xfix0 = xfix0 << 16; - xfix1 = xfix1 << 16; - yfix0 = yfix0 << 16; - yfix1 = yfix1 << 16; - zfix0 = zfix0 << 16; - zfix1 = zfix1 << 16; + xfix0 = v_shl<16>(xfix0); + xfix1 = v_shl<16>(xfix1); + yfix0 = v_shl<16>(yfix0); + yfix1 = v_shl<16>(yfix1); + zfix0 = v_shl<16>(zfix0); + zfix1 = v_shl<16>(zfix1); v_int16 bg0, bg1, rd0, rd1; v_zip(sb, sg, bg0, bg1); @@ -494,12 +501,12 @@ struct RGB2XYZ_i v_uint32 x0, x1, y0, y1, z0, z1; - x0 = v_reinterpret_as_u32(v_dotprod(bg0, cxbg) + v_dotprod(rd0, cxr1) + xfix0) >> shift; - x1 = v_reinterpret_as_u32(v_dotprod(bg1, cxbg) + v_dotprod(rd1, cxr1) + xfix1) >> shift; - y0 = v_reinterpret_as_u32(v_dotprod(bg0, cybg) + v_dotprod(rd0, cyr1) + yfix0) >> shift; - y1 = v_reinterpret_as_u32(v_dotprod(bg1, cybg) + v_dotprod(rd1, cyr1) + yfix1) >> shift; - z0 = v_reinterpret_as_u32(v_dotprod(bg0, czbg) + v_dotprod(rd0, czr1) + zfix0) >> shift; - z1 = v_reinterpret_as_u32(v_dotprod(bg1, czbg) + v_dotprod(rd1, czr1) + zfix1) >> shift; + x0 = v_shr(v_reinterpret_as_u32(v_add(v_add(v_dotprod(bg0, cxbg), v_dotprod(rd0, cxr1)), xfix0))); + x1 = v_shr(v_reinterpret_as_u32(v_add(v_add(v_dotprod(bg1, cxbg), v_dotprod(rd1, cxr1)), xfix1))); + y0 = v_shr(v_reinterpret_as_u32(v_add(v_add(v_dotprod(bg0, cybg), v_dotprod(rd0, cyr1)), yfix0))); + y1 = v_shr(v_reinterpret_as_u32(v_add(v_add(v_dotprod(bg1, cybg), v_dotprod(rd1, cyr1)), yfix1))); + z0 = v_shr(v_reinterpret_as_u32(v_add(v_add(v_dotprod(bg0, czbg), v_dotprod(rd0, czr1)), zfix0))); + z1 = v_shr(v_reinterpret_as_u32(v_add(v_add(v_dotprod(bg1, czbg), v_dotprod(rd1, czr1)), zfix1))); v_uint16 x, y, z; x = v_pack(x0, x1); @@ -593,8 +600,8 @@ struct XYZ2RGB_f C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5], C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8]; int i = 0; -#if CV_SIMD - const int vsize = v_float32::nlanes; +#if (CV_SIMD || CV_SIMD_SCALABLE) + const int vsize = VTraits::vlanes(); v_float32 valpha = vx_setall_f32(alpha); v_float32 vc0 = vx_setall_f32(C0), vc1 = vx_setall_f32(C1), vc2 = vx_setall_f32(C2); v_float32 vc3 = vx_setall_f32(C3), vc4 = vx_setall_f32(C4), vc5 = vx_setall_f32(C5); @@ -606,9 +613,9 @@ struct XYZ2RGB_f v_load_deinterleave(src, x, y, z); v_float32 b, g, r; - b = v_fma(x, vc0, v_fma(y, vc1, z*vc2)); - g = v_fma(x, vc3, v_fma(y, vc4, z*vc5)); - r = v_fma(x, vc6, v_fma(y, vc7, z*vc8)); + b = v_fma(x, vc0, v_fma(y, vc1, v_mul(z, vc2))); + g = v_fma(x, vc3, v_fma(y, vc4, v_mul(z, vc5))); + r = v_fma(x, vc6, v_fma(y, vc7, v_mul(z, vc8))); if(dcn == 4) { @@ -707,8 +714,8 @@ struct XYZ2RGB_i int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5], C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8]; -#if CV_SIMD - const int vsize = v_uint8::nlanes; +#if (CV_SIMD || CV_SIMD_SCALABLE) + const int vsize = VTraits::vlanes(); const int descaleShift = 1 << (shift - 1); v_uint8 valpha = vx_setall_u8(alpha); v_int16 vdescale = vx_setall_s16(descaleShift); @@ -739,25 +746,35 @@ struct XYZ2RGB_i z0 = v_reinterpret_as_s16(uz0); z1 = v_reinterpret_as_s16(uz1); - v_int32 b[4], g[4], r[4]; + v_int32 bb0, bb1, bb2, bb3, + gg0, gg1, gg2, gg3, + rr0, rr1, rr2, rr3; - v_int16 xy[4], zd[4]; - v_zip(x0, y0, xy[0], xy[1]); - v_zip(x1, y1, xy[2], xy[3]); - v_zip(z0, vdescale, zd[0], zd[1]); - v_zip(z1, vdescale, zd[2], zd[3]); + v_int16 xy0, xy1, xy2, xy3; + v_int16 zd0, zd1, zd2, zd3; - for(int j = 0; j < 4; j++) - { - b[j] = (v_dotprod(xy[j], cbxy) + v_dotprod(zd[j], cbz1)) >> shift; - g[j] = (v_dotprod(xy[j], cgxy) + v_dotprod(zd[j], cgz1)) >> shift; - r[j] = (v_dotprod(xy[j], crxy) + v_dotprod(zd[j], crz1)) >> shift; - } + v_zip(x0, y0, xy0, xy1); + v_zip(x1, y1, xy2, xy3); + v_zip(z0, vdescale, zd0, zd1); + v_zip(z1, vdescale, zd2, zd3); + + bb0 = v_shr(v_add(v_dotprod(xy0, cbxy), v_dotprod(zd0, cbz1))); + gg0 = v_shr(v_add(v_dotprod(xy0, cgxy), v_dotprod(zd0, cgz1))); + rr0 = v_shr(v_add(v_dotprod(xy0, crxy), v_dotprod(zd0, crz1))); + bb1 = v_shr(v_add(v_dotprod(xy1, cbxy), v_dotprod(zd1, cbz1))); + gg1 = v_shr(v_add(v_dotprod(xy1, cgxy), v_dotprod(zd1, cgz1))); + rr1 = v_shr(v_add(v_dotprod(xy1, crxy), v_dotprod(zd1, crz1))); + bb2 = v_shr(v_add(v_dotprod(xy2, cbxy), v_dotprod(zd2, cbz1))); + gg2 = v_shr(v_add(v_dotprod(xy2, cgxy), v_dotprod(zd2, cgz1))); + rr2 = v_shr(v_add(v_dotprod(xy2, crxy), v_dotprod(zd2, crz1))); + bb3 = v_shr(v_add(v_dotprod(xy3, cbxy), v_dotprod(zd3, cbz1))); + gg3 = v_shr(v_add(v_dotprod(xy3, cgxy), v_dotprod(zd3, cgz1))); + rr3 = v_shr(v_add(v_dotprod(xy3, crxy), v_dotprod(zd3, crz1))); v_uint16 b0, b1, g0, g1, r0, r1; - b0 = v_pack_u(b[0], b[1]); b1 = v_pack_u(b[2], b[3]); - g0 = v_pack_u(g[0], g[1]); g1 = v_pack_u(g[2], g[3]); - r0 = v_pack_u(r[0], r[1]); r1 = v_pack_u(r[2], r[3]); + b0 = v_pack_u(bb0, bb1); b1 = v_pack_u(bb2, bb3); + g0 = v_pack_u(gg0, gg1); g1 = v_pack_u(gg2, gg3); + r0 = v_pack_u(rr0, rr1); r1 = v_pack_u(rr2, rr3); v_uint8 bb, gg, rr; bb = v_pack(b0, b1); @@ -820,8 +837,8 @@ struct XYZ2RGB_i int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5], C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8]; -#if CV_SIMD - const int vsize = v_uint16::nlanes; +#if (CV_SIMD || CV_SIMD_SCALABLE) + const int vsize = VTraits::vlanes(); const int descaleShift = 1 << (shift-1); v_uint16 valpha = vx_setall_u16(alpha); v_int16 vdescale = vx_setall_s16(descaleShift); @@ -850,30 +867,30 @@ struct XYZ2RGB_i sz = v_reinterpret_as_s16(z); // fixing 16bit signed multiplication - v_int16 mx = sx < zero, my = sy < zero, mz = sz < zero; + v_int16 mx = v_lt(sx, zero), my = v_lt(sy, zero), mz = v_lt(sz, zero); v_int16 bmx, bmy, bmz; v_int16 gmx, gmy, gmz; v_int16 rmx, rmy, rmz; - bmx = mx & vc0; - bmy = my & vc1; - bmz = mz & vc2; - gmx = mx & vc3; - gmy = my & vc4; - gmz = mz & vc5; - rmx = mx & vc6; - rmy = my & vc7; - rmz = mz & vc8; + bmx = v_and(mx, vc0); + bmy = v_and(my, vc1); + bmz = v_and(mz, vc2); + gmx = v_and(mx, vc3); + gmy = v_and(my, vc4); + gmz = v_and(mz, vc5); + rmx = v_and(mx, vc6); + rmy = v_and(my, vc7); + rmz = v_and(mz, vc8); v_int32 bfix0, bfix1, gfix0, gfix1, rfix0, rfix1; - v_expand(bmx + bmy + bmz, bfix0, bfix1); - v_expand(gmx + gmy + gmz, gfix0, gfix1); - v_expand(rmx + rmy + rmz, rfix0, rfix1); + v_expand(v_add(v_add(bmx, bmy), bmz), bfix0, bfix1); + v_expand(v_add(v_add(gmx, gmy), gmz), gfix0, gfix1); + v_expand(v_add(v_add(rmx, rmy), rmz), rfix0, rfix1); - bfix0 = bfix0 << 16; bfix1 = bfix1 << 16; - gfix0 = gfix0 << 16; gfix1 = gfix1 << 16; - rfix0 = rfix0 << 16; rfix1 = rfix1 << 16; + bfix0 = v_shl<16>(bfix0); bfix1 = v_shl<16>(bfix1); + gfix0 = v_shl<16>(gfix0); gfix1 = v_shl<16>(gfix1); + rfix0 = v_shl<16>(rfix0); rfix1 = v_shl<16>(rfix1); v_int16 xy0, xy1, zd0, zd1; v_zip(sx, sy, xy0, xy1); @@ -881,12 +898,12 @@ struct XYZ2RGB_i v_int32 b0, b1, g0, g1, r0, r1; - b0 = (v_dotprod(xy0, cbxy) + v_dotprod(zd0, cbz1) + bfix0) >> shift; - b1 = (v_dotprod(xy1, cbxy) + v_dotprod(zd1, cbz1) + bfix1) >> shift; - g0 = (v_dotprod(xy0, cgxy) + v_dotprod(zd0, cgz1) + gfix0) >> shift; - g1 = (v_dotprod(xy1, cgxy) + v_dotprod(zd1, cgz1) + gfix1) >> shift; - r0 = (v_dotprod(xy0, crxy) + v_dotprod(zd0, crz1) + rfix0) >> shift; - r1 = (v_dotprod(xy1, crxy) + v_dotprod(zd1, crz1) + rfix1) >> shift; + b0 = v_shr(v_add(v_add(v_dotprod(xy0, cbxy), v_dotprod(zd0, cbz1)), bfix0)); + b1 = v_shr(v_add(v_add(v_dotprod(xy1, cbxy), v_dotprod(zd1, cbz1)), bfix1)); + g0 = v_shr(v_add(v_add(v_dotprod(xy0, cgxy), v_dotprod(zd0, cgz1)), gfix0)); + g1 = v_shr(v_add(v_add(v_dotprod(xy1, cgxy), v_dotprod(zd1, cgz1)), gfix1)); + r0 = v_shr(v_add(v_add(v_dotprod(xy0, crxy), v_dotprod(zd0, crz1)), rfix0)); + r1 = v_shr(v_add(v_add(v_dotprod(xy1, crxy), v_dotprod(zd1, crz1)), rfix1)); v_uint16 b, g, r; b = v_pack_u(b0, b1); g = v_pack_u(g0, g1); r = v_pack_u(r0, r1); @@ -1452,19 +1469,19 @@ static inline void trilinearPackedInterpolate(const v_uint16x8& inX, const v_uin #undef DOT_SHIFT_PACK } -#elif CV_SIMD +#elif CV_SIMD // Fixed size v_int16x8 used below, CV_SIMD_SCALABLE is disabled. // inValues are in [0; LAB_BASE] static inline void trilinearPackedInterpolate(const v_uint16& inX, const v_uint16& inY, const v_uint16& inZ, const int16_t* LUT, v_uint16& outA, v_uint16& outB, v_uint16& outC) { - const int vsize = v_uint16::nlanes; + const int vsize = VTraits::max_nlanes; // LUT idx of origin pt of cube - v_uint16 tx = inX >> (lab_base_shift - lab_lut_shift); - v_uint16 ty = inY >> (lab_base_shift - lab_lut_shift); - v_uint16 tz = inZ >> (lab_base_shift - lab_lut_shift); + v_uint16 tx = v_shr(inX); + v_uint16 ty = v_shr(inY); + v_uint16 tz = v_shr(inZ); v_uint32 btmp00, btmp01, btmp10, btmp11, btmp20, btmp21; v_uint32 baseIdx0, baseIdx1; @@ -1472,8 +1489,8 @@ static inline void trilinearPackedInterpolate(const v_uint16& inX, const v_uint1 v_mul_expand(tx, vx_setall_u16(3*8), btmp00, btmp01); v_mul_expand(ty, vx_setall_u16(3*8*LAB_LUT_DIM), btmp10, btmp11); v_mul_expand(tz, vx_setall_u16(3*8*LAB_LUT_DIM*LAB_LUT_DIM), btmp20, btmp21); - baseIdx0 = btmp00 + btmp10 + btmp20; - baseIdx1 = btmp01 + btmp11 + btmp21; + baseIdx0 = v_add(v_add(btmp00, btmp10), btmp20); + baseIdx1 = v_add(v_add(btmp01, btmp11), btmp21); uint32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vbaseIdx[vsize]; v_store_aligned(vbaseIdx + 0*vsize/2, baseIdx0); @@ -1482,9 +1499,9 @@ static inline void trilinearPackedInterpolate(const v_uint16& inX, const v_uint1 // fracX, fracY, fracZ are [0; TRILINEAR_BASE) const uint16_t bitMask = (1 << trilinear_shift) - 1; v_uint16 bitMaskReg = vx_setall_u16(bitMask); - v_uint16 fracX = (inX >> (lab_base_shift - 8 - 1)) & bitMaskReg; - v_uint16 fracY = (inY >> (lab_base_shift - 8 - 1)) & bitMaskReg; - v_uint16 fracZ = (inZ >> (lab_base_shift - 8 - 1)) & bitMaskReg; + v_uint16 fracX = v_and(v_shr(inX), bitMaskReg); + v_uint16 fracY = v_and(v_shr(inY), bitMaskReg); + v_uint16 fracZ = v_and(v_shr(inZ), bitMaskReg); // trilinearIdx = 8*x + 8*TRILINEAR_BASE*y + 8*TRILINEAR_BASE*TRILINEAR_BASE*z v_uint32 trilinearIdx0, trilinearIdx1; @@ -1493,8 +1510,8 @@ static inline void trilinearPackedInterpolate(const v_uint16& inX, const v_uint1 v_expand(fracY, fracY0, fracY1); v_expand(fracZ, fracZ0, fracZ1); - trilinearIdx0 = (fracX0 << 3) + (fracY0 << (3+trilinear_shift)) + (fracZ0 << (3+trilinear_shift*2)); - trilinearIdx1 = (fracX1 << 3) + (fracY1 << (3+trilinear_shift)) + (fracZ1 << (3+trilinear_shift*2)); + trilinearIdx0 = v_add(v_add(v_shl<3>(fracX0), v_shl<3 + trilinear_shift>(fracY0)), v_shl<3 + trilinear_shift * 2>(fracZ0)); + trilinearIdx1 = v_add(v_add(v_shl<3>(fracX1), v_shl<3 + trilinear_shift>(fracY1)), v_shl<3 + trilinear_shift * 2>(fracZ1)); uint32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vtrilinearIdx[vsize]; v_store_aligned(vtrilinearIdx + 0*vsize/2, trilinearIdx0); @@ -1528,12 +1545,12 @@ static inline void trilinearPackedInterpolate(const v_uint16& inX, const v_uint1 // CV_DESCALE const v_uint32 descaleShift = vx_setall_u32(1 << (trilinear_shift*3 - 1)); - a0 = (a0 + descaleShift) >> (trilinear_shift*3); - a1 = (a1 + descaleShift) >> (trilinear_shift*3); - b0 = (b0 + descaleShift) >> (trilinear_shift*3); - b1 = (b1 + descaleShift) >> (trilinear_shift*3); - c0 = (c0 + descaleShift) >> (trilinear_shift*3); - c1 = (c1 + descaleShift) >> (trilinear_shift*3); + a0 = v_shr(v_add(a0, descaleShift)); + a1 = v_shr(v_add(a1, descaleShift)); + b0 = v_shr(v_add(b0, descaleShift)); + b1 = v_shr(v_add(b1, descaleShift)); + c0 = v_shr(v_add(c0, descaleShift)); + c1 = v_shr(v_add(c1, descaleShift)); outA = v_pack(a0, a1); outB = v_pack(b0, b1); outC = v_pack(c0, c1); } diff --git a/modules/imgproc/src/color_yuv.simd.hpp b/modules/imgproc/src/color_yuv.simd.hpp index b5f73d873a..580329f660 100644 --- a/modules/imgproc/src/color_yuv.simd.hpp +++ b/modules/imgproc/src/color_yuv.simd.hpp @@ -49,6 +49,15 @@ void cvtOnePlaneYUVtoBGR(const uchar * src_data, size_t src_step, namespace { //constants for conversion from/to RGB and YUV, YCrCb according to BT.601 +#if CV_SIMD_SCALABLE +template +static void swap(T&a, T&b) { + T t = a; + a = b; + b = t; +} +#endif + //to YCbCr static const float YCBF = 0.564f; // == 1/2/(1-B2YF) static const float YCRF = 0.713f; // == 1/2/(1-R2YF) @@ -143,11 +152,11 @@ struct RGB2YCrCb_f float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4]; int i = 0; -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) v_float32 vc0 = vx_setall_f32(C0), vc1 = vx_setall_f32(C1), vc2 = vx_setall_f32(C2); v_float32 vc3 = vx_setall_f32(C3), vc4 = vx_setall_f32(C4); v_float32 vdelta = vx_setall_f32(delta); - const int vsize = v_float32::nlanes; + const int vsize = VTraits::vlanes(); for( ; i <= n-vsize; i += vsize, src += vsize*scn, dst += vsize*3) { @@ -162,13 +171,13 @@ struct RGB2YCrCb_f } v_float32 y, cr, cb; - y = v_fma(b, vc0, v_fma(g, vc1, r*vc2)); + y = v_fma(b, vc0, v_fma(g, vc1, v_mul(r, vc2))); if(bidx) - std::swap(r, b); + swap(r, b); - cr = v_fma(r - y, vc3, vdelta); - cb = v_fma(b - y, vc4, vdelta); + cr = v_fma(v_sub(r, y), vc3, vdelta); + cb = v_fma(v_sub(b, y), vc4, vdelta); if(yuvOrder) { @@ -266,8 +275,8 @@ struct RGB2YCrCb_i int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4]; int sdelta = ColorChannel::half()*(1 << shift); int i = 0; -#if CV_SIMD - const int vsize = v_uint16::nlanes; +#if (CV_SIMD || CV_SIMD_SCALABLE) + const int vsize = VTraits::vlanes(); const int descale = 1 << (shift-1); v_int16 b2y = vx_setall_s16((short)C0); @@ -312,13 +321,13 @@ struct RGB2YCrCb_i // fixing 16bit signed multiplication v_int16 mr, mg, mb; - mr = (sr < z) & r2y; - mg = (sg < z) & g2y; - mb = (sb < z) & b2y; - v_int16 fixmul = v_add_wrap(mr, v_add_wrap(mg, mb)) << fix_shift; + mr = v_and(v_lt(sr, z), r2y); + mg = v_and(v_lt(sg, z), g2y); + mb = v_and(v_lt(sb, z), b2y); + v_int16 fixmul = v_shl(v_add_wrap(mr, v_add_wrap(mg, mb)), fix_shift); - v_int32 ssy0 = (v_dotprod(bg0, bg2y) + v_dotprod(rd0, r12y)) >> shift; - v_int32 ssy1 = (v_dotprod(bg1, bg2y) + v_dotprod(rd1, r12y)) >> shift; + v_int32 ssy0 = v_shr(v_add(v_dotprod(bg0, bg2y), v_dotprod(rd0, r12y)), shift); + v_int32 ssy1 = v_shr(v_add(v_dotprod(bg1, bg2y), v_dotprod(rd1, r12y)), shift); y = v_reinterpret_as_u16(v_add_wrap(v_pack(ssy0, ssy1), fixmul)); @@ -340,15 +349,15 @@ struct RGB2YCrCb_i v_int32 sy0 = v_reinterpret_as_s32(uy0); v_int32 sy1 = v_reinterpret_as_s32(uy1); - sr0 = sr0 - sy0; sr1 = sr1 - sy1; - sb0 = sb0 - sy0; sb1 = sb1 - sy1; + sr0 = v_sub(sr0, sy0); sr1 = v_sub(sr1, sy1); + sb0 = v_sub(sb0, sy0); sb1 = v_sub(sb1, sy1); v_int32 v_scr0, v_scr1, v_scb0, v_scb1; - v_scr0 = (sr0*vc3 + vdd) >> shift; - v_scr1 = (sr1*vc3 + vdd) >> shift; - v_scb0 = (sb0*vc4 + vdd) >> shift; - v_scb1 = (sb1*vc4 + vdd) >> shift; + v_scr0 = v_shr(v_add(v_mul(sr0, vc3), vdd), shift); + v_scr1 = v_shr(v_add(v_mul(sr1, vc3), vdd), shift); + v_scb0 = v_shr(v_add(v_mul(sb0, vc4), vdd), shift); + v_scb1 = v_shr(v_add(v_mul(sb1, vc4), vdd), shift); // saturate and pack cr = v_pack_u(v_scr0, v_scr1); @@ -407,8 +416,8 @@ struct RGB2YCrCb_i int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4]; int delta = ColorChannel::half()*(1 << shift); -#if CV_SIMD - const int vsize = v_uint8::nlanes; +#if (CV_SIMD || CV_SIMD_SCALABLE) + const int vsize = VTraits::vlanes(); const int descaleShift = 1 << (shift-1); v_int16 bg2y; v_int16 r12y; @@ -458,10 +467,10 @@ struct RGB2YCrCb_i v_zip(sr0, vdescale, rd00, rd01); v_zip(sr1, vdescale, rd10, rd11); - y00 = v_reinterpret_as_u32(v_dotprod(bg00, bg2y) + v_dotprod(rd00, r12y)) >> shift; - y01 = v_reinterpret_as_u32(v_dotprod(bg01, bg2y) + v_dotprod(rd01, r12y)) >> shift; - y10 = v_reinterpret_as_u32(v_dotprod(bg10, bg2y) + v_dotprod(rd10, r12y)) >> shift; - y11 = v_reinterpret_as_u32(v_dotprod(bg11, bg2y) + v_dotprod(rd11, r12y)) >> shift; + y00 = v_shr(v_reinterpret_as_u32(v_add(v_dotprod(bg00, bg2y), v_dotprod(rd00, r12y))), shift); + y01 = v_shr(v_reinterpret_as_u32(v_add(v_dotprod(bg01, bg2y), v_dotprod(rd01, r12y))), shift); + y10 = v_shr(v_reinterpret_as_u32(v_add(v_dotprod(bg10, bg2y), v_dotprod(rd10, r12y))), shift); + y11 = v_shr(v_reinterpret_as_u32(v_add(v_dotprod(bg11, bg2y), v_dotprod(rd11, r12y))), shift); } v_uint16 y0, y1; @@ -512,15 +521,15 @@ struct RGB2YCrCb_i v_uint8 cr, cb; - cr00 = cr00 >> shift; - cr01 = cr01 >> shift; - cr10 = cr10 >> shift; - cr11 = cr11 >> shift; + cr00 = v_shr(cr00, shift); + cr01 = v_shr(cr01, shift); + cr10 = v_shr(cr10, shift); + cr11 = v_shr(cr11, shift); - cb00 = cb00 >> shift; - cb01 = cb01 >> shift; - cb10 = cb10 >> shift; - cb11 = cb11 >> shift; + cb00 = v_shr(cb00, shift); + cb01 = v_shr(cb01, shift); + cb10 = v_shr(cb10, shift); + cb11 = v_shr(cb11, shift); v_int16 cr0, cr1, cb0, cb1; cr0 = v_pack(cr00, cr01); cr1 = v_pack(cr10, cr11); @@ -623,12 +632,12 @@ struct YCrCb2RGB_f float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3]; int i = 0; -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) v_float32 vc0 = vx_setall_f32(C0), vc1 = vx_setall_f32(C1); v_float32 vc2 = vx_setall_f32(C2), vc3 = vx_setall_f32(C3); v_float32 vdelta = vx_setall_f32(delta); v_float32 valpha = vx_setall_f32(alpha); - const int vsize = v_float32::nlanes; + const int vsize = VTraits::vlanes(); for( ; i <= n-vsize; i += vsize, src += vsize*3, dst += vsize*dcn) { @@ -640,7 +649,7 @@ struct YCrCb2RGB_f v_float32 b, g, r; - cb -= vdelta; cr -= vdelta; + cb = v_sub(cb, vdelta); cr = v_sub(cr, vdelta); b = v_fma(cb, vc3, y); g = v_fma(cr, vc1, v_fma(cb, vc2, y)); r = v_fma(cr, vc0, y); @@ -746,8 +755,8 @@ struct YCrCb2RGB_i const uchar delta = ColorChannel::half(), alpha = ColorChannel::max(); int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3]; -#if CV_SIMD - const int vsize = v_uint8::nlanes; +#if (CV_SIMD || CV_SIMD_SCALABLE) + const int vsize = VTraits::vlanes(); v_uint8 valpha = vx_setall_u8(alpha); v_uint8 vdelta = vx_setall_u8(delta); const int descaleShift = 1 << (shift - 1); @@ -794,8 +803,8 @@ struct YCrCb2RGB_i v_int32 cb00, cb01, cb10, cb11; v_expand(v_scb0, cb00, cb01); v_expand(v_scb1, cb10, cb11); - b00 += cb00 << 15; b01 += cb01 << 15; - b10 += cb10 << 15; b11 += cb11 << 15; + b00 = v_add(b00, v_shl<15>(cb00)); b01 = v_add(b01, v_shl<15>(cb01)); + b10 = v_add(b10, v_shl<15>(cb10)); b11 = v_add(b11, v_shl<15>(cb11)); } v_int32 t00, t01, t10, t11; @@ -803,17 +812,17 @@ struct YCrCb2RGB_i v_mul_expand(v_scb1, vc2, t10, t11); v_mul_expand(v_scr0, vc1, g00, g01); v_mul_expand(v_scr1, vc1, g10, g11); - g00 += t00; g01 += t01; - g10 += t10; g11 += t11; + g00 = v_add(g00, t00); g01 = v_add(g01, t01); + g10 = v_add(g10, t10); g11 = v_add(g11, t11); v_mul_expand(v_scr0, vc0, r00, r01); v_mul_expand(v_scr1, vc0, r10, r11); - b00 = (b00 + vdescale) >> shift; b01 = (b01 + vdescale) >> shift; - b10 = (b10 + vdescale) >> shift; b11 = (b11 + vdescale) >> shift; - g00 = (g00 + vdescale) >> shift; g01 = (g01 + vdescale) >> shift; - g10 = (g10 + vdescale) >> shift; g11 = (g11 + vdescale) >> shift; - r00 = (r00 + vdescale) >> shift; r01 = (r01 + vdescale) >> shift; - r10 = (r10 + vdescale) >> shift; r11 = (r11 + vdescale) >> shift; + b00 = v_shr(v_add(b00, vdescale), shift); b01 = v_shr(v_add(b01, vdescale), shift); + b10 = v_shr(v_add(b10, vdescale), shift); b11 = v_shr(v_add(b11, vdescale), shift); + g00 = v_shr(v_add(g00, vdescale), shift); g01 = v_shr(v_add(g01, vdescale), shift); + g10 = v_shr(v_add(g10, vdescale), shift); g11 = v_shr(v_add(g11, vdescale), shift); + r00 = v_shr(v_add(r00, vdescale), shift); r01 = v_shr(v_add(r01, vdescale), shift); + r10 = v_shr(v_add(r10, vdescale), shift); r11 = v_shr(v_add(r11, vdescale), shift); v_int16 b0, b1, g0, g1, r0, r1; b0 = v_pack(b00, b01); b1 = v_pack(b10, b11); @@ -897,8 +906,8 @@ struct YCrCb2RGB_i const ushort delta = ColorChannel::half(), alpha = ColorChannel::max(); int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3]; -#if CV_SIMD - const int vsize = v_uint16::nlanes; +#if (CV_SIMD || CV_SIMD_SCALABLE) + const int vsize = VTraits::vlanes(); const int descaleShift = 1 << (shift-1); v_uint16 valpha = vx_setall_u16(alpha); v_uint16 vdelta = vx_setall_u16(delta); @@ -939,22 +948,22 @@ struct YCrCb2RGB_i // so we fix the multiplication v_int32 cb0, cb1; v_expand(scb, cb0, cb1); - b0 += cb0 << 15; - b1 += cb1 << 15; + b0 = v_add(b0, v_shl<15>(cb0)); + b1 = v_add(b1, v_shl<15>(cb1)); } v_int32 t0, t1; v_mul_expand(scb, vc2, t0, t1); v_mul_expand(scr, vc1, g0, g1); - g0 += t0; g1 += t1; + g0 = v_add(g0, t0); g1 = v_add(g1, t1); v_mul_expand(scr, vc0, r0, r1); // shifted term doesn't fit into 16 bits, addition is to be done in 32 bits - b0 = ((b0 + vdescale) >> shift) + y0; - b1 = ((b1 + vdescale) >> shift) + y1; - g0 = ((g0 + vdescale) >> shift) + y0; - g1 = ((g1 + vdescale) >> shift) + y1; - r0 = ((r0 + vdescale) >> shift) + y0; - r1 = ((r1 + vdescale) >> shift) + y1; + b0 = v_add(v_shr(v_add(b0, vdescale), shift), y0); + b1 = v_add(v_shr(v_add(b1, vdescale), shift), y1); + g0 = v_add(v_shr(v_add(g0, vdescale), shift), y0); + g1 = v_add(v_shr(v_add(g1, vdescale), shift), y1); + r0 = v_add(v_shr(v_add(r0, vdescale), shift), y0); + r1 = v_add(v_shr(v_add(r1, vdescale), shift), y1); // saturate and pack v_uint16 b, g, r; @@ -1038,11 +1047,11 @@ static inline void uvToRGBuv(const uchar u, const uchar v, int& ruv, int& guv, i buv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CUB * uu; } -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) static inline void uvToRGBuv(const v_uint8& u, const v_uint8& v, - v_int32 (&ruv)[4], - v_int32 (&guv)[4], - v_int32 (&buv)[4]) + v_int32 &ruv0, v_int32 &ruv1, v_int32 &ruv2, v_int32 &ruv3, + v_int32 &guv0, v_int32 &guv1, v_int32 &guv2, v_int32 &guv3, + v_int32 &buv0, v_int32 &buv1, v_int32 &buv2, v_int32 &buv3) { v_uint8 v128 = vx_setall_u8(128); v_int8 su = v_reinterpret_as_s8(v_sub_wrap(u, v128)); @@ -1051,9 +1060,10 @@ static inline void uvToRGBuv(const v_uint8& u, const v_uint8& v, v_int16 uu0, uu1, vv0, vv1; v_expand(su, uu0, uu1); v_expand(sv, vv0, vv1); - v_int32 uu[4], vv[4]; - v_expand(uu0, uu[0], uu[1]); v_expand(uu1, uu[2], uu[3]); - v_expand(vv0, vv[0], vv[1]); v_expand(vv1, vv[2], vv[3]); + v_int32 uuu0, uuu1, uuu2, uuu3; + v_int32 vvv0, vvv1, vvv2, vvv3; + v_expand(uu0, uuu0, uuu1); v_expand(uu1, uuu2, uuu3); + v_expand(vv0, vvv0, vvv1); v_expand(vv1, vvv2, vvv3); v_int32 vshift = vx_setall_s32(1 << (ITUR_BT_601_SHIFT - 1)); v_int32 vr = vx_setall_s32(ITUR_BT_601_CVR); @@ -1061,12 +1071,15 @@ static inline void uvToRGBuv(const v_uint8& u, const v_uint8& v, v_int32 ug = vx_setall_s32(ITUR_BT_601_CUG); v_int32 ub = vx_setall_s32(ITUR_BT_601_CUB); - for (int k = 0; k < 4; k++) - { - ruv[k] = vshift + vr * vv[k]; - guv[k] = vshift + vg * vv[k] + ug * uu[k]; - buv[k] = vshift + ub * uu[k]; - } + auto process_uv = [&](v_int32& ruv, v_int32& guv, v_int32& buv, const v_int32& vv, const v_int32& uu) { + ruv = v_add(vshift, v_mul(vr, vv)); + guv = v_add(v_add(vshift, v_mul(vg, vv)), v_mul(ug, uu)); + buv = v_add(vshift, v_mul(ub, uu)); + }; + process_uv(ruv0, guv0, buv0, vvv0, uuu0); + process_uv(ruv1, guv1, buv1, vvv1, uuu1); + process_uv(ruv2, guv2, buv2, vvv2, uuu2); + process_uv(ruv3, guv3, buv3, vvv3, uuu3); } #endif @@ -1081,44 +1094,48 @@ static inline void yRGBuvToRGBA(const uchar vy, const int ruv, const int guv, co a = uchar(0xff); } -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) static inline void yRGBuvToRGBA(const v_uint8& vy, - const v_int32 (&ruv)[4], - const v_int32 (&guv)[4], - const v_int32 (&buv)[4], + const v_int32 &ruv0, const v_int32 &ruv1, const v_int32 &ruv2, const v_int32 &ruv3, + const v_int32 &guv0, const v_int32 &guv1, const v_int32 &guv2, const v_int32 &guv3, + const v_int32 &buv0, const v_int32 &buv1, const v_int32 &buv2, const v_int32 &buv3, v_uint8& rr, v_uint8& gg, v_uint8& bb) { v_uint8 v16 = vx_setall_u8(16); - v_uint8 posY = vy - v16; + v_uint8 posY = v_sub(vy, v16); v_uint16 yy0, yy1; v_expand(posY, yy0, yy1); - v_int32 yy[4]; - v_int32 yy00, yy01, yy10, yy11; - v_expand(v_reinterpret_as_s16(yy0), yy[0], yy[1]); - v_expand(v_reinterpret_as_s16(yy1), yy[2], yy[3]); + v_int32 yyy0, yyy1, yyy2, yyy3; + v_expand(v_reinterpret_as_s16(yy0), yyy0, yyy1); + v_expand(v_reinterpret_as_s16(yy1), yyy2, yyy3); v_int32 vcy = vx_setall_s32(ITUR_BT_601_CY); - v_int32 y[4], r[4], g[4], b[4]; - for(int k = 0; k < 4; k++) - { - y[k] = yy[k]*vcy; - r[k] = (y[k] + ruv[k]) >> ITUR_BT_601_SHIFT; - g[k] = (y[k] + guv[k]) >> ITUR_BT_601_SHIFT; - b[k] = (y[k] + buv[k]) >> ITUR_BT_601_SHIFT; - } + v_int32 y0, y1, y2, y3, r0, r1, r2, r3, g0, g1, g2, g3, b0, b1, b2, b3; - v_int16 r0, r1, g0, g1, b0, b1; - r0 = v_pack(r[0], r[1]); - r1 = v_pack(r[2], r[3]); - g0 = v_pack(g[0], g[1]); - g1 = v_pack(g[2], g[3]); - b0 = v_pack(b[0], b[1]); - b1 = v_pack(b[2], b[3]); + auto process_yrgb = [&](const v_int32& yy, v_int32& y, v_int32& r, v_int32& g, v_int32& b, + const v_int32& ruv, const v_int32& guv, const v_int32& buv) { + y = v_mul(yy, vcy); + r = v_shr(v_add(y, ruv), ITUR_BT_601_SHIFT); + g = v_shr(v_add(y, guv), ITUR_BT_601_SHIFT); + b = v_shr(v_add(y, buv), ITUR_BT_601_SHIFT); + }; + process_yrgb(yyy0, y0, r0, g0, b0, ruv0, guv0, buv0); + process_yrgb(yyy1, y1, r1, g1, b1, ruv1, guv1, buv1); + process_yrgb(yyy2, y2, r2, g2, b2, ruv2, guv2, buv2); + process_yrgb(yyy3, y3, r3, g3, b3, ruv3, guv3, buv3); - rr = v_pack_u(r0, r1); - gg = v_pack_u(g0, g1); - bb = v_pack_u(b0, b1); + v_int16 _r0, _r1, _g0, _g1, _b0, _b1; + _r0 = v_pack(r0, r1); + _r1 = v_pack(r2, r3); + _g0 = v_pack(g0, g1); + _g1 = v_pack(g2, g3); + _b0 = v_pack(b0, b1); + _b1 = v_pack(b2, b3); + + rr = v_pack_u(_r0, _r1); + gg = v_pack_u(_g0, _g1); + bb = v_pack_u(_b0, _b1); } #endif @@ -1201,8 +1218,8 @@ struct YUV420sp2RGB8Invoker : ParallelLoopBody const uchar* y2 = y1 + my1_step; int i = 0; -#if CV_SIMD - const int vsize = v_uint8::nlanes; +#if (CV_SIMD || CV_SIMD_SCALABLE) + const int vsize = VTraits::vlanes(); v_uint8 a = vx_setall_u8(uchar(0xff)); for( ; i <= width - 2*vsize; i += 2*vsize, row1 += vsize*dcn*2, row2 += vsize*dcn*2) @@ -1215,36 +1232,50 @@ struct YUV420sp2RGB8Invoker : ParallelLoopBody swap(u, v); } - v_uint8 vy[4]; - v_load_deinterleave(y1 + i, vy[0], vy[1]); - v_load_deinterleave(y2 + i, vy[2], vy[3]); + v_uint8 vy0, vy1, vy2, vy3; + v_load_deinterleave(y1 + i, vy0, vy1); + v_load_deinterleave(y2 + i, vy2, vy3); - v_int32 ruv[4], guv[4], buv[4]; - uvToRGBuv(u, v, ruv, guv, buv); + v_int32 ruv0, ruv1, ruv2, ruv3, + guv0, guv1, guv2, guv3, + buv0, buv1, buv2, buv3; + uvToRGBuv(u, v, + ruv0, ruv1, ruv2, ruv3, + guv0, guv1, guv2, guv3, + buv0, buv1, buv2, buv3); - v_uint8 r[4], g[4], b[4]; + v_uint8 r0, r1, r2, r3, g0, g1, g2, g3, b0, b1, b2, b3; - for(int k = 0; k < 4; k++) - { - yRGBuvToRGBA(vy[k], ruv, guv, buv, r[k], g[k], b[k]); - } + auto call_yRGBuvToRGBA = [&](const v_uint8& vy, v_uint8& r, v_uint8& g, v_uint8& b) { + yRGBuvToRGBA(vy, + ruv0, ruv1, ruv2, ruv3, + guv0, guv1, guv2, guv3, + buv0, buv1, buv2, buv3, + r, g, b); + }; + call_yRGBuvToRGBA(vy0, r0, g0, b0); + call_yRGBuvToRGBA(vy1, r1, g1, b1); + call_yRGBuvToRGBA(vy2, r2, g2, b2); + call_yRGBuvToRGBA(vy3, r3, g3, b3); if(bIdx) { - for(int k = 0; k < 4; k++) - swap(r[k], b[k]); + swap(r0, b0); + swap(r1, b1); + swap(r2, b2); + swap(r3, b3); } // [r0...], [r1...] => [r0, r1, r0, r1...], [r0, r1, r0, r1...] v_uint8 r0_0, r0_1, r1_0, r1_1; - v_zip(r[0], r[1], r0_0, r0_1); - v_zip(r[2], r[3], r1_0, r1_1); + v_zip(r0, r1, r0_0, r0_1); + v_zip(r2, r3, r1_0, r1_1); v_uint8 g0_0, g0_1, g1_0, g1_1; - v_zip(g[0], g[1], g0_0, g0_1); - v_zip(g[2], g[3], g1_0, g1_1); + v_zip(g0, g1, g0_0, g0_1); + v_zip(g2, g3, g1_0, g1_1); v_uint8 b0_0, b0_1, b1_0, b1_1; - v_zip(b[0], b[1], b0_0, b0_1); - v_zip(b[2], b[3], b1_0, b1_1); + v_zip(b0, b1, b0_0, b0_1); + v_zip(b2, b3, b1_0, b1_1); if(dcn == 4) { @@ -1319,8 +1350,8 @@ struct YUV420p2RGB8Invoker : ParallelLoopBody const uchar* y2 = y1 + stride; int i = 0; -#if CV_SIMD - const int vsize = v_uint8::nlanes; +#if (CV_SIMD || CV_SIMD_SCALABLE) + const int vsize = VTraits::vlanes(); v_uint8 a = vx_setall_u8(uchar(0xff)); for( ; i <= width/2 - vsize; i += vsize, row1 += vsize*dcn*2, row2 += vsize*dcn*2) @@ -1329,36 +1360,50 @@ struct YUV420p2RGB8Invoker : ParallelLoopBody u = vx_load(u1 + i); v = vx_load(v1 + i); - v_uint8 vy[4]; - v_load_deinterleave(y1 + 2*i, vy[0], vy[1]); - v_load_deinterleave(y2 + 2*i, vy[2], vy[3]); + v_uint8 vy0, vy1, vy2, vy3; + v_load_deinterleave(y1 + 2*i, vy0, vy1); + v_load_deinterleave(y2 + 2*i, vy2, vy3); - v_int32 ruv[4], guv[4], buv[4]; - uvToRGBuv(u, v, ruv, guv, buv); + v_int32 ruv0, ruv1, ruv2, ruv3, + guv0, guv1, guv2, guv3, + buv0, buv1, buv2, buv3; + uvToRGBuv(u, v, + ruv0, ruv1, ruv2, ruv3, + guv0, guv1, guv2, guv3, + buv0, buv1, buv2, buv3); - v_uint8 r[4], g[4], b[4]; + v_uint8 r0, r1, r2, r3, g0, g1, g2, g3, b0, b1, b2, b3; - for(int k = 0; k < 4; k++) - { - yRGBuvToRGBA(vy[k], ruv, guv, buv, r[k], g[k], b[k]); - } + auto call_yRGBuvToRGBA = [&](const v_uint8& vy, v_uint8& r, v_uint8& g, v_uint8& b) { + yRGBuvToRGBA(vy, + ruv0, ruv1, ruv2, ruv3, + guv0, guv1, guv2, guv3, + buv0, buv1, buv2, buv3, + r, g, b); + }; + call_yRGBuvToRGBA(vy0, r0, g0, b0); + call_yRGBuvToRGBA(vy1, r1, g1, b1); + call_yRGBuvToRGBA(vy2, r2, g2, b2); + call_yRGBuvToRGBA(vy3, r3, g3, b3); if(bIdx) { - for(int k = 0; k < 4; k++) - swap(r[k], b[k]); + swap(r0, b0); + swap(r1, b1); + swap(r2, b2); + swap(r3, b3); } // [r0...], [r1...] => [r0, r1, r0, r1...], [r0, r1, r0, r1...] v_uint8 r0_0, r0_1, r1_0, r1_1; - v_zip(r[0], r[1], r0_0, r0_1); - v_zip(r[2], r[3], r1_0, r1_1); + v_zip(r0, r1, r0_0, r0_1); + v_zip(r2, r3, r1_0, r1_1); v_uint8 g0_0, g0_1, g1_0, g1_1; - v_zip(g[0], g[1], g0_0, g0_1); - v_zip(g[2], g[3], g1_0, g1_1); + v_zip(g0, g1, g0_0, g0_1); + v_zip(g2, g3, g1_0, g1_1); v_uint8 b0_0, b0_1, b1_0, b1_1; - v_zip(b[0], b[1], b0_0, b0_1); - v_zip(b[2], b[3], b1_0, b1_1); + v_zip(b0, b1, b0_0, b0_1); + v_zip(b2, b3, b1_0, b1_1); if(dcn == 4) { @@ -1430,7 +1475,7 @@ static inline uchar rgbToY42x(uchar r, uchar g, uchar b) return saturate_cast(yy >> ITUR_BT_601_SHIFT); } -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) static inline v_uint8 rgbToY42x(const v_uint8& r, const v_uint8& g, const v_uint8& b) { const int shifted16 = (16 << ITUR_BT_601_SHIFT); @@ -1440,25 +1485,25 @@ static inline v_uint8 rgbToY42x(const v_uint8& r, const v_uint8& g, const v_uint v_expand(g, g0, g1); v_expand(b, b0, b1); - v_uint32 rq[4], gq[4], bq[4]; - v_expand(r0, rq[0], rq[1]); v_expand(r1, rq[2], rq[3]); - v_expand(g0, gq[0], gq[1]); v_expand(g1, gq[2], gq[3]); - v_expand(b0, bq[0], bq[1]); v_expand(b1, bq[2], bq[3]); + v_uint32 rq0, rq1, rq2, rq3, gq0, gq1, gq2, gq3, bq0, bq1, bq2, bq3; + v_expand(r0, rq0, rq1); v_expand(r1, rq2, rq3); + v_expand(g0, gq0, gq1); v_expand(g1, gq2, gq3); + v_expand(b0, bq0, bq1); v_expand(b1, bq2, bq3); v_uint32 ry = vx_setall_u32(ITUR_BT_601_CRY), gy = vx_setall_u32(ITUR_BT_601_CGY); v_uint32 by = vx_setall_u32(ITUR_BT_601_CBY), shift = vx_setall_u32(halfShift + shifted16); - v_uint32 y[4]; - for(int k = 0; k < 4; k++) - { - y[k] = (rq[k]*ry + gq[k]*gy + bq[k]*by + shift) >> ITUR_BT_601_SHIFT; - } + v_uint32 y0, y1, y2, y3; + y0 = v_shr(v_add(v_add(v_add(v_mul(rq0, ry), v_mul(gq0, gy)), v_mul(bq0, by)), shift)); + y1 = v_shr(v_add(v_add(v_add(v_mul(rq1, ry), v_mul(gq1, gy)), v_mul(bq1, by)), shift)); + y2 = v_shr(v_add(v_add(v_add(v_mul(rq2, ry), v_mul(gq2, gy)), v_mul(bq2, by)), shift)); + y3 = v_shr(v_add(v_add(v_add(v_mul(rq3, ry), v_mul(gq3, gy)), v_mul(bq3, by)), shift)); - v_uint16 y0, y1; - y0 = v_pack(y[0], y[1]); - y1 = v_pack(y[2], y[3]); + v_uint16 _y0, _y1; + _y0 = v_pack(y0, y1); + _y1 = v_pack(y2, y3); - return v_pack(y0, y1); + return v_pack(_y0, _y1); } #endif @@ -1473,27 +1518,27 @@ static inline void rgbToUV42x(uchar r, uchar g, uchar b, uchar& u, uchar& v) v = saturate_cast(vv >> ITUR_BT_601_SHIFT); } -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) static inline void rgbToUV42x(const v_uint8& r0, const v_uint8& r1, const v_uint8& g0, const v_uint8& g1, const v_uint8& b0, const v_uint8& b1, v_uint8& u, v_uint8& v) { // [r0, r1, r2, r3,..] => [r0, 0, r2, 0,..] v_int16 vlowByte = vx_setall_s16(0x00ff); v_int16 rd0, rd1, gd0, gd1, bd0, bd1; - rd0 = v_reinterpret_as_s16(r0) & vlowByte; - rd1 = v_reinterpret_as_s16(r1) & vlowByte; - gd0 = v_reinterpret_as_s16(g0) & vlowByte; - gd1 = v_reinterpret_as_s16(g1) & vlowByte; - bd0 = v_reinterpret_as_s16(b0) & vlowByte; - bd1 = v_reinterpret_as_s16(b1) & vlowByte; + rd0 = v_and(v_reinterpret_as_s16(r0), vlowByte); + rd1 = v_and(v_reinterpret_as_s16(r1), vlowByte); + gd0 = v_and(v_reinterpret_as_s16(g0), vlowByte); + gd1 = v_and(v_reinterpret_as_s16(g1), vlowByte); + bd0 = v_and(v_reinterpret_as_s16(b0), vlowByte); + bd1 = v_and(v_reinterpret_as_s16(b1), vlowByte); - v_int32 rq[4], gq[4], bq[4]; - v_expand(rd0, rq[0], rq[1]); - v_expand(rd1, rq[2], rq[3]); - v_expand(gd0, gq[0], gq[1]); - v_expand(gd1, gq[2], gq[3]); - v_expand(bd0, bq[0], bq[1]); - v_expand(bd1, bq[2], bq[3]); + v_int32 rq0, rq1, rq2, rq3, gq0, gq1, gq2, gq3, bq0, bq1, bq2, bq3; + v_expand(rd0, rq0, rq1); + v_expand(rd1, rq2, rq3); + v_expand(gd0, gq0, gq1); + v_expand(gd1, gq2, gq3); + v_expand(bd0, bq0, bq1); + v_expand(bd1, bq2, bq3); const int halfShift = (1 << (ITUR_BT_601_SHIFT - 1)); const int shifted128 = (128 << ITUR_BT_601_SHIFT); @@ -1505,18 +1550,21 @@ static inline void rgbToUV42x(const v_uint8& r0, const v_uint8& r1, const v_uint bu = vx_setall_s32(ITUR_BT_601_CBU); bv = vx_setall_s32(ITUR_BT_601_CBV); - v_int32 uq[4], vq[4]; - for(int k = 0; k < 4; k++) - { - uq[k] = (ru*rq[k] + gu*gq[k] + bu*bq[k] + shift) >> ITUR_BT_601_SHIFT; - vq[k] = (bu*rq[k] + gv*gq[k] + bv*bq[k] + shift) >> ITUR_BT_601_SHIFT; - } + v_int32 uq0, uq1, uq2, uq3, vq0, vq1, vq2, vq3; + uq0 = v_shr(v_add(v_add(v_add(v_mul(ru, rq0), v_mul(gu, gq0)), v_mul(bu, bq0)), shift)); + vq0 = v_shr(v_add(v_add(v_add(v_mul(bu, rq0), v_mul(gv, gq0)), v_mul(bv, bq0)), shift)); + uq1 = v_shr(v_add(v_add(v_add(v_mul(ru, rq1), v_mul(gu, gq1)), v_mul(bu, bq1)), shift)); + vq1 = v_shr(v_add(v_add(v_add(v_mul(bu, rq1), v_mul(gv, gq1)), v_mul(bv, bq1)), shift)); + uq2 = v_shr(v_add(v_add(v_add(v_mul(ru, rq2), v_mul(gu, gq2)), v_mul(bu, bq2)), shift)); + vq2 = v_shr(v_add(v_add(v_add(v_mul(bu, rq2), v_mul(gv, gq2)), v_mul(bv, bq2)), shift)); + uq3 = v_shr(v_add(v_add(v_add(v_mul(ru, rq3), v_mul(gu, gq3)), v_mul(bu, bq3)), shift)); + vq3 = v_shr(v_add(v_add(v_add(v_mul(bu, rq3), v_mul(gv, gq3)), v_mul(bv, bq3)), shift)); v_int16 u0, u1, v0, v1; - u0 = v_pack(uq[0], uq[1]); - u1 = v_pack(uq[2], uq[3]); - v0 = v_pack(vq[0], vq[1]); - v1 = v_pack(vq[2], vq[3]); + u0 = v_pack(uq0, uq1); + u1 = v_pack(uq2, uq3); + v0 = v_pack(vq0, vq1); + v1 = v_pack(vq2, vq3); u = v_pack_u(u0, u1); v = v_pack_u(v0, v1); @@ -1559,8 +1607,8 @@ struct RGB8toYUV420pInvoker: public ParallelLoopBody } } int i = 0; -#if CV_SIMD - const int vsize = v_uint8::nlanes; +#if (CV_SIMD || CV_SIMD_SCALABLE) + const int vsize = VTraits::vlanes(); for( ; i <= w/2 - vsize; i += vsize) @@ -1708,47 +1756,61 @@ struct YUV422toRGB8Invoker : ParallelLoopBody { uchar* row = dst_data + dst_step * j; int i = 0; -#if CV_SIMD - const int vsize = v_uint8::nlanes; +#if (CV_SIMD || CV_SIMD_SCALABLE) + const int vsize = VTraits::vlanes(); v_uint8 a = vx_setall_u8(uchar(0xff)); for(; i <= 2*width - 4*vsize; i += 4*vsize, row += vsize*dcn*2) { - v_uint8 u, v, vy[2]; + v_uint8 u, v, vy0, vy1; if(yIdx == 1) // UYVY { - v_load_deinterleave(yuv_src + i, u, vy[0], v, vy[1]); + v_load_deinterleave(yuv_src + i, u, vy0, v, vy1); } else // YUYV or YVYU { - v_load_deinterleave(yuv_src + i, vy[0], u, vy[1], v); + v_load_deinterleave(yuv_src + i, vy0, u, vy1, v); if(uIdx == 1) // YVYU { swap(u, v); } } - v_int32 ruv[4], guv[4], buv[4]; - uvToRGBuv(u, v, ruv, guv, buv); + v_int32 ruv0, ruv1, ruv2, ruv3, + guv0, guv1, guv2, guv3, + buv0, buv1, buv2, buv3; + uvToRGBuv(u, v, + ruv0, ruv1, ruv2, ruv3, + guv0, guv1, guv2, guv3, + buv0, buv1, buv2, buv3); - v_uint8 r[2], g[2], b[2]; + v_uint8 r0, r1, g0, g1, b0, b1; - yRGBuvToRGBA(vy[0], ruv, guv, buv, r[0], g[0], b[0]); - yRGBuvToRGBA(vy[1], ruv, guv, buv, r[1], g[1], b[1]); + + yRGBuvToRGBA(vy0, + ruv0, ruv1, ruv2, ruv3, + guv0, guv1, guv2, guv3, + buv0, buv1, buv2, buv3, + r0, g0, b0); + yRGBuvToRGBA(vy1, + ruv0, ruv1, ruv2, ruv3, + guv0, guv1, guv2, guv3, + buv0, buv1, buv2, buv3, + r1, g1, b1); if(bIdx) { - swap(r[0], b[0]); - swap(r[1], b[1]); + swap(r0, b0); + swap(r1, b1); } // [r0...], [r1...] => [r0, r1, r0, r1...], [r0, r1, r0, r1...] v_uint8 r0_0, r0_1; - v_zip(r[0], r[1], r0_0, r0_1); + v_zip(r0, r1, r0_0, r0_1); v_uint8 g0_0, g0_1; - v_zip(g[0], g[1], g0_0, g0_1); + v_zip(g0, g1, g0_0, g0_1); v_uint8 b0_0, b0_1; - v_zip(b[0], b[1], b0_0, b0_1); + v_zip(b0, b1, b0_0, b0_1); if(dcn == 4) { diff --git a/modules/imgproc/src/filter.simd.hpp b/modules/imgproc/src/filter.simd.hpp index 8dcf5235af..06053e63fe 100644 --- a/modules/imgproc/src/filter.simd.hpp +++ b/modules/imgproc/src/filter.simd.hpp @@ -349,7 +349,7 @@ struct FilterNoVec }; -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) ///////////////////////////////////// 8u-16s & 8u-8u ////////////////////////////////// @@ -383,7 +383,7 @@ struct RowVec_8u32s if( smallValues ) { - for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes ) + for( ; i <= width - VTraits::vlanes(); i += VTraits::vlanes() ) { const uchar* src = _src + i; v_int32 s0 = vx_setzero_s32(); @@ -396,27 +396,27 @@ struct RowVec_8u32s v_int32 f = vx_setall_s32((_kx[k] & 0xFFFF) | (_kx[k + 1] << 16)); v_uint8 x0, x1; v_zip(vx_load(src), vx_load(src + cn), x0, x1); - s0 += v_dotprod(v_reinterpret_as_s16(v_expand_low(x0)), v_reinterpret_as_s16(f)); - s1 += v_dotprod(v_reinterpret_as_s16(v_expand_high(x0)), v_reinterpret_as_s16(f)); - s2 += v_dotprod(v_reinterpret_as_s16(v_expand_low(x1)), v_reinterpret_as_s16(f)); - s3 += v_dotprod(v_reinterpret_as_s16(v_expand_high(x1)), v_reinterpret_as_s16(f)); + s0 = v_add(s0, v_dotprod(v_reinterpret_as_s16(v_expand_low(x0)), v_reinterpret_as_s16(f))); + s1 = v_add(s1, v_dotprod(v_reinterpret_as_s16(v_expand_high(x0)), v_reinterpret_as_s16(f))); + s2 = v_add(s2, v_dotprod(v_reinterpret_as_s16(v_expand_low(x1)), v_reinterpret_as_s16(f))); + s3 = v_add(s3, v_dotprod(v_reinterpret_as_s16(v_expand_high(x1)), v_reinterpret_as_s16(f))); } if (k < _ksize) { v_int32 f = vx_setall_s32(_kx[k]); v_uint16 x0, x1; v_expand(vx_load(src), x0, x1); - s0 += v_dotprod(v_reinterpret_as_s16(v_expand_low(x0)), v_reinterpret_as_s16(f)); - s1 += v_dotprod(v_reinterpret_as_s16(v_expand_high(x0)), v_reinterpret_as_s16(f)); - s2 += v_dotprod(v_reinterpret_as_s16(v_expand_low(x1)), v_reinterpret_as_s16(f)); - s3 += v_dotprod(v_reinterpret_as_s16(v_expand_high(x1)), v_reinterpret_as_s16(f)); + s0 = v_add(s0, v_dotprod(v_reinterpret_as_s16(v_expand_low(x0)), v_reinterpret_as_s16(f))); + s1 = v_add(s1, v_dotprod(v_reinterpret_as_s16(v_expand_high(x0)), v_reinterpret_as_s16(f))); + s2 = v_add(s2, v_dotprod(v_reinterpret_as_s16(v_expand_low(x1)), v_reinterpret_as_s16(f))); + s3 = v_add(s3, v_dotprod(v_reinterpret_as_s16(v_expand_high(x1)), v_reinterpret_as_s16(f))); } v_store(dst + i, s0); - v_store(dst + i + v_int32::nlanes, s1); - v_store(dst + i + 2*v_int32::nlanes, s2); - v_store(dst + i + 3*v_int32::nlanes, s3); + v_store(dst + i + VTraits::vlanes(), s1); + v_store(dst + i + 2*VTraits::vlanes(), s2); + v_store(dst + i + 3*VTraits::vlanes(), s3); } - if( i <= width - v_uint16::nlanes ) + if( i <= width - VTraits::vlanes() ) { const uchar* src = _src + i; v_int32 s0 = vx_setzero_s32(); @@ -427,22 +427,22 @@ struct RowVec_8u32s v_int32 f = vx_setall_s32((_kx[k] & 0xFFFF) | (_kx[k + 1] << 16)); v_uint16 x0, x1; v_zip(vx_load_expand(src), vx_load_expand(src + cn), x0, x1); - s0 += v_dotprod(v_reinterpret_as_s16(x0), v_reinterpret_as_s16(f)); - s1 += v_dotprod(v_reinterpret_as_s16(x1), v_reinterpret_as_s16(f)); + s0 = v_add(s0, v_dotprod(v_reinterpret_as_s16(x0), v_reinterpret_as_s16(f))); + s1 = v_add(s1, v_dotprod(v_reinterpret_as_s16(x1), v_reinterpret_as_s16(f))); } if( k < _ksize ) { v_int32 f = vx_setall_s32(_kx[k]); v_uint32 x0, x1; v_expand(vx_load_expand(src), x0, x1); - s0 += v_dotprod(v_reinterpret_as_s16(x0), v_reinterpret_as_s16(f)); - s1 += v_dotprod(v_reinterpret_as_s16(x1), v_reinterpret_as_s16(f)); + s0 = v_add(s0, v_dotprod(v_reinterpret_as_s16(x0), v_reinterpret_as_s16(f))); + s1 = v_add(s1, v_dotprod(v_reinterpret_as_s16(x1), v_reinterpret_as_s16(f))); } v_store(dst + i, s0); - v_store(dst + i + v_int32::nlanes, s1); - i += v_uint16::nlanes; + v_store(dst + i + VTraits::vlanes(), s1); + i += VTraits::vlanes(); } - if( i <= width - v_uint32::nlanes ) + if( i <= width - VTraits::vlanes() ) { v_int32 d = vx_setzero_s32(); k = 0; @@ -452,12 +452,12 @@ struct RowVec_8u32s v_int32 f = vx_setall_s32((_kx[k] & 0xFFFF) | (_kx[k + 1] << 16)); v_uint32 x0, x1; v_zip(vx_load_expand_q(src), vx_load_expand_q(src + cn), x0, x1); - d += v_dotprod(v_pack(v_reinterpret_as_s32(x0), v_reinterpret_as_s32(x1)), v_reinterpret_as_s16(f)); + d = v_add(d, v_dotprod(v_pack(v_reinterpret_as_s32(x0), v_reinterpret_as_s32(x1)), v_reinterpret_as_s16(f))); } if (k < _ksize) - d += v_dotprod(v_reinterpret_as_s16(vx_load_expand_q(src)), v_reinterpret_as_s16(vx_setall_s32(_kx[k]))); + d = v_add(d, v_dotprod(v_reinterpret_as_s16(vx_load_expand_q(src)), v_reinterpret_as_s16(vx_setall_s32(_kx[k])))); v_store(dst + i, d); - i += v_uint32::nlanes; + i += VTraits::vlanes(); } } return i; @@ -480,7 +480,7 @@ struct RowVec_8u32f float* dst = (float*)_dst; const float* _kx = kernel.ptr(); width *= cn; - for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes ) + for( ; i <= width - VTraits::vlanes(); i += VTraits::vlanes() ) { v_float32 s0 = vx_setzero_f32(); v_float32 s1 = vx_setzero_f32(); @@ -492,18 +492,18 @@ struct RowVec_8u32f v_float32 f = vx_setall_f32(_kx[k]); const uchar* src = (const uchar*)_src + i + k * cn; v_float32 vs_ll = v_cvt_f32(v_reinterpret_as_s32(vx_load_expand_q(src))); - v_float32 vs_lh = v_cvt_f32(v_reinterpret_as_s32(vx_load_expand_q(src + v_float32::nlanes))); - v_float32 vs_hl = v_cvt_f32(v_reinterpret_as_s32(vx_load_expand_q(src + 2*v_float32::nlanes))); - v_float32 vs_hh = v_cvt_f32(v_reinterpret_as_s32(vx_load_expand_q(src + 3*v_float32::nlanes))); + v_float32 vs_lh = v_cvt_f32(v_reinterpret_as_s32(vx_load_expand_q(src + VTraits::vlanes()))); + v_float32 vs_hl = v_cvt_f32(v_reinterpret_as_s32(vx_load_expand_q(src + 2*VTraits::vlanes()))); + v_float32 vs_hh = v_cvt_f32(v_reinterpret_as_s32(vx_load_expand_q(src + 3*VTraits::vlanes()))); s0 = v_muladd(vs_ll, f, s0); s1 = v_muladd(vs_lh, f, s1); s2 = v_muladd(vs_hl, f, s2); s3 = v_muladd(vs_hh, f, s3); } v_store(dst + i, s0); - v_store(dst + i + v_float32::nlanes, s1); - v_store(dst + i + 2*v_float32::nlanes, s2); - v_store(dst + i + 3*v_float32::nlanes, s3); + v_store(dst + i + VTraits::vlanes(), s1); + v_store(dst + i + 2*VTraits::vlanes(), s2); + v_store(dst + i + 3*VTraits::vlanes(), s3); } return i; } @@ -553,7 +553,7 @@ struct SymmRowSmallVec_8u32s { if( kx[0] == 2 && kx[1] == 1 ) { - for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes ) + for( ; i <= width - VTraits::vlanes(); i += VTraits::vlanes(), src += VTraits::vlanes() ) { v_uint16 x0l, x0h, x1l, x1h, x2l, x2h; v_expand(vx_load(src - cn), x0l, x0h); @@ -562,29 +562,29 @@ struct SymmRowSmallVec_8u32s x1l = v_add_wrap(v_add_wrap(x1l, x1l), v_add_wrap(x0l, x2l)); x1h = v_add_wrap(v_add_wrap(x1h, x1h), v_add_wrap(x0h, x2h)); v_store(dst + i, v_reinterpret_as_s32(v_expand_low(x1l))); - v_store(dst + i + v_int32::nlanes, v_reinterpret_as_s32(v_expand_high(x1l))); - v_store(dst + i + 2*v_int32::nlanes, v_reinterpret_as_s32(v_expand_low(x1h))); - v_store(dst + i + 3*v_int32::nlanes, v_reinterpret_as_s32(v_expand_high(x1h))); + v_store(dst + i + VTraits::vlanes(), v_reinterpret_as_s32(v_expand_high(x1l))); + v_store(dst + i + 2*VTraits::vlanes(), v_reinterpret_as_s32(v_expand_low(x1h))); + v_store(dst + i + 3*VTraits::vlanes(), v_reinterpret_as_s32(v_expand_high(x1h))); } - if( i <= width - v_uint16::nlanes ) + if( i <= width - VTraits::vlanes() ) { v_uint16 x = vx_load_expand(src); x = v_add_wrap(v_add_wrap(x, x), v_add_wrap(vx_load_expand(src - cn), vx_load_expand(src + cn))); v_store(dst + i, v_reinterpret_as_s32(v_expand_low(x))); - v_store(dst + i + v_int32::nlanes, v_reinterpret_as_s32(v_expand_high(x))); - i += v_uint16::nlanes; src += v_uint16::nlanes; + v_store(dst + i + VTraits::vlanes(), v_reinterpret_as_s32(v_expand_high(x))); + i += VTraits::vlanes(); src += VTraits::vlanes(); } - if( i <= width - v_uint32::nlanes ) + if( i <= width - VTraits::vlanes() ) { v_uint32 x = vx_load_expand_q(src); - x = (x + x) + vx_load_expand_q(src - cn) + vx_load_expand_q(src + cn); + x = v_add(v_add(v_add(x, x), vx_load_expand_q(src - cn)), vx_load_expand_q(src + cn)); v_store(dst + i, v_reinterpret_as_s32(x)); - i += v_uint32::nlanes; + i += VTraits::vlanes(); } } else if( kx[0] == -2 && kx[1] == 1 ) { - for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes ) + for( ; i <= width - VTraits::vlanes(); i += VTraits::vlanes(), src += VTraits::vlanes() ) { v_uint16 x0l, x0h, x1l, x1h, x2l, x2h; v_expand(vx_load(src - cn), x0l, x0h); @@ -593,31 +593,31 @@ struct SymmRowSmallVec_8u32s x1l = v_sub_wrap(v_add_wrap(x0l, x2l), v_add_wrap(x1l, x1l)); x1h = v_sub_wrap(v_add_wrap(x0h, x2h), v_add_wrap(x1h, x1h)); v_store(dst + i, v_expand_low(v_reinterpret_as_s16(x1l))); - v_store(dst + i + v_int32::nlanes, v_expand_high(v_reinterpret_as_s16(x1l))); - v_store(dst + i + 2*v_int32::nlanes, v_expand_low(v_reinterpret_as_s16(x1h))); - v_store(dst + i + 3*v_int32::nlanes, v_expand_high(v_reinterpret_as_s16(x1h))); + v_store(dst + i + VTraits::vlanes(), v_expand_high(v_reinterpret_as_s16(x1l))); + v_store(dst + i + 2*VTraits::vlanes(), v_expand_low(v_reinterpret_as_s16(x1h))); + v_store(dst + i + 3*VTraits::vlanes(), v_expand_high(v_reinterpret_as_s16(x1h))); } - if( i <= width - v_uint16::nlanes ) + if( i <= width - VTraits::vlanes() ) { v_uint16 x = vx_load_expand(src); x = v_sub_wrap(v_add_wrap(vx_load_expand(src - cn), vx_load_expand(src + cn)), v_add_wrap(x, x)); v_store(dst + i, v_expand_low(v_reinterpret_as_s16(x))); - v_store(dst + i + v_int32::nlanes, v_expand_high(v_reinterpret_as_s16(x))); - i += v_uint16::nlanes; src += v_uint16::nlanes; + v_store(dst + i + VTraits::vlanes(), v_expand_high(v_reinterpret_as_s16(x))); + i += VTraits::vlanes(); src += VTraits::vlanes(); } - if( i <= width - v_uint32::nlanes ) + if( i <= width - VTraits::vlanes() ) { v_int32 x = v_reinterpret_as_s32(vx_load_expand_q(src)); - x = v_reinterpret_as_s32(vx_load_expand_q(src - cn) + vx_load_expand_q(src + cn)) - (x + x); + x = v_sub(v_reinterpret_as_s32(v_add(vx_load_expand_q(src - cn), vx_load_expand_q(src + cn))), v_add(x, x)); v_store(dst + i, x); - i += v_uint32::nlanes; + i += VTraits::vlanes(); } } else { v_int16 k0 = vx_setall_s16((short)kx[0]); v_int16 k1 = vx_setall_s16((short)kx[1]); - for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes ) + for( ; i <= width - VTraits::vlanes(); i += VTraits::vlanes(), src += VTraits::vlanes() ) { v_uint16 x0l, x0h, x1l, x1h, x2l, x2h; v_expand(vx_load(src - cn), x0l, x0h); @@ -628,34 +628,34 @@ struct SymmRowSmallVec_8u32s v_int16 x0, x1; v_mul_expand(v_reinterpret_as_s16(x1l), k0, dl, dh); v_zip(v_reinterpret_as_s16(x0l), v_reinterpret_as_s16(x2l), x0, x1); - dl += v_dotprod(x0, k1); - dh += v_dotprod(x1, k1); + dl = v_add(dl, v_dotprod(x0, k1)); + dh = v_add(dh, v_dotprod(x1, k1)); v_store(dst + i, dl); - v_store(dst + i + v_int32::nlanes, dh); + v_store(dst + i + VTraits::vlanes(), dh); v_mul_expand(v_reinterpret_as_s16(x1h), k0, dl, dh); v_zip(v_reinterpret_as_s16(x0h), v_reinterpret_as_s16(x2h), x0, x1); - dl += v_dotprod(x0, k1); - dh += v_dotprod(x1, k1); - v_store(dst + i + 2*v_int32::nlanes, dl); - v_store(dst + i + 3*v_int32::nlanes, dh); + dl = v_add(dl, v_dotprod(x0, k1)); + dh = v_add(dh, v_dotprod(x1, k1)); + v_store(dst + i + 2*VTraits::vlanes(), dl); + v_store(dst + i + 3*VTraits::vlanes(), dh); } - if ( i <= width - v_uint16::nlanes ) + if ( i <= width - VTraits::vlanes() ) { v_int32 dl, dh; v_mul_expand(v_reinterpret_as_s16(vx_load_expand(src)), k0, dl, dh); v_int16 x0, x1; v_zip(v_reinterpret_as_s16(vx_load_expand(src - cn)), v_reinterpret_as_s16(vx_load_expand(src + cn)), x0, x1); - dl += v_dotprod(x0, k1); - dh += v_dotprod(x1, k1); + dl = v_add(dl, v_dotprod(x0, k1)); + dh = v_add(dh, v_dotprod(x1, k1)); v_store(dst + i, dl); - v_store(dst + i + v_int32::nlanes, dh); - i += v_uint16::nlanes; src += v_uint16::nlanes; + v_store(dst + i + VTraits::vlanes(), dh); + i += VTraits::vlanes(); src += VTraits::vlanes(); } - if ( i <= width - v_uint32::nlanes ) + if ( i <= width - VTraits::vlanes() ) { - v_store(dst + i, v_muladd(v_reinterpret_as_s32(vx_load_expand_q(src)), vx_setall_s32(kx[0]), v_reinterpret_as_s32(vx_load_expand_q(src - cn) + vx_load_expand_q(src + cn)) * vx_setall_s32(kx[1]))); - i += v_uint32::nlanes; + v_store(dst + i, v_muladd(v_reinterpret_as_s32(vx_load_expand_q(src)), vx_setall_s32(kx[0]), v_mul(v_reinterpret_as_s32(v_add(vx_load_expand_q(src - cn), vx_load_expand_q(src + cn))), vx_setall_s32(kx[1])))); + i += VTraits::vlanes(); } } } @@ -663,7 +663,7 @@ struct SymmRowSmallVec_8u32s { if( kx[0] == -2 && kx[1] == 0 && kx[2] == 1 ) { - for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes ) + for( ; i <= width - VTraits::vlanes(); i += VTraits::vlanes(), src += VTraits::vlanes() ) { v_uint16 x0l, x0h, x1l, x1h, x2l, x2h; v_expand(vx_load(src - 2*cn), x0l, x0h); @@ -672,31 +672,31 @@ struct SymmRowSmallVec_8u32s x1l = v_sub_wrap(v_add_wrap(x0l, x2l), v_add_wrap(x1l, x1l)); x1h = v_sub_wrap(v_add_wrap(x0h, x2h), v_add_wrap(x1h, x1h)); v_store(dst + i, v_expand_low(v_reinterpret_as_s16(x1l))); - v_store(dst + i + v_int32::nlanes, v_expand_high(v_reinterpret_as_s16(x1l))); - v_store(dst + i + 2*v_int32::nlanes, v_expand_low(v_reinterpret_as_s16(x1h))); - v_store(dst + i + 3*v_int32::nlanes, v_expand_high(v_reinterpret_as_s16(x1h))); + v_store(dst + i + VTraits::vlanes(), v_expand_high(v_reinterpret_as_s16(x1l))); + v_store(dst + i + 2*VTraits::vlanes(), v_expand_low(v_reinterpret_as_s16(x1h))); + v_store(dst + i + 3*VTraits::vlanes(), v_expand_high(v_reinterpret_as_s16(x1h))); } - if( i <= width - v_uint16::nlanes ) + if( i <= width - VTraits::vlanes() ) { v_uint16 x = vx_load_expand(src); x = v_sub_wrap(v_add_wrap(vx_load_expand(src - 2*cn), vx_load_expand(src + 2*cn)), v_add_wrap(x, x)); v_store(dst + i, v_expand_low(v_reinterpret_as_s16(x))); - v_store(dst + i + v_int32::nlanes, v_expand_high(v_reinterpret_as_s16(x))); - i += v_uint16::nlanes; src += v_uint16::nlanes; + v_store(dst + i + VTraits::vlanes(), v_expand_high(v_reinterpret_as_s16(x))); + i += VTraits::vlanes(); src += VTraits::vlanes(); } - if( i <= width - v_uint32::nlanes ) + if( i <= width - VTraits::vlanes() ) { v_int32 x = v_reinterpret_as_s32(vx_load_expand_q(src)); - x = v_reinterpret_as_s32(vx_load_expand_q(src - 2*cn) + vx_load_expand_q(src + 2*cn)) - (x + x); + x = v_sub(v_reinterpret_as_s32(v_add(vx_load_expand_q(src - 2 * cn), vx_load_expand_q(src + 2 * cn))), v_add(x, x)); v_store(dst + i, x); - i += v_uint32::nlanes; + i += VTraits::vlanes(); } } else { v_int16 k0 = vx_setall_s16((short)(kx[0])); v_int16 k12 = v_reinterpret_as_s16(vx_setall_s32((kx[1] & 0xFFFF) | (kx[2] << 16))); - for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes ) + for( ; i <= width - VTraits::vlanes(); i += VTraits::vlanes(), src += VTraits::vlanes() ) { v_int32 x0, x1, x2, x3; v_uint16 x0l, x0h, x1l, x1h, x2l, x2h, x3l, x3h; @@ -710,45 +710,45 @@ struct SymmRowSmallVec_8u32s v_expand(vx_load(src + cn), x1l, x1h); v_expand(vx_load(src - 2*cn), x2l, x2h); v_expand(vx_load(src + 2*cn), x3l, x3h); - v_zip(v_reinterpret_as_s16(x0l + x1l), v_reinterpret_as_s16(x2l + x3l), xl, xh); - x0 += v_dotprod(xl, k12); - x1 += v_dotprod(xh, k12); - v_zip(v_reinterpret_as_s16(x0h + x1h), v_reinterpret_as_s16(x2h + x3h), xl, xh); - x2 += v_dotprod(xl, k12); - x3 += v_dotprod(xh, k12); + v_zip(v_reinterpret_as_s16(v_add(x0l, x1l)), v_reinterpret_as_s16(v_add(x2l, x3l)), xl, xh); + x0 = v_add(x0, v_dotprod(xl, k12)); + x1 = v_add(x1, v_dotprod(xh, k12)); + v_zip(v_reinterpret_as_s16(v_add(x0h, x1h)), v_reinterpret_as_s16(v_add(x2h, x3h)), xl, xh); + x2 = v_add(x2, v_dotprod(xl, k12)); + x3 = v_add(x3, v_dotprod(xh, k12)); v_store(dst + i, x0); - v_store(dst + i + v_int32::nlanes, x1); - v_store(dst + i + 2*v_int32::nlanes, x2); - v_store(dst + i + 3*v_int32::nlanes, x3); + v_store(dst + i + VTraits::vlanes(), x1); + v_store(dst + i + 2*VTraits::vlanes(), x2); + v_store(dst + i + 3*VTraits::vlanes(), x3); } - if( i <= width - v_uint16::nlanes ) + if( i <= width - VTraits::vlanes() ) { v_int32 x1, x2; v_mul_expand(v_reinterpret_as_s16(vx_load_expand(src)), k0, x1, x2); v_int16 xl, xh; - v_zip(v_reinterpret_as_s16(vx_load_expand(src - cn) + vx_load_expand(src + cn)), v_reinterpret_as_s16(vx_load_expand(src - 2*cn) + vx_load_expand(src + 2*cn)), xl, xh); - x1 += v_dotprod(xl, k12); - x2 += v_dotprod(xh, k12); + v_zip(v_reinterpret_as_s16(v_add(vx_load_expand(src - cn), vx_load_expand(src + cn))), v_reinterpret_as_s16(v_add(vx_load_expand(src - 2 * cn), vx_load_expand(src + 2 * cn))), xl, xh); + x1 = v_add(x1, v_dotprod(xl, k12)); + x2 = v_add(x2, v_dotprod(xh, k12)); v_store(dst + i, x1); - v_store(dst + i + v_int32::nlanes, x2); - i += v_uint16::nlanes, src += v_uint16::nlanes; + v_store(dst + i + VTraits::vlanes(), x2); + i += VTraits::vlanes(), src += VTraits::vlanes(); } - if( i <= width - v_uint32::nlanes ) + if( i <= width - VTraits::vlanes() ) { v_store(dst + i, v_muladd(v_reinterpret_as_s32(vx_load_expand_q(src)), vx_setall_s32(kx[0]), - v_muladd(v_reinterpret_as_s32(vx_load_expand_q(src - cn) + vx_load_expand_q(src + cn)), vx_setall_s32(kx[1]), - v_reinterpret_as_s32(vx_load_expand_q(src - 2*cn) + vx_load_expand_q(src + 2*cn)) * vx_setall_s32(kx[2])))); - i += v_uint32::nlanes; + v_muladd(v_reinterpret_as_s32(v_add(vx_load_expand_q(src - cn), vx_load_expand_q(src + cn))), vx_setall_s32(kx[1]), + v_mul(v_reinterpret_as_s32(v_add(vx_load_expand_q(src - 2 * cn), vx_load_expand_q(src + 2 * cn))), vx_setall_s32(kx[2]))))); + i += VTraits::vlanes(); } } } else { v_int16 k0 = vx_setall_s16((short)(kx[0])); - for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes ) + for( ; i <= width - VTraits::vlanes(); i += VTraits::vlanes(), src += VTraits::vlanes() ) { v_uint8 v_src = vx_load(src); v_int32 s0, s1, s2, s3; @@ -764,12 +764,12 @@ struct SymmRowSmallVec_8u32s v_uint8 v_src3 = vx_load(src + j + cn); v_int16 xl, xh; - v_zip(v_reinterpret_as_s16(v_expand_low(v_src0) + v_expand_low(v_src2)), v_reinterpret_as_s16(v_expand_low(v_src1) + v_expand_low(v_src3)), xl, xh); - s0 += v_dotprod(xl, k12); - s1 += v_dotprod(xh, k12); - v_zip(v_reinterpret_as_s16(v_expand_high(v_src0) + v_expand_high(v_src2)), v_reinterpret_as_s16(v_expand_high(v_src1) + v_expand_high(v_src3)), xl, xh); - s2 += v_dotprod(xl, k12); - s3 += v_dotprod(xh, k12); + v_zip(v_reinterpret_as_s16(v_add(v_expand_low(v_src0), v_expand_low(v_src2))), v_reinterpret_as_s16(v_add(v_expand_low(v_src1), v_expand_low(v_src3))), xl, xh); + s0 = v_add(s0, v_dotprod(xl, k12)); + s1 = v_add(s1, v_dotprod(xh, k12)); + v_zip(v_reinterpret_as_s16(v_add(v_expand_high(v_src0), v_expand_high(v_src2))), v_reinterpret_as_s16(v_add(v_expand_high(v_src1), v_expand_high(v_src3))), xl, xh); + s2 = v_add(s2, v_dotprod(xl, k12)); + s3 = v_add(s3, v_dotprod(xh, k12)); } if( k < _ksize / 2 + 1 ) { @@ -780,48 +780,48 @@ struct SymmRowSmallVec_8u32s v_int16 xl, xh; v_zip(v_reinterpret_as_s16(v_expand_low(v_src0)), v_reinterpret_as_s16(v_expand_low(v_src1)), xl, xh); - s0 += v_dotprod(xl, k1); - s1 += v_dotprod(xh, k1); + s0 = v_add(s0, v_dotprod(xl, k1)); + s1 = v_add(s1, v_dotprod(xh, k1)); v_zip(v_reinterpret_as_s16(v_expand_high(v_src0)), v_reinterpret_as_s16(v_expand_high(v_src1)), xl, xh); - s2 += v_dotprod(xl, k1); - s3 += v_dotprod(xh, k1); + s2 = v_add(s2, v_dotprod(xl, k1)); + s3 = v_add(s3, v_dotprod(xh, k1)); } v_store(dst + i, s0); - v_store(dst + i + v_int32::nlanes, s1); - v_store(dst + i + 2*v_int32::nlanes, s2); - v_store(dst + i + 3*v_int32::nlanes, s3); + v_store(dst + i + VTraits::vlanes(), s1); + v_store(dst + i + 2*VTraits::vlanes(), s2); + v_store(dst + i + 3*VTraits::vlanes(), s3); } - if( i <= width - v_uint16::nlanes ) + if( i <= width - VTraits::vlanes() ) { v_int32 s0, s1; v_mul_expand(v_reinterpret_as_s16(vx_load_expand(src)), k0, s0, s1); for (k = 1, j = cn; k <= _ksize / 2 - 1; k+=2, j += 2*cn) { v_int16 xl, xh; - v_zip(v_reinterpret_as_s16(vx_load_expand(src - j) + vx_load_expand(src + j)), v_reinterpret_as_s16(vx_load_expand(src - j - cn) + vx_load_expand(src + j + cn)), xl, xh); + v_zip(v_reinterpret_as_s16(v_add(vx_load_expand(src - j), vx_load_expand(src + j))), v_reinterpret_as_s16(v_add(vx_load_expand(src - j - cn), vx_load_expand(src + j + cn))), xl, xh); v_int16 k12 = v_reinterpret_as_s16(vx_setall_s32((kx[k] & 0xFFFF) | (kx[k+1] << 16))); - s0 += v_dotprod(xl, k12); - s1 += v_dotprod(xh, k12); + s0 = v_add(s0, v_dotprod(xl, k12)); + s1 = v_add(s1, v_dotprod(xh, k12)); } if ( k < _ksize / 2 + 1 ) { v_int16 xl, xh; v_zip(v_reinterpret_as_s16(vx_load_expand(src - j)), v_reinterpret_as_s16(vx_load_expand(src + j)), xl, xh); v_int16 k1 = vx_setall_s16((short)(kx[k])); - s0 += v_dotprod(xl, k1); - s1 += v_dotprod(xh, k1); + s0 = v_add(s0, v_dotprod(xl, k1)); + s1 = v_add(s1, v_dotprod(xh, k1)); } v_store(dst + i, s0); - v_store(dst + i + v_int32::nlanes, s1); - i += v_uint16::nlanes; src += v_uint16::nlanes; + v_store(dst + i + VTraits::vlanes(), s1); + i += VTraits::vlanes(); src += VTraits::vlanes(); } - if( i <= width - v_uint32::nlanes ) + if( i <= width - VTraits::vlanes() ) { - v_int32 s0 = v_reinterpret_as_s32(vx_load_expand_q(src)) * vx_setall_s32(kx[0]); + v_int32 s0 = v_mul(v_reinterpret_as_s32(vx_load_expand_q(src)), vx_setall_s32(kx[0])); for( k = 1, j = cn; k < _ksize / 2 + 1; k++, j += cn ) - s0 = v_muladd(v_reinterpret_as_s32(vx_load_expand_q(src - j) + vx_load_expand_q(src + j)), vx_setall_s32(kx[k]), s0); + s0 = v_muladd(v_reinterpret_as_s32(v_add(vx_load_expand_q(src - j), vx_load_expand_q(src + j))), vx_setall_s32(kx[k]), s0); v_store(dst + i, s0); - i += v_uint32::nlanes; + i += VTraits::vlanes(); } } } @@ -831,7 +831,7 @@ struct SymmRowSmallVec_8u32s { if( kx[0] == 0 && kx[1] == 1 ) { - for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes ) + for( ; i <= width - VTraits::vlanes(); i += VTraits::vlanes(), src += VTraits::vlanes() ) { v_uint16 x0l, x0h, x2l, x2h; v_expand(vx_load(src - cn), x0l, x0h); @@ -839,27 +839,27 @@ struct SymmRowSmallVec_8u32s v_int16 dl = v_reinterpret_as_s16(v_sub_wrap(x2l, x0l)); v_int16 dh = v_reinterpret_as_s16(v_sub_wrap(x2h, x0h)); v_store(dst + i, v_expand_low(dl)); - v_store(dst + i + v_int32::nlanes, v_expand_high(dl)); - v_store(dst + i + 2*v_int32::nlanes, v_expand_low(dh)); - v_store(dst + i + 3*v_int32::nlanes, v_expand_high(dh)); + v_store(dst + i + VTraits::vlanes(), v_expand_high(dl)); + v_store(dst + i + 2*VTraits::vlanes(), v_expand_low(dh)); + v_store(dst + i + 3*VTraits::vlanes(), v_expand_high(dh)); } - if( i <= width - v_uint16::nlanes ) + if( i <= width - VTraits::vlanes() ) { v_int16 dl = v_reinterpret_as_s16(v_sub_wrap(vx_load_expand(src + cn), vx_load_expand(src - cn))); v_store(dst + i, v_expand_low(dl)); - v_store(dst + i + v_int32::nlanes, v_expand_high(dl)); - i += v_uint16::nlanes; src += v_uint16::nlanes; + v_store(dst + i + VTraits::vlanes(), v_expand_high(dl)); + i += VTraits::vlanes(); src += VTraits::vlanes(); } - if (i <= width - v_uint32::nlanes) + if (i <= width - VTraits::vlanes()) { - v_store(dst + i, v_reinterpret_as_s32(vx_load_expand_q(src + cn)) - v_reinterpret_as_s32(vx_load_expand_q(src - cn))); - i += v_uint32::nlanes; + v_store(dst + i, v_sub(v_reinterpret_as_s32(vx_load_expand_q(src + cn)), v_reinterpret_as_s32(vx_load_expand_q(src - cn)))); + i += VTraits::vlanes(); } } else { v_int16 k0 = v_reinterpret_as_s16(vx_setall_s32((kx[1] & 0xFFFF) | (-kx[1] << 16))); - for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes ) + for( ; i <= width - VTraits::vlanes(); i += VTraits::vlanes(), src += VTraits::vlanes() ) { v_uint16 x0l, x0h, x2l, x2h; v_expand(vx_load(src - cn), x0l, x0h); @@ -867,30 +867,30 @@ struct SymmRowSmallVec_8u32s v_int16 xl, xh; v_zip(v_reinterpret_as_s16(x2l), v_reinterpret_as_s16(x0l), xl, xh); v_store(dst + i, v_dotprod(xl, k0)); - v_store(dst + i + v_int32::nlanes, v_dotprod(xh, k0)); + v_store(dst + i + VTraits::vlanes(), v_dotprod(xh, k0)); v_zip(v_reinterpret_as_s16(x2h), v_reinterpret_as_s16(x0h), xl, xh); - v_store(dst + i + 2*v_int32::nlanes, v_dotprod(xl, k0)); - v_store(dst + i + 3*v_int32::nlanes, v_dotprod(xh, k0)); + v_store(dst + i + 2*VTraits::vlanes(), v_dotprod(xl, k0)); + v_store(dst + i + 3*VTraits::vlanes(), v_dotprod(xh, k0)); } - if( i <= width - v_uint16::nlanes ) + if( i <= width - VTraits::vlanes() ) { v_int16 xl, xh; v_zip(v_reinterpret_as_s16(vx_load_expand(src + cn)), v_reinterpret_as_s16(vx_load_expand(src - cn)), xl, xh); v_store(dst + i, v_dotprod(xl, k0)); - v_store(dst + i + v_int32::nlanes, v_dotprod(xh, k0)); - i += v_uint16::nlanes; src += v_uint16::nlanes; + v_store(dst + i + VTraits::vlanes(), v_dotprod(xh, k0)); + i += VTraits::vlanes(); src += VTraits::vlanes(); } - if (i <= width - v_uint32::nlanes) + if (i <= width - VTraits::vlanes()) { - v_store(dst + i, v_muladd(v_reinterpret_as_s32(vx_load_expand_q(src + cn)), vx_setall_s32(kx[1]), v_reinterpret_as_s32(vx_load_expand_q(src - cn)) * vx_setall_s32(-kx[1]))); - i += v_uint32::nlanes; + v_store(dst + i, v_muladd(v_reinterpret_as_s32(vx_load_expand_q(src + cn)), vx_setall_s32(kx[1]), v_mul(v_reinterpret_as_s32(vx_load_expand_q(src - cn)), vx_setall_s32(-kx[1])))); + i += VTraits::vlanes(); } } } else if( _ksize == 5 ) { v_int16 k0 = v_reinterpret_as_s16(vx_setall_s32((kx[1] & 0xFFFF) | (kx[2] << 16))); - for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes ) + for( ; i <= width - VTraits::vlanes(); i += VTraits::vlanes(), src += VTraits::vlanes() ) { v_uint16 x0l, x0h, x1l, x1h, x2l, x2h, x3l, x3h; v_expand(vx_load(src - cn), x0l, x0h); @@ -900,31 +900,31 @@ struct SymmRowSmallVec_8u32s v_int16 x0, x1; v_zip(v_reinterpret_as_s16(v_sub_wrap(x2l, x0l)), v_reinterpret_as_s16(v_sub_wrap(x3l, x1l)), x0, x1); v_store(dst + i, v_dotprod(x0, k0)); - v_store(dst + i + v_int32::nlanes, v_dotprod(x1, k0)); + v_store(dst + i + VTraits::vlanes(), v_dotprod(x1, k0)); v_zip(v_reinterpret_as_s16(v_sub_wrap(x2h, x0h)), v_reinterpret_as_s16(v_sub_wrap(x3h, x1h)), x0, x1); - v_store(dst + i + 2*v_int32::nlanes, v_dotprod(x0, k0)); - v_store(dst + i + 3*v_int32::nlanes, v_dotprod(x1, k0)); + v_store(dst + i + 2*VTraits::vlanes(), v_dotprod(x0, k0)); + v_store(dst + i + 3*VTraits::vlanes(), v_dotprod(x1, k0)); } - if( i <= width - v_uint16::nlanes ) + if( i <= width - VTraits::vlanes() ) { v_int16 x0, x1; v_zip(v_reinterpret_as_s16(v_sub_wrap(vx_load_expand(src + cn), vx_load_expand(src - cn))), v_reinterpret_as_s16(v_sub_wrap(vx_load_expand(src + 2*cn), vx_load_expand(src - 2*cn))), x0, x1); v_store(dst + i, v_dotprod(x0, k0)); - v_store(dst + i + v_int32::nlanes, v_dotprod(x1, k0)); - i += v_uint16::nlanes; src += v_uint16::nlanes; + v_store(dst + i + VTraits::vlanes(), v_dotprod(x1, k0)); + i += VTraits::vlanes(); src += VTraits::vlanes(); } - if( i <= width - v_uint32::nlanes ) + if( i <= width - VTraits::vlanes() ) { - v_store(dst + i, v_muladd(v_reinterpret_as_s32(vx_load_expand_q(src + cn)) - v_reinterpret_as_s32(vx_load_expand_q(src - cn)), vx_setall_s32(kx[1]), - (v_reinterpret_as_s32(vx_load_expand_q(src + 2*cn)) - v_reinterpret_as_s32(vx_load_expand_q(src - 2*cn))) * vx_setall_s32(kx[2]))); - i += v_uint32::nlanes; + v_store(dst + i, v_muladd(v_sub(v_reinterpret_as_s32(vx_load_expand_q(src + cn)), v_reinterpret_as_s32(vx_load_expand_q(src - cn))), vx_setall_s32(kx[1]), + v_mul(v_sub(v_reinterpret_as_s32(vx_load_expand_q(src + 2 * cn)), v_reinterpret_as_s32(vx_load_expand_q(src - 2 * cn))), vx_setall_s32(kx[2])))); + i += VTraits::vlanes(); } } else { v_int16 k0 = vx_setall_s16((short)(kx[0])); - for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes ) + for( ; i <= width - VTraits::vlanes(); i += VTraits::vlanes(), src += VTraits::vlanes() ) { v_uint8 v_src = vx_load(src); v_int32 s0, s1, s2, s3; @@ -941,11 +941,11 @@ struct SymmRowSmallVec_8u32s v_int16 xl, xh; v_zip(v_reinterpret_as_s16(v_sub_wrap(v_expand_low(v_src2), v_expand_low(v_src0))), v_reinterpret_as_s16(v_sub_wrap(v_expand_low(v_src3), v_expand_low(v_src1))), xl, xh); - s0 += v_dotprod(xl, k12); - s1 += v_dotprod(xh, k12); + s0 = v_add(s0, v_dotprod(xl, k12)); + s1 = v_add(s1, v_dotprod(xh, k12)); v_zip(v_reinterpret_as_s16(v_sub_wrap(v_expand_high(v_src2), v_expand_high(v_src0))), v_reinterpret_as_s16(v_sub_wrap(v_expand_high(v_src3), v_expand_high(v_src1))), xl, xh); - s2 += v_dotprod(xl, k12); - s3 += v_dotprod(xh, k12); + s2 = v_add(s2, v_dotprod(xl, k12)); + s3 = v_add(s3, v_dotprod(xh, k12)); } if( k < _ksize / 2 + 1 ) { @@ -955,18 +955,18 @@ struct SymmRowSmallVec_8u32s v_int16 xl, xh; v_zip(v_reinterpret_as_s16(v_expand_low(v_src1)), v_reinterpret_as_s16(v_expand_low(v_src0)), xl, xh); - s0 += v_dotprod(xl, k12); - s1 += v_dotprod(xh, k12); + s0 = v_add(s0, v_dotprod(xl, k12)); + s1 = v_add(s1, v_dotprod(xh, k12)); v_zip(v_reinterpret_as_s16(v_expand_high(v_src1)), v_reinterpret_as_s16(v_expand_high(v_src0)), xl, xh); - s2 += v_dotprod(xl, k12); - s3 += v_dotprod(xh, k12); + s2 = v_add(s2, v_dotprod(xl, k12)); + s3 = v_add(s3, v_dotprod(xh, k12)); } v_store(dst + i, s0); - v_store(dst + i + v_int32::nlanes, s1); - v_store(dst + i + 2*v_int32::nlanes, s2); - v_store(dst + i + 3*v_int32::nlanes, s3); + v_store(dst + i + VTraits::vlanes(), s1); + v_store(dst + i + 2*VTraits::vlanes(), s2); + v_store(dst + i + 3*VTraits::vlanes(), s3); } - if( i <= width - v_uint16::nlanes ) + if( i <= width - VTraits::vlanes() ) { v_int32 s0, s1; v_mul_expand(v_reinterpret_as_s16(vx_load_expand(src)), k0, s0, s1); @@ -975,28 +975,28 @@ struct SymmRowSmallVec_8u32s v_int16 xl, xh; v_zip(v_reinterpret_as_s16(v_sub_wrap(vx_load_expand(src + j), vx_load_expand(src - j))), v_reinterpret_as_s16(v_sub_wrap(vx_load_expand(src + j + cn), vx_load_expand(src - j - cn))), xl, xh); v_int16 k12 = v_reinterpret_as_s16(vx_setall_s32((kx[k] & 0xFFFF) | (kx[k + 1] << 16))); - s0 += v_dotprod(xl, k12); - s1 += v_dotprod(xh, k12); + s0 = v_add(s0, v_dotprod(xl, k12)); + s1 = v_add(s1, v_dotprod(xh, k12)); } if( k < _ksize / 2 + 1 ) { v_int16 k1 = v_reinterpret_as_s16(vx_setall_s32((kx[k] & 0xFFFF) | (-kx[k] << 16))); v_int16 xl, xh; v_zip(v_reinterpret_as_s16(vx_load_expand(src + j)), v_reinterpret_as_s16(vx_load_expand(src - j)), xl, xh); - s0 += v_dotprod(xl, k1); - s1 += v_dotprod(xh, k1); + s0 = v_add(s0, v_dotprod(xl, k1)); + s1 = v_add(s1, v_dotprod(xh, k1)); } v_store(dst + i, s0); - v_store(dst + i + v_int32::nlanes, s1); - i += v_uint16::nlanes; src += v_uint16::nlanes; + v_store(dst + i + VTraits::vlanes(), s1); + i += VTraits::vlanes(); src += VTraits::vlanes(); } - if( i <= width - v_uint32::nlanes ) + if( i <= width - VTraits::vlanes() ) { - v_int32 s0 = v_reinterpret_as_s32(vx_load_expand_q(src)) * vx_setall_s32(kx[0]); + v_int32 s0 = v_mul(v_reinterpret_as_s32(vx_load_expand_q(src)), vx_setall_s32(kx[0])); for (k = 1, j = cn; k < _ksize / 2 + 1; k++, j += cn) - s0 = v_muladd(v_reinterpret_as_s32(vx_load_expand_q(src + j)) - v_reinterpret_as_s32(vx_load_expand_q(src - j)), vx_setall_s32(kx[k]), s0); + s0 = v_muladd(v_sub(v_reinterpret_as_s32(vx_load_expand_q(src + j)), v_reinterpret_as_s32(vx_load_expand_q(src - j))), vx_setall_s32(kx[k]), s0); v_store(dst + i, s0); - i += v_uint32::nlanes; + i += VTraits::vlanes(); } } } @@ -1038,120 +1038,120 @@ struct SymmColumnVec_32s8u { v_float32 f0 = vx_setall_f32(ky[0]); v_float32 f1 = vx_setall_f32(ky[1]); - for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes ) + for( ; i <= width - VTraits::vlanes(); i += VTraits::vlanes() ) { const int* S = src[0] + i; v_float32 s0 = v_muladd(v_cvt_f32(vx_load(S)), f0, d4); - v_float32 s1 = v_muladd(v_cvt_f32(vx_load(S + v_int32::nlanes)), f0, d4); - v_float32 s2 = v_muladd(v_cvt_f32(vx_load(S + 2*v_int32::nlanes)), f0, d4); - v_float32 s3 = v_muladd(v_cvt_f32(vx_load(S + 3*v_int32::nlanes)), f0, d4); + v_float32 s1 = v_muladd(v_cvt_f32(vx_load(S + VTraits::vlanes())), f0, d4); + v_float32 s2 = v_muladd(v_cvt_f32(vx_load(S + 2*VTraits::vlanes())), f0, d4); + v_float32 s3 = v_muladd(v_cvt_f32(vx_load(S + 3*VTraits::vlanes())), f0, d4); const int* S0 = src[1] + i; const int* S1 = src[-1] + i; - s0 = v_muladd(v_cvt_f32(vx_load(S0) + vx_load(S1)), f1, s0); - s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) + vx_load(S1 + v_int32::nlanes)), f1, s1); - s2 = v_muladd(v_cvt_f32(vx_load(S0 + 2 * v_int32::nlanes) + vx_load(S1 + 2 * v_int32::nlanes)), f1, s2); - s3 = v_muladd(v_cvt_f32(vx_load(S0 + 3 * v_int32::nlanes) + vx_load(S1 + 3 * v_int32::nlanes)), f1, s3); + s0 = v_muladd(v_cvt_f32(v_add(vx_load(S0), vx_load(S1))), f1, s0); + s1 = v_muladd(v_cvt_f32(v_add(vx_load(S0 + VTraits::vlanes()), vx_load(S1 + VTraits::vlanes()))), f1, s1); + s2 = v_muladd(v_cvt_f32(v_add(vx_load(S0 + 2 * VTraits::vlanes()), vx_load(S1 + 2 * VTraits::vlanes()))), f1, s2); + s3 = v_muladd(v_cvt_f32(v_add(vx_load(S0 + 3 * VTraits::vlanes()), vx_load(S1 + 3 * VTraits::vlanes()))), f1, s3); for( k = 2; k <= ksize2; k++ ) { v_float32 f = vx_setall_f32(ky[k]); S0 = src[k] + i; S1 = src[-k] + i; - s0 = v_muladd(v_cvt_f32(vx_load(S0) + vx_load(S1)), f, s0); - s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) + vx_load(S1 + v_int32::nlanes)), f, s1); - s2 = v_muladd(v_cvt_f32(vx_load(S0 + 2*v_int32::nlanes) + vx_load(S1 + 2*v_int32::nlanes)), f, s2); - s3 = v_muladd(v_cvt_f32(vx_load(S0 + 3*v_int32::nlanes) + vx_load(S1 + 3*v_int32::nlanes)), f, s3); + s0 = v_muladd(v_cvt_f32(v_add(vx_load(S0), vx_load(S1))), f, s0); + s1 = v_muladd(v_cvt_f32(v_add(vx_load(S0 + VTraits::vlanes()), vx_load(S1 + VTraits::vlanes()))), f, s1); + s2 = v_muladd(v_cvt_f32(v_add(vx_load(S0 + 2 * VTraits::vlanes()), vx_load(S1 + 2 * VTraits::vlanes()))), f, s2); + s3 = v_muladd(v_cvt_f32(v_add(vx_load(S0 + 3 * VTraits::vlanes()), vx_load(S1 + 3 * VTraits::vlanes()))), f, s3); } v_store(dst + i, v_pack_u(v_pack(v_round(s0), v_round(s1)), v_pack(v_round(s2), v_round(s3)))); } - if( i <= width - v_uint16::nlanes ) + if( i <= width - VTraits::vlanes() ) { const int* S = src[0] + i; v_float32 s0 = v_muladd(v_cvt_f32(vx_load(S)), f0, d4); - v_float32 s1 = v_muladd(v_cvt_f32(vx_load(S + v_int32::nlanes)), f0, d4); + v_float32 s1 = v_muladd(v_cvt_f32(vx_load(S + VTraits::vlanes())), f0, d4); const int* S0 = src[1] + i; const int* S1 = src[-1] + i; - s0 = v_muladd(v_cvt_f32(vx_load(S0) + vx_load(S1)), f1, s0); - s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) + vx_load(S1 + v_int32::nlanes)), f1, s1); + s0 = v_muladd(v_cvt_f32(v_add(vx_load(S0), vx_load(S1))), f1, s0); + s1 = v_muladd(v_cvt_f32(v_add(vx_load(S0 + VTraits::vlanes()), vx_load(S1 + VTraits::vlanes()))), f1, s1); for( k = 2; k <= ksize2; k++ ) { v_float32 f = vx_setall_f32(ky[k]); S0 = src[k] + i; S1 = src[-k] + i; - s0 = v_muladd(v_cvt_f32(vx_load(S0) + vx_load(S1)), f, s0); - s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) + vx_load(S1 + v_int32::nlanes)), f, s1); + s0 = v_muladd(v_cvt_f32(v_add(vx_load(S0), vx_load(S1))), f, s0); + s1 = v_muladd(v_cvt_f32(v_add(vx_load(S0 + VTraits::vlanes()), vx_load(S1 + VTraits::vlanes()))), f, s1); } v_pack_u_store(dst + i, v_pack(v_round(s0), v_round(s1))); - i += v_uint16::nlanes; + i += VTraits::vlanes(); } #if CV_SIMD_WIDTH > 16 - while( i <= width - v_int32x4::nlanes ) + while( i <= width - 4 /*v_int32x4::nlanes*/ ) #else - if( i <= width - v_int32x4::nlanes ) + if( i <= width - v_int32::nlanes ) #endif { - v_float32x4 s0 = v_muladd(v_cvt_f32(v_load(src[0] + i)), v_setall_f32(ky[0]), v_setall_f32(delta)); - s0 = v_muladd(v_cvt_f32(v_load(src[1] + i) + v_load(src[-1] + i)), v_setall_f32(ky[1]), s0); + v_float32 s0 = v_muladd(v_cvt_f32(vx_load(src[0] + i)), vx_setall_f32(ky[0]), vx_setall_f32(delta)); + s0 = v_muladd(v_cvt_f32(v_add(vx_load(src[1] + i), vx_load(src[-1] + i))), vx_setall_f32(ky[1]), s0); for( k = 2; k <= ksize2; k++ ) - s0 = v_muladd(v_cvt_f32(v_load(src[k] + i) + v_load(src[-k] + i)), v_setall_f32(ky[k]), s0); - v_int32x4 s32 = v_round(s0); - v_int16x8 s16 = v_pack(s32, s32); - *(unaligned_int*)(dst + i) = v_reinterpret_as_s32(v_pack_u(s16, s16)).get0(); - i += v_int32x4::nlanes; + s0 = v_muladd(v_cvt_f32(v_add(vx_load(src[k] + i), vx_load(src[-k] + i))), vx_setall_f32(ky[k]), s0); + v_int32 s32 = v_round(s0); + v_int16 s16 = v_pack(s32, s32); + *(unaligned_int*)(dst + i) = v_get0(v_reinterpret_as_s32(v_pack_u(s16, s16))); + i += 4 /*v_int32x4::nlanes*/ ; } } else { v_float32 f1 = vx_setall_f32(ky[1]); - for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes ) + for( ; i <= width - VTraits::vlanes(); i += VTraits::vlanes() ) { const int* S0 = src[1] + i; const int* S1 = src[-1] + i; - v_float32 s0 = v_muladd(v_cvt_f32(vx_load(S0) - vx_load(S1)), f1, d4); - v_float32 s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) - vx_load(S1 + v_int32::nlanes)), f1, d4); - v_float32 s2 = v_muladd(v_cvt_f32(vx_load(S0 + 2 * v_int32::nlanes) - vx_load(S1 + 2 * v_int32::nlanes)), f1, d4); - v_float32 s3 = v_muladd(v_cvt_f32(vx_load(S0 + 3 * v_int32::nlanes) - vx_load(S1 + 3 * v_int32::nlanes)), f1, d4); + v_float32 s0 = v_muladd(v_cvt_f32(v_sub(vx_load(S0), vx_load(S1))), f1, d4); + v_float32 s1 = v_muladd(v_cvt_f32(v_sub(vx_load(S0 + VTraits::vlanes()), vx_load(S1 + VTraits::vlanes()))), f1, d4); + v_float32 s2 = v_muladd(v_cvt_f32(v_sub(vx_load(S0 + 2 * VTraits::vlanes()), vx_load(S1 + 2 * VTraits::vlanes()))), f1, d4); + v_float32 s3 = v_muladd(v_cvt_f32(v_sub(vx_load(S0 + 3 * VTraits::vlanes()), vx_load(S1 + 3 * VTraits::vlanes()))), f1, d4); for ( k = 2; k <= ksize2; k++ ) { v_float32 f = vx_setall_f32(ky[k]); S0 = src[k] + i; S1 = src[-k] + i; - s0 = v_muladd(v_cvt_f32(vx_load(S0) - vx_load(S1)), f, s0); - s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) - vx_load(S1 + v_int32::nlanes)), f, s1); - s2 = v_muladd(v_cvt_f32(vx_load(S0 + 2*v_int32::nlanes) - vx_load(S1 + 2*v_int32::nlanes)), f, s2); - s3 = v_muladd(v_cvt_f32(vx_load(S0 + 3*v_int32::nlanes) - vx_load(S1 + 3*v_int32::nlanes)), f, s3); + s0 = v_muladd(v_cvt_f32(v_sub(vx_load(S0), vx_load(S1))), f, s0); + s1 = v_muladd(v_cvt_f32(v_sub(vx_load(S0 + VTraits::vlanes()), vx_load(S1 + VTraits::vlanes()))), f, s1); + s2 = v_muladd(v_cvt_f32(v_sub(vx_load(S0 + 2 * VTraits::vlanes()), vx_load(S1 + 2 * VTraits::vlanes()))), f, s2); + s3 = v_muladd(v_cvt_f32(v_sub(vx_load(S0 + 3 * VTraits::vlanes()), vx_load(S1 + 3 * VTraits::vlanes()))), f, s3); } v_store(dst + i, v_pack_u(v_pack(v_round(s0), v_round(s1)), v_pack(v_round(s2), v_round(s3)))); } - if( i <= width - v_uint16::nlanes ) + if( i <= width - VTraits::vlanes() ) { const int* S0 = src[1] + i; const int* S1 = src[-1] + i; - v_float32 s0 = v_muladd(v_cvt_f32(vx_load(S0) - vx_load(S1)), f1, d4); - v_float32 s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) - vx_load(S1 + v_int32::nlanes)), f1, d4); + v_float32 s0 = v_muladd(v_cvt_f32(v_sub(vx_load(S0), vx_load(S1))), f1, d4); + v_float32 s1 = v_muladd(v_cvt_f32(v_sub(vx_load(S0 + VTraits::vlanes()), vx_load(S1 + VTraits::vlanes()))), f1, d4); for ( k = 2; k <= ksize2; k++ ) { v_float32 f = vx_setall_f32(ky[k]); S0 = src[k] + i; S1 = src[-k] + i; - s0 = v_muladd(v_cvt_f32(vx_load(S0) - vx_load(S1)), f, s0); - s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) - vx_load(S1 + v_int32::nlanes)), f, s1); + s0 = v_muladd(v_cvt_f32(v_sub(vx_load(S0), vx_load(S1))), f, s0); + s1 = v_muladd(v_cvt_f32(v_sub(vx_load(S0 + VTraits::vlanes()), vx_load(S1 + VTraits::vlanes()))), f, s1); } v_pack_u_store(dst + i, v_pack(v_round(s0), v_round(s1))); - i += v_uint16::nlanes; + i += VTraits::vlanes(); } #if CV_SIMD_WIDTH > 16 - while( i <= width - v_int32x4::nlanes ) + while( i <= width - 4 /*v_int32x4::nlanes*/ ) #else - if( i <= width - v_int32x4::nlanes ) + if( i <= width - v_int32::nlanes ) #endif { - v_float32x4 s0 = v_muladd(v_cvt_f32(v_load(src[1] + i) - v_load(src[-1] + i)), v_setall_f32(ky[1]), v_setall_f32(delta)); + v_float32 s0 = v_muladd(v_cvt_f32(v_sub(vx_load(src[1] + i), vx_load(src[-1] + i))), vx_setall_f32(ky[1]), vx_setall_f32(delta)); for (k = 2; k <= ksize2; k++) - s0 = v_muladd(v_cvt_f32(v_load(src[k] + i) - v_load(src[-k] + i)), v_setall_f32(ky[k]), s0); - v_int32x4 s32 = v_round(s0); - v_int16x8 s16 = v_pack(s32, s32); - *(unaligned_int*)(dst + i) = v_reinterpret_as_s32(v_pack_u(s16, s16)).get0(); - i += v_int32x4::nlanes; + s0 = v_muladd(v_cvt_f32(v_sub(vx_load(src[k] + i), vx_load(src[-k] + i))), vx_setall_f32(ky[k]), s0); + v_int32 s32 = v_round(s0); + v_int16 s16 = v_pack(s32, s32); + *(unaligned_int*)(dst + i) = v_get0(v_reinterpret_as_s32(v_pack_u(s16, s16))); + i += 4 /*v_int32x4::nlanes*/ ; } } return i; @@ -1187,31 +1187,31 @@ struct SymmColumnVec_32f8u if( symmetrical ) { - for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes ) + for( ; i <= width - VTraits::vlanes(); i += VTraits::vlanes() ) { v_float32 v_ky0 = vx_setall_f32(ky[0]); v_float32 v32_delta = vx_setall_f32(delta); const float* S = src[0] + i; v_float32 s0 = v_muladd(v_ky0, vx_load(S), v32_delta); - v_float32 s1 = v_muladd(v_ky0, vx_load(S + v_float32::nlanes), v32_delta); - v_float32 s2 = v_muladd(v_ky0, vx_load(S + 2*v_float32::nlanes), v32_delta); - v_float32 s3 = v_muladd(v_ky0, vx_load(S + 3*v_float32::nlanes), v32_delta); + v_float32 s1 = v_muladd(v_ky0, vx_load(S + VTraits::vlanes()), v32_delta); + v_float32 s2 = v_muladd(v_ky0, vx_load(S + 2*VTraits::vlanes()), v32_delta); + v_float32 s3 = v_muladd(v_ky0, vx_load(S + 3*VTraits::vlanes()), v32_delta); for( k = 1; k <= ksize2; k++ ) { v_float32 v_kyk = vx_setall_f32(ky[k]); const float* S0 = src[k] + i; const float* S1 = src[-k] + i; - s0 = v_muladd(v_kyk, vx_load(S0) + vx_load(S1), s0); - s1 = v_muladd(v_kyk, vx_load(S0 + v_float32::nlanes) + vx_load(S1 + v_float32::nlanes), s1); - s2 = v_muladd(v_kyk, vx_load(S0 + 2*v_float32::nlanes) + vx_load(S1 + 2*v_float32::nlanes), s2); - s3 = v_muladd(v_kyk, vx_load(S0 + 3*v_float32::nlanes) + vx_load(S1 + 3*v_float32::nlanes), s3); + s0 = v_muladd(v_kyk, v_add(vx_load(S0), vx_load(S1)), s0); + s1 = v_muladd(v_kyk, v_add(vx_load(S0 + VTraits::vlanes()), vx_load(S1 + VTraits::vlanes())), s1); + s2 = v_muladd(v_kyk, v_add(vx_load(S0 + 2 * VTraits::vlanes()), vx_load(S1 + 2 * VTraits::vlanes())), s2); + s3 = v_muladd(v_kyk, v_add(vx_load(S0 + 3 * VTraits::vlanes()), vx_load(S1 + 3 * VTraits::vlanes())), s3); } v_store(_dst + i, v_pack_u(v_pack(v_round(s0), v_round(s1)), v_pack(v_round(s2), v_round(s3)))); } } else { - for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes ) + for( ; i <= width - VTraits::vlanes(); i += VTraits::vlanes() ) { v_float32 s0 = vx_setall_f32(delta); v_float32 s1 = vx_setall_f32(delta); @@ -1222,10 +1222,10 @@ struct SymmColumnVec_32f8u v_float32 v_kyk = vx_setall_f32(ky[k]); const float* S0 = src[k] + i; const float* S1 = src[-k] + i; - s0 = v_muladd(v_kyk, vx_load(S0) - vx_load(S1), s0); - s1 = v_muladd(v_kyk, vx_load(S0 + v_float32::nlanes) - vx_load(S1 + v_float32::nlanes), s1); - s2 = v_muladd(v_kyk, vx_load(S0 + 2*v_float32::nlanes) - vx_load(S1 + 2*v_float32::nlanes), s2); - s3 = v_muladd(v_kyk, vx_load(S0 + 3*v_float32::nlanes) - vx_load(S1 + 3*v_float32::nlanes), s3); + s0 = v_muladd(v_kyk, v_sub(vx_load(S0), vx_load(S1)), s0); + s1 = v_muladd(v_kyk, v_sub(vx_load(S0 + VTraits::vlanes()), vx_load(S1 + VTraits::vlanes())), s1); + s2 = v_muladd(v_kyk, v_sub(vx_load(S0 + 2 * VTraits::vlanes()), vx_load(S1 + 2 * VTraits::vlanes())), s2); + s3 = v_muladd(v_kyk, v_sub(vx_load(S0 + 3 * VTraits::vlanes()), vx_load(S1 + 3 * VTraits::vlanes())), s3); } v_store(_dst + i, v_pack_u(v_pack(v_round(s0), v_round(s1)), v_pack(v_round(s2), v_round(s3)))); } @@ -1268,55 +1268,52 @@ struct SymmColumnSmallVec_32s16s { if( ky[0] == 2 && ky[1] == 1 ) { - for( ; i <= width - 2*v_int16::nlanes; i += 2*v_int16::nlanes ) + for( ; i <= width - 2*VTraits::vlanes(); i += 2*VTraits::vlanes() ) { v_int32 s0 = vx_load(S1 + i); - v_int32 s1 = vx_load(S1 + i + v_int32::nlanes); - v_int32 s2 = vx_load(S1 + i + 2*v_int32::nlanes); - v_int32 s3 = vx_load(S1 + i + 3*v_int32::nlanes); - v_store(dst + i, v_pack(vx_load(S0 + i) + vx_load(S2 + i) + (s0 + s0), vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes) + (s1 + s1)) + d8); - v_store(dst + i + v_int16::nlanes, v_pack(vx_load(S0 + i + 2*v_int32::nlanes) + vx_load(S2 + i + 2*v_int32::nlanes) + (s2 + s2), - vx_load(S0 + i + 3*v_int32::nlanes) + vx_load(S2 + i + 3*v_int32::nlanes) + (s3 + s3)) + d8); + v_int32 s1 = vx_load(S1 + i + VTraits::vlanes()); + v_int32 s2 = vx_load(S1 + i + 2*VTraits::vlanes()); + v_int32 s3 = vx_load(S1 + i + 3*VTraits::vlanes()); + v_store(dst + i, v_add(v_pack(v_add(v_add(vx_load(S0 + i), vx_load(S2 + i)), v_add(s0, s0)), v_add(v_add(vx_load(S0 + i + VTraits::vlanes()), vx_load(S2 + i + VTraits::vlanes())), v_add(s1, s1))), d8)); + v_store(dst + i + VTraits::vlanes(), v_add(v_pack(v_add(v_add(vx_load(S0 + i + 2 * VTraits::vlanes()), vx_load(S2 + i + 2 * VTraits::vlanes())), v_add(s2, s2)), v_add(v_add(vx_load(S0 + i + 3 * VTraits::vlanes()), vx_load(S2 + i + 3 * VTraits::vlanes())), v_add(s3, s3))), d8)); } - if( i <= width - v_int16::nlanes ) + if( i <= width - VTraits::vlanes() ) { v_int32 sl = vx_load(S1 + i); - v_int32 sh = vx_load(S1 + i + v_int32::nlanes); - v_store(dst + i, v_pack(vx_load(S0 + i) + vx_load(S2 + i) + (sl + sl), vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes) + (sh + sh)) + d8); - i += v_int16::nlanes; + v_int32 sh = vx_load(S1 + i + VTraits::vlanes()); + v_store(dst + i, v_add(v_pack(v_add(v_add(vx_load(S0 + i), vx_load(S2 + i)), v_add(sl, sl)), v_add(v_add(vx_load(S0 + i + VTraits::vlanes()), vx_load(S2 + i + VTraits::vlanes())), v_add(sh, sh))), d8)); + i += VTraits::vlanes(); } - if( i <= width - v_int32::nlanes ) + if( i <= width - VTraits::vlanes() ) { v_int32 s = vx_load(S1 + i); - v_pack_store(dst + i, vx_load(S0 + i) + vx_load(S2 + i) + vx_setall_s32(d) + (s + s)); - i += v_int32::nlanes; + v_pack_store(dst + i, v_add(v_add(v_add(vx_load(S0 + i), vx_load(S2 + i)), vx_setall_s32(d)), v_add(s, s))); + i += VTraits::vlanes(); } } else if( ky[0] == -2 && ky[1] == 1 ) { - for( ; i <= width - 2*v_int16::nlanes; i += 2*v_int16::nlanes ) + for( ; i <= width - 2*VTraits::vlanes(); i += 2*VTraits::vlanes() ) { v_int32 s0 = vx_load(S1 + i); - v_int32 s1 = vx_load(S1 + i + v_int32::nlanes); - v_int32 s2 = vx_load(S1 + i + 2*v_int32::nlanes); - v_int32 s3 = vx_load(S1 + i + 3*v_int32::nlanes); - v_store(dst + i, v_pack(vx_load(S0 + i) + vx_load(S2 + i) - (s0 + s0), - vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes) - (s1 + s1)) + d8); - v_store(dst + i + v_int16::nlanes, v_pack(vx_load(S0 + i + 2*v_int32::nlanes) + vx_load(S2 + i + 2*v_int32::nlanes) - (s2 + s2), - vx_load(S0 + i + 3*v_int32::nlanes) + vx_load(S2 + i + 3*v_int32::nlanes) - (s3 + s3)) + d8); + v_int32 s1 = vx_load(S1 + i + VTraits::vlanes()); + v_int32 s2 = vx_load(S1 + i + 2*VTraits::vlanes()); + v_int32 s3 = vx_load(S1 + i + 3*VTraits::vlanes()); + v_store(dst + i, v_add(v_pack(v_sub(v_add(vx_load(S0 + i), vx_load(S2 + i)), v_add(s0, s0)), v_sub(v_add(vx_load(S0 + i + VTraits::vlanes()), vx_load(S2 + i + VTraits::vlanes())), v_add(s1, s1))), d8)); + v_store(dst + i + VTraits::vlanes(), v_add(v_pack(v_sub(v_add(vx_load(S0 + i + 2 * VTraits::vlanes()), vx_load(S2 + i + 2 * VTraits::vlanes())), v_add(s2, s2)), v_sub(v_add(vx_load(S0 + i + 3 * VTraits::vlanes()), vx_load(S2 + i + 3 * VTraits::vlanes())), v_add(s3, s3))), d8)); } - if( i <= width - v_int16::nlanes ) + if( i <= width - VTraits::vlanes() ) { v_int32 sl = vx_load(S1 + i); - v_int32 sh = vx_load(S1 + i + v_int32::nlanes); - v_store(dst + i, v_pack(vx_load(S0 + i) + vx_load(S2 + i) - (sl + sl), vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes) - (sh + sh)) + d8); - i += v_int16::nlanes; + v_int32 sh = vx_load(S1 + i + VTraits::vlanes()); + v_store(dst + i, v_add(v_pack(v_sub(v_add(vx_load(S0 + i), vx_load(S2 + i)), v_add(sl, sl)), v_sub(v_add(vx_load(S0 + i + VTraits::vlanes()), vx_load(S2 + i + VTraits::vlanes())), v_add(sh, sh))), d8)); + i += VTraits::vlanes(); } - if( i <= width - v_int32::nlanes ) + if( i <= width - VTraits::vlanes() ) { v_int32 s = vx_load(S1 + i); - v_pack_store(dst + i, vx_load(S0 + i) + vx_load(S2 + i) + vx_setall_s32(d) - (s + s)); - i += v_int32::nlanes; + v_pack_store(dst + i, v_sub(v_add(v_add(vx_load(S0 + i), vx_load(S2 + i)), vx_setall_s32(d)), v_add(s, s))); + i += VTraits::vlanes(); } } #if CV_NEON @@ -1347,23 +1344,23 @@ struct SymmColumnSmallVec_32s16s else { v_float32 k0 = vx_setall_f32(ky[0]), k1 = vx_setall_f32(ky[1]); - for( ; i <= width - 2*v_int16::nlanes; i += 2*v_int16::nlanes ) + for( ; i <= width - 2*VTraits::vlanes(); i += 2*VTraits::vlanes() ) { - v_store(dst + i, v_pack(v_round(v_muladd(v_cvt_f32(vx_load(S0 + i) + vx_load(S2 + i)), k1, v_muladd(v_cvt_f32(vx_load(S1 + i)), k0, df4))), - v_round(v_muladd(v_cvt_f32(vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes)), k1, v_muladd(v_cvt_f32(vx_load(S1 + i + v_int32::nlanes)), k0, df4))))); - v_store(dst + i + v_int16::nlanes, v_pack(v_round(v_muladd(v_cvt_f32(vx_load(S0 + i + 2*v_int32::nlanes) + vx_load(S2 + i + 2*v_int32::nlanes)), k1, v_muladd(v_cvt_f32(vx_load(S1 + i + 2*v_int32::nlanes)), k0, df4))), - v_round(v_muladd(v_cvt_f32(vx_load(S0 + i + 3*v_int32::nlanes) + vx_load(S2 + i + 3*v_int32::nlanes)), k1, v_muladd(v_cvt_f32(vx_load(S1 + i + 3*v_int32::nlanes)), k0, df4))))); + v_store(dst + i, v_pack(v_round(v_muladd(v_cvt_f32(v_add(vx_load(S0 + i), vx_load(S2 + i))), k1, v_muladd(v_cvt_f32(vx_load(S1 + i)), k0, df4))), + v_round(v_muladd(v_cvt_f32(v_add(vx_load(S0 + i + VTraits::vlanes()), vx_load(S2 + i + VTraits::vlanes()))), k1, v_muladd(v_cvt_f32(vx_load(S1 + i + VTraits::vlanes())), k0, df4))))); + v_store(dst + i + VTraits::vlanes(), v_pack(v_round(v_muladd(v_cvt_f32(v_add(vx_load(S0 + i + 2 * VTraits::vlanes()), vx_load(S2 + i + 2 * VTraits::vlanes()))), k1, v_muladd(v_cvt_f32(vx_load(S1 + i + 2*VTraits::vlanes())), k0, df4))), + v_round(v_muladd(v_cvt_f32(v_add(vx_load(S0 + i + 3 * VTraits::vlanes()), vx_load(S2 + i + 3 * VTraits::vlanes()))), k1, v_muladd(v_cvt_f32(vx_load(S1 + i + 3*VTraits::vlanes())), k0, df4))))); } - if( i <= width - v_int16::nlanes ) + if( i <= width - VTraits::vlanes() ) { - v_store(dst + i, v_pack(v_round(v_muladd(v_cvt_f32(vx_load(S0 + i) + vx_load(S2 + i)), k1, v_muladd(v_cvt_f32(vx_load(S1 + i)), k0, df4))), - v_round(v_muladd(v_cvt_f32(vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes)), k1, v_muladd(v_cvt_f32(vx_load(S1 + i + v_int32::nlanes)), k0, df4))))); - i += v_int16::nlanes; + v_store(dst + i, v_pack(v_round(v_muladd(v_cvt_f32(v_add(vx_load(S0 + i), vx_load(S2 + i))), k1, v_muladd(v_cvt_f32(vx_load(S1 + i)), k0, df4))), + v_round(v_muladd(v_cvt_f32(v_add(vx_load(S0 + i + VTraits::vlanes()), vx_load(S2 + i + VTraits::vlanes()))), k1, v_muladd(v_cvt_f32(vx_load(S1 + i + VTraits::vlanes())), k0, df4))))); + i += VTraits::vlanes(); } - if( i <= width - v_int32::nlanes ) + if( i <= width - VTraits::vlanes() ) { - v_pack_store(dst + i, v_round(v_muladd(v_cvt_f32(vx_load(S0 + i) + vx_load(S2 + i)), k1, v_muladd(v_cvt_f32(vx_load(S1 + i)), k0, df4)))); - i += v_int32::nlanes; + v_pack_store(dst + i, v_round(v_muladd(v_cvt_f32(v_add(vx_load(S0 + i), vx_load(S2 + i))), k1, v_muladd(v_cvt_f32(vx_load(S1 + i)), k0, df4)))); + i += VTraits::vlanes(); } } } @@ -1373,42 +1370,42 @@ struct SymmColumnSmallVec_32s16s { if( ky[1] < 0 ) std::swap(S0, S2); - for( ; i <= width - 2*v_int16::nlanes; i += 2*v_int16::nlanes ) + for( ; i <= width - 2*VTraits::vlanes(); i += 2*VTraits::vlanes() ) { - v_store(dst + i, v_pack(vx_load(S2 + i) - vx_load(S0 + i), vx_load(S2 + i + v_int32::nlanes) - vx_load(S0 + i + v_int32::nlanes)) + d8); - v_store(dst + i + v_int16::nlanes, v_pack(vx_load(S2 + i + 2*v_int32::nlanes) - vx_load(S0 + i + 2*v_int32::nlanes), vx_load(S2 + i + 3*v_int32::nlanes) - vx_load(S0 + i + 3*v_int32::nlanes)) + d8); + v_store(dst + i, v_add(v_pack(v_sub(vx_load(S2 + i), vx_load(S0 + i)), v_sub(vx_load(S2 + i + VTraits::vlanes()), vx_load(S0 + i + VTraits::vlanes()))), d8)); + v_store(dst + i + VTraits::vlanes(), v_add(v_pack(v_sub(vx_load(S2 + i + 2 * VTraits::vlanes()), vx_load(S0 + i + 2 * VTraits::vlanes())), v_sub(vx_load(S2 + i + 3 * VTraits::vlanes()), vx_load(S0 + i + 3 * VTraits::vlanes()))), d8)); } - if( i <= width - v_int16::nlanes ) + if( i <= width - VTraits::vlanes() ) { - v_store(dst + i, v_pack(vx_load(S2 + i) - vx_load(S0 + i), vx_load(S2 + i + v_int32::nlanes) - vx_load(S0 + i + v_int32::nlanes)) + d8); - i += v_int16::nlanes; + v_store(dst + i, v_add(v_pack(v_sub(vx_load(S2 + i), vx_load(S0 + i)), v_sub(vx_load(S2 + i + VTraits::vlanes()), vx_load(S0 + i + VTraits::vlanes()))), d8)); + i += VTraits::vlanes(); } - if( i <= width - v_int32::nlanes ) + if( i <= width - VTraits::vlanes() ) { - v_pack_store(dst + i, vx_load(S2 + i) - vx_load(S0 + i) + vx_setall_s32(d)); - i += v_int32::nlanes; + v_pack_store(dst + i, v_add(v_sub(vx_load(S2 + i), vx_load(S0 + i)), vx_setall_s32(d))); + i += VTraits::vlanes(); } } else { v_float32 k1 = vx_setall_f32(ky[1]); - for( ; i <= width - 2*v_int16::nlanes; i += 2*v_int16::nlanes ) + for( ; i <= width - 2*VTraits::vlanes(); i += 2*VTraits::vlanes() ) { - v_store(dst + i, v_pack(v_round(v_muladd(v_cvt_f32(vx_load(S2 + i) - vx_load(S0 + i)), k1, df4)), - v_round(v_muladd(v_cvt_f32(vx_load(S2 + i + v_int32::nlanes) - vx_load(S0 + i + v_int32::nlanes)), k1, df4)))); - v_store(dst + i + v_int16::nlanes, v_pack(v_round(v_muladd(v_cvt_f32(vx_load(S2 + i + 2*v_int32::nlanes) - vx_load(S0 + i + 2*v_int32::nlanes)), k1, df4)), - v_round(v_muladd(v_cvt_f32(vx_load(S2 + i + 3*v_int32::nlanes) - vx_load(S0 + i + 3*v_int32::nlanes)), k1, df4)))); + v_store(dst + i, v_pack(v_round(v_muladd(v_cvt_f32(v_sub(vx_load(S2 + i), vx_load(S0 + i))), k1, df4)), + v_round(v_muladd(v_cvt_f32(v_sub(vx_load(S2 + i + VTraits::vlanes()), vx_load(S0 + i + VTraits::vlanes()))), k1, df4)))); + v_store(dst + i + VTraits::vlanes(), v_pack(v_round(v_muladd(v_cvt_f32(v_sub(vx_load(S2 + i + 2 * VTraits::vlanes()), vx_load(S0 + i + 2 * VTraits::vlanes()))), k1, df4)), + v_round(v_muladd(v_cvt_f32(v_sub(vx_load(S2 + i + 3 * VTraits::vlanes()), vx_load(S0 + i + 3 * VTraits::vlanes()))), k1, df4)))); } - if( i <= width - v_int16::nlanes ) + if( i <= width - VTraits::vlanes() ) { - v_store(dst + i, v_pack(v_round(v_muladd(v_cvt_f32(vx_load(S2 + i) - vx_load(S0 + i)), k1, df4)), - v_round(v_muladd(v_cvt_f32(vx_load(S2 + i + v_int32::nlanes) - vx_load(S0 + i + v_int32::nlanes)), k1, df4)))); - i += v_int16::nlanes; + v_store(dst + i, v_pack(v_round(v_muladd(v_cvt_f32(v_sub(vx_load(S2 + i), vx_load(S0 + i))), k1, df4)), + v_round(v_muladd(v_cvt_f32(v_sub(vx_load(S2 + i + VTraits::vlanes()), vx_load(S0 + i + VTraits::vlanes()))), k1, df4)))); + i += VTraits::vlanes(); } - if( i <= width - v_int32::nlanes ) + if( i <= width - VTraits::vlanes() ) { - v_pack_store(dst + i, v_round(v_muladd(v_cvt_f32(vx_load(S2 + i) - vx_load(S0 + i)), k1, df4))); - i += v_int32::nlanes; + v_pack_store(dst + i, v_round(v_muladd(v_cvt_f32(v_sub(vx_load(S2 + i), vx_load(S0 + i))), k1, df4))); + i += VTraits::vlanes(); } } } @@ -1440,7 +1437,7 @@ struct RowVec_16s32f const float* _kx = kernel.ptr(); width *= cn; - for( ; i <= width - 2*v_int16::nlanes; i += 2*v_int16::nlanes ) + for( ; i <= width - 2*VTraits::vlanes(); i += 2*VTraits::vlanes() ) { const short* src = (const short*)_src + i; v_float32 s0 = vx_setzero_f32(); @@ -1451,18 +1448,18 @@ struct RowVec_16s32f { v_float32 f = vx_setall_f32(_kx[k]); v_int16 xl = vx_load(src); - v_int16 xh = vx_load(src + v_int16::nlanes); + v_int16 xh = vx_load(src + VTraits::vlanes()); s0 = v_muladd(v_cvt_f32(v_expand_low(xl)), f, s0); s1 = v_muladd(v_cvt_f32(v_expand_high(xl)), f, s1); s2 = v_muladd(v_cvt_f32(v_expand_low(xh)), f, s2); s3 = v_muladd(v_cvt_f32(v_expand_high(xh)), f, s3); } v_store(dst + i, s0); - v_store(dst + i + v_float32::nlanes, s1); - v_store(dst + i + 2*v_float32::nlanes, s2); - v_store(dst + i + 3*v_float32::nlanes, s3); + v_store(dst + i + VTraits::vlanes(), s1); + v_store(dst + i + 2*VTraits::vlanes(), s2); + v_store(dst + i + 3*VTraits::vlanes(), s3); } - if( i <= width - v_int16::nlanes ) + if( i <= width - VTraits::vlanes() ) { const short* src = (const short*)_src + i; v_float32 s0 = vx_setzero_f32(); @@ -1475,17 +1472,17 @@ struct RowVec_16s32f s1 = v_muladd(v_cvt_f32(v_expand_high(x)), f, s1); } v_store(dst + i, s0); - v_store(dst + i + v_float32::nlanes, s1); - i += v_int16::nlanes; + v_store(dst + i + VTraits::vlanes(), s1); + i += VTraits::vlanes(); } - if( i <= width - v_float32::nlanes ) + if( i <= width - VTraits::vlanes() ) { const short* src = (const short*)_src + i; v_float32 s0 = vx_setzero_f32(); for( k = 0; k < _ksize; k++, src += cn ) s0 = v_muladd(v_cvt_f32(vx_load_expand(src)), vx_setall_f32(_kx[k]), s0); v_store(dst + i, s0); - i += v_float32::nlanes; + i += VTraits::vlanes(); } return i; } @@ -1524,92 +1521,92 @@ struct SymmColumnVec_32f16s { v_float32 k0 = vx_setall_f32(ky[0]); v_float32 k1 = vx_setall_f32(ky[1]); - for( ; i <= width - 2*v_int16::nlanes; i += 2*v_int16::nlanes ) + for( ; i <= width - 2*VTraits::vlanes(); i += 2*VTraits::vlanes() ) { v_float32 s0 = v_muladd(vx_load(src[0] + i), k0, d4); - v_float32 s1 = v_muladd(vx_load(src[0] + i + v_float32::nlanes), k0, d4); - v_float32 s2 = v_muladd(vx_load(src[0] + i + 2*v_float32::nlanes), k0, d4); - v_float32 s3 = v_muladd(vx_load(src[0] + i + 3*v_float32::nlanes), k0, d4); - s0 = v_muladd(vx_load(src[1] + i) + vx_load(src[-1] + i), k1, s0); - s1 = v_muladd(vx_load(src[1] + i + v_float32::nlanes) + vx_load(src[-1] + i + v_float32::nlanes), k1, s1); - s2 = v_muladd(vx_load(src[1] + i + 2*v_float32::nlanes) + vx_load(src[-1] + i + 2*v_float32::nlanes), k1, s2); - s3 = v_muladd(vx_load(src[1] + i + 3*v_float32::nlanes) + vx_load(src[-1] + i + 3*v_float32::nlanes), k1, s3); + v_float32 s1 = v_muladd(vx_load(src[0] + i + VTraits::vlanes()), k0, d4); + v_float32 s2 = v_muladd(vx_load(src[0] + i + 2*VTraits::vlanes()), k0, d4); + v_float32 s3 = v_muladd(vx_load(src[0] + i + 3*VTraits::vlanes()), k0, d4); + s0 = v_muladd(v_add(vx_load(src[1] + i), vx_load(src[-1] + i)), k1, s0); + s1 = v_muladd(v_add(vx_load(src[1] + i + VTraits::vlanes()), vx_load(src[-1] + i + VTraits::vlanes())), k1, s1); + s2 = v_muladd(v_add(vx_load(src[1] + i + 2 * VTraits::vlanes()), vx_load(src[-1] + i + 2 * VTraits::vlanes())), k1, s2); + s3 = v_muladd(v_add(vx_load(src[1] + i + 3 * VTraits::vlanes()), vx_load(src[-1] + i + 3 * VTraits::vlanes())), k1, s3); for( k = 2; k <= ksize2; k++ ) { v_float32 k2 = vx_setall_f32(ky[k]); - s0 = v_muladd(vx_load(src[k] + i) + vx_load(src[-k] + i), k2, s0); - s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes) + vx_load(src[-k] + i + v_float32::nlanes), k2, s1); - s2 = v_muladd(vx_load(src[k] + i + 2*v_float32::nlanes) + vx_load(src[-k] + i + 2*v_float32::nlanes), k2, s2); - s3 = v_muladd(vx_load(src[k] + i + 3*v_float32::nlanes) + vx_load(src[-k] + i + 3*v_float32::nlanes), k2, s3); + s0 = v_muladd(v_add(vx_load(src[k] + i), vx_load(src[-k] + i)), k2, s0); + s1 = v_muladd(v_add(vx_load(src[k] + i + VTraits::vlanes()), vx_load(src[-k] + i + VTraits::vlanes())), k2, s1); + s2 = v_muladd(v_add(vx_load(src[k] + i + 2 * VTraits::vlanes()), vx_load(src[-k] + i + 2 * VTraits::vlanes())), k2, s2); + s3 = v_muladd(v_add(vx_load(src[k] + i + 3 * VTraits::vlanes()), vx_load(src[-k] + i + 3 * VTraits::vlanes())), k2, s3); } v_store(dst + i, v_pack(v_round(s0), v_round(s1))); - v_store(dst + i + v_int16::nlanes, v_pack(v_round(s2), v_round(s3))); + v_store(dst + i + VTraits::vlanes(), v_pack(v_round(s2), v_round(s3))); } - if( i <= width - v_int16::nlanes ) + if( i <= width - VTraits::vlanes() ) { v_float32 s0 = v_muladd(vx_load(src[0] + i), k0, d4); - v_float32 s1 = v_muladd(vx_load(src[0] + i + v_float32::nlanes), k0, d4); - s0 = v_muladd(vx_load(src[1] + i) + vx_load(src[-1] + i), k1, s0); - s1 = v_muladd(vx_load(src[1] + i + v_float32::nlanes) + vx_load(src[-1] + i + v_float32::nlanes), k1, s1); + v_float32 s1 = v_muladd(vx_load(src[0] + i + VTraits::vlanes()), k0, d4); + s0 = v_muladd(v_add(vx_load(src[1] + i), vx_load(src[-1] + i)), k1, s0); + s1 = v_muladd(v_add(vx_load(src[1] + i + VTraits::vlanes()), vx_load(src[-1] + i + VTraits::vlanes())), k1, s1); for( k = 2; k <= ksize2; k++ ) { v_float32 k2 = vx_setall_f32(ky[k]); - s0 = v_muladd(vx_load(src[k] + i) + vx_load(src[-k] + i), k2, s0); - s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes) + vx_load(src[-k] + i + v_float32::nlanes), k2, s1); + s0 = v_muladd(v_add(vx_load(src[k] + i), vx_load(src[-k] + i)), k2, s0); + s1 = v_muladd(v_add(vx_load(src[k] + i + VTraits::vlanes()), vx_load(src[-k] + i + VTraits::vlanes())), k2, s1); } v_store(dst + i, v_pack(v_round(s0), v_round(s1))); - i += v_int16::nlanes; + i += VTraits::vlanes(); } - if( i <= width - v_float32::nlanes ) + if( i <= width - VTraits::vlanes() ) { v_float32 s0 = v_muladd(vx_load(src[0] + i), k0, d4); - s0 = v_muladd(vx_load(src[1] + i) + vx_load(src[-1] + i), k1, s0); + s0 = v_muladd(v_add(vx_load(src[1] + i), vx_load(src[-1] + i)), k1, s0); for( k = 2; k <= ksize2; k++ ) - s0 = v_muladd(vx_load(src[k] + i) + vx_load(src[-k] + i), vx_setall_f32(ky[k]), s0); + s0 = v_muladd(v_add(vx_load(src[k] + i), vx_load(src[-k] + i)), vx_setall_f32(ky[k]), s0); v_pack_store(dst + i, v_round(s0)); - i += v_float32::nlanes; + i += VTraits::vlanes(); } } else { v_float32 k1 = vx_setall_f32(ky[1]); - for( ; i <= width - 2*v_int16::nlanes; i += 2*v_int16::nlanes ) + for( ; i <= width - 2*VTraits::vlanes(); i += 2*VTraits::vlanes() ) { - v_float32 s0 = v_muladd(vx_load(src[1] + i) - vx_load(src[-1] + i), k1, d4); - v_float32 s1 = v_muladd(vx_load(src[1] + i + v_float32::nlanes) - vx_load(src[-1] + i + v_float32::nlanes), k1, d4); - v_float32 s2 = v_muladd(vx_load(src[1] + i + 2*v_float32::nlanes) - vx_load(src[-1] + i + 2*v_float32::nlanes), k1, d4); - v_float32 s3 = v_muladd(vx_load(src[1] + i + 3*v_float32::nlanes) - vx_load(src[-1] + i + 3*v_float32::nlanes), k1, d4); + v_float32 s0 = v_muladd(v_sub(vx_load(src[1] + i), vx_load(src[-1] + i)), k1, d4); + v_float32 s1 = v_muladd(v_sub(vx_load(src[1] + i + VTraits::vlanes()), vx_load(src[-1] + i + VTraits::vlanes())), k1, d4); + v_float32 s2 = v_muladd(v_sub(vx_load(src[1] + i + 2 * VTraits::vlanes()), vx_load(src[-1] + i + 2 * VTraits::vlanes())), k1, d4); + v_float32 s3 = v_muladd(v_sub(vx_load(src[1] + i + 3 * VTraits::vlanes()), vx_load(src[-1] + i + 3 * VTraits::vlanes())), k1, d4); for( k = 2; k <= ksize2; k++ ) { v_float32 k2 = vx_setall_f32(ky[k]); - s0 = v_muladd(vx_load(src[k] + i) - vx_load(src[-k] + i), k2, s0); - s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes) - vx_load(src[-k] + i + v_float32::nlanes), k2, s1); - s2 = v_muladd(vx_load(src[k] + i + 2*v_float32::nlanes) - vx_load(src[-k] + i + 2*v_float32::nlanes), k2, s2); - s3 = v_muladd(vx_load(src[k] + i + 3*v_float32::nlanes) - vx_load(src[-k] + i + 3*v_float32::nlanes), k2, s3); + s0 = v_muladd(v_sub(vx_load(src[k] + i), vx_load(src[-k] + i)), k2, s0); + s1 = v_muladd(v_sub(vx_load(src[k] + i + VTraits::vlanes()), vx_load(src[-k] + i + VTraits::vlanes())), k2, s1); + s2 = v_muladd(v_sub(vx_load(src[k] + i + 2 * VTraits::vlanes()), vx_load(src[-k] + i + 2 * VTraits::vlanes())), k2, s2); + s3 = v_muladd(v_sub(vx_load(src[k] + i + 3 * VTraits::vlanes()), vx_load(src[-k] + i + 3 * VTraits::vlanes())), k2, s3); } v_store(dst + i, v_pack(v_round(s0), v_round(s1))); - v_store(dst + i + v_int16::nlanes, v_pack(v_round(s2), v_round(s3))); + v_store(dst + i + VTraits::vlanes(), v_pack(v_round(s2), v_round(s3))); } - if( i <= width - v_int16::nlanes ) + if( i <= width - VTraits::vlanes() ) { - v_float32 s0 = v_muladd(vx_load(src[1] + i) - vx_load(src[-1] + i), k1, d4); - v_float32 s1 = v_muladd(vx_load(src[1] + i + v_float32::nlanes) - vx_load(src[-1] + i + v_float32::nlanes), k1, d4); + v_float32 s0 = v_muladd(v_sub(vx_load(src[1] + i), vx_load(src[-1] + i)), k1, d4); + v_float32 s1 = v_muladd(v_sub(vx_load(src[1] + i + VTraits::vlanes()), vx_load(src[-1] + i + VTraits::vlanes())), k1, d4); for( k = 2; k <= ksize2; k++ ) { v_float32 k2 = vx_setall_f32(ky[k]); - s0 = v_muladd(vx_load(src[k] + i) - vx_load(src[-k] + i), k2, s0); - s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes) - vx_load(src[-k] + i + v_float32::nlanes), k2, s1); + s0 = v_muladd(v_sub(vx_load(src[k] + i), vx_load(src[-k] + i)), k2, s0); + s1 = v_muladd(v_sub(vx_load(src[k] + i + VTraits::vlanes()), vx_load(src[-k] + i + VTraits::vlanes())), k2, s1); } v_store(dst + i, v_pack(v_round(s0), v_round(s1))); - i += v_int16::nlanes; + i += VTraits::vlanes(); } - if( i <= width - v_float32::nlanes ) + if( i <= width - VTraits::vlanes() ) { - v_float32 s0 = v_muladd(vx_load(src[1] + i) - vx_load(src[-1] + i), k1, d4); + v_float32 s0 = v_muladd(v_sub(vx_load(src[1] + i), vx_load(src[-1] + i)), k1, d4); for( k = 2; k <= ksize2; k++ ) - s0 = v_muladd(vx_load(src[k] + i) - vx_load(src[-k] + i), vx_setall_f32(ky[k]), s0); + s0 = v_muladd(v_sub(vx_load(src[k] + i), vx_load(src[-k] + i)), vx_setall_f32(ky[k]), s0); v_pack_store(dst + i, v_round(s0)); - i += v_float32::nlanes; + i += VTraits::vlanes(); } } @@ -1682,52 +1679,52 @@ struct RowVec_32f } #endif v_float32 k0 = vx_setall_f32(_kx[0]); - for( ; i <= width - 4*v_float32::nlanes; i += 4*v_float32::nlanes ) + for( ; i <= width - 4*VTraits::vlanes(); i += 4*VTraits::vlanes() ) { const float* src = src0 + i; - v_float32 s0 = vx_load(src) * k0; - v_float32 s1 = vx_load(src + v_float32::nlanes) * k0; - v_float32 s2 = vx_load(src + 2*v_float32::nlanes) * k0; - v_float32 s3 = vx_load(src + 3*v_float32::nlanes) * k0; + v_float32 s0 = v_mul(vx_load(src), k0); + v_float32 s1 = v_mul(vx_load(src + VTraits::vlanes()), k0); + v_float32 s2 = v_mul(vx_load(src + 2 * VTraits::vlanes()), k0); + v_float32 s3 = v_mul(vx_load(src + 3 * VTraits::vlanes()), k0); src += cn; for( k = 1; k < _ksize; k++, src += cn ) { v_float32 k1 = vx_setall_f32(_kx[k]); s0 = v_muladd(vx_load(src), k1, s0); - s1 = v_muladd(vx_load(src + v_float32::nlanes), k1, s1); - s2 = v_muladd(vx_load(src + 2*v_float32::nlanes), k1, s2); - s3 = v_muladd(vx_load(src + 3*v_float32::nlanes), k1, s3); + s1 = v_muladd(vx_load(src + VTraits::vlanes()), k1, s1); + s2 = v_muladd(vx_load(src + 2*VTraits::vlanes()), k1, s2); + s3 = v_muladd(vx_load(src + 3*VTraits::vlanes()), k1, s3); } v_store(dst + i, s0); - v_store(dst + i + v_float32::nlanes, s1); - v_store(dst + i + 2*v_float32::nlanes, s2); - v_store(dst + i + 3*v_float32::nlanes, s3); + v_store(dst + i + VTraits::vlanes(), s1); + v_store(dst + i + 2*VTraits::vlanes(), s2); + v_store(dst + i + 3*VTraits::vlanes(), s3); } - if( i <= width - 2*v_float32::nlanes ) + if( i <= width - 2*VTraits::vlanes() ) { const float* src = src0 + i; - v_float32 s0 = vx_load(src) * k0; - v_float32 s1 = vx_load(src + v_float32::nlanes) * k0; + v_float32 s0 = v_mul(vx_load(src), k0); + v_float32 s1 = v_mul(vx_load(src + VTraits::vlanes()), k0); src += cn; for( k = 1; k < _ksize; k++, src += cn ) { v_float32 k1 = vx_setall_f32(_kx[k]); s0 = v_muladd(vx_load(src), k1, s0); - s1 = v_muladd(vx_load(src + v_float32::nlanes), k1, s1); + s1 = v_muladd(vx_load(src + VTraits::vlanes()), k1, s1); } v_store(dst + i, s0); - v_store(dst + i + v_float32::nlanes, s1); - i += 2*v_float32::nlanes; + v_store(dst + i + VTraits::vlanes(), s1); + i += 2*VTraits::vlanes(); } - if( i <= width - v_float32::nlanes ) + if( i <= width - VTraits::vlanes() ) { const float* src = src0 + i; - v_float32 s0 = vx_load(src) * k0; + v_float32 s0 = v_mul(vx_load(src), k0); src += cn; for( k = 1; k < _ksize; k++, src += cn ) s0 = v_muladd(vx_load(src), vx_setall_f32(_kx[k]), s0); v_store(dst + i, s0); - i += v_float32::nlanes; + i += VTraits::vlanes(); } return i; } @@ -1806,28 +1803,28 @@ struct SymmRowSmallVec_32f { #if CV_FMA3 || CV_AVX2 v_float32 k0 = vx_setall_f32(kx[0]); - for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes ) - v_store(dst + i, v_muladd(vx_load(src), k0, vx_load(src - cn) + vx_load(src + cn))); + for( ; i <= width - VTraits::vlanes(); i += VTraits::vlanes(), src += VTraits::vlanes() ) + v_store(dst + i, v_muladd(vx_load(src), k0, v_add(vx_load(src - cn), vx_load(src + cn)))); #else if( kx[0] > 0 ) - for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes ) + for( ; i <= width - VTraits::vlanes(); i += VTraits::vlanes(), src += VTraits::vlanes() ) { v_float32 x = vx_load(src); - v_store(dst + i, vx_load(src - cn) + vx_load(src + cn) + (x + x)); + v_store(dst + i, v_add(vx_load(src - cn), vx_load(src + cn), x , x)); } else - for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes ) + for( ; i <= width - VTraits::vlanes(); i += VTraits::vlanes(), src += VTraits::vlanes() ) { v_float32 x = vx_load(src); - v_store(dst + i, vx_load(src - cn) + vx_load(src + cn) - (x + x)); + v_store(dst + i, v_sub(v_add(vx_load(src - cn), vx_load(src + cn)), v_add(x, x))); } #endif } else { v_float32 k0 = vx_setall_f32(kx[0]), k1 = vx_setall_f32(kx[1]); - for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes ) - v_store(dst + i, v_muladd(vx_load(src), k0, (vx_load(src - cn) + vx_load(src + cn)) * k1)); + for( ; i <= width - VTraits::vlanes(); i += VTraits::vlanes(), src += VTraits::vlanes() ) + v_store(dst + i, v_muladd(vx_load(src), k0, v_mul(v_add(vx_load(src - cn), vx_load(src + cn)), k1))); } } else if( _ksize == 5 ) @@ -1836,21 +1833,21 @@ struct SymmRowSmallVec_32f { #if CV_FMA3 || CV_AVX2 v_float32 k0 = vx_setall_f32(-2); - for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes ) - v_store(dst + i, v_muladd(vx_load(src), k0, vx_load(src - 2*cn) + vx_load(src + 2*cn))); + for( ; i <= width - VTraits::vlanes(); i += VTraits::vlanes(), src += VTraits::vlanes() ) + v_store(dst + i, v_muladd(vx_load(src), k0, v_add(vx_load(src - 2 * cn), vx_load(src + 2 * cn)))); #else - for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes ) + for( ; i <= width - VTraits::vlanes(); i += VTraits::vlanes(), src += VTraits::vlanes() ) { v_float32 x = vx_load(src); - v_store(dst + i, vx_load(src - 2*cn) + vx_load(src + 2*cn) - (x + x)); + v_store(dst + i, v_sub(v_add(vx_load(src - 2*cn), vx_load(src + 2*cn)), v_add(x, x))); } #endif } else { v_float32 k0 = vx_setall_f32(kx[0]), k1 = vx_setall_f32(kx[1]), k2 = vx_setall_f32(kx[2]); - for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes ) - v_store(dst + i, v_muladd(vx_load(src + 2*cn) + vx_load(src - 2*cn), k2, v_muladd(vx_load(src), k0, (vx_load(src - cn) + vx_load(src + cn)) * k1))); + for( ; i <= width - VTraits::vlanes(); i += VTraits::vlanes(), src += VTraits::vlanes() ) + v_store(dst + i, v_muladd(v_add(vx_load(src + 2 * cn), vx_load(src - 2 * cn)), k2, v_muladd(vx_load(src), k0, v_mul(v_add(vx_load(src - cn), vx_load(src + cn)), k1)))); } } } @@ -1859,20 +1856,20 @@ struct SymmRowSmallVec_32f if( _ksize == 3 ) { if( kx[0] == 0 && kx[1] == 1 ) - for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes ) - v_store(dst + i, vx_load(src + cn) - vx_load(src - cn)); + for( ; i <= width - VTraits::vlanes(); i += VTraits::vlanes(), src += VTraits::vlanes() ) + v_store(dst + i, v_sub(vx_load(src + cn), vx_load(src - cn))); else { v_float32 k1 = vx_setall_f32(kx[1]); - for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes ) - v_store(dst + i, (vx_load(src + cn) - vx_load(src - cn)) * k1); + for( ; i <= width - VTraits::vlanes(); i += VTraits::vlanes(), src += VTraits::vlanes() ) + v_store(dst + i, v_mul(v_sub(vx_load(src + cn), vx_load(src - cn)), k1)); } } else if( _ksize == 5 ) { v_float32 k1 = vx_setall_f32(kx[1]), k2 = vx_setall_f32(kx[2]); - for ( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes ) - v_store(dst + i, v_muladd(vx_load(src + 2*cn) - vx_load(src - 2*cn), k2, (vx_load(src + cn) - vx_load(src - cn)) * k1)); + for ( ; i <= width - VTraits::vlanes(); i += VTraits::vlanes(), src += VTraits::vlanes() ) + v_store(dst + i, v_muladd(v_sub(vx_load(src + 2 * cn), vx_load(src - 2 * cn)), k2, v_mul(v_sub(vx_load(src + cn), vx_load(src - cn)), k1))); } } return i; @@ -1961,46 +1958,46 @@ struct SymmColumnVec_32f #endif const v_float32 d4 = vx_setall_f32(delta); const v_float32 k0 = vx_setall_f32(ky[0]); - for( ; i <= width - 4*v_float32::nlanes; i += 4*v_float32::nlanes ) + for( ; i <= width - 4*VTraits::vlanes(); i += 4*VTraits::vlanes() ) { v_float32 s0 = v_muladd(vx_load(src[0] + i), k0, d4); - v_float32 s1 = v_muladd(vx_load(src[0] + i + v_float32::nlanes), k0, d4); - v_float32 s2 = v_muladd(vx_load(src[0] + i + 2*v_float32::nlanes), k0, d4); - v_float32 s3 = v_muladd(vx_load(src[0] + i + 3*v_float32::nlanes), k0, d4); + v_float32 s1 = v_muladd(vx_load(src[0] + i + VTraits::vlanes()), k0, d4); + v_float32 s2 = v_muladd(vx_load(src[0] + i + 2*VTraits::vlanes()), k0, d4); + v_float32 s3 = v_muladd(vx_load(src[0] + i + 3*VTraits::vlanes()), k0, d4); for( k = 1; k <= ksize2; k++ ) { v_float32 k1 = vx_setall_f32(ky[k]); - s0 = v_muladd(vx_load(src[k] + i) + vx_load(src[-k] + i), k1, s0); - s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes) + vx_load(src[-k] + i + v_float32::nlanes), k1, s1); - s2 = v_muladd(vx_load(src[k] + i + 2*v_float32::nlanes) + vx_load(src[-k] + i + 2*v_float32::nlanes), k1, s2); - s3 = v_muladd(vx_load(src[k] + i + 3*v_float32::nlanes) + vx_load(src[-k] + i + 3*v_float32::nlanes), k1, s3); + s0 = v_muladd(v_add(vx_load(src[k] + i), vx_load(src[-k] + i)), k1, s0); + s1 = v_muladd(v_add(vx_load(src[k] + i + VTraits::vlanes()), vx_load(src[-k] + i + VTraits::vlanes())), k1, s1); + s2 = v_muladd(v_add(vx_load(src[k] + i + 2 * VTraits::vlanes()), vx_load(src[-k] + i + 2 * VTraits::vlanes())), k1, s2); + s3 = v_muladd(v_add(vx_load(src[k] + i + 3 * VTraits::vlanes()), vx_load(src[-k] + i + 3 * VTraits::vlanes())), k1, s3); } v_store(dst + i, s0); - v_store(dst + i + v_float32::nlanes, s1); - v_store(dst + i + 2*v_float32::nlanes, s2); - v_store(dst + i + 3*v_float32::nlanes, s3); + v_store(dst + i + VTraits::vlanes(), s1); + v_store(dst + i + 2*VTraits::vlanes(), s2); + v_store(dst + i + 3*VTraits::vlanes(), s3); } - if( i <= width - 2*v_float32::nlanes ) + if( i <= width - 2*VTraits::vlanes() ) { v_float32 s0 = v_muladd(vx_load(src[0] + i), k0, d4); - v_float32 s1 = v_muladd(vx_load(src[0] + i + v_float32::nlanes), k0, d4); + v_float32 s1 = v_muladd(vx_load(src[0] + i + VTraits::vlanes()), k0, d4); for( k = 1; k <= ksize2; k++ ) { v_float32 k1 = vx_setall_f32(ky[k]); - s0 = v_muladd(vx_load(src[k] + i) + vx_load(src[-k] + i), k1, s0); - s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes) + vx_load(src[-k] + i + v_float32::nlanes), k1, s1); + s0 = v_muladd(v_add(vx_load(src[k] + i), vx_load(src[-k] + i)), k1, s0); + s1 = v_muladd(v_add(vx_load(src[k] + i + VTraits::vlanes()), vx_load(src[-k] + i + VTraits::vlanes())), k1, s1); } v_store(dst + i, s0); - v_store(dst + i + v_float32::nlanes, s1); - i += 2*v_float32::nlanes; + v_store(dst + i + VTraits::vlanes(), s1); + i += 2*VTraits::vlanes(); } - if( i <= width - v_float32::nlanes ) + if( i <= width - VTraits::vlanes() ) { v_float32 s0 = v_muladd(vx_load(src[0] + i), k0, d4); for( k = 1; k <= ksize2; k++ ) - s0 = v_muladd(vx_load(src[k] + i) + vx_load(src[-k] + i), vx_setall_f32(ky[k]), s0); + s0 = v_muladd(v_add(vx_load(src[k] + i), vx_load(src[-k] + i)), vx_setall_f32(ky[k]), s0); v_store(dst + i, s0); - i += v_float32::nlanes; + i += VTraits::vlanes(); } } else @@ -2042,46 +2039,46 @@ struct SymmColumnVec_32f #endif const v_float32 d4 = vx_setall_f32(delta); const v_float32 k1 = vx_setall_f32(ky[1]); - for( ; i <= width - 4*v_float32::nlanes; i += 4*v_float32::nlanes ) + for( ; i <= width - 4*VTraits::vlanes(); i += 4*VTraits::vlanes() ) { - v_float32 s0 = v_muladd(vx_load(src[1] + i) - vx_load(src[-1] + i), k1, d4); - v_float32 s1 = v_muladd(vx_load(src[1] + i + v_float32::nlanes) - vx_load(src[-1] + i + v_float32::nlanes), k1, d4); - v_float32 s2 = v_muladd(vx_load(src[1] + i + 2*v_float32::nlanes) - vx_load(src[-1] + i + 2*v_float32::nlanes), k1, d4); - v_float32 s3 = v_muladd(vx_load(src[1] + i + 3*v_float32::nlanes) - vx_load(src[-1] + i + 3*v_float32::nlanes), k1, d4); + v_float32 s0 = v_muladd(v_sub(vx_load(src[1] + i), vx_load(src[-1] + i)), k1, d4); + v_float32 s1 = v_muladd(v_sub(vx_load(src[1] + i + VTraits::vlanes()), vx_load(src[-1] + i + VTraits::vlanes())), k1, d4); + v_float32 s2 = v_muladd(v_sub(vx_load(src[1] + i + 2 * VTraits::vlanes()), vx_load(src[-1] + i + 2 * VTraits::vlanes())), k1, d4); + v_float32 s3 = v_muladd(v_sub(vx_load(src[1] + i + 3 * VTraits::vlanes()), vx_load(src[-1] + i + 3 * VTraits::vlanes())), k1, d4); for( k = 2; k <= ksize2; k++ ) { v_float32 k2 = vx_setall_f32(ky[k]); - s0 = v_muladd(vx_load(src[k] + i) - vx_load(src[-k] + i), k2, s0); - s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes) - vx_load(src[-k] + i + v_float32::nlanes), k2, s1); - s2 = v_muladd(vx_load(src[k] + i + 2*v_float32::nlanes) - vx_load(src[-k] + i + 2*v_float32::nlanes), k2, s2); - s3 = v_muladd(vx_load(src[k] + i + 3*v_float32::nlanes) - vx_load(src[-k] + i + 3*v_float32::nlanes), k2, s3); + s0 = v_muladd(v_sub(vx_load(src[k] + i), vx_load(src[-k] + i)), k2, s0); + s1 = v_muladd(v_sub(vx_load(src[k] + i + VTraits::vlanes()), vx_load(src[-k] + i + VTraits::vlanes())), k2, s1); + s2 = v_muladd(v_sub(vx_load(src[k] + i + 2 * VTraits::vlanes()), vx_load(src[-k] + i + 2 * VTraits::vlanes())), k2, s2); + s3 = v_muladd(v_sub(vx_load(src[k] + i + 3 * VTraits::vlanes()), vx_load(src[-k] + i + 3 * VTraits::vlanes())), k2, s3); } v_store(dst + i, s0); - v_store(dst + i + v_float32::nlanes, s1); - v_store(dst + i + 2*v_float32::nlanes, s2); - v_store(dst + i + 3*v_float32::nlanes, s3); + v_store(dst + i + VTraits::vlanes(), s1); + v_store(dst + i + 2*VTraits::vlanes(), s2); + v_store(dst + i + 3*VTraits::vlanes(), s3); } - if( i <= width - 2*v_float32::nlanes ) + if( i <= width - 2*VTraits::vlanes() ) { - v_float32 s0 = v_muladd(vx_load(src[1] + i) - vx_load(src[-1] + i), k1, d4); - v_float32 s1 = v_muladd(vx_load(src[1] + i + v_float32::nlanes) - vx_load(src[-1] + i + v_float32::nlanes), k1, d4); + v_float32 s0 = v_muladd(v_sub(vx_load(src[1] + i), vx_load(src[-1] + i)), k1, d4); + v_float32 s1 = v_muladd(v_sub(vx_load(src[1] + i + VTraits::vlanes()), vx_load(src[-1] + i + VTraits::vlanes())), k1, d4); for( k = 2; k <= ksize2; k++ ) { v_float32 k2 = vx_setall_f32(ky[k]); - s0 = v_muladd(vx_load(src[k] + i) - vx_load(src[-k] + i), k2, s0); - s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes) - vx_load(src[-k] + i + v_float32::nlanes), k2, s1); + s0 = v_muladd(v_sub(vx_load(src[k] + i), vx_load(src[-k] + i)), k2, s0); + s1 = v_muladd(v_sub(vx_load(src[k] + i + VTraits::vlanes()), vx_load(src[-k] + i + VTraits::vlanes())), k2, s1); } v_store(dst + i, s0); - v_store(dst + i + v_float32::nlanes, s1); - i += 2*v_float32::nlanes; + v_store(dst + i + VTraits::vlanes(), s1); + i += 2*VTraits::vlanes(); } - if( i <= width - v_float32::nlanes ) + if( i <= width - VTraits::vlanes() ) { - v_float32 s0 = v_muladd(vx_load(src[1] + i) - vx_load(src[-1] + i), k1, d4); + v_float32 s0 = v_muladd(v_sub(vx_load(src[1] + i), vx_load(src[-1] + i)), k1, d4); for( k = 2; k <= ksize2; k++ ) - s0 = v_muladd(vx_load(src[k] + i) - vx_load(src[-k] + i), vx_setall_f32(ky[k]), s0); + s0 = v_muladd(v_sub(vx_load(src[k] + i), vx_load(src[-k] + i)), vx_setall_f32(ky[k]), s0); v_store(dst + i, s0); - i += v_float32::nlanes; + i += VTraits::vlanes(); } } return i; @@ -2123,28 +2120,28 @@ struct SymmColumnSmallVec_32f { #if CV_FMA3 || CV_AVX2 v_float32 k0 = vx_setall_f32(ky[0]); - for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes ) - v_store(dst + i, v_muladd(vx_load(S1 + i), k0, vx_load(S0 + i) + vx_load(S2 + i) + d4)); + for( ; i <= width - VTraits::vlanes(); i += VTraits::vlanes() ) + v_store(dst + i, v_muladd(vx_load(S1 + i), k0, v_add(v_add(vx_load(S0 + i), vx_load(S2 + i)), d4))); #else if(ky[0] > 0) - for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes ) + for( ; i <= width - VTraits::vlanes(); i += VTraits::vlanes() ) { v_float32 x = vx_load(S1 + i); - v_store(dst + i, vx_load(S0 + i) + vx_load(S2 + i) + d4 + (x + x)); + v_store(dst + i, v_add(vx_load(S0 + i), vx_load(S2 + i), d4, x, x)); } else - for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes ) + for( ; i <= width - VTraits::vlanes(); i += VTraits::vlanes() ) { v_float32 x = vx_load(S1 + i); - v_store(dst + i, vx_load(S0 + i) + vx_load(S2 + i) + d4 - (x + x)); + v_store(dst + i, v_sub(v_add(vx_load(S0 + i), vx_load(S2 + i), d4), v_add(x, x))); } #endif } else { v_float32 k0 = vx_setall_f32(ky[0]), k1 = vx_setall_f32(ky[1]); - for ( ; i <= width - v_float32::nlanes; i += v_float32::nlanes ) - v_store(dst + i, v_muladd(vx_load(S0 + i) + vx_load(S2 + i), k1, v_muladd(vx_load(S1 + i), k0, d4))); + for ( ; i <= width - VTraits::vlanes(); i += VTraits::vlanes() ) + v_store(dst + i, v_muladd(v_add(vx_load(S0 + i), vx_load(S2 + i)), k1, v_muladd(vx_load(S1 + i), k0, d4))); } } else @@ -2153,14 +2150,14 @@ struct SymmColumnSmallVec_32f { if( ky[1] < 0 ) std::swap(S0, S2); - for ( ; i <= width - v_float32::nlanes; i += v_float32::nlanes ) - v_store(dst + i, vx_load(S2 + i) - vx_load(S0 + i) + d4); + for ( ; i <= width - VTraits::vlanes(); i += VTraits::vlanes() ) + v_store(dst + i, v_add(v_sub(vx_load(S2 + i), vx_load(S0 + i)), d4)); } else { v_float32 k1 = vx_setall_f32(ky[1]); - for ( ; i <= width - v_float32::nlanes; i += v_float32::nlanes ) - v_store(dst + i, v_muladd(vx_load(S2 + i) - vx_load(S0 + i), k1, d4)); + for ( ; i <= width - VTraits::vlanes(); i += VTraits::vlanes() ) + v_store(dst + i, v_muladd(v_sub(vx_load(S2 + i), vx_load(S0 + i)), k1, d4)); } } return i; @@ -2199,7 +2196,7 @@ struct FilterVec_8u v_float32 d4 = vx_setall_f32(delta); v_float32 f0 = vx_setall_f32(kf[0]); - for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes ) + for( ; i <= width - VTraits::vlanes(); i += VTraits::vlanes() ) { v_uint16 xl, xh; v_expand(vx_load(src[0] + i), xl, xh); @@ -2223,7 +2220,7 @@ struct FilterVec_8u } v_store(dst + i, v_pack_u(v_pack(v_round(s0), v_round(s1)), v_pack(v_round(s2), v_round(s3)))); } - if( i <= width - v_uint16::nlanes ) + if( i <= width - VTraits::vlanes() ) { v_uint32 x0, x1; v_expand(vx_load_expand(src[0] + i), x0, x1); @@ -2237,21 +2234,21 @@ struct FilterVec_8u s1 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x1)), f, s1); } v_pack_u_store(dst + i, v_pack(v_round(s0), v_round(s1))); - i += v_uint16::nlanes; + i += VTraits::vlanes(); } #if CV_SIMD_WIDTH > 16 - while( i <= width - v_int32x4::nlanes ) + while( i <= width - 4 /*v_int32x4::nlanes*/ ) #else - if( i <= width - v_int32x4::nlanes ) + if( i <= width - v_int32::nlanes ) #endif { - v_float32x4 s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_load_expand_q(src[0] + i))), v_setall_f32(kf[0]), v_setall_f32(delta)); + v_float32 s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(vx_load_expand_q(src[0] + i))), vx_setall_f32(kf[0]), vx_setall_f32(delta)); for( k = 1; k < nz; k++ ) - s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_load_expand_q(src[k] + i))), v_setall_f32(kf[k]), s0); - v_int32x4 s32 = v_round(s0); - v_int16x8 s16 = v_pack(s32, s32); - *(unaligned_int*)(dst + i) = v_reinterpret_as_s32(v_pack_u(s16, s16)).get0(); - i += v_int32x4::nlanes; + s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(vx_load_expand_q(src[k] + i))), vx_setall_f32(kf[k]), s0); + v_int32 s32 = v_round(s0); + v_int16 s16 = v_pack(s32, s32); + *(unaligned_int*)(dst + i) = v_get0(v_reinterpret_as_s32(v_pack_u(s16, s16))); + i += 4 /*v_int32x4::nlanes*/ ; } return i; } @@ -2286,7 +2283,7 @@ struct FilterVec_8u16s v_float32 d4 = vx_setall_f32(delta); v_float32 f0 = vx_setall_f32(kf[0]); - for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes ) + for( ; i <= width - VTraits::vlanes(); i += VTraits::vlanes() ) { v_uint16 xl, xh; v_expand(vx_load(src[0] + i), xl, xh); @@ -2304,9 +2301,9 @@ struct FilterVec_8u16s s3 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_high(xh))), f, s3); } v_store(dst + i, v_pack(v_round(s0), v_round(s1))); - v_store(dst + i + v_int16::nlanes, v_pack(v_round(s2), v_round(s3))); + v_store(dst + i + VTraits::vlanes(), v_pack(v_round(s2), v_round(s3))); } - if( i <= width - v_uint16::nlanes ) + if( i <= width - VTraits::vlanes() ) { v_uint16 x = vx_load_expand(src[0] + i); v_float32 s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_low(x))), f0, d4); @@ -2319,15 +2316,15 @@ struct FilterVec_8u16s s1 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_high(x))), f, s1); } v_store(dst + i, v_pack(v_round(s0), v_round(s1))); - i += v_uint16::nlanes; + i += VTraits::vlanes(); } - if( i <= width - v_int32::nlanes ) + if( i <= width - VTraits::vlanes() ) { v_float32 s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(vx_load_expand_q(src[0] + i))), f0, d4); for( k = 1; k < nz; k++ ) s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(vx_load_expand_q(src[k] + i))), vx_setall_f32(kf[k]), s0); v_pack_store(dst + i, v_round(s0)); - i += v_int32::nlanes; + i += VTraits::vlanes(); } return i; } @@ -2360,46 +2357,46 @@ struct FilterVec_32f v_float32 d4 = vx_setall_f32(delta); v_float32 f0 = vx_setall_f32(kf[0]); - for( ; i <= width - 4*v_float32::nlanes; i += 4*v_float32::nlanes ) + for( ; i <= width - 4*VTraits::vlanes(); i += 4*VTraits::vlanes() ) { v_float32 s0 = v_muladd(vx_load(src[0] + i), f0, d4); - v_float32 s1 = v_muladd(vx_load(src[0] + i + v_float32::nlanes), f0, d4); - v_float32 s2 = v_muladd(vx_load(src[0] + i + 2*v_float32::nlanes), f0, d4); - v_float32 s3 = v_muladd(vx_load(src[0] + i + 3*v_float32::nlanes), f0, d4); + v_float32 s1 = v_muladd(vx_load(src[0] + i + VTraits::vlanes()), f0, d4); + v_float32 s2 = v_muladd(vx_load(src[0] + i + 2*VTraits::vlanes()), f0, d4); + v_float32 s3 = v_muladd(vx_load(src[0] + i + 3*VTraits::vlanes()), f0, d4); for( k = 1; k < nz; k++ ) { v_float32 f1 = vx_setall_f32(kf[k]); s0 = v_muladd(vx_load(src[k] + i), f1, s0); - s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes), f1, s1); - s2 = v_muladd(vx_load(src[k] + i + 2*v_float32::nlanes), f1, s2); - s3 = v_muladd(vx_load(src[k] + i + 3*v_float32::nlanes), f1, s3); + s1 = v_muladd(vx_load(src[k] + i + VTraits::vlanes()), f1, s1); + s2 = v_muladd(vx_load(src[k] + i + 2*VTraits::vlanes()), f1, s2); + s3 = v_muladd(vx_load(src[k] + i + 3*VTraits::vlanes()), f1, s3); } v_store(dst + i, s0); - v_store(dst + i + v_float32::nlanes, s1); - v_store(dst + i + 2*v_float32::nlanes, s2); - v_store(dst + i + 3*v_float32::nlanes, s3); + v_store(dst + i + VTraits::vlanes(), s1); + v_store(dst + i + 2*VTraits::vlanes(), s2); + v_store(dst + i + 3*VTraits::vlanes(), s3); } - if( i <= width - 2*v_float32::nlanes ) + if( i <= width - 2*VTraits::vlanes() ) { v_float32 s0 = v_muladd(vx_load(src[0] + i), f0, d4); - v_float32 s1 = v_muladd(vx_load(src[0] + i + v_float32::nlanes), f0, d4); + v_float32 s1 = v_muladd(vx_load(src[0] + i + VTraits::vlanes()), f0, d4); for( k = 1; k < nz; k++ ) { v_float32 f1 = vx_setall_f32(kf[k]); s0 = v_muladd(vx_load(src[k] + i), f1, s0); - s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes), f1, s1); + s1 = v_muladd(vx_load(src[k] + i + VTraits::vlanes()), f1, s1); } v_store(dst + i, s0); - v_store(dst + i + v_float32::nlanes, s1); - i += 2*v_float32::nlanes; + v_store(dst + i + VTraits::vlanes(), s1); + i += 2*VTraits::vlanes(); } - if( i <= width - v_float32::nlanes ) + if( i <= width - VTraits::vlanes() ) { v_float32 s0 = v_muladd(vx_load(src[0] + i), f0, d4); for( k = 1; k < nz; k++ ) s0 = v_muladd(vx_load(src[k] + i), vx_setall_f32(kf[k]), s0); v_store(dst + i, s0); - i += v_float32::nlanes; + i += VTraits::vlanes(); } return i; } diff --git a/modules/imgproc/src/hough.cpp b/modules/imgproc/src/hough.cpp index 96ba338f30..9961e9aace 100644 --- a/modules/imgproc/src/hough.cpp +++ b/modules/imgproc/src/hough.cpp @@ -1156,13 +1156,13 @@ public: for(; x < numCols; ++x ) { -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) { v_uint8 v_zero = vx_setzero_u8(); - for(; x <= numCols - 2*v_uint8::nlanes; x += 2*v_uint8::nlanes) { - v_uint8 v_edge1 = (vx_load(edgeData + x ) != v_zero); - v_uint8 v_edge2 = (vx_load(edgeData + x + v_uint8::nlanes) != v_zero); + for(; x <= numCols - 2*VTraits::vlanes(); x += 2*VTraits::vlanes()) { + v_uint8 v_edge1 = (v_ne(vx_load(edgeData + x), v_zero)); + v_uint8 v_edge2 = (v_ne(vx_load(edgeData + x + VTraits::vlanes()), v_zero)); if(v_check_any(v_edge1)) { @@ -1172,7 +1172,7 @@ public: if(v_check_any(v_edge2)) { - x += v_uint8::nlanes + v_scan_forward(v_edge2); + x += VTraits::vlanes() + v_scan_forward(v_edge2); goto _next_step; } } @@ -1183,7 +1183,7 @@ public: if(x == numCols) continue; -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) _next_step: #endif float vx, vy; @@ -1514,7 +1514,7 @@ inline int HoughCircleEstimateRadiusInvoker::filterCircles(const Po int nzCount = 0; const Point* nz_ = &nz[0]; int j = 0; -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) { const v_float32 v_minRadius2 = vx_setall_f32(minRadius2); const v_float32 v_maxRadius2 = vx_setall_f32(maxRadius2); @@ -1522,9 +1522,9 @@ inline int HoughCircleEstimateRadiusInvoker::filterCircles(const Po v_float32 v_curCenterX = vx_setall_f32(curCenter.x); v_float32 v_curCenterY = vx_setall_f32(curCenter.y); - float CV_DECL_ALIGNED(CV_SIMD_WIDTH) rbuf[v_float32::nlanes]; - int CV_DECL_ALIGNED(CV_SIMD_WIDTH) rmask[v_int32::nlanes]; - for(; j <= nzSz - v_float32::nlanes; j += v_float32::nlanes) + float CV_DECL_ALIGNED(CV_SIMD_WIDTH) rbuf[VTraits::max_nlanes]; + int CV_DECL_ALIGNED(CV_SIMD_WIDTH) rmask[VTraits::max_nlanes]; + for(; j <= nzSz - VTraits::vlanes(); j += VTraits::vlanes()) { v_float32 v_nzX, v_nzY; v_load_deinterleave((const float*)&nz_[j], v_nzX, v_nzY); // FIXIT use proper datatype @@ -1532,16 +1532,16 @@ inline int HoughCircleEstimateRadiusInvoker::filterCircles(const Po v_float32 v_x = v_cvt_f32(v_reinterpret_as_s32(v_nzX)); v_float32 v_y = v_cvt_f32(v_reinterpret_as_s32(v_nzY)); - v_float32 v_dx = v_x - v_curCenterX; - v_float32 v_dy = v_y - v_curCenterY; + v_float32 v_dx = v_sub(v_x, v_curCenterX); + v_float32 v_dy = v_sub(v_y, v_curCenterY); - v_float32 v_r2 = (v_dx * v_dx) + (v_dy * v_dy); - v_float32 vmask = (v_minRadius2 <= v_r2) & (v_r2 <= v_maxRadius2); + v_float32 v_r2 = v_add(v_mul(v_dx, v_dx), v_mul(v_dy, v_dy)); + v_float32 vmask = v_and(v_le(v_minRadius2, v_r2), v_le(v_r2, v_maxRadius2)); if (v_check_any(vmask)) { v_store_aligned(rmask, v_reinterpret_as_s32(vmask)); v_store_aligned(rbuf, v_r2); - for (int i = 0; i < v_int32::nlanes; ++i) + for (int i = 0; i < VTraits::vlanes(); ++i) if (rmask[i]) ddata[nzCount++] = rbuf[i]; } } @@ -1573,13 +1573,13 @@ inline int HoughCircleEstimateRadiusInvoker::filterCircles(const Poi const Range xOuter = Range(std::max(int(curCenter.x - rOuter), 0), std::min(int(curCenter.x + rOuter), positions.cols)); const Range yOuter = Range(std::max(int(curCenter.y - rOuter), 0), std::min(int(curCenter.y + rOuter), positions.rows)); -#if CV_SIMD - float v_seq[v_float32::nlanes]; - for (int i = 0; i < v_float32::nlanes; ++i) +#if (CV_SIMD || CV_SIMD_SCALABLE) + float v_seq[VTraits::max_nlanes]; + for (int i = 0; i < VTraits::vlanes(); ++i) v_seq[i] = (float)i; const v_float32 v_minRadius2 = vx_setall_f32(minRadius2); const v_float32 v_maxRadius2 = vx_setall_f32(maxRadius2); - const v_float32 v_curCenterX_0123 = vx_setall_f32(curCenter.x) - vx_load(v_seq); + const v_float32 v_curCenterX_0123 = v_sub(vx_setall_f32(curCenter.x), vx_load(v_seq)); #endif for (int y = yOuter.start; y < yOuter.end; y++) @@ -1589,27 +1589,27 @@ inline int HoughCircleEstimateRadiusInvoker::filterCircles(const Poi float dy2 = dy * dy; int x = xOuter.start; -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) { const v_float32 v_dy2 = vx_setall_f32(dy2); const v_uint32 v_zero_u32 = vx_setall_u32(0); - float CV_DECL_ALIGNED(CV_SIMD_WIDTH) rbuf[v_float32::nlanes]; - int CV_DECL_ALIGNED(CV_SIMD_WIDTH) rmask[v_int32::nlanes]; - for (; x <= xOuter.end - v_float32::nlanes; x += v_float32::nlanes) + float CV_DECL_ALIGNED(CV_SIMD_WIDTH) rbuf[VTraits::max_nlanes]; + int CV_DECL_ALIGNED(CV_SIMD_WIDTH) rmask[VTraits::max_nlanes]; + for (; x <= xOuter.end - VTraits::vlanes(); x += VTraits::vlanes()) { v_uint32 v_mask = vx_load_expand_q(ptr + x); - v_mask = v_mask != v_zero_u32; + v_mask = v_ne(v_mask, v_zero_u32); v_float32 v_x = v_cvt_f32(vx_setall_s32(x)); - v_float32 v_dx = v_x - v_curCenterX_0123; + v_float32 v_dx = v_sub(v_x, v_curCenterX_0123); - v_float32 v_r2 = (v_dx * v_dx) + v_dy2; - v_float32 vmask = (v_minRadius2 <= v_r2) & (v_r2 <= v_maxRadius2) & v_reinterpret_as_f32(v_mask); + v_float32 v_r2 = v_add(v_mul(v_dx, v_dx), v_dy2); + v_float32 vmask = v_and(v_and(v_le(v_minRadius2, v_r2), v_le(v_r2, v_maxRadius2)), v_reinterpret_as_f32(v_mask)); if (v_check_any(vmask)) { v_store_aligned(rmask, v_reinterpret_as_s32(vmask)); v_store_aligned(rbuf, v_r2); - for (int i = 0; i < v_int32::nlanes; ++i) + for (int i = 0; i < VTraits::vlanes(); ++i) if (rmask[i]) ddata[nzCount++] = rbuf[i]; } } diff --git a/modules/imgproc/src/stackblur.cpp b/modules/imgproc/src/stackblur.cpp index 5d60a1d365..6becbe5c41 100644 --- a/modules/imgproc/src/stackblur.cpp +++ b/modules/imgproc/src/stackblur.cpp @@ -88,7 +88,7 @@ static unsigned char const stackblurShr[255] = namespace cv{ -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) template inline int opRow(const T* , T* , const std::vector& , const float , const int radius, const int CN, const int ) { @@ -107,7 +107,7 @@ inline int opRow(const uchar* srcPtr, uchar* dstPtr, const std::vector::vlanes(); if (kernelSize == 3) { @@ -126,10 +126,10 @@ inline int opRow(const uchar* srcPtr, uchar* dstPtr, const std::vector>shrValTab; - y01 = (y01 * v_mulVal)>>shrValTab; - y10 = (y10 * v_mulVal)>>shrValTab; - y11 = (y11 * v_mulVal)>>shrValTab; + y00 = v_shr(v_mul(y00, v_mulVal), shrValTab); + y01 = v_shr(v_mul(y01, v_mulVal), shrValTab); + y10 = v_shr(v_mul(y10, v_mulVal), shrValTab); + y11 = v_shr(v_mul(y11, v_mulVal), shrValTab); v_store(dstPtr + i, v_pack(v_pack(y00, y01), v_pack(y10, y11))); } @@ -159,12 +159,12 @@ inline int opRow(const uchar* srcPtr, uchar* dstPtr, const std::vector(const uchar* srcPtr, uchar* dstPtr, const std::vector>shrValTab; - s1 = (s1 * v_mulVal)>>shrValTab; - s2 = (s2 * v_mulVal)>>shrValTab; - s3 = (s3 * v_mulVal)>>shrValTab; + s0 = v_shr(v_mul(s0, v_mulVal), shrValTab); + s1 = v_shr(v_mul(s1, v_mulVal), shrValTab); + s2 = v_shr(v_mul(s2, v_mulVal), shrValTab); + s3 = v_shr(v_mul(s3, v_mulVal), shrValTab); v_store(dstPtr + i, v_pack(v_reinterpret_as_u16(v_pack(s0, s1)), v_reinterpret_as_u16(v_pack(s2, s3)))); } @@ -205,7 +205,7 @@ inline int opRow(const ushort* srcPtr, ushort* dstPtr, const std::vector const int mulValTab= stackblurMul[radius]; const int shrValTab= stackblurShr[radius]; - const int VEC_LINE = v_uint16::nlanes; + const int VEC_LINE = VTraits::vlanes(); v_uint32 v_mulVal = vx_setall_u32(mulValTab); if (kernelSize == 3) @@ -220,7 +220,7 @@ inline int opRow(const ushort* srcPtr, ushort* dstPtr, const std::vector x1l = v_add(v_add(x1l, x1l), v_add(x0l, x2l)); x1h = v_add(v_add(x1h, x1h), v_add(x0h, x2h)); - v_store(dstPtr + i, v_pack((x1l * v_mulVal)>>shrValTab, (x1h * v_mulVal)>>shrValTab)); + v_store(dstPtr + i, v_pack(v_shr(v_mul(x1l, v_mulVal), shrValTab), v_shr(v_mul(x1h, v_mulVal), shrValTab))); } } else @@ -243,25 +243,25 @@ inline int opRow(const ushort* srcPtr, ushort* dstPtr, const std::vector v_uint16 k2 = vx_setall_u16(kx[k + 1]); v_uint32 y0, y1; - v_mul_expand(vx_load(srcPtr - j) + vx_load(srcPtr + j), k1, y0, y1); - s0 += y0; - s1 += y1; - v_mul_expand(vx_load(srcPtr - j - CN) + vx_load(srcPtr + j + CN), k2, y0, y1); - s0 += y0; - s1 += y1; + v_mul_expand(v_add(vx_load(srcPtr - j), vx_load(srcPtr + j)), k1, y0, y1); + s0 = v_add(s0, y0); + s1 = v_add(s1, y1); + v_mul_expand(v_add(vx_load(srcPtr - j - CN), vx_load(srcPtr + j + CN)), k2, y0, y1); + s0 = v_add(s0, y0); + s1 = v_add(s1, y1); } if( k < kernelSize / 2 + 1 ) { v_uint16 k1 = vx_setall_u16(kx[k]); v_uint32 y0, y1; - v_mul_expand(vx_load(srcPtr - j) + vx_load(srcPtr + j), k1, y0, y1); - s0 += y0; - s1 += y1; + v_mul_expand(v_add(vx_load(srcPtr - j), vx_load(srcPtr + j)), k1, y0, y1); + s0 = v_add(s0, y0); + s1 = v_add(s1, y1); } - s0 = (s0 * v_mulVal)>>shrValTab; - s1 = (s1 * v_mulVal)>>shrValTab; + s0 = v_shr(v_mul(s0, v_mulVal), shrValTab); + s1 = v_shr(v_mul(s1, v_mulVal), shrValTab); v_store(dstPtr + i, v_pack(s0, s1)); } @@ -282,7 +282,7 @@ inline int opRow(const short* srcPtr, short* dstPtr, const std::vector::vlanes(); v_int32 v_mulVal = vx_setall_s32(mulValTab); if (kernelSize == 3) @@ -297,7 +297,7 @@ inline int opRow(const short* srcPtr, short* dstPtr, const std::vector>shrValTab, (x1h * v_mulVal)>>shrValTab)); + v_store(dstPtr + i, v_pack(v_shr(v_mul(x1l, v_mulVal), shrValTab), v_shr(v_mul(x1h, v_mulVal), shrValTab))); } } else @@ -320,24 +320,24 @@ inline int opRow(const short* srcPtr, short* dstPtr, const std::vector>shrValTab; - s1 = (s1 * v_mulVal)>>shrValTab; + s0 = v_shr(v_mul(s0, v_mulVal), shrValTab); + s1 = v_shr(v_mul(s1, v_mulVal), shrValTab); v_store(dstPtr + i, v_pack(s0, s1)); } @@ -352,7 +352,7 @@ inline int opRow(const float* srcPtr, float* dstPtr, const std::vector::vlanes(); const int VEC_LINE4 = VEC_LINE * 4; if (kernelSize == 3) @@ -364,22 +364,22 @@ inline int opRow(const float* srcPtr, float* dstPtr, const std::vector(const float* srcPtr, float* dstPtr, const std::vector(const float* srcPtr, float* dstPtr, const std::vector inline int opComputeDiff(const uchar*& srcPtr, int*& diff0, const int w, const int CNR1) { int index = 0; - const int VEC_LINE_8 = v_uint8::nlanes; - const int VEC_LINE_32 = v_int32::nlanes; + const int VEC_LINE_8 = VTraits::vlanes(); + const int VEC_LINE_32 = VTraits::vlanes(); for (; index <= w - VEC_LINE_8; index += VEC_LINE_8, diff0+=VEC_LINE_8, srcPtr+=VEC_LINE_8) { v_uint16 x0l, x0h, x1l, x1h; @@ -435,8 +435,8 @@ inline int opComputeDiff(const uchar*& srcPtr, int*& diff0, const in v_expand(vx_load(srcPtr), x1l, x1h); v_int32 y0, y1, y2, y3; - v_expand(v_reinterpret_as_s16(x0l) - v_reinterpret_as_s16(x1l), y0, y1); - v_expand(v_reinterpret_as_s16(x0h) - v_reinterpret_as_s16(x1h), y2, y3); + v_expand(v_sub(v_reinterpret_as_s16(x0l), v_reinterpret_as_s16(x1l)), y0, y1); + v_expand(v_sub(v_reinterpret_as_s16(x0h), v_reinterpret_as_s16(x1h)), y2, y3); v_store(diff0, y0); v_store(diff0 + VEC_LINE_32, y1); @@ -517,7 +517,7 @@ public: // middle int wc = radius * CN; -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) wc = opRow(srcPtr, dstPtr, kVec, mulVal, radius, CN, widthCN); #endif for (; wc < widthCN; wc++) @@ -586,7 +586,7 @@ public: // middle auto diff0 = diff + radius * CN; int index = 0; -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) index = opComputeDiff(srcPtr, diff0, widthCN, CNR1); #endif @@ -688,7 +688,7 @@ private: float mulVal; }; -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) template inline int opColumn(const T* , T* , T* , TBuf* , TBuf* , TBuf* , const float , const int , const int , const int , const int , const int ) @@ -703,7 +703,7 @@ inline int opColumn(const float* srcPtr, float* dstPtr, float* sta { int k = 0; v_float32 v_mulVal = vx_setall_f32(mulVal); - const int VEC_LINE = v_float32::nlanes; + const int VEC_LINE = VTraits::vlanes(); const int VEC_LINE4 = 4 * VEC_LINE; auto stackStartPtr = stack + ss * widthLen; @@ -726,20 +726,20 @@ inline int opColumn(const float* srcPtr, float* dstPtr, float* sta v_float32 v_sumIn2 = vx_load(sumIn + VEC_LINE * 2 + k); v_float32 v_sumIn3 = vx_load(sumIn + VEC_LINE * 3+ k); - v_store(dstPtr + k, v_sum0 * v_mulVal); - v_store(dstPtr + VEC_LINE + k, v_sum1 * v_mulVal); - v_store(dstPtr + VEC_LINE * 2 + k, v_sum2 * v_mulVal); - v_store(dstPtr + VEC_LINE * 3 + k, v_sum3 * v_mulVal); + v_store(dstPtr + k, v_mul(v_sum0, v_mulVal)); + v_store(dstPtr + VEC_LINE + k, v_mul(v_sum1, v_mulVal)); + v_store(dstPtr + VEC_LINE * 2 + k, v_mul(v_sum2, v_mulVal)); + v_store(dstPtr + VEC_LINE * 3 + k, v_mul(v_sum3, v_mulVal)); - v_sum0 -= v_sumOut0; - v_sum1 -= v_sumOut1; - v_sum2 -= v_sumOut2; - v_sum3 -= v_sumOut3; + v_sum0 = v_sub(v_sum0, v_sumOut0); + v_sum1 = v_sub(v_sum1, v_sumOut1); + v_sum2 = v_sub(v_sum2, v_sumOut2); + v_sum3 = v_sub(v_sum3, v_sumOut3); - v_sumOut0 -= vx_load(stackStartPtr + k); - v_sumOut1 -= vx_load(stackStartPtr + VEC_LINE + k); - v_sumOut2 -= vx_load(stackStartPtr + VEC_LINE * 2 + k); - v_sumOut3 -= vx_load(stackStartPtr + VEC_LINE * 3 + k); + v_sumOut0 = v_sub(v_sumOut0, vx_load(stackStartPtr + k)); + v_sumOut1 = v_sub(v_sumOut1, vx_load(stackStartPtr + VEC_LINE + k)); + v_sumOut2 = v_sub(v_sumOut2, vx_load(stackStartPtr + VEC_LINE * 2 + k)); + v_sumOut3 = v_sub(v_sumOut3, vx_load(stackStartPtr + VEC_LINE * 3 + k)); v_float32 v_srcPtr0 = vx_load(srcPtr + k); v_float32 v_srcPtr1 = vx_load(srcPtr + VEC_LINE + k); @@ -751,35 +751,35 @@ inline int opColumn(const float* srcPtr, float* dstPtr, float* sta v_store(stackStartPtr + VEC_LINE * 2 + k, v_srcPtr2); v_store(stackStartPtr + VEC_LINE * 3 + k, v_srcPtr3); - v_sumIn0 += v_srcPtr0; - v_sumIn1 += v_srcPtr1; - v_sumIn2 += v_srcPtr2; - v_sumIn3 += v_srcPtr3; + v_sumIn0 = v_add(v_sumIn0, v_srcPtr0); + v_sumIn1 = v_add(v_sumIn1, v_srcPtr1); + v_sumIn2 = v_add(v_sumIn2, v_srcPtr2); + v_sumIn3 = v_add(v_sumIn3, v_srcPtr3); - v_store(sum + k, v_sum0 + v_sumIn0); - v_store(sum + VEC_LINE + k, v_sum1 + v_sumIn1); - v_store(sum + VEC_LINE * 2 + k, v_sum2 + v_sumIn2); - v_store(sum + VEC_LINE * 3 + k, v_sum3 + v_sumIn3); + v_store(sum + k, v_add(v_sum0, v_sumIn0)); + v_store(sum + VEC_LINE + k, v_add(v_sum1, v_sumIn1)); + v_store(sum + VEC_LINE * 2 + k, v_add(v_sum2, v_sumIn2)); + v_store(sum + VEC_LINE * 3 + k, v_add(v_sum3, v_sumIn3)); v_srcPtr0 = vx_load(stackSp1Ptr + k); v_srcPtr1 = vx_load(stackSp1Ptr + VEC_LINE + k); v_srcPtr2 = vx_load(stackSp1Ptr + VEC_LINE * 2 + k); v_srcPtr3 = vx_load(stackSp1Ptr + VEC_LINE * 3 + k); - v_sumOut0 += v_srcPtr0; - v_sumOut1 += v_srcPtr1; - v_sumOut2 += v_srcPtr2; - v_sumOut3 += v_srcPtr3; + v_sumOut0 = v_add(v_sumOut0, v_srcPtr0); + v_sumOut1 = v_add(v_sumOut1, v_srcPtr1); + v_sumOut2 = v_add(v_sumOut2, v_srcPtr2); + v_sumOut3 = v_add(v_sumOut3, v_srcPtr3); v_store(sumOut + k, v_sumOut0); v_store(sumOut + VEC_LINE + k, v_sumOut1); v_store(sumOut + VEC_LINE * 2 + k, v_sumOut2); v_store(sumOut + VEC_LINE * 3 + k, v_sumOut3); - v_sumIn0 -= v_srcPtr0; - v_sumIn1 -= v_srcPtr1; - v_sumIn2 -= v_srcPtr2; - v_sumIn3 -= v_srcPtr3; + v_sumIn0 = v_sub(v_sumIn0, v_srcPtr0); + v_sumIn1 = v_sub(v_sumIn1, v_srcPtr1); + v_sumIn2 = v_sub(v_sumIn2, v_srcPtr2); + v_sumIn3 = v_sub(v_sumIn3, v_srcPtr3); v_store(sumIn + k, v_sumIn0); v_store(sumIn + VEC_LINE + k, v_sumIn1); @@ -793,20 +793,20 @@ inline int opColumn(const float* srcPtr, float* dstPtr, float* sta v_float32 v_sumOut = vx_load(sumOut + k); v_float32 v_sumIn = vx_load(sumIn + k); - v_store(dstPtr + k, v_sum * v_mulVal); - v_sum -= v_sumOut; - v_sumOut -= vx_load(stackStartPtr + k); + v_store(dstPtr + k, v_mul(v_sum, v_mulVal)); + v_sum = v_sub(v_sum, v_sumOut); + v_sumOut = v_sub(v_sumOut, vx_load(stackStartPtr + k)); v_float32 v_srcPtr = vx_load(srcPtr + k); v_store(stackStartPtr + k, v_srcPtr); - v_sumIn += v_srcPtr; - v_store(sum + k, v_sum + v_sumIn); + v_sumIn = v_add(v_sumIn, v_srcPtr); + v_store(sum + k, v_add(v_sum, v_sumIn)); v_srcPtr = vx_load(stackSp1Ptr + k); - v_sumOut += v_srcPtr; + v_sumOut = v_add(v_sumOut, v_srcPtr); v_store(sumOut + k, v_sumOut); - v_sumIn -= v_srcPtr; + v_sumIn = v_sub(v_sumIn, v_srcPtr); v_store(sumIn + k, v_sumIn); } return k; @@ -820,8 +820,8 @@ inline int opColumn(const uchar* srcPtr, uchar* dstPtr, uchar* stack int k = 0; if (mulValTab != 0 && shrValTab != 0) { - const int VEC_LINE_8 = v_uint8::nlanes; - const int VEC_LINE_32 = v_int32::nlanes; + const int VEC_LINE_8 = VTraits::vlanes(); + const int VEC_LINE_32 = VTraits::vlanes(); v_int32 v_mulVal = vx_setall_s32(mulValTab); auto stackStartPtr = stack + ss * widthLen; @@ -850,13 +850,13 @@ inline int opColumn(const uchar* srcPtr, uchar* dstPtr, uchar* stack v_store(dstPtr + k, v_pack( - v_reinterpret_as_u16(v_pack((v_sum0 * v_mulVal)>>shrValTab, (v_sum1 * v_mulVal)>>shrValTab)), - v_reinterpret_as_u16(v_pack((v_sum2 * v_mulVal)>>shrValTab, (v_sum3 * v_mulVal)>>shrValTab)))); + v_reinterpret_as_u16(v_pack(v_shr(v_mul(v_sum0, v_mulVal), shrValTab), v_shr(v_mul(v_sum1, v_mulVal), shrValTab))), + v_reinterpret_as_u16(v_pack(v_shr(v_mul(v_sum2, v_mulVal), shrValTab), v_shr(v_mul(v_sum3, v_mulVal), shrValTab))))); - v_sum0 -= v_sumOut0; - v_sum1 -= v_sumOut1; - v_sum2 -= v_sumOut2; - v_sum3 -= v_sumOut3; + v_sum0 = v_sub(v_sum0, v_sumOut0); + v_sum1 = v_sub(v_sum1, v_sumOut1); + v_sum2 = v_sub(v_sum2, v_sumOut2); + v_sum3 = v_sub(v_sum3, v_sumOut3); v_uint16 x0l, x0h; v_int32 v_ss0, v_ss1, v_ss2, v_ss3; @@ -865,10 +865,10 @@ inline int opColumn(const uchar* srcPtr, uchar* dstPtr, uchar* stack v_expand(v_reinterpret_as_s16(x0l), v_ss0, v_ss1); v_expand(v_reinterpret_as_s16(x0h), v_ss2, v_ss3); - v_sumOut0 -= v_ss0; - v_sumOut1 -= v_ss1; - v_sumOut2 -= v_ss2; - v_sumOut3 -= v_ss3; + v_sumOut0 = v_sub(v_sumOut0, v_ss0); + v_sumOut1 = v_sub(v_sumOut1, v_ss1); + v_sumOut2 = v_sub(v_sumOut2, v_ss2); + v_sumOut3 = v_sub(v_sumOut3, v_ss3); v_expand(vx_load(srcPtr + k), x0l, x0h); v_expand(v_reinterpret_as_s16(x0l), v_ss0, v_ss1); @@ -876,34 +876,34 @@ inline int opColumn(const uchar* srcPtr, uchar* dstPtr, uchar* stack memcpy(stackStartPtr + k,srcPtr + k, VEC_LINE_8 * sizeof (uchar)); - v_sumIn0 += v_ss0; - v_sumIn1 += v_ss1; - v_sumIn2 += v_ss2; - v_sumIn3 += v_ss3; + v_sumIn0 = v_add(v_sumIn0, v_ss0); + v_sumIn1 = v_add(v_sumIn1, v_ss1); + v_sumIn2 = v_add(v_sumIn2, v_ss2); + v_sumIn3 = v_add(v_sumIn3, v_ss3); - v_store(sum + k, v_sum0 + v_sumIn0); - v_store(sum + VEC_LINE_32 + k, v_sum1 + v_sumIn1); - v_store(sum + VEC_LINE_32 * 2 + k, v_sum2 + v_sumIn2); - v_store(sum + VEC_LINE_32 * 3 + k, v_sum3 + v_sumIn3); + v_store(sum + k, v_add(v_sum0, v_sumIn0)); + v_store(sum + VEC_LINE_32 + k, v_add(v_sum1, v_sumIn1)); + v_store(sum + VEC_LINE_32 * 2 + k, v_add(v_sum2, v_sumIn2)); + v_store(sum + VEC_LINE_32 * 3 + k, v_add(v_sum3, v_sumIn3)); v_expand(vx_load(stackSp1Ptr + k), x0l, x0h); v_expand(v_reinterpret_as_s16(x0l), v_ss0, v_ss1); v_expand(v_reinterpret_as_s16(x0h), v_ss2, v_ss3); - v_sumOut0 += v_ss0; - v_sumOut1 += v_ss1; - v_sumOut2 += v_ss2; - v_sumOut3 += v_ss3; + v_sumOut0 = v_add(v_sumOut0, v_ss0); + v_sumOut1 = v_add(v_sumOut1, v_ss1); + v_sumOut2 = v_add(v_sumOut2, v_ss2); + v_sumOut3 = v_add(v_sumOut3, v_ss3); v_store(sumOut + k, v_sumOut0); v_store(sumOut + VEC_LINE_32 + k, v_sumOut1); v_store(sumOut + VEC_LINE_32 * 2 + k, v_sumOut2); v_store(sumOut + VEC_LINE_32 * 3 + k, v_sumOut3); - v_sumIn0 -= v_ss0; - v_sumIn1 -= v_ss1; - v_sumIn2 -= v_ss2; - v_sumIn3 -= v_ss3; + v_sumIn0 = v_sub(v_sumIn0, v_ss0); + v_sumIn1 = v_sub(v_sumIn1, v_ss1); + v_sumIn2 = v_sub(v_sumIn2, v_ss2); + v_sumIn3 = v_sub(v_sumIn3, v_ss3); v_store(sumIn + k, v_sumIn0); v_store(sumIn + VEC_LINE_32 + k, v_sumIn1); @@ -922,8 +922,8 @@ inline int opColumn(const short* srcPtr, short* dstPtr, short* stack int k = 0; if (mulValTab != 0 && shrValTab != 0) { - const int VEC_LINE_16 = v_int16::nlanes; - const int VEC_LINE_32 = v_int32::nlanes; + const int VEC_LINE_16 = VTraits::vlanes(); + const int VEC_LINE_32 = VTraits::vlanes(); v_int32 v_mulVal = vx_setall_s32(mulValTab); auto stackStartPtr = stack + ss * widthLen; @@ -943,39 +943,39 @@ inline int opColumn(const short* srcPtr, short* dstPtr, short* stack v_sumOut0 = vx_load(sumOut + k); v_sumOut1 = vx_load(sumOut + k + VEC_LINE_32); - v_store(dstPtr + k,v_pack((v_sum0 * v_mulVal)>>shrValTab, (v_sum1 * v_mulVal)>>shrValTab)); + v_store(dstPtr + k,v_pack(v_shr(v_mul(v_sum0, v_mulVal), shrValTab), v_shr(v_mul(v_sum1, v_mulVal), shrValTab))); - v_sum0 -= v_sumOut0; - v_sum1 -= v_sumOut1; + v_sum0 = v_sub(v_sum0, v_sumOut0); + v_sum1 = v_sub(v_sum1, v_sumOut1); v_int32 v_ss0, v_ss1; v_expand(vx_load(stackStartPtr + k), v_ss0, v_ss1); - v_sumOut0 -= v_ss0; - v_sumOut1 -= v_ss1; + v_sumOut0 = v_sub(v_sumOut0, v_ss0); + v_sumOut1 = v_sub(v_sumOut1, v_ss1); v_expand(vx_load(srcPtr + k), v_ss0, v_ss1); memcpy(stackStartPtr + k,srcPtr + k, VEC_LINE_16 * sizeof (short)); - v_sumIn0 += v_ss0; - v_sumIn1 += v_ss1; + v_sumIn0 = v_add(v_sumIn0, v_ss0); + v_sumIn1 = v_add(v_sumIn1, v_ss1); - v_sum0 += v_sumIn0; - v_sum1 += v_sumIn1; + v_sum0 = v_add(v_sum0, v_sumIn0); + v_sum1 = v_add(v_sum1, v_sumIn1); v_store(sum + k, v_sum0); v_store(sum + VEC_LINE_32 + k, v_sum1); v_expand(vx_load(stackSp1Ptr + k), v_ss0, v_ss1); - v_sumOut0 += v_ss0; - v_sumOut1 += v_ss1; + v_sumOut0 = v_add(v_sumOut0, v_ss0); + v_sumOut1 = v_add(v_sumOut1, v_ss1); v_store(sumOut + k, v_sumOut0); v_store(sumOut + VEC_LINE_32 + k, v_sumOut1); - v_sumIn0 -= v_ss0; - v_sumIn1 -= v_ss1; + v_sumIn0 = v_sub(v_sumIn0, v_ss0); + v_sumIn1 = v_sub(v_sumIn1, v_ss1); v_store(sumIn + k, v_sumIn0); v_store(sumIn + VEC_LINE_32 + k, v_sumIn1); @@ -992,8 +992,8 @@ inline int opColumn(const ushort* srcPtr, ushort* dstPtr, ushort* s int k = 0; if (mulValTab != 0 && shrValTab != 0) { - const int VEC_LINE_16 = v_uint16::nlanes; - const int VEC_LINE_32 = v_int32::nlanes; + const int VEC_LINE_16 = VTraits::vlanes(); + const int VEC_LINE_32 = VTraits::vlanes(); v_uint32 v_mulVal = vx_setall_u32((uint32_t)mulValTab); auto stackStartPtr = stack + ss * widthLen; @@ -1013,40 +1013,40 @@ inline int opColumn(const ushort* srcPtr, ushort* dstPtr, ushort* s v_sumOut0 = vx_load(sumOut + k); v_sumOut1 = vx_load(sumOut + k + VEC_LINE_32); - v_store(dstPtr + k, v_pack((v_reinterpret_as_u32(v_sum0) * v_mulVal)>>shrValTab, (v_reinterpret_as_u32(v_sum1) * v_mulVal)>>shrValTab)); + v_store(dstPtr + k, v_pack(v_shr(v_mul(v_reinterpret_as_u32(v_sum0), v_mulVal), shrValTab), v_shr(v_mul(v_reinterpret_as_u32(v_sum1), v_mulVal), shrValTab))); - v_sum0 -= v_sumOut0; - v_sum1 -= v_sumOut1; + v_sum0 = v_sub(v_sum0, v_sumOut0); + v_sum1 = v_sub(v_sum1, v_sumOut1); v_uint32 v_ss0, v_ss1; v_expand(vx_load(stackStartPtr + k), v_ss0, v_ss1); - v_sumOut0 -= v_reinterpret_as_s32(v_ss0); - v_sumOut1 -= v_reinterpret_as_s32(v_ss1); + v_sumOut0 = v_sub(v_sumOut0, v_reinterpret_as_s32(v_ss0)); + v_sumOut1 = v_sub(v_sumOut1, v_reinterpret_as_s32(v_ss1)); v_expand(vx_load(srcPtr + k), v_ss0, v_ss1); memcpy(stackStartPtr + k,srcPtr + k, VEC_LINE_16 * sizeof (ushort)); - v_sumIn0 += v_reinterpret_as_s32(v_ss0); - v_sumIn1 += v_reinterpret_as_s32(v_ss1); + v_sumIn0 = v_add(v_sumIn0, v_reinterpret_as_s32(v_ss0)); + v_sumIn1 = v_add(v_sumIn1, v_reinterpret_as_s32(v_ss1)); - v_sum0 += v_sumIn0; - v_sum1 += v_sumIn1; + v_sum0 = v_add(v_sum0, v_sumIn0); + v_sum1 = v_add(v_sum1, v_sumIn1); v_store(sum + k, v_sum0); v_store(sum + VEC_LINE_32 + k, v_sum1); v_expand(vx_load(stackSp1Ptr + k), v_ss0, v_ss1); - v_sumOut0 += v_reinterpret_as_s32(v_ss0); - v_sumOut1 += v_reinterpret_as_s32(v_ss1); + v_sumOut0 = v_add(v_sumOut0, v_reinterpret_as_s32(v_ss0)); + v_sumOut1 = v_add(v_sumOut1, v_reinterpret_as_s32(v_ss1)); v_store(sumOut + k, v_sumOut0); v_store(sumOut + VEC_LINE_32 + k, v_sumOut1); - v_sumIn0 -= v_reinterpret_as_s32(v_ss0); - v_sumIn1 -= v_reinterpret_as_s32(v_ss1); + v_sumIn0 = v_sub(v_sumIn0, v_reinterpret_as_s32(v_ss0)); + v_sumIn1 = v_sub(v_sumIn1, v_reinterpret_as_s32(v_ss1)); v_store(sumIn + k, v_sumIn0); v_store(sumIn + VEC_LINE_32 + k, v_sumIn1); @@ -1152,7 +1152,7 @@ public: } int k = 0; -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) k = opColumn(srcPtr, dstPtr, stack, sum, sumIn, sumOut, mulVal, mulValTab, shrValTab, widthLen, stackStart, sp1); #endif diff --git a/modules/imgproc/src/thresh.cpp b/modules/imgproc/src/thresh.cpp index bed0d37f26..4622691e68 100644 --- a/modules/imgproc/src/thresh.cpp +++ b/modules/imgproc/src/thresh.cpp @@ -190,7 +190,7 @@ thresh_8u( const Mat& _src, Mat& _dst, uchar thresh, uchar maxval, int type ) int j = 0; const uchar* src = _src.ptr(); uchar* dst = _dst.ptr(); -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) v_uint8 thresh_u = vx_setall_u8( thresh ); v_uint8 maxval16 = vx_setall_u8( maxval ); @@ -199,12 +199,12 @@ thresh_8u( const Mat& _src, Mat& _dst, uchar thresh, uchar maxval, int type ) case THRESH_BINARY: for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { - for( j = 0; j <= roi.width - v_uint8::nlanes; j += v_uint8::nlanes) + for( j = 0; j <= roi.width - VTraits::vlanes(); j += VTraits::vlanes()) { v_uint8 v0; v0 = vx_load( src + j ); - v0 = thresh_u < v0; - v0 = v0 & maxval16; + v0 = v_lt(thresh_u, v0); + v0 = v_and(v0, maxval16); v_store( dst + j, v0 ); } } @@ -213,12 +213,12 @@ thresh_8u( const Mat& _src, Mat& _dst, uchar thresh, uchar maxval, int type ) case THRESH_BINARY_INV: for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { - for( j = 0; j <= roi.width - v_uint8::nlanes; j += v_uint8::nlanes) + for( j = 0; j <= roi.width - VTraits::vlanes(); j += VTraits::vlanes()) { v_uint8 v0; v0 = vx_load( src + j ); - v0 = v0 <= thresh_u; - v0 = v0 & maxval16; + v0 = v_le(v0, thresh_u); + v0 = v_and(v0, maxval16); v_store( dst + j, v0 ); } } @@ -227,11 +227,11 @@ thresh_8u( const Mat& _src, Mat& _dst, uchar thresh, uchar maxval, int type ) case THRESH_TRUNC: for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { - for( j = 0; j <= roi.width - v_uint8::nlanes; j += v_uint8::nlanes) + for( j = 0; j <= roi.width - VTraits::vlanes(); j += VTraits::vlanes()) { v_uint8 v0; v0 = vx_load( src + j ); - v0 = v0 - ( v0 - thresh_u ); + v0 = v_sub(v0, v_sub(v0, thresh_u)); v_store( dst + j, v0 ); } } @@ -240,11 +240,11 @@ thresh_8u( const Mat& _src, Mat& _dst, uchar thresh, uchar maxval, int type ) case THRESH_TOZERO: for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { - for( j = 0; j <= roi.width - v_uint8::nlanes; j += v_uint8::nlanes) + for( j = 0; j <= roi.width - VTraits::vlanes(); j += VTraits::vlanes()) { v_uint8 v0; v0 = vx_load( src + j ); - v0 = ( thresh_u < v0 ) & v0; + v0 = v_and(v_lt(thresh_u, v0), v0); v_store( dst + j, v0 ); } } @@ -253,11 +253,11 @@ thresh_8u( const Mat& _src, Mat& _dst, uchar thresh, uchar maxval, int type ) case THRESH_TOZERO_INV: for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { - for( j = 0; j <= roi.width - v_uint8::nlanes; j += v_uint8::nlanes) + for( j = 0; j <= roi.width - VTraits::vlanes(); j += VTraits::vlanes()) { v_uint8 v0; v0 = vx_load( src + j ); - v0 = ( v0 <= thresh_u ) & v0; + v0 = v_and(v_le(v0, thresh_u), v0); v_store( dst + j, v0 ); } } @@ -351,7 +351,7 @@ thresh_16u(const Mat& _src, Mat& _dst, ushort thresh, ushort maxval, int type) const ushort* src = _src.ptr(); ushort* dst = _dst.ptr(); -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) int i, j; v_uint16 thresh_u = vx_setall_u16(thresh); v_uint16 maxval16 = vx_setall_u16(maxval); @@ -361,25 +361,25 @@ thresh_16u(const Mat& _src, Mat& _dst, ushort thresh, ushort maxval, int type) case THRESH_BINARY: for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step) { - for (j = 0; j <= roi.width - 2*v_uint16::nlanes; j += 2*v_uint16::nlanes) + for (j = 0; j <= roi.width - 2*VTraits::vlanes(); j += 2*VTraits::vlanes()) { v_uint16 v0, v1; v0 = vx_load(src + j); - v1 = vx_load(src + j + v_uint16::nlanes); - v0 = thresh_u < v0; - v1 = thresh_u < v1; - v0 = v0 & maxval16; - v1 = v1 & maxval16; + v1 = vx_load(src + j + VTraits::vlanes()); + v0 = v_lt(thresh_u, v0); + v1 = v_lt(thresh_u, v1); + v0 = v_and(v0, maxval16); + v1 = v_and(v1, maxval16); v_store(dst + j, v0); - v_store(dst + j + v_uint16::nlanes, v1); + v_store(dst + j + VTraits::vlanes(), v1); } - if (j <= roi.width - v_uint16::nlanes) + if (j <= roi.width - VTraits::vlanes()) { v_uint16 v0 = vx_load(src + j); - v0 = thresh_u < v0; - v0 = v0 & maxval16; + v0 = v_lt(thresh_u, v0); + v0 = v_and(v0, maxval16); v_store(dst + j, v0); - j += v_uint16::nlanes; + j += VTraits::vlanes(); } for (; j < roi.width; j++) @@ -391,25 +391,25 @@ thresh_16u(const Mat& _src, Mat& _dst, ushort thresh, ushort maxval, int type) for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step) { j = 0; - for (; j <= roi.width - 2*v_uint16::nlanes; j += 2*v_uint16::nlanes) + for (; j <= roi.width - 2*VTraits::vlanes(); j += 2*VTraits::vlanes()) { v_uint16 v0, v1; v0 = vx_load(src + j); - v1 = vx_load(src + j + v_uint16::nlanes); - v0 = v0 <= thresh_u; - v1 = v1 <= thresh_u; - v0 = v0 & maxval16; - v1 = v1 & maxval16; + v1 = vx_load(src + j + VTraits::vlanes()); + v0 = v_le(v0, thresh_u); + v1 = v_le(v1, thresh_u); + v0 = v_and(v0, maxval16); + v1 = v_and(v1, maxval16); v_store(dst + j, v0); - v_store(dst + j + v_uint16::nlanes, v1); + v_store(dst + j + VTraits::vlanes(), v1); } - if (j <= roi.width - v_uint16::nlanes) + if (j <= roi.width - VTraits::vlanes()) { v_uint16 v0 = vx_load(src + j); - v0 = v0 <= thresh_u; - v0 = v0 & maxval16; + v0 = v_le(v0, thresh_u); + v0 = v_and(v0, maxval16); v_store(dst + j, v0); - j += v_uint16::nlanes; + j += VTraits::vlanes(); } for (; j < roi.width; j++) @@ -421,22 +421,22 @@ thresh_16u(const Mat& _src, Mat& _dst, ushort thresh, ushort maxval, int type) for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step) { j = 0; - for (; j <= roi.width - 2*v_uint16::nlanes; j += 2*v_uint16::nlanes) + for (; j <= roi.width - 2*VTraits::vlanes(); j += 2*VTraits::vlanes()) { v_uint16 v0, v1; v0 = vx_load(src + j); - v1 = vx_load(src + j + v_uint16::nlanes); + v1 = vx_load(src + j + VTraits::vlanes()); v0 = v_min(v0, thresh_u); v1 = v_min(v1, thresh_u); v_store(dst + j, v0); - v_store(dst + j + v_uint16::nlanes, v1); + v_store(dst + j + VTraits::vlanes(), v1); } - if (j <= roi.width - v_uint16::nlanes) + if (j <= roi.width - VTraits::vlanes()) { v_uint16 v0 = vx_load(src + j); v0 = v_min(v0, thresh_u); v_store(dst + j, v0); - j += v_uint16::nlanes; + j += VTraits::vlanes(); } for (; j < roi.width; j++) @@ -448,22 +448,22 @@ thresh_16u(const Mat& _src, Mat& _dst, ushort thresh, ushort maxval, int type) for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step) { j = 0; - for (; j <= roi.width - 2*v_uint16::nlanes; j += 2*v_uint16::nlanes) + for (; j <= roi.width - 2*VTraits::vlanes(); j += 2*VTraits::vlanes()) { v_uint16 v0, v1; v0 = vx_load(src + j); - v1 = vx_load(src + j + v_uint16::nlanes); - v0 = (thresh_u < v0) & v0; - v1 = (thresh_u < v1) & v1; + v1 = vx_load(src + j + VTraits::vlanes()); + v0 = v_and(v_lt(thresh_u, v0), v0); + v1 = v_and(v_lt(thresh_u, v1), v1); v_store(dst + j, v0); - v_store(dst + j + v_uint16::nlanes, v1); + v_store(dst + j + VTraits::vlanes(), v1); } - if (j <= roi.width - v_uint16::nlanes) + if (j <= roi.width - VTraits::vlanes()) { v_uint16 v0 = vx_load(src + j); - v0 = (thresh_u < v0) & v0; + v0 = v_and(v_lt(thresh_u, v0), v0); v_store(dst + j, v0); - j += v_uint16::nlanes; + j += VTraits::vlanes(); } for (; j < roi.width; j++) @@ -475,22 +475,22 @@ thresh_16u(const Mat& _src, Mat& _dst, ushort thresh, ushort maxval, int type) for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step) { j = 0; - for (; j <= roi.width - 2*v_uint16::nlanes; j += 2*v_uint16::nlanes) + for (; j <= roi.width - 2*VTraits::vlanes(); j += 2*VTraits::vlanes()) { v_uint16 v0, v1; v0 = vx_load(src + j); - v1 = vx_load(src + j + v_uint16::nlanes); - v0 = (v0 <= thresh_u) & v0; - v1 = (v1 <= thresh_u) & v1; + v1 = vx_load(src + j + VTraits::vlanes()); + v0 = v_and(v_le(v0, thresh_u), v0); + v1 = v_and(v_le(v1, thresh_u), v1); v_store(dst + j, v0); - v_store(dst + j + v_uint16::nlanes, v1); + v_store(dst + j + VTraits::vlanes(), v1); } - if (j <= roi.width - v_uint16::nlanes) + if (j <= roi.width - VTraits::vlanes()) { v_uint16 v0 = vx_load(src + j); - v0 = (v0 <= thresh_u) & v0; + v0 = v_and(v_le(v0, thresh_u), v0); v_store(dst + j, v0); - j += v_uint16::nlanes; + j += VTraits::vlanes(); } for (; j < roi.width; j++) @@ -571,7 +571,7 @@ thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type ) } #endif -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) int i, j; v_int16 thresh8 = vx_setall_s16( thresh ); v_int16 maxval8 = vx_setall_s16( maxval ); @@ -582,25 +582,25 @@ thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type ) for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { j = 0; - for( ; j <= roi.width - 2*v_int16::nlanes; j += 2*v_int16::nlanes ) + for( ; j <= roi.width - 2*VTraits::vlanes(); j += 2*VTraits::vlanes() ) { v_int16 v0, v1; v0 = vx_load( src + j ); - v1 = vx_load( src + j + v_int16::nlanes ); - v0 = thresh8 < v0; - v1 = thresh8 < v1; - v0 = v0 & maxval8; - v1 = v1 & maxval8; + v1 = vx_load( src + j + VTraits::vlanes() ); + v0 = v_lt(thresh8, v0); + v1 = v_lt(thresh8, v1); + v0 = v_and(v0, maxval8); + v1 = v_and(v1, maxval8); v_store( dst + j, v0 ); - v_store( dst + j + v_int16::nlanes, v1 ); + v_store( dst + j + VTraits::vlanes(), v1 ); } - if( j <= roi.width - v_int16::nlanes ) + if( j <= roi.width - VTraits::vlanes() ) { v_int16 v0 = vx_load( src + j ); - v0 = thresh8 < v0; - v0 = v0 & maxval8; + v0 = v_lt(thresh8, v0); + v0 = v_and(v0, maxval8); v_store( dst + j, v0 ); - j += v_int16::nlanes; + j += VTraits::vlanes(); } for( ; j < roi.width; j++ ) @@ -612,25 +612,25 @@ thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type ) for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { j = 0; - for( ; j <= roi.width - 2*v_int16::nlanes; j += 2*v_int16::nlanes ) + for( ; j <= roi.width - 2*VTraits::vlanes(); j += 2*VTraits::vlanes() ) { v_int16 v0, v1; v0 = vx_load( src + j ); - v1 = vx_load( src + j + v_int16::nlanes ); - v0 = v0 <= thresh8; - v1 = v1 <= thresh8; - v0 = v0 & maxval8; - v1 = v1 & maxval8; + v1 = vx_load( src + j + VTraits::vlanes() ); + v0 = v_le(v0, thresh8); + v1 = v_le(v1, thresh8); + v0 = v_and(v0, maxval8); + v1 = v_and(v1, maxval8); v_store( dst + j, v0 ); - v_store( dst + j + v_int16::nlanes, v1 ); + v_store( dst + j + VTraits::vlanes(), v1 ); } - if( j <= roi.width - v_int16::nlanes ) + if( j <= roi.width - VTraits::vlanes() ) { v_int16 v0 = vx_load( src + j ); - v0 = v0 <= thresh8; - v0 = v0 & maxval8; + v0 = v_le(v0, thresh8); + v0 = v_and(v0, maxval8); v_store( dst + j, v0 ); - j += v_int16::nlanes; + j += VTraits::vlanes(); } for( ; j < roi.width; j++ ) @@ -642,22 +642,22 @@ thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type ) for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { j = 0; - for( ; j <= roi.width - 2*v_int16::nlanes; j += 2*v_int16::nlanes ) + for( ; j <= roi.width - 2*VTraits::vlanes(); j += 2*VTraits::vlanes() ) { v_int16 v0, v1; v0 = vx_load( src + j ); - v1 = vx_load( src + j + v_int16::nlanes ); + v1 = vx_load( src + j + VTraits::vlanes() ); v0 = v_min( v0, thresh8 ); v1 = v_min( v1, thresh8 ); v_store( dst + j, v0 ); - v_store( dst + j + v_int16::nlanes, v1 ); + v_store( dst + j + VTraits::vlanes(), v1 ); } - if( j <= roi.width - v_int16::nlanes ) + if( j <= roi.width - VTraits::vlanes() ) { v_int16 v0 = vx_load( src + j ); v0 = v_min( v0, thresh8 ); v_store( dst + j, v0 ); - j += v_int16::nlanes; + j += VTraits::vlanes(); } for( ; j < roi.width; j++ ) @@ -669,22 +669,22 @@ thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type ) for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { j = 0; - for( ; j <= roi.width - 2*v_int16::nlanes; j += 2*v_int16::nlanes ) + for( ; j <= roi.width - 2*VTraits::vlanes(); j += 2*VTraits::vlanes() ) { v_int16 v0, v1; v0 = vx_load( src + j ); - v1 = vx_load( src + j + v_int16::nlanes ); - v0 = ( thresh8 < v0 ) & v0; - v1 = ( thresh8 < v1 ) & v1; + v1 = vx_load( src + j + VTraits::vlanes() ); + v0 = v_and(v_lt(thresh8, v0), v0); + v1 = v_and(v_lt(thresh8, v1), v1); v_store( dst + j, v0 ); - v_store( dst + j + v_int16::nlanes, v1 ); + v_store( dst + j + VTraits::vlanes(), v1 ); } - if( j <= roi.width - v_int16::nlanes ) + if( j <= roi.width - VTraits::vlanes() ) { v_int16 v0 = vx_load( src + j ); - v0 = ( thresh8 < v0 ) & v0; + v0 = v_and(v_lt(thresh8, v0), v0); v_store( dst + j, v0 ); - j += v_int16::nlanes; + j += VTraits::vlanes(); } for( ; j < roi.width; j++ ) @@ -696,22 +696,22 @@ thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type ) for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { j = 0; - for( ; j <= roi.width - 2*v_int16::nlanes; j += 2*v_int16::nlanes ) + for( ; j <= roi.width - 2*VTraits::vlanes(); j += 2*VTraits::vlanes() ) { v_int16 v0, v1; v0 = vx_load( src + j ); - v1 = vx_load( src + j + v_int16::nlanes ); - v0 = ( v0 <= thresh8 ) & v0; - v1 = ( v1 <= thresh8 ) & v1; + v1 = vx_load( src + j + VTraits::vlanes() ); + v0 = v_and(v_le(v0, thresh8), v0); + v1 = v_and(v_le(v1, thresh8), v1); v_store( dst + j, v0 ); - v_store( dst + j + v_int16::nlanes, v1 ); + v_store( dst + j + VTraits::vlanes(), v1 ); } - if( j <= roi.width - v_int16::nlanes ) + if( j <= roi.width - VTraits::vlanes() ) { v_int16 v0 = vx_load( src + j ); - v0 = ( v0 <= thresh8 ) & v0; + v0 = v_and(v_le(v0, thresh8), v0); v_store( dst + j, v0 ); - j += v_int16::nlanes; + j += VTraits::vlanes(); } for( ; j < roi.width; j++ ) @@ -777,7 +777,7 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type ) } #endif -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) int i, j; v_float32 thresh4 = vx_setall_f32( thresh ); v_float32 maxval4 = vx_setall_f32( maxval ); @@ -788,25 +788,25 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type ) for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { j = 0; - for( ; j <= roi.width - 2*v_float32::nlanes; j += 2*v_float32::nlanes ) + for( ; j <= roi.width - 2*VTraits::vlanes(); j += 2*VTraits::vlanes() ) { v_float32 v0, v1; v0 = vx_load( src + j ); - v1 = vx_load( src + j + v_float32::nlanes ); - v0 = thresh4 < v0; - v1 = thresh4 < v1; - v0 = v0 & maxval4; - v1 = v1 & maxval4; + v1 = vx_load( src + j + VTraits::vlanes() ); + v0 = v_lt(thresh4, v0); + v1 = v_lt(thresh4, v1); + v0 = v_and(v0, maxval4); + v1 = v_and(v1, maxval4); v_store( dst + j, v0 ); - v_store( dst + j + v_float32::nlanes, v1 ); + v_store( dst + j + VTraits::vlanes(), v1 ); } - if( j <= roi.width - v_float32::nlanes ) + if( j <= roi.width - VTraits::vlanes() ) { v_float32 v0 = vx_load( src + j ); - v0 = thresh4 < v0; - v0 = v0 & maxval4; + v0 = v_lt(thresh4, v0); + v0 = v_and(v0, maxval4); v_store( dst + j, v0 ); - j += v_float32::nlanes; + j += VTraits::vlanes(); } for( ; j < roi.width; j++ ) @@ -818,25 +818,25 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type ) for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { j = 0; - for( ; j <= roi.width - 2*v_float32::nlanes; j += 2*v_float32::nlanes ) + for( ; j <= roi.width - 2*VTraits::vlanes(); j += 2*VTraits::vlanes() ) { v_float32 v0, v1; v0 = vx_load( src + j ); - v1 = vx_load( src + j + v_float32::nlanes ); - v0 = v0 <= thresh4; - v1 = v1 <= thresh4; - v0 = v0 & maxval4; - v1 = v1 & maxval4; + v1 = vx_load( src + j + VTraits::vlanes() ); + v0 = v_le(v0, thresh4); + v1 = v_le(v1, thresh4); + v0 = v_and(v0, maxval4); + v1 = v_and(v1, maxval4); v_store( dst + j, v0 ); - v_store( dst + j + v_float32::nlanes, v1 ); + v_store( dst + j + VTraits::vlanes(), v1 ); } - if( j <= roi.width - v_float32::nlanes ) + if( j <= roi.width - VTraits::vlanes() ) { v_float32 v0 = vx_load( src + j ); - v0 = v0 <= thresh4; - v0 = v0 & maxval4; + v0 = v_le(v0, thresh4); + v0 = v_and(v0, maxval4); v_store( dst + j, v0 ); - j += v_float32::nlanes; + j += VTraits::vlanes(); } for( ; j < roi.width; j++ ) @@ -848,22 +848,22 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type ) for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { j = 0; - for( ; j <= roi.width - 2*v_float32::nlanes; j += 2*v_float32::nlanes ) + for( ; j <= roi.width - 2*VTraits::vlanes(); j += 2*VTraits::vlanes() ) { v_float32 v0, v1; v0 = vx_load( src + j ); - v1 = vx_load( src + j + v_float32::nlanes ); + v1 = vx_load( src + j + VTraits::vlanes() ); v0 = v_min( v0, thresh4 ); v1 = v_min( v1, thresh4 ); v_store( dst + j, v0 ); - v_store( dst + j + v_float32::nlanes, v1 ); + v_store( dst + j + VTraits::vlanes(), v1 ); } - if( j <= roi.width - v_float32::nlanes ) + if( j <= roi.width - VTraits::vlanes() ) { v_float32 v0 = vx_load( src + j ); v0 = v_min( v0, thresh4 ); v_store( dst + j, v0 ); - j += v_float32::nlanes; + j += VTraits::vlanes(); } for( ; j < roi.width; j++ ) @@ -875,22 +875,22 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type ) for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { j = 0; - for( ; j <= roi.width - 2*v_float32::nlanes; j += 2*v_float32::nlanes ) + for( ; j <= roi.width - 2*VTraits::vlanes(); j += 2*VTraits::vlanes() ) { v_float32 v0, v1; v0 = vx_load( src + j ); - v1 = vx_load( src + j + v_float32::nlanes ); - v0 = ( thresh4 < v0 ) & v0; - v1 = ( thresh4 < v1 ) & v1; + v1 = vx_load( src + j + VTraits::vlanes() ); + v0 = v_and(v_lt(thresh4, v0), v0); + v1 = v_and(v_lt(thresh4, v1), v1); v_store( dst + j, v0 ); - v_store( dst + j + v_float32::nlanes, v1 ); + v_store( dst + j + VTraits::vlanes(), v1 ); } - if( j <= roi.width - v_float32::nlanes ) + if( j <= roi.width - VTraits::vlanes() ) { v_float32 v0 = vx_load( src + j ); - v0 = ( thresh4 < v0 ) & v0; + v0 = v_and(v_lt(thresh4, v0), v0); v_store( dst + j, v0 ); - j += v_float32::nlanes; + j += VTraits::vlanes(); } for( ; j < roi.width; j++ ) @@ -902,22 +902,22 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type ) for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { j = 0; - for( ; j <= roi.width - 2*v_float32::nlanes; j += 2*v_float32::nlanes ) + for( ; j <= roi.width - 2*VTraits::vlanes(); j += 2*VTraits::vlanes() ) { v_float32 v0, v1; v0 = vx_load( src + j ); - v1 = vx_load( src + j + v_float32::nlanes ); - v0 = ( v0 <= thresh4 ) & v0; - v1 = ( v1 <= thresh4 ) & v1; + v1 = vx_load( src + j + VTraits::vlanes() ); + v0 = v_and(v_le(v0, thresh4), v0); + v1 = v_and(v_le(v1, thresh4), v1); v_store( dst + j, v0 ); - v_store( dst + j + v_float32::nlanes, v1 ); + v_store( dst + j + VTraits::vlanes(), v1 ); } - if( j <= roi.width - v_float32::nlanes ) + if( j <= roi.width - VTraits::vlanes() ) { v_float32 v0 = vx_load( src + j ); - v0 = ( v0 <= thresh4 ) & v0; + v0 = v_and(v_le(v0, thresh4), v0); v_store( dst + j, v0 ); - j += v_float32::nlanes; + j += VTraits::vlanes(); } for( ; j < roi.width; j++ ) @@ -948,7 +948,7 @@ thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type) roi.height = 1; } -#if CV_SIMD_64F +#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F) int i, j; v_float64 thresh2 = vx_setall_f64( thresh ); v_float64 maxval2 = vx_setall_f64( maxval ); @@ -959,25 +959,25 @@ thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type) for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { j = 0; - for( ; j <= roi.width - 2*v_float64::nlanes; j += 2*v_float64::nlanes ) + for( ; j <= roi.width - 2*VTraits::vlanes(); j += 2*VTraits::vlanes() ) { v_float64 v0, v1; v0 = vx_load( src + j ); - v1 = vx_load( src + j + v_float64::nlanes ); - v0 = thresh2 < v0; - v1 = thresh2 < v1; - v0 = v0 & maxval2; - v1 = v1 & maxval2; + v1 = vx_load( src + j + VTraits::vlanes() ); + v0 = v_lt(thresh2, v0); + v1 = v_lt(thresh2, v1); + v0 = v_and(v0, maxval2); + v1 = v_and(v1, maxval2); v_store( dst + j, v0 ); - v_store( dst + j + v_float64::nlanes, v1 ); + v_store( dst + j + VTraits::vlanes(), v1 ); } - if( j <= roi.width - v_float64::nlanes ) + if( j <= roi.width - VTraits::vlanes() ) { v_float64 v0 = vx_load( src + j ); - v0 = thresh2 < v0; - v0 = v0 & maxval2; + v0 = v_lt(thresh2, v0); + v0 = v_and(v0, maxval2); v_store( dst + j, v0 ); - j += v_float64::nlanes; + j += VTraits::vlanes(); } for( ; j < roi.width; j++ ) @@ -989,25 +989,25 @@ thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type) for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { j = 0; - for( ; j <= roi.width - 2*v_float64::nlanes; j += 2*v_float64::nlanes ) + for( ; j <= roi.width - 2*VTraits::vlanes(); j += 2*VTraits::vlanes() ) { v_float64 v0, v1; v0 = vx_load( src + j ); - v1 = vx_load( src + j + v_float64::nlanes ); - v0 = v0 <= thresh2; - v1 = v1 <= thresh2; - v0 = v0 & maxval2; - v1 = v1 & maxval2; + v1 = vx_load( src + j + VTraits::vlanes() ); + v0 = v_le(v0, thresh2); + v1 = v_le(v1, thresh2); + v0 = v_and(v0, maxval2); + v1 = v_and(v1, maxval2); v_store( dst + j, v0 ); - v_store( dst + j + v_float64::nlanes, v1 ); + v_store( dst + j + VTraits::vlanes(), v1 ); } - if( j <= roi.width - v_float64::nlanes ) + if( j <= roi.width - VTraits::vlanes() ) { v_float64 v0 = vx_load( src + j ); - v0 = v0 <= thresh2; - v0 = v0 & maxval2; + v0 = v_le(v0, thresh2); + v0 = v_and(v0, maxval2); v_store( dst + j, v0 ); - j += v_float64::nlanes; + j += VTraits::vlanes(); } for( ; j < roi.width; j++ ) @@ -1019,22 +1019,22 @@ thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type) for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { j = 0; - for( ; j <= roi.width - 2*v_float64::nlanes; j += 2*v_float64::nlanes ) + for( ; j <= roi.width - 2*VTraits::vlanes(); j += 2*VTraits::vlanes() ) { v_float64 v0, v1; v0 = vx_load( src + j ); - v1 = vx_load( src + j + v_float64::nlanes ); + v1 = vx_load( src + j + VTraits::vlanes() ); v0 = v_min( v0, thresh2 ); v1 = v_min( v1, thresh2 ); v_store( dst + j, v0 ); - v_store( dst + j + v_float64::nlanes, v1 ); + v_store( dst + j + VTraits::vlanes(), v1 ); } - if( j <= roi.width - v_float64::nlanes ) + if( j <= roi.width - VTraits::vlanes() ) { v_float64 v0 = vx_load( src + j ); v0 = v_min( v0, thresh2 ); v_store( dst + j, v0 ); - j += v_float64::nlanes; + j += VTraits::vlanes(); } for( ; j < roi.width; j++ ) @@ -1046,22 +1046,22 @@ thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type) for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { j = 0; - for( ; j <= roi.width - 2*v_float64::nlanes; j += 2*v_float64::nlanes ) + for( ; j <= roi.width - 2*VTraits::vlanes(); j += 2*VTraits::vlanes() ) { v_float64 v0, v1; v0 = vx_load( src + j ); - v1 = vx_load( src + j + v_float64::nlanes ); - v0 = ( thresh2 < v0 ) & v0; - v1 = ( thresh2 < v1 ) & v1; + v1 = vx_load( src + j + VTraits::vlanes() ); + v0 = v_and(v_lt(thresh2, v0), v0); + v1 = v_and(v_lt(thresh2, v1), v1); v_store( dst + j, v0 ); - v_store( dst + j + v_float64::nlanes, v1 ); + v_store( dst + j + VTraits::vlanes(), v1 ); } - if( j <= roi.width - v_float64::nlanes ) + if( j <= roi.width - VTraits::vlanes() ) { v_float64 v0 = vx_load( src + j ); - v0 = ( thresh2 < v0 ) & v0; + v0 = v_and(v_lt(thresh2, v0), v0); v_store( dst + j, v0 ); - j += v_float64::nlanes; + j += VTraits::vlanes(); } for( ; j < roi.width; j++ ) @@ -1073,22 +1073,22 @@ thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type) for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { j = 0; - for( ; j <= roi.width - 2*v_float64::nlanes; j += 2*v_float64::nlanes ) + for( ; j <= roi.width - 2*VTraits::vlanes(); j += 2*VTraits::vlanes() ) { v_float64 v0, v1; v0 = vx_load( src + j ); - v1 = vx_load( src + j + v_float64::nlanes ); - v0 = ( v0 <= thresh2 ) & v0; - v1 = ( v1 <= thresh2 ) & v1; + v1 = vx_load( src + j + VTraits::vlanes() ); + v0 = v_and(v_le(v0, thresh2), v0); + v1 = v_and(v_le(v1, thresh2), v1); v_store( dst + j, v0 ); - v_store( dst + j + v_float64::nlanes, v1 ); + v_store( dst + j + VTraits::vlanes(), v1 ); } - if( j <= roi.width - v_float64::nlanes ) + if( j <= roi.width - VTraits::vlanes() ) { v_float64 v0 = vx_load( src + j ); - v0 = ( v0 <= thresh2 ) & v0; + v0 = v_and(v_le(v0, thresh2), v0); v_store( dst + j, v0 ); - j += v_float64::nlanes; + j += VTraits::vlanes(); } for( ; j < roi.width; j++ )