mirror of
https://github.com/opencv/opencv.git
synced 2024-11-28 05:06:29 +08:00
Merge pull request #24132 from hanliutong:rewrite-imgproc2
Rewrite Universal Intrinsic code by using new API: ImgProc module Part 2 #24132 The goal of this series of PRs is to modify the SIMD code blocks guarded by CV_SIMD macro in the opencv/modules/imgproc folder: rewrite them by using the new Universal Intrinsic API. This is the second part of the modification to the Imgproc module ( Part 1: #24058 ), And I tested this patch on RVV (QEMU) and AVX devices, `opencv_test_imgproc` is passed. The patch is partially auto-generated by using the [rewriter](https://github.com/hanliutong/rewriter). ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [ ] I agree to contribute to the project under Apache 2 License. - [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [ ] The PR is proposed to the proper branch - [ ] There is a reference to the original bug report and related work - [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [ ] The feature is well documented and sample code can be built with the project CMake
This commit is contained in:
parent
8f2e6640e3
commit
f617fbe166
@ -745,7 +745,22 @@ namespace CV__SIMD_NAMESPACE {
|
||||
inline _Tpvec v_add(const _Tpvec& f1, const _Tpvec& f2, const Args&... vf) { \
|
||||
return v_add(f1 + f2, vf...); \
|
||||
}
|
||||
#define OPENCV_HAL_WRAP_SHIFT_OP(_Tpvec) \
|
||||
inline _Tpvec v_shr(const _Tpvec& a, int n) \
|
||||
{ \
|
||||
return a >> n; \
|
||||
} \
|
||||
inline _Tpvec v_shl(const _Tpvec& a, int n) \
|
||||
{ \
|
||||
return a << n; \
|
||||
}
|
||||
|
||||
OPENCV_HAL_WRAP_SHIFT_OP(v_uint16)
|
||||
OPENCV_HAL_WRAP_SHIFT_OP(v_uint32)
|
||||
OPENCV_HAL_WRAP_SHIFT_OP(v_uint64)
|
||||
OPENCV_HAL_WRAP_SHIFT_OP(v_int16)
|
||||
OPENCV_HAL_WRAP_SHIFT_OP(v_int32)
|
||||
OPENCV_HAL_WRAP_SHIFT_OP(v_int64)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint8)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint16)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint32)
|
||||
@ -769,6 +784,12 @@ namespace CV__SIMD_NAMESPACE {
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int32x4)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int64x2)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float32x4)
|
||||
OPENCV_HAL_WRAP_SHIFT_OP(v_uint16x8)
|
||||
OPENCV_HAL_WRAP_SHIFT_OP(v_uint32x4)
|
||||
OPENCV_HAL_WRAP_SHIFT_OP(v_uint64x2)
|
||||
OPENCV_HAL_WRAP_SHIFT_OP(v_int16x8)
|
||||
OPENCV_HAL_WRAP_SHIFT_OP(v_int32x4)
|
||||
OPENCV_HAL_WRAP_SHIFT_OP(v_int64x2)
|
||||
#if CV_SIMD_64F
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64x2)
|
||||
#endif
|
||||
@ -784,6 +805,12 @@ namespace CV__SIMD_NAMESPACE {
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int32x8)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int64x4)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float32x8)
|
||||
OPENCV_HAL_WRAP_SHIFT_OP(v_uint16x16)
|
||||
OPENCV_HAL_WRAP_SHIFT_OP(v_uint32x8)
|
||||
OPENCV_HAL_WRAP_SHIFT_OP(v_uint64x4)
|
||||
OPENCV_HAL_WRAP_SHIFT_OP(v_int16x16)
|
||||
OPENCV_HAL_WRAP_SHIFT_OP(v_int32x8)
|
||||
OPENCV_HAL_WRAP_SHIFT_OP(v_int64x4)
|
||||
#if CV_SIMD_64F
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64x4)
|
||||
#endif
|
||||
@ -801,7 +828,9 @@ namespace CV__SIMD_NAMESPACE {
|
||||
inline _Tpvec v_xor(const _Tpvec& a, const _Tpvec& b) \
|
||||
{ \
|
||||
return a ^ b; \
|
||||
} \
|
||||
}
|
||||
|
||||
#define OPENCV_HAL_WRAP_NOT_OP(_Tpvec) \
|
||||
inline _Tpvec v_not(const _Tpvec& a) \
|
||||
{ \
|
||||
return ~a; \
|
||||
@ -815,6 +844,18 @@ namespace CV__SIMD_NAMESPACE {
|
||||
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int16)
|
||||
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int32)
|
||||
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int64)
|
||||
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_float32)
|
||||
OPENCV_HAL_WRAP_NOT_OP(v_uint8)
|
||||
OPENCV_HAL_WRAP_NOT_OP(v_uint16)
|
||||
OPENCV_HAL_WRAP_NOT_OP(v_uint32)
|
||||
OPENCV_HAL_WRAP_NOT_OP(v_uint64)
|
||||
OPENCV_HAL_WRAP_NOT_OP(v_int8)
|
||||
OPENCV_HAL_WRAP_NOT_OP(v_int16)
|
||||
OPENCV_HAL_WRAP_NOT_OP(v_int32)
|
||||
OPENCV_HAL_WRAP_NOT_OP(v_int64)
|
||||
#if CV_SIMD_64F
|
||||
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_float64)
|
||||
#endif
|
||||
#if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
|
||||
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint8x16)
|
||||
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint16x8)
|
||||
@ -824,6 +865,18 @@ namespace CV__SIMD_NAMESPACE {
|
||||
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int16x8)
|
||||
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int32x4)
|
||||
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int64x2)
|
||||
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_float32x4)
|
||||
OPENCV_HAL_WRAP_NOT_OP(v_uint8x16)
|
||||
OPENCV_HAL_WRAP_NOT_OP(v_uint16x8)
|
||||
OPENCV_HAL_WRAP_NOT_OP(v_uint32x4)
|
||||
OPENCV_HAL_WRAP_NOT_OP(v_uint64x2)
|
||||
OPENCV_HAL_WRAP_NOT_OP(v_int8x16)
|
||||
OPENCV_HAL_WRAP_NOT_OP(v_int16x8)
|
||||
OPENCV_HAL_WRAP_NOT_OP(v_int32x4)
|
||||
OPENCV_HAL_WRAP_NOT_OP(v_int64x2)
|
||||
#if CV_SIMD_64F
|
||||
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_float64x2)
|
||||
#endif
|
||||
#endif
|
||||
#if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
|
||||
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint8x32)
|
||||
@ -834,6 +887,18 @@ namespace CV__SIMD_NAMESPACE {
|
||||
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int16x16)
|
||||
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int32x8)
|
||||
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int64x4)
|
||||
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_float32x8)
|
||||
OPENCV_HAL_WRAP_NOT_OP(v_uint8x32)
|
||||
OPENCV_HAL_WRAP_NOT_OP(v_uint16x16)
|
||||
OPENCV_HAL_WRAP_NOT_OP(v_uint32x8)
|
||||
OPENCV_HAL_WRAP_NOT_OP(v_uint64x4)
|
||||
OPENCV_HAL_WRAP_NOT_OP(v_int8x32)
|
||||
OPENCV_HAL_WRAP_NOT_OP(v_int16x16)
|
||||
OPENCV_HAL_WRAP_NOT_OP(v_int32x8)
|
||||
OPENCV_HAL_WRAP_NOT_OP(v_int64x4)
|
||||
#if CV_SIMD_64F
|
||||
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_float64x4)
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#define OPENCV_HAL_WRAP_BIN_OP_MUL(_Tpvec) \
|
||||
|
@ -45,6 +45,7 @@ OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vuint8m2_t, u8m2, vuint8m2_t, i8)
|
||||
OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vuint8m4_t, u8m4, vuint8m4_t, i8)
|
||||
OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vuint8m8_t, u8m8, vuint8m8_t, i8)
|
||||
OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vfloat32m1_t, f32m1, vuint32m1_t, i32)
|
||||
OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vuint32m1_t, u32m1, vuint32m1_t, i32)
|
||||
#if CV_SIMD_SCALABLE_64F
|
||||
OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vfloat64m1_t, f64m1, vuint32mf2_t, i32)
|
||||
#endif
|
||||
|
@ -475,6 +475,25 @@ OPENCV_HAL_IMPL_RVV_LUT(v_float32, float, m1)
|
||||
OPENCV_HAL_IMPL_RVV_LUT(v_float64, double, mf2)
|
||||
#endif
|
||||
|
||||
#define OPENCV_HAL_IMPL_RVV_LUT_VEC(_Tpvec, _Tp) \
|
||||
inline _Tpvec v_lut(const _Tp* tab, const v_int32& vidx) \
|
||||
{ \
|
||||
v_uint32 vidx_ = vmul(vreinterpret_u32m1(vidx), sizeof(_Tp), VTraits<v_int32>::vlanes()); \
|
||||
return vloxei32(tab, vidx_, VTraits<_Tpvec>::vlanes()); \
|
||||
}
|
||||
OPENCV_HAL_IMPL_RVV_LUT_VEC(v_float32, float)
|
||||
OPENCV_HAL_IMPL_RVV_LUT_VEC(v_int32, int)
|
||||
OPENCV_HAL_IMPL_RVV_LUT_VEC(v_uint32, unsigned)
|
||||
|
||||
#if CV_SIMD_SCALABLE_64F
|
||||
inline v_float64 v_lut(const double* tab, const v_int32& vidx) \
|
||||
{ \
|
||||
vuint32mf2_t vidx_ = vmul(vlmul_trunc_u32mf2(vreinterpret_u32m1(vidx)), sizeof(double), VTraits<v_float64>::vlanes()); \
|
||||
return vloxei32(tab, vidx_, VTraits<v_float64>::vlanes()); \
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
inline v_uint8 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((schar*)tab, idx)); }
|
||||
inline v_uint8 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((schar*)tab, idx)); }
|
||||
inline v_uint8 v_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_quads((schar*)tab, idx)); }
|
||||
@ -690,23 +709,27 @@ inline v_float64 v_not (const v_float64& a) \
|
||||
|
||||
|
||||
////////////// Bitwise shifts //////////////
|
||||
/* Usage
|
||||
1. v_shl<N>(vec);
|
||||
2. v_shl(vec, N); // instead of vec << N, when N is non-constant.
|
||||
*/
|
||||
|
||||
#define OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(_Tpvec, vl) \
|
||||
template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
|
||||
template<int s = 0> inline _Tpvec v_shl(const _Tpvec& a, int n = s) \
|
||||
{ \
|
||||
return _Tpvec(vsll(a, uint8_t(n), vl)); \
|
||||
} \
|
||||
template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
|
||||
template<int s = 0> inline _Tpvec v_shr(const _Tpvec& a, int n = s) \
|
||||
{ \
|
||||
return _Tpvec(vsrl(a, uint8_t(n), vl)); \
|
||||
}
|
||||
|
||||
#define OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(_Tpvec, vl) \
|
||||
template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
|
||||
template<int s = 0> inline _Tpvec v_shl(const _Tpvec& a, int n = s) \
|
||||
{ \
|
||||
return _Tpvec(vsll(a, uint8_t(n), vl)); \
|
||||
} \
|
||||
template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
|
||||
template<int s = 0> inline _Tpvec v_shr(const _Tpvec& a, int n = s) \
|
||||
{ \
|
||||
return _Tpvec(vsra(a, uint8_t(n), vl)); \
|
||||
}
|
||||
|
@ -99,33 +99,33 @@ public:
|
||||
const uchar* ksptr2 = sptr + space_ofs[k+2];
|
||||
const uchar* ksptr3 = sptr + space_ofs[k+3];
|
||||
j = 0;
|
||||
#if CV_SIMD
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
v_float32 kweight0 = vx_setall_f32(space_weight[k]);
|
||||
v_float32 kweight1 = vx_setall_f32(space_weight[k+1]);
|
||||
v_float32 kweight2 = vx_setall_f32(space_weight[k+2]);
|
||||
v_float32 kweight3 = vx_setall_f32(space_weight[k+3]);
|
||||
for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes)
|
||||
for (; j <= size.width - VTraits<v_float32>::vlanes(); j += VTraits<v_float32>::vlanes())
|
||||
{
|
||||
v_uint32 rval = vx_load_expand_q(sptr + j);
|
||||
|
||||
v_uint32 val = vx_load_expand_q(ksptr0 + j);
|
||||
v_float32 w = kweight0 * v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, rval)));
|
||||
v_float32 v_wsum = vx_load_aligned(wsum + j) + w;
|
||||
v_float32 w = v_mul(kweight0, v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, rval))));
|
||||
v_float32 v_wsum = v_add(vx_load_aligned(wsum + j), w);
|
||||
v_float32 v_sum = v_muladd(v_cvt_f32(v_reinterpret_as_s32(val)), w, vx_load_aligned(sum + j));
|
||||
|
||||
val = vx_load_expand_q(ksptr1 + j);
|
||||
w = kweight1 * v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, rval)));
|
||||
v_wsum += w;
|
||||
w = v_mul(kweight1, v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, rval))));
|
||||
v_wsum = v_add(v_wsum, w);
|
||||
v_sum = v_muladd(v_cvt_f32(v_reinterpret_as_s32(val)), w, v_sum);
|
||||
|
||||
val = vx_load_expand_q(ksptr2 + j);
|
||||
w = kweight2 * v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, rval)));
|
||||
v_wsum += w;
|
||||
w = v_mul(kweight2, v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, rval))));
|
||||
v_wsum = v_add(v_wsum, w);
|
||||
v_sum = v_muladd(v_cvt_f32(v_reinterpret_as_s32(val)), w, v_sum);
|
||||
|
||||
val = vx_load_expand_q(ksptr3 + j);
|
||||
w = kweight3 * v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, rval)));
|
||||
v_wsum += w;
|
||||
w = v_mul(kweight3, v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, rval))));
|
||||
v_wsum = v_add(v_wsum, w);
|
||||
v_sum = v_muladd(v_cvt_f32(v_reinterpret_as_s32(val)), w, v_sum);
|
||||
|
||||
v_store_aligned(wsum + j, v_wsum);
|
||||
@ -172,13 +172,13 @@ public:
|
||||
{
|
||||
const uchar* ksptr = sptr + space_ofs[k];
|
||||
j = 0;
|
||||
#if CV_SIMD
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
v_float32 kweight = vx_setall_f32(space_weight[k]);
|
||||
for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes)
|
||||
for (; j <= size.width - VTraits<v_float32>::vlanes(); j += VTraits<v_float32>::vlanes())
|
||||
{
|
||||
v_uint32 val = vx_load_expand_q(ksptr + j);
|
||||
v_float32 w = kweight * v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, vx_load_expand_q(sptr + j))));
|
||||
v_store_aligned(wsum + j, vx_load_aligned(wsum + j) + w);
|
||||
v_float32 w = v_mul(kweight, v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, vx_load_expand_q(sptr + j)))));
|
||||
v_store_aligned(wsum + j, v_add(vx_load_aligned(wsum + j), w));
|
||||
v_store_aligned(sum + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val)), w, vx_load_aligned(sum + j)));
|
||||
}
|
||||
#endif
|
||||
@ -191,10 +191,10 @@ public:
|
||||
}
|
||||
}
|
||||
j = 0;
|
||||
#if CV_SIMD
|
||||
for (; j <= size.width - 2*v_float32::nlanes; j += 2*v_float32::nlanes)
|
||||
v_pack_u_store(dptr + j, v_pack(v_round(vx_load_aligned(sum + j ) / vx_load_aligned(wsum + j )),
|
||||
v_round(vx_load_aligned(sum + j + v_float32::nlanes) / vx_load_aligned(wsum + j + v_float32::nlanes))));
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
for (; j <= size.width - 2*VTraits<v_float32>::vlanes(); j += 2*VTraits<v_float32>::vlanes())
|
||||
v_pack_u_store(dptr + j, v_pack(v_round(v_div(vx_load_aligned(sum + j), vx_load_aligned(wsum + j))),
|
||||
v_round(v_div(vx_load_aligned(sum + j + VTraits<v_float32>::vlanes()), vx_load_aligned(wsum + j + VTraits<v_float32>::vlanes())))));
|
||||
#endif
|
||||
for (; j < size.width; j++)
|
||||
{
|
||||
@ -221,13 +221,13 @@ public:
|
||||
const uchar* ksptr3 = sptr + space_ofs[k+3];
|
||||
const uchar* rsptr = sptr;
|
||||
j = 0;
|
||||
#if CV_SIMD
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
v_float32 kweight0 = vx_setall_f32(space_weight[k]);
|
||||
v_float32 kweight1 = vx_setall_f32(space_weight[k+1]);
|
||||
v_float32 kweight2 = vx_setall_f32(space_weight[k+2]);
|
||||
v_float32 kweight3 = vx_setall_f32(space_weight[k+3]);
|
||||
for (; j <= size.width - v_uint8::nlanes; j += v_uint8::nlanes, rsptr += 3*v_uint8::nlanes,
|
||||
ksptr0 += 3*v_uint8::nlanes, ksptr1 += 3*v_uint8::nlanes, ksptr2 += 3*v_uint8::nlanes, ksptr3 += 3*v_uint8::nlanes)
|
||||
for (; j <= size.width - VTraits<v_uint8>::vlanes(); j += VTraits<v_uint8>::vlanes(), rsptr += 3*VTraits<v_uint8>::vlanes(),
|
||||
ksptr0 += 3*VTraits<v_uint8>::vlanes(), ksptr1 += 3*VTraits<v_uint8>::vlanes(), ksptr2 += 3*VTraits<v_uint8>::vlanes(), ksptr3 += 3*VTraits<v_uint8>::vlanes())
|
||||
{
|
||||
v_uint8 kb, kg, kr, rb, rg, rr;
|
||||
v_load_deinterleave(rsptr, rb, rg, rr);
|
||||
@ -236,163 +236,163 @@ public:
|
||||
v_uint16 val0, val1, val2, val3, val4;
|
||||
v_expand(v_absdiff(kb, rb), val0, val1);
|
||||
v_expand(v_absdiff(kg, rg), val2, val3);
|
||||
val0 += val2; val1 += val3;
|
||||
val0 = v_add(val0, val2); val1 = v_add(val1, val3);
|
||||
v_expand(v_absdiff(kr, rr), val2, val3);
|
||||
val0 += val2; val1 += val3;
|
||||
val0 = v_add(val0, val2); val1 = v_add(val1, val3);
|
||||
|
||||
v_uint32 vall, valh;
|
||||
v_expand(val0, vall, valh);
|
||||
v_float32 w0 = kweight0 * v_lut(color_weight, v_reinterpret_as_s32(vall));
|
||||
v_float32 w1 = kweight0 * v_lut(color_weight, v_reinterpret_as_s32(valh));
|
||||
v_store_aligned(wsum + j, w0 + vx_load_aligned(wsum + j));
|
||||
v_store_aligned(wsum + j + v_float32::nlanes, w1 + vx_load_aligned(wsum + j + v_float32::nlanes));
|
||||
v_float32 w0 = v_mul(kweight0, v_lut(color_weight, v_reinterpret_as_s32(vall)));
|
||||
v_float32 w1 = v_mul(kweight0, v_lut(color_weight, v_reinterpret_as_s32(valh)));
|
||||
v_store_aligned(wsum + j, v_add(w0, vx_load_aligned(wsum + j)));
|
||||
v_store_aligned(wsum + j + VTraits<v_float32>::vlanes(), v_add(w1, vx_load_aligned(wsum + j + VTraits<v_float32>::vlanes())));
|
||||
v_expand(kb, val0, val2);
|
||||
v_expand(val0, vall, valh);
|
||||
v_store_aligned(sum_b + j , v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j)));
|
||||
v_store_aligned(sum_b + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + v_float32::nlanes)));
|
||||
v_store_aligned(sum_b + j + VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + VTraits<v_float32>::vlanes())));
|
||||
v_expand(kg, val0, val3);
|
||||
v_expand(val0, vall, valh);
|
||||
v_store_aligned(sum_g + j , v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j)));
|
||||
v_store_aligned(sum_g + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + v_float32::nlanes)));
|
||||
v_store_aligned(sum_g + j + VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + VTraits<v_float32>::vlanes())));
|
||||
v_expand(kr, val0, val4);
|
||||
v_expand(val0, vall, valh);
|
||||
v_store_aligned(sum_r + j , v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j)));
|
||||
v_store_aligned(sum_r + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + v_float32::nlanes)));
|
||||
v_store_aligned(sum_r + j + VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + VTraits<v_float32>::vlanes())));
|
||||
|
||||
v_expand(val1, vall, valh);
|
||||
w0 = kweight0 * v_lut(color_weight, v_reinterpret_as_s32(vall));
|
||||
w1 = kweight0 * v_lut(color_weight, v_reinterpret_as_s32(valh));
|
||||
v_store_aligned(wsum + j + 2 * v_float32::nlanes, w0 + vx_load_aligned(wsum + j + 2 * v_float32::nlanes));
|
||||
v_store_aligned(wsum + j + 3 * v_float32::nlanes, w1 + vx_load_aligned(wsum + j + 3 * v_float32::nlanes));
|
||||
w0 = v_mul(kweight0, v_lut(color_weight, v_reinterpret_as_s32(vall)));
|
||||
w1 = v_mul(kweight0, v_lut(color_weight, v_reinterpret_as_s32(valh)));
|
||||
v_store_aligned(wsum + j + 2 * VTraits<v_float32>::vlanes(), v_add(w0, vx_load_aligned(wsum + j + 2 * VTraits<v_float32>::vlanes())));
|
||||
v_store_aligned(wsum + j + 3 * VTraits<v_float32>::vlanes(), v_add(w1, vx_load_aligned(wsum + j + 3 * VTraits<v_float32>::vlanes())));
|
||||
v_expand(val2, vall, valh);
|
||||
v_store_aligned(sum_b + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j + 2 * v_float32::nlanes)));
|
||||
v_store_aligned(sum_b + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + 3 * v_float32::nlanes)));
|
||||
v_store_aligned(sum_b + j + 2 * VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j + 2 * VTraits<v_float32>::vlanes())));
|
||||
v_store_aligned(sum_b + j + 3 * VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + 3 * VTraits<v_float32>::vlanes())));
|
||||
v_expand(val3, vall, valh);
|
||||
v_store_aligned(sum_g + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j + 2 * v_float32::nlanes)));
|
||||
v_store_aligned(sum_g + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + 3 * v_float32::nlanes)));
|
||||
v_store_aligned(sum_g + j + 2 * VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j + 2 * VTraits<v_float32>::vlanes())));
|
||||
v_store_aligned(sum_g + j + 3 * VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + 3 * VTraits<v_float32>::vlanes())));
|
||||
v_expand(val4, vall, valh);
|
||||
v_store_aligned(sum_r + j + 2*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j + 2*v_float32::nlanes)));
|
||||
v_store_aligned(sum_r + j + 3*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + 3*v_float32::nlanes)));
|
||||
v_store_aligned(sum_r + j + 2*VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j + 2*VTraits<v_float32>::vlanes())));
|
||||
v_store_aligned(sum_r + j + 3*VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + 3*VTraits<v_float32>::vlanes())));
|
||||
|
||||
v_load_deinterleave(ksptr1, kb, kg, kr);
|
||||
v_expand(v_absdiff(kb, rb), val0, val1);
|
||||
v_expand(v_absdiff(kg, rg), val2, val3);
|
||||
val0 += val2; val1 += val3;
|
||||
val0 = v_add(val0, val2); val1 = v_add(val1, val3);
|
||||
v_expand(v_absdiff(kr, rr), val2, val3);
|
||||
val0 += val2; val1 += val3;
|
||||
val0 = v_add(val0, val2); val1 = v_add(val1, val3);
|
||||
|
||||
v_expand(val0, vall, valh);
|
||||
w0 = kweight1 * v_lut(color_weight, v_reinterpret_as_s32(vall));
|
||||
w1 = kweight1 * v_lut(color_weight, v_reinterpret_as_s32(valh));
|
||||
v_store_aligned(wsum + j, w0 + vx_load_aligned(wsum + j));
|
||||
v_store_aligned(wsum + j + v_float32::nlanes, w1 + vx_load_aligned(wsum + j + v_float32::nlanes));
|
||||
w0 = v_mul(kweight1, v_lut(color_weight, v_reinterpret_as_s32(vall)));
|
||||
w1 = v_mul(kweight1, v_lut(color_weight, v_reinterpret_as_s32(valh)));
|
||||
v_store_aligned(wsum + j, v_add(w0, vx_load_aligned(wsum + j)));
|
||||
v_store_aligned(wsum + j + VTraits<v_float32>::vlanes(), v_add(w1, vx_load_aligned(wsum + j + VTraits<v_float32>::vlanes())));
|
||||
v_expand(kb, val0, val2);
|
||||
v_expand(val0, vall, valh);
|
||||
v_store_aligned(sum_b + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j)));
|
||||
v_store_aligned(sum_b + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + v_float32::nlanes)));
|
||||
v_store_aligned(sum_b + j + VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + VTraits<v_float32>::vlanes())));
|
||||
v_expand(kg, val0, val3);
|
||||
v_expand(val0, vall, valh);
|
||||
v_store_aligned(sum_g + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j)));
|
||||
v_store_aligned(sum_g + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + v_float32::nlanes)));
|
||||
v_store_aligned(sum_g + j + VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + VTraits<v_float32>::vlanes())));
|
||||
v_expand(kr, val0, val4);
|
||||
v_expand(val0, vall, valh);
|
||||
v_store_aligned(sum_r + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j)));
|
||||
v_store_aligned(sum_r + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + v_float32::nlanes)));
|
||||
v_store_aligned(sum_r + j + VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + VTraits<v_float32>::vlanes())));
|
||||
|
||||
v_expand(val1, vall, valh);
|
||||
w0 = kweight1 * v_lut(color_weight, v_reinterpret_as_s32(vall));
|
||||
w1 = kweight1 * v_lut(color_weight, v_reinterpret_as_s32(valh));
|
||||
v_store_aligned(wsum + j + 2 * v_float32::nlanes, w0 + vx_load_aligned(wsum + j + 2 * v_float32::nlanes));
|
||||
v_store_aligned(wsum + j + 3 * v_float32::nlanes, w1 + vx_load_aligned(wsum + j + 3 * v_float32::nlanes));
|
||||
w0 = v_mul(kweight1, v_lut(color_weight, v_reinterpret_as_s32(vall)));
|
||||
w1 = v_mul(kweight1, v_lut(color_weight, v_reinterpret_as_s32(valh)));
|
||||
v_store_aligned(wsum + j + 2 * VTraits<v_float32>::vlanes(), v_add(w0, vx_load_aligned(wsum + j + 2 * VTraits<v_float32>::vlanes())));
|
||||
v_store_aligned(wsum + j + 3 * VTraits<v_float32>::vlanes(), v_add(w1, vx_load_aligned(wsum + j + 3 * VTraits<v_float32>::vlanes())));
|
||||
v_expand(val2, vall, valh);
|
||||
v_store_aligned(sum_b + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j + 2 * v_float32::nlanes)));
|
||||
v_store_aligned(sum_b + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + 3 * v_float32::nlanes)));
|
||||
v_store_aligned(sum_b + j + 2 * VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j + 2 * VTraits<v_float32>::vlanes())));
|
||||
v_store_aligned(sum_b + j + 3 * VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + 3 * VTraits<v_float32>::vlanes())));
|
||||
v_expand(val3, vall, valh);
|
||||
v_store_aligned(sum_g + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j + 2 * v_float32::nlanes)));
|
||||
v_store_aligned(sum_g + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + 3 * v_float32::nlanes)));
|
||||
v_store_aligned(sum_g + j + 2 * VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j + 2 * VTraits<v_float32>::vlanes())));
|
||||
v_store_aligned(sum_g + j + 3 * VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + 3 * VTraits<v_float32>::vlanes())));
|
||||
v_expand(val4, vall, valh);
|
||||
v_store_aligned(sum_r + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j + 2 * v_float32::nlanes)));
|
||||
v_store_aligned(sum_r + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + 3 * v_float32::nlanes)));
|
||||
v_store_aligned(sum_r + j + 2 * VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j + 2 * VTraits<v_float32>::vlanes())));
|
||||
v_store_aligned(sum_r + j + 3 * VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + 3 * VTraits<v_float32>::vlanes())));
|
||||
|
||||
v_load_deinterleave(ksptr2, kb, kg, kr);
|
||||
v_expand(v_absdiff(kb, rb), val0, val1);
|
||||
v_expand(v_absdiff(kg, rg), val2, val3);
|
||||
val0 += val2; val1 += val3;
|
||||
val0 = v_add(val0, val2); val1 = v_add(val1, val3);
|
||||
v_expand(v_absdiff(kr, rr), val2, val3);
|
||||
val0 += val2; val1 += val3;
|
||||
val0 = v_add(val0, val2); val1 = v_add(val1, val3);
|
||||
|
||||
v_expand(val0, vall, valh);
|
||||
w0 = kweight2 * v_lut(color_weight, v_reinterpret_as_s32(vall));
|
||||
w1 = kweight2 * v_lut(color_weight, v_reinterpret_as_s32(valh));
|
||||
v_store_aligned(wsum + j, w0 + vx_load_aligned(wsum + j));
|
||||
v_store_aligned(wsum + j + v_float32::nlanes, w1 + vx_load_aligned(wsum + j + v_float32::nlanes));
|
||||
w0 = v_mul(kweight2, v_lut(color_weight, v_reinterpret_as_s32(vall)));
|
||||
w1 = v_mul(kweight2, v_lut(color_weight, v_reinterpret_as_s32(valh)));
|
||||
v_store_aligned(wsum + j, v_add(w0, vx_load_aligned(wsum + j)));
|
||||
v_store_aligned(wsum + j + VTraits<v_float32>::vlanes(), v_add(w1, vx_load_aligned(wsum + j + VTraits<v_float32>::vlanes())));
|
||||
v_expand(kb, val0, val2);
|
||||
v_expand(val0, vall, valh);
|
||||
v_store_aligned(sum_b + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j)));
|
||||
v_store_aligned(sum_b + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + v_float32::nlanes)));
|
||||
v_store_aligned(sum_b + j + VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + VTraits<v_float32>::vlanes())));
|
||||
v_expand(kg, val0, val3);
|
||||
v_expand(val0, vall, valh);
|
||||
v_store_aligned(sum_g + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j)));
|
||||
v_store_aligned(sum_g + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + v_float32::nlanes)));
|
||||
v_store_aligned(sum_g + j + VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + VTraits<v_float32>::vlanes())));
|
||||
v_expand(kr, val0, val4);
|
||||
v_expand(val0, vall, valh);
|
||||
v_store_aligned(sum_r + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j)));
|
||||
v_store_aligned(sum_r + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + v_float32::nlanes)));
|
||||
v_store_aligned(sum_r + j + VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + VTraits<v_float32>::vlanes())));
|
||||
|
||||
v_expand(val1, vall, valh);
|
||||
w0 = kweight2 * v_lut(color_weight, v_reinterpret_as_s32(vall));
|
||||
w1 = kweight2 * v_lut(color_weight, v_reinterpret_as_s32(valh));
|
||||
v_store_aligned(wsum + j + 2 * v_float32::nlanes, w0 + vx_load_aligned(wsum + j + 2 * v_float32::nlanes));
|
||||
v_store_aligned(wsum + j + 3 * v_float32::nlanes, w1 + vx_load_aligned(wsum + j + 3 * v_float32::nlanes));
|
||||
w0 = v_mul(kweight2, v_lut(color_weight, v_reinterpret_as_s32(vall)));
|
||||
w1 = v_mul(kweight2, v_lut(color_weight, v_reinterpret_as_s32(valh)));
|
||||
v_store_aligned(wsum + j + 2 * VTraits<v_float32>::vlanes(), v_add(w0, vx_load_aligned(wsum + j + 2 * VTraits<v_float32>::vlanes())));
|
||||
v_store_aligned(wsum + j + 3 * VTraits<v_float32>::vlanes(), v_add(w1, vx_load_aligned(wsum + j + 3 * VTraits<v_float32>::vlanes())));
|
||||
v_expand(val2, vall, valh);
|
||||
v_store_aligned(sum_b + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j + 2 * v_float32::nlanes)));
|
||||
v_store_aligned(sum_b + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + 3 * v_float32::nlanes)));
|
||||
v_store_aligned(sum_b + j + 2 * VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j + 2 * VTraits<v_float32>::vlanes())));
|
||||
v_store_aligned(sum_b + j + 3 * VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + 3 * VTraits<v_float32>::vlanes())));
|
||||
v_expand(val3, vall, valh);
|
||||
v_store_aligned(sum_g + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j + 2 * v_float32::nlanes)));
|
||||
v_store_aligned(sum_g + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + 3 * v_float32::nlanes)));
|
||||
v_store_aligned(sum_g + j + 2 * VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j + 2 * VTraits<v_float32>::vlanes())));
|
||||
v_store_aligned(sum_g + j + 3 * VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + 3 * VTraits<v_float32>::vlanes())));
|
||||
v_expand(val4, vall, valh);
|
||||
v_store_aligned(sum_r + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j + 2 * v_float32::nlanes)));
|
||||
v_store_aligned(sum_r + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + 3 * v_float32::nlanes)));
|
||||
v_store_aligned(sum_r + j + 2 * VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j + 2 * VTraits<v_float32>::vlanes())));
|
||||
v_store_aligned(sum_r + j + 3 * VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + 3 * VTraits<v_float32>::vlanes())));
|
||||
|
||||
v_load_deinterleave(ksptr3, kb, kg, kr);
|
||||
v_expand(v_absdiff(kb, rb), val0, val1);
|
||||
v_expand(v_absdiff(kg, rg), val2, val3);
|
||||
val0 += val2; val1 += val3;
|
||||
val0 = v_add(val0, val2); val1 = v_add(val1, val3);
|
||||
v_expand(v_absdiff(kr, rr), val2, val3);
|
||||
val0 += val2; val1 += val3;
|
||||
val0 = v_add(val0, val2); val1 = v_add(val1, val3);
|
||||
|
||||
v_expand(val0, vall, valh);
|
||||
w0 = kweight3 * v_lut(color_weight, v_reinterpret_as_s32(vall));
|
||||
w1 = kweight3 * v_lut(color_weight, v_reinterpret_as_s32(valh));
|
||||
v_store_aligned(wsum + j, w0 + vx_load_aligned(wsum + j));
|
||||
v_store_aligned(wsum + j + v_float32::nlanes, w1 + vx_load_aligned(wsum + j + v_float32::nlanes));
|
||||
w0 = v_mul(kweight3, v_lut(color_weight, v_reinterpret_as_s32(vall)));
|
||||
w1 = v_mul(kweight3, v_lut(color_weight, v_reinterpret_as_s32(valh)));
|
||||
v_store_aligned(wsum + j, v_add(w0, vx_load_aligned(wsum + j)));
|
||||
v_store_aligned(wsum + j + VTraits<v_float32>::vlanes(), v_add(w1, vx_load_aligned(wsum + j + VTraits<v_float32>::vlanes())));
|
||||
v_expand(kb, val0, val2);
|
||||
v_expand(val0, vall, valh);
|
||||
v_store_aligned(sum_b + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j)));
|
||||
v_store_aligned(sum_b + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + v_float32::nlanes)));
|
||||
v_store_aligned(sum_b + j + VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + VTraits<v_float32>::vlanes())));
|
||||
v_expand(kg, val0, val3);
|
||||
v_expand(val0, vall, valh);
|
||||
v_store_aligned(sum_g + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j)));
|
||||
v_store_aligned(sum_g + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + v_float32::nlanes)));
|
||||
v_store_aligned(sum_g + j + VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + VTraits<v_float32>::vlanes())));
|
||||
v_expand(kr, val0, val4);
|
||||
v_expand(val0, vall, valh);
|
||||
v_store_aligned(sum_r + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j)));
|
||||
v_store_aligned(sum_r + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + v_float32::nlanes)));
|
||||
v_store_aligned(sum_r + j + VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + VTraits<v_float32>::vlanes())));
|
||||
|
||||
v_expand(val1, vall, valh);
|
||||
w0 = kweight3 * v_lut(color_weight, v_reinterpret_as_s32(vall));
|
||||
w1 = kweight3 * v_lut(color_weight, v_reinterpret_as_s32(valh));
|
||||
v_store_aligned(wsum + j + 2 * v_float32::nlanes, w0 + vx_load_aligned(wsum + j + 2 * v_float32::nlanes));
|
||||
v_store_aligned(wsum + j + 3 * v_float32::nlanes, w1 + vx_load_aligned(wsum + j + 3 * v_float32::nlanes));
|
||||
w0 = v_mul(kweight3, v_lut(color_weight, v_reinterpret_as_s32(vall)));
|
||||
w1 = v_mul(kweight3, v_lut(color_weight, v_reinterpret_as_s32(valh)));
|
||||
v_store_aligned(wsum + j + 2 * VTraits<v_float32>::vlanes(), v_add(w0, vx_load_aligned(wsum + j + 2 * VTraits<v_float32>::vlanes())));
|
||||
v_store_aligned(wsum + j + 3 * VTraits<v_float32>::vlanes(), v_add(w1, vx_load_aligned(wsum + j + 3 * VTraits<v_float32>::vlanes())));
|
||||
v_expand(val2, vall, valh);
|
||||
v_store_aligned(sum_b + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j + 2 * v_float32::nlanes)));
|
||||
v_store_aligned(sum_b + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + 3 * v_float32::nlanes)));
|
||||
v_store_aligned(sum_b + j + 2 * VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j + 2 * VTraits<v_float32>::vlanes())));
|
||||
v_store_aligned(sum_b + j + 3 * VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + 3 * VTraits<v_float32>::vlanes())));
|
||||
v_expand(val3, vall, valh);
|
||||
v_store_aligned(sum_g + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j + 2 * v_float32::nlanes)));
|
||||
v_store_aligned(sum_g + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + 3 * v_float32::nlanes)));
|
||||
v_store_aligned(sum_g + j + 2 * VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j + 2 * VTraits<v_float32>::vlanes())));
|
||||
v_store_aligned(sum_g + j + 3 * VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + 3 * VTraits<v_float32>::vlanes())));
|
||||
v_expand(val4, vall, valh);
|
||||
v_store_aligned(sum_r + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j + 2 * v_float32::nlanes)));
|
||||
v_store_aligned(sum_r + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + 3 * v_float32::nlanes)));
|
||||
v_store_aligned(sum_r + j + 2 * VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j + 2 * VTraits<v_float32>::vlanes())));
|
||||
v_store_aligned(sum_r + j + 3 * VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + 3 * VTraits<v_float32>::vlanes())));
|
||||
}
|
||||
#endif
|
||||
#if CV_SIMD128
|
||||
@ -442,9 +442,9 @@ public:
|
||||
const uchar* ksptr = sptr + space_ofs[k];
|
||||
const uchar* rsptr = sptr;
|
||||
j = 0;
|
||||
#if CV_SIMD
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
v_float32 kweight = vx_setall_f32(space_weight[k]);
|
||||
for (; j <= size.width - v_uint8::nlanes; j += v_uint8::nlanes, ksptr += 3*v_uint8::nlanes, rsptr += 3*v_uint8::nlanes)
|
||||
for (; j <= size.width - VTraits<v_uint8>::vlanes(); j += VTraits<v_uint8>::vlanes(), ksptr += 3*VTraits<v_uint8>::vlanes(), rsptr += 3*VTraits<v_uint8>::vlanes())
|
||||
{
|
||||
v_uint8 kb, kg, kr, rb, rg, rr;
|
||||
v_load_deinterleave(ksptr, kb, kg, kr);
|
||||
@ -456,39 +456,39 @@ public:
|
||||
v_expand(v_absdiff(kr, rr), r_l, r_h);
|
||||
|
||||
v_uint32 val0, val1, val2, val3;
|
||||
v_expand(b_l + g_l + r_l, val0, val1);
|
||||
v_expand(b_h + g_h + r_h, val2, val3);
|
||||
v_expand(v_add(v_add(b_l, g_l), r_l), val0, val1);
|
||||
v_expand(v_add(v_add(b_h, g_h), r_h), val2, val3);
|
||||
|
||||
v_expand(kb, b_l, b_h);
|
||||
v_expand(kg, g_l, g_h);
|
||||
v_expand(kr, r_l, r_h);
|
||||
|
||||
v_float32 w0 = kweight * v_lut(color_weight, v_reinterpret_as_s32(val0));
|
||||
v_float32 w1 = kweight * v_lut(color_weight, v_reinterpret_as_s32(val1));
|
||||
v_float32 w2 = kweight * v_lut(color_weight, v_reinterpret_as_s32(val2));
|
||||
v_float32 w3 = kweight * v_lut(color_weight, v_reinterpret_as_s32(val3));
|
||||
v_store_aligned(wsum + j , w0 + vx_load_aligned(wsum + j));
|
||||
v_store_aligned(wsum + j + v_float32::nlanes, w1 + vx_load_aligned(wsum + j + v_float32::nlanes));
|
||||
v_store_aligned(wsum + j + 2*v_float32::nlanes, w2 + vx_load_aligned(wsum + j + 2*v_float32::nlanes));
|
||||
v_store_aligned(wsum + j + 3*v_float32::nlanes, w3 + vx_load_aligned(wsum + j + 3*v_float32::nlanes));
|
||||
v_float32 w0 = v_mul(kweight, v_lut(color_weight, v_reinterpret_as_s32(val0)));
|
||||
v_float32 w1 = v_mul(kweight, v_lut(color_weight, v_reinterpret_as_s32(val1)));
|
||||
v_float32 w2 = v_mul(kweight, v_lut(color_weight, v_reinterpret_as_s32(val2)));
|
||||
v_float32 w3 = v_mul(kweight, v_lut(color_weight, v_reinterpret_as_s32(val3)));
|
||||
v_store_aligned(wsum + j , v_add(w0, vx_load_aligned(wsum + j)));
|
||||
v_store_aligned(wsum + j + VTraits<v_float32>::vlanes(), v_add(w1, vx_load_aligned(wsum + j + VTraits<v_float32>::vlanes())));
|
||||
v_store_aligned(wsum + j + 2*VTraits<v_float32>::vlanes(), v_add(w2, vx_load_aligned(wsum + j + 2 * VTraits<v_float32>::vlanes())));
|
||||
v_store_aligned(wsum + j + 3*VTraits<v_float32>::vlanes(), v_add(w3, vx_load_aligned(wsum + j + 3 * VTraits<v_float32>::vlanes())));
|
||||
v_expand(b_l, val0, val1);
|
||||
v_expand(b_h, val2, val3);
|
||||
v_store_aligned(sum_b + j , v_muladd(v_cvt_f32(v_reinterpret_as_s32(val0)), w0, vx_load_aligned(sum_b + j)));
|
||||
v_store_aligned(sum_b + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val1)), w1, vx_load_aligned(sum_b + j + v_float32::nlanes)));
|
||||
v_store_aligned(sum_b + j + 2*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val2)), w2, vx_load_aligned(sum_b + j + 2*v_float32::nlanes)));
|
||||
v_store_aligned(sum_b + j + 3*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val3)), w3, vx_load_aligned(sum_b + j + 3*v_float32::nlanes)));
|
||||
v_store_aligned(sum_b + j + VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(val1)), w1, vx_load_aligned(sum_b + j + VTraits<v_float32>::vlanes())));
|
||||
v_store_aligned(sum_b + j + 2*VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(val2)), w2, vx_load_aligned(sum_b + j + 2*VTraits<v_float32>::vlanes())));
|
||||
v_store_aligned(sum_b + j + 3*VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(val3)), w3, vx_load_aligned(sum_b + j + 3*VTraits<v_float32>::vlanes())));
|
||||
v_expand(g_l, val0, val1);
|
||||
v_expand(g_h, val2, val3);
|
||||
v_store_aligned(sum_g + j , v_muladd(v_cvt_f32(v_reinterpret_as_s32(val0)), w0, vx_load_aligned(sum_g + j)));
|
||||
v_store_aligned(sum_g + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val1)), w1, vx_load_aligned(sum_g + j + v_float32::nlanes)));
|
||||
v_store_aligned(sum_g + j + 2*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val2)), w2, vx_load_aligned(sum_g + j + 2*v_float32::nlanes)));
|
||||
v_store_aligned(sum_g + j + 3*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val3)), w3, vx_load_aligned(sum_g + j + 3*v_float32::nlanes)));
|
||||
v_store_aligned(sum_g + j + VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(val1)), w1, vx_load_aligned(sum_g + j + VTraits<v_float32>::vlanes())));
|
||||
v_store_aligned(sum_g + j + 2*VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(val2)), w2, vx_load_aligned(sum_g + j + 2*VTraits<v_float32>::vlanes())));
|
||||
v_store_aligned(sum_g + j + 3*VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(val3)), w3, vx_load_aligned(sum_g + j + 3*VTraits<v_float32>::vlanes())));
|
||||
v_expand(r_l, val0, val1);
|
||||
v_expand(r_h, val2, val3);
|
||||
v_store_aligned(sum_r + j , v_muladd(v_cvt_f32(v_reinterpret_as_s32(val0)), w0, vx_load_aligned(sum_r + j)));
|
||||
v_store_aligned(sum_r + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val1)), w1, vx_load_aligned(sum_r + j + v_float32::nlanes)));
|
||||
v_store_aligned(sum_r + j + 2*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val2)), w2, vx_load_aligned(sum_r + j + 2*v_float32::nlanes)));
|
||||
v_store_aligned(sum_r + j + 3*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val3)), w3, vx_load_aligned(sum_r + j + 3*v_float32::nlanes)));
|
||||
v_store_aligned(sum_r + j + VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(val1)), w1, vx_load_aligned(sum_r + j + VTraits<v_float32>::vlanes())));
|
||||
v_store_aligned(sum_r + j + 2*VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(val2)), w2, vx_load_aligned(sum_r + j + 2*VTraits<v_float32>::vlanes())));
|
||||
v_store_aligned(sum_r + j + 3*VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(val3)), w3, vx_load_aligned(sum_r + j + 3*VTraits<v_float32>::vlanes())));
|
||||
}
|
||||
#endif
|
||||
for(; j < size.width; j++, ksptr += 3, rsptr += 3)
|
||||
@ -500,27 +500,27 @@ public:
|
||||
}
|
||||
}
|
||||
j = 0;
|
||||
#if CV_SIMD
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
v_float32 v_one = vx_setall_f32(1.f);
|
||||
for(; j <= size.width - v_uint8::nlanes; j += v_uint8::nlanes, dptr += 3*v_uint8::nlanes)
|
||||
for(; j <= size.width - VTraits<v_uint8>::vlanes(); j += VTraits<v_uint8>::vlanes(), dptr += 3*VTraits<v_uint8>::vlanes())
|
||||
{
|
||||
v_float32 w0 = v_one / vx_load_aligned(wsum + j);
|
||||
v_float32 w1 = v_one / vx_load_aligned(wsum + j + v_float32::nlanes);
|
||||
v_float32 w2 = v_one / vx_load_aligned(wsum + j + 2*v_float32::nlanes);
|
||||
v_float32 w3 = v_one / vx_load_aligned(wsum + j + 3*v_float32::nlanes);
|
||||
v_float32 w0 = v_div(v_one, vx_load_aligned(wsum + j));
|
||||
v_float32 w1 = v_div(v_one, vx_load_aligned(wsum + j + VTraits<v_float32>::vlanes()));
|
||||
v_float32 w2 = v_div(v_one, vx_load_aligned(wsum + j + 2 * VTraits<v_float32>::vlanes()));
|
||||
v_float32 w3 = v_div(v_one, vx_load_aligned(wsum + j + 3 * VTraits<v_float32>::vlanes()));
|
||||
|
||||
v_store_interleave(dptr, v_pack_u(v_pack(v_round(w0 * vx_load_aligned(sum_b + j)),
|
||||
v_round(w1 * vx_load_aligned(sum_b + j + v_float32::nlanes))),
|
||||
v_pack(v_round(w2 * vx_load_aligned(sum_b + j + 2*v_float32::nlanes)),
|
||||
v_round(w3 * vx_load_aligned(sum_b + j + 3*v_float32::nlanes)))),
|
||||
v_pack_u(v_pack(v_round(w0 * vx_load_aligned(sum_g + j)),
|
||||
v_round(w1 * vx_load_aligned(sum_g + j + v_float32::nlanes))),
|
||||
v_pack(v_round(w2 * vx_load_aligned(sum_g + j + 2*v_float32::nlanes)),
|
||||
v_round(w3 * vx_load_aligned(sum_g + j + 3*v_float32::nlanes)))),
|
||||
v_pack_u(v_pack(v_round(w0 * vx_load_aligned(sum_r + j)),
|
||||
v_round(w1 * vx_load_aligned(sum_r + j + v_float32::nlanes))),
|
||||
v_pack(v_round(w2 * vx_load_aligned(sum_r + j + 2*v_float32::nlanes)),
|
||||
v_round(w3 * vx_load_aligned(sum_r + j + 3*v_float32::nlanes)))));
|
||||
v_store_interleave(dptr, v_pack_u(v_pack(v_round(v_mul(w0, vx_load_aligned(sum_b + j))),
|
||||
v_round(v_mul(w1, vx_load_aligned(sum_b + j + VTraits<v_float32>::vlanes())))),
|
||||
v_pack(v_round(v_mul(w2, vx_load_aligned(sum_b + j + 2 * VTraits<v_float32>::vlanes()))),
|
||||
v_round(v_mul(w3, vx_load_aligned(sum_b + j + 3 * VTraits<v_float32>::vlanes()))))),
|
||||
v_pack_u(v_pack(v_round(v_mul(w0, vx_load_aligned(sum_g + j))),
|
||||
v_round(v_mul(w1, vx_load_aligned(sum_g + j + VTraits<v_float32>::vlanes())))),
|
||||
v_pack(v_round(v_mul(w2, vx_load_aligned(sum_g + j + 2 * VTraits<v_float32>::vlanes()))),
|
||||
v_round(v_mul(w3, vx_load_aligned(sum_g + j + 3 * VTraits<v_float32>::vlanes()))))),
|
||||
v_pack_u(v_pack(v_round(v_mul(w0, vx_load_aligned(sum_r + j))),
|
||||
v_round(v_mul(w1, vx_load_aligned(sum_r + j + VTraits<v_float32>::vlanes())))),
|
||||
v_pack(v_round(v_mul(w2, vx_load_aligned(sum_r + j + 2 * VTraits<v_float32>::vlanes()))),
|
||||
v_round(v_mul(w3, vx_load_aligned(sum_r + j + 3 * VTraits<v_float32>::vlanes()))))));
|
||||
}
|
||||
#endif
|
||||
for(; j < size.width; j++)
|
||||
@ -533,7 +533,7 @@ public:
|
||||
}
|
||||
}
|
||||
}
|
||||
#if CV_SIMD
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
vx_cleanup();
|
||||
#endif
|
||||
}
|
||||
@ -589,7 +589,7 @@ public:
|
||||
memset(buf.data(), 0, buf.size() * sizeof(float));
|
||||
float *sum = alignPtr(buf.data(), CV_SIMD_WIDTH);
|
||||
float *wsum = sum + alignSize(size.width, CV_SIMD_WIDTH);
|
||||
#if CV_SIMD
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
v_float32 v_one = vx_setall_f32(1.f);
|
||||
v_float32 sindex = vx_setall_f32(scale_index);
|
||||
#endif
|
||||
@ -601,50 +601,50 @@ public:
|
||||
const float* ksptr2 = sptr + space_ofs[k + 2];
|
||||
const float* ksptr3 = sptr + space_ofs[k + 3];
|
||||
j = 0;
|
||||
#if CV_SIMD
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
v_float32 kweight0 = vx_setall_f32(space_weight[k]);
|
||||
v_float32 kweight1 = vx_setall_f32(space_weight[k+1]);
|
||||
v_float32 kweight2 = vx_setall_f32(space_weight[k+2]);
|
||||
v_float32 kweight3 = vx_setall_f32(space_weight[k+3]);
|
||||
for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes)
|
||||
for (; j <= size.width - VTraits<v_float32>::vlanes(); j += VTraits<v_float32>::vlanes())
|
||||
{
|
||||
v_float32 rval = vx_load(sptr + j);
|
||||
|
||||
v_float32 val = vx_load(ksptr0 + j);
|
||||
v_float32 knan = v_not_nan(val);
|
||||
v_float32 alpha = (v_absdiff(val, rval) * sindex) & v_not_nan(rval) & knan;
|
||||
v_float32 alpha = v_and(v_and(v_mul(v_absdiff(val, rval), sindex), v_not_nan(rval)), knan);
|
||||
v_int32 idx = v_trunc(alpha);
|
||||
alpha -= v_cvt_f32(idx);
|
||||
v_float32 w = (kweight0 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one-alpha))) & knan;
|
||||
v_float32 v_wsum = vx_load_aligned(wsum + j) + w;
|
||||
v_float32 v_sum = v_muladd(val & knan, w, vx_load_aligned(sum + j));
|
||||
alpha = v_sub(alpha, v_cvt_f32(idx));
|
||||
v_float32 w = v_and(v_mul(kweight0, v_muladd(v_lut(this->expLUT + 1, idx), alpha, v_mul(v_lut(this->expLUT, idx), v_sub(v_one, alpha)))), knan);
|
||||
v_float32 v_wsum = v_add(vx_load_aligned(wsum + j), w);
|
||||
v_float32 v_sum = v_muladd(v_and(val, knan), w, vx_load_aligned(sum + j));
|
||||
|
||||
val = vx_load(ksptr1 + j);
|
||||
knan = v_not_nan(val);
|
||||
alpha = (v_absdiff(val, rval) * sindex) & v_not_nan(rval) & knan;
|
||||
alpha = v_and(v_and(v_mul(v_absdiff(val, rval), sindex), v_not_nan(rval)), knan);
|
||||
idx = v_trunc(alpha);
|
||||
alpha -= v_cvt_f32(idx);
|
||||
w = (kweight1 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan;
|
||||
v_wsum += w;
|
||||
v_sum = v_muladd(val & knan, w, v_sum);
|
||||
alpha = v_sub(alpha, v_cvt_f32(idx));
|
||||
w = v_and(v_mul(kweight1, v_muladd(v_lut(this->expLUT + 1, idx), alpha, v_mul(v_lut(this->expLUT, idx), v_sub(v_one, alpha)))), knan);
|
||||
v_wsum = v_add(v_wsum, w);
|
||||
v_sum = v_muladd(v_and(val, knan), w, v_sum);
|
||||
|
||||
val = vx_load(ksptr2 + j);
|
||||
knan = v_not_nan(val);
|
||||
alpha = (v_absdiff(val, rval) * sindex) & v_not_nan(rval) & knan;
|
||||
alpha = v_and(v_and(v_mul(v_absdiff(val, rval), sindex), v_not_nan(rval)), knan);
|
||||
idx = v_trunc(alpha);
|
||||
alpha -= v_cvt_f32(idx);
|
||||
w = (kweight2 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan;
|
||||
v_wsum += w;
|
||||
v_sum = v_muladd(val & knan, w, v_sum);
|
||||
alpha = v_sub(alpha, v_cvt_f32(idx));
|
||||
w = v_and(v_mul(kweight2, v_muladd(v_lut(this->expLUT + 1, idx), alpha, v_mul(v_lut(this->expLUT, idx), v_sub(v_one, alpha)))), knan);
|
||||
v_wsum = v_add(v_wsum, w);
|
||||
v_sum = v_muladd(v_and(val, knan), w, v_sum);
|
||||
|
||||
val = vx_load(ksptr3 + j);
|
||||
knan = v_not_nan(val);
|
||||
alpha = (v_absdiff(val, rval) * sindex) & v_not_nan(rval) & knan;
|
||||
alpha = v_and(v_and(v_mul(v_absdiff(val, rval), sindex), v_not_nan(rval)), knan);
|
||||
idx = v_trunc(alpha);
|
||||
alpha -= v_cvt_f32(idx);
|
||||
w = (kweight3 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan;
|
||||
v_wsum += w;
|
||||
v_sum = v_muladd(val & knan, w, v_sum);
|
||||
alpha = v_sub(alpha, v_cvt_f32(idx));
|
||||
w = v_and(v_mul(kweight3, v_muladd(v_lut(this->expLUT + 1, idx), alpha, v_mul(v_lut(this->expLUT, idx), v_sub(v_one, alpha)))), knan);
|
||||
v_wsum = v_add(v_wsum, w);
|
||||
v_sum = v_muladd(v_and(val, knan), w, v_sum);
|
||||
|
||||
v_store_aligned(wsum + j, v_wsum);
|
||||
v_store_aligned(sum + j, v_sum);
|
||||
@ -720,20 +720,20 @@ public:
|
||||
{
|
||||
const float* ksptr = sptr + space_ofs[k];
|
||||
j = 0;
|
||||
#if CV_SIMD
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
v_float32 kweight = vx_setall_f32(space_weight[k]);
|
||||
for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes)
|
||||
for (; j <= size.width - VTraits<v_float32>::vlanes(); j += VTraits<v_float32>::vlanes())
|
||||
{
|
||||
v_float32 val = vx_load(ksptr + j);
|
||||
v_float32 rval = vx_load(sptr + j);
|
||||
v_float32 knan = v_not_nan(val);
|
||||
v_float32 alpha = (v_absdiff(val, rval) * sindex) & v_not_nan(rval) & knan;
|
||||
v_float32 alpha = v_and(v_and(v_mul(v_absdiff(val, rval), sindex), v_not_nan(rval)), knan);
|
||||
v_int32 idx = v_trunc(alpha);
|
||||
alpha -= v_cvt_f32(idx);
|
||||
alpha = v_sub(alpha, v_cvt_f32(idx));
|
||||
|
||||
v_float32 w = (kweight * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one-alpha))) & knan;
|
||||
v_store_aligned(wsum + j, vx_load_aligned(wsum + j) + w);
|
||||
v_store_aligned(sum + j, v_muladd(val & knan, w, vx_load_aligned(sum + j)));
|
||||
v_float32 w = v_and(v_mul(kweight, v_muladd(v_lut(this->expLUT + 1, idx), alpha, v_mul(v_lut(this->expLUT, idx), v_sub(v_one, alpha)))), knan);
|
||||
v_store_aligned(wsum + j, v_add(vx_load_aligned(wsum + j), w));
|
||||
v_store_aligned(sum + j, v_muladd(v_and(val, knan), w, vx_load_aligned(sum + j)));
|
||||
}
|
||||
#endif
|
||||
for (; j < size.width; j++)
|
||||
@ -752,11 +752,11 @@ public:
|
||||
}
|
||||
}
|
||||
j = 0;
|
||||
#if CV_SIMD
|
||||
for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes)
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
for (; j <= size.width - VTraits<v_float32>::vlanes(); j += VTraits<v_float32>::vlanes())
|
||||
{
|
||||
v_float32 v_val = vx_load(sptr + j);
|
||||
v_store(dptr + j, (vx_load_aligned(sum + j) + (v_val & v_not_nan(v_val))) / (vx_load_aligned(wsum + j) + (v_one & v_not_nan(v_val))));
|
||||
v_store(dptr + j, v_div(v_add(vx_load_aligned(sum + j), v_and(v_val, v_not_nan(v_val))), v_add(vx_load_aligned(wsum + j), v_and(v_one, v_not_nan(v_val)))));
|
||||
}
|
||||
#endif
|
||||
for (; j < size.width; j++)
|
||||
@ -774,7 +774,7 @@ public:
|
||||
float *sum_g = sum_b + alignSize(size.width, CV_SIMD_WIDTH);
|
||||
float *sum_r = sum_g + alignSize(size.width, CV_SIMD_WIDTH);
|
||||
float *wsum = sum_r + alignSize(size.width, CV_SIMD_WIDTH);
|
||||
#if CV_SIMD
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
v_float32 v_one = vx_setall_f32(1.f);
|
||||
v_float32 sindex = vx_setall_f32(scale_index);
|
||||
#endif
|
||||
@ -787,60 +787,60 @@ public:
|
||||
const float* ksptr3 = sptr + space_ofs[k+3];
|
||||
const float* rsptr = sptr;
|
||||
j = 0;
|
||||
#if CV_SIMD
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
v_float32 kweight0 = vx_setall_f32(space_weight[k]);
|
||||
v_float32 kweight1 = vx_setall_f32(space_weight[k+1]);
|
||||
v_float32 kweight2 = vx_setall_f32(space_weight[k+2]);
|
||||
v_float32 kweight3 = vx_setall_f32(space_weight[k+3]);
|
||||
for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes, rsptr += 3 * v_float32::nlanes,
|
||||
ksptr0 += 3 * v_float32::nlanes, ksptr1 += 3 * v_float32::nlanes, ksptr2 += 3 * v_float32::nlanes, ksptr3 += 3 * v_float32::nlanes)
|
||||
for (; j <= size.width - VTraits<v_float32>::vlanes(); j += VTraits<v_float32>::vlanes(), rsptr += 3 * VTraits<v_float32>::vlanes(),
|
||||
ksptr0 += 3 * VTraits<v_float32>::vlanes(), ksptr1 += 3 * VTraits<v_float32>::vlanes(), ksptr2 += 3 * VTraits<v_float32>::vlanes(), ksptr3 += 3 * VTraits<v_float32>::vlanes())
|
||||
{
|
||||
v_float32 kb, kg, kr, rb, rg, rr;
|
||||
v_load_deinterleave(rsptr, rb, rg, rr);
|
||||
|
||||
v_load_deinterleave(ksptr0, kb, kg, kr);
|
||||
v_float32 knan = v_not_nan(kb) & v_not_nan(kg) & v_not_nan(kr);
|
||||
v_float32 alpha = ((v_absdiff(kb, rb) + v_absdiff(kg, rg) + v_absdiff(kr, rr)) * sindex) & v_not_nan(rb) & v_not_nan(rg) & v_not_nan(rr) & knan;
|
||||
v_float32 knan = v_and(v_and(v_not_nan(kb), v_not_nan(kg)), v_not_nan(kr));
|
||||
v_float32 alpha = v_and(v_and(v_and(v_and(v_mul(v_add(v_add(v_absdiff(kb, rb), v_absdiff(kg, rg)), v_absdiff(kr, rr)), sindex), v_not_nan(rb)), v_not_nan(rg)), v_not_nan(rr)), knan);
|
||||
v_int32 idx = v_trunc(alpha);
|
||||
alpha -= v_cvt_f32(idx);
|
||||
v_float32 w = (kweight0 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan;
|
||||
v_float32 v_wsum = vx_load_aligned(wsum + j) + w;
|
||||
v_float32 v_sum_b = v_muladd(kb & knan, w, vx_load_aligned(sum_b + j));
|
||||
v_float32 v_sum_g = v_muladd(kg & knan, w, vx_load_aligned(sum_g + j));
|
||||
v_float32 v_sum_r = v_muladd(kr & knan, w, vx_load_aligned(sum_r + j));
|
||||
alpha = v_sub(alpha, v_cvt_f32(idx));
|
||||
v_float32 w = v_and(v_mul(kweight0, v_muladd(v_lut(this->expLUT + 1, idx), alpha, v_mul(v_lut(this->expLUT, idx), v_sub(v_one, alpha)))), knan);
|
||||
v_float32 v_wsum = v_add(vx_load_aligned(wsum + j), w);
|
||||
v_float32 v_sum_b = v_muladd(v_and(kb, knan), w, vx_load_aligned(sum_b + j));
|
||||
v_float32 v_sum_g = v_muladd(v_and(kg, knan), w, vx_load_aligned(sum_g + j));
|
||||
v_float32 v_sum_r = v_muladd(v_and(kr, knan), w, vx_load_aligned(sum_r + j));
|
||||
|
||||
v_load_deinterleave(ksptr1, kb, kg, kr);
|
||||
knan = v_not_nan(kb) & v_not_nan(kg) & v_not_nan(kr);
|
||||
alpha = ((v_absdiff(kb, rb) + v_absdiff(kg, rg) + v_absdiff(kr, rr)) * sindex) & v_not_nan(rb) & v_not_nan(rg) & v_not_nan(rr) & knan;
|
||||
knan = v_and(v_and(v_not_nan(kb), v_not_nan(kg)), v_not_nan(kr));
|
||||
alpha = v_and(v_and(v_and(v_and(v_mul(v_add(v_add(v_absdiff(kb, rb), v_absdiff(kg, rg)), v_absdiff(kr, rr)), sindex), v_not_nan(rb)), v_not_nan(rg)), v_not_nan(rr)), knan);
|
||||
idx = v_trunc(alpha);
|
||||
alpha -= v_cvt_f32(idx);
|
||||
w = (kweight1 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan;
|
||||
v_wsum += w;
|
||||
v_sum_b = v_muladd(kb & knan, w, v_sum_b);
|
||||
v_sum_g = v_muladd(kg & knan, w, v_sum_g);
|
||||
v_sum_r = v_muladd(kr & knan, w, v_sum_r);
|
||||
alpha = v_sub(alpha, v_cvt_f32(idx));
|
||||
w = v_and(v_mul(kweight1, v_muladd(v_lut(this->expLUT + 1, idx), alpha, v_mul(v_lut(this->expLUT, idx), v_sub(v_one, alpha)))), knan);
|
||||
v_wsum = v_add(v_wsum, w);
|
||||
v_sum_b = v_muladd(v_and(kb, knan), w, v_sum_b);
|
||||
v_sum_g = v_muladd(v_and(kg, knan), w, v_sum_g);
|
||||
v_sum_r = v_muladd(v_and(kr, knan), w, v_sum_r);
|
||||
|
||||
v_load_deinterleave(ksptr2, kb, kg, kr);
|
||||
knan = v_not_nan(kb) & v_not_nan(kg) & v_not_nan(kr);
|
||||
alpha = ((v_absdiff(kb, rb) + v_absdiff(kg, rg) + v_absdiff(kr, rr)) * sindex) & v_not_nan(rb) & v_not_nan(rg) & v_not_nan(rr) & knan;
|
||||
knan = v_and(v_and(v_not_nan(kb), v_not_nan(kg)), v_not_nan(kr));
|
||||
alpha = v_and(v_and(v_and(v_and(v_mul(v_add(v_add(v_absdiff(kb, rb), v_absdiff(kg, rg)), v_absdiff(kr, rr)), sindex), v_not_nan(rb)), v_not_nan(rg)), v_not_nan(rr)), knan);
|
||||
idx = v_trunc(alpha);
|
||||
alpha -= v_cvt_f32(idx);
|
||||
w = (kweight2 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan;
|
||||
v_wsum += w;
|
||||
v_sum_b = v_muladd(kb & knan, w, v_sum_b);
|
||||
v_sum_g = v_muladd(kg & knan, w, v_sum_g);
|
||||
v_sum_r = v_muladd(kr & knan, w, v_sum_r);
|
||||
alpha = v_sub(alpha, v_cvt_f32(idx));
|
||||
w = v_and(v_mul(kweight2, v_muladd(v_lut(this->expLUT + 1, idx), alpha, v_mul(v_lut(this->expLUT, idx), v_sub(v_one, alpha)))), knan);
|
||||
v_wsum = v_add(v_wsum, w);
|
||||
v_sum_b = v_muladd(v_and(kb, knan), w, v_sum_b);
|
||||
v_sum_g = v_muladd(v_and(kg, knan), w, v_sum_g);
|
||||
v_sum_r = v_muladd(v_and(kr, knan), w, v_sum_r);
|
||||
|
||||
v_load_deinterleave(ksptr3, kb, kg, kr);
|
||||
knan = v_not_nan(kb) & v_not_nan(kg) & v_not_nan(kr);
|
||||
alpha = ((v_absdiff(kb, rb) + v_absdiff(kg, rg) + v_absdiff(kr, rr)) * sindex) & v_not_nan(rb) & v_not_nan(rg) & v_not_nan(rr) & knan;
|
||||
knan = v_and(v_and(v_not_nan(kb), v_not_nan(kg)), v_not_nan(kr));
|
||||
alpha = v_and(v_and(v_and(v_and(v_mul(v_add(v_add(v_absdiff(kb, rb), v_absdiff(kg, rg)), v_absdiff(kr, rr)), sindex), v_not_nan(rb)), v_not_nan(rg)), v_not_nan(rr)), knan);
|
||||
idx = v_trunc(alpha);
|
||||
alpha -= v_cvt_f32(idx);
|
||||
w = (kweight3 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan;
|
||||
v_wsum += w;
|
||||
v_sum_b = v_muladd(kb & knan, w, v_sum_b);
|
||||
v_sum_g = v_muladd(kg & knan, w, v_sum_g);
|
||||
v_sum_r = v_muladd(kr & knan, w, v_sum_r);
|
||||
alpha = v_sub(alpha, v_cvt_f32(idx));
|
||||
w = v_and(v_mul(kweight3, v_muladd(v_lut(this->expLUT + 1, idx), alpha, v_mul(v_lut(this->expLUT, idx), v_sub(v_one, alpha)))), knan);
|
||||
v_wsum = v_add(v_wsum, w);
|
||||
v_sum_b = v_muladd(v_and(kb, knan), w, v_sum_b);
|
||||
v_sum_g = v_muladd(v_and(kg, knan), w, v_sum_g);
|
||||
v_sum_r = v_muladd(v_and(kr, knan), w, v_sum_r);
|
||||
|
||||
v_store_aligned(wsum + j, v_wsum);
|
||||
v_store_aligned(sum_b + j, v_sum_b);
|
||||
@ -938,24 +938,24 @@ public:
|
||||
const float* ksptr = sptr + space_ofs[k];
|
||||
const float* rsptr = sptr;
|
||||
j = 0;
|
||||
#if CV_SIMD
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
v_float32 kweight = vx_setall_f32(space_weight[k]);
|
||||
for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes, ksptr += 3*v_float32::nlanes, rsptr += 3*v_float32::nlanes)
|
||||
for (; j <= size.width - VTraits<v_float32>::vlanes(); j += VTraits<v_float32>::vlanes(), ksptr += 3*VTraits<v_float32>::vlanes(), rsptr += 3*VTraits<v_float32>::vlanes())
|
||||
{
|
||||
v_float32 kb, kg, kr, rb, rg, rr;
|
||||
v_load_deinterleave(ksptr, kb, kg, kr);
|
||||
v_load_deinterleave(rsptr, rb, rg, rr);
|
||||
|
||||
v_float32 knan = v_not_nan(kb) & v_not_nan(kg) & v_not_nan(kr);
|
||||
v_float32 alpha = ((v_absdiff(kb, rb) + v_absdiff(kg, rg) + v_absdiff(kr, rr)) * sindex) & v_not_nan(rb) & v_not_nan(rg) & v_not_nan(rr) & knan;
|
||||
v_float32 knan = v_and(v_and(v_not_nan(kb), v_not_nan(kg)), v_not_nan(kr));
|
||||
v_float32 alpha = v_and(v_and(v_and(v_and(v_mul(v_add(v_add(v_absdiff(kb, rb), v_absdiff(kg, rg)), v_absdiff(kr, rr)), sindex), v_not_nan(rb)), v_not_nan(rg)), v_not_nan(rr)), knan);
|
||||
v_int32 idx = v_trunc(alpha);
|
||||
alpha -= v_cvt_f32(idx);
|
||||
alpha = v_sub(alpha, v_cvt_f32(idx));
|
||||
|
||||
v_float32 w = (kweight * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan;
|
||||
v_store_aligned(wsum + j, vx_load_aligned(wsum + j) + w);
|
||||
v_store_aligned(sum_b + j, v_muladd(kb & knan, w, vx_load_aligned(sum_b + j)));
|
||||
v_store_aligned(sum_g + j, v_muladd(kg & knan, w, vx_load_aligned(sum_g + j)));
|
||||
v_store_aligned(sum_r + j, v_muladd(kr & knan, w, vx_load_aligned(sum_r + j)));
|
||||
v_float32 w = v_and(v_mul(kweight, v_muladd(v_lut(this->expLUT + 1, idx), alpha, v_mul(v_lut(this->expLUT, idx), v_sub(v_one, alpha)))), knan);
|
||||
v_store_aligned(wsum + j, v_add(vx_load_aligned(wsum + j), w));
|
||||
v_store_aligned(sum_b + j, v_muladd(v_and(kb, knan), w, vx_load_aligned(sum_b + j)));
|
||||
v_store_aligned(sum_g + j, v_muladd(v_and(kg, knan), w, vx_load_aligned(sum_g + j)));
|
||||
v_store_aligned(sum_r + j, v_muladd(v_and(kr, knan), w, vx_load_aligned(sum_r + j)));
|
||||
}
|
||||
#endif
|
||||
for (; j < size.width; j++, ksptr += 3, rsptr += 3)
|
||||
@ -978,14 +978,14 @@ public:
|
||||
}
|
||||
}
|
||||
j = 0;
|
||||
#if CV_SIMD
|
||||
for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes, sptr += 3*v_float32::nlanes, dptr += 3*v_float32::nlanes)
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
for (; j <= size.width - VTraits<v_float32>::vlanes(); j += VTraits<v_float32>::vlanes(), sptr += 3*VTraits<v_float32>::vlanes(), dptr += 3*VTraits<v_float32>::vlanes())
|
||||
{
|
||||
v_float32 b, g, r;
|
||||
v_load_deinterleave(sptr, b, g, r);
|
||||
v_float32 mask = v_not_nan(b) & v_not_nan(g) & v_not_nan(r);
|
||||
v_float32 w = v_one / (vx_load_aligned(wsum + j) + (v_one & mask));
|
||||
v_store_interleave(dptr, (vx_load_aligned(sum_b + j) + (b & mask)) * w, (vx_load_aligned(sum_g + j) + (g & mask)) * w, (vx_load_aligned(sum_r + j) + (r & mask)) * w);
|
||||
v_float32 mask = v_and(v_and(v_not_nan(b), v_not_nan(g)), v_not_nan(r));
|
||||
v_float32 w = v_div(v_one, v_add(vx_load_aligned(wsum + j), v_and(v_one, mask)));
|
||||
v_store_interleave(dptr, v_mul(v_add(vx_load_aligned(sum_b + j), v_and(b, mask)), w), v_mul(v_add(vx_load_aligned(sum_g + j), v_and(g, mask)), w), v_mul(v_add(vx_load_aligned(sum_r + j), v_and(r, mask)), w));
|
||||
}
|
||||
#endif
|
||||
for (; j < size.width; j++)
|
||||
@ -1011,7 +1011,7 @@ public:
|
||||
}
|
||||
}
|
||||
}
|
||||
#if CV_SIMD
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
vx_cleanup();
|
||||
#endif
|
||||
}
|
||||
|
@ -56,40 +56,38 @@ template<typename _Tp> static inline _Tp splineInterpolate(_Tp x, const _Tp* tab
|
||||
return ((tab[3]*x + tab[2])*x + tab[1])*x + tab[0];
|
||||
}
|
||||
|
||||
#if CV_SIMD
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
|
||||
template<typename _Tp> static inline cv::v_float32 splineInterpolate(const cv::v_float32& x, const _Tp* tab, int n)
|
||||
{
|
||||
using namespace cv;
|
||||
v_int32 ix = v_min(v_max(v_trunc(x), vx_setzero_s32()), vx_setall_s32(n-1));
|
||||
cv::v_float32 xx = x - v_cvt_f32(ix);
|
||||
ix = ix << 2;
|
||||
cv::v_float32 xx = v_sub(x, v_cvt_f32(ix));
|
||||
ix = v_shl<2>(ix);
|
||||
|
||||
v_float32 t[4];
|
||||
v_float32 t0, t1, t2, t3;
|
||||
// assume that v_float32::nlanes == v_int32::nlanes
|
||||
if(v_float32::nlanes == 4)
|
||||
if(VTraits<v_float32>::vlanes() == 4)
|
||||
{
|
||||
#if CV_SIMD_WIDTH == 16
|
||||
int32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) idx[4];
|
||||
v_store_aligned(idx, ix);
|
||||
v_float32x4 tt[4];
|
||||
tt[0] = v_load(tab + idx[0]);
|
||||
tt[1] = v_load(tab + idx[1]);
|
||||
tt[2] = v_load(tab + idx[2]);
|
||||
tt[3] = v_load(tab + idx[3]);
|
||||
v_transpose4x4(tt[0], tt[1], tt[2], tt[3],
|
||||
t[0], t[1], t[2], t[3]);
|
||||
#endif
|
||||
v_float32 tt0, tt1, tt2, tt3;
|
||||
tt0 = vx_load(tab + idx[0]);
|
||||
tt1 = vx_load(tab + idx[1]);
|
||||
tt2 = vx_load(tab + idx[2]);
|
||||
tt3 = vx_load(tab + idx[3]);
|
||||
v_transpose4x4(tt0, tt1, tt2, tt3,
|
||||
t0, t1, t2, t3);
|
||||
}
|
||||
else
|
||||
{
|
||||
t[0] = v_lut(tab + 0, ix);
|
||||
t[1] = v_lut(tab + 1, ix);
|
||||
t[2] = v_lut(tab + 2, ix);
|
||||
t[3] = v_lut(tab + 3, ix);
|
||||
t0 = v_lut(tab + 0, ix);
|
||||
t1 = v_lut(tab + 1, ix);
|
||||
t2 = v_lut(tab + 2, ix);
|
||||
t3 = v_lut(tab + 3, ix);
|
||||
}
|
||||
|
||||
return v_fma(v_fma(v_fma(t[3], xx, t[2]), xx, t[1]), xx, t[0]);
|
||||
return v_fma(v_fma(v_fma(t3, xx, t2), xx, t1), xx, t0);
|
||||
}
|
||||
|
||||
#endif
|
||||
@ -207,8 +205,8 @@ struct RGB2XYZ_f<float>
|
||||
C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
|
||||
C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
|
||||
int i = 0;
|
||||
#if CV_SIMD
|
||||
const int vsize = v_float32::nlanes;
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
const int vsize = VTraits<v_float32>::vlanes();
|
||||
v_float32 vc0 = vx_setall_f32(C0), vc1 = vx_setall_f32(C1), vc2 = vx_setall_f32(C2);
|
||||
v_float32 vc3 = vx_setall_f32(C3), vc4 = vx_setall_f32(C4), vc5 = vx_setall_f32(C5);
|
||||
v_float32 vc6 = vx_setall_f32(C6), vc7 = vx_setall_f32(C7), vc8 = vx_setall_f32(C8);
|
||||
@ -226,9 +224,9 @@ struct RGB2XYZ_f<float>
|
||||
}
|
||||
|
||||
v_float32 x, y, z;
|
||||
x = v_fma(b, vc0, v_fma(g, vc1, r*vc2));
|
||||
y = v_fma(b, vc3, v_fma(g, vc4, r*vc5));
|
||||
z = v_fma(b, vc6, v_fma(g, vc7, r*vc8));
|
||||
x = v_fma(b, vc0, v_fma(g, vc1, v_mul(r, vc2)));
|
||||
y = v_fma(b, vc3, v_fma(g, vc4, v_mul(r, vc5)));
|
||||
z = v_fma(b, vc6, v_fma(g, vc7, v_mul(r, vc8)));
|
||||
|
||||
v_store_interleave(dst, x, y, z);
|
||||
}
|
||||
@ -313,8 +311,8 @@ struct RGB2XYZ_i<uchar>
|
||||
C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
|
||||
C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
|
||||
|
||||
#if CV_SIMD
|
||||
const int vsize = v_uint8::nlanes;
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
const int vsize = VTraits<v_uint8>::vlanes();
|
||||
int descaleShift = 1 << (shift-1);
|
||||
v_int16 vdescale = vx_setall_s16((short)descaleShift);
|
||||
v_int16 cxbg, cxr1, cybg, cyr1, czbg, czr1;
|
||||
@ -349,27 +347,36 @@ struct RGB2XYZ_i<uchar>
|
||||
sg0 = v_reinterpret_as_s16(g0); sg1 = v_reinterpret_as_s16(g1);
|
||||
sb0 = v_reinterpret_as_s16(b0); sb1 = v_reinterpret_as_s16(b1);
|
||||
|
||||
v_int16 bg[4], rd[4];
|
||||
v_zip(sb0, sg0, bg[0], bg[1]);
|
||||
v_zip(sb1, sg1, bg[2], bg[3]);
|
||||
v_zip(sr0, vdescale, rd[0], rd[1]);
|
||||
v_zip(sr1, vdescale, rd[2], rd[3]);
|
||||
v_int16 bg0, bg1, bg2, bg3, rd0, rd1, rd2, rd3;
|
||||
v_zip(sb0, sg0, bg0, bg1);
|
||||
v_zip(sb1, sg1, bg2, bg3);
|
||||
v_zip(sr0, vdescale, rd0, rd1);
|
||||
v_zip(sr1, vdescale, rd2, rd3);
|
||||
|
||||
v_uint32 vx[4], vy[4], vz[4];
|
||||
for(int j = 0; j < 4; j++)
|
||||
{
|
||||
vx[j] = v_reinterpret_as_u32(v_dotprod(bg[j], cxbg) + v_dotprod(rd[j], cxr1)) >> shift;
|
||||
vy[j] = v_reinterpret_as_u32(v_dotprod(bg[j], cybg) + v_dotprod(rd[j], cyr1)) >> shift;
|
||||
vz[j] = v_reinterpret_as_u32(v_dotprod(bg[j], czbg) + v_dotprod(rd[j], czr1)) >> shift;
|
||||
}
|
||||
v_uint32 vx0, vx1, vx2, vx3;
|
||||
v_uint32 vy0, vy1, vy2, vy3;
|
||||
v_uint32 vz0, vz1, vz2, vz3;
|
||||
|
||||
vx0 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_dotprod(bg0, cxbg), v_dotprod(rd0, cxr1))));
|
||||
vy0 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_dotprod(bg0, cybg), v_dotprod(rd0, cyr1))));
|
||||
vz0 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_dotprod(bg0, czbg), v_dotprod(rd0, czr1))));
|
||||
vx1 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_dotprod(bg1, cxbg), v_dotprod(rd1, cxr1))));
|
||||
vy1 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_dotprod(bg1, cybg), v_dotprod(rd1, cyr1))));
|
||||
vz1 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_dotprod(bg1, czbg), v_dotprod(rd1, czr1))));
|
||||
vx2 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_dotprod(bg2, cxbg), v_dotprod(rd2, cxr1))));
|
||||
vy2 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_dotprod(bg2, cybg), v_dotprod(rd2, cyr1))));
|
||||
vz2 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_dotprod(bg2, czbg), v_dotprod(rd2, czr1))));
|
||||
vx3 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_dotprod(bg3, cxbg), v_dotprod(rd3, cxr1))));
|
||||
vy3 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_dotprod(bg3, cybg), v_dotprod(rd3, cyr1))));
|
||||
vz3 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_dotprod(bg3, czbg), v_dotprod(rd3, czr1))));
|
||||
|
||||
v_uint16 x0, x1, y0, y1, z0, z1;
|
||||
x0 = v_pack(vx[0], vx[1]);
|
||||
x1 = v_pack(vx[2], vx[3]);
|
||||
y0 = v_pack(vy[0], vy[1]);
|
||||
y1 = v_pack(vy[2], vy[3]);
|
||||
z0 = v_pack(vz[0], vz[1]);
|
||||
z1 = v_pack(vz[2], vz[3]);
|
||||
x0 = v_pack(vx0, vx1);
|
||||
x1 = v_pack(vx2, vx3);
|
||||
y0 = v_pack(vy0, vy1);
|
||||
y1 = v_pack(vy2, vy3);
|
||||
z0 = v_pack(vz0, vz1);
|
||||
z1 = v_pack(vz2, vz3);
|
||||
|
||||
v_uint8 x, y, z;
|
||||
x = v_pack(x0, x1);
|
||||
@ -424,8 +431,8 @@ struct RGB2XYZ_i<ushort>
|
||||
int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
|
||||
C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
|
||||
C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
|
||||
#if CV_SIMD
|
||||
const int vsize = v_uint16::nlanes;
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
const int vsize = VTraits<v_uint16>::vlanes();
|
||||
const int descaleShift = 1 << (shift-1);
|
||||
v_int16 vdescale = vx_setall_s16(descaleShift);
|
||||
v_int16 vc0 = vx_setall_s16((short)C0), vc1 = vx_setall_s16((short)C1), vc2 = vx_setall_s16((short)C2);
|
||||
@ -464,29 +471,29 @@ struct RGB2XYZ_i<ushort>
|
||||
v_int16 ymr, ymg, ymb;
|
||||
v_int16 zmr, zmg, zmb;
|
||||
|
||||
v_int16 mr = sr < zero, mg = sg < zero, mb = sb < zero;
|
||||
v_int16 mr = v_lt(sr, zero), mg = v_lt(sg, zero), mb = v_lt(sb, zero);
|
||||
|
||||
xmb = mb & vc0;
|
||||
xmg = mg & vc1;
|
||||
xmr = mr & vc2;
|
||||
ymb = mb & vc3;
|
||||
ymg = mg & vc4;
|
||||
ymr = mr & vc5;
|
||||
zmb = mb & vc6;
|
||||
zmg = mg & vc7;
|
||||
zmr = mr & vc8;
|
||||
xmb = v_and(mb, vc0);
|
||||
xmg = v_and(mg, vc1);
|
||||
xmr = v_and(mr, vc2);
|
||||
ymb = v_and(mb, vc3);
|
||||
ymg = v_and(mg, vc4);
|
||||
ymr = v_and(mr, vc5);
|
||||
zmb = v_and(mb, vc6);
|
||||
zmg = v_and(mg, vc7);
|
||||
zmr = v_and(mr, vc8);
|
||||
|
||||
v_int32 xfix0, xfix1, yfix0, yfix1, zfix0, zfix1;
|
||||
v_expand(xmr + xmg + xmb, xfix0, xfix1);
|
||||
v_expand(ymr + ymg + ymb, yfix0, yfix1);
|
||||
v_expand(zmr + zmg + zmb, zfix0, zfix1);
|
||||
v_expand(v_add(v_add(xmr, xmg), xmb), xfix0, xfix1);
|
||||
v_expand(v_add(v_add(ymr, ymg), ymb), yfix0, yfix1);
|
||||
v_expand(v_add(v_add(zmr, zmg), zmb), zfix0, zfix1);
|
||||
|
||||
xfix0 = xfix0 << 16;
|
||||
xfix1 = xfix1 << 16;
|
||||
yfix0 = yfix0 << 16;
|
||||
yfix1 = yfix1 << 16;
|
||||
zfix0 = zfix0 << 16;
|
||||
zfix1 = zfix1 << 16;
|
||||
xfix0 = v_shl<16>(xfix0);
|
||||
xfix1 = v_shl<16>(xfix1);
|
||||
yfix0 = v_shl<16>(yfix0);
|
||||
yfix1 = v_shl<16>(yfix1);
|
||||
zfix0 = v_shl<16>(zfix0);
|
||||
zfix1 = v_shl<16>(zfix1);
|
||||
|
||||
v_int16 bg0, bg1, rd0, rd1;
|
||||
v_zip(sb, sg, bg0, bg1);
|
||||
@ -494,12 +501,12 @@ struct RGB2XYZ_i<ushort>
|
||||
|
||||
v_uint32 x0, x1, y0, y1, z0, z1;
|
||||
|
||||
x0 = v_reinterpret_as_u32(v_dotprod(bg0, cxbg) + v_dotprod(rd0, cxr1) + xfix0) >> shift;
|
||||
x1 = v_reinterpret_as_u32(v_dotprod(bg1, cxbg) + v_dotprod(rd1, cxr1) + xfix1) >> shift;
|
||||
y0 = v_reinterpret_as_u32(v_dotprod(bg0, cybg) + v_dotprod(rd0, cyr1) + yfix0) >> shift;
|
||||
y1 = v_reinterpret_as_u32(v_dotprod(bg1, cybg) + v_dotprod(rd1, cyr1) + yfix1) >> shift;
|
||||
z0 = v_reinterpret_as_u32(v_dotprod(bg0, czbg) + v_dotprod(rd0, czr1) + zfix0) >> shift;
|
||||
z1 = v_reinterpret_as_u32(v_dotprod(bg1, czbg) + v_dotprod(rd1, czr1) + zfix1) >> shift;
|
||||
x0 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_add(v_dotprod(bg0, cxbg), v_dotprod(rd0, cxr1)), xfix0)));
|
||||
x1 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_add(v_dotprod(bg1, cxbg), v_dotprod(rd1, cxr1)), xfix1)));
|
||||
y0 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_add(v_dotprod(bg0, cybg), v_dotprod(rd0, cyr1)), yfix0)));
|
||||
y1 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_add(v_dotprod(bg1, cybg), v_dotprod(rd1, cyr1)), yfix1)));
|
||||
z0 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_add(v_dotprod(bg0, czbg), v_dotprod(rd0, czr1)), zfix0)));
|
||||
z1 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_add(v_dotprod(bg1, czbg), v_dotprod(rd1, czr1)), zfix1)));
|
||||
|
||||
v_uint16 x, y, z;
|
||||
x = v_pack(x0, x1);
|
||||
@ -593,8 +600,8 @@ struct XYZ2RGB_f<float>
|
||||
C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
|
||||
C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
|
||||
int i = 0;
|
||||
#if CV_SIMD
|
||||
const int vsize = v_float32::nlanes;
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
const int vsize = VTraits<v_float32>::vlanes();
|
||||
v_float32 valpha = vx_setall_f32(alpha);
|
||||
v_float32 vc0 = vx_setall_f32(C0), vc1 = vx_setall_f32(C1), vc2 = vx_setall_f32(C2);
|
||||
v_float32 vc3 = vx_setall_f32(C3), vc4 = vx_setall_f32(C4), vc5 = vx_setall_f32(C5);
|
||||
@ -606,9 +613,9 @@ struct XYZ2RGB_f<float>
|
||||
v_load_deinterleave(src, x, y, z);
|
||||
|
||||
v_float32 b, g, r;
|
||||
b = v_fma(x, vc0, v_fma(y, vc1, z*vc2));
|
||||
g = v_fma(x, vc3, v_fma(y, vc4, z*vc5));
|
||||
r = v_fma(x, vc6, v_fma(y, vc7, z*vc8));
|
||||
b = v_fma(x, vc0, v_fma(y, vc1, v_mul(z, vc2)));
|
||||
g = v_fma(x, vc3, v_fma(y, vc4, v_mul(z, vc5)));
|
||||
r = v_fma(x, vc6, v_fma(y, vc7, v_mul(z, vc8)));
|
||||
|
||||
if(dcn == 4)
|
||||
{
|
||||
@ -707,8 +714,8 @@ struct XYZ2RGB_i<uchar>
|
||||
int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
|
||||
C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
|
||||
C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
|
||||
#if CV_SIMD
|
||||
const int vsize = v_uint8::nlanes;
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
const int vsize = VTraits<v_uint8>::vlanes();
|
||||
const int descaleShift = 1 << (shift - 1);
|
||||
v_uint8 valpha = vx_setall_u8(alpha);
|
||||
v_int16 vdescale = vx_setall_s16(descaleShift);
|
||||
@ -739,25 +746,35 @@ struct XYZ2RGB_i<uchar>
|
||||
z0 = v_reinterpret_as_s16(uz0);
|
||||
z1 = v_reinterpret_as_s16(uz1);
|
||||
|
||||
v_int32 b[4], g[4], r[4];
|
||||
v_int32 bb0, bb1, bb2, bb3,
|
||||
gg0, gg1, gg2, gg3,
|
||||
rr0, rr1, rr2, rr3;
|
||||
|
||||
v_int16 xy[4], zd[4];
|
||||
v_zip(x0, y0, xy[0], xy[1]);
|
||||
v_zip(x1, y1, xy[2], xy[3]);
|
||||
v_zip(z0, vdescale, zd[0], zd[1]);
|
||||
v_zip(z1, vdescale, zd[2], zd[3]);
|
||||
v_int16 xy0, xy1, xy2, xy3;
|
||||
v_int16 zd0, zd1, zd2, zd3;
|
||||
|
||||
for(int j = 0; j < 4; j++)
|
||||
{
|
||||
b[j] = (v_dotprod(xy[j], cbxy) + v_dotprod(zd[j], cbz1)) >> shift;
|
||||
g[j] = (v_dotprod(xy[j], cgxy) + v_dotprod(zd[j], cgz1)) >> shift;
|
||||
r[j] = (v_dotprod(xy[j], crxy) + v_dotprod(zd[j], crz1)) >> shift;
|
||||
}
|
||||
v_zip(x0, y0, xy0, xy1);
|
||||
v_zip(x1, y1, xy2, xy3);
|
||||
v_zip(z0, vdescale, zd0, zd1);
|
||||
v_zip(z1, vdescale, zd2, zd3);
|
||||
|
||||
bb0 = v_shr<shift>(v_add(v_dotprod(xy0, cbxy), v_dotprod(zd0, cbz1)));
|
||||
gg0 = v_shr<shift>(v_add(v_dotprod(xy0, cgxy), v_dotprod(zd0, cgz1)));
|
||||
rr0 = v_shr<shift>(v_add(v_dotprod(xy0, crxy), v_dotprod(zd0, crz1)));
|
||||
bb1 = v_shr<shift>(v_add(v_dotprod(xy1, cbxy), v_dotprod(zd1, cbz1)));
|
||||
gg1 = v_shr<shift>(v_add(v_dotprod(xy1, cgxy), v_dotprod(zd1, cgz1)));
|
||||
rr1 = v_shr<shift>(v_add(v_dotprod(xy1, crxy), v_dotprod(zd1, crz1)));
|
||||
bb2 = v_shr<shift>(v_add(v_dotprod(xy2, cbxy), v_dotprod(zd2, cbz1)));
|
||||
gg2 = v_shr<shift>(v_add(v_dotprod(xy2, cgxy), v_dotprod(zd2, cgz1)));
|
||||
rr2 = v_shr<shift>(v_add(v_dotprod(xy2, crxy), v_dotprod(zd2, crz1)));
|
||||
bb3 = v_shr<shift>(v_add(v_dotprod(xy3, cbxy), v_dotprod(zd3, cbz1)));
|
||||
gg3 = v_shr<shift>(v_add(v_dotprod(xy3, cgxy), v_dotprod(zd3, cgz1)));
|
||||
rr3 = v_shr<shift>(v_add(v_dotprod(xy3, crxy), v_dotprod(zd3, crz1)));
|
||||
|
||||
v_uint16 b0, b1, g0, g1, r0, r1;
|
||||
b0 = v_pack_u(b[0], b[1]); b1 = v_pack_u(b[2], b[3]);
|
||||
g0 = v_pack_u(g[0], g[1]); g1 = v_pack_u(g[2], g[3]);
|
||||
r0 = v_pack_u(r[0], r[1]); r1 = v_pack_u(r[2], r[3]);
|
||||
b0 = v_pack_u(bb0, bb1); b1 = v_pack_u(bb2, bb3);
|
||||
g0 = v_pack_u(gg0, gg1); g1 = v_pack_u(gg2, gg3);
|
||||
r0 = v_pack_u(rr0, rr1); r1 = v_pack_u(rr2, rr3);
|
||||
|
||||
v_uint8 bb, gg, rr;
|
||||
bb = v_pack(b0, b1);
|
||||
@ -820,8 +837,8 @@ struct XYZ2RGB_i<ushort>
|
||||
int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
|
||||
C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
|
||||
C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
|
||||
#if CV_SIMD
|
||||
const int vsize = v_uint16::nlanes;
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
const int vsize = VTraits<v_uint16>::vlanes();
|
||||
const int descaleShift = 1 << (shift-1);
|
||||
v_uint16 valpha = vx_setall_u16(alpha);
|
||||
v_int16 vdescale = vx_setall_s16(descaleShift);
|
||||
@ -850,30 +867,30 @@ struct XYZ2RGB_i<ushort>
|
||||
sz = v_reinterpret_as_s16(z);
|
||||
|
||||
// fixing 16bit signed multiplication
|
||||
v_int16 mx = sx < zero, my = sy < zero, mz = sz < zero;
|
||||
v_int16 mx = v_lt(sx, zero), my = v_lt(sy, zero), mz = v_lt(sz, zero);
|
||||
|
||||
v_int16 bmx, bmy, bmz;
|
||||
v_int16 gmx, gmy, gmz;
|
||||
v_int16 rmx, rmy, rmz;
|
||||
|
||||
bmx = mx & vc0;
|
||||
bmy = my & vc1;
|
||||
bmz = mz & vc2;
|
||||
gmx = mx & vc3;
|
||||
gmy = my & vc4;
|
||||
gmz = mz & vc5;
|
||||
rmx = mx & vc6;
|
||||
rmy = my & vc7;
|
||||
rmz = mz & vc8;
|
||||
bmx = v_and(mx, vc0);
|
||||
bmy = v_and(my, vc1);
|
||||
bmz = v_and(mz, vc2);
|
||||
gmx = v_and(mx, vc3);
|
||||
gmy = v_and(my, vc4);
|
||||
gmz = v_and(mz, vc5);
|
||||
rmx = v_and(mx, vc6);
|
||||
rmy = v_and(my, vc7);
|
||||
rmz = v_and(mz, vc8);
|
||||
|
||||
v_int32 bfix0, bfix1, gfix0, gfix1, rfix0, rfix1;
|
||||
v_expand(bmx + bmy + bmz, bfix0, bfix1);
|
||||
v_expand(gmx + gmy + gmz, gfix0, gfix1);
|
||||
v_expand(rmx + rmy + rmz, rfix0, rfix1);
|
||||
v_expand(v_add(v_add(bmx, bmy), bmz), bfix0, bfix1);
|
||||
v_expand(v_add(v_add(gmx, gmy), gmz), gfix0, gfix1);
|
||||
v_expand(v_add(v_add(rmx, rmy), rmz), rfix0, rfix1);
|
||||
|
||||
bfix0 = bfix0 << 16; bfix1 = bfix1 << 16;
|
||||
gfix0 = gfix0 << 16; gfix1 = gfix1 << 16;
|
||||
rfix0 = rfix0 << 16; rfix1 = rfix1 << 16;
|
||||
bfix0 = v_shl<16>(bfix0); bfix1 = v_shl<16>(bfix1);
|
||||
gfix0 = v_shl<16>(gfix0); gfix1 = v_shl<16>(gfix1);
|
||||
rfix0 = v_shl<16>(rfix0); rfix1 = v_shl<16>(rfix1);
|
||||
|
||||
v_int16 xy0, xy1, zd0, zd1;
|
||||
v_zip(sx, sy, xy0, xy1);
|
||||
@ -881,12 +898,12 @@ struct XYZ2RGB_i<ushort>
|
||||
|
||||
v_int32 b0, b1, g0, g1, r0, r1;
|
||||
|
||||
b0 = (v_dotprod(xy0, cbxy) + v_dotprod(zd0, cbz1) + bfix0) >> shift;
|
||||
b1 = (v_dotprod(xy1, cbxy) + v_dotprod(zd1, cbz1) + bfix1) >> shift;
|
||||
g0 = (v_dotprod(xy0, cgxy) + v_dotprod(zd0, cgz1) + gfix0) >> shift;
|
||||
g1 = (v_dotprod(xy1, cgxy) + v_dotprod(zd1, cgz1) + gfix1) >> shift;
|
||||
r0 = (v_dotprod(xy0, crxy) + v_dotprod(zd0, crz1) + rfix0) >> shift;
|
||||
r1 = (v_dotprod(xy1, crxy) + v_dotprod(zd1, crz1) + rfix1) >> shift;
|
||||
b0 = v_shr<shift>(v_add(v_add(v_dotprod(xy0, cbxy), v_dotprod(zd0, cbz1)), bfix0));
|
||||
b1 = v_shr<shift>(v_add(v_add(v_dotprod(xy1, cbxy), v_dotprod(zd1, cbz1)), bfix1));
|
||||
g0 = v_shr<shift>(v_add(v_add(v_dotprod(xy0, cgxy), v_dotprod(zd0, cgz1)), gfix0));
|
||||
g1 = v_shr<shift>(v_add(v_add(v_dotprod(xy1, cgxy), v_dotprod(zd1, cgz1)), gfix1));
|
||||
r0 = v_shr<shift>(v_add(v_add(v_dotprod(xy0, crxy), v_dotprod(zd0, crz1)), rfix0));
|
||||
r1 = v_shr<shift>(v_add(v_add(v_dotprod(xy1, crxy), v_dotprod(zd1, crz1)), rfix1));
|
||||
|
||||
v_uint16 b, g, r;
|
||||
b = v_pack_u(b0, b1); g = v_pack_u(g0, g1); r = v_pack_u(r0, r1);
|
||||
@ -1452,19 +1469,19 @@ static inline void trilinearPackedInterpolate(const v_uint16x8& inX, const v_uin
|
||||
#undef DOT_SHIFT_PACK
|
||||
}
|
||||
|
||||
#elif CV_SIMD
|
||||
#elif CV_SIMD // Fixed size v_int16x8 used below, CV_SIMD_SCALABLE is disabled.
|
||||
|
||||
// inValues are in [0; LAB_BASE]
|
||||
static inline void trilinearPackedInterpolate(const v_uint16& inX, const v_uint16& inY, const v_uint16& inZ,
|
||||
const int16_t* LUT,
|
||||
v_uint16& outA, v_uint16& outB, v_uint16& outC)
|
||||
{
|
||||
const int vsize = v_uint16::nlanes;
|
||||
const int vsize = VTraits<v_uint16>::max_nlanes;
|
||||
|
||||
// LUT idx of origin pt of cube
|
||||
v_uint16 tx = inX >> (lab_base_shift - lab_lut_shift);
|
||||
v_uint16 ty = inY >> (lab_base_shift - lab_lut_shift);
|
||||
v_uint16 tz = inZ >> (lab_base_shift - lab_lut_shift);
|
||||
v_uint16 tx = v_shr<lab_base_shift - lab_lut_shift>(inX);
|
||||
v_uint16 ty = v_shr<lab_base_shift - lab_lut_shift>(inY);
|
||||
v_uint16 tz = v_shr<lab_base_shift - lab_lut_shift>(inZ);
|
||||
|
||||
v_uint32 btmp00, btmp01, btmp10, btmp11, btmp20, btmp21;
|
||||
v_uint32 baseIdx0, baseIdx1;
|
||||
@ -1472,8 +1489,8 @@ static inline void trilinearPackedInterpolate(const v_uint16& inX, const v_uint1
|
||||
v_mul_expand(tx, vx_setall_u16(3*8), btmp00, btmp01);
|
||||
v_mul_expand(ty, vx_setall_u16(3*8*LAB_LUT_DIM), btmp10, btmp11);
|
||||
v_mul_expand(tz, vx_setall_u16(3*8*LAB_LUT_DIM*LAB_LUT_DIM), btmp20, btmp21);
|
||||
baseIdx0 = btmp00 + btmp10 + btmp20;
|
||||
baseIdx1 = btmp01 + btmp11 + btmp21;
|
||||
baseIdx0 = v_add(v_add(btmp00, btmp10), btmp20);
|
||||
baseIdx1 = v_add(v_add(btmp01, btmp11), btmp21);
|
||||
|
||||
uint32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vbaseIdx[vsize];
|
||||
v_store_aligned(vbaseIdx + 0*vsize/2, baseIdx0);
|
||||
@ -1482,9 +1499,9 @@ static inline void trilinearPackedInterpolate(const v_uint16& inX, const v_uint1
|
||||
// fracX, fracY, fracZ are [0; TRILINEAR_BASE)
|
||||
const uint16_t bitMask = (1 << trilinear_shift) - 1;
|
||||
v_uint16 bitMaskReg = vx_setall_u16(bitMask);
|
||||
v_uint16 fracX = (inX >> (lab_base_shift - 8 - 1)) & bitMaskReg;
|
||||
v_uint16 fracY = (inY >> (lab_base_shift - 8 - 1)) & bitMaskReg;
|
||||
v_uint16 fracZ = (inZ >> (lab_base_shift - 8 - 1)) & bitMaskReg;
|
||||
v_uint16 fracX = v_and(v_shr<lab_base_shift - 8 - 1>(inX), bitMaskReg);
|
||||
v_uint16 fracY = v_and(v_shr<lab_base_shift - 8 - 1>(inY), bitMaskReg);
|
||||
v_uint16 fracZ = v_and(v_shr<lab_base_shift - 8 - 1>(inZ), bitMaskReg);
|
||||
|
||||
// trilinearIdx = 8*x + 8*TRILINEAR_BASE*y + 8*TRILINEAR_BASE*TRILINEAR_BASE*z
|
||||
v_uint32 trilinearIdx0, trilinearIdx1;
|
||||
@ -1493,8 +1510,8 @@ static inline void trilinearPackedInterpolate(const v_uint16& inX, const v_uint1
|
||||
v_expand(fracY, fracY0, fracY1);
|
||||
v_expand(fracZ, fracZ0, fracZ1);
|
||||
|
||||
trilinearIdx0 = (fracX0 << 3) + (fracY0 << (3+trilinear_shift)) + (fracZ0 << (3+trilinear_shift*2));
|
||||
trilinearIdx1 = (fracX1 << 3) + (fracY1 << (3+trilinear_shift)) + (fracZ1 << (3+trilinear_shift*2));
|
||||
trilinearIdx0 = v_add(v_add(v_shl<3>(fracX0), v_shl<3 + trilinear_shift>(fracY0)), v_shl<3 + trilinear_shift * 2>(fracZ0));
|
||||
trilinearIdx1 = v_add(v_add(v_shl<3>(fracX1), v_shl<3 + trilinear_shift>(fracY1)), v_shl<3 + trilinear_shift * 2>(fracZ1));
|
||||
|
||||
uint32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vtrilinearIdx[vsize];
|
||||
v_store_aligned(vtrilinearIdx + 0*vsize/2, trilinearIdx0);
|
||||
@ -1528,12 +1545,12 @@ static inline void trilinearPackedInterpolate(const v_uint16& inX, const v_uint1
|
||||
|
||||
// CV_DESCALE
|
||||
const v_uint32 descaleShift = vx_setall_u32(1 << (trilinear_shift*3 - 1));
|
||||
a0 = (a0 + descaleShift) >> (trilinear_shift*3);
|
||||
a1 = (a1 + descaleShift) >> (trilinear_shift*3);
|
||||
b0 = (b0 + descaleShift) >> (trilinear_shift*3);
|
||||
b1 = (b1 + descaleShift) >> (trilinear_shift*3);
|
||||
c0 = (c0 + descaleShift) >> (trilinear_shift*3);
|
||||
c1 = (c1 + descaleShift) >> (trilinear_shift*3);
|
||||
a0 = v_shr<trilinear_shift * 3>(v_add(a0, descaleShift));
|
||||
a1 = v_shr<trilinear_shift * 3>(v_add(a1, descaleShift));
|
||||
b0 = v_shr<trilinear_shift * 3>(v_add(b0, descaleShift));
|
||||
b1 = v_shr<trilinear_shift * 3>(v_add(b1, descaleShift));
|
||||
c0 = v_shr<trilinear_shift * 3>(v_add(c0, descaleShift));
|
||||
c1 = v_shr<trilinear_shift * 3>(v_add(c1, descaleShift));
|
||||
|
||||
outA = v_pack(a0, a1); outB = v_pack(b0, b1); outC = v_pack(c0, c1);
|
||||
}
|
||||
|
@ -49,6 +49,15 @@ void cvtOnePlaneYUVtoBGR(const uchar * src_data, size_t src_step,
|
||||
namespace {
|
||||
//constants for conversion from/to RGB and YUV, YCrCb according to BT.601
|
||||
|
||||
#if CV_SIMD_SCALABLE
|
||||
template <class T>
|
||||
static void swap(T&a, T&b) {
|
||||
T t = a;
|
||||
a = b;
|
||||
b = t;
|
||||
}
|
||||
#endif
|
||||
|
||||
//to YCbCr
|
||||
static const float YCBF = 0.564f; // == 1/2/(1-B2YF)
|
||||
static const float YCRF = 0.713f; // == 1/2/(1-R2YF)
|
||||
@ -143,11 +152,11 @@ struct RGB2YCrCb_f<float>
|
||||
float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4];
|
||||
|
||||
int i = 0;
|
||||
#if CV_SIMD
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
v_float32 vc0 = vx_setall_f32(C0), vc1 = vx_setall_f32(C1), vc2 = vx_setall_f32(C2);
|
||||
v_float32 vc3 = vx_setall_f32(C3), vc4 = vx_setall_f32(C4);
|
||||
v_float32 vdelta = vx_setall_f32(delta);
|
||||
const int vsize = v_float32::nlanes;
|
||||
const int vsize = VTraits<v_float32>::vlanes();
|
||||
for( ; i <= n-vsize;
|
||||
i += vsize, src += vsize*scn, dst += vsize*3)
|
||||
{
|
||||
@ -162,13 +171,13 @@ struct RGB2YCrCb_f<float>
|
||||
}
|
||||
|
||||
v_float32 y, cr, cb;
|
||||
y = v_fma(b, vc0, v_fma(g, vc1, r*vc2));
|
||||
y = v_fma(b, vc0, v_fma(g, vc1, v_mul(r, vc2)));
|
||||
|
||||
if(bidx)
|
||||
std::swap(r, b);
|
||||
swap(r, b);
|
||||
|
||||
cr = v_fma(r - y, vc3, vdelta);
|
||||
cb = v_fma(b - y, vc4, vdelta);
|
||||
cr = v_fma(v_sub(r, y), vc3, vdelta);
|
||||
cb = v_fma(v_sub(b, y), vc4, vdelta);
|
||||
|
||||
if(yuvOrder)
|
||||
{
|
||||
@ -266,8 +275,8 @@ struct RGB2YCrCb_i<ushort>
|
||||
int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4];
|
||||
int sdelta = ColorChannel<ushort>::half()*(1 << shift);
|
||||
int i = 0;
|
||||
#if CV_SIMD
|
||||
const int vsize = v_uint16::nlanes;
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
const int vsize = VTraits<v_uint16>::vlanes();
|
||||
const int descale = 1 << (shift-1);
|
||||
|
||||
v_int16 b2y = vx_setall_s16((short)C0);
|
||||
@ -312,13 +321,13 @@ struct RGB2YCrCb_i<ushort>
|
||||
|
||||
// fixing 16bit signed multiplication
|
||||
v_int16 mr, mg, mb;
|
||||
mr = (sr < z) & r2y;
|
||||
mg = (sg < z) & g2y;
|
||||
mb = (sb < z) & b2y;
|
||||
v_int16 fixmul = v_add_wrap(mr, v_add_wrap(mg, mb)) << fix_shift;
|
||||
mr = v_and(v_lt(sr, z), r2y);
|
||||
mg = v_and(v_lt(sg, z), g2y);
|
||||
mb = v_and(v_lt(sb, z), b2y);
|
||||
v_int16 fixmul = v_shl(v_add_wrap(mr, v_add_wrap(mg, mb)), fix_shift);
|
||||
|
||||
v_int32 ssy0 = (v_dotprod(bg0, bg2y) + v_dotprod(rd0, r12y)) >> shift;
|
||||
v_int32 ssy1 = (v_dotprod(bg1, bg2y) + v_dotprod(rd1, r12y)) >> shift;
|
||||
v_int32 ssy0 = v_shr(v_add(v_dotprod(bg0, bg2y), v_dotprod(rd0, r12y)), shift);
|
||||
v_int32 ssy1 = v_shr(v_add(v_dotprod(bg1, bg2y), v_dotprod(rd1, r12y)), shift);
|
||||
|
||||
y = v_reinterpret_as_u16(v_add_wrap(v_pack(ssy0, ssy1), fixmul));
|
||||
|
||||
@ -340,15 +349,15 @@ struct RGB2YCrCb_i<ushort>
|
||||
v_int32 sy0 = v_reinterpret_as_s32(uy0);
|
||||
v_int32 sy1 = v_reinterpret_as_s32(uy1);
|
||||
|
||||
sr0 = sr0 - sy0; sr1 = sr1 - sy1;
|
||||
sb0 = sb0 - sy0; sb1 = sb1 - sy1;
|
||||
sr0 = v_sub(sr0, sy0); sr1 = v_sub(sr1, sy1);
|
||||
sb0 = v_sub(sb0, sy0); sb1 = v_sub(sb1, sy1);
|
||||
|
||||
v_int32 v_scr0, v_scr1, v_scb0, v_scb1;
|
||||
|
||||
v_scr0 = (sr0*vc3 + vdd) >> shift;
|
||||
v_scr1 = (sr1*vc3 + vdd) >> shift;
|
||||
v_scb0 = (sb0*vc4 + vdd) >> shift;
|
||||
v_scb1 = (sb1*vc4 + vdd) >> shift;
|
||||
v_scr0 = v_shr(v_add(v_mul(sr0, vc3), vdd), shift);
|
||||
v_scr1 = v_shr(v_add(v_mul(sr1, vc3), vdd), shift);
|
||||
v_scb0 = v_shr(v_add(v_mul(sb0, vc4), vdd), shift);
|
||||
v_scb1 = v_shr(v_add(v_mul(sb1, vc4), vdd), shift);
|
||||
|
||||
// saturate and pack
|
||||
cr = v_pack_u(v_scr0, v_scr1);
|
||||
@ -407,8 +416,8 @@ struct RGB2YCrCb_i<uchar>
|
||||
int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4];
|
||||
int delta = ColorChannel<uchar>::half()*(1 << shift);
|
||||
|
||||
#if CV_SIMD
|
||||
const int vsize = v_uint8::nlanes;
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
const int vsize = VTraits<v_uint8>::vlanes();
|
||||
const int descaleShift = 1 << (shift-1);
|
||||
v_int16 bg2y;
|
||||
v_int16 r12y;
|
||||
@ -458,10 +467,10 @@ struct RGB2YCrCb_i<uchar>
|
||||
v_zip(sr0, vdescale, rd00, rd01);
|
||||
v_zip(sr1, vdescale, rd10, rd11);
|
||||
|
||||
y00 = v_reinterpret_as_u32(v_dotprod(bg00, bg2y) + v_dotprod(rd00, r12y)) >> shift;
|
||||
y01 = v_reinterpret_as_u32(v_dotprod(bg01, bg2y) + v_dotprod(rd01, r12y)) >> shift;
|
||||
y10 = v_reinterpret_as_u32(v_dotprod(bg10, bg2y) + v_dotprod(rd10, r12y)) >> shift;
|
||||
y11 = v_reinterpret_as_u32(v_dotprod(bg11, bg2y) + v_dotprod(rd11, r12y)) >> shift;
|
||||
y00 = v_shr(v_reinterpret_as_u32(v_add(v_dotprod(bg00, bg2y), v_dotprod(rd00, r12y))), shift);
|
||||
y01 = v_shr(v_reinterpret_as_u32(v_add(v_dotprod(bg01, bg2y), v_dotprod(rd01, r12y))), shift);
|
||||
y10 = v_shr(v_reinterpret_as_u32(v_add(v_dotprod(bg10, bg2y), v_dotprod(rd10, r12y))), shift);
|
||||
y11 = v_shr(v_reinterpret_as_u32(v_add(v_dotprod(bg11, bg2y), v_dotprod(rd11, r12y))), shift);
|
||||
}
|
||||
|
||||
v_uint16 y0, y1;
|
||||
@ -512,15 +521,15 @@ struct RGB2YCrCb_i<uchar>
|
||||
|
||||
v_uint8 cr, cb;
|
||||
|
||||
cr00 = cr00 >> shift;
|
||||
cr01 = cr01 >> shift;
|
||||
cr10 = cr10 >> shift;
|
||||
cr11 = cr11 >> shift;
|
||||
cr00 = v_shr(cr00, shift);
|
||||
cr01 = v_shr(cr01, shift);
|
||||
cr10 = v_shr(cr10, shift);
|
||||
cr11 = v_shr(cr11, shift);
|
||||
|
||||
cb00 = cb00 >> shift;
|
||||
cb01 = cb01 >> shift;
|
||||
cb10 = cb10 >> shift;
|
||||
cb11 = cb11 >> shift;
|
||||
cb00 = v_shr(cb00, shift);
|
||||
cb01 = v_shr(cb01, shift);
|
||||
cb10 = v_shr(cb10, shift);
|
||||
cb11 = v_shr(cb11, shift);
|
||||
|
||||
v_int16 cr0, cr1, cb0, cb1;
|
||||
cr0 = v_pack(cr00, cr01); cr1 = v_pack(cr10, cr11);
|
||||
@ -623,12 +632,12 @@ struct YCrCb2RGB_f<float>
|
||||
float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3];
|
||||
|
||||
int i = 0;
|
||||
#if CV_SIMD
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
v_float32 vc0 = vx_setall_f32(C0), vc1 = vx_setall_f32(C1);
|
||||
v_float32 vc2 = vx_setall_f32(C2), vc3 = vx_setall_f32(C3);
|
||||
v_float32 vdelta = vx_setall_f32(delta);
|
||||
v_float32 valpha = vx_setall_f32(alpha);
|
||||
const int vsize = v_float32::nlanes;
|
||||
const int vsize = VTraits<v_float32>::vlanes();
|
||||
for( ; i <= n-vsize;
|
||||
i += vsize, src += vsize*3, dst += vsize*dcn)
|
||||
{
|
||||
@ -640,7 +649,7 @@ struct YCrCb2RGB_f<float>
|
||||
|
||||
v_float32 b, g, r;
|
||||
|
||||
cb -= vdelta; cr -= vdelta;
|
||||
cb = v_sub(cb, vdelta); cr = v_sub(cr, vdelta);
|
||||
b = v_fma(cb, vc3, y);
|
||||
g = v_fma(cr, vc1, v_fma(cb, vc2, y));
|
||||
r = v_fma(cr, vc0, y);
|
||||
@ -746,8 +755,8 @@ struct YCrCb2RGB_i<uchar>
|
||||
const uchar delta = ColorChannel<uchar>::half(), alpha = ColorChannel<uchar>::max();
|
||||
int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3];
|
||||
|
||||
#if CV_SIMD
|
||||
const int vsize = v_uint8::nlanes;
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
const int vsize = VTraits<v_uint8>::vlanes();
|
||||
v_uint8 valpha = vx_setall_u8(alpha);
|
||||
v_uint8 vdelta = vx_setall_u8(delta);
|
||||
const int descaleShift = 1 << (shift - 1);
|
||||
@ -794,8 +803,8 @@ struct YCrCb2RGB_i<uchar>
|
||||
v_int32 cb00, cb01, cb10, cb11;
|
||||
v_expand(v_scb0, cb00, cb01);
|
||||
v_expand(v_scb1, cb10, cb11);
|
||||
b00 += cb00 << 15; b01 += cb01 << 15;
|
||||
b10 += cb10 << 15; b11 += cb11 << 15;
|
||||
b00 = v_add(b00, v_shl<15>(cb00)); b01 = v_add(b01, v_shl<15>(cb01));
|
||||
b10 = v_add(b10, v_shl<15>(cb10)); b11 = v_add(b11, v_shl<15>(cb11));
|
||||
}
|
||||
|
||||
v_int32 t00, t01, t10, t11;
|
||||
@ -803,17 +812,17 @@ struct YCrCb2RGB_i<uchar>
|
||||
v_mul_expand(v_scb1, vc2, t10, t11);
|
||||
v_mul_expand(v_scr0, vc1, g00, g01);
|
||||
v_mul_expand(v_scr1, vc1, g10, g11);
|
||||
g00 += t00; g01 += t01;
|
||||
g10 += t10; g11 += t11;
|
||||
g00 = v_add(g00, t00); g01 = v_add(g01, t01);
|
||||
g10 = v_add(g10, t10); g11 = v_add(g11, t11);
|
||||
v_mul_expand(v_scr0, vc0, r00, r01);
|
||||
v_mul_expand(v_scr1, vc0, r10, r11);
|
||||
|
||||
b00 = (b00 + vdescale) >> shift; b01 = (b01 + vdescale) >> shift;
|
||||
b10 = (b10 + vdescale) >> shift; b11 = (b11 + vdescale) >> shift;
|
||||
g00 = (g00 + vdescale) >> shift; g01 = (g01 + vdescale) >> shift;
|
||||
g10 = (g10 + vdescale) >> shift; g11 = (g11 + vdescale) >> shift;
|
||||
r00 = (r00 + vdescale) >> shift; r01 = (r01 + vdescale) >> shift;
|
||||
r10 = (r10 + vdescale) >> shift; r11 = (r11 + vdescale) >> shift;
|
||||
b00 = v_shr(v_add(b00, vdescale), shift); b01 = v_shr(v_add(b01, vdescale), shift);
|
||||
b10 = v_shr(v_add(b10, vdescale), shift); b11 = v_shr(v_add(b11, vdescale), shift);
|
||||
g00 = v_shr(v_add(g00, vdescale), shift); g01 = v_shr(v_add(g01, vdescale), shift);
|
||||
g10 = v_shr(v_add(g10, vdescale), shift); g11 = v_shr(v_add(g11, vdescale), shift);
|
||||
r00 = v_shr(v_add(r00, vdescale), shift); r01 = v_shr(v_add(r01, vdescale), shift);
|
||||
r10 = v_shr(v_add(r10, vdescale), shift); r11 = v_shr(v_add(r11, vdescale), shift);
|
||||
|
||||
v_int16 b0, b1, g0, g1, r0, r1;
|
||||
b0 = v_pack(b00, b01); b1 = v_pack(b10, b11);
|
||||
@ -897,8 +906,8 @@ struct YCrCb2RGB_i<ushort>
|
||||
const ushort delta = ColorChannel<ushort>::half(), alpha = ColorChannel<ushort>::max();
|
||||
int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3];
|
||||
|
||||
#if CV_SIMD
|
||||
const int vsize = v_uint16::nlanes;
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
const int vsize = VTraits<v_uint16>::vlanes();
|
||||
const int descaleShift = 1 << (shift-1);
|
||||
v_uint16 valpha = vx_setall_u16(alpha);
|
||||
v_uint16 vdelta = vx_setall_u16(delta);
|
||||
@ -939,22 +948,22 @@ struct YCrCb2RGB_i<ushort>
|
||||
// so we fix the multiplication
|
||||
v_int32 cb0, cb1;
|
||||
v_expand(scb, cb0, cb1);
|
||||
b0 += cb0 << 15;
|
||||
b1 += cb1 << 15;
|
||||
b0 = v_add(b0, v_shl<15>(cb0));
|
||||
b1 = v_add(b1, v_shl<15>(cb1));
|
||||
}
|
||||
v_int32 t0, t1;
|
||||
v_mul_expand(scb, vc2, t0, t1);
|
||||
v_mul_expand(scr, vc1, g0, g1);
|
||||
g0 += t0; g1 += t1;
|
||||
g0 = v_add(g0, t0); g1 = v_add(g1, t1);
|
||||
v_mul_expand(scr, vc0, r0, r1);
|
||||
|
||||
// shifted term doesn't fit into 16 bits, addition is to be done in 32 bits
|
||||
b0 = ((b0 + vdescale) >> shift) + y0;
|
||||
b1 = ((b1 + vdescale) >> shift) + y1;
|
||||
g0 = ((g0 + vdescale) >> shift) + y0;
|
||||
g1 = ((g1 + vdescale) >> shift) + y1;
|
||||
r0 = ((r0 + vdescale) >> shift) + y0;
|
||||
r1 = ((r1 + vdescale) >> shift) + y1;
|
||||
b0 = v_add(v_shr(v_add(b0, vdescale), shift), y0);
|
||||
b1 = v_add(v_shr(v_add(b1, vdescale), shift), y1);
|
||||
g0 = v_add(v_shr(v_add(g0, vdescale), shift), y0);
|
||||
g1 = v_add(v_shr(v_add(g1, vdescale), shift), y1);
|
||||
r0 = v_add(v_shr(v_add(r0, vdescale), shift), y0);
|
||||
r1 = v_add(v_shr(v_add(r1, vdescale), shift), y1);
|
||||
|
||||
// saturate and pack
|
||||
v_uint16 b, g, r;
|
||||
@ -1038,11 +1047,11 @@ static inline void uvToRGBuv(const uchar u, const uchar v, int& ruv, int& guv, i
|
||||
buv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CUB * uu;
|
||||
}
|
||||
|
||||
#if CV_SIMD
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
static inline void uvToRGBuv(const v_uint8& u, const v_uint8& v,
|
||||
v_int32 (&ruv)[4],
|
||||
v_int32 (&guv)[4],
|
||||
v_int32 (&buv)[4])
|
||||
v_int32 &ruv0, v_int32 &ruv1, v_int32 &ruv2, v_int32 &ruv3,
|
||||
v_int32 &guv0, v_int32 &guv1, v_int32 &guv2, v_int32 &guv3,
|
||||
v_int32 &buv0, v_int32 &buv1, v_int32 &buv2, v_int32 &buv3)
|
||||
{
|
||||
v_uint8 v128 = vx_setall_u8(128);
|
||||
v_int8 su = v_reinterpret_as_s8(v_sub_wrap(u, v128));
|
||||
@ -1051,9 +1060,10 @@ static inline void uvToRGBuv(const v_uint8& u, const v_uint8& v,
|
||||
v_int16 uu0, uu1, vv0, vv1;
|
||||
v_expand(su, uu0, uu1);
|
||||
v_expand(sv, vv0, vv1);
|
||||
v_int32 uu[4], vv[4];
|
||||
v_expand(uu0, uu[0], uu[1]); v_expand(uu1, uu[2], uu[3]);
|
||||
v_expand(vv0, vv[0], vv[1]); v_expand(vv1, vv[2], vv[3]);
|
||||
v_int32 uuu0, uuu1, uuu2, uuu3;
|
||||
v_int32 vvv0, vvv1, vvv2, vvv3;
|
||||
v_expand(uu0, uuu0, uuu1); v_expand(uu1, uuu2, uuu3);
|
||||
v_expand(vv0, vvv0, vvv1); v_expand(vv1, vvv2, vvv3);
|
||||
|
||||
v_int32 vshift = vx_setall_s32(1 << (ITUR_BT_601_SHIFT - 1));
|
||||
v_int32 vr = vx_setall_s32(ITUR_BT_601_CVR);
|
||||
@ -1061,12 +1071,15 @@ static inline void uvToRGBuv(const v_uint8& u, const v_uint8& v,
|
||||
v_int32 ug = vx_setall_s32(ITUR_BT_601_CUG);
|
||||
v_int32 ub = vx_setall_s32(ITUR_BT_601_CUB);
|
||||
|
||||
for (int k = 0; k < 4; k++)
|
||||
{
|
||||
ruv[k] = vshift + vr * vv[k];
|
||||
guv[k] = vshift + vg * vv[k] + ug * uu[k];
|
||||
buv[k] = vshift + ub * uu[k];
|
||||
}
|
||||
auto process_uv = [&](v_int32& ruv, v_int32& guv, v_int32& buv, const v_int32& vv, const v_int32& uu) {
|
||||
ruv = v_add(vshift, v_mul(vr, vv));
|
||||
guv = v_add(v_add(vshift, v_mul(vg, vv)), v_mul(ug, uu));
|
||||
buv = v_add(vshift, v_mul(ub, uu));
|
||||
};
|
||||
process_uv(ruv0, guv0, buv0, vvv0, uuu0);
|
||||
process_uv(ruv1, guv1, buv1, vvv1, uuu1);
|
||||
process_uv(ruv2, guv2, buv2, vvv2, uuu2);
|
||||
process_uv(ruv3, guv3, buv3, vvv3, uuu3);
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -1081,44 +1094,48 @@ static inline void yRGBuvToRGBA(const uchar vy, const int ruv, const int guv, co
|
||||
a = uchar(0xff);
|
||||
}
|
||||
|
||||
#if CV_SIMD
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
static inline void yRGBuvToRGBA(const v_uint8& vy,
|
||||
const v_int32 (&ruv)[4],
|
||||
const v_int32 (&guv)[4],
|
||||
const v_int32 (&buv)[4],
|
||||
const v_int32 &ruv0, const v_int32 &ruv1, const v_int32 &ruv2, const v_int32 &ruv3,
|
||||
const v_int32 &guv0, const v_int32 &guv1, const v_int32 &guv2, const v_int32 &guv3,
|
||||
const v_int32 &buv0, const v_int32 &buv1, const v_int32 &buv2, const v_int32 &buv3,
|
||||
v_uint8& rr, v_uint8& gg, v_uint8& bb)
|
||||
{
|
||||
v_uint8 v16 = vx_setall_u8(16);
|
||||
v_uint8 posY = vy - v16;
|
||||
v_uint8 posY = v_sub(vy, v16);
|
||||
v_uint16 yy0, yy1;
|
||||
v_expand(posY, yy0, yy1);
|
||||
v_int32 yy[4];
|
||||
v_int32 yy00, yy01, yy10, yy11;
|
||||
v_expand(v_reinterpret_as_s16(yy0), yy[0], yy[1]);
|
||||
v_expand(v_reinterpret_as_s16(yy1), yy[2], yy[3]);
|
||||
v_int32 yyy0, yyy1, yyy2, yyy3;
|
||||
v_expand(v_reinterpret_as_s16(yy0), yyy0, yyy1);
|
||||
v_expand(v_reinterpret_as_s16(yy1), yyy2, yyy3);
|
||||
|
||||
v_int32 vcy = vx_setall_s32(ITUR_BT_601_CY);
|
||||
|
||||
v_int32 y[4], r[4], g[4], b[4];
|
||||
for(int k = 0; k < 4; k++)
|
||||
{
|
||||
y[k] = yy[k]*vcy;
|
||||
r[k] = (y[k] + ruv[k]) >> ITUR_BT_601_SHIFT;
|
||||
g[k] = (y[k] + guv[k]) >> ITUR_BT_601_SHIFT;
|
||||
b[k] = (y[k] + buv[k]) >> ITUR_BT_601_SHIFT;
|
||||
}
|
||||
v_int32 y0, y1, y2, y3, r0, r1, r2, r3, g0, g1, g2, g3, b0, b1, b2, b3;
|
||||
|
||||
v_int16 r0, r1, g0, g1, b0, b1;
|
||||
r0 = v_pack(r[0], r[1]);
|
||||
r1 = v_pack(r[2], r[3]);
|
||||
g0 = v_pack(g[0], g[1]);
|
||||
g1 = v_pack(g[2], g[3]);
|
||||
b0 = v_pack(b[0], b[1]);
|
||||
b1 = v_pack(b[2], b[3]);
|
||||
auto process_yrgb = [&](const v_int32& yy, v_int32& y, v_int32& r, v_int32& g, v_int32& b,
|
||||
const v_int32& ruv, const v_int32& guv, const v_int32& buv) {
|
||||
y = v_mul(yy, vcy);
|
||||
r = v_shr(v_add(y, ruv), ITUR_BT_601_SHIFT);
|
||||
g = v_shr(v_add(y, guv), ITUR_BT_601_SHIFT);
|
||||
b = v_shr(v_add(y, buv), ITUR_BT_601_SHIFT);
|
||||
};
|
||||
process_yrgb(yyy0, y0, r0, g0, b0, ruv0, guv0, buv0);
|
||||
process_yrgb(yyy1, y1, r1, g1, b1, ruv1, guv1, buv1);
|
||||
process_yrgb(yyy2, y2, r2, g2, b2, ruv2, guv2, buv2);
|
||||
process_yrgb(yyy3, y3, r3, g3, b3, ruv3, guv3, buv3);
|
||||
|
||||
rr = v_pack_u(r0, r1);
|
||||
gg = v_pack_u(g0, g1);
|
||||
bb = v_pack_u(b0, b1);
|
||||
v_int16 _r0, _r1, _g0, _g1, _b0, _b1;
|
||||
_r0 = v_pack(r0, r1);
|
||||
_r1 = v_pack(r2, r3);
|
||||
_g0 = v_pack(g0, g1);
|
||||
_g1 = v_pack(g2, g3);
|
||||
_b0 = v_pack(b0, b1);
|
||||
_b1 = v_pack(b2, b3);
|
||||
|
||||
rr = v_pack_u(_r0, _r1);
|
||||
gg = v_pack_u(_g0, _g1);
|
||||
bb = v_pack_u(_b0, _b1);
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -1201,8 +1218,8 @@ struct YUV420sp2RGB8Invoker : ParallelLoopBody
|
||||
const uchar* y2 = y1 + my1_step;
|
||||
|
||||
int i = 0;
|
||||
#if CV_SIMD
|
||||
const int vsize = v_uint8::nlanes;
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
const int vsize = VTraits<v_uint8>::vlanes();
|
||||
v_uint8 a = vx_setall_u8(uchar(0xff));
|
||||
for( ; i <= width - 2*vsize;
|
||||
i += 2*vsize, row1 += vsize*dcn*2, row2 += vsize*dcn*2)
|
||||
@ -1215,36 +1232,50 @@ struct YUV420sp2RGB8Invoker : ParallelLoopBody
|
||||
swap(u, v);
|
||||
}
|
||||
|
||||
v_uint8 vy[4];
|
||||
v_load_deinterleave(y1 + i, vy[0], vy[1]);
|
||||
v_load_deinterleave(y2 + i, vy[2], vy[3]);
|
||||
v_uint8 vy0, vy1, vy2, vy3;
|
||||
v_load_deinterleave(y1 + i, vy0, vy1);
|
||||
v_load_deinterleave(y2 + i, vy2, vy3);
|
||||
|
||||
v_int32 ruv[4], guv[4], buv[4];
|
||||
uvToRGBuv(u, v, ruv, guv, buv);
|
||||
v_int32 ruv0, ruv1, ruv2, ruv3,
|
||||
guv0, guv1, guv2, guv3,
|
||||
buv0, buv1, buv2, buv3;
|
||||
uvToRGBuv(u, v,
|
||||
ruv0, ruv1, ruv2, ruv3,
|
||||
guv0, guv1, guv2, guv3,
|
||||
buv0, buv1, buv2, buv3);
|
||||
|
||||
v_uint8 r[4], g[4], b[4];
|
||||
v_uint8 r0, r1, r2, r3, g0, g1, g2, g3, b0, b1, b2, b3;
|
||||
|
||||
for(int k = 0; k < 4; k++)
|
||||
{
|
||||
yRGBuvToRGBA(vy[k], ruv, guv, buv, r[k], g[k], b[k]);
|
||||
}
|
||||
auto call_yRGBuvToRGBA = [&](const v_uint8& vy, v_uint8& r, v_uint8& g, v_uint8& b) {
|
||||
yRGBuvToRGBA(vy,
|
||||
ruv0, ruv1, ruv2, ruv3,
|
||||
guv0, guv1, guv2, guv3,
|
||||
buv0, buv1, buv2, buv3,
|
||||
r, g, b);
|
||||
};
|
||||
call_yRGBuvToRGBA(vy0, r0, g0, b0);
|
||||
call_yRGBuvToRGBA(vy1, r1, g1, b1);
|
||||
call_yRGBuvToRGBA(vy2, r2, g2, b2);
|
||||
call_yRGBuvToRGBA(vy3, r3, g3, b3);
|
||||
|
||||
if(bIdx)
|
||||
{
|
||||
for(int k = 0; k < 4; k++)
|
||||
swap(r[k], b[k]);
|
||||
swap(r0, b0);
|
||||
swap(r1, b1);
|
||||
swap(r2, b2);
|
||||
swap(r3, b3);
|
||||
}
|
||||
|
||||
// [r0...], [r1...] => [r0, r1, r0, r1...], [r0, r1, r0, r1...]
|
||||
v_uint8 r0_0, r0_1, r1_0, r1_1;
|
||||
v_zip(r[0], r[1], r0_0, r0_1);
|
||||
v_zip(r[2], r[3], r1_0, r1_1);
|
||||
v_zip(r0, r1, r0_0, r0_1);
|
||||
v_zip(r2, r3, r1_0, r1_1);
|
||||
v_uint8 g0_0, g0_1, g1_0, g1_1;
|
||||
v_zip(g[0], g[1], g0_0, g0_1);
|
||||
v_zip(g[2], g[3], g1_0, g1_1);
|
||||
v_zip(g0, g1, g0_0, g0_1);
|
||||
v_zip(g2, g3, g1_0, g1_1);
|
||||
v_uint8 b0_0, b0_1, b1_0, b1_1;
|
||||
v_zip(b[0], b[1], b0_0, b0_1);
|
||||
v_zip(b[2], b[3], b1_0, b1_1);
|
||||
v_zip(b0, b1, b0_0, b0_1);
|
||||
v_zip(b2, b3, b1_0, b1_1);
|
||||
|
||||
if(dcn == 4)
|
||||
{
|
||||
@ -1319,8 +1350,8 @@ struct YUV420p2RGB8Invoker : ParallelLoopBody
|
||||
const uchar* y2 = y1 + stride;
|
||||
int i = 0;
|
||||
|
||||
#if CV_SIMD
|
||||
const int vsize = v_uint8::nlanes;
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
const int vsize = VTraits<v_uint8>::vlanes();
|
||||
v_uint8 a = vx_setall_u8(uchar(0xff));
|
||||
for( ; i <= width/2 - vsize;
|
||||
i += vsize, row1 += vsize*dcn*2, row2 += vsize*dcn*2)
|
||||
@ -1329,36 +1360,50 @@ struct YUV420p2RGB8Invoker : ParallelLoopBody
|
||||
u = vx_load(u1 + i);
|
||||
v = vx_load(v1 + i);
|
||||
|
||||
v_uint8 vy[4];
|
||||
v_load_deinterleave(y1 + 2*i, vy[0], vy[1]);
|
||||
v_load_deinterleave(y2 + 2*i, vy[2], vy[3]);
|
||||
v_uint8 vy0, vy1, vy2, vy3;
|
||||
v_load_deinterleave(y1 + 2*i, vy0, vy1);
|
||||
v_load_deinterleave(y2 + 2*i, vy2, vy3);
|
||||
|
||||
v_int32 ruv[4], guv[4], buv[4];
|
||||
uvToRGBuv(u, v, ruv, guv, buv);
|
||||
v_int32 ruv0, ruv1, ruv2, ruv3,
|
||||
guv0, guv1, guv2, guv3,
|
||||
buv0, buv1, buv2, buv3;
|
||||
uvToRGBuv(u, v,
|
||||
ruv0, ruv1, ruv2, ruv3,
|
||||
guv0, guv1, guv2, guv3,
|
||||
buv0, buv1, buv2, buv3);
|
||||
|
||||
v_uint8 r[4], g[4], b[4];
|
||||
v_uint8 r0, r1, r2, r3, g0, g1, g2, g3, b0, b1, b2, b3;
|
||||
|
||||
for(int k = 0; k < 4; k++)
|
||||
{
|
||||
yRGBuvToRGBA(vy[k], ruv, guv, buv, r[k], g[k], b[k]);
|
||||
}
|
||||
auto call_yRGBuvToRGBA = [&](const v_uint8& vy, v_uint8& r, v_uint8& g, v_uint8& b) {
|
||||
yRGBuvToRGBA(vy,
|
||||
ruv0, ruv1, ruv2, ruv3,
|
||||
guv0, guv1, guv2, guv3,
|
||||
buv0, buv1, buv2, buv3,
|
||||
r, g, b);
|
||||
};
|
||||
call_yRGBuvToRGBA(vy0, r0, g0, b0);
|
||||
call_yRGBuvToRGBA(vy1, r1, g1, b1);
|
||||
call_yRGBuvToRGBA(vy2, r2, g2, b2);
|
||||
call_yRGBuvToRGBA(vy3, r3, g3, b3);
|
||||
|
||||
if(bIdx)
|
||||
{
|
||||
for(int k = 0; k < 4; k++)
|
||||
swap(r[k], b[k]);
|
||||
swap(r0, b0);
|
||||
swap(r1, b1);
|
||||
swap(r2, b2);
|
||||
swap(r3, b3);
|
||||
}
|
||||
|
||||
// [r0...], [r1...] => [r0, r1, r0, r1...], [r0, r1, r0, r1...]
|
||||
v_uint8 r0_0, r0_1, r1_0, r1_1;
|
||||
v_zip(r[0], r[1], r0_0, r0_1);
|
||||
v_zip(r[2], r[3], r1_0, r1_1);
|
||||
v_zip(r0, r1, r0_0, r0_1);
|
||||
v_zip(r2, r3, r1_0, r1_1);
|
||||
v_uint8 g0_0, g0_1, g1_0, g1_1;
|
||||
v_zip(g[0], g[1], g0_0, g0_1);
|
||||
v_zip(g[2], g[3], g1_0, g1_1);
|
||||
v_zip(g0, g1, g0_0, g0_1);
|
||||
v_zip(g2, g3, g1_0, g1_1);
|
||||
v_uint8 b0_0, b0_1, b1_0, b1_1;
|
||||
v_zip(b[0], b[1], b0_0, b0_1);
|
||||
v_zip(b[2], b[3], b1_0, b1_1);
|
||||
v_zip(b0, b1, b0_0, b0_1);
|
||||
v_zip(b2, b3, b1_0, b1_1);
|
||||
|
||||
if(dcn == 4)
|
||||
{
|
||||
@ -1430,7 +1475,7 @@ static inline uchar rgbToY42x(uchar r, uchar g, uchar b)
|
||||
return saturate_cast<uchar>(yy >> ITUR_BT_601_SHIFT);
|
||||
}
|
||||
|
||||
#if CV_SIMD
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
static inline v_uint8 rgbToY42x(const v_uint8& r, const v_uint8& g, const v_uint8& b)
|
||||
{
|
||||
const int shifted16 = (16 << ITUR_BT_601_SHIFT);
|
||||
@ -1440,25 +1485,25 @@ static inline v_uint8 rgbToY42x(const v_uint8& r, const v_uint8& g, const v_uint
|
||||
v_expand(g, g0, g1);
|
||||
v_expand(b, b0, b1);
|
||||
|
||||
v_uint32 rq[4], gq[4], bq[4];
|
||||
v_expand(r0, rq[0], rq[1]); v_expand(r1, rq[2], rq[3]);
|
||||
v_expand(g0, gq[0], gq[1]); v_expand(g1, gq[2], gq[3]);
|
||||
v_expand(b0, bq[0], bq[1]); v_expand(b1, bq[2], bq[3]);
|
||||
v_uint32 rq0, rq1, rq2, rq3, gq0, gq1, gq2, gq3, bq0, bq1, bq2, bq3;
|
||||
v_expand(r0, rq0, rq1); v_expand(r1, rq2, rq3);
|
||||
v_expand(g0, gq0, gq1); v_expand(g1, gq2, gq3);
|
||||
v_expand(b0, bq0, bq1); v_expand(b1, bq2, bq3);
|
||||
|
||||
v_uint32 ry = vx_setall_u32(ITUR_BT_601_CRY), gy = vx_setall_u32(ITUR_BT_601_CGY);
|
||||
v_uint32 by = vx_setall_u32(ITUR_BT_601_CBY), shift = vx_setall_u32(halfShift + shifted16);
|
||||
|
||||
v_uint32 y[4];
|
||||
for(int k = 0; k < 4; k++)
|
||||
{
|
||||
y[k] = (rq[k]*ry + gq[k]*gy + bq[k]*by + shift) >> ITUR_BT_601_SHIFT;
|
||||
}
|
||||
v_uint32 y0, y1, y2, y3;
|
||||
y0 = v_shr<ITUR_BT_601_SHIFT>(v_add(v_add(v_add(v_mul(rq0, ry), v_mul(gq0, gy)), v_mul(bq0, by)), shift));
|
||||
y1 = v_shr<ITUR_BT_601_SHIFT>(v_add(v_add(v_add(v_mul(rq1, ry), v_mul(gq1, gy)), v_mul(bq1, by)), shift));
|
||||
y2 = v_shr<ITUR_BT_601_SHIFT>(v_add(v_add(v_add(v_mul(rq2, ry), v_mul(gq2, gy)), v_mul(bq2, by)), shift));
|
||||
y3 = v_shr<ITUR_BT_601_SHIFT>(v_add(v_add(v_add(v_mul(rq3, ry), v_mul(gq3, gy)), v_mul(bq3, by)), shift));
|
||||
|
||||
v_uint16 y0, y1;
|
||||
y0 = v_pack(y[0], y[1]);
|
||||
y1 = v_pack(y[2], y[3]);
|
||||
v_uint16 _y0, _y1;
|
||||
_y0 = v_pack(y0, y1);
|
||||
_y1 = v_pack(y2, y3);
|
||||
|
||||
return v_pack(y0, y1);
|
||||
return v_pack(_y0, _y1);
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -1473,27 +1518,27 @@ static inline void rgbToUV42x(uchar r, uchar g, uchar b, uchar& u, uchar& v)
|
||||
v = saturate_cast<uchar>(vv >> ITUR_BT_601_SHIFT);
|
||||
}
|
||||
|
||||
#if CV_SIMD
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
static inline void rgbToUV42x(const v_uint8& r0, const v_uint8& r1, const v_uint8& g0, const v_uint8& g1,
|
||||
const v_uint8& b0, const v_uint8& b1, v_uint8& u, v_uint8& v)
|
||||
{
|
||||
// [r0, r1, r2, r3,..] => [r0, 0, r2, 0,..]
|
||||
v_int16 vlowByte = vx_setall_s16(0x00ff);
|
||||
v_int16 rd0, rd1, gd0, gd1, bd0, bd1;
|
||||
rd0 = v_reinterpret_as_s16(r0) & vlowByte;
|
||||
rd1 = v_reinterpret_as_s16(r1) & vlowByte;
|
||||
gd0 = v_reinterpret_as_s16(g0) & vlowByte;
|
||||
gd1 = v_reinterpret_as_s16(g1) & vlowByte;
|
||||
bd0 = v_reinterpret_as_s16(b0) & vlowByte;
|
||||
bd1 = v_reinterpret_as_s16(b1) & vlowByte;
|
||||
rd0 = v_and(v_reinterpret_as_s16(r0), vlowByte);
|
||||
rd1 = v_and(v_reinterpret_as_s16(r1), vlowByte);
|
||||
gd0 = v_and(v_reinterpret_as_s16(g0), vlowByte);
|
||||
gd1 = v_and(v_reinterpret_as_s16(g1), vlowByte);
|
||||
bd0 = v_and(v_reinterpret_as_s16(b0), vlowByte);
|
||||
bd1 = v_and(v_reinterpret_as_s16(b1), vlowByte);
|
||||
|
||||
v_int32 rq[4], gq[4], bq[4];
|
||||
v_expand(rd0, rq[0], rq[1]);
|
||||
v_expand(rd1, rq[2], rq[3]);
|
||||
v_expand(gd0, gq[0], gq[1]);
|
||||
v_expand(gd1, gq[2], gq[3]);
|
||||
v_expand(bd0, bq[0], bq[1]);
|
||||
v_expand(bd1, bq[2], bq[3]);
|
||||
v_int32 rq0, rq1, rq2, rq3, gq0, gq1, gq2, gq3, bq0, bq1, bq2, bq3;
|
||||
v_expand(rd0, rq0, rq1);
|
||||
v_expand(rd1, rq2, rq3);
|
||||
v_expand(gd0, gq0, gq1);
|
||||
v_expand(gd1, gq2, gq3);
|
||||
v_expand(bd0, bq0, bq1);
|
||||
v_expand(bd1, bq2, bq3);
|
||||
|
||||
const int halfShift = (1 << (ITUR_BT_601_SHIFT - 1));
|
||||
const int shifted128 = (128 << ITUR_BT_601_SHIFT);
|
||||
@ -1505,18 +1550,21 @@ static inline void rgbToUV42x(const v_uint8& r0, const v_uint8& r1, const v_uint
|
||||
bu = vx_setall_s32(ITUR_BT_601_CBU);
|
||||
bv = vx_setall_s32(ITUR_BT_601_CBV);
|
||||
|
||||
v_int32 uq[4], vq[4];
|
||||
for(int k = 0; k < 4; k++)
|
||||
{
|
||||
uq[k] = (ru*rq[k] + gu*gq[k] + bu*bq[k] + shift) >> ITUR_BT_601_SHIFT;
|
||||
vq[k] = (bu*rq[k] + gv*gq[k] + bv*bq[k] + shift) >> ITUR_BT_601_SHIFT;
|
||||
}
|
||||
v_int32 uq0, uq1, uq2, uq3, vq0, vq1, vq2, vq3;
|
||||
uq0 = v_shr<ITUR_BT_601_SHIFT>(v_add(v_add(v_add(v_mul(ru, rq0), v_mul(gu, gq0)), v_mul(bu, bq0)), shift));
|
||||
vq0 = v_shr<ITUR_BT_601_SHIFT>(v_add(v_add(v_add(v_mul(bu, rq0), v_mul(gv, gq0)), v_mul(bv, bq0)), shift));
|
||||
uq1 = v_shr<ITUR_BT_601_SHIFT>(v_add(v_add(v_add(v_mul(ru, rq1), v_mul(gu, gq1)), v_mul(bu, bq1)), shift));
|
||||
vq1 = v_shr<ITUR_BT_601_SHIFT>(v_add(v_add(v_add(v_mul(bu, rq1), v_mul(gv, gq1)), v_mul(bv, bq1)), shift));
|
||||
uq2 = v_shr<ITUR_BT_601_SHIFT>(v_add(v_add(v_add(v_mul(ru, rq2), v_mul(gu, gq2)), v_mul(bu, bq2)), shift));
|
||||
vq2 = v_shr<ITUR_BT_601_SHIFT>(v_add(v_add(v_add(v_mul(bu, rq2), v_mul(gv, gq2)), v_mul(bv, bq2)), shift));
|
||||
uq3 = v_shr<ITUR_BT_601_SHIFT>(v_add(v_add(v_add(v_mul(ru, rq3), v_mul(gu, gq3)), v_mul(bu, bq3)), shift));
|
||||
vq3 = v_shr<ITUR_BT_601_SHIFT>(v_add(v_add(v_add(v_mul(bu, rq3), v_mul(gv, gq3)), v_mul(bv, bq3)), shift));
|
||||
|
||||
v_int16 u0, u1, v0, v1;
|
||||
u0 = v_pack(uq[0], uq[1]);
|
||||
u1 = v_pack(uq[2], uq[3]);
|
||||
v0 = v_pack(vq[0], vq[1]);
|
||||
v1 = v_pack(vq[2], vq[3]);
|
||||
u0 = v_pack(uq0, uq1);
|
||||
u1 = v_pack(uq2, uq3);
|
||||
v0 = v_pack(vq0, vq1);
|
||||
v1 = v_pack(vq2, vq3);
|
||||
|
||||
u = v_pack_u(u0, u1);
|
||||
v = v_pack_u(v0, v1);
|
||||
@ -1559,8 +1607,8 @@ struct RGB8toYUV420pInvoker: public ParallelLoopBody
|
||||
}
|
||||
}
|
||||
int i = 0;
|
||||
#if CV_SIMD
|
||||
const int vsize = v_uint8::nlanes;
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
const int vsize = VTraits<v_uint8>::vlanes();
|
||||
|
||||
for( ; i <= w/2 - vsize;
|
||||
i += vsize)
|
||||
@ -1708,47 +1756,61 @@ struct YUV422toRGB8Invoker : ParallelLoopBody
|
||||
{
|
||||
uchar* row = dst_data + dst_step * j;
|
||||
int i = 0;
|
||||
#if CV_SIMD
|
||||
const int vsize = v_uint8::nlanes;
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
const int vsize = VTraits<v_uint8>::vlanes();
|
||||
v_uint8 a = vx_setall_u8(uchar(0xff));
|
||||
for(; i <= 2*width - 4*vsize;
|
||||
i += 4*vsize, row += vsize*dcn*2)
|
||||
{
|
||||
v_uint8 u, v, vy[2];
|
||||
v_uint8 u, v, vy0, vy1;
|
||||
if(yIdx == 1) // UYVY
|
||||
{
|
||||
v_load_deinterleave(yuv_src + i, u, vy[0], v, vy[1]);
|
||||
v_load_deinterleave(yuv_src + i, u, vy0, v, vy1);
|
||||
}
|
||||
else // YUYV or YVYU
|
||||
{
|
||||
v_load_deinterleave(yuv_src + i, vy[0], u, vy[1], v);
|
||||
v_load_deinterleave(yuv_src + i, vy0, u, vy1, v);
|
||||
if(uIdx == 1) // YVYU
|
||||
{
|
||||
swap(u, v);
|
||||
}
|
||||
}
|
||||
|
||||
v_int32 ruv[4], guv[4], buv[4];
|
||||
uvToRGBuv(u, v, ruv, guv, buv);
|
||||
v_int32 ruv0, ruv1, ruv2, ruv3,
|
||||
guv0, guv1, guv2, guv3,
|
||||
buv0, buv1, buv2, buv3;
|
||||
uvToRGBuv(u, v,
|
||||
ruv0, ruv1, ruv2, ruv3,
|
||||
guv0, guv1, guv2, guv3,
|
||||
buv0, buv1, buv2, buv3);
|
||||
|
||||
v_uint8 r[2], g[2], b[2];
|
||||
v_uint8 r0, r1, g0, g1, b0, b1;
|
||||
|
||||
yRGBuvToRGBA(vy[0], ruv, guv, buv, r[0], g[0], b[0]);
|
||||
yRGBuvToRGBA(vy[1], ruv, guv, buv, r[1], g[1], b[1]);
|
||||
|
||||
yRGBuvToRGBA(vy0,
|
||||
ruv0, ruv1, ruv2, ruv3,
|
||||
guv0, guv1, guv2, guv3,
|
||||
buv0, buv1, buv2, buv3,
|
||||
r0, g0, b0);
|
||||
yRGBuvToRGBA(vy1,
|
||||
ruv0, ruv1, ruv2, ruv3,
|
||||
guv0, guv1, guv2, guv3,
|
||||
buv0, buv1, buv2, buv3,
|
||||
r1, g1, b1);
|
||||
|
||||
if(bIdx)
|
||||
{
|
||||
swap(r[0], b[0]);
|
||||
swap(r[1], b[1]);
|
||||
swap(r0, b0);
|
||||
swap(r1, b1);
|
||||
}
|
||||
|
||||
// [r0...], [r1...] => [r0, r1, r0, r1...], [r0, r1, r0, r1...]
|
||||
v_uint8 r0_0, r0_1;
|
||||
v_zip(r[0], r[1], r0_0, r0_1);
|
||||
v_zip(r0, r1, r0_0, r0_1);
|
||||
v_uint8 g0_0, g0_1;
|
||||
v_zip(g[0], g[1], g0_0, g0_1);
|
||||
v_zip(g0, g1, g0_0, g0_1);
|
||||
v_uint8 b0_0, b0_1;
|
||||
v_zip(b[0], b[1], b0_0, b0_1);
|
||||
v_zip(b0, b1, b0_0, b0_1);
|
||||
|
||||
if(dcn == 4)
|
||||
{
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -1156,13 +1156,13 @@ public:
|
||||
|
||||
for(; x < numCols; ++x )
|
||||
{
|
||||
#if CV_SIMD
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
{
|
||||
v_uint8 v_zero = vx_setzero_u8();
|
||||
|
||||
for(; x <= numCols - 2*v_uint8::nlanes; x += 2*v_uint8::nlanes) {
|
||||
v_uint8 v_edge1 = (vx_load(edgeData + x ) != v_zero);
|
||||
v_uint8 v_edge2 = (vx_load(edgeData + x + v_uint8::nlanes) != v_zero);
|
||||
for(; x <= numCols - 2*VTraits<v_uint8>::vlanes(); x += 2*VTraits<v_uint8>::vlanes()) {
|
||||
v_uint8 v_edge1 = (v_ne(vx_load(edgeData + x), v_zero));
|
||||
v_uint8 v_edge2 = (v_ne(vx_load(edgeData + x + VTraits<v_uint8>::vlanes()), v_zero));
|
||||
|
||||
if(v_check_any(v_edge1))
|
||||
{
|
||||
@ -1172,7 +1172,7 @@ public:
|
||||
|
||||
if(v_check_any(v_edge2))
|
||||
{
|
||||
x += v_uint8::nlanes + v_scan_forward(v_edge2);
|
||||
x += VTraits<v_uint8>::vlanes() + v_scan_forward(v_edge2);
|
||||
goto _next_step;
|
||||
}
|
||||
}
|
||||
@ -1183,7 +1183,7 @@ public:
|
||||
|
||||
if(x == numCols)
|
||||
continue;
|
||||
#if CV_SIMD
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
_next_step:
|
||||
#endif
|
||||
float vx, vy;
|
||||
@ -1514,7 +1514,7 @@ inline int HoughCircleEstimateRadiusInvoker<NZPointList>::filterCircles(const Po
|
||||
int nzCount = 0;
|
||||
const Point* nz_ = &nz[0];
|
||||
int j = 0;
|
||||
#if CV_SIMD
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
{
|
||||
const v_float32 v_minRadius2 = vx_setall_f32(minRadius2);
|
||||
const v_float32 v_maxRadius2 = vx_setall_f32(maxRadius2);
|
||||
@ -1522,9 +1522,9 @@ inline int HoughCircleEstimateRadiusInvoker<NZPointList>::filterCircles(const Po
|
||||
v_float32 v_curCenterX = vx_setall_f32(curCenter.x);
|
||||
v_float32 v_curCenterY = vx_setall_f32(curCenter.y);
|
||||
|
||||
float CV_DECL_ALIGNED(CV_SIMD_WIDTH) rbuf[v_float32::nlanes];
|
||||
int CV_DECL_ALIGNED(CV_SIMD_WIDTH) rmask[v_int32::nlanes];
|
||||
for(; j <= nzSz - v_float32::nlanes; j += v_float32::nlanes)
|
||||
float CV_DECL_ALIGNED(CV_SIMD_WIDTH) rbuf[VTraits<v_float32>::max_nlanes];
|
||||
int CV_DECL_ALIGNED(CV_SIMD_WIDTH) rmask[VTraits<v_int32>::max_nlanes];
|
||||
for(; j <= nzSz - VTraits<v_float32>::vlanes(); j += VTraits<v_float32>::vlanes())
|
||||
{
|
||||
v_float32 v_nzX, v_nzY;
|
||||
v_load_deinterleave((const float*)&nz_[j], v_nzX, v_nzY); // FIXIT use proper datatype
|
||||
@ -1532,16 +1532,16 @@ inline int HoughCircleEstimateRadiusInvoker<NZPointList>::filterCircles(const Po
|
||||
v_float32 v_x = v_cvt_f32(v_reinterpret_as_s32(v_nzX));
|
||||
v_float32 v_y = v_cvt_f32(v_reinterpret_as_s32(v_nzY));
|
||||
|
||||
v_float32 v_dx = v_x - v_curCenterX;
|
||||
v_float32 v_dy = v_y - v_curCenterY;
|
||||
v_float32 v_dx = v_sub(v_x, v_curCenterX);
|
||||
v_float32 v_dy = v_sub(v_y, v_curCenterY);
|
||||
|
||||
v_float32 v_r2 = (v_dx * v_dx) + (v_dy * v_dy);
|
||||
v_float32 vmask = (v_minRadius2 <= v_r2) & (v_r2 <= v_maxRadius2);
|
||||
v_float32 v_r2 = v_add(v_mul(v_dx, v_dx), v_mul(v_dy, v_dy));
|
||||
v_float32 vmask = v_and(v_le(v_minRadius2, v_r2), v_le(v_r2, v_maxRadius2));
|
||||
if (v_check_any(vmask))
|
||||
{
|
||||
v_store_aligned(rmask, v_reinterpret_as_s32(vmask));
|
||||
v_store_aligned(rbuf, v_r2);
|
||||
for (int i = 0; i < v_int32::nlanes; ++i)
|
||||
for (int i = 0; i < VTraits<v_int32>::vlanes(); ++i)
|
||||
if (rmask[i]) ddata[nzCount++] = rbuf[i];
|
||||
}
|
||||
}
|
||||
@ -1573,13 +1573,13 @@ inline int HoughCircleEstimateRadiusInvoker<NZPointSet>::filterCircles(const Poi
|
||||
const Range xOuter = Range(std::max(int(curCenter.x - rOuter), 0), std::min(int(curCenter.x + rOuter), positions.cols));
|
||||
const Range yOuter = Range(std::max(int(curCenter.y - rOuter), 0), std::min(int(curCenter.y + rOuter), positions.rows));
|
||||
|
||||
#if CV_SIMD
|
||||
float v_seq[v_float32::nlanes];
|
||||
for (int i = 0; i < v_float32::nlanes; ++i)
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
float v_seq[VTraits<v_float32>::max_nlanes];
|
||||
for (int i = 0; i < VTraits<v_float32>::vlanes(); ++i)
|
||||
v_seq[i] = (float)i;
|
||||
const v_float32 v_minRadius2 = vx_setall_f32(minRadius2);
|
||||
const v_float32 v_maxRadius2 = vx_setall_f32(maxRadius2);
|
||||
const v_float32 v_curCenterX_0123 = vx_setall_f32(curCenter.x) - vx_load(v_seq);
|
||||
const v_float32 v_curCenterX_0123 = v_sub(vx_setall_f32(curCenter.x), vx_load(v_seq));
|
||||
#endif
|
||||
|
||||
for (int y = yOuter.start; y < yOuter.end; y++)
|
||||
@ -1589,27 +1589,27 @@ inline int HoughCircleEstimateRadiusInvoker<NZPointSet>::filterCircles(const Poi
|
||||
float dy2 = dy * dy;
|
||||
|
||||
int x = xOuter.start;
|
||||
#if CV_SIMD
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
{
|
||||
const v_float32 v_dy2 = vx_setall_f32(dy2);
|
||||
const v_uint32 v_zero_u32 = vx_setall_u32(0);
|
||||
float CV_DECL_ALIGNED(CV_SIMD_WIDTH) rbuf[v_float32::nlanes];
|
||||
int CV_DECL_ALIGNED(CV_SIMD_WIDTH) rmask[v_int32::nlanes];
|
||||
for (; x <= xOuter.end - v_float32::nlanes; x += v_float32::nlanes)
|
||||
float CV_DECL_ALIGNED(CV_SIMD_WIDTH) rbuf[VTraits<v_float32>::max_nlanes];
|
||||
int CV_DECL_ALIGNED(CV_SIMD_WIDTH) rmask[VTraits<v_int32>::max_nlanes];
|
||||
for (; x <= xOuter.end - VTraits<v_float32>::vlanes(); x += VTraits<v_float32>::vlanes())
|
||||
{
|
||||
v_uint32 v_mask = vx_load_expand_q(ptr + x);
|
||||
v_mask = v_mask != v_zero_u32;
|
||||
v_mask = v_ne(v_mask, v_zero_u32);
|
||||
|
||||
v_float32 v_x = v_cvt_f32(vx_setall_s32(x));
|
||||
v_float32 v_dx = v_x - v_curCenterX_0123;
|
||||
v_float32 v_dx = v_sub(v_x, v_curCenterX_0123);
|
||||
|
||||
v_float32 v_r2 = (v_dx * v_dx) + v_dy2;
|
||||
v_float32 vmask = (v_minRadius2 <= v_r2) & (v_r2 <= v_maxRadius2) & v_reinterpret_as_f32(v_mask);
|
||||
v_float32 v_r2 = v_add(v_mul(v_dx, v_dx), v_dy2);
|
||||
v_float32 vmask = v_and(v_and(v_le(v_minRadius2, v_r2), v_le(v_r2, v_maxRadius2)), v_reinterpret_as_f32(v_mask));
|
||||
if (v_check_any(vmask))
|
||||
{
|
||||
v_store_aligned(rmask, v_reinterpret_as_s32(vmask));
|
||||
v_store_aligned(rbuf, v_r2);
|
||||
for (int i = 0; i < v_int32::nlanes; ++i)
|
||||
for (int i = 0; i < VTraits<v_int32>::vlanes(); ++i)
|
||||
if (rmask[i]) ddata[nzCount++] = rbuf[i];
|
||||
}
|
||||
}
|
||||
|
@ -88,7 +88,7 @@ static unsigned char const stackblurShr[255] =
|
||||
|
||||
namespace cv{
|
||||
|
||||
#if CV_SIMD
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
template<typename T>
|
||||
inline int opRow(const T* , T* , const std::vector<ushort>& , const float , const int radius, const int CN, const int )
|
||||
{
|
||||
@ -107,7 +107,7 @@ inline int opRow<uchar>(const uchar* srcPtr, uchar* dstPtr, const std::vector<us
|
||||
const int mulValTab= stackblurMul[radius];
|
||||
const int shrValTab= stackblurShr[radius];
|
||||
|
||||
const int VEC_LINE = v_uint8::nlanes;
|
||||
const int VEC_LINE = VTraits<v_uint8>::vlanes();
|
||||
|
||||
if (kernelSize == 3)
|
||||
{
|
||||
@ -126,10 +126,10 @@ inline int opRow<uchar>(const uchar* srcPtr, uchar* dstPtr, const std::vector<us
|
||||
v_expand(x1l, y00, y01);
|
||||
v_expand(x1h, y10, y11);
|
||||
|
||||
y00 = (y00 * v_mulVal)>>shrValTab;
|
||||
y01 = (y01 * v_mulVal)>>shrValTab;
|
||||
y10 = (y10 * v_mulVal)>>shrValTab;
|
||||
y11 = (y11 * v_mulVal)>>shrValTab;
|
||||
y00 = v_shr(v_mul(y00, v_mulVal), shrValTab);
|
||||
y01 = v_shr(v_mul(y01, v_mulVal), shrValTab);
|
||||
y10 = v_shr(v_mul(y10, v_mulVal), shrValTab);
|
||||
y11 = v_shr(v_mul(y11, v_mulVal), shrValTab);
|
||||
|
||||
v_store(dstPtr + i, v_pack(v_pack(y00, y01), v_pack(y10, y11)));
|
||||
}
|
||||
@ -159,12 +159,12 @@ inline int opRow<uchar>(const uchar* srcPtr, uchar* dstPtr, const std::vector<us
|
||||
v_uint8 v_src3 = vx_load(srcPtr + j + CN);
|
||||
|
||||
v_int16 xl, xh;
|
||||
v_zip(v_reinterpret_as_s16(v_expand_low(v_src0) + v_expand_low(v_src2)), v_reinterpret_as_s16(v_expand_low(v_src1) + v_expand_low(v_src3)), xl, xh);
|
||||
s0 += v_dotprod(xl, k12);
|
||||
s1 += v_dotprod(xh, k12);
|
||||
v_zip(v_reinterpret_as_s16(v_expand_high(v_src0) + v_expand_high(v_src2)), v_reinterpret_as_s16(v_expand_high(v_src1) + v_expand_high(v_src3)), xl, xh);
|
||||
s2 += v_dotprod(xl, k12);
|
||||
s3 += v_dotprod(xh, k12);
|
||||
v_zip(v_reinterpret_as_s16(v_add(v_expand_low(v_src0), v_expand_low(v_src2))), v_reinterpret_as_s16(v_add(v_expand_low(v_src1), v_expand_low(v_src3))), xl, xh);
|
||||
s0 = v_add(s0, v_dotprod(xl, k12));
|
||||
s1 = v_add(s1, v_dotprod(xh, k12));
|
||||
v_zip(v_reinterpret_as_s16(v_add(v_expand_high(v_src0), v_expand_high(v_src2))), v_reinterpret_as_s16(v_add(v_expand_high(v_src1), v_expand_high(v_src3))), xl, xh);
|
||||
s2 = v_add(s2, v_dotprod(xl, k12));
|
||||
s3 = v_add(s3, v_dotprod(xh, k12));
|
||||
}
|
||||
if( k < kernelSize / 2 + 1 )
|
||||
{
|
||||
@ -175,17 +175,17 @@ inline int opRow<uchar>(const uchar* srcPtr, uchar* dstPtr, const std::vector<us
|
||||
|
||||
v_int16 xl, xh;
|
||||
v_zip(v_reinterpret_as_s16(v_expand_low(v_src0)), v_reinterpret_as_s16(v_expand_low(v_src1)), xl, xh);
|
||||
s0 += v_dotprod(xl, k1);
|
||||
s1 += v_dotprod(xh, k1);
|
||||
s0 = v_add(s0, v_dotprod(xl, k1));
|
||||
s1 = v_add(s1, v_dotprod(xh, k1));
|
||||
v_zip(v_reinterpret_as_s16(v_expand_high(v_src0)), v_reinterpret_as_s16(v_expand_high(v_src1)), xl, xh);
|
||||
s2 += v_dotprod(xl, k1);
|
||||
s3 += v_dotprod(xh, k1);
|
||||
s2 = v_add(s2, v_dotprod(xl, k1));
|
||||
s3 = v_add(s3, v_dotprod(xh, k1));
|
||||
}
|
||||
|
||||
s0 = (s0 * v_mulVal)>>shrValTab;
|
||||
s1 = (s1 * v_mulVal)>>shrValTab;
|
||||
s2 = (s2 * v_mulVal)>>shrValTab;
|
||||
s3 = (s3 * v_mulVal)>>shrValTab;
|
||||
s0 = v_shr(v_mul(s0, v_mulVal), shrValTab);
|
||||
s1 = v_shr(v_mul(s1, v_mulVal), shrValTab);
|
||||
s2 = v_shr(v_mul(s2, v_mulVal), shrValTab);
|
||||
s3 = v_shr(v_mul(s3, v_mulVal), shrValTab);
|
||||
|
||||
v_store(dstPtr + i, v_pack(v_reinterpret_as_u16(v_pack(s0, s1)), v_reinterpret_as_u16(v_pack(s2, s3))));
|
||||
}
|
||||
@ -205,7 +205,7 @@ inline int opRow<ushort>(const ushort* srcPtr, ushort* dstPtr, const std::vector
|
||||
const int mulValTab= stackblurMul[radius];
|
||||
const int shrValTab= stackblurShr[radius];
|
||||
|
||||
const int VEC_LINE = v_uint16::nlanes;
|
||||
const int VEC_LINE = VTraits<v_uint16>::vlanes();
|
||||
|
||||
v_uint32 v_mulVal = vx_setall_u32(mulValTab);
|
||||
if (kernelSize == 3)
|
||||
@ -220,7 +220,7 @@ inline int opRow<ushort>(const ushort* srcPtr, ushort* dstPtr, const std::vector
|
||||
x1l = v_add(v_add(x1l, x1l), v_add(x0l, x2l));
|
||||
x1h = v_add(v_add(x1h, x1h), v_add(x0h, x2h));
|
||||
|
||||
v_store(dstPtr + i, v_pack((x1l * v_mulVal)>>shrValTab, (x1h * v_mulVal)>>shrValTab));
|
||||
v_store(dstPtr + i, v_pack(v_shr(v_mul(x1l, v_mulVal), shrValTab), v_shr(v_mul(x1h, v_mulVal), shrValTab)));
|
||||
}
|
||||
}
|
||||
else
|
||||
@ -243,25 +243,25 @@ inline int opRow<ushort>(const ushort* srcPtr, ushort* dstPtr, const std::vector
|
||||
v_uint16 k2 = vx_setall_u16(kx[k + 1]);
|
||||
|
||||
v_uint32 y0, y1;
|
||||
v_mul_expand(vx_load(srcPtr - j) + vx_load(srcPtr + j), k1, y0, y1);
|
||||
s0 += y0;
|
||||
s1 += y1;
|
||||
v_mul_expand(vx_load(srcPtr - j - CN) + vx_load(srcPtr + j + CN), k2, y0, y1);
|
||||
s0 += y0;
|
||||
s1 += y1;
|
||||
v_mul_expand(v_add(vx_load(srcPtr - j), vx_load(srcPtr + j)), k1, y0, y1);
|
||||
s0 = v_add(s0, y0);
|
||||
s1 = v_add(s1, y1);
|
||||
v_mul_expand(v_add(vx_load(srcPtr - j - CN), vx_load(srcPtr + j + CN)), k2, y0, y1);
|
||||
s0 = v_add(s0, y0);
|
||||
s1 = v_add(s1, y1);
|
||||
}
|
||||
if( k < kernelSize / 2 + 1 )
|
||||
{
|
||||
v_uint16 k1 = vx_setall_u16(kx[k]);
|
||||
|
||||
v_uint32 y0, y1;
|
||||
v_mul_expand(vx_load(srcPtr - j) + vx_load(srcPtr + j), k1, y0, y1);
|
||||
s0 += y0;
|
||||
s1 += y1;
|
||||
v_mul_expand(v_add(vx_load(srcPtr - j), vx_load(srcPtr + j)), k1, y0, y1);
|
||||
s0 = v_add(s0, y0);
|
||||
s1 = v_add(s1, y1);
|
||||
}
|
||||
|
||||
s0 = (s0 * v_mulVal)>>shrValTab;
|
||||
s1 = (s1 * v_mulVal)>>shrValTab;
|
||||
s0 = v_shr(v_mul(s0, v_mulVal), shrValTab);
|
||||
s1 = v_shr(v_mul(s1, v_mulVal), shrValTab);
|
||||
|
||||
v_store(dstPtr + i, v_pack(s0, s1));
|
||||
}
|
||||
@ -282,7 +282,7 @@ inline int opRow<short>(const short* srcPtr, short* dstPtr, const std::vector<us
|
||||
const int mulValTab= stackblurMul[radius];
|
||||
const int shrValTab= stackblurShr[radius];
|
||||
|
||||
const int VEC_LINE = v_int16::nlanes;
|
||||
const int VEC_LINE = VTraits<v_int16>::vlanes();
|
||||
v_int32 v_mulVal = vx_setall_s32(mulValTab);
|
||||
|
||||
if (kernelSize == 3)
|
||||
@ -297,7 +297,7 @@ inline int opRow<short>(const short* srcPtr, short* dstPtr, const std::vector<us
|
||||
x1l = v_add(v_add(x1l, x1l), v_add(x0l, x2l));
|
||||
x1h = v_add(v_add(x1h, x1h), v_add(x0h, x2h));
|
||||
|
||||
v_store(dstPtr + i, v_pack((x1l * v_mulVal)>>shrValTab, (x1h * v_mulVal)>>shrValTab));
|
||||
v_store(dstPtr + i, v_pack(v_shr(v_mul(x1l, v_mulVal), shrValTab), v_shr(v_mul(x1h, v_mulVal), shrValTab)));
|
||||
}
|
||||
}
|
||||
else
|
||||
@ -320,24 +320,24 @@ inline int opRow<short>(const short* srcPtr, short* dstPtr, const std::vector<us
|
||||
|
||||
v_int32 y0, y1;
|
||||
|
||||
v_mul_expand(vx_load(srcPtr - j) + vx_load(srcPtr + j), k1, y0, y1);
|
||||
s0 += y0;
|
||||
s1 += y1;
|
||||
v_mul_expand(vx_load(srcPtr - j - CN) + vx_load(srcPtr + j + CN), k2, y0, y1);
|
||||
s0 += y0;
|
||||
s1 += y1;
|
||||
v_mul_expand(v_add(vx_load(srcPtr - j), vx_load(srcPtr + j)), k1, y0, y1);
|
||||
s0 = v_add(s0, y0);
|
||||
s1 = v_add(s1, y1);
|
||||
v_mul_expand(v_add(vx_load(srcPtr - j - CN), vx_load(srcPtr + j + CN)), k2, y0, y1);
|
||||
s0 = v_add(s0, y0);
|
||||
s1 = v_add(s1, y1);
|
||||
}
|
||||
if( k < kernelSize / 2 + 1 )
|
||||
{
|
||||
v_int16 k1 = vx_setall_s16((short)kx[k]);
|
||||
v_int32 y0, y1;
|
||||
v_mul_expand(vx_load(srcPtr - j) + vx_load(srcPtr + j), k1, y0, y1);
|
||||
s0 += y0;
|
||||
s1 += y1;
|
||||
v_mul_expand(v_add(vx_load(srcPtr - j), vx_load(srcPtr + j)), k1, y0, y1);
|
||||
s0 = v_add(s0, y0);
|
||||
s1 = v_add(s1, y1);
|
||||
}
|
||||
|
||||
s0 = (s0 * v_mulVal)>>shrValTab;
|
||||
s1 = (s1 * v_mulVal)>>shrValTab;
|
||||
s0 = v_shr(v_mul(s0, v_mulVal), shrValTab);
|
||||
s1 = v_shr(v_mul(s1, v_mulVal), shrValTab);
|
||||
|
||||
v_store(dstPtr + i, v_pack(s0, s1));
|
||||
}
|
||||
@ -352,7 +352,7 @@ inline int opRow<float>(const float* srcPtr, float* dstPtr, const std::vector<us
|
||||
int i = radius * CN;
|
||||
|
||||
v_float32 v_mulVal = vx_setall_f32(mulVal);
|
||||
const int VEC_LINE = v_float32::nlanes;
|
||||
const int VEC_LINE = VTraits<v_float32>::vlanes();
|
||||
const int VEC_LINE4 = VEC_LINE * 4;
|
||||
|
||||
if (kernelSize == 3)
|
||||
@ -364,22 +364,22 @@ inline int opRow<float>(const float* srcPtr, float* dstPtr, const std::vector<us
|
||||
v_float32 v_srcPtr2 = vx_load(srcPtr + VEC_LINE * 2 + i);
|
||||
v_float32 v_srcPtr3 = vx_load(srcPtr + VEC_LINE * 3 + i);
|
||||
|
||||
v_float32 v_sumVal0 = v_srcPtr0 + v_srcPtr0 + vx_load(srcPtr + i - CN) + vx_load(srcPtr + i + CN);
|
||||
v_float32 v_sumVal1 = v_srcPtr1 + v_srcPtr1 + vx_load(srcPtr + VEC_LINE + i - CN) + vx_load(srcPtr + VEC_LINE + i + CN);
|
||||
v_float32 v_sumVal2 = v_srcPtr2 + v_srcPtr2 + vx_load(srcPtr + VEC_LINE * 2 + i - CN) + vx_load(srcPtr + VEC_LINE * 2 + i + CN);
|
||||
v_float32 v_sumVal3 = v_srcPtr3 + v_srcPtr3 + vx_load(srcPtr + VEC_LINE * 3 + i - CN) + vx_load(srcPtr + VEC_LINE * 3 + i + CN);
|
||||
v_float32 v_sumVal0 = v_add(v_add(v_add(v_srcPtr0, v_srcPtr0), vx_load(srcPtr + i - CN)), vx_load(srcPtr + i + CN));
|
||||
v_float32 v_sumVal1 = v_add(v_add(v_add(v_srcPtr1, v_srcPtr1), vx_load(srcPtr + VEC_LINE + i - CN)), vx_load(srcPtr + VEC_LINE + i + CN));
|
||||
v_float32 v_sumVal2 = v_add(v_add(v_add(v_srcPtr2, v_srcPtr2), vx_load(srcPtr + VEC_LINE * 2 + i - CN)), vx_load(srcPtr + VEC_LINE * 2 + i + CN));
|
||||
v_float32 v_sumVal3 = v_add(v_add(v_add(v_srcPtr3, v_srcPtr3), vx_load(srcPtr + VEC_LINE * 3 + i - CN)), vx_load(srcPtr + VEC_LINE * 3 + i + CN));
|
||||
|
||||
v_store(dstPtr + i, v_sumVal0 * v_mulVal);
|
||||
v_store(dstPtr + i + VEC_LINE, v_sumVal1 * v_mulVal);
|
||||
v_store(dstPtr + i + VEC_LINE * 2, v_sumVal2 * v_mulVal);
|
||||
v_store(dstPtr + i + VEC_LINE * 3, v_sumVal3 * v_mulVal);
|
||||
v_store(dstPtr + i, v_mul(v_sumVal0, v_mulVal));
|
||||
v_store(dstPtr + i + VEC_LINE, v_mul(v_sumVal1, v_mulVal));
|
||||
v_store(dstPtr + i + VEC_LINE * 2, v_mul(v_sumVal2, v_mulVal));
|
||||
v_store(dstPtr + i + VEC_LINE * 3, v_mul(v_sumVal3, v_mulVal));
|
||||
}
|
||||
|
||||
for (; i <= widthCN - VEC_LINE; i += VEC_LINE)
|
||||
{
|
||||
v_float32 v_srcPtr = vx_load(srcPtr + i);
|
||||
v_float32 v_sumVal = v_srcPtr + v_srcPtr + vx_load(srcPtr + i - CN) + vx_load(srcPtr + i + CN);
|
||||
v_store(dstPtr + i, v_sumVal * v_mulVal);
|
||||
v_float32 v_sumVal = v_add(v_add(v_add(v_srcPtr, v_srcPtr), vx_load(srcPtr + i - CN)), vx_load(srcPtr + i + CN));
|
||||
v_store(dstPtr + i, v_mul(v_sumVal, v_mulVal));
|
||||
}
|
||||
}
|
||||
else
|
||||
@ -392,7 +392,7 @@ inline int opRow<float>(const float* srcPtr, float* dstPtr, const std::vector<us
|
||||
{
|
||||
v_float32 v_src = vx_load(srcPtr);
|
||||
v_float32 s0;
|
||||
s0 = v_src * k0;
|
||||
s0 = v_mul(v_src, k0);
|
||||
|
||||
int k = 1, j = CN;
|
||||
for (; k <= kernelSize / 2 - 1; k += 2, j += 2 * CN)
|
||||
@ -400,17 +400,17 @@ inline int opRow<float>(const float* srcPtr, float* dstPtr, const std::vector<us
|
||||
v_float32 k1 = vx_setall_f32((float)kx[k]);
|
||||
v_float32 k2 = vx_setall_f32((float)kx[k + 1]);
|
||||
|
||||
s0 += (vx_load(srcPtr - j) + vx_load(srcPtr + j)) * k1;
|
||||
s0 += (vx_load(srcPtr - j - CN) + vx_load(srcPtr + j + CN)) * k2;
|
||||
s0 = v_add(s0, v_mul(v_add(vx_load(srcPtr - j), vx_load(srcPtr + j)), k1));
|
||||
s0 = v_add(s0, v_mul(v_add(vx_load(srcPtr - j - CN), vx_load(srcPtr + j + CN)), k2));
|
||||
}
|
||||
if( k < kernelSize / 2 + 1 )
|
||||
{
|
||||
v_float32 k1 = vx_setall_f32((float)kx[k]);
|
||||
|
||||
s0 += (vx_load(srcPtr - j) + vx_load(srcPtr + j)) * k1;
|
||||
s0 = v_add(s0, v_mul(v_add(vx_load(srcPtr - j), vx_load(srcPtr + j)), k1));
|
||||
}
|
||||
|
||||
v_store(dstPtr + i, s0 * v_mulVal);
|
||||
v_store(dstPtr + i, v_mul(s0, v_mulVal));
|
||||
}
|
||||
}
|
||||
return i;
|
||||
@ -426,8 +426,8 @@ template<>
|
||||
inline int opComputeDiff<uchar, int>(const uchar*& srcPtr, int*& diff0, const int w, const int CNR1)
|
||||
{
|
||||
int index = 0;
|
||||
const int VEC_LINE_8 = v_uint8::nlanes;
|
||||
const int VEC_LINE_32 = v_int32::nlanes;
|
||||
const int VEC_LINE_8 = VTraits<v_uint8>::vlanes();
|
||||
const int VEC_LINE_32 = VTraits<v_int32>::vlanes();
|
||||
for (; index <= w - VEC_LINE_8; index += VEC_LINE_8, diff0+=VEC_LINE_8, srcPtr+=VEC_LINE_8)
|
||||
{
|
||||
v_uint16 x0l, x0h, x1l, x1h;
|
||||
@ -435,8 +435,8 @@ inline int opComputeDiff<uchar, int>(const uchar*& srcPtr, int*& diff0, const in
|
||||
v_expand(vx_load(srcPtr), x1l, x1h);
|
||||
|
||||
v_int32 y0, y1, y2, y3;
|
||||
v_expand(v_reinterpret_as_s16(x0l) - v_reinterpret_as_s16(x1l), y0, y1);
|
||||
v_expand(v_reinterpret_as_s16(x0h) - v_reinterpret_as_s16(x1h), y2, y3);
|
||||
v_expand(v_sub(v_reinterpret_as_s16(x0l), v_reinterpret_as_s16(x1l)), y0, y1);
|
||||
v_expand(v_sub(v_reinterpret_as_s16(x0h), v_reinterpret_as_s16(x1h)), y2, y3);
|
||||
|
||||
v_store(diff0, y0);
|
||||
v_store(diff0 + VEC_LINE_32, y1);
|
||||
@ -517,7 +517,7 @@ public:
|
||||
|
||||
// middle
|
||||
int wc = radius * CN;
|
||||
#if CV_SIMD
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
wc = opRow<T>(srcPtr, dstPtr, kVec, mulVal, radius, CN, widthCN);
|
||||
#endif
|
||||
for (; wc < widthCN; wc++)
|
||||
@ -586,7 +586,7 @@ public:
|
||||
// middle
|
||||
auto diff0 = diff + radius * CN;
|
||||
int index = 0;
|
||||
#if CV_SIMD
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
index = opComputeDiff(srcPtr, diff0, widthCN, CNR1);
|
||||
#endif
|
||||
|
||||
@ -688,7 +688,7 @@ private:
|
||||
float mulVal;
|
||||
};
|
||||
|
||||
#if CV_SIMD
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
template<typename T, typename TBuf>
|
||||
inline int opColumn(const T* , T* , T* , TBuf* , TBuf* , TBuf* , const float ,
|
||||
const int , const int , const int , const int , const int )
|
||||
@ -703,7 +703,7 @@ inline int opColumn<float, float>(const float* srcPtr, float* dstPtr, float* sta
|
||||
{
|
||||
int k = 0;
|
||||
v_float32 v_mulVal = vx_setall_f32(mulVal);
|
||||
const int VEC_LINE = v_float32::nlanes;
|
||||
const int VEC_LINE = VTraits<v_float32>::vlanes();
|
||||
const int VEC_LINE4 = 4 * VEC_LINE;
|
||||
|
||||
auto stackStartPtr = stack + ss * widthLen;
|
||||
@ -726,20 +726,20 @@ inline int opColumn<float, float>(const float* srcPtr, float* dstPtr, float* sta
|
||||
v_float32 v_sumIn2 = vx_load(sumIn + VEC_LINE * 2 + k);
|
||||
v_float32 v_sumIn3 = vx_load(sumIn + VEC_LINE * 3+ k);
|
||||
|
||||
v_store(dstPtr + k, v_sum0 * v_mulVal);
|
||||
v_store(dstPtr + VEC_LINE + k, v_sum1 * v_mulVal);
|
||||
v_store(dstPtr + VEC_LINE * 2 + k, v_sum2 * v_mulVal);
|
||||
v_store(dstPtr + VEC_LINE * 3 + k, v_sum3 * v_mulVal);
|
||||
v_store(dstPtr + k, v_mul(v_sum0, v_mulVal));
|
||||
v_store(dstPtr + VEC_LINE + k, v_mul(v_sum1, v_mulVal));
|
||||
v_store(dstPtr + VEC_LINE * 2 + k, v_mul(v_sum2, v_mulVal));
|
||||
v_store(dstPtr + VEC_LINE * 3 + k, v_mul(v_sum3, v_mulVal));
|
||||
|
||||
v_sum0 -= v_sumOut0;
|
||||
v_sum1 -= v_sumOut1;
|
||||
v_sum2 -= v_sumOut2;
|
||||
v_sum3 -= v_sumOut3;
|
||||
v_sum0 = v_sub(v_sum0, v_sumOut0);
|
||||
v_sum1 = v_sub(v_sum1, v_sumOut1);
|
||||
v_sum2 = v_sub(v_sum2, v_sumOut2);
|
||||
v_sum3 = v_sub(v_sum3, v_sumOut3);
|
||||
|
||||
v_sumOut0 -= vx_load(stackStartPtr + k);
|
||||
v_sumOut1 -= vx_load(stackStartPtr + VEC_LINE + k);
|
||||
v_sumOut2 -= vx_load(stackStartPtr + VEC_LINE * 2 + k);
|
||||
v_sumOut3 -= vx_load(stackStartPtr + VEC_LINE * 3 + k);
|
||||
v_sumOut0 = v_sub(v_sumOut0, vx_load(stackStartPtr + k));
|
||||
v_sumOut1 = v_sub(v_sumOut1, vx_load(stackStartPtr + VEC_LINE + k));
|
||||
v_sumOut2 = v_sub(v_sumOut2, vx_load(stackStartPtr + VEC_LINE * 2 + k));
|
||||
v_sumOut3 = v_sub(v_sumOut3, vx_load(stackStartPtr + VEC_LINE * 3 + k));
|
||||
|
||||
v_float32 v_srcPtr0 = vx_load(srcPtr + k);
|
||||
v_float32 v_srcPtr1 = vx_load(srcPtr + VEC_LINE + k);
|
||||
@ -751,35 +751,35 @@ inline int opColumn<float, float>(const float* srcPtr, float* dstPtr, float* sta
|
||||
v_store(stackStartPtr + VEC_LINE * 2 + k, v_srcPtr2);
|
||||
v_store(stackStartPtr + VEC_LINE * 3 + k, v_srcPtr3);
|
||||
|
||||
v_sumIn0 += v_srcPtr0;
|
||||
v_sumIn1 += v_srcPtr1;
|
||||
v_sumIn2 += v_srcPtr2;
|
||||
v_sumIn3 += v_srcPtr3;
|
||||
v_sumIn0 = v_add(v_sumIn0, v_srcPtr0);
|
||||
v_sumIn1 = v_add(v_sumIn1, v_srcPtr1);
|
||||
v_sumIn2 = v_add(v_sumIn2, v_srcPtr2);
|
||||
v_sumIn3 = v_add(v_sumIn3, v_srcPtr3);
|
||||
|
||||
v_store(sum + k, v_sum0 + v_sumIn0);
|
||||
v_store(sum + VEC_LINE + k, v_sum1 + v_sumIn1);
|
||||
v_store(sum + VEC_LINE * 2 + k, v_sum2 + v_sumIn2);
|
||||
v_store(sum + VEC_LINE * 3 + k, v_sum3 + v_sumIn3);
|
||||
v_store(sum + k, v_add(v_sum0, v_sumIn0));
|
||||
v_store(sum + VEC_LINE + k, v_add(v_sum1, v_sumIn1));
|
||||
v_store(sum + VEC_LINE * 2 + k, v_add(v_sum2, v_sumIn2));
|
||||
v_store(sum + VEC_LINE * 3 + k, v_add(v_sum3, v_sumIn3));
|
||||
|
||||
v_srcPtr0 = vx_load(stackSp1Ptr + k);
|
||||
v_srcPtr1 = vx_load(stackSp1Ptr + VEC_LINE + k);
|
||||
v_srcPtr2 = vx_load(stackSp1Ptr + VEC_LINE * 2 + k);
|
||||
v_srcPtr3 = vx_load(stackSp1Ptr + VEC_LINE * 3 + k);
|
||||
|
||||
v_sumOut0 += v_srcPtr0;
|
||||
v_sumOut1 += v_srcPtr1;
|
||||
v_sumOut2 += v_srcPtr2;
|
||||
v_sumOut3 += v_srcPtr3;
|
||||
v_sumOut0 = v_add(v_sumOut0, v_srcPtr0);
|
||||
v_sumOut1 = v_add(v_sumOut1, v_srcPtr1);
|
||||
v_sumOut2 = v_add(v_sumOut2, v_srcPtr2);
|
||||
v_sumOut3 = v_add(v_sumOut3, v_srcPtr3);
|
||||
|
||||
v_store(sumOut + k, v_sumOut0);
|
||||
v_store(sumOut + VEC_LINE + k, v_sumOut1);
|
||||
v_store(sumOut + VEC_LINE * 2 + k, v_sumOut2);
|
||||
v_store(sumOut + VEC_LINE * 3 + k, v_sumOut3);
|
||||
|
||||
v_sumIn0 -= v_srcPtr0;
|
||||
v_sumIn1 -= v_srcPtr1;
|
||||
v_sumIn2 -= v_srcPtr2;
|
||||
v_sumIn3 -= v_srcPtr3;
|
||||
v_sumIn0 = v_sub(v_sumIn0, v_srcPtr0);
|
||||
v_sumIn1 = v_sub(v_sumIn1, v_srcPtr1);
|
||||
v_sumIn2 = v_sub(v_sumIn2, v_srcPtr2);
|
||||
v_sumIn3 = v_sub(v_sumIn3, v_srcPtr3);
|
||||
|
||||
v_store(sumIn + k, v_sumIn0);
|
||||
v_store(sumIn + VEC_LINE + k, v_sumIn1);
|
||||
@ -793,20 +793,20 @@ inline int opColumn<float, float>(const float* srcPtr, float* dstPtr, float* sta
|
||||
v_float32 v_sumOut = vx_load(sumOut + k);
|
||||
v_float32 v_sumIn = vx_load(sumIn + k);
|
||||
|
||||
v_store(dstPtr + k, v_sum * v_mulVal);
|
||||
v_sum -= v_sumOut;
|
||||
v_sumOut -= vx_load(stackStartPtr + k);
|
||||
v_store(dstPtr + k, v_mul(v_sum, v_mulVal));
|
||||
v_sum = v_sub(v_sum, v_sumOut);
|
||||
v_sumOut = v_sub(v_sumOut, vx_load(stackStartPtr + k));
|
||||
|
||||
v_float32 v_srcPtr = vx_load(srcPtr + k);
|
||||
v_store(stackStartPtr + k, v_srcPtr);
|
||||
|
||||
v_sumIn += v_srcPtr;
|
||||
v_store(sum + k, v_sum + v_sumIn);
|
||||
v_sumIn = v_add(v_sumIn, v_srcPtr);
|
||||
v_store(sum + k, v_add(v_sum, v_sumIn));
|
||||
|
||||
v_srcPtr = vx_load(stackSp1Ptr + k);
|
||||
v_sumOut += v_srcPtr;
|
||||
v_sumOut = v_add(v_sumOut, v_srcPtr);
|
||||
v_store(sumOut + k, v_sumOut);
|
||||
v_sumIn -= v_srcPtr;
|
||||
v_sumIn = v_sub(v_sumIn, v_srcPtr);
|
||||
v_store(sumIn + k, v_sumIn);
|
||||
}
|
||||
return k;
|
||||
@ -820,8 +820,8 @@ inline int opColumn<uchar, int>(const uchar* srcPtr, uchar* dstPtr, uchar* stack
|
||||
int k = 0;
|
||||
if (mulValTab != 0 && shrValTab != 0)
|
||||
{
|
||||
const int VEC_LINE_8 = v_uint8::nlanes;
|
||||
const int VEC_LINE_32 = v_int32::nlanes;
|
||||
const int VEC_LINE_8 = VTraits<v_uint8>::vlanes();
|
||||
const int VEC_LINE_32 = VTraits<v_int32>::vlanes();
|
||||
v_int32 v_mulVal = vx_setall_s32(mulValTab);
|
||||
|
||||
auto stackStartPtr = stack + ss * widthLen;
|
||||
@ -850,13 +850,13 @@ inline int opColumn<uchar, int>(const uchar* srcPtr, uchar* dstPtr, uchar* stack
|
||||
|
||||
v_store(dstPtr + k,
|
||||
v_pack(
|
||||
v_reinterpret_as_u16(v_pack((v_sum0 * v_mulVal)>>shrValTab, (v_sum1 * v_mulVal)>>shrValTab)),
|
||||
v_reinterpret_as_u16(v_pack((v_sum2 * v_mulVal)>>shrValTab, (v_sum3 * v_mulVal)>>shrValTab))));
|
||||
v_reinterpret_as_u16(v_pack(v_shr(v_mul(v_sum0, v_mulVal), shrValTab), v_shr(v_mul(v_sum1, v_mulVal), shrValTab))),
|
||||
v_reinterpret_as_u16(v_pack(v_shr(v_mul(v_sum2, v_mulVal), shrValTab), v_shr(v_mul(v_sum3, v_mulVal), shrValTab)))));
|
||||
|
||||
v_sum0 -= v_sumOut0;
|
||||
v_sum1 -= v_sumOut1;
|
||||
v_sum2 -= v_sumOut2;
|
||||
v_sum3 -= v_sumOut3;
|
||||
v_sum0 = v_sub(v_sum0, v_sumOut0);
|
||||
v_sum1 = v_sub(v_sum1, v_sumOut1);
|
||||
v_sum2 = v_sub(v_sum2, v_sumOut2);
|
||||
v_sum3 = v_sub(v_sum3, v_sumOut3);
|
||||
|
||||
v_uint16 x0l, x0h;
|
||||
v_int32 v_ss0, v_ss1, v_ss2, v_ss3;
|
||||
@ -865,10 +865,10 @@ inline int opColumn<uchar, int>(const uchar* srcPtr, uchar* dstPtr, uchar* stack
|
||||
v_expand(v_reinterpret_as_s16(x0l), v_ss0, v_ss1);
|
||||
v_expand(v_reinterpret_as_s16(x0h), v_ss2, v_ss3);
|
||||
|
||||
v_sumOut0 -= v_ss0;
|
||||
v_sumOut1 -= v_ss1;
|
||||
v_sumOut2 -= v_ss2;
|
||||
v_sumOut3 -= v_ss3;
|
||||
v_sumOut0 = v_sub(v_sumOut0, v_ss0);
|
||||
v_sumOut1 = v_sub(v_sumOut1, v_ss1);
|
||||
v_sumOut2 = v_sub(v_sumOut2, v_ss2);
|
||||
v_sumOut3 = v_sub(v_sumOut3, v_ss3);
|
||||
|
||||
v_expand(vx_load(srcPtr + k), x0l, x0h);
|
||||
v_expand(v_reinterpret_as_s16(x0l), v_ss0, v_ss1);
|
||||
@ -876,34 +876,34 @@ inline int opColumn<uchar, int>(const uchar* srcPtr, uchar* dstPtr, uchar* stack
|
||||
|
||||
memcpy(stackStartPtr + k,srcPtr + k, VEC_LINE_8 * sizeof (uchar));
|
||||
|
||||
v_sumIn0 += v_ss0;
|
||||
v_sumIn1 += v_ss1;
|
||||
v_sumIn2 += v_ss2;
|
||||
v_sumIn3 += v_ss3;
|
||||
v_sumIn0 = v_add(v_sumIn0, v_ss0);
|
||||
v_sumIn1 = v_add(v_sumIn1, v_ss1);
|
||||
v_sumIn2 = v_add(v_sumIn2, v_ss2);
|
||||
v_sumIn3 = v_add(v_sumIn3, v_ss3);
|
||||
|
||||
v_store(sum + k, v_sum0 + v_sumIn0);
|
||||
v_store(sum + VEC_LINE_32 + k, v_sum1 + v_sumIn1);
|
||||
v_store(sum + VEC_LINE_32 * 2 + k, v_sum2 + v_sumIn2);
|
||||
v_store(sum + VEC_LINE_32 * 3 + k, v_sum3 + v_sumIn3);
|
||||
v_store(sum + k, v_add(v_sum0, v_sumIn0));
|
||||
v_store(sum + VEC_LINE_32 + k, v_add(v_sum1, v_sumIn1));
|
||||
v_store(sum + VEC_LINE_32 * 2 + k, v_add(v_sum2, v_sumIn2));
|
||||
v_store(sum + VEC_LINE_32 * 3 + k, v_add(v_sum3, v_sumIn3));
|
||||
|
||||
v_expand(vx_load(stackSp1Ptr + k), x0l, x0h);
|
||||
v_expand(v_reinterpret_as_s16(x0l), v_ss0, v_ss1);
|
||||
v_expand(v_reinterpret_as_s16(x0h), v_ss2, v_ss3);
|
||||
|
||||
v_sumOut0 += v_ss0;
|
||||
v_sumOut1 += v_ss1;
|
||||
v_sumOut2 += v_ss2;
|
||||
v_sumOut3 += v_ss3;
|
||||
v_sumOut0 = v_add(v_sumOut0, v_ss0);
|
||||
v_sumOut1 = v_add(v_sumOut1, v_ss1);
|
||||
v_sumOut2 = v_add(v_sumOut2, v_ss2);
|
||||
v_sumOut3 = v_add(v_sumOut3, v_ss3);
|
||||
|
||||
v_store(sumOut + k, v_sumOut0);
|
||||
v_store(sumOut + VEC_LINE_32 + k, v_sumOut1);
|
||||
v_store(sumOut + VEC_LINE_32 * 2 + k, v_sumOut2);
|
||||
v_store(sumOut + VEC_LINE_32 * 3 + k, v_sumOut3);
|
||||
|
||||
v_sumIn0 -= v_ss0;
|
||||
v_sumIn1 -= v_ss1;
|
||||
v_sumIn2 -= v_ss2;
|
||||
v_sumIn3 -= v_ss3;
|
||||
v_sumIn0 = v_sub(v_sumIn0, v_ss0);
|
||||
v_sumIn1 = v_sub(v_sumIn1, v_ss1);
|
||||
v_sumIn2 = v_sub(v_sumIn2, v_ss2);
|
||||
v_sumIn3 = v_sub(v_sumIn3, v_ss3);
|
||||
|
||||
v_store(sumIn + k, v_sumIn0);
|
||||
v_store(sumIn + VEC_LINE_32 + k, v_sumIn1);
|
||||
@ -922,8 +922,8 @@ inline int opColumn<short, int>(const short* srcPtr, short* dstPtr, short* stack
|
||||
int k = 0;
|
||||
if (mulValTab != 0 && shrValTab != 0)
|
||||
{
|
||||
const int VEC_LINE_16 = v_int16::nlanes;
|
||||
const int VEC_LINE_32 = v_int32::nlanes;
|
||||
const int VEC_LINE_16 = VTraits<v_int16>::vlanes();
|
||||
const int VEC_LINE_32 = VTraits<v_int32>::vlanes();
|
||||
v_int32 v_mulVal = vx_setall_s32(mulValTab);
|
||||
|
||||
auto stackStartPtr = stack + ss * widthLen;
|
||||
@ -943,39 +943,39 @@ inline int opColumn<short, int>(const short* srcPtr, short* dstPtr, short* stack
|
||||
v_sumOut0 = vx_load(sumOut + k);
|
||||
v_sumOut1 = vx_load(sumOut + k + VEC_LINE_32);
|
||||
|
||||
v_store(dstPtr + k,v_pack((v_sum0 * v_mulVal)>>shrValTab, (v_sum1 * v_mulVal)>>shrValTab));
|
||||
v_store(dstPtr + k,v_pack(v_shr(v_mul(v_sum0, v_mulVal), shrValTab), v_shr(v_mul(v_sum1, v_mulVal), shrValTab)));
|
||||
|
||||
v_sum0 -= v_sumOut0;
|
||||
v_sum1 -= v_sumOut1;
|
||||
v_sum0 = v_sub(v_sum0, v_sumOut0);
|
||||
v_sum1 = v_sub(v_sum1, v_sumOut1);
|
||||
|
||||
v_int32 v_ss0, v_ss1;
|
||||
v_expand(vx_load(stackStartPtr + k), v_ss0, v_ss1);
|
||||
|
||||
v_sumOut0 -= v_ss0;
|
||||
v_sumOut1 -= v_ss1;
|
||||
v_sumOut0 = v_sub(v_sumOut0, v_ss0);
|
||||
v_sumOut1 = v_sub(v_sumOut1, v_ss1);
|
||||
|
||||
v_expand(vx_load(srcPtr + k), v_ss0, v_ss1);
|
||||
memcpy(stackStartPtr + k,srcPtr + k, VEC_LINE_16 * sizeof (short));
|
||||
|
||||
v_sumIn0 += v_ss0;
|
||||
v_sumIn1 += v_ss1;
|
||||
v_sumIn0 = v_add(v_sumIn0, v_ss0);
|
||||
v_sumIn1 = v_add(v_sumIn1, v_ss1);
|
||||
|
||||
v_sum0 += v_sumIn0;
|
||||
v_sum1 += v_sumIn1;
|
||||
v_sum0 = v_add(v_sum0, v_sumIn0);
|
||||
v_sum1 = v_add(v_sum1, v_sumIn1);
|
||||
|
||||
v_store(sum + k, v_sum0);
|
||||
v_store(sum + VEC_LINE_32 + k, v_sum1);
|
||||
|
||||
v_expand(vx_load(stackSp1Ptr + k), v_ss0, v_ss1);
|
||||
|
||||
v_sumOut0 += v_ss0;
|
||||
v_sumOut1 += v_ss1;
|
||||
v_sumOut0 = v_add(v_sumOut0, v_ss0);
|
||||
v_sumOut1 = v_add(v_sumOut1, v_ss1);
|
||||
|
||||
v_store(sumOut + k, v_sumOut0);
|
||||
v_store(sumOut + VEC_LINE_32 + k, v_sumOut1);
|
||||
|
||||
v_sumIn0 -= v_ss0;
|
||||
v_sumIn1 -= v_ss1;
|
||||
v_sumIn0 = v_sub(v_sumIn0, v_ss0);
|
||||
v_sumIn1 = v_sub(v_sumIn1, v_ss1);
|
||||
|
||||
v_store(sumIn + k, v_sumIn0);
|
||||
v_store(sumIn + VEC_LINE_32 + k, v_sumIn1);
|
||||
@ -992,8 +992,8 @@ inline int opColumn<ushort, int>(const ushort* srcPtr, ushort* dstPtr, ushort* s
|
||||
int k = 0;
|
||||
if (mulValTab != 0 && shrValTab != 0)
|
||||
{
|
||||
const int VEC_LINE_16 = v_uint16::nlanes;
|
||||
const int VEC_LINE_32 = v_int32::nlanes;
|
||||
const int VEC_LINE_16 = VTraits<v_uint16>::vlanes();
|
||||
const int VEC_LINE_32 = VTraits<v_int32>::vlanes();
|
||||
v_uint32 v_mulVal = vx_setall_u32((uint32_t)mulValTab);
|
||||
|
||||
auto stackStartPtr = stack + ss * widthLen;
|
||||
@ -1013,40 +1013,40 @@ inline int opColumn<ushort, int>(const ushort* srcPtr, ushort* dstPtr, ushort* s
|
||||
v_sumOut0 = vx_load(sumOut + k);
|
||||
v_sumOut1 = vx_load(sumOut + k + VEC_LINE_32);
|
||||
|
||||
v_store(dstPtr + k, v_pack((v_reinterpret_as_u32(v_sum0) * v_mulVal)>>shrValTab, (v_reinterpret_as_u32(v_sum1) * v_mulVal)>>shrValTab));
|
||||
v_store(dstPtr + k, v_pack(v_shr(v_mul(v_reinterpret_as_u32(v_sum0), v_mulVal), shrValTab), v_shr(v_mul(v_reinterpret_as_u32(v_sum1), v_mulVal), shrValTab)));
|
||||
|
||||
v_sum0 -= v_sumOut0;
|
||||
v_sum1 -= v_sumOut1;
|
||||
v_sum0 = v_sub(v_sum0, v_sumOut0);
|
||||
v_sum1 = v_sub(v_sum1, v_sumOut1);
|
||||
|
||||
v_uint32 v_ss0, v_ss1;
|
||||
v_expand(vx_load(stackStartPtr + k), v_ss0, v_ss1);
|
||||
|
||||
v_sumOut0 -= v_reinterpret_as_s32(v_ss0);
|
||||
v_sumOut1 -= v_reinterpret_as_s32(v_ss1);
|
||||
v_sumOut0 = v_sub(v_sumOut0, v_reinterpret_as_s32(v_ss0));
|
||||
v_sumOut1 = v_sub(v_sumOut1, v_reinterpret_as_s32(v_ss1));
|
||||
|
||||
v_expand(vx_load(srcPtr + k), v_ss0, v_ss1);
|
||||
|
||||
memcpy(stackStartPtr + k,srcPtr + k, VEC_LINE_16 * sizeof (ushort));
|
||||
|
||||
v_sumIn0 += v_reinterpret_as_s32(v_ss0);
|
||||
v_sumIn1 += v_reinterpret_as_s32(v_ss1);
|
||||
v_sumIn0 = v_add(v_sumIn0, v_reinterpret_as_s32(v_ss0));
|
||||
v_sumIn1 = v_add(v_sumIn1, v_reinterpret_as_s32(v_ss1));
|
||||
|
||||
v_sum0 += v_sumIn0;
|
||||
v_sum1 += v_sumIn1;
|
||||
v_sum0 = v_add(v_sum0, v_sumIn0);
|
||||
v_sum1 = v_add(v_sum1, v_sumIn1);
|
||||
|
||||
v_store(sum + k, v_sum0);
|
||||
v_store(sum + VEC_LINE_32 + k, v_sum1);
|
||||
|
||||
v_expand(vx_load(stackSp1Ptr + k), v_ss0, v_ss1);
|
||||
|
||||
v_sumOut0 += v_reinterpret_as_s32(v_ss0);
|
||||
v_sumOut1 += v_reinterpret_as_s32(v_ss1);
|
||||
v_sumOut0 = v_add(v_sumOut0, v_reinterpret_as_s32(v_ss0));
|
||||
v_sumOut1 = v_add(v_sumOut1, v_reinterpret_as_s32(v_ss1));
|
||||
|
||||
v_store(sumOut + k, v_sumOut0);
|
||||
v_store(sumOut + VEC_LINE_32 + k, v_sumOut1);
|
||||
|
||||
v_sumIn0 -= v_reinterpret_as_s32(v_ss0);
|
||||
v_sumIn1 -= v_reinterpret_as_s32(v_ss1);
|
||||
v_sumIn0 = v_sub(v_sumIn0, v_reinterpret_as_s32(v_ss0));
|
||||
v_sumIn1 = v_sub(v_sumIn1, v_reinterpret_as_s32(v_ss1));
|
||||
|
||||
v_store(sumIn + k, v_sumIn0);
|
||||
v_store(sumIn + VEC_LINE_32 + k, v_sumIn1);
|
||||
@ -1152,7 +1152,7 @@ public:
|
||||
}
|
||||
|
||||
int k = 0;
|
||||
#if CV_SIMD
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
k = opColumn<T, TBuf>(srcPtr, dstPtr, stack, sum, sumIn, sumOut, mulVal, mulValTab, shrValTab,
|
||||
widthLen, stackStart, sp1);
|
||||
#endif
|
||||
|
@ -190,7 +190,7 @@ thresh_8u( const Mat& _src, Mat& _dst, uchar thresh, uchar maxval, int type )
|
||||
int j = 0;
|
||||
const uchar* src = _src.ptr();
|
||||
uchar* dst = _dst.ptr();
|
||||
#if CV_SIMD
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
v_uint8 thresh_u = vx_setall_u8( thresh );
|
||||
v_uint8 maxval16 = vx_setall_u8( maxval );
|
||||
|
||||
@ -199,12 +199,12 @@ thresh_8u( const Mat& _src, Mat& _dst, uchar thresh, uchar maxval, int type )
|
||||
case THRESH_BINARY:
|
||||
for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
|
||||
{
|
||||
for( j = 0; j <= roi.width - v_uint8::nlanes; j += v_uint8::nlanes)
|
||||
for( j = 0; j <= roi.width - VTraits<v_uint8>::vlanes(); j += VTraits<v_uint8>::vlanes())
|
||||
{
|
||||
v_uint8 v0;
|
||||
v0 = vx_load( src + j );
|
||||
v0 = thresh_u < v0;
|
||||
v0 = v0 & maxval16;
|
||||
v0 = v_lt(thresh_u, v0);
|
||||
v0 = v_and(v0, maxval16);
|
||||
v_store( dst + j, v0 );
|
||||
}
|
||||
}
|
||||
@ -213,12 +213,12 @@ thresh_8u( const Mat& _src, Mat& _dst, uchar thresh, uchar maxval, int type )
|
||||
case THRESH_BINARY_INV:
|
||||
for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
|
||||
{
|
||||
for( j = 0; j <= roi.width - v_uint8::nlanes; j += v_uint8::nlanes)
|
||||
for( j = 0; j <= roi.width - VTraits<v_uint8>::vlanes(); j += VTraits<v_uint8>::vlanes())
|
||||
{
|
||||
v_uint8 v0;
|
||||
v0 = vx_load( src + j );
|
||||
v0 = v0 <= thresh_u;
|
||||
v0 = v0 & maxval16;
|
||||
v0 = v_le(v0, thresh_u);
|
||||
v0 = v_and(v0, maxval16);
|
||||
v_store( dst + j, v0 );
|
||||
}
|
||||
}
|
||||
@ -227,11 +227,11 @@ thresh_8u( const Mat& _src, Mat& _dst, uchar thresh, uchar maxval, int type )
|
||||
case THRESH_TRUNC:
|
||||
for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
|
||||
{
|
||||
for( j = 0; j <= roi.width - v_uint8::nlanes; j += v_uint8::nlanes)
|
||||
for( j = 0; j <= roi.width - VTraits<v_uint8>::vlanes(); j += VTraits<v_uint8>::vlanes())
|
||||
{
|
||||
v_uint8 v0;
|
||||
v0 = vx_load( src + j );
|
||||
v0 = v0 - ( v0 - thresh_u );
|
||||
v0 = v_sub(v0, v_sub(v0, thresh_u));
|
||||
v_store( dst + j, v0 );
|
||||
}
|
||||
}
|
||||
@ -240,11 +240,11 @@ thresh_8u( const Mat& _src, Mat& _dst, uchar thresh, uchar maxval, int type )
|
||||
case THRESH_TOZERO:
|
||||
for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
|
||||
{
|
||||
for( j = 0; j <= roi.width - v_uint8::nlanes; j += v_uint8::nlanes)
|
||||
for( j = 0; j <= roi.width - VTraits<v_uint8>::vlanes(); j += VTraits<v_uint8>::vlanes())
|
||||
{
|
||||
v_uint8 v0;
|
||||
v0 = vx_load( src + j );
|
||||
v0 = ( thresh_u < v0 ) & v0;
|
||||
v0 = v_and(v_lt(thresh_u, v0), v0);
|
||||
v_store( dst + j, v0 );
|
||||
}
|
||||
}
|
||||
@ -253,11 +253,11 @@ thresh_8u( const Mat& _src, Mat& _dst, uchar thresh, uchar maxval, int type )
|
||||
case THRESH_TOZERO_INV:
|
||||
for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
|
||||
{
|
||||
for( j = 0; j <= roi.width - v_uint8::nlanes; j += v_uint8::nlanes)
|
||||
for( j = 0; j <= roi.width - VTraits<v_uint8>::vlanes(); j += VTraits<v_uint8>::vlanes())
|
||||
{
|
||||
v_uint8 v0;
|
||||
v0 = vx_load( src + j );
|
||||
v0 = ( v0 <= thresh_u ) & v0;
|
||||
v0 = v_and(v_le(v0, thresh_u), v0);
|
||||
v_store( dst + j, v0 );
|
||||
}
|
||||
}
|
||||
@ -351,7 +351,7 @@ thresh_16u(const Mat& _src, Mat& _dst, ushort thresh, ushort maxval, int type)
|
||||
|
||||
const ushort* src = _src.ptr<ushort>();
|
||||
ushort* dst = _dst.ptr<ushort>();
|
||||
#if CV_SIMD
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
int i, j;
|
||||
v_uint16 thresh_u = vx_setall_u16(thresh);
|
||||
v_uint16 maxval16 = vx_setall_u16(maxval);
|
||||
@ -361,25 +361,25 @@ thresh_16u(const Mat& _src, Mat& _dst, ushort thresh, ushort maxval, int type)
|
||||
case THRESH_BINARY:
|
||||
for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step)
|
||||
{
|
||||
for (j = 0; j <= roi.width - 2*v_uint16::nlanes; j += 2*v_uint16::nlanes)
|
||||
for (j = 0; j <= roi.width - 2*VTraits<v_uint16>::vlanes(); j += 2*VTraits<v_uint16>::vlanes())
|
||||
{
|
||||
v_uint16 v0, v1;
|
||||
v0 = vx_load(src + j);
|
||||
v1 = vx_load(src + j + v_uint16::nlanes);
|
||||
v0 = thresh_u < v0;
|
||||
v1 = thresh_u < v1;
|
||||
v0 = v0 & maxval16;
|
||||
v1 = v1 & maxval16;
|
||||
v1 = vx_load(src + j + VTraits<v_uint16>::vlanes());
|
||||
v0 = v_lt(thresh_u, v0);
|
||||
v1 = v_lt(thresh_u, v1);
|
||||
v0 = v_and(v0, maxval16);
|
||||
v1 = v_and(v1, maxval16);
|
||||
v_store(dst + j, v0);
|
||||
v_store(dst + j + v_uint16::nlanes, v1);
|
||||
v_store(dst + j + VTraits<v_uint16>::vlanes(), v1);
|
||||
}
|
||||
if (j <= roi.width - v_uint16::nlanes)
|
||||
if (j <= roi.width - VTraits<v_uint16>::vlanes())
|
||||
{
|
||||
v_uint16 v0 = vx_load(src + j);
|
||||
v0 = thresh_u < v0;
|
||||
v0 = v0 & maxval16;
|
||||
v0 = v_lt(thresh_u, v0);
|
||||
v0 = v_and(v0, maxval16);
|
||||
v_store(dst + j, v0);
|
||||
j += v_uint16::nlanes;
|
||||
j += VTraits<v_uint16>::vlanes();
|
||||
}
|
||||
|
||||
for (; j < roi.width; j++)
|
||||
@ -391,25 +391,25 @@ thresh_16u(const Mat& _src, Mat& _dst, ushort thresh, ushort maxval, int type)
|
||||
for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step)
|
||||
{
|
||||
j = 0;
|
||||
for (; j <= roi.width - 2*v_uint16::nlanes; j += 2*v_uint16::nlanes)
|
||||
for (; j <= roi.width - 2*VTraits<v_uint16>::vlanes(); j += 2*VTraits<v_uint16>::vlanes())
|
||||
{
|
||||
v_uint16 v0, v1;
|
||||
v0 = vx_load(src + j);
|
||||
v1 = vx_load(src + j + v_uint16::nlanes);
|
||||
v0 = v0 <= thresh_u;
|
||||
v1 = v1 <= thresh_u;
|
||||
v0 = v0 & maxval16;
|
||||
v1 = v1 & maxval16;
|
||||
v1 = vx_load(src + j + VTraits<v_uint16>::vlanes());
|
||||
v0 = v_le(v0, thresh_u);
|
||||
v1 = v_le(v1, thresh_u);
|
||||
v0 = v_and(v0, maxval16);
|
||||
v1 = v_and(v1, maxval16);
|
||||
v_store(dst + j, v0);
|
||||
v_store(dst + j + v_uint16::nlanes, v1);
|
||||
v_store(dst + j + VTraits<v_uint16>::vlanes(), v1);
|
||||
}
|
||||
if (j <= roi.width - v_uint16::nlanes)
|
||||
if (j <= roi.width - VTraits<v_uint16>::vlanes())
|
||||
{
|
||||
v_uint16 v0 = vx_load(src + j);
|
||||
v0 = v0 <= thresh_u;
|
||||
v0 = v0 & maxval16;
|
||||
v0 = v_le(v0, thresh_u);
|
||||
v0 = v_and(v0, maxval16);
|
||||
v_store(dst + j, v0);
|
||||
j += v_uint16::nlanes;
|
||||
j += VTraits<v_uint16>::vlanes();
|
||||
}
|
||||
|
||||
for (; j < roi.width; j++)
|
||||
@ -421,22 +421,22 @@ thresh_16u(const Mat& _src, Mat& _dst, ushort thresh, ushort maxval, int type)
|
||||
for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step)
|
||||
{
|
||||
j = 0;
|
||||
for (; j <= roi.width - 2*v_uint16::nlanes; j += 2*v_uint16::nlanes)
|
||||
for (; j <= roi.width - 2*VTraits<v_uint16>::vlanes(); j += 2*VTraits<v_uint16>::vlanes())
|
||||
{
|
||||
v_uint16 v0, v1;
|
||||
v0 = vx_load(src + j);
|
||||
v1 = vx_load(src + j + v_uint16::nlanes);
|
||||
v1 = vx_load(src + j + VTraits<v_uint16>::vlanes());
|
||||
v0 = v_min(v0, thresh_u);
|
||||
v1 = v_min(v1, thresh_u);
|
||||
v_store(dst + j, v0);
|
||||
v_store(dst + j + v_uint16::nlanes, v1);
|
||||
v_store(dst + j + VTraits<v_uint16>::vlanes(), v1);
|
||||
}
|
||||
if (j <= roi.width - v_uint16::nlanes)
|
||||
if (j <= roi.width - VTraits<v_uint16>::vlanes())
|
||||
{
|
||||
v_uint16 v0 = vx_load(src + j);
|
||||
v0 = v_min(v0, thresh_u);
|
||||
v_store(dst + j, v0);
|
||||
j += v_uint16::nlanes;
|
||||
j += VTraits<v_uint16>::vlanes();
|
||||
}
|
||||
|
||||
for (; j < roi.width; j++)
|
||||
@ -448,22 +448,22 @@ thresh_16u(const Mat& _src, Mat& _dst, ushort thresh, ushort maxval, int type)
|
||||
for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step)
|
||||
{
|
||||
j = 0;
|
||||
for (; j <= roi.width - 2*v_uint16::nlanes; j += 2*v_uint16::nlanes)
|
||||
for (; j <= roi.width - 2*VTraits<v_uint16>::vlanes(); j += 2*VTraits<v_uint16>::vlanes())
|
||||
{
|
||||
v_uint16 v0, v1;
|
||||
v0 = vx_load(src + j);
|
||||
v1 = vx_load(src + j + v_uint16::nlanes);
|
||||
v0 = (thresh_u < v0) & v0;
|
||||
v1 = (thresh_u < v1) & v1;
|
||||
v1 = vx_load(src + j + VTraits<v_uint16>::vlanes());
|
||||
v0 = v_and(v_lt(thresh_u, v0), v0);
|
||||
v1 = v_and(v_lt(thresh_u, v1), v1);
|
||||
v_store(dst + j, v0);
|
||||
v_store(dst + j + v_uint16::nlanes, v1);
|
||||
v_store(dst + j + VTraits<v_uint16>::vlanes(), v1);
|
||||
}
|
||||
if (j <= roi.width - v_uint16::nlanes)
|
||||
if (j <= roi.width - VTraits<v_uint16>::vlanes())
|
||||
{
|
||||
v_uint16 v0 = vx_load(src + j);
|
||||
v0 = (thresh_u < v0) & v0;
|
||||
v0 = v_and(v_lt(thresh_u, v0), v0);
|
||||
v_store(dst + j, v0);
|
||||
j += v_uint16::nlanes;
|
||||
j += VTraits<v_uint16>::vlanes();
|
||||
}
|
||||
|
||||
for (; j < roi.width; j++)
|
||||
@ -475,22 +475,22 @@ thresh_16u(const Mat& _src, Mat& _dst, ushort thresh, ushort maxval, int type)
|
||||
for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step)
|
||||
{
|
||||
j = 0;
|
||||
for (; j <= roi.width - 2*v_uint16::nlanes; j += 2*v_uint16::nlanes)
|
||||
for (; j <= roi.width - 2*VTraits<v_uint16>::vlanes(); j += 2*VTraits<v_uint16>::vlanes())
|
||||
{
|
||||
v_uint16 v0, v1;
|
||||
v0 = vx_load(src + j);
|
||||
v1 = vx_load(src + j + v_uint16::nlanes);
|
||||
v0 = (v0 <= thresh_u) & v0;
|
||||
v1 = (v1 <= thresh_u) & v1;
|
||||
v1 = vx_load(src + j + VTraits<v_uint16>::vlanes());
|
||||
v0 = v_and(v_le(v0, thresh_u), v0);
|
||||
v1 = v_and(v_le(v1, thresh_u), v1);
|
||||
v_store(dst + j, v0);
|
||||
v_store(dst + j + v_uint16::nlanes, v1);
|
||||
v_store(dst + j + VTraits<v_uint16>::vlanes(), v1);
|
||||
}
|
||||
if (j <= roi.width - v_uint16::nlanes)
|
||||
if (j <= roi.width - VTraits<v_uint16>::vlanes())
|
||||
{
|
||||
v_uint16 v0 = vx_load(src + j);
|
||||
v0 = (v0 <= thresh_u) & v0;
|
||||
v0 = v_and(v_le(v0, thresh_u), v0);
|
||||
v_store(dst + j, v0);
|
||||
j += v_uint16::nlanes;
|
||||
j += VTraits<v_uint16>::vlanes();
|
||||
}
|
||||
|
||||
for (; j < roi.width; j++)
|
||||
@ -571,7 +571,7 @@ thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type )
|
||||
}
|
||||
#endif
|
||||
|
||||
#if CV_SIMD
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
int i, j;
|
||||
v_int16 thresh8 = vx_setall_s16( thresh );
|
||||
v_int16 maxval8 = vx_setall_s16( maxval );
|
||||
@ -582,25 +582,25 @@ thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type )
|
||||
for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
|
||||
{
|
||||
j = 0;
|
||||
for( ; j <= roi.width - 2*v_int16::nlanes; j += 2*v_int16::nlanes )
|
||||
for( ; j <= roi.width - 2*VTraits<v_int16>::vlanes(); j += 2*VTraits<v_int16>::vlanes() )
|
||||
{
|
||||
v_int16 v0, v1;
|
||||
v0 = vx_load( src + j );
|
||||
v1 = vx_load( src + j + v_int16::nlanes );
|
||||
v0 = thresh8 < v0;
|
||||
v1 = thresh8 < v1;
|
||||
v0 = v0 & maxval8;
|
||||
v1 = v1 & maxval8;
|
||||
v1 = vx_load( src + j + VTraits<v_int16>::vlanes() );
|
||||
v0 = v_lt(thresh8, v0);
|
||||
v1 = v_lt(thresh8, v1);
|
||||
v0 = v_and(v0, maxval8);
|
||||
v1 = v_and(v1, maxval8);
|
||||
v_store( dst + j, v0 );
|
||||
v_store( dst + j + v_int16::nlanes, v1 );
|
||||
v_store( dst + j + VTraits<v_int16>::vlanes(), v1 );
|
||||
}
|
||||
if( j <= roi.width - v_int16::nlanes )
|
||||
if( j <= roi.width - VTraits<v_int16>::vlanes() )
|
||||
{
|
||||
v_int16 v0 = vx_load( src + j );
|
||||
v0 = thresh8 < v0;
|
||||
v0 = v0 & maxval8;
|
||||
v0 = v_lt(thresh8, v0);
|
||||
v0 = v_and(v0, maxval8);
|
||||
v_store( dst + j, v0 );
|
||||
j += v_int16::nlanes;
|
||||
j += VTraits<v_int16>::vlanes();
|
||||
}
|
||||
|
||||
for( ; j < roi.width; j++ )
|
||||
@ -612,25 +612,25 @@ thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type )
|
||||
for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
|
||||
{
|
||||
j = 0;
|
||||
for( ; j <= roi.width - 2*v_int16::nlanes; j += 2*v_int16::nlanes )
|
||||
for( ; j <= roi.width - 2*VTraits<v_int16>::vlanes(); j += 2*VTraits<v_int16>::vlanes() )
|
||||
{
|
||||
v_int16 v0, v1;
|
||||
v0 = vx_load( src + j );
|
||||
v1 = vx_load( src + j + v_int16::nlanes );
|
||||
v0 = v0 <= thresh8;
|
||||
v1 = v1 <= thresh8;
|
||||
v0 = v0 & maxval8;
|
||||
v1 = v1 & maxval8;
|
||||
v1 = vx_load( src + j + VTraits<v_int16>::vlanes() );
|
||||
v0 = v_le(v0, thresh8);
|
||||
v1 = v_le(v1, thresh8);
|
||||
v0 = v_and(v0, maxval8);
|
||||
v1 = v_and(v1, maxval8);
|
||||
v_store( dst + j, v0 );
|
||||
v_store( dst + j + v_int16::nlanes, v1 );
|
||||
v_store( dst + j + VTraits<v_int16>::vlanes(), v1 );
|
||||
}
|
||||
if( j <= roi.width - v_int16::nlanes )
|
||||
if( j <= roi.width - VTraits<v_int16>::vlanes() )
|
||||
{
|
||||
v_int16 v0 = vx_load( src + j );
|
||||
v0 = v0 <= thresh8;
|
||||
v0 = v0 & maxval8;
|
||||
v0 = v_le(v0, thresh8);
|
||||
v0 = v_and(v0, maxval8);
|
||||
v_store( dst + j, v0 );
|
||||
j += v_int16::nlanes;
|
||||
j += VTraits<v_int16>::vlanes();
|
||||
}
|
||||
|
||||
for( ; j < roi.width; j++ )
|
||||
@ -642,22 +642,22 @@ thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type )
|
||||
for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
|
||||
{
|
||||
j = 0;
|
||||
for( ; j <= roi.width - 2*v_int16::nlanes; j += 2*v_int16::nlanes )
|
||||
for( ; j <= roi.width - 2*VTraits<v_int16>::vlanes(); j += 2*VTraits<v_int16>::vlanes() )
|
||||
{
|
||||
v_int16 v0, v1;
|
||||
v0 = vx_load( src + j );
|
||||
v1 = vx_load( src + j + v_int16::nlanes );
|
||||
v1 = vx_load( src + j + VTraits<v_int16>::vlanes() );
|
||||
v0 = v_min( v0, thresh8 );
|
||||
v1 = v_min( v1, thresh8 );
|
||||
v_store( dst + j, v0 );
|
||||
v_store( dst + j + v_int16::nlanes, v1 );
|
||||
v_store( dst + j + VTraits<v_int16>::vlanes(), v1 );
|
||||
}
|
||||
if( j <= roi.width - v_int16::nlanes )
|
||||
if( j <= roi.width - VTraits<v_int16>::vlanes() )
|
||||
{
|
||||
v_int16 v0 = vx_load( src + j );
|
||||
v0 = v_min( v0, thresh8 );
|
||||
v_store( dst + j, v0 );
|
||||
j += v_int16::nlanes;
|
||||
j += VTraits<v_int16>::vlanes();
|
||||
}
|
||||
|
||||
for( ; j < roi.width; j++ )
|
||||
@ -669,22 +669,22 @@ thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type )
|
||||
for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
|
||||
{
|
||||
j = 0;
|
||||
for( ; j <= roi.width - 2*v_int16::nlanes; j += 2*v_int16::nlanes )
|
||||
for( ; j <= roi.width - 2*VTraits<v_int16>::vlanes(); j += 2*VTraits<v_int16>::vlanes() )
|
||||
{
|
||||
v_int16 v0, v1;
|
||||
v0 = vx_load( src + j );
|
||||
v1 = vx_load( src + j + v_int16::nlanes );
|
||||
v0 = ( thresh8 < v0 ) & v0;
|
||||
v1 = ( thresh8 < v1 ) & v1;
|
||||
v1 = vx_load( src + j + VTraits<v_int16>::vlanes() );
|
||||
v0 = v_and(v_lt(thresh8, v0), v0);
|
||||
v1 = v_and(v_lt(thresh8, v1), v1);
|
||||
v_store( dst + j, v0 );
|
||||
v_store( dst + j + v_int16::nlanes, v1 );
|
||||
v_store( dst + j + VTraits<v_int16>::vlanes(), v1 );
|
||||
}
|
||||
if( j <= roi.width - v_int16::nlanes )
|
||||
if( j <= roi.width - VTraits<v_int16>::vlanes() )
|
||||
{
|
||||
v_int16 v0 = vx_load( src + j );
|
||||
v0 = ( thresh8 < v0 ) & v0;
|
||||
v0 = v_and(v_lt(thresh8, v0), v0);
|
||||
v_store( dst + j, v0 );
|
||||
j += v_int16::nlanes;
|
||||
j += VTraits<v_int16>::vlanes();
|
||||
}
|
||||
|
||||
for( ; j < roi.width; j++ )
|
||||
@ -696,22 +696,22 @@ thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type )
|
||||
for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
|
||||
{
|
||||
j = 0;
|
||||
for( ; j <= roi.width - 2*v_int16::nlanes; j += 2*v_int16::nlanes )
|
||||
for( ; j <= roi.width - 2*VTraits<v_int16>::vlanes(); j += 2*VTraits<v_int16>::vlanes() )
|
||||
{
|
||||
v_int16 v0, v1;
|
||||
v0 = vx_load( src + j );
|
||||
v1 = vx_load( src + j + v_int16::nlanes );
|
||||
v0 = ( v0 <= thresh8 ) & v0;
|
||||
v1 = ( v1 <= thresh8 ) & v1;
|
||||
v1 = vx_load( src + j + VTraits<v_int16>::vlanes() );
|
||||
v0 = v_and(v_le(v0, thresh8), v0);
|
||||
v1 = v_and(v_le(v1, thresh8), v1);
|
||||
v_store( dst + j, v0 );
|
||||
v_store( dst + j + v_int16::nlanes, v1 );
|
||||
v_store( dst + j + VTraits<v_int16>::vlanes(), v1 );
|
||||
}
|
||||
if( j <= roi.width - v_int16::nlanes )
|
||||
if( j <= roi.width - VTraits<v_int16>::vlanes() )
|
||||
{
|
||||
v_int16 v0 = vx_load( src + j );
|
||||
v0 = ( v0 <= thresh8 ) & v0;
|
||||
v0 = v_and(v_le(v0, thresh8), v0);
|
||||
v_store( dst + j, v0 );
|
||||
j += v_int16::nlanes;
|
||||
j += VTraits<v_int16>::vlanes();
|
||||
}
|
||||
|
||||
for( ; j < roi.width; j++ )
|
||||
@ -777,7 +777,7 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type )
|
||||
}
|
||||
#endif
|
||||
|
||||
#if CV_SIMD
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
int i, j;
|
||||
v_float32 thresh4 = vx_setall_f32( thresh );
|
||||
v_float32 maxval4 = vx_setall_f32( maxval );
|
||||
@ -788,25 +788,25 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type )
|
||||
for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
|
||||
{
|
||||
j = 0;
|
||||
for( ; j <= roi.width - 2*v_float32::nlanes; j += 2*v_float32::nlanes )
|
||||
for( ; j <= roi.width - 2*VTraits<v_float32>::vlanes(); j += 2*VTraits<v_float32>::vlanes() )
|
||||
{
|
||||
v_float32 v0, v1;
|
||||
v0 = vx_load( src + j );
|
||||
v1 = vx_load( src + j + v_float32::nlanes );
|
||||
v0 = thresh4 < v0;
|
||||
v1 = thresh4 < v1;
|
||||
v0 = v0 & maxval4;
|
||||
v1 = v1 & maxval4;
|
||||
v1 = vx_load( src + j + VTraits<v_float32>::vlanes() );
|
||||
v0 = v_lt(thresh4, v0);
|
||||
v1 = v_lt(thresh4, v1);
|
||||
v0 = v_and(v0, maxval4);
|
||||
v1 = v_and(v1, maxval4);
|
||||
v_store( dst + j, v0 );
|
||||
v_store( dst + j + v_float32::nlanes, v1 );
|
||||
v_store( dst + j + VTraits<v_float32>::vlanes(), v1 );
|
||||
}
|
||||
if( j <= roi.width - v_float32::nlanes )
|
||||
if( j <= roi.width - VTraits<v_float32>::vlanes() )
|
||||
{
|
||||
v_float32 v0 = vx_load( src + j );
|
||||
v0 = thresh4 < v0;
|
||||
v0 = v0 & maxval4;
|
||||
v0 = v_lt(thresh4, v0);
|
||||
v0 = v_and(v0, maxval4);
|
||||
v_store( dst + j, v0 );
|
||||
j += v_float32::nlanes;
|
||||
j += VTraits<v_float32>::vlanes();
|
||||
}
|
||||
|
||||
for( ; j < roi.width; j++ )
|
||||
@ -818,25 +818,25 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type )
|
||||
for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
|
||||
{
|
||||
j = 0;
|
||||
for( ; j <= roi.width - 2*v_float32::nlanes; j += 2*v_float32::nlanes )
|
||||
for( ; j <= roi.width - 2*VTraits<v_float32>::vlanes(); j += 2*VTraits<v_float32>::vlanes() )
|
||||
{
|
||||
v_float32 v0, v1;
|
||||
v0 = vx_load( src + j );
|
||||
v1 = vx_load( src + j + v_float32::nlanes );
|
||||
v0 = v0 <= thresh4;
|
||||
v1 = v1 <= thresh4;
|
||||
v0 = v0 & maxval4;
|
||||
v1 = v1 & maxval4;
|
||||
v1 = vx_load( src + j + VTraits<v_float32>::vlanes() );
|
||||
v0 = v_le(v0, thresh4);
|
||||
v1 = v_le(v1, thresh4);
|
||||
v0 = v_and(v0, maxval4);
|
||||
v1 = v_and(v1, maxval4);
|
||||
v_store( dst + j, v0 );
|
||||
v_store( dst + j + v_float32::nlanes, v1 );
|
||||
v_store( dst + j + VTraits<v_float32>::vlanes(), v1 );
|
||||
}
|
||||
if( j <= roi.width - v_float32::nlanes )
|
||||
if( j <= roi.width - VTraits<v_float32>::vlanes() )
|
||||
{
|
||||
v_float32 v0 = vx_load( src + j );
|
||||
v0 = v0 <= thresh4;
|
||||
v0 = v0 & maxval4;
|
||||
v0 = v_le(v0, thresh4);
|
||||
v0 = v_and(v0, maxval4);
|
||||
v_store( dst + j, v0 );
|
||||
j += v_float32::nlanes;
|
||||
j += VTraits<v_float32>::vlanes();
|
||||
}
|
||||
|
||||
for( ; j < roi.width; j++ )
|
||||
@ -848,22 +848,22 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type )
|
||||
for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
|
||||
{
|
||||
j = 0;
|
||||
for( ; j <= roi.width - 2*v_float32::nlanes; j += 2*v_float32::nlanes )
|
||||
for( ; j <= roi.width - 2*VTraits<v_float32>::vlanes(); j += 2*VTraits<v_float32>::vlanes() )
|
||||
{
|
||||
v_float32 v0, v1;
|
||||
v0 = vx_load( src + j );
|
||||
v1 = vx_load( src + j + v_float32::nlanes );
|
||||
v1 = vx_load( src + j + VTraits<v_float32>::vlanes() );
|
||||
v0 = v_min( v0, thresh4 );
|
||||
v1 = v_min( v1, thresh4 );
|
||||
v_store( dst + j, v0 );
|
||||
v_store( dst + j + v_float32::nlanes, v1 );
|
||||
v_store( dst + j + VTraits<v_float32>::vlanes(), v1 );
|
||||
}
|
||||
if( j <= roi.width - v_float32::nlanes )
|
||||
if( j <= roi.width - VTraits<v_float32>::vlanes() )
|
||||
{
|
||||
v_float32 v0 = vx_load( src + j );
|
||||
v0 = v_min( v0, thresh4 );
|
||||
v_store( dst + j, v0 );
|
||||
j += v_float32::nlanes;
|
||||
j += VTraits<v_float32>::vlanes();
|
||||
}
|
||||
|
||||
for( ; j < roi.width; j++ )
|
||||
@ -875,22 +875,22 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type )
|
||||
for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
|
||||
{
|
||||
j = 0;
|
||||
for( ; j <= roi.width - 2*v_float32::nlanes; j += 2*v_float32::nlanes )
|
||||
for( ; j <= roi.width - 2*VTraits<v_float32>::vlanes(); j += 2*VTraits<v_float32>::vlanes() )
|
||||
{
|
||||
v_float32 v0, v1;
|
||||
v0 = vx_load( src + j );
|
||||
v1 = vx_load( src + j + v_float32::nlanes );
|
||||
v0 = ( thresh4 < v0 ) & v0;
|
||||
v1 = ( thresh4 < v1 ) & v1;
|
||||
v1 = vx_load( src + j + VTraits<v_float32>::vlanes() );
|
||||
v0 = v_and(v_lt(thresh4, v0), v0);
|
||||
v1 = v_and(v_lt(thresh4, v1), v1);
|
||||
v_store( dst + j, v0 );
|
||||
v_store( dst + j + v_float32::nlanes, v1 );
|
||||
v_store( dst + j + VTraits<v_float32>::vlanes(), v1 );
|
||||
}
|
||||
if( j <= roi.width - v_float32::nlanes )
|
||||
if( j <= roi.width - VTraits<v_float32>::vlanes() )
|
||||
{
|
||||
v_float32 v0 = vx_load( src + j );
|
||||
v0 = ( thresh4 < v0 ) & v0;
|
||||
v0 = v_and(v_lt(thresh4, v0), v0);
|
||||
v_store( dst + j, v0 );
|
||||
j += v_float32::nlanes;
|
||||
j += VTraits<v_float32>::vlanes();
|
||||
}
|
||||
|
||||
for( ; j < roi.width; j++ )
|
||||
@ -902,22 +902,22 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type )
|
||||
for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
|
||||
{
|
||||
j = 0;
|
||||
for( ; j <= roi.width - 2*v_float32::nlanes; j += 2*v_float32::nlanes )
|
||||
for( ; j <= roi.width - 2*VTraits<v_float32>::vlanes(); j += 2*VTraits<v_float32>::vlanes() )
|
||||
{
|
||||
v_float32 v0, v1;
|
||||
v0 = vx_load( src + j );
|
||||
v1 = vx_load( src + j + v_float32::nlanes );
|
||||
v0 = ( v0 <= thresh4 ) & v0;
|
||||
v1 = ( v1 <= thresh4 ) & v1;
|
||||
v1 = vx_load( src + j + VTraits<v_float32>::vlanes() );
|
||||
v0 = v_and(v_le(v0, thresh4), v0);
|
||||
v1 = v_and(v_le(v1, thresh4), v1);
|
||||
v_store( dst + j, v0 );
|
||||
v_store( dst + j + v_float32::nlanes, v1 );
|
||||
v_store( dst + j + VTraits<v_float32>::vlanes(), v1 );
|
||||
}
|
||||
if( j <= roi.width - v_float32::nlanes )
|
||||
if( j <= roi.width - VTraits<v_float32>::vlanes() )
|
||||
{
|
||||
v_float32 v0 = vx_load( src + j );
|
||||
v0 = ( v0 <= thresh4 ) & v0;
|
||||
v0 = v_and(v_le(v0, thresh4), v0);
|
||||
v_store( dst + j, v0 );
|
||||
j += v_float32::nlanes;
|
||||
j += VTraits<v_float32>::vlanes();
|
||||
}
|
||||
|
||||
for( ; j < roi.width; j++ )
|
||||
@ -948,7 +948,7 @@ thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type)
|
||||
roi.height = 1;
|
||||
}
|
||||
|
||||
#if CV_SIMD_64F
|
||||
#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
|
||||
int i, j;
|
||||
v_float64 thresh2 = vx_setall_f64( thresh );
|
||||
v_float64 maxval2 = vx_setall_f64( maxval );
|
||||
@ -959,25 +959,25 @@ thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type)
|
||||
for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
|
||||
{
|
||||
j = 0;
|
||||
for( ; j <= roi.width - 2*v_float64::nlanes; j += 2*v_float64::nlanes )
|
||||
for( ; j <= roi.width - 2*VTraits<v_float64>::vlanes(); j += 2*VTraits<v_float64>::vlanes() )
|
||||
{
|
||||
v_float64 v0, v1;
|
||||
v0 = vx_load( src + j );
|
||||
v1 = vx_load( src + j + v_float64::nlanes );
|
||||
v0 = thresh2 < v0;
|
||||
v1 = thresh2 < v1;
|
||||
v0 = v0 & maxval2;
|
||||
v1 = v1 & maxval2;
|
||||
v1 = vx_load( src + j + VTraits<v_float64>::vlanes() );
|
||||
v0 = v_lt(thresh2, v0);
|
||||
v1 = v_lt(thresh2, v1);
|
||||
v0 = v_and(v0, maxval2);
|
||||
v1 = v_and(v1, maxval2);
|
||||
v_store( dst + j, v0 );
|
||||
v_store( dst + j + v_float64::nlanes, v1 );
|
||||
v_store( dst + j + VTraits<v_float64>::vlanes(), v1 );
|
||||
}
|
||||
if( j <= roi.width - v_float64::nlanes )
|
||||
if( j <= roi.width - VTraits<v_float64>::vlanes() )
|
||||
{
|
||||
v_float64 v0 = vx_load( src + j );
|
||||
v0 = thresh2 < v0;
|
||||
v0 = v0 & maxval2;
|
||||
v0 = v_lt(thresh2, v0);
|
||||
v0 = v_and(v0, maxval2);
|
||||
v_store( dst + j, v0 );
|
||||
j += v_float64::nlanes;
|
||||
j += VTraits<v_float64>::vlanes();
|
||||
}
|
||||
|
||||
for( ; j < roi.width; j++ )
|
||||
@ -989,25 +989,25 @@ thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type)
|
||||
for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
|
||||
{
|
||||
j = 0;
|
||||
for( ; j <= roi.width - 2*v_float64::nlanes; j += 2*v_float64::nlanes )
|
||||
for( ; j <= roi.width - 2*VTraits<v_float64>::vlanes(); j += 2*VTraits<v_float64>::vlanes() )
|
||||
{
|
||||
v_float64 v0, v1;
|
||||
v0 = vx_load( src + j );
|
||||
v1 = vx_load( src + j + v_float64::nlanes );
|
||||
v0 = v0 <= thresh2;
|
||||
v1 = v1 <= thresh2;
|
||||
v0 = v0 & maxval2;
|
||||
v1 = v1 & maxval2;
|
||||
v1 = vx_load( src + j + VTraits<v_float64>::vlanes() );
|
||||
v0 = v_le(v0, thresh2);
|
||||
v1 = v_le(v1, thresh2);
|
||||
v0 = v_and(v0, maxval2);
|
||||
v1 = v_and(v1, maxval2);
|
||||
v_store( dst + j, v0 );
|
||||
v_store( dst + j + v_float64::nlanes, v1 );
|
||||
v_store( dst + j + VTraits<v_float64>::vlanes(), v1 );
|
||||
}
|
||||
if( j <= roi.width - v_float64::nlanes )
|
||||
if( j <= roi.width - VTraits<v_float64>::vlanes() )
|
||||
{
|
||||
v_float64 v0 = vx_load( src + j );
|
||||
v0 = v0 <= thresh2;
|
||||
v0 = v0 & maxval2;
|
||||
v0 = v_le(v0, thresh2);
|
||||
v0 = v_and(v0, maxval2);
|
||||
v_store( dst + j, v0 );
|
||||
j += v_float64::nlanes;
|
||||
j += VTraits<v_float64>::vlanes();
|
||||
}
|
||||
|
||||
for( ; j < roi.width; j++ )
|
||||
@ -1019,22 +1019,22 @@ thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type)
|
||||
for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
|
||||
{
|
||||
j = 0;
|
||||
for( ; j <= roi.width - 2*v_float64::nlanes; j += 2*v_float64::nlanes )
|
||||
for( ; j <= roi.width - 2*VTraits<v_float64>::vlanes(); j += 2*VTraits<v_float64>::vlanes() )
|
||||
{
|
||||
v_float64 v0, v1;
|
||||
v0 = vx_load( src + j );
|
||||
v1 = vx_load( src + j + v_float64::nlanes );
|
||||
v1 = vx_load( src + j + VTraits<v_float64>::vlanes() );
|
||||
v0 = v_min( v0, thresh2 );
|
||||
v1 = v_min( v1, thresh2 );
|
||||
v_store( dst + j, v0 );
|
||||
v_store( dst + j + v_float64::nlanes, v1 );
|
||||
v_store( dst + j + VTraits<v_float64>::vlanes(), v1 );
|
||||
}
|
||||
if( j <= roi.width - v_float64::nlanes )
|
||||
if( j <= roi.width - VTraits<v_float64>::vlanes() )
|
||||
{
|
||||
v_float64 v0 = vx_load( src + j );
|
||||
v0 = v_min( v0, thresh2 );
|
||||
v_store( dst + j, v0 );
|
||||
j += v_float64::nlanes;
|
||||
j += VTraits<v_float64>::vlanes();
|
||||
}
|
||||
|
||||
for( ; j < roi.width; j++ )
|
||||
@ -1046,22 +1046,22 @@ thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type)
|
||||
for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
|
||||
{
|
||||
j = 0;
|
||||
for( ; j <= roi.width - 2*v_float64::nlanes; j += 2*v_float64::nlanes )
|
||||
for( ; j <= roi.width - 2*VTraits<v_float64>::vlanes(); j += 2*VTraits<v_float64>::vlanes() )
|
||||
{
|
||||
v_float64 v0, v1;
|
||||
v0 = vx_load( src + j );
|
||||
v1 = vx_load( src + j + v_float64::nlanes );
|
||||
v0 = ( thresh2 < v0 ) & v0;
|
||||
v1 = ( thresh2 < v1 ) & v1;
|
||||
v1 = vx_load( src + j + VTraits<v_float64>::vlanes() );
|
||||
v0 = v_and(v_lt(thresh2, v0), v0);
|
||||
v1 = v_and(v_lt(thresh2, v1), v1);
|
||||
v_store( dst + j, v0 );
|
||||
v_store( dst + j + v_float64::nlanes, v1 );
|
||||
v_store( dst + j + VTraits<v_float64>::vlanes(), v1 );
|
||||
}
|
||||
if( j <= roi.width - v_float64::nlanes )
|
||||
if( j <= roi.width - VTraits<v_float64>::vlanes() )
|
||||
{
|
||||
v_float64 v0 = vx_load( src + j );
|
||||
v0 = ( thresh2 < v0 ) & v0;
|
||||
v0 = v_and(v_lt(thresh2, v0), v0);
|
||||
v_store( dst + j, v0 );
|
||||
j += v_float64::nlanes;
|
||||
j += VTraits<v_float64>::vlanes();
|
||||
}
|
||||
|
||||
for( ; j < roi.width; j++ )
|
||||
@ -1073,22 +1073,22 @@ thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type)
|
||||
for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
|
||||
{
|
||||
j = 0;
|
||||
for( ; j <= roi.width - 2*v_float64::nlanes; j += 2*v_float64::nlanes )
|
||||
for( ; j <= roi.width - 2*VTraits<v_float64>::vlanes(); j += 2*VTraits<v_float64>::vlanes() )
|
||||
{
|
||||
v_float64 v0, v1;
|
||||
v0 = vx_load( src + j );
|
||||
v1 = vx_load( src + j + v_float64::nlanes );
|
||||
v0 = ( v0 <= thresh2 ) & v0;
|
||||
v1 = ( v1 <= thresh2 ) & v1;
|
||||
v1 = vx_load( src + j + VTraits<v_float64>::vlanes() );
|
||||
v0 = v_and(v_le(v0, thresh2), v0);
|
||||
v1 = v_and(v_le(v1, thresh2), v1);
|
||||
v_store( dst + j, v0 );
|
||||
v_store( dst + j + v_float64::nlanes, v1 );
|
||||
v_store( dst + j + VTraits<v_float64>::vlanes(), v1 );
|
||||
}
|
||||
if( j <= roi.width - v_float64::nlanes )
|
||||
if( j <= roi.width - VTraits<v_float64>::vlanes() )
|
||||
{
|
||||
v_float64 v0 = vx_load( src + j );
|
||||
v0 = ( v0 <= thresh2 ) & v0;
|
||||
v0 = v_and(v_le(v0, thresh2), v0);
|
||||
v_store( dst + j, v0 );
|
||||
j += v_float64::nlanes;
|
||||
j += VTraits<v_float64>::vlanes();
|
||||
}
|
||||
|
||||
for( ; j < roi.width; j++ )
|
||||
|
Loading…
Reference in New Issue
Block a user