diff --git a/modules/imgproc/src/accum.simd.hpp b/modules/imgproc/src/accum.simd.hpp index e2be2c952e..7a29447497 100644 --- a/modules/imgproc/src/accum.simd.hpp +++ b/modules/imgproc/src/accum.simd.hpp @@ -425,9 +425,7 @@ void acc_simd_(const ushort* src, float* dst, const uchar* mask, int len, int cn v_uint16x8 v_0 = v_setall_u16(0); for ( ; x <= len - cVectorWidth; x += cVectorWidth) { - v_uint8x16 _v_mask = v_load(mask + x); - v_uint16x8 v_mask, dummy; - v_expand(_v_mask, v_mask, dummy); + v_uint16x8 v_mask = v_load_expand(mask + x); v_mask = ~(v_mask == v_0); v_uint16x8 v_src = v_load(src + x); v_src = v_src & v_mask; @@ -443,9 +441,7 @@ void acc_simd_(const ushort* src, float* dst, const uchar* mask, int len, int cn v_uint16x8 v_0 = v_setall_u16(0); for ( ; x <= len - cVectorWidth; x += cVectorWidth) { - v_uint8x16 _v_mask = v_load(mask + x); - v_uint16x8 v_mask, dummy; - v_expand(_v_mask, v_mask, dummy); + v_uint16x8 v_mask = v_load_expand(mask + x); v_mask = ~(v_mask == v_0); v_uint16x8 v_src0, v_src1, v_src2; v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2); @@ -491,8 +487,7 @@ void acc_simd_(const float* src, float* dst, const uchar* mask, int len, int cn) { for ( ; x <= len - cVectorWidth ; x += cVectorWidth) { - v_uint16x8 v_masku16, dummy0; - v_expand(v_load(mask + x), v_masku16, dummy0); + v_uint16x8 v_masku16 = v_load_expand(mask + x); v_uint32x4 v_masku320, v_masku321; v_expand(v_masku16, v_masku320, v_masku321); v_float32x4 v_mask0 = v_reinterpret_as_f32(~(v_masku320 == v_reinterpret_as_u32(v_0))); @@ -506,8 +501,7 @@ void acc_simd_(const float* src, float* dst, const uchar* mask, int len, int cn) { for ( ; x <= len - cVectorWidth ; x += cVectorWidth) { - v_uint16x8 v_masku16, dummy0; - v_expand(v_load(mask + x), v_masku16, dummy0); + v_uint16x8 v_masku16 = v_load_expand(mask + x); v_uint32x4 v_masku320, v_masku321; v_expand(v_masku16, v_masku320, v_masku321); v_float32x4 v_mask0 = v_reinterpret_as_f32(~(v_masku320 == v_reinterpret_as_u32(v_0))); @@ -770,8 +764,7 @@ void acc_simd_(const ushort* src, double* dst, const uchar* mask, int len, int c { for ( ; x <= len - cVectorWidth; x += cVectorWidth) { - v_uint16x8 v_mask, dummy; - v_expand(v_load(mask + x), v_mask, dummy); + v_uint16x8 v_mask = v_load_expand(mask + x); v_mask = ~(v_mask == v_0); v_uint16x8 v_src = v_load(src + x); v_src = v_src & v_mask; @@ -803,8 +796,7 @@ void acc_simd_(const ushort* src, double* dst, const uchar* mask, int len, int c { for ( ; x <= len - cVectorWidth; x += cVectorWidth) { - v_uint16x8 v_mask, dummy; - v_expand(v_load(mask + x), v_mask, dummy); + v_uint16x8 v_mask = v_load_expand(mask + x); v_mask = ~(v_mask == v_0); v_uint16x8 v_src0, v_src1, v_src2; v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2); @@ -871,10 +863,7 @@ void acc_simd_(const float* src, double* dst, const uchar* mask, int len, int cn { for ( ; x <= len - cVectorWidth ; x += cVectorWidth) { - v_uint16x8 v_masku16, dummy0; - v_expand(v_load(mask + x), v_masku16, dummy0); - v_uint32x4 v_masku32, dummy1; - v_expand(v_masku16, v_masku32, dummy1); + v_uint32x4 v_masku32 = v_load_expand_q(mask + x); v_uint64x2 v_masku640, v_masku641; v_expand(v_masku32, v_masku640, v_masku641); v_float64x2 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0)); @@ -892,10 +881,7 @@ void acc_simd_(const float* src, double* dst, const uchar* mask, int len, int cn { for ( ; x <= len - cVectorWidth ; x += cVectorWidth) { - v_uint16x8 v_masku16, dummy0; - v_expand(v_load(mask + x), v_masku16, dummy0); - v_uint32x4 v_masku32, dummy1; - v_expand(v_masku16, v_masku32, dummy1); + v_uint32x4 v_masku32 = v_load_expand_q(mask + x); v_uint64x2 v_masku640, v_masku641; v_expand(v_masku32, v_masku640, v_masku641); v_float64x2 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0)); @@ -947,10 +933,7 @@ void acc_simd_(const double* src, double* dst, const uchar* mask, int len, int c { for ( ; x <= len - cVectorWidth ; x += cVectorWidth) { - v_uint16x8 v_masku16, dummy0; - v_expand(v_load(mask + x), v_masku16, dummy0); - v_uint32x4 v_masku32, dummy1; - v_expand(v_masku16, v_masku32, dummy1); + v_uint32x4 v_masku32 = v_load_expand_q(mask + x); v_uint64x2 v_masku640, v_masku641; v_expand(v_masku32, v_masku640, v_masku641); v_float64x2 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0)); @@ -967,10 +950,7 @@ void acc_simd_(const double* src, double* dst, const uchar* mask, int len, int c { for ( ; x <= len - cVectorWidth ; x += cVectorWidth) { - v_uint16x8 v_masku16, dummy0; - v_expand(v_load(mask + x), v_masku16, dummy0); - v_uint32x4 v_masku32, dummy1; - v_expand(v_masku16, v_masku32, dummy1); + v_uint32x4 v_masku32 = v_load_expand_q(mask + x); v_uint64x2 v_masku640, v_masku641; v_expand(v_masku32, v_masku640, v_masku641); v_float64x2 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0)); @@ -1157,9 +1137,9 @@ void accSqr_simd_(const ushort* src, float* dst, const uchar* mask, int len, int { for ( ; x <= len - cVectorWidth ; x += cVectorWidth) { - v_uint16x8 stub = v_load_expand(mask + x); + v_uint16x8 v_mask16 = v_load_expand(mask + x); v_uint32x4 v_mask0, v_mask1; - v_expand(stub, v_mask0, v_mask1); + v_expand(v_mask16, v_mask0, v_mask1); v_mask0 = ~(v_mask0 == v_0); v_mask1 = ~(v_mask1 == v_0); v_uint16x8 v_src = v_load(src + x); @@ -1182,9 +1162,9 @@ void accSqr_simd_(const ushort* src, float* dst, const uchar* mask, int len, int { for ( ; x <= len - cVectorWidth ; x += cVectorWidth) { - v_uint16x8 stub = v_load_expand(mask + x); + v_uint16x8 v_mask16 = v_load_expand(mask + x); v_uint32x4 v_mask0, v_mask1; - v_expand(stub, v_mask0, v_mask1); + v_expand(v_mask16, v_mask0, v_mask1); v_mask0 = ~(v_mask0 == v_0); v_mask1 = ~(v_mask1 == v_0); @@ -1254,11 +1234,11 @@ void accSqr_simd_(const float* src, float* dst, const uchar* mask, int len, int { for (; x <= len - cVectorWidth; x += cVectorWidth) { - v_uint16x8 v_stub = v_load_expand(mask + x); - v_uint32x4 v_stub0, v_stub1; - v_expand(v_stub, v_stub0, v_stub1); - v_float32x4 v_mask0 = v_reinterpret_as_f32(~(v_stub0 == v_0)); - v_float32x4 v_mask1 = v_reinterpret_as_f32(~(v_stub1 == v_0)); + v_uint16x8 v_mask16 = v_load_expand(mask + x); + v_uint32x4 v_mask_0, v_mask_1; + v_expand(v_mask16, v_mask_0, v_mask_1); + v_float32x4 v_mask0 = v_reinterpret_as_f32(~(v_mask_0 == v_0)); + v_float32x4 v_mask1 = v_reinterpret_as_f32(~(v_mask_1 == v_0)); v_float32x4 v_src0 = v_load(src + x); v_float32x4 v_src1 = v_load(src + x + 4); v_src0 = v_src0 & v_mask0; @@ -1274,11 +1254,11 @@ void accSqr_simd_(const float* src, float* dst, const uchar* mask, int len, int { for (; x <= len - cVectorWidth; x += cVectorWidth) { - v_uint16x8 v_stub = v_load_expand(mask + x); - v_uint32x4 v_stub0, v_stub1; - v_expand(v_stub, v_stub0, v_stub1); - v_float32x4 v_mask0 = v_reinterpret_as_f32(~(v_stub0 == v_0)); - v_float32x4 v_mask1 = v_reinterpret_as_f32(~(v_stub1 == v_0)); + v_uint16x8 v_mask16 = v_load_expand(mask + x); + v_uint32x4 v_mask_0, v_mask_1; + v_expand(v_mask16, v_mask_0, v_mask_1); + v_float32x4 v_mask0 = v_reinterpret_as_f32(~(v_mask_0 == v_0)); + v_float32x4 v_mask1 = v_reinterpret_as_f32(~(v_mask_1 == v_0)); v_float32x4 v_src00, v_src10, v_src20, v_src01, v_src11, v_src21; v_load_deinterleave(src + x * cn, v_src00, v_src10, v_src20); @@ -1319,9 +1299,7 @@ void accSqr_simd_(const uchar* src, double* dst, const uchar* mask, int len, int int size = len * cn; for (; x <= size - cVectorWidth; x += cVectorWidth) { - v_uint8x16 v_src = v_load(src + x); - v_uint16x8 v_int, dummy; - v_expand(v_src, v_int, dummy); + v_uint16x8 v_int = v_load_expand(src + x); v_uint32x4 v_int0, v_int1; v_expand(v_int, v_int0, v_int1); @@ -1353,17 +1331,15 @@ void accSqr_simd_(const uchar* src, double* dst, const uchar* mask, int len, int } else { - v_uint8x16 v_0 = v_setzero_u8(); + v_uint16x8 v_0 = v_setzero_u16(); if (cn == 1) { for (; x <= len - cVectorWidth; x += cVectorWidth) { - v_uint8x16 v_mask = v_load(mask + x); + v_uint16x8 v_mask = v_load_expand(mask + x); v_mask = ~(v_mask == v_0); - v_uint8x16 v_src = v_load(src + x); - v_src = v_src & v_mask; - v_uint16x8 v_int, dummy; - v_expand(v_src, v_int, dummy); + v_uint16x8 v_src = v_load_expand(src + x); + v_uint16x8 v_int = v_src & v_mask; v_uint32x4 v_int0, v_int1; v_expand(v_int, v_int0, v_int1); @@ -1395,19 +1371,19 @@ void accSqr_simd_(const uchar* src, double* dst, const uchar* mask, int len, int } else if (cn == 3) { - for (; x <= len - cVectorWidth; x += cVectorWidth) + for (; x <= len - /*cVectorWidth*/16; x += cVectorWidth) { - v_uint8x16 v_mask = v_load(mask + x); - v_mask = ~(v_mask == v_0); v_uint8x16 v_src0, v_src1, v_src2; v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2); - v_src0 = v_src0 & v_mask; - v_src1 = v_src1 & v_mask; - v_src2 = v_src2 & v_mask; v_uint16x8 v_int0, v_int1, v_int2, dummy; v_expand(v_src0, v_int0, dummy); v_expand(v_src1, v_int1, dummy); v_expand(v_src2, v_int2, dummy); + v_uint16x8 v_mask = v_load_expand(mask + x); + v_mask = ~(v_mask == v_0); + v_int0 = v_int0 & v_mask; + v_int1 = v_int1 & v_mask; + v_int2 = v_int2 & v_mask; v_uint32x4 v_int00, v_int01, v_int10, v_int11, v_int20, v_int21; v_expand(v_int0, v_int00, v_int01); @@ -1627,9 +1603,7 @@ void accSqr_simd_(const float* src, double* dst, const uchar* mask, int len, int { for (; x <= len - cVectorWidth; x += cVectorWidth) { - v_uint16x8 stub = v_load_expand(mask + x); - v_uint32x4 v_mask, dummy; - v_expand(stub, v_mask, dummy); + v_uint32x4 v_mask = v_load_expand_q(mask + x);; v_mask = ~(v_mask == v_0); v_float32x4 v_src = v_load(src + x); v_src = v_src & v_reinterpret_as_f32(v_mask); @@ -1646,9 +1620,7 @@ void accSqr_simd_(const float* src, double* dst, const uchar* mask, int len, int { for (; x <= len - cVectorWidth; x += cVectorWidth) { - v_uint16x8 stub = v_load_expand(mask + x); - v_uint32x4 v_mask, dummy; - v_expand(stub, v_mask, dummy); + v_uint32x4 v_mask = v_load_expand_q(mask + x); v_mask = ~(v_mask == v_0); v_float32x4 v_src0, v_src1, v_src2; @@ -1709,11 +1681,9 @@ void accSqr_simd_(const double* src, double* dst, const uchar* mask, int len, in { for (; x <= len - cVectorWidth; x += cVectorWidth) { - v_uint16x8 stub = v_load_expand(mask + x); - v_uint32x4 stub0, stub1; - v_expand(stub, stub0, stub1); + v_uint32x4 v_mask32 = v_load_expand_q(mask + x); v_uint64x2 v_masku640, v_masku641; - v_expand(stub0, v_masku640, v_masku641); + v_expand(v_mask32, v_masku640, v_masku641); v_float64x2 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0)); v_float64x2 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0)); v_float64x2 v_src0 = v_load(src + x); @@ -1731,11 +1701,9 @@ void accSqr_simd_(const double* src, double* dst, const uchar* mask, int len, in { for (; x <= len - cVectorWidth; x += cVectorWidth) { - v_uint16x8 stub = v_load_expand(mask + x); - v_uint32x4 stub0, stub1; - v_expand(stub, stub0, stub1); + v_uint32x4 v_mask32 = v_load_expand_q(mask + x); v_uint64x2 v_masku640, v_masku641; - v_expand(stub0, v_masku640, v_masku641); + v_expand(v_mask32, v_masku640, v_masku641); v_float64x2 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0)); v_float64x2 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0)); @@ -2059,11 +2027,10 @@ void accProd_simd_(const float* src1, const float* src2, float* dst, const uchar { for (; x <= len - cVectorWidth; x += cVectorWidth) { - v_uint16x8 stub = v_load_expand(mask + x); - v_uint32x4 stub0, stub1; - v_expand(stub, stub0, stub1); - v_float32x4 v_mask0 = v_reinterpret_as_f32(~(stub0 == v_0)); - v_float32x4 v_mask1 = v_reinterpret_as_f32(~(stub1 == v_0)); + v_uint32x4 v_mask32_0 = v_load_expand_q(mask + x); + v_uint32x4 v_mask32_1 = v_load_expand_q(mask + x + 4); + v_float32x4 v_mask0 = v_reinterpret_as_f32(~(v_mask32_0 == v_0)); + v_float32x4 v_mask1 = v_reinterpret_as_f32(~(v_mask32_1 == v_0)); v_store(dst + x, v_load(dst + x) + ((v_load(src1 + x) * v_load(src2 + x)) & v_mask0)); v_store(dst + x + 4, v_load(dst + x + 4) + ((v_load(src1 + x + 4) * v_load(src2 + x + 4)) & v_mask1)); @@ -2073,11 +2040,10 @@ void accProd_simd_(const float* src1, const float* src2, float* dst, const uchar { for (; x <= len - cVectorWidth; x += cVectorWidth) { - v_uint16x8 stub = v_load_expand(mask + x); - v_uint32x4 stub0, stub1; - v_expand(stub, stub0, stub1); - v_float32x4 v_mask0 = v_reinterpret_as_f32(~(stub0 == v_0)); - v_float32x4 v_mask1 = v_reinterpret_as_f32(~(stub1 == v_0)); + v_uint32x4 v_mask32_0 = v_load_expand_q(mask + x); + v_uint32x4 v_mask32_1 = v_load_expand_q(mask + x + 4); + v_float32x4 v_mask0 = v_reinterpret_as_f32(~(v_mask32_0 == v_0)); + v_float32x4 v_mask1 = v_reinterpret_as_f32(~(v_mask32_1 == v_0)); v_float32x4 v_1src00, v_1src01, v_1src10, v_1src11, v_1src20, v_1src21; v_float32x4 v_2src00, v_2src01, v_2src10, v_2src11, v_2src20, v_2src21; @@ -2109,12 +2075,8 @@ void accProd_simd_(const uchar* src1, const uchar* src2, double* dst, const ucha int size = len * cn; for (; x <= size - cVectorWidth; x += cVectorWidth) { - v_uint8x16 v_1src = v_load(src1 + x); - v_uint8x16 v_2src = v_load(src2 + x); - - v_uint16x8 v_1int, v_2int, dummy; - v_expand(v_1src, v_1int, dummy); - v_expand(v_2src, v_2int, dummy); + v_uint16x8 v_1int = v_load_expand(src1 + x); + v_uint16x8 v_2int = v_load_expand(src2 + x); v_uint32x4 v_1int_0, v_1int_1, v_2int_0, v_2int_1; v_expand(v_1int, v_1int_0, v_1int_1); @@ -2148,19 +2110,15 @@ void accProd_simd_(const uchar* src1, const uchar* src2, double* dst, const ucha } else { - v_uint8x16 v_0 = v_setzero_u8(); + v_uint16x8 v_0 = v_setzero_u16(); if (cn == 1) { for (; x <= len - cVectorWidth; x += cVectorWidth) { - v_uint8x16 v_mask = v_load(mask + x); + v_uint16x8 v_mask = v_load_expand(mask + x); v_mask = ~(v_mask == v_0); - v_uint8x16 v_1src = v_load(src1 + x) & v_mask; - v_uint8x16 v_2src = v_load(src2 + x) & v_mask; - - v_uint16x8 v_1int, v_2int, dummy; - v_expand(v_1src, v_1int, dummy); - v_expand(v_2src, v_2int, dummy); + v_uint16x8 v_1int = v_load_expand(src1 + x) & v_mask; + v_uint16x8 v_2int = v_load_expand(src2 + x) & v_mask; v_uint32x4 v_1int_0, v_1int_1, v_2int_0, v_2int_1; v_expand(v_1int, v_1int_0, v_1int_1); @@ -2194,19 +2152,11 @@ void accProd_simd_(const uchar* src1, const uchar* src2, double* dst, const ucha } else if (cn == 3) { - for (; x <= len - cVectorWidth; x += cVectorWidth) + for (; x <= len - /*cVectorWidth*/16; x += cVectorWidth) { - v_uint8x16 v_mask = v_load(mask + x); - v_mask = ~(v_mask == v_0); v_uint8x16 v_1src0, v_1src1, v_1src2, v_2src0, v_2src1, v_2src2; v_load_deinterleave(src1 + x * cn, v_1src0, v_1src1, v_1src2); v_load_deinterleave(src2 + x * cn, v_2src0, v_2src1, v_2src2); - v_1src0 = v_1src0 & v_mask; - v_1src1 = v_1src1 & v_mask; - v_1src2 = v_1src2 & v_mask; - v_2src0 = v_2src0 & v_mask; - v_2src1 = v_2src1 & v_mask; - v_2src2 = v_2src2 & v_mask; v_uint16x8 v_1int0, v_1int1, v_1int2, v_2int0, v_2int1, v_2int2, dummy; v_expand(v_1src0, v_1int0, dummy); @@ -2216,6 +2166,15 @@ void accProd_simd_(const uchar* src1, const uchar* src2, double* dst, const ucha v_expand(v_2src1, v_2int1, dummy); v_expand(v_2src2, v_2int2, dummy); + v_uint16x8 v_mask = v_load_expand(mask + x); + v_mask = ~(v_mask == v_0); + v_1int0 = v_1int0 & v_mask; + v_1int1 = v_1int1 & v_mask; + v_1int2 = v_1int2 & v_mask; + v_2int0 = v_2int0 & v_mask; + v_2int1 = v_2int1 & v_mask; + v_2int2 = v_2int2 & v_mask; + v_uint32x4 v_1int00, v_1int01, v_1int10, v_1int11, v_1int20, v_1int21; v_uint32x4 v_2int00, v_2int01, v_2int10, v_2int11, v_2int20, v_2int21; v_expand(v_1int0, v_1int00, v_1int01); @@ -2440,9 +2399,7 @@ void accProd_simd_(const float* src1, const float* src2, double* dst, const ucha { for (; x <= len - cVectorWidth; x += cVectorWidth) { - v_uint16x8 stub = v_load_expand(mask + x); - v_uint32x4 v_mask, dummy; - v_expand(stub, v_mask, dummy); + v_uint32x4 v_mask = v_load_expand_q(mask + x); v_mask = ~(v_mask == v_0); v_float32x4 v_1src = v_load(src1 + x); v_float32x4 v_2src = v_load(src2 + x); @@ -2462,9 +2419,7 @@ void accProd_simd_(const float* src1, const float* src2, double* dst, const ucha { for (; x <= len - cVectorWidth; x += cVectorWidth) { - v_uint16x8 stub = v_load_expand(mask + x); - v_uint32x4 v_mask, dummy; - v_expand(stub, v_mask, dummy); + v_uint32x4 v_mask = v_load_expand_q(mask + x); v_mask = ~(v_mask == v_0); v_float32x4 v_1src0, v_1src1, v_1src2, v_2src0, v_2src1, v_2src2; v_load_deinterleave(src1 + x * cn, v_1src0, v_1src1, v_1src2); @@ -2522,11 +2477,9 @@ void accProd_simd_(const double* src1, const double* src2, double* dst, const uc { for (; x <= len - cVectorWidth; x += cVectorWidth) { - v_uint16x8 stub = v_load_expand(mask + x); - v_uint32x4 stub0, stub1; - v_expand(stub, stub0, stub1); + v_uint32x4 v_mask32 = v_load_expand_q(mask + x); v_uint64x2 v_masku640, v_masku641; - v_expand(stub0, v_masku640, v_masku641); + v_expand(v_mask32, v_masku640, v_masku641); v_float64x2 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0)); v_float64x2 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0)); @@ -2543,11 +2496,9 @@ void accProd_simd_(const double* src1, const double* src2, double* dst, const uc { for (; x <= len - cVectorWidth; x += cVectorWidth) { - v_uint16x8 stub = v_load_expand(mask + x); - v_uint32x4 stub0, stub1; - v_expand(stub, stub0, stub1); + v_uint32x4 v_mask32 = v_load_expand_q(mask + x); v_uint64x2 v_masku640, v_masku641; - v_expand(stub0, v_masku640, v_masku641); + v_expand(v_mask32, v_masku640, v_masku641); v_float64x2 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0)); v_float64x2 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0)); @@ -2704,12 +2655,10 @@ void accW_simd_(const uchar* src, double* dst, const uchar* mask, int len, int c int size = len * cn; for (; x <= size - cVectorWidth; x += cVectorWidth) { - v_uint8x16 v_src = v_load(src + x); - v_uint16x8 v_int, dummy; - v_expand(v_src, v_int, dummy); + v_uint16x8 v_src16 = v_load_expand(src + x); v_uint32x4 v_int_0, v_int_1; - v_expand(v_int, v_int_0, v_int_1); + v_expand(v_src16, v_int_0, v_int_1); v_int32x4 v_int0 = v_reinterpret_as_s32(v_int_0); v_int32x4 v_int1 = v_reinterpret_as_s32(v_int_1);