Merge pull request #24371 from hanliutong:clean-up

Clean up the obsolete API of Universal Intrinsic
This commit is contained in:
Alexander Smorkalov 2023-10-20 12:50:26 +03:00 committed by GitHub
commit 1c0ca41b6e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
40 changed files with 1611 additions and 1531 deletions

View File

@ -723,7 +723,7 @@ namespace CV__SIMD_NAMESPACE {
/** @brief SIMD processing state cleanup call */
inline void vx_cleanup() { VXPREFIX(_cleanup)(); }
#if !CV_SIMD_SCALABLE
#if !CV_SIMD_SCALABLE && !(CV_NEON && !defined(CV_FORCE_SIMD128_CPP))
// Compatibility layer
template<typename T> struct VTraits {
@ -1148,6 +1148,74 @@ namespace CV__SIMD_NAMESPACE {
#endif //!CV_SIMD_SCALABLE
#if (CV_NEON /* || CV_others */) && !defined(CV_FORCE_SIMD128_CPP)
// Compatibility layer for the backend that cleaned up.
#define OPENCV_HAL_WRAP_BIN_OP_ADDSUB(_Tpvec) \
template<typename... Args> \
inline _Tpvec v_add(const _Tpvec& f1, const _Tpvec& f2, const Args&... vf) { \
return v_add(v_add(f1, f2), vf...); \
}
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint8)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint16)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint32)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint64)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int8)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int16)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int32)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int64)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float32)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64)
#endif
#define OPENCV_HAL_WRAP_BIN_OP_MUL(_Tpvec) \
template<typename... Args> \
inline _Tpvec v_mul(const _Tpvec& f1, const _Tpvec& f2, const Args&... vf) { \
return v_mul(v_mul(f1, f2), vf...); \
}
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint8)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int8)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint16)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint32)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int16)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int32)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_float32)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_BIN_OP_MUL(v_float64)
#endif
#define OPENCV_HAL_WRAP_EXTRACT(_Tpvec) \
inline typename VTraits<_Tpvec>::lane_type v_extract_highest(const _Tpvec& v) \
{ \
return v_extract_n<VTraits<_Tpvec>::nlanes-1>(v); \
}
OPENCV_HAL_WRAP_EXTRACT(v_uint8)
OPENCV_HAL_WRAP_EXTRACT(v_int8)
OPENCV_HAL_WRAP_EXTRACT(v_uint16)
OPENCV_HAL_WRAP_EXTRACT(v_int16)
OPENCV_HAL_WRAP_EXTRACT(v_uint32)
OPENCV_HAL_WRAP_EXTRACT(v_int32)
OPENCV_HAL_WRAP_EXTRACT(v_uint64)
OPENCV_HAL_WRAP_EXTRACT(v_int64)
OPENCV_HAL_WRAP_EXTRACT(v_float32)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_EXTRACT(v_float64)
#endif
#define OPENCV_HAL_WRAP_BROADCAST(_Tpvec) \
inline _Tpvec v_broadcast_highest(const _Tpvec& v) \
{ \
return v_broadcast_element<VTraits<_Tpvec>::nlanes-1>(v); \
}
OPENCV_HAL_WRAP_BROADCAST(v_uint32)
OPENCV_HAL_WRAP_BROADCAST(v_int32)
OPENCV_HAL_WRAP_BROADCAST(v_float32)
#endif //CV_NEON
//! @cond IGNORED
// backward compatibility

View File

@ -131,13 +131,22 @@ OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_I64(int64x2, int64x1, s64)
OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_F64(float64x2, float64x1,f64)
#endif
//////////// Compatibility layer ////////////
template<typename T> struct VTraits {
static inline int vlanes() { return T::nlanes; }
enum { max_nlanes = T::nlanes, nlanes = T::nlanes };
using lane_type = typename T::lane_type;
};
template<typename T>
inline typename VTraits<T>::lane_type v_get0(const T& v) \
{ \
return v.get0(); \
}
//////////// Types ////////////
struct v_uint8x16
{
typedef uchar lane_type;
enum { nlanes = 16 };
v_uint8x16() {}
explicit v_uint8x16(uint8x16_t v) : val(v) {}
v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
@ -146,19 +155,22 @@ struct v_uint8x16
uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
val = vld1q_u8(v);
}
uint8x16_t val;
private:
friend struct VTraits<v_uint8x16>;
enum { nlanes = 16 };
typedef uchar lane_type;
friend typename VTraits<v_uint8x16>::lane_type v_get0<v_uint8x16>(const v_uint8x16& v);
uchar get0() const
{
return vgetq_lane_u8(val, 0);
}
uint8x16_t val;
};
struct v_int8x16
{
typedef schar lane_type;
enum { nlanes = 16 };
v_int8x16() {}
explicit v_int8x16(int8x16_t v) : val(v) {}
v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
@ -167,19 +179,22 @@ struct v_int8x16
schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
val = vld1q_s8(v);
}
int8x16_t val;
private:
friend struct VTraits<v_int8x16>;
enum { nlanes = 16 };
typedef schar lane_type;
friend typename VTraits<v_int8x16>::lane_type v_get0<v_int8x16>(const v_int8x16& v);
schar get0() const
{
return vgetq_lane_s8(val, 0);
}
int8x16_t val;
};
struct v_uint16x8
{
typedef ushort lane_type;
enum { nlanes = 8 };
v_uint16x8() {}
explicit v_uint16x8(uint16x8_t v) : val(v) {}
v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
@ -187,19 +202,22 @@ struct v_uint16x8
ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
val = vld1q_u16(v);
}
uint16x8_t val;
private:
friend struct VTraits<v_uint16x8>;
enum { nlanes = 8 };
typedef ushort lane_type;
friend typename VTraits<v_uint16x8>::lane_type v_get0<v_uint16x8>(const v_uint16x8& v);
ushort get0() const
{
return vgetq_lane_u16(val, 0);
}
uint16x8_t val;
};
struct v_int16x8
{
typedef short lane_type;
enum { nlanes = 8 };
v_int16x8() {}
explicit v_int16x8(int16x8_t v) : val(v) {}
v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
@ -207,19 +225,22 @@ struct v_int16x8
short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
val = vld1q_s16(v);
}
int16x8_t val;
private:
friend struct VTraits<v_int16x8>;
enum { nlanes = 8 };
typedef short lane_type;
friend typename VTraits<v_int16x8>::lane_type v_get0<v_int16x8>(const v_int16x8& v);
short get0() const
{
return vgetq_lane_s16(val, 0);
}
int16x8_t val;
};
struct v_uint32x4
{
typedef unsigned lane_type;
enum { nlanes = 4 };
v_uint32x4() {}
explicit v_uint32x4(uint32x4_t v) : val(v) {}
v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
@ -227,19 +248,22 @@ struct v_uint32x4
unsigned v[] = {v0, v1, v2, v3};
val = vld1q_u32(v);
}
uint32x4_t val;
private:
friend struct VTraits<v_uint32x4>;
enum { nlanes = 4 };
typedef unsigned lane_type;
friend typename VTraits<v_uint32x4>::lane_type v_get0<v_uint32x4>(const v_uint32x4& v);
unsigned get0() const
{
return vgetq_lane_u32(val, 0);
}
uint32x4_t val;
};
struct v_int32x4
{
typedef int lane_type;
enum { nlanes = 4 };
v_int32x4() {}
explicit v_int32x4(int32x4_t v) : val(v) {}
v_int32x4(int v0, int v1, int v2, int v3)
@ -247,18 +271,22 @@ struct v_int32x4
int v[] = {v0, v1, v2, v3};
val = vld1q_s32(v);
}
int32x4_t val;
private:
friend struct VTraits<v_int32x4>;
enum { nlanes = 4 };
typedef int lane_type;
friend typename VTraits<v_int32x4>::lane_type v_get0<v_int32x4>(const v_int32x4& v);
int get0() const
{
return vgetq_lane_s32(val, 0);
}
int32x4_t val;
};
struct v_float32x4
{
typedef float lane_type;
enum { nlanes = 4 };
v_float32x4() {}
explicit v_float32x4(float32x4_t v) : val(v) {}
v_float32x4(float v0, float v1, float v2, float v3)
@ -266,18 +294,22 @@ struct v_float32x4
float v[] = {v0, v1, v2, v3};
val = vld1q_f32(v);
}
float32x4_t val;
private:
friend struct VTraits<v_float32x4>;
enum { nlanes = 4 };
typedef float lane_type;
friend typename VTraits<v_float32x4>::lane_type v_get0<v_float32x4>(const v_float32x4& v);
float get0() const
{
return vgetq_lane_f32(val, 0);
}
float32x4_t val;
};
struct v_uint64x2
{
typedef uint64 lane_type;
enum { nlanes = 2 };
v_uint64x2() {}
explicit v_uint64x2(uint64x2_t v) : val(v) {}
v_uint64x2(uint64 v0, uint64 v1)
@ -285,18 +317,21 @@ struct v_uint64x2
uint64 v[] = {v0, v1};
val = vld1q_u64(v);
}
uint64x2_t val;
private:
friend struct VTraits<v_uint64x2>;
enum { nlanes = 2 };
typedef uint64 lane_type;
friend typename VTraits<v_uint64x2>::lane_type v_get0<v_uint64x2>(const v_uint64x2& v);
uint64 get0() const
{
return vgetq_lane_u64(val, 0);
}
uint64x2_t val;
};
struct v_int64x2
{
typedef int64 lane_type;
enum { nlanes = 2 };
v_int64x2() {}
explicit v_int64x2(int64x2_t v) : val(v) {}
v_int64x2(int64 v0, int64 v1)
@ -304,19 +339,23 @@ struct v_int64x2
int64 v[] = {v0, v1};
val = vld1q_s64(v);
}
int64x2_t val;
private:
friend struct VTraits<v_int64x2>;
enum { nlanes = 2 };
typedef int64 lane_type;
friend typename VTraits<v_int64x2>::lane_type v_get0<v_int64x2>(const v_int64x2& v);
int64 get0() const
{
return vgetq_lane_s64(val, 0);
}
int64x2_t val;
};
#if CV_SIMD128_64F
struct v_float64x2
{
typedef double lane_type;
enum { nlanes = 2 };
v_float64x2() {}
explicit v_float64x2(float64x2_t v) : val(v) {}
v_float64x2(double v0, double v1)
@ -324,11 +363,18 @@ struct v_float64x2
double v[] = {v0, v1};
val = vld1q_f64(v);
}
float64x2_t val;
private:
friend struct VTraits<v_float64x2>;
enum { nlanes = 2 };
typedef double lane_type;
friend typename VTraits<v_float64x2>::lane_type v_get0<v_float64x2>(const v_float64x2& v);
double get0() const
{
return vgetq_lane_f64(val, 0);
}
float64x2_t val;
};
#endif
@ -460,71 +506,56 @@ inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
}
#define OPENCV_HAL_IMPL_NEON_BIN_OP(bin_op, _Tpvec, intrin) \
inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec bin_op (const _Tpvec& a, const _Tpvec& b) \
{ \
return _Tpvec(intrin(a.val, b.val)); \
} \
inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
{ \
a.val = intrin(a.val, b.val); \
return a; \
}
OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint8x16, vqaddq_u8)
OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint8x16, vqsubq_u8)
OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int8x16, vqaddq_s8)
OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int8x16, vqsubq_s8)
OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint16x8, vqaddq_u16)
OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint16x8, vqsubq_u16)
OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int16x8, vqaddq_s16)
OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int16x8, vqsubq_s16)
OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int32x4, vaddq_s32)
OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int32x4, vsubq_s32)
OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_int32x4, vmulq_s32)
OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint32x4, vaddq_u32)
OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint32x4, vsubq_u32)
OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_uint32x4, vmulq_u32)
OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_float32x4, vaddq_f32)
OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_float32x4, vsubq_f32)
OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_float32x4, vmulq_f32)
OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int64x2, vaddq_s64)
OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int64x2, vsubq_s64)
OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint64x2, vaddq_u64)
OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint64x2, vsubq_u64)
OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_uint8x16, vqaddq_u8)
OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_uint8x16, vqsubq_u8)
OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_int8x16, vqaddq_s8)
OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_int8x16, vqsubq_s8)
OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_uint16x8, vqaddq_u16)
OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_uint16x8, vqsubq_u16)
OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_int16x8, vqaddq_s16)
OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_int16x8, vqsubq_s16)
OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_int32x4, vaddq_s32)
OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_int32x4, vsubq_s32)
OPENCV_HAL_IMPL_NEON_BIN_OP(v_mul, v_int32x4, vmulq_s32)
OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_uint32x4, vaddq_u32)
OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_uint32x4, vsubq_u32)
OPENCV_HAL_IMPL_NEON_BIN_OP(v_mul, v_uint32x4, vmulq_u32)
OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_float32x4, vaddq_f32)
OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_float32x4, vsubq_f32)
OPENCV_HAL_IMPL_NEON_BIN_OP(v_mul, v_float32x4, vmulq_f32)
OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_int64x2, vaddq_s64)
OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_int64x2, vsubq_s64)
OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_uint64x2, vaddq_u64)
OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_uint64x2, vsubq_u64)
#if CV_SIMD128_64F
OPENCV_HAL_IMPL_NEON_BIN_OP(/, v_float32x4, vdivq_f32)
OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_float64x2, vaddq_f64)
OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_float64x2, vsubq_f64)
OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_float64x2, vmulq_f64)
OPENCV_HAL_IMPL_NEON_BIN_OP(/, v_float64x2, vdivq_f64)
OPENCV_HAL_IMPL_NEON_BIN_OP(v_div, v_float32x4, vdivq_f32)
OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_float64x2, vaddq_f64)
OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_float64x2, vsubq_f64)
OPENCV_HAL_IMPL_NEON_BIN_OP(v_mul, v_float64x2, vmulq_f64)
OPENCV_HAL_IMPL_NEON_BIN_OP(v_div, v_float64x2, vdivq_f64)
#else
inline v_float32x4 operator / (const v_float32x4& a, const v_float32x4& b)
inline v_float32x4 v_div (const v_float32x4& a, const v_float32x4& b)
{
float32x4_t reciprocal = vrecpeq_f32(b.val);
reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
return v_float32x4(vmulq_f32(a.val, reciprocal));
}
inline v_float32x4& operator /= (v_float32x4& a, const v_float32x4& b)
{
float32x4_t reciprocal = vrecpeq_f32(b.val);
reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
a.val = vmulq_f32(a.val, reciprocal);
return a;
}
#endif
// saturating multiply 8-bit, 16-bit
#define OPENCV_HAL_IMPL_NEON_MUL_SAT(_Tpvec, _Tpwvec) \
inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec v_mul (const _Tpvec& a, const _Tpvec& b) \
{ \
_Tpwvec c, d; \
v_mul_expand(a, b, c, d); \
return v_pack(c, d); \
} \
inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \
{ a = a * b; return a; }
}
OPENCV_HAL_IMPL_NEON_MUL_SAT(v_int8x16, v_int16x8)
OPENCV_HAL_IMPL_NEON_MUL_SAT(v_uint8x16, v_uint16x8)
@ -698,7 +729,7 @@ inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b,
const v_uint32x4& c)
{
return v_dotprod_expand(a, b) + c;
return v_add(v_dotprod_expand(a, b), c);
}
inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
@ -715,7 +746,7 @@ inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b,
const v_int32x4& c)
{
return v_dotprod_expand(a, b) + c;
return v_add(v_dotprod_expand(a, b), c);
}
#endif
// 16 >> 64
@ -735,7 +766,7 @@ inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
return v_uint64x2(vaddq_u64(s0, s1));
}
inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
{ return v_dotprod_expand(a, b) + c; }
{ return v_add(v_dotprod_expand(a, b), c); }
inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
{
@ -752,7 +783,7 @@ inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
}
inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b,
const v_int64x2& c)
{ return v_dotprod_expand(a, b) + c; }
{ return v_add(v_dotprod_expand(a, b), c); }
// 32 >> 64f
#if CV_SIMD128_64F
@ -760,7 +791,7 @@ inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
{ return v_cvt_f64(v_dotprod(a, b)); }
inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b,
const v_float64x2& c)
{ return v_dotprod_expand(a, b) + c; }
{ return v_add(v_dotprod_expand(a, b), c); }
#endif
//////// Fast Dot Product ////////
@ -850,7 +881,7 @@ inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b
}
inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
{
return v_dotprod_expand_fast(a, b) + c;
return v_add(v_dotprod_expand_fast(a, b), c);
}
inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
@ -861,7 +892,7 @@ inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
}
inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
{
return v_dotprod_expand_fast(a, b) + c;
return v_add(v_dotprod_expand_fast(a, b), c);
}
#endif
@ -875,7 +906,7 @@ inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b
return v_uint64x2(vaddq_u64(s0, s1));
}
inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
{ return v_dotprod_expand_fast(a, b) + c; }
{ return v_add(v_dotprod_expand_fast(a, b), c); }
inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
{
@ -884,22 +915,22 @@ inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
return v_int64x2(vaddl_s32(vget_low_s32(prod), vget_high_s32(prod)));
}
inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
{ return v_dotprod_expand_fast(a, b) + c; }
{ return v_add(v_dotprod_expand_fast(a, b), c); }
// 32 >> 64f
#if CV_SIMD128_64F
inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
{ return v_cvt_f64(v_dotprod_fast(a, b)); }
inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
{ return v_dotprod_expand_fast(a, b) + c; }
{ return v_add(v_dotprod_expand_fast(a, b), c); }
#endif
#define OPENCV_HAL_IMPL_NEON_LOGIC_OP(_Tpvec, suffix) \
OPENCV_HAL_IMPL_NEON_BIN_OP(&, _Tpvec, vandq_##suffix) \
OPENCV_HAL_IMPL_NEON_BIN_OP(|, _Tpvec, vorrq_##suffix) \
OPENCV_HAL_IMPL_NEON_BIN_OP(^, _Tpvec, veorq_##suffix) \
inline _Tpvec operator ~ (const _Tpvec& a) \
OPENCV_HAL_IMPL_NEON_BIN_OP(v_and, _Tpvec, vandq_##suffix) \
OPENCV_HAL_IMPL_NEON_BIN_OP(v_or, _Tpvec, vorrq_##suffix) \
OPENCV_HAL_IMPL_NEON_BIN_OP(v_xor, _Tpvec, veorq_##suffix) \
inline _Tpvec v_not (const _Tpvec& a) \
{ \
return _Tpvec(vreinterpretq_##suffix##_u8(vmvnq_u8(vreinterpretq_u8_##suffix(a.val)))); \
}
@ -914,21 +945,16 @@ OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint64x2, u64)
OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int64x2, s64)
#define OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(bin_op, intrin) \
inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \
inline v_float32x4 bin_op (const v_float32x4& a, const v_float32x4& b) \
{ \
return v_float32x4(vreinterpretq_f32_s32(intrin(vreinterpretq_s32_f32(a.val), vreinterpretq_s32_f32(b.val)))); \
} \
inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \
{ \
a.val = vreinterpretq_f32_s32(intrin(vreinterpretq_s32_f32(a.val), vreinterpretq_s32_f32(b.val))); \
return a; \
}
OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(&, vandq_s32)
OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(|, vorrq_s32)
OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(^, veorq_s32)
OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(v_and, vandq_s32)
OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(v_or, vorrq_s32)
OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(v_xor, veorq_s32)
inline v_float32x4 operator ~ (const v_float32x4& a)
inline v_float32x4 v_not (const v_float32x4& a)
{
return v_float32x4(vreinterpretq_f32_s32(vmvnq_s32(vreinterpretq_s32_f32(a.val))));
}
@ -942,7 +968,7 @@ inline v_float32x4 v_sqrt(const v_float32x4& x)
inline v_float32x4 v_invsqrt(const v_float32x4& x)
{
v_float32x4 one = v_setall_f32(1.0f);
return one / v_sqrt(x);
return v_div(one, v_sqrt(x));
}
#else
inline v_float32x4 v_sqrt(const v_float32x4& x)
@ -975,21 +1001,16 @@ inline v_float32x4 v_abs(v_float32x4 x)
#if CV_SIMD128_64F
#define OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(bin_op, intrin) \
inline v_float64x2 operator bin_op (const v_float64x2& a, const v_float64x2& b) \
inline v_float64x2 bin_op (const v_float64x2& a, const v_float64x2& b) \
{ \
return v_float64x2(vreinterpretq_f64_s64(intrin(vreinterpretq_s64_f64(a.val), vreinterpretq_s64_f64(b.val)))); \
} \
inline v_float64x2& operator bin_op##= (v_float64x2& a, const v_float64x2& b) \
{ \
a.val = vreinterpretq_f64_s64(intrin(vreinterpretq_s64_f64(a.val), vreinterpretq_s64_f64(b.val))); \
return a; \
}
OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(&, vandq_s64)
OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(|, vorrq_s64)
OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(^, veorq_s64)
OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(v_and, vandq_s64)
OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(v_or, vorrq_s64)
OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(v_xor, veorq_s64)
inline v_float64x2 operator ~ (const v_float64x2& a)
inline v_float64x2 v_not (const v_float64x2& a)
{
return v_float64x2(vreinterpretq_f64_s32(vmvnq_s32(vreinterpretq_s32_f64(a.val))));
}
@ -1002,7 +1023,7 @@ inline v_float64x2 v_sqrt(const v_float64x2& x)
inline v_float64x2 v_invsqrt(const v_float64x2& x)
{
v_float64x2 one = v_setall_f64(1.0f);
return one / v_sqrt(x);
return v_div(one, v_sqrt(x));
}
inline v_float64x2 v_abs(v_float64x2 x)
@ -1037,17 +1058,17 @@ OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float64x2, v_max, vmaxq_f64)
#endif
#define OPENCV_HAL_IMPL_NEON_INT_CMP_OP(_Tpvec, cast, suffix, not_suffix) \
inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec v_eq (const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(cast(vceqq_##suffix(a.val, b.val))); } \
inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec v_ne (const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(cast(vmvnq_##not_suffix(vceqq_##suffix(a.val, b.val)))); } \
inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec v_lt (const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(cast(vcltq_##suffix(a.val, b.val))); } \
inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec v_gt (const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(cast(vcgtq_##suffix(a.val, b.val))); } \
inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec v_le (const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(cast(vcleq_##suffix(a.val, b.val))); } \
inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec v_ge (const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(cast(vcgeq_##suffix(a.val, b.val))); }
OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint8x16, OPENCV_HAL_NOP, u8, u8)
@ -1065,22 +1086,22 @@ static inline uint64x2_t vmvnq_u64(uint64x2_t a)
}
//OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint64x2, OPENCV_HAL_NOP, u64, u64)
//OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int64x2, vreinterpretq_s64_u64, s64, u64)
static inline v_uint64x2 operator == (const v_uint64x2& a, const v_uint64x2& b)
static inline v_uint64x2 v_eq (const v_uint64x2& a, const v_uint64x2& b)
{ return v_uint64x2(vceqq_u64(a.val, b.val)); }
static inline v_uint64x2 operator != (const v_uint64x2& a, const v_uint64x2& b)
static inline v_uint64x2 v_ne (const v_uint64x2& a, const v_uint64x2& b)
{ return v_uint64x2(vmvnq_u64(vceqq_u64(a.val, b.val))); }
static inline v_int64x2 operator == (const v_int64x2& a, const v_int64x2& b)
static inline v_int64x2 v_eq (const v_int64x2& a, const v_int64x2& b)
{ return v_int64x2(vreinterpretq_s64_u64(vceqq_s64(a.val, b.val))); }
static inline v_int64x2 operator != (const v_int64x2& a, const v_int64x2& b)
static inline v_int64x2 v_ne (const v_int64x2& a, const v_int64x2& b)
{ return v_int64x2(vreinterpretq_s64_u64(vmvnq_u64(vceqq_s64(a.val, b.val)))); }
#else
static inline v_uint64x2 operator == (const v_uint64x2& a, const v_uint64x2& b)
static inline v_uint64x2 v_eq (const v_uint64x2& a, const v_uint64x2& b)
{
uint32x4_t cmp = vceqq_u32(vreinterpretq_u32_u64(a.val), vreinterpretq_u32_u64(b.val));
uint32x4_t swapped = vrev64q_u32(cmp);
return v_uint64x2(vreinterpretq_u64_u32(vandq_u32(cmp, swapped)));
}
static inline v_uint64x2 operator != (const v_uint64x2& a, const v_uint64x2& b)
static inline v_uint64x2 v_ne (const v_uint64x2& a, const v_uint64x2& b)
{
uint32x4_t cmp = vceqq_u32(vreinterpretq_u32_u64(a.val), vreinterpretq_u32_u64(b.val));
uint32x4_t swapped = vrev64q_u32(cmp);
@ -1088,13 +1109,13 @@ static inline v_uint64x2 operator != (const v_uint64x2& a, const v_uint64x2& b)
uint64x2_t vx = vreinterpretq_u64_u32(vdupq_n_u32(0xFFFFFFFF));
return v_uint64x2(veorq_u64(v_eq, vx));
}
static inline v_int64x2 operator == (const v_int64x2& a, const v_int64x2& b)
static inline v_int64x2 v_eq (const v_int64x2& a, const v_int64x2& b)
{
return v_reinterpret_as_s64(v_reinterpret_as_u64(a) == v_reinterpret_as_u64(b));
return v_reinterpret_as_s64(v_eq(v_reinterpret_as_u64(a), v_reinterpret_as_u64(b)));
}
static inline v_int64x2 operator != (const v_int64x2& a, const v_int64x2& b)
static inline v_int64x2 v_ne (const v_int64x2& a, const v_int64x2& b)
{
return v_reinterpret_as_s64(v_reinterpret_as_u64(a) != v_reinterpret_as_u64(b));
return v_reinterpret_as_s64(v_ne(v_reinterpret_as_u64(a), v_reinterpret_as_u64(b)));
}
#endif
#if CV_SIMD128_64F
@ -1207,9 +1228,9 @@ inline v_float64x2 v_muladd(const v_float64x2& a, const v_float64x2& b, const v_
// trade efficiency for convenience
#define OPENCV_HAL_IMPL_NEON_SHIFT_OP(_Tpvec, suffix, _Tps, ssuffix) \
inline _Tpvec operator << (const _Tpvec& a, int n) \
inline _Tpvec v_shl (const _Tpvec& a, int n) \
{ return _Tpvec(vshlq_##suffix(a.val, vdupq_n_##ssuffix((_Tps)n))); } \
inline _Tpvec operator >> (const _Tpvec& a, int n) \
inline _Tpvec v_shr (const _Tpvec& a, int n) \
{ return _Tpvec(vshlq_##suffix(a.val, vdupq_n_##ssuffix((_Tps)-n))); } \
template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
{ return _Tpvec(vshlq_n_##suffix(a.val, n)); } \
@ -1231,13 +1252,13 @@ OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int64x2, s64, int64, s64)
template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
{ return _Tpvec(vextq_##suffix(a.val, vdupq_n_##suffix(0), n)); } \
template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
{ return _Tpvec(vextq_##suffix(vdupq_n_##suffix(0), a.val, _Tpvec::nlanes - n)); } \
{ return _Tpvec(vextq_##suffix(vdupq_n_##suffix(0), a.val, VTraits<_Tpvec>::nlanes - n)); } \
template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \
{ return a; } \
template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(vextq_##suffix(a.val, b.val, n)); } \
template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(vextq_##suffix(b.val, a.val, _Tpvec::nlanes - n)); } \
{ return _Tpvec(vextq_##suffix(b.val, a.val, VTraits<_Tpvec>::nlanes - n)); } \
template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \
{ CV_UNUSED(b); return a; }

View File

@ -358,8 +358,8 @@ static inline void v_store_pair_as(float16_t* ptr, const v_float64& a, const v_f
static inline void vx_load_as(const double* ptr, v_float32& a)
{
const int VECSZ = v_float32::nlanes;
float buf[VECSZ*2];
const int VECSZ = VTraits<v_float32>::vlanes();
float buf[VTraits<v_float32>::max_nlanes*2];
for( int i = 0; i < VECSZ; i++ )
buf[i] = saturate_cast<float>(ptr[i]);
@ -369,19 +369,19 @@ static inline void vx_load_as(const double* ptr, v_float32& a)
template<typename _Tdvec>
static inline void vx_load_pair_as(const double* ptr, _Tdvec& a, _Tdvec& b)
{
const int VECSZ = _Tdvec::nlanes;
typename _Tdvec::lane_type buf[VECSZ*2];
const int VECSZ = VTraits<_Tdvec>::vlanes();
typename VTraits<_Tdvec>::lane_type buf[VTraits<_Tdvec>::max_nlanes*2];
for( int i = 0; i < VECSZ*2; i++ )
buf[i] = saturate_cast<typename _Tdvec::lane_type>(ptr[i]);
buf[i] = saturate_cast<typename VTraits<_Tdvec>::lane_type>(ptr[i]);
a = vx_load(buf);
b = vx_load(buf + VECSZ);
}
static inline void v_store_as(double* ptr, const v_float32& a)
{
const int VECSZ = v_float32::nlanes;
float buf[VECSZ];
const int VECSZ = VTraits<v_float32>::vlanes();
float buf[VTraits<v_float32>::max_nlanes];
v_store(buf, a);
for( int i = 0; i < VECSZ; i++ )
@ -391,8 +391,8 @@ static inline void v_store_as(double* ptr, const v_float32& a)
template<typename _Tsvec>
static inline void v_store_pair_as(double* ptr, const _Tsvec& a, const _Tsvec& b)
{
const int VECSZ = _Tsvec::nlanes;
typename _Tsvec::lane_type buf[VECSZ*2];
const int VECSZ = VTraits<_Tsvec>::vlanes();
typename VTraits<_Tsvec>::lane_type buf[VTraits<_Tsvec>::max_nlanes*2];
v_store(buf, a); v_store(buf + VECSZ, b);
for( int i = 0; i < VECSZ*2; i++ )

View File

@ -93,13 +93,13 @@ struct v_atan_f32
{
v_float32 ax = v_abs(x);
v_float32 ay = v_abs(y);
v_float32 c = v_min(ax, ay) / (v_max(ax, ay) + eps);
v_float32 cc = c * c;
v_float32 a = v_fma(v_fma(v_fma(cc, p7, p5), cc, p3), cc, p1)*c;
a = v_select(ax >= ay, a, val90 - a);
a = v_select(x < z, val180 - a, a);
a = v_select(y < z, val360 - a, a);
return a * s;
v_float32 c = v_div(v_min(ax, ay), v_add(v_max(ax, ay), this->eps));
v_float32 cc = v_mul(c, c);
v_float32 a = v_mul(v_fma(v_fma(v_fma(cc, this->p7, this->p5), cc, this->p3), cc, this->p1), c);
a = v_select(v_ge(ax, ay), a, v_sub(this->val90, a));
a = v_select(v_lt(x, this->z), v_sub(this->val180, a), a);
a = v_select(v_lt(y, this->z), v_sub(this->val360, a), a);
return v_mul(a, this->s);
}
v_float32 eps;
@ -125,7 +125,7 @@ static void fastAtan32f_(const float *Y, const float *X, float *angle, int len,
float scale = angleInDegrees ? 1.f : (float)(CV_PI/180);
int i = 0;
#if CV_SIMD
const int VECSZ = v_float32::nlanes;
const int VECSZ = VTraits<v_float32>::vlanes();
v_atan_f32 v(scale);
for( ; i < len; i += VECSZ*2 )
@ -198,7 +198,7 @@ void magnitude32f(const float* x, const float* y, float* mag, int len)
int i = 0;
#if CV_SIMD
const int VECSZ = v_float32::nlanes;
const int VECSZ = VTraits<v_float32>::vlanes();
for( ; i < len; i += VECSZ*2 )
{
if( i + VECSZ*2 > len )
@ -209,8 +209,8 @@ void magnitude32f(const float* x, const float* y, float* mag, int len)
}
v_float32 x0 = vx_load(x + i), x1 = vx_load(x + i + VECSZ);
v_float32 y0 = vx_load(y + i), y1 = vx_load(y + i + VECSZ);
x0 = v_sqrt(v_muladd(x0, x0, y0*y0));
x1 = v_sqrt(v_muladd(x1, x1, y1*y1));
x0 = v_sqrt(v_muladd(x0, x0, v_mul(y0, y0)));
x1 = v_sqrt(v_muladd(x1, x1, v_mul(y1, y1)));
v_store(mag + i, x0);
v_store(mag + i + VECSZ, x1);
}
@ -231,7 +231,7 @@ void magnitude64f(const double* x, const double* y, double* mag, int len)
int i = 0;
#if CV_SIMD_64F
const int VECSZ = v_float64::nlanes;
const int VECSZ = VTraits<v_float64>::vlanes();
for( ; i < len; i += VECSZ*2 )
{
if( i + VECSZ*2 > len )
@ -242,8 +242,8 @@ void magnitude64f(const double* x, const double* y, double* mag, int len)
}
v_float64 x0 = vx_load(x + i), x1 = vx_load(x + i + VECSZ);
v_float64 y0 = vx_load(y + i), y1 = vx_load(y + i + VECSZ);
x0 = v_sqrt(v_muladd(x0, x0, y0*y0));
x1 = v_sqrt(v_muladd(x1, x1, y1*y1));
x0 = v_sqrt(v_muladd(x0, x0, v_mul(y0, y0)));
x1 = v_sqrt(v_muladd(x1, x1, v_mul(y1, y1)));
v_store(mag + i, x0);
v_store(mag + i + VECSZ, x1);
}
@ -265,7 +265,7 @@ void invSqrt32f(const float* src, float* dst, int len)
int i = 0;
#if CV_SIMD
const int VECSZ = v_float32::nlanes;
const int VECSZ = VTraits<v_float32>::vlanes();
for( ; i < len; i += VECSZ*2 )
{
if( i + VECSZ*2 > len )
@ -293,7 +293,7 @@ void invSqrt64f(const double* src, double* dst, int len)
int i = 0;
#if CV_SIMD_64F
const int VECSZ = v_float64::nlanes;
const int VECSZ = VTraits<v_float64>::vlanes();
for ( ; i < len; i += VECSZ*2)
{
if( i + VECSZ*2 > len )
@ -321,7 +321,7 @@ void sqrt32f(const float* src, float* dst, int len)
int i = 0;
#if CV_SIMD
const int VECSZ = v_float32::nlanes;
const int VECSZ = VTraits<v_float32>::vlanes();
for( ; i < len; i += VECSZ*2 )
{
if( i + VECSZ*2 > len )
@ -350,7 +350,7 @@ void sqrt64f(const double* src, double* dst, int len)
int i = 0;
#if CV_SIMD_64F
const int VECSZ = v_float64::nlanes;
const int VECSZ = VTraits<v_float64>::vlanes();
for( ; i < len; i += VECSZ*2 )
{
if( i + VECSZ*2 > len )
@ -452,7 +452,7 @@ void exp32f( const float *_x, float *y, int n )
float postscale = (float)exp_postscale;
#if CV_SIMD
const int VECSZ = v_float32::nlanes;
const int VECSZ = VTraits<v_float32>::vlanes();
const v_float32 vprescale = vx_setall_f32((float)exp_prescale);
const v_float32 vpostscale = vx_setall_f32((float)exp_postscale);
const v_float32 vminval = vx_setall_f32(minval);
@ -481,26 +481,26 @@ void exp32f( const float *_x, float *y, int n )
xf0 = v_min(v_max(xf0, vminval), vmaxval);
xf1 = v_min(v_max(xf1, vminval), vmaxval);
xf0 *= vprescale;
xf1 *= vprescale;
xf0 = v_mul(xf0, vprescale);
xf1 = v_mul(xf1, vprescale);
v_int32 xi0 = v_round(xf0);
v_int32 xi1 = v_round(xf1);
xf0 = (xf0 - v_cvt_f32(xi0))*vpostscale;
xf1 = (xf1 - v_cvt_f32(xi1))*vpostscale;
xf0 = v_mul(v_sub(xf0, v_cvt_f32(xi0)), vpostscale);
xf1 = v_mul(v_sub(xf1, v_cvt_f32(xi1)), vpostscale);
v_float32 yf0 = v_lut(expTab_f, xi0 & vidxmask);
v_float32 yf1 = v_lut(expTab_f, xi1 & vidxmask);
v_float32 yf0 = v_lut(expTab_f, v_and(xi0, vidxmask));
v_float32 yf1 = v_lut(expTab_f, v_and(xi1, vidxmask));
v_int32 v0 = vx_setzero_s32(), v127 = vx_setall_s32(127), v255 = vx_setall_s32(255);
xi0 = v_min(v_max(v_shr<EXPTAB_SCALE>(xi0) + v127, v0), v255);
xi1 = v_min(v_max(v_shr<EXPTAB_SCALE>(xi1) + v127, v0), v255);
xi0 = v_min(v_max(v_add(v_shr<6>(xi0), v127), v0), v255);
xi1 = v_min(v_max(v_add(v_shr<6>(xi1), v127), v0), v255);
yf0 *= v_reinterpret_as_f32(v_shl<23>(xi0));
yf1 *= v_reinterpret_as_f32(v_shl<23>(xi1));
yf0 = v_mul(yf0, v_reinterpret_as_f32(v_shl<23>(xi0)));
yf1 = v_mul(yf1, v_reinterpret_as_f32(v_shl<23>(xi1)));
v_float32 zf0 = xf0 + vA1;
v_float32 zf1 = xf1 + vA1;
v_float32 zf0 = v_add(xf0, vA1);
v_float32 zf1 = v_add(xf1, vA1);
zf0 = v_fma(zf0, xf0, vA2);
zf1 = v_fma(zf1, xf1, vA2);
@ -511,8 +511,8 @@ void exp32f( const float *_x, float *y, int n )
zf0 = v_fma(zf0, xf0, vA4);
zf1 = v_fma(zf1, xf1, vA4);
zf0 *= yf0;
zf1 *= yf1;
zf0 = v_mul(zf0, yf0);
zf1 = v_mul(zf1, yf1);
if( y_aligned )
{
@ -566,7 +566,7 @@ void exp64f( const double *_x, double *y, int n )
double maxval = (exp_max_val/exp_prescale);
#if CV_SIMD_64F
const int VECSZ = v_float64::nlanes;
const int VECSZ = VTraits<v_float64>::vlanes();
const v_float64 vprescale = vx_setall_f64(exp_prescale);
const v_float64 vpostscale = vx_setall_f64(exp_postscale);
const v_float64 vminval = vx_setall_f64(minval);
@ -596,30 +596,30 @@ void exp64f( const double *_x, double *y, int n )
xf0 = v_min(v_max(xf0, vminval), vmaxval);
xf1 = v_min(v_max(xf1, vminval), vmaxval);
xf0 *= vprescale;
xf1 *= vprescale;
xf0 = v_mul(xf0, vprescale);
xf1 = v_mul(xf1, vprescale);
v_int32 xi0 = v_round(xf0);
v_int32 xi1 = v_round(xf1);
xf0 = (xf0 - v_cvt_f64(xi0))*vpostscale;
xf1 = (xf1 - v_cvt_f64(xi1))*vpostscale;
xf0 = v_mul(v_sub(xf0, v_cvt_f64(xi0)), vpostscale);
xf1 = v_mul(v_sub(xf1, v_cvt_f64(xi1)), vpostscale);
v_float64 yf0 = v_lut(expTab, xi0 & vidxmask);
v_float64 yf1 = v_lut(expTab, xi1 & vidxmask);
v_float64 yf0 = v_lut(expTab, v_and(xi0, vidxmask));
v_float64 yf1 = v_lut(expTab, v_and(xi1, vidxmask));
v_int32 v0 = vx_setzero_s32(), v1023 = vx_setall_s32(1023), v2047 = vx_setall_s32(2047);
xi0 = v_min(v_max(v_shr<EXPTAB_SCALE>(xi0) + v1023, v0), v2047);
xi1 = v_min(v_max(v_shr<EXPTAB_SCALE>(xi1) + v1023, v0), v2047);
xi0 = v_min(v_max(v_add(v_shr<6>(xi0), v1023), v0), v2047);
xi1 = v_min(v_max(v_add(v_shr<6>(xi1), v1023), v0), v2047);
v_int64 xq0, xq1, dummy;
v_expand(xi0, xq0, dummy);
v_expand(xi1, xq1, dummy);
yf0 *= v_reinterpret_as_f64(v_shl<52>(xq0));
yf1 *= v_reinterpret_as_f64(v_shl<52>(xq1));
yf0 = v_mul(yf0, v_reinterpret_as_f64(v_shl<52>(xq0)));
yf1 = v_mul(yf1, v_reinterpret_as_f64(v_shl<52>(xq1)));
v_float64 zf0 = xf0 + vA1;
v_float64 zf1 = xf1 + vA1;
v_float64 zf0 = v_add(xf0, vA1);
v_float64 zf1 = v_add(xf1, vA1);
zf0 = v_fma(zf0, xf0, vA2);
zf1 = v_fma(zf1, xf1, vA2);
@ -633,8 +633,8 @@ void exp64f( const double *_x, double *y, int n )
zf0 = v_fma(zf0, xf0, vA5);
zf1 = v_fma(zf1, xf1, vA5);
zf0 *= yf0;
zf1 *= yf1;
zf0 = v_mul(zf0, yf0);
zf1 = v_mul(zf1, yf1);
if( y_aligned )
{
@ -696,7 +696,7 @@ void log32f( const float *_x, float *y, int n )
const int* x = (const int*)_x;
#if CV_SIMD
const int VECSZ = v_float32::nlanes;
const int VECSZ = VTraits<v_float32>::vlanes();
const v_float32 vln2 = vx_setall_f32((float)ln_2);
const v_float32 v1 = vx_setall_f32(1.f);
const v_float32 vshift = vx_setall_f32(-1.f/512);
@ -715,18 +715,18 @@ void log32f( const float *_x, float *y, int n )
}
v_int32 h0 = vx_load(x + i);
v_int32 yi0 = (v_shr<23>(h0) & vx_setall_s32(255)) - vx_setall_s32(127);
v_int32 xi0 = (h0 & vx_setall_s32(LOGTAB_MASK2_32F)) | vx_setall_s32(127 << 23);
v_int32 yi0 = v_sub(v_and(v_shr<23>(h0), vx_setall_s32(255)), vx_setall_s32(127));
v_int32 xi0 = v_or(v_and(h0, vx_setall_s32(LOGTAB_MASK2_32F)), vx_setall_s32(127 << 23));
h0 = v_shr<23 - LOGTAB_SCALE - 1>(h0) & vx_setall_s32(LOGTAB_MASK*2);
h0 = v_and(v_shr<23 - 8 - 1>(h0), vx_setall_s32(((1 << 8) - 1) * 2));
v_float32 yf0, xf0;
v_lut_deinterleave(logTab_f, h0, yf0, xf0);
yf0 = v_fma(v_cvt_f32(yi0), vln2, yf0);
v_float32 delta = v_select(v_reinterpret_as_f32(h0 == vx_setall_s32(510)), vshift, vx_setall<float>(0));
xf0 = v_fma((v_reinterpret_as_f32(xi0) - v1), xf0, delta);
v_float32 delta = v_select(v_reinterpret_as_f32(v_eq(h0, vx_setall_s32(510))), vshift, vx_setall<float>(0));
xf0 = v_fma((v_sub(v_reinterpret_as_f32(xi0), v1)), xf0, delta);
v_float32 zf0 = v_fma(xf0, vA0, vA1);
zf0 = v_fma(zf0, xf0, vA2);
@ -771,7 +771,7 @@ void log64f( const double *x, double *y, int n )
int i = 0;
#if CV_SIMD_64F
const int VECSZ = v_float64::nlanes;
const int VECSZ = VTraits<v_float64>::vlanes();
const v_float64 vln2 = vx_setall_f64(ln_2);
const v_float64
@ -791,20 +791,20 @@ void log64f( const double *x, double *y, int n )
v_int64 h0 = vx_load((const int64*)x + i);
v_int32 yi0 = v_pack(v_shr<52>(h0), vx_setzero_s64());
yi0 = (yi0 & vx_setall_s32(0x7ff)) - vx_setall_s32(1023);
yi0 = v_sub(v_and(yi0, vx_setall_s32(2047)), vx_setall_s32(1023));
v_int64 xi0 = (h0 & vx_setall_s64(LOGTAB_MASK2_64F)) | vx_setall_s64((int64)1023 << 52);
v_int64 xi0 = v_or(v_and(h0, vx_setall_s64(LOGTAB_MASK2_64F)), vx_setall_s64((int64)1023 << 52));
h0 = v_shr<52 - LOGTAB_SCALE - 1>(h0);
v_int32 idx = v_pack(h0, h0) & vx_setall_s32(LOGTAB_MASK*2);
v_int32 idx = v_and(v_pack(h0, h0), vx_setall_s32(((1 << 8) - 1) * 2));
v_float64 xf0, yf0;
v_lut_deinterleave(logTab, idx, yf0, xf0);
yf0 = v_fma(v_cvt_f64(yi0), vln2, yf0);
v_float64 delta = v_cvt_f64(idx == vx_setall_s32(510))*vx_setall_f64(1./512);
xf0 = v_fma(v_reinterpret_as_f64(xi0) - vx_setall_f64(1.), xf0, delta);
v_float64 delta = v_mul(v_cvt_f64(v_eq(idx, vx_setall_s32(510))), vx_setall_f64(1. / 512));
xf0 = v_fma(v_sub(v_reinterpret_as_f64(xi0), vx_setall_f64(1.)), xf0, delta);
v_float64 xq = xf0*xf0;
v_float64 xq = v_mul(xf0, xf0);
v_float64 zf0 = v_fma(xq, vA0, vA2);
v_float64 zf1 = v_fma(xq, vA1, vA3);
zf0 = v_fma(zf0, xq, vA4);

View File

@ -1584,7 +1584,7 @@ transform_16u( const ushort* src, ushort* dst, const float* m, int len, int scn,
v_float32x4 _m2h = v_rotate_left<1>(_m2l);
v_float32x4 _m3h = v_rotate_left<1>(_m3l);
v_int16x8 _delta(0, -32768, -32768, -32768, -32768, -32768, -32768, 0);
for( ; x <= len*3 - v_uint16x8::nlanes; x += 3*v_uint16x8::nlanes/4 )
for( ; x <= len*3 - VTraits<v_uint16x8>::vlanes(); x += 3*VTraits<v_uint16x8>::vlanes()/4 )
v_store(dst + x, v_rotate_right<1>(v_reinterpret_as_u16(v_add_wrap(v_pack(
v_round(v_matmuladd(v_cvt_f32(v_reinterpret_as_s32(v_load_expand(src + x ))), _m0h, _m1h, _m2h, _m3h)),
v_round(v_matmuladd(v_cvt_f32(v_reinterpret_as_s32(v_load_expand(src + x + 3))), _m0l, _m1l, _m2l, _m3l))), _delta))));
@ -1664,10 +1664,10 @@ transform_32f( const float* src, float* dst, const float* m, int len, int scn, i
v_float32x4 _m2 = v_load(m + 10);
v_float32x4 _m3 = v_load(m + 15);
v_float32x4 _m4(m[4], m[9], m[14], m[19]);
for( ; x < len*4; x += v_float32x4::nlanes )
for( ; x < len*4; x += VTraits<v_float32x4>::vlanes() )
{
v_float32x4 v_src = v_load(src + x);
v_store(dst + x, v_reduce_sum4(v_src * _m0, v_src * _m1, v_src * _m2, v_src * _m3) + _m4);
v_store(dst + x, v_add(v_reduce_sum4(v_mul(v_src, _m0), v_mul(v_src, _m1), v_mul(v_src, _m2), v_mul(v_src, _m3)), _m4));
}
#else // CV_SIMD_WIDTH >= 16 && !CV_SIMD128
for( ; x < len*4; x += 4 )
@ -2113,12 +2113,12 @@ MulTransposedR(const Mat& srcmat, const Mat& dstmat, const Mat& deltamat, double
for( k = 0; k < size.height; k++, tsrc += srcstep )
{
v_float64x2 a = v_setall_f64((double)col_buf[k]);
s0 += a * v_load(tsrc+0);
s1 += a * v_load(tsrc+2);
s0 = v_add(s0, v_mul(a, v_load(tsrc + 0)));
s1 = v_add(s1, v_mul(a, v_load(tsrc + 2)));
}
v_store((double*)(tdst+j), s0*v_scale);
v_store((double*)(tdst+j+2), s1*v_scale);
v_store((double*)(tdst+j), v_mul(s0, v_scale));
v_store((double*)(tdst+j+2), v_mul(s1, v_scale));
} else
#endif
{
@ -2174,12 +2174,12 @@ MulTransposedR(const Mat& srcmat, const Mat& dstmat, const Mat& deltamat, double
for( k = 0; k < size.height; k++, tsrc+=srcstep, d+=deltastep )
{
v_float64x2 a = v_setall_f64((double)col_buf[k]);
s0 += a * (v_load(tsrc+0) - v_load(d+0));
s1 += a * (v_load(tsrc+2) - v_load(d+2));
s0 = v_add(s0, v_mul(a, v_sub(v_load(tsrc + 0), v_load(d + 0))));
s1 = v_add(s1, v_mul(a, v_sub(v_load(tsrc + 2), v_load(d + 2))));
}
v_store((double*)(tdst+j), s0*v_scale);
v_store((double*)(tdst+j+2), s1*v_scale);
v_store((double*)(tdst+j), v_mul(s0, v_scale));
v_store((double*)(tdst+j+2), v_mul(s1, v_scale));
}
else
#endif
@ -2249,8 +2249,7 @@ MulTransposedL(const Mat& srcmat, const Mat& dstmat, const Mat& deltamat, double
v_float64x2 v_s = v_setzero_f64();
for( k = 0; k <= size.width - 4; k += 4 )
v_s += (v_load(v_tsrc1+k) * v_load(v_tsrc2+k)) +
(v_load(v_tsrc1+k+2) * v_load(v_tsrc2+k+2));
v_s = v_add(v_s, v_add(v_mul(v_load(v_tsrc1 + k), v_load(v_tsrc2 + k)), v_mul(v_load(v_tsrc1 + k + 2), v_load(v_tsrc2 + k + 2))));
s += v_reduce_sum(v_s);
}
else
@ -2303,8 +2302,7 @@ MulTransposedL(const Mat& srcmat, const Mat& dstmat, const Mat& deltamat, double
v_float64x2 v_s = v_setzero_f64();
for( k = 0; k <= size.width - 4; k += 4, v_tdelta2 += delta_shift )
v_s += ((v_load(v_tsrc2+k) - v_load(v_tdelta2)) * v_load(v_row_buf+k)) +
((v_load(v_tsrc2+k+2) - v_load(v_tdelta2+2)) * v_load(v_row_buf+k+2));
v_s = v_add(v_s, v_add(v_mul(v_sub(v_load(v_tsrc2 + k), v_load(v_tdelta2)), v_load(v_row_buf + k)), v_mul(v_sub(v_load(v_tsrc2 + k + 2), v_load(v_tdelta2 + 2)), v_load(v_row_buf + k + 2))));
s += v_reduce_sum(v_s);
tdelta2 = (const dT *)(v_tdelta2);
@ -2566,7 +2564,7 @@ double dotProd_32s(const int* src1, const int* src2, int len)
v_sum0 = v_dotprod_expand_fast(v_src10, v_src20, v_sum0);
v_sum1 = v_dotprod_expand_fast(v_src11, v_src21, v_sum1);
}
v_sum0 += v_sum1;
v_sum0 = v_add(v_sum0, v_sum1);
#endif
for (; i < len - step; i += step, src1 += step, src2 += step)
{

View File

@ -356,10 +356,10 @@ void transposeND(InputArray src_, const std::vector<int>& order, OutputArray dst
#if CV_SIMD128
template<typename V> CV_ALWAYS_INLINE void flipHoriz_single( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size size, size_t esz )
{
typedef typename V::lane_type T;
typedef typename VTraits<V>::lane_type T;
int end = (int)(size.width*esz);
int width = (end + 1)/2;
int width_1 = width & -v_uint8x16::nlanes;
int width_1 = width & -VTraits<v_uint8x16>::vlanes();
int i, j;
#if CV_STRONG_ALIGNMENT
@ -368,15 +368,15 @@ template<typename V> CV_ALWAYS_INLINE void flipHoriz_single( const uchar* src, s
for( ; size.height--; src += sstep, dst += dstep )
{
for( i = 0, j = end; i < width_1; i += v_uint8x16::nlanes, j -= v_uint8x16::nlanes )
for( i = 0, j = end; i < width_1; i += VTraits<v_uint8x16>::vlanes(), j -= VTraits<v_uint8x16>::vlanes() )
{
V t0, t1;
t0 = v_load((T*)((uchar*)src + i));
t1 = v_load((T*)((uchar*)src + j - v_uint8x16::nlanes));
t1 = v_load((T*)((uchar*)src + j - VTraits<v_uint8x16>::vlanes()));
t0 = v_reverse(t0);
t1 = v_reverse(t1);
v_store((T*)(dst + j - v_uint8x16::nlanes), t0);
v_store((T*)(dst + j - VTraits<v_uint8x16>::vlanes()), t0);
v_store((T*)(dst + i), t1);
}
if (isAligned<sizeof(T)>(src, dst))
@ -446,14 +446,14 @@ flipHoriz( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size size,
#if CV_STRONG_ALIGNMENT
size_t alignmentMark = ((size_t)src)|((size_t)dst)|sstep|dstep;
#endif
if (esz == 2 * v_uint8x16::nlanes)
if (esz == 2 * (size_t)VTraits<v_uint8x16>::vlanes())
{
int end = (int)(size.width*esz);
int width = end/2;
for( ; size.height--; src += sstep, dst += dstep )
{
for( int i = 0, j = end - 2 * v_uint8x16::nlanes; i < width; i += 2 * v_uint8x16::nlanes, j -= 2 * v_uint8x16::nlanes )
for( int i = 0, j = end - 2 * VTraits<v_uint8x16>::vlanes(); i < width; i += 2 * VTraits<v_uint8x16>::vlanes(), j -= 2 * VTraits<v_uint8x16>::vlanes() )
{
#if CV_SIMD256
v_uint8x32 t0, t1;
@ -466,25 +466,25 @@ flipHoriz( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size size,
v_uint8x16 t0, t1, t2, t3;
t0 = v_load((uchar*)src + i);
t1 = v_load((uchar*)src + i + v_uint8x16::nlanes);
t1 = v_load((uchar*)src + i + VTraits<v_uint8x16>::vlanes());
t2 = v_load((uchar*)src + j);
t3 = v_load((uchar*)src + j + v_uint8x16::nlanes);
t3 = v_load((uchar*)src + j + VTraits<v_uint8x16>::vlanes());
v_store(dst + j, t0);
v_store(dst + j + v_uint8x16::nlanes, t1);
v_store(dst + j + VTraits<v_uint8x16>::vlanes(), t1);
v_store(dst + i, t2);
v_store(dst + i + v_uint8x16::nlanes, t3);
v_store(dst + i + VTraits<v_uint8x16>::vlanes(), t3);
#endif
}
}
}
else if (esz == v_uint8x16::nlanes)
else if (esz == (size_t)VTraits<v_uint8x16>::vlanes())
{
int end = (int)(size.width*esz);
int width = end/2;
for( ; size.height--; src += sstep, dst += dstep )
{
for( int i = 0, j = end - v_uint8x16::nlanes; i < width; i += v_uint8x16::nlanes, j -= v_uint8x16::nlanes )
for( int i = 0, j = end - VTraits<v_uint8x16>::vlanes(); i < width; i += VTraits<v_uint8x16>::vlanes(), j -= VTraits<v_uint8x16>::vlanes() )
{
v_uint8x16 t0, t1;
@ -534,19 +534,19 @@ flipHoriz( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size size,
for( ; size.height--; src += sstep, dst += dstep )
{
for ( int i = 0, j = end; i < width; i += v_uint8x16::nlanes + sizeof(uint64_t), j -= v_uint8x16::nlanes + sizeof(uint64_t) )
for ( int i = 0, j = end; i < width; i += VTraits<v_uint8x16>::vlanes() + sizeof(uint64_t), j -= VTraits<v_uint8x16>::vlanes() + sizeof(uint64_t) )
{
v_uint8x16 t0, t1;
uint64_t t2, t3;
t0 = v_load((uchar*)src + i);
t2 = *((uint64_t*)((uchar*)src + i + v_uint8x16::nlanes));
t1 = v_load((uchar*)src + j - v_uint8x16::nlanes - sizeof(uint64_t));
t2 = *((uint64_t*)((uchar*)src + i + VTraits<v_uint8x16>::vlanes()));
t1 = v_load((uchar*)src + j - VTraits<v_uint8x16>::vlanes() - sizeof(uint64_t));
t3 = *((uint64_t*)((uchar*)src + j - sizeof(uint64_t)));
v_store(dst + j - v_uint8x16::nlanes - sizeof(uint64_t), t0);
v_store(dst + j - VTraits<v_uint8x16>::vlanes() - sizeof(uint64_t), t0);
*((uint64_t*)(dst + j - sizeof(uint64_t))) = t2;
v_store(dst + i, t1);
*((uint64_t*)(dst + i + v_uint8x16::nlanes)) = t3;
*((uint64_t*)(dst + i + VTraits<v_uint8x16>::vlanes())) = t3;
}
}
}

View File

@ -141,7 +141,7 @@ CV_ALWAYS_INLINE uint64_t v_reduce_min(const v_uint64x2& a)
CV_ALWAYS_INLINE v_uint64x2 v_select(const v_uint64x2& mask, const v_uint64x2& a, const v_uint64x2& b)
{
return b ^ ((a ^ b) & mask);
return v_xor(b, v_and(v_xor(a, b), mask));
}
#endif
@ -151,16 +151,16 @@ minMaxIdx_reduce_##suffix( VT &valMin, VT &valMax, IT &idxMin, IT &idxMax, IT &n
T &minVal, T &maxVal, size_t &minIdx, size_t &maxIdx, \
size_t delta ) \
{ \
if ( v_check_any(idxMin != none) ) \
if ( v_check_any(v_ne(idxMin, none)) ) \
{ \
minVal = v_reduce_min(valMin); \
minIdx = (size_t)v_reduce_min(v_select(v_reinterpret_as_##suffix2(v_setall_##suffix((IR)minVal) == valMin), \
minIdx = (size_t)v_reduce_min(v_select(v_reinterpret_as_##suffix2(v_eq(v_setall_##suffix((IR)minVal), valMin)), \
idxMin, v_setall_##suffix2(maxLimit))) + delta; \
} \
if ( v_check_any(idxMax != none) ) \
if ( v_check_any(v_ne(idxMax, none)) ) \
{ \
maxVal = v_reduce_max(valMax); \
maxIdx = (size_t)v_reduce_min(v_select(v_reinterpret_as_##suffix2(v_setall_##suffix((IR)maxVal) == valMax), \
maxIdx = (size_t)v_reduce_min(v_select(v_reinterpret_as_##suffix2(v_eq(v_setall_##suffix((IR)maxVal), valMax)), \
idxMax, v_setall_##suffix2(maxLimit))) + delta; \
} \
}
@ -210,18 +210,18 @@ static void minMaxIdx_8u(const uchar* src, const uchar* mask, int* minval, int*
size_t* minidx, size_t* maxidx, int len, size_t startidx )
{
#if CV_SIMD128
if ( len >= v_uint8x16::nlanes )
if ( len >= VTraits<v_uint8x16>::vlanes() )
{
int j, len0;
int minVal, maxVal;
size_t minIdx, maxIdx;
minMaxIdx_init( src, mask, minval, maxval, minidx, maxidx, minVal, maxVal, minIdx, maxIdx,
(int)0, (int)UCHAR_MAX, v_uint8x16::nlanes, len, startidx, j, len0 );
(int)0, (int)UCHAR_MAX, VTraits<v_uint8x16>::vlanes(), len, startidx, j, len0 );
if ( j <= len0 - v_uint8x16::nlanes )
if ( j <= len0 - VTraits<v_uint8x16>::vlanes() )
{
v_uint8x16 inc = v_setall_u8(v_uint8x16::nlanes);
v_uint8x16 inc = v_setall_u8((uchar)VTraits<v_uint8x16>::vlanes());
v_uint8x16 none = v_reinterpret_as_u8(v_setall_s8(-1));
v_uint8x16 idxStart(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
@ -235,31 +235,31 @@ static void minMaxIdx_8u(const uchar* src, const uchar* mask, int* minval, int*
if ( !mask )
{
for( ; k < std::min(len0, j + 15 * v_uint8x16::nlanes); k += v_uint8x16::nlanes )
for( ; k < std::min(len0, j + 15 * VTraits<v_uint8x16>::vlanes()); k += VTraits<v_uint8x16>::vlanes() )
{
v_uint8x16 data = v_load(src + k);
v_uint8x16 cmpMin = (data < valMin);
v_uint8x16 cmpMax = (data > valMax);
v_uint8x16 cmpMin = (v_lt(data, valMin));
v_uint8x16 cmpMax = (v_gt(data, valMax));
idxMin = v_select(cmpMin, idx, idxMin);
idxMax = v_select(cmpMax, idx, idxMax);
valMin = v_min(data, valMin);
valMax = v_max(data, valMax);
idx += inc;
idx = v_add(idx, inc);
}
}
else
{
for( ; k < std::min(len0, j + 15 * v_uint8x16::nlanes); k += v_uint8x16::nlanes )
for( ; k < std::min(len0, j + 15 * VTraits<v_uint8x16>::vlanes()); k += VTraits<v_uint8x16>::vlanes() )
{
v_uint8x16 data = v_load(src + k);
v_uint8x16 maskVal = v_load(mask + k) != v_setzero_u8();
v_uint8x16 cmpMin = (data < valMin) & maskVal;
v_uint8x16 cmpMax = (data > valMax) & maskVal;
v_uint8x16 maskVal = v_ne(v_load(mask + k), v_setzero_u8());
v_uint8x16 cmpMin = v_and(v_lt(data, valMin), maskVal);
v_uint8x16 cmpMax = v_and(v_gt(data, valMax), maskVal);
idxMin = v_select(cmpMin, idx, idxMin);
idxMax = v_select(cmpMax, idx, idxMax);
valMin = v_select(cmpMin, data, valMin);
valMax = v_select(cmpMax, data, valMax);
idx += inc;
idx = v_add(idx, inc);
}
}
@ -287,18 +287,18 @@ static void minMaxIdx_8s(const schar* src, const uchar* mask, int* minval, int*
size_t* minidx, size_t* maxidx, int len, size_t startidx )
{
#if CV_SIMD128
if ( len >= v_int8x16::nlanes )
if ( len >= VTraits<v_int8x16>::vlanes() )
{
int j, len0;
int minVal, maxVal;
size_t minIdx, maxIdx;
minMaxIdx_init( src, mask, minval, maxval, minidx, maxidx, minVal, maxVal, minIdx, maxIdx,
(int)SCHAR_MIN, (int)SCHAR_MAX, v_int8x16::nlanes, len, startidx, j, len0 );
(int)SCHAR_MIN, (int)SCHAR_MAX, VTraits<v_int8x16>::vlanes(), len, startidx, j, len0 );
if ( j <= len0 - v_int8x16::nlanes )
if ( j <= len0 - VTraits<v_int8x16>::vlanes() )
{
v_uint8x16 inc = v_setall_u8(v_int8x16::nlanes);
v_uint8x16 inc = v_setall_u8((uchar)VTraits<v_int8x16>::vlanes());
v_uint8x16 none = v_reinterpret_as_u8(v_setall_s8(-1));
v_uint8x16 idxStart(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
@ -312,31 +312,31 @@ static void minMaxIdx_8s(const schar* src, const uchar* mask, int* minval, int*
if ( !mask )
{
for( ; k < std::min(len0, j + 15 * v_int8x16::nlanes); k += v_int8x16::nlanes )
for( ; k < std::min(len0, j + 15 * VTraits<v_int8x16>::vlanes()); k += VTraits<v_int8x16>::vlanes() )
{
v_int8x16 data = v_load(src + k);
v_uint8x16 cmpMin = v_reinterpret_as_u8(data < valMin);
v_uint8x16 cmpMax = v_reinterpret_as_u8(data > valMax);
v_uint8x16 cmpMin = v_reinterpret_as_u8(v_lt(data, valMin));
v_uint8x16 cmpMax = v_reinterpret_as_u8(v_gt(data, valMax));
idxMin = v_select(cmpMin, idx, idxMin);
idxMax = v_select(cmpMax, idx, idxMax);
valMin = v_min(data, valMin);
valMax = v_max(data, valMax);
idx += inc;
idx = v_add(idx, inc);
}
}
else
{
for( ; k < std::min(len0, j + 15 * v_int8x16::nlanes); k += v_int8x16::nlanes )
for( ; k < std::min(len0, j + 15 * VTraits<v_int8x16>::vlanes()); k += VTraits<v_int8x16>::vlanes() )
{
v_int8x16 data = v_load(src + k);
v_uint8x16 maskVal = v_load(mask + k) != v_setzero_u8();
v_uint8x16 cmpMin = v_reinterpret_as_u8(data < valMin) & maskVal;
v_uint8x16 cmpMax = v_reinterpret_as_u8(data > valMax) & maskVal;
v_uint8x16 maskVal = v_ne(v_load(mask + k), v_setzero_u8());
v_uint8x16 cmpMin = v_and(v_reinterpret_as_u8(v_lt(data, valMin)), maskVal);
v_uint8x16 cmpMax = v_and(v_reinterpret_as_u8(v_gt(data, valMax)), maskVal);
idxMin = v_select(cmpMin, idx, idxMin);
idxMax = v_select(cmpMax, idx, idxMax);
valMin = v_select(v_reinterpret_as_s8(cmpMin), data, valMin);
valMax = v_select(v_reinterpret_as_s8(cmpMax), data, valMax);
idx += inc;
idx = v_add(idx, inc);
}
}
@ -364,18 +364,18 @@ static void minMaxIdx_16u(const ushort* src, const uchar* mask, int* minval, int
size_t* minidx, size_t* maxidx, int len, size_t startidx )
{
#if CV_SIMD128
if ( len >= v_uint16x8::nlanes )
if ( len >= VTraits<v_uint16x8>::vlanes() )
{
int j, len0;
int minVal, maxVal;
size_t minIdx, maxIdx;
minMaxIdx_init( src, mask, minval, maxval, minidx, maxidx, minVal, maxVal, minIdx, maxIdx,
(int)0, (int)USHRT_MAX, v_uint16x8::nlanes, len, startidx, j, len0 );
(int)0, (int)USHRT_MAX, VTraits<v_uint16x8>::vlanes(), len, startidx, j, len0 );
if ( j <= len0 - v_uint16x8::nlanes )
if ( j <= len0 - VTraits<v_uint16x8>::vlanes() )
{
v_uint16x8 inc = v_setall_u16(v_uint16x8::nlanes);
v_uint16x8 inc = v_setall_u16((uchar)VTraits<v_uint16x8>::vlanes());
v_uint16x8 none = v_reinterpret_as_u16(v_setall_s16(-1));
v_uint16x8 idxStart(0, 1, 2, 3, 4, 5, 6, 7);
@ -389,31 +389,31 @@ static void minMaxIdx_16u(const ushort* src, const uchar* mask, int* minval, int
if ( !mask )
{
for( ; k < std::min(len0, j + 8191 * v_uint16x8::nlanes); k += v_uint16x8::nlanes )
for( ; k < std::min(len0, j + 8191 * VTraits<v_uint16x8>::vlanes()); k += VTraits<v_uint16x8>::vlanes() )
{
v_uint16x8 data = v_load(src + k);
v_uint16x8 cmpMin = (data < valMin);
v_uint16x8 cmpMax = (data > valMax);
v_uint16x8 cmpMin = (v_lt(data, valMin));
v_uint16x8 cmpMax = (v_gt(data, valMax));
idxMin = v_select(cmpMin, idx, idxMin);
idxMax = v_select(cmpMax, idx, idxMax);
valMin = v_min(data, valMin);
valMax = v_max(data, valMax);
idx += inc;
idx = v_add(idx, inc);
}
}
else
{
for( ; k < std::min(len0, j + 8191 * v_uint16x8::nlanes); k += v_uint16x8::nlanes )
for( ; k < std::min(len0, j + 8191 * VTraits<v_uint16x8>::vlanes()); k += VTraits<v_uint16x8>::vlanes() )
{
v_uint16x8 data = v_load(src + k);
v_uint16x8 maskVal = v_load_expand(mask + k) != v_setzero_u16();
v_uint16x8 cmpMin = (data < valMin) & maskVal;
v_uint16x8 cmpMax = (data > valMax) & maskVal;
v_uint16x8 maskVal = v_ne(v_load_expand(mask + k), v_setzero_u16());
v_uint16x8 cmpMin = v_and(v_lt(data, valMin), maskVal);
v_uint16x8 cmpMax = v_and(v_gt(data, valMax), maskVal);
idxMin = v_select(cmpMin, idx, idxMin);
idxMax = v_select(cmpMax, idx, idxMax);
valMin = v_select(cmpMin, data, valMin);
valMax = v_select(cmpMax, data, valMax);
idx += inc;
idx = v_add(idx, inc);
}
}
@ -441,18 +441,18 @@ static void minMaxIdx_16s(const short* src, const uchar* mask, int* minval, int*
size_t* minidx, size_t* maxidx, int len, size_t startidx )
{
#if CV_SIMD128
if ( len >= v_int16x8::nlanes )
if ( len >= VTraits<v_int16x8>::vlanes() )
{
int j, len0;
int minVal, maxVal;
size_t minIdx, maxIdx;
minMaxIdx_init( src, mask, minval, maxval, minidx, maxidx, minVal, maxVal, minIdx, maxIdx,
(int)SHRT_MIN, (int)SHRT_MAX, v_int16x8::nlanes, len, startidx, j, len0 );
(int)SHRT_MIN, (int)SHRT_MAX, VTraits<v_int16x8>::vlanes(), len, startidx, j, len0 );
if ( j <= len0 - v_int16x8::nlanes )
if ( j <= len0 - VTraits<v_int16x8>::vlanes() )
{
v_uint16x8 inc = v_setall_u16(v_int16x8::nlanes);
v_uint16x8 inc = v_setall_u16((uchar)VTraits<v_int16x8>::vlanes());
v_uint16x8 none = v_reinterpret_as_u16(v_setall_s16(-1));
v_uint16x8 idxStart(0, 1, 2, 3, 4, 5, 6, 7);
@ -466,31 +466,31 @@ static void minMaxIdx_16s(const short* src, const uchar* mask, int* minval, int*
if ( !mask )
{
for( ; k < std::min(len0, j + 8191 * v_int16x8::nlanes); k += v_int16x8::nlanes )
for( ; k < std::min(len0, j + 8191 * VTraits<v_int16x8>::vlanes()); k += VTraits<v_int16x8>::vlanes() )
{
v_int16x8 data = v_load(src + k);
v_uint16x8 cmpMin = v_reinterpret_as_u16(data < valMin);
v_uint16x8 cmpMax = v_reinterpret_as_u16(data > valMax);
v_uint16x8 cmpMin = v_reinterpret_as_u16(v_lt(data, valMin));
v_uint16x8 cmpMax = v_reinterpret_as_u16(v_gt(data, valMax));
idxMin = v_select(cmpMin, idx, idxMin);
idxMax = v_select(cmpMax, idx, idxMax);
valMin = v_min(data, valMin);
valMax = v_max(data, valMax);
idx += inc;
idx = v_add(idx, inc);
}
}
else
{
for( ; k < std::min(len0, j + 8191 * v_int16x8::nlanes); k += v_int16x8::nlanes )
for( ; k < std::min(len0, j + 8191 * VTraits<v_int16x8>::vlanes()); k += VTraits<v_int16x8>::vlanes() )
{
v_int16x8 data = v_load(src + k);
v_uint16x8 maskVal = v_load_expand(mask + k) != v_setzero_u16();
v_uint16x8 cmpMin = v_reinterpret_as_u16(data < valMin) & maskVal;
v_uint16x8 cmpMax = v_reinterpret_as_u16(data > valMax) & maskVal;
v_uint16x8 maskVal = v_ne(v_load_expand(mask + k), v_setzero_u16());
v_uint16x8 cmpMin = v_and(v_reinterpret_as_u16(v_lt(data, valMin)), maskVal);
v_uint16x8 cmpMax = v_and(v_reinterpret_as_u16(v_gt(data, valMax)), maskVal);
idxMin = v_select(cmpMin, idx, idxMin);
idxMax = v_select(cmpMax, idx, idxMax);
valMin = v_select(v_reinterpret_as_s16(cmpMin), data, valMin);
valMax = v_select(v_reinterpret_as_s16(cmpMax), data, valMax);
idx += inc;
idx = v_add(idx, inc);
}
}
@ -518,14 +518,14 @@ static void minMaxIdx_32s(const int* src, const uchar* mask, int* minval, int* m
size_t* minidx, size_t* maxidx, int len, size_t startidx )
{
#if CV_SIMD128
if ( len >= 2 * v_int32x4::nlanes )
if ( len >= 2 * VTraits<v_int32x4>::vlanes() )
{
int j = 0, len0 = len & -(2 * v_int32x4::nlanes);
int j = 0, len0 = len & -(2 * VTraits<v_int32x4>::vlanes());
int minVal = *minval, maxVal = *maxval;
size_t minIdx = *minidx, maxIdx = *maxidx;
{
v_uint32x4 inc = v_setall_u32(v_int32x4::nlanes);
v_uint32x4 inc = v_setall_u32(VTraits<v_int32x4>::vlanes());
v_uint32x4 none = v_reinterpret_as_u32(v_setall_s32(-1));
v_uint32x4 idxStart(0, 1, 2, 3);
@ -539,49 +539,49 @@ static void minMaxIdx_32s(const int* src, const uchar* mask, int* minval, int* m
if ( !mask )
{
for( ; k < std::min(len0, j + 32766 * 2 * v_int32x4::nlanes); k += 2 * v_int32x4::nlanes )
for( ; k < std::min(len0, j + 32766 * 2 * VTraits<v_int32x4>::vlanes()); k += 2 * VTraits<v_int32x4>::vlanes() )
{
v_int32x4 data = v_load(src + k);
v_uint32x4 cmpMin = v_reinterpret_as_u32(data < valMin);
v_uint32x4 cmpMax = v_reinterpret_as_u32(data > valMax);
v_uint32x4 cmpMin = v_reinterpret_as_u32(v_lt(data, valMin));
v_uint32x4 cmpMax = v_reinterpret_as_u32(v_gt(data, valMax));
idxMin = v_select(cmpMin, idx, idxMin);
idxMax = v_select(cmpMax, idx, idxMax);
valMin = v_min(data, valMin);
valMax = v_max(data, valMax);
idx += inc;
data = v_load(src + k + v_int32x4::nlanes);
cmpMin = v_reinterpret_as_u32(data < valMin);
cmpMax = v_reinterpret_as_u32(data > valMax);
idx = v_add(idx, inc);
data = v_load(src + k + VTraits<v_int32x4>::vlanes());
cmpMin = v_reinterpret_as_u32(v_lt(data, valMin));
cmpMax = v_reinterpret_as_u32(v_gt(data, valMax));
idxMin = v_select(cmpMin, idx, idxMin);
idxMax = v_select(cmpMax, idx, idxMax);
valMin = v_min(data, valMin);
valMax = v_max(data, valMax);
idx += inc;
idx = v_add(idx, inc);
}
}
else
{
for( ; k < std::min(len0, j + 32766 * 2 * v_int32x4::nlanes); k += 2 * v_int32x4::nlanes )
for( ; k < std::min(len0, j + 32766 * 2 * VTraits<v_int32x4>::vlanes()); k += 2 * VTraits<v_int32x4>::vlanes() )
{
v_int32x4 data = v_load(src + k);
v_uint16x8 maskVal = v_load_expand(mask + k) != v_setzero_u16();
v_uint16x8 maskVal = v_ne(v_load_expand(mask + k), v_setzero_u16());
v_int32x4 maskVal1, maskVal2;
v_expand(v_reinterpret_as_s16(maskVal), maskVal1, maskVal2);
v_uint32x4 cmpMin = v_reinterpret_as_u32((data < valMin) & maskVal1);
v_uint32x4 cmpMax = v_reinterpret_as_u32((data > valMax) & maskVal1);
v_uint32x4 cmpMin = v_reinterpret_as_u32(v_and(v_lt(data, valMin), maskVal1));
v_uint32x4 cmpMax = v_reinterpret_as_u32(v_and(v_gt(data, valMax), maskVal1));
idxMin = v_select(cmpMin, idx, idxMin);
idxMax = v_select(cmpMax, idx, idxMax);
valMin = v_select(v_reinterpret_as_s32(cmpMin), data, valMin);
valMax = v_select(v_reinterpret_as_s32(cmpMax), data, valMax);
idx += inc;
data = v_load(src + k + v_int32x4::nlanes);
cmpMin = v_reinterpret_as_u32((data < valMin) & maskVal2);
cmpMax = v_reinterpret_as_u32((data > valMax) & maskVal2);
idx = v_add(idx, inc);
data = v_load(src + k + VTraits<v_int32x4>::vlanes());
cmpMin = v_reinterpret_as_u32(v_and(v_lt(data, valMin), maskVal2));
cmpMax = v_reinterpret_as_u32(v_and(v_gt(data, valMax), maskVal2));
idxMin = v_select(cmpMin, idx, idxMin);
idxMax = v_select(cmpMax, idx, idxMax);
valMin = v_select(v_reinterpret_as_s32(cmpMin), data, valMin);
valMax = v_select(v_reinterpret_as_s32(cmpMax), data, valMax);
idx += inc;
idx = v_add(idx, inc);
}
}
@ -609,18 +609,18 @@ static void minMaxIdx_32f(const float* src, const uchar* mask, float* minval, fl
size_t* minidx, size_t* maxidx, int len, size_t startidx )
{
#if CV_SIMD128
if ( len >= 2 * v_float32x4::nlanes )
if ( len >= 2 * VTraits<v_float32x4>::vlanes() )
{
int j, len0;
float minVal, maxVal;
size_t minIdx, maxIdx;
minMaxIdx_init( src, mask, minval, maxval, minidx, maxidx, minVal, maxVal, minIdx, maxIdx,
FLT_MIN, FLT_MAX, 2 * v_float32x4::nlanes, len, startidx, j, len0 );
FLT_MIN, FLT_MAX, 2 * VTraits<v_float32x4>::vlanes(), len, startidx, j, len0 );
if ( j <= len0 - 2 * v_float32x4::nlanes )
if ( j <= len0 - 2 * VTraits<v_float32x4>::vlanes() )
{
v_uint32x4 inc = v_setall_u32(v_float32x4::nlanes);
v_uint32x4 inc = v_setall_u32(VTraits<v_float32x4>::vlanes());
v_uint32x4 none = v_reinterpret_as_u32(v_setall_s32(-1));
v_uint32x4 idxStart(0, 1, 2, 3);
@ -634,49 +634,49 @@ static void minMaxIdx_32f(const float* src, const uchar* mask, float* minval, fl
if ( !mask )
{
for( ; k < std::min(len0, j + 32766 * 2 * v_float32x4::nlanes); k += 2 * v_float32x4::nlanes )
for( ; k < std::min(len0, j + 32766 * 2 * VTraits<v_float32x4>::vlanes()); k += 2 * VTraits<v_float32x4>::vlanes() )
{
v_float32x4 data = v_load(src + k);
v_uint32x4 cmpMin = v_reinterpret_as_u32(data < valMin);
v_uint32x4 cmpMax = v_reinterpret_as_u32(data > valMax);
v_uint32x4 cmpMin = v_reinterpret_as_u32(v_lt(data, valMin));
v_uint32x4 cmpMax = v_reinterpret_as_u32(v_gt(data, valMax));
idxMin = v_select(cmpMin, idx, idxMin);
idxMax = v_select(cmpMax, idx, idxMax);
valMin = v_min(data, valMin);
valMax = v_max(data, valMax);
idx += inc;
data = v_load(src + k + v_float32x4::nlanes);
cmpMin = v_reinterpret_as_u32(data < valMin);
cmpMax = v_reinterpret_as_u32(data > valMax);
idx = v_add(idx, inc);
data = v_load(src + k + VTraits<v_float32x4>::vlanes());
cmpMin = v_reinterpret_as_u32(v_lt(data, valMin));
cmpMax = v_reinterpret_as_u32(v_gt(data, valMax));
idxMin = v_select(cmpMin, idx, idxMin);
idxMax = v_select(cmpMax, idx, idxMax);
valMin = v_min(data, valMin);
valMax = v_max(data, valMax);
idx += inc;
idx = v_add(idx, inc);
}
}
else
{
for( ; k < std::min(len0, j + 32766 * 2 * v_float32x4::nlanes); k += 2 * v_float32x4::nlanes )
for( ; k < std::min(len0, j + 32766 * 2 * VTraits<v_float32x4>::vlanes()); k += 2 * VTraits<v_float32x4>::vlanes() )
{
v_float32x4 data = v_load(src + k);
v_uint16x8 maskVal = v_load_expand(mask + k) != v_setzero_u16();
v_uint16x8 maskVal = v_ne(v_load_expand(mask + k), v_setzero_u16());
v_int32x4 maskVal1, maskVal2;
v_expand(v_reinterpret_as_s16(maskVal), maskVal1, maskVal2);
v_uint32x4 cmpMin = v_reinterpret_as_u32(v_reinterpret_as_s32(data < valMin) & maskVal1);
v_uint32x4 cmpMax = v_reinterpret_as_u32(v_reinterpret_as_s32(data > valMax) & maskVal1);
v_uint32x4 cmpMin = v_reinterpret_as_u32(v_and(v_reinterpret_as_s32(v_lt(data, valMin)), maskVal1));
v_uint32x4 cmpMax = v_reinterpret_as_u32(v_and(v_reinterpret_as_s32(v_gt(data, valMax)), maskVal1));
idxMin = v_select(cmpMin, idx, idxMin);
idxMax = v_select(cmpMax, idx, idxMax);
valMin = v_select(v_reinterpret_as_f32(cmpMin), data, valMin);
valMax = v_select(v_reinterpret_as_f32(cmpMax), data, valMax);
idx += inc;
data = v_load(src + k + v_float32x4::nlanes);
cmpMin = v_reinterpret_as_u32(v_reinterpret_as_s32(data < valMin) & maskVal2);
cmpMax = v_reinterpret_as_u32(v_reinterpret_as_s32(data > valMax) & maskVal2);
idx = v_add(idx, inc);
data = v_load(src + k + VTraits<v_float32x4>::vlanes());
cmpMin = v_reinterpret_as_u32(v_and(v_reinterpret_as_s32(v_lt(data, valMin)), maskVal2));
cmpMax = v_reinterpret_as_u32(v_and(v_reinterpret_as_s32(v_gt(data, valMax)), maskVal2));
idxMin = v_select(cmpMin, idx, idxMin);
idxMax = v_select(cmpMax, idx, idxMax);
valMin = v_select(v_reinterpret_as_f32(cmpMin), data, valMin);
valMax = v_select(v_reinterpret_as_f32(cmpMax), data, valMax);
idx += inc;
idx = v_add(idx, inc);
}
}
@ -704,18 +704,18 @@ static void minMaxIdx_64f(const double* src, const uchar* mask, double* minval,
size_t* minidx, size_t* maxidx, int len, size_t startidx )
{
#if CV_SIMD128_64F
if ( len >= 4 * v_float64x2::nlanes )
if ( len >= 4 * VTraits<v_float64x2>::vlanes() )
{
int j, len0;
double minVal, maxVal;
size_t minIdx, maxIdx;
minMaxIdx_init( src, mask, minval, maxval, minidx, maxidx, minVal, maxVal, minIdx, maxIdx,
DBL_MIN, DBL_MAX, 4 * v_float64x2::nlanes, len, startidx, j, len0 );
DBL_MIN, DBL_MAX, 4 * VTraits<v_float64x2>::vlanes(), len, startidx, j, len0 );
if ( j <= len0 - 4 * v_float64x2::nlanes )
if ( j <= len0 - 4 * VTraits<v_float64x2>::vlanes() )
{
v_uint64x2 inc = v_setall_u64(v_float64x2::nlanes);
v_uint64x2 inc = v_setall_u64(VTraits<v_float64x2>::vlanes());
v_uint64x2 none = v_reinterpret_as_u64(v_setall_s64(-1));
v_uint64x2 idxStart(0, 1);
@ -729,84 +729,84 @@ static void minMaxIdx_64f(const double* src, const uchar* mask, double* minval,
if ( !mask )
{
for( ; k < std::min(len0, j + 32764 * 4 * v_float64x2::nlanes); k += 4 * v_float64x2::nlanes )
for( ; k < std::min(len0, j + 32764 * 4 * VTraits<v_float64x2>::vlanes()); k += 4 * VTraits<v_float64x2>::vlanes() )
{
v_float64x2 data = v_load(src + k);
v_uint64x2 cmpMin = v_reinterpret_as_u64(data < valMin);
v_uint64x2 cmpMax = v_reinterpret_as_u64(data > valMax);
v_uint64x2 cmpMin = v_reinterpret_as_u64(v_lt(data, valMin));
v_uint64x2 cmpMax = v_reinterpret_as_u64(v_gt(data, valMax));
idxMin = v_select(cmpMin, idx, idxMin);
idxMax = v_select(cmpMax, idx, idxMax);
valMin = v_min(data, valMin);
valMax = v_max(data, valMax);
idx += inc;
data = v_load(src + k + v_float64x2::nlanes);
cmpMin = v_reinterpret_as_u64(data < valMin);
cmpMax = v_reinterpret_as_u64(data > valMax);
idx = v_add(idx, inc);
data = v_load(src + k + VTraits<v_float64x2>::vlanes());
cmpMin = v_reinterpret_as_u64(v_lt(data, valMin));
cmpMax = v_reinterpret_as_u64(v_gt(data, valMax));
idxMin = v_select(cmpMin, idx, idxMin);
idxMax = v_select(cmpMax, idx, idxMax);
valMin = v_min(data, valMin);
valMax = v_max(data, valMax);
idx += inc;
data = v_load(src + k + 2 * v_float64x2::nlanes);
cmpMin = v_reinterpret_as_u64(data < valMin);
cmpMax = v_reinterpret_as_u64(data > valMax);
idx = v_add(idx, inc);
data = v_load(src + k + 2 * VTraits<v_float64x2>::vlanes());
cmpMin = v_reinterpret_as_u64(v_lt(data, valMin));
cmpMax = v_reinterpret_as_u64(v_gt(data, valMax));
idxMin = v_select(cmpMin, idx, idxMin);
idxMax = v_select(cmpMax, idx, idxMax);
valMin = v_min(data, valMin);
valMax = v_max(data, valMax);
idx += inc;
data = v_load(src + k + 3 * v_float64x2::nlanes);
cmpMin = v_reinterpret_as_u64(data < valMin);
cmpMax = v_reinterpret_as_u64(data > valMax);
idx = v_add(idx, inc);
data = v_load(src + k + 3 * VTraits<v_float64x2>::vlanes());
cmpMin = v_reinterpret_as_u64(v_lt(data, valMin));
cmpMax = v_reinterpret_as_u64(v_gt(data, valMax));
idxMin = v_select(cmpMin, idx, idxMin);
idxMax = v_select(cmpMax, idx, idxMax);
valMin = v_min(data, valMin);
valMax = v_max(data, valMax);
idx += inc;
idx = v_add(idx, inc);
}
}
else
{
for( ; k < std::min(len0, j + 32764 * 4 * v_float64x2::nlanes); k += 4 * v_float64x2::nlanes )
for( ; k < std::min(len0, j + 32764 * 4 * VTraits<v_float64x2>::vlanes()); k += 4 * VTraits<v_float64x2>::vlanes() )
{
v_float64x2 data = v_load(src + k);
v_uint16x8 maskVal = v_load_expand(mask + k) != v_setzero_u16();
v_uint16x8 maskVal = v_ne(v_load_expand(mask + k), v_setzero_u16());
v_int32x4 maskVal1, maskVal2;
v_expand(v_reinterpret_as_s16(maskVal), maskVal1, maskVal2);
v_int64x2 maskVal3, maskVal4;
v_expand(maskVal1, maskVal3, maskVal4);
v_uint64x2 cmpMin = v_reinterpret_as_u64(v_reinterpret_as_s64(data < valMin) & maskVal3);
v_uint64x2 cmpMax = v_reinterpret_as_u64(v_reinterpret_as_s64(data > valMax) & maskVal3);
v_uint64x2 cmpMin = v_reinterpret_as_u64(v_and(v_reinterpret_as_s64(v_lt(data, valMin)), maskVal3));
v_uint64x2 cmpMax = v_reinterpret_as_u64(v_and(v_reinterpret_as_s64(v_gt(data, valMax)), maskVal3));
idxMin = v_select(cmpMin, idx, idxMin);
idxMax = v_select(cmpMax, idx, idxMax);
valMin = v_select(v_reinterpret_as_f64(cmpMin), data, valMin);
valMax = v_select(v_reinterpret_as_f64(cmpMax), data, valMax);
idx += inc;
data = v_load(src + k + v_float64x2::nlanes);
cmpMin = v_reinterpret_as_u64(v_reinterpret_as_s64(data < valMin) & maskVal4);
cmpMax = v_reinterpret_as_u64(v_reinterpret_as_s64(data > valMax) & maskVal4);
idx = v_add(idx, inc);
data = v_load(src + k + VTraits<v_float64x2>::vlanes());
cmpMin = v_reinterpret_as_u64(v_and(v_reinterpret_as_s64(v_lt(data, valMin)), maskVal4));
cmpMax = v_reinterpret_as_u64(v_and(v_reinterpret_as_s64(v_gt(data, valMax)), maskVal4));
idxMin = v_select(cmpMin, idx, idxMin);
idxMax = v_select(cmpMax, idx, idxMax);
valMin = v_select(v_reinterpret_as_f64(cmpMin), data, valMin);
valMax = v_select(v_reinterpret_as_f64(cmpMax), data, valMax);
idx += inc;
data = v_load(src + k + 2 * v_float64x2::nlanes);
idx = v_add(idx, inc);
data = v_load(src + k + 2 * VTraits<v_float64x2>::vlanes());
v_expand(maskVal2, maskVal3, maskVal4);
cmpMin = v_reinterpret_as_u64(v_reinterpret_as_s64(data < valMin) & maskVal3);
cmpMax = v_reinterpret_as_u64(v_reinterpret_as_s64(data > valMax) & maskVal3);
cmpMin = v_reinterpret_as_u64(v_and(v_reinterpret_as_s64(v_lt(data, valMin)), maskVal3));
cmpMax = v_reinterpret_as_u64(v_and(v_reinterpret_as_s64(v_gt(data, valMax)), maskVal3));
idxMin = v_select(cmpMin, idx, idxMin);
idxMax = v_select(cmpMax, idx, idxMax);
valMin = v_select(v_reinterpret_as_f64(cmpMin), data, valMin);
valMax = v_select(v_reinterpret_as_f64(cmpMax), data, valMax);
idx += inc;
data = v_load(src + k + 3 * v_float64x2::nlanes);
cmpMin = v_reinterpret_as_u64(v_reinterpret_as_s64(data < valMin) & maskVal4);
cmpMax = v_reinterpret_as_u64(v_reinterpret_as_s64(data > valMax) & maskVal4);
idx = v_add(idx, inc);
data = v_load(src + k + 3 * VTraits<v_float64x2>::vlanes());
cmpMin = v_reinterpret_as_u64(v_and(v_reinterpret_as_s64(v_lt(data, valMin)), maskVal4));
cmpMax = v_reinterpret_as_u64(v_and(v_reinterpret_as_s64(v_gt(data, valMax)), maskVal4));
idxMin = v_select(cmpMin, idx, idxMin);
idxMax = v_select(cmpMax, idx, idxMax);
valMin = v_select(v_reinterpret_as_f64(cmpMin), data, valMin);
valMax = v_select(v_reinterpret_as_f64(cmpMax), data, valMax);
idx += inc;
idx = v_add(idx, inc);
}
}

View File

@ -1745,13 +1745,8 @@ template<typename R> struct TheTest
R a = dataA;
R b = dataB;
#if CV_SIMD_SCALABLE
Data<R> dataEQ = v_eq(a, b);
Data<R> dataNE = v_ne(a, b);
#else
Data<R> dataEQ = (a == b);
Data<R> dataNE = (a != b);
#endif
for (int i = 0; i < VTraits<R>::vlanes(); ++i)
{

View File

@ -29,10 +29,10 @@ static inline void v_expand_mul_add(const v_int8x16& a, const v_int8x16& b,
v_int32x4 t0, t1;
v_mul_expand(a0, b0, t0, t1);
out0 += t0; out1 += t1;
out0 = v_add(out0, t0); out1 = v_add(out1, t1);
v_mul_expand(a1, b1, t0, t1);
out2 += t0; out3 += t1;
out2 = v_add(out2, t0); out3 = v_add(out3, t1);
}
#endif
@ -1055,10 +1055,10 @@ public:
v_expand_mul_add(v21, vw21, vout0, vout1, vout2, vout3);
v_expand_mul_add(v22, vw22, vout0, vout1, vout2, vout3);
vout0 = voutzp + v_round(v_cvt_f32(vout0)*vmult);
vout1 = voutzp + v_round(v_cvt_f32(vout1)*vmult);
vout2 = voutzp + v_round(v_cvt_f32(vout2)*vmult);
vout3 = voutzp + v_round(v_cvt_f32(vout3)*vmult);
vout0 = v_add(voutzp, v_round(v_mul(v_cvt_f32(vout0), vmult)));
vout1 = v_add(voutzp, v_round(v_mul(v_cvt_f32(vout1), vmult)));
vout2 = v_add(voutzp, v_round(v_mul(v_cvt_f32(vout2), vmult)));
vout3 = v_add(voutzp, v_round(v_mul(v_cvt_f32(vout3), vmult)));
vout0 = v_min(v_max(vout0, outmin), outmax);
vout1 = v_min(v_max(vout1, outmin), outmax);
@ -1408,12 +1408,12 @@ public:
vs12 = v_dotprod_expand_fast(w1, r2, vs12);
vs13 = v_dotprod_expand_fast(w1, r3, vs13);
}
s0 += v_int32x4(v_reduce_sum(vs00), v_reduce_sum(vs01), v_reduce_sum(vs02), v_reduce_sum(vs03));
s1 += v_int32x4(v_reduce_sum(vs10), v_reduce_sum(vs11), v_reduce_sum(vs12), v_reduce_sum(vs13));
s0 = v_add(s0, v_int32x4(v_reduce_sum(vs00), v_reduce_sum(vs01), v_reduce_sum(vs02), v_reduce_sum(vs03)));
s1 = v_add(s1, v_int32x4(v_reduce_sum(vs10), v_reduce_sum(vs11), v_reduce_sum(vs12), v_reduce_sum(vs13)));
if( cn1 == inpCn )
{
s0 = voutzp + v_round(v_cvt_f32(s0)*vmult0);
s1 = voutzp + v_round(v_cvt_f32(s1)*vmult1);
s0 = v_add(voutzp, v_round(v_mul(v_cvt_f32(s0), vmult0)));
s1 = v_add(voutzp, v_round(v_mul(v_cvt_f32(s1), vmult1)));
s0 = v_min(v_max(s0, outmin), outmax);
s1 = v_min(v_max(s1, outmin), outmax);

View File

@ -323,8 +323,8 @@ public:
vs3 = v_dotprod_expand_fast(v, v_load_aligned(wptr + wstep*3 + k), vs3);
}
s += v_int32x4(v_reduce_sum(vs0), v_reduce_sum(vs1), v_reduce_sum(vs2), v_reduce_sum(vs3));
v_int32x4 out = outzp + v_round(v_cvt_f32(s)*mult);
s = v_add(s, v_int32x4(v_reduce_sum(vs0), v_reduce_sum(vs1), v_reduce_sum(vs2), v_reduce_sum(vs3)));
v_int32x4 out = v_add(outzp, v_round(v_mul(v_cvt_f32(s), mult)));
v_store(dptr + i, v_min(v_max(out, outmin), outmax));
}
#endif

View File

@ -631,17 +631,17 @@ public:
(int)srcData[index + stride_w*10], (int)srcData[index + stride_w*11]);
v_int32x4 v3((int)srcData[index + stride_w*12], (int)srcData[index + stride_w*13],
(int)srcData[index + stride_w*14], (int)srcData[index + stride_w*15]);
sum_val0 += v0;
sum_val1 += v1;
sum_val2 += v2;
sum_val3 += v3;
sum_val0 = v_add(sum_val0, v0);
sum_val1 = v_add(sum_val1, v1);
sum_val2 = v_add(sum_val2, v2);
sum_val3 = v_add(sum_val3, v3);
}
}
sum_val0 = v_round(v_cvt_f32(sum_val0)*ikarea) + voutzp;
sum_val1 = v_round(v_cvt_f32(sum_val1)*ikarea) + voutzp;
sum_val2 = v_round(v_cvt_f32(sum_val2)*ikarea) + voutzp;
sum_val3 = v_round(v_cvt_f32(sum_val3)*ikarea) + voutzp;
sum_val0 = v_add(v_round(v_mul(v_cvt_f32(sum_val0), ikarea)), voutzp);
sum_val1 = v_add(v_round(v_mul(v_cvt_f32(sum_val1), ikarea)), voutzp);
sum_val2 = v_add(v_round(v_mul(v_cvt_f32(sum_val2), ikarea)), voutzp);
sum_val3 = v_add(v_round(v_mul(v_cvt_f32(sum_val3), ikarea)), voutzp);
v_store(dstData + x0, v_pack(v_pack(sum_val0, sum_val1), v_pack(sum_val2, sum_val3)));
x0 += 15;

View File

@ -236,13 +236,11 @@ void depthWiseBlockConv2D(const float* wptr,
v21 = v_load(imgptr2 + in_j + dilation_w),
v22 = v_load(imgptr2 + in_j + dilation_w*2);
v_float32x4 vout = v00*vw00 + v01*vw01 + v02*vw02 +
v10*vw10 + v11*vw11 + v12*vw12 +
v20*vw20 + v21*vw21 + v22*vw22 + vbias;
v_float32x4 vout = v_add(v_add(v_add(v_add(v_add(v_add(v_add(v_add(v_add(v_mul(v00, vw00), v_mul(v01, vw01)), v_mul(v02, vw02)), v_mul(v10, vw10)), v_mul(v11, vw11)), v_mul(v12, vw12)), v_mul(v20, vw20)), v_mul(v21, vw21)), v_mul(v22, vw22)), vbias);
if (fusedAdd)
vout = v_load(outptr + out_j) + vout;
vout = v_add(v_load(outptr + out_j), vout);
if (relu)
vout = v_select(vout > z, vout, vout*vrc);
vout = v_select(v_gt(vout, z), vout, v_mul(vout, vrc));
v_store(outptr + out_j, vout);
}
}
@ -268,14 +266,12 @@ void depthWiseBlockConv2D(const float* wptr,
v_load_deinterleave(imgptr2 + in_j, v20, v21);
v_load_deinterleave(imgptr2 + in_j + 2, v22, unused);
v_float32x4 vout = v00 * vw00 + v01 * vw01 + v02 * vw02 +
v10 * vw10 + v11 * vw11 + v12 * vw12 +
v20 * vw20 + v21 * vw21 + v22 * vw22 + vbias;
v_float32x4 vout = v_add(v_add(v_add(v_add(v_add(v_add(v_add(v_add(v_add(v_mul(v00, vw00), v_mul(v01, vw01)), v_mul(v02, vw02)), v_mul(v10, vw10)), v_mul(v11, vw11)), v_mul(v12, vw12)), v_mul(v20, vw20)), v_mul(v21, vw21)), v_mul(v22, vw22)), vbias);
if (fusedAdd)
vout = v_load(outptr + out_j) + vout;
vout = v_add(v_load(outptr + out_j), vout);
if (relu)
vout = v_select(vout > z, vout, vout*vrc);
vout = v_select(v_gt(vout, z), vout, v_mul(vout, vrc));
v_store(outptr + out_j, vout);
}
}
@ -381,11 +377,11 @@ void depthWiseBlockConv1D(const float* wptr,
v01 = v_load(imgptr0 + in_j + dilation_w),
v02 = v_load(imgptr0 + in_j + dilation_w*2);
v_float32x4 vout = v00*vw00 + v01*vw01 + v02*vw02 + vbias;
v_float32x4 vout = v_add(v_add(v_add(v_mul(v00, vw00), v_mul(v01, vw01)), v_mul(v02, vw02)), vbias);
if (fusedAdd)
vout = v_load(outptr + out_j) + vout;
vout = v_add(v_load(outptr + out_j), vout);
if (relu)
vout = v_select(vout > z, vout, vout*vrc);
vout = v_select(v_gt(vout, z), vout, v_mul(vout, vrc));
v_store(outptr + out_j, vout);
}
}
@ -407,13 +403,13 @@ void depthWiseBlockConv1D(const float* wptr,
v_load_deinterleave(imgptr0 + in_j, v00, v01);
v_load_deinterleave(imgptr0 + in_j + 2, v02, unused);
v_float32x4 vout = v00 * vw00 + v01 * vw01 + v02 * vw02 + vbias;
v_float32x4 vout = v_add(v_add(v_add(v_mul(v00, vw00), v_mul(v01, vw01)), v_mul(v02, vw02)), vbias);
if (fusedAdd)
vout = v_load(outptr + out_j) + vout;
vout = v_add(v_load(outptr + out_j), vout);
if (relu)
vout = v_select(vout > z, vout, vout*vrc);
vout = v_select(v_gt(vout, z), vout, v_mul(vout, vrc));
v_store(outptr + out_j, vout);
}
}

View File

@ -430,32 +430,32 @@ void winofunc_BtXB_8x8_f32(const float* inptr, int inpstep,
/* Y[0] = [1.f, 0.f, -5.25f, 0.f, 5.25f, 0.f, -1.f, 0.f]*X */
/* Y[7] = [0.f, -1.f, 0.f, 5.25f, 0.f, -5.25f, 0.f, 1.f]*X */
v_float32x4 q5_25 = v_setall_f32(5.25f), t00, t01, t10, t11;
t00 = x40 - x20;
t01 = x41 - x21;
t10 = x30 - x50;
t11 = x31 - x51;
v_float32x4 y00 = v_fma(t00, q5_25, x00 - x60);
v_float32x4 y01 = v_fma(t01, q5_25, x01 - x61);
v_float32x4 y70 = v_fma(t10, q5_25, x70 - x10);
v_float32x4 y71 = v_fma(t11, q5_25, x71 - x11);
t00 = v_sub(x40, x20);
t01 = v_sub(x41, x21);
t10 = v_sub(x30, x50);
t11 = v_sub(x31, x51);
v_float32x4 y00 = v_fma(t00, q5_25, v_sub(x00, x60));
v_float32x4 y01 = v_fma(t01, q5_25, v_sub(x01, x61));
v_float32x4 y70 = v_fma(t10, q5_25, v_sub(x70, x10));
v_float32x4 y71 = v_fma(t11, q5_25, v_sub(x71, x11));
/* Y[1] = [0.f, 1.f, 1.f, -4.25f, -4.25f, 1.f, 1.f, 0.f]*X */
/* Y[2] = [0.f, -1.f, 1.f, 4.25f, -4.25f, -1.f, 1.f, 0.f]*X */
v_float32x4 qm4_25 = v_setall_f32(-4.25f);
t00 = v_fma(x30, qm4_25, x10 + x50);
t01 = v_fma(x31, qm4_25, x11 + x51);
t10 = v_fma(x40, qm4_25, x20 + x60);
t11 = v_fma(x41, qm4_25, x21 + x61);
t00 = v_fma(x30, qm4_25, v_add(x10, x50));
t01 = v_fma(x31, qm4_25, v_add(x11, x51));
t10 = v_fma(x40, qm4_25, v_add(x20, x60));
t11 = v_fma(x41, qm4_25, v_add(x21, x61));
v_float32x4 y10 = t00 + t10, y11 = t01 + t11;
v_float32x4 y20 = t10 - t00, y21 = t11 - t01;
v_float32x4 y10 = v_add(t00, t10), y11 = v_add(t01, t11);
v_float32x4 y20 = v_sub(t10, t00), y21 = v_sub(t11, t01);
/* Y[3] = [0.f, 0.5f, 0.25f, -2.5f, -1.25f, 2.f, 1.f, 0.f]*X */
/* Y[4] = [0.f, -0.5f, 0.25f, 2.5f, -1.25f, -2.f, 1.f, 0.f]*X */
v_float32x4 q0_5 = v_setall_f32(0.5f), q0_25 = v_setall_f32(0.25f);
v_float32x4 qm2_5 = v_setall_f32(-2.5f), qm1_25 = v_setall_f32(-1.25f);
t00 = v_fma(x10, q0_5, x50 + x50);
t01 = v_fma(x11, q0_5, x51 + x51);
t00 = v_fma(x10, q0_5, v_add(x50, x50));
t01 = v_fma(x11, q0_5, v_add(x51, x51));
t10 = v_fma(x20, q0_25, x60);
t11 = v_fma(x21, q0_25, x61);
t00 = v_fma(x30, qm2_5, t00);
@ -463,14 +463,14 @@ void winofunc_BtXB_8x8_f32(const float* inptr, int inpstep,
t10 = v_fma(x40, qm1_25, t10);
t11 = v_fma(x41, qm1_25, t11);
v_float32x4 y30 = t00 + t10, y31 = t01 + t11;
v_float32x4 y40 = t10 - t00, y41 = t11 - t01;
v_float32x4 y30 = v_add(t00, t10), y31 = v_add(t01, t11);
v_float32x4 y40 = v_sub(t10, t00), y41 = v_sub(t11, t01);
/* Y[5] = [0.f, 2.f, 4.f, -2.5f, -5.f, 0.5f, 1.f, 0.f]*X */
/* Y[6] = [0.f, -2.f, 4.f, 2.5f, -5.f, -0.5f, 1.f, 0.f]*X */
v_float32x4 q4 = v_setall_f32(4.f), qm5 = v_setall_f32(-5.f);
t00 = v_fma(x50, q0_5, x10 + x10);
t01 = v_fma(x51, q0_5, x11 + x11);
t00 = v_fma(x50, q0_5, v_add(x10, x10));
t01 = v_fma(x51, q0_5, v_add(x11, x11));
t10 = v_fma(x20, q4 , x60);
t11 = v_fma(x21, q4 , x61);
t00 = v_fma(x30, qm2_5, t00);
@ -478,8 +478,8 @@ void winofunc_BtXB_8x8_f32(const float* inptr, int inpstep,
t10 = v_fma(x40, qm5 , t10);
t11 = v_fma(x41, qm5 , t11);
v_float32x4 y50 = t00 + t10, y51 = t01 + t11;
v_float32x4 y60 = t10 - t00, y61 = t11 - t01;
v_float32x4 y50 = v_add(t00, t10), y51 = v_add(t01, t11);
v_float32x4 y60 = v_sub(t10, t00), y61 = v_sub(t11, t01);
/* transpose 8x8 matrix with v_transpose4x4 */
@ -491,29 +491,29 @@ void winofunc_BtXB_8x8_f32(const float* inptr, int inpstep,
/* Z[0] = [1.f, 0.f, -5.25f, 0.f, 5.25f, 0.f, -1.f, 0.f]*Y */
/* Z[7] = [0.f, -1.f, 0.f, 5.25f, 0.f, -5.25f, 0.f, 1.f]*Y */
t00 = y010 - y200;
t01 = y410 - y600;
t10 = y300 - y110;
t11 = y700 - y510;
z00 = v_fma(t00, q5_25, y000 - y210);
z01 = v_fma(t01, q5_25, y400 - y610);
z70 = v_fma(t10, q5_25, y310 - y100);
z71 = v_fma(t11, q5_25, y710 - y500);
t00 = v_sub(y010, y200);
t01 = v_sub(y410, y600);
t10 = v_sub(y300, y110);
t11 = v_sub(y700, y510);
z00 = v_fma(t00, q5_25, v_sub(y000, y210));
z01 = v_fma(t01, q5_25, v_sub(y400, y610));
z70 = v_fma(t10, q5_25, v_sub(y310, y100));
z71 = v_fma(t11, q5_25, v_sub(y710, y500));
/* Z[1] = [0.f, 1.f, 1.f, -4.25f, -4.25f, 1.f, 1.f, 0.f]*Y */
/* Z[2] = [0.f, -1.f, 1.f, 4.25f, -4.25f, -1.f, 1.f, 0.f]*Y */
t00 = v_fma(y300, qm4_25, y100 + y110);
t01 = v_fma(y700, qm4_25, y500 + y510);
t10 = v_fma(y010, qm4_25, y200 + y210);
t11 = v_fma(y410, qm4_25, y600 + y610);
t00 = v_fma(y300, qm4_25, v_add(y100, y110));
t01 = v_fma(y700, qm4_25, v_add(y500, y510));
t10 = v_fma(y010, qm4_25, v_add(y200, y210));
t11 = v_fma(y410, qm4_25, v_add(y600, y610));
z10 = t00 + t10; z11 = t01 + t11;
z20 = t10 - t00; z21 = t11 - t01;
z10 = v_add(t00, t10); z11 = v_add(t01, t11);
z20 = v_sub(t10, t00); z21 = v_sub(t11, t01);
/* Z[3] = [0.f, 0.5f, 0.25f, -2.5f, -1.25f, 2.f, 1.f, 0.f]*Y */
/* Z[4] = [0.f, -0.5f, 0.25f, 2.5f, -1.25f, -2.f, 1.f, 0.f]*Y */
t00 = v_fma(y100, q0_5, y110 + y110);
t01 = v_fma(y500, q0_5, y510 + y510);
t00 = v_fma(y100, q0_5, v_add(y110, y110));
t01 = v_fma(y500, q0_5, v_add(y510, y510));
t10 = v_fma(y200, q0_25, y210);
t11 = v_fma(y600, q0_25, y610);
t00 = v_fma(y300, qm2_5, t00);
@ -521,13 +521,13 @@ void winofunc_BtXB_8x8_f32(const float* inptr, int inpstep,
t10 = v_fma(y010, qm1_25, t10);
t11 = v_fma(y410, qm1_25, t11);
z30 = t00 + t10; z31 = t01 + t11;
z40 = t10 - t00; z41 = t11 - t01;
z30 = v_add(t00, t10); z31 = v_add(t01, t11);
z40 = v_sub(t10, t00); z41 = v_sub(t11, t01);
/* Z[5] = [0.f, 2.f, 4.f, -2.5f, -5.f, 0.5f, 1.f, 0.f]*Y */
/* Z[6] = [0.f, -2.f, 4.f, 2.5f, -5.f, -0.5f, 1.f, 0.f]*Y */
t00 = v_fma(y110, q0_5, y100 + y100);
t01 = v_fma(y510, q0_5, y500 + y500);
t00 = v_fma(y110, q0_5, v_add(y100, y100));
t01 = v_fma(y510, q0_5, v_add(y500, y500));
t10 = v_fma(y200, q4, y210);
t11 = v_fma(y600, q4, y610);
t00 = v_fma(y300, qm2_5, t00);
@ -535,8 +535,8 @@ void winofunc_BtXB_8x8_f32(const float* inptr, int inpstep,
t10 = v_fma(y010, qm5, t10);
t11 = v_fma(y410, qm5, t11);
z50 = t00 + t10; z51 = t01 + t11;
z60 = t10 - t00; z61 = t11 - t01;
z50 = v_add(t00, t10); z51 = v_add(t01, t11);
z60 = v_sub(t10, t00); z61 = v_sub(t11, t01);
}
const int outstep = winoIblock*winoAtomF32*Cg;
@ -601,12 +601,12 @@ void winofunc_AtXA_8x8_f32(const float* inptr, int inpstep,
{
v_float32x4 s12_0, s12_1, s34_0, s34_1, s56_0, s56_1;
s12_0 = x10 + x20; s12_1 = x11 + x21;
s34_0 = x30 + x40; s34_1 = x31 + x41;
s56_0 = x50 + x60; s56_1 = x51 + x61;
s12_0 = v_add(x10, x20); s12_1 = v_add(x11, x21);
s34_0 = v_add(x30, x40); s34_1 = v_add(x31, x41);
s56_0 = v_add(x50, x60); s56_1 = v_add(x51, x61);
v_float32x4 y00 = x00 + s12_0 + s34_0 + s56_0;
v_float32x4 y01 = x01 + s12_1 + s34_1 + s56_1;
v_float32x4 y00 = v_add(v_add(v_add(x00, s12_0), s34_0), s56_0);
v_float32x4 y01 = v_add(v_add(v_add(x01, s12_1), s34_1), s56_1);
v_float32x4 a0 = v_setall_f32(0.25f), a1 = v_setall_f32(4.0f);
v_float32x4 y20 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0));
@ -616,13 +616,13 @@ void winofunc_AtXA_8x8_f32(const float* inptr, int inpstep,
v_float32x4 y40 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0));
v_float32x4 y41 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1));
s12_0 = x10 - x20; s12_1 = x11 - x21;
s34_0 = x30 - x40; s34_1 = x31 - x41;
s56_0 = x50 - x60; s56_1 = x51 - x61;
s12_0 = v_sub(x10, x20); s12_1 = v_sub(x11, x21);
s34_0 = v_sub(x30, x40); s34_1 = v_sub(x31, x41);
s56_0 = v_sub(x50, x60); s56_1 = v_sub(x51, x61);
a0 = v_setall_f32(1.f/32), a1 = v_setall_f32(32.f);
v_float32x4 y50 = v_fma(s56_0, a0, v_fma(s34_0, a1, x70 + s12_0));
v_float32x4 y51 = v_fma(s56_1, a0, v_fma(s34_1, a1, x71 + s12_1));
v_float32x4 y50 = v_fma(s56_0, a0, v_fma(s34_0, a1, v_add(x70, s12_0)));
v_float32x4 y51 = v_fma(s56_1, a0, v_fma(s34_1, a1, v_add(x71, s12_1)));
a0 = v_setall_f32(0.5f), a1 = v_setall_f32(2.f);
v_float32x4 y10 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0));
@ -642,12 +642,12 @@ void winofunc_AtXA_8x8_f32(const float* inptr, int inpstep,
v_transpose4x4(y40, y50, y60, y70, y400, y500, y600, y700);
v_transpose4x4(y41, y51, y61, y71, y410, y510, y610, y710);
s12_0 = y100 + y200; s12_1 = y500 + y600;
s34_0 = y300 + y010; s34_1 = y700 + y410;
s56_0 = y110 + y210; s56_1 = y510 + y610;
s12_0 = v_add(y100, y200); s12_1 = v_add(y500, y600);
s34_0 = v_add(y300, y010); s34_1 = v_add(y700, y410);
s56_0 = v_add(y110, y210); s56_1 = v_add(y510, y610);
z00 = y000 + s12_0 + s34_0 + s56_0;
z01 = y400 + s12_1 + s34_1 + s56_1;
z00 = v_add(v_add(v_add(y000, s12_0), s34_0), s56_0);
z01 = v_add(v_add(v_add(y400, s12_1), s34_1), s56_1);
a0 = v_setall_f32(0.25f), a1 = v_setall_f32(4.0f);
z20 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0));
@ -657,13 +657,13 @@ void winofunc_AtXA_8x8_f32(const float* inptr, int inpstep,
z40 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0));
z41 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1));
s12_0 = y100 - y200; s12_1 = y500 - y600;
s34_0 = y300 - y010; s34_1 = y700 - y410;
s56_0 = y110 - y210; s56_1 = y510 - y610;
s12_0 = v_sub(y100, y200); s12_1 = v_sub(y500, y600);
s34_0 = v_sub(y300, y010); s34_1 = v_sub(y700, y410);
s56_0 = v_sub(y110, y210); s56_1 = v_sub(y510, y610);
a0 = v_setall_f32(1.f/32), a1 = v_setall_f32(32.0f);
z50 = v_fma(s56_0, a0, v_fma(s34_0, a1, y310 + s12_0));
z51 = v_fma(s56_1, a0, v_fma(s34_1, a1, y710 + s12_1));
z50 = v_fma(s56_0, a0, v_fma(s34_0, a1, v_add(y310, s12_0)));
z51 = v_fma(s56_1, a0, v_fma(s34_1, a1, v_add(y710, s12_1)));
a0 = v_setall_f32(0.5f), a1 = v_setall_f32(2.0f);
z10 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0));
z11 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1));
@ -673,34 +673,34 @@ void winofunc_AtXA_8x8_f32(const float* inptr, int inpstep,
z31 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1));
v_float32x4 vbias = v_setall_f32(bias);
z00 += vbias;
z01 += vbias;
z10 += vbias;
z11 += vbias;
z20 += vbias;
z21 += vbias;
z30 += vbias;
z31 += vbias;
z40 += vbias;
z41 += vbias;
z50 += vbias;
z51 += vbias;
z00 = v_add(z00, vbias);
z01 = v_add(z01, vbias);
z10 = v_add(z10, vbias);
z11 = v_add(z11, vbias);
z20 = v_add(z20, vbias);
z21 = v_add(z21, vbias);
z30 = v_add(z30, vbias);
z31 = v_add(z31, vbias);
z40 = v_add(z40, vbias);
z41 = v_add(z41, vbias);
z50 = v_add(z50, vbias);
z51 = v_add(z51, vbias);
}
if (bpptr)
{
z00 += v_load(bpptr);
z01 += v_load_low(bpptr + 4);
z10 += v_load(bpptr + bpstep);
z11 += v_load_low(bpptr + bpstep + 4);
z20 += v_load(bpptr + bpstep*2);
z21 += v_load_low(bpptr + bpstep*2 + 4);
z30 += v_load(bpptr + bpstep*3);
z31 += v_load_low(bpptr + bpstep*3 + 4);
z40 += v_load(bpptr + bpstep*4);
z41 += v_load_low(bpptr + bpstep*4 + 4);
z50 += v_load(bpptr + bpstep*5);
z51 += v_load_low(bpptr + bpstep*5 + 4);
z00 = v_add(z00, v_load(bpptr));
z01 = v_add(z01, v_load_low(bpptr + 4));
z10 = v_add(z10, v_load(bpptr + bpstep));
z11 = v_add(z11, v_load_low(bpptr + bpstep + 4));
z20 = v_add(z20, v_load(bpptr + bpstep * 2));
z21 = v_add(z21, v_load_low(bpptr + bpstep * 2 + 4));
z30 = v_add(z30, v_load(bpptr + bpstep * 3));
z31 = v_add(z31, v_load_low(bpptr + bpstep * 3 + 4));
z40 = v_add(z40, v_load(bpptr + bpstep * 4));
z41 = v_add(z41, v_load_low(bpptr + bpstep * 4 + 4));
z50 = v_add(z50, v_load(bpptr + bpstep * 5));
z51 = v_add(z51, v_load_low(bpptr + bpstep * 5 + 4));
}
if (ifMinMaxAct)

View File

@ -370,10 +370,10 @@ struct ReLUFunctor : public BaseFunctor
v_float32x4 x1 = v_load(srcptr + i + 4);
v_float32x4 x2 = v_load(srcptr + i + 8);
v_float32x4 x3 = v_load(srcptr + i + 12);
x0 = v_select(x0 >= z, x0, x0*s4);
x1 = v_select(x1 >= z, x1, x1*s4);
x2 = v_select(x2 >= z, x2, x2*s4);
x3 = v_select(x3 >= z, x3, x3*s4);
x0 = v_select(v_ge(x0, z), x0, v_mul(x0, s4));
x1 = v_select(v_ge(x1, z), x1, v_mul(x1, s4));
x2 = v_select(v_ge(x2, z), x2, v_mul(x2, s4));
x3 = v_select(v_ge(x3, z), x3, v_mul(x3, s4));
v_store(dstptr + i, x0);
v_store(dstptr + i + 4, x1);
v_store(dstptr + i + 8, x2);
@ -2493,10 +2493,10 @@ struct ChannelsPReLUFunctor : public BaseFunctor
v_float32x4 x1 = v_load(srcptr + i + 4);
v_float32x4 x2 = v_load(srcptr + i + 8);
v_float32x4 x3 = v_load(srcptr + i + 12);
x0 = v_select(x0 >= z, x0, x0*s4);
x1 = v_select(x1 >= z, x1, x1*s4);
x2 = v_select(x2 >= z, x2, x2*s4);
x3 = v_select(x3 >= z, x3, x3*s4);
x0 = v_select(v_ge(x0, z), x0, v_mul(x0, s4));
x1 = v_select(v_ge(x1, z), x1, v_mul(x1, s4));
x2 = v_select(v_ge(x2, z), x2, v_mul(x2, s4));
x3 = v_select(v_ge(x3, z), x3, v_mul(x3, s4));
v_store(dstptr + i, x0);
v_store(dstptr + i + 4, x1);
v_store(dstptr + i + 8, x2);
@ -2649,10 +2649,10 @@ struct PReLUFunctor : public ChannelsPReLUFunctor
v_float32x4 s1 = v_load(scaleptr + i + 4);
v_float32x4 s2 = v_load(scaleptr + i + 8);
v_float32x4 s3 = v_load(scaleptr + i + 12);
x0 = v_select(x0 >= z, x0, x0*s0);
x1 = v_select(x1 >= z, x1, x1*s1);
x2 = v_select(x2 >= z, x2, x2*s2);
x3 = v_select(x3 >= z, x3, x3*s3);
x0 = v_select(v_ge(x0, z), x0, v_mul(x0, s0));
x1 = v_select(v_ge(x1, z), x1, v_mul(x1, s1));
x2 = v_select(v_ge(x2, z), x2, v_mul(x2, s2));
x3 = v_select(v_ge(x3, z), x3, v_mul(x3, s3));
v_store(dstptr + i, x0);
v_store(dstptr + i + 4, x1);
v_store(dstptr + i + 8, x2);

View File

@ -308,7 +308,7 @@ public:
}
v_float32x4 s = v_reduce_sum4(vs0, vs1, vs2, vs3);
s += v_load(biasptr + i);
s = v_add(s, v_load(biasptr + i));
v_store(dptr + i, s);
}
#endif

View File

@ -898,25 +898,25 @@ public:
v_float32x4 max_idx0 = v_setall_f32(-1.f);
v_float32x4 max_idx1 = max_idx0;
int index0 = ystart * inp_width + xstart;
v_float32x4 idx0 = idx00 + v_setall_f32((float)index0);
v_float32x4 idx1 = idx0 + v_setall_f32((float)(stride_w*4));
v_float32x4 idx0 = v_add(idx00, v_setall_f32((float)index0));
v_float32x4 idx1 = v_add(idx0, v_setall_f32((float)(stride_w * 4)));
for (int y = ystart; y < yend; ++y)
{
for (int x = xstart; x < xend; ++x, idx0 += ones, idx1 += ones)
for (int x = xstart; x < xend; ++x, idx0 = v_add(idx0, ones), idx1 = v_add(idx1, ones))
{
const int index = y * inp_width + x;
v_float32x4 v0(srcData[index], srcData[index + stride_w],
srcData[index + stride_w*2], srcData[index + stride_w*3]);
v_float32x4 v1(srcData[index + stride_w*4], srcData[index + stride_w*5],
srcData[index + stride_w*6], srcData[index + stride_w*7]);
max_idx0 = v_select(v0 > max_val0, idx0, max_idx0);
max_idx1 = v_select(v1 > max_val1, idx1, max_idx1);
max_idx0 = v_select(v_gt(v0, max_val0), idx0, max_idx0);
max_idx1 = v_select(v_gt(v1, max_val1), idx1, max_idx1);
max_val0 = v_max(max_val0, v0);
max_val1 = v_max(max_val1, v1);
}
idx0 += idx_delta;
idx1 += idx_delta;
idx0 = v_add(idx0, idx_delta);
idx1 = v_add(idx1, idx_delta);
}
v_store(dstData + x0, max_val0);
v_store(dstData + x0 + 4, max_val1);
@ -1069,12 +1069,12 @@ public:
srcData[index + stride_w*2], srcData[index + stride_w*3]);
v_float32x4 v1(srcData[index + stride_w*4], srcData[index + stride_w*5],
srcData[index + stride_w*6], srcData[index + stride_w*7]);
sum_val0 += v0;
sum_val1 += v1;
sum_val0 = v_add(sum_val0, v0);
sum_val1 = v_add(sum_val1, v1);
}
}
v_store(dstData + x0, sum_val0*ikarea);
v_store(dstData + x0 + 4, sum_val1*ikarea);
v_store(dstData + x0, v_mul(sum_val0, ikarea));
v_store(dstData + x0 + 4, v_mul(sum_val1, ikarea));
x0 += 7;
}
else

View File

@ -120,8 +120,8 @@ void FAST_t(InputArray _img, std::vector<KeyPoint>& keypoints, int threshold, bo
for (; j < img.cols - 16 - 3; j += 16, ptr += 16)
{
v_uint8x16 v = v_load(ptr);
v_int8x16 v0 = v_reinterpret_as_s8((v + t) ^ delta);
v_int8x16 v1 = v_reinterpret_as_s8((v - t) ^ delta);
v_int8x16 v0 = v_reinterpret_as_s8(v_xor(v_add(v, t), delta));
v_int8x16 v1 = v_reinterpret_as_s8(v_xor(v_sub(v, t), delta));
v_int8x16 x0 = v_reinterpret_as_s8(v_sub_wrap(v_load(ptr + pixel[0]), delta));
v_int8x16 x1 = v_reinterpret_as_s8(v_sub_wrap(v_load(ptr + pixel[quarterPatternSize]), delta));
@ -129,15 +129,15 @@ void FAST_t(InputArray _img, std::vector<KeyPoint>& keypoints, int threshold, bo
v_int8x16 x3 = v_reinterpret_as_s8(v_sub_wrap(v_load(ptr + pixel[3*quarterPatternSize]), delta));
v_int8x16 m0, m1;
m0 = (v0 < x0) & (v0 < x1);
m1 = (x0 < v1) & (x1 < v1);
m0 = m0 | ((v0 < x1) & (v0 < x2));
m1 = m1 | ((x1 < v1) & (x2 < v1));
m0 = m0 | ((v0 < x2) & (v0 < x3));
m1 = m1 | ((x2 < v1) & (x3 < v1));
m0 = m0 | ((v0 < x3) & (v0 < x0));
m1 = m1 | ((x3 < v1) & (x0 < v1));
m0 = m0 | m1;
m0 = v_and(v_lt(v0, x0), v_lt(v0, x1));
m1 = v_and(v_lt(x0, v1), v_lt(x1, v1));
m0 = v_or(m0, v_and(v_lt(v0, x1), v_lt(v0, x2)));
m1 = v_or(m1, v_and(v_lt(x1, v1), v_lt(x2, v1)));
m0 = v_or(m0, v_and(v_lt(v0, x2), v_lt(v0, x3)));
m1 = v_or(m1, v_and(v_lt(x2, v1), v_lt(x3, v1)));
m0 = v_or(m0, v_and(v_lt(v0, x3), v_lt(v0, x0)));
m1 = v_or(m1, v_and(v_lt(x3, v1), v_lt(x0, v1)));
m0 = v_or(m0, m1);
if( !v_check_any(m0) )
continue;
@ -154,18 +154,18 @@ void FAST_t(InputArray _img, std::vector<KeyPoint>& keypoints, int threshold, bo
v_uint8x16 max1 = v_setzero_u8();
for( k = 0; k < N; k++ )
{
v_int8x16 x = v_reinterpret_as_s8(v_load((ptr + pixel[k])) ^ delta);
m0 = v0 < x;
m1 = x < v1;
v_int8x16 x = v_reinterpret_as_s8(v_xor(v_load((ptr + pixel[k])), delta));
m0 = v_lt(v0, x);
m1 = v_lt(x, v1);
c0 = v_sub_wrap(c0, m0) & m0;
c1 = v_sub_wrap(c1, m1) & m1;
c0 = v_and(v_sub_wrap(c0, m0), m0);
c1 = v_and(v_sub_wrap(c1, m1), m1);
max0 = v_max(max0, v_reinterpret_as_u8(c0));
max1 = v_max(max1, v_reinterpret_as_u8(c1));
}
max0 = K16 < v_max(max0, max1);
max0 = v_lt(K16, v_max(max0, max1));
unsigned int m = v_signmask(v_reinterpret_as_s8(max0));
for( k = 0; m > 0 && k < 16; k++, m >>= 1 )
@ -190,7 +190,7 @@ void FAST_t(InputArray _img, std::vector<KeyPoint>& keypoints, int threshold, bo
a1 = v_min(a1, v_nms);
b1 = v_max(b1, v_nms);
}
curr[j + k] = (uchar)(v_reduce_max(v_max(v_max(a0, a1), v_setzero_s16() - v_min(b0, b1))) - 1);
curr[j + k] = (uchar)(v_reduce_max(v_max(v_max(a0, a1), v_sub(v_setzero_s16(), v_min(b0, b1)))) - 1);
}
}
}

View File

@ -160,7 +160,7 @@ int cornerScore<16>(const uchar* ptr, const int pixel[], int threshold)
q0 = v_max(q0, v_min(a, v0));
q1 = v_min(q1, v_max(b, v0));
}
q0 = v_max(q0, v_setzero_s16() - q1);
q0 = v_max(q0, v_sub(v_setzero_s16(), q1));
threshold = v_reduce_max(q0) - 1;
}
else
@ -251,7 +251,7 @@ int cornerScore<12>(const uchar* ptr, const int pixel[], int threshold)
q0 = v_max(q0, v_min(a, v0));
q1 = v_min(q1, v_max(b, v0));
}
q0 = v_max(q0, v_setzero_s16() - q1);
q0 = v_max(q0, v_sub(v_setzero_s16(), q1));
threshold = v_reduce_max(q0) - 1;
}
else
@ -323,7 +323,7 @@ int cornerScore<8>(const uchar* ptr, const int pixel[], int threshold)
v0 = v_load(d + 5);
q0 = v_max(q0, v_min(a, v0));
q1 = v_min(q1, v_max(b, v0));
q0 = v_max(q0, v_setzero_s16() - q1);
q0 = v_max(q0, v_sub(v_setzero_s16(), q1));
threshold = v_reduce_max(q0) - 1;
}
else

View File

@ -335,7 +335,7 @@ void run_rgb2hsv_impl(uchar out[], const uchar in[], const int sdiv_table[],
// divide and calculate s according to above feature
v_uint32x4 ss[4];
v_uint32x4 vadd = v_setall_u32(1) << (hsv_shift - 1);
v_uint32x4 vadd = v_shl(v_setall_u32(1), (hsv_shift - 1));
v_uint32x4 v_diff_exp[4];
v_diff_exp[0] = v_reinterpret_as_u32(v_and(v_reinterpret_as_u8(v_diff), mask1));
@ -406,16 +406,16 @@ void run_rgb2hsv_impl(uchar out[], const uchar in[], const int sdiv_table[],
// start computing H-ch
//h = (_vr & (g - b)) + (~_vr & ((_vg & (b - r + 2 * diff)) + ((~_vg) & (r - g + 4 * diff))));
v_int32x4 hh[4];
hh[0] = v_reinterpret_as_s32(v_select(e[0], v_reinterpret_as_s32(gg[0] - bb[0]),
hh[0] = v_reinterpret_as_s32(v_select(e[0], v_reinterpret_as_s32(v_sub(gg[0], bb[0])),
v_select(p[0], v_reinterpret_as_s32(v_add(v_sub(bb[0], rr[0]), v_mul(v_setall_u32(2), vdd[0]))),
v_reinterpret_as_s32(v_add(v_sub(rr[0], gg[0]), v_mul(v_setall_u32(4), vdd[0]))))));
hh[1] = v_reinterpret_as_s32(v_select(e[1], v_reinterpret_as_s32(gg[1] - bb[1]),
hh[1] = v_reinterpret_as_s32(v_select(e[1], v_reinterpret_as_s32(v_sub(gg[1], bb[1])),
v_select(p[1], v_reinterpret_as_s32(v_add(v_sub(bb[1], rr[1]), v_mul(v_setall_u32(2), vdd[1]))),
v_reinterpret_as_s32(v_add(v_sub(rr[1], gg[1]), v_mul(v_setall_u32(4), vdd[1]))))));
hh[2] = v_reinterpret_as_s32(v_select(e[2], v_reinterpret_as_s32(gg[2] - bb[2]),
hh[2] = v_reinterpret_as_s32(v_select(e[2], v_reinterpret_as_s32(v_sub(gg[2], bb[2])),
v_select(p[2], v_reinterpret_as_s32(v_add(v_sub(bb[2], rr[2]), v_mul(v_setall_u32(2), vdd[2]))),
v_reinterpret_as_s32(v_add(v_sub(rr[2], gg[2]), v_mul(v_setall_u32(4), vdd[2]))))));
hh[3] = v_reinterpret_as_s32(v_select(e[3], v_reinterpret_as_s32(gg[3] - bb[3]),
hh[3] = v_reinterpret_as_s32(v_select(e[3], v_reinterpret_as_s32(v_sub(gg[3], bb[3])),
v_select(p[3], v_reinterpret_as_s32(v_add(v_sub(bb[3], rr[3]), v_mul(v_setall_u32(2), vdd[3]))),
v_reinterpret_as_s32(v_add(v_sub(rr[3], gg[3]), v_mul(v_setall_u32(4), vdd[3]))))));
@ -433,16 +433,16 @@ void run_rgb2hsv_impl(uchar out[], const uchar in[], const int sdiv_table[],
// check for negative H
v_int32x4 v_h_less_0[4];
v_h_less_0[0] = (hh[0] < v_setall_s32(0));
v_h_less_0[1] = (hh[1] < v_setall_s32(0));
v_h_less_0[2] = (hh[2] < v_setall_s32(0));
v_h_less_0[3] = (hh[3] < v_setall_s32(0));
v_h_less_0[0] = (v_lt(hh[0], v_setall_s32(0)));
v_h_less_0[1] = (v_lt(hh[1], v_setall_s32(0)));
v_h_less_0[2] = (v_lt(hh[2], v_setall_s32(0)));
v_h_less_0[3] = (v_lt(hh[3], v_setall_s32(0)));
v_int32x4 v_h_180[4];
v_h_180[0] = hh[0] + v_setall_s32(180);
v_h_180[1] = hh[1] + v_setall_s32(180);
v_h_180[2] = hh[2] + v_setall_s32(180);
v_h_180[3] = hh[3] + v_setall_s32(180);
v_h_180[0] = v_add(hh[0], v_setall_s32(180));
v_h_180[1] = v_add(hh[1], v_setall_s32(180));
v_h_180[2] = v_add(hh[2], v_setall_s32(180));
v_h_180[3] = v_add(hh[3], v_setall_s32(180));
hh[0] = v_select(v_h_less_0[0], v_h_180[0], hh[0]);
hh[1] = v_select(v_h_less_0[1], v_h_180[1], hh[1]);

View File

@ -64,7 +64,7 @@ CV_ALWAYS_INLINE void calcRowLinear32FC1Impl(float *dst[],
bool xRatioEq1 = inSz.width == outSz.width;
bool yRatioEq1 = inSz.height == outSz.height;
constexpr int nlanes = v_float32x8::nlanes;
const int nlanes = VTraits<v_float32x8>::vlanes();
if (!xRatioEq1 && !yRatioEq1)
{

View File

@ -140,9 +140,9 @@ public:
#if CV_SIMD128
v_uint32x4 rval = v_setall_u32(sptr[j]);
v_uint32x4 val(ksptr0[j], ksptr1[j], ksptr2[j], ksptr3[j]);
v_float32x4 w = kweight4 * v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, rval)));
v_float32x4 w = v_mul(kweight4, v_lut(this->color_weight, v_reinterpret_as_s32(v_absdiff(val, rval))));
wsum[j] += v_reduce_sum(w);
sum[j] += v_reduce_sum(v_cvt_f32(v_reinterpret_as_s32(val)) * w);
sum[j] += v_reduce_sum(v_mul(v_cvt_f32(v_reinterpret_as_s32(val)), w));
#else
int rval = sptr[j];
@ -407,11 +407,11 @@ public:
v_uint32x4 b(ksptr0[0], ksptr1[0], ksptr2[0], ksptr3[0]);
v_uint32x4 g(ksptr0[1], ksptr1[1], ksptr2[1], ksptr3[1]);
v_uint32x4 r(ksptr0[2], ksptr1[2], ksptr2[2], ksptr3[2]);
v_float32x4 w = kweight4 * v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(b, rb) + v_absdiff(g, rg) + v_absdiff(r, rr)));
v_float32x4 w = v_mul(kweight4, v_lut(this->color_weight, v_reinterpret_as_s32(v_add(v_add(v_absdiff(b, rb), v_absdiff(g, rg)), v_absdiff(r, rr)))));
wsum[j] += v_reduce_sum(w);
sum_b[j] += v_reduce_sum(v_cvt_f32(v_reinterpret_as_s32(b)) * w);
sum_g[j] += v_reduce_sum(v_cvt_f32(v_reinterpret_as_s32(g)) * w);
sum_r[j] += v_reduce_sum(v_cvt_f32(v_reinterpret_as_s32(r)) * w);
sum_b[j] += v_reduce_sum(v_mul(v_cvt_f32(v_reinterpret_as_s32(b)), w));
sum_g[j] += v_reduce_sum(v_mul(v_cvt_f32(v_reinterpret_as_s32(g)), w));
sum_r[j] += v_reduce_sum(v_mul(v_cvt_f32(v_reinterpret_as_s32(r)), w));
#else
int rb = rsptr[0], rg = rsptr[1], rr = rsptr[2];
@ -661,12 +661,12 @@ public:
v_float32x4 rval = v_setall_f32(sptr[j]);
v_float32x4 val(ksptr0[j], ksptr1[j], ksptr2[j], ksptr3[j]);
v_float32x4 knan = v_not_nan(val);
v_float32x4 alpha = (v_absdiff(val, rval) * sindex4) & v_not_nan(rval) & knan;
v_float32x4 alpha = v_and(v_and(v_mul(v_absdiff(val, rval), sindex4), v_not_nan(rval)), knan);
v_int32x4 idx = v_trunc(alpha);
alpha -= v_cvt_f32(idx);
v_float32x4 w = (kweight4 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one4 - alpha))) & knan;
alpha = v_sub(alpha, v_cvt_f32(idx));
v_float32x4 w = v_and(v_mul(kweight4, v_muladd(v_lut(this->expLUT + 1, idx), alpha, v_mul(v_lut(this->expLUT, idx), v_sub(v_one4, alpha)))), knan);
wsum[j] += v_reduce_sum(w);
sum[j] += v_reduce_sum((val & knan) * w);
sum[j] += v_reduce_sum(v_mul(v_and(val, knan), w));
#else
float rval = sptr[j];
@ -862,15 +862,15 @@ public:
v_float32x4 kb(ksptr0[0], ksptr1[0], ksptr2[0], ksptr3[0]);
v_float32x4 kg(ksptr0[1], ksptr1[1], ksptr2[1], ksptr3[1]);
v_float32x4 kr(ksptr0[2], ksptr1[2], ksptr2[2], ksptr3[2]);
v_float32x4 knan = v_not_nan(kb) & v_not_nan(kg) & v_not_nan(kr);
v_float32x4 alpha = ((v_absdiff(kb, rb) + v_absdiff(kg, rg) + v_absdiff(kr, rr)) * sindex4) & v_not_nan(rb) & v_not_nan(rg) & v_not_nan(rr) & knan;
v_float32x4 knan = v_and(v_and(v_not_nan(kb), v_not_nan(kg)), v_not_nan(kr));
v_float32x4 alpha = v_and(v_and(v_and(v_and(v_mul(v_add(v_add(v_absdiff(kb, rb), v_absdiff(kg, rg)), v_absdiff(kr, rr)), sindex4), v_not_nan(rb)), v_not_nan(rg)), v_not_nan(rr)), knan);
v_int32x4 idx = v_trunc(alpha);
alpha -= v_cvt_f32(idx);
v_float32x4 w = (kweight4 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one4 - alpha))) & knan;
alpha = v_sub(alpha, v_cvt_f32(idx));
v_float32x4 w = v_and(v_mul(kweight4, v_muladd(v_lut(this->expLUT + 1, idx), alpha, v_mul(v_lut(this->expLUT, idx), v_sub(v_one4, alpha)))), knan);
wsum[j] += v_reduce_sum(w);
sum_b[j] += v_reduce_sum((kb & knan) * w);
sum_g[j] += v_reduce_sum((kg & knan) * w);
sum_r[j] += v_reduce_sum((kr & knan) * w);
sum_b[j] += v_reduce_sum(v_mul(v_and(kb, knan), w));
sum_g[j] += v_reduce_sum(v_mul(v_and(kg, knan), w));
sum_r[j] += v_reduce_sum(v_mul(v_and(kr, knan), w));
#else
float rb = rsptr[0], rg = rsptr[1], rr = rsptr[2];
bool r_NAN = cvIsNaN(rb) || cvIsNaN(rg) || cvIsNaN(rr);

View File

@ -315,7 +315,7 @@ struct ColumnSum<int, uchar> :
v_store(SUM + i, v_add(vx_load(SUM + i), vx_load(Sp + i)));
}
#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
for (; i <= width - v_int32x4::nlanes; i += v_int32x4::nlanes)
for (; i <= width - VTraits<v_int32x4>::vlanes(); i += VTraits<v_int32x4>::vlanes())
{
v_store(SUM + i, v_add(v_load(SUM + i), v_load(Sp + i)));
}
@ -357,10 +357,10 @@ struct ColumnSum<int, uchar> :
}
#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
v_float32x4 v_scale = v_setall_f32((float)_scale);
for( ; i <= width-v_uint16x8::nlanes; i+=v_uint16x8::nlanes )
for( ; i <= width-VTraits<v_uint16x8>::vlanes(); i+=VTraits<v_uint16x8>::vlanes() )
{
v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i));
v_int32x4 v_s01 = v_add(v_load(SUM + i + v_int32x4::nlanes), v_load(Sp + i + v_int32x4::nlanes));
v_int32x4 v_s01 = v_add(v_load(SUM + i + VTraits<v_int32x4>::vlanes()), v_load(Sp + i + VTraits<v_int32x4>::vlanes()));
v_uint32x4 v_s0d = v_reinterpret_as_u32(v_round(v_mul(v_cvt_f32(v_s0), v_scale)));
v_uint32x4 v_s01d = v_reinterpret_as_u32(v_round(v_mul(v_cvt_f32(v_s01), v_scale)));
@ -369,7 +369,7 @@ struct ColumnSum<int, uchar> :
v_pack_store(D + i, v_dst);
v_store(SUM + i, v_sub(v_s0, v_load(Sm + i)));
v_store(SUM + i + v_int32x4::nlanes, v_sub(v_s01, v_load(Sm + i + v_int32x4::nlanes)));
v_store(SUM + i + VTraits<v_int32x4>::vlanes(), v_sub(v_s01, v_load(Sm + i + VTraits<v_int32x4>::vlanes())));
}
#endif
#endif
@ -396,16 +396,16 @@ struct ColumnSum<int, uchar> :
v_store(SUM + i + VTraits<v_int32>::vlanes(), v_sub(v_s01, vx_load(Sm + i + VTraits<v_int32>::vlanes())));
}
#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
for( ; i <= width-v_uint16x8::nlanes; i+=v_uint16x8::nlanes )
for( ; i <= width-VTraits<v_uint16x8>::vlanes(); i+=VTraits<v_uint16x8>::vlanes() )
{
v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i));
v_int32x4 v_s01 = v_add(v_load(SUM + i + v_int32x4::nlanes), v_load(Sp + i + v_int32x4::nlanes));
v_int32x4 v_s01 = v_add(v_load(SUM + i + VTraits<v_int32x4>::vlanes()), v_load(Sp + i + VTraits<v_int32x4>::vlanes()));
v_uint16x8 v_dst = v_pack(v_reinterpret_as_u32(v_s0), v_reinterpret_as_u32(v_s01));
v_pack_store(D + i, v_dst);
v_store(SUM + i, v_sub(v_s0, v_load(Sm + i)));
v_store(SUM + i + v_int32x4::nlanes, v_sub(v_s01, v_load(Sm + i + v_int32x4::nlanes)));
v_store(SUM + i + VTraits<v_int32x4>::vlanes(), v_sub(v_s01, v_load(Sm + i + VTraits<v_int32x4>::vlanes())));
}
#endif
#endif
@ -486,7 +486,7 @@ public BaseColumnFilter
v_store(SUM + i, v_add(vx_load(SUM + i), vx_load(Sp + i)));
}
#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
for( ; i <= width - v_uint16x8::nlanes; i += v_uint16x8::nlanes )
for( ; i <= width - VTraits<v_uint16x8>::vlanes(); i += VTraits<v_uint16x8>::vlanes() )
{
v_store(SUM + i, v_add(v_load(SUM + i), v_load(Sp + i)));
}
@ -546,13 +546,13 @@ public BaseColumnFilter
v_uint32x4 ds4 = v_setall_u32((unsigned)ds);
v_uint16x8 dd8 = v_setall_u16((ushort)dd);
for( ; i <= width-v_uint8x16::nlanes; i+=v_uint8x16::nlanes )
for( ; i <= width-VTraits<v_uint8x16>::vlanes(); i+=VTraits<v_uint8x16>::vlanes() )
{
v_uint16x8 _sm0 = v_load(Sm + i);
v_uint16x8 _sm1 = v_load(Sm + i + v_uint16x8::nlanes);
v_uint16x8 _sm1 = v_load(Sm + i + VTraits<v_uint16x8>::vlanes());
v_uint16x8 _s0 = v_add_wrap(v_load(SUM + i), v_load(Sp + i));
v_uint16x8 _s1 = v_add_wrap(v_load(SUM + i + v_uint16x8::nlanes), v_load(Sp + i + v_uint16x8::nlanes));
v_uint16x8 _s1 = v_add_wrap(v_load(SUM + i + VTraits<v_uint16x8>::vlanes()), v_load(Sp + i + VTraits<v_uint16x8>::vlanes()));
v_uint32x4 _s00, _s01, _s10, _s11;
@ -572,7 +572,7 @@ public BaseColumnFilter
v_store(D + i, v_pack_u(r0, r1));
v_store(SUM + i, _s0);
v_store(SUM + i + v_uint16x8::nlanes, _s1);
v_store(SUM + i + VTraits<v_uint16x8>::vlanes(), _s1);
}
#endif
#endif
@ -649,7 +649,7 @@ struct ColumnSum<int, short> :
v_store(SUM + i, v_add(vx_load(SUM + i), vx_load(Sp + i)));
}
#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
for( ; i <= width - v_int32x4::nlanes; i+=v_int32x4::nlanes )
for( ; i <= width - VTraits<v_int32x4>::vlanes(); i+=VTraits<v_int32x4>::vlanes() )
{
v_store(SUM + i, v_add(v_load(SUM + i), v_load(Sp + i)));
}
@ -689,17 +689,17 @@ struct ColumnSum<int, short> :
}
#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
v_float32x4 v_scale = v_setall_f32((float)_scale);
for( ; i <= width-v_int16x8::nlanes; i+=v_int16x8::nlanes )
for( ; i <= width-VTraits<v_int16x8>::vlanes(); i+=VTraits<v_int16x8>::vlanes() )
{
v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i));
v_int32x4 v_s01 = v_add(v_load(SUM + i + v_int32x4::nlanes), v_load(Sp + i + v_int32x4::nlanes));
v_int32x4 v_s01 = v_add(v_load(SUM + i + VTraits<v_int32x4>::vlanes()), v_load(Sp + i + VTraits<v_int32x4>::vlanes()));
v_int32x4 v_s0d = v_round(v_mul(v_cvt_f32(v_s0), v_scale));
v_int32x4 v_s01d = v_round(v_mul(v_cvt_f32(v_s01), v_scale));
v_store(D + i, v_pack(v_s0d, v_s01d));
v_store(SUM + i, v_sub(v_s0, v_load(Sm + i)));
v_store(SUM + i + v_int32x4::nlanes, v_sub(v_s01, v_load(Sm + i + v_int32x4::nlanes)));
v_store(SUM + i + VTraits<v_int32x4>::vlanes(), v_sub(v_s01, v_load(Sm + i + VTraits<v_int32x4>::vlanes())));
}
#endif
#endif
@ -725,15 +725,15 @@ struct ColumnSum<int, short> :
v_store(SUM + i + VTraits<v_int32>::vlanes(), v_sub(v_s01, vx_load(Sm + i + VTraits<v_int32>::vlanes())));
}
#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
for( ; i <= width-v_int16x8::nlanes; i+=v_int16x8::nlanes )
for( ; i <= width-VTraits<v_int16x8>::vlanes(); i+=VTraits<v_int16x8>::vlanes() )
{
v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i));
v_int32x4 v_s01 = v_add(v_load(SUM + i + v_int32x4::nlanes), v_load(Sp + i + v_int32x4::nlanes));
v_int32x4 v_s01 = v_add(v_load(SUM + i + VTraits<v_int32x4>::vlanes()), v_load(Sp + i + VTraits<v_int32x4>::vlanes()));
v_store(D + i, v_pack(v_s0, v_s01));
v_store(SUM + i, v_sub(v_s0, v_load(Sm + i)));
v_store(SUM + i + v_int32x4::nlanes, v_sub(v_s01, v_load(Sm + i + v_int32x4::nlanes)));
v_store(SUM + i + VTraits<v_int32x4>::vlanes(), v_sub(v_s01, v_load(Sm + i + VTraits<v_int32x4>::vlanes())));
}
#endif
#endif
@ -798,7 +798,7 @@ struct ColumnSum<int, ushort> :
v_store(SUM + i, v_add(vx_load(SUM + i), vx_load(Sp + i)));
}
#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
for (; i <= width - v_int32x4::nlanes; i += v_int32x4::nlanes)
for (; i <= width - VTraits<v_int32x4>::vlanes(); i += VTraits<v_int32x4>::vlanes())
{
v_store(SUM + i, v_add(v_load(SUM + i), v_load(Sp + i)));
}
@ -838,17 +838,17 @@ struct ColumnSum<int, ushort> :
}
#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
v_float32x4 v_scale = v_setall_f32((float)_scale);
for( ; i <= width-v_uint16x8::nlanes; i+=v_uint16x8::nlanes )
for( ; i <= width-VTraits<v_uint16x8>::vlanes(); i+=VTraits<v_uint16x8>::vlanes() )
{
v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i));
v_int32x4 v_s01 = v_add(v_load(SUM + i + v_int32x4::nlanes), v_load(Sp + i + v_int32x4::nlanes));
v_int32x4 v_s01 = v_add(v_load(SUM + i + VTraits<v_int32x4>::vlanes()), v_load(Sp + i + VTraits<v_int32x4>::vlanes()));
v_uint32x4 v_s0d = v_reinterpret_as_u32(v_round(v_mul(v_cvt_f32(v_s0), v_scale)));
v_uint32x4 v_s01d = v_reinterpret_as_u32(v_round(v_mul(v_cvt_f32(v_s01), v_scale)));
v_store(D + i, v_pack(v_s0d, v_s01d));
v_store(SUM + i, v_sub(v_s0, v_load(Sm + i)));
v_store(SUM + i + v_int32x4::nlanes, v_sub(v_s01, v_load(Sm + i + v_int32x4::nlanes)));
v_store(SUM + i + VTraits<v_int32x4>::vlanes(), v_sub(v_s01, v_load(Sm + i + VTraits<v_int32x4>::vlanes())));
}
#endif
#endif
@ -874,15 +874,15 @@ struct ColumnSum<int, ushort> :
v_store(SUM + i + VTraits<v_int32>::vlanes(), v_sub(v_s01, vx_load(Sm + i + VTraits<v_int32>::vlanes())));
}
#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
for( ; i <= width-v_uint16x8::nlanes; i+=v_uint16x8::nlanes )
for( ; i <= width-VTraits<v_uint16x8>::vlanes(); i+=VTraits<v_uint16x8>::vlanes() )
{
v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i));
v_int32x4 v_s01 = v_add(v_load(SUM + i + v_int32x4::nlanes), v_load(Sp + i + v_int32x4::nlanes));
v_int32x4 v_s01 = v_add(v_load(SUM + i + VTraits<v_int32x4>::vlanes()), v_load(Sp + i + VTraits<v_int32x4>::vlanes()));
v_store(D + i, v_pack(v_reinterpret_as_u32(v_s0), v_reinterpret_as_u32(v_s01)));
v_store(SUM + i, v_sub(v_s0, v_load(Sm + i)));
v_store(SUM + i + v_int32x4::nlanes, v_sub(v_s01, v_load(Sm + i + v_int32x4::nlanes)));
v_store(SUM + i + VTraits<v_int32x4>::vlanes(), v_sub(v_s01, v_load(Sm + i + VTraits<v_int32x4>::vlanes())));
}
#endif
#endif
@ -945,7 +945,7 @@ struct ColumnSum<int, int> :
v_store(SUM + i, v_add(vx_load(SUM + i), vx_load(Sp + i)));
}
#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
for( ; i <= width - v_int32x4::nlanes; i+=v_int32x4::nlanes )
for( ; i <= width - VTraits<v_int32x4>::vlanes(); i+=VTraits<v_int32x4>::vlanes() )
{
v_store(SUM + i, v_add(v_load(SUM + i), v_load(Sp + i)));
}
@ -981,7 +981,7 @@ struct ColumnSum<int, int> :
}
#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
v_float32x4 v_scale = v_setall_f32((float)_scale);
for( ; i <= width-v_int32x4::nlanes; i+=v_int32x4::nlanes )
for( ; i <= width-VTraits<v_int32x4>::vlanes(); i+=VTraits<v_int32x4>::vlanes() )
{
v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i));
v_int32x4 v_s0d = v_round(v_mul(v_cvt_f32(v_s0), v_scale));
@ -1010,7 +1010,7 @@ struct ColumnSum<int, int> :
v_store(SUM + i, v_sub(v_s0, vx_load(Sm + i)));
}
#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
for( ; i <= width-v_int32x4::nlanes; i+=v_int32x4::nlanes )
for( ; i <= width-VTraits<v_int32x4>::vlanes(); i+=VTraits<v_int32x4>::vlanes() )
{
v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i));
@ -1079,7 +1079,7 @@ struct ColumnSum<int, float> :
v_store(SUM + i, v_add(vx_load(SUM + i), vx_load(Sp + i)));
}
#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
for( ; i <= width - v_int32x4::nlanes; i+=v_int32x4::nlanes )
for( ; i <= width - VTraits<v_int32x4>::vlanes(); i+=VTraits<v_int32x4>::vlanes() )
{
v_store(SUM + i, v_add(v_load(SUM + i), v_load(Sp + i)));
}
@ -1115,7 +1115,7 @@ struct ColumnSum<int, float> :
}
#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
v_float32x4 v_scale = v_setall_f32((float)_scale);
for (; i <= width - v_int32x4::nlanes; i += v_int32x4::nlanes)
for (; i <= width - VTraits<v_int32x4>::vlanes(); i += VTraits<v_int32x4>::vlanes())
{
v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i));
v_store(D + i, v_mul(v_cvt_f32(v_s0), v_scale));
@ -1142,7 +1142,7 @@ struct ColumnSum<int, float> :
v_store(SUM + i, v_sub(v_s0, vx_load(Sm + i)));
}
#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
for( ; i <= width-v_int32x4::nlanes; i+=v_int32x4::nlanes )
for( ; i <= width-VTraits<v_int32x4>::vlanes(); i+=VTraits<v_int32x4>::vlanes() )
{
v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i));
v_store(D + i, v_cvt_f32(v_s0));

View File

@ -66,7 +66,7 @@ template<typename _Tp> static inline cv::v_float32 splineInterpolate(const cv::v
ix = v_shl<2>(ix);
v_float32 t0, t1, t2, t3;
// assume that v_float32::nlanes == v_int32::nlanes
// assume that VTraits<v_float32>::vlanes() == VTraits<v_int32>::vlanes()
if(VTraits<v_float32>::vlanes() == 4)
{
int32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) idx[4];
@ -1388,16 +1388,16 @@ static inline void trilinearPackedInterpolate(const v_uint16x8& inX, const v_uin
v_uint16x8& outA, v_uint16x8& outB, v_uint16x8& outC)
{
//LUT idx of origin pt of cube
v_uint16x8 idxsX = inX >> (lab_base_shift - lab_lut_shift);
v_uint16x8 idxsY = inY >> (lab_base_shift - lab_lut_shift);
v_uint16x8 idxsZ = inZ >> (lab_base_shift - lab_lut_shift);
v_uint16x8 idxsX = v_shr<lab_base_shift - lab_lut_shift>(inX);
v_uint16x8 idxsY = v_shr<lab_base_shift - lab_lut_shift>(inY);
v_uint16x8 idxsZ = v_shr<lab_base_shift - lab_lut_shift>(inZ);
//x, y, z are [0; TRILINEAR_BASE)
const uint16_t bitMask = (1 << trilinear_shift) - 1;
v_uint16x8 bitMaskReg = v_setall_u16(bitMask);
v_uint16x8 fracX = (inX >> (lab_base_shift - 8 - 1)) & bitMaskReg;
v_uint16x8 fracY = (inY >> (lab_base_shift - 8 - 1)) & bitMaskReg;
v_uint16x8 fracZ = (inZ >> (lab_base_shift - 8 - 1)) & bitMaskReg;
v_uint16x8 fracX = v_and(v_shr<lab_base_shift - 8 - 1>(inX), bitMaskReg);
v_uint16x8 fracY = v_and(v_shr<lab_base_shift - 8 - 1>(inY), bitMaskReg);
v_uint16x8 fracZ = v_and(v_shr<lab_base_shift - 8 - 1>(inZ), bitMaskReg);
//load values to interpolate for pix0, pix1, .., pix7
v_int16x8 a0, a1, a2, a3, a4, a5, a6, a7;
@ -1407,9 +1407,9 @@ static inline void trilinearPackedInterpolate(const v_uint16x8& inX, const v_uin
v_uint32x4 addrDw0, addrDw1, addrDw10, addrDw11;
v_mul_expand(v_setall_u16(3*8), idxsX, addrDw0, addrDw1);
v_mul_expand(v_setall_u16(3*8*LAB_LUT_DIM), idxsY, addrDw10, addrDw11);
addrDw0 += addrDw10; addrDw1 += addrDw11;
addrDw0 = v_add(addrDw0, addrDw10); addrDw1 = v_add(addrDw1, addrDw11);
v_mul_expand(v_setall_u16(3*8*LAB_LUT_DIM*LAB_LUT_DIM), idxsZ, addrDw10, addrDw11);
addrDw0 += addrDw10; addrDw1 += addrDw11;
addrDw0 = v_add(addrDw0, addrDw10); addrDw1 = v_add(addrDw1, addrDw11);
uint32_t CV_DECL_ALIGNED(16) addrofs[8];
v_store_aligned(addrofs, addrDw0);
@ -1431,9 +1431,9 @@ static inline void trilinearPackedInterpolate(const v_uint16x8& inX, const v_uin
v_int16x8 w0, w1, w2, w3, w4, w5, w6, w7;
v_mul_expand(v_setall_u16(8), fracX, addrDw0, addrDw1);
v_mul_expand(v_setall_u16(8*TRILINEAR_BASE), fracY, addrDw10, addrDw11);
addrDw0 += addrDw10; addrDw1 += addrDw11;
addrDw0 = v_add(addrDw0, addrDw10); addrDw1 = v_add(addrDw1, addrDw11);
v_mul_expand(v_setall_u16(8*TRILINEAR_BASE*TRILINEAR_BASE), fracZ, addrDw10, addrDw11);
addrDw0 += addrDw10; addrDw1 += addrDw11;
addrDw0 = v_add(addrDw0, addrDw10); addrDw1 = v_add(addrDw1, addrDw11);
v_store_aligned(addrofs, addrDw0);
v_store_aligned(addrofs + 4, addrDw1);
@ -1476,7 +1476,8 @@ static inline void trilinearPackedInterpolate(const v_uint16& inX, const v_uint1
const int16_t* LUT,
v_uint16& outA, v_uint16& outB, v_uint16& outC)
{
const int vsize = VTraits<v_uint16>::max_nlanes;
const int vsize = VTraits<v_uint16>::vlanes();
const int vsize_max = VTraits<v_uint16>::max_nlanes;
// LUT idx of origin pt of cube
v_uint16 tx = v_shr<lab_base_shift - lab_lut_shift>(inX);
@ -1492,7 +1493,7 @@ static inline void trilinearPackedInterpolate(const v_uint16& inX, const v_uint1
baseIdx0 = v_add(v_add(btmp00, btmp10), btmp20);
baseIdx1 = v_add(v_add(btmp01, btmp11), btmp21);
uint32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vbaseIdx[vsize];
uint32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vbaseIdx[vsize_max];
v_store_aligned(vbaseIdx + 0*vsize/2, baseIdx0);
v_store_aligned(vbaseIdx + 1*vsize/2, baseIdx1);
@ -1513,13 +1514,13 @@ static inline void trilinearPackedInterpolate(const v_uint16& inX, const v_uint1
trilinearIdx0 = v_add(v_add(v_shl<3>(fracX0), v_shl<3 + trilinear_shift>(fracY0)), v_shl<3 + trilinear_shift * 2>(fracZ0));
trilinearIdx1 = v_add(v_add(v_shl<3>(fracX1), v_shl<3 + trilinear_shift>(fracY1)), v_shl<3 + trilinear_shift * 2>(fracZ1));
uint32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vtrilinearIdx[vsize];
uint32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vtrilinearIdx[vsize_max];
v_store_aligned(vtrilinearIdx + 0*vsize/2, trilinearIdx0);
v_store_aligned(vtrilinearIdx + 1*vsize/2, trilinearIdx1);
v_uint32 a0, a1, b0, b1, c0, c1;
uint32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) va[vsize], vb[vsize], vc[vsize];
uint32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) va[vsize_max], vb[vsize_max], vc[vsize_max];
for(int j = 0; j < vsize; j++)
{
const int16_t* baseLUT = LUT + vbaseIdx[j];
@ -1649,11 +1650,11 @@ struct RGB2Lab_b
vL = v_shr<lab_shift2>(vL);
/* int a = CV_DESCALE( 500*(fX - fY) + 128*(1 << lab_shift2), lab_shift2 );*/
va = v_fma(vfX - vfY, v_setall_s32(500), v_setall_s32(abShift+labDescaleShift));
va = v_fma(v_sub(vfX, vfY), v_setall_s32(500), v_setall_s32(abShift+labDescaleShift));
va = v_shr<lab_shift2>(va);
/* int b = CV_DESCALE( 200*(fY - fZ) + 128*(1 << lab_shift2), lab_shift2 );*/
vb = v_fma(vfY - vfZ, v_setall_s32(200), v_setall_s32(abShift+labDescaleShift));
vb = v_fma(v_sub(vfY, vfZ), v_setall_s32(200), v_setall_s32(abShift+labDescaleShift));
vb = v_shr<lab_shift2>(vb);
}
#endif // CV_NEON
@ -1675,8 +1676,8 @@ struct RGB2Lab_b
#if CV_NEON
// On each loop, we load nlanes of RGB/A v_uint8s and store nlanes of
// Lab v_uint8s
for(; i <= n - v_uint8::nlanes; i += v_uint8::nlanes,
src += scn*v_uint8::nlanes, dst += 3*v_uint8::nlanes )
for(; i <= n - VTraits<v_uint8>::vlanes(); i += VTraits<v_uint8>::vlanes(),
src += scn*VTraits<v_uint8>::vlanes(), dst += 3*VTraits<v_uint8>::vlanes() )
{
// Load 4 batches of 4 src
v_uint8 vRi, vGi, vBi;
@ -1712,7 +1713,7 @@ struct RGB2Lab_b
#endif // CV_NEON
#if CV_SIMD
const int vsize = v_uint8::nlanes;
const int vsize = VTraits<v_uint8>::vlanes();
const int xyzDescaleShift = 1 << (lab_shift - 1);
v_int16 vXYZdescale = vx_setall_s16(xyzDescaleShift);
v_int16 cxrg, cxb1, cyrg, cyb1, czrg, czb1;
@ -1752,7 +1753,7 @@ struct RGB2Lab_b
v_expand(drgb[k], qrgb[k*2+0], qrgb[k*2+1]);
}
uint32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vdrgb[vsize*3];
uint32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vdrgb[VTraits<v_uint8>::max_nlanes*3];
for(int k = 0; k < 12; k++)
{
v_store_aligned(vdrgb + k*vsize/4, qrgb[k]);
@ -1784,14 +1785,14 @@ struct RGB2Lab_b
v_uint32 x[4], y[4], z[4];
for(int j = 0; j < 4; j++)
{
x[j] = v_reinterpret_as_u32(v_dotprod(rg[j], cxrg) + v_dotprod(bd[j], cxb1)) >> lab_shift;
y[j] = v_reinterpret_as_u32(v_dotprod(rg[j], cyrg) + v_dotprod(bd[j], cyb1)) >> lab_shift;
z[j] = v_reinterpret_as_u32(v_dotprod(rg[j], czrg) + v_dotprod(bd[j], czb1)) >> lab_shift;
x[j] = v_shr<xyz_shift>(v_reinterpret_as_u32(v_add(v_dotprod(rg[j], cxrg), v_dotprod(bd[j], cxb1))));
y[j] = v_shr<xyz_shift>(v_reinterpret_as_u32(v_add(v_dotprod(rg[j], cyrg), v_dotprod(bd[j], cyb1))));
z[j] = v_shr<xyz_shift>(v_reinterpret_as_u32(v_add(v_dotprod(rg[j], czrg), v_dotprod(bd[j], czb1))));
}
// [fX, fY, fZ] = LabCbrtTab_b[vx, vy, vz]
// [4 per X, 4 per Y, 4 per Z]
uint32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vxyz[vsize*3];
uint32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vxyz[VTraits<v_uint8>::max_nlanes*3];
for(int j = 0; j < 4; j++)
{
v_store_aligned(vxyz + (0*4+j)*vsize/4, x[j]);
@ -1822,7 +1823,7 @@ struct RGB2Lab_b
v_uint32 vLshift = vx_setall_u32((uint32_t)(Lshift + labDescaleShift));
for(int k = 0; k < 4; k++)
{
vL[k] = (vL[k] + vLshift) >> lab_shift2;
vL[k] = v_shr<lab_shift2>(v_add(vL[k], vLshift));
}
v_uint16 L0, L1;
L0 = v_pack(vL[0], vL[1]);
@ -1846,7 +1847,7 @@ struct RGB2Lab_b
v_int32 abShift = vx_setall_s32(128*(1 << lab_shift2) + labDescaleShift);
for(int k = 0; k < 8; k++)
{
ab[k] = (ab[k] + abShift) >> lab_shift2;
ab[k] = v_shr<lab_shift2>(v_add(ab[k], abShift));
}
v_int16 a0, a1, b0, b1;
a0 = v_pack(ab[0], ab[1]); a1 = v_pack(ab[2], ab[3]);
@ -1941,7 +1942,7 @@ struct RGB2Lab_f
#if CV_SIMD
if(enablePackedLab)
{
const int vsize = v_float32::nlanes;
const int vsize = VTraits<v_float32>::vlanes();
static const int nPixels = vsize*2;
for(; i < n - 3*nPixels; i += 3*nPixels, src += scn*nPixels)
{
@ -1973,8 +1974,8 @@ struct RGB2Lab_f
#undef clipv
/* int iR = R*LAB_BASE, iG = G*LAB_BASE, iB = B*LAB_BASE, iL, ia, ib; */
v_float32 basef = vx_setall_f32(LAB_BASE);
rvec0 *= basef, gvec0 *= basef, bvec0 *= basef;
rvec1 *= basef, gvec1 *= basef, bvec1 *= basef;
rvec0 = v_mul(rvec0, basef), gvec0 = v_mul(gvec0, basef), bvec0 = v_mul(bvec0, basef);
rvec1 = v_mul(rvec1, basef), gvec1 = v_mul(gvec1, basef), bvec1 = v_mul(bvec1, basef);
v_int32 irvec0, igvec0, ibvec0, irvec1, igvec1, ibvec1;
irvec0 = v_round(rvec0); irvec1 = v_round(rvec1);
@ -2004,8 +2005,8 @@ struct RGB2Lab_f
/* dst[i] = L*100.0f */
v_float32 v100dBase = vx_setall_f32(100.0f/LAB_BASE);
l_vec0 = l_vec0*v100dBase;
l_vec1 = l_vec1*v100dBase;
l_vec0 = v_mul(l_vec0, v100dBase);
l_vec1 = v_mul(l_vec1, v100dBase);
/*
dst[i + 1] = a*256.0f - 128.0f;
dst[i + 2] = b*256.0f - 128.0f;
@ -2043,8 +2044,8 @@ struct RGB2Lab_f
static const float _a = (softfloat(16) / softfloat(116));
int i = 0;
#if CV_SIMD
const int vsize = v_float32::nlanes;
const int nrepeats = vsize == 4 ? 2 : 1;
const int vsize = VTraits<v_float32>::vlanes();
const int nrepeats = VTraits<v_float32>::nlanes == 4 ? 2 : 1;
v_float32 vc0 = vx_setall_f32(C0), vc1 = vx_setall_f32(C1), vc2 = vx_setall_f32(C2);
v_float32 vc3 = vx_setall_f32(C3), vc4 = vx_setall_f32(C4), vc5 = vx_setall_f32(C5);
v_float32 vc6 = vx_setall_f32(C6), vc7 = vx_setall_f32(C7), vc8 = vx_setall_f32(C8);
@ -2080,9 +2081,9 @@ struct RGB2Lab_f
v_float32 vgscale = vx_setall_f32(gscale);
for (int k = 0; k < nrepeats; k++)
{
R[k] = splineInterpolate(R[k]*vgscale, gammaTab, GAMMA_TAB_SIZE);
G[k] = splineInterpolate(G[k]*vgscale, gammaTab, GAMMA_TAB_SIZE);
B[k] = splineInterpolate(B[k]*vgscale, gammaTab, GAMMA_TAB_SIZE);
R[k] = splineInterpolate(v_mul(R[k], vgscale), gammaTab, GAMMA_TAB_SIZE);
G[k] = splineInterpolate(v_mul(G[k], vgscale), gammaTab, GAMMA_TAB_SIZE);
B[k] = splineInterpolate(v_mul(B[k], vgscale), gammaTab, GAMMA_TAB_SIZE);
}
}
@ -2090,26 +2091,26 @@ struct RGB2Lab_f
v_float32 FX[nrepeats], FY[nrepeats], FZ[nrepeats];
for (int k = 0; k < nrepeats; k++)
{
X[k] = v_fma(R[k], vc0, v_fma(G[k], vc1, B[k]*vc2));
Y[k] = v_fma(R[k], vc3, v_fma(G[k], vc4, B[k]*vc5));
Z[k] = v_fma(R[k], vc6, v_fma(G[k], vc7, B[k]*vc8));
X[k] = v_fma(R[k], vc0, v_fma(G[k], vc1, v_mul(B[k], vc2)));
Y[k] = v_fma(R[k], vc3, v_fma(G[k], vc4, v_mul(B[k], vc5)));
Z[k] = v_fma(R[k], vc6, v_fma(G[k], vc7, v_mul(B[k], vc8)));
// use spline interpolation instead of direct calculation
v_float32 vTabScale = vx_setall_f32(LabCbrtTabScale);
FX[k] = splineInterpolate(X[k]*vTabScale, LabCbrtTab, LAB_CBRT_TAB_SIZE);
FY[k] = splineInterpolate(Y[k]*vTabScale, LabCbrtTab, LAB_CBRT_TAB_SIZE);
FZ[k] = splineInterpolate(Z[k]*vTabScale, LabCbrtTab, LAB_CBRT_TAB_SIZE);
FX[k] = splineInterpolate(v_mul(X[k], vTabScale), LabCbrtTab, LAB_CBRT_TAB_SIZE);
FY[k] = splineInterpolate(v_mul(Y[k], vTabScale), LabCbrtTab, LAB_CBRT_TAB_SIZE);
FZ[k] = splineInterpolate(v_mul(Z[k], vTabScale), LabCbrtTab, LAB_CBRT_TAB_SIZE);
}
v_float32 L[nrepeats], a[nrepeats], b[nrepeats];
for (int k = 0; k < nrepeats; k++)
{
// 7.787f = (29/3)^3/(29*4), 0.008856f = (6/29)^3, 903.3 = (29/3)^3
v_float32 mask = Y[k] > (vx_setall_f32(0.008856f));
v_float32 mask = v_gt(Y[k], (vx_setall_f32(0.008856f)));
v_float32 v116 = vx_setall_f32(116.f), vm16 = vx_setall_f32(-16.f);
L[k] = v_select(mask, v_fma(v116, FY[k], vm16), vx_setall_f32(903.3f)*Y[k]);
a[k] = vx_setall_f32(500.f) * (FX[k] - FY[k]);
b[k] = vx_setall_f32(200.f) * (FY[k] - FZ[k]);
L[k] = v_select(mask, v_fma(v116, FY[k], vm16), v_mul(vx_setall_f32(903.3f),Y[k]));
a[k] = v_mul(vx_setall_f32(500.F), v_sub(FX[k], FY[k]));
b[k] = v_mul(vx_setall_f32(200.F), v_sub(FY[k], FZ[k]));
v_store_interleave(dst + k*3*vsize, L[k], a[k], b[k]);
}
@ -2204,7 +2205,7 @@ struct Lab2RGBfloat
float alpha = ColorChannel<float>::max();
#if CV_SIMD
const int vsize = v_float32::nlanes;
const int vsize = VTraits<v_float32>::vlanes();
const int nrepeats = 2;
v_float32 v16_116 = vx_setall_f32(16.0f / 116.0f);
for( ; i <= n-vsize*nrepeats;
@ -2221,14 +2222,14 @@ struct Lab2RGBfloat
v_float32 vlThresh = vx_setall_f32(lThresh);
for(int k = 0; k < nrepeats; k++)
{
limask[k] = li[k] <= vlThresh;
limask[k] = v_le(li[k], vlThresh);
}
v_float32 ylo[nrepeats], yhi[nrepeats], fylo[nrepeats], fyhi[nrepeats];
// 903.3 = (29/3)^3, 7.787 = (29/3)^3/(29*4)
v_float32 vinv903 = vx_setall_f32(1.f/903.3f);
for(int k = 0; k < nrepeats; k++)
{
ylo[k] = li[k] * vinv903;
ylo[k] = v_mul(li[k], vinv903);
}
v_float32 v7787 = vx_setall_f32(7.787f);
for(int k = 0; k < nrepeats; k++)
@ -2238,11 +2239,11 @@ struct Lab2RGBfloat
v_float32 v16 = vx_setall_f32(16.0f), vinv116 = vx_setall_f32(1.f/116.0f);
for(int k = 0; k < nrepeats; k++)
{
fyhi[k] = (li[k] + v16) * vinv116;
fyhi[k] = v_mul(v_add(li[k], v16), vinv116);
}
for(int k = 0; k < nrepeats; k++)
{
yhi[k] = fyhi[k] * fyhi[k] * fyhi[k];
yhi[k] = v_mul(fyhi[k], fyhi[k], fyhi[k]);
}
for(int k = 0; k < nrepeats; k++)
{
@ -2265,9 +2266,9 @@ struct Lab2RGBfloat
for (int j = 0; j < 2; j++)
{
v_float32 f = fxz[k*2+j];
v_float32 fmask = f <= vfTresh;
v_float32 flo = (f - v16_116) * vinv7787;
v_float32 fhi = f*f*f;
v_float32 fmask = v_le(f, vfTresh);
v_float32 flo = v_mul(v_sub(f, v16_116), vinv7787);
v_float32 fhi = v_mul(v_mul(f, f), f);
fxz[k*2+j] = v_select(fmask, flo, fhi);
}
}
@ -2281,9 +2282,9 @@ struct Lab2RGBfloat
v_float32 vc6 = vx_setall_f32(C6), vc7 = vx_setall_f32(C7), vc8 = vx_setall_f32(C8);
for(int k = 0; k < nrepeats; k++)
{
ro[k] = v_fma(vc0, x[k], v_fma(vc1, y[k], vc2 * z[k]));
go[k] = v_fma(vc3, x[k], v_fma(vc4, y[k], vc5 * z[k]));
bo[k] = v_fma(vc6, x[k], v_fma(vc7, y[k], vc8 * z[k]));
ro[k] = v_fma(vc0, x[k], v_fma(vc1, y[k], v_mul(vc2, z[k])));
go[k] = v_fma(vc3, x[k], v_fma(vc4, y[k], v_mul(vc5, z[k])));
bo[k] = v_fma(vc6, x[k], v_fma(vc7, y[k], v_mul(vc8, z[k])));
}
v_float32 one = vx_setall_f32(1.f), zero = vx_setzero_f32();
for(int k = 0; k < nrepeats; k++)
@ -2298,9 +2299,9 @@ struct Lab2RGBfloat
v_float32 vgscale = vx_setall_f32(gscale);
for(int k = 0; k < nrepeats; k++)
{
ro[k] *= vgscale;
go[k] *= vgscale;
bo[k] *= vgscale;
ro[k] = v_mul(ro[k], vgscale);
go[k] = v_mul(go[k], vgscale);
bo[k] = v_mul(bo[k], vgscale);
}
for(int k = 0; k < nrepeats; k++)
@ -2500,8 +2501,8 @@ struct Lab2RGBinteger
for(int k = 0; k < 4; k++)
{
yf[k] = v_lut((const int*)LabToYF_b, lq[k]);
y[k] = yf[k] & mask16;
ify[k] = v_reinterpret_as_s32(v_reinterpret_as_u32(yf[k]) >> 16);
y[k] = v_and(yf[k], mask16);
ify[k] = v_reinterpret_as_s32(v_shr(v_reinterpret_as_u32(yf[k]), 16));
}
v_int16 ify0, ify1;
@ -2516,18 +2517,18 @@ struct Lab2RGBinteger
v_uint16 mulA = vx_setall_u16(53687);
v_uint32 ma[4];
v_uint32 addA = vx_setall_u32(1 << 7);
v_mul_expand((a0 + (a0 << 2)), mulA, ma[0], ma[1]);
v_mul_expand((a1 + (a1 << 2)), mulA, ma[2], ma[3]);
adiv0 = v_reinterpret_as_s16(v_pack(((ma[0] + addA) >> 13), ((ma[1] + addA) >> 13)));
adiv1 = v_reinterpret_as_s16(v_pack(((ma[2] + addA) >> 13), ((ma[3] + addA) >> 13)));
v_mul_expand((v_add(a0, v_shl<2>(a0))), mulA, ma[0], ma[1]);
v_mul_expand((v_add(a1, v_shl<2>(a1))), mulA, ma[2], ma[3]);
adiv0 = v_reinterpret_as_s16(v_pack((v_shr<13>(v_add(ma[0], addA))), (v_shr<13>(v_add(ma[1], addA)))));
adiv1 = v_reinterpret_as_s16(v_pack((v_shr<13>(v_add(ma[2], addA))), (v_shr<13>(v_add(ma[3], addA)))));
v_uint16 mulB = vx_setall_u16(41943);
v_uint32 mb[4];
v_uint32 addB = vx_setall_u32(1 << 4);
v_mul_expand(b0, mulB, mb[0], mb[1]);
v_mul_expand(b1, mulB, mb[2], mb[3]);
bdiv0 = v_reinterpret_as_s16(v_pack((mb[0] + addB) >> 9, (mb[1] + addB) >> 9));
bdiv1 = v_reinterpret_as_s16(v_pack((mb[2] + addB) >> 9, (mb[3] + addB) >> 9));
bdiv0 = v_reinterpret_as_s16(v_pack(v_shr<9>(v_add(mb[0], addB)), v_shr<9>(v_add(mb[1], addB))));
bdiv1 = v_reinterpret_as_s16(v_pack(v_shr<9>(v_add(mb[2], addB)), v_shr<9>(v_add(mb[3], addB))));
// 0 <= adiv <= 8356, 0 <= bdiv <= 20890
/* x = ifxz[0]; y = y; z = ifxz[1]; */
@ -2570,7 +2571,7 @@ struct Lab2RGBinteger
{
bool srgb = issRGB;
ushort* tab = sRGBInvGammaTab_b;
const int vsize = v_uint8::nlanes;
const int vsize = VTraits<v_uint8>::vlanes();
v_uint8 valpha = vx_setall_u8(alpha);
v_int32 vc[9];
for(int k = 0; k < 9; k++)
@ -2592,9 +2593,9 @@ struct Lab2RGBinteger
v_int32 rq[4], gq[4], bq[4];
for(int k = 0; k < 4; k++)
{
rq[k] = (vc[0] * xq[k] + vc[1] * yq[k] + vc[2] * zq[k] + vdescale) >> shift;
gq[k] = (vc[3] * xq[k] + vc[4] * yq[k] + vc[5] * zq[k] + vdescale) >> shift;
bq[k] = (vc[6] * xq[k] + vc[7] * yq[k] + vc[8] * zq[k] + vdescale) >> shift;
rq[k] = v_shr<shift>(v_add(v_add(v_add(v_mul(vc[0], xq[k]), v_mul(vc[1], yq[k])), v_mul(vc[2], zq[k])), vdescale));
gq[k] = v_shr<shift>(v_add(v_add(v_add(v_mul(vc[3], xq[k]), v_mul(vc[4], yq[k])), v_mul(vc[5], zq[k])), vdescale));
bq[k] = v_shr<shift>(v_add(v_add(v_add(v_mul(vc[6], xq[k]), v_mul(vc[7], yq[k])), v_mul(vc[8], zq[k])), vdescale));
}
//limit indices in table and then substitute
@ -2611,7 +2612,7 @@ struct Lab2RGBinteger
if(srgb)
{
// [RRR... , GGG... , BBB...]
int32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vidx[vsize*3];
int32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vidx[VTraits<v_uint8>::max_nlanes*3];
for (int k = 0; k < 4; k++)
v_store_aligned(vidx + 0*vsize + k*vsize/4, rq[k]);
for (int k = 0; k < 4; k++)
@ -2631,9 +2632,9 @@ struct Lab2RGBinteger
// rgb = (rgb*255) >> inv_gamma_shift
for(int k = 0; k < 4; k++)
{
rq[k] = ((rq[k] << 8) - rq[k]) >> inv_gamma_shift;
gq[k] = ((gq[k] << 8) - gq[k]) >> inv_gamma_shift;
bq[k] = ((bq[k] << 8) - bq[k]) >> inv_gamma_shift;
rq[k] = v_shr((v_sub(v_shl(rq[k], 8), rq[k])), inv_gamma_shift);
gq[k] = v_shr((v_sub(v_shl(gq[k], 8), gq[k])), inv_gamma_shift);
bq[k] = v_shr((v_sub(v_shl(bq[k], 8), bq[k])), inv_gamma_shift);
}
rgb[0] = v_reinterpret_as_u16(v_pack(rq[0], rq[1]));
rgb[1] = v_reinterpret_as_u16(v_pack(rq[2], rq[3]));
@ -2730,13 +2731,13 @@ struct Lab2RGB_b
static const softfloat fl = softfloat(100)/f255;
#if CV_SIMD
const int fsize = v_float32::nlanes;
const int fsize = VTraits<v_float32>::vlanes();
v_float32 vl = vx_setall_f32((float)fl);
v_float32 va = vx_setall_f32(1.f);
v_float32 vb = vx_setall_f32(1.f);
v_float32 vaLow = vx_setall_f32(-128.f), vbLow = vx_setall_f32(-128.f);
//TODO: fix that when v_interleave is available
float CV_DECL_ALIGNED(CV_SIMD_WIDTH) interTmpM[fsize*3], interTmpA[fsize*3];
float CV_DECL_ALIGNED(CV_SIMD_WIDTH) interTmpM[VTraits<v_float32>::max_nlanes*3], interTmpA[VTraits<v_float32>::max_nlanes*3];
v_store_interleave(interTmpM, vl, va, vb);
v_store_interleave(interTmpA, vx_setzero_f32(), vaLow, vbLow);
v_float32 mluv[3], aluv[3];
@ -2754,7 +2755,7 @@ struct Lab2RGB_b
j = 0;
#if CV_SIMD
const int vsize = v_uint8::nlanes;
const int vsize = VTraits<v_uint8>::vlanes();
for( ; j <= (dn - vsize)*3; j += 3*vsize )
{
v_uint8 s0, s1, s2;
@ -2808,7 +2809,7 @@ struct Lab2RGB_b
v_int32 vi[4*3];
for(int k = 0; k < 4*3; k++)
{
vi[k] = v_round(vf[k]*v255);
vi[k] = v_round(v_mul(vf[k], v255));
}
v_uint8 rgb[3];
@ -2830,7 +2831,7 @@ struct Lab2RGB_b
for(int k = 0; k < 4; k++)
{
vf[k] = vx_load_aligned(buf + j + k*fsize);
vi[k] = v_round(vf[k]*v255);
vi[k] = v_round(v_mul(vf[k], v255));
}
v_store(dst, v_pack_u(v_pack(vi[0], vi[1]), v_pack(vi[2], vi[3])));
}
@ -2910,8 +2911,8 @@ struct RGB2Luvfloat
C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
#if CV_SIMD
const int vsize = v_float32::nlanes;
const int nrepeats = vsize == 4 ? 2 : 1;
const int vsize = VTraits<v_float32>::vlanes();
const int nrepeats = VTraits<v_float32>::nlanes == 4 ? 2 : 1;
for( ; i <= n-vsize*nrepeats;
i+= vsize*nrepeats, src += scn*vsize*nrepeats, dst += 3*vsize*nrepeats)
{
@ -2944,9 +2945,9 @@ struct RGB2Luvfloat
v_float32 vgscale = vx_setall_f32(gscale);
for (int k = 0; k < nrepeats; k++)
{
R[k] *= vgscale;
G[k] *= vgscale;
B[k] *= vgscale;
R[k] = v_mul(R[k], vgscale);
G[k] = v_mul(G[k], vgscale);
B[k] = v_mul(B[k], vgscale);
}
for (int k = 0; k < nrepeats; k++)
@ -2963,27 +2964,27 @@ struct RGB2Luvfloat
v_float32 vc6 = vx_setall_f32(C6), vc7 = vx_setall_f32(C7), vc8 = vx_setall_f32(C8);
for (int k = 0; k < nrepeats; k++)
{
X[k] = v_fma(R[k], vc0, v_fma(G[k], vc1, B[k]*vc2));
Y[k] = v_fma(R[k], vc3, v_fma(G[k], vc4, B[k]*vc5));
Z[k] = v_fma(R[k], vc6, v_fma(G[k], vc7, B[k]*vc8));
X[k] = v_fma(R[k], vc0, v_fma(G[k], vc1, v_mul(B[k], vc2)));
Y[k] = v_fma(R[k], vc3, v_fma(G[k], vc4, v_mul(B[k], vc5)));
Z[k] = v_fma(R[k], vc6, v_fma(G[k], vc7, v_mul(B[k], vc8)));
}
v_float32 L[nrepeats], u[nrepeats], v[nrepeats];
v_float32 vmun = vx_setall_f32(-un), vmvn = vx_setall_f32(-vn);
for (int k = 0; k < nrepeats; k++)
{
L[k] = splineInterpolate(Y[k]*vx_setall_f32(LabCbrtTabScale), LabCbrtTab, LAB_CBRT_TAB_SIZE);
L[k] = splineInterpolate(v_mul(Y[k], vx_setall_f32(LabCbrtTabScale)), LabCbrtTab, LAB_CBRT_TAB_SIZE);
// L = 116.f*L - 16.f;
L[k] = v_fma(L[k], vx_setall_f32(116.f), vx_setall_f32(-16.f));
v_float32 d;
// d = (4*13) / max(X + 15 * Y + 3 * Z, FLT_EPSILON)
d = v_fma(Y[k], vx_setall_f32(15.f), v_fma(Z[k], vx_setall_f32(3.f), X[k]));
d = vx_setall_f32(4.f*13.f) / v_max(d, vx_setall_f32(FLT_EPSILON));
d = v_div(vx_setall_f32(4.F * 13.F), v_max(d, vx_setall_f32(FLT_EPSILON)));
// u = L*(X*d - un)
u[k] = L[k]*v_fma(X[k], d, vmun);
u[k] = v_mul(L[k], v_fma(X[k], d, vmun));
// v = L*((9*0.25f)*Y*d - vn);
v[k] = L[k]*v_fma(vx_setall_f32(9.f*0.25f)*Y[k], d, vmvn);
v[k] = v_mul(L[k], v_fma(v_mul(vx_setall_f32(9.F * 0.25F), Y[k]), d, vmvn));
}
for (int k = 0; k < nrepeats; k++)
@ -3099,8 +3100,8 @@ struct Luv2RGBfloat
float _un = un, _vn = vn;
#if CV_SIMD
const int vsize = v_float32::nlanes;
const int nrepeats = vsize == 4 ? 2 : 1;
const int vsize = VTraits<v_float32>::vlanes();
const int nrepeats = VTraits<v_float32>::nlanes == 4 ? 2 : 1;
for( ; i <= n - vsize*nrepeats;
i += vsize*nrepeats, src += vsize*3*nrepeats, dst += dcn*vsize*nrepeats)
{
@ -3120,13 +3121,13 @@ struct Luv2RGBfloat
v_float32 Ylo, Yhi;
// ((L + 16)/116)^3
Ylo = (L[k] + v16) * v116inv;
Ylo = Ylo*Ylo*Ylo;
Ylo = v_mul(v_add(L[k], v16), v116inv);
Ylo = v_mul(v_mul(Ylo, Ylo), Ylo);
// L*(3./29.)^3
Yhi = L[k] * v903inv;
Yhi = v_mul(L[k], v903inv);
// Y = (L <= 8) ? Y0 : Y1;
Y[k] = v_select(L[k] >= vx_setall_f32(8.f), Ylo, Yhi);
Y[k] = v_select(v_ge(L[k], vx_setall_f32(8.f)), Ylo, Yhi);
}
v_float32 v4inv = vx_setall_f32(0.25f), v3 = vx_setall_f32(3.f);
@ -3135,18 +3136,18 @@ struct Luv2RGBfloat
v_float32 up, vp;
// up = 3*(u + L*_un);
up = v3*(v_fma(L[k], vx_setall_f32(_un), u[k]));
up = v_mul(v3, v_fma(L[k], vx_setall_f32(_un), u[k]));
// vp = 0.25/(v + L*_vn);
vp = v4inv/(v_fma(L[k], vx_setall_f32(_vn), v[k]));
vp = v_div(v4inv, v_fma(L[k], vx_setall_f32(_vn), v[k]));
// vp = max(-0.25, min(0.25, vp));
vp = v_max(vx_setall_f32(-0.25f), v_min(v4inv, vp));
//X = 3*up*vp; // (*Y) is done later
X[k] = v3*up*vp;
X[k] = v_mul(v_mul(v3, up), vp);
//Z = ((12*13*L - up)*vp - 5); // (*Y) is done later
// xor flips the sign, works like unary minus
Z[k] = v_fma(v_fma(L[k], vx_setall_f32(12.f*13.f), (vx_setall_f32(-0.f) ^ up)), vp, vx_setall_f32(-5.f));
Z[k] = v_fma(v_fma(L[k], vx_setall_f32(12.f*13.f), (v_xor(vx_setall_f32(-0.F), up))), vp, vx_setall_f32(-5.f));
}
v_float32 R[nrepeats], G[nrepeats], B[nrepeats];
@ -3156,9 +3157,9 @@ struct Luv2RGBfloat
for(int k = 0; k < nrepeats; k++)
{
// R = (X*C0 + C1 + Z*C2)*Y; // here (*Y) is done
R[k] = v_fma(Z[k], vc2, v_fma(X[k], vc0, vc1))*Y[k];
G[k] = v_fma(Z[k], vc5, v_fma(X[k], vc3, vc4))*Y[k];
B[k] = v_fma(Z[k], vc8, v_fma(X[k], vc6, vc7))*Y[k];
R[k] = v_mul(v_fma(Z[k], vc2, v_fma(X[k], vc0, vc1)), Y[k]);
G[k] = v_mul(v_fma(Z[k], vc5, v_fma(X[k], vc3, vc4)), Y[k]);
B[k] = v_mul(v_fma(Z[k], vc8, v_fma(X[k], vc6, vc7)), Y[k]);
}
v_float32 vzero = vx_setzero_f32(), v1 = vx_setall_f32(1.f);
@ -3174,9 +3175,9 @@ struct Luv2RGBfloat
v_float32 vgscale = vx_setall_f32(gscale);
for(int k = 0; k < nrepeats; k++)
{
R[k] *= vgscale;
G[k] *= vgscale;
B[k] *= vgscale;
R[k] = v_mul(R[k], vgscale);
G[k] = v_mul(G[k], vgscale);
B[k] = v_mul(B[k], vgscale);
}
for(int k = 0; k < nrepeats; k++)
{
@ -3285,7 +3286,7 @@ struct RGB2Luvinterpolate
#if CV_SIMD
if(enablePackedRGB2Luv)
{
const int vsize = v_uint16::nlanes;
const int vsize = VTraits<v_uint16>::vlanes();
static const int nPixels = vsize*2;
for(; i < n - 3*nPixels; i += 3*nPixels, src += scn*nPixels)
{
@ -3315,9 +3316,9 @@ struct RGB2Luvinterpolate
v_expand(r, r0, r1);
v_expand(g, g0, g1);
v_expand(b, b0, b1);
r0 = r0 << (lab_base_shift - 8); r1 = r1 << (lab_base_shift - 8);
g0 = g0 << (lab_base_shift - 8); g1 = g1 << (lab_base_shift - 8);
b0 = b0 << (lab_base_shift - 8); b1 = b1 << (lab_base_shift - 8);
r0 = v_shl<lab_base_shift - 8>(r0); r1 = v_shl<lab_base_shift - 8>(r1);
g0 = v_shl<lab_base_shift - 8>(g0); g1 = v_shl<lab_base_shift - 8>(g1);
b0 = v_shl<lab_base_shift - 8>(b0); b1 = v_shl<lab_base_shift - 8>(b1);
/*
int L, u, v;
@ -3332,9 +3333,9 @@ struct RGB2Luvinterpolate
dst[i+1] = saturate_cast<uchar>(u/baseDiv);
dst[i+2] = saturate_cast<uchar>(v/baseDiv);
*/
l0 = l0 >> (lab_base_shift - 8); l1 = l1 >> (lab_base_shift - 8);
u0 = u0 >> (lab_base_shift - 8); u1 = u1 >> (lab_base_shift - 8);
v0 = v0 >> (lab_base_shift - 8); v1 = v1 >> (lab_base_shift - 8);
l0 = v_shr<lab_base_shift - 8>(l0); l1 = v_shr<lab_base_shift - 8>(l1);
u0 = v_shr<lab_base_shift - 8>(u0); u1 = v_shr<lab_base_shift - 8>(u1);
v0 = v_shr<lab_base_shift - 8>(v0); v1 = v_shr<lab_base_shift - 8>(v1);
v_uint8 l = v_pack(l0, l1);
v_uint8 u = v_pack(u0, u1);
v_uint8 v = v_pack(v0, v1);
@ -3405,12 +3406,12 @@ struct RGB2Luv_b
static const softfloat su = -uLow*f255/uRange;
static const softfloat sv = -vLow*f255/vRange;
#if CV_SIMD
const int fsize = v_float32::nlanes;
const int fsize = VTraits<v_float32>::vlanes();
v_float32 ml = vx_setall_f32((float)fL), al = vx_setzero_f32();
v_float32 mu = vx_setall_f32((float)fu), au = vx_setall_f32((float)su);
v_float32 mv = vx_setall_f32((float)fv), av = vx_setall_f32((float)sv);
//TODO: fix that when v_interleave is available
float CV_DECL_ALIGNED(CV_SIMD_WIDTH) interTmpM[fsize*3], interTmpA[fsize*3];
float CV_DECL_ALIGNED(CV_SIMD_WIDTH) interTmpM[VTraits<v_float32>::max_nlanes*3], interTmpA[VTraits<v_float32>::max_nlanes*3];
v_store_interleave(interTmpM, ml, mu, mv);
v_store_interleave(interTmpA, al, au, av);
v_float32 mluv[3], aluv[3];
@ -3452,7 +3453,7 @@ struct RGB2Luv_b
v_float32 f[3*4];
for(int k = 0; k < 3*4; k++)
{
f[k] = v_cvt_f32(q[k])*v255inv;
f[k] = v_mul(v_cvt_f32(q[k]), v255inv);
}
for(int k = 0; k < 4; k++)
@ -3478,8 +3479,8 @@ struct RGB2Luv_b
v_int32 q0, q1;
v_expand(v_reinterpret_as_s16(d), q0, q1);
v_store_aligned(buf + j + 0*fsize, v_cvt_f32(q0)*v255inv);
v_store_aligned(buf + j + 1*fsize, v_cvt_f32(q1)*v255inv);
v_store_aligned(buf + j + 0*fsize, v_mul(v_cvt_f32(q0), v255inv));
v_store_aligned(buf + j + 1*fsize, v_mul(v_cvt_f32(q1), v255inv));
}
for( ; j < dn*bufChannels; j++, src++ )
{
@ -3633,7 +3634,8 @@ struct Luv2RGBinteger
inline void processLuvToXYZ(const v_uint8& lv, const v_uint8& uv, const v_uint8& vv,
v_int32 (&x)[4], v_int32 (&y)[4], v_int32 (&z)[4]) const
{
const int vsize = v_uint8::nlanes;
const int vsize = VTraits<v_uint8>::vlanes();
const int vsize_max = VTraits<v_uint8>::max_nlanes;
v_uint16 lv0, lv1;
v_expand(lv, lv0, lv1);
@ -3646,7 +3648,7 @@ struct Luv2RGBinteger
v_int32 mask16 = vx_setall_s32(0xFFFF);
for(int k = 0; k < 4; k++)
{
y[k] = v_lut((const int*)LabToYF_b, v_reinterpret_as_s32(lq[k])) & mask16;
y[k] = v_and(v_lut((const int *)LabToYF_b, v_reinterpret_as_s32(lq[k])), mask16);
}
v_int32 up[4], vp[4];
@ -3657,10 +3659,10 @@ struct Luv2RGBinteger
v_expand(vv, vv0, vv1);
// LL*256
v_uint16 ll0, ll1;
ll0 = lv0 << 8; ll1 = lv1 << 8;
ll0 = v_shl<8>(lv0); ll1 = v_shl<8>(lv1);
v_uint16 upidx0, upidx1, vpidx0, vpidx1;
upidx0 = ll0 + uv0; upidx1 = ll1 + uv1;
vpidx0 = ll0 + vv0; vpidx1 = ll1 + vv1;
upidx0 = v_add(ll0, uv0); upidx1 = v_add(ll1, uv1);
vpidx0 = v_add(ll0, vv0); vpidx1 = v_add(ll1, vv1);
v_uint32 upidx[4], vpidx[4];
v_expand(upidx0, upidx[0], upidx[1]); v_expand(upidx1, upidx[2], upidx[3]);
v_expand(vpidx0, vpidx[0], vpidx[1]); v_expand(vpidx1, vpidx[2], vpidx[3]);
@ -3672,7 +3674,7 @@ struct Luv2RGBinteger
// long long int vpl = LUVLUT.LvToVpl_b[LL*256+v];
v_int64 vpl[8];
int32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vpidxstore[vsize];
int32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vpidxstore[vsize_max];
for(int k = 0; k < 4; k++)
{
v_store_aligned(vpidxstore + k*vsize/4, v_reinterpret_as_s32(vpidx[k]));
@ -3684,12 +3686,13 @@ struct Luv2RGBinteger
// not all 64-bit arithmetic is available in univ. intrinsics
// need to handle it with scalar code
int64_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vvpl[vsize];
int64_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vvpl[vsize_max];
for(int k = 0; k < 8; k++)
{
v_store_aligned(vvpl + k*vsize/8, vpl[k]);
}
int32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vup[vsize], vvp[vsize], vx[vsize], vy[vsize], vzm[vsize];
int32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vup[vsize_max], vvp[vsize_max],
vx[vsize_max], vy[vsize_max], vzm[vsize_max];
for(int k = 0; k < 4; k++)
{
v_store_aligned(vup + k*vsize/4, up[k]);
@ -3724,7 +3727,7 @@ struct Luv2RGBinteger
// z = zm/256 + zm/65536;
for (int k = 0; k < 4; k++)
{
z[k] = (zm[k] >> 8) + (zm[k] >> 16);
z[k] = v_add(v_shr<8>(zm[k]), v_shr<16>(zm[k]));
}
// (x, z) = clip((x, z), min=0, max=2*BASE)
@ -3751,7 +3754,7 @@ struct Luv2RGBinteger
{
ushort* tab = sRGBInvGammaTab_b;
bool srgb = issRGB;
static const int vsize = v_uint8::nlanes;
static const int vsize = VTraits<v_uint8>::vlanes();
const int descaleShift = 1 << (shift-1);
v_int16 vdescale = vx_setall_s16(descaleShift);
v_int16 vc[9];
@ -3771,12 +3774,12 @@ struct Luv2RGBinteger
// fixing 16bit signed multiplication
// by subtracting 2^(base_shift-1) and then adding result back
v_int32 dummy32, fm[3];
v_expand(vc[0]+vc[1]+vc[2], fm[0], dummy32);
v_expand(vc[3]+vc[4]+vc[5], fm[1], dummy32);
v_expand(vc[6]+vc[7]+vc[8], fm[2], dummy32);
fm[0] = fm[0] << (base_shift-1);
fm[1] = fm[1] << (base_shift-1);
fm[2] = fm[2] << (base_shift-1);
v_expand(v_add(vc[0],vc[1],vc[2]), fm[0], dummy32);
v_expand(v_add(vc[3],vc[4],vc[5]), fm[1], dummy32);
v_expand(v_add(vc[6],vc[7],vc[8]), fm[2], dummy32);
fm[0] = v_shl(fm[0], (base_shift-1));
fm[1] = v_shl(fm[1], (base_shift-1));
fm[2] = v_shl(fm[2], (base_shift-1));
for (; i <= n-vsize; i += vsize, src += 3*vsize, dst += dcn*vsize)
{
@ -3816,15 +3819,15 @@ struct Luv2RGBinteger
// a bit faster than one loop for all
for(int k = 0; k < 4; k++)
{
i_rgb[k+4*0] = (v_dotprod(xy[k], crxy) + v_dotprod(zd[k], crz1) + fm[0]) >> shift;
i_rgb[k+4*0] = v_shr<shift>(v_add(v_add(v_dotprod(xy[k], crxy), v_dotprod(zd[k], crz1)), fm[0]));
}
for(int k = 0; k < 4; k++)
{
i_rgb[k+4*1] = (v_dotprod(xy[k], cgxy) + v_dotprod(zd[k], cgz1) + fm[1]) >> shift;
i_rgb[k+4*1] = v_shr<shift>(v_add(v_add(v_dotprod(xy[k], cgxy), v_dotprod(zd[k], cgz1)), fm[1]));
}
for(int k = 0; k < 4; k++)
{
i_rgb[k+4*2] = (v_dotprod(xy[k], cbxy) + v_dotprod(zd[k], cbz1) + fm[2]) >> shift;
i_rgb[k+4*2] = v_shr<shift>(v_add(v_add(v_dotprod(xy[k], cbxy), v_dotprod(zd[k], cbz1)), fm[2]));
}
// [rrggbb]
@ -3842,7 +3845,7 @@ struct Luv2RGBinteger
if(srgb)
{
// [rr.., gg.., bb..]
int32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) rgbshifts[3*vsize];
int32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) rgbshifts[3*VTraits<v_uint8>::max_nlanes];
for(int k = 0; k < 12; k++)
{
v_store_aligned(rgbshifts + k*vsize/4, i_rgb[k]);
@ -3857,7 +3860,7 @@ struct Luv2RGBinteger
// rgb = (rgb*255) >> inv_gamma_shift
for(int k = 0; k < 12; k++)
{
i_rgb[k] = ((i_rgb[k] << 8) - i_rgb[k]) >> inv_gamma_shift;
i_rgb[k] = v_shr((v_sub((v_shl(i_rgb[k], 8)), i_rgb[k])), inv_gamma_shift);
}
for(int k = 0; k < 6; k++)
@ -3940,13 +3943,13 @@ struct Luv2RGB_b
static const softfloat fv = vRange/f255;
#if CV_SIMD
const int fsize = v_float32::nlanes;
const int fsize = VTraits<v_float32>::vlanes();
v_float32 vl = vx_setall_f32((float)fl);
v_float32 vu = vx_setall_f32((float)fu);
v_float32 vv = vx_setall_f32((float)fv);
v_float32 vuLow = vx_setall_f32((float)uLow), vvLow = vx_setall_f32((float)vLow);
//TODO: fix that when v_interleave is available
float CV_DECL_ALIGNED(CV_SIMD_WIDTH) interTmpM[fsize*3], interTmpA[fsize*3];
float CV_DECL_ALIGNED(CV_SIMD_WIDTH) interTmpM[VTraits<v_float32>::max_nlanes*3], interTmpA[VTraits<v_float32>::max_nlanes*3];
v_store_interleave(interTmpM, vl, vu, vv);
v_store_interleave(interTmpA, vx_setzero_f32(), vuLow, vvLow);
v_float32 mluv[3], aluv[3];
@ -3964,7 +3967,7 @@ struct Luv2RGB_b
j = 0;
#if CV_SIMD
const int vsize = v_uint8::nlanes;
const int vsize = VTraits<v_uint8>::vlanes();
for( ; j <= (dn - vsize)*3; j += 3*vsize )
{
v_uint8 s0, s1, s2;
@ -4017,7 +4020,7 @@ struct Luv2RGB_b
v_int32 vi[4*3];
for(int k = 0; k < 4*3; k++)
{
vi[k] = v_round(vf[k]*v255);
vi[k] = v_round(v_mul(vf[k], v255));
}
v_uint8 rgb[3];
@ -4039,7 +4042,7 @@ struct Luv2RGB_b
for(int k = 0; k < 4; k++)
{
vf[k] = vx_load_aligned(buf + j + k*fsize);
vi[k] = v_round(vf[k]*v255);
vi[k] = v_round(v_mul(vf[k], v255));
}
v_store(dst, v_pack_u(v_pack(vi[0], vi[1]), v_pack(vi[2], vi[3])));
}

View File

@ -882,7 +882,7 @@ struct RGBA2mRGBA<uchar>
int i = 0;
#if CV_SIMD
const int vsize = v_uint8::nlanes;
const int vsize = VTraits<v_uint8>::vlanes();
v_uint8 amask = v_reinterpret_as_u8(vx_setall_u32(0xFF000000));
v_uint16 vh = vx_setall_u16(half_val+1);
@ -901,27 +901,27 @@ struct RGBA2mRGBA<uchar>
v_uint16 a16[4];
for(int j = 0; j < 4; j++)
a16[j] = v_reinterpret_as_u16(v[j] & amask);
a16[j] = v_reinterpret_as_u16(v_and(v[j], amask));
v_uint32 a32[4];
for(int j = 0; j < 4; j++)
a32[j] = v_reinterpret_as_u32(a16[j] | (a16[j] >> 8));
a32[j] = v_reinterpret_as_u32(v_or(a16[j], (v_shr(a16[j], 8))));
v_uint8 a[4];
for(int j = 0; j < 4; j++)
a[j] = v_reinterpret_as_u8(a32[j] | (a32[j] >> 16));
a[j] = v_reinterpret_as_u8(v_or(a32[j], (v_shr(a32[j], 16))));
v_uint16 m[8];
for(int j = 0; j < 4; j++)
v_mul_expand(v[j], a[j], m[j], m[j+4]);
for(int j = 0; j < 8; j++)
m[j] += vh;
m[j] = v_add(m[j], vh);
// div 255: (v+1+(v>>8))>8
// +1 is in vh, has no effect on (v>>8)
for(int j = 0; j < 8; j++)
m[j] = (m[j] + (m[j] >> 8)) >> 8;
m[j] = v_shr((v_add(m[j], (v_shr(m[j], 8)))), 8);
v_uint8 d[4];
for(int j = 0; j < 4; j++)

View File

@ -188,21 +188,21 @@ public:
v_uint16x8 r1 = v_reinterpret_as_u16(v_load(bayer+bayer_step));
v_uint16x8 r2 = v_reinterpret_as_u16(v_load(bayer+bayer_step*2));
v_uint16x8 b1 = ((r0 << 8) >> 7) + ((r2 << 8) >> 7);
v_uint16x8 b0 = v_rotate_right<1>(b1) + b1;
b1 = v_rotate_right<1>(b1) << 1;
v_uint16x8 b1 = v_add(v_shr<7>(v_shl<8>(r0)), v_shr<7>(v_shl<8>(r2)));
v_uint16x8 b0 = v_add(v_rotate_right<1>(b1), b1);
b1 = v_shl<1>(v_rotate_right<1>(b1));
v_uint16x8 g0 = (r0 >> 7) + (r2 >> 7);
v_uint16x8 g1 = (r1 << 8) >> 7;
g0 += v_rotate_right<1>(g1) + g1;
g1 = v_rotate_right<1>(g1) << 2;
v_uint16x8 g0 = v_add(v_shr<7>(r0), v_shr<7>(r2));
v_uint16x8 g1 = v_shr<7>(v_shl<8>(r1));
g0 = v_add(g0, v_add(v_rotate_right<1>(g1), g1));
g1 = v_shl<2>(v_rotate_right<1>(g1));
r0 = r1 >> 8;
r1 = (v_rotate_right<1>(r0) + r0) << 2;
r0 = r0 << 3;
r0 = v_shr<8>(r1);
r1 = v_shl<2>(v_add(v_rotate_right<1>(r0), r0));
r0 = v_shl<3>(r0);
g0 = (v_mul_hi(b0, _b2y) + v_mul_hi(g0, _g2y) + v_mul_hi(r0, _r2y)) >> 2;
g1 = (v_mul_hi(b1, _b2y) + v_mul_hi(g1, _g2y) + v_mul_hi(r1, _r2y)) >> 2;
g0 = v_shr<2>(v_add(v_add(v_mul_hi(b0, _b2y), v_mul_hi(g0, _g2y)), v_mul_hi(r0, _r2y)));
g1 = v_shr<2>(v_add(v_add(v_mul_hi(b1, _b2y), v_mul_hi(g1, _g2y)), v_mul_hi(r1, _r2y)));
v_uint8x16 pack_lo, pack_hi;
v_zip(v_pack_u(v_reinterpret_as_s16(g0), v_reinterpret_as_s16(g0)),
v_pack_u(v_reinterpret_as_s16(g1), v_reinterpret_as_s16(g1)),
@ -269,31 +269,31 @@ public:
v_uint16x8 r1 = v_reinterpret_as_u16(v_load(bayer+bayer_step));
v_uint16x8 r2 = v_reinterpret_as_u16(v_load(bayer+bayer_step*2));
v_uint16x8 b1 = (r0 & masklo) + (r2 & masklo);
v_uint16x8 b1 = v_add(v_and(r0, masklo), v_and(r2, masklo));
v_uint16x8 nextb1 = v_rotate_right<1>(b1);
v_uint16x8 b0 = b1 + nextb1;
b1 = (nextb1 + delta1) >> 1;
b0 = (b0 + delta2) >> 2;
v_uint16x8 b0 = v_add(b1, nextb1);
b1 = v_shr<1>(v_add(nextb1, delta1));
b0 = v_shr<2>(v_add(b0, delta2));
// b0 b2 ... b14 b1 b3 ... b15
b0 = v_reinterpret_as_u16(v_pack_u(v_reinterpret_as_s16(b0), v_reinterpret_as_s16(b1)));
v_uint16x8 g0 = (r0 >> 8) + (r2 >> 8);
v_uint16x8 g1 = r1 & masklo;
g0 += v_rotate_right<1>(g1) + g1;
v_uint16x8 g0 = v_add(v_shr<8>(r0), v_shr<8>(r2));
v_uint16x8 g1 = v_and(r1, masklo);
g0 = v_add(g0, v_add(v_rotate_right<1>(g1), g1));
g1 = v_rotate_right<1>(g1);
g0 = (g0 + delta2) >> 2;
g0 = v_shr<2>(v_add(g0, delta2));
// g0 g2 ... g14 g1 g3 ... g15
g0 = v_reinterpret_as_u16(v_pack_u(v_reinterpret_as_s16(g0), v_reinterpret_as_s16(g1)));
r0 = r1 >> 8;
r1 = v_rotate_right<1>(r0) + r0;
r1 = (r1 + delta1) >> 1;
r0 = v_shr<8>(r1);
r1 = v_add(v_rotate_right<1>(r0), r0);
r1 = v_shr<1>(v_add(r1, delta1));
// r0 r2 ... r14 r1 r3 ... r15
r0 = v_reinterpret_as_u16(v_pack_u(v_reinterpret_as_s16(r0), v_reinterpret_as_s16(r1)));
b1 = (b0 ^ r0) & mask;
b0 = b0 ^ b1;
r0 = r0 ^ b1;
b1 = v_and(v_xor(b0, r0), mask);
b0 = v_xor(b0, b1);
r0 = v_xor(r0, b1);
// b1 g1 b3 g3 b5 g5...
v_uint8x16 pack_lo, pack_hi;
@ -402,31 +402,31 @@ public:
v_uint16x8 r1 = v_reinterpret_as_u16(v_load(bayer+bayer_step));
v_uint16x8 r2 = v_reinterpret_as_u16(v_load(bayer+bayer_step*2));
v_uint16x8 b1 = (r0 & masklo) + (r2 & masklo);
v_uint16x8 b1 = v_add(v_and(r0, masklo), v_and(r2, masklo));
v_uint16x8 nextb1 = v_rotate_right<1>(b1);
v_uint16x8 b0 = b1 + nextb1;
b1 = (nextb1 + delta1) >> 1;
b0 = (b0 + delta2) >> 2;
v_uint16x8 b0 = v_add(b1, nextb1);
b1 = v_shr<1>(v_add(nextb1, delta1));
b0 = v_shr<2>(v_add(b0, delta2));
// b0 b2 ... b14 b1 b3 ... b15
b0 = v_reinterpret_as_u16(v_pack_u(v_reinterpret_as_s16(b0), v_reinterpret_as_s16(b1)));
v_uint16x8 g0 = (r0 >> 8) + (r2 >> 8);
v_uint16x8 g1 = r1 & masklo;
g0 += v_rotate_right<1>(g1) + g1;
v_uint16x8 g0 = v_add(v_shr<8>(r0), v_shr<8>(r2));
v_uint16x8 g1 = v_and(r1, masklo);
g0 = v_add(g0, v_add(v_rotate_right<1>(g1), g1));
g1 = v_rotate_right<1>(g1);
g0 = (g0 + delta2) >> 2;
g0 = v_shr<2>(v_add(g0, delta2));
// g0 g2 ... g14 g1 g3 ... g15
g0 = v_reinterpret_as_u16(v_pack_u(v_reinterpret_as_s16(g0), v_reinterpret_as_s16(g1)));
r0 = r1 >> 8;
r1 = v_rotate_right<1>(r0) + r0;
r1 = (r1 + delta1) >> 1;
r0 = v_shr<8>(r1);
r1 = v_add(v_rotate_right<1>(r0), r0);
r1 = v_shr<1>(v_add(r1, delta1));
// r0 r2 ... r14 r1 r3 ... r15
r0 = v_reinterpret_as_u16(v_pack_u(v_reinterpret_as_s16(r0), v_reinterpret_as_s16(r1)));
b1 = (b0 ^ r0) & mask;
b0 = b0 ^ b1;
r0 = r0 ^ b1;
b1 = v_and(v_xor(b0, r0), mask);
b0 = v_xor(b0, b1);
r0 = v_xor(r0, b1);
// b1 g1 b3 g3 b5 g5...
v_uint8x16 pack_lo, pack_hi;
@ -498,40 +498,40 @@ public:
v_uint16x8 r1 = v_reinterpret_as_u16(v_load(bayer+bayer_step));
v_uint16x8 r2 = v_reinterpret_as_u16(v_load(bayer+bayer_step*2));
v_uint16x8 b1 = (r0 & masklow) + (r2 & masklow);
v_uint16x8 b1 = v_add(v_and(r0, masklow), v_and(r2, masklow));
v_uint16x8 nextb1 = v_rotate_right<1>(b1);
v_uint16x8 b0 = b1 + nextb1;
b1 = (nextb1 + delta1) >> 1;
b0 = (b0 + delta2) >> 2;
v_uint16x8 b0 = v_add(b1, nextb1);
b1 = v_shr<1>(v_add(nextb1, delta1));
b0 = v_shr<2>(v_add(b0, delta2));
// b0 b2 ... b14 b1 b3 ... b15
b0 = v_reinterpret_as_u16(v_pack_u(v_reinterpret_as_s16(b0), v_reinterpret_as_s16(b1)));
// vertical sum
v_uint16x8 r0g = r0 >> 8;
v_uint16x8 r2g = r2 >> 8;
v_uint16x8 sumv = ((r0g + r2g) + delta1) >> 1;
v_uint16x8 r0g = v_shr<8>(r0);
v_uint16x8 r2g = v_shr<8>(r2);
v_uint16x8 sumv = v_shr<1>(v_add(v_add(r0g, r2g), delta1));
// horizontal sum
v_uint16x8 g1 = r1 & masklow;
v_uint16x8 g1 = v_and(r1, masklow);
v_uint16x8 nextg1 = v_rotate_right<1>(g1);
v_uint16x8 sumg = (g1 + nextg1 + delta1) >> 1;
v_uint16x8 sumg = v_shr<1>(v_add(v_add(g1, nextg1), delta1));
// gradients
v_uint16x8 gradv = (r0g - r2g) + (r2g - r0g);
v_uint16x8 gradg = (nextg1 - g1) + (g1 - nextg1);
v_uint16x8 gmask = gradg > gradv;
v_uint16x8 g0 = (gmask & sumv) + (sumg & (gmask ^ full));
v_uint16x8 gradv = v_add(v_sub(r0g, r2g), v_sub(r2g, r0g));
v_uint16x8 gradg = v_add(v_sub(nextg1, g1), v_sub(g1, nextg1));
v_uint16x8 gmask = v_gt(gradg, gradv);
v_uint16x8 g0 = v_add(v_and(gmask, sumv), v_and(sumg, v_xor(gmask, full)));
// g0 g2 ... g14 g1 g3 ...
g0 = v_reinterpret_as_u16(v_pack_u(v_reinterpret_as_s16(g0), v_reinterpret_as_s16(nextg1)));
r0 = r1 >> 8;
r1 = v_rotate_right<1>(r0) + r0;
r1 = (r1 + delta1) >> 1;
r0 = v_shr<8>(r1);
r1 = v_add(v_rotate_right<1>(r0), r0);
r1 = v_shr<1>(v_add(r1, delta1));
// r0 r2 ... r14 r1 r3 ... r15
r0 = v_reinterpret_as_u16(v_pack_u(v_reinterpret_as_s16(r0), v_reinterpret_as_s16(r1)));
b1 = (b0 ^ r0) & mask;
b0 = b0 ^ b1;
r0 = r0 ^ b1;
b1 = v_and(v_xor(b0, r0), mask);
b0 = v_xor(b0, b1);
r0 = v_xor(r0, b1);
// b1 g1 b3 g3 b5 g5...
v_uint8x16 pack_lo, pack_hi;
@ -1060,19 +1060,19 @@ static void Bayer2RGB_VNG_8u( const Mat& srcmat, Mat& dstmat, int code )
v_uint16x8 b0, b1, b2, b3, b4, b5, b6;
b0 = (v_absdiff(s2, s8)<<1) + v_absdiff(s1, s7) + v_absdiff(s3, s9);
b1 = (v_absdiff(s4, s6)<<1) + v_absdiff(s1, s3) + v_absdiff(s7, s9);
b2 = v_absdiff(s3, s7)<<1;
b3 = v_absdiff(s1, s9)<<1;
b0 = v_add(v_add(v_shl<1>(v_absdiff(s2, s8)), v_absdiff(s1, s7)), v_absdiff(s3, s9));
b1 = v_add(v_add(v_shl<1>(v_absdiff(s4, s6)), v_absdiff(s1, s3)), v_absdiff(s7, s9));
b2 = v_shl<1>(v_absdiff(s3, s7));
b3 = v_shl<1>(v_absdiff(s1, s9));
v_store(brow, b0);
v_store(brow + N, b1);
v_store(brow + N2, b2);
v_store(brow + N3, b3);
b4 = b2 + v_absdiff(s2, s4) + v_absdiff(s6, s8);
b5 = b3 + v_absdiff(s2, s6) + v_absdiff(s4, s8);
b6 = (s2 + s4 + s6 + s8)>>1;
b4 = v_add(v_add(b2, v_absdiff(s2, s4)), v_absdiff(s6, s8));
b5 = v_add(v_add(b3, v_absdiff(s2, s6)), v_absdiff(s4, s8));
b6 = v_shr<1>(v_add(v_add(v_add(s2, s4), s6), s8));
v_store(brow + N4, b4);
v_store(brow + N5, b5);
@ -1279,7 +1279,7 @@ static void Bayer2RGB_VNG_8u( const Mat& srcmat, Mat& dstmat, int code )
v_uint16x8 one = v_setall_u16(1), z = v_setzero_u16();
v_float32x4 _0_5 = v_setall_f32(0.5f);
#define v_merge_u16(a, b) (((a) & v_reinterpret_as_u16(emask)) | ((b) & v_reinterpret_as_u16(omask))) //(aA_aA_aA_aA) * (bB_bB_bB_bB) => (bA_bA_bA_bA)
#define v_merge_u16(a, b) (v_or((v_and((a), v_reinterpret_as_u16(emask))), (v_and((b), v_reinterpret_as_u16(omask))))) //(aA_aA_aA_aA) * (bB_bB_bB_bB) => (bA_bA_bA_bA)
#define v_cvt_s16f32_lo(a) v_cvt_f32(v_expand_low(v_reinterpret_as_s16(a))) //(1,2,3,4,5,6,7,8) => (1f,2f,3f,4f)
#define v_cvt_s16f32_hi(a) v_cvt_f32(v_expand_high(v_reinterpret_as_s16(a))) //(1,2,3,4,5,6,7,8) => (5f,6f,7f,8f)
@ -1287,16 +1287,16 @@ static void Bayer2RGB_VNG_8u( const Mat& srcmat, Mat& dstmat, int code )
for( ; i <= N - 10; i += 8, srow += 8, brow0 += 8, brow1 += 8, brow2 += 8 )
{
//int gradN = brow0[0] + brow1[0];
v_uint16x8 gradN = v_load(brow0) + v_load(brow1);
v_uint16x8 gradN = v_add(v_load(brow0), v_load(brow1));
//int gradS = brow1[0] + brow2[0];
v_uint16x8 gradS = v_load(brow1) + v_load(brow2);
v_uint16x8 gradS = v_add(v_load(brow1), v_load(brow2));
//int gradW = brow1[N-1] + brow1[N];
v_uint16x8 gradW = v_load(brow1+N-1) + v_load(brow1+N);
v_uint16x8 gradW = v_add(v_load(brow1 + N - 1), v_load(brow1 + N));
//int gradE = brow1[N+1] + brow1[N];
v_uint16x8 gradE = v_load(brow1+N+1) + v_load(brow1+N);
v_uint16x8 gradE = v_add(v_load(brow1 + N + 1), v_load(brow1 + N));
//int minGrad = std::min(std::min(std::min(gradN, gradS), gradW), gradE);
//int maxGrad = std::max(std::max(std::max(gradN, gradS), gradW), gradE);
@ -1307,14 +1307,14 @@ static void Bayer2RGB_VNG_8u( const Mat& srcmat, Mat& dstmat, int code )
//int gradNE = brow0[N4+1] + brow1[N4];
//int gradNE = brow0[N2] + brow0[N2+1] + brow1[N2] + brow1[N2+1];
grad0 = v_load(brow0+N4+1) + v_load(brow1+N4);
grad1 = v_load(brow0+N2) + v_load(brow0+N2+1) + v_load(brow1+N2) + v_load(brow1+N2+1);
grad0 = v_add(v_load(brow0 + N4 + 1), v_load(brow1 + N4));
grad1 = v_add(v_add(v_add(v_load(brow0 + N2), v_load(brow0 + N2 + 1)), v_load(brow1 + N2)), v_load(brow1 + N2 + 1));
v_uint16x8 gradNE = v_merge_u16(grad0, grad1);
//int gradSW = brow1[N4] + brow2[N4-1];
//int gradSW = brow1[N2] + brow1[N2-1] + brow2[N2] + brow2[N2-1];
grad0 = v_load(brow2+N4-1) + v_load(brow1+N4);
grad1 = v_load(brow2+N2) + v_load(brow2+N2-1) + v_load(brow1+N2) + v_load(brow1+N2-1);
grad0 = v_add(v_load(brow2 + N4 - 1), v_load(brow1 + N4));
grad1 = v_add(v_add(v_add(v_load(brow2 + N2), v_load(brow2 + N2 - 1)), v_load(brow1 + N2)), v_load(brow1 + N2 - 1));
v_uint16x8 gradSW = v_merge_u16(grad0, grad1);
minGrad = v_min(v_min(minGrad, gradNE), gradSW);
@ -1322,21 +1322,21 @@ static void Bayer2RGB_VNG_8u( const Mat& srcmat, Mat& dstmat, int code )
//int gradNW = brow0[N5-1] + brow1[N5];
//int gradNW = brow0[N3] + brow0[N3-1] + brow1[N3] + brow1[N3-1];
grad0 = v_load(brow0+N5-1) + v_load(brow1+N5);
grad1 = v_load(brow0+N3) + v_load(brow0+N3-1) + v_load(brow1+N3) + v_load(brow1+N3-1);
grad0 = v_add(v_load(brow0 + N5 - 1), v_load(brow1 + N5));
grad1 = v_add(v_add(v_add(v_load(brow0 + N3), v_load(brow0 + N3 - 1)), v_load(brow1 + N3)), v_load(brow1 + N3 - 1));
v_uint16x8 gradNW = v_merge_u16(grad0, grad1);
//int gradSE = brow1[N5] + brow2[N5+1];
//int gradSE = brow1[N3] + brow1[N3+1] + brow2[N3] + brow2[N3+1];
grad0 = v_load(brow2+N5+1) + v_load(brow1+N5);
grad1 = v_load(brow2+N3) + v_load(brow2+N3+1) + v_load(brow1+N3) + v_load(brow1+N3+1);
grad0 = v_add(v_load(brow2 + N5 + 1), v_load(brow1 + N5));
grad1 = v_add(v_add(v_add(v_load(brow2 + N3), v_load(brow2 + N3 + 1)), v_load(brow1 + N3)), v_load(brow1 + N3 + 1));
v_uint16x8 gradSE = v_merge_u16(grad0, grad1);
minGrad = v_min(v_min(minGrad, gradNW), gradSE);
maxGrad = v_max(v_max(maxGrad, gradNW), gradSE);
//int T = minGrad + maxGrad/2;
v_uint16x8 T = v_max((maxGrad >> 1), one) + minGrad;
v_uint16x8 T = v_add(v_max((v_shr<1>(maxGrad)), one), minGrad);
v_uint16x8 RGs = z, GRs = z, Bs = z, ng = z;
@ -1361,133 +1361,135 @@ static void Bayer2RGB_VNG_8u( const Mat& srcmat, Mat& dstmat, int code )
v_uint16x8 t0, t1, mask;
// gradN ***********************************************
mask = (T > gradN); // mask = T>gradN
ng = v_reinterpret_as_u16(v_reinterpret_as_s16(ng) - v_reinterpret_as_s16(mask)); // ng += (T>gradN)
mask = (v_gt(T, gradN)); // mask = T>gradN
ng = v_reinterpret_as_u16(v_sub(v_reinterpret_as_s16(ng), v_reinterpret_as_s16(mask))); // ng += (T>gradN)
t0 = (x3 << 1); // srow[-bstep]*2
t1 = v_load_expand(srow - bstep*2) + x0; // srow[-bstep*2] + srow[0]
t0 = (v_shl<1>(x3)); // srow[-bstep]*2
t1 = v_add(v_load_expand(srow - bstep * 2), x0); // srow[-bstep*2] + srow[0]
// RGs += (srow[-bstep*2] + srow[0]) * (T>gradN)
RGs += (t1 & mask);
RGs = v_add(RGs, v_and(t1, mask));
// GRs += {srow[-bstep]*2; (srow[-bstep*2-1] + srow[-bstep*2+1])} * (T>gradN)
GRs += (v_merge_u16(t0, x2 + x4) & mask);
GRs = v_add(GRs, (v_and(v_merge_u16(t0, v_add(x2, x4)), mask)));
// Bs += {(srow[-bstep-1]+srow[-bstep+1]); srow[-bstep]*2 } * (T>gradN)
Bs += (v_merge_u16(x1 + x5, t0) & mask);
Bs = v_add(Bs, v_and(v_merge_u16(v_add(x1, x5), t0), mask));
// gradNE **********************************************
mask = (T > gradNE); // mask = T>gradNE
ng = v_reinterpret_as_u16(v_reinterpret_as_s16(ng) - v_reinterpret_as_s16(mask)); // ng += (T>gradNE)
mask = (v_gt(T, gradNE)); // mask = T>gradNE
ng = v_reinterpret_as_u16(v_sub(v_reinterpret_as_s16(ng), v_reinterpret_as_s16(mask))); // ng += (T>gradNE)
t0 = (x5 << 1); // srow[-bstep+1]*2
t1 = v_load_expand(srow - bstep*2+2) + x0; // srow[-bstep*2+2] + srow[0]
t0 = (v_shl<1>(x5)); // srow[-bstep+1]*2
t1 = v_add(v_load_expand(srow - bstep * 2 + 2), x0); // srow[-bstep*2+2] + srow[0]
// RGs += {(srow[-bstep*2+2] + srow[0]); srow[-bstep+1]*2} * (T>gradNE)
RGs += (v_merge_u16(t1, t0) & mask);
RGs = v_add(RGs, v_and(v_merge_u16(t1, t0), mask));
// GRs += {brow0[N6+1]; (srow[-bstep*2+1] + srow[1])} * (T>gradNE)
GRs += (v_merge_u16(v_load(brow0+N6+1), x4 + x7) & mask);
GRs = v_add(GRs, v_and(v_merge_u16(v_load(brow0+N6+1), v_add(x4, x7)), mask));
// Bs += {srow[-bstep+1]*2; (srow[-bstep] + srow[-bstep+2])} * (T>gradNE)
Bs += (v_merge_u16(t0, x3 + x6) & mask);
Bs = v_add(Bs, v_and(v_merge_u16(t0, v_add(x3, x6)), mask));
// gradE ***********************************************
mask = (T > gradE); // mask = T>gradE
ng = v_reinterpret_as_u16(v_reinterpret_as_s16(ng) - v_reinterpret_as_s16(mask)); // ng += (T>gradE)
mask = (v_gt(T, gradE)); // mask = T>gradE
ng = v_reinterpret_as_u16(v_sub(v_reinterpret_as_s16(ng), v_reinterpret_as_s16(mask))); // ng += (T>gradE)
t0 = (x7 << 1); // srow[1]*2
t1 = v_load_expand(srow +2) + x0; // srow[2] + srow[0]
t0 = (v_shl<1>(x7)); // srow[1]*2
t1 = v_add(v_load_expand(srow + 2), x0); // srow[2] + srow[0]
// RGs += (srow[2] + srow[0]) * (T>gradE)
RGs += (t1 & mask);
RGs = v_add(RGs, v_and(t1, mask));
// GRs += (srow[1]*2) * (T>gradE)
GRs += (t0 & mask);
GRs = v_add(GRs, v_and(t0, mask));
// Bs += {(srow[-bstep+1]+srow[bstep+1]); (srow[-bstep+2]+srow[bstep+2])} * (T>gradE)
Bs += (v_merge_u16(x5 + x9, x6 + x8) & mask);
Bs = v_add(Bs, v_and(v_merge_u16(v_add(x5, x9), v_add(x6, x8)), mask));
// gradSE **********************************************
mask = (T > gradSE); // mask = T>gradSE
ng = v_reinterpret_as_u16(v_reinterpret_as_s16(ng) - v_reinterpret_as_s16(mask)); // ng += (T>gradSE)
mask = (v_gt(T, gradSE)); // mask = T>gradSE
ng = v_reinterpret_as_u16(v_sub(v_reinterpret_as_s16(ng), v_reinterpret_as_s16(mask))); // ng += (T>gradSE)
t0 = (x9 << 1); // srow[bstep+1]*2
t1 = v_load_expand(srow + bstep*2+2) + x0; // srow[bstep*2+2] + srow[0]
t0 = (v_shl<1>(x9)); // srow[bstep+1]*2
t1 = v_add(v_load_expand(srow + bstep * 2 + 2), x0); // srow[bstep*2+2] + srow[0]
// RGs += {(srow[bstep*2+2] + srow[0]); srow[bstep+1]*2} * (T>gradSE)
RGs += (v_merge_u16(t1, t0) & mask);
RGs = v_add(RGs, v_and(v_merge_u16(t1, t0), mask));
// GRs += {brow2[N6+1]; (srow[1]+srow[bstep*2+1])} * (T>gradSE)
GRs += (v_merge_u16(v_load(brow2+N6+1), x7 + x10) & mask);
GRs = v_add(GRs, v_and(v_merge_u16(v_load(brow2+N6+1), v_add(x7, x10)), mask));
// Bs += {srow[bstep+1]*2; (srow[bstep+2]+srow[bstep])} * (T>gradSE)
Bs += (v_merge_u16((x9 << 1), x8 + x11) & mask);
Bs = v_add(Bs, v_and(v_merge_u16((v_shl<1>(x9)), v_add(x8, x11)), mask));
// gradS ***********************************************
mask = (T > gradS); // mask = T>gradS
ng = v_reinterpret_as_u16(v_reinterpret_as_s16(ng) - v_reinterpret_as_s16(mask)); // ng += (T>gradS)
mask = (v_gt(T, gradS)); // mask = T>gradS
ng = v_reinterpret_as_u16(v_sub(v_reinterpret_as_s16(ng), v_reinterpret_as_s16(mask))); // ng += (T>gradS)
t0 = (x11 << 1); // srow[bstep]*2
t1 = v_load_expand(srow + bstep*2) + x0; // srow[bstep*2]+srow[0]
t0 = (v_shl<1>(x11)); // srow[bstep]*2
t1 = v_add(v_load_expand(srow + bstep * 2), x0); // srow[bstep*2]+srow[0]
// RGs += (srow[bstep*2]+srow[0]) * (T>gradS)
RGs += (t1 & mask);
RGs = v_add(RGs, v_and(t1, mask));
// GRs += {srow[bstep]*2; (srow[bstep*2+1]+srow[bstep*2-1])} * (T>gradS)
GRs += (v_merge_u16(t0, x10 + x12) & mask);
GRs = v_add(GRs, v_and(v_merge_u16(t0, v_add(x10, x12)), mask));
// Bs += {(srow[bstep+1]+srow[bstep-1]); srow[bstep]*2} * (T>gradS)
Bs += (v_merge_u16(x9 + x13, t0) & mask);
Bs = v_add(Bs, v_and(v_merge_u16(v_add(x9, x13), t0), mask));
// gradSW **********************************************
mask = (T > gradSW); // mask = T>gradSW
ng = v_reinterpret_as_u16(v_reinterpret_as_s16(ng) - v_reinterpret_as_s16(mask)); // ng += (T>gradSW)
mask = (v_gt(T, gradSW)); // mask = T>gradSW
ng = v_reinterpret_as_u16(v_sub(v_reinterpret_as_s16(ng), v_reinterpret_as_s16(mask))); // ng += (T>gradSW)
t0 = (x13 << 1); // srow[bstep-1]*2
t1 = v_load_expand(srow + bstep*2-2) + x0; // srow[bstep*2-2]+srow[0]
t0 = (v_shl<1>(x13)); // srow[bstep-1]*2
t1 = v_add(v_load_expand(srow + bstep * 2 - 2), x0); // srow[bstep*2-2]+srow[0]
// RGs += {(srow[bstep*2-2]+srow[0]); srow[bstep-1]*2} * (T>gradSW)
RGs += (v_merge_u16(t1, t0) & mask);
RGs = v_add(RGs, v_and(v_merge_u16(t1, t0), mask));
// GRs += {brow2[N6-1]; (srow[bstep*2-1]+srow[-1])} * (T>gradSW)
GRs += (v_merge_u16(v_load(brow2+N6-1), x12 + x15) & mask);
GRs = v_add(GRs, v_and(v_merge_u16(v_load(brow2+N6-1), v_add(x12, x15)), mask));
// Bs += {srow[bstep-1]*2; (srow[bstep]+srow[bstep-2])} * (T>gradSW)
Bs += (v_merge_u16(t0, x11 + x14) & mask);
Bs = v_add(Bs, v_and(v_merge_u16(t0, v_add(x11, x14)), mask));
// gradW ***********************************************
mask = (T > gradW); // mask = T>gradW
ng = v_reinterpret_as_u16(v_reinterpret_as_s16(ng) - v_reinterpret_as_s16(mask)); // ng += (T>gradW)
mask = (v_gt(T, gradW)); // mask = T>gradW
ng = v_reinterpret_as_u16(v_sub(v_reinterpret_as_s16(ng), v_reinterpret_as_s16(mask))); // ng += (T>gradW)
t0 = (x15 << 1); // srow[-1]*2
t1 = v_load_expand(srow -2) + x0; // srow[-2]+srow[0]
t0 = (v_shl<1>(x15)); // srow[-1]*2
t1 = v_add(v_load_expand(srow - 2), x0); // srow[-2]+srow[0]
// RGs += (srow[-2]+srow[0]) * (T>gradW)
RGs += (t1 & mask);
RGs = v_add(RGs, v_and(t1, mask));
// GRs += (srow[-1]*2) * (T>gradW)
GRs += (t0 & mask);
GRs = v_add(GRs, v_and(t0, mask));
// Bs += {(srow[-bstep-1]+srow[bstep-1]); (srow[bstep-2]+srow[-bstep-2])} * (T>gradW)
Bs += (v_merge_u16(x1 + x13, x14 + x16) & mask);
Bs = v_add(Bs, v_and(v_merge_u16(v_add(x1, x13), v_add(x14, x16)), mask));
// gradNW **********************************************
mask = (T > gradNW); // mask = T>gradNW
ng = v_reinterpret_as_u16(v_reinterpret_as_s16(ng) - v_reinterpret_as_s16(mask)); // ng += (T>gradNW)
mask = (v_gt(T, gradNW)); // mask = T>gradNW
ng = v_reinterpret_as_u16(v_sub(v_reinterpret_as_s16(ng), v_reinterpret_as_s16(mask))); // ng += (T>gradNW)
t0 = (x1 << 1); // srow[-bstep-1]*2
t1 = v_load_expand(srow -bstep*2-2) + x0; // srow[-bstep*2-2]+srow[0]
t0 = (v_shl<1>(x1)); // srow[-bstep-1]*2
t1 = v_add(v_load_expand(srow - bstep * 2 - 2), x0); // srow[-bstep*2-2]+srow[0]
// RGs += {(srow[-bstep*2-2]+srow[0]); srow[-bstep-1]*2} * (T>gradNW)
RGs += (v_merge_u16(t1, t0) & mask);
RGs = v_add(RGs, v_and(v_merge_u16(t1, t0), mask));
// GRs += {brow0[N6-1]; (srow[-bstep*2-1]+srow[-1])} * (T>gradNW)
GRs += (v_merge_u16(v_load(brow0+N6-1), x2 + x15) & mask);
GRs = v_add(GRs, v_and(v_merge_u16(v_load(brow0+N6-1), v_add(x2, x15)), mask));
// Bs += {srow[-bstep-1]*2; (srow[-bstep]+srow[-bstep-2])} * (T>gradNW)
Bs += (v_merge_u16((x1 << 1), x3 + x16) & mask);
Bs = v_add(Bs, v_and(v_merge_u16(v_shl<1>(x1), v_add(x3, x16)), mask));
v_float32x4 ngf0 = _0_5 / v_cvt_s16f32_lo(ng);
v_float32x4 ngf1 = _0_5 / v_cvt_s16f32_hi(ng);
v_float32x4 ngf0 = v_div(_0_5, v_cvt_s16f32_lo(ng));
v_float32x4 ngf1 = v_div(_0_5, v_cvt_s16f32_hi(ng));
// now interpolate r, g & b
t0 = v_reinterpret_as_u16(v_reinterpret_as_s16(GRs) - v_reinterpret_as_s16(RGs));
t1 = v_reinterpret_as_u16(v_reinterpret_as_s16(Bs) - v_reinterpret_as_s16(RGs));
t0 = v_reinterpret_as_u16(v_sub(v_reinterpret_as_s16(GRs), v_reinterpret_as_s16(RGs)));
t1 = v_reinterpret_as_u16(v_sub(v_reinterpret_as_s16(Bs), v_reinterpret_as_s16(RGs)));
t0 = v_reinterpret_as_u16(v_reinterpret_as_s16(x0) +
t0 = v_reinterpret_as_u16(
v_add(v_reinterpret_as_s16(x0),
v_pack(
v_round(v_cvt_s16f32_lo(t0) * ngf0),
v_round(v_cvt_s16f32_hi(t0) * ngf1)));
v_round(v_mul(v_cvt_s16f32_lo(t0), ngf0)),
v_round(v_mul(v_cvt_s16f32_hi(t0), ngf1)))));
t1 = v_reinterpret_as_u16(v_reinterpret_as_s16(x0) +
t1 = v_reinterpret_as_u16(
v_add(v_reinterpret_as_s16(x0),
v_pack(
v_round(v_cvt_s16f32_lo(t1) * ngf0),
v_round(v_cvt_s16f32_hi(t1) * ngf1)));
v_round(v_mul(v_cvt_s16f32_lo(t1), ngf0)),
v_round(v_mul(v_cvt_s16f32_hi(t1), ngf1)))));
x1 = v_merge_u16(x0, t0);
x2 = v_merge_u16(t0, x0);

View File

@ -1084,9 +1084,9 @@ struct SymmColumnVec_32s8u
i += VTraits<v_uint16>::vlanes();
}
#if CV_SIMD_WIDTH > 16
while( i <= width - 4 /*v_int32x4::nlanes*/ )
while( i <= width - 4 /*VTraits<v_int32x4>::vlanes()*/ )
#else
if( i <= width - v_int32::nlanes )
if( i <= width - VTraits<v_int32>::vlanes() )
#endif
{
v_float32 s0 = v_muladd(v_cvt_f32(vx_load(src[0] + i)), vx_setall_f32(ky[0]), vx_setall_f32(delta));
@ -1140,9 +1140,9 @@ struct SymmColumnVec_32s8u
i += VTraits<v_uint16>::vlanes();
}
#if CV_SIMD_WIDTH > 16
while( i <= width - 4 /*v_int32x4::nlanes*/ )
while( i <= width - 4 /*VTraits<v_int32x4>::vlanes()*/ )
#else
if( i <= width - v_int32::nlanes )
if( i <= width - VTraits<v_int32>::vlanes() )
#endif
{
v_float32 s0 = v_muladd(v_cvt_f32(v_sub(vx_load(src[1] + i), vx_load(src[-1] + i))), vx_setall_f32(ky[1]), vx_setall_f32(delta));
@ -1321,23 +1321,23 @@ struct SymmColumnSmallVec_32s16s
{
v_int32 k0 = vx_setall_s32((int)ky[0]), k1 = vx_setall_s32((int)ky[1]);
v_int32 d4 = vx_setall_s32(d);
for( ; i <= width - 2*v_int16::nlanes; i += 2*v_int16::nlanes )
for( ; i <= width - 2*VTraits<v_int16>::vlanes(); i += 2*VTraits<v_int16>::vlanes() )
{
v_store(dst + i, v_pack(v_muladd(vx_load(S0 + i) + vx_load(S2 + i), k1, v_muladd(vx_load(S1 + i), k0, d4)),
v_muladd(vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes), k1, v_muladd(vx_load(S1 + i + v_int32::nlanes), k0, d4))));
v_store(dst + i + v_int16::nlanes, v_pack(v_muladd(vx_load(S0 + i + 2*v_int32::nlanes) + vx_load(S2 + i + 2*v_int32::nlanes), k1, v_muladd(vx_load(S1 + i + 2*v_int32::nlanes), k0, d4)),
v_muladd(vx_load(S0 + i + 3*v_int32::nlanes) + vx_load(S2 + i + 3*v_int32::nlanes), k1, v_muladd(vx_load(S1 + i + 3*v_int32::nlanes), k0, d4))));
v_store(dst + i, v_pack(v_muladd(v_add(vx_load(S0 + i), vx_load(S2 + i)), k1, v_muladd(vx_load(S1 + i), k0, d4)),
v_muladd(v_add(vx_load(S0 + i + VTraits<v_int32>::vlanes()), vx_load(S2 + i + VTraits<v_int32>::vlanes())), k1, v_muladd(vx_load(S1 + i + VTraits<v_int32>::vlanes()), k0, d4))));
v_store(dst + i + VTraits<v_int16>::vlanes(), v_pack(v_muladd(v_add(vx_load(S0 + i + 2 * VTraits<v_int32>::vlanes()), vx_load(S2 + i + 2 * VTraits<v_int32>::vlanes())), k1, v_muladd(vx_load(S1 + i + 2*VTraits<v_int32>::vlanes()), k0, d4)),
v_muladd(v_add(vx_load(S0 + i + 3 * VTraits<v_int32>::vlanes()), vx_load(S2 + i + 3 * VTraits<v_int32>::vlanes())), k1, v_muladd(vx_load(S1 + i + 3*VTraits<v_int32>::vlanes()), k0, d4))));
}
if( i <= width - v_int16::nlanes )
if( i <= width - VTraits<v_int16>::vlanes() )
{
v_store(dst + i, v_pack(v_muladd(vx_load(S0 + i) + vx_load(S2 + i), k1, v_muladd(vx_load(S1 + i), k0, d4)),
v_muladd(vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes), k1, v_muladd(vx_load(S1 + i + v_int32::nlanes), k0, d4))));
i += v_int16::nlanes;
v_store(dst + i, v_pack(v_muladd(v_add(vx_load(S0 + i), vx_load(S2 + i)), k1, v_muladd(vx_load(S1 + i), k0, d4)),
v_muladd(v_add(vx_load(S0 + i + VTraits<v_int32>::vlanes()), vx_load(S2 + i + VTraits<v_int32>::vlanes())), k1, v_muladd(vx_load(S1 + i + VTraits<v_int32>::vlanes()), k0, d4))));
i += VTraits<v_int16>::vlanes();
}
if( i <= width - v_int32::nlanes )
if( i <= width - VTraits<v_int32>::vlanes() )
{
v_pack_store(dst + i, v_muladd(vx_load(S0 + i) + vx_load(S2 + i), k1, v_muladd(vx_load(S1 + i), k0, d4)));
i += v_int32::nlanes;
v_pack_store(dst + i, v_muladd(v_add(vx_load(S0 + i), vx_load(S2 + i)), k1, v_muladd(vx_load(S1 + i), k0, d4)));
i += VTraits<v_int32>::vlanes();
}
}
#endif
@ -2237,9 +2237,9 @@ struct FilterVec_8u
i += VTraits<v_uint16>::vlanes();
}
#if CV_SIMD_WIDTH > 16
while( i <= width - 4 /*v_int32x4::nlanes*/ )
while( i <= width - 4 /*VTraits<v_int32x4>::vlanes()*/ )
#else
if( i <= width - v_int32::nlanes )
if( i <= width - VTraits<v_int32>::vlanes() )
#endif
{
v_float32 s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(vx_load_expand_q(src[0] + i))), vx_setall_f32(kf[0]), vx_setall_f32(delta));
@ -2248,7 +2248,7 @@ struct FilterVec_8u
v_int32 s32 = v_round(s0);
v_int16 s16 = v_pack(s32, s32);
*(unaligned_int*)(dst + i) = v_get0(v_reinterpret_as_s32(v_pack_u(s16, s16)));
i += 4 /*v_int32x4::nlanes*/ ;
i += 4 /*VTraits<v_int32x4>::vlanes()*/ ;
}
return i;
}

View File

@ -2093,7 +2093,7 @@ double cv::compareHist( InputArray _H1, InputArray _H2, int method )
v_float32 v_s11 = vx_setzero_f32();
v_float32 v_s12 = vx_setzero_f32();
v_float32 v_s22 = vx_setzero_f32();
for (; j <= len - v_float32::nlanes; j += v_float32::nlanes)
for (; j <= len - VTraits<v_float32>::vlanes(); j += VTraits<v_float32>::vlanes())
{
v_float32 v_a = vx_load(h1 + j);
v_float32 v_b = vx_load(h2 + j);
@ -2134,10 +2134,10 @@ double cv::compareHist( InputArray _H1, InputArray _H2, int method )
result += v_reduce_sum(v_result);
#elif CV_SIMD
v_float32 v_result = vx_setzero_f32();
for (; j <= len - v_float32::nlanes; j += v_float32::nlanes)
for (; j <= len - VTraits<v_float32>::vlanes(); j += VTraits<v_float32>::vlanes())
{
v_float32 v_src = v_min(vx_load(h1 + j), vx_load(h2 + j));
v_result += v_src;
v_result = v_add(v_result, v_src);
}
result += v_reduce_sum(v_result);
#endif
@ -2174,7 +2174,7 @@ double cv::compareHist( InputArray _H1, InputArray _H2, int method )
v_float32 v_s1 = vx_setzero_f32();
v_float32 v_s2 = vx_setzero_f32();
v_float32 v_result = vx_setzero_f32();
for (; j <= len - v_float32::nlanes; j += v_float32::nlanes)
for (; j <= len - VTraits<v_float32>::vlanes(); j += VTraits<v_float32>::vlanes())
{
v_float32 v_a = vx_load(h1 + j);
v_float32 v_b = vx_load(h2 + j);

View File

@ -455,7 +455,7 @@ struct RemapVec_8u
v_int32x4 delta = v_setall_s32(INTER_REMAP_COEF_SCALE / 2);
v_int16x8 xy2ofs = v_reinterpret_as_s16(v_setall_s32(cn + (sstep << 16)));
int CV_DECL_ALIGNED(16) iofs0[4], iofs1[4];
const uchar* src_limit_8bytes = _src.datalimit - v_int16x8::nlanes;
const uchar* src_limit_8bytes = _src.datalimit - VTraits<v_int16x8>::vlanes();
#define CV_PICK_AND_PACK_RGB(ptr, offset, result) \
{ \
const uchar* const p = ((const uchar*)ptr) + (offset); \
@ -483,7 +483,7 @@ struct RemapVec_8u
v_uint8x16 rrggbbaa, dummy; \
v_uint16x8 rrggbbaa8, dummy8; \
v_uint8x16 rgba0 = v_reinterpret_as_u8(v_int32x4(*(unaligned_int*)(p), 0, 0, 0)); \
v_uint8x16 rgba1 = v_reinterpret_as_u8(v_int32x4(*(unaligned_int*)(p + v_int32x4::nlanes), 0, 0, 0)); \
v_uint8x16 rgba1 = v_reinterpret_as_u8(v_int32x4(*(unaligned_int*)(p + VTraits<v_int32x4>::vlanes()), 0, 0, 0)); \
v_zip(rgba0, rgba1, rrggbbaa, dummy); \
v_expand(rrggbbaa, rrggbbaa8, dummy8); \
result = v_reinterpret_as_s16(rrggbbaa8); \
@ -534,8 +534,8 @@ struct RemapVec_8u
v3 = v_dotprod(v_reinterpret_as_s16(v3), v_reinterpret_as_s16(d2), delta);
v2 = v_dotprod(v_reinterpret_as_s16(v2), v_reinterpret_as_s16(c2), v3);
v0 = v0 >> INTER_REMAP_COEF_BITS;
v2 = v2 >> INTER_REMAP_COEF_BITS;
v0 = v_shr<INTER_REMAP_COEF_BITS>(v0);
v2 = v_shr<INTER_REMAP_COEF_BITS>(v2);
v_pack_u_store(D + x, v_pack(v0, v2));
}
}
@ -563,8 +563,8 @@ struct RemapVec_8u
CV_PICK_AND_PACK_RGB(S0, iofs0[1], u1);
CV_PICK_AND_PACK_RGB(S1, iofs0[1], v1);
v_int32x4 result0 = v_dotprod(u0, w00, v_dotprod(v0, w01, delta)) >> INTER_REMAP_COEF_BITS;
v_int32x4 result1 = v_dotprod(u1, w10, v_dotprod(v1, w11, delta)) >> INTER_REMAP_COEF_BITS;
v_int32x4 result0 = v_shr<INTER_REMAP_COEF_BITS>(v_dotprod(u0, w00, v_dotprod(v0, w01, delta)));
v_int32x4 result1 = v_shr<INTER_REMAP_COEF_BITS>(v_dotprod(u1, w10, v_dotprod(v1, w11, delta)));
result0 = v_rotate_left<1>(result0);
v_int16x8 result8 = v_pack(result0, result1);
@ -581,8 +581,8 @@ struct RemapVec_8u
CV_PICK_AND_PACK_RGB(S0, iofs0[3], u1);
CV_PICK_AND_PACK_RGB(S1, iofs0[3], v1);
result0 = v_dotprod(u0, w00, v_dotprod(v0, w01, delta)) >> INTER_REMAP_COEF_BITS;
result1 = v_dotprod(u1, w10, v_dotprod(v1, w11, delta)) >> INTER_REMAP_COEF_BITS;
result0 = v_shr<INTER_REMAP_COEF_BITS>(v_dotprod(u0, w00, v_dotprod(v0, w01, delta)));
result1 = v_shr<INTER_REMAP_COEF_BITS>(v_dotprod(u1, w10, v_dotprod(v1, w11, delta)));
result0 = v_rotate_left<1>(result0);
result8 = v_pack(result0, result1);
@ -613,8 +613,8 @@ struct RemapVec_8u
CV_PICK_AND_PACK_RGBA(S0, iofs0[1], u1);
CV_PICK_AND_PACK_RGBA(S1, iofs0[1], v1);
v_int32x4 result0 = v_dotprod(u0, w00, v_dotprod(v0, w01, delta)) >> INTER_REMAP_COEF_BITS;
v_int32x4 result1 = v_dotprod(u1, w10, v_dotprod(v1, w11, delta)) >> INTER_REMAP_COEF_BITS;
v_int32x4 result0 = v_shr<INTER_REMAP_COEF_BITS>(v_dotprod(u0, w00, v_dotprod(v0, w01, delta)));
v_int32x4 result1 = v_shr<INTER_REMAP_COEF_BITS>(v_dotprod(u1, w10, v_dotprod(v1, w11, delta)));
v_int16x8 result8 = v_pack(result0, result1);
v_pack_u_store(D, result8);
@ -627,8 +627,8 @@ struct RemapVec_8u
CV_PICK_AND_PACK_RGBA(S0, iofs0[3], u1);
CV_PICK_AND_PACK_RGBA(S1, iofs0[3], v1);
result0 = v_dotprod(u0, w00, v_dotprod(v0, w01, delta)) >> INTER_REMAP_COEF_BITS;
result1 = v_dotprod(u1, w10, v_dotprod(v1, w11, delta)) >> INTER_REMAP_COEF_BITS;
result0 = v_shr<INTER_REMAP_COEF_BITS>(v_dotprod(u0, w00, v_dotprod(v0, w01, delta)));
result1 = v_shr<INTER_REMAP_COEF_BITS>(v_dotprod(u1, w10, v_dotprod(v1, w11, delta)));
result8 = v_pack(result0, result1);
v_pack_u_store(D + 8, result8);
}
@ -1164,7 +1164,7 @@ public:
#if CV_SIMD128
{
int span = v_float32x4::nlanes;
int span = VTraits<v_float32x4>::vlanes();
for( ; x1 <= bcols - span * 2; x1 += span * 2 )
{
v_int32x4 ix0 = v_round(v_load(sX + x1));
@ -1206,9 +1206,9 @@ public:
#if CV_SIMD128
{
v_uint16x8 v_scale = v_setall_u16(INTER_TAB_SIZE2 - 1);
int span = v_uint16x8::nlanes;
int span = VTraits<v_uint16x8>::vlanes();
for( ; x1 <= bcols - span; x1 += span )
v_store((unsigned short*)(A + x1), v_load(sA + x1) & v_scale);
v_store((unsigned short*)(A + x1), v_and(v_load(sA + x1), v_scale));
}
#endif
for( ; x1 < bcols; x1++ )
@ -1224,16 +1224,16 @@ public:
{
v_float32x4 v_scale = v_setall_f32((float)INTER_TAB_SIZE);
v_int32x4 v_scale2 = v_setall_s32(INTER_TAB_SIZE - 1);
int span = v_float32x4::nlanes;
int span = VTraits<v_float32x4>::vlanes();
for( ; x1 <= bcols - span * 2; x1 += span * 2 )
{
v_int32x4 v_sx0 = v_round(v_scale * v_load(sX + x1));
v_int32x4 v_sy0 = v_round(v_scale * v_load(sY + x1));
v_int32x4 v_sx1 = v_round(v_scale * v_load(sX + x1 + span));
v_int32x4 v_sy1 = v_round(v_scale * v_load(sY + x1 + span));
v_uint16x8 v_sx8 = v_reinterpret_as_u16(v_pack(v_sx0 & v_scale2, v_sx1 & v_scale2));
v_uint16x8 v_sy8 = v_reinterpret_as_u16(v_pack(v_sy0 & v_scale2, v_sy1 & v_scale2));
v_uint16x8 v_v = v_shl<INTER_BITS>(v_sy8) | (v_sx8);
v_int32x4 v_sx0 = v_round(v_mul(v_scale, v_load(sX + x1)));
v_int32x4 v_sy0 = v_round(v_mul(v_scale, v_load(sY + x1)));
v_int32x4 v_sx1 = v_round(v_mul(v_scale, v_load(sX + x1 + span)));
v_int32x4 v_sy1 = v_round(v_mul(v_scale, v_load(sY + x1 + span)));
v_uint16x8 v_sx8 = v_reinterpret_as_u16(v_pack(v_and(v_sx0, v_scale2), v_and(v_sx1, v_scale2)));
v_uint16x8 v_sy8 = v_reinterpret_as_u16(v_pack(v_and(v_sy0, v_scale2), v_and(v_sy1, v_scale2)));
v_uint16x8 v_v = v_or(v_shl<INTER_BITS>(v_sy8), v_sx8);
v_store(A + x1, v_v);
v_int16x8 v_d0 = v_pack(v_shr<INTER_BITS>(v_sx0), v_shr<INTER_BITS>(v_sx1));
@ -1261,18 +1261,18 @@ public:
{
v_float32x4 v_scale = v_setall_f32((float)INTER_TAB_SIZE);
v_int32x4 v_scale2 = v_setall_s32(INTER_TAB_SIZE - 1), v_scale3 = v_setall_s32(INTER_TAB_SIZE);
int span = v_float32x4::nlanes;
int span = VTraits<v_float32x4>::vlanes();
for( ; x1 <= bcols - span * 2; x1 += span * 2 )
{
v_float32x4 v_fx, v_fy;
v_load_deinterleave(sXY + (x1 << 1), v_fx, v_fy);
v_int32x4 v_sx0 = v_round(v_fx * v_scale);
v_int32x4 v_sy0 = v_round(v_fy * v_scale);
v_int32x4 v_sx0 = v_round(v_mul(v_fx, v_scale));
v_int32x4 v_sy0 = v_round(v_mul(v_fy, v_scale));
v_load_deinterleave(sXY + ((x1 + span) << 1), v_fx, v_fy);
v_int32x4 v_sx1 = v_round(v_fx * v_scale);
v_int32x4 v_sy1 = v_round(v_fy * v_scale);
v_int32x4 v_v0 = v_muladd(v_scale3, (v_sy0 & v_scale2), (v_sx0 & v_scale2));
v_int32x4 v_v1 = v_muladd(v_scale3, (v_sy1 & v_scale2), (v_sx1 & v_scale2));
v_int32x4 v_sx1 = v_round(v_mul(v_fx, v_scale));
v_int32x4 v_sy1 = v_round(v_mul(v_fy, v_scale));
v_int32x4 v_v0 = v_muladd(v_scale3, (v_and(v_sy0, v_scale2)), (v_and(v_sx0, v_scale2)));
v_int32x4 v_v1 = v_muladd(v_scale3, (v_and(v_sy1, v_scale2)), (v_and(v_sx1, v_scale2)));
v_uint16x8 v_v8 = v_reinterpret_as_u16(v_pack(v_v0, v_v1));
v_store(A + x1, v_v8);
v_int16x8 v_dx = v_pack(v_shr<INTER_BITS>(v_sx0), v_shr<INTER_BITS>(v_sx1));
@ -1941,7 +1941,7 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
{
#if CV_SIMD128
{
int span = v_int16x8::nlanes;
int span = VTraits<v_int16x8>::vlanes();
for( ; x <= size.width - span; x += span )
{
v_int16x8 v_dst[2];
@ -1973,21 +1973,21 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
v_float32x4 v_scale = v_setall_f32((float)INTER_TAB_SIZE);
v_int32x4 v_mask = v_setall_s32(INTER_TAB_SIZE - 1);
v_int32x4 v_scale3 = v_setall_s32(INTER_TAB_SIZE);
int span = v_float32x4::nlanes;
int span = VTraits<v_float32x4>::vlanes();
for( ; x <= size.width - span * 2; x += span * 2 )
{
v_int32x4 v_ix0 = v_round(v_scale * (v_load(src1f + x)));
v_int32x4 v_ix1 = v_round(v_scale * (v_load(src1f + x + span)));
v_int32x4 v_iy0 = v_round(v_scale * (v_load(src2f + x)));
v_int32x4 v_iy1 = v_round(v_scale * (v_load(src2f + x + span)));
v_int32x4 v_ix0 = v_round(v_mul(v_scale, v_load(src1f + x)));
v_int32x4 v_ix1 = v_round(v_mul(v_scale, v_load(src1f + x + span)));
v_int32x4 v_iy0 = v_round(v_mul(v_scale, v_load(src2f + x)));
v_int32x4 v_iy1 = v_round(v_mul(v_scale, v_load(src2f + x + span)));
v_int16x8 v_dst[2];
v_dst[0] = v_pack(v_shr<INTER_BITS>(v_ix0), v_shr<INTER_BITS>(v_ix1));
v_dst[1] = v_pack(v_shr<INTER_BITS>(v_iy0), v_shr<INTER_BITS>(v_iy1));
v_store_interleave(dst1 + (x << 1), v_dst[0], v_dst[1]);
v_int32x4 v_dst0 = v_muladd(v_scale3, (v_iy0 & v_mask), (v_ix0 & v_mask));
v_int32x4 v_dst1 = v_muladd(v_scale3, (v_iy1 & v_mask), (v_ix1 & v_mask));
v_int32x4 v_dst0 = v_muladd(v_scale3, (v_and(v_iy0, v_mask)), (v_and(v_ix0, v_mask)));
v_int32x4 v_dst1 = v_muladd(v_scale3, (v_and(v_iy1, v_mask)), (v_and(v_ix1, v_mask)));
v_store(dst2 + x, v_pack_u(v_dst0, v_dst1));
}
}
@ -2008,7 +2008,7 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
if( nninterpolate )
{
#if CV_SIMD128
int span = v_float32x4::nlanes;
int span = VTraits<v_float32x4>::vlanes();
{
for( ; x <= (size.width << 1) - span * 2; x += span * 2 )
v_store(dst1 + x, v_pack(v_round(v_load(src1f + x)),
@ -2034,16 +2034,16 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
v_float32x4 v_scale = v_setall_f32((float)INTER_TAB_SIZE);
v_int32x4 v_mask = v_setall_s32(INTER_TAB_SIZE - 1);
v_int32x4 v_scale3 = v_setall_s32(INTER_TAB_SIZE);
int span = v_uint16x8::nlanes;
int span = VTraits<v_uint16x8>::vlanes();
for (; x <= size.width - span; x += span )
{
v_float32x4 v_src0[2], v_src1[2];
v_load_deinterleave(src1f + (x << 1), v_src0[0], v_src0[1]);
v_load_deinterleave(src1f + (x << 1) + span, v_src1[0], v_src1[1]);
v_int32x4 v_ix0 = v_round(v_src0[0] * v_scale);
v_int32x4 v_ix1 = v_round(v_src1[0] * v_scale);
v_int32x4 v_iy0 = v_round(v_src0[1] * v_scale);
v_int32x4 v_iy1 = v_round(v_src1[1] * v_scale);
v_int32x4 v_ix0 = v_round(v_mul(v_src0[0], v_scale));
v_int32x4 v_ix1 = v_round(v_mul(v_src1[0], v_scale));
v_int32x4 v_iy0 = v_round(v_mul(v_src0[1], v_scale));
v_int32x4 v_iy1 = v_round(v_mul(v_src1[1], v_scale));
v_int16x8 v_dst[2];
v_dst[0] = v_pack(v_shr<INTER_BITS>(v_ix0), v_shr<INTER_BITS>(v_ix1));
@ -2051,8 +2051,8 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
v_store_interleave(dst1 + (x << 1), v_dst[0], v_dst[1]);
v_store(dst2 + x, v_pack_u(
v_muladd(v_scale3, (v_iy0 & v_mask), (v_ix0 & v_mask)),
v_muladd(v_scale3, (v_iy1 & v_mask), (v_ix1 & v_mask))));
v_muladd(v_scale3, (v_and(v_iy0, v_mask)), (v_and(v_ix0, v_mask))),
v_muladd(v_scale3, (v_and(v_iy1, v_mask)), (v_and(v_ix1, v_mask)))));
}
}
#endif
@ -2074,13 +2074,13 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
v_uint16x8 v_mask2 = v_setall_u16(INTER_TAB_SIZE2-1);
v_uint32x4 v_zero = v_setzero_u32(), v_mask = v_setall_u32(INTER_TAB_SIZE-1);
v_float32x4 v_scale = v_setall_f32(scale);
int span = v_float32x4::nlanes;
int span = VTraits<v_float32x4>::vlanes();
for( ; x <= size.width - span * 2; x += span * 2 )
{
v_uint32x4 v_fxy1, v_fxy2;
if ( src2 )
{
v_uint16x8 v_src2 = v_load(src2 + x) & v_mask2;
v_uint16x8 v_src2 = v_and(v_load(src2 + x), v_mask2);
v_expand(v_src2, v_fxy1, v_fxy2);
}
else
@ -2091,9 +2091,9 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
v_load_deinterleave(src1 + (x << 1), v_src[0], v_src[1]);
v_expand(v_src[0], v_src0[0], v_src0[1]);
v_expand(v_src[1], v_src1[0], v_src1[1]);
#define CV_COMPUTE_MAP_X(X, FXY) v_muladd(v_scale, v_cvt_f32(v_reinterpret_as_s32((FXY) & v_mask)),\
#define CV_COMPUTE_MAP_X(X, FXY) v_muladd(v_scale, v_cvt_f32(v_reinterpret_as_s32(v_and((FXY), v_mask))),\
v_cvt_f32(v_reinterpret_as_s32(X)))
#define CV_COMPUTE_MAP_Y(Y, FXY) v_muladd(v_scale, v_cvt_f32(v_reinterpret_as_s32((FXY) >> INTER_BITS)),\
#define CV_COMPUTE_MAP_Y(Y, FXY) v_muladd(v_scale, v_cvt_f32(v_reinterpret_as_s32(v_shr<INTER_BITS>((FXY)))),\
v_cvt_f32(v_reinterpret_as_s32(Y)))
v_float32x4 v_dst1 = CV_COMPUTE_MAP_X(v_src0[0], v_fxy1);
v_float32x4 v_dst2 = CV_COMPUTE_MAP_Y(v_src1[0], v_fxy1);
@ -2123,13 +2123,13 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
v_int16x8 v_mask2 = v_setall_s16(INTER_TAB_SIZE2-1);
v_int32x4 v_zero = v_setzero_s32(), v_mask = v_setall_s32(INTER_TAB_SIZE-1);
v_float32x4 v_scale = v_setall_f32(scale);
int span = v_int16x8::nlanes;
int span = VTraits<v_int16x8>::vlanes();
for( ; x <= size.width - span; x += span )
{
v_int32x4 v_fxy1, v_fxy2;
if (src2)
{
v_int16x8 v_src2 = v_load((short *)src2 + x) & v_mask2;
v_int16x8 v_src2 = v_and(v_load((short *)src2 + x), v_mask2);
v_expand(v_src2, v_fxy1, v_fxy2);
}
else
@ -2142,8 +2142,8 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
v_expand(v_src[0], v_src0[0], v_src0[1]);
v_expand(v_src[1], v_src1[0], v_src1[1]);
#define CV_COMPUTE_MAP_X(X, FXY) v_muladd(v_scale, v_cvt_f32((FXY) & v_mask), v_cvt_f32(X))
#define CV_COMPUTE_MAP_Y(Y, FXY) v_muladd(v_scale, v_cvt_f32((FXY) >> INTER_BITS), v_cvt_f32(Y))
#define CV_COMPUTE_MAP_X(X, FXY) v_muladd(v_scale, v_cvt_f32(v_and((FXY), v_mask)), v_cvt_f32(X))
#define CV_COMPUTE_MAP_Y(Y, FXY) v_muladd(v_scale, v_cvt_f32(v_shr<INTER_BITS>((FXY))), v_cvt_f32(Y))
v_dst[0] = CV_COMPUTE_MAP_X(v_src0[0], v_fxy1);
v_dst[1] = CV_COMPUTE_MAP_Y(v_src1[0], v_fxy1);
v_store_interleave(dst1f + (x << 1), v_dst[0], v_dst[1]);
@ -2234,12 +2234,12 @@ public:
#if CV_SIMD128
{
v_int32x4 v_X0 = v_setall_s32(X0), v_Y0 = v_setall_s32(Y0);
int span = v_uint16x8::nlanes;
int span = VTraits<v_uint16x8>::vlanes();
for( ; x1 <= bw - span; x1 += span )
{
v_int16x8 v_dst[2];
#define CV_CONVERT_MAP(ptr,offset,shift) v_pack(v_shr<AB_BITS>(shift+v_load(ptr + offset)),\
v_shr<AB_BITS>(shift+v_load(ptr + offset + 4)))
#define CV_CONVERT_MAP(ptr,offset,shift) v_pack(v_shr<AB_BITS>(v_add(shift,v_load(ptr + offset))),\
v_shr<AB_BITS>(v_add(shift,v_load(ptr + offset + 4))))
v_dst[0] = CV_CONVERT_MAP(adelta, x+x1, v_X0);
v_dst[1] = CV_CONVERT_MAP(bdelta, x+x1, v_Y0);
#undef CV_CONVERT_MAP
@ -2272,21 +2272,21 @@ public:
{
v_int32x4 v__X0 = v_setall_s32(X0), v__Y0 = v_setall_s32(Y0);
v_int32x4 v_mask = v_setall_s32(INTER_TAB_SIZE - 1);
int span = v_float32x4::nlanes;
int span = VTraits<v_float32x4>::vlanes();
for( ; x1 <= bw - span * 2; x1 += span * 2 )
{
v_int32x4 v_X0 = v_shr<AB_BITS - INTER_BITS>(v__X0 + v_load(adelta + x + x1));
v_int32x4 v_Y0 = v_shr<AB_BITS - INTER_BITS>(v__Y0 + v_load(bdelta + x + x1));
v_int32x4 v_X1 = v_shr<AB_BITS - INTER_BITS>(v__X0 + v_load(adelta + x + x1 + span));
v_int32x4 v_Y1 = v_shr<AB_BITS - INTER_BITS>(v__Y0 + v_load(bdelta + x + x1 + span));
v_int32x4 v_X0 = v_shr<AB_BITS - INTER_BITS>(v_add(v__X0, v_load(this->adelta + x + x1)));
v_int32x4 v_Y0 = v_shr<AB_BITS - INTER_BITS>(v_add(v__Y0, v_load(this->bdelta + x + x1)));
v_int32x4 v_X1 = v_shr<AB_BITS - INTER_BITS>(v_add(v__X0, v_load(this->adelta + x + x1 + span)));
v_int32x4 v_Y1 = v_shr<AB_BITS - INTER_BITS>(v_add(v__Y0, v_load(this->bdelta + x + x1 + span)));
v_int16x8 v_xy[2];
v_xy[0] = v_pack(v_shr<INTER_BITS>(v_X0), v_shr<INTER_BITS>(v_X1));
v_xy[1] = v_pack(v_shr<INTER_BITS>(v_Y0), v_shr<INTER_BITS>(v_Y1));
v_store_interleave(xy + (x1 << 1), v_xy[0], v_xy[1]);
v_int32x4 v_alpha0 = v_shl<INTER_BITS>(v_Y0 & v_mask) | (v_X0 & v_mask);
v_int32x4 v_alpha1 = v_shl<INTER_BITS>(v_Y1 & v_mask) | (v_X1 & v_mask);
v_int32x4 v_alpha0 = v_or(v_shl<INTER_BITS>(v_and(v_Y0, v_mask)), v_and(v_X0, v_mask));
v_int32x4 v_alpha1 = v_or(v_shl<INTER_BITS>(v_and(v_Y1, v_mask)), v_and(v_X1, v_mask));
v_store(alpha + x1, v_pack(v_alpha0, v_alpha1));
}
}
@ -2866,16 +2866,16 @@ void WarpPerspectiveLine_ProcessNN_CV_SIMD(const double *M, short* xy, double X0
v_int32x4 v_X0, v_Y0;
{
v_float64x2 v_W = v_muladd(v_M6, v_x1, v_W0);
v_W = v_select(v_W != v_zero, v_1 / v_W, v_zero);
v_float64x2 v_fX0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W));
v_float64x2 v_fY0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W));
v_x1 += v_2;
v_W = v_select(v_ne(v_W, v_zero), v_div(v_1, v_W), v_zero);
v_float64x2 v_fX0 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M0, v_x1, v_X0d), v_W)));
v_float64x2 v_fY0 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M3, v_x1, v_Y0d), v_W)));
v_x1 = v_add(v_x1, v_2);
v_W = v_muladd(v_M6, v_x1, v_W0);
v_W = v_select(v_W != v_zero, v_1 / v_W, v_zero);
v_float64x2 v_fX1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W));
v_float64x2 v_fY1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W));
v_x1 += v_2;
v_W = v_select(v_ne(v_W, v_zero), v_div(v_1, v_W), v_zero);
v_float64x2 v_fX1 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M0, v_x1, v_X0d), v_W)));
v_float64x2 v_fY1 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M3, v_x1, v_Y0d), v_W)));
v_x1 = v_add(v_x1, v_2);
v_X0 = v_round(v_fX0, v_fX1);
v_Y0 = v_round(v_fY0, v_fY1);
@ -2885,16 +2885,16 @@ void WarpPerspectiveLine_ProcessNN_CV_SIMD(const double *M, short* xy, double X0
v_int32x4 v_X1, v_Y1;
{
v_float64x2 v_W = v_muladd(v_M6, v_x1, v_W0);
v_W = v_select(v_W != v_zero, v_1 / v_W, v_zero);
v_float64x2 v_fX0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W));
v_float64x2 v_fY0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W));
v_x1 += v_2;
v_W = v_select(v_ne(v_W, v_zero), v_div(v_1, v_W), v_zero);
v_float64x2 v_fX0 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M0, v_x1, v_X0d), v_W)));
v_float64x2 v_fY0 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M3, v_x1, v_Y0d), v_W)));
v_x1 = v_add(v_x1, v_2);
v_W = v_muladd(v_M6, v_x1, v_W0);
v_W = v_select(v_W != v_zero, v_1 / v_W, v_zero);
v_float64x2 v_fX1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W));
v_float64x2 v_fY1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W));
v_x1 += v_2;
v_W = v_select(v_ne(v_W, v_zero), v_div(v_1, v_W), v_zero);
v_float64x2 v_fX1 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M0, v_x1, v_X0d), v_W)));
v_float64x2 v_fY1 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M3, v_x1, v_Y0d), v_W)));
v_x1 = v_add(v_x1, v_2);
v_X1 = v_round(v_fX0, v_fX1);
v_Y1 = v_round(v_fY0, v_fY1);
@ -2904,16 +2904,16 @@ void WarpPerspectiveLine_ProcessNN_CV_SIMD(const double *M, short* xy, double X0
v_int32x4 v_X2, v_Y2;
{
v_float64x2 v_W = v_muladd(v_M6, v_x1, v_W0);
v_W = v_select(v_W != v_zero, v_1 / v_W, v_zero);
v_float64x2 v_fX0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W));
v_float64x2 v_fY0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W));
v_x1 += v_2;
v_W = v_select(v_ne(v_W, v_zero), v_div(v_1, v_W), v_zero);
v_float64x2 v_fX0 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M0, v_x1, v_X0d), v_W)));
v_float64x2 v_fY0 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M3, v_x1, v_Y0d), v_W)));
v_x1 = v_add(v_x1, v_2);
v_W = v_muladd(v_M6, v_x1, v_W0);
v_W = v_select(v_W != v_zero, v_1 / v_W, v_zero);
v_float64x2 v_fX1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W));
v_float64x2 v_fY1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W));
v_x1 += v_2;
v_W = v_select(v_ne(v_W, v_zero), v_div(v_1, v_W), v_zero);
v_float64x2 v_fX1 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M0, v_x1, v_X0d), v_W)));
v_float64x2 v_fY1 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M3, v_x1, v_Y0d), v_W)));
v_x1 = v_add(v_x1, v_2);
v_X2 = v_round(v_fX0, v_fX1);
v_Y2 = v_round(v_fY0, v_fY1);
@ -2923,16 +2923,16 @@ void WarpPerspectiveLine_ProcessNN_CV_SIMD(const double *M, short* xy, double X0
v_int32x4 v_X3, v_Y3;
{
v_float64x2 v_W = v_muladd(v_M6, v_x1, v_W0);
v_W = v_select(v_W != v_zero, v_1 / v_W, v_zero);
v_float64x2 v_fX0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W));
v_float64x2 v_fY0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W));
v_x1 += v_2;
v_W = v_select(v_ne(v_W, v_zero), v_div(v_1, v_W), v_zero);
v_float64x2 v_fX0 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M0, v_x1, v_X0d), v_W)));
v_float64x2 v_fY0 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M3, v_x1, v_Y0d), v_W)));
v_x1 = v_add(v_x1, v_2);
v_W = v_muladd(v_M6, v_x1, v_W0);
v_W = v_select(v_W != v_zero, v_1 / v_W, v_zero);
v_float64x2 v_fX1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W));
v_float64x2 v_fY1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W));
v_x1 += v_2;
v_W = v_select(v_ne(v_W, v_zero), v_div(v_1, v_W), v_zero);
v_float64x2 v_fX1 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M0, v_x1, v_X0d), v_W)));
v_float64x2 v_fY1 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M3, v_x1, v_Y0d), v_W)));
v_x1 = v_add(v_x1, v_2);
v_X3 = v_round(v_fX0, v_fX1);
v_Y3 = v_round(v_fY0, v_fY1);
@ -2987,16 +2987,16 @@ void WarpPerspectiveLine_Process_CV_SIMD(const double *M, short* xy, short* alph
v_int32x4 v_X0, v_Y0;
{
v_float64x2 v_W = v_muladd(v_M6, v_x1, v_W0);
v_W = v_select(v_W != v_zero, v_its / v_W, v_zero);
v_float64x2 v_fX0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W));
v_float64x2 v_fY0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W));
v_x1 += v_2;
v_W = v_select(v_ne(v_W, v_zero), v_div(v_its, v_W), v_zero);
v_float64x2 v_fX0 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M0, v_x1, v_X0d), v_W)));
v_float64x2 v_fY0 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M3, v_x1, v_Y0d), v_W)));
v_x1 = v_add(v_x1, v_2);
v_W = v_muladd(v_M6, v_x1, v_W0);
v_W = v_select(v_W != v_zero, v_its / v_W, v_zero);
v_float64x2 v_fX1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W));
v_float64x2 v_fY1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W));
v_x1 += v_2;
v_W = v_select(v_ne(v_W, v_zero), v_div(v_its, v_W), v_zero);
v_float64x2 v_fX1 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M0, v_x1, v_X0d), v_W)));
v_float64x2 v_fY1 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M3, v_x1, v_Y0d), v_W)));
v_x1 = v_add(v_x1, v_2);
v_X0 = v_round(v_fX0, v_fX1);
v_Y0 = v_round(v_fY0, v_fY1);
@ -3006,16 +3006,16 @@ void WarpPerspectiveLine_Process_CV_SIMD(const double *M, short* xy, short* alph
v_int32x4 v_X1, v_Y1;
{
v_float64x2 v_W = v_muladd(v_M6, v_x1, v_W0);
v_W = v_select(v_W != v_zero, v_its / v_W, v_zero);
v_float64x2 v_fX0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W));
v_float64x2 v_fY0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W));
v_x1 += v_2;
v_W = v_select(v_ne(v_W, v_zero), v_div(v_its, v_W), v_zero);
v_float64x2 v_fX0 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M0, v_x1, v_X0d), v_W)));
v_float64x2 v_fY0 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M3, v_x1, v_Y0d), v_W)));
v_x1 = v_add(v_x1, v_2);
v_W = v_muladd(v_M6, v_x1, v_W0);
v_W = v_select(v_W != v_zero, v_its / v_W, v_zero);
v_float64x2 v_fX1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W));
v_float64x2 v_fY1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W));
v_x1 += v_2;
v_W = v_select(v_ne(v_W, v_zero), v_div(v_its, v_W), v_zero);
v_float64x2 v_fX1 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M0, v_x1, v_X0d), v_W)));
v_float64x2 v_fY1 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M3, v_x1, v_Y0d), v_W)));
v_x1 = v_add(v_x1, v_2);
v_X1 = v_round(v_fX0, v_fX1);
v_Y1 = v_round(v_fY0, v_fY1);
@ -3025,16 +3025,16 @@ void WarpPerspectiveLine_Process_CV_SIMD(const double *M, short* xy, short* alph
v_int32x4 v_X2, v_Y2;
{
v_float64x2 v_W = v_muladd(v_M6, v_x1, v_W0);
v_W = v_select(v_W != v_zero, v_its / v_W, v_zero);
v_float64x2 v_fX0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W));
v_float64x2 v_fY0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W));
v_x1 += v_2;
v_W = v_select(v_ne(v_W, v_zero), v_div(v_its, v_W), v_zero);
v_float64x2 v_fX0 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M0, v_x1, v_X0d), v_W)));
v_float64x2 v_fY0 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M3, v_x1, v_Y0d), v_W)));
v_x1 = v_add(v_x1, v_2);
v_W = v_muladd(v_M6, v_x1, v_W0);
v_W = v_select(v_W != v_zero, v_its / v_W, v_zero);
v_float64x2 v_fX1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W));
v_float64x2 v_fY1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W));
v_x1 += v_2;
v_W = v_select(v_ne(v_W, v_zero), v_div(v_its, v_W), v_zero);
v_float64x2 v_fX1 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M0, v_x1, v_X0d), v_W)));
v_float64x2 v_fY1 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M3, v_x1, v_Y0d), v_W)));
v_x1 = v_add(v_x1, v_2);
v_X2 = v_round(v_fX0, v_fX1);
v_Y2 = v_round(v_fY0, v_fY1);
@ -3044,35 +3044,35 @@ void WarpPerspectiveLine_Process_CV_SIMD(const double *M, short* xy, short* alph
v_int32x4 v_X3, v_Y3;
{
v_float64x2 v_W = v_muladd(v_M6, v_x1, v_W0);
v_W = v_select(v_W != v_zero, v_its / v_W, v_zero);
v_float64x2 v_fX0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W));
v_float64x2 v_fY0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W));
v_x1 += v_2;
v_W = v_select(v_ne(v_W, v_zero), v_div(v_its, v_W), v_zero);
v_float64x2 v_fX0 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M0, v_x1, v_X0d), v_W)));
v_float64x2 v_fY0 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M3, v_x1, v_Y0d), v_W)));
v_x1 = v_add(v_x1, v_2);
v_W = v_muladd(v_M6, v_x1, v_W0);
v_W = v_select(v_W != v_zero, v_its / v_W, v_zero);
v_float64x2 v_fX1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W));
v_float64x2 v_fY1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W));
v_x1 += v_2;
v_W = v_select(v_ne(v_W, v_zero), v_div(v_its, v_W), v_zero);
v_float64x2 v_fX1 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M0, v_x1, v_X0d), v_W)));
v_float64x2 v_fY1 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M3, v_x1, v_Y0d), v_W)));
v_x1 = v_add(v_x1, v_2);
v_X3 = v_round(v_fX0, v_fX1);
v_Y3 = v_round(v_fY0, v_fY1);
}
// store alpha
v_int32x4 v_alpha0 = ((v_Y0 & v_itsi1) << INTER_BITS) + (v_X0 & v_itsi1);
v_int32x4 v_alpha1 = ((v_Y1 & v_itsi1) << INTER_BITS) + (v_X1 & v_itsi1);
v_int32x4 v_alpha0 = v_add(v_shl<INTER_BITS>(v_and(v_Y0, v_itsi1)), v_and(v_X0, v_itsi1));
v_int32x4 v_alpha1 = v_add(v_shl<INTER_BITS>(v_and(v_Y1, v_itsi1)), v_and(v_X1, v_itsi1));
v_store((alpha + x1), v_pack(v_alpha0, v_alpha1));
v_alpha0 = ((v_Y2 & v_itsi1) << INTER_BITS) + (v_X2 & v_itsi1);
v_alpha1 = ((v_Y3 & v_itsi1) << INTER_BITS) + (v_X3 & v_itsi1);
v_alpha0 = v_add(v_shl<INTER_BITS>(v_and(v_Y2, v_itsi1)), v_and(v_X2, v_itsi1));
v_alpha1 = v_add(v_shl<INTER_BITS>(v_and(v_Y3, v_itsi1)), v_and(v_X3, v_itsi1));
v_store((alpha + x1 + 8), v_pack(v_alpha0, v_alpha1));
// convert to 16s
v_X0 = v_reinterpret_as_s32(v_pack(v_X0 >> INTER_BITS, v_X1 >> INTER_BITS));
v_X1 = v_reinterpret_as_s32(v_pack(v_X2 >> INTER_BITS, v_X3 >> INTER_BITS));
v_Y0 = v_reinterpret_as_s32(v_pack(v_Y0 >> INTER_BITS, v_Y1 >> INTER_BITS));
v_Y1 = v_reinterpret_as_s32(v_pack(v_Y2 >> INTER_BITS, v_Y3 >> INTER_BITS));
v_X0 = v_reinterpret_as_s32(v_pack(v_shr<INTER_BITS>(v_X0), v_shr<INTER_BITS>(v_X1)));
v_X1 = v_reinterpret_as_s32(v_pack(v_shr<INTER_BITS>(v_X2), v_shr<INTER_BITS>(v_X3)));
v_Y0 = v_reinterpret_as_s32(v_pack(v_shr<INTER_BITS>(v_Y0), v_shr<INTER_BITS>(v_Y1)));
v_Y1 = v_reinterpret_as_s32(v_pack(v_shr<INTER_BITS>(v_Y2), v_shr<INTER_BITS>(v_Y3)));
v_store_interleave(xy + x1 * 2, (v_reinterpret_as_s16)(v_X0), (v_reinterpret_as_s16)(v_Y0));
v_store_interleave(xy + x1 * 2 + 16, (v_reinterpret_as_s16)(v_X1), (v_reinterpret_as_s16)(v_Y1));

View File

@ -179,10 +179,10 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize )
for (k = 0; k < 16; ++k)
{
#if CV_SIMD256
v_store(H.fine[k], v_mul_wrap(v256_load(h_fine + 16 * n*(16 * c + k)), v256_setall_u16(2 * r + 1)) + v256_load(H.fine[k]));
v_store(H.fine[k], v_mul_wrap(v256_load(h_fine + 16 * n*(16 * c + k)), v_add(v256_setall_u16(2 * r + 1), v256_load(H.fine[k]))));
#elif CV_SIMD128
v_store(H.fine[k], v_mul_wrap(v_load(h_fine + 16 * n*(16 * c + k)), v_setall_u16((ushort)(2 * r + 1))) + v_load(H.fine[k]));
v_store(H.fine[k] + 8, v_mul_wrap(v_load(h_fine + 16 * n*(16 * c + k) + 8), v_setall_u16((ushort)(2 * r + 1))) + v_load(H.fine[k] + 8));
v_store(H.fine[k], v_add(v_mul_wrap(v_load(h_fine + 16 * n * (16 * c + k)), v_setall_u16((ushort)(2 * r + 1))), v_load(H.fine[k])));
v_store(H.fine[k] + 8, v_add(v_mul_wrap(v_load(h_fine + 16 * n * (16 * c + k) + 8), v_setall_u16((ushort)(2 * r + 1))), v_load(H.fine[k] + 8)));
#else
for (int ind = 0; ind < 16; ++ind)
H.fine[k][ind] = (HT)(H.fine[k][ind] + (2 * r + 1) * h_fine[16 * n*(16 * c + k) + ind]);
@ -199,10 +199,10 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize )
for( j = 0; j < 2*r; ++j, px += 16 )
{
#if CV_SIMD256
v_coarse += v256_load(px);
v_coarse = v_add(v_coarse, v256_load(px));
#elif CV_SIMD128
v_coarsel += v_load(px);
v_coarseh += v_load(px + 8);
v_coarsel = v_add(v_coarsel, v_load(px));
v_coarseh = v_add(v_coarseh, v_load(px + 8));
#else
for (int ind = 0; ind < 16; ++ind)
H.coarse[ind] += px[ind];
@ -216,11 +216,11 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize )
px = h_coarse + 16 * (n*c + std::min(j + r, n - 1));
#if CV_SIMD256
v_coarse += v256_load(px);
v_coarse = v_add(v_coarse, v256_load(px));
v_store(H.coarse, v_coarse);
#elif CV_SIMD128
v_coarsel += v_load(px);
v_coarseh += v_load(px + 8);
v_coarsel = v_add(v_coarsel, v_load(px));
v_coarseh = v_add(v_coarseh, v_load(px + 8));
v_store(H.coarse, v_coarsel);
v_store(H.coarse + 8, v_coarseh);
#else
@ -261,10 +261,10 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize )
for (luc[k] = HT(j - r); luc[k] < MIN(j + r + 1, n); ++luc[k], px += 16)
{
#if CV_SIMD256
v_fine += v256_load(px);
v_fine = v_add(v_fine, v256_load(px));
#elif CV_SIMD128
v_finel += v_load(px);
v_fineh += v_load(px + 8);
v_finel = v_add(v_finel, v_load(px));
v_fineh = v_add(v_fineh, v_load(px + 8));
#else
for (int ind = 0; ind < 16; ++ind)
H.fine[k][ind] += px[ind];
@ -275,10 +275,10 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize )
{
px = h_fine + 16 * (n*(16 * c + k) + (n - 1));
#if CV_SIMD256
v_fine += v_mul_wrap(v256_load(px), v256_setall_u16(j + r + 1 - n));
v_fine = v_add(v_fine, v_mul_wrap(v256_load(px), v256_setall_u16(j + r + 1 - n)));
#elif CV_SIMD128
v_finel += v_mul_wrap(v_load(px), v_setall_u16((ushort)(j + r + 1 - n)));
v_fineh += v_mul_wrap(v_load(px + 8), v_setall_u16((ushort)(j + r + 1 - n)));
v_finel = v_add(v_finel, v_mul_wrap(v_load(px), v_setall_u16((ushort)(j + r + 1 - n))));
v_fineh = v_add(v_fineh, v_mul_wrap(v_load(px + 8), v_setall_u16((ushort)(j + r + 1 - n))));
#else
for (int ind = 0; ind < 16; ++ind)
H.fine[k][ind] = (HT)(H.fine[k][ind] + (j + r + 1 - n) * px[ind]);
@ -298,10 +298,10 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize )
for ( ; luc[k] < j+r+1; ++luc[k] )
{
#if CV_SIMD256
v_fine = v_fine + v256_load(px + 16 * MIN(luc[k], n - 1)) - v256_load(px + 16 * MAX(luc[k] - 2 * r - 1, 0));
v_fine = v_sub(v_add(v_fine, v256_load(px + 16 * MIN(luc[k], n - 1))), v256_load(px + 16 * MAX(luc[k] - 2 * r - 1, 0)));
#elif CV_SIMD128
v_finel = v_finel + v_load(px + 16 * MIN(luc[k], n - 1) ) - v_load(px + 16 * MAX(luc[k] - 2 * r - 1, 0));
v_fineh = v_fineh + v_load(px + 16 * MIN(luc[k], n - 1) + 8) - v_load(px + 16 * MAX(luc[k] - 2 * r - 1, 0) + 8);
v_finel = v_sub(v_add(v_finel, v_load(px + 16 * MIN(luc[k], n - 1) )), v_load(px + 16 * MAX(luc[k] - 2 * r - 1, 0)));
v_fineh = v_sub(v_add(v_fineh, v_load(px + 16 * MIN(luc[k], n - 1) + 8)), v_load(px + 16 * MAX(luc[k] - 2 * r - 1, 0) + 8));
#else
for (int ind = 0; ind < 16; ++ind)
H.fine[k][ind] += px[16 * MIN(luc[k], n - 1) + ind] - px[16 * MAX(luc[k] - 2 * r - 1, 0) + ind];
@ -312,12 +312,12 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize )
px = h_coarse + 16 * (n*c + MAX(j - r, 0));
#if CV_SIMD256
v_store(H.fine[k], v_fine);
v_coarse -= v256_load(px);
v_coarse = v_sub(v_coarse, v256_load(px));
#elif CV_SIMD128
v_store(H.fine[k], v_finel);
v_store(H.fine[k] + 8, v_fineh);
v_coarsel -= v_load(px);
v_coarseh -= v_load(px + 8);
v_coarsel = v_sub(v_coarsel, v_load(px));
v_coarseh = v_sub(v_coarseh, v_load(px + 8));
#else
for (int ind = 0; ind < 16; ++ind)
H.coarse[ind] -= px[ind];

View File

@ -236,12 +236,12 @@ struct MomentsInTile_SIMD<uchar, int, int>
v_int16x8 p = v_reinterpret_as_s16(v_load_expand(ptr + x));
v_int16x8 sx = v_mul_wrap(qx, qx);
qx0 += v_reinterpret_as_u32(p);
qx0 = v_add(qx0, v_reinterpret_as_u32(p));
qx1 = v_reinterpret_as_u32(v_dotprod(p, qx, v_reinterpret_as_s32(qx1)));
qx2 = v_reinterpret_as_u32(v_dotprod(p, sx, v_reinterpret_as_s32(qx2)));
qx3 = v_reinterpret_as_u32(v_dotprod(v_mul_wrap(p, qx), sx, v_reinterpret_as_s32(qx3)));
qx += dx;
qx = v_add(qx, dx);
}
x0 = v_reduce_sum(qx0);
@ -276,19 +276,19 @@ struct MomentsInTile_SIMD<ushort, int, int64>
{
v_int32x4 v_src = v_reinterpret_as_s32(v_load_expand(ptr + x));
v_x0 += v_reinterpret_as_u32(v_src);
v_x1 += v_reinterpret_as_u32(v_src * v_ix0);
v_x0 = v_add(v_x0, v_reinterpret_as_u32(v_src));
v_x1 = v_add(v_x1, v_reinterpret_as_u32(v_mul(v_src, v_ix0)));
v_int32x4 v_ix1 = v_ix0 * v_ix0;
v_x2 += v_reinterpret_as_u32(v_src * v_ix1);
v_int32x4 v_ix1 = v_mul(v_ix0, v_ix0);
v_x2 = v_add(v_x2, v_reinterpret_as_u32(v_mul(v_src, v_ix1)));
v_ix1 = v_ix0 * v_ix1;
v_src = v_src * v_ix1;
v_ix1 = v_mul(v_ix0, v_ix1);
v_src = v_mul(v_src, v_ix1);
v_uint64x2 v_lo, v_hi;
v_expand(v_reinterpret_as_u32(v_src), v_lo, v_hi);
v_x3 += v_lo + v_hi;
v_x3 = v_add(v_x3, v_add(v_lo, v_hi));
v_ix0 += v_delta;
v_ix0 = v_add(v_ix0, v_delta);
}
x0 = v_reduce_sum(v_x0);

View File

@ -463,7 +463,7 @@ template<> int PyrDownVecV<int, uchar>(int** src, uchar* dst, int width)
}
#if CV_SIMD128
typedef int CV_DECL_ALIGNED(1) unaligned_int;
for ( ; x <= width - v_int32x4::nlanes; x += v_int32x4::nlanes)
for ( ; x <= width - VTraits<v_int32x4>::vlanes(); x += VTraits<v_int32x4>::vlanes())
{
v_int32x4 r0, r1, r2, r3, r4, t0;
r0 = v_load(row0 + x);
@ -473,7 +473,7 @@ template<> int PyrDownVecV<int, uchar>(int** src, uchar* dst, int width)
r4 = v_load(row4 + x);
t0 = v_add(v_add(v_add(r0, r4), v_add(r2, r2)), v_shl<2>(v_add(v_add(r1, r3), r2)));
*((unaligned_int*) (dst + x)) = v_reinterpret_as_s32(v_rshr_pack<8>(v_pack_u(t0, t0), v_setzero_u16())).get0();
*((unaligned_int*) (dst + x)) = v_get0(v_reinterpret_as_s32(v_rshr_pack<8>(v_pack_u(t0, t0), v_setzero_u16())));
}
#else
for (; x <= width - 1; x += 1)
@ -615,15 +615,15 @@ template <> int PyrUpVecV<int, uchar>(int** src, uchar** dst, int width)
}
#if CV_SIMD128
typedef int CV_DECL_ALIGNED(1) unaligned_int;
for (; x <= width - v_int32x4::nlanes; x += v_int32x4::nlanes)
for (; x <= width - VTraits<v_int32x4>::vlanes(); x += VTraits<v_int32x4>::vlanes())
{
v_int32 v_r00 = vx_load(row0 + x),
v_r10 = vx_load(row1 + x),
v_r20 = vx_load(row2 + x);
v_int32 v_2r10 = v_add(v_r10, v_r10);
v_int16 d = v_pack(v_add(v_add(v_r00, v_r20), v_add(v_add(v_2r10, v_2r10), v_2r10)), v_shl<2>(v_add(v_r10, v_r20)));
*(unaligned_int*)(dst0 + x) = v_reinterpret_as_s32(v_rshr_pack_u<6>(d, vx_setzero_s16())).get0();
*(unaligned_int*)(dst1 + x) = v_reinterpret_as_s32(v_rshr_pack_u<6>(v_combine_high(d, d), vx_setzero_s16())).get0();
*(unaligned_int*)(dst0 + x) = v_get0(v_reinterpret_as_s32(v_rshr_pack_u<6>(d, vx_setzero_s16())));
*(unaligned_int*)(dst1 + x) = v_get0(v_reinterpret_as_s32(v_rshr_pack_u<6>(v_combine_high(d, d), vx_setzero_s16())));
}
#else
for (; x <= width - 1; x += 1)
@ -754,14 +754,14 @@ template <> int PyrUpVecVOneRow<int, uchar>(int** src, uchar* dst, int width)
}
#if CV_SIMD128
typedef int CV_DECL_ALIGNED(1) unaligned_int;
for (; x <= width - v_int32x4::nlanes; x += v_int32x4::nlanes)
for (; x <= width - VTraits<v_int32x4>::vlanes(); x += VTraits<v_int32x4>::vlanes())
{
v_int32 v_r00 = vx_load(row0 + x),
v_r10 = vx_load(row1 + x),
v_r20 = vx_load(row2 + x);
v_int32 v_2r10 = v_add(v_r10, v_r10);
v_int16 d = v_pack(v_add(v_add(v_r00, v_r20), v_add(v_add(v_2r10, v_2r10), v_2r10)), v_shl<2>(v_add(v_r10, v_r20)));
*(unaligned_int*)(dst + x) = v_reinterpret_as_s32(v_rshr_pack_u<6>(d, vx_setzero_s16())).get0();
*(unaligned_int*)(dst + x) = v_get0(v_reinterpret_as_s32(v_rshr_pack_u<6>(d, vx_setzero_s16())));
}
#else
for (; x <= width - 1; x += 1)

View File

@ -2473,7 +2473,7 @@ public:
v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
v_uint16 bl, gl, rl;
#if CV_SIMD_WIDTH == 16
bl = t0 + t3; gl = t1 + t4; rl = t2 + t5;
bl = v_add(t0, t3); gl = v_add(t1, t4); rl = v_add(t2, t5);
#elif CV_SIMD_WIDTH == 32
v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
bl = v_add(s0, s3); gl = v_add(s1, s4); rl = v_add(s2, s5);
@ -2493,7 +2493,7 @@ public:
v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
v_uint16 bh, gh, rh;
#if CV_SIMD_WIDTH == 16
bh = t0 + t3; gh = t1 + t4; rh = t2 + t5;
bh = v_add(t0, t3); gh = v_add(t1, t4); rh = v_add(t2, t5);
#elif CV_SIMD_WIDTH == 32
v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
bh = v_add(s0, s3); gh = v_add(s1, s4); rh = v_add(s2, s5);
@ -2566,7 +2566,7 @@ public:
v_rshr_pack_store<2>(D, r0 + v_rotate_left<1>(r1, r0));
}
#else
v_rshr_pack_store<2>(D, v_load_expand(S0) + v_load_expand(S0 + 3) + v_load_expand(S1) + v_load_expand(S1 + 3));
v_rshr_pack_store<2>(D, v_add(v_add(v_add(v_load_expand(S0), v_load_expand(S0 + 3)), v_load_expand(S1)), v_load_expand(S1 + 3)));
#endif
#elif CV_SIMD_WIDTH == 32 || CV_SIMD_WIDTH == 64
for ( ; dx <= w - 3*VTraits<v_uint16>::vlanes(); dx += 3*VTraits<v_uint16>::vlanes(), S0 += 6*VTraits<v_uint16>::vlanes(), S1 += 6*VTraits<v_uint16>::vlanes(), D += 3*VTraits<v_uint16>::vlanes())
@ -2609,7 +2609,7 @@ public:
}
#elif CV_SIMD_WIDTH >= 64
v_uint32 masklow = vx_setall_u32(0x0000ffff);
for ( ; dx <= w - 3*v_uint16::nlanes; dx += 3*v_uint16::nlanes, S0 += 6*v_uint16::nlanes, S1 += 6*v_uint16::nlanes, D += 3*v_uint16::nlanes)
for ( ; dx <= w - 3*VTraits<v_uint16>::vlanes(); dx += 3*VTraits<v_uint16>::vlanes(), S0 += 6*VTraits<v_uint16>::vlanes(), S1 += 6*VTraits<v_uint16>::vlanes(), D += 3*VTraits<v_uint16>::vlanes())
{
v_uint16 b0, g0, r0, b1, g1, r1;
v_load_deinterleave(S0, b0, g0, r0);
@ -2617,8 +2617,8 @@ public:
v_uint32 bl = (v_reinterpret_as_u32(b0) >> 16) + (v_reinterpret_as_u32(b0) & masklow) + (v_reinterpret_as_u32(b1) >> 16) + (v_reinterpret_as_u32(b1) & masklow);
v_uint32 gl = (v_reinterpret_as_u32(g0) >> 16) + (v_reinterpret_as_u32(g0) & masklow) + (v_reinterpret_as_u32(g1) >> 16) + (v_reinterpret_as_u32(g1) & masklow);
v_uint32 rl = (v_reinterpret_as_u32(r0) >> 16) + (v_reinterpret_as_u32(r0) & masklow) + (v_reinterpret_as_u32(r1) >> 16) + (v_reinterpret_as_u32(r1) & masklow);
v_load_deinterleave(S0 + 3*v_uint16::nlanes, b0, g0, r0);
v_load_deinterleave(S1 + 3*v_uint16::nlanes, b1, g1, r1);
v_load_deinterleave(S0 + 3*VTraits<v_uint16>::vlanes(), b0, g0, r0);
v_load_deinterleave(S1 + 3*VTraits<v_uint16>::vlanes(), b1, g1, r1);
v_uint32 bh = (v_reinterpret_as_u32(b0) >> 16) + (v_reinterpret_as_u32(b0) & masklow) + (v_reinterpret_as_u32(b1) >> 16) + (v_reinterpret_as_u32(b1) & masklow);
v_uint32 gh = (v_reinterpret_as_u32(g0) >> 16) + (v_reinterpret_as_u32(g0) & masklow) + (v_reinterpret_as_u32(g1) >> 16) + (v_reinterpret_as_u32(g1) & masklow);
v_uint32 rh = (v_reinterpret_as_u32(r0) >> 16) + (v_reinterpret_as_u32(r0) & masklow) + (v_reinterpret_as_u32(r1) >> 16) + (v_reinterpret_as_u32(r1) & masklow);
@ -2630,7 +2630,7 @@ public:
{
CV_Assert(cn == 4);
#if CV_SIMD_WIDTH >= 64
for ( ; dx <= w - v_uint16::nlanes; dx += v_uint16::nlanes, S0 += 2*v_uint16::nlanes, S1 += 2*v_uint16::nlanes, D += v_uint16::nlanes)
for ( ; dx <= w - VTraits<v_uint16>::vlanes(); dx += VTraits<v_uint16>::vlanes(), S0 += 2*VTraits<v_uint16>::vlanes(), S1 += 2*VTraits<v_uint16>::vlanes(), D += VTraits<v_uint16>::vlanes())
{
v_uint64 r00, r01, r10, r11;
v_load_deinterleave((uint64_t*)S0, r00, r01);
@ -2652,7 +2652,7 @@ public:
r0 = v_add(r0, r2); r1 = v_add(r1, r3);
v_uint32 v_d;
#if CV_SIMD_WIDTH == 16
v_d = r0 + r1;
v_d = v_add(r0, r1);
#elif CV_SIMD_WIDTH == 32
v_uint32 t0, t1;
v_recombine(r0, r1, t0, t1);
@ -2697,7 +2697,7 @@ public:
{
#if CV_SIMD_WIDTH == 16
for ( ; dx <= w - 4; dx += 3, S0 += 6, S1 += 6, D += 3)
v_rshr_pack_store<2>(D, v_load_expand(S0) + v_load_expand(S0 + 3) + v_load_expand(S1) + v_load_expand(S1 + 3));
v_rshr_pack_store<2>(D, v_add(v_add(v_add(v_load_expand(S0), v_load_expand(S0 + 3)), v_load_expand(S1)), v_load_expand(S1 + 3)));
#elif CV_SIMD_WIDTH == 32 || CV_SIMD_WIDTH == 64
for ( ; dx <= w - 3*VTraits<v_int16>::vlanes(); dx += 3*VTraits<v_int16>::vlanes(), S0 += 6*VTraits<v_int16>::vlanes(), S1 += 6*VTraits<v_int16>::vlanes(), D += 3*VTraits<v_int16>::vlanes())
{
@ -2738,7 +2738,7 @@ public:
v_store_interleave(D, v_rshr_pack<2>(bl, bh), v_rshr_pack<2>(gl, gh), v_rshr_pack<2>(rl, rh));
}
#elif CV_SIMD_WIDTH >= 64
for ( ; dx <= w - 3*v_int16::nlanes; dx += 3*v_int16::nlanes, S0 += 6*v_int16::nlanes, S1 += 6*v_int16::nlanes, D += 3*v_int16::nlanes)
for ( ; dx <= w - 3*VTraits<v_int16>::vlanes(); dx += 3*VTraits<v_int16>::vlanes(), S0 += 6*VTraits<v_int16>::vlanes(), S1 += 6*VTraits<v_int16>::vlanes(), D += 3*VTraits<v_int16>::vlanes())
{
v_int16 b0, g0, r0, b1, g1, r1;
v_load_deinterleave(S0, b0, g0, r0);
@ -2746,8 +2746,8 @@ public:
v_int32 bl = (v_reinterpret_as_s32(b0) >> 16) + ((v_reinterpret_as_s32(b0) << 16) >> 16) + (v_reinterpret_as_s32(b1) >> 16) + ((v_reinterpret_as_s32(b1) << 16) >> 16);
v_int32 gl = (v_reinterpret_as_s32(g0) >> 16) + ((v_reinterpret_as_s32(g0) << 16) >> 16) + (v_reinterpret_as_s32(g1) >> 16) + ((v_reinterpret_as_s32(g1) << 16) >> 16);
v_int32 rl = (v_reinterpret_as_s32(r0) >> 16) + ((v_reinterpret_as_s32(r0) << 16) >> 16) + (v_reinterpret_as_s32(r1) >> 16) + ((v_reinterpret_as_s32(r1) << 16) >> 16);
v_load_deinterleave(S0 + 3*v_int16::nlanes, b0, g0, r0);
v_load_deinterleave(S1 + 3*v_int16::nlanes, b1, g1, r1);
v_load_deinterleave(S0 + 3*VTraits<v_int16>::vlanes(), b0, g0, r0);
v_load_deinterleave(S1 + 3*VTraits<v_int16>::vlanes(), b1, g1, r1);
v_int32 bh = (v_reinterpret_as_s32(b0) >> 16) + ((v_reinterpret_as_s32(b0) << 16) >> 16) + (v_reinterpret_as_s32(b1) >> 16) + ((v_reinterpret_as_s32(b1) << 16) >> 16);
v_int32 gh = (v_reinterpret_as_s32(g0) >> 16) + ((v_reinterpret_as_s32(g0) << 16) >> 16) + (v_reinterpret_as_s32(g1) >> 16) + ((v_reinterpret_as_s32(g1) << 16) >> 16);
v_int32 rh = (v_reinterpret_as_s32(r0) >> 16) + ((v_reinterpret_as_s32(r0) << 16) >> 16) + (v_reinterpret_as_s32(r1) >> 16) + ((v_reinterpret_as_s32(r1) << 16) >> 16);
@ -2779,7 +2779,7 @@ public:
r3 = v_add(vx_load_expand(S0 + 3 * VTraits<v_int32>::vlanes()), vx_load_expand(S1 + 3 * VTraits<v_int32>::vlanes()));
v_int32 dl, dh;
#if CV_SIMD_WIDTH == 16
dl = r0 + r1; dh = r2 + r3;
dl = v_add(r0, r1); dh = v_add(r2, r3);
#elif CV_SIMD_WIDTH == 32
v_int32 t0, t1, t2, t3;
v_recombine(r0, r1, t0, t1); v_recombine(r2, r3, t2, t3);
@ -2829,14 +2829,14 @@ struct ResizeAreaFastVec_SIMD_32f
{
#if CV_SIMD_WIDTH == 16
v_float32 v_025 = vx_setall_f32(0.25f);
for (; dx <= w - v_float32::nlanes; dx += v_float32::nlanes, S0 += 2*v_float32::nlanes, S1 += 2*v_float32::nlanes, D += v_float32::nlanes)
v_store(D, ((vx_load(S0) + vx_load(S0 + v_float32::nlanes)) + (vx_load(S1) + vx_load(S1 + v_float32::nlanes))) * v_025);
for (; dx <= w - VTraits<v_float32>::vlanes(); dx += VTraits<v_float32>::vlanes(), S0 += 2*VTraits<v_float32>::vlanes(), S1 += 2*VTraits<v_float32>::vlanes(), D += VTraits<v_float32>::vlanes())
v_store(D, v_mul(v_add(v_add(vx_load(S0), vx_load(S0 + VTraits<v_float32>::vlanes())), v_add(vx_load(S1), vx_load(S1 + VTraits<v_float32>::vlanes()))), v_025));
#elif CV_SIMD256
v_float32x8 v_025 = v256_setall_f32(0.25f);
for (; dx <= w - v_float32x8::nlanes; dx += v_float32x8::nlanes, S0 += 2*v_float32x8::nlanes, S1 += 2*v_float32x8::nlanes, D += v_float32x8::nlanes)
for (; dx <= w - VTraits<v_float32x8>::vlanes(); dx += VTraits<v_float32x8>::vlanes(), S0 += 2*VTraits<v_float32x8>::vlanes(), S1 += 2*VTraits<v_float32x8>::vlanes(), D += VTraits<v_float32x8>::vlanes())
{
v_float32x8 dst0, dst1;
v_recombine(v_add(v256_load(S0), v256_load(S1)), v_add(v256_load(S0 + v_float32x8::nlanes), v256_load(S1 + v_float32x8::nlanes)), dst0, dst1);
v_recombine(v_add(v256_load(S0), v256_load(S1)), v_add(v256_load(S0 + VTraits<v_float32x8>::vlanes()), v256_load(S1 + VTraits<v_float32x8>::vlanes())), dst0, dst1);
v_store(D, v_mul(v_add(dst0, dst1), v_025));
}
#endif

View File

@ -114,7 +114,7 @@ struct Integral_SIMD<uchar, int, double>
v_int32 prev = vx_setzero_s32();
int j = 0;
for ( ; j + v_uint16::nlanes <= width; j += v_uint16::nlanes)
for ( ; j + VTraits<v_uint16>::vlanes() <= width; j += VTraits<v_uint16>::vlanes())
{
v_int16 el8 = v_reinterpret_as_s16(vx_load_expand(src_row + j));
v_int32 el4l, el4h;
@ -127,8 +127,8 @@ struct Integral_SIMD<uchar, int, double>
el4h.val = _mm256_add_epi32(_mm256_cvtepi16_epi32(_v256_extract_high(vsum)), _mm256_permutevar8x32_epi32(el4l.val, shmask));
prev.val = _mm256_permutevar8x32_epi32(el4h.val, shmask);
#else
el8 += v_rotate_left<1>(el8);
el8 += v_rotate_left<2>(el8);
el8 = v_add(el8, v_rotate_left<1>(el8));
el8 = v_add(el8, v_rotate_left<2>(el8));
#if CV_SIMD_WIDTH >= 32
el8 += v_rotate_left<4>(el8);
#if CV_SIMD_WIDTH == 64
@ -136,12 +136,12 @@ struct Integral_SIMD<uchar, int, double>
#endif
#endif
v_expand(el8, el4l, el4h);
el4l += prev;
el4h += el4l;
prev = v_broadcast_element<v_int32::nlanes - 1>(el4h);
el4l = v_add(el4l, prev);
el4h = v_add(el4h, el4l);
prev = v_broadcast_highest(el4h);
#endif
v_store(sum_row + j , el4l + vx_load(prev_sum_row + j ));
v_store(sum_row + j + v_int32::nlanes, el4h + vx_load(prev_sum_row + j + v_int32::nlanes));
v_store(sum_row + j , v_add(el4l, vx_load(prev_sum_row + j)));
v_store(sum_row + j + VTraits<v_int32>::vlanes(), v_add(el4h, vx_load(prev_sum_row + j + VTraits<v_int32>::vlanes())));
}
for (int v = sum_row[j - 1] - prev_sum_row[j - 1]; j < width; ++j)
@ -162,11 +162,11 @@ struct Integral_SIMD<uchar, int, double>
v_int32 prev_1 = vx_setzero_s32(), prev_2 = vx_setzero_s32();
int j = 0;
for ( ; j + v_uint16::nlanes * cn <= width; j += v_uint16::nlanes * cn)
for ( ; j + VTraits<v_uint16>::vlanes() * cn <= width; j += VTraits<v_uint16>::vlanes() * cn)
{
v_int16 v_src_row = v_reinterpret_as_s16(vx_load(src_row + j));
v_int16 el8_1 = v_src_row & mask;
v_int16 el8_2 = v_reinterpret_as_s16(v_reinterpret_as_u16(v_src_row) >> 8);
v_int16 el8_1 = v_and(v_src_row, mask);
v_int16 el8_2 = v_reinterpret_as_s16(v_shr<8>(v_reinterpret_as_u16(v_src_row)));
v_int32 el4l_1, el4h_1, el4l_2, el4h_2;
#if CV_AVX2 && CV_SIMD_WIDTH == 32
__m256i vsum_1 = _mm256_add_epi16(el8_1.val, _mm256_slli_si256(el8_1.val, 2));
@ -183,10 +183,10 @@ struct Integral_SIMD<uchar, int, double>
prev_1.val = _mm256_permutevar8x32_epi32(el4h_1.val, shmask);
prev_2.val = _mm256_permutevar8x32_epi32(el4h_2.val, shmask);
#else
el8_1 += v_rotate_left<1>(el8_1);
el8_2 += v_rotate_left<1>(el8_2);
el8_1 += v_rotate_left<2>(el8_1);
el8_2 += v_rotate_left<2>(el8_2);
el8_1 = v_add(el8_1, v_rotate_left<1>(el8_1));
el8_2 = v_add(el8_2, v_rotate_left<1>(el8_2));
el8_1 = v_add(el8_1, v_rotate_left<2>(el8_1));
el8_2 = v_add(el8_2, v_rotate_left<2>(el8_2));
#if CV_SIMD_WIDTH >= 32
el8_1 += v_rotate_left<4>(el8_1);
el8_2 += v_rotate_left<4>(el8_2);
@ -197,20 +197,20 @@ struct Integral_SIMD<uchar, int, double>
#endif
v_expand(el8_1, el4l_1, el4h_1);
v_expand(el8_2, el4l_2, el4h_2);
el4l_1 += prev_1;
el4l_2 += prev_2;
el4h_1 += el4l_1;
el4h_2 += el4l_2;
prev_1 = v_broadcast_element<v_int32::nlanes - 1>(el4h_1);
prev_2 = v_broadcast_element<v_int32::nlanes - 1>(el4h_2);
el4l_1 = v_add(el4l_1, prev_1);
el4l_2 = v_add(el4l_2, prev_2);
el4h_1 = v_add(el4h_1, el4l_1);
el4h_2 = v_add(el4h_2, el4l_2);
prev_1 = v_broadcast_highest(el4h_1);
prev_2 = v_broadcast_highest(el4h_2);
#endif
v_int32 el4_1, el4_2, el4_3, el4_4;
v_zip(el4l_1, el4l_2, el4_1, el4_2);
v_zip(el4h_1, el4h_2, el4_3, el4_4);
v_store(sum_row + j , el4_1 + vx_load(prev_sum_row + j ));
v_store(sum_row + j + v_int32::nlanes , el4_2 + vx_load(prev_sum_row + j + v_int32::nlanes ));
v_store(sum_row + j + v_int32::nlanes * 2, el4_3 + vx_load(prev_sum_row + j + v_int32::nlanes * 2));
v_store(sum_row + j + v_int32::nlanes * 3, el4_4 + vx_load(prev_sum_row + j + v_int32::nlanes * 3));
v_store(sum_row + j , v_add(el4_1, vx_load(prev_sum_row + j)));
v_store(sum_row + j + VTraits<v_int32>::vlanes() , v_add(el4_2, vx_load(prev_sum_row + j + VTraits<v_int32>::vlanes())));
v_store(sum_row + j + VTraits<v_int32>::vlanes() * 2, v_add(el4_3, vx_load(prev_sum_row + j + VTraits<v_int32>::vlanes() * 2)));
v_store(sum_row + j + VTraits<v_int32>::vlanes() * 3, v_add(el4_4, vx_load(prev_sum_row + j + VTraits<v_int32>::vlanes() * 3)));
}
for (int v2 = sum_row[j - 1] - prev_sum_row[j - 1],
@ -230,7 +230,7 @@ struct Integral_SIMD<uchar, int, double>
const uchar * src_row = src + _srcstep * i;
int * prev_sum_row = (int *)((uchar *)sum + _sumstep * i) + cn;
int * sum_row = (int *)((uchar *)sum + _sumstep * (i + 1)) + cn;
int row_cache[v_int32::nlanes * 6];
int row_cache[VTraits<v_int32>::max_nlanes * 6];
sum_row[-1] = sum_row[-2] = sum_row[-3] = 0;
@ -238,10 +238,10 @@ struct Integral_SIMD<uchar, int, double>
prev_3 = vx_setzero_s32();
int j = 0;
const int j_max =
((_srcstep * i + (width - v_uint16::nlanes * cn + v_uint8::nlanes * cn)) >= _srcstep * height)
? width - v_uint8::nlanes * cn // uint8 in v_load_deinterleave()
: width - v_uint16::nlanes * cn; // v_expand_low
for ( ; j <= j_max; j += v_uint16::nlanes * cn)
((_srcstep * i + (width - VTraits<v_uint16>::vlanes() * cn + VTraits<v_uint8>::vlanes() * cn)) >= _srcstep * height)
? width - VTraits<v_uint8>::vlanes() * cn // uint8 in v_load_deinterleave()
: width - VTraits<v_uint16>::vlanes() * cn; // v_expand_low
for ( ; j <= j_max; j += VTraits<v_uint16>::vlanes() * cn)
{
v_uint8 v_src_row_1, v_src_row_2, v_src_row_3;
v_load_deinterleave(src_row + j, v_src_row_1, v_src_row_2, v_src_row_3);
@ -270,49 +270,49 @@ struct Integral_SIMD<uchar, int, double>
prev_2.val = _mm256_permutevar8x32_epi32(el4h_2.val, shmask);
prev_3.val = _mm256_permutevar8x32_epi32(el4h_3.val, shmask);
#else
el8_1 += v_rotate_left<1>(el8_1);
el8_2 += v_rotate_left<1>(el8_2);
el8_3 += v_rotate_left<1>(el8_3);
el8_1 += v_rotate_left<2>(el8_1);
el8_2 += v_rotate_left<2>(el8_2);
el8_3 += v_rotate_left<2>(el8_3);
el8_1 = v_add(el8_1,v_rotate_left<1>(el8_1));
el8_2 = v_add(el8_2,v_rotate_left<1>(el8_2));
el8_3 = v_add(el8_3,v_rotate_left<1>(el8_3));
el8_1 = v_add(el8_1,v_rotate_left<2>(el8_1));
el8_2 = v_add(el8_2,v_rotate_left<2>(el8_2));
el8_3 = v_add(el8_3,v_rotate_left<2>(el8_3));
#if CV_SIMD_WIDTH >= 32
el8_1 += v_rotate_left<4>(el8_1);
el8_2 += v_rotate_left<4>(el8_2);
el8_3 += v_rotate_left<4>(el8_3);
el8_1 = v_add(el8_1, v_rotate_left<4>(el8_1));
el8_2 = v_add(el8_2, v_rotate_left<4>(el8_2));
el8_3 = v_add(el8_3, v_rotate_left<4>(el8_3));
#if CV_SIMD_WIDTH == 64
el8_1 += v_rotate_left<8>(el8_1);
el8_2 += v_rotate_left<8>(el8_2);
el8_3 += v_rotate_left<8>(el8_3);
el8_1 = v_add(el8_1, v_rotate_left<8>(el8_1));
el8_2 = v_add(el8_2, v_rotate_left<8>(el8_2));
el8_3 = v_add(el8_3, v_rotate_left<8>(el8_3));
#endif
#endif
v_expand(el8_1, el4l_1, el4h_1);
v_expand(el8_2, el4l_2, el4h_2);
v_expand(el8_3, el4l_3, el4h_3);
el4l_1 += prev_1;
el4l_2 += prev_2;
el4l_3 += prev_3;
el4h_1 += el4l_1;
el4h_2 += el4l_2;
el4h_3 += el4l_3;
prev_1 = v_broadcast_element<v_int32::nlanes - 1>(el4h_1);
prev_2 = v_broadcast_element<v_int32::nlanes - 1>(el4h_2);
prev_3 = v_broadcast_element<v_int32::nlanes - 1>(el4h_3);
el4l_1 = v_add(el4l_1, prev_1);
el4l_2 = v_add(el4l_2, prev_2);
el4l_3 = v_add(el4l_3, prev_3);
el4h_1 = v_add(el4h_1, el4l_1);
el4h_2 = v_add(el4h_2, el4l_2);
el4h_3 = v_add(el4h_3, el4l_3);
prev_1 = v_broadcast_highest(el4h_1);
prev_2 = v_broadcast_highest(el4h_2);
prev_3 = v_broadcast_highest(el4h_3);
#endif
v_store_interleave(row_cache , el4l_1, el4l_2, el4l_3);
v_store_interleave(row_cache + v_int32::nlanes * 3, el4h_1, el4h_2, el4h_3);
v_store_interleave(row_cache + VTraits<v_int32>::vlanes() * 3, el4h_1, el4h_2, el4h_3);
el4l_1 = vx_load(row_cache );
el4l_2 = vx_load(row_cache + v_int32::nlanes );
el4l_3 = vx_load(row_cache + v_int32::nlanes * 2);
el4h_1 = vx_load(row_cache + v_int32::nlanes * 3);
el4h_2 = vx_load(row_cache + v_int32::nlanes * 4);
el4h_3 = vx_load(row_cache + v_int32::nlanes * 5);
v_store(sum_row + j , el4l_1 + vx_load(prev_sum_row + j ));
v_store(sum_row + j + v_int32::nlanes , el4l_2 + vx_load(prev_sum_row + j + v_int32::nlanes ));
v_store(sum_row + j + v_int32::nlanes * 2, el4l_3 + vx_load(prev_sum_row + j + v_int32::nlanes * 2));
v_store(sum_row + j + v_int32::nlanes * 3, el4h_1 + vx_load(prev_sum_row + j + v_int32::nlanes * 3));
v_store(sum_row + j + v_int32::nlanes * 4, el4h_2 + vx_load(prev_sum_row + j + v_int32::nlanes * 4));
v_store(sum_row + j + v_int32::nlanes * 5, el4h_3 + vx_load(prev_sum_row + j + v_int32::nlanes * 5));
el4l_2 = vx_load(row_cache + VTraits<v_int32>::vlanes() );
el4l_3 = vx_load(row_cache + VTraits<v_int32>::vlanes() * 2);
el4h_1 = vx_load(row_cache + VTraits<v_int32>::vlanes() * 3);
el4h_2 = vx_load(row_cache + VTraits<v_int32>::vlanes() * 4);
el4h_3 = vx_load(row_cache + VTraits<v_int32>::vlanes() * 5);
v_store(sum_row + j , v_add(el4l_1, vx_load(prev_sum_row + j )));
v_store(sum_row + j + VTraits<v_int32>::vlanes() , v_add(el4l_2, vx_load(prev_sum_row + j + VTraits<v_int32>::vlanes() )));
v_store(sum_row + j + VTraits<v_int32>::vlanes() * 2, v_add(el4l_3, vx_load(prev_sum_row + j + VTraits<v_int32>::vlanes() * 2)));
v_store(sum_row + j + VTraits<v_int32>::vlanes() * 3, v_add(el4h_1, vx_load(prev_sum_row + j + VTraits<v_int32>::vlanes() * 3)));
v_store(sum_row + j + VTraits<v_int32>::vlanes() * 4, v_add(el4h_2, vx_load(prev_sum_row + j + VTraits<v_int32>::vlanes() * 4)));
v_store(sum_row + j + VTraits<v_int32>::vlanes() * 5, v_add(el4h_3, vx_load(prev_sum_row + j + VTraits<v_int32>::vlanes() * 5)));
}
for (int v3 = sum_row[j - 1] - prev_sum_row[j - 1],
@ -339,7 +339,7 @@ struct Integral_SIMD<uchar, int, double>
v_int32 prev = vx_setzero_s32();
int j = 0;
for ( ; j + v_uint16::nlanes <= width; j += v_uint16::nlanes)
for ( ; j + VTraits<v_uint16>::vlanes() <= width; j += VTraits<v_uint16>::vlanes())
{
v_int16 el8 = v_reinterpret_as_s16(vx_load_expand(src_row + j));
v_int32 el4l, el4h;
@ -356,8 +356,8 @@ struct Integral_SIMD<uchar, int, double>
#endif
#endif
v_expand(el8, el4l, el4h);
el4l += prev;
el4h += el4l;
el4l = v_add(el4l, prev);
el4h = v_add(el4h, el4l);
#if CV_SIMD_WIDTH == 16
prev = el4h;
#elif CV_SIMD_WIDTH == 32
@ -368,8 +368,8 @@ struct Integral_SIMD<uchar, int, double>
prev = v_combine_low(t, t);
#endif
#endif
v_store(sum_row + j , el4l + vx_load(prev_sum_row + j ));
v_store(sum_row + j + v_int32::nlanes, el4h + vx_load(prev_sum_row + j + v_int32::nlanes));
v_store(sum_row + j , v_add(el4l, vx_load(prev_sum_row + j)));
v_store(sum_row + j + VTraits<v_int32>::vlanes(), v_add(el4h, vx_load(prev_sum_row + j + VTraits<v_int32>::vlanes())));
}
for (int v4 = sum_row[j - 1] - prev_sum_row[j - 1],
@ -426,7 +426,7 @@ struct Integral_SIMD<uchar, float, double>
v_float32 prev = vx_setzero_f32();
int j = 0;
for (; j + v_uint16::nlanes <= width; j += v_uint16::nlanes)
for (; j + VTraits<v_uint16>::vlanes() <= width; j += VTraits<v_uint16>::vlanes())
{
v_int16 el8 = v_reinterpret_as_s16(vx_load_expand(src_row + j));
v_float32 el4l, el4h;
@ -439,8 +439,8 @@ struct Integral_SIMD<uchar, float, double>
el4h.val = _mm256_add_ps(_mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_v256_extract_high(vsum))), _mm256_permutevar8x32_ps(el4l.val, shmask));
prev.val = _mm256_permutevar8x32_ps(el4h.val, shmask);
#else
el8 += v_rotate_left<1>(el8);
el8 += v_rotate_left<2>(el8);
el8 = v_add(el8, v_rotate_left<1>(el8));
el8 = v_add(el8, v_rotate_left<2>(el8));
#if CV_SIMD_WIDTH >= 32
el8 += v_rotate_left<4>(el8);
#if CV_SIMD_WIDTH == 64
@ -449,12 +449,12 @@ struct Integral_SIMD<uchar, float, double>
#endif
v_int32 el4li, el4hi;
v_expand(el8, el4li, el4hi);
el4l = v_cvt_f32(el4li) + prev;
el4h = v_cvt_f32(el4hi) + el4l;
prev = v_broadcast_element<v_float32::nlanes - 1>(el4h);
el4l = v_add(v_cvt_f32(el4li), prev);
el4h = v_add(v_cvt_f32(el4hi), el4l);
prev = v_broadcast_highest(el4h);
#endif
v_store(sum_row + j , el4l + vx_load(prev_sum_row + j ));
v_store(sum_row + j + v_float32::nlanes, el4h + vx_load(prev_sum_row + j + v_float32::nlanes));
v_store(sum_row + j , v_add(el4l, vx_load(prev_sum_row + j)));
v_store(sum_row + j + VTraits<v_float32>::vlanes(), v_add(el4h, vx_load(prev_sum_row + j + VTraits<v_float32>::vlanes())));
}
for (float v = sum_row[j - 1] - prev_sum_row[j - 1]; j < width; ++j)
@ -475,11 +475,11 @@ struct Integral_SIMD<uchar, float, double>
v_float32 prev_1 = vx_setzero_f32(), prev_2 = vx_setzero_f32();
int j = 0;
for (; j + v_uint16::nlanes * cn <= width; j += v_uint16::nlanes * cn)
for (; j + VTraits<v_uint16>::vlanes() * cn <= width; j += VTraits<v_uint16>::vlanes() * cn)
{
v_int16 v_src_row = v_reinterpret_as_s16(vx_load(src_row + j));
v_int16 el8_1 = v_src_row & mask;
v_int16 el8_2 = v_reinterpret_as_s16(v_reinterpret_as_u16(v_src_row) >> 8);
v_int16 el8_1 = v_and(v_src_row, mask);
v_int16 el8_2 = v_reinterpret_as_s16(v_shr<8>(v_reinterpret_as_u16(v_src_row)));
v_float32 el4l_1, el4h_1, el4l_2, el4h_2;
#if CV_AVX2 && CV_SIMD_WIDTH == 32
__m256i vsum_1 = _mm256_add_epi16(el8_1.val, _mm256_slli_si256(el8_1.val, 2));
@ -496,10 +496,10 @@ struct Integral_SIMD<uchar, float, double>
prev_1.val = _mm256_permutevar8x32_ps(el4h_1.val, shmask);
prev_2.val = _mm256_permutevar8x32_ps(el4h_2.val, shmask);
#else
el8_1 += v_rotate_left<1>(el8_1);
el8_2 += v_rotate_left<1>(el8_2);
el8_1 += v_rotate_left<2>(el8_1);
el8_2 += v_rotate_left<2>(el8_2);
el8_1 = v_add(el8_1, v_rotate_left<1>(el8_1));
el8_2 = v_add(el8_2, v_rotate_left<1>(el8_2));
el8_1 = v_add(el8_1, v_rotate_left<2>(el8_1));
el8_2 = v_add(el8_2, v_rotate_left<2>(el8_2));
#if CV_SIMD_WIDTH >= 32
el8_1 += v_rotate_left<4>(el8_1);
el8_2 += v_rotate_left<4>(el8_2);
@ -511,20 +511,20 @@ struct Integral_SIMD<uchar, float, double>
v_int32 el4li_1, el4hi_1, el4li_2, el4hi_2;
v_expand(el8_1, el4li_1, el4hi_1);
v_expand(el8_2, el4li_2, el4hi_2);
el4l_1 = v_cvt_f32(el4li_1) + prev_1;
el4l_2 = v_cvt_f32(el4li_2) + prev_2;
el4h_1 = v_cvt_f32(el4hi_1) + el4l_1;
el4h_2 = v_cvt_f32(el4hi_2) + el4l_2;
prev_1 = v_broadcast_element<v_float32::nlanes - 1>(el4h_1);
prev_2 = v_broadcast_element<v_float32::nlanes - 1>(el4h_2);
el4l_1 = v_add(v_cvt_f32(el4li_1), prev_1);
el4l_2 = v_add(v_cvt_f32(el4li_2), prev_2);
el4h_1 = v_add(v_cvt_f32(el4hi_1), el4l_1);
el4h_2 = v_add(v_cvt_f32(el4hi_2), el4l_2);
prev_1 = v_broadcast_highest(el4h_1);
prev_2 = v_broadcast_highest(el4h_2);
#endif
v_float32 el4_1, el4_2, el4_3, el4_4;
v_zip(el4l_1, el4l_2, el4_1, el4_2);
v_zip(el4h_1, el4h_2, el4_3, el4_4);
v_store(sum_row + j , el4_1 + vx_load(prev_sum_row + j ));
v_store(sum_row + j + v_float32::nlanes , el4_2 + vx_load(prev_sum_row + j + v_float32::nlanes ));
v_store(sum_row + j + v_float32::nlanes * 2, el4_3 + vx_load(prev_sum_row + j + v_float32::nlanes * 2));
v_store(sum_row + j + v_float32::nlanes * 3, el4_4 + vx_load(prev_sum_row + j + v_float32::nlanes * 3));
v_store(sum_row + j , v_add(el4_1, vx_load(prev_sum_row + j)));
v_store(sum_row + j + VTraits<v_float32>::vlanes() , v_add(el4_2, vx_load(prev_sum_row + j + VTraits<v_float32>::vlanes())));
v_store(sum_row + j + VTraits<v_float32>::vlanes() * 2, v_add(el4_3, vx_load(prev_sum_row + j + VTraits<v_float32>::vlanes() * 2)));
v_store(sum_row + j + VTraits<v_float32>::vlanes() * 3, v_add(el4_4, vx_load(prev_sum_row + j + VTraits<v_float32>::vlanes() * 3)));
}
for (float v2 = sum_row[j - 1] - prev_sum_row[j - 1],
@ -543,7 +543,7 @@ struct Integral_SIMD<uchar, float, double>
const uchar * src_row = src + _srcstep * i;
float * prev_sum_row = (float *)((uchar *)sum + _sumstep * i) + cn;
float * sum_row = (float *)((uchar *)sum + _sumstep * (i + 1)) + cn;
float row_cache[v_float32::nlanes * 6];
float row_cache[VTraits<v_float32>::max_nlanes * 6];
sum_row[-1] = sum_row[-2] = sum_row[-3] = 0;
@ -551,10 +551,10 @@ struct Integral_SIMD<uchar, float, double>
prev_3 = vx_setzero_f32();
int j = 0;
const int j_max =
((_srcstep * i + (width - v_uint16::nlanes * cn + v_uint8::nlanes * cn)) >= _srcstep * height)
? width - v_uint8::nlanes * cn // uint8 in v_load_deinterleave()
: width - v_uint16::nlanes * cn; // v_expand_low
for ( ; j <= j_max; j += v_uint16::nlanes * cn)
((_srcstep * i + (width - VTraits<v_uint16>::vlanes() * cn + VTraits<v_uint8>::vlanes() * cn)) >= _srcstep * height)
? width - VTraits<v_uint8>::vlanes() * cn // uint8 in v_load_deinterleave()
: width - VTraits<v_uint16>::vlanes() * cn; // v_expand_low
for ( ; j <= j_max; j += VTraits<v_uint16>::vlanes() * cn)
{
v_uint8 v_src_row_1, v_src_row_2, v_src_row_3;
v_load_deinterleave(src_row + j, v_src_row_1, v_src_row_2, v_src_row_3);
@ -583,12 +583,12 @@ struct Integral_SIMD<uchar, float, double>
prev_2.val = _mm256_permutevar8x32_ps(el4h_2.val, shmask);
prev_3.val = _mm256_permutevar8x32_ps(el4h_3.val, shmask);
#else
el8_1 += v_rotate_left<1>(el8_1);
el8_2 += v_rotate_left<1>(el8_2);
el8_3 += v_rotate_left<1>(el8_3);
el8_1 += v_rotate_left<2>(el8_1);
el8_2 += v_rotate_left<2>(el8_2);
el8_3 += v_rotate_left<2>(el8_3);
el8_1 = v_add(el8_1, v_rotate_left<1>(el8_1));
el8_2 = v_add(el8_2, v_rotate_left<1>(el8_2));
el8_3 = v_add(el8_3, v_rotate_left<1>(el8_3));
el8_1 = v_add(el8_1, v_rotate_left<2>(el8_1));
el8_2 = v_add(el8_2, v_rotate_left<2>(el8_2));
el8_3 = v_add(el8_3, v_rotate_left<2>(el8_3));
#if CV_SIMD_WIDTH >= 32
el8_1 += v_rotate_left<4>(el8_1);
el8_2 += v_rotate_left<4>(el8_2);
@ -603,30 +603,30 @@ struct Integral_SIMD<uchar, float, double>
v_expand(el8_1, el4li_1, el4hi_1);
v_expand(el8_2, el4li_2, el4hi_2);
v_expand(el8_3, el4li_3, el4hi_3);
el4l_1 = v_cvt_f32(el4li_1) + prev_1;
el4l_2 = v_cvt_f32(el4li_2) + prev_2;
el4l_3 = v_cvt_f32(el4li_3) + prev_3;
el4h_1 = v_cvt_f32(el4hi_1) + el4l_1;
el4h_2 = v_cvt_f32(el4hi_2) + el4l_2;
el4h_3 = v_cvt_f32(el4hi_3) + el4l_3;
prev_1 = v_broadcast_element<v_float32::nlanes - 1>(el4h_1);
prev_2 = v_broadcast_element<v_float32::nlanes - 1>(el4h_2);
prev_3 = v_broadcast_element<v_float32::nlanes - 1>(el4h_3);
el4l_1 = v_add(v_cvt_f32(el4li_1), prev_1);
el4l_2 = v_add(v_cvt_f32(el4li_2), prev_2);
el4l_3 = v_add(v_cvt_f32(el4li_3), prev_3);
el4h_1 = v_add(v_cvt_f32(el4hi_1), el4l_1);
el4h_2 = v_add(v_cvt_f32(el4hi_2), el4l_2);
el4h_3 = v_add(v_cvt_f32(el4hi_3), el4l_3);
prev_1 = v_broadcast_highest(el4h_1);
prev_2 = v_broadcast_highest(el4h_2);
prev_3 = v_broadcast_highest(el4h_3);
#endif
v_store_interleave(row_cache , el4l_1, el4l_2, el4l_3);
v_store_interleave(row_cache + v_float32::nlanes * 3, el4h_1, el4h_2, el4h_3);
v_store_interleave(row_cache + VTraits<v_float32>::vlanes() * 3, el4h_1, el4h_2, el4h_3);
el4l_1 = vx_load(row_cache );
el4l_2 = vx_load(row_cache + v_float32::nlanes );
el4l_3 = vx_load(row_cache + v_float32::nlanes * 2);
el4h_1 = vx_load(row_cache + v_float32::nlanes * 3);
el4h_2 = vx_load(row_cache + v_float32::nlanes * 4);
el4h_3 = vx_load(row_cache + v_float32::nlanes * 5);
v_store(sum_row + j , el4l_1 + vx_load(prev_sum_row + j ));
v_store(sum_row + j + v_float32::nlanes , el4l_2 + vx_load(prev_sum_row + j + v_float32::nlanes ));
v_store(sum_row + j + v_float32::nlanes * 2, el4l_3 + vx_load(prev_sum_row + j + v_float32::nlanes * 2));
v_store(sum_row + j + v_float32::nlanes * 3, el4h_1 + vx_load(prev_sum_row + j + v_float32::nlanes * 3));
v_store(sum_row + j + v_float32::nlanes * 4, el4h_2 + vx_load(prev_sum_row + j + v_float32::nlanes * 4));
v_store(sum_row + j + v_float32::nlanes * 5, el4h_3 + vx_load(prev_sum_row + j + v_float32::nlanes * 5));
el4l_2 = vx_load(row_cache + VTraits<v_float32>::vlanes() );
el4l_3 = vx_load(row_cache + VTraits<v_float32>::vlanes() * 2);
el4h_1 = vx_load(row_cache + VTraits<v_float32>::vlanes() * 3);
el4h_2 = vx_load(row_cache + VTraits<v_float32>::vlanes() * 4);
el4h_3 = vx_load(row_cache + VTraits<v_float32>::vlanes() * 5);
v_store(sum_row + j , v_add(el4l_1, vx_load(prev_sum_row + j)));
v_store(sum_row + j + VTraits<v_float32>::vlanes() , v_add(el4l_2, vx_load(prev_sum_row + j + VTraits<v_float32>::vlanes())));
v_store(sum_row + j + VTraits<v_float32>::vlanes() * 2, v_add(el4l_3, vx_load(prev_sum_row + j + VTraits<v_float32>::vlanes() * 2)));
v_store(sum_row + j + VTraits<v_float32>::vlanes() * 3, v_add(el4h_1, vx_load(prev_sum_row + j + VTraits<v_float32>::vlanes() * 3)));
v_store(sum_row + j + VTraits<v_float32>::vlanes() * 4, v_add(el4h_2, vx_load(prev_sum_row + j + VTraits<v_float32>::vlanes() * 4)));
v_store(sum_row + j + VTraits<v_float32>::vlanes() * 5, v_add(el4h_3, vx_load(prev_sum_row + j + VTraits<v_float32>::vlanes() * 5)));
}
for (float v3 = sum_row[j - 1] - prev_sum_row[j - 1],
@ -652,7 +652,7 @@ struct Integral_SIMD<uchar, float, double>
v_float32 prev = vx_setzero_f32();
int j = 0;
for ( ; j + v_uint16::nlanes <= width; j += v_uint16::nlanes)
for ( ; j + VTraits<v_uint16>::vlanes() <= width; j += VTraits<v_uint16>::vlanes())
{
v_int16 el8 = v_reinterpret_as_s16(vx_load_expand(src_row + j));
v_float32 el4l, el4h;
@ -670,8 +670,8 @@ struct Integral_SIMD<uchar, float, double>
#endif
v_int32 el4li, el4hi;
v_expand(el8, el4li, el4hi);
el4l = v_cvt_f32(el4li) + prev;
el4h = v_cvt_f32(el4hi) + el4l;
el4l = v_add(v_cvt_f32(el4li), prev);
el4h = v_add(v_cvt_f32(el4hi), el4l);
#if CV_SIMD_WIDTH == 16
prev = el4h;
#elif CV_SIMD_WIDTH == 32
@ -682,8 +682,8 @@ struct Integral_SIMD<uchar, float, double>
prev = v_combine_low(t, t);
#endif
#endif
v_store(sum_row + j , el4l + vx_load(prev_sum_row + j ));
v_store(sum_row + j + v_float32::nlanes, el4h + vx_load(prev_sum_row + j + v_float32::nlanes));
v_store(sum_row + j , v_add(el4l, vx_load(prev_sum_row + j)));
v_store(sum_row + j + VTraits<v_float32>::vlanes(), v_add(el4h, vx_load(prev_sum_row + j + VTraits<v_float32>::vlanes())));
}
for (float v4 = sum_row[j - 1] - prev_sum_row[j - 1],
@ -750,7 +750,7 @@ struct Integral_SIMD<uchar, double, double>
v_float64 prev = vx_setzero_f64();
int j = 0;
for (; j + v_uint16::nlanes <= width; j += v_uint16::nlanes)
for (; j + VTraits<v_uint16>::vlanes() <= width; j += VTraits<v_uint16>::vlanes())
{
v_int16 el8 = v_reinterpret_as_s16(vx_load_expand(src_row + j));
v_float64 el4ll, el4lh, el4hl, el4hh;
@ -767,8 +767,8 @@ struct Integral_SIMD<uchar, double, double>
el4hh.val = _mm256_add_pd(_mm256_cvtepi32_pd(_v256_extract_high(el4h_32)), el4d);
prev.val = _mm256_permute4x64_pd(el4hh.val, 0xff);
#else
el8 += v_rotate_left<1>(el8);
el8 += v_rotate_left<2>(el8);
el8 = v_add(el8, v_rotate_left<1>(el8));
el8 = v_add(el8, v_rotate_left<2>(el8));
#if CV_SIMD_WIDTH >= 32
el8 += v_rotate_left<4>(el8);
#if CV_SIMD_WIDTH == 64
@ -777,17 +777,17 @@ struct Integral_SIMD<uchar, double, double>
#endif
v_int32 el4li, el4hi;
v_expand(el8, el4li, el4hi);
el4ll = v_cvt_f64(el4li) + prev;
el4lh = v_cvt_f64_high(el4li) + prev;
el4hl = v_cvt_f64(el4hi) + el4ll;
el4hh = v_cvt_f64_high(el4hi) + el4lh;
prev = vx_setall_f64(v_extract_n<v_float64::nlanes - 1>(el4hh));
// prev = v_broadcast_element<v_float64::nlanes - 1>(el4hh);
el4ll = v_add(v_cvt_f64(el4li), prev);
el4lh = v_add(v_cvt_f64_high(el4li), prev);
el4hl = v_add(v_cvt_f64(el4hi), el4ll);
el4hh = v_add(v_cvt_f64_high(el4hi), el4lh);
prev = vx_setall_f64(v_extract_highest(el4hh));
// prev = v_broadcast_highest(el4hh);
#endif
v_store(sum_row + j , el4ll + vx_load(prev_sum_row + j ));
v_store(sum_row + j + v_float64::nlanes , el4lh + vx_load(prev_sum_row + j + v_float64::nlanes ));
v_store(sum_row + j + v_float64::nlanes * 2, el4hl + vx_load(prev_sum_row + j + v_float64::nlanes * 2));
v_store(sum_row + j + v_float64::nlanes * 3, el4hh + vx_load(prev_sum_row + j + v_float64::nlanes * 3));
v_store(sum_row + j , v_add(el4ll, vx_load(prev_sum_row + j)));
v_store(sum_row + j + VTraits<v_float64>::vlanes() , v_add(el4lh, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes())));
v_store(sum_row + j + VTraits<v_float64>::vlanes() * 2, v_add(el4hl, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes() * 2)));
v_store(sum_row + j + VTraits<v_float64>::vlanes() * 3, v_add(el4hh, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes() * 3)));
}
for (double v = sum_row[j - 1] - prev_sum_row[j - 1]; j < width; ++j)
@ -808,11 +808,11 @@ struct Integral_SIMD<uchar, double, double>
v_float64 prev_1 = vx_setzero_f64(), prev_2 = vx_setzero_f64();
int j = 0;
for (; j + v_uint16::nlanes * cn <= width; j += v_uint16::nlanes * cn)
for (; j + VTraits<v_uint16>::vlanes() * cn <= width; j += VTraits<v_uint16>::vlanes() * cn)
{
v_int16 v_src_row = v_reinterpret_as_s16(vx_load(src_row + j));
v_int16 el8_1 = v_src_row & mask;
v_int16 el8_2 = v_reinterpret_as_s16(v_reinterpret_as_u16(v_src_row) >> 8);
v_int16 el8_1 = v_and(v_src_row, mask);
v_int16 el8_2 = v_reinterpret_as_s16(v_shr<8>(v_reinterpret_as_u16(v_src_row)));
v_float64 el4ll_1, el4lh_1, el4hl_1, el4hh_1, el4ll_2, el4lh_2, el4hl_2, el4hh_2;
#if CV_AVX2 && CV_SIMD_WIDTH == 32
__m256i vsum_1 = _mm256_add_epi16(el8_1.val, _mm256_slli_si256(el8_1.val, 2));
@ -838,10 +838,10 @@ struct Integral_SIMD<uchar, double, double>
prev_1.val = _mm256_permute4x64_pd(el4hh_1.val, 0xff);
prev_2.val = _mm256_permute4x64_pd(el4hh_2.val, 0xff);
#else
el8_1 += v_rotate_left<1>(el8_1);
el8_2 += v_rotate_left<1>(el8_2);
el8_1 += v_rotate_left<2>(el8_1);
el8_2 += v_rotate_left<2>(el8_2);
el8_1 = v_add(el8_1, v_rotate_left<1>(el8_1));
el8_2 = v_add(el8_2, v_rotate_left<1>(el8_2));
el8_1 = v_add(el8_1, v_rotate_left<2>(el8_1));
el8_2 = v_add(el8_2, v_rotate_left<2>(el8_2));
#if CV_SIMD_WIDTH >= 32
el8_1 += v_rotate_left<4>(el8_1);
el8_2 += v_rotate_left<4>(el8_2);
@ -853,32 +853,32 @@ struct Integral_SIMD<uchar, double, double>
v_int32 el4li_1, el4hi_1, el4li_2, el4hi_2;
v_expand(el8_1, el4li_1, el4hi_1);
v_expand(el8_2, el4li_2, el4hi_2);
el4ll_1 = v_cvt_f64(el4li_1) + prev_1;
el4ll_2 = v_cvt_f64(el4li_2) + prev_2;
el4lh_1 = v_cvt_f64_high(el4li_1) + prev_1;
el4lh_2 = v_cvt_f64_high(el4li_2) + prev_2;
el4hl_1 = v_cvt_f64(el4hi_1) + el4ll_1;
el4hl_2 = v_cvt_f64(el4hi_2) + el4ll_2;
el4hh_1 = v_cvt_f64_high(el4hi_1) + el4lh_1;
el4hh_2 = v_cvt_f64_high(el4hi_2) + el4lh_2;
prev_1 = vx_setall_f64(v_extract_n<v_float64::nlanes - 1>(el4hh_1));
prev_2 = vx_setall_f64(v_extract_n<v_float64::nlanes - 1>(el4hh_2));
// prev_1 = v_broadcast_element<v_float64::nlanes - 1>(el4hh_1);
// prev_2 = v_broadcast_element<v_float64::nlanes - 1>(el4hh_2);
el4ll_1 = v_add(v_cvt_f64(el4li_1), prev_1);
el4ll_2 = v_add(v_cvt_f64(el4li_2), prev_2);
el4lh_1 = v_add(v_cvt_f64_high(el4li_1), prev_1);
el4lh_2 = v_add(v_cvt_f64_high(el4li_2), prev_2);
el4hl_1 = v_add(v_cvt_f64(el4hi_1), el4ll_1);
el4hl_2 = v_add(v_cvt_f64(el4hi_2), el4ll_2);
el4hh_1 = v_add(v_cvt_f64_high(el4hi_1), el4lh_1);
el4hh_2 = v_add(v_cvt_f64_high(el4hi_2), el4lh_2);
prev_1 = vx_setall_f64(v_extract_highest(el4hh_1));
prev_2 = vx_setall_f64(v_extract_highest(el4hh_2));
// prev_1 = v_broadcast_highest(el4hh_1);
// prev_2 = v_broadcast_highest(el4hh_2);
#endif
v_float64 el4_1, el4_2, el4_3, el4_4, el4_5, el4_6, el4_7, el4_8;
v_zip(el4ll_1, el4ll_2, el4_1, el4_2);
v_zip(el4lh_1, el4lh_2, el4_3, el4_4);
v_zip(el4hl_1, el4hl_2, el4_5, el4_6);
v_zip(el4hh_1, el4hh_2, el4_7, el4_8);
v_store(sum_row + j , el4_1 + vx_load(prev_sum_row + j ));
v_store(sum_row + j + v_float64::nlanes , el4_2 + vx_load(prev_sum_row + j + v_float64::nlanes ));
v_store(sum_row + j + v_float64::nlanes * 2, el4_3 + vx_load(prev_sum_row + j + v_float64::nlanes * 2));
v_store(sum_row + j + v_float64::nlanes * 3, el4_4 + vx_load(prev_sum_row + j + v_float64::nlanes * 3));
v_store(sum_row + j + v_float64::nlanes * 4, el4_5 + vx_load(prev_sum_row + j + v_float64::nlanes * 4));
v_store(sum_row + j + v_float64::nlanes * 5, el4_6 + vx_load(prev_sum_row + j + v_float64::nlanes * 5));
v_store(sum_row + j + v_float64::nlanes * 6, el4_7 + vx_load(prev_sum_row + j + v_float64::nlanes * 6));
v_store(sum_row + j + v_float64::nlanes * 7, el4_8 + vx_load(prev_sum_row + j + v_float64::nlanes * 7));
v_store(sum_row + j , v_add(el4_1, vx_load(prev_sum_row + j)));
v_store(sum_row + j + VTraits<v_float64>::vlanes() , v_add(el4_2, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes())));
v_store(sum_row + j + VTraits<v_float64>::vlanes() * 2, v_add(el4_3, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes() * 2)));
v_store(sum_row + j + VTraits<v_float64>::vlanes() * 3, v_add(el4_4, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes() * 3)));
v_store(sum_row + j + VTraits<v_float64>::vlanes() * 4, v_add(el4_5, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes() * 4)));
v_store(sum_row + j + VTraits<v_float64>::vlanes() * 5, v_add(el4_6, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes() * 5)));
v_store(sum_row + j + VTraits<v_float64>::vlanes() * 6, v_add(el4_7, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes() * 6)));
v_store(sum_row + j + VTraits<v_float64>::vlanes() * 7, v_add(el4_8, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes() * 7)));
}
for (double v2 = sum_row[j - 1] - prev_sum_row[j - 1],
@ -897,7 +897,7 @@ struct Integral_SIMD<uchar, double, double>
const uchar * src_row = src + _srcstep * i;
double * prev_sum_row = (double *)((uchar *)sum + _sumstep * i) + cn;
double * sum_row = (double *)((uchar *)sum + _sumstep * (i + 1)) + cn;
double row_cache[v_float64::nlanes * 12];
double row_cache[VTraits<v_float64>::max_nlanes * 12];
sum_row[-1] = sum_row[-2] = sum_row[-3] = 0;
@ -905,10 +905,10 @@ struct Integral_SIMD<uchar, double, double>
prev_3 = vx_setzero_f64();
int j = 0;
const int j_max =
((_srcstep * i + (width - v_uint16::nlanes * cn + v_uint8::nlanes * cn)) >= _srcstep * height)
? width - v_uint8::nlanes * cn // uint8 in v_load_deinterleave()
: width - v_uint16::nlanes * cn; // v_expand_low
for ( ; j <= j_max; j += v_uint16::nlanes * cn)
((_srcstep * i + (width - VTraits<v_uint16>::vlanes() * cn + VTraits<v_uint8>::vlanes() * cn)) >= _srcstep * height)
? width - VTraits<v_uint8>::vlanes() * cn // uint8 in v_load_deinterleave()
: width - VTraits<v_uint16>::vlanes() * cn; // v_expand_low
for ( ; j <= j_max; j += VTraits<v_uint16>::vlanes() * cn)
{
v_uint8 v_src_row_1, v_src_row_2, v_src_row_3;
v_load_deinterleave(src_row + j, v_src_row_1, v_src_row_2, v_src_row_3);
@ -951,12 +951,12 @@ struct Integral_SIMD<uchar, double, double>
prev_2.val = _mm256_permute4x64_pd(el4hh_2.val, 0xff);
prev_3.val = _mm256_permute4x64_pd(el4hh_3.val, 0xff);
#else
el8_1 += v_rotate_left<1>(el8_1);
el8_2 += v_rotate_left<1>(el8_2);
el8_3 += v_rotate_left<1>(el8_3);
el8_1 += v_rotate_left<2>(el8_1);
el8_2 += v_rotate_left<2>(el8_2);
el8_3 += v_rotate_left<2>(el8_3);
el8_1 = v_add(el8_1, v_rotate_left<1>(el8_1));
el8_2 = v_add(el8_2, v_rotate_left<1>(el8_2));
el8_3 = v_add(el8_3, v_rotate_left<1>(el8_3));
el8_1 = v_add(el8_1, v_rotate_left<2>(el8_1));
el8_2 = v_add(el8_2, v_rotate_left<2>(el8_2));
el8_3 = v_add(el8_3, v_rotate_left<2>(el8_3));
#if CV_SIMD_WIDTH >= 32
el8_1 += v_rotate_left<4>(el8_1);
el8_2 += v_rotate_left<4>(el8_2);
@ -971,53 +971,53 @@ struct Integral_SIMD<uchar, double, double>
v_expand(el8_1, el4li_1, el4hi_1);
v_expand(el8_2, el4li_2, el4hi_2);
v_expand(el8_3, el4li_3, el4hi_3);
el4ll_1 = v_cvt_f64(el4li_1) + prev_1;
el4ll_2 = v_cvt_f64(el4li_2) + prev_2;
el4ll_3 = v_cvt_f64(el4li_3) + prev_3;
el4lh_1 = v_cvt_f64_high(el4li_1) + prev_1;
el4lh_2 = v_cvt_f64_high(el4li_2) + prev_2;
el4lh_3 = v_cvt_f64_high(el4li_3) + prev_3;
el4hl_1 = v_cvt_f64(el4hi_1) + el4ll_1;
el4hl_2 = v_cvt_f64(el4hi_2) + el4ll_2;
el4hl_3 = v_cvt_f64(el4hi_3) + el4ll_3;
el4hh_1 = v_cvt_f64_high(el4hi_1) + el4lh_1;
el4hh_2 = v_cvt_f64_high(el4hi_2) + el4lh_2;
el4hh_3 = v_cvt_f64_high(el4hi_3) + el4lh_3;
prev_1 = vx_setall_f64(v_extract_n<v_float64::nlanes - 1>(el4hh_1));
prev_2 = vx_setall_f64(v_extract_n<v_float64::nlanes - 1>(el4hh_2));
prev_3 = vx_setall_f64(v_extract_n<v_float64::nlanes - 1>(el4hh_3));
// prev_1 = v_broadcast_element<v_float64::nlanes - 1>(el4hh_1);
// prev_2 = v_broadcast_element<v_float64::nlanes - 1>(el4hh_2);
// prev_3 = v_broadcast_element<v_float64::nlanes - 1>(el4hh_3);
el4ll_1 = v_add(v_cvt_f64(el4li_1), prev_1);
el4ll_2 = v_add(v_cvt_f64(el4li_2), prev_2);
el4ll_3 = v_add(v_cvt_f64(el4li_3), prev_3);
el4lh_1 = v_add(v_cvt_f64_high(el4li_1), prev_1);
el4lh_2 = v_add(v_cvt_f64_high(el4li_2), prev_2);
el4lh_3 = v_add(v_cvt_f64_high(el4li_3), prev_3);
el4hl_1 = v_add(v_cvt_f64(el4hi_1), el4ll_1);
el4hl_2 = v_add(v_cvt_f64(el4hi_2), el4ll_2);
el4hl_3 = v_add(v_cvt_f64(el4hi_3), el4ll_3);
el4hh_1 = v_add(v_cvt_f64_high(el4hi_1), el4lh_1);
el4hh_2 = v_add(v_cvt_f64_high(el4hi_2), el4lh_2);
el4hh_3 = v_add(v_cvt_f64_high(el4hi_3), el4lh_3);
prev_1 = vx_setall_f64(v_extract_highest(el4hh_1));
prev_2 = vx_setall_f64(v_extract_highest(el4hh_2));
prev_3 = vx_setall_f64(v_extract_highest(el4hh_3));
// prev_1 = v_broadcast_highest(el4hh_1);
// prev_2 = v_broadcast_highest(el4hh_2);
// prev_3 = v_broadcast_highest(el4hh_3);
#endif
v_store_interleave(row_cache , el4ll_1, el4ll_2, el4ll_3);
v_store_interleave(row_cache + v_float64::nlanes * 3, el4lh_1, el4lh_2, el4lh_3);
v_store_interleave(row_cache + v_float64::nlanes * 6, el4hl_1, el4hl_2, el4hl_3);
v_store_interleave(row_cache + v_float64::nlanes * 9, el4hh_1, el4hh_2, el4hh_3);
v_store_interleave(row_cache + VTraits<v_float64>::vlanes() * 3, el4lh_1, el4lh_2, el4lh_3);
v_store_interleave(row_cache + VTraits<v_float64>::vlanes() * 6, el4hl_1, el4hl_2, el4hl_3);
v_store_interleave(row_cache + VTraits<v_float64>::vlanes() * 9, el4hh_1, el4hh_2, el4hh_3);
el4ll_1 = vx_load(row_cache );
el4ll_2 = vx_load(row_cache + v_float64::nlanes );
el4ll_3 = vx_load(row_cache + v_float64::nlanes * 2 );
el4lh_1 = vx_load(row_cache + v_float64::nlanes * 3 );
el4lh_2 = vx_load(row_cache + v_float64::nlanes * 4 );
el4lh_3 = vx_load(row_cache + v_float64::nlanes * 5 );
el4hl_1 = vx_load(row_cache + v_float64::nlanes * 6 );
el4hl_2 = vx_load(row_cache + v_float64::nlanes * 7 );
el4hl_3 = vx_load(row_cache + v_float64::nlanes * 8 );
el4hh_1 = vx_load(row_cache + v_float64::nlanes * 9 );
el4hh_2 = vx_load(row_cache + v_float64::nlanes * 10);
el4hh_3 = vx_load(row_cache + v_float64::nlanes * 11);
v_store(sum_row + j , el4ll_1 + vx_load(prev_sum_row + j ));
v_store(sum_row + j + v_float64::nlanes , el4ll_2 + vx_load(prev_sum_row + j + v_float64::nlanes ));
v_store(sum_row + j + v_float64::nlanes * 2 , el4ll_3 + vx_load(prev_sum_row + j + v_float64::nlanes * 2 ));
v_store(sum_row + j + v_float64::nlanes * 3 , el4lh_1 + vx_load(prev_sum_row + j + v_float64::nlanes * 3 ));
v_store(sum_row + j + v_float64::nlanes * 4 , el4lh_2 + vx_load(prev_sum_row + j + v_float64::nlanes * 4 ));
v_store(sum_row + j + v_float64::nlanes * 5 , el4lh_3 + vx_load(prev_sum_row + j + v_float64::nlanes * 5 ));
v_store(sum_row + j + v_float64::nlanes * 6 , el4hl_1 + vx_load(prev_sum_row + j + v_float64::nlanes * 6 ));
v_store(sum_row + j + v_float64::nlanes * 7 , el4hl_2 + vx_load(prev_sum_row + j + v_float64::nlanes * 7 ));
v_store(sum_row + j + v_float64::nlanes * 8 , el4hl_3 + vx_load(prev_sum_row + j + v_float64::nlanes * 8 ));
v_store(sum_row + j + v_float64::nlanes * 9 , el4hh_1 + vx_load(prev_sum_row + j + v_float64::nlanes * 9 ));
v_store(sum_row + j + v_float64::nlanes * 10, el4hh_2 + vx_load(prev_sum_row + j + v_float64::nlanes * 10));
v_store(sum_row + j + v_float64::nlanes * 11, el4hh_3 + vx_load(prev_sum_row + j + v_float64::nlanes * 11));
el4ll_2 = vx_load(row_cache + VTraits<v_float64>::vlanes() );
el4ll_3 = vx_load(row_cache + VTraits<v_float64>::vlanes() * 2 );
el4lh_1 = vx_load(row_cache + VTraits<v_float64>::vlanes() * 3 );
el4lh_2 = vx_load(row_cache + VTraits<v_float64>::vlanes() * 4 );
el4lh_3 = vx_load(row_cache + VTraits<v_float64>::vlanes() * 5 );
el4hl_1 = vx_load(row_cache + VTraits<v_float64>::vlanes() * 6 );
el4hl_2 = vx_load(row_cache + VTraits<v_float64>::vlanes() * 7 );
el4hl_3 = vx_load(row_cache + VTraits<v_float64>::vlanes() * 8 );
el4hh_1 = vx_load(row_cache + VTraits<v_float64>::vlanes() * 9 );
el4hh_2 = vx_load(row_cache + VTraits<v_float64>::vlanes() * 10);
el4hh_3 = vx_load(row_cache + VTraits<v_float64>::vlanes() * 11);
v_store(sum_row + j , v_add(el4ll_1, vx_load(prev_sum_row + j)));
v_store(sum_row + j + VTraits<v_float64>::vlanes() , v_add(el4ll_2, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes())));
v_store(sum_row + j + VTraits<v_float64>::vlanes() * 2 , v_add(el4ll_3, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes() * 2)));
v_store(sum_row + j + VTraits<v_float64>::vlanes() * 3 , v_add(el4lh_1, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes() * 3)));
v_store(sum_row + j + VTraits<v_float64>::vlanes() * 4 , v_add(el4lh_2, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes() * 4)));
v_store(sum_row + j + VTraits<v_float64>::vlanes() * 5 , v_add(el4lh_3, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes() * 5)));
v_store(sum_row + j + VTraits<v_float64>::vlanes() * 6 , v_add(el4hl_1, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes() * 6)));
v_store(sum_row + j + VTraits<v_float64>::vlanes() * 7 , v_add(el4hl_2, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes() * 7)));
v_store(sum_row + j + VTraits<v_float64>::vlanes() * 8 , v_add(el4hl_3, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes() * 8)));
v_store(sum_row + j + VTraits<v_float64>::vlanes() * 9 , v_add(el4hh_1, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes() * 9)));
v_store(sum_row + j + VTraits<v_float64>::vlanes() * 10, v_add(el4hh_2, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes() * 10)));
v_store(sum_row + j + VTraits<v_float64>::vlanes() * 11, v_add(el4hh_3, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes() * 11)));
}
for (double v3 = sum_row[j - 1] - prev_sum_row[j - 1],
@ -1043,7 +1043,7 @@ struct Integral_SIMD<uchar, double, double>
v_float64 prev_1 = vx_setzero_f64(), prev_2 = vx_setzero_f64();
int j = 0;
for ( ; j + v_uint16::nlanes <= width; j += v_uint16::nlanes)
for ( ; j + VTraits<v_uint16>::vlanes() <= width; j += VTraits<v_uint16>::vlanes())
{
v_int16 el8 = v_reinterpret_as_s16(vx_load_expand(src_row + j));
v_float64 el4ll, el4lh, el4hl, el4hh;
@ -1065,10 +1065,10 @@ struct Integral_SIMD<uchar, double, double>
#endif
v_int32 el4li, el4hi;
v_expand(el8, el4li, el4hi);
el4ll = v_cvt_f64(el4li) + prev_1;
el4lh = v_cvt_f64_high(el4li) + prev_2;
el4hl = v_cvt_f64(el4hi) + el4ll;
el4hh = v_cvt_f64_high(el4hi) + el4lh;
el4ll = v_add(v_cvt_f64(el4li), prev_1);
el4lh = v_add(v_cvt_f64_high(el4li), prev_2);
el4hl = v_add(v_cvt_f64(el4hi), el4ll);
el4hh = v_add(v_cvt_f64_high(el4hi), el4lh);
#if CV_SIMD_WIDTH == 16
prev_1 = el4hl;
prev_2 = el4hh;
@ -1078,10 +1078,10 @@ struct Integral_SIMD<uchar, double, double>
prev_1 = prev_2 = v_combine_high(el4hh, el4hh);
#endif
#endif
v_store(sum_row + j , el4ll + vx_load(prev_sum_row + j ));
v_store(sum_row + j + v_float64::nlanes , el4lh + vx_load(prev_sum_row + j + v_float64::nlanes ));
v_store(sum_row + j + v_float64::nlanes * 2, el4hl + vx_load(prev_sum_row + j + v_float64::nlanes * 2));
v_store(sum_row + j + v_float64::nlanes * 3, el4hh + vx_load(prev_sum_row + j + v_float64::nlanes * 3));
v_store(sum_row + j , v_add(el4ll, vx_load(prev_sum_row + j)));
v_store(sum_row + j + VTraits<v_float64>::vlanes() , v_add(el4lh, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes())));
v_store(sum_row + j + VTraits<v_float64>::vlanes() * 2, v_add(el4hl, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes() * 2)));
v_store(sum_row + j + VTraits<v_float64>::vlanes() * 3, v_add(el4hh, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes() * 3)));
}
for (double v4 = sum_row[j - 1] - prev_sum_row[j - 1],

View File

@ -268,13 +268,13 @@ void HOGDescriptor::computeGradient(InputArray _img, InputOutputArray _grad, Inp
for ( i = 0; i < 256; i += 4)
{
v_store(_data + i, v_sqrt(idx));
idx += ifour;
idx = v_add(idx, ifour);
}
else
for ( i = 0; i < 256; i += 4)
{
v_store(_data + i, idx);
idx += ifour;
idx = v_add(idx, ifour);
}
#else
if( gammaCorrection )
@ -320,7 +320,7 @@ void HOGDescriptor::computeGradient(InputArray _img, InputOutputArray _grad, Inp
for ( ; x <= end - 4; x += 4)
{
v_int32x4 mul_res = v_load(xmap + x);
mul_res += mul_res + mul_res;
mul_res = v_add(mul_res, v_add(mul_res, mul_res));
v_store(xmap + x, mul_res);
}
#endif
@ -444,34 +444,34 @@ void HOGDescriptor::computeGradient(InputArray _img, InputOutputArray _grad, Inp
{
int x0 = xmap[x], x1 = xmap[x+1], x2 = xmap[x+2], x3 = xmap[x+3];
v_float32x4 _dx0 = v_load(lutCurr+x+widthP2*0+2) - v_load(lutCurr+x+widthP2*0);
v_float32x4 _dx1 = v_load(lutCurr+x+widthP2*1+2) - v_load(lutCurr+x+widthP2*1);
v_float32x4 _dx2 = v_load(lutCurr+x+widthP2*2+2) - v_load(lutCurr+x+widthP2*2);
v_float32x4 _dx0 = v_sub(v_load(lutCurr + x + widthP2 * 0 + 2), v_load(lutCurr + x + widthP2 * 0));
v_float32x4 _dx1 = v_sub(v_load(lutCurr + x + widthP2 * 1 + 2), v_load(lutCurr + x + widthP2 * 1));
v_float32x4 _dx2 = v_sub(v_load(lutCurr + x + widthP2 * 2 + 2), v_load(lutCurr + x + widthP2 * 2));
v_float32x4 _dy00 = v_float32x4(lut[nextPtr[x0+0]], lut[nextPtr[x1+0]], lut[nextPtr[x2+0]], lut[nextPtr[x3+0]]);
v_float32x4 _dy0 = _dy00 - v_load(lutPrev+x+widthP2*0+1);
v_float32x4 _dy0 = v_sub(_dy00, v_load(lutPrev + x + widthP2 * 0 + 1));
v_store(lutNext+x+widthP2*0+1, _dy00);
v_float32x4 _dy10 = v_float32x4(lut[nextPtr[x0+1]], lut[nextPtr[x1+1]], lut[nextPtr[x2+1]], lut[nextPtr[x3+1]]);
v_float32x4 _dy1 = _dy10 - v_load(lutPrev+x+widthP2*1+1);
v_float32x4 _dy1 = v_sub(_dy10, v_load(lutPrev + x + widthP2 * 1 + 1));
v_store(lutNext+x+widthP2*1+1, _dy10);
v_float32x4 _dy20 = v_float32x4(lut[nextPtr[x0+2]], lut[nextPtr[x1+2]], lut[nextPtr[x2+2]], lut[nextPtr[x3+2]]);
v_float32x4 _dy2 = _dy20 - v_load(lutPrev+x+widthP2*2+1);
v_float32x4 _dy2 = v_sub(_dy20, v_load(lutPrev + x + widthP2 * 2 + 1));
v_store(lutNext+x+widthP2*2+1, _dy20);
v_float32x4 _mag0 = (_dx0 * _dx0) + (_dy0 * _dy0);
v_float32x4 _mag1 = (_dx1 * _dx1) + (_dy1 * _dy1);
v_float32x4 _mag2 = (_dx2 * _dx2) + (_dy2 * _dy2);
v_float32x4 _mag0 = v_add(v_mul(_dx0, _dx0), v_mul(_dy0, _dy0));
v_float32x4 _mag1 = v_add(v_mul(_dx1, _dx1), v_mul(_dy1, _dy1));
v_float32x4 _mag2 = v_add(v_mul(_dx2, _dx2), v_mul(_dy2, _dy2));
v_float32x4 mask = v_reinterpret_as_f32(_mag2 > _mag1);
v_float32x4 mask = v_reinterpret_as_f32(v_gt(_mag2, _mag1));
_dx2 = v_select(mask, _dx2, _dx1);
_dy2 = v_select(mask, _dy2, _dy1);
mask = v_reinterpret_as_f32(v_max(_mag2, _mag1) > _mag0);
mask = v_reinterpret_as_f32(v_gt(v_max(_mag2, _mag1), _mag0));
_dx2 = v_select(mask, _dx2, _dx0);
_dy2 = v_select(mask, _dy2, _dy0);
@ -537,25 +537,25 @@ void HOGDescriptor::computeGradient(InputArray _img, InputOutputArray _grad, Inp
int x2 = x << 1;
v_float32x4 _mag = v_load(dbuf + x + (width << 1));
v_float32x4 _angle = v_load(dbuf + x + width * 3);
_angle = (_angleScale * _angle) - fhalf;
_angle = v_sub(v_mul(_angleScale, _angle), fhalf);
v_int32x4 _hidx = v_floor(_angle);
_angle -= v_cvt_f32(_hidx);
_angle = v_sub(_angle, v_cvt_f32(_hidx));
v_float32x4 ft0 = _mag * (fone - _angle);
v_float32x4 ft1 = _mag * _angle;
v_float32x4 ft0 = v_mul(_mag, v_sub(fone, _angle));
v_float32x4 ft1 = v_mul(_mag, _angle);
v_store_interleave(gradPtr + x2, ft0, ft1);
v_int32x4 mask0 = _hidx >> 31;
v_int32x4 it0 = mask0 & _nbins;
mask0 = (_hidx >= _nbins);
v_int32x4 it1 = mask0 & _nbins;
_hidx += (it0 - it1);
v_int32x4 mask0 = v_shr<31>(_hidx);
v_int32x4 it0 = v_and(mask0, _nbins);
mask0 = (v_ge(_hidx, _nbins));
v_int32x4 it1 = v_and(mask0, _nbins);
_hidx = v_add(_hidx, v_sub(it0, it1));
it0 = v_reinterpret_as_s32(v_pack(v_pack(_hidx, izero), v_reinterpret_as_s16(izero)));
_hidx += ione;
_hidx &= (_hidx < _nbins);
_hidx = v_add(_hidx, ione);
_hidx = v_and(_hidx, v_lt(_hidx, _nbins));
it1 = v_reinterpret_as_s32(v_pack(v_pack(_hidx, izero), v_reinterpret_as_s16(izero)));
v_uint8x16 it2, it3;
v_zip(v_reinterpret_as_u8(it0), v_reinterpret_as_u8(it1), it2, it3);
@ -707,9 +707,9 @@ void HOGCache::init(const HOGDescriptor* _descriptor,
for (; i <= blockSize.height - 4; i += 4)
{
v_float32x4 t = idx - _bh;
t *= t;
idx += ifour;
v_float32x4 t = v_sub(idx, _bh);
t = v_mul(t, t);
idx = v_add(idx, ifour);
v_store(_di + i, t);
}
#endif
@ -725,9 +725,9 @@ void HOGCache::init(const HOGDescriptor* _descriptor,
for (; j <= blockSize.height - 4; j += 4)
{
v_float32x4 t = idx - _bw;
t *= t;
idx += ifour;
v_float32x4 t = v_sub(idx, _bw);
t = v_mul(t, t);
idx = v_add(idx, ifour);
v_store(_dj + j, t);
}
#endif
@ -936,8 +936,8 @@ const float* HOGCache::getBlock(Point pt, float* buf)
int h0 = h[0], h1 = h[1];
v_float32x4 _a0 = v_setall_f32(a[0]), _a1 = v_setall_f32(a[1]);
v_float32x4 w = v_setall_f32(pk.gradWeight) * v_load(pk.histWeights);
v_float32x4 _t0 = _a0 * w, _t1 = _a1 * w;
v_float32x4 w = v_mul(v_setall_f32(pk.gradWeight), v_load(pk.histWeights));
v_float32x4 _t0 = v_mul(_a0, w), _t1 = v_mul(_a1, w);
v_store(hist0, _t0);
v_store(hist1, _t1);
@ -984,8 +984,8 @@ const float* HOGCache::getBlock(Point pt, float* buf)
int h0 = h[0], h1 = h[1];
v_float32x4 _a0 = v_setall_f32(a[0]), _a1 = v_setall_f32(a[1]);
v_float32x4 w = v_setall_f32(pk.gradWeight) * v_load(pk.histWeights);
v_float32x4 _t0 = _a0 * w, _t1 = _a1 * w;
v_float32x4 w = v_mul(v_setall_f32(pk.gradWeight), v_load(pk.histWeights));
v_float32x4 _t0 = v_mul(_a0, w), _t1 = v_mul(_a1, w);
v_store(hist0, _t0);
v_store(hist1, _t1);
@ -1057,12 +1057,12 @@ void HOGCache::normalizeBlockHistogram(float* _hist) const
#if CV_SIMD128
v_float32x4 p0 = v_load(hist);
v_float32x4 s = p0 * p0;
v_float32x4 s = v_mul(p0, p0);
for (i = 4; i <= sz - 4; i += 4)
{
p0 = v_load(hist + i);
s += p0 * p0;
s = v_add(s, v_mul(p0, p0));
}
v_store(partSum, s);
#else
@ -1091,17 +1091,17 @@ void HOGCache::normalizeBlockHistogram(float* _hist) const
v_float32x4 _scale = v_setall_f32(scale);
static v_float32x4 _threshold = v_setall_f32(thresh);
v_float32x4 p = _scale * v_load(hist);
v_float32x4 p = v_mul(_scale, v_load(hist));
p = v_min(p, _threshold);
s = p * p;
s = v_mul(p, p);
v_store(hist, p);
for(i = 4 ; i <= sz - 4; i += 4)
{
p = v_load(hist + i);
p *= _scale;
p = v_mul(p, _scale);
p = v_min(p, _threshold);
s += p * p;
s = v_add(s, v_mul(p, p));
v_store(hist + i, p);
}
@ -1137,7 +1137,7 @@ void HOGCache::normalizeBlockHistogram(float* _hist) const
v_float32x4 _scale2 = v_setall_f32(scale);
for ( ; i <= sz - 4; i += 4)
{
v_float32x4 t = _scale2 * v_load(hist + i);
v_float32x4 t = v_mul(_scale2, v_load(hist + i));
v_store(hist + i, t);
}
#endif
@ -1593,14 +1593,14 @@ void HOGDescriptor::detect(InputArray _img,
#if CV_SIMD128
v_float32x4 _vec = v_load(vec);
v_float32x4 _svmVec = v_load(svmVec);
v_float32x4 sum = _svmVec * _vec;
v_float32x4 sum = v_mul(_svmVec, _vec);
for( k = 4; k <= blockHistogramSize - 4; k += 4 )
{
_vec = v_load(vec + k);
_svmVec = v_load(svmVec + k);
sum += _vec * _svmVec;
sum = v_add(sum, v_mul(_vec, _svmVec));
}
v_store(partSum, sum);
@ -3392,14 +3392,14 @@ void HOGDescriptor::detectROI(InputArray _img, const std::vector<cv::Point> &loc
#if CV_SIMD128
v_float32x4 _vec = v_load(vec);
v_float32x4 _svmVec = v_load(svmVec);
v_float32x4 sum = _svmVec * _vec;
v_float32x4 sum = v_mul(_svmVec, _vec);
for( k = 4; k <= blockHistogramSize - 4; k += 4 )
{
_vec = v_load(vec + k);
_svmVec = v_load(svmVec + k);
sum += _vec * _svmVec;
sum = v_add(sum, v_mul(_vec, _svmVec));
}
v_store(partSum, sum);

View File

@ -520,16 +520,16 @@ DISOpticalFlowImpl::PatchInverseSearch_ParBody::PatchInverseSearch_ParBody(DISOp
v_expand(I0_row_8, I0_row_4_left, I0_row_4_right); \
\
/* Compute diffs between I0 and bilinearly interpolated I1: */ \
I_diff_left = w00v * v_cvt_f32(v_reinterpret_as_s32(I1_row_4_left)) + \
w01v * v_cvt_f32(v_reinterpret_as_s32(I1_row_shifted_4_left)) + \
w10v * v_cvt_f32(v_reinterpret_as_s32(I1_row_next_4_left)) + \
w11v * v_cvt_f32(v_reinterpret_as_s32(I1_row_next_shifted_4_left)) - \
v_cvt_f32(v_reinterpret_as_s32(I0_row_4_left)); \
I_diff_right = w00v * v_cvt_f32(v_reinterpret_as_s32(I1_row_4_right)) + \
w01v * v_cvt_f32(v_reinterpret_as_s32(I1_row_shifted_4_right)) + \
w10v * v_cvt_f32(v_reinterpret_as_s32(I1_row_next_4_right)) + \
w11v * v_cvt_f32(v_reinterpret_as_s32(I1_row_next_shifted_4_right)) - \
v_cvt_f32(v_reinterpret_as_s32(I0_row_4_right));
I_diff_left = v_sub(v_add(v_mul(w00v, v_cvt_f32(v_reinterpret_as_s32(I1_row_4_left))), \
v_mul(w01v, v_cvt_f32(v_reinterpret_as_s32(I1_row_shifted_4_left))), \
v_mul(w10v, v_cvt_f32(v_reinterpret_as_s32(I1_row_next_4_left))), \
v_mul(w11v, v_cvt_f32(v_reinterpret_as_s32(I1_row_next_shifted_4_left)))), \
v_cvt_f32(v_reinterpret_as_s32(I0_row_4_left))); \
I_diff_right = v_sub(v_add(v_mul(w00v, v_cvt_f32(v_reinterpret_as_s32(I1_row_4_right))), \
v_mul(w01v, v_cvt_f32(v_reinterpret_as_s32(I1_row_shifted_4_right))), \
v_mul(w10v, v_cvt_f32(v_reinterpret_as_s32(I1_row_next_4_right))), \
v_mul(w11v, v_cvt_f32(v_reinterpret_as_s32(I1_row_next_shifted_4_right)))), \
v_cvt_f32(v_reinterpret_as_s32(I0_row_4_right)));
#define HAL_BILINEAR_8x8_PATCH_EXTRACTION_NEXT_ROW \
I0_ptr += I0_stride; \
@ -572,9 +572,9 @@ inline float processPatch(float &dst_dUx, float &dst_dUy, uchar *I0_ptr, uchar *
v_expand(I0y_row, I0y_row_4_left, I0y_row_4_right);
/* Update the sums: */
Ux_vec += I_diff_left * v_cvt_f32(I0x_row_4_left) + I_diff_right * v_cvt_f32(I0x_row_4_right);
Uy_vec += I_diff_left * v_cvt_f32(I0y_row_4_left) + I_diff_right * v_cvt_f32(I0y_row_4_right);
SSD_vec += I_diff_left * I_diff_left + I_diff_right * I_diff_right;
Ux_vec = v_add(Ux_vec, v_add(v_mul(I_diff_left, v_cvt_f32(I0x_row_4_left)), v_mul(I_diff_right, v_cvt_f32(I0x_row_4_right))));
Uy_vec = v_add(Uy_vec, v_add(v_mul(I_diff_left, v_cvt_f32(I0y_row_4_left)), v_mul(I_diff_right, v_cvt_f32(I0y_row_4_right))));
SSD_vec = v_add(SSD_vec, v_add(v_mul(I_diff_left, I_diff_left), v_mul(I_diff_right, I_diff_right)));
I0x_ptr += I0_stride;
I0y_ptr += I0_stride;
@ -640,10 +640,10 @@ inline float processPatchMeanNorm(float &dst_dUx, float &dst_dUy, uchar *I0_ptr,
v_expand(I0y_row, I0y_row_4_left, I0y_row_4_right);
/* Update the sums: */
sum_I0x_mul_vec += I_diff_left * v_cvt_f32(I0x_row_4_left) + I_diff_right * v_cvt_f32(I0x_row_4_right);
sum_I0y_mul_vec += I_diff_left * v_cvt_f32(I0y_row_4_left) + I_diff_right * v_cvt_f32(I0y_row_4_right);
sum_diff_sq_vec += I_diff_left * I_diff_left + I_diff_right * I_diff_right;
sum_diff_vec += I_diff_left + I_diff_right;
sum_I0x_mul_vec = v_add(sum_I0x_mul_vec, v_add(v_mul(I_diff_left, v_cvt_f32(I0x_row_4_left)), v_mul(I_diff_right, v_cvt_f32(I0x_row_4_right))));
sum_I0y_mul_vec = v_add(sum_I0y_mul_vec, v_add(v_mul(I_diff_left, v_cvt_f32(I0y_row_4_left)), v_mul(I_diff_right, v_cvt_f32(I0y_row_4_right))));
sum_diff_sq_vec = v_add(sum_diff_sq_vec, v_add(v_mul(I_diff_left, I_diff_left), v_mul(I_diff_right, I_diff_right)));
sum_diff_vec = v_add(sum_diff_vec, v_add(I_diff_left, I_diff_right));
I0x_ptr += I0_stride;
I0y_ptr += I0_stride;
@ -692,7 +692,7 @@ inline float computeSSD(uchar *I0_ptr, uchar *I1_ptr, int I0_stride, int I1_stri
for (int row = 0; row < 8; row++)
{
HAL_PROCESS_BILINEAR_8x8_PATCH_EXTRACTION;
SSD_vec += I_diff_left * I_diff_left + I_diff_right * I_diff_right;
SSD_vec = v_add(SSD_vec, v_add(v_mul(I_diff_left, I_diff_left), v_mul(I_diff_right, I_diff_right)));
HAL_BILINEAR_8x8_PATCH_EXTRACTION_NEXT_ROW;
}
SSD = v_reduce_sum(SSD_vec);
@ -728,8 +728,8 @@ inline float computeSSDMeanNorm(uchar *I0_ptr, uchar *I1_ptr, int I0_stride, int
for (int row = 0; row < 8; row++)
{
HAL_PROCESS_BILINEAR_8x8_PATCH_EXTRACTION;
sum_diff_sq_vec += I_diff_left * I_diff_left + I_diff_right * I_diff_right;
sum_diff_vec += I_diff_left + I_diff_right;
sum_diff_sq_vec = v_add(sum_diff_sq_vec, v_add(v_mul(I_diff_left, I_diff_left), v_mul(I_diff_right, I_diff_right)));
sum_diff_vec = v_add(sum_diff_vec, v_add(I_diff_left, I_diff_right));
HAL_BILINEAR_8x8_PATCH_EXTRACTION_NEXT_ROW;
}
sum_diff = v_reduce_sum(sum_diff_vec);

View File

@ -97,8 +97,8 @@ void cv::detail::ScharrDerivInvoker::operator()(const Range& range) const
v_int16x8 s1 = v_reinterpret_as_s16(v_load_expand(srow1 + x));
v_int16x8 s2 = v_reinterpret_as_s16(v_load_expand(srow2 + x));
v_int16x8 t1 = s2 - s0;
v_int16x8 t0 = v_mul_wrap(s0 + s2, c3) + v_mul_wrap(s1, c10);
v_int16x8 t1 = v_sub(s2, s0);
v_int16x8 t0 = v_add(v_mul_wrap(v_add(s0, s2), c3), v_mul_wrap(s1, c10));
v_store(trow0 + x, t0);
v_store(trow1 + x, t1);
@ -134,8 +134,8 @@ void cv::detail::ScharrDerivInvoker::operator()(const Range& range) const
v_int16x8 s3 = v_load(trow1 + x);
v_int16x8 s4 = v_load(trow1 + x + cn);
v_int16x8 t0 = s1 - s0;
v_int16x8 t1 = v_mul_wrap(s2 + s4, c3) + v_mul_wrap(s3, c10);
v_int16x8 t0 = v_sub(s1, s0);
v_int16x8 t1 = v_add(v_mul_wrap(v_add(s2, s4), c3), v_mul_wrap(s3, c10));
v_store_interleave((drow + x*2), t0, t1);
}
@ -293,10 +293,10 @@ void cv::detail::LKTrackerInvoker::operator()(const Range& range) const
v_zip(v00, v01, t00, t01);
v_zip(v10, v11, t10, t11);
t0 = v_dotprod(t00, qw0, qdelta) + v_dotprod(t10, qw1);
t1 = v_dotprod(t01, qw0, qdelta) + v_dotprod(t11, qw1);
t0 = t0 >> (W_BITS1-5);
t1 = t1 >> (W_BITS1-5);
t0 = v_add(v_dotprod(t00, qw0, qdelta), v_dotprod(t10, qw1));
t1 = v_add(v_dotprod(t01, qw0, qdelta), v_dotprod(t11, qw1));
t0 = v_shr<W_BITS1 - 5>(t0);
t1 = v_shr<W_BITS1 - 5>(t1);
v_store(Iptr + x, v_pack(t0, t1));
v00 = v_reinterpret_as_s16(v_load(dsrc));
@ -307,10 +307,10 @@ void cv::detail::LKTrackerInvoker::operator()(const Range& range) const
v_zip(v00, v01, t00, t01);
v_zip(v10, v11, t10, t11);
t0 = v_dotprod(t00, qw0, qdelta_d) + v_dotprod(t10, qw1);
t1 = v_dotprod(t01, qw0, qdelta_d) + v_dotprod(t11, qw1);
t0 = t0 >> W_BITS1;
t1 = t1 >> W_BITS1;
t0 = v_add(v_dotprod(t00, qw0, qdelta_d), v_dotprod(t10, qw1));
t1 = v_add(v_dotprod(t01, qw0, qdelta_d), v_dotprod(t11, qw1));
t0 = v_shr<W_BITS1>(t0);
t1 = v_shr<W_BITS1>(t1);
v00 = v_pack(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
v_store(dIptr, v00);
@ -332,10 +332,10 @@ void cv::detail::LKTrackerInvoker::operator()(const Range& range) const
v_zip(v00, v01, t00, t01);
v_zip(v10, v11, t10, t11);
t0 = v_dotprod(t00, qw0, qdelta_d) + v_dotprod(t10, qw1);
t1 = v_dotprod(t01, qw0, qdelta_d) + v_dotprod(t11, qw1);
t0 = t0 >> W_BITS1;
t1 = t1 >> W_BITS1;
t0 = v_add(v_dotprod(t00, qw0, qdelta_d), v_dotprod(t10, qw1));
t1 = v_add(v_dotprod(t01, qw0, qdelta_d), v_dotprod(t11, qw1));
t0 = v_shr<W_BITS1>(t0);
t1 = v_shr<W_BITS1>(t1);
v00 = v_pack(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
v_store(dIptr + 4*2, v00);
@ -548,18 +548,18 @@ void cv::detail::LKTrackerInvoker::operator()(const Range& range) const
v_zip(v00, v01, t00, t01);
v_zip(v10, v11, t10, t11);
t0 = v_dotprod(t00, qw0, qdelta) + v_dotprod(t10, qw1);
t1 = v_dotprod(t01, qw0, qdelta) + v_dotprod(t11, qw1);
t0 = t0 >> (W_BITS1-5);
t1 = t1 >> (W_BITS1-5);
diff0 = v_pack(t0, t1) - diff0;
t0 = v_add(v_dotprod(t00, qw0, qdelta), v_dotprod(t10, qw1));
t1 = v_add(v_dotprod(t01, qw0, qdelta), v_dotprod(t11, qw1));
t0 = v_shr<W_BITS1 - 5>(t0);
t1 = v_shr<W_BITS1 - 5>(t1);
diff0 = v_sub(v_pack(t0, t1), diff0);
v_zip(diff0, diff0, diff2, diff1); // It0 It0 It1 It1 ...
v00 = v_reinterpret_as_s16(v_load(dIptr)); // Ix0 Iy0 Ix1 Iy1 ...
v01 = v_reinterpret_as_s16(v_load(dIptr + 8));
v_zip(v00, v01, v10, v11);
v_zip(diff2, diff1, v00, v01);
qb0 += v_cvt_f32(v_dotprod(v00, v10));
qb1 += v_cvt_f32(v_dotprod(v01, v11));
qb0 = v_add(qb0, v_cvt_f32(v_dotprod(v00, v10)));
qb1 = v_add(qb1, v_cvt_f32(v_dotprod(v01, v11)));
}
#endif
@ -647,7 +647,7 @@ void cv::detail::LKTrackerInvoker::operator()(const Range& range) const
#if CV_SIMD128 && !CV_NEON
v_float32x4 qf0, qf1;
v_recombine(v_interleave_pairs(qb0 + qb1), v_setzero_f32(), qf0, qf1);
v_recombine(v_interleave_pairs(v_add(qb0, qb1)), v_setzero_f32(), qf0, qf1);
ib1 += v_reduce_sum(qf0);
ib2 += v_reduce_sum(qf1);
#endif

View File

@ -463,22 +463,22 @@ FarnebackUpdateFlow_GaussianBlur( const Mat& _R0, const Mat& _R1,
const float *sptr0 = srow[m], *sptr1;
v_float32x4 g4 = v_load(simd_kernel);
v_float32x4 s0, s1, s2, s3;
s0 = v_load(sptr0 + x) * g4;
s1 = v_load(sptr0 + x + 4) * g4;
s2 = v_load(sptr0 + x + 8) * g4;
s3 = v_load(sptr0 + x + 12) * g4;
s0 = v_mul(v_load(sptr0 + x), g4);
s1 = v_mul(v_load(sptr0 + x + 4), g4);
s2 = v_mul(v_load(sptr0 + x + 8), g4);
s3 = v_mul(v_load(sptr0 + x + 12), g4);
for( i = 1; i <= m; i++ )
{
v_float32x4 x0, x1;
sptr0 = srow[m+i], sptr1 = srow[m-i];
g4 = v_load(simd_kernel + i*4);
x0 = v_load(sptr0 + x) + v_load(sptr1 + x);
x1 = v_load(sptr0 + x + 4) + v_load(sptr1 + x + 4);
x0 = v_add(v_load(sptr0 + x), v_load(sptr1 + x));
x1 = v_add(v_load(sptr0 + x + 4), v_load(sptr1 + x + 4));
s0 = v_muladd(x0, g4, s0);
s1 = v_muladd(x1, g4, s1);
x0 = v_load(sptr0 + x + 8) + v_load(sptr1 + x + 8);
x1 = v_load(sptr0 + x + 12) + v_load(sptr1 + x + 12);
x0 = v_add(v_load(sptr0 + x + 8), v_load(sptr1 + x + 8));
x1 = v_add(v_load(sptr0 + x + 12), v_load(sptr1 + x + 12));
s2 = v_muladd(x0, g4, s2);
s3 = v_muladd(x1, g4, s3);
}
@ -493,13 +493,13 @@ FarnebackUpdateFlow_GaussianBlur( const Mat& _R0, const Mat& _R1,
{
const float *sptr0 = srow[m], *sptr1;
v_float32x4 g4 = v_load(simd_kernel);
v_float32x4 s0 = v_load(sptr0 + x) * g4;
v_float32x4 s0 = v_mul(v_load(sptr0 + x), g4);
for( i = 1; i <= m; i++ )
{
sptr0 = srow[m+i], sptr1 = srow[m-i];
g4 = v_load(simd_kernel + i*4);
v_float32x4 x0 = v_load(sptr0 + x) + v_load(sptr1 + x);
v_float32x4 x0 = v_add(v_load(sptr0 + x), v_load(sptr1 + x));
s0 = v_muladd(x0, g4, s0);
}
v_store(vsum + x, s0);
@ -528,14 +528,14 @@ FarnebackUpdateFlow_GaussianBlur( const Mat& _R0, const Mat& _R1,
for( ; x <= width*5 - 8; x += 8 )
{
v_float32x4 g4 = v_load(simd_kernel);
v_float32x4 s0 = v_load(vsum + x) * g4;
v_float32x4 s1 = v_load(vsum + x + 4) * g4;
v_float32x4 s0 = v_mul(v_load(vsum + x), g4);
v_float32x4 s1 = v_mul(v_load(vsum + x + 4), g4);
for( i = 1; i <= m; i++ )
{
g4 = v_load(simd_kernel + i*4);
v_float32x4 x0 = v_load(vsum + x - i*5) + v_load(vsum + x+ i*5);
v_float32x4 x1 = v_load(vsum + x - i*5 + 4) + v_load(vsum + x+ i*5 + 4);
v_float32x4 x0 = v_add(v_load(vsum + x - i * 5), v_load(vsum + x + i * 5));
v_float32x4 x1 = v_add(v_load(vsum + x - i * 5 + 4), v_load(vsum + x + i * 5 + 4));
s0 = v_muladd(x0, g4, s0);
s1 = v_muladd(x1, g4, s1);
}

View File

@ -651,15 +651,15 @@ void VariationalRefinementImpl::ComputeDataTerm_ParBody::operator()(const Range
pdU_vec = v_load(pdU + j);
pdV_vec = v_load(pdV + j);
derivNorm_vec = pIx_vec * pIx_vec + pIy_vec * pIy_vec + zeta_vec;
Ik1z_vec = pIz_vec + pIx_vec * pdU_vec + pIy_vec * pdV_vec;
weight_vec = (delta_vec / v_sqrt(Ik1z_vec * Ik1z_vec / derivNorm_vec + eps_vec)) / derivNorm_vec;
derivNorm_vec = v_add(v_add(v_mul(pIx_vec, pIx_vec), v_mul(pIy_vec, pIy_vec)), zeta_vec);
Ik1z_vec = v_add(v_add(pIz_vec, v_mul(pIx_vec, pdU_vec)), v_mul(pIy_vec, pdV_vec));
weight_vec = v_div(v_div(delta_vec, v_sqrt(v_add(v_div(v_mul(Ik1z_vec, Ik1z_vec), derivNorm_vec), eps_vec))), derivNorm_vec);
pa11_vec = weight_vec * (pIx_vec * pIx_vec) + zeta_vec;
pa12_vec = weight_vec * (pIx_vec * pIy_vec);
pa22_vec = weight_vec * (pIy_vec * pIy_vec) + zeta_vec;
pb1_vec = zero_vec - weight_vec * (pIz_vec * pIx_vec);
pb2_vec = zero_vec - weight_vec * (pIz_vec * pIy_vec);
pa11_vec = v_add(v_mul(weight_vec, v_mul(pIx_vec, pIx_vec)), zeta_vec);
pa12_vec = v_mul(weight_vec, v_mul(pIx_vec, pIy_vec));
pa22_vec = v_add(v_mul(weight_vec, v_mul(pIy_vec, pIy_vec)), zeta_vec);
pb1_vec = v_sub(zero_vec, v_mul(weight_vec, v_mul(pIz_vec, pIx_vec)));
pb2_vec = v_sub(zero_vec, v_mul(weight_vec, v_mul(pIz_vec, pIy_vec)));
pIxx_vec = v_load(pIxx + j);
pIxy_vec = v_load(pIxy + j);
@ -667,18 +667,17 @@ void VariationalRefinementImpl::ComputeDataTerm_ParBody::operator()(const Range
pIxz_vec = v_load(pIxz + j);
pIyz_vec = v_load(pIyz + j);
derivNorm_vec = pIxx_vec * pIxx_vec + pIxy_vec * pIxy_vec + zeta_vec;
derivNorm2_vec = pIyy_vec * pIyy_vec + pIxy_vec * pIxy_vec + zeta_vec;
Ik1zx_vec = pIxz_vec + pIxx_vec * pdU_vec + pIxy_vec * pdV_vec;
Ik1zy_vec = pIyz_vec + pIxy_vec * pdU_vec + pIyy_vec * pdV_vec;
weight_vec = gamma_vec / v_sqrt(Ik1zx_vec * Ik1zx_vec / derivNorm_vec +
Ik1zy_vec * Ik1zy_vec / derivNorm2_vec + eps_vec);
derivNorm_vec = v_add(v_add(v_mul(pIxx_vec, pIxx_vec), v_mul(pIxy_vec, pIxy_vec)), zeta_vec);
derivNorm2_vec = v_add(v_add(v_mul(pIyy_vec, pIyy_vec), v_mul(pIxy_vec, pIxy_vec)), zeta_vec);
Ik1zx_vec = v_add(v_add(pIxz_vec, v_mul(pIxx_vec, pdU_vec)), v_mul(pIxy_vec, pdV_vec));
Ik1zy_vec = v_add(v_add(pIyz_vec, v_mul(pIxy_vec, pdU_vec)), v_mul(pIyy_vec, pdV_vec));
weight_vec = v_div(gamma_vec, v_sqrt(v_add(v_add(v_div(v_mul(Ik1zx_vec, Ik1zx_vec), derivNorm_vec), v_div(v_mul(Ik1zy_vec, Ik1zy_vec), derivNorm2_vec)), eps_vec)));
pa11_vec += weight_vec * (pIxx_vec * pIxx_vec / derivNorm_vec + pIxy_vec * pIxy_vec / derivNorm2_vec);
pa12_vec += weight_vec * (pIxx_vec * pIxy_vec / derivNorm_vec + pIxy_vec * pIyy_vec / derivNorm2_vec);
pa22_vec += weight_vec * (pIxy_vec * pIxy_vec / derivNorm_vec + pIyy_vec * pIyy_vec / derivNorm2_vec);
pb1_vec -= weight_vec * (pIxx_vec * pIxz_vec / derivNorm_vec + pIxy_vec * pIyz_vec / derivNorm2_vec);
pb2_vec -= weight_vec * (pIxy_vec * pIxz_vec / derivNorm_vec + pIyy_vec * pIyz_vec / derivNorm2_vec);
pa11_vec = v_add(pa11_vec, v_mul(weight_vec, v_add(v_div(v_mul(pIxx_vec, pIxx_vec), derivNorm_vec), v_div(v_mul(pIxy_vec, pIxy_vec), derivNorm2_vec))));
pa12_vec = v_add(pa12_vec, v_mul(weight_vec, v_add(v_div(v_mul(pIxx_vec, pIxy_vec), derivNorm_vec), v_div(v_mul(pIxy_vec, pIyy_vec), derivNorm2_vec))));
pa22_vec = v_add(pa22_vec, v_mul(weight_vec, v_add(v_div(v_mul(pIxy_vec, pIxy_vec), derivNorm_vec), v_div(v_mul(pIyy_vec, pIyy_vec), derivNorm2_vec))));
pb1_vec = v_sub(pb1_vec, v_mul(weight_vec, v_add(v_div(v_mul(pIxx_vec, pIxz_vec), derivNorm_vec), v_div(v_mul(pIxy_vec, pIyz_vec), derivNorm2_vec))));
pb2_vec = v_sub(pb2_vec, v_mul(weight_vec, v_add(v_div(v_mul(pIxy_vec, pIxz_vec), derivNorm_vec), v_div(v_mul(pIyy_vec, pIyz_vec), derivNorm2_vec))));
v_store(pa11 + j, pa11_vec);
v_store(pa12 + j, pa12_vec);
@ -850,26 +849,26 @@ void VariationalRefinementImpl::ComputeSmoothnessTermHorPass_ParBody::operator()
cW_u_vec = v_load(cW_u + j);
cW_v_vec = v_load(cW_v + j);
ux_vec = v_load(cW_u_next + j) - cW_u_vec;
vx_vec = v_load(cW_v_next + j) - cW_v_vec;
uy_vec = v_load(cW_u_next_row + j) - cW_u_vec;
vy_vec = v_load(cW_v_next_row + j) - cW_v_vec;
ux_vec = v_sub(v_load(cW_u_next + j), cW_u_vec);
vx_vec = v_sub(v_load(cW_v_next + j), cW_v_vec);
uy_vec = v_sub(v_load(cW_u_next_row + j), cW_u_vec);
vy_vec = v_sub(v_load(cW_v_next_row + j), cW_v_vec);
pWeight_vec =
alpha2_vec / v_sqrt(ux_vec * ux_vec + vx_vec * vx_vec + uy_vec * uy_vec + vy_vec * vy_vec + eps_vec);
v_div(alpha2_vec, v_sqrt(v_add(v_add(v_add(v_add(v_mul(ux_vec, ux_vec), v_mul(vx_vec, vx_vec)), v_mul(uy_vec, uy_vec)), v_mul(vy_vec, vy_vec)), eps_vec)));
v_store(pWeight + j, pWeight_vec);
ux_vec = pWeight_vec * (v_load(pW_u_next + j) - v_load(pW_u + j));
vx_vec = pWeight_vec * (v_load(pW_v_next + j) - v_load(pW_v + j));
ux_vec = v_mul(pWeight_vec, v_sub(v_load(pW_u_next + j), v_load(pW_u + j)));
vx_vec = v_mul(pWeight_vec, v_sub(v_load(pW_v_next + j), v_load(pW_v + j)));
v_store(pA_u + j, v_load(pA_u + j) + pWeight_vec);
v_store(pA_v + j, v_load(pA_v + j) + pWeight_vec);
v_store(pB_u + j, v_load(pB_u + j) + ux_vec);
v_store(pB_v + j, v_load(pB_v + j) + vx_vec);
v_store(pA_u + j, v_add(v_load(pA_u + j), pWeight_vec));
v_store(pA_v + j, v_add(v_load(pA_v + j), pWeight_vec));
v_store(pB_u + j, v_add(v_load(pB_u + j), ux_vec));
v_store(pB_v + j, v_add(v_load(pB_v + j), vx_vec));
v_store(pA_u_next + j, v_load(pA_u_next + j) + pWeight_vec);
v_store(pA_v_next + j, v_load(pA_v_next + j) + pWeight_vec);
v_store(pB_u_next + j, v_load(pB_u_next + j) - ux_vec);
v_store(pB_v_next + j, v_load(pB_v_next + j) - vx_vec);
v_store(pA_u_next + j, v_add(v_load(pA_u_next + j), pWeight_vec));
v_store(pA_v_next + j, v_add(v_load(pA_v_next + j), pWeight_vec));
v_store(pB_u_next + j, v_sub(v_load(pB_u_next + j), ux_vec));
v_store(pB_v_next + j, v_sub(v_load(pB_v_next + j), vx_vec));
}
#endif
for (; j < len - 1; j++)
@ -956,18 +955,18 @@ void VariationalRefinementImpl::ComputeSmoothnessTermVertPass_ParBody::operator(
for (; j < len - 3; j += 4)
{
pWeight_vec = v_load(pWeight + j);
uy_vec = pWeight_vec * (v_load(pW_u_next_row + j) - v_load(pW_u + j));
vy_vec = pWeight_vec * (v_load(pW_v_next_row + j) - v_load(pW_v + j));
uy_vec = v_mul(pWeight_vec, v_sub(v_load(pW_u_next_row + j), v_load(pW_u + j)));
vy_vec = v_mul(pWeight_vec, v_sub(v_load(pW_v_next_row + j), v_load(pW_v + j)));
v_store(pA_u + j, v_load(pA_u + j) + pWeight_vec);
v_store(pA_v + j, v_load(pA_v + j) + pWeight_vec);
v_store(pB_u + j, v_load(pB_u + j) + uy_vec);
v_store(pB_v + j, v_load(pB_v + j) + vy_vec);
v_store(pA_u + j, v_add(v_load(pA_u + j), pWeight_vec));
v_store(pA_v + j, v_add(v_load(pA_v + j), pWeight_vec));
v_store(pB_u + j, v_add(v_load(pB_u + j), uy_vec));
v_store(pB_v + j, v_add(v_load(pB_v + j), vy_vec));
v_store(pA_u_next_row + j, v_load(pA_u_next_row + j) + pWeight_vec);
v_store(pA_v_next_row + j, v_load(pA_v_next_row + j) + pWeight_vec);
v_store(pB_u_next_row + j, v_load(pB_u_next_row + j) - uy_vec);
v_store(pB_v_next_row + j, v_load(pB_v_next_row + j) - vy_vec);
v_store(pA_u_next_row + j, v_add(v_load(pA_u_next_row + j), pWeight_vec));
v_store(pA_v_next_row + j, v_add(v_load(pA_v_next_row + j), pWeight_vec));
v_store(pB_u_next_row + j, v_sub(v_load(pB_u_next_row + j), uy_vec));
v_store(pB_v_next_row + j, v_sub(v_load(pB_v_next_row + j), vy_vec));
}
#endif
for (; j < len; j++)
@ -1084,15 +1083,13 @@ void VariationalRefinementImpl::RedBlackSOR_ParBody::operator()(const Range &ran
pdv_shifted_vec = v_reinterpret_as_f32(
v_extract<3>(v_reinterpret_as_s32(pdv_prev_vec), v_reinterpret_as_s32(pdv_next_vec)));
sigmaU_vec = pW_shifted_vec * pdu_shifted_vec + pW_vec * pdu_next_vec + pW_prev_row_vec * pdu_prev_row_vec +
pW_vec * pdu_next_row_vec;
sigmaV_vec = pW_shifted_vec * pdv_shifted_vec + pW_vec * pdv_next_vec + pW_prev_row_vec * pdv_prev_row_vec +
pW_vec * pdv_next_row_vec;
sigmaU_vec = v_add(v_add(v_add(v_mul(pW_shifted_vec, pdu_shifted_vec), v_mul(pW_vec, pdu_next_vec)), v_mul(pW_prev_row_vec, pdu_prev_row_vec)), v_mul(pW_vec, pdu_next_row_vec));
sigmaV_vec = v_add(v_add(v_add(v_mul(pW_shifted_vec, pdv_shifted_vec), v_mul(pW_vec, pdv_next_vec)), v_mul(pW_prev_row_vec, pdv_prev_row_vec)), v_mul(pW_vec, pdv_next_row_vec));
pdu_vec = v_load(pdu + j);
pdv_vec = v_load(pdv + j);
pdu_vec += omega_vec * ((sigmaU_vec + v_load(pb1 + j) - pdv_vec * pa12_vec) / v_load(pa11 + j) - pdu_vec);
pdv_vec += omega_vec * ((sigmaV_vec + v_load(pb2 + j) - pdu_vec * pa12_vec) / v_load(pa22 + j) - pdv_vec);
pdu_vec = v_add(pdu_vec, v_mul(omega_vec, v_sub(v_div(v_sub(v_add(sigmaU_vec, v_load(pb1 + j)), v_mul(pdv_vec, pa12_vec)), v_load(pa11 + j)), pdu_vec)));
pdv_vec = v_add(pdv_vec, v_mul(omega_vec, v_sub(v_div(v_sub(v_add(sigmaV_vec, v_load(pb2 + j)), v_mul(pdu_vec, pa12_vec)), v_load(pa22 + j)), pdv_vec)));
v_store(pdu + j, pdu_vec);
v_store(pdv + j, pdv_vec);

View File

@ -38,8 +38,8 @@ int main(int /*argc*/, char** /*argv*/)
printf("================== arithm check =================\n");
v_uint8 a = vx_setall_u8(10);
v_uint8 c = a + vx_setall_u8(45);
printf("(vx_setall_u8(10) + vx_setall_u8(45)).get0() => %d\n", (int)c.get0());
v_uint8 c = v_add(a, vx_setall_u8(45));
printf("v_get0(vx_setall_u8(10) + vx_setall_u8(45)) => %d\n", (int)v_get0(c));
#else
printf("\nSIMD intrinsics are not available. Check compilation target and passed build options.\n");
#endif

View File

@ -85,7 +85,7 @@ void conv1dsimd(Mat src, Mat kernel, float *ans, int row = 0, int rowk = 0, int
//! [convolution-1D-main]
//! [convolution-1D-main-h1]
int step = v_float32().nlanes;
int step = VTraits<v_float32x4>::vlanes();
float *sptr = src_32.ptr<float>(row), *kptr = kernel.ptr<float>(rowk);
for (int k = 0; k < ksize; k++)
{
@ -96,7 +96,7 @@ void conv1dsimd(Mat src, Mat kernel, float *ans, int row = 0, int rowk = 0, int
for (i = 0; i + step < len; i += step)
{
v_float32 window = vx_load(sptr + i + k);
v_float32 sum = vx_load(ans + i) + kernel_wide * window;
v_float32 sum = v_add(vx_load(ans + i), v_mul(kernel_wide, window));
v_store(ans + i, sum);
}
//! [convolution-1D-main-h2]
@ -122,7 +122,7 @@ void convolute_simd(Mat src, Mat &dst, Mat kernel)
copyMakeBorder(src, src, sz, sz, 0, 0, BORDER_REPLICATE);
int step = v_float32().nlanes;
int step = VTraits<v_float32x4>::vlanes();
//! [convolution-2D-init]
//! [convolution-2D-main]
@ -135,7 +135,7 @@ void convolute_simd(Mat src, Mat &dst, Mat kernel)
int j;
for (j = 0; j + step < cols; j += step)
{
v_float32 sum = vx_load(&dst.ptr<float>(i)[j]) + vx_load(&ans[j]);
v_float32 sum = v_add(vx_load(&dst.ptr<float>(i)[j]), vx_load(&ans[j]));
v_store(&dst.ptr<float>(i)[j], sum);
}