mirror of
https://github.com/opencv/opencv.git
synced 2024-11-24 03:00:14 +08:00
Merge pull request #24371 from hanliutong:clean-up
Clean up the obsolete API of Universal Intrinsic
This commit is contained in:
commit
1c0ca41b6e
@ -723,7 +723,7 @@ namespace CV__SIMD_NAMESPACE {
|
||||
/** @brief SIMD processing state cleanup call */
|
||||
inline void vx_cleanup() { VXPREFIX(_cleanup)(); }
|
||||
|
||||
#if !CV_SIMD_SCALABLE
|
||||
#if !CV_SIMD_SCALABLE && !(CV_NEON && !defined(CV_FORCE_SIMD128_CPP))
|
||||
// Compatibility layer
|
||||
|
||||
template<typename T> struct VTraits {
|
||||
@ -1148,6 +1148,74 @@ namespace CV__SIMD_NAMESPACE {
|
||||
|
||||
#endif //!CV_SIMD_SCALABLE
|
||||
|
||||
#if (CV_NEON /* || CV_others */) && !defined(CV_FORCE_SIMD128_CPP)
|
||||
// Compatibility layer for the backend that cleaned up.
|
||||
#define OPENCV_HAL_WRAP_BIN_OP_ADDSUB(_Tpvec) \
|
||||
template<typename... Args> \
|
||||
inline _Tpvec v_add(const _Tpvec& f1, const _Tpvec& f2, const Args&... vf) { \
|
||||
return v_add(v_add(f1, f2), vf...); \
|
||||
}
|
||||
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint8)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint16)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint32)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint64)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int8)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int16)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int32)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int64)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float32)
|
||||
#if CV_SIMD_64F
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64)
|
||||
#endif
|
||||
|
||||
#define OPENCV_HAL_WRAP_BIN_OP_MUL(_Tpvec) \
|
||||
template<typename... Args> \
|
||||
inline _Tpvec v_mul(const _Tpvec& f1, const _Tpvec& f2, const Args&... vf) { \
|
||||
return v_mul(v_mul(f1, f2), vf...); \
|
||||
}
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint8)
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int8)
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint16)
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint32)
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int16)
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int32)
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_float32)
|
||||
#if CV_SIMD_64F
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_float64)
|
||||
#endif
|
||||
|
||||
#define OPENCV_HAL_WRAP_EXTRACT(_Tpvec) \
|
||||
inline typename VTraits<_Tpvec>::lane_type v_extract_highest(const _Tpvec& v) \
|
||||
{ \
|
||||
return v_extract_n<VTraits<_Tpvec>::nlanes-1>(v); \
|
||||
}
|
||||
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_uint8)
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_int8)
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_uint16)
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_int16)
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_uint32)
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_int32)
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_uint64)
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_int64)
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_float32)
|
||||
#if CV_SIMD_64F
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_float64)
|
||||
#endif
|
||||
|
||||
#define OPENCV_HAL_WRAP_BROADCAST(_Tpvec) \
|
||||
inline _Tpvec v_broadcast_highest(const _Tpvec& v) \
|
||||
{ \
|
||||
return v_broadcast_element<VTraits<_Tpvec>::nlanes-1>(v); \
|
||||
}
|
||||
|
||||
OPENCV_HAL_WRAP_BROADCAST(v_uint32)
|
||||
OPENCV_HAL_WRAP_BROADCAST(v_int32)
|
||||
OPENCV_HAL_WRAP_BROADCAST(v_float32)
|
||||
|
||||
#endif //CV_NEON
|
||||
|
||||
//! @cond IGNORED
|
||||
|
||||
// backward compatibility
|
||||
|
@ -131,13 +131,22 @@ OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_I64(int64x2, int64x1, s64)
|
||||
OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_F64(float64x2, float64x1,f64)
|
||||
#endif
|
||||
|
||||
//////////// Compatibility layer ////////////
|
||||
template<typename T> struct VTraits {
|
||||
static inline int vlanes() { return T::nlanes; }
|
||||
enum { max_nlanes = T::nlanes, nlanes = T::nlanes };
|
||||
using lane_type = typename T::lane_type;
|
||||
};
|
||||
|
||||
template<typename T>
|
||||
inline typename VTraits<T>::lane_type v_get0(const T& v) \
|
||||
{ \
|
||||
return v.get0(); \
|
||||
}
|
||||
//////////// Types ////////////
|
||||
|
||||
struct v_uint8x16
|
||||
{
|
||||
typedef uchar lane_type;
|
||||
enum { nlanes = 16 };
|
||||
|
||||
v_uint8x16() {}
|
||||
explicit v_uint8x16(uint8x16_t v) : val(v) {}
|
||||
v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
|
||||
@ -146,19 +155,22 @@ struct v_uint8x16
|
||||
uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
|
||||
val = vld1q_u8(v);
|
||||
}
|
||||
uint8x16_t val;
|
||||
|
||||
private:
|
||||
friend struct VTraits<v_uint8x16>;
|
||||
enum { nlanes = 16 };
|
||||
typedef uchar lane_type;
|
||||
|
||||
friend typename VTraits<v_uint8x16>::lane_type v_get0<v_uint8x16>(const v_uint8x16& v);
|
||||
uchar get0() const
|
||||
{
|
||||
return vgetq_lane_u8(val, 0);
|
||||
}
|
||||
|
||||
uint8x16_t val;
|
||||
};
|
||||
|
||||
struct v_int8x16
|
||||
{
|
||||
typedef schar lane_type;
|
||||
enum { nlanes = 16 };
|
||||
|
||||
v_int8x16() {}
|
||||
explicit v_int8x16(int8x16_t v) : val(v) {}
|
||||
v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
|
||||
@ -167,19 +179,22 @@ struct v_int8x16
|
||||
schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
|
||||
val = vld1q_s8(v);
|
||||
}
|
||||
int8x16_t val;
|
||||
|
||||
private:
|
||||
friend struct VTraits<v_int8x16>;
|
||||
enum { nlanes = 16 };
|
||||
typedef schar lane_type;
|
||||
|
||||
friend typename VTraits<v_int8x16>::lane_type v_get0<v_int8x16>(const v_int8x16& v);
|
||||
schar get0() const
|
||||
{
|
||||
return vgetq_lane_s8(val, 0);
|
||||
}
|
||||
|
||||
int8x16_t val;
|
||||
};
|
||||
|
||||
struct v_uint16x8
|
||||
{
|
||||
typedef ushort lane_type;
|
||||
enum { nlanes = 8 };
|
||||
|
||||
v_uint16x8() {}
|
||||
explicit v_uint16x8(uint16x8_t v) : val(v) {}
|
||||
v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
|
||||
@ -187,19 +202,22 @@ struct v_uint16x8
|
||||
ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
|
||||
val = vld1q_u16(v);
|
||||
}
|
||||
uint16x8_t val;
|
||||
|
||||
private:
|
||||
friend struct VTraits<v_uint16x8>;
|
||||
enum { nlanes = 8 };
|
||||
typedef ushort lane_type;
|
||||
|
||||
friend typename VTraits<v_uint16x8>::lane_type v_get0<v_uint16x8>(const v_uint16x8& v);
|
||||
ushort get0() const
|
||||
{
|
||||
return vgetq_lane_u16(val, 0);
|
||||
}
|
||||
|
||||
uint16x8_t val;
|
||||
};
|
||||
|
||||
struct v_int16x8
|
||||
{
|
||||
typedef short lane_type;
|
||||
enum { nlanes = 8 };
|
||||
|
||||
v_int16x8() {}
|
||||
explicit v_int16x8(int16x8_t v) : val(v) {}
|
||||
v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
|
||||
@ -207,19 +225,22 @@ struct v_int16x8
|
||||
short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
|
||||
val = vld1q_s16(v);
|
||||
}
|
||||
int16x8_t val;
|
||||
|
||||
private:
|
||||
friend struct VTraits<v_int16x8>;
|
||||
enum { nlanes = 8 };
|
||||
typedef short lane_type;
|
||||
|
||||
friend typename VTraits<v_int16x8>::lane_type v_get0<v_int16x8>(const v_int16x8& v);
|
||||
short get0() const
|
||||
{
|
||||
return vgetq_lane_s16(val, 0);
|
||||
}
|
||||
|
||||
int16x8_t val;
|
||||
};
|
||||
|
||||
struct v_uint32x4
|
||||
{
|
||||
typedef unsigned lane_type;
|
||||
enum { nlanes = 4 };
|
||||
|
||||
v_uint32x4() {}
|
||||
explicit v_uint32x4(uint32x4_t v) : val(v) {}
|
||||
v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
|
||||
@ -227,19 +248,22 @@ struct v_uint32x4
|
||||
unsigned v[] = {v0, v1, v2, v3};
|
||||
val = vld1q_u32(v);
|
||||
}
|
||||
uint32x4_t val;
|
||||
|
||||
private:
|
||||
friend struct VTraits<v_uint32x4>;
|
||||
enum { nlanes = 4 };
|
||||
typedef unsigned lane_type;
|
||||
|
||||
friend typename VTraits<v_uint32x4>::lane_type v_get0<v_uint32x4>(const v_uint32x4& v);
|
||||
unsigned get0() const
|
||||
{
|
||||
return vgetq_lane_u32(val, 0);
|
||||
}
|
||||
|
||||
uint32x4_t val;
|
||||
};
|
||||
|
||||
struct v_int32x4
|
||||
{
|
||||
typedef int lane_type;
|
||||
enum { nlanes = 4 };
|
||||
|
||||
v_int32x4() {}
|
||||
explicit v_int32x4(int32x4_t v) : val(v) {}
|
||||
v_int32x4(int v0, int v1, int v2, int v3)
|
||||
@ -247,18 +271,22 @@ struct v_int32x4
|
||||
int v[] = {v0, v1, v2, v3};
|
||||
val = vld1q_s32(v);
|
||||
}
|
||||
int32x4_t val;
|
||||
|
||||
private:
|
||||
friend struct VTraits<v_int32x4>;
|
||||
enum { nlanes = 4 };
|
||||
typedef int lane_type;
|
||||
|
||||
friend typename VTraits<v_int32x4>::lane_type v_get0<v_int32x4>(const v_int32x4& v);
|
||||
int get0() const
|
||||
{
|
||||
return vgetq_lane_s32(val, 0);
|
||||
}
|
||||
int32x4_t val;
|
||||
};
|
||||
|
||||
struct v_float32x4
|
||||
{
|
||||
typedef float lane_type;
|
||||
enum { nlanes = 4 };
|
||||
|
||||
v_float32x4() {}
|
||||
explicit v_float32x4(float32x4_t v) : val(v) {}
|
||||
v_float32x4(float v0, float v1, float v2, float v3)
|
||||
@ -266,18 +294,22 @@ struct v_float32x4
|
||||
float v[] = {v0, v1, v2, v3};
|
||||
val = vld1q_f32(v);
|
||||
}
|
||||
float32x4_t val;
|
||||
|
||||
private:
|
||||
friend struct VTraits<v_float32x4>;
|
||||
enum { nlanes = 4 };
|
||||
typedef float lane_type;
|
||||
|
||||
friend typename VTraits<v_float32x4>::lane_type v_get0<v_float32x4>(const v_float32x4& v);
|
||||
float get0() const
|
||||
{
|
||||
return vgetq_lane_f32(val, 0);
|
||||
}
|
||||
float32x4_t val;
|
||||
};
|
||||
|
||||
struct v_uint64x2
|
||||
{
|
||||
typedef uint64 lane_type;
|
||||
enum { nlanes = 2 };
|
||||
|
||||
v_uint64x2() {}
|
||||
explicit v_uint64x2(uint64x2_t v) : val(v) {}
|
||||
v_uint64x2(uint64 v0, uint64 v1)
|
||||
@ -285,18 +317,21 @@ struct v_uint64x2
|
||||
uint64 v[] = {v0, v1};
|
||||
val = vld1q_u64(v);
|
||||
}
|
||||
uint64x2_t val;
|
||||
private:
|
||||
friend struct VTraits<v_uint64x2>;
|
||||
enum { nlanes = 2 };
|
||||
typedef uint64 lane_type;
|
||||
|
||||
friend typename VTraits<v_uint64x2>::lane_type v_get0<v_uint64x2>(const v_uint64x2& v);
|
||||
uint64 get0() const
|
||||
{
|
||||
return vgetq_lane_u64(val, 0);
|
||||
}
|
||||
uint64x2_t val;
|
||||
};
|
||||
|
||||
struct v_int64x2
|
||||
{
|
||||
typedef int64 lane_type;
|
||||
enum { nlanes = 2 };
|
||||
|
||||
v_int64x2() {}
|
||||
explicit v_int64x2(int64x2_t v) : val(v) {}
|
||||
v_int64x2(int64 v0, int64 v1)
|
||||
@ -304,19 +339,23 @@ struct v_int64x2
|
||||
int64 v[] = {v0, v1};
|
||||
val = vld1q_s64(v);
|
||||
}
|
||||
int64x2_t val;
|
||||
|
||||
private:
|
||||
friend struct VTraits<v_int64x2>;
|
||||
enum { nlanes = 2 };
|
||||
typedef int64 lane_type;
|
||||
|
||||
friend typename VTraits<v_int64x2>::lane_type v_get0<v_int64x2>(const v_int64x2& v);
|
||||
int64 get0() const
|
||||
{
|
||||
return vgetq_lane_s64(val, 0);
|
||||
}
|
||||
int64x2_t val;
|
||||
};
|
||||
|
||||
#if CV_SIMD128_64F
|
||||
struct v_float64x2
|
||||
{
|
||||
typedef double lane_type;
|
||||
enum { nlanes = 2 };
|
||||
|
||||
v_float64x2() {}
|
||||
explicit v_float64x2(float64x2_t v) : val(v) {}
|
||||
v_float64x2(double v0, double v1)
|
||||
@ -324,11 +363,18 @@ struct v_float64x2
|
||||
double v[] = {v0, v1};
|
||||
val = vld1q_f64(v);
|
||||
}
|
||||
|
||||
float64x2_t val;
|
||||
private:
|
||||
friend struct VTraits<v_float64x2>;
|
||||
enum { nlanes = 2 };
|
||||
typedef double lane_type;
|
||||
|
||||
friend typename VTraits<v_float64x2>::lane_type v_get0<v_float64x2>(const v_float64x2& v);
|
||||
double get0() const
|
||||
{
|
||||
return vgetq_lane_f64(val, 0);
|
||||
}
|
||||
float64x2_t val;
|
||||
};
|
||||
#endif
|
||||
|
||||
@ -460,71 +506,56 @@ inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
|
||||
}
|
||||
|
||||
#define OPENCV_HAL_IMPL_NEON_BIN_OP(bin_op, _Tpvec, intrin) \
|
||||
inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
|
||||
inline _Tpvec bin_op (const _Tpvec& a, const _Tpvec& b) \
|
||||
{ \
|
||||
return _Tpvec(intrin(a.val, b.val)); \
|
||||
} \
|
||||
inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
|
||||
{ \
|
||||
a.val = intrin(a.val, b.val); \
|
||||
return a; \
|
||||
}
|
||||
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint8x16, vqaddq_u8)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint8x16, vqsubq_u8)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int8x16, vqaddq_s8)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int8x16, vqsubq_s8)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint16x8, vqaddq_u16)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint16x8, vqsubq_u16)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int16x8, vqaddq_s16)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int16x8, vqsubq_s16)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int32x4, vaddq_s32)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int32x4, vsubq_s32)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_int32x4, vmulq_s32)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint32x4, vaddq_u32)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint32x4, vsubq_u32)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_uint32x4, vmulq_u32)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_float32x4, vaddq_f32)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_float32x4, vsubq_f32)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_float32x4, vmulq_f32)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int64x2, vaddq_s64)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int64x2, vsubq_s64)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint64x2, vaddq_u64)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint64x2, vsubq_u64)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_uint8x16, vqaddq_u8)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_uint8x16, vqsubq_u8)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_int8x16, vqaddq_s8)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_int8x16, vqsubq_s8)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_uint16x8, vqaddq_u16)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_uint16x8, vqsubq_u16)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_int16x8, vqaddq_s16)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_int16x8, vqsubq_s16)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_int32x4, vaddq_s32)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_int32x4, vsubq_s32)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(v_mul, v_int32x4, vmulq_s32)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_uint32x4, vaddq_u32)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_uint32x4, vsubq_u32)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(v_mul, v_uint32x4, vmulq_u32)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_float32x4, vaddq_f32)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_float32x4, vsubq_f32)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(v_mul, v_float32x4, vmulq_f32)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_int64x2, vaddq_s64)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_int64x2, vsubq_s64)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_uint64x2, vaddq_u64)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_uint64x2, vsubq_u64)
|
||||
#if CV_SIMD128_64F
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(/, v_float32x4, vdivq_f32)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_float64x2, vaddq_f64)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_float64x2, vsubq_f64)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_float64x2, vmulq_f64)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(/, v_float64x2, vdivq_f64)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(v_div, v_float32x4, vdivq_f32)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_float64x2, vaddq_f64)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_float64x2, vsubq_f64)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(v_mul, v_float64x2, vmulq_f64)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(v_div, v_float64x2, vdivq_f64)
|
||||
#else
|
||||
inline v_float32x4 operator / (const v_float32x4& a, const v_float32x4& b)
|
||||
inline v_float32x4 v_div (const v_float32x4& a, const v_float32x4& b)
|
||||
{
|
||||
float32x4_t reciprocal = vrecpeq_f32(b.val);
|
||||
reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
|
||||
reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
|
||||
return v_float32x4(vmulq_f32(a.val, reciprocal));
|
||||
}
|
||||
inline v_float32x4& operator /= (v_float32x4& a, const v_float32x4& b)
|
||||
{
|
||||
float32x4_t reciprocal = vrecpeq_f32(b.val);
|
||||
reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
|
||||
reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
|
||||
a.val = vmulq_f32(a.val, reciprocal);
|
||||
return a;
|
||||
}
|
||||
#endif
|
||||
|
||||
// saturating multiply 8-bit, 16-bit
|
||||
#define OPENCV_HAL_IMPL_NEON_MUL_SAT(_Tpvec, _Tpwvec) \
|
||||
inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \
|
||||
inline _Tpvec v_mul (const _Tpvec& a, const _Tpvec& b) \
|
||||
{ \
|
||||
_Tpwvec c, d; \
|
||||
v_mul_expand(a, b, c, d); \
|
||||
return v_pack(c, d); \
|
||||
} \
|
||||
inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \
|
||||
{ a = a * b; return a; }
|
||||
}
|
||||
|
||||
OPENCV_HAL_IMPL_NEON_MUL_SAT(v_int8x16, v_int16x8)
|
||||
OPENCV_HAL_IMPL_NEON_MUL_SAT(v_uint8x16, v_uint16x8)
|
||||
@ -698,7 +729,7 @@ inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
|
||||
inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b,
|
||||
const v_uint32x4& c)
|
||||
{
|
||||
return v_dotprod_expand(a, b) + c;
|
||||
return v_add(v_dotprod_expand(a, b), c);
|
||||
}
|
||||
|
||||
inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
|
||||
@ -715,7 +746,7 @@ inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
|
||||
inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b,
|
||||
const v_int32x4& c)
|
||||
{
|
||||
return v_dotprod_expand(a, b) + c;
|
||||
return v_add(v_dotprod_expand(a, b), c);
|
||||
}
|
||||
#endif
|
||||
// 16 >> 64
|
||||
@ -735,7 +766,7 @@ inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
|
||||
return v_uint64x2(vaddq_u64(s0, s1));
|
||||
}
|
||||
inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
|
||||
{ return v_dotprod_expand(a, b) + c; }
|
||||
{ return v_add(v_dotprod_expand(a, b), c); }
|
||||
|
||||
inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
|
||||
{
|
||||
@ -752,7 +783,7 @@ inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
|
||||
}
|
||||
inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b,
|
||||
const v_int64x2& c)
|
||||
{ return v_dotprod_expand(a, b) + c; }
|
||||
{ return v_add(v_dotprod_expand(a, b), c); }
|
||||
|
||||
// 32 >> 64f
|
||||
#if CV_SIMD128_64F
|
||||
@ -760,7 +791,7 @@ inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
|
||||
{ return v_cvt_f64(v_dotprod(a, b)); }
|
||||
inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b,
|
||||
const v_float64x2& c)
|
||||
{ return v_dotprod_expand(a, b) + c; }
|
||||
{ return v_add(v_dotprod_expand(a, b), c); }
|
||||
#endif
|
||||
|
||||
//////// Fast Dot Product ////////
|
||||
@ -850,7 +881,7 @@ inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b
|
||||
}
|
||||
inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
|
||||
{
|
||||
return v_dotprod_expand_fast(a, b) + c;
|
||||
return v_add(v_dotprod_expand_fast(a, b), c);
|
||||
}
|
||||
|
||||
inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
|
||||
@ -861,7 +892,7 @@ inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
|
||||
}
|
||||
inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
|
||||
{
|
||||
return v_dotprod_expand_fast(a, b) + c;
|
||||
return v_add(v_dotprod_expand_fast(a, b), c);
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -875,7 +906,7 @@ inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b
|
||||
return v_uint64x2(vaddq_u64(s0, s1));
|
||||
}
|
||||
inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
|
||||
{ return v_dotprod_expand_fast(a, b) + c; }
|
||||
{ return v_add(v_dotprod_expand_fast(a, b), c); }
|
||||
|
||||
inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
|
||||
{
|
||||
@ -884,22 +915,22 @@ inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
|
||||
return v_int64x2(vaddl_s32(vget_low_s32(prod), vget_high_s32(prod)));
|
||||
}
|
||||
inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
|
||||
{ return v_dotprod_expand_fast(a, b) + c; }
|
||||
{ return v_add(v_dotprod_expand_fast(a, b), c); }
|
||||
|
||||
// 32 >> 64f
|
||||
#if CV_SIMD128_64F
|
||||
inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
|
||||
{ return v_cvt_f64(v_dotprod_fast(a, b)); }
|
||||
inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
|
||||
{ return v_dotprod_expand_fast(a, b) + c; }
|
||||
{ return v_add(v_dotprod_expand_fast(a, b), c); }
|
||||
#endif
|
||||
|
||||
|
||||
#define OPENCV_HAL_IMPL_NEON_LOGIC_OP(_Tpvec, suffix) \
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(&, _Tpvec, vandq_##suffix) \
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(|, _Tpvec, vorrq_##suffix) \
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(^, _Tpvec, veorq_##suffix) \
|
||||
inline _Tpvec operator ~ (const _Tpvec& a) \
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(v_and, _Tpvec, vandq_##suffix) \
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(v_or, _Tpvec, vorrq_##suffix) \
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(v_xor, _Tpvec, veorq_##suffix) \
|
||||
inline _Tpvec v_not (const _Tpvec& a) \
|
||||
{ \
|
||||
return _Tpvec(vreinterpretq_##suffix##_u8(vmvnq_u8(vreinterpretq_u8_##suffix(a.val)))); \
|
||||
}
|
||||
@ -914,21 +945,16 @@ OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint64x2, u64)
|
||||
OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int64x2, s64)
|
||||
|
||||
#define OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(bin_op, intrin) \
|
||||
inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \
|
||||
inline v_float32x4 bin_op (const v_float32x4& a, const v_float32x4& b) \
|
||||
{ \
|
||||
return v_float32x4(vreinterpretq_f32_s32(intrin(vreinterpretq_s32_f32(a.val), vreinterpretq_s32_f32(b.val)))); \
|
||||
} \
|
||||
inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \
|
||||
{ \
|
||||
a.val = vreinterpretq_f32_s32(intrin(vreinterpretq_s32_f32(a.val), vreinterpretq_s32_f32(b.val))); \
|
||||
return a; \
|
||||
}
|
||||
|
||||
OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(&, vandq_s32)
|
||||
OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(|, vorrq_s32)
|
||||
OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(^, veorq_s32)
|
||||
OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(v_and, vandq_s32)
|
||||
OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(v_or, vorrq_s32)
|
||||
OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(v_xor, veorq_s32)
|
||||
|
||||
inline v_float32x4 operator ~ (const v_float32x4& a)
|
||||
inline v_float32x4 v_not (const v_float32x4& a)
|
||||
{
|
||||
return v_float32x4(vreinterpretq_f32_s32(vmvnq_s32(vreinterpretq_s32_f32(a.val))));
|
||||
}
|
||||
@ -942,7 +968,7 @@ inline v_float32x4 v_sqrt(const v_float32x4& x)
|
||||
inline v_float32x4 v_invsqrt(const v_float32x4& x)
|
||||
{
|
||||
v_float32x4 one = v_setall_f32(1.0f);
|
||||
return one / v_sqrt(x);
|
||||
return v_div(one, v_sqrt(x));
|
||||
}
|
||||
#else
|
||||
inline v_float32x4 v_sqrt(const v_float32x4& x)
|
||||
@ -975,21 +1001,16 @@ inline v_float32x4 v_abs(v_float32x4 x)
|
||||
|
||||
#if CV_SIMD128_64F
|
||||
#define OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(bin_op, intrin) \
|
||||
inline v_float64x2 operator bin_op (const v_float64x2& a, const v_float64x2& b) \
|
||||
inline v_float64x2 bin_op (const v_float64x2& a, const v_float64x2& b) \
|
||||
{ \
|
||||
return v_float64x2(vreinterpretq_f64_s64(intrin(vreinterpretq_s64_f64(a.val), vreinterpretq_s64_f64(b.val)))); \
|
||||
} \
|
||||
inline v_float64x2& operator bin_op##= (v_float64x2& a, const v_float64x2& b) \
|
||||
{ \
|
||||
a.val = vreinterpretq_f64_s64(intrin(vreinterpretq_s64_f64(a.val), vreinterpretq_s64_f64(b.val))); \
|
||||
return a; \
|
||||
}
|
||||
|
||||
OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(&, vandq_s64)
|
||||
OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(|, vorrq_s64)
|
||||
OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(^, veorq_s64)
|
||||
OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(v_and, vandq_s64)
|
||||
OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(v_or, vorrq_s64)
|
||||
OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(v_xor, veorq_s64)
|
||||
|
||||
inline v_float64x2 operator ~ (const v_float64x2& a)
|
||||
inline v_float64x2 v_not (const v_float64x2& a)
|
||||
{
|
||||
return v_float64x2(vreinterpretq_f64_s32(vmvnq_s32(vreinterpretq_s32_f64(a.val))));
|
||||
}
|
||||
@ -1002,7 +1023,7 @@ inline v_float64x2 v_sqrt(const v_float64x2& x)
|
||||
inline v_float64x2 v_invsqrt(const v_float64x2& x)
|
||||
{
|
||||
v_float64x2 one = v_setall_f64(1.0f);
|
||||
return one / v_sqrt(x);
|
||||
return v_div(one, v_sqrt(x));
|
||||
}
|
||||
|
||||
inline v_float64x2 v_abs(v_float64x2 x)
|
||||
@ -1037,17 +1058,17 @@ OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float64x2, v_max, vmaxq_f64)
|
||||
#endif
|
||||
|
||||
#define OPENCV_HAL_IMPL_NEON_INT_CMP_OP(_Tpvec, cast, suffix, not_suffix) \
|
||||
inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
|
||||
inline _Tpvec v_eq (const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return _Tpvec(cast(vceqq_##suffix(a.val, b.val))); } \
|
||||
inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
|
||||
inline _Tpvec v_ne (const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return _Tpvec(cast(vmvnq_##not_suffix(vceqq_##suffix(a.val, b.val)))); } \
|
||||
inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
|
||||
inline _Tpvec v_lt (const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return _Tpvec(cast(vcltq_##suffix(a.val, b.val))); } \
|
||||
inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
|
||||
inline _Tpvec v_gt (const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return _Tpvec(cast(vcgtq_##suffix(a.val, b.val))); } \
|
||||
inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
|
||||
inline _Tpvec v_le (const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return _Tpvec(cast(vcleq_##suffix(a.val, b.val))); } \
|
||||
inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
|
||||
inline _Tpvec v_ge (const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return _Tpvec(cast(vcgeq_##suffix(a.val, b.val))); }
|
||||
|
||||
OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint8x16, OPENCV_HAL_NOP, u8, u8)
|
||||
@ -1065,22 +1086,22 @@ static inline uint64x2_t vmvnq_u64(uint64x2_t a)
|
||||
}
|
||||
//OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint64x2, OPENCV_HAL_NOP, u64, u64)
|
||||
//OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int64x2, vreinterpretq_s64_u64, s64, u64)
|
||||
static inline v_uint64x2 operator == (const v_uint64x2& a, const v_uint64x2& b)
|
||||
static inline v_uint64x2 v_eq (const v_uint64x2& a, const v_uint64x2& b)
|
||||
{ return v_uint64x2(vceqq_u64(a.val, b.val)); }
|
||||
static inline v_uint64x2 operator != (const v_uint64x2& a, const v_uint64x2& b)
|
||||
static inline v_uint64x2 v_ne (const v_uint64x2& a, const v_uint64x2& b)
|
||||
{ return v_uint64x2(vmvnq_u64(vceqq_u64(a.val, b.val))); }
|
||||
static inline v_int64x2 operator == (const v_int64x2& a, const v_int64x2& b)
|
||||
static inline v_int64x2 v_eq (const v_int64x2& a, const v_int64x2& b)
|
||||
{ return v_int64x2(vreinterpretq_s64_u64(vceqq_s64(a.val, b.val))); }
|
||||
static inline v_int64x2 operator != (const v_int64x2& a, const v_int64x2& b)
|
||||
static inline v_int64x2 v_ne (const v_int64x2& a, const v_int64x2& b)
|
||||
{ return v_int64x2(vreinterpretq_s64_u64(vmvnq_u64(vceqq_s64(a.val, b.val)))); }
|
||||
#else
|
||||
static inline v_uint64x2 operator == (const v_uint64x2& a, const v_uint64x2& b)
|
||||
static inline v_uint64x2 v_eq (const v_uint64x2& a, const v_uint64x2& b)
|
||||
{
|
||||
uint32x4_t cmp = vceqq_u32(vreinterpretq_u32_u64(a.val), vreinterpretq_u32_u64(b.val));
|
||||
uint32x4_t swapped = vrev64q_u32(cmp);
|
||||
return v_uint64x2(vreinterpretq_u64_u32(vandq_u32(cmp, swapped)));
|
||||
}
|
||||
static inline v_uint64x2 operator != (const v_uint64x2& a, const v_uint64x2& b)
|
||||
static inline v_uint64x2 v_ne (const v_uint64x2& a, const v_uint64x2& b)
|
||||
{
|
||||
uint32x4_t cmp = vceqq_u32(vreinterpretq_u32_u64(a.val), vreinterpretq_u32_u64(b.val));
|
||||
uint32x4_t swapped = vrev64q_u32(cmp);
|
||||
@ -1088,13 +1109,13 @@ static inline v_uint64x2 operator != (const v_uint64x2& a, const v_uint64x2& b)
|
||||
uint64x2_t vx = vreinterpretq_u64_u32(vdupq_n_u32(0xFFFFFFFF));
|
||||
return v_uint64x2(veorq_u64(v_eq, vx));
|
||||
}
|
||||
static inline v_int64x2 operator == (const v_int64x2& a, const v_int64x2& b)
|
||||
static inline v_int64x2 v_eq (const v_int64x2& a, const v_int64x2& b)
|
||||
{
|
||||
return v_reinterpret_as_s64(v_reinterpret_as_u64(a) == v_reinterpret_as_u64(b));
|
||||
return v_reinterpret_as_s64(v_eq(v_reinterpret_as_u64(a), v_reinterpret_as_u64(b)));
|
||||
}
|
||||
static inline v_int64x2 operator != (const v_int64x2& a, const v_int64x2& b)
|
||||
static inline v_int64x2 v_ne (const v_int64x2& a, const v_int64x2& b)
|
||||
{
|
||||
return v_reinterpret_as_s64(v_reinterpret_as_u64(a) != v_reinterpret_as_u64(b));
|
||||
return v_reinterpret_as_s64(v_ne(v_reinterpret_as_u64(a), v_reinterpret_as_u64(b)));
|
||||
}
|
||||
#endif
|
||||
#if CV_SIMD128_64F
|
||||
@ -1207,9 +1228,9 @@ inline v_float64x2 v_muladd(const v_float64x2& a, const v_float64x2& b, const v_
|
||||
|
||||
// trade efficiency for convenience
|
||||
#define OPENCV_HAL_IMPL_NEON_SHIFT_OP(_Tpvec, suffix, _Tps, ssuffix) \
|
||||
inline _Tpvec operator << (const _Tpvec& a, int n) \
|
||||
inline _Tpvec v_shl (const _Tpvec& a, int n) \
|
||||
{ return _Tpvec(vshlq_##suffix(a.val, vdupq_n_##ssuffix((_Tps)n))); } \
|
||||
inline _Tpvec operator >> (const _Tpvec& a, int n) \
|
||||
inline _Tpvec v_shr (const _Tpvec& a, int n) \
|
||||
{ return _Tpvec(vshlq_##suffix(a.val, vdupq_n_##ssuffix((_Tps)-n))); } \
|
||||
template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
|
||||
{ return _Tpvec(vshlq_n_##suffix(a.val, n)); } \
|
||||
@ -1231,13 +1252,13 @@ OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int64x2, s64, int64, s64)
|
||||
template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
|
||||
{ return _Tpvec(vextq_##suffix(a.val, vdupq_n_##suffix(0), n)); } \
|
||||
template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
|
||||
{ return _Tpvec(vextq_##suffix(vdupq_n_##suffix(0), a.val, _Tpvec::nlanes - n)); } \
|
||||
{ return _Tpvec(vextq_##suffix(vdupq_n_##suffix(0), a.val, VTraits<_Tpvec>::nlanes - n)); } \
|
||||
template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \
|
||||
{ return a; } \
|
||||
template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return _Tpvec(vextq_##suffix(a.val, b.val, n)); } \
|
||||
template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return _Tpvec(vextq_##suffix(b.val, a.val, _Tpvec::nlanes - n)); } \
|
||||
{ return _Tpvec(vextq_##suffix(b.val, a.val, VTraits<_Tpvec>::nlanes - n)); } \
|
||||
template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \
|
||||
{ CV_UNUSED(b); return a; }
|
||||
|
||||
|
@ -358,8 +358,8 @@ static inline void v_store_pair_as(float16_t* ptr, const v_float64& a, const v_f
|
||||
|
||||
static inline void vx_load_as(const double* ptr, v_float32& a)
|
||||
{
|
||||
const int VECSZ = v_float32::nlanes;
|
||||
float buf[VECSZ*2];
|
||||
const int VECSZ = VTraits<v_float32>::vlanes();
|
||||
float buf[VTraits<v_float32>::max_nlanes*2];
|
||||
|
||||
for( int i = 0; i < VECSZ; i++ )
|
||||
buf[i] = saturate_cast<float>(ptr[i]);
|
||||
@ -369,19 +369,19 @@ static inline void vx_load_as(const double* ptr, v_float32& a)
|
||||
template<typename _Tdvec>
|
||||
static inline void vx_load_pair_as(const double* ptr, _Tdvec& a, _Tdvec& b)
|
||||
{
|
||||
const int VECSZ = _Tdvec::nlanes;
|
||||
typename _Tdvec::lane_type buf[VECSZ*2];
|
||||
const int VECSZ = VTraits<_Tdvec>::vlanes();
|
||||
typename VTraits<_Tdvec>::lane_type buf[VTraits<_Tdvec>::max_nlanes*2];
|
||||
|
||||
for( int i = 0; i < VECSZ*2; i++ )
|
||||
buf[i] = saturate_cast<typename _Tdvec::lane_type>(ptr[i]);
|
||||
buf[i] = saturate_cast<typename VTraits<_Tdvec>::lane_type>(ptr[i]);
|
||||
a = vx_load(buf);
|
||||
b = vx_load(buf + VECSZ);
|
||||
}
|
||||
|
||||
static inline void v_store_as(double* ptr, const v_float32& a)
|
||||
{
|
||||
const int VECSZ = v_float32::nlanes;
|
||||
float buf[VECSZ];
|
||||
const int VECSZ = VTraits<v_float32>::vlanes();
|
||||
float buf[VTraits<v_float32>::max_nlanes];
|
||||
|
||||
v_store(buf, a);
|
||||
for( int i = 0; i < VECSZ; i++ )
|
||||
@ -391,8 +391,8 @@ static inline void v_store_as(double* ptr, const v_float32& a)
|
||||
template<typename _Tsvec>
|
||||
static inline void v_store_pair_as(double* ptr, const _Tsvec& a, const _Tsvec& b)
|
||||
{
|
||||
const int VECSZ = _Tsvec::nlanes;
|
||||
typename _Tsvec::lane_type buf[VECSZ*2];
|
||||
const int VECSZ = VTraits<_Tsvec>::vlanes();
|
||||
typename VTraits<_Tsvec>::lane_type buf[VTraits<_Tsvec>::max_nlanes*2];
|
||||
|
||||
v_store(buf, a); v_store(buf + VECSZ, b);
|
||||
for( int i = 0; i < VECSZ*2; i++ )
|
||||
|
@ -93,13 +93,13 @@ struct v_atan_f32
|
||||
{
|
||||
v_float32 ax = v_abs(x);
|
||||
v_float32 ay = v_abs(y);
|
||||
v_float32 c = v_min(ax, ay) / (v_max(ax, ay) + eps);
|
||||
v_float32 cc = c * c;
|
||||
v_float32 a = v_fma(v_fma(v_fma(cc, p7, p5), cc, p3), cc, p1)*c;
|
||||
a = v_select(ax >= ay, a, val90 - a);
|
||||
a = v_select(x < z, val180 - a, a);
|
||||
a = v_select(y < z, val360 - a, a);
|
||||
return a * s;
|
||||
v_float32 c = v_div(v_min(ax, ay), v_add(v_max(ax, ay), this->eps));
|
||||
v_float32 cc = v_mul(c, c);
|
||||
v_float32 a = v_mul(v_fma(v_fma(v_fma(cc, this->p7, this->p5), cc, this->p3), cc, this->p1), c);
|
||||
a = v_select(v_ge(ax, ay), a, v_sub(this->val90, a));
|
||||
a = v_select(v_lt(x, this->z), v_sub(this->val180, a), a);
|
||||
a = v_select(v_lt(y, this->z), v_sub(this->val360, a), a);
|
||||
return v_mul(a, this->s);
|
||||
}
|
||||
|
||||
v_float32 eps;
|
||||
@ -125,7 +125,7 @@ static void fastAtan32f_(const float *Y, const float *X, float *angle, int len,
|
||||
float scale = angleInDegrees ? 1.f : (float)(CV_PI/180);
|
||||
int i = 0;
|
||||
#if CV_SIMD
|
||||
const int VECSZ = v_float32::nlanes;
|
||||
const int VECSZ = VTraits<v_float32>::vlanes();
|
||||
v_atan_f32 v(scale);
|
||||
|
||||
for( ; i < len; i += VECSZ*2 )
|
||||
@ -198,7 +198,7 @@ void magnitude32f(const float* x, const float* y, float* mag, int len)
|
||||
int i = 0;
|
||||
|
||||
#if CV_SIMD
|
||||
const int VECSZ = v_float32::nlanes;
|
||||
const int VECSZ = VTraits<v_float32>::vlanes();
|
||||
for( ; i < len; i += VECSZ*2 )
|
||||
{
|
||||
if( i + VECSZ*2 > len )
|
||||
@ -209,8 +209,8 @@ void magnitude32f(const float* x, const float* y, float* mag, int len)
|
||||
}
|
||||
v_float32 x0 = vx_load(x + i), x1 = vx_load(x + i + VECSZ);
|
||||
v_float32 y0 = vx_load(y + i), y1 = vx_load(y + i + VECSZ);
|
||||
x0 = v_sqrt(v_muladd(x0, x0, y0*y0));
|
||||
x1 = v_sqrt(v_muladd(x1, x1, y1*y1));
|
||||
x0 = v_sqrt(v_muladd(x0, x0, v_mul(y0, y0)));
|
||||
x1 = v_sqrt(v_muladd(x1, x1, v_mul(y1, y1)));
|
||||
v_store(mag + i, x0);
|
||||
v_store(mag + i + VECSZ, x1);
|
||||
}
|
||||
@ -231,7 +231,7 @@ void magnitude64f(const double* x, const double* y, double* mag, int len)
|
||||
int i = 0;
|
||||
|
||||
#if CV_SIMD_64F
|
||||
const int VECSZ = v_float64::nlanes;
|
||||
const int VECSZ = VTraits<v_float64>::vlanes();
|
||||
for( ; i < len; i += VECSZ*2 )
|
||||
{
|
||||
if( i + VECSZ*2 > len )
|
||||
@ -242,8 +242,8 @@ void magnitude64f(const double* x, const double* y, double* mag, int len)
|
||||
}
|
||||
v_float64 x0 = vx_load(x + i), x1 = vx_load(x + i + VECSZ);
|
||||
v_float64 y0 = vx_load(y + i), y1 = vx_load(y + i + VECSZ);
|
||||
x0 = v_sqrt(v_muladd(x0, x0, y0*y0));
|
||||
x1 = v_sqrt(v_muladd(x1, x1, y1*y1));
|
||||
x0 = v_sqrt(v_muladd(x0, x0, v_mul(y0, y0)));
|
||||
x1 = v_sqrt(v_muladd(x1, x1, v_mul(y1, y1)));
|
||||
v_store(mag + i, x0);
|
||||
v_store(mag + i + VECSZ, x1);
|
||||
}
|
||||
@ -265,7 +265,7 @@ void invSqrt32f(const float* src, float* dst, int len)
|
||||
int i = 0;
|
||||
|
||||
#if CV_SIMD
|
||||
const int VECSZ = v_float32::nlanes;
|
||||
const int VECSZ = VTraits<v_float32>::vlanes();
|
||||
for( ; i < len; i += VECSZ*2 )
|
||||
{
|
||||
if( i + VECSZ*2 > len )
|
||||
@ -293,7 +293,7 @@ void invSqrt64f(const double* src, double* dst, int len)
|
||||
int i = 0;
|
||||
|
||||
#if CV_SIMD_64F
|
||||
const int VECSZ = v_float64::nlanes;
|
||||
const int VECSZ = VTraits<v_float64>::vlanes();
|
||||
for ( ; i < len; i += VECSZ*2)
|
||||
{
|
||||
if( i + VECSZ*2 > len )
|
||||
@ -321,7 +321,7 @@ void sqrt32f(const float* src, float* dst, int len)
|
||||
int i = 0;
|
||||
|
||||
#if CV_SIMD
|
||||
const int VECSZ = v_float32::nlanes;
|
||||
const int VECSZ = VTraits<v_float32>::vlanes();
|
||||
for( ; i < len; i += VECSZ*2 )
|
||||
{
|
||||
if( i + VECSZ*2 > len )
|
||||
@ -350,7 +350,7 @@ void sqrt64f(const double* src, double* dst, int len)
|
||||
int i = 0;
|
||||
|
||||
#if CV_SIMD_64F
|
||||
const int VECSZ = v_float64::nlanes;
|
||||
const int VECSZ = VTraits<v_float64>::vlanes();
|
||||
for( ; i < len; i += VECSZ*2 )
|
||||
{
|
||||
if( i + VECSZ*2 > len )
|
||||
@ -452,7 +452,7 @@ void exp32f( const float *_x, float *y, int n )
|
||||
float postscale = (float)exp_postscale;
|
||||
|
||||
#if CV_SIMD
|
||||
const int VECSZ = v_float32::nlanes;
|
||||
const int VECSZ = VTraits<v_float32>::vlanes();
|
||||
const v_float32 vprescale = vx_setall_f32((float)exp_prescale);
|
||||
const v_float32 vpostscale = vx_setall_f32((float)exp_postscale);
|
||||
const v_float32 vminval = vx_setall_f32(minval);
|
||||
@ -481,26 +481,26 @@ void exp32f( const float *_x, float *y, int n )
|
||||
xf0 = v_min(v_max(xf0, vminval), vmaxval);
|
||||
xf1 = v_min(v_max(xf1, vminval), vmaxval);
|
||||
|
||||
xf0 *= vprescale;
|
||||
xf1 *= vprescale;
|
||||
xf0 = v_mul(xf0, vprescale);
|
||||
xf1 = v_mul(xf1, vprescale);
|
||||
|
||||
v_int32 xi0 = v_round(xf0);
|
||||
v_int32 xi1 = v_round(xf1);
|
||||
xf0 = (xf0 - v_cvt_f32(xi0))*vpostscale;
|
||||
xf1 = (xf1 - v_cvt_f32(xi1))*vpostscale;
|
||||
xf0 = v_mul(v_sub(xf0, v_cvt_f32(xi0)), vpostscale);
|
||||
xf1 = v_mul(v_sub(xf1, v_cvt_f32(xi1)), vpostscale);
|
||||
|
||||
v_float32 yf0 = v_lut(expTab_f, xi0 & vidxmask);
|
||||
v_float32 yf1 = v_lut(expTab_f, xi1 & vidxmask);
|
||||
v_float32 yf0 = v_lut(expTab_f, v_and(xi0, vidxmask));
|
||||
v_float32 yf1 = v_lut(expTab_f, v_and(xi1, vidxmask));
|
||||
|
||||
v_int32 v0 = vx_setzero_s32(), v127 = vx_setall_s32(127), v255 = vx_setall_s32(255);
|
||||
xi0 = v_min(v_max(v_shr<EXPTAB_SCALE>(xi0) + v127, v0), v255);
|
||||
xi1 = v_min(v_max(v_shr<EXPTAB_SCALE>(xi1) + v127, v0), v255);
|
||||
xi0 = v_min(v_max(v_add(v_shr<6>(xi0), v127), v0), v255);
|
||||
xi1 = v_min(v_max(v_add(v_shr<6>(xi1), v127), v0), v255);
|
||||
|
||||
yf0 *= v_reinterpret_as_f32(v_shl<23>(xi0));
|
||||
yf1 *= v_reinterpret_as_f32(v_shl<23>(xi1));
|
||||
yf0 = v_mul(yf0, v_reinterpret_as_f32(v_shl<23>(xi0)));
|
||||
yf1 = v_mul(yf1, v_reinterpret_as_f32(v_shl<23>(xi1)));
|
||||
|
||||
v_float32 zf0 = xf0 + vA1;
|
||||
v_float32 zf1 = xf1 + vA1;
|
||||
v_float32 zf0 = v_add(xf0, vA1);
|
||||
v_float32 zf1 = v_add(xf1, vA1);
|
||||
|
||||
zf0 = v_fma(zf0, xf0, vA2);
|
||||
zf1 = v_fma(zf1, xf1, vA2);
|
||||
@ -511,8 +511,8 @@ void exp32f( const float *_x, float *y, int n )
|
||||
zf0 = v_fma(zf0, xf0, vA4);
|
||||
zf1 = v_fma(zf1, xf1, vA4);
|
||||
|
||||
zf0 *= yf0;
|
||||
zf1 *= yf1;
|
||||
zf0 = v_mul(zf0, yf0);
|
||||
zf1 = v_mul(zf1, yf1);
|
||||
|
||||
if( y_aligned )
|
||||
{
|
||||
@ -566,7 +566,7 @@ void exp64f( const double *_x, double *y, int n )
|
||||
double maxval = (exp_max_val/exp_prescale);
|
||||
|
||||
#if CV_SIMD_64F
|
||||
const int VECSZ = v_float64::nlanes;
|
||||
const int VECSZ = VTraits<v_float64>::vlanes();
|
||||
const v_float64 vprescale = vx_setall_f64(exp_prescale);
|
||||
const v_float64 vpostscale = vx_setall_f64(exp_postscale);
|
||||
const v_float64 vminval = vx_setall_f64(minval);
|
||||
@ -596,30 +596,30 @@ void exp64f( const double *_x, double *y, int n )
|
||||
xf0 = v_min(v_max(xf0, vminval), vmaxval);
|
||||
xf1 = v_min(v_max(xf1, vminval), vmaxval);
|
||||
|
||||
xf0 *= vprescale;
|
||||
xf1 *= vprescale;
|
||||
xf0 = v_mul(xf0, vprescale);
|
||||
xf1 = v_mul(xf1, vprescale);
|
||||
|
||||
v_int32 xi0 = v_round(xf0);
|
||||
v_int32 xi1 = v_round(xf1);
|
||||
xf0 = (xf0 - v_cvt_f64(xi0))*vpostscale;
|
||||
xf1 = (xf1 - v_cvt_f64(xi1))*vpostscale;
|
||||
xf0 = v_mul(v_sub(xf0, v_cvt_f64(xi0)), vpostscale);
|
||||
xf1 = v_mul(v_sub(xf1, v_cvt_f64(xi1)), vpostscale);
|
||||
|
||||
v_float64 yf0 = v_lut(expTab, xi0 & vidxmask);
|
||||
v_float64 yf1 = v_lut(expTab, xi1 & vidxmask);
|
||||
v_float64 yf0 = v_lut(expTab, v_and(xi0, vidxmask));
|
||||
v_float64 yf1 = v_lut(expTab, v_and(xi1, vidxmask));
|
||||
|
||||
v_int32 v0 = vx_setzero_s32(), v1023 = vx_setall_s32(1023), v2047 = vx_setall_s32(2047);
|
||||
xi0 = v_min(v_max(v_shr<EXPTAB_SCALE>(xi0) + v1023, v0), v2047);
|
||||
xi1 = v_min(v_max(v_shr<EXPTAB_SCALE>(xi1) + v1023, v0), v2047);
|
||||
xi0 = v_min(v_max(v_add(v_shr<6>(xi0), v1023), v0), v2047);
|
||||
xi1 = v_min(v_max(v_add(v_shr<6>(xi1), v1023), v0), v2047);
|
||||
|
||||
v_int64 xq0, xq1, dummy;
|
||||
v_expand(xi0, xq0, dummy);
|
||||
v_expand(xi1, xq1, dummy);
|
||||
|
||||
yf0 *= v_reinterpret_as_f64(v_shl<52>(xq0));
|
||||
yf1 *= v_reinterpret_as_f64(v_shl<52>(xq1));
|
||||
yf0 = v_mul(yf0, v_reinterpret_as_f64(v_shl<52>(xq0)));
|
||||
yf1 = v_mul(yf1, v_reinterpret_as_f64(v_shl<52>(xq1)));
|
||||
|
||||
v_float64 zf0 = xf0 + vA1;
|
||||
v_float64 zf1 = xf1 + vA1;
|
||||
v_float64 zf0 = v_add(xf0, vA1);
|
||||
v_float64 zf1 = v_add(xf1, vA1);
|
||||
|
||||
zf0 = v_fma(zf0, xf0, vA2);
|
||||
zf1 = v_fma(zf1, xf1, vA2);
|
||||
@ -633,8 +633,8 @@ void exp64f( const double *_x, double *y, int n )
|
||||
zf0 = v_fma(zf0, xf0, vA5);
|
||||
zf1 = v_fma(zf1, xf1, vA5);
|
||||
|
||||
zf0 *= yf0;
|
||||
zf1 *= yf1;
|
||||
zf0 = v_mul(zf0, yf0);
|
||||
zf1 = v_mul(zf1, yf1);
|
||||
|
||||
if( y_aligned )
|
||||
{
|
||||
@ -696,7 +696,7 @@ void log32f( const float *_x, float *y, int n )
|
||||
const int* x = (const int*)_x;
|
||||
|
||||
#if CV_SIMD
|
||||
const int VECSZ = v_float32::nlanes;
|
||||
const int VECSZ = VTraits<v_float32>::vlanes();
|
||||
const v_float32 vln2 = vx_setall_f32((float)ln_2);
|
||||
const v_float32 v1 = vx_setall_f32(1.f);
|
||||
const v_float32 vshift = vx_setall_f32(-1.f/512);
|
||||
@ -715,18 +715,18 @@ void log32f( const float *_x, float *y, int n )
|
||||
}
|
||||
|
||||
v_int32 h0 = vx_load(x + i);
|
||||
v_int32 yi0 = (v_shr<23>(h0) & vx_setall_s32(255)) - vx_setall_s32(127);
|
||||
v_int32 xi0 = (h0 & vx_setall_s32(LOGTAB_MASK2_32F)) | vx_setall_s32(127 << 23);
|
||||
v_int32 yi0 = v_sub(v_and(v_shr<23>(h0), vx_setall_s32(255)), vx_setall_s32(127));
|
||||
v_int32 xi0 = v_or(v_and(h0, vx_setall_s32(LOGTAB_MASK2_32F)), vx_setall_s32(127 << 23));
|
||||
|
||||
h0 = v_shr<23 - LOGTAB_SCALE - 1>(h0) & vx_setall_s32(LOGTAB_MASK*2);
|
||||
h0 = v_and(v_shr<23 - 8 - 1>(h0), vx_setall_s32(((1 << 8) - 1) * 2));
|
||||
v_float32 yf0, xf0;
|
||||
|
||||
v_lut_deinterleave(logTab_f, h0, yf0, xf0);
|
||||
|
||||
yf0 = v_fma(v_cvt_f32(yi0), vln2, yf0);
|
||||
|
||||
v_float32 delta = v_select(v_reinterpret_as_f32(h0 == vx_setall_s32(510)), vshift, vx_setall<float>(0));
|
||||
xf0 = v_fma((v_reinterpret_as_f32(xi0) - v1), xf0, delta);
|
||||
v_float32 delta = v_select(v_reinterpret_as_f32(v_eq(h0, vx_setall_s32(510))), vshift, vx_setall<float>(0));
|
||||
xf0 = v_fma((v_sub(v_reinterpret_as_f32(xi0), v1)), xf0, delta);
|
||||
|
||||
v_float32 zf0 = v_fma(xf0, vA0, vA1);
|
||||
zf0 = v_fma(zf0, xf0, vA2);
|
||||
@ -771,7 +771,7 @@ void log64f( const double *x, double *y, int n )
|
||||
int i = 0;
|
||||
|
||||
#if CV_SIMD_64F
|
||||
const int VECSZ = v_float64::nlanes;
|
||||
const int VECSZ = VTraits<v_float64>::vlanes();
|
||||
const v_float64 vln2 = vx_setall_f64(ln_2);
|
||||
|
||||
const v_float64
|
||||
@ -791,20 +791,20 @@ void log64f( const double *x, double *y, int n )
|
||||
|
||||
v_int64 h0 = vx_load((const int64*)x + i);
|
||||
v_int32 yi0 = v_pack(v_shr<52>(h0), vx_setzero_s64());
|
||||
yi0 = (yi0 & vx_setall_s32(0x7ff)) - vx_setall_s32(1023);
|
||||
yi0 = v_sub(v_and(yi0, vx_setall_s32(2047)), vx_setall_s32(1023));
|
||||
|
||||
v_int64 xi0 = (h0 & vx_setall_s64(LOGTAB_MASK2_64F)) | vx_setall_s64((int64)1023 << 52);
|
||||
v_int64 xi0 = v_or(v_and(h0, vx_setall_s64(LOGTAB_MASK2_64F)), vx_setall_s64((int64)1023 << 52));
|
||||
h0 = v_shr<52 - LOGTAB_SCALE - 1>(h0);
|
||||
v_int32 idx = v_pack(h0, h0) & vx_setall_s32(LOGTAB_MASK*2);
|
||||
v_int32 idx = v_and(v_pack(h0, h0), vx_setall_s32(((1 << 8) - 1) * 2));
|
||||
|
||||
v_float64 xf0, yf0;
|
||||
v_lut_deinterleave(logTab, idx, yf0, xf0);
|
||||
|
||||
yf0 = v_fma(v_cvt_f64(yi0), vln2, yf0);
|
||||
v_float64 delta = v_cvt_f64(idx == vx_setall_s32(510))*vx_setall_f64(1./512);
|
||||
xf0 = v_fma(v_reinterpret_as_f64(xi0) - vx_setall_f64(1.), xf0, delta);
|
||||
v_float64 delta = v_mul(v_cvt_f64(v_eq(idx, vx_setall_s32(510))), vx_setall_f64(1. / 512));
|
||||
xf0 = v_fma(v_sub(v_reinterpret_as_f64(xi0), vx_setall_f64(1.)), xf0, delta);
|
||||
|
||||
v_float64 xq = xf0*xf0;
|
||||
v_float64 xq = v_mul(xf0, xf0);
|
||||
v_float64 zf0 = v_fma(xq, vA0, vA2);
|
||||
v_float64 zf1 = v_fma(xq, vA1, vA3);
|
||||
zf0 = v_fma(zf0, xq, vA4);
|
||||
|
@ -1584,7 +1584,7 @@ transform_16u( const ushort* src, ushort* dst, const float* m, int len, int scn,
|
||||
v_float32x4 _m2h = v_rotate_left<1>(_m2l);
|
||||
v_float32x4 _m3h = v_rotate_left<1>(_m3l);
|
||||
v_int16x8 _delta(0, -32768, -32768, -32768, -32768, -32768, -32768, 0);
|
||||
for( ; x <= len*3 - v_uint16x8::nlanes; x += 3*v_uint16x8::nlanes/4 )
|
||||
for( ; x <= len*3 - VTraits<v_uint16x8>::vlanes(); x += 3*VTraits<v_uint16x8>::vlanes()/4 )
|
||||
v_store(dst + x, v_rotate_right<1>(v_reinterpret_as_u16(v_add_wrap(v_pack(
|
||||
v_round(v_matmuladd(v_cvt_f32(v_reinterpret_as_s32(v_load_expand(src + x ))), _m0h, _m1h, _m2h, _m3h)),
|
||||
v_round(v_matmuladd(v_cvt_f32(v_reinterpret_as_s32(v_load_expand(src + x + 3))), _m0l, _m1l, _m2l, _m3l))), _delta))));
|
||||
@ -1664,10 +1664,10 @@ transform_32f( const float* src, float* dst, const float* m, int len, int scn, i
|
||||
v_float32x4 _m2 = v_load(m + 10);
|
||||
v_float32x4 _m3 = v_load(m + 15);
|
||||
v_float32x4 _m4(m[4], m[9], m[14], m[19]);
|
||||
for( ; x < len*4; x += v_float32x4::nlanes )
|
||||
for( ; x < len*4; x += VTraits<v_float32x4>::vlanes() )
|
||||
{
|
||||
v_float32x4 v_src = v_load(src + x);
|
||||
v_store(dst + x, v_reduce_sum4(v_src * _m0, v_src * _m1, v_src * _m2, v_src * _m3) + _m4);
|
||||
v_store(dst + x, v_add(v_reduce_sum4(v_mul(v_src, _m0), v_mul(v_src, _m1), v_mul(v_src, _m2), v_mul(v_src, _m3)), _m4));
|
||||
}
|
||||
#else // CV_SIMD_WIDTH >= 16 && !CV_SIMD128
|
||||
for( ; x < len*4; x += 4 )
|
||||
@ -2113,12 +2113,12 @@ MulTransposedR(const Mat& srcmat, const Mat& dstmat, const Mat& deltamat, double
|
||||
for( k = 0; k < size.height; k++, tsrc += srcstep )
|
||||
{
|
||||
v_float64x2 a = v_setall_f64((double)col_buf[k]);
|
||||
s0 += a * v_load(tsrc+0);
|
||||
s1 += a * v_load(tsrc+2);
|
||||
s0 = v_add(s0, v_mul(a, v_load(tsrc + 0)));
|
||||
s1 = v_add(s1, v_mul(a, v_load(tsrc + 2)));
|
||||
}
|
||||
|
||||
v_store((double*)(tdst+j), s0*v_scale);
|
||||
v_store((double*)(tdst+j+2), s1*v_scale);
|
||||
v_store((double*)(tdst+j), v_mul(s0, v_scale));
|
||||
v_store((double*)(tdst+j+2), v_mul(s1, v_scale));
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
@ -2174,12 +2174,12 @@ MulTransposedR(const Mat& srcmat, const Mat& dstmat, const Mat& deltamat, double
|
||||
for( k = 0; k < size.height; k++, tsrc+=srcstep, d+=deltastep )
|
||||
{
|
||||
v_float64x2 a = v_setall_f64((double)col_buf[k]);
|
||||
s0 += a * (v_load(tsrc+0) - v_load(d+0));
|
||||
s1 += a * (v_load(tsrc+2) - v_load(d+2));
|
||||
s0 = v_add(s0, v_mul(a, v_sub(v_load(tsrc + 0), v_load(d + 0))));
|
||||
s1 = v_add(s1, v_mul(a, v_sub(v_load(tsrc + 2), v_load(d + 2))));
|
||||
}
|
||||
|
||||
v_store((double*)(tdst+j), s0*v_scale);
|
||||
v_store((double*)(tdst+j+2), s1*v_scale);
|
||||
v_store((double*)(tdst+j), v_mul(s0, v_scale));
|
||||
v_store((double*)(tdst+j+2), v_mul(s1, v_scale));
|
||||
}
|
||||
else
|
||||
#endif
|
||||
@ -2249,8 +2249,7 @@ MulTransposedL(const Mat& srcmat, const Mat& dstmat, const Mat& deltamat, double
|
||||
v_float64x2 v_s = v_setzero_f64();
|
||||
|
||||
for( k = 0; k <= size.width - 4; k += 4 )
|
||||
v_s += (v_load(v_tsrc1+k) * v_load(v_tsrc2+k)) +
|
||||
(v_load(v_tsrc1+k+2) * v_load(v_tsrc2+k+2));
|
||||
v_s = v_add(v_s, v_add(v_mul(v_load(v_tsrc1 + k), v_load(v_tsrc2 + k)), v_mul(v_load(v_tsrc1 + k + 2), v_load(v_tsrc2 + k + 2))));
|
||||
s += v_reduce_sum(v_s);
|
||||
}
|
||||
else
|
||||
@ -2303,8 +2302,7 @@ MulTransposedL(const Mat& srcmat, const Mat& dstmat, const Mat& deltamat, double
|
||||
v_float64x2 v_s = v_setzero_f64();
|
||||
|
||||
for( k = 0; k <= size.width - 4; k += 4, v_tdelta2 += delta_shift )
|
||||
v_s += ((v_load(v_tsrc2+k) - v_load(v_tdelta2)) * v_load(v_row_buf+k)) +
|
||||
((v_load(v_tsrc2+k+2) - v_load(v_tdelta2+2)) * v_load(v_row_buf+k+2));
|
||||
v_s = v_add(v_s, v_add(v_mul(v_sub(v_load(v_tsrc2 + k), v_load(v_tdelta2)), v_load(v_row_buf + k)), v_mul(v_sub(v_load(v_tsrc2 + k + 2), v_load(v_tdelta2 + 2)), v_load(v_row_buf + k + 2))));
|
||||
s += v_reduce_sum(v_s);
|
||||
|
||||
tdelta2 = (const dT *)(v_tdelta2);
|
||||
@ -2566,7 +2564,7 @@ double dotProd_32s(const int* src1, const int* src2, int len)
|
||||
v_sum0 = v_dotprod_expand_fast(v_src10, v_src20, v_sum0);
|
||||
v_sum1 = v_dotprod_expand_fast(v_src11, v_src21, v_sum1);
|
||||
}
|
||||
v_sum0 += v_sum1;
|
||||
v_sum0 = v_add(v_sum0, v_sum1);
|
||||
#endif
|
||||
for (; i < len - step; i += step, src1 += step, src2 += step)
|
||||
{
|
||||
|
@ -356,10 +356,10 @@ void transposeND(InputArray src_, const std::vector<int>& order, OutputArray dst
|
||||
#if CV_SIMD128
|
||||
template<typename V> CV_ALWAYS_INLINE void flipHoriz_single( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size size, size_t esz )
|
||||
{
|
||||
typedef typename V::lane_type T;
|
||||
typedef typename VTraits<V>::lane_type T;
|
||||
int end = (int)(size.width*esz);
|
||||
int width = (end + 1)/2;
|
||||
int width_1 = width & -v_uint8x16::nlanes;
|
||||
int width_1 = width & -VTraits<v_uint8x16>::vlanes();
|
||||
int i, j;
|
||||
|
||||
#if CV_STRONG_ALIGNMENT
|
||||
@ -368,15 +368,15 @@ template<typename V> CV_ALWAYS_INLINE void flipHoriz_single( const uchar* src, s
|
||||
|
||||
for( ; size.height--; src += sstep, dst += dstep )
|
||||
{
|
||||
for( i = 0, j = end; i < width_1; i += v_uint8x16::nlanes, j -= v_uint8x16::nlanes )
|
||||
for( i = 0, j = end; i < width_1; i += VTraits<v_uint8x16>::vlanes(), j -= VTraits<v_uint8x16>::vlanes() )
|
||||
{
|
||||
V t0, t1;
|
||||
|
||||
t0 = v_load((T*)((uchar*)src + i));
|
||||
t1 = v_load((T*)((uchar*)src + j - v_uint8x16::nlanes));
|
||||
t1 = v_load((T*)((uchar*)src + j - VTraits<v_uint8x16>::vlanes()));
|
||||
t0 = v_reverse(t0);
|
||||
t1 = v_reverse(t1);
|
||||
v_store((T*)(dst + j - v_uint8x16::nlanes), t0);
|
||||
v_store((T*)(dst + j - VTraits<v_uint8x16>::vlanes()), t0);
|
||||
v_store((T*)(dst + i), t1);
|
||||
}
|
||||
if (isAligned<sizeof(T)>(src, dst))
|
||||
@ -446,14 +446,14 @@ flipHoriz( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size size,
|
||||
#if CV_STRONG_ALIGNMENT
|
||||
size_t alignmentMark = ((size_t)src)|((size_t)dst)|sstep|dstep;
|
||||
#endif
|
||||
if (esz == 2 * v_uint8x16::nlanes)
|
||||
if (esz == 2 * (size_t)VTraits<v_uint8x16>::vlanes())
|
||||
{
|
||||
int end = (int)(size.width*esz);
|
||||
int width = end/2;
|
||||
|
||||
for( ; size.height--; src += sstep, dst += dstep )
|
||||
{
|
||||
for( int i = 0, j = end - 2 * v_uint8x16::nlanes; i < width; i += 2 * v_uint8x16::nlanes, j -= 2 * v_uint8x16::nlanes )
|
||||
for( int i = 0, j = end - 2 * VTraits<v_uint8x16>::vlanes(); i < width; i += 2 * VTraits<v_uint8x16>::vlanes(), j -= 2 * VTraits<v_uint8x16>::vlanes() )
|
||||
{
|
||||
#if CV_SIMD256
|
||||
v_uint8x32 t0, t1;
|
||||
@ -466,25 +466,25 @@ flipHoriz( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size size,
|
||||
v_uint8x16 t0, t1, t2, t3;
|
||||
|
||||
t0 = v_load((uchar*)src + i);
|
||||
t1 = v_load((uchar*)src + i + v_uint8x16::nlanes);
|
||||
t1 = v_load((uchar*)src + i + VTraits<v_uint8x16>::vlanes());
|
||||
t2 = v_load((uchar*)src + j);
|
||||
t3 = v_load((uchar*)src + j + v_uint8x16::nlanes);
|
||||
t3 = v_load((uchar*)src + j + VTraits<v_uint8x16>::vlanes());
|
||||
v_store(dst + j, t0);
|
||||
v_store(dst + j + v_uint8x16::nlanes, t1);
|
||||
v_store(dst + j + VTraits<v_uint8x16>::vlanes(), t1);
|
||||
v_store(dst + i, t2);
|
||||
v_store(dst + i + v_uint8x16::nlanes, t3);
|
||||
v_store(dst + i + VTraits<v_uint8x16>::vlanes(), t3);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (esz == v_uint8x16::nlanes)
|
||||
else if (esz == (size_t)VTraits<v_uint8x16>::vlanes())
|
||||
{
|
||||
int end = (int)(size.width*esz);
|
||||
int width = end/2;
|
||||
|
||||
for( ; size.height--; src += sstep, dst += dstep )
|
||||
{
|
||||
for( int i = 0, j = end - v_uint8x16::nlanes; i < width; i += v_uint8x16::nlanes, j -= v_uint8x16::nlanes )
|
||||
for( int i = 0, j = end - VTraits<v_uint8x16>::vlanes(); i < width; i += VTraits<v_uint8x16>::vlanes(), j -= VTraits<v_uint8x16>::vlanes() )
|
||||
{
|
||||
v_uint8x16 t0, t1;
|
||||
|
||||
@ -534,19 +534,19 @@ flipHoriz( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size size,
|
||||
|
||||
for( ; size.height--; src += sstep, dst += dstep )
|
||||
{
|
||||
for ( int i = 0, j = end; i < width; i += v_uint8x16::nlanes + sizeof(uint64_t), j -= v_uint8x16::nlanes + sizeof(uint64_t) )
|
||||
for ( int i = 0, j = end; i < width; i += VTraits<v_uint8x16>::vlanes() + sizeof(uint64_t), j -= VTraits<v_uint8x16>::vlanes() + sizeof(uint64_t) )
|
||||
{
|
||||
v_uint8x16 t0, t1;
|
||||
uint64_t t2, t3;
|
||||
|
||||
t0 = v_load((uchar*)src + i);
|
||||
t2 = *((uint64_t*)((uchar*)src + i + v_uint8x16::nlanes));
|
||||
t1 = v_load((uchar*)src + j - v_uint8x16::nlanes - sizeof(uint64_t));
|
||||
t2 = *((uint64_t*)((uchar*)src + i + VTraits<v_uint8x16>::vlanes()));
|
||||
t1 = v_load((uchar*)src + j - VTraits<v_uint8x16>::vlanes() - sizeof(uint64_t));
|
||||
t3 = *((uint64_t*)((uchar*)src + j - sizeof(uint64_t)));
|
||||
v_store(dst + j - v_uint8x16::nlanes - sizeof(uint64_t), t0);
|
||||
v_store(dst + j - VTraits<v_uint8x16>::vlanes() - sizeof(uint64_t), t0);
|
||||
*((uint64_t*)(dst + j - sizeof(uint64_t))) = t2;
|
||||
v_store(dst + i, t1);
|
||||
*((uint64_t*)(dst + i + v_uint8x16::nlanes)) = t3;
|
||||
*((uint64_t*)(dst + i + VTraits<v_uint8x16>::vlanes())) = t3;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -141,7 +141,7 @@ CV_ALWAYS_INLINE uint64_t v_reduce_min(const v_uint64x2& a)
|
||||
|
||||
CV_ALWAYS_INLINE v_uint64x2 v_select(const v_uint64x2& mask, const v_uint64x2& a, const v_uint64x2& b)
|
||||
{
|
||||
return b ^ ((a ^ b) & mask);
|
||||
return v_xor(b, v_and(v_xor(a, b), mask));
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -151,16 +151,16 @@ minMaxIdx_reduce_##suffix( VT &valMin, VT &valMax, IT &idxMin, IT &idxMax, IT &n
|
||||
T &minVal, T &maxVal, size_t &minIdx, size_t &maxIdx, \
|
||||
size_t delta ) \
|
||||
{ \
|
||||
if ( v_check_any(idxMin != none) ) \
|
||||
if ( v_check_any(v_ne(idxMin, none)) ) \
|
||||
{ \
|
||||
minVal = v_reduce_min(valMin); \
|
||||
minIdx = (size_t)v_reduce_min(v_select(v_reinterpret_as_##suffix2(v_setall_##suffix((IR)minVal) == valMin), \
|
||||
minIdx = (size_t)v_reduce_min(v_select(v_reinterpret_as_##suffix2(v_eq(v_setall_##suffix((IR)minVal), valMin)), \
|
||||
idxMin, v_setall_##suffix2(maxLimit))) + delta; \
|
||||
} \
|
||||
if ( v_check_any(idxMax != none) ) \
|
||||
if ( v_check_any(v_ne(idxMax, none)) ) \
|
||||
{ \
|
||||
maxVal = v_reduce_max(valMax); \
|
||||
maxIdx = (size_t)v_reduce_min(v_select(v_reinterpret_as_##suffix2(v_setall_##suffix((IR)maxVal) == valMax), \
|
||||
maxIdx = (size_t)v_reduce_min(v_select(v_reinterpret_as_##suffix2(v_eq(v_setall_##suffix((IR)maxVal), valMax)), \
|
||||
idxMax, v_setall_##suffix2(maxLimit))) + delta; \
|
||||
} \
|
||||
}
|
||||
@ -210,18 +210,18 @@ static void minMaxIdx_8u(const uchar* src, const uchar* mask, int* minval, int*
|
||||
size_t* minidx, size_t* maxidx, int len, size_t startidx )
|
||||
{
|
||||
#if CV_SIMD128
|
||||
if ( len >= v_uint8x16::nlanes )
|
||||
if ( len >= VTraits<v_uint8x16>::vlanes() )
|
||||
{
|
||||
int j, len0;
|
||||
int minVal, maxVal;
|
||||
size_t minIdx, maxIdx;
|
||||
|
||||
minMaxIdx_init( src, mask, minval, maxval, minidx, maxidx, minVal, maxVal, minIdx, maxIdx,
|
||||
(int)0, (int)UCHAR_MAX, v_uint8x16::nlanes, len, startidx, j, len0 );
|
||||
(int)0, (int)UCHAR_MAX, VTraits<v_uint8x16>::vlanes(), len, startidx, j, len0 );
|
||||
|
||||
if ( j <= len0 - v_uint8x16::nlanes )
|
||||
if ( j <= len0 - VTraits<v_uint8x16>::vlanes() )
|
||||
{
|
||||
v_uint8x16 inc = v_setall_u8(v_uint8x16::nlanes);
|
||||
v_uint8x16 inc = v_setall_u8((uchar)VTraits<v_uint8x16>::vlanes());
|
||||
v_uint8x16 none = v_reinterpret_as_u8(v_setall_s8(-1));
|
||||
v_uint8x16 idxStart(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
|
||||
|
||||
@ -235,31 +235,31 @@ static void minMaxIdx_8u(const uchar* src, const uchar* mask, int* minval, int*
|
||||
|
||||
if ( !mask )
|
||||
{
|
||||
for( ; k < std::min(len0, j + 15 * v_uint8x16::nlanes); k += v_uint8x16::nlanes )
|
||||
for( ; k < std::min(len0, j + 15 * VTraits<v_uint8x16>::vlanes()); k += VTraits<v_uint8x16>::vlanes() )
|
||||
{
|
||||
v_uint8x16 data = v_load(src + k);
|
||||
v_uint8x16 cmpMin = (data < valMin);
|
||||
v_uint8x16 cmpMax = (data > valMax);
|
||||
v_uint8x16 cmpMin = (v_lt(data, valMin));
|
||||
v_uint8x16 cmpMax = (v_gt(data, valMax));
|
||||
idxMin = v_select(cmpMin, idx, idxMin);
|
||||
idxMax = v_select(cmpMax, idx, idxMax);
|
||||
valMin = v_min(data, valMin);
|
||||
valMax = v_max(data, valMax);
|
||||
idx += inc;
|
||||
idx = v_add(idx, inc);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for( ; k < std::min(len0, j + 15 * v_uint8x16::nlanes); k += v_uint8x16::nlanes )
|
||||
for( ; k < std::min(len0, j + 15 * VTraits<v_uint8x16>::vlanes()); k += VTraits<v_uint8x16>::vlanes() )
|
||||
{
|
||||
v_uint8x16 data = v_load(src + k);
|
||||
v_uint8x16 maskVal = v_load(mask + k) != v_setzero_u8();
|
||||
v_uint8x16 cmpMin = (data < valMin) & maskVal;
|
||||
v_uint8x16 cmpMax = (data > valMax) & maskVal;
|
||||
v_uint8x16 maskVal = v_ne(v_load(mask + k), v_setzero_u8());
|
||||
v_uint8x16 cmpMin = v_and(v_lt(data, valMin), maskVal);
|
||||
v_uint8x16 cmpMax = v_and(v_gt(data, valMax), maskVal);
|
||||
idxMin = v_select(cmpMin, idx, idxMin);
|
||||
idxMax = v_select(cmpMax, idx, idxMax);
|
||||
valMin = v_select(cmpMin, data, valMin);
|
||||
valMax = v_select(cmpMax, data, valMax);
|
||||
idx += inc;
|
||||
idx = v_add(idx, inc);
|
||||
}
|
||||
}
|
||||
|
||||
@ -287,18 +287,18 @@ static void minMaxIdx_8s(const schar* src, const uchar* mask, int* minval, int*
|
||||
size_t* minidx, size_t* maxidx, int len, size_t startidx )
|
||||
{
|
||||
#if CV_SIMD128
|
||||
if ( len >= v_int8x16::nlanes )
|
||||
if ( len >= VTraits<v_int8x16>::vlanes() )
|
||||
{
|
||||
int j, len0;
|
||||
int minVal, maxVal;
|
||||
size_t minIdx, maxIdx;
|
||||
|
||||
minMaxIdx_init( src, mask, minval, maxval, minidx, maxidx, minVal, maxVal, minIdx, maxIdx,
|
||||
(int)SCHAR_MIN, (int)SCHAR_MAX, v_int8x16::nlanes, len, startidx, j, len0 );
|
||||
(int)SCHAR_MIN, (int)SCHAR_MAX, VTraits<v_int8x16>::vlanes(), len, startidx, j, len0 );
|
||||
|
||||
if ( j <= len0 - v_int8x16::nlanes )
|
||||
if ( j <= len0 - VTraits<v_int8x16>::vlanes() )
|
||||
{
|
||||
v_uint8x16 inc = v_setall_u8(v_int8x16::nlanes);
|
||||
v_uint8x16 inc = v_setall_u8((uchar)VTraits<v_int8x16>::vlanes());
|
||||
v_uint8x16 none = v_reinterpret_as_u8(v_setall_s8(-1));
|
||||
v_uint8x16 idxStart(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
|
||||
|
||||
@ -312,31 +312,31 @@ static void minMaxIdx_8s(const schar* src, const uchar* mask, int* minval, int*
|
||||
|
||||
if ( !mask )
|
||||
{
|
||||
for( ; k < std::min(len0, j + 15 * v_int8x16::nlanes); k += v_int8x16::nlanes )
|
||||
for( ; k < std::min(len0, j + 15 * VTraits<v_int8x16>::vlanes()); k += VTraits<v_int8x16>::vlanes() )
|
||||
{
|
||||
v_int8x16 data = v_load(src + k);
|
||||
v_uint8x16 cmpMin = v_reinterpret_as_u8(data < valMin);
|
||||
v_uint8x16 cmpMax = v_reinterpret_as_u8(data > valMax);
|
||||
v_uint8x16 cmpMin = v_reinterpret_as_u8(v_lt(data, valMin));
|
||||
v_uint8x16 cmpMax = v_reinterpret_as_u8(v_gt(data, valMax));
|
||||
idxMin = v_select(cmpMin, idx, idxMin);
|
||||
idxMax = v_select(cmpMax, idx, idxMax);
|
||||
valMin = v_min(data, valMin);
|
||||
valMax = v_max(data, valMax);
|
||||
idx += inc;
|
||||
idx = v_add(idx, inc);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for( ; k < std::min(len0, j + 15 * v_int8x16::nlanes); k += v_int8x16::nlanes )
|
||||
for( ; k < std::min(len0, j + 15 * VTraits<v_int8x16>::vlanes()); k += VTraits<v_int8x16>::vlanes() )
|
||||
{
|
||||
v_int8x16 data = v_load(src + k);
|
||||
v_uint8x16 maskVal = v_load(mask + k) != v_setzero_u8();
|
||||
v_uint8x16 cmpMin = v_reinterpret_as_u8(data < valMin) & maskVal;
|
||||
v_uint8x16 cmpMax = v_reinterpret_as_u8(data > valMax) & maskVal;
|
||||
v_uint8x16 maskVal = v_ne(v_load(mask + k), v_setzero_u8());
|
||||
v_uint8x16 cmpMin = v_and(v_reinterpret_as_u8(v_lt(data, valMin)), maskVal);
|
||||
v_uint8x16 cmpMax = v_and(v_reinterpret_as_u8(v_gt(data, valMax)), maskVal);
|
||||
idxMin = v_select(cmpMin, idx, idxMin);
|
||||
idxMax = v_select(cmpMax, idx, idxMax);
|
||||
valMin = v_select(v_reinterpret_as_s8(cmpMin), data, valMin);
|
||||
valMax = v_select(v_reinterpret_as_s8(cmpMax), data, valMax);
|
||||
idx += inc;
|
||||
idx = v_add(idx, inc);
|
||||
}
|
||||
}
|
||||
|
||||
@ -364,18 +364,18 @@ static void minMaxIdx_16u(const ushort* src, const uchar* mask, int* minval, int
|
||||
size_t* minidx, size_t* maxidx, int len, size_t startidx )
|
||||
{
|
||||
#if CV_SIMD128
|
||||
if ( len >= v_uint16x8::nlanes )
|
||||
if ( len >= VTraits<v_uint16x8>::vlanes() )
|
||||
{
|
||||
int j, len0;
|
||||
int minVal, maxVal;
|
||||
size_t minIdx, maxIdx;
|
||||
|
||||
minMaxIdx_init( src, mask, minval, maxval, minidx, maxidx, minVal, maxVal, minIdx, maxIdx,
|
||||
(int)0, (int)USHRT_MAX, v_uint16x8::nlanes, len, startidx, j, len0 );
|
||||
(int)0, (int)USHRT_MAX, VTraits<v_uint16x8>::vlanes(), len, startidx, j, len0 );
|
||||
|
||||
if ( j <= len0 - v_uint16x8::nlanes )
|
||||
if ( j <= len0 - VTraits<v_uint16x8>::vlanes() )
|
||||
{
|
||||
v_uint16x8 inc = v_setall_u16(v_uint16x8::nlanes);
|
||||
v_uint16x8 inc = v_setall_u16((uchar)VTraits<v_uint16x8>::vlanes());
|
||||
v_uint16x8 none = v_reinterpret_as_u16(v_setall_s16(-1));
|
||||
v_uint16x8 idxStart(0, 1, 2, 3, 4, 5, 6, 7);
|
||||
|
||||
@ -389,31 +389,31 @@ static void minMaxIdx_16u(const ushort* src, const uchar* mask, int* minval, int
|
||||
|
||||
if ( !mask )
|
||||
{
|
||||
for( ; k < std::min(len0, j + 8191 * v_uint16x8::nlanes); k += v_uint16x8::nlanes )
|
||||
for( ; k < std::min(len0, j + 8191 * VTraits<v_uint16x8>::vlanes()); k += VTraits<v_uint16x8>::vlanes() )
|
||||
{
|
||||
v_uint16x8 data = v_load(src + k);
|
||||
v_uint16x8 cmpMin = (data < valMin);
|
||||
v_uint16x8 cmpMax = (data > valMax);
|
||||
v_uint16x8 cmpMin = (v_lt(data, valMin));
|
||||
v_uint16x8 cmpMax = (v_gt(data, valMax));
|
||||
idxMin = v_select(cmpMin, idx, idxMin);
|
||||
idxMax = v_select(cmpMax, idx, idxMax);
|
||||
valMin = v_min(data, valMin);
|
||||
valMax = v_max(data, valMax);
|
||||
idx += inc;
|
||||
idx = v_add(idx, inc);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for( ; k < std::min(len0, j + 8191 * v_uint16x8::nlanes); k += v_uint16x8::nlanes )
|
||||
for( ; k < std::min(len0, j + 8191 * VTraits<v_uint16x8>::vlanes()); k += VTraits<v_uint16x8>::vlanes() )
|
||||
{
|
||||
v_uint16x8 data = v_load(src + k);
|
||||
v_uint16x8 maskVal = v_load_expand(mask + k) != v_setzero_u16();
|
||||
v_uint16x8 cmpMin = (data < valMin) & maskVal;
|
||||
v_uint16x8 cmpMax = (data > valMax) & maskVal;
|
||||
v_uint16x8 maskVal = v_ne(v_load_expand(mask + k), v_setzero_u16());
|
||||
v_uint16x8 cmpMin = v_and(v_lt(data, valMin), maskVal);
|
||||
v_uint16x8 cmpMax = v_and(v_gt(data, valMax), maskVal);
|
||||
idxMin = v_select(cmpMin, idx, idxMin);
|
||||
idxMax = v_select(cmpMax, idx, idxMax);
|
||||
valMin = v_select(cmpMin, data, valMin);
|
||||
valMax = v_select(cmpMax, data, valMax);
|
||||
idx += inc;
|
||||
idx = v_add(idx, inc);
|
||||
}
|
||||
}
|
||||
|
||||
@ -441,18 +441,18 @@ static void minMaxIdx_16s(const short* src, const uchar* mask, int* minval, int*
|
||||
size_t* minidx, size_t* maxidx, int len, size_t startidx )
|
||||
{
|
||||
#if CV_SIMD128
|
||||
if ( len >= v_int16x8::nlanes )
|
||||
if ( len >= VTraits<v_int16x8>::vlanes() )
|
||||
{
|
||||
int j, len0;
|
||||
int minVal, maxVal;
|
||||
size_t minIdx, maxIdx;
|
||||
|
||||
minMaxIdx_init( src, mask, minval, maxval, minidx, maxidx, minVal, maxVal, minIdx, maxIdx,
|
||||
(int)SHRT_MIN, (int)SHRT_MAX, v_int16x8::nlanes, len, startidx, j, len0 );
|
||||
(int)SHRT_MIN, (int)SHRT_MAX, VTraits<v_int16x8>::vlanes(), len, startidx, j, len0 );
|
||||
|
||||
if ( j <= len0 - v_int16x8::nlanes )
|
||||
if ( j <= len0 - VTraits<v_int16x8>::vlanes() )
|
||||
{
|
||||
v_uint16x8 inc = v_setall_u16(v_int16x8::nlanes);
|
||||
v_uint16x8 inc = v_setall_u16((uchar)VTraits<v_int16x8>::vlanes());
|
||||
v_uint16x8 none = v_reinterpret_as_u16(v_setall_s16(-1));
|
||||
v_uint16x8 idxStart(0, 1, 2, 3, 4, 5, 6, 7);
|
||||
|
||||
@ -466,31 +466,31 @@ static void minMaxIdx_16s(const short* src, const uchar* mask, int* minval, int*
|
||||
|
||||
if ( !mask )
|
||||
{
|
||||
for( ; k < std::min(len0, j + 8191 * v_int16x8::nlanes); k += v_int16x8::nlanes )
|
||||
for( ; k < std::min(len0, j + 8191 * VTraits<v_int16x8>::vlanes()); k += VTraits<v_int16x8>::vlanes() )
|
||||
{
|
||||
v_int16x8 data = v_load(src + k);
|
||||
v_uint16x8 cmpMin = v_reinterpret_as_u16(data < valMin);
|
||||
v_uint16x8 cmpMax = v_reinterpret_as_u16(data > valMax);
|
||||
v_uint16x8 cmpMin = v_reinterpret_as_u16(v_lt(data, valMin));
|
||||
v_uint16x8 cmpMax = v_reinterpret_as_u16(v_gt(data, valMax));
|
||||
idxMin = v_select(cmpMin, idx, idxMin);
|
||||
idxMax = v_select(cmpMax, idx, idxMax);
|
||||
valMin = v_min(data, valMin);
|
||||
valMax = v_max(data, valMax);
|
||||
idx += inc;
|
||||
idx = v_add(idx, inc);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for( ; k < std::min(len0, j + 8191 * v_int16x8::nlanes); k += v_int16x8::nlanes )
|
||||
for( ; k < std::min(len0, j + 8191 * VTraits<v_int16x8>::vlanes()); k += VTraits<v_int16x8>::vlanes() )
|
||||
{
|
||||
v_int16x8 data = v_load(src + k);
|
||||
v_uint16x8 maskVal = v_load_expand(mask + k) != v_setzero_u16();
|
||||
v_uint16x8 cmpMin = v_reinterpret_as_u16(data < valMin) & maskVal;
|
||||
v_uint16x8 cmpMax = v_reinterpret_as_u16(data > valMax) & maskVal;
|
||||
v_uint16x8 maskVal = v_ne(v_load_expand(mask + k), v_setzero_u16());
|
||||
v_uint16x8 cmpMin = v_and(v_reinterpret_as_u16(v_lt(data, valMin)), maskVal);
|
||||
v_uint16x8 cmpMax = v_and(v_reinterpret_as_u16(v_gt(data, valMax)), maskVal);
|
||||
idxMin = v_select(cmpMin, idx, idxMin);
|
||||
idxMax = v_select(cmpMax, idx, idxMax);
|
||||
valMin = v_select(v_reinterpret_as_s16(cmpMin), data, valMin);
|
||||
valMax = v_select(v_reinterpret_as_s16(cmpMax), data, valMax);
|
||||
idx += inc;
|
||||
idx = v_add(idx, inc);
|
||||
}
|
||||
}
|
||||
|
||||
@ -518,14 +518,14 @@ static void minMaxIdx_32s(const int* src, const uchar* mask, int* minval, int* m
|
||||
size_t* minidx, size_t* maxidx, int len, size_t startidx )
|
||||
{
|
||||
#if CV_SIMD128
|
||||
if ( len >= 2 * v_int32x4::nlanes )
|
||||
if ( len >= 2 * VTraits<v_int32x4>::vlanes() )
|
||||
{
|
||||
int j = 0, len0 = len & -(2 * v_int32x4::nlanes);
|
||||
int j = 0, len0 = len & -(2 * VTraits<v_int32x4>::vlanes());
|
||||
int minVal = *minval, maxVal = *maxval;
|
||||
size_t minIdx = *minidx, maxIdx = *maxidx;
|
||||
|
||||
{
|
||||
v_uint32x4 inc = v_setall_u32(v_int32x4::nlanes);
|
||||
v_uint32x4 inc = v_setall_u32(VTraits<v_int32x4>::vlanes());
|
||||
v_uint32x4 none = v_reinterpret_as_u32(v_setall_s32(-1));
|
||||
v_uint32x4 idxStart(0, 1, 2, 3);
|
||||
|
||||
@ -539,49 +539,49 @@ static void minMaxIdx_32s(const int* src, const uchar* mask, int* minval, int* m
|
||||
|
||||
if ( !mask )
|
||||
{
|
||||
for( ; k < std::min(len0, j + 32766 * 2 * v_int32x4::nlanes); k += 2 * v_int32x4::nlanes )
|
||||
for( ; k < std::min(len0, j + 32766 * 2 * VTraits<v_int32x4>::vlanes()); k += 2 * VTraits<v_int32x4>::vlanes() )
|
||||
{
|
||||
v_int32x4 data = v_load(src + k);
|
||||
v_uint32x4 cmpMin = v_reinterpret_as_u32(data < valMin);
|
||||
v_uint32x4 cmpMax = v_reinterpret_as_u32(data > valMax);
|
||||
v_uint32x4 cmpMin = v_reinterpret_as_u32(v_lt(data, valMin));
|
||||
v_uint32x4 cmpMax = v_reinterpret_as_u32(v_gt(data, valMax));
|
||||
idxMin = v_select(cmpMin, idx, idxMin);
|
||||
idxMax = v_select(cmpMax, idx, idxMax);
|
||||
valMin = v_min(data, valMin);
|
||||
valMax = v_max(data, valMax);
|
||||
idx += inc;
|
||||
data = v_load(src + k + v_int32x4::nlanes);
|
||||
cmpMin = v_reinterpret_as_u32(data < valMin);
|
||||
cmpMax = v_reinterpret_as_u32(data > valMax);
|
||||
idx = v_add(idx, inc);
|
||||
data = v_load(src + k + VTraits<v_int32x4>::vlanes());
|
||||
cmpMin = v_reinterpret_as_u32(v_lt(data, valMin));
|
||||
cmpMax = v_reinterpret_as_u32(v_gt(data, valMax));
|
||||
idxMin = v_select(cmpMin, idx, idxMin);
|
||||
idxMax = v_select(cmpMax, idx, idxMax);
|
||||
valMin = v_min(data, valMin);
|
||||
valMax = v_max(data, valMax);
|
||||
idx += inc;
|
||||
idx = v_add(idx, inc);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for( ; k < std::min(len0, j + 32766 * 2 * v_int32x4::nlanes); k += 2 * v_int32x4::nlanes )
|
||||
for( ; k < std::min(len0, j + 32766 * 2 * VTraits<v_int32x4>::vlanes()); k += 2 * VTraits<v_int32x4>::vlanes() )
|
||||
{
|
||||
v_int32x4 data = v_load(src + k);
|
||||
v_uint16x8 maskVal = v_load_expand(mask + k) != v_setzero_u16();
|
||||
v_uint16x8 maskVal = v_ne(v_load_expand(mask + k), v_setzero_u16());
|
||||
v_int32x4 maskVal1, maskVal2;
|
||||
v_expand(v_reinterpret_as_s16(maskVal), maskVal1, maskVal2);
|
||||
v_uint32x4 cmpMin = v_reinterpret_as_u32((data < valMin) & maskVal1);
|
||||
v_uint32x4 cmpMax = v_reinterpret_as_u32((data > valMax) & maskVal1);
|
||||
v_uint32x4 cmpMin = v_reinterpret_as_u32(v_and(v_lt(data, valMin), maskVal1));
|
||||
v_uint32x4 cmpMax = v_reinterpret_as_u32(v_and(v_gt(data, valMax), maskVal1));
|
||||
idxMin = v_select(cmpMin, idx, idxMin);
|
||||
idxMax = v_select(cmpMax, idx, idxMax);
|
||||
valMin = v_select(v_reinterpret_as_s32(cmpMin), data, valMin);
|
||||
valMax = v_select(v_reinterpret_as_s32(cmpMax), data, valMax);
|
||||
idx += inc;
|
||||
data = v_load(src + k + v_int32x4::nlanes);
|
||||
cmpMin = v_reinterpret_as_u32((data < valMin) & maskVal2);
|
||||
cmpMax = v_reinterpret_as_u32((data > valMax) & maskVal2);
|
||||
idx = v_add(idx, inc);
|
||||
data = v_load(src + k + VTraits<v_int32x4>::vlanes());
|
||||
cmpMin = v_reinterpret_as_u32(v_and(v_lt(data, valMin), maskVal2));
|
||||
cmpMax = v_reinterpret_as_u32(v_and(v_gt(data, valMax), maskVal2));
|
||||
idxMin = v_select(cmpMin, idx, idxMin);
|
||||
idxMax = v_select(cmpMax, idx, idxMax);
|
||||
valMin = v_select(v_reinterpret_as_s32(cmpMin), data, valMin);
|
||||
valMax = v_select(v_reinterpret_as_s32(cmpMax), data, valMax);
|
||||
idx += inc;
|
||||
idx = v_add(idx, inc);
|
||||
}
|
||||
}
|
||||
|
||||
@ -609,18 +609,18 @@ static void minMaxIdx_32f(const float* src, const uchar* mask, float* minval, fl
|
||||
size_t* minidx, size_t* maxidx, int len, size_t startidx )
|
||||
{
|
||||
#if CV_SIMD128
|
||||
if ( len >= 2 * v_float32x4::nlanes )
|
||||
if ( len >= 2 * VTraits<v_float32x4>::vlanes() )
|
||||
{
|
||||
int j, len0;
|
||||
float minVal, maxVal;
|
||||
size_t minIdx, maxIdx;
|
||||
|
||||
minMaxIdx_init( src, mask, minval, maxval, minidx, maxidx, minVal, maxVal, minIdx, maxIdx,
|
||||
FLT_MIN, FLT_MAX, 2 * v_float32x4::nlanes, len, startidx, j, len0 );
|
||||
FLT_MIN, FLT_MAX, 2 * VTraits<v_float32x4>::vlanes(), len, startidx, j, len0 );
|
||||
|
||||
if ( j <= len0 - 2 * v_float32x4::nlanes )
|
||||
if ( j <= len0 - 2 * VTraits<v_float32x4>::vlanes() )
|
||||
{
|
||||
v_uint32x4 inc = v_setall_u32(v_float32x4::nlanes);
|
||||
v_uint32x4 inc = v_setall_u32(VTraits<v_float32x4>::vlanes());
|
||||
v_uint32x4 none = v_reinterpret_as_u32(v_setall_s32(-1));
|
||||
v_uint32x4 idxStart(0, 1, 2, 3);
|
||||
|
||||
@ -634,49 +634,49 @@ static void minMaxIdx_32f(const float* src, const uchar* mask, float* minval, fl
|
||||
|
||||
if ( !mask )
|
||||
{
|
||||
for( ; k < std::min(len0, j + 32766 * 2 * v_float32x4::nlanes); k += 2 * v_float32x4::nlanes )
|
||||
for( ; k < std::min(len0, j + 32766 * 2 * VTraits<v_float32x4>::vlanes()); k += 2 * VTraits<v_float32x4>::vlanes() )
|
||||
{
|
||||
v_float32x4 data = v_load(src + k);
|
||||
v_uint32x4 cmpMin = v_reinterpret_as_u32(data < valMin);
|
||||
v_uint32x4 cmpMax = v_reinterpret_as_u32(data > valMax);
|
||||
v_uint32x4 cmpMin = v_reinterpret_as_u32(v_lt(data, valMin));
|
||||
v_uint32x4 cmpMax = v_reinterpret_as_u32(v_gt(data, valMax));
|
||||
idxMin = v_select(cmpMin, idx, idxMin);
|
||||
idxMax = v_select(cmpMax, idx, idxMax);
|
||||
valMin = v_min(data, valMin);
|
||||
valMax = v_max(data, valMax);
|
||||
idx += inc;
|
||||
data = v_load(src + k + v_float32x4::nlanes);
|
||||
cmpMin = v_reinterpret_as_u32(data < valMin);
|
||||
cmpMax = v_reinterpret_as_u32(data > valMax);
|
||||
idx = v_add(idx, inc);
|
||||
data = v_load(src + k + VTraits<v_float32x4>::vlanes());
|
||||
cmpMin = v_reinterpret_as_u32(v_lt(data, valMin));
|
||||
cmpMax = v_reinterpret_as_u32(v_gt(data, valMax));
|
||||
idxMin = v_select(cmpMin, idx, idxMin);
|
||||
idxMax = v_select(cmpMax, idx, idxMax);
|
||||
valMin = v_min(data, valMin);
|
||||
valMax = v_max(data, valMax);
|
||||
idx += inc;
|
||||
idx = v_add(idx, inc);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for( ; k < std::min(len0, j + 32766 * 2 * v_float32x4::nlanes); k += 2 * v_float32x4::nlanes )
|
||||
for( ; k < std::min(len0, j + 32766 * 2 * VTraits<v_float32x4>::vlanes()); k += 2 * VTraits<v_float32x4>::vlanes() )
|
||||
{
|
||||
v_float32x4 data = v_load(src + k);
|
||||
v_uint16x8 maskVal = v_load_expand(mask + k) != v_setzero_u16();
|
||||
v_uint16x8 maskVal = v_ne(v_load_expand(mask + k), v_setzero_u16());
|
||||
v_int32x4 maskVal1, maskVal2;
|
||||
v_expand(v_reinterpret_as_s16(maskVal), maskVal1, maskVal2);
|
||||
v_uint32x4 cmpMin = v_reinterpret_as_u32(v_reinterpret_as_s32(data < valMin) & maskVal1);
|
||||
v_uint32x4 cmpMax = v_reinterpret_as_u32(v_reinterpret_as_s32(data > valMax) & maskVal1);
|
||||
v_uint32x4 cmpMin = v_reinterpret_as_u32(v_and(v_reinterpret_as_s32(v_lt(data, valMin)), maskVal1));
|
||||
v_uint32x4 cmpMax = v_reinterpret_as_u32(v_and(v_reinterpret_as_s32(v_gt(data, valMax)), maskVal1));
|
||||
idxMin = v_select(cmpMin, idx, idxMin);
|
||||
idxMax = v_select(cmpMax, idx, idxMax);
|
||||
valMin = v_select(v_reinterpret_as_f32(cmpMin), data, valMin);
|
||||
valMax = v_select(v_reinterpret_as_f32(cmpMax), data, valMax);
|
||||
idx += inc;
|
||||
data = v_load(src + k + v_float32x4::nlanes);
|
||||
cmpMin = v_reinterpret_as_u32(v_reinterpret_as_s32(data < valMin) & maskVal2);
|
||||
cmpMax = v_reinterpret_as_u32(v_reinterpret_as_s32(data > valMax) & maskVal2);
|
||||
idx = v_add(idx, inc);
|
||||
data = v_load(src + k + VTraits<v_float32x4>::vlanes());
|
||||
cmpMin = v_reinterpret_as_u32(v_and(v_reinterpret_as_s32(v_lt(data, valMin)), maskVal2));
|
||||
cmpMax = v_reinterpret_as_u32(v_and(v_reinterpret_as_s32(v_gt(data, valMax)), maskVal2));
|
||||
idxMin = v_select(cmpMin, idx, idxMin);
|
||||
idxMax = v_select(cmpMax, idx, idxMax);
|
||||
valMin = v_select(v_reinterpret_as_f32(cmpMin), data, valMin);
|
||||
valMax = v_select(v_reinterpret_as_f32(cmpMax), data, valMax);
|
||||
idx += inc;
|
||||
idx = v_add(idx, inc);
|
||||
}
|
||||
}
|
||||
|
||||
@ -704,18 +704,18 @@ static void minMaxIdx_64f(const double* src, const uchar* mask, double* minval,
|
||||
size_t* minidx, size_t* maxidx, int len, size_t startidx )
|
||||
{
|
||||
#if CV_SIMD128_64F
|
||||
if ( len >= 4 * v_float64x2::nlanes )
|
||||
if ( len >= 4 * VTraits<v_float64x2>::vlanes() )
|
||||
{
|
||||
int j, len0;
|
||||
double minVal, maxVal;
|
||||
size_t minIdx, maxIdx;
|
||||
|
||||
minMaxIdx_init( src, mask, minval, maxval, minidx, maxidx, minVal, maxVal, minIdx, maxIdx,
|
||||
DBL_MIN, DBL_MAX, 4 * v_float64x2::nlanes, len, startidx, j, len0 );
|
||||
DBL_MIN, DBL_MAX, 4 * VTraits<v_float64x2>::vlanes(), len, startidx, j, len0 );
|
||||
|
||||
if ( j <= len0 - 4 * v_float64x2::nlanes )
|
||||
if ( j <= len0 - 4 * VTraits<v_float64x2>::vlanes() )
|
||||
{
|
||||
v_uint64x2 inc = v_setall_u64(v_float64x2::nlanes);
|
||||
v_uint64x2 inc = v_setall_u64(VTraits<v_float64x2>::vlanes());
|
||||
v_uint64x2 none = v_reinterpret_as_u64(v_setall_s64(-1));
|
||||
v_uint64x2 idxStart(0, 1);
|
||||
|
||||
@ -729,84 +729,84 @@ static void minMaxIdx_64f(const double* src, const uchar* mask, double* minval,
|
||||
|
||||
if ( !mask )
|
||||
{
|
||||
for( ; k < std::min(len0, j + 32764 * 4 * v_float64x2::nlanes); k += 4 * v_float64x2::nlanes )
|
||||
for( ; k < std::min(len0, j + 32764 * 4 * VTraits<v_float64x2>::vlanes()); k += 4 * VTraits<v_float64x2>::vlanes() )
|
||||
{
|
||||
v_float64x2 data = v_load(src + k);
|
||||
v_uint64x2 cmpMin = v_reinterpret_as_u64(data < valMin);
|
||||
v_uint64x2 cmpMax = v_reinterpret_as_u64(data > valMax);
|
||||
v_uint64x2 cmpMin = v_reinterpret_as_u64(v_lt(data, valMin));
|
||||
v_uint64x2 cmpMax = v_reinterpret_as_u64(v_gt(data, valMax));
|
||||
idxMin = v_select(cmpMin, idx, idxMin);
|
||||
idxMax = v_select(cmpMax, idx, idxMax);
|
||||
valMin = v_min(data, valMin);
|
||||
valMax = v_max(data, valMax);
|
||||
idx += inc;
|
||||
data = v_load(src + k + v_float64x2::nlanes);
|
||||
cmpMin = v_reinterpret_as_u64(data < valMin);
|
||||
cmpMax = v_reinterpret_as_u64(data > valMax);
|
||||
idx = v_add(idx, inc);
|
||||
data = v_load(src + k + VTraits<v_float64x2>::vlanes());
|
||||
cmpMin = v_reinterpret_as_u64(v_lt(data, valMin));
|
||||
cmpMax = v_reinterpret_as_u64(v_gt(data, valMax));
|
||||
idxMin = v_select(cmpMin, idx, idxMin);
|
||||
idxMax = v_select(cmpMax, idx, idxMax);
|
||||
valMin = v_min(data, valMin);
|
||||
valMax = v_max(data, valMax);
|
||||
idx += inc;
|
||||
data = v_load(src + k + 2 * v_float64x2::nlanes);
|
||||
cmpMin = v_reinterpret_as_u64(data < valMin);
|
||||
cmpMax = v_reinterpret_as_u64(data > valMax);
|
||||
idx = v_add(idx, inc);
|
||||
data = v_load(src + k + 2 * VTraits<v_float64x2>::vlanes());
|
||||
cmpMin = v_reinterpret_as_u64(v_lt(data, valMin));
|
||||
cmpMax = v_reinterpret_as_u64(v_gt(data, valMax));
|
||||
idxMin = v_select(cmpMin, idx, idxMin);
|
||||
idxMax = v_select(cmpMax, idx, idxMax);
|
||||
valMin = v_min(data, valMin);
|
||||
valMax = v_max(data, valMax);
|
||||
idx += inc;
|
||||
data = v_load(src + k + 3 * v_float64x2::nlanes);
|
||||
cmpMin = v_reinterpret_as_u64(data < valMin);
|
||||
cmpMax = v_reinterpret_as_u64(data > valMax);
|
||||
idx = v_add(idx, inc);
|
||||
data = v_load(src + k + 3 * VTraits<v_float64x2>::vlanes());
|
||||
cmpMin = v_reinterpret_as_u64(v_lt(data, valMin));
|
||||
cmpMax = v_reinterpret_as_u64(v_gt(data, valMax));
|
||||
idxMin = v_select(cmpMin, idx, idxMin);
|
||||
idxMax = v_select(cmpMax, idx, idxMax);
|
||||
valMin = v_min(data, valMin);
|
||||
valMax = v_max(data, valMax);
|
||||
idx += inc;
|
||||
idx = v_add(idx, inc);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for( ; k < std::min(len0, j + 32764 * 4 * v_float64x2::nlanes); k += 4 * v_float64x2::nlanes )
|
||||
for( ; k < std::min(len0, j + 32764 * 4 * VTraits<v_float64x2>::vlanes()); k += 4 * VTraits<v_float64x2>::vlanes() )
|
||||
{
|
||||
v_float64x2 data = v_load(src + k);
|
||||
v_uint16x8 maskVal = v_load_expand(mask + k) != v_setzero_u16();
|
||||
v_uint16x8 maskVal = v_ne(v_load_expand(mask + k), v_setzero_u16());
|
||||
v_int32x4 maskVal1, maskVal2;
|
||||
v_expand(v_reinterpret_as_s16(maskVal), maskVal1, maskVal2);
|
||||
v_int64x2 maskVal3, maskVal4;
|
||||
v_expand(maskVal1, maskVal3, maskVal4);
|
||||
v_uint64x2 cmpMin = v_reinterpret_as_u64(v_reinterpret_as_s64(data < valMin) & maskVal3);
|
||||
v_uint64x2 cmpMax = v_reinterpret_as_u64(v_reinterpret_as_s64(data > valMax) & maskVal3);
|
||||
v_uint64x2 cmpMin = v_reinterpret_as_u64(v_and(v_reinterpret_as_s64(v_lt(data, valMin)), maskVal3));
|
||||
v_uint64x2 cmpMax = v_reinterpret_as_u64(v_and(v_reinterpret_as_s64(v_gt(data, valMax)), maskVal3));
|
||||
idxMin = v_select(cmpMin, idx, idxMin);
|
||||
idxMax = v_select(cmpMax, idx, idxMax);
|
||||
valMin = v_select(v_reinterpret_as_f64(cmpMin), data, valMin);
|
||||
valMax = v_select(v_reinterpret_as_f64(cmpMax), data, valMax);
|
||||
idx += inc;
|
||||
data = v_load(src + k + v_float64x2::nlanes);
|
||||
cmpMin = v_reinterpret_as_u64(v_reinterpret_as_s64(data < valMin) & maskVal4);
|
||||
cmpMax = v_reinterpret_as_u64(v_reinterpret_as_s64(data > valMax) & maskVal4);
|
||||
idx = v_add(idx, inc);
|
||||
data = v_load(src + k + VTraits<v_float64x2>::vlanes());
|
||||
cmpMin = v_reinterpret_as_u64(v_and(v_reinterpret_as_s64(v_lt(data, valMin)), maskVal4));
|
||||
cmpMax = v_reinterpret_as_u64(v_and(v_reinterpret_as_s64(v_gt(data, valMax)), maskVal4));
|
||||
idxMin = v_select(cmpMin, idx, idxMin);
|
||||
idxMax = v_select(cmpMax, idx, idxMax);
|
||||
valMin = v_select(v_reinterpret_as_f64(cmpMin), data, valMin);
|
||||
valMax = v_select(v_reinterpret_as_f64(cmpMax), data, valMax);
|
||||
idx += inc;
|
||||
data = v_load(src + k + 2 * v_float64x2::nlanes);
|
||||
idx = v_add(idx, inc);
|
||||
data = v_load(src + k + 2 * VTraits<v_float64x2>::vlanes());
|
||||
v_expand(maskVal2, maskVal3, maskVal4);
|
||||
cmpMin = v_reinterpret_as_u64(v_reinterpret_as_s64(data < valMin) & maskVal3);
|
||||
cmpMax = v_reinterpret_as_u64(v_reinterpret_as_s64(data > valMax) & maskVal3);
|
||||
cmpMin = v_reinterpret_as_u64(v_and(v_reinterpret_as_s64(v_lt(data, valMin)), maskVal3));
|
||||
cmpMax = v_reinterpret_as_u64(v_and(v_reinterpret_as_s64(v_gt(data, valMax)), maskVal3));
|
||||
idxMin = v_select(cmpMin, idx, idxMin);
|
||||
idxMax = v_select(cmpMax, idx, idxMax);
|
||||
valMin = v_select(v_reinterpret_as_f64(cmpMin), data, valMin);
|
||||
valMax = v_select(v_reinterpret_as_f64(cmpMax), data, valMax);
|
||||
idx += inc;
|
||||
data = v_load(src + k + 3 * v_float64x2::nlanes);
|
||||
cmpMin = v_reinterpret_as_u64(v_reinterpret_as_s64(data < valMin) & maskVal4);
|
||||
cmpMax = v_reinterpret_as_u64(v_reinterpret_as_s64(data > valMax) & maskVal4);
|
||||
idx = v_add(idx, inc);
|
||||
data = v_load(src + k + 3 * VTraits<v_float64x2>::vlanes());
|
||||
cmpMin = v_reinterpret_as_u64(v_and(v_reinterpret_as_s64(v_lt(data, valMin)), maskVal4));
|
||||
cmpMax = v_reinterpret_as_u64(v_and(v_reinterpret_as_s64(v_gt(data, valMax)), maskVal4));
|
||||
idxMin = v_select(cmpMin, idx, idxMin);
|
||||
idxMax = v_select(cmpMax, idx, idxMax);
|
||||
valMin = v_select(v_reinterpret_as_f64(cmpMin), data, valMin);
|
||||
valMax = v_select(v_reinterpret_as_f64(cmpMax), data, valMax);
|
||||
idx += inc;
|
||||
idx = v_add(idx, inc);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1745,13 +1745,8 @@ template<typename R> struct TheTest
|
||||
R a = dataA;
|
||||
R b = dataB;
|
||||
|
||||
#if CV_SIMD_SCALABLE
|
||||
Data<R> dataEQ = v_eq(a, b);
|
||||
Data<R> dataNE = v_ne(a, b);
|
||||
#else
|
||||
Data<R> dataEQ = (a == b);
|
||||
Data<R> dataNE = (a != b);
|
||||
#endif
|
||||
|
||||
for (int i = 0; i < VTraits<R>::vlanes(); ++i)
|
||||
{
|
||||
|
@ -29,10 +29,10 @@ static inline void v_expand_mul_add(const v_int8x16& a, const v_int8x16& b,
|
||||
|
||||
v_int32x4 t0, t1;
|
||||
v_mul_expand(a0, b0, t0, t1);
|
||||
out0 += t0; out1 += t1;
|
||||
out0 = v_add(out0, t0); out1 = v_add(out1, t1);
|
||||
|
||||
v_mul_expand(a1, b1, t0, t1);
|
||||
out2 += t0; out3 += t1;
|
||||
out2 = v_add(out2, t0); out3 = v_add(out3, t1);
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -1055,10 +1055,10 @@ public:
|
||||
v_expand_mul_add(v21, vw21, vout0, vout1, vout2, vout3);
|
||||
v_expand_mul_add(v22, vw22, vout0, vout1, vout2, vout3);
|
||||
|
||||
vout0 = voutzp + v_round(v_cvt_f32(vout0)*vmult);
|
||||
vout1 = voutzp + v_round(v_cvt_f32(vout1)*vmult);
|
||||
vout2 = voutzp + v_round(v_cvt_f32(vout2)*vmult);
|
||||
vout3 = voutzp + v_round(v_cvt_f32(vout3)*vmult);
|
||||
vout0 = v_add(voutzp, v_round(v_mul(v_cvt_f32(vout0), vmult)));
|
||||
vout1 = v_add(voutzp, v_round(v_mul(v_cvt_f32(vout1), vmult)));
|
||||
vout2 = v_add(voutzp, v_round(v_mul(v_cvt_f32(vout2), vmult)));
|
||||
vout3 = v_add(voutzp, v_round(v_mul(v_cvt_f32(vout3), vmult)));
|
||||
|
||||
vout0 = v_min(v_max(vout0, outmin), outmax);
|
||||
vout1 = v_min(v_max(vout1, outmin), outmax);
|
||||
@ -1408,12 +1408,12 @@ public:
|
||||
vs12 = v_dotprod_expand_fast(w1, r2, vs12);
|
||||
vs13 = v_dotprod_expand_fast(w1, r3, vs13);
|
||||
}
|
||||
s0 += v_int32x4(v_reduce_sum(vs00), v_reduce_sum(vs01), v_reduce_sum(vs02), v_reduce_sum(vs03));
|
||||
s1 += v_int32x4(v_reduce_sum(vs10), v_reduce_sum(vs11), v_reduce_sum(vs12), v_reduce_sum(vs13));
|
||||
s0 = v_add(s0, v_int32x4(v_reduce_sum(vs00), v_reduce_sum(vs01), v_reduce_sum(vs02), v_reduce_sum(vs03)));
|
||||
s1 = v_add(s1, v_int32x4(v_reduce_sum(vs10), v_reduce_sum(vs11), v_reduce_sum(vs12), v_reduce_sum(vs13)));
|
||||
if( cn1 == inpCn )
|
||||
{
|
||||
s0 = voutzp + v_round(v_cvt_f32(s0)*vmult0);
|
||||
s1 = voutzp + v_round(v_cvt_f32(s1)*vmult1);
|
||||
s0 = v_add(voutzp, v_round(v_mul(v_cvt_f32(s0), vmult0)));
|
||||
s1 = v_add(voutzp, v_round(v_mul(v_cvt_f32(s1), vmult1)));
|
||||
|
||||
s0 = v_min(v_max(s0, outmin), outmax);
|
||||
s1 = v_min(v_max(s1, outmin), outmax);
|
||||
|
@ -323,8 +323,8 @@ public:
|
||||
vs3 = v_dotprod_expand_fast(v, v_load_aligned(wptr + wstep*3 + k), vs3);
|
||||
}
|
||||
|
||||
s += v_int32x4(v_reduce_sum(vs0), v_reduce_sum(vs1), v_reduce_sum(vs2), v_reduce_sum(vs3));
|
||||
v_int32x4 out = outzp + v_round(v_cvt_f32(s)*mult);
|
||||
s = v_add(s, v_int32x4(v_reduce_sum(vs0), v_reduce_sum(vs1), v_reduce_sum(vs2), v_reduce_sum(vs3)));
|
||||
v_int32x4 out = v_add(outzp, v_round(v_mul(v_cvt_f32(s), mult)));
|
||||
v_store(dptr + i, v_min(v_max(out, outmin), outmax));
|
||||
}
|
||||
#endif
|
||||
|
@ -631,17 +631,17 @@ public:
|
||||
(int)srcData[index + stride_w*10], (int)srcData[index + stride_w*11]);
|
||||
v_int32x4 v3((int)srcData[index + stride_w*12], (int)srcData[index + stride_w*13],
|
||||
(int)srcData[index + stride_w*14], (int)srcData[index + stride_w*15]);
|
||||
sum_val0 += v0;
|
||||
sum_val1 += v1;
|
||||
sum_val2 += v2;
|
||||
sum_val3 += v3;
|
||||
sum_val0 = v_add(sum_val0, v0);
|
||||
sum_val1 = v_add(sum_val1, v1);
|
||||
sum_val2 = v_add(sum_val2, v2);
|
||||
sum_val3 = v_add(sum_val3, v3);
|
||||
}
|
||||
}
|
||||
|
||||
sum_val0 = v_round(v_cvt_f32(sum_val0)*ikarea) + voutzp;
|
||||
sum_val1 = v_round(v_cvt_f32(sum_val1)*ikarea) + voutzp;
|
||||
sum_val2 = v_round(v_cvt_f32(sum_val2)*ikarea) + voutzp;
|
||||
sum_val3 = v_round(v_cvt_f32(sum_val3)*ikarea) + voutzp;
|
||||
sum_val0 = v_add(v_round(v_mul(v_cvt_f32(sum_val0), ikarea)), voutzp);
|
||||
sum_val1 = v_add(v_round(v_mul(v_cvt_f32(sum_val1), ikarea)), voutzp);
|
||||
sum_val2 = v_add(v_round(v_mul(v_cvt_f32(sum_val2), ikarea)), voutzp);
|
||||
sum_val3 = v_add(v_round(v_mul(v_cvt_f32(sum_val3), ikarea)), voutzp);
|
||||
|
||||
v_store(dstData + x0, v_pack(v_pack(sum_val0, sum_val1), v_pack(sum_val2, sum_val3)));
|
||||
x0 += 15;
|
||||
|
@ -236,13 +236,11 @@ void depthWiseBlockConv2D(const float* wptr,
|
||||
v21 = v_load(imgptr2 + in_j + dilation_w),
|
||||
v22 = v_load(imgptr2 + in_j + dilation_w*2);
|
||||
|
||||
v_float32x4 vout = v00*vw00 + v01*vw01 + v02*vw02 +
|
||||
v10*vw10 + v11*vw11 + v12*vw12 +
|
||||
v20*vw20 + v21*vw21 + v22*vw22 + vbias;
|
||||
v_float32x4 vout = v_add(v_add(v_add(v_add(v_add(v_add(v_add(v_add(v_add(v_mul(v00, vw00), v_mul(v01, vw01)), v_mul(v02, vw02)), v_mul(v10, vw10)), v_mul(v11, vw11)), v_mul(v12, vw12)), v_mul(v20, vw20)), v_mul(v21, vw21)), v_mul(v22, vw22)), vbias);
|
||||
if (fusedAdd)
|
||||
vout = v_load(outptr + out_j) + vout;
|
||||
vout = v_add(v_load(outptr + out_j), vout);
|
||||
if (relu)
|
||||
vout = v_select(vout > z, vout, vout*vrc);
|
||||
vout = v_select(v_gt(vout, z), vout, v_mul(vout, vrc));
|
||||
v_store(outptr + out_j, vout);
|
||||
}
|
||||
}
|
||||
@ -268,14 +266,12 @@ void depthWiseBlockConv2D(const float* wptr,
|
||||
v_load_deinterleave(imgptr2 + in_j, v20, v21);
|
||||
v_load_deinterleave(imgptr2 + in_j + 2, v22, unused);
|
||||
|
||||
v_float32x4 vout = v00 * vw00 + v01 * vw01 + v02 * vw02 +
|
||||
v10 * vw10 + v11 * vw11 + v12 * vw12 +
|
||||
v20 * vw20 + v21 * vw21 + v22 * vw22 + vbias;
|
||||
v_float32x4 vout = v_add(v_add(v_add(v_add(v_add(v_add(v_add(v_add(v_add(v_mul(v00, vw00), v_mul(v01, vw01)), v_mul(v02, vw02)), v_mul(v10, vw10)), v_mul(v11, vw11)), v_mul(v12, vw12)), v_mul(v20, vw20)), v_mul(v21, vw21)), v_mul(v22, vw22)), vbias);
|
||||
|
||||
if (fusedAdd)
|
||||
vout = v_load(outptr + out_j) + vout;
|
||||
vout = v_add(v_load(outptr + out_j), vout);
|
||||
if (relu)
|
||||
vout = v_select(vout > z, vout, vout*vrc);
|
||||
vout = v_select(v_gt(vout, z), vout, v_mul(vout, vrc));
|
||||
v_store(outptr + out_j, vout);
|
||||
}
|
||||
}
|
||||
@ -381,11 +377,11 @@ void depthWiseBlockConv1D(const float* wptr,
|
||||
v01 = v_load(imgptr0 + in_j + dilation_w),
|
||||
v02 = v_load(imgptr0 + in_j + dilation_w*2);
|
||||
|
||||
v_float32x4 vout = v00*vw00 + v01*vw01 + v02*vw02 + vbias;
|
||||
v_float32x4 vout = v_add(v_add(v_add(v_mul(v00, vw00), v_mul(v01, vw01)), v_mul(v02, vw02)), vbias);
|
||||
if (fusedAdd)
|
||||
vout = v_load(outptr + out_j) + vout;
|
||||
vout = v_add(v_load(outptr + out_j), vout);
|
||||
if (relu)
|
||||
vout = v_select(vout > z, vout, vout*vrc);
|
||||
vout = v_select(v_gt(vout, z), vout, v_mul(vout, vrc));
|
||||
v_store(outptr + out_j, vout);
|
||||
}
|
||||
}
|
||||
@ -407,13 +403,13 @@ void depthWiseBlockConv1D(const float* wptr,
|
||||
v_load_deinterleave(imgptr0 + in_j, v00, v01);
|
||||
v_load_deinterleave(imgptr0 + in_j + 2, v02, unused);
|
||||
|
||||
v_float32x4 vout = v00 * vw00 + v01 * vw01 + v02 * vw02 + vbias;
|
||||
v_float32x4 vout = v_add(v_add(v_add(v_mul(v00, vw00), v_mul(v01, vw01)), v_mul(v02, vw02)), vbias);
|
||||
|
||||
if (fusedAdd)
|
||||
vout = v_load(outptr + out_j) + vout;
|
||||
vout = v_add(v_load(outptr + out_j), vout);
|
||||
|
||||
if (relu)
|
||||
vout = v_select(vout > z, vout, vout*vrc);
|
||||
vout = v_select(v_gt(vout, z), vout, v_mul(vout, vrc));
|
||||
v_store(outptr + out_j, vout);
|
||||
}
|
||||
}
|
||||
|
@ -430,32 +430,32 @@ void winofunc_BtXB_8x8_f32(const float* inptr, int inpstep,
|
||||
/* Y[0] = [1.f, 0.f, -5.25f, 0.f, 5.25f, 0.f, -1.f, 0.f]*X */
|
||||
/* Y[7] = [0.f, -1.f, 0.f, 5.25f, 0.f, -5.25f, 0.f, 1.f]*X */
|
||||
v_float32x4 q5_25 = v_setall_f32(5.25f), t00, t01, t10, t11;
|
||||
t00 = x40 - x20;
|
||||
t01 = x41 - x21;
|
||||
t10 = x30 - x50;
|
||||
t11 = x31 - x51;
|
||||
v_float32x4 y00 = v_fma(t00, q5_25, x00 - x60);
|
||||
v_float32x4 y01 = v_fma(t01, q5_25, x01 - x61);
|
||||
v_float32x4 y70 = v_fma(t10, q5_25, x70 - x10);
|
||||
v_float32x4 y71 = v_fma(t11, q5_25, x71 - x11);
|
||||
t00 = v_sub(x40, x20);
|
||||
t01 = v_sub(x41, x21);
|
||||
t10 = v_sub(x30, x50);
|
||||
t11 = v_sub(x31, x51);
|
||||
v_float32x4 y00 = v_fma(t00, q5_25, v_sub(x00, x60));
|
||||
v_float32x4 y01 = v_fma(t01, q5_25, v_sub(x01, x61));
|
||||
v_float32x4 y70 = v_fma(t10, q5_25, v_sub(x70, x10));
|
||||
v_float32x4 y71 = v_fma(t11, q5_25, v_sub(x71, x11));
|
||||
|
||||
/* Y[1] = [0.f, 1.f, 1.f, -4.25f, -4.25f, 1.f, 1.f, 0.f]*X */
|
||||
/* Y[2] = [0.f, -1.f, 1.f, 4.25f, -4.25f, -1.f, 1.f, 0.f]*X */
|
||||
v_float32x4 qm4_25 = v_setall_f32(-4.25f);
|
||||
t00 = v_fma(x30, qm4_25, x10 + x50);
|
||||
t01 = v_fma(x31, qm4_25, x11 + x51);
|
||||
t10 = v_fma(x40, qm4_25, x20 + x60);
|
||||
t11 = v_fma(x41, qm4_25, x21 + x61);
|
||||
t00 = v_fma(x30, qm4_25, v_add(x10, x50));
|
||||
t01 = v_fma(x31, qm4_25, v_add(x11, x51));
|
||||
t10 = v_fma(x40, qm4_25, v_add(x20, x60));
|
||||
t11 = v_fma(x41, qm4_25, v_add(x21, x61));
|
||||
|
||||
v_float32x4 y10 = t00 + t10, y11 = t01 + t11;
|
||||
v_float32x4 y20 = t10 - t00, y21 = t11 - t01;
|
||||
v_float32x4 y10 = v_add(t00, t10), y11 = v_add(t01, t11);
|
||||
v_float32x4 y20 = v_sub(t10, t00), y21 = v_sub(t11, t01);
|
||||
|
||||
/* Y[3] = [0.f, 0.5f, 0.25f, -2.5f, -1.25f, 2.f, 1.f, 0.f]*X */
|
||||
/* Y[4] = [0.f, -0.5f, 0.25f, 2.5f, -1.25f, -2.f, 1.f, 0.f]*X */
|
||||
v_float32x4 q0_5 = v_setall_f32(0.5f), q0_25 = v_setall_f32(0.25f);
|
||||
v_float32x4 qm2_5 = v_setall_f32(-2.5f), qm1_25 = v_setall_f32(-1.25f);
|
||||
t00 = v_fma(x10, q0_5, x50 + x50);
|
||||
t01 = v_fma(x11, q0_5, x51 + x51);
|
||||
t00 = v_fma(x10, q0_5, v_add(x50, x50));
|
||||
t01 = v_fma(x11, q0_5, v_add(x51, x51));
|
||||
t10 = v_fma(x20, q0_25, x60);
|
||||
t11 = v_fma(x21, q0_25, x61);
|
||||
t00 = v_fma(x30, qm2_5, t00);
|
||||
@ -463,14 +463,14 @@ void winofunc_BtXB_8x8_f32(const float* inptr, int inpstep,
|
||||
t10 = v_fma(x40, qm1_25, t10);
|
||||
t11 = v_fma(x41, qm1_25, t11);
|
||||
|
||||
v_float32x4 y30 = t00 + t10, y31 = t01 + t11;
|
||||
v_float32x4 y40 = t10 - t00, y41 = t11 - t01;
|
||||
v_float32x4 y30 = v_add(t00, t10), y31 = v_add(t01, t11);
|
||||
v_float32x4 y40 = v_sub(t10, t00), y41 = v_sub(t11, t01);
|
||||
|
||||
/* Y[5] = [0.f, 2.f, 4.f, -2.5f, -5.f, 0.5f, 1.f, 0.f]*X */
|
||||
/* Y[6] = [0.f, -2.f, 4.f, 2.5f, -5.f, -0.5f, 1.f, 0.f]*X */
|
||||
v_float32x4 q4 = v_setall_f32(4.f), qm5 = v_setall_f32(-5.f);
|
||||
t00 = v_fma(x50, q0_5, x10 + x10);
|
||||
t01 = v_fma(x51, q0_5, x11 + x11);
|
||||
t00 = v_fma(x50, q0_5, v_add(x10, x10));
|
||||
t01 = v_fma(x51, q0_5, v_add(x11, x11));
|
||||
t10 = v_fma(x20, q4 , x60);
|
||||
t11 = v_fma(x21, q4 , x61);
|
||||
t00 = v_fma(x30, qm2_5, t00);
|
||||
@ -478,8 +478,8 @@ void winofunc_BtXB_8x8_f32(const float* inptr, int inpstep,
|
||||
t10 = v_fma(x40, qm5 , t10);
|
||||
t11 = v_fma(x41, qm5 , t11);
|
||||
|
||||
v_float32x4 y50 = t00 + t10, y51 = t01 + t11;
|
||||
v_float32x4 y60 = t10 - t00, y61 = t11 - t01;
|
||||
v_float32x4 y50 = v_add(t00, t10), y51 = v_add(t01, t11);
|
||||
v_float32x4 y60 = v_sub(t10, t00), y61 = v_sub(t11, t01);
|
||||
|
||||
/* transpose 8x8 matrix with v_transpose4x4 */
|
||||
|
||||
@ -491,29 +491,29 @@ void winofunc_BtXB_8x8_f32(const float* inptr, int inpstep,
|
||||
|
||||
/* Z[0] = [1.f, 0.f, -5.25f, 0.f, 5.25f, 0.f, -1.f, 0.f]*Y */
|
||||
/* Z[7] = [0.f, -1.f, 0.f, 5.25f, 0.f, -5.25f, 0.f, 1.f]*Y */
|
||||
t00 = y010 - y200;
|
||||
t01 = y410 - y600;
|
||||
t10 = y300 - y110;
|
||||
t11 = y700 - y510;
|
||||
z00 = v_fma(t00, q5_25, y000 - y210);
|
||||
z01 = v_fma(t01, q5_25, y400 - y610);
|
||||
z70 = v_fma(t10, q5_25, y310 - y100);
|
||||
z71 = v_fma(t11, q5_25, y710 - y500);
|
||||
t00 = v_sub(y010, y200);
|
||||
t01 = v_sub(y410, y600);
|
||||
t10 = v_sub(y300, y110);
|
||||
t11 = v_sub(y700, y510);
|
||||
z00 = v_fma(t00, q5_25, v_sub(y000, y210));
|
||||
z01 = v_fma(t01, q5_25, v_sub(y400, y610));
|
||||
z70 = v_fma(t10, q5_25, v_sub(y310, y100));
|
||||
z71 = v_fma(t11, q5_25, v_sub(y710, y500));
|
||||
|
||||
/* Z[1] = [0.f, 1.f, 1.f, -4.25f, -4.25f, 1.f, 1.f, 0.f]*Y */
|
||||
/* Z[2] = [0.f, -1.f, 1.f, 4.25f, -4.25f, -1.f, 1.f, 0.f]*Y */
|
||||
t00 = v_fma(y300, qm4_25, y100 + y110);
|
||||
t01 = v_fma(y700, qm4_25, y500 + y510);
|
||||
t10 = v_fma(y010, qm4_25, y200 + y210);
|
||||
t11 = v_fma(y410, qm4_25, y600 + y610);
|
||||
t00 = v_fma(y300, qm4_25, v_add(y100, y110));
|
||||
t01 = v_fma(y700, qm4_25, v_add(y500, y510));
|
||||
t10 = v_fma(y010, qm4_25, v_add(y200, y210));
|
||||
t11 = v_fma(y410, qm4_25, v_add(y600, y610));
|
||||
|
||||
z10 = t00 + t10; z11 = t01 + t11;
|
||||
z20 = t10 - t00; z21 = t11 - t01;
|
||||
z10 = v_add(t00, t10); z11 = v_add(t01, t11);
|
||||
z20 = v_sub(t10, t00); z21 = v_sub(t11, t01);
|
||||
|
||||
/* Z[3] = [0.f, 0.5f, 0.25f, -2.5f, -1.25f, 2.f, 1.f, 0.f]*Y */
|
||||
/* Z[4] = [0.f, -0.5f, 0.25f, 2.5f, -1.25f, -2.f, 1.f, 0.f]*Y */
|
||||
t00 = v_fma(y100, q0_5, y110 + y110);
|
||||
t01 = v_fma(y500, q0_5, y510 + y510);
|
||||
t00 = v_fma(y100, q0_5, v_add(y110, y110));
|
||||
t01 = v_fma(y500, q0_5, v_add(y510, y510));
|
||||
t10 = v_fma(y200, q0_25, y210);
|
||||
t11 = v_fma(y600, q0_25, y610);
|
||||
t00 = v_fma(y300, qm2_5, t00);
|
||||
@ -521,13 +521,13 @@ void winofunc_BtXB_8x8_f32(const float* inptr, int inpstep,
|
||||
t10 = v_fma(y010, qm1_25, t10);
|
||||
t11 = v_fma(y410, qm1_25, t11);
|
||||
|
||||
z30 = t00 + t10; z31 = t01 + t11;
|
||||
z40 = t10 - t00; z41 = t11 - t01;
|
||||
z30 = v_add(t00, t10); z31 = v_add(t01, t11);
|
||||
z40 = v_sub(t10, t00); z41 = v_sub(t11, t01);
|
||||
|
||||
/* Z[5] = [0.f, 2.f, 4.f, -2.5f, -5.f, 0.5f, 1.f, 0.f]*Y */
|
||||
/* Z[6] = [0.f, -2.f, 4.f, 2.5f, -5.f, -0.5f, 1.f, 0.f]*Y */
|
||||
t00 = v_fma(y110, q0_5, y100 + y100);
|
||||
t01 = v_fma(y510, q0_5, y500 + y500);
|
||||
t00 = v_fma(y110, q0_5, v_add(y100, y100));
|
||||
t01 = v_fma(y510, q0_5, v_add(y500, y500));
|
||||
t10 = v_fma(y200, q4, y210);
|
||||
t11 = v_fma(y600, q4, y610);
|
||||
t00 = v_fma(y300, qm2_5, t00);
|
||||
@ -535,8 +535,8 @@ void winofunc_BtXB_8x8_f32(const float* inptr, int inpstep,
|
||||
t10 = v_fma(y010, qm5, t10);
|
||||
t11 = v_fma(y410, qm5, t11);
|
||||
|
||||
z50 = t00 + t10; z51 = t01 + t11;
|
||||
z60 = t10 - t00; z61 = t11 - t01;
|
||||
z50 = v_add(t00, t10); z51 = v_add(t01, t11);
|
||||
z60 = v_sub(t10, t00); z61 = v_sub(t11, t01);
|
||||
}
|
||||
|
||||
const int outstep = winoIblock*winoAtomF32*Cg;
|
||||
@ -601,12 +601,12 @@ void winofunc_AtXA_8x8_f32(const float* inptr, int inpstep,
|
||||
|
||||
{
|
||||
v_float32x4 s12_0, s12_1, s34_0, s34_1, s56_0, s56_1;
|
||||
s12_0 = x10 + x20; s12_1 = x11 + x21;
|
||||
s34_0 = x30 + x40; s34_1 = x31 + x41;
|
||||
s56_0 = x50 + x60; s56_1 = x51 + x61;
|
||||
s12_0 = v_add(x10, x20); s12_1 = v_add(x11, x21);
|
||||
s34_0 = v_add(x30, x40); s34_1 = v_add(x31, x41);
|
||||
s56_0 = v_add(x50, x60); s56_1 = v_add(x51, x61);
|
||||
|
||||
v_float32x4 y00 = x00 + s12_0 + s34_0 + s56_0;
|
||||
v_float32x4 y01 = x01 + s12_1 + s34_1 + s56_1;
|
||||
v_float32x4 y00 = v_add(v_add(v_add(x00, s12_0), s34_0), s56_0);
|
||||
v_float32x4 y01 = v_add(v_add(v_add(x01, s12_1), s34_1), s56_1);
|
||||
|
||||
v_float32x4 a0 = v_setall_f32(0.25f), a1 = v_setall_f32(4.0f);
|
||||
v_float32x4 y20 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0));
|
||||
@ -616,13 +616,13 @@ void winofunc_AtXA_8x8_f32(const float* inptr, int inpstep,
|
||||
v_float32x4 y40 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0));
|
||||
v_float32x4 y41 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1));
|
||||
|
||||
s12_0 = x10 - x20; s12_1 = x11 - x21;
|
||||
s34_0 = x30 - x40; s34_1 = x31 - x41;
|
||||
s56_0 = x50 - x60; s56_1 = x51 - x61;
|
||||
s12_0 = v_sub(x10, x20); s12_1 = v_sub(x11, x21);
|
||||
s34_0 = v_sub(x30, x40); s34_1 = v_sub(x31, x41);
|
||||
s56_0 = v_sub(x50, x60); s56_1 = v_sub(x51, x61);
|
||||
|
||||
a0 = v_setall_f32(1.f/32), a1 = v_setall_f32(32.f);
|
||||
v_float32x4 y50 = v_fma(s56_0, a0, v_fma(s34_0, a1, x70 + s12_0));
|
||||
v_float32x4 y51 = v_fma(s56_1, a0, v_fma(s34_1, a1, x71 + s12_1));
|
||||
v_float32x4 y50 = v_fma(s56_0, a0, v_fma(s34_0, a1, v_add(x70, s12_0)));
|
||||
v_float32x4 y51 = v_fma(s56_1, a0, v_fma(s34_1, a1, v_add(x71, s12_1)));
|
||||
|
||||
a0 = v_setall_f32(0.5f), a1 = v_setall_f32(2.f);
|
||||
v_float32x4 y10 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0));
|
||||
@ -642,12 +642,12 @@ void winofunc_AtXA_8x8_f32(const float* inptr, int inpstep,
|
||||
v_transpose4x4(y40, y50, y60, y70, y400, y500, y600, y700);
|
||||
v_transpose4x4(y41, y51, y61, y71, y410, y510, y610, y710);
|
||||
|
||||
s12_0 = y100 + y200; s12_1 = y500 + y600;
|
||||
s34_0 = y300 + y010; s34_1 = y700 + y410;
|
||||
s56_0 = y110 + y210; s56_1 = y510 + y610;
|
||||
s12_0 = v_add(y100, y200); s12_1 = v_add(y500, y600);
|
||||
s34_0 = v_add(y300, y010); s34_1 = v_add(y700, y410);
|
||||
s56_0 = v_add(y110, y210); s56_1 = v_add(y510, y610);
|
||||
|
||||
z00 = y000 + s12_0 + s34_0 + s56_0;
|
||||
z01 = y400 + s12_1 + s34_1 + s56_1;
|
||||
z00 = v_add(v_add(v_add(y000, s12_0), s34_0), s56_0);
|
||||
z01 = v_add(v_add(v_add(y400, s12_1), s34_1), s56_1);
|
||||
|
||||
a0 = v_setall_f32(0.25f), a1 = v_setall_f32(4.0f);
|
||||
z20 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0));
|
||||
@ -657,13 +657,13 @@ void winofunc_AtXA_8x8_f32(const float* inptr, int inpstep,
|
||||
z40 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0));
|
||||
z41 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1));
|
||||
|
||||
s12_0 = y100 - y200; s12_1 = y500 - y600;
|
||||
s34_0 = y300 - y010; s34_1 = y700 - y410;
|
||||
s56_0 = y110 - y210; s56_1 = y510 - y610;
|
||||
s12_0 = v_sub(y100, y200); s12_1 = v_sub(y500, y600);
|
||||
s34_0 = v_sub(y300, y010); s34_1 = v_sub(y700, y410);
|
||||
s56_0 = v_sub(y110, y210); s56_1 = v_sub(y510, y610);
|
||||
|
||||
a0 = v_setall_f32(1.f/32), a1 = v_setall_f32(32.0f);
|
||||
z50 = v_fma(s56_0, a0, v_fma(s34_0, a1, y310 + s12_0));
|
||||
z51 = v_fma(s56_1, a0, v_fma(s34_1, a1, y710 + s12_1));
|
||||
z50 = v_fma(s56_0, a0, v_fma(s34_0, a1, v_add(y310, s12_0)));
|
||||
z51 = v_fma(s56_1, a0, v_fma(s34_1, a1, v_add(y710, s12_1)));
|
||||
a0 = v_setall_f32(0.5f), a1 = v_setall_f32(2.0f);
|
||||
z10 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0));
|
||||
z11 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1));
|
||||
@ -673,34 +673,34 @@ void winofunc_AtXA_8x8_f32(const float* inptr, int inpstep,
|
||||
z31 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1));
|
||||
|
||||
v_float32x4 vbias = v_setall_f32(bias);
|
||||
z00 += vbias;
|
||||
z01 += vbias;
|
||||
z10 += vbias;
|
||||
z11 += vbias;
|
||||
z20 += vbias;
|
||||
z21 += vbias;
|
||||
z30 += vbias;
|
||||
z31 += vbias;
|
||||
z40 += vbias;
|
||||
z41 += vbias;
|
||||
z50 += vbias;
|
||||
z51 += vbias;
|
||||
z00 = v_add(z00, vbias);
|
||||
z01 = v_add(z01, vbias);
|
||||
z10 = v_add(z10, vbias);
|
||||
z11 = v_add(z11, vbias);
|
||||
z20 = v_add(z20, vbias);
|
||||
z21 = v_add(z21, vbias);
|
||||
z30 = v_add(z30, vbias);
|
||||
z31 = v_add(z31, vbias);
|
||||
z40 = v_add(z40, vbias);
|
||||
z41 = v_add(z41, vbias);
|
||||
z50 = v_add(z50, vbias);
|
||||
z51 = v_add(z51, vbias);
|
||||
}
|
||||
|
||||
if (bpptr)
|
||||
{
|
||||
z00 += v_load(bpptr);
|
||||
z01 += v_load_low(bpptr + 4);
|
||||
z10 += v_load(bpptr + bpstep);
|
||||
z11 += v_load_low(bpptr + bpstep + 4);
|
||||
z20 += v_load(bpptr + bpstep*2);
|
||||
z21 += v_load_low(bpptr + bpstep*2 + 4);
|
||||
z30 += v_load(bpptr + bpstep*3);
|
||||
z31 += v_load_low(bpptr + bpstep*3 + 4);
|
||||
z40 += v_load(bpptr + bpstep*4);
|
||||
z41 += v_load_low(bpptr + bpstep*4 + 4);
|
||||
z50 += v_load(bpptr + bpstep*5);
|
||||
z51 += v_load_low(bpptr + bpstep*5 + 4);
|
||||
z00 = v_add(z00, v_load(bpptr));
|
||||
z01 = v_add(z01, v_load_low(bpptr + 4));
|
||||
z10 = v_add(z10, v_load(bpptr + bpstep));
|
||||
z11 = v_add(z11, v_load_low(bpptr + bpstep + 4));
|
||||
z20 = v_add(z20, v_load(bpptr + bpstep * 2));
|
||||
z21 = v_add(z21, v_load_low(bpptr + bpstep * 2 + 4));
|
||||
z30 = v_add(z30, v_load(bpptr + bpstep * 3));
|
||||
z31 = v_add(z31, v_load_low(bpptr + bpstep * 3 + 4));
|
||||
z40 = v_add(z40, v_load(bpptr + bpstep * 4));
|
||||
z41 = v_add(z41, v_load_low(bpptr + bpstep * 4 + 4));
|
||||
z50 = v_add(z50, v_load(bpptr + bpstep * 5));
|
||||
z51 = v_add(z51, v_load_low(bpptr + bpstep * 5 + 4));
|
||||
}
|
||||
|
||||
if (ifMinMaxAct)
|
||||
|
@ -370,10 +370,10 @@ struct ReLUFunctor : public BaseFunctor
|
||||
v_float32x4 x1 = v_load(srcptr + i + 4);
|
||||
v_float32x4 x2 = v_load(srcptr + i + 8);
|
||||
v_float32x4 x3 = v_load(srcptr + i + 12);
|
||||
x0 = v_select(x0 >= z, x0, x0*s4);
|
||||
x1 = v_select(x1 >= z, x1, x1*s4);
|
||||
x2 = v_select(x2 >= z, x2, x2*s4);
|
||||
x3 = v_select(x3 >= z, x3, x3*s4);
|
||||
x0 = v_select(v_ge(x0, z), x0, v_mul(x0, s4));
|
||||
x1 = v_select(v_ge(x1, z), x1, v_mul(x1, s4));
|
||||
x2 = v_select(v_ge(x2, z), x2, v_mul(x2, s4));
|
||||
x3 = v_select(v_ge(x3, z), x3, v_mul(x3, s4));
|
||||
v_store(dstptr + i, x0);
|
||||
v_store(dstptr + i + 4, x1);
|
||||
v_store(dstptr + i + 8, x2);
|
||||
@ -2493,10 +2493,10 @@ struct ChannelsPReLUFunctor : public BaseFunctor
|
||||
v_float32x4 x1 = v_load(srcptr + i + 4);
|
||||
v_float32x4 x2 = v_load(srcptr + i + 8);
|
||||
v_float32x4 x3 = v_load(srcptr + i + 12);
|
||||
x0 = v_select(x0 >= z, x0, x0*s4);
|
||||
x1 = v_select(x1 >= z, x1, x1*s4);
|
||||
x2 = v_select(x2 >= z, x2, x2*s4);
|
||||
x3 = v_select(x3 >= z, x3, x3*s4);
|
||||
x0 = v_select(v_ge(x0, z), x0, v_mul(x0, s4));
|
||||
x1 = v_select(v_ge(x1, z), x1, v_mul(x1, s4));
|
||||
x2 = v_select(v_ge(x2, z), x2, v_mul(x2, s4));
|
||||
x3 = v_select(v_ge(x3, z), x3, v_mul(x3, s4));
|
||||
v_store(dstptr + i, x0);
|
||||
v_store(dstptr + i + 4, x1);
|
||||
v_store(dstptr + i + 8, x2);
|
||||
@ -2649,10 +2649,10 @@ struct PReLUFunctor : public ChannelsPReLUFunctor
|
||||
v_float32x4 s1 = v_load(scaleptr + i + 4);
|
||||
v_float32x4 s2 = v_load(scaleptr + i + 8);
|
||||
v_float32x4 s3 = v_load(scaleptr + i + 12);
|
||||
x0 = v_select(x0 >= z, x0, x0*s0);
|
||||
x1 = v_select(x1 >= z, x1, x1*s1);
|
||||
x2 = v_select(x2 >= z, x2, x2*s2);
|
||||
x3 = v_select(x3 >= z, x3, x3*s3);
|
||||
x0 = v_select(v_ge(x0, z), x0, v_mul(x0, s0));
|
||||
x1 = v_select(v_ge(x1, z), x1, v_mul(x1, s1));
|
||||
x2 = v_select(v_ge(x2, z), x2, v_mul(x2, s2));
|
||||
x3 = v_select(v_ge(x3, z), x3, v_mul(x3, s3));
|
||||
v_store(dstptr + i, x0);
|
||||
v_store(dstptr + i + 4, x1);
|
||||
v_store(dstptr + i + 8, x2);
|
||||
|
@ -308,7 +308,7 @@ public:
|
||||
}
|
||||
|
||||
v_float32x4 s = v_reduce_sum4(vs0, vs1, vs2, vs3);
|
||||
s += v_load(biasptr + i);
|
||||
s = v_add(s, v_load(biasptr + i));
|
||||
v_store(dptr + i, s);
|
||||
}
|
||||
#endif
|
||||
|
@ -898,25 +898,25 @@ public:
|
||||
v_float32x4 max_idx0 = v_setall_f32(-1.f);
|
||||
v_float32x4 max_idx1 = max_idx0;
|
||||
int index0 = ystart * inp_width + xstart;
|
||||
v_float32x4 idx0 = idx00 + v_setall_f32((float)index0);
|
||||
v_float32x4 idx1 = idx0 + v_setall_f32((float)(stride_w*4));
|
||||
v_float32x4 idx0 = v_add(idx00, v_setall_f32((float)index0));
|
||||
v_float32x4 idx1 = v_add(idx0, v_setall_f32((float)(stride_w * 4)));
|
||||
|
||||
for (int y = ystart; y < yend; ++y)
|
||||
{
|
||||
for (int x = xstart; x < xend; ++x, idx0 += ones, idx1 += ones)
|
||||
for (int x = xstart; x < xend; ++x, idx0 = v_add(idx0, ones), idx1 = v_add(idx1, ones))
|
||||
{
|
||||
const int index = y * inp_width + x;
|
||||
v_float32x4 v0(srcData[index], srcData[index + stride_w],
|
||||
srcData[index + stride_w*2], srcData[index + stride_w*3]);
|
||||
v_float32x4 v1(srcData[index + stride_w*4], srcData[index + stride_w*5],
|
||||
srcData[index + stride_w*6], srcData[index + stride_w*7]);
|
||||
max_idx0 = v_select(v0 > max_val0, idx0, max_idx0);
|
||||
max_idx1 = v_select(v1 > max_val1, idx1, max_idx1);
|
||||
max_idx0 = v_select(v_gt(v0, max_val0), idx0, max_idx0);
|
||||
max_idx1 = v_select(v_gt(v1, max_val1), idx1, max_idx1);
|
||||
max_val0 = v_max(max_val0, v0);
|
||||
max_val1 = v_max(max_val1, v1);
|
||||
}
|
||||
idx0 += idx_delta;
|
||||
idx1 += idx_delta;
|
||||
idx0 = v_add(idx0, idx_delta);
|
||||
idx1 = v_add(idx1, idx_delta);
|
||||
}
|
||||
v_store(dstData + x0, max_val0);
|
||||
v_store(dstData + x0 + 4, max_val1);
|
||||
@ -1069,12 +1069,12 @@ public:
|
||||
srcData[index + stride_w*2], srcData[index + stride_w*3]);
|
||||
v_float32x4 v1(srcData[index + stride_w*4], srcData[index + stride_w*5],
|
||||
srcData[index + stride_w*6], srcData[index + stride_w*7]);
|
||||
sum_val0 += v0;
|
||||
sum_val1 += v1;
|
||||
sum_val0 = v_add(sum_val0, v0);
|
||||
sum_val1 = v_add(sum_val1, v1);
|
||||
}
|
||||
}
|
||||
v_store(dstData + x0, sum_val0*ikarea);
|
||||
v_store(dstData + x0 + 4, sum_val1*ikarea);
|
||||
v_store(dstData + x0, v_mul(sum_val0, ikarea));
|
||||
v_store(dstData + x0 + 4, v_mul(sum_val1, ikarea));
|
||||
x0 += 7;
|
||||
}
|
||||
else
|
||||
|
@ -120,8 +120,8 @@ void FAST_t(InputArray _img, std::vector<KeyPoint>& keypoints, int threshold, bo
|
||||
for (; j < img.cols - 16 - 3; j += 16, ptr += 16)
|
||||
{
|
||||
v_uint8x16 v = v_load(ptr);
|
||||
v_int8x16 v0 = v_reinterpret_as_s8((v + t) ^ delta);
|
||||
v_int8x16 v1 = v_reinterpret_as_s8((v - t) ^ delta);
|
||||
v_int8x16 v0 = v_reinterpret_as_s8(v_xor(v_add(v, t), delta));
|
||||
v_int8x16 v1 = v_reinterpret_as_s8(v_xor(v_sub(v, t), delta));
|
||||
|
||||
v_int8x16 x0 = v_reinterpret_as_s8(v_sub_wrap(v_load(ptr + pixel[0]), delta));
|
||||
v_int8x16 x1 = v_reinterpret_as_s8(v_sub_wrap(v_load(ptr + pixel[quarterPatternSize]), delta));
|
||||
@ -129,15 +129,15 @@ void FAST_t(InputArray _img, std::vector<KeyPoint>& keypoints, int threshold, bo
|
||||
v_int8x16 x3 = v_reinterpret_as_s8(v_sub_wrap(v_load(ptr + pixel[3*quarterPatternSize]), delta));
|
||||
|
||||
v_int8x16 m0, m1;
|
||||
m0 = (v0 < x0) & (v0 < x1);
|
||||
m1 = (x0 < v1) & (x1 < v1);
|
||||
m0 = m0 | ((v0 < x1) & (v0 < x2));
|
||||
m1 = m1 | ((x1 < v1) & (x2 < v1));
|
||||
m0 = m0 | ((v0 < x2) & (v0 < x3));
|
||||
m1 = m1 | ((x2 < v1) & (x3 < v1));
|
||||
m0 = m0 | ((v0 < x3) & (v0 < x0));
|
||||
m1 = m1 | ((x3 < v1) & (x0 < v1));
|
||||
m0 = m0 | m1;
|
||||
m0 = v_and(v_lt(v0, x0), v_lt(v0, x1));
|
||||
m1 = v_and(v_lt(x0, v1), v_lt(x1, v1));
|
||||
m0 = v_or(m0, v_and(v_lt(v0, x1), v_lt(v0, x2)));
|
||||
m1 = v_or(m1, v_and(v_lt(x1, v1), v_lt(x2, v1)));
|
||||
m0 = v_or(m0, v_and(v_lt(v0, x2), v_lt(v0, x3)));
|
||||
m1 = v_or(m1, v_and(v_lt(x2, v1), v_lt(x3, v1)));
|
||||
m0 = v_or(m0, v_and(v_lt(v0, x3), v_lt(v0, x0)));
|
||||
m1 = v_or(m1, v_and(v_lt(x3, v1), v_lt(x0, v1)));
|
||||
m0 = v_or(m0, m1);
|
||||
|
||||
if( !v_check_any(m0) )
|
||||
continue;
|
||||
@ -154,18 +154,18 @@ void FAST_t(InputArray _img, std::vector<KeyPoint>& keypoints, int threshold, bo
|
||||
v_uint8x16 max1 = v_setzero_u8();
|
||||
for( k = 0; k < N; k++ )
|
||||
{
|
||||
v_int8x16 x = v_reinterpret_as_s8(v_load((ptr + pixel[k])) ^ delta);
|
||||
m0 = v0 < x;
|
||||
m1 = x < v1;
|
||||
v_int8x16 x = v_reinterpret_as_s8(v_xor(v_load((ptr + pixel[k])), delta));
|
||||
m0 = v_lt(v0, x);
|
||||
m1 = v_lt(x, v1);
|
||||
|
||||
c0 = v_sub_wrap(c0, m0) & m0;
|
||||
c1 = v_sub_wrap(c1, m1) & m1;
|
||||
c0 = v_and(v_sub_wrap(c0, m0), m0);
|
||||
c1 = v_and(v_sub_wrap(c1, m1), m1);
|
||||
|
||||
max0 = v_max(max0, v_reinterpret_as_u8(c0));
|
||||
max1 = v_max(max1, v_reinterpret_as_u8(c1));
|
||||
}
|
||||
|
||||
max0 = K16 < v_max(max0, max1);
|
||||
max0 = v_lt(K16, v_max(max0, max1));
|
||||
unsigned int m = v_signmask(v_reinterpret_as_s8(max0));
|
||||
|
||||
for( k = 0; m > 0 && k < 16; k++, m >>= 1 )
|
||||
@ -190,7 +190,7 @@ void FAST_t(InputArray _img, std::vector<KeyPoint>& keypoints, int threshold, bo
|
||||
a1 = v_min(a1, v_nms);
|
||||
b1 = v_max(b1, v_nms);
|
||||
}
|
||||
curr[j + k] = (uchar)(v_reduce_max(v_max(v_max(a0, a1), v_setzero_s16() - v_min(b0, b1))) - 1);
|
||||
curr[j + k] = (uchar)(v_reduce_max(v_max(v_max(a0, a1), v_sub(v_setzero_s16(), v_min(b0, b1)))) - 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -160,7 +160,7 @@ int cornerScore<16>(const uchar* ptr, const int pixel[], int threshold)
|
||||
q0 = v_max(q0, v_min(a, v0));
|
||||
q1 = v_min(q1, v_max(b, v0));
|
||||
}
|
||||
q0 = v_max(q0, v_setzero_s16() - q1);
|
||||
q0 = v_max(q0, v_sub(v_setzero_s16(), q1));
|
||||
threshold = v_reduce_max(q0) - 1;
|
||||
}
|
||||
else
|
||||
@ -251,7 +251,7 @@ int cornerScore<12>(const uchar* ptr, const int pixel[], int threshold)
|
||||
q0 = v_max(q0, v_min(a, v0));
|
||||
q1 = v_min(q1, v_max(b, v0));
|
||||
}
|
||||
q0 = v_max(q0, v_setzero_s16() - q1);
|
||||
q0 = v_max(q0, v_sub(v_setzero_s16(), q1));
|
||||
threshold = v_reduce_max(q0) - 1;
|
||||
}
|
||||
else
|
||||
@ -323,7 +323,7 @@ int cornerScore<8>(const uchar* ptr, const int pixel[], int threshold)
|
||||
v0 = v_load(d + 5);
|
||||
q0 = v_max(q0, v_min(a, v0));
|
||||
q1 = v_min(q1, v_max(b, v0));
|
||||
q0 = v_max(q0, v_setzero_s16() - q1);
|
||||
q0 = v_max(q0, v_sub(v_setzero_s16(), q1));
|
||||
threshold = v_reduce_max(q0) - 1;
|
||||
}
|
||||
else
|
||||
|
@ -335,7 +335,7 @@ void run_rgb2hsv_impl(uchar out[], const uchar in[], const int sdiv_table[],
|
||||
// divide and calculate s according to above feature
|
||||
v_uint32x4 ss[4];
|
||||
|
||||
v_uint32x4 vadd = v_setall_u32(1) << (hsv_shift - 1);
|
||||
v_uint32x4 vadd = v_shl(v_setall_u32(1), (hsv_shift - 1));
|
||||
|
||||
v_uint32x4 v_diff_exp[4];
|
||||
v_diff_exp[0] = v_reinterpret_as_u32(v_and(v_reinterpret_as_u8(v_diff), mask1));
|
||||
@ -406,16 +406,16 @@ void run_rgb2hsv_impl(uchar out[], const uchar in[], const int sdiv_table[],
|
||||
// start computing H-ch
|
||||
//h = (_vr & (g - b)) + (~_vr & ((_vg & (b - r + 2 * diff)) + ((~_vg) & (r - g + 4 * diff))));
|
||||
v_int32x4 hh[4];
|
||||
hh[0] = v_reinterpret_as_s32(v_select(e[0], v_reinterpret_as_s32(gg[0] - bb[0]),
|
||||
hh[0] = v_reinterpret_as_s32(v_select(e[0], v_reinterpret_as_s32(v_sub(gg[0], bb[0])),
|
||||
v_select(p[0], v_reinterpret_as_s32(v_add(v_sub(bb[0], rr[0]), v_mul(v_setall_u32(2), vdd[0]))),
|
||||
v_reinterpret_as_s32(v_add(v_sub(rr[0], gg[0]), v_mul(v_setall_u32(4), vdd[0]))))));
|
||||
hh[1] = v_reinterpret_as_s32(v_select(e[1], v_reinterpret_as_s32(gg[1] - bb[1]),
|
||||
hh[1] = v_reinterpret_as_s32(v_select(e[1], v_reinterpret_as_s32(v_sub(gg[1], bb[1])),
|
||||
v_select(p[1], v_reinterpret_as_s32(v_add(v_sub(bb[1], rr[1]), v_mul(v_setall_u32(2), vdd[1]))),
|
||||
v_reinterpret_as_s32(v_add(v_sub(rr[1], gg[1]), v_mul(v_setall_u32(4), vdd[1]))))));
|
||||
hh[2] = v_reinterpret_as_s32(v_select(e[2], v_reinterpret_as_s32(gg[2] - bb[2]),
|
||||
hh[2] = v_reinterpret_as_s32(v_select(e[2], v_reinterpret_as_s32(v_sub(gg[2], bb[2])),
|
||||
v_select(p[2], v_reinterpret_as_s32(v_add(v_sub(bb[2], rr[2]), v_mul(v_setall_u32(2), vdd[2]))),
|
||||
v_reinterpret_as_s32(v_add(v_sub(rr[2], gg[2]), v_mul(v_setall_u32(4), vdd[2]))))));
|
||||
hh[3] = v_reinterpret_as_s32(v_select(e[3], v_reinterpret_as_s32(gg[3] - bb[3]),
|
||||
hh[3] = v_reinterpret_as_s32(v_select(e[3], v_reinterpret_as_s32(v_sub(gg[3], bb[3])),
|
||||
v_select(p[3], v_reinterpret_as_s32(v_add(v_sub(bb[3], rr[3]), v_mul(v_setall_u32(2), vdd[3]))),
|
||||
v_reinterpret_as_s32(v_add(v_sub(rr[3], gg[3]), v_mul(v_setall_u32(4), vdd[3]))))));
|
||||
|
||||
@ -433,16 +433,16 @@ void run_rgb2hsv_impl(uchar out[], const uchar in[], const int sdiv_table[],
|
||||
|
||||
// check for negative H
|
||||
v_int32x4 v_h_less_0[4];
|
||||
v_h_less_0[0] = (hh[0] < v_setall_s32(0));
|
||||
v_h_less_0[1] = (hh[1] < v_setall_s32(0));
|
||||
v_h_less_0[2] = (hh[2] < v_setall_s32(0));
|
||||
v_h_less_0[3] = (hh[3] < v_setall_s32(0));
|
||||
v_h_less_0[0] = (v_lt(hh[0], v_setall_s32(0)));
|
||||
v_h_less_0[1] = (v_lt(hh[1], v_setall_s32(0)));
|
||||
v_h_less_0[2] = (v_lt(hh[2], v_setall_s32(0)));
|
||||
v_h_less_0[3] = (v_lt(hh[3], v_setall_s32(0)));
|
||||
|
||||
v_int32x4 v_h_180[4];
|
||||
v_h_180[0] = hh[0] + v_setall_s32(180);
|
||||
v_h_180[1] = hh[1] + v_setall_s32(180);
|
||||
v_h_180[2] = hh[2] + v_setall_s32(180);
|
||||
v_h_180[3] = hh[3] + v_setall_s32(180);
|
||||
v_h_180[0] = v_add(hh[0], v_setall_s32(180));
|
||||
v_h_180[1] = v_add(hh[1], v_setall_s32(180));
|
||||
v_h_180[2] = v_add(hh[2], v_setall_s32(180));
|
||||
v_h_180[3] = v_add(hh[3], v_setall_s32(180));
|
||||
|
||||
hh[0] = v_select(v_h_less_0[0], v_h_180[0], hh[0]);
|
||||
hh[1] = v_select(v_h_less_0[1], v_h_180[1], hh[1]);
|
||||
|
@ -64,7 +64,7 @@ CV_ALWAYS_INLINE void calcRowLinear32FC1Impl(float *dst[],
|
||||
bool xRatioEq1 = inSz.width == outSz.width;
|
||||
bool yRatioEq1 = inSz.height == outSz.height;
|
||||
|
||||
constexpr int nlanes = v_float32x8::nlanes;
|
||||
const int nlanes = VTraits<v_float32x8>::vlanes();
|
||||
|
||||
if (!xRatioEq1 && !yRatioEq1)
|
||||
{
|
||||
|
@ -140,9 +140,9 @@ public:
|
||||
#if CV_SIMD128
|
||||
v_uint32x4 rval = v_setall_u32(sptr[j]);
|
||||
v_uint32x4 val(ksptr0[j], ksptr1[j], ksptr2[j], ksptr3[j]);
|
||||
v_float32x4 w = kweight4 * v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, rval)));
|
||||
v_float32x4 w = v_mul(kweight4, v_lut(this->color_weight, v_reinterpret_as_s32(v_absdiff(val, rval))));
|
||||
wsum[j] += v_reduce_sum(w);
|
||||
sum[j] += v_reduce_sum(v_cvt_f32(v_reinterpret_as_s32(val)) * w);
|
||||
sum[j] += v_reduce_sum(v_mul(v_cvt_f32(v_reinterpret_as_s32(val)), w));
|
||||
#else
|
||||
int rval = sptr[j];
|
||||
|
||||
@ -407,11 +407,11 @@ public:
|
||||
v_uint32x4 b(ksptr0[0], ksptr1[0], ksptr2[0], ksptr3[0]);
|
||||
v_uint32x4 g(ksptr0[1], ksptr1[1], ksptr2[1], ksptr3[1]);
|
||||
v_uint32x4 r(ksptr0[2], ksptr1[2], ksptr2[2], ksptr3[2]);
|
||||
v_float32x4 w = kweight4 * v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(b, rb) + v_absdiff(g, rg) + v_absdiff(r, rr)));
|
||||
v_float32x4 w = v_mul(kweight4, v_lut(this->color_weight, v_reinterpret_as_s32(v_add(v_add(v_absdiff(b, rb), v_absdiff(g, rg)), v_absdiff(r, rr)))));
|
||||
wsum[j] += v_reduce_sum(w);
|
||||
sum_b[j] += v_reduce_sum(v_cvt_f32(v_reinterpret_as_s32(b)) * w);
|
||||
sum_g[j] += v_reduce_sum(v_cvt_f32(v_reinterpret_as_s32(g)) * w);
|
||||
sum_r[j] += v_reduce_sum(v_cvt_f32(v_reinterpret_as_s32(r)) * w);
|
||||
sum_b[j] += v_reduce_sum(v_mul(v_cvt_f32(v_reinterpret_as_s32(b)), w));
|
||||
sum_g[j] += v_reduce_sum(v_mul(v_cvt_f32(v_reinterpret_as_s32(g)), w));
|
||||
sum_r[j] += v_reduce_sum(v_mul(v_cvt_f32(v_reinterpret_as_s32(r)), w));
|
||||
#else
|
||||
int rb = rsptr[0], rg = rsptr[1], rr = rsptr[2];
|
||||
|
||||
@ -661,12 +661,12 @@ public:
|
||||
v_float32x4 rval = v_setall_f32(sptr[j]);
|
||||
v_float32x4 val(ksptr0[j], ksptr1[j], ksptr2[j], ksptr3[j]);
|
||||
v_float32x4 knan = v_not_nan(val);
|
||||
v_float32x4 alpha = (v_absdiff(val, rval) * sindex4) & v_not_nan(rval) & knan;
|
||||
v_float32x4 alpha = v_and(v_and(v_mul(v_absdiff(val, rval), sindex4), v_not_nan(rval)), knan);
|
||||
v_int32x4 idx = v_trunc(alpha);
|
||||
alpha -= v_cvt_f32(idx);
|
||||
v_float32x4 w = (kweight4 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one4 - alpha))) & knan;
|
||||
alpha = v_sub(alpha, v_cvt_f32(idx));
|
||||
v_float32x4 w = v_and(v_mul(kweight4, v_muladd(v_lut(this->expLUT + 1, idx), alpha, v_mul(v_lut(this->expLUT, idx), v_sub(v_one4, alpha)))), knan);
|
||||
wsum[j] += v_reduce_sum(w);
|
||||
sum[j] += v_reduce_sum((val & knan) * w);
|
||||
sum[j] += v_reduce_sum(v_mul(v_and(val, knan), w));
|
||||
#else
|
||||
float rval = sptr[j];
|
||||
|
||||
@ -862,15 +862,15 @@ public:
|
||||
v_float32x4 kb(ksptr0[0], ksptr1[0], ksptr2[0], ksptr3[0]);
|
||||
v_float32x4 kg(ksptr0[1], ksptr1[1], ksptr2[1], ksptr3[1]);
|
||||
v_float32x4 kr(ksptr0[2], ksptr1[2], ksptr2[2], ksptr3[2]);
|
||||
v_float32x4 knan = v_not_nan(kb) & v_not_nan(kg) & v_not_nan(kr);
|
||||
v_float32x4 alpha = ((v_absdiff(kb, rb) + v_absdiff(kg, rg) + v_absdiff(kr, rr)) * sindex4) & v_not_nan(rb) & v_not_nan(rg) & v_not_nan(rr) & knan;
|
||||
v_float32x4 knan = v_and(v_and(v_not_nan(kb), v_not_nan(kg)), v_not_nan(kr));
|
||||
v_float32x4 alpha = v_and(v_and(v_and(v_and(v_mul(v_add(v_add(v_absdiff(kb, rb), v_absdiff(kg, rg)), v_absdiff(kr, rr)), sindex4), v_not_nan(rb)), v_not_nan(rg)), v_not_nan(rr)), knan);
|
||||
v_int32x4 idx = v_trunc(alpha);
|
||||
alpha -= v_cvt_f32(idx);
|
||||
v_float32x4 w = (kweight4 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one4 - alpha))) & knan;
|
||||
alpha = v_sub(alpha, v_cvt_f32(idx));
|
||||
v_float32x4 w = v_and(v_mul(kweight4, v_muladd(v_lut(this->expLUT + 1, idx), alpha, v_mul(v_lut(this->expLUT, idx), v_sub(v_one4, alpha)))), knan);
|
||||
wsum[j] += v_reduce_sum(w);
|
||||
sum_b[j] += v_reduce_sum((kb & knan) * w);
|
||||
sum_g[j] += v_reduce_sum((kg & knan) * w);
|
||||
sum_r[j] += v_reduce_sum((kr & knan) * w);
|
||||
sum_b[j] += v_reduce_sum(v_mul(v_and(kb, knan), w));
|
||||
sum_g[j] += v_reduce_sum(v_mul(v_and(kg, knan), w));
|
||||
sum_r[j] += v_reduce_sum(v_mul(v_and(kr, knan), w));
|
||||
#else
|
||||
float rb = rsptr[0], rg = rsptr[1], rr = rsptr[2];
|
||||
bool r_NAN = cvIsNaN(rb) || cvIsNaN(rg) || cvIsNaN(rr);
|
||||
|
@ -315,7 +315,7 @@ struct ColumnSum<int, uchar> :
|
||||
v_store(SUM + i, v_add(vx_load(SUM + i), vx_load(Sp + i)));
|
||||
}
|
||||
#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
|
||||
for (; i <= width - v_int32x4::nlanes; i += v_int32x4::nlanes)
|
||||
for (; i <= width - VTraits<v_int32x4>::vlanes(); i += VTraits<v_int32x4>::vlanes())
|
||||
{
|
||||
v_store(SUM + i, v_add(v_load(SUM + i), v_load(Sp + i)));
|
||||
}
|
||||
@ -357,10 +357,10 @@ struct ColumnSum<int, uchar> :
|
||||
}
|
||||
#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
|
||||
v_float32x4 v_scale = v_setall_f32((float)_scale);
|
||||
for( ; i <= width-v_uint16x8::nlanes; i+=v_uint16x8::nlanes )
|
||||
for( ; i <= width-VTraits<v_uint16x8>::vlanes(); i+=VTraits<v_uint16x8>::vlanes() )
|
||||
{
|
||||
v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i));
|
||||
v_int32x4 v_s01 = v_add(v_load(SUM + i + v_int32x4::nlanes), v_load(Sp + i + v_int32x4::nlanes));
|
||||
v_int32x4 v_s01 = v_add(v_load(SUM + i + VTraits<v_int32x4>::vlanes()), v_load(Sp + i + VTraits<v_int32x4>::vlanes()));
|
||||
|
||||
v_uint32x4 v_s0d = v_reinterpret_as_u32(v_round(v_mul(v_cvt_f32(v_s0), v_scale)));
|
||||
v_uint32x4 v_s01d = v_reinterpret_as_u32(v_round(v_mul(v_cvt_f32(v_s01), v_scale)));
|
||||
@ -369,7 +369,7 @@ struct ColumnSum<int, uchar> :
|
||||
v_pack_store(D + i, v_dst);
|
||||
|
||||
v_store(SUM + i, v_sub(v_s0, v_load(Sm + i)));
|
||||
v_store(SUM + i + v_int32x4::nlanes, v_sub(v_s01, v_load(Sm + i + v_int32x4::nlanes)));
|
||||
v_store(SUM + i + VTraits<v_int32x4>::vlanes(), v_sub(v_s01, v_load(Sm + i + VTraits<v_int32x4>::vlanes())));
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
@ -396,16 +396,16 @@ struct ColumnSum<int, uchar> :
|
||||
v_store(SUM + i + VTraits<v_int32>::vlanes(), v_sub(v_s01, vx_load(Sm + i + VTraits<v_int32>::vlanes())));
|
||||
}
|
||||
#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
|
||||
for( ; i <= width-v_uint16x8::nlanes; i+=v_uint16x8::nlanes )
|
||||
for( ; i <= width-VTraits<v_uint16x8>::vlanes(); i+=VTraits<v_uint16x8>::vlanes() )
|
||||
{
|
||||
v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i));
|
||||
v_int32x4 v_s01 = v_add(v_load(SUM + i + v_int32x4::nlanes), v_load(Sp + i + v_int32x4::nlanes));
|
||||
v_int32x4 v_s01 = v_add(v_load(SUM + i + VTraits<v_int32x4>::vlanes()), v_load(Sp + i + VTraits<v_int32x4>::vlanes()));
|
||||
|
||||
v_uint16x8 v_dst = v_pack(v_reinterpret_as_u32(v_s0), v_reinterpret_as_u32(v_s01));
|
||||
v_pack_store(D + i, v_dst);
|
||||
|
||||
v_store(SUM + i, v_sub(v_s0, v_load(Sm + i)));
|
||||
v_store(SUM + i + v_int32x4::nlanes, v_sub(v_s01, v_load(Sm + i + v_int32x4::nlanes)));
|
||||
v_store(SUM + i + VTraits<v_int32x4>::vlanes(), v_sub(v_s01, v_load(Sm + i + VTraits<v_int32x4>::vlanes())));
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
@ -486,7 +486,7 @@ public BaseColumnFilter
|
||||
v_store(SUM + i, v_add(vx_load(SUM + i), vx_load(Sp + i)));
|
||||
}
|
||||
#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
|
||||
for( ; i <= width - v_uint16x8::nlanes; i += v_uint16x8::nlanes )
|
||||
for( ; i <= width - VTraits<v_uint16x8>::vlanes(); i += VTraits<v_uint16x8>::vlanes() )
|
||||
{
|
||||
v_store(SUM + i, v_add(v_load(SUM + i), v_load(Sp + i)));
|
||||
}
|
||||
@ -546,13 +546,13 @@ public BaseColumnFilter
|
||||
v_uint32x4 ds4 = v_setall_u32((unsigned)ds);
|
||||
v_uint16x8 dd8 = v_setall_u16((ushort)dd);
|
||||
|
||||
for( ; i <= width-v_uint8x16::nlanes; i+=v_uint8x16::nlanes )
|
||||
for( ; i <= width-VTraits<v_uint8x16>::vlanes(); i+=VTraits<v_uint8x16>::vlanes() )
|
||||
{
|
||||
v_uint16x8 _sm0 = v_load(Sm + i);
|
||||
v_uint16x8 _sm1 = v_load(Sm + i + v_uint16x8::nlanes);
|
||||
v_uint16x8 _sm1 = v_load(Sm + i + VTraits<v_uint16x8>::vlanes());
|
||||
|
||||
v_uint16x8 _s0 = v_add_wrap(v_load(SUM + i), v_load(Sp + i));
|
||||
v_uint16x8 _s1 = v_add_wrap(v_load(SUM + i + v_uint16x8::nlanes), v_load(Sp + i + v_uint16x8::nlanes));
|
||||
v_uint16x8 _s1 = v_add_wrap(v_load(SUM + i + VTraits<v_uint16x8>::vlanes()), v_load(Sp + i + VTraits<v_uint16x8>::vlanes()));
|
||||
|
||||
v_uint32x4 _s00, _s01, _s10, _s11;
|
||||
|
||||
@ -572,7 +572,7 @@ public BaseColumnFilter
|
||||
|
||||
v_store(D + i, v_pack_u(r0, r1));
|
||||
v_store(SUM + i, _s0);
|
||||
v_store(SUM + i + v_uint16x8::nlanes, _s1);
|
||||
v_store(SUM + i + VTraits<v_uint16x8>::vlanes(), _s1);
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
@ -649,7 +649,7 @@ struct ColumnSum<int, short> :
|
||||
v_store(SUM + i, v_add(vx_load(SUM + i), vx_load(Sp + i)));
|
||||
}
|
||||
#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
|
||||
for( ; i <= width - v_int32x4::nlanes; i+=v_int32x4::nlanes )
|
||||
for( ; i <= width - VTraits<v_int32x4>::vlanes(); i+=VTraits<v_int32x4>::vlanes() )
|
||||
{
|
||||
v_store(SUM + i, v_add(v_load(SUM + i), v_load(Sp + i)));
|
||||
}
|
||||
@ -689,17 +689,17 @@ struct ColumnSum<int, short> :
|
||||
}
|
||||
#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
|
||||
v_float32x4 v_scale = v_setall_f32((float)_scale);
|
||||
for( ; i <= width-v_int16x8::nlanes; i+=v_int16x8::nlanes )
|
||||
for( ; i <= width-VTraits<v_int16x8>::vlanes(); i+=VTraits<v_int16x8>::vlanes() )
|
||||
{
|
||||
v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i));
|
||||
v_int32x4 v_s01 = v_add(v_load(SUM + i + v_int32x4::nlanes), v_load(Sp + i + v_int32x4::nlanes));
|
||||
v_int32x4 v_s01 = v_add(v_load(SUM + i + VTraits<v_int32x4>::vlanes()), v_load(Sp + i + VTraits<v_int32x4>::vlanes()));
|
||||
|
||||
v_int32x4 v_s0d = v_round(v_mul(v_cvt_f32(v_s0), v_scale));
|
||||
v_int32x4 v_s01d = v_round(v_mul(v_cvt_f32(v_s01), v_scale));
|
||||
v_store(D + i, v_pack(v_s0d, v_s01d));
|
||||
|
||||
v_store(SUM + i, v_sub(v_s0, v_load(Sm + i)));
|
||||
v_store(SUM + i + v_int32x4::nlanes, v_sub(v_s01, v_load(Sm + i + v_int32x4::nlanes)));
|
||||
v_store(SUM + i + VTraits<v_int32x4>::vlanes(), v_sub(v_s01, v_load(Sm + i + VTraits<v_int32x4>::vlanes())));
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
@ -725,15 +725,15 @@ struct ColumnSum<int, short> :
|
||||
v_store(SUM + i + VTraits<v_int32>::vlanes(), v_sub(v_s01, vx_load(Sm + i + VTraits<v_int32>::vlanes())));
|
||||
}
|
||||
#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
|
||||
for( ; i <= width-v_int16x8::nlanes; i+=v_int16x8::nlanes )
|
||||
for( ; i <= width-VTraits<v_int16x8>::vlanes(); i+=VTraits<v_int16x8>::vlanes() )
|
||||
{
|
||||
v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i));
|
||||
v_int32x4 v_s01 = v_add(v_load(SUM + i + v_int32x4::nlanes), v_load(Sp + i + v_int32x4::nlanes));
|
||||
v_int32x4 v_s01 = v_add(v_load(SUM + i + VTraits<v_int32x4>::vlanes()), v_load(Sp + i + VTraits<v_int32x4>::vlanes()));
|
||||
|
||||
v_store(D + i, v_pack(v_s0, v_s01));
|
||||
|
||||
v_store(SUM + i, v_sub(v_s0, v_load(Sm + i)));
|
||||
v_store(SUM + i + v_int32x4::nlanes, v_sub(v_s01, v_load(Sm + i + v_int32x4::nlanes)));
|
||||
v_store(SUM + i + VTraits<v_int32x4>::vlanes(), v_sub(v_s01, v_load(Sm + i + VTraits<v_int32x4>::vlanes())));
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
@ -798,7 +798,7 @@ struct ColumnSum<int, ushort> :
|
||||
v_store(SUM + i, v_add(vx_load(SUM + i), vx_load(Sp + i)));
|
||||
}
|
||||
#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
|
||||
for (; i <= width - v_int32x4::nlanes; i += v_int32x4::nlanes)
|
||||
for (; i <= width - VTraits<v_int32x4>::vlanes(); i += VTraits<v_int32x4>::vlanes())
|
||||
{
|
||||
v_store(SUM + i, v_add(v_load(SUM + i), v_load(Sp + i)));
|
||||
}
|
||||
@ -838,17 +838,17 @@ struct ColumnSum<int, ushort> :
|
||||
}
|
||||
#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
|
||||
v_float32x4 v_scale = v_setall_f32((float)_scale);
|
||||
for( ; i <= width-v_uint16x8::nlanes; i+=v_uint16x8::nlanes )
|
||||
for( ; i <= width-VTraits<v_uint16x8>::vlanes(); i+=VTraits<v_uint16x8>::vlanes() )
|
||||
{
|
||||
v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i));
|
||||
v_int32x4 v_s01 = v_add(v_load(SUM + i + v_int32x4::nlanes), v_load(Sp + i + v_int32x4::nlanes));
|
||||
v_int32x4 v_s01 = v_add(v_load(SUM + i + VTraits<v_int32x4>::vlanes()), v_load(Sp + i + VTraits<v_int32x4>::vlanes()));
|
||||
|
||||
v_uint32x4 v_s0d = v_reinterpret_as_u32(v_round(v_mul(v_cvt_f32(v_s0), v_scale)));
|
||||
v_uint32x4 v_s01d = v_reinterpret_as_u32(v_round(v_mul(v_cvt_f32(v_s01), v_scale)));
|
||||
v_store(D + i, v_pack(v_s0d, v_s01d));
|
||||
|
||||
v_store(SUM + i, v_sub(v_s0, v_load(Sm + i)));
|
||||
v_store(SUM + i + v_int32x4::nlanes, v_sub(v_s01, v_load(Sm + i + v_int32x4::nlanes)));
|
||||
v_store(SUM + i + VTraits<v_int32x4>::vlanes(), v_sub(v_s01, v_load(Sm + i + VTraits<v_int32x4>::vlanes())));
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
@ -874,15 +874,15 @@ struct ColumnSum<int, ushort> :
|
||||
v_store(SUM + i + VTraits<v_int32>::vlanes(), v_sub(v_s01, vx_load(Sm + i + VTraits<v_int32>::vlanes())));
|
||||
}
|
||||
#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
|
||||
for( ; i <= width-v_uint16x8::nlanes; i+=v_uint16x8::nlanes )
|
||||
for( ; i <= width-VTraits<v_uint16x8>::vlanes(); i+=VTraits<v_uint16x8>::vlanes() )
|
||||
{
|
||||
v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i));
|
||||
v_int32x4 v_s01 = v_add(v_load(SUM + i + v_int32x4::nlanes), v_load(Sp + i + v_int32x4::nlanes));
|
||||
v_int32x4 v_s01 = v_add(v_load(SUM + i + VTraits<v_int32x4>::vlanes()), v_load(Sp + i + VTraits<v_int32x4>::vlanes()));
|
||||
|
||||
v_store(D + i, v_pack(v_reinterpret_as_u32(v_s0), v_reinterpret_as_u32(v_s01)));
|
||||
|
||||
v_store(SUM + i, v_sub(v_s0, v_load(Sm + i)));
|
||||
v_store(SUM + i + v_int32x4::nlanes, v_sub(v_s01, v_load(Sm + i + v_int32x4::nlanes)));
|
||||
v_store(SUM + i + VTraits<v_int32x4>::vlanes(), v_sub(v_s01, v_load(Sm + i + VTraits<v_int32x4>::vlanes())));
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
@ -945,7 +945,7 @@ struct ColumnSum<int, int> :
|
||||
v_store(SUM + i, v_add(vx_load(SUM + i), vx_load(Sp + i)));
|
||||
}
|
||||
#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
|
||||
for( ; i <= width - v_int32x4::nlanes; i+=v_int32x4::nlanes )
|
||||
for( ; i <= width - VTraits<v_int32x4>::vlanes(); i+=VTraits<v_int32x4>::vlanes() )
|
||||
{
|
||||
v_store(SUM + i, v_add(v_load(SUM + i), v_load(Sp + i)));
|
||||
}
|
||||
@ -981,7 +981,7 @@ struct ColumnSum<int, int> :
|
||||
}
|
||||
#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
|
||||
v_float32x4 v_scale = v_setall_f32((float)_scale);
|
||||
for( ; i <= width-v_int32x4::nlanes; i+=v_int32x4::nlanes )
|
||||
for( ; i <= width-VTraits<v_int32x4>::vlanes(); i+=VTraits<v_int32x4>::vlanes() )
|
||||
{
|
||||
v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i));
|
||||
v_int32x4 v_s0d = v_round(v_mul(v_cvt_f32(v_s0), v_scale));
|
||||
@ -1010,7 +1010,7 @@ struct ColumnSum<int, int> :
|
||||
v_store(SUM + i, v_sub(v_s0, vx_load(Sm + i)));
|
||||
}
|
||||
#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
|
||||
for( ; i <= width-v_int32x4::nlanes; i+=v_int32x4::nlanes )
|
||||
for( ; i <= width-VTraits<v_int32x4>::vlanes(); i+=VTraits<v_int32x4>::vlanes() )
|
||||
{
|
||||
v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i));
|
||||
|
||||
@ -1079,7 +1079,7 @@ struct ColumnSum<int, float> :
|
||||
v_store(SUM + i, v_add(vx_load(SUM + i), vx_load(Sp + i)));
|
||||
}
|
||||
#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
|
||||
for( ; i <= width - v_int32x4::nlanes; i+=v_int32x4::nlanes )
|
||||
for( ; i <= width - VTraits<v_int32x4>::vlanes(); i+=VTraits<v_int32x4>::vlanes() )
|
||||
{
|
||||
v_store(SUM + i, v_add(v_load(SUM + i), v_load(Sp + i)));
|
||||
}
|
||||
@ -1115,7 +1115,7 @@ struct ColumnSum<int, float> :
|
||||
}
|
||||
#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
|
||||
v_float32x4 v_scale = v_setall_f32((float)_scale);
|
||||
for (; i <= width - v_int32x4::nlanes; i += v_int32x4::nlanes)
|
||||
for (; i <= width - VTraits<v_int32x4>::vlanes(); i += VTraits<v_int32x4>::vlanes())
|
||||
{
|
||||
v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i));
|
||||
v_store(D + i, v_mul(v_cvt_f32(v_s0), v_scale));
|
||||
@ -1142,7 +1142,7 @@ struct ColumnSum<int, float> :
|
||||
v_store(SUM + i, v_sub(v_s0, vx_load(Sm + i)));
|
||||
}
|
||||
#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
|
||||
for( ; i <= width-v_int32x4::nlanes; i+=v_int32x4::nlanes )
|
||||
for( ; i <= width-VTraits<v_int32x4>::vlanes(); i+=VTraits<v_int32x4>::vlanes() )
|
||||
{
|
||||
v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i));
|
||||
v_store(D + i, v_cvt_f32(v_s0));
|
||||
|
@ -66,7 +66,7 @@ template<typename _Tp> static inline cv::v_float32 splineInterpolate(const cv::v
|
||||
ix = v_shl<2>(ix);
|
||||
|
||||
v_float32 t0, t1, t2, t3;
|
||||
// assume that v_float32::nlanes == v_int32::nlanes
|
||||
// assume that VTraits<v_float32>::vlanes() == VTraits<v_int32>::vlanes()
|
||||
if(VTraits<v_float32>::vlanes() == 4)
|
||||
{
|
||||
int32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) idx[4];
|
||||
@ -1388,16 +1388,16 @@ static inline void trilinearPackedInterpolate(const v_uint16x8& inX, const v_uin
|
||||
v_uint16x8& outA, v_uint16x8& outB, v_uint16x8& outC)
|
||||
{
|
||||
//LUT idx of origin pt of cube
|
||||
v_uint16x8 idxsX = inX >> (lab_base_shift - lab_lut_shift);
|
||||
v_uint16x8 idxsY = inY >> (lab_base_shift - lab_lut_shift);
|
||||
v_uint16x8 idxsZ = inZ >> (lab_base_shift - lab_lut_shift);
|
||||
v_uint16x8 idxsX = v_shr<lab_base_shift - lab_lut_shift>(inX);
|
||||
v_uint16x8 idxsY = v_shr<lab_base_shift - lab_lut_shift>(inY);
|
||||
v_uint16x8 idxsZ = v_shr<lab_base_shift - lab_lut_shift>(inZ);
|
||||
|
||||
//x, y, z are [0; TRILINEAR_BASE)
|
||||
const uint16_t bitMask = (1 << trilinear_shift) - 1;
|
||||
v_uint16x8 bitMaskReg = v_setall_u16(bitMask);
|
||||
v_uint16x8 fracX = (inX >> (lab_base_shift - 8 - 1)) & bitMaskReg;
|
||||
v_uint16x8 fracY = (inY >> (lab_base_shift - 8 - 1)) & bitMaskReg;
|
||||
v_uint16x8 fracZ = (inZ >> (lab_base_shift - 8 - 1)) & bitMaskReg;
|
||||
v_uint16x8 fracX = v_and(v_shr<lab_base_shift - 8 - 1>(inX), bitMaskReg);
|
||||
v_uint16x8 fracY = v_and(v_shr<lab_base_shift - 8 - 1>(inY), bitMaskReg);
|
||||
v_uint16x8 fracZ = v_and(v_shr<lab_base_shift - 8 - 1>(inZ), bitMaskReg);
|
||||
|
||||
//load values to interpolate for pix0, pix1, .., pix7
|
||||
v_int16x8 a0, a1, a2, a3, a4, a5, a6, a7;
|
||||
@ -1407,9 +1407,9 @@ static inline void trilinearPackedInterpolate(const v_uint16x8& inX, const v_uin
|
||||
v_uint32x4 addrDw0, addrDw1, addrDw10, addrDw11;
|
||||
v_mul_expand(v_setall_u16(3*8), idxsX, addrDw0, addrDw1);
|
||||
v_mul_expand(v_setall_u16(3*8*LAB_LUT_DIM), idxsY, addrDw10, addrDw11);
|
||||
addrDw0 += addrDw10; addrDw1 += addrDw11;
|
||||
addrDw0 = v_add(addrDw0, addrDw10); addrDw1 = v_add(addrDw1, addrDw11);
|
||||
v_mul_expand(v_setall_u16(3*8*LAB_LUT_DIM*LAB_LUT_DIM), idxsZ, addrDw10, addrDw11);
|
||||
addrDw0 += addrDw10; addrDw1 += addrDw11;
|
||||
addrDw0 = v_add(addrDw0, addrDw10); addrDw1 = v_add(addrDw1, addrDw11);
|
||||
|
||||
uint32_t CV_DECL_ALIGNED(16) addrofs[8];
|
||||
v_store_aligned(addrofs, addrDw0);
|
||||
@ -1431,9 +1431,9 @@ static inline void trilinearPackedInterpolate(const v_uint16x8& inX, const v_uin
|
||||
v_int16x8 w0, w1, w2, w3, w4, w5, w6, w7;
|
||||
v_mul_expand(v_setall_u16(8), fracX, addrDw0, addrDw1);
|
||||
v_mul_expand(v_setall_u16(8*TRILINEAR_BASE), fracY, addrDw10, addrDw11);
|
||||
addrDw0 += addrDw10; addrDw1 += addrDw11;
|
||||
addrDw0 = v_add(addrDw0, addrDw10); addrDw1 = v_add(addrDw1, addrDw11);
|
||||
v_mul_expand(v_setall_u16(8*TRILINEAR_BASE*TRILINEAR_BASE), fracZ, addrDw10, addrDw11);
|
||||
addrDw0 += addrDw10; addrDw1 += addrDw11;
|
||||
addrDw0 = v_add(addrDw0, addrDw10); addrDw1 = v_add(addrDw1, addrDw11);
|
||||
|
||||
v_store_aligned(addrofs, addrDw0);
|
||||
v_store_aligned(addrofs + 4, addrDw1);
|
||||
@ -1476,7 +1476,8 @@ static inline void trilinearPackedInterpolate(const v_uint16& inX, const v_uint1
|
||||
const int16_t* LUT,
|
||||
v_uint16& outA, v_uint16& outB, v_uint16& outC)
|
||||
{
|
||||
const int vsize = VTraits<v_uint16>::max_nlanes;
|
||||
const int vsize = VTraits<v_uint16>::vlanes();
|
||||
const int vsize_max = VTraits<v_uint16>::max_nlanes;
|
||||
|
||||
// LUT idx of origin pt of cube
|
||||
v_uint16 tx = v_shr<lab_base_shift - lab_lut_shift>(inX);
|
||||
@ -1492,7 +1493,7 @@ static inline void trilinearPackedInterpolate(const v_uint16& inX, const v_uint1
|
||||
baseIdx0 = v_add(v_add(btmp00, btmp10), btmp20);
|
||||
baseIdx1 = v_add(v_add(btmp01, btmp11), btmp21);
|
||||
|
||||
uint32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vbaseIdx[vsize];
|
||||
uint32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vbaseIdx[vsize_max];
|
||||
v_store_aligned(vbaseIdx + 0*vsize/2, baseIdx0);
|
||||
v_store_aligned(vbaseIdx + 1*vsize/2, baseIdx1);
|
||||
|
||||
@ -1513,13 +1514,13 @@ static inline void trilinearPackedInterpolate(const v_uint16& inX, const v_uint1
|
||||
trilinearIdx0 = v_add(v_add(v_shl<3>(fracX0), v_shl<3 + trilinear_shift>(fracY0)), v_shl<3 + trilinear_shift * 2>(fracZ0));
|
||||
trilinearIdx1 = v_add(v_add(v_shl<3>(fracX1), v_shl<3 + trilinear_shift>(fracY1)), v_shl<3 + trilinear_shift * 2>(fracZ1));
|
||||
|
||||
uint32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vtrilinearIdx[vsize];
|
||||
uint32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vtrilinearIdx[vsize_max];
|
||||
v_store_aligned(vtrilinearIdx + 0*vsize/2, trilinearIdx0);
|
||||
v_store_aligned(vtrilinearIdx + 1*vsize/2, trilinearIdx1);
|
||||
|
||||
v_uint32 a0, a1, b0, b1, c0, c1;
|
||||
|
||||
uint32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) va[vsize], vb[vsize], vc[vsize];
|
||||
uint32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) va[vsize_max], vb[vsize_max], vc[vsize_max];
|
||||
for(int j = 0; j < vsize; j++)
|
||||
{
|
||||
const int16_t* baseLUT = LUT + vbaseIdx[j];
|
||||
@ -1649,11 +1650,11 @@ struct RGB2Lab_b
|
||||
vL = v_shr<lab_shift2>(vL);
|
||||
|
||||
/* int a = CV_DESCALE( 500*(fX - fY) + 128*(1 << lab_shift2), lab_shift2 );*/
|
||||
va = v_fma(vfX - vfY, v_setall_s32(500), v_setall_s32(abShift+labDescaleShift));
|
||||
va = v_fma(v_sub(vfX, vfY), v_setall_s32(500), v_setall_s32(abShift+labDescaleShift));
|
||||
va = v_shr<lab_shift2>(va);
|
||||
|
||||
/* int b = CV_DESCALE( 200*(fY - fZ) + 128*(1 << lab_shift2), lab_shift2 );*/
|
||||
vb = v_fma(vfY - vfZ, v_setall_s32(200), v_setall_s32(abShift+labDescaleShift));
|
||||
vb = v_fma(v_sub(vfY, vfZ), v_setall_s32(200), v_setall_s32(abShift+labDescaleShift));
|
||||
vb = v_shr<lab_shift2>(vb);
|
||||
}
|
||||
#endif // CV_NEON
|
||||
@ -1675,8 +1676,8 @@ struct RGB2Lab_b
|
||||
#if CV_NEON
|
||||
// On each loop, we load nlanes of RGB/A v_uint8s and store nlanes of
|
||||
// Lab v_uint8s
|
||||
for(; i <= n - v_uint8::nlanes; i += v_uint8::nlanes,
|
||||
src += scn*v_uint8::nlanes, dst += 3*v_uint8::nlanes )
|
||||
for(; i <= n - VTraits<v_uint8>::vlanes(); i += VTraits<v_uint8>::vlanes(),
|
||||
src += scn*VTraits<v_uint8>::vlanes(), dst += 3*VTraits<v_uint8>::vlanes() )
|
||||
{
|
||||
// Load 4 batches of 4 src
|
||||
v_uint8 vRi, vGi, vBi;
|
||||
@ -1712,7 +1713,7 @@ struct RGB2Lab_b
|
||||
#endif // CV_NEON
|
||||
|
||||
#if CV_SIMD
|
||||
const int vsize = v_uint8::nlanes;
|
||||
const int vsize = VTraits<v_uint8>::vlanes();
|
||||
const int xyzDescaleShift = 1 << (lab_shift - 1);
|
||||
v_int16 vXYZdescale = vx_setall_s16(xyzDescaleShift);
|
||||
v_int16 cxrg, cxb1, cyrg, cyb1, czrg, czb1;
|
||||
@ -1752,7 +1753,7 @@ struct RGB2Lab_b
|
||||
v_expand(drgb[k], qrgb[k*2+0], qrgb[k*2+1]);
|
||||
}
|
||||
|
||||
uint32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vdrgb[vsize*3];
|
||||
uint32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vdrgb[VTraits<v_uint8>::max_nlanes*3];
|
||||
for(int k = 0; k < 12; k++)
|
||||
{
|
||||
v_store_aligned(vdrgb + k*vsize/4, qrgb[k]);
|
||||
@ -1784,14 +1785,14 @@ struct RGB2Lab_b
|
||||
v_uint32 x[4], y[4], z[4];
|
||||
for(int j = 0; j < 4; j++)
|
||||
{
|
||||
x[j] = v_reinterpret_as_u32(v_dotprod(rg[j], cxrg) + v_dotprod(bd[j], cxb1)) >> lab_shift;
|
||||
y[j] = v_reinterpret_as_u32(v_dotprod(rg[j], cyrg) + v_dotprod(bd[j], cyb1)) >> lab_shift;
|
||||
z[j] = v_reinterpret_as_u32(v_dotprod(rg[j], czrg) + v_dotprod(bd[j], czb1)) >> lab_shift;
|
||||
x[j] = v_shr<xyz_shift>(v_reinterpret_as_u32(v_add(v_dotprod(rg[j], cxrg), v_dotprod(bd[j], cxb1))));
|
||||
y[j] = v_shr<xyz_shift>(v_reinterpret_as_u32(v_add(v_dotprod(rg[j], cyrg), v_dotprod(bd[j], cyb1))));
|
||||
z[j] = v_shr<xyz_shift>(v_reinterpret_as_u32(v_add(v_dotprod(rg[j], czrg), v_dotprod(bd[j], czb1))));
|
||||
}
|
||||
|
||||
// [fX, fY, fZ] = LabCbrtTab_b[vx, vy, vz]
|
||||
// [4 per X, 4 per Y, 4 per Z]
|
||||
uint32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vxyz[vsize*3];
|
||||
uint32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vxyz[VTraits<v_uint8>::max_nlanes*3];
|
||||
for(int j = 0; j < 4; j++)
|
||||
{
|
||||
v_store_aligned(vxyz + (0*4+j)*vsize/4, x[j]);
|
||||
@ -1822,7 +1823,7 @@ struct RGB2Lab_b
|
||||
v_uint32 vLshift = vx_setall_u32((uint32_t)(Lshift + labDescaleShift));
|
||||
for(int k = 0; k < 4; k++)
|
||||
{
|
||||
vL[k] = (vL[k] + vLshift) >> lab_shift2;
|
||||
vL[k] = v_shr<lab_shift2>(v_add(vL[k], vLshift));
|
||||
}
|
||||
v_uint16 L0, L1;
|
||||
L0 = v_pack(vL[0], vL[1]);
|
||||
@ -1846,7 +1847,7 @@ struct RGB2Lab_b
|
||||
v_int32 abShift = vx_setall_s32(128*(1 << lab_shift2) + labDescaleShift);
|
||||
for(int k = 0; k < 8; k++)
|
||||
{
|
||||
ab[k] = (ab[k] + abShift) >> lab_shift2;
|
||||
ab[k] = v_shr<lab_shift2>(v_add(ab[k], abShift));
|
||||
}
|
||||
v_int16 a0, a1, b0, b1;
|
||||
a0 = v_pack(ab[0], ab[1]); a1 = v_pack(ab[2], ab[3]);
|
||||
@ -1941,7 +1942,7 @@ struct RGB2Lab_f
|
||||
#if CV_SIMD
|
||||
if(enablePackedLab)
|
||||
{
|
||||
const int vsize = v_float32::nlanes;
|
||||
const int vsize = VTraits<v_float32>::vlanes();
|
||||
static const int nPixels = vsize*2;
|
||||
for(; i < n - 3*nPixels; i += 3*nPixels, src += scn*nPixels)
|
||||
{
|
||||
@ -1973,8 +1974,8 @@ struct RGB2Lab_f
|
||||
#undef clipv
|
||||
/* int iR = R*LAB_BASE, iG = G*LAB_BASE, iB = B*LAB_BASE, iL, ia, ib; */
|
||||
v_float32 basef = vx_setall_f32(LAB_BASE);
|
||||
rvec0 *= basef, gvec0 *= basef, bvec0 *= basef;
|
||||
rvec1 *= basef, gvec1 *= basef, bvec1 *= basef;
|
||||
rvec0 = v_mul(rvec0, basef), gvec0 = v_mul(gvec0, basef), bvec0 = v_mul(bvec0, basef);
|
||||
rvec1 = v_mul(rvec1, basef), gvec1 = v_mul(gvec1, basef), bvec1 = v_mul(bvec1, basef);
|
||||
|
||||
v_int32 irvec0, igvec0, ibvec0, irvec1, igvec1, ibvec1;
|
||||
irvec0 = v_round(rvec0); irvec1 = v_round(rvec1);
|
||||
@ -2004,8 +2005,8 @@ struct RGB2Lab_f
|
||||
|
||||
/* dst[i] = L*100.0f */
|
||||
v_float32 v100dBase = vx_setall_f32(100.0f/LAB_BASE);
|
||||
l_vec0 = l_vec0*v100dBase;
|
||||
l_vec1 = l_vec1*v100dBase;
|
||||
l_vec0 = v_mul(l_vec0, v100dBase);
|
||||
l_vec1 = v_mul(l_vec1, v100dBase);
|
||||
/*
|
||||
dst[i + 1] = a*256.0f - 128.0f;
|
||||
dst[i + 2] = b*256.0f - 128.0f;
|
||||
@ -2043,8 +2044,8 @@ struct RGB2Lab_f
|
||||
static const float _a = (softfloat(16) / softfloat(116));
|
||||
int i = 0;
|
||||
#if CV_SIMD
|
||||
const int vsize = v_float32::nlanes;
|
||||
const int nrepeats = vsize == 4 ? 2 : 1;
|
||||
const int vsize = VTraits<v_float32>::vlanes();
|
||||
const int nrepeats = VTraits<v_float32>::nlanes == 4 ? 2 : 1;
|
||||
v_float32 vc0 = vx_setall_f32(C0), vc1 = vx_setall_f32(C1), vc2 = vx_setall_f32(C2);
|
||||
v_float32 vc3 = vx_setall_f32(C3), vc4 = vx_setall_f32(C4), vc5 = vx_setall_f32(C5);
|
||||
v_float32 vc6 = vx_setall_f32(C6), vc7 = vx_setall_f32(C7), vc8 = vx_setall_f32(C8);
|
||||
@ -2080,9 +2081,9 @@ struct RGB2Lab_f
|
||||
v_float32 vgscale = vx_setall_f32(gscale);
|
||||
for (int k = 0; k < nrepeats; k++)
|
||||
{
|
||||
R[k] = splineInterpolate(R[k]*vgscale, gammaTab, GAMMA_TAB_SIZE);
|
||||
G[k] = splineInterpolate(G[k]*vgscale, gammaTab, GAMMA_TAB_SIZE);
|
||||
B[k] = splineInterpolate(B[k]*vgscale, gammaTab, GAMMA_TAB_SIZE);
|
||||
R[k] = splineInterpolate(v_mul(R[k], vgscale), gammaTab, GAMMA_TAB_SIZE);
|
||||
G[k] = splineInterpolate(v_mul(G[k], vgscale), gammaTab, GAMMA_TAB_SIZE);
|
||||
B[k] = splineInterpolate(v_mul(B[k], vgscale), gammaTab, GAMMA_TAB_SIZE);
|
||||
}
|
||||
}
|
||||
|
||||
@ -2090,26 +2091,26 @@ struct RGB2Lab_f
|
||||
v_float32 FX[nrepeats], FY[nrepeats], FZ[nrepeats];
|
||||
for (int k = 0; k < nrepeats; k++)
|
||||
{
|
||||
X[k] = v_fma(R[k], vc0, v_fma(G[k], vc1, B[k]*vc2));
|
||||
Y[k] = v_fma(R[k], vc3, v_fma(G[k], vc4, B[k]*vc5));
|
||||
Z[k] = v_fma(R[k], vc6, v_fma(G[k], vc7, B[k]*vc8));
|
||||
X[k] = v_fma(R[k], vc0, v_fma(G[k], vc1, v_mul(B[k], vc2)));
|
||||
Y[k] = v_fma(R[k], vc3, v_fma(G[k], vc4, v_mul(B[k], vc5)));
|
||||
Z[k] = v_fma(R[k], vc6, v_fma(G[k], vc7, v_mul(B[k], vc8)));
|
||||
|
||||
// use spline interpolation instead of direct calculation
|
||||
v_float32 vTabScale = vx_setall_f32(LabCbrtTabScale);
|
||||
FX[k] = splineInterpolate(X[k]*vTabScale, LabCbrtTab, LAB_CBRT_TAB_SIZE);
|
||||
FY[k] = splineInterpolate(Y[k]*vTabScale, LabCbrtTab, LAB_CBRT_TAB_SIZE);
|
||||
FZ[k] = splineInterpolate(Z[k]*vTabScale, LabCbrtTab, LAB_CBRT_TAB_SIZE);
|
||||
FX[k] = splineInterpolate(v_mul(X[k], vTabScale), LabCbrtTab, LAB_CBRT_TAB_SIZE);
|
||||
FY[k] = splineInterpolate(v_mul(Y[k], vTabScale), LabCbrtTab, LAB_CBRT_TAB_SIZE);
|
||||
FZ[k] = splineInterpolate(v_mul(Z[k], vTabScale), LabCbrtTab, LAB_CBRT_TAB_SIZE);
|
||||
}
|
||||
|
||||
v_float32 L[nrepeats], a[nrepeats], b[nrepeats];
|
||||
for (int k = 0; k < nrepeats; k++)
|
||||
{
|
||||
// 7.787f = (29/3)^3/(29*4), 0.008856f = (6/29)^3, 903.3 = (29/3)^3
|
||||
v_float32 mask = Y[k] > (vx_setall_f32(0.008856f));
|
||||
v_float32 mask = v_gt(Y[k], (vx_setall_f32(0.008856f)));
|
||||
v_float32 v116 = vx_setall_f32(116.f), vm16 = vx_setall_f32(-16.f);
|
||||
L[k] = v_select(mask, v_fma(v116, FY[k], vm16), vx_setall_f32(903.3f)*Y[k]);
|
||||
a[k] = vx_setall_f32(500.f) * (FX[k] - FY[k]);
|
||||
b[k] = vx_setall_f32(200.f) * (FY[k] - FZ[k]);
|
||||
L[k] = v_select(mask, v_fma(v116, FY[k], vm16), v_mul(vx_setall_f32(903.3f),Y[k]));
|
||||
a[k] = v_mul(vx_setall_f32(500.F), v_sub(FX[k], FY[k]));
|
||||
b[k] = v_mul(vx_setall_f32(200.F), v_sub(FY[k], FZ[k]));
|
||||
|
||||
v_store_interleave(dst + k*3*vsize, L[k], a[k], b[k]);
|
||||
}
|
||||
@ -2204,7 +2205,7 @@ struct Lab2RGBfloat
|
||||
float alpha = ColorChannel<float>::max();
|
||||
|
||||
#if CV_SIMD
|
||||
const int vsize = v_float32::nlanes;
|
||||
const int vsize = VTraits<v_float32>::vlanes();
|
||||
const int nrepeats = 2;
|
||||
v_float32 v16_116 = vx_setall_f32(16.0f / 116.0f);
|
||||
for( ; i <= n-vsize*nrepeats;
|
||||
@ -2221,14 +2222,14 @@ struct Lab2RGBfloat
|
||||
v_float32 vlThresh = vx_setall_f32(lThresh);
|
||||
for(int k = 0; k < nrepeats; k++)
|
||||
{
|
||||
limask[k] = li[k] <= vlThresh;
|
||||
limask[k] = v_le(li[k], vlThresh);
|
||||
}
|
||||
v_float32 ylo[nrepeats], yhi[nrepeats], fylo[nrepeats], fyhi[nrepeats];
|
||||
// 903.3 = (29/3)^3, 7.787 = (29/3)^3/(29*4)
|
||||
v_float32 vinv903 = vx_setall_f32(1.f/903.3f);
|
||||
for(int k = 0; k < nrepeats; k++)
|
||||
{
|
||||
ylo[k] = li[k] * vinv903;
|
||||
ylo[k] = v_mul(li[k], vinv903);
|
||||
}
|
||||
v_float32 v7787 = vx_setall_f32(7.787f);
|
||||
for(int k = 0; k < nrepeats; k++)
|
||||
@ -2238,11 +2239,11 @@ struct Lab2RGBfloat
|
||||
v_float32 v16 = vx_setall_f32(16.0f), vinv116 = vx_setall_f32(1.f/116.0f);
|
||||
for(int k = 0; k < nrepeats; k++)
|
||||
{
|
||||
fyhi[k] = (li[k] + v16) * vinv116;
|
||||
fyhi[k] = v_mul(v_add(li[k], v16), vinv116);
|
||||
}
|
||||
for(int k = 0; k < nrepeats; k++)
|
||||
{
|
||||
yhi[k] = fyhi[k] * fyhi[k] * fyhi[k];
|
||||
yhi[k] = v_mul(fyhi[k], fyhi[k], fyhi[k]);
|
||||
}
|
||||
for(int k = 0; k < nrepeats; k++)
|
||||
{
|
||||
@ -2265,9 +2266,9 @@ struct Lab2RGBfloat
|
||||
for (int j = 0; j < 2; j++)
|
||||
{
|
||||
v_float32 f = fxz[k*2+j];
|
||||
v_float32 fmask = f <= vfTresh;
|
||||
v_float32 flo = (f - v16_116) * vinv7787;
|
||||
v_float32 fhi = f*f*f;
|
||||
v_float32 fmask = v_le(f, vfTresh);
|
||||
v_float32 flo = v_mul(v_sub(f, v16_116), vinv7787);
|
||||
v_float32 fhi = v_mul(v_mul(f, f), f);
|
||||
fxz[k*2+j] = v_select(fmask, flo, fhi);
|
||||
}
|
||||
}
|
||||
@ -2281,9 +2282,9 @@ struct Lab2RGBfloat
|
||||
v_float32 vc6 = vx_setall_f32(C6), vc7 = vx_setall_f32(C7), vc8 = vx_setall_f32(C8);
|
||||
for(int k = 0; k < nrepeats; k++)
|
||||
{
|
||||
ro[k] = v_fma(vc0, x[k], v_fma(vc1, y[k], vc2 * z[k]));
|
||||
go[k] = v_fma(vc3, x[k], v_fma(vc4, y[k], vc5 * z[k]));
|
||||
bo[k] = v_fma(vc6, x[k], v_fma(vc7, y[k], vc8 * z[k]));
|
||||
ro[k] = v_fma(vc0, x[k], v_fma(vc1, y[k], v_mul(vc2, z[k])));
|
||||
go[k] = v_fma(vc3, x[k], v_fma(vc4, y[k], v_mul(vc5, z[k])));
|
||||
bo[k] = v_fma(vc6, x[k], v_fma(vc7, y[k], v_mul(vc8, z[k])));
|
||||
}
|
||||
v_float32 one = vx_setall_f32(1.f), zero = vx_setzero_f32();
|
||||
for(int k = 0; k < nrepeats; k++)
|
||||
@ -2298,9 +2299,9 @@ struct Lab2RGBfloat
|
||||
v_float32 vgscale = vx_setall_f32(gscale);
|
||||
for(int k = 0; k < nrepeats; k++)
|
||||
{
|
||||
ro[k] *= vgscale;
|
||||
go[k] *= vgscale;
|
||||
bo[k] *= vgscale;
|
||||
ro[k] = v_mul(ro[k], vgscale);
|
||||
go[k] = v_mul(go[k], vgscale);
|
||||
bo[k] = v_mul(bo[k], vgscale);
|
||||
}
|
||||
|
||||
for(int k = 0; k < nrepeats; k++)
|
||||
@ -2500,8 +2501,8 @@ struct Lab2RGBinteger
|
||||
for(int k = 0; k < 4; k++)
|
||||
{
|
||||
yf[k] = v_lut((const int*)LabToYF_b, lq[k]);
|
||||
y[k] = yf[k] & mask16;
|
||||
ify[k] = v_reinterpret_as_s32(v_reinterpret_as_u32(yf[k]) >> 16);
|
||||
y[k] = v_and(yf[k], mask16);
|
||||
ify[k] = v_reinterpret_as_s32(v_shr(v_reinterpret_as_u32(yf[k]), 16));
|
||||
}
|
||||
|
||||
v_int16 ify0, ify1;
|
||||
@ -2516,18 +2517,18 @@ struct Lab2RGBinteger
|
||||
v_uint16 mulA = vx_setall_u16(53687);
|
||||
v_uint32 ma[4];
|
||||
v_uint32 addA = vx_setall_u32(1 << 7);
|
||||
v_mul_expand((a0 + (a0 << 2)), mulA, ma[0], ma[1]);
|
||||
v_mul_expand((a1 + (a1 << 2)), mulA, ma[2], ma[3]);
|
||||
adiv0 = v_reinterpret_as_s16(v_pack(((ma[0] + addA) >> 13), ((ma[1] + addA) >> 13)));
|
||||
adiv1 = v_reinterpret_as_s16(v_pack(((ma[2] + addA) >> 13), ((ma[3] + addA) >> 13)));
|
||||
v_mul_expand((v_add(a0, v_shl<2>(a0))), mulA, ma[0], ma[1]);
|
||||
v_mul_expand((v_add(a1, v_shl<2>(a1))), mulA, ma[2], ma[3]);
|
||||
adiv0 = v_reinterpret_as_s16(v_pack((v_shr<13>(v_add(ma[0], addA))), (v_shr<13>(v_add(ma[1], addA)))));
|
||||
adiv1 = v_reinterpret_as_s16(v_pack((v_shr<13>(v_add(ma[2], addA))), (v_shr<13>(v_add(ma[3], addA)))));
|
||||
|
||||
v_uint16 mulB = vx_setall_u16(41943);
|
||||
v_uint32 mb[4];
|
||||
v_uint32 addB = vx_setall_u32(1 << 4);
|
||||
v_mul_expand(b0, mulB, mb[0], mb[1]);
|
||||
v_mul_expand(b1, mulB, mb[2], mb[3]);
|
||||
bdiv0 = v_reinterpret_as_s16(v_pack((mb[0] + addB) >> 9, (mb[1] + addB) >> 9));
|
||||
bdiv1 = v_reinterpret_as_s16(v_pack((mb[2] + addB) >> 9, (mb[3] + addB) >> 9));
|
||||
bdiv0 = v_reinterpret_as_s16(v_pack(v_shr<9>(v_add(mb[0], addB)), v_shr<9>(v_add(mb[1], addB))));
|
||||
bdiv1 = v_reinterpret_as_s16(v_pack(v_shr<9>(v_add(mb[2], addB)), v_shr<9>(v_add(mb[3], addB))));
|
||||
|
||||
// 0 <= adiv <= 8356, 0 <= bdiv <= 20890
|
||||
/* x = ifxz[0]; y = y; z = ifxz[1]; */
|
||||
@ -2570,7 +2571,7 @@ struct Lab2RGBinteger
|
||||
{
|
||||
bool srgb = issRGB;
|
||||
ushort* tab = sRGBInvGammaTab_b;
|
||||
const int vsize = v_uint8::nlanes;
|
||||
const int vsize = VTraits<v_uint8>::vlanes();
|
||||
v_uint8 valpha = vx_setall_u8(alpha);
|
||||
v_int32 vc[9];
|
||||
for(int k = 0; k < 9; k++)
|
||||
@ -2592,9 +2593,9 @@ struct Lab2RGBinteger
|
||||
v_int32 rq[4], gq[4], bq[4];
|
||||
for(int k = 0; k < 4; k++)
|
||||
{
|
||||
rq[k] = (vc[0] * xq[k] + vc[1] * yq[k] + vc[2] * zq[k] + vdescale) >> shift;
|
||||
gq[k] = (vc[3] * xq[k] + vc[4] * yq[k] + vc[5] * zq[k] + vdescale) >> shift;
|
||||
bq[k] = (vc[6] * xq[k] + vc[7] * yq[k] + vc[8] * zq[k] + vdescale) >> shift;
|
||||
rq[k] = v_shr<shift>(v_add(v_add(v_add(v_mul(vc[0], xq[k]), v_mul(vc[1], yq[k])), v_mul(vc[2], zq[k])), vdescale));
|
||||
gq[k] = v_shr<shift>(v_add(v_add(v_add(v_mul(vc[3], xq[k]), v_mul(vc[4], yq[k])), v_mul(vc[5], zq[k])), vdescale));
|
||||
bq[k] = v_shr<shift>(v_add(v_add(v_add(v_mul(vc[6], xq[k]), v_mul(vc[7], yq[k])), v_mul(vc[8], zq[k])), vdescale));
|
||||
}
|
||||
|
||||
//limit indices in table and then substitute
|
||||
@ -2611,7 +2612,7 @@ struct Lab2RGBinteger
|
||||
if(srgb)
|
||||
{
|
||||
// [RRR... , GGG... , BBB...]
|
||||
int32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vidx[vsize*3];
|
||||
int32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vidx[VTraits<v_uint8>::max_nlanes*3];
|
||||
for (int k = 0; k < 4; k++)
|
||||
v_store_aligned(vidx + 0*vsize + k*vsize/4, rq[k]);
|
||||
for (int k = 0; k < 4; k++)
|
||||
@ -2631,9 +2632,9 @@ struct Lab2RGBinteger
|
||||
// rgb = (rgb*255) >> inv_gamma_shift
|
||||
for(int k = 0; k < 4; k++)
|
||||
{
|
||||
rq[k] = ((rq[k] << 8) - rq[k]) >> inv_gamma_shift;
|
||||
gq[k] = ((gq[k] << 8) - gq[k]) >> inv_gamma_shift;
|
||||
bq[k] = ((bq[k] << 8) - bq[k]) >> inv_gamma_shift;
|
||||
rq[k] = v_shr((v_sub(v_shl(rq[k], 8), rq[k])), inv_gamma_shift);
|
||||
gq[k] = v_shr((v_sub(v_shl(gq[k], 8), gq[k])), inv_gamma_shift);
|
||||
bq[k] = v_shr((v_sub(v_shl(bq[k], 8), bq[k])), inv_gamma_shift);
|
||||
}
|
||||
rgb[0] = v_reinterpret_as_u16(v_pack(rq[0], rq[1]));
|
||||
rgb[1] = v_reinterpret_as_u16(v_pack(rq[2], rq[3]));
|
||||
@ -2730,13 +2731,13 @@ struct Lab2RGB_b
|
||||
static const softfloat fl = softfloat(100)/f255;
|
||||
|
||||
#if CV_SIMD
|
||||
const int fsize = v_float32::nlanes;
|
||||
const int fsize = VTraits<v_float32>::vlanes();
|
||||
v_float32 vl = vx_setall_f32((float)fl);
|
||||
v_float32 va = vx_setall_f32(1.f);
|
||||
v_float32 vb = vx_setall_f32(1.f);
|
||||
v_float32 vaLow = vx_setall_f32(-128.f), vbLow = vx_setall_f32(-128.f);
|
||||
//TODO: fix that when v_interleave is available
|
||||
float CV_DECL_ALIGNED(CV_SIMD_WIDTH) interTmpM[fsize*3], interTmpA[fsize*3];
|
||||
float CV_DECL_ALIGNED(CV_SIMD_WIDTH) interTmpM[VTraits<v_float32>::max_nlanes*3], interTmpA[VTraits<v_float32>::max_nlanes*3];
|
||||
v_store_interleave(interTmpM, vl, va, vb);
|
||||
v_store_interleave(interTmpA, vx_setzero_f32(), vaLow, vbLow);
|
||||
v_float32 mluv[3], aluv[3];
|
||||
@ -2754,7 +2755,7 @@ struct Lab2RGB_b
|
||||
j = 0;
|
||||
|
||||
#if CV_SIMD
|
||||
const int vsize = v_uint8::nlanes;
|
||||
const int vsize = VTraits<v_uint8>::vlanes();
|
||||
for( ; j <= (dn - vsize)*3; j += 3*vsize )
|
||||
{
|
||||
v_uint8 s0, s1, s2;
|
||||
@ -2808,7 +2809,7 @@ struct Lab2RGB_b
|
||||
v_int32 vi[4*3];
|
||||
for(int k = 0; k < 4*3; k++)
|
||||
{
|
||||
vi[k] = v_round(vf[k]*v255);
|
||||
vi[k] = v_round(v_mul(vf[k], v255));
|
||||
}
|
||||
|
||||
v_uint8 rgb[3];
|
||||
@ -2830,7 +2831,7 @@ struct Lab2RGB_b
|
||||
for(int k = 0; k < 4; k++)
|
||||
{
|
||||
vf[k] = vx_load_aligned(buf + j + k*fsize);
|
||||
vi[k] = v_round(vf[k]*v255);
|
||||
vi[k] = v_round(v_mul(vf[k], v255));
|
||||
}
|
||||
v_store(dst, v_pack_u(v_pack(vi[0], vi[1]), v_pack(vi[2], vi[3])));
|
||||
}
|
||||
@ -2910,8 +2911,8 @@ struct RGB2Luvfloat
|
||||
C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
|
||||
|
||||
#if CV_SIMD
|
||||
const int vsize = v_float32::nlanes;
|
||||
const int nrepeats = vsize == 4 ? 2 : 1;
|
||||
const int vsize = VTraits<v_float32>::vlanes();
|
||||
const int nrepeats = VTraits<v_float32>::nlanes == 4 ? 2 : 1;
|
||||
for( ; i <= n-vsize*nrepeats;
|
||||
i+= vsize*nrepeats, src += scn*vsize*nrepeats, dst += 3*vsize*nrepeats)
|
||||
{
|
||||
@ -2944,9 +2945,9 @@ struct RGB2Luvfloat
|
||||
v_float32 vgscale = vx_setall_f32(gscale);
|
||||
for (int k = 0; k < nrepeats; k++)
|
||||
{
|
||||
R[k] *= vgscale;
|
||||
G[k] *= vgscale;
|
||||
B[k] *= vgscale;
|
||||
R[k] = v_mul(R[k], vgscale);
|
||||
G[k] = v_mul(G[k], vgscale);
|
||||
B[k] = v_mul(B[k], vgscale);
|
||||
}
|
||||
|
||||
for (int k = 0; k < nrepeats; k++)
|
||||
@ -2963,27 +2964,27 @@ struct RGB2Luvfloat
|
||||
v_float32 vc6 = vx_setall_f32(C6), vc7 = vx_setall_f32(C7), vc8 = vx_setall_f32(C8);
|
||||
for (int k = 0; k < nrepeats; k++)
|
||||
{
|
||||
X[k] = v_fma(R[k], vc0, v_fma(G[k], vc1, B[k]*vc2));
|
||||
Y[k] = v_fma(R[k], vc3, v_fma(G[k], vc4, B[k]*vc5));
|
||||
Z[k] = v_fma(R[k], vc6, v_fma(G[k], vc7, B[k]*vc8));
|
||||
X[k] = v_fma(R[k], vc0, v_fma(G[k], vc1, v_mul(B[k], vc2)));
|
||||
Y[k] = v_fma(R[k], vc3, v_fma(G[k], vc4, v_mul(B[k], vc5)));
|
||||
Z[k] = v_fma(R[k], vc6, v_fma(G[k], vc7, v_mul(B[k], vc8)));
|
||||
}
|
||||
|
||||
v_float32 L[nrepeats], u[nrepeats], v[nrepeats];
|
||||
v_float32 vmun = vx_setall_f32(-un), vmvn = vx_setall_f32(-vn);
|
||||
for (int k = 0; k < nrepeats; k++)
|
||||
{
|
||||
L[k] = splineInterpolate(Y[k]*vx_setall_f32(LabCbrtTabScale), LabCbrtTab, LAB_CBRT_TAB_SIZE);
|
||||
L[k] = splineInterpolate(v_mul(Y[k], vx_setall_f32(LabCbrtTabScale)), LabCbrtTab, LAB_CBRT_TAB_SIZE);
|
||||
// L = 116.f*L - 16.f;
|
||||
L[k] = v_fma(L[k], vx_setall_f32(116.f), vx_setall_f32(-16.f));
|
||||
|
||||
v_float32 d;
|
||||
// d = (4*13) / max(X + 15 * Y + 3 * Z, FLT_EPSILON)
|
||||
d = v_fma(Y[k], vx_setall_f32(15.f), v_fma(Z[k], vx_setall_f32(3.f), X[k]));
|
||||
d = vx_setall_f32(4.f*13.f) / v_max(d, vx_setall_f32(FLT_EPSILON));
|
||||
d = v_div(vx_setall_f32(4.F * 13.F), v_max(d, vx_setall_f32(FLT_EPSILON)));
|
||||
// u = L*(X*d - un)
|
||||
u[k] = L[k]*v_fma(X[k], d, vmun);
|
||||
u[k] = v_mul(L[k], v_fma(X[k], d, vmun));
|
||||
// v = L*((9*0.25f)*Y*d - vn);
|
||||
v[k] = L[k]*v_fma(vx_setall_f32(9.f*0.25f)*Y[k], d, vmvn);
|
||||
v[k] = v_mul(L[k], v_fma(v_mul(vx_setall_f32(9.F * 0.25F), Y[k]), d, vmvn));
|
||||
}
|
||||
|
||||
for (int k = 0; k < nrepeats; k++)
|
||||
@ -3099,8 +3100,8 @@ struct Luv2RGBfloat
|
||||
float _un = un, _vn = vn;
|
||||
|
||||
#if CV_SIMD
|
||||
const int vsize = v_float32::nlanes;
|
||||
const int nrepeats = vsize == 4 ? 2 : 1;
|
||||
const int vsize = VTraits<v_float32>::vlanes();
|
||||
const int nrepeats = VTraits<v_float32>::nlanes == 4 ? 2 : 1;
|
||||
for( ; i <= n - vsize*nrepeats;
|
||||
i += vsize*nrepeats, src += vsize*3*nrepeats, dst += dcn*vsize*nrepeats)
|
||||
{
|
||||
@ -3120,13 +3121,13 @@ struct Luv2RGBfloat
|
||||
v_float32 Ylo, Yhi;
|
||||
|
||||
// ((L + 16)/116)^3
|
||||
Ylo = (L[k] + v16) * v116inv;
|
||||
Ylo = Ylo*Ylo*Ylo;
|
||||
Ylo = v_mul(v_add(L[k], v16), v116inv);
|
||||
Ylo = v_mul(v_mul(Ylo, Ylo), Ylo);
|
||||
// L*(3./29.)^3
|
||||
Yhi = L[k] * v903inv;
|
||||
Yhi = v_mul(L[k], v903inv);
|
||||
|
||||
// Y = (L <= 8) ? Y0 : Y1;
|
||||
Y[k] = v_select(L[k] >= vx_setall_f32(8.f), Ylo, Yhi);
|
||||
Y[k] = v_select(v_ge(L[k], vx_setall_f32(8.f)), Ylo, Yhi);
|
||||
}
|
||||
|
||||
v_float32 v4inv = vx_setall_f32(0.25f), v3 = vx_setall_f32(3.f);
|
||||
@ -3135,18 +3136,18 @@ struct Luv2RGBfloat
|
||||
v_float32 up, vp;
|
||||
|
||||
// up = 3*(u + L*_un);
|
||||
up = v3*(v_fma(L[k], vx_setall_f32(_un), u[k]));
|
||||
up = v_mul(v3, v_fma(L[k], vx_setall_f32(_un), u[k]));
|
||||
// vp = 0.25/(v + L*_vn);
|
||||
vp = v4inv/(v_fma(L[k], vx_setall_f32(_vn), v[k]));
|
||||
vp = v_div(v4inv, v_fma(L[k], vx_setall_f32(_vn), v[k]));
|
||||
|
||||
// vp = max(-0.25, min(0.25, vp));
|
||||
vp = v_max(vx_setall_f32(-0.25f), v_min(v4inv, vp));
|
||||
|
||||
//X = 3*up*vp; // (*Y) is done later
|
||||
X[k] = v3*up*vp;
|
||||
X[k] = v_mul(v_mul(v3, up), vp);
|
||||
//Z = ((12*13*L - up)*vp - 5); // (*Y) is done later
|
||||
// xor flips the sign, works like unary minus
|
||||
Z[k] = v_fma(v_fma(L[k], vx_setall_f32(12.f*13.f), (vx_setall_f32(-0.f) ^ up)), vp, vx_setall_f32(-5.f));
|
||||
Z[k] = v_fma(v_fma(L[k], vx_setall_f32(12.f*13.f), (v_xor(vx_setall_f32(-0.F), up))), vp, vx_setall_f32(-5.f));
|
||||
}
|
||||
|
||||
v_float32 R[nrepeats], G[nrepeats], B[nrepeats];
|
||||
@ -3156,9 +3157,9 @@ struct Luv2RGBfloat
|
||||
for(int k = 0; k < nrepeats; k++)
|
||||
{
|
||||
// R = (X*C0 + C1 + Z*C2)*Y; // here (*Y) is done
|
||||
R[k] = v_fma(Z[k], vc2, v_fma(X[k], vc0, vc1))*Y[k];
|
||||
G[k] = v_fma(Z[k], vc5, v_fma(X[k], vc3, vc4))*Y[k];
|
||||
B[k] = v_fma(Z[k], vc8, v_fma(X[k], vc6, vc7))*Y[k];
|
||||
R[k] = v_mul(v_fma(Z[k], vc2, v_fma(X[k], vc0, vc1)), Y[k]);
|
||||
G[k] = v_mul(v_fma(Z[k], vc5, v_fma(X[k], vc3, vc4)), Y[k]);
|
||||
B[k] = v_mul(v_fma(Z[k], vc8, v_fma(X[k], vc6, vc7)), Y[k]);
|
||||
}
|
||||
|
||||
v_float32 vzero = vx_setzero_f32(), v1 = vx_setall_f32(1.f);
|
||||
@ -3174,9 +3175,9 @@ struct Luv2RGBfloat
|
||||
v_float32 vgscale = vx_setall_f32(gscale);
|
||||
for(int k = 0; k < nrepeats; k++)
|
||||
{
|
||||
R[k] *= vgscale;
|
||||
G[k] *= vgscale;
|
||||
B[k] *= vgscale;
|
||||
R[k] = v_mul(R[k], vgscale);
|
||||
G[k] = v_mul(G[k], vgscale);
|
||||
B[k] = v_mul(B[k], vgscale);
|
||||
}
|
||||
for(int k = 0; k < nrepeats; k++)
|
||||
{
|
||||
@ -3285,7 +3286,7 @@ struct RGB2Luvinterpolate
|
||||
#if CV_SIMD
|
||||
if(enablePackedRGB2Luv)
|
||||
{
|
||||
const int vsize = v_uint16::nlanes;
|
||||
const int vsize = VTraits<v_uint16>::vlanes();
|
||||
static const int nPixels = vsize*2;
|
||||
for(; i < n - 3*nPixels; i += 3*nPixels, src += scn*nPixels)
|
||||
{
|
||||
@ -3315,9 +3316,9 @@ struct RGB2Luvinterpolate
|
||||
v_expand(r, r0, r1);
|
||||
v_expand(g, g0, g1);
|
||||
v_expand(b, b0, b1);
|
||||
r0 = r0 << (lab_base_shift - 8); r1 = r1 << (lab_base_shift - 8);
|
||||
g0 = g0 << (lab_base_shift - 8); g1 = g1 << (lab_base_shift - 8);
|
||||
b0 = b0 << (lab_base_shift - 8); b1 = b1 << (lab_base_shift - 8);
|
||||
r0 = v_shl<lab_base_shift - 8>(r0); r1 = v_shl<lab_base_shift - 8>(r1);
|
||||
g0 = v_shl<lab_base_shift - 8>(g0); g1 = v_shl<lab_base_shift - 8>(g1);
|
||||
b0 = v_shl<lab_base_shift - 8>(b0); b1 = v_shl<lab_base_shift - 8>(b1);
|
||||
|
||||
/*
|
||||
int L, u, v;
|
||||
@ -3332,9 +3333,9 @@ struct RGB2Luvinterpolate
|
||||
dst[i+1] = saturate_cast<uchar>(u/baseDiv);
|
||||
dst[i+2] = saturate_cast<uchar>(v/baseDiv);
|
||||
*/
|
||||
l0 = l0 >> (lab_base_shift - 8); l1 = l1 >> (lab_base_shift - 8);
|
||||
u0 = u0 >> (lab_base_shift - 8); u1 = u1 >> (lab_base_shift - 8);
|
||||
v0 = v0 >> (lab_base_shift - 8); v1 = v1 >> (lab_base_shift - 8);
|
||||
l0 = v_shr<lab_base_shift - 8>(l0); l1 = v_shr<lab_base_shift - 8>(l1);
|
||||
u0 = v_shr<lab_base_shift - 8>(u0); u1 = v_shr<lab_base_shift - 8>(u1);
|
||||
v0 = v_shr<lab_base_shift - 8>(v0); v1 = v_shr<lab_base_shift - 8>(v1);
|
||||
v_uint8 l = v_pack(l0, l1);
|
||||
v_uint8 u = v_pack(u0, u1);
|
||||
v_uint8 v = v_pack(v0, v1);
|
||||
@ -3405,12 +3406,12 @@ struct RGB2Luv_b
|
||||
static const softfloat su = -uLow*f255/uRange;
|
||||
static const softfloat sv = -vLow*f255/vRange;
|
||||
#if CV_SIMD
|
||||
const int fsize = v_float32::nlanes;
|
||||
const int fsize = VTraits<v_float32>::vlanes();
|
||||
v_float32 ml = vx_setall_f32((float)fL), al = vx_setzero_f32();
|
||||
v_float32 mu = vx_setall_f32((float)fu), au = vx_setall_f32((float)su);
|
||||
v_float32 mv = vx_setall_f32((float)fv), av = vx_setall_f32((float)sv);
|
||||
//TODO: fix that when v_interleave is available
|
||||
float CV_DECL_ALIGNED(CV_SIMD_WIDTH) interTmpM[fsize*3], interTmpA[fsize*3];
|
||||
float CV_DECL_ALIGNED(CV_SIMD_WIDTH) interTmpM[VTraits<v_float32>::max_nlanes*3], interTmpA[VTraits<v_float32>::max_nlanes*3];
|
||||
v_store_interleave(interTmpM, ml, mu, mv);
|
||||
v_store_interleave(interTmpA, al, au, av);
|
||||
v_float32 mluv[3], aluv[3];
|
||||
@ -3452,7 +3453,7 @@ struct RGB2Luv_b
|
||||
v_float32 f[3*4];
|
||||
for(int k = 0; k < 3*4; k++)
|
||||
{
|
||||
f[k] = v_cvt_f32(q[k])*v255inv;
|
||||
f[k] = v_mul(v_cvt_f32(q[k]), v255inv);
|
||||
}
|
||||
|
||||
for(int k = 0; k < 4; k++)
|
||||
@ -3478,8 +3479,8 @@ struct RGB2Luv_b
|
||||
v_int32 q0, q1;
|
||||
v_expand(v_reinterpret_as_s16(d), q0, q1);
|
||||
|
||||
v_store_aligned(buf + j + 0*fsize, v_cvt_f32(q0)*v255inv);
|
||||
v_store_aligned(buf + j + 1*fsize, v_cvt_f32(q1)*v255inv);
|
||||
v_store_aligned(buf + j + 0*fsize, v_mul(v_cvt_f32(q0), v255inv));
|
||||
v_store_aligned(buf + j + 1*fsize, v_mul(v_cvt_f32(q1), v255inv));
|
||||
}
|
||||
for( ; j < dn*bufChannels; j++, src++ )
|
||||
{
|
||||
@ -3633,7 +3634,8 @@ struct Luv2RGBinteger
|
||||
inline void processLuvToXYZ(const v_uint8& lv, const v_uint8& uv, const v_uint8& vv,
|
||||
v_int32 (&x)[4], v_int32 (&y)[4], v_int32 (&z)[4]) const
|
||||
{
|
||||
const int vsize = v_uint8::nlanes;
|
||||
const int vsize = VTraits<v_uint8>::vlanes();
|
||||
const int vsize_max = VTraits<v_uint8>::max_nlanes;
|
||||
|
||||
v_uint16 lv0, lv1;
|
||||
v_expand(lv, lv0, lv1);
|
||||
@ -3646,7 +3648,7 @@ struct Luv2RGBinteger
|
||||
v_int32 mask16 = vx_setall_s32(0xFFFF);
|
||||
for(int k = 0; k < 4; k++)
|
||||
{
|
||||
y[k] = v_lut((const int*)LabToYF_b, v_reinterpret_as_s32(lq[k])) & mask16;
|
||||
y[k] = v_and(v_lut((const int *)LabToYF_b, v_reinterpret_as_s32(lq[k])), mask16);
|
||||
}
|
||||
|
||||
v_int32 up[4], vp[4];
|
||||
@ -3657,10 +3659,10 @@ struct Luv2RGBinteger
|
||||
v_expand(vv, vv0, vv1);
|
||||
// LL*256
|
||||
v_uint16 ll0, ll1;
|
||||
ll0 = lv0 << 8; ll1 = lv1 << 8;
|
||||
ll0 = v_shl<8>(lv0); ll1 = v_shl<8>(lv1);
|
||||
v_uint16 upidx0, upidx1, vpidx0, vpidx1;
|
||||
upidx0 = ll0 + uv0; upidx1 = ll1 + uv1;
|
||||
vpidx0 = ll0 + vv0; vpidx1 = ll1 + vv1;
|
||||
upidx0 = v_add(ll0, uv0); upidx1 = v_add(ll1, uv1);
|
||||
vpidx0 = v_add(ll0, vv0); vpidx1 = v_add(ll1, vv1);
|
||||
v_uint32 upidx[4], vpidx[4];
|
||||
v_expand(upidx0, upidx[0], upidx[1]); v_expand(upidx1, upidx[2], upidx[3]);
|
||||
v_expand(vpidx0, vpidx[0], vpidx[1]); v_expand(vpidx1, vpidx[2], vpidx[3]);
|
||||
@ -3672,7 +3674,7 @@ struct Luv2RGBinteger
|
||||
|
||||
// long long int vpl = LUVLUT.LvToVpl_b[LL*256+v];
|
||||
v_int64 vpl[8];
|
||||
int32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vpidxstore[vsize];
|
||||
int32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vpidxstore[vsize_max];
|
||||
for(int k = 0; k < 4; k++)
|
||||
{
|
||||
v_store_aligned(vpidxstore + k*vsize/4, v_reinterpret_as_s32(vpidx[k]));
|
||||
@ -3684,12 +3686,13 @@ struct Luv2RGBinteger
|
||||
|
||||
// not all 64-bit arithmetic is available in univ. intrinsics
|
||||
// need to handle it with scalar code
|
||||
int64_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vvpl[vsize];
|
||||
int64_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vvpl[vsize_max];
|
||||
for(int k = 0; k < 8; k++)
|
||||
{
|
||||
v_store_aligned(vvpl + k*vsize/8, vpl[k]);
|
||||
}
|
||||
int32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vup[vsize], vvp[vsize], vx[vsize], vy[vsize], vzm[vsize];
|
||||
int32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vup[vsize_max], vvp[vsize_max],
|
||||
vx[vsize_max], vy[vsize_max], vzm[vsize_max];
|
||||
for(int k = 0; k < 4; k++)
|
||||
{
|
||||
v_store_aligned(vup + k*vsize/4, up[k]);
|
||||
@ -3724,7 +3727,7 @@ struct Luv2RGBinteger
|
||||
// z = zm/256 + zm/65536;
|
||||
for (int k = 0; k < 4; k++)
|
||||
{
|
||||
z[k] = (zm[k] >> 8) + (zm[k] >> 16);
|
||||
z[k] = v_add(v_shr<8>(zm[k]), v_shr<16>(zm[k]));
|
||||
}
|
||||
|
||||
// (x, z) = clip((x, z), min=0, max=2*BASE)
|
||||
@ -3751,7 +3754,7 @@ struct Luv2RGBinteger
|
||||
{
|
||||
ushort* tab = sRGBInvGammaTab_b;
|
||||
bool srgb = issRGB;
|
||||
static const int vsize = v_uint8::nlanes;
|
||||
static const int vsize = VTraits<v_uint8>::vlanes();
|
||||
const int descaleShift = 1 << (shift-1);
|
||||
v_int16 vdescale = vx_setall_s16(descaleShift);
|
||||
v_int16 vc[9];
|
||||
@ -3771,12 +3774,12 @@ struct Luv2RGBinteger
|
||||
// fixing 16bit signed multiplication
|
||||
// by subtracting 2^(base_shift-1) and then adding result back
|
||||
v_int32 dummy32, fm[3];
|
||||
v_expand(vc[0]+vc[1]+vc[2], fm[0], dummy32);
|
||||
v_expand(vc[3]+vc[4]+vc[5], fm[1], dummy32);
|
||||
v_expand(vc[6]+vc[7]+vc[8], fm[2], dummy32);
|
||||
fm[0] = fm[0] << (base_shift-1);
|
||||
fm[1] = fm[1] << (base_shift-1);
|
||||
fm[2] = fm[2] << (base_shift-1);
|
||||
v_expand(v_add(vc[0],vc[1],vc[2]), fm[0], dummy32);
|
||||
v_expand(v_add(vc[3],vc[4],vc[5]), fm[1], dummy32);
|
||||
v_expand(v_add(vc[6],vc[7],vc[8]), fm[2], dummy32);
|
||||
fm[0] = v_shl(fm[0], (base_shift-1));
|
||||
fm[1] = v_shl(fm[1], (base_shift-1));
|
||||
fm[2] = v_shl(fm[2], (base_shift-1));
|
||||
|
||||
for (; i <= n-vsize; i += vsize, src += 3*vsize, dst += dcn*vsize)
|
||||
{
|
||||
@ -3816,15 +3819,15 @@ struct Luv2RGBinteger
|
||||
// a bit faster than one loop for all
|
||||
for(int k = 0; k < 4; k++)
|
||||
{
|
||||
i_rgb[k+4*0] = (v_dotprod(xy[k], crxy) + v_dotprod(zd[k], crz1) + fm[0]) >> shift;
|
||||
i_rgb[k+4*0] = v_shr<shift>(v_add(v_add(v_dotprod(xy[k], crxy), v_dotprod(zd[k], crz1)), fm[0]));
|
||||
}
|
||||
for(int k = 0; k < 4; k++)
|
||||
{
|
||||
i_rgb[k+4*1] = (v_dotprod(xy[k], cgxy) + v_dotprod(zd[k], cgz1) + fm[1]) >> shift;
|
||||
i_rgb[k+4*1] = v_shr<shift>(v_add(v_add(v_dotprod(xy[k], cgxy), v_dotprod(zd[k], cgz1)), fm[1]));
|
||||
}
|
||||
for(int k = 0; k < 4; k++)
|
||||
{
|
||||
i_rgb[k+4*2] = (v_dotprod(xy[k], cbxy) + v_dotprod(zd[k], cbz1) + fm[2]) >> shift;
|
||||
i_rgb[k+4*2] = v_shr<shift>(v_add(v_add(v_dotprod(xy[k], cbxy), v_dotprod(zd[k], cbz1)), fm[2]));
|
||||
}
|
||||
|
||||
// [rrggbb]
|
||||
@ -3842,7 +3845,7 @@ struct Luv2RGBinteger
|
||||
if(srgb)
|
||||
{
|
||||
// [rr.., gg.., bb..]
|
||||
int32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) rgbshifts[3*vsize];
|
||||
int32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) rgbshifts[3*VTraits<v_uint8>::max_nlanes];
|
||||
for(int k = 0; k < 12; k++)
|
||||
{
|
||||
v_store_aligned(rgbshifts + k*vsize/4, i_rgb[k]);
|
||||
@ -3857,7 +3860,7 @@ struct Luv2RGBinteger
|
||||
// rgb = (rgb*255) >> inv_gamma_shift
|
||||
for(int k = 0; k < 12; k++)
|
||||
{
|
||||
i_rgb[k] = ((i_rgb[k] << 8) - i_rgb[k]) >> inv_gamma_shift;
|
||||
i_rgb[k] = v_shr((v_sub((v_shl(i_rgb[k], 8)), i_rgb[k])), inv_gamma_shift);
|
||||
}
|
||||
|
||||
for(int k = 0; k < 6; k++)
|
||||
@ -3940,13 +3943,13 @@ struct Luv2RGB_b
|
||||
static const softfloat fv = vRange/f255;
|
||||
|
||||
#if CV_SIMD
|
||||
const int fsize = v_float32::nlanes;
|
||||
const int fsize = VTraits<v_float32>::vlanes();
|
||||
v_float32 vl = vx_setall_f32((float)fl);
|
||||
v_float32 vu = vx_setall_f32((float)fu);
|
||||
v_float32 vv = vx_setall_f32((float)fv);
|
||||
v_float32 vuLow = vx_setall_f32((float)uLow), vvLow = vx_setall_f32((float)vLow);
|
||||
//TODO: fix that when v_interleave is available
|
||||
float CV_DECL_ALIGNED(CV_SIMD_WIDTH) interTmpM[fsize*3], interTmpA[fsize*3];
|
||||
float CV_DECL_ALIGNED(CV_SIMD_WIDTH) interTmpM[VTraits<v_float32>::max_nlanes*3], interTmpA[VTraits<v_float32>::max_nlanes*3];
|
||||
v_store_interleave(interTmpM, vl, vu, vv);
|
||||
v_store_interleave(interTmpA, vx_setzero_f32(), vuLow, vvLow);
|
||||
v_float32 mluv[3], aluv[3];
|
||||
@ -3964,7 +3967,7 @@ struct Luv2RGB_b
|
||||
j = 0;
|
||||
|
||||
#if CV_SIMD
|
||||
const int vsize = v_uint8::nlanes;
|
||||
const int vsize = VTraits<v_uint8>::vlanes();
|
||||
for( ; j <= (dn - vsize)*3; j += 3*vsize )
|
||||
{
|
||||
v_uint8 s0, s1, s2;
|
||||
@ -4017,7 +4020,7 @@ struct Luv2RGB_b
|
||||
v_int32 vi[4*3];
|
||||
for(int k = 0; k < 4*3; k++)
|
||||
{
|
||||
vi[k] = v_round(vf[k]*v255);
|
||||
vi[k] = v_round(v_mul(vf[k], v255));
|
||||
}
|
||||
|
||||
v_uint8 rgb[3];
|
||||
@ -4039,7 +4042,7 @@ struct Luv2RGB_b
|
||||
for(int k = 0; k < 4; k++)
|
||||
{
|
||||
vf[k] = vx_load_aligned(buf + j + k*fsize);
|
||||
vi[k] = v_round(vf[k]*v255);
|
||||
vi[k] = v_round(v_mul(vf[k], v255));
|
||||
}
|
||||
v_store(dst, v_pack_u(v_pack(vi[0], vi[1]), v_pack(vi[2], vi[3])));
|
||||
}
|
||||
|
@ -882,7 +882,7 @@ struct RGBA2mRGBA<uchar>
|
||||
|
||||
int i = 0;
|
||||
#if CV_SIMD
|
||||
const int vsize = v_uint8::nlanes;
|
||||
const int vsize = VTraits<v_uint8>::vlanes();
|
||||
v_uint8 amask = v_reinterpret_as_u8(vx_setall_u32(0xFF000000));
|
||||
v_uint16 vh = vx_setall_u16(half_val+1);
|
||||
|
||||
@ -901,27 +901,27 @@ struct RGBA2mRGBA<uchar>
|
||||
|
||||
v_uint16 a16[4];
|
||||
for(int j = 0; j < 4; j++)
|
||||
a16[j] = v_reinterpret_as_u16(v[j] & amask);
|
||||
a16[j] = v_reinterpret_as_u16(v_and(v[j], amask));
|
||||
|
||||
v_uint32 a32[4];
|
||||
for(int j = 0; j < 4; j++)
|
||||
a32[j] = v_reinterpret_as_u32(a16[j] | (a16[j] >> 8));
|
||||
a32[j] = v_reinterpret_as_u32(v_or(a16[j], (v_shr(a16[j], 8))));
|
||||
|
||||
v_uint8 a[4];
|
||||
for(int j = 0; j < 4; j++)
|
||||
a[j] = v_reinterpret_as_u8(a32[j] | (a32[j] >> 16));
|
||||
a[j] = v_reinterpret_as_u8(v_or(a32[j], (v_shr(a32[j], 16))));
|
||||
|
||||
v_uint16 m[8];
|
||||
for(int j = 0; j < 4; j++)
|
||||
v_mul_expand(v[j], a[j], m[j], m[j+4]);
|
||||
|
||||
for(int j = 0; j < 8; j++)
|
||||
m[j] += vh;
|
||||
m[j] = v_add(m[j], vh);
|
||||
|
||||
// div 255: (v+1+(v>>8))>8
|
||||
// +1 is in vh, has no effect on (v>>8)
|
||||
for(int j = 0; j < 8; j++)
|
||||
m[j] = (m[j] + (m[j] >> 8)) >> 8;
|
||||
m[j] = v_shr((v_add(m[j], (v_shr(m[j], 8)))), 8);
|
||||
|
||||
v_uint8 d[4];
|
||||
for(int j = 0; j < 4; j++)
|
||||
|
@ -188,21 +188,21 @@ public:
|
||||
v_uint16x8 r1 = v_reinterpret_as_u16(v_load(bayer+bayer_step));
|
||||
v_uint16x8 r2 = v_reinterpret_as_u16(v_load(bayer+bayer_step*2));
|
||||
|
||||
v_uint16x8 b1 = ((r0 << 8) >> 7) + ((r2 << 8) >> 7);
|
||||
v_uint16x8 b0 = v_rotate_right<1>(b1) + b1;
|
||||
b1 = v_rotate_right<1>(b1) << 1;
|
||||
v_uint16x8 b1 = v_add(v_shr<7>(v_shl<8>(r0)), v_shr<7>(v_shl<8>(r2)));
|
||||
v_uint16x8 b0 = v_add(v_rotate_right<1>(b1), b1);
|
||||
b1 = v_shl<1>(v_rotate_right<1>(b1));
|
||||
|
||||
v_uint16x8 g0 = (r0 >> 7) + (r2 >> 7);
|
||||
v_uint16x8 g1 = (r1 << 8) >> 7;
|
||||
g0 += v_rotate_right<1>(g1) + g1;
|
||||
g1 = v_rotate_right<1>(g1) << 2;
|
||||
v_uint16x8 g0 = v_add(v_shr<7>(r0), v_shr<7>(r2));
|
||||
v_uint16x8 g1 = v_shr<7>(v_shl<8>(r1));
|
||||
g0 = v_add(g0, v_add(v_rotate_right<1>(g1), g1));
|
||||
g1 = v_shl<2>(v_rotate_right<1>(g1));
|
||||
|
||||
r0 = r1 >> 8;
|
||||
r1 = (v_rotate_right<1>(r0) + r0) << 2;
|
||||
r0 = r0 << 3;
|
||||
r0 = v_shr<8>(r1);
|
||||
r1 = v_shl<2>(v_add(v_rotate_right<1>(r0), r0));
|
||||
r0 = v_shl<3>(r0);
|
||||
|
||||
g0 = (v_mul_hi(b0, _b2y) + v_mul_hi(g0, _g2y) + v_mul_hi(r0, _r2y)) >> 2;
|
||||
g1 = (v_mul_hi(b1, _b2y) + v_mul_hi(g1, _g2y) + v_mul_hi(r1, _r2y)) >> 2;
|
||||
g0 = v_shr<2>(v_add(v_add(v_mul_hi(b0, _b2y), v_mul_hi(g0, _g2y)), v_mul_hi(r0, _r2y)));
|
||||
g1 = v_shr<2>(v_add(v_add(v_mul_hi(b1, _b2y), v_mul_hi(g1, _g2y)), v_mul_hi(r1, _r2y)));
|
||||
v_uint8x16 pack_lo, pack_hi;
|
||||
v_zip(v_pack_u(v_reinterpret_as_s16(g0), v_reinterpret_as_s16(g0)),
|
||||
v_pack_u(v_reinterpret_as_s16(g1), v_reinterpret_as_s16(g1)),
|
||||
@ -269,31 +269,31 @@ public:
|
||||
v_uint16x8 r1 = v_reinterpret_as_u16(v_load(bayer+bayer_step));
|
||||
v_uint16x8 r2 = v_reinterpret_as_u16(v_load(bayer+bayer_step*2));
|
||||
|
||||
v_uint16x8 b1 = (r0 & masklo) + (r2 & masklo);
|
||||
v_uint16x8 b1 = v_add(v_and(r0, masklo), v_and(r2, masklo));
|
||||
v_uint16x8 nextb1 = v_rotate_right<1>(b1);
|
||||
v_uint16x8 b0 = b1 + nextb1;
|
||||
b1 = (nextb1 + delta1) >> 1;
|
||||
b0 = (b0 + delta2) >> 2;
|
||||
v_uint16x8 b0 = v_add(b1, nextb1);
|
||||
b1 = v_shr<1>(v_add(nextb1, delta1));
|
||||
b0 = v_shr<2>(v_add(b0, delta2));
|
||||
// b0 b2 ... b14 b1 b3 ... b15
|
||||
b0 = v_reinterpret_as_u16(v_pack_u(v_reinterpret_as_s16(b0), v_reinterpret_as_s16(b1)));
|
||||
|
||||
v_uint16x8 g0 = (r0 >> 8) + (r2 >> 8);
|
||||
v_uint16x8 g1 = r1 & masklo;
|
||||
g0 += v_rotate_right<1>(g1) + g1;
|
||||
v_uint16x8 g0 = v_add(v_shr<8>(r0), v_shr<8>(r2));
|
||||
v_uint16x8 g1 = v_and(r1, masklo);
|
||||
g0 = v_add(g0, v_add(v_rotate_right<1>(g1), g1));
|
||||
g1 = v_rotate_right<1>(g1);
|
||||
g0 = (g0 + delta2) >> 2;
|
||||
g0 = v_shr<2>(v_add(g0, delta2));
|
||||
// g0 g2 ... g14 g1 g3 ... g15
|
||||
g0 = v_reinterpret_as_u16(v_pack_u(v_reinterpret_as_s16(g0), v_reinterpret_as_s16(g1)));
|
||||
|
||||
r0 = r1 >> 8;
|
||||
r1 = v_rotate_right<1>(r0) + r0;
|
||||
r1 = (r1 + delta1) >> 1;
|
||||
r0 = v_shr<8>(r1);
|
||||
r1 = v_add(v_rotate_right<1>(r0), r0);
|
||||
r1 = v_shr<1>(v_add(r1, delta1));
|
||||
// r0 r2 ... r14 r1 r3 ... r15
|
||||
r0 = v_reinterpret_as_u16(v_pack_u(v_reinterpret_as_s16(r0), v_reinterpret_as_s16(r1)));
|
||||
|
||||
b1 = (b0 ^ r0) & mask;
|
||||
b0 = b0 ^ b1;
|
||||
r0 = r0 ^ b1;
|
||||
b1 = v_and(v_xor(b0, r0), mask);
|
||||
b0 = v_xor(b0, b1);
|
||||
r0 = v_xor(r0, b1);
|
||||
|
||||
// b1 g1 b3 g3 b5 g5...
|
||||
v_uint8x16 pack_lo, pack_hi;
|
||||
@ -402,31 +402,31 @@ public:
|
||||
v_uint16x8 r1 = v_reinterpret_as_u16(v_load(bayer+bayer_step));
|
||||
v_uint16x8 r2 = v_reinterpret_as_u16(v_load(bayer+bayer_step*2));
|
||||
|
||||
v_uint16x8 b1 = (r0 & masklo) + (r2 & masklo);
|
||||
v_uint16x8 b1 = v_add(v_and(r0, masklo), v_and(r2, masklo));
|
||||
v_uint16x8 nextb1 = v_rotate_right<1>(b1);
|
||||
v_uint16x8 b0 = b1 + nextb1;
|
||||
b1 = (nextb1 + delta1) >> 1;
|
||||
b0 = (b0 + delta2) >> 2;
|
||||
v_uint16x8 b0 = v_add(b1, nextb1);
|
||||
b1 = v_shr<1>(v_add(nextb1, delta1));
|
||||
b0 = v_shr<2>(v_add(b0, delta2));
|
||||
// b0 b2 ... b14 b1 b3 ... b15
|
||||
b0 = v_reinterpret_as_u16(v_pack_u(v_reinterpret_as_s16(b0), v_reinterpret_as_s16(b1)));
|
||||
|
||||
v_uint16x8 g0 = (r0 >> 8) + (r2 >> 8);
|
||||
v_uint16x8 g1 = r1 & masklo;
|
||||
g0 += v_rotate_right<1>(g1) + g1;
|
||||
v_uint16x8 g0 = v_add(v_shr<8>(r0), v_shr<8>(r2));
|
||||
v_uint16x8 g1 = v_and(r1, masklo);
|
||||
g0 = v_add(g0, v_add(v_rotate_right<1>(g1), g1));
|
||||
g1 = v_rotate_right<1>(g1);
|
||||
g0 = (g0 + delta2) >> 2;
|
||||
g0 = v_shr<2>(v_add(g0, delta2));
|
||||
// g0 g2 ... g14 g1 g3 ... g15
|
||||
g0 = v_reinterpret_as_u16(v_pack_u(v_reinterpret_as_s16(g0), v_reinterpret_as_s16(g1)));
|
||||
|
||||
r0 = r1 >> 8;
|
||||
r1 = v_rotate_right<1>(r0) + r0;
|
||||
r1 = (r1 + delta1) >> 1;
|
||||
r0 = v_shr<8>(r1);
|
||||
r1 = v_add(v_rotate_right<1>(r0), r0);
|
||||
r1 = v_shr<1>(v_add(r1, delta1));
|
||||
// r0 r2 ... r14 r1 r3 ... r15
|
||||
r0 = v_reinterpret_as_u16(v_pack_u(v_reinterpret_as_s16(r0), v_reinterpret_as_s16(r1)));
|
||||
|
||||
b1 = (b0 ^ r0) & mask;
|
||||
b0 = b0 ^ b1;
|
||||
r0 = r0 ^ b1;
|
||||
b1 = v_and(v_xor(b0, r0), mask);
|
||||
b0 = v_xor(b0, b1);
|
||||
r0 = v_xor(r0, b1);
|
||||
|
||||
// b1 g1 b3 g3 b5 g5...
|
||||
v_uint8x16 pack_lo, pack_hi;
|
||||
@ -498,40 +498,40 @@ public:
|
||||
v_uint16x8 r1 = v_reinterpret_as_u16(v_load(bayer+bayer_step));
|
||||
v_uint16x8 r2 = v_reinterpret_as_u16(v_load(bayer+bayer_step*2));
|
||||
|
||||
v_uint16x8 b1 = (r0 & masklow) + (r2 & masklow);
|
||||
v_uint16x8 b1 = v_add(v_and(r0, masklow), v_and(r2, masklow));
|
||||
v_uint16x8 nextb1 = v_rotate_right<1>(b1);
|
||||
v_uint16x8 b0 = b1 + nextb1;
|
||||
b1 = (nextb1 + delta1) >> 1;
|
||||
b0 = (b0 + delta2) >> 2;
|
||||
v_uint16x8 b0 = v_add(b1, nextb1);
|
||||
b1 = v_shr<1>(v_add(nextb1, delta1));
|
||||
b0 = v_shr<2>(v_add(b0, delta2));
|
||||
// b0 b2 ... b14 b1 b3 ... b15
|
||||
b0 = v_reinterpret_as_u16(v_pack_u(v_reinterpret_as_s16(b0), v_reinterpret_as_s16(b1)));
|
||||
|
||||
// vertical sum
|
||||
v_uint16x8 r0g = r0 >> 8;
|
||||
v_uint16x8 r2g = r2 >> 8;
|
||||
v_uint16x8 sumv = ((r0g + r2g) + delta1) >> 1;
|
||||
v_uint16x8 r0g = v_shr<8>(r0);
|
||||
v_uint16x8 r2g = v_shr<8>(r2);
|
||||
v_uint16x8 sumv = v_shr<1>(v_add(v_add(r0g, r2g), delta1));
|
||||
// horizontal sum
|
||||
v_uint16x8 g1 = r1 & masklow;
|
||||
v_uint16x8 g1 = v_and(r1, masklow);
|
||||
v_uint16x8 nextg1 = v_rotate_right<1>(g1);
|
||||
v_uint16x8 sumg = (g1 + nextg1 + delta1) >> 1;
|
||||
v_uint16x8 sumg = v_shr<1>(v_add(v_add(g1, nextg1), delta1));
|
||||
|
||||
// gradients
|
||||
v_uint16x8 gradv = (r0g - r2g) + (r2g - r0g);
|
||||
v_uint16x8 gradg = (nextg1 - g1) + (g1 - nextg1);
|
||||
v_uint16x8 gmask = gradg > gradv;
|
||||
v_uint16x8 g0 = (gmask & sumv) + (sumg & (gmask ^ full));
|
||||
v_uint16x8 gradv = v_add(v_sub(r0g, r2g), v_sub(r2g, r0g));
|
||||
v_uint16x8 gradg = v_add(v_sub(nextg1, g1), v_sub(g1, nextg1));
|
||||
v_uint16x8 gmask = v_gt(gradg, gradv);
|
||||
v_uint16x8 g0 = v_add(v_and(gmask, sumv), v_and(sumg, v_xor(gmask, full)));
|
||||
// g0 g2 ... g14 g1 g3 ...
|
||||
g0 = v_reinterpret_as_u16(v_pack_u(v_reinterpret_as_s16(g0), v_reinterpret_as_s16(nextg1)));
|
||||
|
||||
r0 = r1 >> 8;
|
||||
r1 = v_rotate_right<1>(r0) + r0;
|
||||
r1 = (r1 + delta1) >> 1;
|
||||
r0 = v_shr<8>(r1);
|
||||
r1 = v_add(v_rotate_right<1>(r0), r0);
|
||||
r1 = v_shr<1>(v_add(r1, delta1));
|
||||
// r0 r2 ... r14 r1 r3 ... r15
|
||||
r0 = v_reinterpret_as_u16(v_pack_u(v_reinterpret_as_s16(r0), v_reinterpret_as_s16(r1)));
|
||||
|
||||
b1 = (b0 ^ r0) & mask;
|
||||
b0 = b0 ^ b1;
|
||||
r0 = r0 ^ b1;
|
||||
b1 = v_and(v_xor(b0, r0), mask);
|
||||
b0 = v_xor(b0, b1);
|
||||
r0 = v_xor(r0, b1);
|
||||
|
||||
// b1 g1 b3 g3 b5 g5...
|
||||
v_uint8x16 pack_lo, pack_hi;
|
||||
@ -1060,19 +1060,19 @@ static void Bayer2RGB_VNG_8u( const Mat& srcmat, Mat& dstmat, int code )
|
||||
|
||||
v_uint16x8 b0, b1, b2, b3, b4, b5, b6;
|
||||
|
||||
b0 = (v_absdiff(s2, s8)<<1) + v_absdiff(s1, s7) + v_absdiff(s3, s9);
|
||||
b1 = (v_absdiff(s4, s6)<<1) + v_absdiff(s1, s3) + v_absdiff(s7, s9);
|
||||
b2 = v_absdiff(s3, s7)<<1;
|
||||
b3 = v_absdiff(s1, s9)<<1;
|
||||
b0 = v_add(v_add(v_shl<1>(v_absdiff(s2, s8)), v_absdiff(s1, s7)), v_absdiff(s3, s9));
|
||||
b1 = v_add(v_add(v_shl<1>(v_absdiff(s4, s6)), v_absdiff(s1, s3)), v_absdiff(s7, s9));
|
||||
b2 = v_shl<1>(v_absdiff(s3, s7));
|
||||
b3 = v_shl<1>(v_absdiff(s1, s9));
|
||||
|
||||
v_store(brow, b0);
|
||||
v_store(brow + N, b1);
|
||||
v_store(brow + N2, b2);
|
||||
v_store(brow + N3, b3);
|
||||
|
||||
b4 = b2 + v_absdiff(s2, s4) + v_absdiff(s6, s8);
|
||||
b5 = b3 + v_absdiff(s2, s6) + v_absdiff(s4, s8);
|
||||
b6 = (s2 + s4 + s6 + s8)>>1;
|
||||
b4 = v_add(v_add(b2, v_absdiff(s2, s4)), v_absdiff(s6, s8));
|
||||
b5 = v_add(v_add(b3, v_absdiff(s2, s6)), v_absdiff(s4, s8));
|
||||
b6 = v_shr<1>(v_add(v_add(v_add(s2, s4), s6), s8));
|
||||
|
||||
v_store(brow + N4, b4);
|
||||
v_store(brow + N5, b5);
|
||||
@ -1279,7 +1279,7 @@ static void Bayer2RGB_VNG_8u( const Mat& srcmat, Mat& dstmat, int code )
|
||||
v_uint16x8 one = v_setall_u16(1), z = v_setzero_u16();
|
||||
v_float32x4 _0_5 = v_setall_f32(0.5f);
|
||||
|
||||
#define v_merge_u16(a, b) (((a) & v_reinterpret_as_u16(emask)) | ((b) & v_reinterpret_as_u16(omask))) //(aA_aA_aA_aA) * (bB_bB_bB_bB) => (bA_bA_bA_bA)
|
||||
#define v_merge_u16(a, b) (v_or((v_and((a), v_reinterpret_as_u16(emask))), (v_and((b), v_reinterpret_as_u16(omask))))) //(aA_aA_aA_aA) * (bB_bB_bB_bB) => (bA_bA_bA_bA)
|
||||
#define v_cvt_s16f32_lo(a) v_cvt_f32(v_expand_low(v_reinterpret_as_s16(a))) //(1,2,3,4,5,6,7,8) => (1f,2f,3f,4f)
|
||||
#define v_cvt_s16f32_hi(a) v_cvt_f32(v_expand_high(v_reinterpret_as_s16(a))) //(1,2,3,4,5,6,7,8) => (5f,6f,7f,8f)
|
||||
|
||||
@ -1287,16 +1287,16 @@ static void Bayer2RGB_VNG_8u( const Mat& srcmat, Mat& dstmat, int code )
|
||||
for( ; i <= N - 10; i += 8, srow += 8, brow0 += 8, brow1 += 8, brow2 += 8 )
|
||||
{
|
||||
//int gradN = brow0[0] + brow1[0];
|
||||
v_uint16x8 gradN = v_load(brow0) + v_load(brow1);
|
||||
v_uint16x8 gradN = v_add(v_load(brow0), v_load(brow1));
|
||||
|
||||
//int gradS = brow1[0] + brow2[0];
|
||||
v_uint16x8 gradS = v_load(brow1) + v_load(brow2);
|
||||
v_uint16x8 gradS = v_add(v_load(brow1), v_load(brow2));
|
||||
|
||||
//int gradW = brow1[N-1] + brow1[N];
|
||||
v_uint16x8 gradW = v_load(brow1+N-1) + v_load(brow1+N);
|
||||
v_uint16x8 gradW = v_add(v_load(brow1 + N - 1), v_load(brow1 + N));
|
||||
|
||||
//int gradE = brow1[N+1] + brow1[N];
|
||||
v_uint16x8 gradE = v_load(brow1+N+1) + v_load(brow1+N);
|
||||
v_uint16x8 gradE = v_add(v_load(brow1 + N + 1), v_load(brow1 + N));
|
||||
|
||||
//int minGrad = std::min(std::min(std::min(gradN, gradS), gradW), gradE);
|
||||
//int maxGrad = std::max(std::max(std::max(gradN, gradS), gradW), gradE);
|
||||
@ -1307,14 +1307,14 @@ static void Bayer2RGB_VNG_8u( const Mat& srcmat, Mat& dstmat, int code )
|
||||
|
||||
//int gradNE = brow0[N4+1] + brow1[N4];
|
||||
//int gradNE = brow0[N2] + brow0[N2+1] + brow1[N2] + brow1[N2+1];
|
||||
grad0 = v_load(brow0+N4+1) + v_load(brow1+N4);
|
||||
grad1 = v_load(brow0+N2) + v_load(brow0+N2+1) + v_load(brow1+N2) + v_load(brow1+N2+1);
|
||||
grad0 = v_add(v_load(brow0 + N4 + 1), v_load(brow1 + N4));
|
||||
grad1 = v_add(v_add(v_add(v_load(brow0 + N2), v_load(brow0 + N2 + 1)), v_load(brow1 + N2)), v_load(brow1 + N2 + 1));
|
||||
v_uint16x8 gradNE = v_merge_u16(grad0, grad1);
|
||||
|
||||
//int gradSW = brow1[N4] + brow2[N4-1];
|
||||
//int gradSW = brow1[N2] + brow1[N2-1] + brow2[N2] + brow2[N2-1];
|
||||
grad0 = v_load(brow2+N4-1) + v_load(brow1+N4);
|
||||
grad1 = v_load(brow2+N2) + v_load(brow2+N2-1) + v_load(brow1+N2) + v_load(brow1+N2-1);
|
||||
grad0 = v_add(v_load(brow2 + N4 - 1), v_load(brow1 + N4));
|
||||
grad1 = v_add(v_add(v_add(v_load(brow2 + N2), v_load(brow2 + N2 - 1)), v_load(brow1 + N2)), v_load(brow1 + N2 - 1));
|
||||
v_uint16x8 gradSW = v_merge_u16(grad0, grad1);
|
||||
|
||||
minGrad = v_min(v_min(minGrad, gradNE), gradSW);
|
||||
@ -1322,21 +1322,21 @@ static void Bayer2RGB_VNG_8u( const Mat& srcmat, Mat& dstmat, int code )
|
||||
|
||||
//int gradNW = brow0[N5-1] + brow1[N5];
|
||||
//int gradNW = brow0[N3] + brow0[N3-1] + brow1[N3] + brow1[N3-1];
|
||||
grad0 = v_load(brow0+N5-1) + v_load(brow1+N5);
|
||||
grad1 = v_load(brow0+N3) + v_load(brow0+N3-1) + v_load(brow1+N3) + v_load(brow1+N3-1);
|
||||
grad0 = v_add(v_load(brow0 + N5 - 1), v_load(brow1 + N5));
|
||||
grad1 = v_add(v_add(v_add(v_load(brow0 + N3), v_load(brow0 + N3 - 1)), v_load(brow1 + N3)), v_load(brow1 + N3 - 1));
|
||||
v_uint16x8 gradNW = v_merge_u16(grad0, grad1);
|
||||
|
||||
//int gradSE = brow1[N5] + brow2[N5+1];
|
||||
//int gradSE = brow1[N3] + brow1[N3+1] + brow2[N3] + brow2[N3+1];
|
||||
grad0 = v_load(brow2+N5+1) + v_load(brow1+N5);
|
||||
grad1 = v_load(brow2+N3) + v_load(brow2+N3+1) + v_load(brow1+N3) + v_load(brow1+N3+1);
|
||||
grad0 = v_add(v_load(brow2 + N5 + 1), v_load(brow1 + N5));
|
||||
grad1 = v_add(v_add(v_add(v_load(brow2 + N3), v_load(brow2 + N3 + 1)), v_load(brow1 + N3)), v_load(brow1 + N3 + 1));
|
||||
v_uint16x8 gradSE = v_merge_u16(grad0, grad1);
|
||||
|
||||
minGrad = v_min(v_min(minGrad, gradNW), gradSE);
|
||||
maxGrad = v_max(v_max(maxGrad, gradNW), gradSE);
|
||||
|
||||
//int T = minGrad + maxGrad/2;
|
||||
v_uint16x8 T = v_max((maxGrad >> 1), one) + minGrad;
|
||||
v_uint16x8 T = v_add(v_max((v_shr<1>(maxGrad)), one), minGrad);
|
||||
|
||||
v_uint16x8 RGs = z, GRs = z, Bs = z, ng = z;
|
||||
|
||||
@ -1361,133 +1361,135 @@ static void Bayer2RGB_VNG_8u( const Mat& srcmat, Mat& dstmat, int code )
|
||||
v_uint16x8 t0, t1, mask;
|
||||
|
||||
// gradN ***********************************************
|
||||
mask = (T > gradN); // mask = T>gradN
|
||||
ng = v_reinterpret_as_u16(v_reinterpret_as_s16(ng) - v_reinterpret_as_s16(mask)); // ng += (T>gradN)
|
||||
mask = (v_gt(T, gradN)); // mask = T>gradN
|
||||
ng = v_reinterpret_as_u16(v_sub(v_reinterpret_as_s16(ng), v_reinterpret_as_s16(mask))); // ng += (T>gradN)
|
||||
|
||||
t0 = (x3 << 1); // srow[-bstep]*2
|
||||
t1 = v_load_expand(srow - bstep*2) + x0; // srow[-bstep*2] + srow[0]
|
||||
t0 = (v_shl<1>(x3)); // srow[-bstep]*2
|
||||
t1 = v_add(v_load_expand(srow - bstep * 2), x0); // srow[-bstep*2] + srow[0]
|
||||
|
||||
// RGs += (srow[-bstep*2] + srow[0]) * (T>gradN)
|
||||
RGs += (t1 & mask);
|
||||
RGs = v_add(RGs, v_and(t1, mask));
|
||||
// GRs += {srow[-bstep]*2; (srow[-bstep*2-1] + srow[-bstep*2+1])} * (T>gradN)
|
||||
GRs += (v_merge_u16(t0, x2 + x4) & mask);
|
||||
GRs = v_add(GRs, (v_and(v_merge_u16(t0, v_add(x2, x4)), mask)));
|
||||
// Bs += {(srow[-bstep-1]+srow[-bstep+1]); srow[-bstep]*2 } * (T>gradN)
|
||||
Bs += (v_merge_u16(x1 + x5, t0) & mask);
|
||||
Bs = v_add(Bs, v_and(v_merge_u16(v_add(x1, x5), t0), mask));
|
||||
|
||||
// gradNE **********************************************
|
||||
mask = (T > gradNE); // mask = T>gradNE
|
||||
ng = v_reinterpret_as_u16(v_reinterpret_as_s16(ng) - v_reinterpret_as_s16(mask)); // ng += (T>gradNE)
|
||||
mask = (v_gt(T, gradNE)); // mask = T>gradNE
|
||||
ng = v_reinterpret_as_u16(v_sub(v_reinterpret_as_s16(ng), v_reinterpret_as_s16(mask))); // ng += (T>gradNE)
|
||||
|
||||
t0 = (x5 << 1); // srow[-bstep+1]*2
|
||||
t1 = v_load_expand(srow - bstep*2+2) + x0; // srow[-bstep*2+2] + srow[0]
|
||||
t0 = (v_shl<1>(x5)); // srow[-bstep+1]*2
|
||||
t1 = v_add(v_load_expand(srow - bstep * 2 + 2), x0); // srow[-bstep*2+2] + srow[0]
|
||||
|
||||
// RGs += {(srow[-bstep*2+2] + srow[0]); srow[-bstep+1]*2} * (T>gradNE)
|
||||
RGs += (v_merge_u16(t1, t0) & mask);
|
||||
RGs = v_add(RGs, v_and(v_merge_u16(t1, t0), mask));
|
||||
// GRs += {brow0[N6+1]; (srow[-bstep*2+1] + srow[1])} * (T>gradNE)
|
||||
GRs += (v_merge_u16(v_load(brow0+N6+1), x4 + x7) & mask);
|
||||
GRs = v_add(GRs, v_and(v_merge_u16(v_load(brow0+N6+1), v_add(x4, x7)), mask));
|
||||
// Bs += {srow[-bstep+1]*2; (srow[-bstep] + srow[-bstep+2])} * (T>gradNE)
|
||||
Bs += (v_merge_u16(t0, x3 + x6) & mask);
|
||||
Bs = v_add(Bs, v_and(v_merge_u16(t0, v_add(x3, x6)), mask));
|
||||
|
||||
// gradE ***********************************************
|
||||
mask = (T > gradE); // mask = T>gradE
|
||||
ng = v_reinterpret_as_u16(v_reinterpret_as_s16(ng) - v_reinterpret_as_s16(mask)); // ng += (T>gradE)
|
||||
mask = (v_gt(T, gradE)); // mask = T>gradE
|
||||
ng = v_reinterpret_as_u16(v_sub(v_reinterpret_as_s16(ng), v_reinterpret_as_s16(mask))); // ng += (T>gradE)
|
||||
|
||||
t0 = (x7 << 1); // srow[1]*2
|
||||
t1 = v_load_expand(srow +2) + x0; // srow[2] + srow[0]
|
||||
t0 = (v_shl<1>(x7)); // srow[1]*2
|
||||
t1 = v_add(v_load_expand(srow + 2), x0); // srow[2] + srow[0]
|
||||
|
||||
// RGs += (srow[2] + srow[0]) * (T>gradE)
|
||||
RGs += (t1 & mask);
|
||||
RGs = v_add(RGs, v_and(t1, mask));
|
||||
// GRs += (srow[1]*2) * (T>gradE)
|
||||
GRs += (t0 & mask);
|
||||
GRs = v_add(GRs, v_and(t0, mask));
|
||||
// Bs += {(srow[-bstep+1]+srow[bstep+1]); (srow[-bstep+2]+srow[bstep+2])} * (T>gradE)
|
||||
Bs += (v_merge_u16(x5 + x9, x6 + x8) & mask);
|
||||
Bs = v_add(Bs, v_and(v_merge_u16(v_add(x5, x9), v_add(x6, x8)), mask));
|
||||
|
||||
// gradSE **********************************************
|
||||
mask = (T > gradSE); // mask = T>gradSE
|
||||
ng = v_reinterpret_as_u16(v_reinterpret_as_s16(ng) - v_reinterpret_as_s16(mask)); // ng += (T>gradSE)
|
||||
mask = (v_gt(T, gradSE)); // mask = T>gradSE
|
||||
ng = v_reinterpret_as_u16(v_sub(v_reinterpret_as_s16(ng), v_reinterpret_as_s16(mask))); // ng += (T>gradSE)
|
||||
|
||||
t0 = (x9 << 1); // srow[bstep+1]*2
|
||||
t1 = v_load_expand(srow + bstep*2+2) + x0; // srow[bstep*2+2] + srow[0]
|
||||
t0 = (v_shl<1>(x9)); // srow[bstep+1]*2
|
||||
t1 = v_add(v_load_expand(srow + bstep * 2 + 2), x0); // srow[bstep*2+2] + srow[0]
|
||||
|
||||
// RGs += {(srow[bstep*2+2] + srow[0]); srow[bstep+1]*2} * (T>gradSE)
|
||||
RGs += (v_merge_u16(t1, t0) & mask);
|
||||
RGs = v_add(RGs, v_and(v_merge_u16(t1, t0), mask));
|
||||
// GRs += {brow2[N6+1]; (srow[1]+srow[bstep*2+1])} * (T>gradSE)
|
||||
GRs += (v_merge_u16(v_load(brow2+N6+1), x7 + x10) & mask);
|
||||
GRs = v_add(GRs, v_and(v_merge_u16(v_load(brow2+N6+1), v_add(x7, x10)), mask));
|
||||
// Bs += {srow[bstep+1]*2; (srow[bstep+2]+srow[bstep])} * (T>gradSE)
|
||||
Bs += (v_merge_u16((x9 << 1), x8 + x11) & mask);
|
||||
Bs = v_add(Bs, v_and(v_merge_u16((v_shl<1>(x9)), v_add(x8, x11)), mask));
|
||||
|
||||
// gradS ***********************************************
|
||||
mask = (T > gradS); // mask = T>gradS
|
||||
ng = v_reinterpret_as_u16(v_reinterpret_as_s16(ng) - v_reinterpret_as_s16(mask)); // ng += (T>gradS)
|
||||
mask = (v_gt(T, gradS)); // mask = T>gradS
|
||||
ng = v_reinterpret_as_u16(v_sub(v_reinterpret_as_s16(ng), v_reinterpret_as_s16(mask))); // ng += (T>gradS)
|
||||
|
||||
t0 = (x11 << 1); // srow[bstep]*2
|
||||
t1 = v_load_expand(srow + bstep*2) + x0; // srow[bstep*2]+srow[0]
|
||||
t0 = (v_shl<1>(x11)); // srow[bstep]*2
|
||||
t1 = v_add(v_load_expand(srow + bstep * 2), x0); // srow[bstep*2]+srow[0]
|
||||
|
||||
// RGs += (srow[bstep*2]+srow[0]) * (T>gradS)
|
||||
RGs += (t1 & mask);
|
||||
RGs = v_add(RGs, v_and(t1, mask));
|
||||
// GRs += {srow[bstep]*2; (srow[bstep*2+1]+srow[bstep*2-1])} * (T>gradS)
|
||||
GRs += (v_merge_u16(t0, x10 + x12) & mask);
|
||||
GRs = v_add(GRs, v_and(v_merge_u16(t0, v_add(x10, x12)), mask));
|
||||
// Bs += {(srow[bstep+1]+srow[bstep-1]); srow[bstep]*2} * (T>gradS)
|
||||
Bs += (v_merge_u16(x9 + x13, t0) & mask);
|
||||
Bs = v_add(Bs, v_and(v_merge_u16(v_add(x9, x13), t0), mask));
|
||||
|
||||
// gradSW **********************************************
|
||||
mask = (T > gradSW); // mask = T>gradSW
|
||||
ng = v_reinterpret_as_u16(v_reinterpret_as_s16(ng) - v_reinterpret_as_s16(mask)); // ng += (T>gradSW)
|
||||
mask = (v_gt(T, gradSW)); // mask = T>gradSW
|
||||
ng = v_reinterpret_as_u16(v_sub(v_reinterpret_as_s16(ng), v_reinterpret_as_s16(mask))); // ng += (T>gradSW)
|
||||
|
||||
t0 = (x13 << 1); // srow[bstep-1]*2
|
||||
t1 = v_load_expand(srow + bstep*2-2) + x0; // srow[bstep*2-2]+srow[0]
|
||||
t0 = (v_shl<1>(x13)); // srow[bstep-1]*2
|
||||
t1 = v_add(v_load_expand(srow + bstep * 2 - 2), x0); // srow[bstep*2-2]+srow[0]
|
||||
|
||||
// RGs += {(srow[bstep*2-2]+srow[0]); srow[bstep-1]*2} * (T>gradSW)
|
||||
RGs += (v_merge_u16(t1, t0) & mask);
|
||||
RGs = v_add(RGs, v_and(v_merge_u16(t1, t0), mask));
|
||||
// GRs += {brow2[N6-1]; (srow[bstep*2-1]+srow[-1])} * (T>gradSW)
|
||||
GRs += (v_merge_u16(v_load(brow2+N6-1), x12 + x15) & mask);
|
||||
GRs = v_add(GRs, v_and(v_merge_u16(v_load(brow2+N6-1), v_add(x12, x15)), mask));
|
||||
// Bs += {srow[bstep-1]*2; (srow[bstep]+srow[bstep-2])} * (T>gradSW)
|
||||
Bs += (v_merge_u16(t0, x11 + x14) & mask);
|
||||
Bs = v_add(Bs, v_and(v_merge_u16(t0, v_add(x11, x14)), mask));
|
||||
|
||||
// gradW ***********************************************
|
||||
mask = (T > gradW); // mask = T>gradW
|
||||
ng = v_reinterpret_as_u16(v_reinterpret_as_s16(ng) - v_reinterpret_as_s16(mask)); // ng += (T>gradW)
|
||||
mask = (v_gt(T, gradW)); // mask = T>gradW
|
||||
ng = v_reinterpret_as_u16(v_sub(v_reinterpret_as_s16(ng), v_reinterpret_as_s16(mask))); // ng += (T>gradW)
|
||||
|
||||
t0 = (x15 << 1); // srow[-1]*2
|
||||
t1 = v_load_expand(srow -2) + x0; // srow[-2]+srow[0]
|
||||
t0 = (v_shl<1>(x15)); // srow[-1]*2
|
||||
t1 = v_add(v_load_expand(srow - 2), x0); // srow[-2]+srow[0]
|
||||
|
||||
// RGs += (srow[-2]+srow[0]) * (T>gradW)
|
||||
RGs += (t1 & mask);
|
||||
RGs = v_add(RGs, v_and(t1, mask));
|
||||
// GRs += (srow[-1]*2) * (T>gradW)
|
||||
GRs += (t0 & mask);
|
||||
GRs = v_add(GRs, v_and(t0, mask));
|
||||
// Bs += {(srow[-bstep-1]+srow[bstep-1]); (srow[bstep-2]+srow[-bstep-2])} * (T>gradW)
|
||||
Bs += (v_merge_u16(x1 + x13, x14 + x16) & mask);
|
||||
Bs = v_add(Bs, v_and(v_merge_u16(v_add(x1, x13), v_add(x14, x16)), mask));
|
||||
|
||||
// gradNW **********************************************
|
||||
mask = (T > gradNW); // mask = T>gradNW
|
||||
ng = v_reinterpret_as_u16(v_reinterpret_as_s16(ng) - v_reinterpret_as_s16(mask)); // ng += (T>gradNW)
|
||||
mask = (v_gt(T, gradNW)); // mask = T>gradNW
|
||||
ng = v_reinterpret_as_u16(v_sub(v_reinterpret_as_s16(ng), v_reinterpret_as_s16(mask))); // ng += (T>gradNW)
|
||||
|
||||
t0 = (x1 << 1); // srow[-bstep-1]*2
|
||||
t1 = v_load_expand(srow -bstep*2-2) + x0; // srow[-bstep*2-2]+srow[0]
|
||||
t0 = (v_shl<1>(x1)); // srow[-bstep-1]*2
|
||||
t1 = v_add(v_load_expand(srow - bstep * 2 - 2), x0); // srow[-bstep*2-2]+srow[0]
|
||||
|
||||
// RGs += {(srow[-bstep*2-2]+srow[0]); srow[-bstep-1]*2} * (T>gradNW)
|
||||
RGs += (v_merge_u16(t1, t0) & mask);
|
||||
RGs = v_add(RGs, v_and(v_merge_u16(t1, t0), mask));
|
||||
// GRs += {brow0[N6-1]; (srow[-bstep*2-1]+srow[-1])} * (T>gradNW)
|
||||
GRs += (v_merge_u16(v_load(brow0+N6-1), x2 + x15) & mask);
|
||||
GRs = v_add(GRs, v_and(v_merge_u16(v_load(brow0+N6-1), v_add(x2, x15)), mask));
|
||||
// Bs += {srow[-bstep-1]*2; (srow[-bstep]+srow[-bstep-2])} * (T>gradNW)
|
||||
Bs += (v_merge_u16((x1 << 1), x3 + x16) & mask);
|
||||
Bs = v_add(Bs, v_and(v_merge_u16(v_shl<1>(x1), v_add(x3, x16)), mask));
|
||||
|
||||
v_float32x4 ngf0 = _0_5 / v_cvt_s16f32_lo(ng);
|
||||
v_float32x4 ngf1 = _0_5 / v_cvt_s16f32_hi(ng);
|
||||
v_float32x4 ngf0 = v_div(_0_5, v_cvt_s16f32_lo(ng));
|
||||
v_float32x4 ngf1 = v_div(_0_5, v_cvt_s16f32_hi(ng));
|
||||
|
||||
// now interpolate r, g & b
|
||||
t0 = v_reinterpret_as_u16(v_reinterpret_as_s16(GRs) - v_reinterpret_as_s16(RGs));
|
||||
t1 = v_reinterpret_as_u16(v_reinterpret_as_s16(Bs) - v_reinterpret_as_s16(RGs));
|
||||
t0 = v_reinterpret_as_u16(v_sub(v_reinterpret_as_s16(GRs), v_reinterpret_as_s16(RGs)));
|
||||
t1 = v_reinterpret_as_u16(v_sub(v_reinterpret_as_s16(Bs), v_reinterpret_as_s16(RGs)));
|
||||
|
||||
t0 = v_reinterpret_as_u16(v_reinterpret_as_s16(x0) +
|
||||
t0 = v_reinterpret_as_u16(
|
||||
v_add(v_reinterpret_as_s16(x0),
|
||||
v_pack(
|
||||
v_round(v_cvt_s16f32_lo(t0) * ngf0),
|
||||
v_round(v_cvt_s16f32_hi(t0) * ngf1)));
|
||||
v_round(v_mul(v_cvt_s16f32_lo(t0), ngf0)),
|
||||
v_round(v_mul(v_cvt_s16f32_hi(t0), ngf1)))));
|
||||
|
||||
t1 = v_reinterpret_as_u16(v_reinterpret_as_s16(x0) +
|
||||
t1 = v_reinterpret_as_u16(
|
||||
v_add(v_reinterpret_as_s16(x0),
|
||||
v_pack(
|
||||
v_round(v_cvt_s16f32_lo(t1) * ngf0),
|
||||
v_round(v_cvt_s16f32_hi(t1) * ngf1)));
|
||||
v_round(v_mul(v_cvt_s16f32_lo(t1), ngf0)),
|
||||
v_round(v_mul(v_cvt_s16f32_hi(t1), ngf1)))));
|
||||
|
||||
x1 = v_merge_u16(x0, t0);
|
||||
x2 = v_merge_u16(t0, x0);
|
||||
|
@ -1084,9 +1084,9 @@ struct SymmColumnVec_32s8u
|
||||
i += VTraits<v_uint16>::vlanes();
|
||||
}
|
||||
#if CV_SIMD_WIDTH > 16
|
||||
while( i <= width - 4 /*v_int32x4::nlanes*/ )
|
||||
while( i <= width - 4 /*VTraits<v_int32x4>::vlanes()*/ )
|
||||
#else
|
||||
if( i <= width - v_int32::nlanes )
|
||||
if( i <= width - VTraits<v_int32>::vlanes() )
|
||||
#endif
|
||||
{
|
||||
v_float32 s0 = v_muladd(v_cvt_f32(vx_load(src[0] + i)), vx_setall_f32(ky[0]), vx_setall_f32(delta));
|
||||
@ -1140,9 +1140,9 @@ struct SymmColumnVec_32s8u
|
||||
i += VTraits<v_uint16>::vlanes();
|
||||
}
|
||||
#if CV_SIMD_WIDTH > 16
|
||||
while( i <= width - 4 /*v_int32x4::nlanes*/ )
|
||||
while( i <= width - 4 /*VTraits<v_int32x4>::vlanes()*/ )
|
||||
#else
|
||||
if( i <= width - v_int32::nlanes )
|
||||
if( i <= width - VTraits<v_int32>::vlanes() )
|
||||
#endif
|
||||
{
|
||||
v_float32 s0 = v_muladd(v_cvt_f32(v_sub(vx_load(src[1] + i), vx_load(src[-1] + i))), vx_setall_f32(ky[1]), vx_setall_f32(delta));
|
||||
@ -1321,23 +1321,23 @@ struct SymmColumnSmallVec_32s16s
|
||||
{
|
||||
v_int32 k0 = vx_setall_s32((int)ky[0]), k1 = vx_setall_s32((int)ky[1]);
|
||||
v_int32 d4 = vx_setall_s32(d);
|
||||
for( ; i <= width - 2*v_int16::nlanes; i += 2*v_int16::nlanes )
|
||||
for( ; i <= width - 2*VTraits<v_int16>::vlanes(); i += 2*VTraits<v_int16>::vlanes() )
|
||||
{
|
||||
v_store(dst + i, v_pack(v_muladd(vx_load(S0 + i) + vx_load(S2 + i), k1, v_muladd(vx_load(S1 + i), k0, d4)),
|
||||
v_muladd(vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes), k1, v_muladd(vx_load(S1 + i + v_int32::nlanes), k0, d4))));
|
||||
v_store(dst + i + v_int16::nlanes, v_pack(v_muladd(vx_load(S0 + i + 2*v_int32::nlanes) + vx_load(S2 + i + 2*v_int32::nlanes), k1, v_muladd(vx_load(S1 + i + 2*v_int32::nlanes), k0, d4)),
|
||||
v_muladd(vx_load(S0 + i + 3*v_int32::nlanes) + vx_load(S2 + i + 3*v_int32::nlanes), k1, v_muladd(vx_load(S1 + i + 3*v_int32::nlanes), k0, d4))));
|
||||
v_store(dst + i, v_pack(v_muladd(v_add(vx_load(S0 + i), vx_load(S2 + i)), k1, v_muladd(vx_load(S1 + i), k0, d4)),
|
||||
v_muladd(v_add(vx_load(S0 + i + VTraits<v_int32>::vlanes()), vx_load(S2 + i + VTraits<v_int32>::vlanes())), k1, v_muladd(vx_load(S1 + i + VTraits<v_int32>::vlanes()), k0, d4))));
|
||||
v_store(dst + i + VTraits<v_int16>::vlanes(), v_pack(v_muladd(v_add(vx_load(S0 + i + 2 * VTraits<v_int32>::vlanes()), vx_load(S2 + i + 2 * VTraits<v_int32>::vlanes())), k1, v_muladd(vx_load(S1 + i + 2*VTraits<v_int32>::vlanes()), k0, d4)),
|
||||
v_muladd(v_add(vx_load(S0 + i + 3 * VTraits<v_int32>::vlanes()), vx_load(S2 + i + 3 * VTraits<v_int32>::vlanes())), k1, v_muladd(vx_load(S1 + i + 3*VTraits<v_int32>::vlanes()), k0, d4))));
|
||||
}
|
||||
if( i <= width - v_int16::nlanes )
|
||||
if( i <= width - VTraits<v_int16>::vlanes() )
|
||||
{
|
||||
v_store(dst + i, v_pack(v_muladd(vx_load(S0 + i) + vx_load(S2 + i), k1, v_muladd(vx_load(S1 + i), k0, d4)),
|
||||
v_muladd(vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes), k1, v_muladd(vx_load(S1 + i + v_int32::nlanes), k0, d4))));
|
||||
i += v_int16::nlanes;
|
||||
v_store(dst + i, v_pack(v_muladd(v_add(vx_load(S0 + i), vx_load(S2 + i)), k1, v_muladd(vx_load(S1 + i), k0, d4)),
|
||||
v_muladd(v_add(vx_load(S0 + i + VTraits<v_int32>::vlanes()), vx_load(S2 + i + VTraits<v_int32>::vlanes())), k1, v_muladd(vx_load(S1 + i + VTraits<v_int32>::vlanes()), k0, d4))));
|
||||
i += VTraits<v_int16>::vlanes();
|
||||
}
|
||||
if( i <= width - v_int32::nlanes )
|
||||
if( i <= width - VTraits<v_int32>::vlanes() )
|
||||
{
|
||||
v_pack_store(dst + i, v_muladd(vx_load(S0 + i) + vx_load(S2 + i), k1, v_muladd(vx_load(S1 + i), k0, d4)));
|
||||
i += v_int32::nlanes;
|
||||
v_pack_store(dst + i, v_muladd(v_add(vx_load(S0 + i), vx_load(S2 + i)), k1, v_muladd(vx_load(S1 + i), k0, d4)));
|
||||
i += VTraits<v_int32>::vlanes();
|
||||
}
|
||||
}
|
||||
#endif
|
||||
@ -2237,9 +2237,9 @@ struct FilterVec_8u
|
||||
i += VTraits<v_uint16>::vlanes();
|
||||
}
|
||||
#if CV_SIMD_WIDTH > 16
|
||||
while( i <= width - 4 /*v_int32x4::nlanes*/ )
|
||||
while( i <= width - 4 /*VTraits<v_int32x4>::vlanes()*/ )
|
||||
#else
|
||||
if( i <= width - v_int32::nlanes )
|
||||
if( i <= width - VTraits<v_int32>::vlanes() )
|
||||
#endif
|
||||
{
|
||||
v_float32 s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(vx_load_expand_q(src[0] + i))), vx_setall_f32(kf[0]), vx_setall_f32(delta));
|
||||
@ -2248,7 +2248,7 @@ struct FilterVec_8u
|
||||
v_int32 s32 = v_round(s0);
|
||||
v_int16 s16 = v_pack(s32, s32);
|
||||
*(unaligned_int*)(dst + i) = v_get0(v_reinterpret_as_s32(v_pack_u(s16, s16)));
|
||||
i += 4 /*v_int32x4::nlanes*/ ;
|
||||
i += 4 /*VTraits<v_int32x4>::vlanes()*/ ;
|
||||
}
|
||||
return i;
|
||||
}
|
||||
|
@ -2093,7 +2093,7 @@ double cv::compareHist( InputArray _H1, InputArray _H2, int method )
|
||||
v_float32 v_s11 = vx_setzero_f32();
|
||||
v_float32 v_s12 = vx_setzero_f32();
|
||||
v_float32 v_s22 = vx_setzero_f32();
|
||||
for (; j <= len - v_float32::nlanes; j += v_float32::nlanes)
|
||||
for (; j <= len - VTraits<v_float32>::vlanes(); j += VTraits<v_float32>::vlanes())
|
||||
{
|
||||
v_float32 v_a = vx_load(h1 + j);
|
||||
v_float32 v_b = vx_load(h2 + j);
|
||||
@ -2134,10 +2134,10 @@ double cv::compareHist( InputArray _H1, InputArray _H2, int method )
|
||||
result += v_reduce_sum(v_result);
|
||||
#elif CV_SIMD
|
||||
v_float32 v_result = vx_setzero_f32();
|
||||
for (; j <= len - v_float32::nlanes; j += v_float32::nlanes)
|
||||
for (; j <= len - VTraits<v_float32>::vlanes(); j += VTraits<v_float32>::vlanes())
|
||||
{
|
||||
v_float32 v_src = v_min(vx_load(h1 + j), vx_load(h2 + j));
|
||||
v_result += v_src;
|
||||
v_result = v_add(v_result, v_src);
|
||||
}
|
||||
result += v_reduce_sum(v_result);
|
||||
#endif
|
||||
@ -2174,7 +2174,7 @@ double cv::compareHist( InputArray _H1, InputArray _H2, int method )
|
||||
v_float32 v_s1 = vx_setzero_f32();
|
||||
v_float32 v_s2 = vx_setzero_f32();
|
||||
v_float32 v_result = vx_setzero_f32();
|
||||
for (; j <= len - v_float32::nlanes; j += v_float32::nlanes)
|
||||
for (; j <= len - VTraits<v_float32>::vlanes(); j += VTraits<v_float32>::vlanes())
|
||||
{
|
||||
v_float32 v_a = vx_load(h1 + j);
|
||||
v_float32 v_b = vx_load(h2 + j);
|
||||
|
@ -455,7 +455,7 @@ struct RemapVec_8u
|
||||
v_int32x4 delta = v_setall_s32(INTER_REMAP_COEF_SCALE / 2);
|
||||
v_int16x8 xy2ofs = v_reinterpret_as_s16(v_setall_s32(cn + (sstep << 16)));
|
||||
int CV_DECL_ALIGNED(16) iofs0[4], iofs1[4];
|
||||
const uchar* src_limit_8bytes = _src.datalimit - v_int16x8::nlanes;
|
||||
const uchar* src_limit_8bytes = _src.datalimit - VTraits<v_int16x8>::vlanes();
|
||||
#define CV_PICK_AND_PACK_RGB(ptr, offset, result) \
|
||||
{ \
|
||||
const uchar* const p = ((const uchar*)ptr) + (offset); \
|
||||
@ -483,7 +483,7 @@ struct RemapVec_8u
|
||||
v_uint8x16 rrggbbaa, dummy; \
|
||||
v_uint16x8 rrggbbaa8, dummy8; \
|
||||
v_uint8x16 rgba0 = v_reinterpret_as_u8(v_int32x4(*(unaligned_int*)(p), 0, 0, 0)); \
|
||||
v_uint8x16 rgba1 = v_reinterpret_as_u8(v_int32x4(*(unaligned_int*)(p + v_int32x4::nlanes), 0, 0, 0)); \
|
||||
v_uint8x16 rgba1 = v_reinterpret_as_u8(v_int32x4(*(unaligned_int*)(p + VTraits<v_int32x4>::vlanes()), 0, 0, 0)); \
|
||||
v_zip(rgba0, rgba1, rrggbbaa, dummy); \
|
||||
v_expand(rrggbbaa, rrggbbaa8, dummy8); \
|
||||
result = v_reinterpret_as_s16(rrggbbaa8); \
|
||||
@ -534,8 +534,8 @@ struct RemapVec_8u
|
||||
v3 = v_dotprod(v_reinterpret_as_s16(v3), v_reinterpret_as_s16(d2), delta);
|
||||
v2 = v_dotprod(v_reinterpret_as_s16(v2), v_reinterpret_as_s16(c2), v3);
|
||||
|
||||
v0 = v0 >> INTER_REMAP_COEF_BITS;
|
||||
v2 = v2 >> INTER_REMAP_COEF_BITS;
|
||||
v0 = v_shr<INTER_REMAP_COEF_BITS>(v0);
|
||||
v2 = v_shr<INTER_REMAP_COEF_BITS>(v2);
|
||||
v_pack_u_store(D + x, v_pack(v0, v2));
|
||||
}
|
||||
}
|
||||
@ -563,8 +563,8 @@ struct RemapVec_8u
|
||||
CV_PICK_AND_PACK_RGB(S0, iofs0[1], u1);
|
||||
CV_PICK_AND_PACK_RGB(S1, iofs0[1], v1);
|
||||
|
||||
v_int32x4 result0 = v_dotprod(u0, w00, v_dotprod(v0, w01, delta)) >> INTER_REMAP_COEF_BITS;
|
||||
v_int32x4 result1 = v_dotprod(u1, w10, v_dotprod(v1, w11, delta)) >> INTER_REMAP_COEF_BITS;
|
||||
v_int32x4 result0 = v_shr<INTER_REMAP_COEF_BITS>(v_dotprod(u0, w00, v_dotprod(v0, w01, delta)));
|
||||
v_int32x4 result1 = v_shr<INTER_REMAP_COEF_BITS>(v_dotprod(u1, w10, v_dotprod(v1, w11, delta)));
|
||||
|
||||
result0 = v_rotate_left<1>(result0);
|
||||
v_int16x8 result8 = v_pack(result0, result1);
|
||||
@ -581,8 +581,8 @@ struct RemapVec_8u
|
||||
CV_PICK_AND_PACK_RGB(S0, iofs0[3], u1);
|
||||
CV_PICK_AND_PACK_RGB(S1, iofs0[3], v1);
|
||||
|
||||
result0 = v_dotprod(u0, w00, v_dotprod(v0, w01, delta)) >> INTER_REMAP_COEF_BITS;
|
||||
result1 = v_dotprod(u1, w10, v_dotprod(v1, w11, delta)) >> INTER_REMAP_COEF_BITS;
|
||||
result0 = v_shr<INTER_REMAP_COEF_BITS>(v_dotprod(u0, w00, v_dotprod(v0, w01, delta)));
|
||||
result1 = v_shr<INTER_REMAP_COEF_BITS>(v_dotprod(u1, w10, v_dotprod(v1, w11, delta)));
|
||||
|
||||
result0 = v_rotate_left<1>(result0);
|
||||
result8 = v_pack(result0, result1);
|
||||
@ -613,8 +613,8 @@ struct RemapVec_8u
|
||||
CV_PICK_AND_PACK_RGBA(S0, iofs0[1], u1);
|
||||
CV_PICK_AND_PACK_RGBA(S1, iofs0[1], v1);
|
||||
|
||||
v_int32x4 result0 = v_dotprod(u0, w00, v_dotprod(v0, w01, delta)) >> INTER_REMAP_COEF_BITS;
|
||||
v_int32x4 result1 = v_dotprod(u1, w10, v_dotprod(v1, w11, delta)) >> INTER_REMAP_COEF_BITS;
|
||||
v_int32x4 result0 = v_shr<INTER_REMAP_COEF_BITS>(v_dotprod(u0, w00, v_dotprod(v0, w01, delta)));
|
||||
v_int32x4 result1 = v_shr<INTER_REMAP_COEF_BITS>(v_dotprod(u1, w10, v_dotprod(v1, w11, delta)));
|
||||
v_int16x8 result8 = v_pack(result0, result1);
|
||||
v_pack_u_store(D, result8);
|
||||
|
||||
@ -627,8 +627,8 @@ struct RemapVec_8u
|
||||
CV_PICK_AND_PACK_RGBA(S0, iofs0[3], u1);
|
||||
CV_PICK_AND_PACK_RGBA(S1, iofs0[3], v1);
|
||||
|
||||
result0 = v_dotprod(u0, w00, v_dotprod(v0, w01, delta)) >> INTER_REMAP_COEF_BITS;
|
||||
result1 = v_dotprod(u1, w10, v_dotprod(v1, w11, delta)) >> INTER_REMAP_COEF_BITS;
|
||||
result0 = v_shr<INTER_REMAP_COEF_BITS>(v_dotprod(u0, w00, v_dotprod(v0, w01, delta)));
|
||||
result1 = v_shr<INTER_REMAP_COEF_BITS>(v_dotprod(u1, w10, v_dotprod(v1, w11, delta)));
|
||||
result8 = v_pack(result0, result1);
|
||||
v_pack_u_store(D + 8, result8);
|
||||
}
|
||||
@ -1164,7 +1164,7 @@ public:
|
||||
|
||||
#if CV_SIMD128
|
||||
{
|
||||
int span = v_float32x4::nlanes;
|
||||
int span = VTraits<v_float32x4>::vlanes();
|
||||
for( ; x1 <= bcols - span * 2; x1 += span * 2 )
|
||||
{
|
||||
v_int32x4 ix0 = v_round(v_load(sX + x1));
|
||||
@ -1206,9 +1206,9 @@ public:
|
||||
#if CV_SIMD128
|
||||
{
|
||||
v_uint16x8 v_scale = v_setall_u16(INTER_TAB_SIZE2 - 1);
|
||||
int span = v_uint16x8::nlanes;
|
||||
int span = VTraits<v_uint16x8>::vlanes();
|
||||
for( ; x1 <= bcols - span; x1 += span )
|
||||
v_store((unsigned short*)(A + x1), v_load(sA + x1) & v_scale);
|
||||
v_store((unsigned short*)(A + x1), v_and(v_load(sA + x1), v_scale));
|
||||
}
|
||||
#endif
|
||||
for( ; x1 < bcols; x1++ )
|
||||
@ -1224,16 +1224,16 @@ public:
|
||||
{
|
||||
v_float32x4 v_scale = v_setall_f32((float)INTER_TAB_SIZE);
|
||||
v_int32x4 v_scale2 = v_setall_s32(INTER_TAB_SIZE - 1);
|
||||
int span = v_float32x4::nlanes;
|
||||
int span = VTraits<v_float32x4>::vlanes();
|
||||
for( ; x1 <= bcols - span * 2; x1 += span * 2 )
|
||||
{
|
||||
v_int32x4 v_sx0 = v_round(v_scale * v_load(sX + x1));
|
||||
v_int32x4 v_sy0 = v_round(v_scale * v_load(sY + x1));
|
||||
v_int32x4 v_sx1 = v_round(v_scale * v_load(sX + x1 + span));
|
||||
v_int32x4 v_sy1 = v_round(v_scale * v_load(sY + x1 + span));
|
||||
v_uint16x8 v_sx8 = v_reinterpret_as_u16(v_pack(v_sx0 & v_scale2, v_sx1 & v_scale2));
|
||||
v_uint16x8 v_sy8 = v_reinterpret_as_u16(v_pack(v_sy0 & v_scale2, v_sy1 & v_scale2));
|
||||
v_uint16x8 v_v = v_shl<INTER_BITS>(v_sy8) | (v_sx8);
|
||||
v_int32x4 v_sx0 = v_round(v_mul(v_scale, v_load(sX + x1)));
|
||||
v_int32x4 v_sy0 = v_round(v_mul(v_scale, v_load(sY + x1)));
|
||||
v_int32x4 v_sx1 = v_round(v_mul(v_scale, v_load(sX + x1 + span)));
|
||||
v_int32x4 v_sy1 = v_round(v_mul(v_scale, v_load(sY + x1 + span)));
|
||||
v_uint16x8 v_sx8 = v_reinterpret_as_u16(v_pack(v_and(v_sx0, v_scale2), v_and(v_sx1, v_scale2)));
|
||||
v_uint16x8 v_sy8 = v_reinterpret_as_u16(v_pack(v_and(v_sy0, v_scale2), v_and(v_sy1, v_scale2)));
|
||||
v_uint16x8 v_v = v_or(v_shl<INTER_BITS>(v_sy8), v_sx8);
|
||||
v_store(A + x1, v_v);
|
||||
|
||||
v_int16x8 v_d0 = v_pack(v_shr<INTER_BITS>(v_sx0), v_shr<INTER_BITS>(v_sx1));
|
||||
@ -1261,18 +1261,18 @@ public:
|
||||
{
|
||||
v_float32x4 v_scale = v_setall_f32((float)INTER_TAB_SIZE);
|
||||
v_int32x4 v_scale2 = v_setall_s32(INTER_TAB_SIZE - 1), v_scale3 = v_setall_s32(INTER_TAB_SIZE);
|
||||
int span = v_float32x4::nlanes;
|
||||
int span = VTraits<v_float32x4>::vlanes();
|
||||
for( ; x1 <= bcols - span * 2; x1 += span * 2 )
|
||||
{
|
||||
v_float32x4 v_fx, v_fy;
|
||||
v_load_deinterleave(sXY + (x1 << 1), v_fx, v_fy);
|
||||
v_int32x4 v_sx0 = v_round(v_fx * v_scale);
|
||||
v_int32x4 v_sy0 = v_round(v_fy * v_scale);
|
||||
v_int32x4 v_sx0 = v_round(v_mul(v_fx, v_scale));
|
||||
v_int32x4 v_sy0 = v_round(v_mul(v_fy, v_scale));
|
||||
v_load_deinterleave(sXY + ((x1 + span) << 1), v_fx, v_fy);
|
||||
v_int32x4 v_sx1 = v_round(v_fx * v_scale);
|
||||
v_int32x4 v_sy1 = v_round(v_fy * v_scale);
|
||||
v_int32x4 v_v0 = v_muladd(v_scale3, (v_sy0 & v_scale2), (v_sx0 & v_scale2));
|
||||
v_int32x4 v_v1 = v_muladd(v_scale3, (v_sy1 & v_scale2), (v_sx1 & v_scale2));
|
||||
v_int32x4 v_sx1 = v_round(v_mul(v_fx, v_scale));
|
||||
v_int32x4 v_sy1 = v_round(v_mul(v_fy, v_scale));
|
||||
v_int32x4 v_v0 = v_muladd(v_scale3, (v_and(v_sy0, v_scale2)), (v_and(v_sx0, v_scale2)));
|
||||
v_int32x4 v_v1 = v_muladd(v_scale3, (v_and(v_sy1, v_scale2)), (v_and(v_sx1, v_scale2)));
|
||||
v_uint16x8 v_v8 = v_reinterpret_as_u16(v_pack(v_v0, v_v1));
|
||||
v_store(A + x1, v_v8);
|
||||
v_int16x8 v_dx = v_pack(v_shr<INTER_BITS>(v_sx0), v_shr<INTER_BITS>(v_sx1));
|
||||
@ -1941,7 +1941,7 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
|
||||
{
|
||||
#if CV_SIMD128
|
||||
{
|
||||
int span = v_int16x8::nlanes;
|
||||
int span = VTraits<v_int16x8>::vlanes();
|
||||
for( ; x <= size.width - span; x += span )
|
||||
{
|
||||
v_int16x8 v_dst[2];
|
||||
@ -1973,21 +1973,21 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
|
||||
v_float32x4 v_scale = v_setall_f32((float)INTER_TAB_SIZE);
|
||||
v_int32x4 v_mask = v_setall_s32(INTER_TAB_SIZE - 1);
|
||||
v_int32x4 v_scale3 = v_setall_s32(INTER_TAB_SIZE);
|
||||
int span = v_float32x4::nlanes;
|
||||
int span = VTraits<v_float32x4>::vlanes();
|
||||
for( ; x <= size.width - span * 2; x += span * 2 )
|
||||
{
|
||||
v_int32x4 v_ix0 = v_round(v_scale * (v_load(src1f + x)));
|
||||
v_int32x4 v_ix1 = v_round(v_scale * (v_load(src1f + x + span)));
|
||||
v_int32x4 v_iy0 = v_round(v_scale * (v_load(src2f + x)));
|
||||
v_int32x4 v_iy1 = v_round(v_scale * (v_load(src2f + x + span)));
|
||||
v_int32x4 v_ix0 = v_round(v_mul(v_scale, v_load(src1f + x)));
|
||||
v_int32x4 v_ix1 = v_round(v_mul(v_scale, v_load(src1f + x + span)));
|
||||
v_int32x4 v_iy0 = v_round(v_mul(v_scale, v_load(src2f + x)));
|
||||
v_int32x4 v_iy1 = v_round(v_mul(v_scale, v_load(src2f + x + span)));
|
||||
|
||||
v_int16x8 v_dst[2];
|
||||
v_dst[0] = v_pack(v_shr<INTER_BITS>(v_ix0), v_shr<INTER_BITS>(v_ix1));
|
||||
v_dst[1] = v_pack(v_shr<INTER_BITS>(v_iy0), v_shr<INTER_BITS>(v_iy1));
|
||||
v_store_interleave(dst1 + (x << 1), v_dst[0], v_dst[1]);
|
||||
|
||||
v_int32x4 v_dst0 = v_muladd(v_scale3, (v_iy0 & v_mask), (v_ix0 & v_mask));
|
||||
v_int32x4 v_dst1 = v_muladd(v_scale3, (v_iy1 & v_mask), (v_ix1 & v_mask));
|
||||
v_int32x4 v_dst0 = v_muladd(v_scale3, (v_and(v_iy0, v_mask)), (v_and(v_ix0, v_mask)));
|
||||
v_int32x4 v_dst1 = v_muladd(v_scale3, (v_and(v_iy1, v_mask)), (v_and(v_ix1, v_mask)));
|
||||
v_store(dst2 + x, v_pack_u(v_dst0, v_dst1));
|
||||
}
|
||||
}
|
||||
@ -2008,7 +2008,7 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
|
||||
if( nninterpolate )
|
||||
{
|
||||
#if CV_SIMD128
|
||||
int span = v_float32x4::nlanes;
|
||||
int span = VTraits<v_float32x4>::vlanes();
|
||||
{
|
||||
for( ; x <= (size.width << 1) - span * 2; x += span * 2 )
|
||||
v_store(dst1 + x, v_pack(v_round(v_load(src1f + x)),
|
||||
@ -2034,16 +2034,16 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
|
||||
v_float32x4 v_scale = v_setall_f32((float)INTER_TAB_SIZE);
|
||||
v_int32x4 v_mask = v_setall_s32(INTER_TAB_SIZE - 1);
|
||||
v_int32x4 v_scale3 = v_setall_s32(INTER_TAB_SIZE);
|
||||
int span = v_uint16x8::nlanes;
|
||||
int span = VTraits<v_uint16x8>::vlanes();
|
||||
for (; x <= size.width - span; x += span )
|
||||
{
|
||||
v_float32x4 v_src0[2], v_src1[2];
|
||||
v_load_deinterleave(src1f + (x << 1), v_src0[0], v_src0[1]);
|
||||
v_load_deinterleave(src1f + (x << 1) + span, v_src1[0], v_src1[1]);
|
||||
v_int32x4 v_ix0 = v_round(v_src0[0] * v_scale);
|
||||
v_int32x4 v_ix1 = v_round(v_src1[0] * v_scale);
|
||||
v_int32x4 v_iy0 = v_round(v_src0[1] * v_scale);
|
||||
v_int32x4 v_iy1 = v_round(v_src1[1] * v_scale);
|
||||
v_int32x4 v_ix0 = v_round(v_mul(v_src0[0], v_scale));
|
||||
v_int32x4 v_ix1 = v_round(v_mul(v_src1[0], v_scale));
|
||||
v_int32x4 v_iy0 = v_round(v_mul(v_src0[1], v_scale));
|
||||
v_int32x4 v_iy1 = v_round(v_mul(v_src1[1], v_scale));
|
||||
|
||||
v_int16x8 v_dst[2];
|
||||
v_dst[0] = v_pack(v_shr<INTER_BITS>(v_ix0), v_shr<INTER_BITS>(v_ix1));
|
||||
@ -2051,8 +2051,8 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
|
||||
v_store_interleave(dst1 + (x << 1), v_dst[0], v_dst[1]);
|
||||
|
||||
v_store(dst2 + x, v_pack_u(
|
||||
v_muladd(v_scale3, (v_iy0 & v_mask), (v_ix0 & v_mask)),
|
||||
v_muladd(v_scale3, (v_iy1 & v_mask), (v_ix1 & v_mask))));
|
||||
v_muladd(v_scale3, (v_and(v_iy0, v_mask)), (v_and(v_ix0, v_mask))),
|
||||
v_muladd(v_scale3, (v_and(v_iy1, v_mask)), (v_and(v_ix1, v_mask)))));
|
||||
}
|
||||
}
|
||||
#endif
|
||||
@ -2074,13 +2074,13 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
|
||||
v_uint16x8 v_mask2 = v_setall_u16(INTER_TAB_SIZE2-1);
|
||||
v_uint32x4 v_zero = v_setzero_u32(), v_mask = v_setall_u32(INTER_TAB_SIZE-1);
|
||||
v_float32x4 v_scale = v_setall_f32(scale);
|
||||
int span = v_float32x4::nlanes;
|
||||
int span = VTraits<v_float32x4>::vlanes();
|
||||
for( ; x <= size.width - span * 2; x += span * 2 )
|
||||
{
|
||||
v_uint32x4 v_fxy1, v_fxy2;
|
||||
if ( src2 )
|
||||
{
|
||||
v_uint16x8 v_src2 = v_load(src2 + x) & v_mask2;
|
||||
v_uint16x8 v_src2 = v_and(v_load(src2 + x), v_mask2);
|
||||
v_expand(v_src2, v_fxy1, v_fxy2);
|
||||
}
|
||||
else
|
||||
@ -2091,9 +2091,9 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
|
||||
v_load_deinterleave(src1 + (x << 1), v_src[0], v_src[1]);
|
||||
v_expand(v_src[0], v_src0[0], v_src0[1]);
|
||||
v_expand(v_src[1], v_src1[0], v_src1[1]);
|
||||
#define CV_COMPUTE_MAP_X(X, FXY) v_muladd(v_scale, v_cvt_f32(v_reinterpret_as_s32((FXY) & v_mask)),\
|
||||
#define CV_COMPUTE_MAP_X(X, FXY) v_muladd(v_scale, v_cvt_f32(v_reinterpret_as_s32(v_and((FXY), v_mask))),\
|
||||
v_cvt_f32(v_reinterpret_as_s32(X)))
|
||||
#define CV_COMPUTE_MAP_Y(Y, FXY) v_muladd(v_scale, v_cvt_f32(v_reinterpret_as_s32((FXY) >> INTER_BITS)),\
|
||||
#define CV_COMPUTE_MAP_Y(Y, FXY) v_muladd(v_scale, v_cvt_f32(v_reinterpret_as_s32(v_shr<INTER_BITS>((FXY)))),\
|
||||
v_cvt_f32(v_reinterpret_as_s32(Y)))
|
||||
v_float32x4 v_dst1 = CV_COMPUTE_MAP_X(v_src0[0], v_fxy1);
|
||||
v_float32x4 v_dst2 = CV_COMPUTE_MAP_Y(v_src1[0], v_fxy1);
|
||||
@ -2123,13 +2123,13 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
|
||||
v_int16x8 v_mask2 = v_setall_s16(INTER_TAB_SIZE2-1);
|
||||
v_int32x4 v_zero = v_setzero_s32(), v_mask = v_setall_s32(INTER_TAB_SIZE-1);
|
||||
v_float32x4 v_scale = v_setall_f32(scale);
|
||||
int span = v_int16x8::nlanes;
|
||||
int span = VTraits<v_int16x8>::vlanes();
|
||||
for( ; x <= size.width - span; x += span )
|
||||
{
|
||||
v_int32x4 v_fxy1, v_fxy2;
|
||||
if (src2)
|
||||
{
|
||||
v_int16x8 v_src2 = v_load((short *)src2 + x) & v_mask2;
|
||||
v_int16x8 v_src2 = v_and(v_load((short *)src2 + x), v_mask2);
|
||||
v_expand(v_src2, v_fxy1, v_fxy2);
|
||||
}
|
||||
else
|
||||
@ -2142,8 +2142,8 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
|
||||
v_expand(v_src[0], v_src0[0], v_src0[1]);
|
||||
v_expand(v_src[1], v_src1[0], v_src1[1]);
|
||||
|
||||
#define CV_COMPUTE_MAP_X(X, FXY) v_muladd(v_scale, v_cvt_f32((FXY) & v_mask), v_cvt_f32(X))
|
||||
#define CV_COMPUTE_MAP_Y(Y, FXY) v_muladd(v_scale, v_cvt_f32((FXY) >> INTER_BITS), v_cvt_f32(Y))
|
||||
#define CV_COMPUTE_MAP_X(X, FXY) v_muladd(v_scale, v_cvt_f32(v_and((FXY), v_mask)), v_cvt_f32(X))
|
||||
#define CV_COMPUTE_MAP_Y(Y, FXY) v_muladd(v_scale, v_cvt_f32(v_shr<INTER_BITS>((FXY))), v_cvt_f32(Y))
|
||||
v_dst[0] = CV_COMPUTE_MAP_X(v_src0[0], v_fxy1);
|
||||
v_dst[1] = CV_COMPUTE_MAP_Y(v_src1[0], v_fxy1);
|
||||
v_store_interleave(dst1f + (x << 1), v_dst[0], v_dst[1]);
|
||||
@ -2234,12 +2234,12 @@ public:
|
||||
#if CV_SIMD128
|
||||
{
|
||||
v_int32x4 v_X0 = v_setall_s32(X0), v_Y0 = v_setall_s32(Y0);
|
||||
int span = v_uint16x8::nlanes;
|
||||
int span = VTraits<v_uint16x8>::vlanes();
|
||||
for( ; x1 <= bw - span; x1 += span )
|
||||
{
|
||||
v_int16x8 v_dst[2];
|
||||
#define CV_CONVERT_MAP(ptr,offset,shift) v_pack(v_shr<AB_BITS>(shift+v_load(ptr + offset)),\
|
||||
v_shr<AB_BITS>(shift+v_load(ptr + offset + 4)))
|
||||
#define CV_CONVERT_MAP(ptr,offset,shift) v_pack(v_shr<AB_BITS>(v_add(shift,v_load(ptr + offset))),\
|
||||
v_shr<AB_BITS>(v_add(shift,v_load(ptr + offset + 4))))
|
||||
v_dst[0] = CV_CONVERT_MAP(adelta, x+x1, v_X0);
|
||||
v_dst[1] = CV_CONVERT_MAP(bdelta, x+x1, v_Y0);
|
||||
#undef CV_CONVERT_MAP
|
||||
@ -2272,21 +2272,21 @@ public:
|
||||
{
|
||||
v_int32x4 v__X0 = v_setall_s32(X0), v__Y0 = v_setall_s32(Y0);
|
||||
v_int32x4 v_mask = v_setall_s32(INTER_TAB_SIZE - 1);
|
||||
int span = v_float32x4::nlanes;
|
||||
int span = VTraits<v_float32x4>::vlanes();
|
||||
for( ; x1 <= bw - span * 2; x1 += span * 2 )
|
||||
{
|
||||
v_int32x4 v_X0 = v_shr<AB_BITS - INTER_BITS>(v__X0 + v_load(adelta + x + x1));
|
||||
v_int32x4 v_Y0 = v_shr<AB_BITS - INTER_BITS>(v__Y0 + v_load(bdelta + x + x1));
|
||||
v_int32x4 v_X1 = v_shr<AB_BITS - INTER_BITS>(v__X0 + v_load(adelta + x + x1 + span));
|
||||
v_int32x4 v_Y1 = v_shr<AB_BITS - INTER_BITS>(v__Y0 + v_load(bdelta + x + x1 + span));
|
||||
v_int32x4 v_X0 = v_shr<AB_BITS - INTER_BITS>(v_add(v__X0, v_load(this->adelta + x + x1)));
|
||||
v_int32x4 v_Y0 = v_shr<AB_BITS - INTER_BITS>(v_add(v__Y0, v_load(this->bdelta + x + x1)));
|
||||
v_int32x4 v_X1 = v_shr<AB_BITS - INTER_BITS>(v_add(v__X0, v_load(this->adelta + x + x1 + span)));
|
||||
v_int32x4 v_Y1 = v_shr<AB_BITS - INTER_BITS>(v_add(v__Y0, v_load(this->bdelta + x + x1 + span)));
|
||||
|
||||
v_int16x8 v_xy[2];
|
||||
v_xy[0] = v_pack(v_shr<INTER_BITS>(v_X0), v_shr<INTER_BITS>(v_X1));
|
||||
v_xy[1] = v_pack(v_shr<INTER_BITS>(v_Y0), v_shr<INTER_BITS>(v_Y1));
|
||||
v_store_interleave(xy + (x1 << 1), v_xy[0], v_xy[1]);
|
||||
|
||||
v_int32x4 v_alpha0 = v_shl<INTER_BITS>(v_Y0 & v_mask) | (v_X0 & v_mask);
|
||||
v_int32x4 v_alpha1 = v_shl<INTER_BITS>(v_Y1 & v_mask) | (v_X1 & v_mask);
|
||||
v_int32x4 v_alpha0 = v_or(v_shl<INTER_BITS>(v_and(v_Y0, v_mask)), v_and(v_X0, v_mask));
|
||||
v_int32x4 v_alpha1 = v_or(v_shl<INTER_BITS>(v_and(v_Y1, v_mask)), v_and(v_X1, v_mask));
|
||||
v_store(alpha + x1, v_pack(v_alpha0, v_alpha1));
|
||||
}
|
||||
}
|
||||
@ -2866,16 +2866,16 @@ void WarpPerspectiveLine_ProcessNN_CV_SIMD(const double *M, short* xy, double X0
|
||||
v_int32x4 v_X0, v_Y0;
|
||||
{
|
||||
v_float64x2 v_W = v_muladd(v_M6, v_x1, v_W0);
|
||||
v_W = v_select(v_W != v_zero, v_1 / v_W, v_zero);
|
||||
v_float64x2 v_fX0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W));
|
||||
v_float64x2 v_fY0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W));
|
||||
v_x1 += v_2;
|
||||
v_W = v_select(v_ne(v_W, v_zero), v_div(v_1, v_W), v_zero);
|
||||
v_float64x2 v_fX0 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M0, v_x1, v_X0d), v_W)));
|
||||
v_float64x2 v_fY0 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M3, v_x1, v_Y0d), v_W)));
|
||||
v_x1 = v_add(v_x1, v_2);
|
||||
|
||||
v_W = v_muladd(v_M6, v_x1, v_W0);
|
||||
v_W = v_select(v_W != v_zero, v_1 / v_W, v_zero);
|
||||
v_float64x2 v_fX1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W));
|
||||
v_float64x2 v_fY1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W));
|
||||
v_x1 += v_2;
|
||||
v_W = v_select(v_ne(v_W, v_zero), v_div(v_1, v_W), v_zero);
|
||||
v_float64x2 v_fX1 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M0, v_x1, v_X0d), v_W)));
|
||||
v_float64x2 v_fY1 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M3, v_x1, v_Y0d), v_W)));
|
||||
v_x1 = v_add(v_x1, v_2);
|
||||
|
||||
v_X0 = v_round(v_fX0, v_fX1);
|
||||
v_Y0 = v_round(v_fY0, v_fY1);
|
||||
@ -2885,16 +2885,16 @@ void WarpPerspectiveLine_ProcessNN_CV_SIMD(const double *M, short* xy, double X0
|
||||
v_int32x4 v_X1, v_Y1;
|
||||
{
|
||||
v_float64x2 v_W = v_muladd(v_M6, v_x1, v_W0);
|
||||
v_W = v_select(v_W != v_zero, v_1 / v_W, v_zero);
|
||||
v_float64x2 v_fX0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W));
|
||||
v_float64x2 v_fY0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W));
|
||||
v_x1 += v_2;
|
||||
v_W = v_select(v_ne(v_W, v_zero), v_div(v_1, v_W), v_zero);
|
||||
v_float64x2 v_fX0 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M0, v_x1, v_X0d), v_W)));
|
||||
v_float64x2 v_fY0 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M3, v_x1, v_Y0d), v_W)));
|
||||
v_x1 = v_add(v_x1, v_2);
|
||||
|
||||
v_W = v_muladd(v_M6, v_x1, v_W0);
|
||||
v_W = v_select(v_W != v_zero, v_1 / v_W, v_zero);
|
||||
v_float64x2 v_fX1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W));
|
||||
v_float64x2 v_fY1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W));
|
||||
v_x1 += v_2;
|
||||
v_W = v_select(v_ne(v_W, v_zero), v_div(v_1, v_W), v_zero);
|
||||
v_float64x2 v_fX1 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M0, v_x1, v_X0d), v_W)));
|
||||
v_float64x2 v_fY1 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M3, v_x1, v_Y0d), v_W)));
|
||||
v_x1 = v_add(v_x1, v_2);
|
||||
|
||||
v_X1 = v_round(v_fX0, v_fX1);
|
||||
v_Y1 = v_round(v_fY0, v_fY1);
|
||||
@ -2904,16 +2904,16 @@ void WarpPerspectiveLine_ProcessNN_CV_SIMD(const double *M, short* xy, double X0
|
||||
v_int32x4 v_X2, v_Y2;
|
||||
{
|
||||
v_float64x2 v_W = v_muladd(v_M6, v_x1, v_W0);
|
||||
v_W = v_select(v_W != v_zero, v_1 / v_W, v_zero);
|
||||
v_float64x2 v_fX0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W));
|
||||
v_float64x2 v_fY0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W));
|
||||
v_x1 += v_2;
|
||||
v_W = v_select(v_ne(v_W, v_zero), v_div(v_1, v_W), v_zero);
|
||||
v_float64x2 v_fX0 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M0, v_x1, v_X0d), v_W)));
|
||||
v_float64x2 v_fY0 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M3, v_x1, v_Y0d), v_W)));
|
||||
v_x1 = v_add(v_x1, v_2);
|
||||
|
||||
v_W = v_muladd(v_M6, v_x1, v_W0);
|
||||
v_W = v_select(v_W != v_zero, v_1 / v_W, v_zero);
|
||||
v_float64x2 v_fX1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W));
|
||||
v_float64x2 v_fY1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W));
|
||||
v_x1 += v_2;
|
||||
v_W = v_select(v_ne(v_W, v_zero), v_div(v_1, v_W), v_zero);
|
||||
v_float64x2 v_fX1 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M0, v_x1, v_X0d), v_W)));
|
||||
v_float64x2 v_fY1 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M3, v_x1, v_Y0d), v_W)));
|
||||
v_x1 = v_add(v_x1, v_2);
|
||||
|
||||
v_X2 = v_round(v_fX0, v_fX1);
|
||||
v_Y2 = v_round(v_fY0, v_fY1);
|
||||
@ -2923,16 +2923,16 @@ void WarpPerspectiveLine_ProcessNN_CV_SIMD(const double *M, short* xy, double X0
|
||||
v_int32x4 v_X3, v_Y3;
|
||||
{
|
||||
v_float64x2 v_W = v_muladd(v_M6, v_x1, v_W0);
|
||||
v_W = v_select(v_W != v_zero, v_1 / v_W, v_zero);
|
||||
v_float64x2 v_fX0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W));
|
||||
v_float64x2 v_fY0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W));
|
||||
v_x1 += v_2;
|
||||
v_W = v_select(v_ne(v_W, v_zero), v_div(v_1, v_W), v_zero);
|
||||
v_float64x2 v_fX0 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M0, v_x1, v_X0d), v_W)));
|
||||
v_float64x2 v_fY0 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M3, v_x1, v_Y0d), v_W)));
|
||||
v_x1 = v_add(v_x1, v_2);
|
||||
|
||||
v_W = v_muladd(v_M6, v_x1, v_W0);
|
||||
v_W = v_select(v_W != v_zero, v_1 / v_W, v_zero);
|
||||
v_float64x2 v_fX1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W));
|
||||
v_float64x2 v_fY1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W));
|
||||
v_x1 += v_2;
|
||||
v_W = v_select(v_ne(v_W, v_zero), v_div(v_1, v_W), v_zero);
|
||||
v_float64x2 v_fX1 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M0, v_x1, v_X0d), v_W)));
|
||||
v_float64x2 v_fY1 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M3, v_x1, v_Y0d), v_W)));
|
||||
v_x1 = v_add(v_x1, v_2);
|
||||
|
||||
v_X3 = v_round(v_fX0, v_fX1);
|
||||
v_Y3 = v_round(v_fY0, v_fY1);
|
||||
@ -2987,16 +2987,16 @@ void WarpPerspectiveLine_Process_CV_SIMD(const double *M, short* xy, short* alph
|
||||
v_int32x4 v_X0, v_Y0;
|
||||
{
|
||||
v_float64x2 v_W = v_muladd(v_M6, v_x1, v_W0);
|
||||
v_W = v_select(v_W != v_zero, v_its / v_W, v_zero);
|
||||
v_float64x2 v_fX0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W));
|
||||
v_float64x2 v_fY0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W));
|
||||
v_x1 += v_2;
|
||||
v_W = v_select(v_ne(v_W, v_zero), v_div(v_its, v_W), v_zero);
|
||||
v_float64x2 v_fX0 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M0, v_x1, v_X0d), v_W)));
|
||||
v_float64x2 v_fY0 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M3, v_x1, v_Y0d), v_W)));
|
||||
v_x1 = v_add(v_x1, v_2);
|
||||
|
||||
v_W = v_muladd(v_M6, v_x1, v_W0);
|
||||
v_W = v_select(v_W != v_zero, v_its / v_W, v_zero);
|
||||
v_float64x2 v_fX1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W));
|
||||
v_float64x2 v_fY1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W));
|
||||
v_x1 += v_2;
|
||||
v_W = v_select(v_ne(v_W, v_zero), v_div(v_its, v_W), v_zero);
|
||||
v_float64x2 v_fX1 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M0, v_x1, v_X0d), v_W)));
|
||||
v_float64x2 v_fY1 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M3, v_x1, v_Y0d), v_W)));
|
||||
v_x1 = v_add(v_x1, v_2);
|
||||
|
||||
v_X0 = v_round(v_fX0, v_fX1);
|
||||
v_Y0 = v_round(v_fY0, v_fY1);
|
||||
@ -3006,16 +3006,16 @@ void WarpPerspectiveLine_Process_CV_SIMD(const double *M, short* xy, short* alph
|
||||
v_int32x4 v_X1, v_Y1;
|
||||
{
|
||||
v_float64x2 v_W = v_muladd(v_M6, v_x1, v_W0);
|
||||
v_W = v_select(v_W != v_zero, v_its / v_W, v_zero);
|
||||
v_float64x2 v_fX0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W));
|
||||
v_float64x2 v_fY0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W));
|
||||
v_x1 += v_2;
|
||||
v_W = v_select(v_ne(v_W, v_zero), v_div(v_its, v_W), v_zero);
|
||||
v_float64x2 v_fX0 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M0, v_x1, v_X0d), v_W)));
|
||||
v_float64x2 v_fY0 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M3, v_x1, v_Y0d), v_W)));
|
||||
v_x1 = v_add(v_x1, v_2);
|
||||
|
||||
v_W = v_muladd(v_M6, v_x1, v_W0);
|
||||
v_W = v_select(v_W != v_zero, v_its / v_W, v_zero);
|
||||
v_float64x2 v_fX1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W));
|
||||
v_float64x2 v_fY1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W));
|
||||
v_x1 += v_2;
|
||||
v_W = v_select(v_ne(v_W, v_zero), v_div(v_its, v_W), v_zero);
|
||||
v_float64x2 v_fX1 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M0, v_x1, v_X0d), v_W)));
|
||||
v_float64x2 v_fY1 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M3, v_x1, v_Y0d), v_W)));
|
||||
v_x1 = v_add(v_x1, v_2);
|
||||
|
||||
v_X1 = v_round(v_fX0, v_fX1);
|
||||
v_Y1 = v_round(v_fY0, v_fY1);
|
||||
@ -3025,16 +3025,16 @@ void WarpPerspectiveLine_Process_CV_SIMD(const double *M, short* xy, short* alph
|
||||
v_int32x4 v_X2, v_Y2;
|
||||
{
|
||||
v_float64x2 v_W = v_muladd(v_M6, v_x1, v_W0);
|
||||
v_W = v_select(v_W != v_zero, v_its / v_W, v_zero);
|
||||
v_float64x2 v_fX0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W));
|
||||
v_float64x2 v_fY0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W));
|
||||
v_x1 += v_2;
|
||||
v_W = v_select(v_ne(v_W, v_zero), v_div(v_its, v_W), v_zero);
|
||||
v_float64x2 v_fX0 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M0, v_x1, v_X0d), v_W)));
|
||||
v_float64x2 v_fY0 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M3, v_x1, v_Y0d), v_W)));
|
||||
v_x1 = v_add(v_x1, v_2);
|
||||
|
||||
v_W = v_muladd(v_M6, v_x1, v_W0);
|
||||
v_W = v_select(v_W != v_zero, v_its / v_W, v_zero);
|
||||
v_float64x2 v_fX1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W));
|
||||
v_float64x2 v_fY1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W));
|
||||
v_x1 += v_2;
|
||||
v_W = v_select(v_ne(v_W, v_zero), v_div(v_its, v_W), v_zero);
|
||||
v_float64x2 v_fX1 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M0, v_x1, v_X0d), v_W)));
|
||||
v_float64x2 v_fY1 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M3, v_x1, v_Y0d), v_W)));
|
||||
v_x1 = v_add(v_x1, v_2);
|
||||
|
||||
v_X2 = v_round(v_fX0, v_fX1);
|
||||
v_Y2 = v_round(v_fY0, v_fY1);
|
||||
@ -3044,35 +3044,35 @@ void WarpPerspectiveLine_Process_CV_SIMD(const double *M, short* xy, short* alph
|
||||
v_int32x4 v_X3, v_Y3;
|
||||
{
|
||||
v_float64x2 v_W = v_muladd(v_M6, v_x1, v_W0);
|
||||
v_W = v_select(v_W != v_zero, v_its / v_W, v_zero);
|
||||
v_float64x2 v_fX0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W));
|
||||
v_float64x2 v_fY0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W));
|
||||
v_x1 += v_2;
|
||||
v_W = v_select(v_ne(v_W, v_zero), v_div(v_its, v_W), v_zero);
|
||||
v_float64x2 v_fX0 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M0, v_x1, v_X0d), v_W)));
|
||||
v_float64x2 v_fY0 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M3, v_x1, v_Y0d), v_W)));
|
||||
v_x1 = v_add(v_x1, v_2);
|
||||
|
||||
v_W = v_muladd(v_M6, v_x1, v_W0);
|
||||
v_W = v_select(v_W != v_zero, v_its / v_W, v_zero);
|
||||
v_float64x2 v_fX1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W));
|
||||
v_float64x2 v_fY1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W));
|
||||
v_x1 += v_2;
|
||||
v_W = v_select(v_ne(v_W, v_zero), v_div(v_its, v_W), v_zero);
|
||||
v_float64x2 v_fX1 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M0, v_x1, v_X0d), v_W)));
|
||||
v_float64x2 v_fY1 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M3, v_x1, v_Y0d), v_W)));
|
||||
v_x1 = v_add(v_x1, v_2);
|
||||
|
||||
v_X3 = v_round(v_fX0, v_fX1);
|
||||
v_Y3 = v_round(v_fY0, v_fY1);
|
||||
}
|
||||
|
||||
// store alpha
|
||||
v_int32x4 v_alpha0 = ((v_Y0 & v_itsi1) << INTER_BITS) + (v_X0 & v_itsi1);
|
||||
v_int32x4 v_alpha1 = ((v_Y1 & v_itsi1) << INTER_BITS) + (v_X1 & v_itsi1);
|
||||
v_int32x4 v_alpha0 = v_add(v_shl<INTER_BITS>(v_and(v_Y0, v_itsi1)), v_and(v_X0, v_itsi1));
|
||||
v_int32x4 v_alpha1 = v_add(v_shl<INTER_BITS>(v_and(v_Y1, v_itsi1)), v_and(v_X1, v_itsi1));
|
||||
v_store((alpha + x1), v_pack(v_alpha0, v_alpha1));
|
||||
|
||||
v_alpha0 = ((v_Y2 & v_itsi1) << INTER_BITS) + (v_X2 & v_itsi1);
|
||||
v_alpha1 = ((v_Y3 & v_itsi1) << INTER_BITS) + (v_X3 & v_itsi1);
|
||||
v_alpha0 = v_add(v_shl<INTER_BITS>(v_and(v_Y2, v_itsi1)), v_and(v_X2, v_itsi1));
|
||||
v_alpha1 = v_add(v_shl<INTER_BITS>(v_and(v_Y3, v_itsi1)), v_and(v_X3, v_itsi1));
|
||||
v_store((alpha + x1 + 8), v_pack(v_alpha0, v_alpha1));
|
||||
|
||||
// convert to 16s
|
||||
v_X0 = v_reinterpret_as_s32(v_pack(v_X0 >> INTER_BITS, v_X1 >> INTER_BITS));
|
||||
v_X1 = v_reinterpret_as_s32(v_pack(v_X2 >> INTER_BITS, v_X3 >> INTER_BITS));
|
||||
v_Y0 = v_reinterpret_as_s32(v_pack(v_Y0 >> INTER_BITS, v_Y1 >> INTER_BITS));
|
||||
v_Y1 = v_reinterpret_as_s32(v_pack(v_Y2 >> INTER_BITS, v_Y3 >> INTER_BITS));
|
||||
v_X0 = v_reinterpret_as_s32(v_pack(v_shr<INTER_BITS>(v_X0), v_shr<INTER_BITS>(v_X1)));
|
||||
v_X1 = v_reinterpret_as_s32(v_pack(v_shr<INTER_BITS>(v_X2), v_shr<INTER_BITS>(v_X3)));
|
||||
v_Y0 = v_reinterpret_as_s32(v_pack(v_shr<INTER_BITS>(v_Y0), v_shr<INTER_BITS>(v_Y1)));
|
||||
v_Y1 = v_reinterpret_as_s32(v_pack(v_shr<INTER_BITS>(v_Y2), v_shr<INTER_BITS>(v_Y3)));
|
||||
|
||||
v_store_interleave(xy + x1 * 2, (v_reinterpret_as_s16)(v_X0), (v_reinterpret_as_s16)(v_Y0));
|
||||
v_store_interleave(xy + x1 * 2 + 16, (v_reinterpret_as_s16)(v_X1), (v_reinterpret_as_s16)(v_Y1));
|
||||
|
@ -179,10 +179,10 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize )
|
||||
for (k = 0; k < 16; ++k)
|
||||
{
|
||||
#if CV_SIMD256
|
||||
v_store(H.fine[k], v_mul_wrap(v256_load(h_fine + 16 * n*(16 * c + k)), v256_setall_u16(2 * r + 1)) + v256_load(H.fine[k]));
|
||||
v_store(H.fine[k], v_mul_wrap(v256_load(h_fine + 16 * n*(16 * c + k)), v_add(v256_setall_u16(2 * r + 1), v256_load(H.fine[k]))));
|
||||
#elif CV_SIMD128
|
||||
v_store(H.fine[k], v_mul_wrap(v_load(h_fine + 16 * n*(16 * c + k)), v_setall_u16((ushort)(2 * r + 1))) + v_load(H.fine[k]));
|
||||
v_store(H.fine[k] + 8, v_mul_wrap(v_load(h_fine + 16 * n*(16 * c + k) + 8), v_setall_u16((ushort)(2 * r + 1))) + v_load(H.fine[k] + 8));
|
||||
v_store(H.fine[k], v_add(v_mul_wrap(v_load(h_fine + 16 * n * (16 * c + k)), v_setall_u16((ushort)(2 * r + 1))), v_load(H.fine[k])));
|
||||
v_store(H.fine[k] + 8, v_add(v_mul_wrap(v_load(h_fine + 16 * n * (16 * c + k) + 8), v_setall_u16((ushort)(2 * r + 1))), v_load(H.fine[k] + 8)));
|
||||
#else
|
||||
for (int ind = 0; ind < 16; ++ind)
|
||||
H.fine[k][ind] = (HT)(H.fine[k][ind] + (2 * r + 1) * h_fine[16 * n*(16 * c + k) + ind]);
|
||||
@ -199,10 +199,10 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize )
|
||||
for( j = 0; j < 2*r; ++j, px += 16 )
|
||||
{
|
||||
#if CV_SIMD256
|
||||
v_coarse += v256_load(px);
|
||||
v_coarse = v_add(v_coarse, v256_load(px));
|
||||
#elif CV_SIMD128
|
||||
v_coarsel += v_load(px);
|
||||
v_coarseh += v_load(px + 8);
|
||||
v_coarsel = v_add(v_coarsel, v_load(px));
|
||||
v_coarseh = v_add(v_coarseh, v_load(px + 8));
|
||||
#else
|
||||
for (int ind = 0; ind < 16; ++ind)
|
||||
H.coarse[ind] += px[ind];
|
||||
@ -216,11 +216,11 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize )
|
||||
|
||||
px = h_coarse + 16 * (n*c + std::min(j + r, n - 1));
|
||||
#if CV_SIMD256
|
||||
v_coarse += v256_load(px);
|
||||
v_coarse = v_add(v_coarse, v256_load(px));
|
||||
v_store(H.coarse, v_coarse);
|
||||
#elif CV_SIMD128
|
||||
v_coarsel += v_load(px);
|
||||
v_coarseh += v_load(px + 8);
|
||||
v_coarsel = v_add(v_coarsel, v_load(px));
|
||||
v_coarseh = v_add(v_coarseh, v_load(px + 8));
|
||||
v_store(H.coarse, v_coarsel);
|
||||
v_store(H.coarse + 8, v_coarseh);
|
||||
#else
|
||||
@ -261,10 +261,10 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize )
|
||||
for (luc[k] = HT(j - r); luc[k] < MIN(j + r + 1, n); ++luc[k], px += 16)
|
||||
{
|
||||
#if CV_SIMD256
|
||||
v_fine += v256_load(px);
|
||||
v_fine = v_add(v_fine, v256_load(px));
|
||||
#elif CV_SIMD128
|
||||
v_finel += v_load(px);
|
||||
v_fineh += v_load(px + 8);
|
||||
v_finel = v_add(v_finel, v_load(px));
|
||||
v_fineh = v_add(v_fineh, v_load(px + 8));
|
||||
#else
|
||||
for (int ind = 0; ind < 16; ++ind)
|
||||
H.fine[k][ind] += px[ind];
|
||||
@ -275,10 +275,10 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize )
|
||||
{
|
||||
px = h_fine + 16 * (n*(16 * c + k) + (n - 1));
|
||||
#if CV_SIMD256
|
||||
v_fine += v_mul_wrap(v256_load(px), v256_setall_u16(j + r + 1 - n));
|
||||
v_fine = v_add(v_fine, v_mul_wrap(v256_load(px), v256_setall_u16(j + r + 1 - n)));
|
||||
#elif CV_SIMD128
|
||||
v_finel += v_mul_wrap(v_load(px), v_setall_u16((ushort)(j + r + 1 - n)));
|
||||
v_fineh += v_mul_wrap(v_load(px + 8), v_setall_u16((ushort)(j + r + 1 - n)));
|
||||
v_finel = v_add(v_finel, v_mul_wrap(v_load(px), v_setall_u16((ushort)(j + r + 1 - n))));
|
||||
v_fineh = v_add(v_fineh, v_mul_wrap(v_load(px + 8), v_setall_u16((ushort)(j + r + 1 - n))));
|
||||
#else
|
||||
for (int ind = 0; ind < 16; ++ind)
|
||||
H.fine[k][ind] = (HT)(H.fine[k][ind] + (j + r + 1 - n) * px[ind]);
|
||||
@ -298,10 +298,10 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize )
|
||||
for ( ; luc[k] < j+r+1; ++luc[k] )
|
||||
{
|
||||
#if CV_SIMD256
|
||||
v_fine = v_fine + v256_load(px + 16 * MIN(luc[k], n - 1)) - v256_load(px + 16 * MAX(luc[k] - 2 * r - 1, 0));
|
||||
v_fine = v_sub(v_add(v_fine, v256_load(px + 16 * MIN(luc[k], n - 1))), v256_load(px + 16 * MAX(luc[k] - 2 * r - 1, 0)));
|
||||
#elif CV_SIMD128
|
||||
v_finel = v_finel + v_load(px + 16 * MIN(luc[k], n - 1) ) - v_load(px + 16 * MAX(luc[k] - 2 * r - 1, 0));
|
||||
v_fineh = v_fineh + v_load(px + 16 * MIN(luc[k], n - 1) + 8) - v_load(px + 16 * MAX(luc[k] - 2 * r - 1, 0) + 8);
|
||||
v_finel = v_sub(v_add(v_finel, v_load(px + 16 * MIN(luc[k], n - 1) )), v_load(px + 16 * MAX(luc[k] - 2 * r - 1, 0)));
|
||||
v_fineh = v_sub(v_add(v_fineh, v_load(px + 16 * MIN(luc[k], n - 1) + 8)), v_load(px + 16 * MAX(luc[k] - 2 * r - 1, 0) + 8));
|
||||
#else
|
||||
for (int ind = 0; ind < 16; ++ind)
|
||||
H.fine[k][ind] += px[16 * MIN(luc[k], n - 1) + ind] - px[16 * MAX(luc[k] - 2 * r - 1, 0) + ind];
|
||||
@ -312,12 +312,12 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize )
|
||||
px = h_coarse + 16 * (n*c + MAX(j - r, 0));
|
||||
#if CV_SIMD256
|
||||
v_store(H.fine[k], v_fine);
|
||||
v_coarse -= v256_load(px);
|
||||
v_coarse = v_sub(v_coarse, v256_load(px));
|
||||
#elif CV_SIMD128
|
||||
v_store(H.fine[k], v_finel);
|
||||
v_store(H.fine[k] + 8, v_fineh);
|
||||
v_coarsel -= v_load(px);
|
||||
v_coarseh -= v_load(px + 8);
|
||||
v_coarsel = v_sub(v_coarsel, v_load(px));
|
||||
v_coarseh = v_sub(v_coarseh, v_load(px + 8));
|
||||
#else
|
||||
for (int ind = 0; ind < 16; ++ind)
|
||||
H.coarse[ind] -= px[ind];
|
||||
|
@ -236,12 +236,12 @@ struct MomentsInTile_SIMD<uchar, int, int>
|
||||
v_int16x8 p = v_reinterpret_as_s16(v_load_expand(ptr + x));
|
||||
v_int16x8 sx = v_mul_wrap(qx, qx);
|
||||
|
||||
qx0 += v_reinterpret_as_u32(p);
|
||||
qx0 = v_add(qx0, v_reinterpret_as_u32(p));
|
||||
qx1 = v_reinterpret_as_u32(v_dotprod(p, qx, v_reinterpret_as_s32(qx1)));
|
||||
qx2 = v_reinterpret_as_u32(v_dotprod(p, sx, v_reinterpret_as_s32(qx2)));
|
||||
qx3 = v_reinterpret_as_u32(v_dotprod(v_mul_wrap(p, qx), sx, v_reinterpret_as_s32(qx3)));
|
||||
|
||||
qx += dx;
|
||||
qx = v_add(qx, dx);
|
||||
}
|
||||
|
||||
x0 = v_reduce_sum(qx0);
|
||||
@ -276,19 +276,19 @@ struct MomentsInTile_SIMD<ushort, int, int64>
|
||||
{
|
||||
v_int32x4 v_src = v_reinterpret_as_s32(v_load_expand(ptr + x));
|
||||
|
||||
v_x0 += v_reinterpret_as_u32(v_src);
|
||||
v_x1 += v_reinterpret_as_u32(v_src * v_ix0);
|
||||
v_x0 = v_add(v_x0, v_reinterpret_as_u32(v_src));
|
||||
v_x1 = v_add(v_x1, v_reinterpret_as_u32(v_mul(v_src, v_ix0)));
|
||||
|
||||
v_int32x4 v_ix1 = v_ix0 * v_ix0;
|
||||
v_x2 += v_reinterpret_as_u32(v_src * v_ix1);
|
||||
v_int32x4 v_ix1 = v_mul(v_ix0, v_ix0);
|
||||
v_x2 = v_add(v_x2, v_reinterpret_as_u32(v_mul(v_src, v_ix1)));
|
||||
|
||||
v_ix1 = v_ix0 * v_ix1;
|
||||
v_src = v_src * v_ix1;
|
||||
v_ix1 = v_mul(v_ix0, v_ix1);
|
||||
v_src = v_mul(v_src, v_ix1);
|
||||
v_uint64x2 v_lo, v_hi;
|
||||
v_expand(v_reinterpret_as_u32(v_src), v_lo, v_hi);
|
||||
v_x3 += v_lo + v_hi;
|
||||
v_x3 = v_add(v_x3, v_add(v_lo, v_hi));
|
||||
|
||||
v_ix0 += v_delta;
|
||||
v_ix0 = v_add(v_ix0, v_delta);
|
||||
}
|
||||
|
||||
x0 = v_reduce_sum(v_x0);
|
||||
|
@ -463,7 +463,7 @@ template<> int PyrDownVecV<int, uchar>(int** src, uchar* dst, int width)
|
||||
}
|
||||
#if CV_SIMD128
|
||||
typedef int CV_DECL_ALIGNED(1) unaligned_int;
|
||||
for ( ; x <= width - v_int32x4::nlanes; x += v_int32x4::nlanes)
|
||||
for ( ; x <= width - VTraits<v_int32x4>::vlanes(); x += VTraits<v_int32x4>::vlanes())
|
||||
{
|
||||
v_int32x4 r0, r1, r2, r3, r4, t0;
|
||||
r0 = v_load(row0 + x);
|
||||
@ -473,7 +473,7 @@ template<> int PyrDownVecV<int, uchar>(int** src, uchar* dst, int width)
|
||||
r4 = v_load(row4 + x);
|
||||
t0 = v_add(v_add(v_add(r0, r4), v_add(r2, r2)), v_shl<2>(v_add(v_add(r1, r3), r2)));
|
||||
|
||||
*((unaligned_int*) (dst + x)) = v_reinterpret_as_s32(v_rshr_pack<8>(v_pack_u(t0, t0), v_setzero_u16())).get0();
|
||||
*((unaligned_int*) (dst + x)) = v_get0(v_reinterpret_as_s32(v_rshr_pack<8>(v_pack_u(t0, t0), v_setzero_u16())));
|
||||
}
|
||||
#else
|
||||
for (; x <= width - 1; x += 1)
|
||||
@ -615,15 +615,15 @@ template <> int PyrUpVecV<int, uchar>(int** src, uchar** dst, int width)
|
||||
}
|
||||
#if CV_SIMD128
|
||||
typedef int CV_DECL_ALIGNED(1) unaligned_int;
|
||||
for (; x <= width - v_int32x4::nlanes; x += v_int32x4::nlanes)
|
||||
for (; x <= width - VTraits<v_int32x4>::vlanes(); x += VTraits<v_int32x4>::vlanes())
|
||||
{
|
||||
v_int32 v_r00 = vx_load(row0 + x),
|
||||
v_r10 = vx_load(row1 + x),
|
||||
v_r20 = vx_load(row2 + x);
|
||||
v_int32 v_2r10 = v_add(v_r10, v_r10);
|
||||
v_int16 d = v_pack(v_add(v_add(v_r00, v_r20), v_add(v_add(v_2r10, v_2r10), v_2r10)), v_shl<2>(v_add(v_r10, v_r20)));
|
||||
*(unaligned_int*)(dst0 + x) = v_reinterpret_as_s32(v_rshr_pack_u<6>(d, vx_setzero_s16())).get0();
|
||||
*(unaligned_int*)(dst1 + x) = v_reinterpret_as_s32(v_rshr_pack_u<6>(v_combine_high(d, d), vx_setzero_s16())).get0();
|
||||
*(unaligned_int*)(dst0 + x) = v_get0(v_reinterpret_as_s32(v_rshr_pack_u<6>(d, vx_setzero_s16())));
|
||||
*(unaligned_int*)(dst1 + x) = v_get0(v_reinterpret_as_s32(v_rshr_pack_u<6>(v_combine_high(d, d), vx_setzero_s16())));
|
||||
}
|
||||
#else
|
||||
for (; x <= width - 1; x += 1)
|
||||
@ -754,14 +754,14 @@ template <> int PyrUpVecVOneRow<int, uchar>(int** src, uchar* dst, int width)
|
||||
}
|
||||
#if CV_SIMD128
|
||||
typedef int CV_DECL_ALIGNED(1) unaligned_int;
|
||||
for (; x <= width - v_int32x4::nlanes; x += v_int32x4::nlanes)
|
||||
for (; x <= width - VTraits<v_int32x4>::vlanes(); x += VTraits<v_int32x4>::vlanes())
|
||||
{
|
||||
v_int32 v_r00 = vx_load(row0 + x),
|
||||
v_r10 = vx_load(row1 + x),
|
||||
v_r20 = vx_load(row2 + x);
|
||||
v_int32 v_2r10 = v_add(v_r10, v_r10);
|
||||
v_int16 d = v_pack(v_add(v_add(v_r00, v_r20), v_add(v_add(v_2r10, v_2r10), v_2r10)), v_shl<2>(v_add(v_r10, v_r20)));
|
||||
*(unaligned_int*)(dst + x) = v_reinterpret_as_s32(v_rshr_pack_u<6>(d, vx_setzero_s16())).get0();
|
||||
*(unaligned_int*)(dst + x) = v_get0(v_reinterpret_as_s32(v_rshr_pack_u<6>(d, vx_setzero_s16())));
|
||||
}
|
||||
#else
|
||||
for (; x <= width - 1; x += 1)
|
||||
|
@ -2473,7 +2473,7 @@ public:
|
||||
v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
|
||||
v_uint16 bl, gl, rl;
|
||||
#if CV_SIMD_WIDTH == 16
|
||||
bl = t0 + t3; gl = t1 + t4; rl = t2 + t5;
|
||||
bl = v_add(t0, t3); gl = v_add(t1, t4); rl = v_add(t2, t5);
|
||||
#elif CV_SIMD_WIDTH == 32
|
||||
v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
|
||||
bl = v_add(s0, s3); gl = v_add(s1, s4); rl = v_add(s2, s5);
|
||||
@ -2493,7 +2493,7 @@ public:
|
||||
v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
|
||||
v_uint16 bh, gh, rh;
|
||||
#if CV_SIMD_WIDTH == 16
|
||||
bh = t0 + t3; gh = t1 + t4; rh = t2 + t5;
|
||||
bh = v_add(t0, t3); gh = v_add(t1, t4); rh = v_add(t2, t5);
|
||||
#elif CV_SIMD_WIDTH == 32
|
||||
v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
|
||||
bh = v_add(s0, s3); gh = v_add(s1, s4); rh = v_add(s2, s5);
|
||||
@ -2566,7 +2566,7 @@ public:
|
||||
v_rshr_pack_store<2>(D, r0 + v_rotate_left<1>(r1, r0));
|
||||
}
|
||||
#else
|
||||
v_rshr_pack_store<2>(D, v_load_expand(S0) + v_load_expand(S0 + 3) + v_load_expand(S1) + v_load_expand(S1 + 3));
|
||||
v_rshr_pack_store<2>(D, v_add(v_add(v_add(v_load_expand(S0), v_load_expand(S0 + 3)), v_load_expand(S1)), v_load_expand(S1 + 3)));
|
||||
#endif
|
||||
#elif CV_SIMD_WIDTH == 32 || CV_SIMD_WIDTH == 64
|
||||
for ( ; dx <= w - 3*VTraits<v_uint16>::vlanes(); dx += 3*VTraits<v_uint16>::vlanes(), S0 += 6*VTraits<v_uint16>::vlanes(), S1 += 6*VTraits<v_uint16>::vlanes(), D += 3*VTraits<v_uint16>::vlanes())
|
||||
@ -2609,7 +2609,7 @@ public:
|
||||
}
|
||||
#elif CV_SIMD_WIDTH >= 64
|
||||
v_uint32 masklow = vx_setall_u32(0x0000ffff);
|
||||
for ( ; dx <= w - 3*v_uint16::nlanes; dx += 3*v_uint16::nlanes, S0 += 6*v_uint16::nlanes, S1 += 6*v_uint16::nlanes, D += 3*v_uint16::nlanes)
|
||||
for ( ; dx <= w - 3*VTraits<v_uint16>::vlanes(); dx += 3*VTraits<v_uint16>::vlanes(), S0 += 6*VTraits<v_uint16>::vlanes(), S1 += 6*VTraits<v_uint16>::vlanes(), D += 3*VTraits<v_uint16>::vlanes())
|
||||
{
|
||||
v_uint16 b0, g0, r0, b1, g1, r1;
|
||||
v_load_deinterleave(S0, b0, g0, r0);
|
||||
@ -2617,8 +2617,8 @@ public:
|
||||
v_uint32 bl = (v_reinterpret_as_u32(b0) >> 16) + (v_reinterpret_as_u32(b0) & masklow) + (v_reinterpret_as_u32(b1) >> 16) + (v_reinterpret_as_u32(b1) & masklow);
|
||||
v_uint32 gl = (v_reinterpret_as_u32(g0) >> 16) + (v_reinterpret_as_u32(g0) & masklow) + (v_reinterpret_as_u32(g1) >> 16) + (v_reinterpret_as_u32(g1) & masklow);
|
||||
v_uint32 rl = (v_reinterpret_as_u32(r0) >> 16) + (v_reinterpret_as_u32(r0) & masklow) + (v_reinterpret_as_u32(r1) >> 16) + (v_reinterpret_as_u32(r1) & masklow);
|
||||
v_load_deinterleave(S0 + 3*v_uint16::nlanes, b0, g0, r0);
|
||||
v_load_deinterleave(S1 + 3*v_uint16::nlanes, b1, g1, r1);
|
||||
v_load_deinterleave(S0 + 3*VTraits<v_uint16>::vlanes(), b0, g0, r0);
|
||||
v_load_deinterleave(S1 + 3*VTraits<v_uint16>::vlanes(), b1, g1, r1);
|
||||
v_uint32 bh = (v_reinterpret_as_u32(b0) >> 16) + (v_reinterpret_as_u32(b0) & masklow) + (v_reinterpret_as_u32(b1) >> 16) + (v_reinterpret_as_u32(b1) & masklow);
|
||||
v_uint32 gh = (v_reinterpret_as_u32(g0) >> 16) + (v_reinterpret_as_u32(g0) & masklow) + (v_reinterpret_as_u32(g1) >> 16) + (v_reinterpret_as_u32(g1) & masklow);
|
||||
v_uint32 rh = (v_reinterpret_as_u32(r0) >> 16) + (v_reinterpret_as_u32(r0) & masklow) + (v_reinterpret_as_u32(r1) >> 16) + (v_reinterpret_as_u32(r1) & masklow);
|
||||
@ -2630,7 +2630,7 @@ public:
|
||||
{
|
||||
CV_Assert(cn == 4);
|
||||
#if CV_SIMD_WIDTH >= 64
|
||||
for ( ; dx <= w - v_uint16::nlanes; dx += v_uint16::nlanes, S0 += 2*v_uint16::nlanes, S1 += 2*v_uint16::nlanes, D += v_uint16::nlanes)
|
||||
for ( ; dx <= w - VTraits<v_uint16>::vlanes(); dx += VTraits<v_uint16>::vlanes(), S0 += 2*VTraits<v_uint16>::vlanes(), S1 += 2*VTraits<v_uint16>::vlanes(), D += VTraits<v_uint16>::vlanes())
|
||||
{
|
||||
v_uint64 r00, r01, r10, r11;
|
||||
v_load_deinterleave((uint64_t*)S0, r00, r01);
|
||||
@ -2652,7 +2652,7 @@ public:
|
||||
r0 = v_add(r0, r2); r1 = v_add(r1, r3);
|
||||
v_uint32 v_d;
|
||||
#if CV_SIMD_WIDTH == 16
|
||||
v_d = r0 + r1;
|
||||
v_d = v_add(r0, r1);
|
||||
#elif CV_SIMD_WIDTH == 32
|
||||
v_uint32 t0, t1;
|
||||
v_recombine(r0, r1, t0, t1);
|
||||
@ -2697,7 +2697,7 @@ public:
|
||||
{
|
||||
#if CV_SIMD_WIDTH == 16
|
||||
for ( ; dx <= w - 4; dx += 3, S0 += 6, S1 += 6, D += 3)
|
||||
v_rshr_pack_store<2>(D, v_load_expand(S0) + v_load_expand(S0 + 3) + v_load_expand(S1) + v_load_expand(S1 + 3));
|
||||
v_rshr_pack_store<2>(D, v_add(v_add(v_add(v_load_expand(S0), v_load_expand(S0 + 3)), v_load_expand(S1)), v_load_expand(S1 + 3)));
|
||||
#elif CV_SIMD_WIDTH == 32 || CV_SIMD_WIDTH == 64
|
||||
for ( ; dx <= w - 3*VTraits<v_int16>::vlanes(); dx += 3*VTraits<v_int16>::vlanes(), S0 += 6*VTraits<v_int16>::vlanes(), S1 += 6*VTraits<v_int16>::vlanes(), D += 3*VTraits<v_int16>::vlanes())
|
||||
{
|
||||
@ -2738,7 +2738,7 @@ public:
|
||||
v_store_interleave(D, v_rshr_pack<2>(bl, bh), v_rshr_pack<2>(gl, gh), v_rshr_pack<2>(rl, rh));
|
||||
}
|
||||
#elif CV_SIMD_WIDTH >= 64
|
||||
for ( ; dx <= w - 3*v_int16::nlanes; dx += 3*v_int16::nlanes, S0 += 6*v_int16::nlanes, S1 += 6*v_int16::nlanes, D += 3*v_int16::nlanes)
|
||||
for ( ; dx <= w - 3*VTraits<v_int16>::vlanes(); dx += 3*VTraits<v_int16>::vlanes(), S0 += 6*VTraits<v_int16>::vlanes(), S1 += 6*VTraits<v_int16>::vlanes(), D += 3*VTraits<v_int16>::vlanes())
|
||||
{
|
||||
v_int16 b0, g0, r0, b1, g1, r1;
|
||||
v_load_deinterleave(S0, b0, g0, r0);
|
||||
@ -2746,8 +2746,8 @@ public:
|
||||
v_int32 bl = (v_reinterpret_as_s32(b0) >> 16) + ((v_reinterpret_as_s32(b0) << 16) >> 16) + (v_reinterpret_as_s32(b1) >> 16) + ((v_reinterpret_as_s32(b1) << 16) >> 16);
|
||||
v_int32 gl = (v_reinterpret_as_s32(g0) >> 16) + ((v_reinterpret_as_s32(g0) << 16) >> 16) + (v_reinterpret_as_s32(g1) >> 16) + ((v_reinterpret_as_s32(g1) << 16) >> 16);
|
||||
v_int32 rl = (v_reinterpret_as_s32(r0) >> 16) + ((v_reinterpret_as_s32(r0) << 16) >> 16) + (v_reinterpret_as_s32(r1) >> 16) + ((v_reinterpret_as_s32(r1) << 16) >> 16);
|
||||
v_load_deinterleave(S0 + 3*v_int16::nlanes, b0, g0, r0);
|
||||
v_load_deinterleave(S1 + 3*v_int16::nlanes, b1, g1, r1);
|
||||
v_load_deinterleave(S0 + 3*VTraits<v_int16>::vlanes(), b0, g0, r0);
|
||||
v_load_deinterleave(S1 + 3*VTraits<v_int16>::vlanes(), b1, g1, r1);
|
||||
v_int32 bh = (v_reinterpret_as_s32(b0) >> 16) + ((v_reinterpret_as_s32(b0) << 16) >> 16) + (v_reinterpret_as_s32(b1) >> 16) + ((v_reinterpret_as_s32(b1) << 16) >> 16);
|
||||
v_int32 gh = (v_reinterpret_as_s32(g0) >> 16) + ((v_reinterpret_as_s32(g0) << 16) >> 16) + (v_reinterpret_as_s32(g1) >> 16) + ((v_reinterpret_as_s32(g1) << 16) >> 16);
|
||||
v_int32 rh = (v_reinterpret_as_s32(r0) >> 16) + ((v_reinterpret_as_s32(r0) << 16) >> 16) + (v_reinterpret_as_s32(r1) >> 16) + ((v_reinterpret_as_s32(r1) << 16) >> 16);
|
||||
@ -2779,7 +2779,7 @@ public:
|
||||
r3 = v_add(vx_load_expand(S0 + 3 * VTraits<v_int32>::vlanes()), vx_load_expand(S1 + 3 * VTraits<v_int32>::vlanes()));
|
||||
v_int32 dl, dh;
|
||||
#if CV_SIMD_WIDTH == 16
|
||||
dl = r0 + r1; dh = r2 + r3;
|
||||
dl = v_add(r0, r1); dh = v_add(r2, r3);
|
||||
#elif CV_SIMD_WIDTH == 32
|
||||
v_int32 t0, t1, t2, t3;
|
||||
v_recombine(r0, r1, t0, t1); v_recombine(r2, r3, t2, t3);
|
||||
@ -2829,14 +2829,14 @@ struct ResizeAreaFastVec_SIMD_32f
|
||||
{
|
||||
#if CV_SIMD_WIDTH == 16
|
||||
v_float32 v_025 = vx_setall_f32(0.25f);
|
||||
for (; dx <= w - v_float32::nlanes; dx += v_float32::nlanes, S0 += 2*v_float32::nlanes, S1 += 2*v_float32::nlanes, D += v_float32::nlanes)
|
||||
v_store(D, ((vx_load(S0) + vx_load(S0 + v_float32::nlanes)) + (vx_load(S1) + vx_load(S1 + v_float32::nlanes))) * v_025);
|
||||
for (; dx <= w - VTraits<v_float32>::vlanes(); dx += VTraits<v_float32>::vlanes(), S0 += 2*VTraits<v_float32>::vlanes(), S1 += 2*VTraits<v_float32>::vlanes(), D += VTraits<v_float32>::vlanes())
|
||||
v_store(D, v_mul(v_add(v_add(vx_load(S0), vx_load(S0 + VTraits<v_float32>::vlanes())), v_add(vx_load(S1), vx_load(S1 + VTraits<v_float32>::vlanes()))), v_025));
|
||||
#elif CV_SIMD256
|
||||
v_float32x8 v_025 = v256_setall_f32(0.25f);
|
||||
for (; dx <= w - v_float32x8::nlanes; dx += v_float32x8::nlanes, S0 += 2*v_float32x8::nlanes, S1 += 2*v_float32x8::nlanes, D += v_float32x8::nlanes)
|
||||
for (; dx <= w - VTraits<v_float32x8>::vlanes(); dx += VTraits<v_float32x8>::vlanes(), S0 += 2*VTraits<v_float32x8>::vlanes(), S1 += 2*VTraits<v_float32x8>::vlanes(), D += VTraits<v_float32x8>::vlanes())
|
||||
{
|
||||
v_float32x8 dst0, dst1;
|
||||
v_recombine(v_add(v256_load(S0), v256_load(S1)), v_add(v256_load(S0 + v_float32x8::nlanes), v256_load(S1 + v_float32x8::nlanes)), dst0, dst1);
|
||||
v_recombine(v_add(v256_load(S0), v256_load(S1)), v_add(v256_load(S0 + VTraits<v_float32x8>::vlanes()), v256_load(S1 + VTraits<v_float32x8>::vlanes())), dst0, dst1);
|
||||
v_store(D, v_mul(v_add(dst0, dst1), v_025));
|
||||
}
|
||||
#endif
|
||||
|
@ -114,7 +114,7 @@ struct Integral_SIMD<uchar, int, double>
|
||||
|
||||
v_int32 prev = vx_setzero_s32();
|
||||
int j = 0;
|
||||
for ( ; j + v_uint16::nlanes <= width; j += v_uint16::nlanes)
|
||||
for ( ; j + VTraits<v_uint16>::vlanes() <= width; j += VTraits<v_uint16>::vlanes())
|
||||
{
|
||||
v_int16 el8 = v_reinterpret_as_s16(vx_load_expand(src_row + j));
|
||||
v_int32 el4l, el4h;
|
||||
@ -127,8 +127,8 @@ struct Integral_SIMD<uchar, int, double>
|
||||
el4h.val = _mm256_add_epi32(_mm256_cvtepi16_epi32(_v256_extract_high(vsum)), _mm256_permutevar8x32_epi32(el4l.val, shmask));
|
||||
prev.val = _mm256_permutevar8x32_epi32(el4h.val, shmask);
|
||||
#else
|
||||
el8 += v_rotate_left<1>(el8);
|
||||
el8 += v_rotate_left<2>(el8);
|
||||
el8 = v_add(el8, v_rotate_left<1>(el8));
|
||||
el8 = v_add(el8, v_rotate_left<2>(el8));
|
||||
#if CV_SIMD_WIDTH >= 32
|
||||
el8 += v_rotate_left<4>(el8);
|
||||
#if CV_SIMD_WIDTH == 64
|
||||
@ -136,12 +136,12 @@ struct Integral_SIMD<uchar, int, double>
|
||||
#endif
|
||||
#endif
|
||||
v_expand(el8, el4l, el4h);
|
||||
el4l += prev;
|
||||
el4h += el4l;
|
||||
prev = v_broadcast_element<v_int32::nlanes - 1>(el4h);
|
||||
el4l = v_add(el4l, prev);
|
||||
el4h = v_add(el4h, el4l);
|
||||
prev = v_broadcast_highest(el4h);
|
||||
#endif
|
||||
v_store(sum_row + j , el4l + vx_load(prev_sum_row + j ));
|
||||
v_store(sum_row + j + v_int32::nlanes, el4h + vx_load(prev_sum_row + j + v_int32::nlanes));
|
||||
v_store(sum_row + j , v_add(el4l, vx_load(prev_sum_row + j)));
|
||||
v_store(sum_row + j + VTraits<v_int32>::vlanes(), v_add(el4h, vx_load(prev_sum_row + j + VTraits<v_int32>::vlanes())));
|
||||
}
|
||||
|
||||
for (int v = sum_row[j - 1] - prev_sum_row[j - 1]; j < width; ++j)
|
||||
@ -162,11 +162,11 @@ struct Integral_SIMD<uchar, int, double>
|
||||
|
||||
v_int32 prev_1 = vx_setzero_s32(), prev_2 = vx_setzero_s32();
|
||||
int j = 0;
|
||||
for ( ; j + v_uint16::nlanes * cn <= width; j += v_uint16::nlanes * cn)
|
||||
for ( ; j + VTraits<v_uint16>::vlanes() * cn <= width; j += VTraits<v_uint16>::vlanes() * cn)
|
||||
{
|
||||
v_int16 v_src_row = v_reinterpret_as_s16(vx_load(src_row + j));
|
||||
v_int16 el8_1 = v_src_row & mask;
|
||||
v_int16 el8_2 = v_reinterpret_as_s16(v_reinterpret_as_u16(v_src_row) >> 8);
|
||||
v_int16 el8_1 = v_and(v_src_row, mask);
|
||||
v_int16 el8_2 = v_reinterpret_as_s16(v_shr<8>(v_reinterpret_as_u16(v_src_row)));
|
||||
v_int32 el4l_1, el4h_1, el4l_2, el4h_2;
|
||||
#if CV_AVX2 && CV_SIMD_WIDTH == 32
|
||||
__m256i vsum_1 = _mm256_add_epi16(el8_1.val, _mm256_slli_si256(el8_1.val, 2));
|
||||
@ -183,10 +183,10 @@ struct Integral_SIMD<uchar, int, double>
|
||||
prev_1.val = _mm256_permutevar8x32_epi32(el4h_1.val, shmask);
|
||||
prev_2.val = _mm256_permutevar8x32_epi32(el4h_2.val, shmask);
|
||||
#else
|
||||
el8_1 += v_rotate_left<1>(el8_1);
|
||||
el8_2 += v_rotate_left<1>(el8_2);
|
||||
el8_1 += v_rotate_left<2>(el8_1);
|
||||
el8_2 += v_rotate_left<2>(el8_2);
|
||||
el8_1 = v_add(el8_1, v_rotate_left<1>(el8_1));
|
||||
el8_2 = v_add(el8_2, v_rotate_left<1>(el8_2));
|
||||
el8_1 = v_add(el8_1, v_rotate_left<2>(el8_1));
|
||||
el8_2 = v_add(el8_2, v_rotate_left<2>(el8_2));
|
||||
#if CV_SIMD_WIDTH >= 32
|
||||
el8_1 += v_rotate_left<4>(el8_1);
|
||||
el8_2 += v_rotate_left<4>(el8_2);
|
||||
@ -197,20 +197,20 @@ struct Integral_SIMD<uchar, int, double>
|
||||
#endif
|
||||
v_expand(el8_1, el4l_1, el4h_1);
|
||||
v_expand(el8_2, el4l_2, el4h_2);
|
||||
el4l_1 += prev_1;
|
||||
el4l_2 += prev_2;
|
||||
el4h_1 += el4l_1;
|
||||
el4h_2 += el4l_2;
|
||||
prev_1 = v_broadcast_element<v_int32::nlanes - 1>(el4h_1);
|
||||
prev_2 = v_broadcast_element<v_int32::nlanes - 1>(el4h_2);
|
||||
el4l_1 = v_add(el4l_1, prev_1);
|
||||
el4l_2 = v_add(el4l_2, prev_2);
|
||||
el4h_1 = v_add(el4h_1, el4l_1);
|
||||
el4h_2 = v_add(el4h_2, el4l_2);
|
||||
prev_1 = v_broadcast_highest(el4h_1);
|
||||
prev_2 = v_broadcast_highest(el4h_2);
|
||||
#endif
|
||||
v_int32 el4_1, el4_2, el4_3, el4_4;
|
||||
v_zip(el4l_1, el4l_2, el4_1, el4_2);
|
||||
v_zip(el4h_1, el4h_2, el4_3, el4_4);
|
||||
v_store(sum_row + j , el4_1 + vx_load(prev_sum_row + j ));
|
||||
v_store(sum_row + j + v_int32::nlanes , el4_2 + vx_load(prev_sum_row + j + v_int32::nlanes ));
|
||||
v_store(sum_row + j + v_int32::nlanes * 2, el4_3 + vx_load(prev_sum_row + j + v_int32::nlanes * 2));
|
||||
v_store(sum_row + j + v_int32::nlanes * 3, el4_4 + vx_load(prev_sum_row + j + v_int32::nlanes * 3));
|
||||
v_store(sum_row + j , v_add(el4_1, vx_load(prev_sum_row + j)));
|
||||
v_store(sum_row + j + VTraits<v_int32>::vlanes() , v_add(el4_2, vx_load(prev_sum_row + j + VTraits<v_int32>::vlanes())));
|
||||
v_store(sum_row + j + VTraits<v_int32>::vlanes() * 2, v_add(el4_3, vx_load(prev_sum_row + j + VTraits<v_int32>::vlanes() * 2)));
|
||||
v_store(sum_row + j + VTraits<v_int32>::vlanes() * 3, v_add(el4_4, vx_load(prev_sum_row + j + VTraits<v_int32>::vlanes() * 3)));
|
||||
}
|
||||
|
||||
for (int v2 = sum_row[j - 1] - prev_sum_row[j - 1],
|
||||
@ -230,7 +230,7 @@ struct Integral_SIMD<uchar, int, double>
|
||||
const uchar * src_row = src + _srcstep * i;
|
||||
int * prev_sum_row = (int *)((uchar *)sum + _sumstep * i) + cn;
|
||||
int * sum_row = (int *)((uchar *)sum + _sumstep * (i + 1)) + cn;
|
||||
int row_cache[v_int32::nlanes * 6];
|
||||
int row_cache[VTraits<v_int32>::max_nlanes * 6];
|
||||
|
||||
sum_row[-1] = sum_row[-2] = sum_row[-3] = 0;
|
||||
|
||||
@ -238,10 +238,10 @@ struct Integral_SIMD<uchar, int, double>
|
||||
prev_3 = vx_setzero_s32();
|
||||
int j = 0;
|
||||
const int j_max =
|
||||
((_srcstep * i + (width - v_uint16::nlanes * cn + v_uint8::nlanes * cn)) >= _srcstep * height)
|
||||
? width - v_uint8::nlanes * cn // uint8 in v_load_deinterleave()
|
||||
: width - v_uint16::nlanes * cn; // v_expand_low
|
||||
for ( ; j <= j_max; j += v_uint16::nlanes * cn)
|
||||
((_srcstep * i + (width - VTraits<v_uint16>::vlanes() * cn + VTraits<v_uint8>::vlanes() * cn)) >= _srcstep * height)
|
||||
? width - VTraits<v_uint8>::vlanes() * cn // uint8 in v_load_deinterleave()
|
||||
: width - VTraits<v_uint16>::vlanes() * cn; // v_expand_low
|
||||
for ( ; j <= j_max; j += VTraits<v_uint16>::vlanes() * cn)
|
||||
{
|
||||
v_uint8 v_src_row_1, v_src_row_2, v_src_row_3;
|
||||
v_load_deinterleave(src_row + j, v_src_row_1, v_src_row_2, v_src_row_3);
|
||||
@ -270,49 +270,49 @@ struct Integral_SIMD<uchar, int, double>
|
||||
prev_2.val = _mm256_permutevar8x32_epi32(el4h_2.val, shmask);
|
||||
prev_3.val = _mm256_permutevar8x32_epi32(el4h_3.val, shmask);
|
||||
#else
|
||||
el8_1 += v_rotate_left<1>(el8_1);
|
||||
el8_2 += v_rotate_left<1>(el8_2);
|
||||
el8_3 += v_rotate_left<1>(el8_3);
|
||||
el8_1 += v_rotate_left<2>(el8_1);
|
||||
el8_2 += v_rotate_left<2>(el8_2);
|
||||
el8_3 += v_rotate_left<2>(el8_3);
|
||||
el8_1 = v_add(el8_1,v_rotate_left<1>(el8_1));
|
||||
el8_2 = v_add(el8_2,v_rotate_left<1>(el8_2));
|
||||
el8_3 = v_add(el8_3,v_rotate_left<1>(el8_3));
|
||||
el8_1 = v_add(el8_1,v_rotate_left<2>(el8_1));
|
||||
el8_2 = v_add(el8_2,v_rotate_left<2>(el8_2));
|
||||
el8_3 = v_add(el8_3,v_rotate_left<2>(el8_3));
|
||||
#if CV_SIMD_WIDTH >= 32
|
||||
el8_1 += v_rotate_left<4>(el8_1);
|
||||
el8_2 += v_rotate_left<4>(el8_2);
|
||||
el8_3 += v_rotate_left<4>(el8_3);
|
||||
el8_1 = v_add(el8_1, v_rotate_left<4>(el8_1));
|
||||
el8_2 = v_add(el8_2, v_rotate_left<4>(el8_2));
|
||||
el8_3 = v_add(el8_3, v_rotate_left<4>(el8_3));
|
||||
#if CV_SIMD_WIDTH == 64
|
||||
el8_1 += v_rotate_left<8>(el8_1);
|
||||
el8_2 += v_rotate_left<8>(el8_2);
|
||||
el8_3 += v_rotate_left<8>(el8_3);
|
||||
el8_1 = v_add(el8_1, v_rotate_left<8>(el8_1));
|
||||
el8_2 = v_add(el8_2, v_rotate_left<8>(el8_2));
|
||||
el8_3 = v_add(el8_3, v_rotate_left<8>(el8_3));
|
||||
#endif
|
||||
#endif
|
||||
v_expand(el8_1, el4l_1, el4h_1);
|
||||
v_expand(el8_2, el4l_2, el4h_2);
|
||||
v_expand(el8_3, el4l_3, el4h_3);
|
||||
el4l_1 += prev_1;
|
||||
el4l_2 += prev_2;
|
||||
el4l_3 += prev_3;
|
||||
el4h_1 += el4l_1;
|
||||
el4h_2 += el4l_2;
|
||||
el4h_3 += el4l_3;
|
||||
prev_1 = v_broadcast_element<v_int32::nlanes - 1>(el4h_1);
|
||||
prev_2 = v_broadcast_element<v_int32::nlanes - 1>(el4h_2);
|
||||
prev_3 = v_broadcast_element<v_int32::nlanes - 1>(el4h_3);
|
||||
el4l_1 = v_add(el4l_1, prev_1);
|
||||
el4l_2 = v_add(el4l_2, prev_2);
|
||||
el4l_3 = v_add(el4l_3, prev_3);
|
||||
el4h_1 = v_add(el4h_1, el4l_1);
|
||||
el4h_2 = v_add(el4h_2, el4l_2);
|
||||
el4h_3 = v_add(el4h_3, el4l_3);
|
||||
prev_1 = v_broadcast_highest(el4h_1);
|
||||
prev_2 = v_broadcast_highest(el4h_2);
|
||||
prev_3 = v_broadcast_highest(el4h_3);
|
||||
#endif
|
||||
v_store_interleave(row_cache , el4l_1, el4l_2, el4l_3);
|
||||
v_store_interleave(row_cache + v_int32::nlanes * 3, el4h_1, el4h_2, el4h_3);
|
||||
v_store_interleave(row_cache + VTraits<v_int32>::vlanes() * 3, el4h_1, el4h_2, el4h_3);
|
||||
el4l_1 = vx_load(row_cache );
|
||||
el4l_2 = vx_load(row_cache + v_int32::nlanes );
|
||||
el4l_3 = vx_load(row_cache + v_int32::nlanes * 2);
|
||||
el4h_1 = vx_load(row_cache + v_int32::nlanes * 3);
|
||||
el4h_2 = vx_load(row_cache + v_int32::nlanes * 4);
|
||||
el4h_3 = vx_load(row_cache + v_int32::nlanes * 5);
|
||||
v_store(sum_row + j , el4l_1 + vx_load(prev_sum_row + j ));
|
||||
v_store(sum_row + j + v_int32::nlanes , el4l_2 + vx_load(prev_sum_row + j + v_int32::nlanes ));
|
||||
v_store(sum_row + j + v_int32::nlanes * 2, el4l_3 + vx_load(prev_sum_row + j + v_int32::nlanes * 2));
|
||||
v_store(sum_row + j + v_int32::nlanes * 3, el4h_1 + vx_load(prev_sum_row + j + v_int32::nlanes * 3));
|
||||
v_store(sum_row + j + v_int32::nlanes * 4, el4h_2 + vx_load(prev_sum_row + j + v_int32::nlanes * 4));
|
||||
v_store(sum_row + j + v_int32::nlanes * 5, el4h_3 + vx_load(prev_sum_row + j + v_int32::nlanes * 5));
|
||||
el4l_2 = vx_load(row_cache + VTraits<v_int32>::vlanes() );
|
||||
el4l_3 = vx_load(row_cache + VTraits<v_int32>::vlanes() * 2);
|
||||
el4h_1 = vx_load(row_cache + VTraits<v_int32>::vlanes() * 3);
|
||||
el4h_2 = vx_load(row_cache + VTraits<v_int32>::vlanes() * 4);
|
||||
el4h_3 = vx_load(row_cache + VTraits<v_int32>::vlanes() * 5);
|
||||
v_store(sum_row + j , v_add(el4l_1, vx_load(prev_sum_row + j )));
|
||||
v_store(sum_row + j + VTraits<v_int32>::vlanes() , v_add(el4l_2, vx_load(prev_sum_row + j + VTraits<v_int32>::vlanes() )));
|
||||
v_store(sum_row + j + VTraits<v_int32>::vlanes() * 2, v_add(el4l_3, vx_load(prev_sum_row + j + VTraits<v_int32>::vlanes() * 2)));
|
||||
v_store(sum_row + j + VTraits<v_int32>::vlanes() * 3, v_add(el4h_1, vx_load(prev_sum_row + j + VTraits<v_int32>::vlanes() * 3)));
|
||||
v_store(sum_row + j + VTraits<v_int32>::vlanes() * 4, v_add(el4h_2, vx_load(prev_sum_row + j + VTraits<v_int32>::vlanes() * 4)));
|
||||
v_store(sum_row + j + VTraits<v_int32>::vlanes() * 5, v_add(el4h_3, vx_load(prev_sum_row + j + VTraits<v_int32>::vlanes() * 5)));
|
||||
}
|
||||
|
||||
for (int v3 = sum_row[j - 1] - prev_sum_row[j - 1],
|
||||
@ -339,7 +339,7 @@ struct Integral_SIMD<uchar, int, double>
|
||||
|
||||
v_int32 prev = vx_setzero_s32();
|
||||
int j = 0;
|
||||
for ( ; j + v_uint16::nlanes <= width; j += v_uint16::nlanes)
|
||||
for ( ; j + VTraits<v_uint16>::vlanes() <= width; j += VTraits<v_uint16>::vlanes())
|
||||
{
|
||||
v_int16 el8 = v_reinterpret_as_s16(vx_load_expand(src_row + j));
|
||||
v_int32 el4l, el4h;
|
||||
@ -356,8 +356,8 @@ struct Integral_SIMD<uchar, int, double>
|
||||
#endif
|
||||
#endif
|
||||
v_expand(el8, el4l, el4h);
|
||||
el4l += prev;
|
||||
el4h += el4l;
|
||||
el4l = v_add(el4l, prev);
|
||||
el4h = v_add(el4h, el4l);
|
||||
#if CV_SIMD_WIDTH == 16
|
||||
prev = el4h;
|
||||
#elif CV_SIMD_WIDTH == 32
|
||||
@ -368,8 +368,8 @@ struct Integral_SIMD<uchar, int, double>
|
||||
prev = v_combine_low(t, t);
|
||||
#endif
|
||||
#endif
|
||||
v_store(sum_row + j , el4l + vx_load(prev_sum_row + j ));
|
||||
v_store(sum_row + j + v_int32::nlanes, el4h + vx_load(prev_sum_row + j + v_int32::nlanes));
|
||||
v_store(sum_row + j , v_add(el4l, vx_load(prev_sum_row + j)));
|
||||
v_store(sum_row + j + VTraits<v_int32>::vlanes(), v_add(el4h, vx_load(prev_sum_row + j + VTraits<v_int32>::vlanes())));
|
||||
}
|
||||
|
||||
for (int v4 = sum_row[j - 1] - prev_sum_row[j - 1],
|
||||
@ -426,7 +426,7 @@ struct Integral_SIMD<uchar, float, double>
|
||||
|
||||
v_float32 prev = vx_setzero_f32();
|
||||
int j = 0;
|
||||
for (; j + v_uint16::nlanes <= width; j += v_uint16::nlanes)
|
||||
for (; j + VTraits<v_uint16>::vlanes() <= width; j += VTraits<v_uint16>::vlanes())
|
||||
{
|
||||
v_int16 el8 = v_reinterpret_as_s16(vx_load_expand(src_row + j));
|
||||
v_float32 el4l, el4h;
|
||||
@ -439,8 +439,8 @@ struct Integral_SIMD<uchar, float, double>
|
||||
el4h.val = _mm256_add_ps(_mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_v256_extract_high(vsum))), _mm256_permutevar8x32_ps(el4l.val, shmask));
|
||||
prev.val = _mm256_permutevar8x32_ps(el4h.val, shmask);
|
||||
#else
|
||||
el8 += v_rotate_left<1>(el8);
|
||||
el8 += v_rotate_left<2>(el8);
|
||||
el8 = v_add(el8, v_rotate_left<1>(el8));
|
||||
el8 = v_add(el8, v_rotate_left<2>(el8));
|
||||
#if CV_SIMD_WIDTH >= 32
|
||||
el8 += v_rotate_left<4>(el8);
|
||||
#if CV_SIMD_WIDTH == 64
|
||||
@ -449,12 +449,12 @@ struct Integral_SIMD<uchar, float, double>
|
||||
#endif
|
||||
v_int32 el4li, el4hi;
|
||||
v_expand(el8, el4li, el4hi);
|
||||
el4l = v_cvt_f32(el4li) + prev;
|
||||
el4h = v_cvt_f32(el4hi) + el4l;
|
||||
prev = v_broadcast_element<v_float32::nlanes - 1>(el4h);
|
||||
el4l = v_add(v_cvt_f32(el4li), prev);
|
||||
el4h = v_add(v_cvt_f32(el4hi), el4l);
|
||||
prev = v_broadcast_highest(el4h);
|
||||
#endif
|
||||
v_store(sum_row + j , el4l + vx_load(prev_sum_row + j ));
|
||||
v_store(sum_row + j + v_float32::nlanes, el4h + vx_load(prev_sum_row + j + v_float32::nlanes));
|
||||
v_store(sum_row + j , v_add(el4l, vx_load(prev_sum_row + j)));
|
||||
v_store(sum_row + j + VTraits<v_float32>::vlanes(), v_add(el4h, vx_load(prev_sum_row + j + VTraits<v_float32>::vlanes())));
|
||||
}
|
||||
|
||||
for (float v = sum_row[j - 1] - prev_sum_row[j - 1]; j < width; ++j)
|
||||
@ -475,11 +475,11 @@ struct Integral_SIMD<uchar, float, double>
|
||||
|
||||
v_float32 prev_1 = vx_setzero_f32(), prev_2 = vx_setzero_f32();
|
||||
int j = 0;
|
||||
for (; j + v_uint16::nlanes * cn <= width; j += v_uint16::nlanes * cn)
|
||||
for (; j + VTraits<v_uint16>::vlanes() * cn <= width; j += VTraits<v_uint16>::vlanes() * cn)
|
||||
{
|
||||
v_int16 v_src_row = v_reinterpret_as_s16(vx_load(src_row + j));
|
||||
v_int16 el8_1 = v_src_row & mask;
|
||||
v_int16 el8_2 = v_reinterpret_as_s16(v_reinterpret_as_u16(v_src_row) >> 8);
|
||||
v_int16 el8_1 = v_and(v_src_row, mask);
|
||||
v_int16 el8_2 = v_reinterpret_as_s16(v_shr<8>(v_reinterpret_as_u16(v_src_row)));
|
||||
v_float32 el4l_1, el4h_1, el4l_2, el4h_2;
|
||||
#if CV_AVX2 && CV_SIMD_WIDTH == 32
|
||||
__m256i vsum_1 = _mm256_add_epi16(el8_1.val, _mm256_slli_si256(el8_1.val, 2));
|
||||
@ -496,10 +496,10 @@ struct Integral_SIMD<uchar, float, double>
|
||||
prev_1.val = _mm256_permutevar8x32_ps(el4h_1.val, shmask);
|
||||
prev_2.val = _mm256_permutevar8x32_ps(el4h_2.val, shmask);
|
||||
#else
|
||||
el8_1 += v_rotate_left<1>(el8_1);
|
||||
el8_2 += v_rotate_left<1>(el8_2);
|
||||
el8_1 += v_rotate_left<2>(el8_1);
|
||||
el8_2 += v_rotate_left<2>(el8_2);
|
||||
el8_1 = v_add(el8_1, v_rotate_left<1>(el8_1));
|
||||
el8_2 = v_add(el8_2, v_rotate_left<1>(el8_2));
|
||||
el8_1 = v_add(el8_1, v_rotate_left<2>(el8_1));
|
||||
el8_2 = v_add(el8_2, v_rotate_left<2>(el8_2));
|
||||
#if CV_SIMD_WIDTH >= 32
|
||||
el8_1 += v_rotate_left<4>(el8_1);
|
||||
el8_2 += v_rotate_left<4>(el8_2);
|
||||
@ -511,20 +511,20 @@ struct Integral_SIMD<uchar, float, double>
|
||||
v_int32 el4li_1, el4hi_1, el4li_2, el4hi_2;
|
||||
v_expand(el8_1, el4li_1, el4hi_1);
|
||||
v_expand(el8_2, el4li_2, el4hi_2);
|
||||
el4l_1 = v_cvt_f32(el4li_1) + prev_1;
|
||||
el4l_2 = v_cvt_f32(el4li_2) + prev_2;
|
||||
el4h_1 = v_cvt_f32(el4hi_1) + el4l_1;
|
||||
el4h_2 = v_cvt_f32(el4hi_2) + el4l_2;
|
||||
prev_1 = v_broadcast_element<v_float32::nlanes - 1>(el4h_1);
|
||||
prev_2 = v_broadcast_element<v_float32::nlanes - 1>(el4h_2);
|
||||
el4l_1 = v_add(v_cvt_f32(el4li_1), prev_1);
|
||||
el4l_2 = v_add(v_cvt_f32(el4li_2), prev_2);
|
||||
el4h_1 = v_add(v_cvt_f32(el4hi_1), el4l_1);
|
||||
el4h_2 = v_add(v_cvt_f32(el4hi_2), el4l_2);
|
||||
prev_1 = v_broadcast_highest(el4h_1);
|
||||
prev_2 = v_broadcast_highest(el4h_2);
|
||||
#endif
|
||||
v_float32 el4_1, el4_2, el4_3, el4_4;
|
||||
v_zip(el4l_1, el4l_2, el4_1, el4_2);
|
||||
v_zip(el4h_1, el4h_2, el4_3, el4_4);
|
||||
v_store(sum_row + j , el4_1 + vx_load(prev_sum_row + j ));
|
||||
v_store(sum_row + j + v_float32::nlanes , el4_2 + vx_load(prev_sum_row + j + v_float32::nlanes ));
|
||||
v_store(sum_row + j + v_float32::nlanes * 2, el4_3 + vx_load(prev_sum_row + j + v_float32::nlanes * 2));
|
||||
v_store(sum_row + j + v_float32::nlanes * 3, el4_4 + vx_load(prev_sum_row + j + v_float32::nlanes * 3));
|
||||
v_store(sum_row + j , v_add(el4_1, vx_load(prev_sum_row + j)));
|
||||
v_store(sum_row + j + VTraits<v_float32>::vlanes() , v_add(el4_2, vx_load(prev_sum_row + j + VTraits<v_float32>::vlanes())));
|
||||
v_store(sum_row + j + VTraits<v_float32>::vlanes() * 2, v_add(el4_3, vx_load(prev_sum_row + j + VTraits<v_float32>::vlanes() * 2)));
|
||||
v_store(sum_row + j + VTraits<v_float32>::vlanes() * 3, v_add(el4_4, vx_load(prev_sum_row + j + VTraits<v_float32>::vlanes() * 3)));
|
||||
}
|
||||
|
||||
for (float v2 = sum_row[j - 1] - prev_sum_row[j - 1],
|
||||
@ -543,7 +543,7 @@ struct Integral_SIMD<uchar, float, double>
|
||||
const uchar * src_row = src + _srcstep * i;
|
||||
float * prev_sum_row = (float *)((uchar *)sum + _sumstep * i) + cn;
|
||||
float * sum_row = (float *)((uchar *)sum + _sumstep * (i + 1)) + cn;
|
||||
float row_cache[v_float32::nlanes * 6];
|
||||
float row_cache[VTraits<v_float32>::max_nlanes * 6];
|
||||
|
||||
sum_row[-1] = sum_row[-2] = sum_row[-3] = 0;
|
||||
|
||||
@ -551,10 +551,10 @@ struct Integral_SIMD<uchar, float, double>
|
||||
prev_3 = vx_setzero_f32();
|
||||
int j = 0;
|
||||
const int j_max =
|
||||
((_srcstep * i + (width - v_uint16::nlanes * cn + v_uint8::nlanes * cn)) >= _srcstep * height)
|
||||
? width - v_uint8::nlanes * cn // uint8 in v_load_deinterleave()
|
||||
: width - v_uint16::nlanes * cn; // v_expand_low
|
||||
for ( ; j <= j_max; j += v_uint16::nlanes * cn)
|
||||
((_srcstep * i + (width - VTraits<v_uint16>::vlanes() * cn + VTraits<v_uint8>::vlanes() * cn)) >= _srcstep * height)
|
||||
? width - VTraits<v_uint8>::vlanes() * cn // uint8 in v_load_deinterleave()
|
||||
: width - VTraits<v_uint16>::vlanes() * cn; // v_expand_low
|
||||
for ( ; j <= j_max; j += VTraits<v_uint16>::vlanes() * cn)
|
||||
{
|
||||
v_uint8 v_src_row_1, v_src_row_2, v_src_row_3;
|
||||
v_load_deinterleave(src_row + j, v_src_row_1, v_src_row_2, v_src_row_3);
|
||||
@ -583,12 +583,12 @@ struct Integral_SIMD<uchar, float, double>
|
||||
prev_2.val = _mm256_permutevar8x32_ps(el4h_2.val, shmask);
|
||||
prev_3.val = _mm256_permutevar8x32_ps(el4h_3.val, shmask);
|
||||
#else
|
||||
el8_1 += v_rotate_left<1>(el8_1);
|
||||
el8_2 += v_rotate_left<1>(el8_2);
|
||||
el8_3 += v_rotate_left<1>(el8_3);
|
||||
el8_1 += v_rotate_left<2>(el8_1);
|
||||
el8_2 += v_rotate_left<2>(el8_2);
|
||||
el8_3 += v_rotate_left<2>(el8_3);
|
||||
el8_1 = v_add(el8_1, v_rotate_left<1>(el8_1));
|
||||
el8_2 = v_add(el8_2, v_rotate_left<1>(el8_2));
|
||||
el8_3 = v_add(el8_3, v_rotate_left<1>(el8_3));
|
||||
el8_1 = v_add(el8_1, v_rotate_left<2>(el8_1));
|
||||
el8_2 = v_add(el8_2, v_rotate_left<2>(el8_2));
|
||||
el8_3 = v_add(el8_3, v_rotate_left<2>(el8_3));
|
||||
#if CV_SIMD_WIDTH >= 32
|
||||
el8_1 += v_rotate_left<4>(el8_1);
|
||||
el8_2 += v_rotate_left<4>(el8_2);
|
||||
@ -603,30 +603,30 @@ struct Integral_SIMD<uchar, float, double>
|
||||
v_expand(el8_1, el4li_1, el4hi_1);
|
||||
v_expand(el8_2, el4li_2, el4hi_2);
|
||||
v_expand(el8_3, el4li_3, el4hi_3);
|
||||
el4l_1 = v_cvt_f32(el4li_1) + prev_1;
|
||||
el4l_2 = v_cvt_f32(el4li_2) + prev_2;
|
||||
el4l_3 = v_cvt_f32(el4li_3) + prev_3;
|
||||
el4h_1 = v_cvt_f32(el4hi_1) + el4l_1;
|
||||
el4h_2 = v_cvt_f32(el4hi_2) + el4l_2;
|
||||
el4h_3 = v_cvt_f32(el4hi_3) + el4l_3;
|
||||
prev_1 = v_broadcast_element<v_float32::nlanes - 1>(el4h_1);
|
||||
prev_2 = v_broadcast_element<v_float32::nlanes - 1>(el4h_2);
|
||||
prev_3 = v_broadcast_element<v_float32::nlanes - 1>(el4h_3);
|
||||
el4l_1 = v_add(v_cvt_f32(el4li_1), prev_1);
|
||||
el4l_2 = v_add(v_cvt_f32(el4li_2), prev_2);
|
||||
el4l_3 = v_add(v_cvt_f32(el4li_3), prev_3);
|
||||
el4h_1 = v_add(v_cvt_f32(el4hi_1), el4l_1);
|
||||
el4h_2 = v_add(v_cvt_f32(el4hi_2), el4l_2);
|
||||
el4h_3 = v_add(v_cvt_f32(el4hi_3), el4l_3);
|
||||
prev_1 = v_broadcast_highest(el4h_1);
|
||||
prev_2 = v_broadcast_highest(el4h_2);
|
||||
prev_3 = v_broadcast_highest(el4h_3);
|
||||
#endif
|
||||
v_store_interleave(row_cache , el4l_1, el4l_2, el4l_3);
|
||||
v_store_interleave(row_cache + v_float32::nlanes * 3, el4h_1, el4h_2, el4h_3);
|
||||
v_store_interleave(row_cache + VTraits<v_float32>::vlanes() * 3, el4h_1, el4h_2, el4h_3);
|
||||
el4l_1 = vx_load(row_cache );
|
||||
el4l_2 = vx_load(row_cache + v_float32::nlanes );
|
||||
el4l_3 = vx_load(row_cache + v_float32::nlanes * 2);
|
||||
el4h_1 = vx_load(row_cache + v_float32::nlanes * 3);
|
||||
el4h_2 = vx_load(row_cache + v_float32::nlanes * 4);
|
||||
el4h_3 = vx_load(row_cache + v_float32::nlanes * 5);
|
||||
v_store(sum_row + j , el4l_1 + vx_load(prev_sum_row + j ));
|
||||
v_store(sum_row + j + v_float32::nlanes , el4l_2 + vx_load(prev_sum_row + j + v_float32::nlanes ));
|
||||
v_store(sum_row + j + v_float32::nlanes * 2, el4l_3 + vx_load(prev_sum_row + j + v_float32::nlanes * 2));
|
||||
v_store(sum_row + j + v_float32::nlanes * 3, el4h_1 + vx_load(prev_sum_row + j + v_float32::nlanes * 3));
|
||||
v_store(sum_row + j + v_float32::nlanes * 4, el4h_2 + vx_load(prev_sum_row + j + v_float32::nlanes * 4));
|
||||
v_store(sum_row + j + v_float32::nlanes * 5, el4h_3 + vx_load(prev_sum_row + j + v_float32::nlanes * 5));
|
||||
el4l_2 = vx_load(row_cache + VTraits<v_float32>::vlanes() );
|
||||
el4l_3 = vx_load(row_cache + VTraits<v_float32>::vlanes() * 2);
|
||||
el4h_1 = vx_load(row_cache + VTraits<v_float32>::vlanes() * 3);
|
||||
el4h_2 = vx_load(row_cache + VTraits<v_float32>::vlanes() * 4);
|
||||
el4h_3 = vx_load(row_cache + VTraits<v_float32>::vlanes() * 5);
|
||||
v_store(sum_row + j , v_add(el4l_1, vx_load(prev_sum_row + j)));
|
||||
v_store(sum_row + j + VTraits<v_float32>::vlanes() , v_add(el4l_2, vx_load(prev_sum_row + j + VTraits<v_float32>::vlanes())));
|
||||
v_store(sum_row + j + VTraits<v_float32>::vlanes() * 2, v_add(el4l_3, vx_load(prev_sum_row + j + VTraits<v_float32>::vlanes() * 2)));
|
||||
v_store(sum_row + j + VTraits<v_float32>::vlanes() * 3, v_add(el4h_1, vx_load(prev_sum_row + j + VTraits<v_float32>::vlanes() * 3)));
|
||||
v_store(sum_row + j + VTraits<v_float32>::vlanes() * 4, v_add(el4h_2, vx_load(prev_sum_row + j + VTraits<v_float32>::vlanes() * 4)));
|
||||
v_store(sum_row + j + VTraits<v_float32>::vlanes() * 5, v_add(el4h_3, vx_load(prev_sum_row + j + VTraits<v_float32>::vlanes() * 5)));
|
||||
}
|
||||
|
||||
for (float v3 = sum_row[j - 1] - prev_sum_row[j - 1],
|
||||
@ -652,7 +652,7 @@ struct Integral_SIMD<uchar, float, double>
|
||||
|
||||
v_float32 prev = vx_setzero_f32();
|
||||
int j = 0;
|
||||
for ( ; j + v_uint16::nlanes <= width; j += v_uint16::nlanes)
|
||||
for ( ; j + VTraits<v_uint16>::vlanes() <= width; j += VTraits<v_uint16>::vlanes())
|
||||
{
|
||||
v_int16 el8 = v_reinterpret_as_s16(vx_load_expand(src_row + j));
|
||||
v_float32 el4l, el4h;
|
||||
@ -670,8 +670,8 @@ struct Integral_SIMD<uchar, float, double>
|
||||
#endif
|
||||
v_int32 el4li, el4hi;
|
||||
v_expand(el8, el4li, el4hi);
|
||||
el4l = v_cvt_f32(el4li) + prev;
|
||||
el4h = v_cvt_f32(el4hi) + el4l;
|
||||
el4l = v_add(v_cvt_f32(el4li), prev);
|
||||
el4h = v_add(v_cvt_f32(el4hi), el4l);
|
||||
#if CV_SIMD_WIDTH == 16
|
||||
prev = el4h;
|
||||
#elif CV_SIMD_WIDTH == 32
|
||||
@ -682,8 +682,8 @@ struct Integral_SIMD<uchar, float, double>
|
||||
prev = v_combine_low(t, t);
|
||||
#endif
|
||||
#endif
|
||||
v_store(sum_row + j , el4l + vx_load(prev_sum_row + j ));
|
||||
v_store(sum_row + j + v_float32::nlanes, el4h + vx_load(prev_sum_row + j + v_float32::nlanes));
|
||||
v_store(sum_row + j , v_add(el4l, vx_load(prev_sum_row + j)));
|
||||
v_store(sum_row + j + VTraits<v_float32>::vlanes(), v_add(el4h, vx_load(prev_sum_row + j + VTraits<v_float32>::vlanes())));
|
||||
}
|
||||
|
||||
for (float v4 = sum_row[j - 1] - prev_sum_row[j - 1],
|
||||
@ -750,7 +750,7 @@ struct Integral_SIMD<uchar, double, double>
|
||||
|
||||
v_float64 prev = vx_setzero_f64();
|
||||
int j = 0;
|
||||
for (; j + v_uint16::nlanes <= width; j += v_uint16::nlanes)
|
||||
for (; j + VTraits<v_uint16>::vlanes() <= width; j += VTraits<v_uint16>::vlanes())
|
||||
{
|
||||
v_int16 el8 = v_reinterpret_as_s16(vx_load_expand(src_row + j));
|
||||
v_float64 el4ll, el4lh, el4hl, el4hh;
|
||||
@ -767,8 +767,8 @@ struct Integral_SIMD<uchar, double, double>
|
||||
el4hh.val = _mm256_add_pd(_mm256_cvtepi32_pd(_v256_extract_high(el4h_32)), el4d);
|
||||
prev.val = _mm256_permute4x64_pd(el4hh.val, 0xff);
|
||||
#else
|
||||
el8 += v_rotate_left<1>(el8);
|
||||
el8 += v_rotate_left<2>(el8);
|
||||
el8 = v_add(el8, v_rotate_left<1>(el8));
|
||||
el8 = v_add(el8, v_rotate_left<2>(el8));
|
||||
#if CV_SIMD_WIDTH >= 32
|
||||
el8 += v_rotate_left<4>(el8);
|
||||
#if CV_SIMD_WIDTH == 64
|
||||
@ -777,17 +777,17 @@ struct Integral_SIMD<uchar, double, double>
|
||||
#endif
|
||||
v_int32 el4li, el4hi;
|
||||
v_expand(el8, el4li, el4hi);
|
||||
el4ll = v_cvt_f64(el4li) + prev;
|
||||
el4lh = v_cvt_f64_high(el4li) + prev;
|
||||
el4hl = v_cvt_f64(el4hi) + el4ll;
|
||||
el4hh = v_cvt_f64_high(el4hi) + el4lh;
|
||||
prev = vx_setall_f64(v_extract_n<v_float64::nlanes - 1>(el4hh));
|
||||
// prev = v_broadcast_element<v_float64::nlanes - 1>(el4hh);
|
||||
el4ll = v_add(v_cvt_f64(el4li), prev);
|
||||
el4lh = v_add(v_cvt_f64_high(el4li), prev);
|
||||
el4hl = v_add(v_cvt_f64(el4hi), el4ll);
|
||||
el4hh = v_add(v_cvt_f64_high(el4hi), el4lh);
|
||||
prev = vx_setall_f64(v_extract_highest(el4hh));
|
||||
// prev = v_broadcast_highest(el4hh);
|
||||
#endif
|
||||
v_store(sum_row + j , el4ll + vx_load(prev_sum_row + j ));
|
||||
v_store(sum_row + j + v_float64::nlanes , el4lh + vx_load(prev_sum_row + j + v_float64::nlanes ));
|
||||
v_store(sum_row + j + v_float64::nlanes * 2, el4hl + vx_load(prev_sum_row + j + v_float64::nlanes * 2));
|
||||
v_store(sum_row + j + v_float64::nlanes * 3, el4hh + vx_load(prev_sum_row + j + v_float64::nlanes * 3));
|
||||
v_store(sum_row + j , v_add(el4ll, vx_load(prev_sum_row + j)));
|
||||
v_store(sum_row + j + VTraits<v_float64>::vlanes() , v_add(el4lh, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes())));
|
||||
v_store(sum_row + j + VTraits<v_float64>::vlanes() * 2, v_add(el4hl, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes() * 2)));
|
||||
v_store(sum_row + j + VTraits<v_float64>::vlanes() * 3, v_add(el4hh, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes() * 3)));
|
||||
}
|
||||
|
||||
for (double v = sum_row[j - 1] - prev_sum_row[j - 1]; j < width; ++j)
|
||||
@ -808,11 +808,11 @@ struct Integral_SIMD<uchar, double, double>
|
||||
|
||||
v_float64 prev_1 = vx_setzero_f64(), prev_2 = vx_setzero_f64();
|
||||
int j = 0;
|
||||
for (; j + v_uint16::nlanes * cn <= width; j += v_uint16::nlanes * cn)
|
||||
for (; j + VTraits<v_uint16>::vlanes() * cn <= width; j += VTraits<v_uint16>::vlanes() * cn)
|
||||
{
|
||||
v_int16 v_src_row = v_reinterpret_as_s16(vx_load(src_row + j));
|
||||
v_int16 el8_1 = v_src_row & mask;
|
||||
v_int16 el8_2 = v_reinterpret_as_s16(v_reinterpret_as_u16(v_src_row) >> 8);
|
||||
v_int16 el8_1 = v_and(v_src_row, mask);
|
||||
v_int16 el8_2 = v_reinterpret_as_s16(v_shr<8>(v_reinterpret_as_u16(v_src_row)));
|
||||
v_float64 el4ll_1, el4lh_1, el4hl_1, el4hh_1, el4ll_2, el4lh_2, el4hl_2, el4hh_2;
|
||||
#if CV_AVX2 && CV_SIMD_WIDTH == 32
|
||||
__m256i vsum_1 = _mm256_add_epi16(el8_1.val, _mm256_slli_si256(el8_1.val, 2));
|
||||
@ -838,10 +838,10 @@ struct Integral_SIMD<uchar, double, double>
|
||||
prev_1.val = _mm256_permute4x64_pd(el4hh_1.val, 0xff);
|
||||
prev_2.val = _mm256_permute4x64_pd(el4hh_2.val, 0xff);
|
||||
#else
|
||||
el8_1 += v_rotate_left<1>(el8_1);
|
||||
el8_2 += v_rotate_left<1>(el8_2);
|
||||
el8_1 += v_rotate_left<2>(el8_1);
|
||||
el8_2 += v_rotate_left<2>(el8_2);
|
||||
el8_1 = v_add(el8_1, v_rotate_left<1>(el8_1));
|
||||
el8_2 = v_add(el8_2, v_rotate_left<1>(el8_2));
|
||||
el8_1 = v_add(el8_1, v_rotate_left<2>(el8_1));
|
||||
el8_2 = v_add(el8_2, v_rotate_left<2>(el8_2));
|
||||
#if CV_SIMD_WIDTH >= 32
|
||||
el8_1 += v_rotate_left<4>(el8_1);
|
||||
el8_2 += v_rotate_left<4>(el8_2);
|
||||
@ -853,32 +853,32 @@ struct Integral_SIMD<uchar, double, double>
|
||||
v_int32 el4li_1, el4hi_1, el4li_2, el4hi_2;
|
||||
v_expand(el8_1, el4li_1, el4hi_1);
|
||||
v_expand(el8_2, el4li_2, el4hi_2);
|
||||
el4ll_1 = v_cvt_f64(el4li_1) + prev_1;
|
||||
el4ll_2 = v_cvt_f64(el4li_2) + prev_2;
|
||||
el4lh_1 = v_cvt_f64_high(el4li_1) + prev_1;
|
||||
el4lh_2 = v_cvt_f64_high(el4li_2) + prev_2;
|
||||
el4hl_1 = v_cvt_f64(el4hi_1) + el4ll_1;
|
||||
el4hl_2 = v_cvt_f64(el4hi_2) + el4ll_2;
|
||||
el4hh_1 = v_cvt_f64_high(el4hi_1) + el4lh_1;
|
||||
el4hh_2 = v_cvt_f64_high(el4hi_2) + el4lh_2;
|
||||
prev_1 = vx_setall_f64(v_extract_n<v_float64::nlanes - 1>(el4hh_1));
|
||||
prev_2 = vx_setall_f64(v_extract_n<v_float64::nlanes - 1>(el4hh_2));
|
||||
// prev_1 = v_broadcast_element<v_float64::nlanes - 1>(el4hh_1);
|
||||
// prev_2 = v_broadcast_element<v_float64::nlanes - 1>(el4hh_2);
|
||||
el4ll_1 = v_add(v_cvt_f64(el4li_1), prev_1);
|
||||
el4ll_2 = v_add(v_cvt_f64(el4li_2), prev_2);
|
||||
el4lh_1 = v_add(v_cvt_f64_high(el4li_1), prev_1);
|
||||
el4lh_2 = v_add(v_cvt_f64_high(el4li_2), prev_2);
|
||||
el4hl_1 = v_add(v_cvt_f64(el4hi_1), el4ll_1);
|
||||
el4hl_2 = v_add(v_cvt_f64(el4hi_2), el4ll_2);
|
||||
el4hh_1 = v_add(v_cvt_f64_high(el4hi_1), el4lh_1);
|
||||
el4hh_2 = v_add(v_cvt_f64_high(el4hi_2), el4lh_2);
|
||||
prev_1 = vx_setall_f64(v_extract_highest(el4hh_1));
|
||||
prev_2 = vx_setall_f64(v_extract_highest(el4hh_2));
|
||||
// prev_1 = v_broadcast_highest(el4hh_1);
|
||||
// prev_2 = v_broadcast_highest(el4hh_2);
|
||||
#endif
|
||||
v_float64 el4_1, el4_2, el4_3, el4_4, el4_5, el4_6, el4_7, el4_8;
|
||||
v_zip(el4ll_1, el4ll_2, el4_1, el4_2);
|
||||
v_zip(el4lh_1, el4lh_2, el4_3, el4_4);
|
||||
v_zip(el4hl_1, el4hl_2, el4_5, el4_6);
|
||||
v_zip(el4hh_1, el4hh_2, el4_7, el4_8);
|
||||
v_store(sum_row + j , el4_1 + vx_load(prev_sum_row + j ));
|
||||
v_store(sum_row + j + v_float64::nlanes , el4_2 + vx_load(prev_sum_row + j + v_float64::nlanes ));
|
||||
v_store(sum_row + j + v_float64::nlanes * 2, el4_3 + vx_load(prev_sum_row + j + v_float64::nlanes * 2));
|
||||
v_store(sum_row + j + v_float64::nlanes * 3, el4_4 + vx_load(prev_sum_row + j + v_float64::nlanes * 3));
|
||||
v_store(sum_row + j + v_float64::nlanes * 4, el4_5 + vx_load(prev_sum_row + j + v_float64::nlanes * 4));
|
||||
v_store(sum_row + j + v_float64::nlanes * 5, el4_6 + vx_load(prev_sum_row + j + v_float64::nlanes * 5));
|
||||
v_store(sum_row + j + v_float64::nlanes * 6, el4_7 + vx_load(prev_sum_row + j + v_float64::nlanes * 6));
|
||||
v_store(sum_row + j + v_float64::nlanes * 7, el4_8 + vx_load(prev_sum_row + j + v_float64::nlanes * 7));
|
||||
v_store(sum_row + j , v_add(el4_1, vx_load(prev_sum_row + j)));
|
||||
v_store(sum_row + j + VTraits<v_float64>::vlanes() , v_add(el4_2, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes())));
|
||||
v_store(sum_row + j + VTraits<v_float64>::vlanes() * 2, v_add(el4_3, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes() * 2)));
|
||||
v_store(sum_row + j + VTraits<v_float64>::vlanes() * 3, v_add(el4_4, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes() * 3)));
|
||||
v_store(sum_row + j + VTraits<v_float64>::vlanes() * 4, v_add(el4_5, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes() * 4)));
|
||||
v_store(sum_row + j + VTraits<v_float64>::vlanes() * 5, v_add(el4_6, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes() * 5)));
|
||||
v_store(sum_row + j + VTraits<v_float64>::vlanes() * 6, v_add(el4_7, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes() * 6)));
|
||||
v_store(sum_row + j + VTraits<v_float64>::vlanes() * 7, v_add(el4_8, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes() * 7)));
|
||||
}
|
||||
|
||||
for (double v2 = sum_row[j - 1] - prev_sum_row[j - 1],
|
||||
@ -897,7 +897,7 @@ struct Integral_SIMD<uchar, double, double>
|
||||
const uchar * src_row = src + _srcstep * i;
|
||||
double * prev_sum_row = (double *)((uchar *)sum + _sumstep * i) + cn;
|
||||
double * sum_row = (double *)((uchar *)sum + _sumstep * (i + 1)) + cn;
|
||||
double row_cache[v_float64::nlanes * 12];
|
||||
double row_cache[VTraits<v_float64>::max_nlanes * 12];
|
||||
|
||||
sum_row[-1] = sum_row[-2] = sum_row[-3] = 0;
|
||||
|
||||
@ -905,10 +905,10 @@ struct Integral_SIMD<uchar, double, double>
|
||||
prev_3 = vx_setzero_f64();
|
||||
int j = 0;
|
||||
const int j_max =
|
||||
((_srcstep * i + (width - v_uint16::nlanes * cn + v_uint8::nlanes * cn)) >= _srcstep * height)
|
||||
? width - v_uint8::nlanes * cn // uint8 in v_load_deinterleave()
|
||||
: width - v_uint16::nlanes * cn; // v_expand_low
|
||||
for ( ; j <= j_max; j += v_uint16::nlanes * cn)
|
||||
((_srcstep * i + (width - VTraits<v_uint16>::vlanes() * cn + VTraits<v_uint8>::vlanes() * cn)) >= _srcstep * height)
|
||||
? width - VTraits<v_uint8>::vlanes() * cn // uint8 in v_load_deinterleave()
|
||||
: width - VTraits<v_uint16>::vlanes() * cn; // v_expand_low
|
||||
for ( ; j <= j_max; j += VTraits<v_uint16>::vlanes() * cn)
|
||||
{
|
||||
v_uint8 v_src_row_1, v_src_row_2, v_src_row_3;
|
||||
v_load_deinterleave(src_row + j, v_src_row_1, v_src_row_2, v_src_row_3);
|
||||
@ -951,12 +951,12 @@ struct Integral_SIMD<uchar, double, double>
|
||||
prev_2.val = _mm256_permute4x64_pd(el4hh_2.val, 0xff);
|
||||
prev_3.val = _mm256_permute4x64_pd(el4hh_3.val, 0xff);
|
||||
#else
|
||||
el8_1 += v_rotate_left<1>(el8_1);
|
||||
el8_2 += v_rotate_left<1>(el8_2);
|
||||
el8_3 += v_rotate_left<1>(el8_3);
|
||||
el8_1 += v_rotate_left<2>(el8_1);
|
||||
el8_2 += v_rotate_left<2>(el8_2);
|
||||
el8_3 += v_rotate_left<2>(el8_3);
|
||||
el8_1 = v_add(el8_1, v_rotate_left<1>(el8_1));
|
||||
el8_2 = v_add(el8_2, v_rotate_left<1>(el8_2));
|
||||
el8_3 = v_add(el8_3, v_rotate_left<1>(el8_3));
|
||||
el8_1 = v_add(el8_1, v_rotate_left<2>(el8_1));
|
||||
el8_2 = v_add(el8_2, v_rotate_left<2>(el8_2));
|
||||
el8_3 = v_add(el8_3, v_rotate_left<2>(el8_3));
|
||||
#if CV_SIMD_WIDTH >= 32
|
||||
el8_1 += v_rotate_left<4>(el8_1);
|
||||
el8_2 += v_rotate_left<4>(el8_2);
|
||||
@ -971,53 +971,53 @@ struct Integral_SIMD<uchar, double, double>
|
||||
v_expand(el8_1, el4li_1, el4hi_1);
|
||||
v_expand(el8_2, el4li_2, el4hi_2);
|
||||
v_expand(el8_3, el4li_3, el4hi_3);
|
||||
el4ll_1 = v_cvt_f64(el4li_1) + prev_1;
|
||||
el4ll_2 = v_cvt_f64(el4li_2) + prev_2;
|
||||
el4ll_3 = v_cvt_f64(el4li_3) + prev_3;
|
||||
el4lh_1 = v_cvt_f64_high(el4li_1) + prev_1;
|
||||
el4lh_2 = v_cvt_f64_high(el4li_2) + prev_2;
|
||||
el4lh_3 = v_cvt_f64_high(el4li_3) + prev_3;
|
||||
el4hl_1 = v_cvt_f64(el4hi_1) + el4ll_1;
|
||||
el4hl_2 = v_cvt_f64(el4hi_2) + el4ll_2;
|
||||
el4hl_3 = v_cvt_f64(el4hi_3) + el4ll_3;
|
||||
el4hh_1 = v_cvt_f64_high(el4hi_1) + el4lh_1;
|
||||
el4hh_2 = v_cvt_f64_high(el4hi_2) + el4lh_2;
|
||||
el4hh_3 = v_cvt_f64_high(el4hi_3) + el4lh_3;
|
||||
prev_1 = vx_setall_f64(v_extract_n<v_float64::nlanes - 1>(el4hh_1));
|
||||
prev_2 = vx_setall_f64(v_extract_n<v_float64::nlanes - 1>(el4hh_2));
|
||||
prev_3 = vx_setall_f64(v_extract_n<v_float64::nlanes - 1>(el4hh_3));
|
||||
// prev_1 = v_broadcast_element<v_float64::nlanes - 1>(el4hh_1);
|
||||
// prev_2 = v_broadcast_element<v_float64::nlanes - 1>(el4hh_2);
|
||||
// prev_3 = v_broadcast_element<v_float64::nlanes - 1>(el4hh_3);
|
||||
el4ll_1 = v_add(v_cvt_f64(el4li_1), prev_1);
|
||||
el4ll_2 = v_add(v_cvt_f64(el4li_2), prev_2);
|
||||
el4ll_3 = v_add(v_cvt_f64(el4li_3), prev_3);
|
||||
el4lh_1 = v_add(v_cvt_f64_high(el4li_1), prev_1);
|
||||
el4lh_2 = v_add(v_cvt_f64_high(el4li_2), prev_2);
|
||||
el4lh_3 = v_add(v_cvt_f64_high(el4li_3), prev_3);
|
||||
el4hl_1 = v_add(v_cvt_f64(el4hi_1), el4ll_1);
|
||||
el4hl_2 = v_add(v_cvt_f64(el4hi_2), el4ll_2);
|
||||
el4hl_3 = v_add(v_cvt_f64(el4hi_3), el4ll_3);
|
||||
el4hh_1 = v_add(v_cvt_f64_high(el4hi_1), el4lh_1);
|
||||
el4hh_2 = v_add(v_cvt_f64_high(el4hi_2), el4lh_2);
|
||||
el4hh_3 = v_add(v_cvt_f64_high(el4hi_3), el4lh_3);
|
||||
prev_1 = vx_setall_f64(v_extract_highest(el4hh_1));
|
||||
prev_2 = vx_setall_f64(v_extract_highest(el4hh_2));
|
||||
prev_3 = vx_setall_f64(v_extract_highest(el4hh_3));
|
||||
// prev_1 = v_broadcast_highest(el4hh_1);
|
||||
// prev_2 = v_broadcast_highest(el4hh_2);
|
||||
// prev_3 = v_broadcast_highest(el4hh_3);
|
||||
#endif
|
||||
v_store_interleave(row_cache , el4ll_1, el4ll_2, el4ll_3);
|
||||
v_store_interleave(row_cache + v_float64::nlanes * 3, el4lh_1, el4lh_2, el4lh_3);
|
||||
v_store_interleave(row_cache + v_float64::nlanes * 6, el4hl_1, el4hl_2, el4hl_3);
|
||||
v_store_interleave(row_cache + v_float64::nlanes * 9, el4hh_1, el4hh_2, el4hh_3);
|
||||
v_store_interleave(row_cache + VTraits<v_float64>::vlanes() * 3, el4lh_1, el4lh_2, el4lh_3);
|
||||
v_store_interleave(row_cache + VTraits<v_float64>::vlanes() * 6, el4hl_1, el4hl_2, el4hl_3);
|
||||
v_store_interleave(row_cache + VTraits<v_float64>::vlanes() * 9, el4hh_1, el4hh_2, el4hh_3);
|
||||
el4ll_1 = vx_load(row_cache );
|
||||
el4ll_2 = vx_load(row_cache + v_float64::nlanes );
|
||||
el4ll_3 = vx_load(row_cache + v_float64::nlanes * 2 );
|
||||
el4lh_1 = vx_load(row_cache + v_float64::nlanes * 3 );
|
||||
el4lh_2 = vx_load(row_cache + v_float64::nlanes * 4 );
|
||||
el4lh_3 = vx_load(row_cache + v_float64::nlanes * 5 );
|
||||
el4hl_1 = vx_load(row_cache + v_float64::nlanes * 6 );
|
||||
el4hl_2 = vx_load(row_cache + v_float64::nlanes * 7 );
|
||||
el4hl_3 = vx_load(row_cache + v_float64::nlanes * 8 );
|
||||
el4hh_1 = vx_load(row_cache + v_float64::nlanes * 9 );
|
||||
el4hh_2 = vx_load(row_cache + v_float64::nlanes * 10);
|
||||
el4hh_3 = vx_load(row_cache + v_float64::nlanes * 11);
|
||||
v_store(sum_row + j , el4ll_1 + vx_load(prev_sum_row + j ));
|
||||
v_store(sum_row + j + v_float64::nlanes , el4ll_2 + vx_load(prev_sum_row + j + v_float64::nlanes ));
|
||||
v_store(sum_row + j + v_float64::nlanes * 2 , el4ll_3 + vx_load(prev_sum_row + j + v_float64::nlanes * 2 ));
|
||||
v_store(sum_row + j + v_float64::nlanes * 3 , el4lh_1 + vx_load(prev_sum_row + j + v_float64::nlanes * 3 ));
|
||||
v_store(sum_row + j + v_float64::nlanes * 4 , el4lh_2 + vx_load(prev_sum_row + j + v_float64::nlanes * 4 ));
|
||||
v_store(sum_row + j + v_float64::nlanes * 5 , el4lh_3 + vx_load(prev_sum_row + j + v_float64::nlanes * 5 ));
|
||||
v_store(sum_row + j + v_float64::nlanes * 6 , el4hl_1 + vx_load(prev_sum_row + j + v_float64::nlanes * 6 ));
|
||||
v_store(sum_row + j + v_float64::nlanes * 7 , el4hl_2 + vx_load(prev_sum_row + j + v_float64::nlanes * 7 ));
|
||||
v_store(sum_row + j + v_float64::nlanes * 8 , el4hl_3 + vx_load(prev_sum_row + j + v_float64::nlanes * 8 ));
|
||||
v_store(sum_row + j + v_float64::nlanes * 9 , el4hh_1 + vx_load(prev_sum_row + j + v_float64::nlanes * 9 ));
|
||||
v_store(sum_row + j + v_float64::nlanes * 10, el4hh_2 + vx_load(prev_sum_row + j + v_float64::nlanes * 10));
|
||||
v_store(sum_row + j + v_float64::nlanes * 11, el4hh_3 + vx_load(prev_sum_row + j + v_float64::nlanes * 11));
|
||||
el4ll_2 = vx_load(row_cache + VTraits<v_float64>::vlanes() );
|
||||
el4ll_3 = vx_load(row_cache + VTraits<v_float64>::vlanes() * 2 );
|
||||
el4lh_1 = vx_load(row_cache + VTraits<v_float64>::vlanes() * 3 );
|
||||
el4lh_2 = vx_load(row_cache + VTraits<v_float64>::vlanes() * 4 );
|
||||
el4lh_3 = vx_load(row_cache + VTraits<v_float64>::vlanes() * 5 );
|
||||
el4hl_1 = vx_load(row_cache + VTraits<v_float64>::vlanes() * 6 );
|
||||
el4hl_2 = vx_load(row_cache + VTraits<v_float64>::vlanes() * 7 );
|
||||
el4hl_3 = vx_load(row_cache + VTraits<v_float64>::vlanes() * 8 );
|
||||
el4hh_1 = vx_load(row_cache + VTraits<v_float64>::vlanes() * 9 );
|
||||
el4hh_2 = vx_load(row_cache + VTraits<v_float64>::vlanes() * 10);
|
||||
el4hh_3 = vx_load(row_cache + VTraits<v_float64>::vlanes() * 11);
|
||||
v_store(sum_row + j , v_add(el4ll_1, vx_load(prev_sum_row + j)));
|
||||
v_store(sum_row + j + VTraits<v_float64>::vlanes() , v_add(el4ll_2, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes())));
|
||||
v_store(sum_row + j + VTraits<v_float64>::vlanes() * 2 , v_add(el4ll_3, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes() * 2)));
|
||||
v_store(sum_row + j + VTraits<v_float64>::vlanes() * 3 , v_add(el4lh_1, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes() * 3)));
|
||||
v_store(sum_row + j + VTraits<v_float64>::vlanes() * 4 , v_add(el4lh_2, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes() * 4)));
|
||||
v_store(sum_row + j + VTraits<v_float64>::vlanes() * 5 , v_add(el4lh_3, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes() * 5)));
|
||||
v_store(sum_row + j + VTraits<v_float64>::vlanes() * 6 , v_add(el4hl_1, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes() * 6)));
|
||||
v_store(sum_row + j + VTraits<v_float64>::vlanes() * 7 , v_add(el4hl_2, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes() * 7)));
|
||||
v_store(sum_row + j + VTraits<v_float64>::vlanes() * 8 , v_add(el4hl_3, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes() * 8)));
|
||||
v_store(sum_row + j + VTraits<v_float64>::vlanes() * 9 , v_add(el4hh_1, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes() * 9)));
|
||||
v_store(sum_row + j + VTraits<v_float64>::vlanes() * 10, v_add(el4hh_2, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes() * 10)));
|
||||
v_store(sum_row + j + VTraits<v_float64>::vlanes() * 11, v_add(el4hh_3, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes() * 11)));
|
||||
}
|
||||
|
||||
for (double v3 = sum_row[j - 1] - prev_sum_row[j - 1],
|
||||
@ -1043,7 +1043,7 @@ struct Integral_SIMD<uchar, double, double>
|
||||
|
||||
v_float64 prev_1 = vx_setzero_f64(), prev_2 = vx_setzero_f64();
|
||||
int j = 0;
|
||||
for ( ; j + v_uint16::nlanes <= width; j += v_uint16::nlanes)
|
||||
for ( ; j + VTraits<v_uint16>::vlanes() <= width; j += VTraits<v_uint16>::vlanes())
|
||||
{
|
||||
v_int16 el8 = v_reinterpret_as_s16(vx_load_expand(src_row + j));
|
||||
v_float64 el4ll, el4lh, el4hl, el4hh;
|
||||
@ -1065,10 +1065,10 @@ struct Integral_SIMD<uchar, double, double>
|
||||
#endif
|
||||
v_int32 el4li, el4hi;
|
||||
v_expand(el8, el4li, el4hi);
|
||||
el4ll = v_cvt_f64(el4li) + prev_1;
|
||||
el4lh = v_cvt_f64_high(el4li) + prev_2;
|
||||
el4hl = v_cvt_f64(el4hi) + el4ll;
|
||||
el4hh = v_cvt_f64_high(el4hi) + el4lh;
|
||||
el4ll = v_add(v_cvt_f64(el4li), prev_1);
|
||||
el4lh = v_add(v_cvt_f64_high(el4li), prev_2);
|
||||
el4hl = v_add(v_cvt_f64(el4hi), el4ll);
|
||||
el4hh = v_add(v_cvt_f64_high(el4hi), el4lh);
|
||||
#if CV_SIMD_WIDTH == 16
|
||||
prev_1 = el4hl;
|
||||
prev_2 = el4hh;
|
||||
@ -1078,10 +1078,10 @@ struct Integral_SIMD<uchar, double, double>
|
||||
prev_1 = prev_2 = v_combine_high(el4hh, el4hh);
|
||||
#endif
|
||||
#endif
|
||||
v_store(sum_row + j , el4ll + vx_load(prev_sum_row + j ));
|
||||
v_store(sum_row + j + v_float64::nlanes , el4lh + vx_load(prev_sum_row + j + v_float64::nlanes ));
|
||||
v_store(sum_row + j + v_float64::nlanes * 2, el4hl + vx_load(prev_sum_row + j + v_float64::nlanes * 2));
|
||||
v_store(sum_row + j + v_float64::nlanes * 3, el4hh + vx_load(prev_sum_row + j + v_float64::nlanes * 3));
|
||||
v_store(sum_row + j , v_add(el4ll, vx_load(prev_sum_row + j)));
|
||||
v_store(sum_row + j + VTraits<v_float64>::vlanes() , v_add(el4lh, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes())));
|
||||
v_store(sum_row + j + VTraits<v_float64>::vlanes() * 2, v_add(el4hl, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes() * 2)));
|
||||
v_store(sum_row + j + VTraits<v_float64>::vlanes() * 3, v_add(el4hh, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes() * 3)));
|
||||
}
|
||||
|
||||
for (double v4 = sum_row[j - 1] - prev_sum_row[j - 1],
|
||||
|
@ -268,13 +268,13 @@ void HOGDescriptor::computeGradient(InputArray _img, InputOutputArray _grad, Inp
|
||||
for ( i = 0; i < 256; i += 4)
|
||||
{
|
||||
v_store(_data + i, v_sqrt(idx));
|
||||
idx += ifour;
|
||||
idx = v_add(idx, ifour);
|
||||
}
|
||||
else
|
||||
for ( i = 0; i < 256; i += 4)
|
||||
{
|
||||
v_store(_data + i, idx);
|
||||
idx += ifour;
|
||||
idx = v_add(idx, ifour);
|
||||
}
|
||||
#else
|
||||
if( gammaCorrection )
|
||||
@ -320,7 +320,7 @@ void HOGDescriptor::computeGradient(InputArray _img, InputOutputArray _grad, Inp
|
||||
for ( ; x <= end - 4; x += 4)
|
||||
{
|
||||
v_int32x4 mul_res = v_load(xmap + x);
|
||||
mul_res += mul_res + mul_res;
|
||||
mul_res = v_add(mul_res, v_add(mul_res, mul_res));
|
||||
v_store(xmap + x, mul_res);
|
||||
}
|
||||
#endif
|
||||
@ -444,34 +444,34 @@ void HOGDescriptor::computeGradient(InputArray _img, InputOutputArray _grad, Inp
|
||||
{
|
||||
int x0 = xmap[x], x1 = xmap[x+1], x2 = xmap[x+2], x3 = xmap[x+3];
|
||||
|
||||
v_float32x4 _dx0 = v_load(lutCurr+x+widthP2*0+2) - v_load(lutCurr+x+widthP2*0);
|
||||
v_float32x4 _dx1 = v_load(lutCurr+x+widthP2*1+2) - v_load(lutCurr+x+widthP2*1);
|
||||
v_float32x4 _dx2 = v_load(lutCurr+x+widthP2*2+2) - v_load(lutCurr+x+widthP2*2);
|
||||
v_float32x4 _dx0 = v_sub(v_load(lutCurr + x + widthP2 * 0 + 2), v_load(lutCurr + x + widthP2 * 0));
|
||||
v_float32x4 _dx1 = v_sub(v_load(lutCurr + x + widthP2 * 1 + 2), v_load(lutCurr + x + widthP2 * 1));
|
||||
v_float32x4 _dx2 = v_sub(v_load(lutCurr + x + widthP2 * 2 + 2), v_load(lutCurr + x + widthP2 * 2));
|
||||
|
||||
v_float32x4 _dy00 = v_float32x4(lut[nextPtr[x0+0]], lut[nextPtr[x1+0]], lut[nextPtr[x2+0]], lut[nextPtr[x3+0]]);
|
||||
v_float32x4 _dy0 = _dy00 - v_load(lutPrev+x+widthP2*0+1);
|
||||
v_float32x4 _dy0 = v_sub(_dy00, v_load(lutPrev + x + widthP2 * 0 + 1));
|
||||
|
||||
v_store(lutNext+x+widthP2*0+1, _dy00);
|
||||
|
||||
v_float32x4 _dy10 = v_float32x4(lut[nextPtr[x0+1]], lut[nextPtr[x1+1]], lut[nextPtr[x2+1]], lut[nextPtr[x3+1]]);
|
||||
v_float32x4 _dy1 = _dy10 - v_load(lutPrev+x+widthP2*1+1);
|
||||
v_float32x4 _dy1 = v_sub(_dy10, v_load(lutPrev + x + widthP2 * 1 + 1));
|
||||
|
||||
v_store(lutNext+x+widthP2*1+1, _dy10);
|
||||
|
||||
v_float32x4 _dy20 = v_float32x4(lut[nextPtr[x0+2]], lut[nextPtr[x1+2]], lut[nextPtr[x2+2]], lut[nextPtr[x3+2]]);
|
||||
v_float32x4 _dy2 = _dy20 - v_load(lutPrev+x+widthP2*2+1);
|
||||
v_float32x4 _dy2 = v_sub(_dy20, v_load(lutPrev + x + widthP2 * 2 + 1));
|
||||
|
||||
v_store(lutNext+x+widthP2*2+1, _dy20);
|
||||
|
||||
v_float32x4 _mag0 = (_dx0 * _dx0) + (_dy0 * _dy0);
|
||||
v_float32x4 _mag1 = (_dx1 * _dx1) + (_dy1 * _dy1);
|
||||
v_float32x4 _mag2 = (_dx2 * _dx2) + (_dy2 * _dy2);
|
||||
v_float32x4 _mag0 = v_add(v_mul(_dx0, _dx0), v_mul(_dy0, _dy0));
|
||||
v_float32x4 _mag1 = v_add(v_mul(_dx1, _dx1), v_mul(_dy1, _dy1));
|
||||
v_float32x4 _mag2 = v_add(v_mul(_dx2, _dx2), v_mul(_dy2, _dy2));
|
||||
|
||||
v_float32x4 mask = v_reinterpret_as_f32(_mag2 > _mag1);
|
||||
v_float32x4 mask = v_reinterpret_as_f32(v_gt(_mag2, _mag1));
|
||||
_dx2 = v_select(mask, _dx2, _dx1);
|
||||
_dy2 = v_select(mask, _dy2, _dy1);
|
||||
|
||||
mask = v_reinterpret_as_f32(v_max(_mag2, _mag1) > _mag0);
|
||||
mask = v_reinterpret_as_f32(v_gt(v_max(_mag2, _mag1), _mag0));
|
||||
_dx2 = v_select(mask, _dx2, _dx0);
|
||||
_dy2 = v_select(mask, _dy2, _dy0);
|
||||
|
||||
@ -537,25 +537,25 @@ void HOGDescriptor::computeGradient(InputArray _img, InputOutputArray _grad, Inp
|
||||
int x2 = x << 1;
|
||||
v_float32x4 _mag = v_load(dbuf + x + (width << 1));
|
||||
v_float32x4 _angle = v_load(dbuf + x + width * 3);
|
||||
_angle = (_angleScale * _angle) - fhalf;
|
||||
_angle = v_sub(v_mul(_angleScale, _angle), fhalf);
|
||||
|
||||
v_int32x4 _hidx = v_floor(_angle);
|
||||
_angle -= v_cvt_f32(_hidx);
|
||||
_angle = v_sub(_angle, v_cvt_f32(_hidx));
|
||||
|
||||
v_float32x4 ft0 = _mag * (fone - _angle);
|
||||
v_float32x4 ft1 = _mag * _angle;
|
||||
v_float32x4 ft0 = v_mul(_mag, v_sub(fone, _angle));
|
||||
v_float32x4 ft1 = v_mul(_mag, _angle);
|
||||
|
||||
v_store_interleave(gradPtr + x2, ft0, ft1);
|
||||
|
||||
v_int32x4 mask0 = _hidx >> 31;
|
||||
v_int32x4 it0 = mask0 & _nbins;
|
||||
mask0 = (_hidx >= _nbins);
|
||||
v_int32x4 it1 = mask0 & _nbins;
|
||||
_hidx += (it0 - it1);
|
||||
v_int32x4 mask0 = v_shr<31>(_hidx);
|
||||
v_int32x4 it0 = v_and(mask0, _nbins);
|
||||
mask0 = (v_ge(_hidx, _nbins));
|
||||
v_int32x4 it1 = v_and(mask0, _nbins);
|
||||
_hidx = v_add(_hidx, v_sub(it0, it1));
|
||||
|
||||
it0 = v_reinterpret_as_s32(v_pack(v_pack(_hidx, izero), v_reinterpret_as_s16(izero)));
|
||||
_hidx += ione;
|
||||
_hidx &= (_hidx < _nbins);
|
||||
_hidx = v_add(_hidx, ione);
|
||||
_hidx = v_and(_hidx, v_lt(_hidx, _nbins));
|
||||
it1 = v_reinterpret_as_s32(v_pack(v_pack(_hidx, izero), v_reinterpret_as_s16(izero)));
|
||||
v_uint8x16 it2, it3;
|
||||
v_zip(v_reinterpret_as_u8(it0), v_reinterpret_as_u8(it1), it2, it3);
|
||||
@ -707,9 +707,9 @@ void HOGCache::init(const HOGDescriptor* _descriptor,
|
||||
|
||||
for (; i <= blockSize.height - 4; i += 4)
|
||||
{
|
||||
v_float32x4 t = idx - _bh;
|
||||
t *= t;
|
||||
idx += ifour;
|
||||
v_float32x4 t = v_sub(idx, _bh);
|
||||
t = v_mul(t, t);
|
||||
idx = v_add(idx, ifour);
|
||||
v_store(_di + i, t);
|
||||
}
|
||||
#endif
|
||||
@ -725,9 +725,9 @@ void HOGCache::init(const HOGDescriptor* _descriptor,
|
||||
|
||||
for (; j <= blockSize.height - 4; j += 4)
|
||||
{
|
||||
v_float32x4 t = idx - _bw;
|
||||
t *= t;
|
||||
idx += ifour;
|
||||
v_float32x4 t = v_sub(idx, _bw);
|
||||
t = v_mul(t, t);
|
||||
idx = v_add(idx, ifour);
|
||||
v_store(_dj + j, t);
|
||||
}
|
||||
#endif
|
||||
@ -936,8 +936,8 @@ const float* HOGCache::getBlock(Point pt, float* buf)
|
||||
int h0 = h[0], h1 = h[1];
|
||||
|
||||
v_float32x4 _a0 = v_setall_f32(a[0]), _a1 = v_setall_f32(a[1]);
|
||||
v_float32x4 w = v_setall_f32(pk.gradWeight) * v_load(pk.histWeights);
|
||||
v_float32x4 _t0 = _a0 * w, _t1 = _a1 * w;
|
||||
v_float32x4 w = v_mul(v_setall_f32(pk.gradWeight), v_load(pk.histWeights));
|
||||
v_float32x4 _t0 = v_mul(_a0, w), _t1 = v_mul(_a1, w);
|
||||
|
||||
v_store(hist0, _t0);
|
||||
v_store(hist1, _t1);
|
||||
@ -984,8 +984,8 @@ const float* HOGCache::getBlock(Point pt, float* buf)
|
||||
int h0 = h[0], h1 = h[1];
|
||||
|
||||
v_float32x4 _a0 = v_setall_f32(a[0]), _a1 = v_setall_f32(a[1]);
|
||||
v_float32x4 w = v_setall_f32(pk.gradWeight) * v_load(pk.histWeights);
|
||||
v_float32x4 _t0 = _a0 * w, _t1 = _a1 * w;
|
||||
v_float32x4 w = v_mul(v_setall_f32(pk.gradWeight), v_load(pk.histWeights));
|
||||
v_float32x4 _t0 = v_mul(_a0, w), _t1 = v_mul(_a1, w);
|
||||
|
||||
v_store(hist0, _t0);
|
||||
v_store(hist1, _t1);
|
||||
@ -1057,12 +1057,12 @@ void HOGCache::normalizeBlockHistogram(float* _hist) const
|
||||
|
||||
#if CV_SIMD128
|
||||
v_float32x4 p0 = v_load(hist);
|
||||
v_float32x4 s = p0 * p0;
|
||||
v_float32x4 s = v_mul(p0, p0);
|
||||
|
||||
for (i = 4; i <= sz - 4; i += 4)
|
||||
{
|
||||
p0 = v_load(hist + i);
|
||||
s += p0 * p0;
|
||||
s = v_add(s, v_mul(p0, p0));
|
||||
}
|
||||
v_store(partSum, s);
|
||||
#else
|
||||
@ -1091,17 +1091,17 @@ void HOGCache::normalizeBlockHistogram(float* _hist) const
|
||||
v_float32x4 _scale = v_setall_f32(scale);
|
||||
static v_float32x4 _threshold = v_setall_f32(thresh);
|
||||
|
||||
v_float32x4 p = _scale * v_load(hist);
|
||||
v_float32x4 p = v_mul(_scale, v_load(hist));
|
||||
p = v_min(p, _threshold);
|
||||
s = p * p;
|
||||
s = v_mul(p, p);
|
||||
v_store(hist, p);
|
||||
|
||||
for(i = 4 ; i <= sz - 4; i += 4)
|
||||
{
|
||||
p = v_load(hist + i);
|
||||
p *= _scale;
|
||||
p = v_mul(p, _scale);
|
||||
p = v_min(p, _threshold);
|
||||
s += p * p;
|
||||
s = v_add(s, v_mul(p, p));
|
||||
v_store(hist + i, p);
|
||||
}
|
||||
|
||||
@ -1137,7 +1137,7 @@ void HOGCache::normalizeBlockHistogram(float* _hist) const
|
||||
v_float32x4 _scale2 = v_setall_f32(scale);
|
||||
for ( ; i <= sz - 4; i += 4)
|
||||
{
|
||||
v_float32x4 t = _scale2 * v_load(hist + i);
|
||||
v_float32x4 t = v_mul(_scale2, v_load(hist + i));
|
||||
v_store(hist + i, t);
|
||||
}
|
||||
#endif
|
||||
@ -1593,14 +1593,14 @@ void HOGDescriptor::detect(InputArray _img,
|
||||
#if CV_SIMD128
|
||||
v_float32x4 _vec = v_load(vec);
|
||||
v_float32x4 _svmVec = v_load(svmVec);
|
||||
v_float32x4 sum = _svmVec * _vec;
|
||||
v_float32x4 sum = v_mul(_svmVec, _vec);
|
||||
|
||||
for( k = 4; k <= blockHistogramSize - 4; k += 4 )
|
||||
{
|
||||
_vec = v_load(vec + k);
|
||||
_svmVec = v_load(svmVec + k);
|
||||
|
||||
sum += _vec * _svmVec;
|
||||
sum = v_add(sum, v_mul(_vec, _svmVec));
|
||||
}
|
||||
|
||||
v_store(partSum, sum);
|
||||
@ -3392,14 +3392,14 @@ void HOGDescriptor::detectROI(InputArray _img, const std::vector<cv::Point> &loc
|
||||
#if CV_SIMD128
|
||||
v_float32x4 _vec = v_load(vec);
|
||||
v_float32x4 _svmVec = v_load(svmVec);
|
||||
v_float32x4 sum = _svmVec * _vec;
|
||||
v_float32x4 sum = v_mul(_svmVec, _vec);
|
||||
|
||||
for( k = 4; k <= blockHistogramSize - 4; k += 4 )
|
||||
{
|
||||
_vec = v_load(vec + k);
|
||||
_svmVec = v_load(svmVec + k);
|
||||
|
||||
sum += _vec * _svmVec;
|
||||
sum = v_add(sum, v_mul(_vec, _svmVec));
|
||||
}
|
||||
|
||||
v_store(partSum, sum);
|
||||
|
@ -520,16 +520,16 @@ DISOpticalFlowImpl::PatchInverseSearch_ParBody::PatchInverseSearch_ParBody(DISOp
|
||||
v_expand(I0_row_8, I0_row_4_left, I0_row_4_right); \
|
||||
\
|
||||
/* Compute diffs between I0 and bilinearly interpolated I1: */ \
|
||||
I_diff_left = w00v * v_cvt_f32(v_reinterpret_as_s32(I1_row_4_left)) + \
|
||||
w01v * v_cvt_f32(v_reinterpret_as_s32(I1_row_shifted_4_left)) + \
|
||||
w10v * v_cvt_f32(v_reinterpret_as_s32(I1_row_next_4_left)) + \
|
||||
w11v * v_cvt_f32(v_reinterpret_as_s32(I1_row_next_shifted_4_left)) - \
|
||||
v_cvt_f32(v_reinterpret_as_s32(I0_row_4_left)); \
|
||||
I_diff_right = w00v * v_cvt_f32(v_reinterpret_as_s32(I1_row_4_right)) + \
|
||||
w01v * v_cvt_f32(v_reinterpret_as_s32(I1_row_shifted_4_right)) + \
|
||||
w10v * v_cvt_f32(v_reinterpret_as_s32(I1_row_next_4_right)) + \
|
||||
w11v * v_cvt_f32(v_reinterpret_as_s32(I1_row_next_shifted_4_right)) - \
|
||||
v_cvt_f32(v_reinterpret_as_s32(I0_row_4_right));
|
||||
I_diff_left = v_sub(v_add(v_mul(w00v, v_cvt_f32(v_reinterpret_as_s32(I1_row_4_left))), \
|
||||
v_mul(w01v, v_cvt_f32(v_reinterpret_as_s32(I1_row_shifted_4_left))), \
|
||||
v_mul(w10v, v_cvt_f32(v_reinterpret_as_s32(I1_row_next_4_left))), \
|
||||
v_mul(w11v, v_cvt_f32(v_reinterpret_as_s32(I1_row_next_shifted_4_left)))), \
|
||||
v_cvt_f32(v_reinterpret_as_s32(I0_row_4_left))); \
|
||||
I_diff_right = v_sub(v_add(v_mul(w00v, v_cvt_f32(v_reinterpret_as_s32(I1_row_4_right))), \
|
||||
v_mul(w01v, v_cvt_f32(v_reinterpret_as_s32(I1_row_shifted_4_right))), \
|
||||
v_mul(w10v, v_cvt_f32(v_reinterpret_as_s32(I1_row_next_4_right))), \
|
||||
v_mul(w11v, v_cvt_f32(v_reinterpret_as_s32(I1_row_next_shifted_4_right)))), \
|
||||
v_cvt_f32(v_reinterpret_as_s32(I0_row_4_right)));
|
||||
|
||||
#define HAL_BILINEAR_8x8_PATCH_EXTRACTION_NEXT_ROW \
|
||||
I0_ptr += I0_stride; \
|
||||
@ -572,9 +572,9 @@ inline float processPatch(float &dst_dUx, float &dst_dUy, uchar *I0_ptr, uchar *
|
||||
v_expand(I0y_row, I0y_row_4_left, I0y_row_4_right);
|
||||
|
||||
/* Update the sums: */
|
||||
Ux_vec += I_diff_left * v_cvt_f32(I0x_row_4_left) + I_diff_right * v_cvt_f32(I0x_row_4_right);
|
||||
Uy_vec += I_diff_left * v_cvt_f32(I0y_row_4_left) + I_diff_right * v_cvt_f32(I0y_row_4_right);
|
||||
SSD_vec += I_diff_left * I_diff_left + I_diff_right * I_diff_right;
|
||||
Ux_vec = v_add(Ux_vec, v_add(v_mul(I_diff_left, v_cvt_f32(I0x_row_4_left)), v_mul(I_diff_right, v_cvt_f32(I0x_row_4_right))));
|
||||
Uy_vec = v_add(Uy_vec, v_add(v_mul(I_diff_left, v_cvt_f32(I0y_row_4_left)), v_mul(I_diff_right, v_cvt_f32(I0y_row_4_right))));
|
||||
SSD_vec = v_add(SSD_vec, v_add(v_mul(I_diff_left, I_diff_left), v_mul(I_diff_right, I_diff_right)));
|
||||
|
||||
I0x_ptr += I0_stride;
|
||||
I0y_ptr += I0_stride;
|
||||
@ -640,10 +640,10 @@ inline float processPatchMeanNorm(float &dst_dUx, float &dst_dUy, uchar *I0_ptr,
|
||||
v_expand(I0y_row, I0y_row_4_left, I0y_row_4_right);
|
||||
|
||||
/* Update the sums: */
|
||||
sum_I0x_mul_vec += I_diff_left * v_cvt_f32(I0x_row_4_left) + I_diff_right * v_cvt_f32(I0x_row_4_right);
|
||||
sum_I0y_mul_vec += I_diff_left * v_cvt_f32(I0y_row_4_left) + I_diff_right * v_cvt_f32(I0y_row_4_right);
|
||||
sum_diff_sq_vec += I_diff_left * I_diff_left + I_diff_right * I_diff_right;
|
||||
sum_diff_vec += I_diff_left + I_diff_right;
|
||||
sum_I0x_mul_vec = v_add(sum_I0x_mul_vec, v_add(v_mul(I_diff_left, v_cvt_f32(I0x_row_4_left)), v_mul(I_diff_right, v_cvt_f32(I0x_row_4_right))));
|
||||
sum_I0y_mul_vec = v_add(sum_I0y_mul_vec, v_add(v_mul(I_diff_left, v_cvt_f32(I0y_row_4_left)), v_mul(I_diff_right, v_cvt_f32(I0y_row_4_right))));
|
||||
sum_diff_sq_vec = v_add(sum_diff_sq_vec, v_add(v_mul(I_diff_left, I_diff_left), v_mul(I_diff_right, I_diff_right)));
|
||||
sum_diff_vec = v_add(sum_diff_vec, v_add(I_diff_left, I_diff_right));
|
||||
|
||||
I0x_ptr += I0_stride;
|
||||
I0y_ptr += I0_stride;
|
||||
@ -692,7 +692,7 @@ inline float computeSSD(uchar *I0_ptr, uchar *I1_ptr, int I0_stride, int I1_stri
|
||||
for (int row = 0; row < 8; row++)
|
||||
{
|
||||
HAL_PROCESS_BILINEAR_8x8_PATCH_EXTRACTION;
|
||||
SSD_vec += I_diff_left * I_diff_left + I_diff_right * I_diff_right;
|
||||
SSD_vec = v_add(SSD_vec, v_add(v_mul(I_diff_left, I_diff_left), v_mul(I_diff_right, I_diff_right)));
|
||||
HAL_BILINEAR_8x8_PATCH_EXTRACTION_NEXT_ROW;
|
||||
}
|
||||
SSD = v_reduce_sum(SSD_vec);
|
||||
@ -728,8 +728,8 @@ inline float computeSSDMeanNorm(uchar *I0_ptr, uchar *I1_ptr, int I0_stride, int
|
||||
for (int row = 0; row < 8; row++)
|
||||
{
|
||||
HAL_PROCESS_BILINEAR_8x8_PATCH_EXTRACTION;
|
||||
sum_diff_sq_vec += I_diff_left * I_diff_left + I_diff_right * I_diff_right;
|
||||
sum_diff_vec += I_diff_left + I_diff_right;
|
||||
sum_diff_sq_vec = v_add(sum_diff_sq_vec, v_add(v_mul(I_diff_left, I_diff_left), v_mul(I_diff_right, I_diff_right)));
|
||||
sum_diff_vec = v_add(sum_diff_vec, v_add(I_diff_left, I_diff_right));
|
||||
HAL_BILINEAR_8x8_PATCH_EXTRACTION_NEXT_ROW;
|
||||
}
|
||||
sum_diff = v_reduce_sum(sum_diff_vec);
|
||||
|
@ -97,8 +97,8 @@ void cv::detail::ScharrDerivInvoker::operator()(const Range& range) const
|
||||
v_int16x8 s1 = v_reinterpret_as_s16(v_load_expand(srow1 + x));
|
||||
v_int16x8 s2 = v_reinterpret_as_s16(v_load_expand(srow2 + x));
|
||||
|
||||
v_int16x8 t1 = s2 - s0;
|
||||
v_int16x8 t0 = v_mul_wrap(s0 + s2, c3) + v_mul_wrap(s1, c10);
|
||||
v_int16x8 t1 = v_sub(s2, s0);
|
||||
v_int16x8 t0 = v_add(v_mul_wrap(v_add(s0, s2), c3), v_mul_wrap(s1, c10));
|
||||
|
||||
v_store(trow0 + x, t0);
|
||||
v_store(trow1 + x, t1);
|
||||
@ -134,8 +134,8 @@ void cv::detail::ScharrDerivInvoker::operator()(const Range& range) const
|
||||
v_int16x8 s3 = v_load(trow1 + x);
|
||||
v_int16x8 s4 = v_load(trow1 + x + cn);
|
||||
|
||||
v_int16x8 t0 = s1 - s0;
|
||||
v_int16x8 t1 = v_mul_wrap(s2 + s4, c3) + v_mul_wrap(s3, c10);
|
||||
v_int16x8 t0 = v_sub(s1, s0);
|
||||
v_int16x8 t1 = v_add(v_mul_wrap(v_add(s2, s4), c3), v_mul_wrap(s3, c10));
|
||||
|
||||
v_store_interleave((drow + x*2), t0, t1);
|
||||
}
|
||||
@ -293,10 +293,10 @@ void cv::detail::LKTrackerInvoker::operator()(const Range& range) const
|
||||
v_zip(v00, v01, t00, t01);
|
||||
v_zip(v10, v11, t10, t11);
|
||||
|
||||
t0 = v_dotprod(t00, qw0, qdelta) + v_dotprod(t10, qw1);
|
||||
t1 = v_dotprod(t01, qw0, qdelta) + v_dotprod(t11, qw1);
|
||||
t0 = t0 >> (W_BITS1-5);
|
||||
t1 = t1 >> (W_BITS1-5);
|
||||
t0 = v_add(v_dotprod(t00, qw0, qdelta), v_dotprod(t10, qw1));
|
||||
t1 = v_add(v_dotprod(t01, qw0, qdelta), v_dotprod(t11, qw1));
|
||||
t0 = v_shr<W_BITS1 - 5>(t0);
|
||||
t1 = v_shr<W_BITS1 - 5>(t1);
|
||||
v_store(Iptr + x, v_pack(t0, t1));
|
||||
|
||||
v00 = v_reinterpret_as_s16(v_load(dsrc));
|
||||
@ -307,10 +307,10 @@ void cv::detail::LKTrackerInvoker::operator()(const Range& range) const
|
||||
v_zip(v00, v01, t00, t01);
|
||||
v_zip(v10, v11, t10, t11);
|
||||
|
||||
t0 = v_dotprod(t00, qw0, qdelta_d) + v_dotprod(t10, qw1);
|
||||
t1 = v_dotprod(t01, qw0, qdelta_d) + v_dotprod(t11, qw1);
|
||||
t0 = t0 >> W_BITS1;
|
||||
t1 = t1 >> W_BITS1;
|
||||
t0 = v_add(v_dotprod(t00, qw0, qdelta_d), v_dotprod(t10, qw1));
|
||||
t1 = v_add(v_dotprod(t01, qw0, qdelta_d), v_dotprod(t11, qw1));
|
||||
t0 = v_shr<W_BITS1>(t0);
|
||||
t1 = v_shr<W_BITS1>(t1);
|
||||
v00 = v_pack(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
|
||||
v_store(dIptr, v00);
|
||||
|
||||
@ -332,10 +332,10 @@ void cv::detail::LKTrackerInvoker::operator()(const Range& range) const
|
||||
v_zip(v00, v01, t00, t01);
|
||||
v_zip(v10, v11, t10, t11);
|
||||
|
||||
t0 = v_dotprod(t00, qw0, qdelta_d) + v_dotprod(t10, qw1);
|
||||
t1 = v_dotprod(t01, qw0, qdelta_d) + v_dotprod(t11, qw1);
|
||||
t0 = t0 >> W_BITS1;
|
||||
t1 = t1 >> W_BITS1;
|
||||
t0 = v_add(v_dotprod(t00, qw0, qdelta_d), v_dotprod(t10, qw1));
|
||||
t1 = v_add(v_dotprod(t01, qw0, qdelta_d), v_dotprod(t11, qw1));
|
||||
t0 = v_shr<W_BITS1>(t0);
|
||||
t1 = v_shr<W_BITS1>(t1);
|
||||
v00 = v_pack(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
|
||||
v_store(dIptr + 4*2, v00);
|
||||
|
||||
@ -548,18 +548,18 @@ void cv::detail::LKTrackerInvoker::operator()(const Range& range) const
|
||||
v_zip(v00, v01, t00, t01);
|
||||
v_zip(v10, v11, t10, t11);
|
||||
|
||||
t0 = v_dotprod(t00, qw0, qdelta) + v_dotprod(t10, qw1);
|
||||
t1 = v_dotprod(t01, qw0, qdelta) + v_dotprod(t11, qw1);
|
||||
t0 = t0 >> (W_BITS1-5);
|
||||
t1 = t1 >> (W_BITS1-5);
|
||||
diff0 = v_pack(t0, t1) - diff0;
|
||||
t0 = v_add(v_dotprod(t00, qw0, qdelta), v_dotprod(t10, qw1));
|
||||
t1 = v_add(v_dotprod(t01, qw0, qdelta), v_dotprod(t11, qw1));
|
||||
t0 = v_shr<W_BITS1 - 5>(t0);
|
||||
t1 = v_shr<W_BITS1 - 5>(t1);
|
||||
diff0 = v_sub(v_pack(t0, t1), diff0);
|
||||
v_zip(diff0, diff0, diff2, diff1); // It0 It0 It1 It1 ...
|
||||
v00 = v_reinterpret_as_s16(v_load(dIptr)); // Ix0 Iy0 Ix1 Iy1 ...
|
||||
v01 = v_reinterpret_as_s16(v_load(dIptr + 8));
|
||||
v_zip(v00, v01, v10, v11);
|
||||
v_zip(diff2, diff1, v00, v01);
|
||||
qb0 += v_cvt_f32(v_dotprod(v00, v10));
|
||||
qb1 += v_cvt_f32(v_dotprod(v01, v11));
|
||||
qb0 = v_add(qb0, v_cvt_f32(v_dotprod(v00, v10)));
|
||||
qb1 = v_add(qb1, v_cvt_f32(v_dotprod(v01, v11)));
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -647,7 +647,7 @@ void cv::detail::LKTrackerInvoker::operator()(const Range& range) const
|
||||
|
||||
#if CV_SIMD128 && !CV_NEON
|
||||
v_float32x4 qf0, qf1;
|
||||
v_recombine(v_interleave_pairs(qb0 + qb1), v_setzero_f32(), qf0, qf1);
|
||||
v_recombine(v_interleave_pairs(v_add(qb0, qb1)), v_setzero_f32(), qf0, qf1);
|
||||
ib1 += v_reduce_sum(qf0);
|
||||
ib2 += v_reduce_sum(qf1);
|
||||
#endif
|
||||
|
@ -463,22 +463,22 @@ FarnebackUpdateFlow_GaussianBlur( const Mat& _R0, const Mat& _R1,
|
||||
const float *sptr0 = srow[m], *sptr1;
|
||||
v_float32x4 g4 = v_load(simd_kernel);
|
||||
v_float32x4 s0, s1, s2, s3;
|
||||
s0 = v_load(sptr0 + x) * g4;
|
||||
s1 = v_load(sptr0 + x + 4) * g4;
|
||||
s2 = v_load(sptr0 + x + 8) * g4;
|
||||
s3 = v_load(sptr0 + x + 12) * g4;
|
||||
s0 = v_mul(v_load(sptr0 + x), g4);
|
||||
s1 = v_mul(v_load(sptr0 + x + 4), g4);
|
||||
s2 = v_mul(v_load(sptr0 + x + 8), g4);
|
||||
s3 = v_mul(v_load(sptr0 + x + 12), g4);
|
||||
|
||||
for( i = 1; i <= m; i++ )
|
||||
{
|
||||
v_float32x4 x0, x1;
|
||||
sptr0 = srow[m+i], sptr1 = srow[m-i];
|
||||
g4 = v_load(simd_kernel + i*4);
|
||||
x0 = v_load(sptr0 + x) + v_load(sptr1 + x);
|
||||
x1 = v_load(sptr0 + x + 4) + v_load(sptr1 + x + 4);
|
||||
x0 = v_add(v_load(sptr0 + x), v_load(sptr1 + x));
|
||||
x1 = v_add(v_load(sptr0 + x + 4), v_load(sptr1 + x + 4));
|
||||
s0 = v_muladd(x0, g4, s0);
|
||||
s1 = v_muladd(x1, g4, s1);
|
||||
x0 = v_load(sptr0 + x + 8) + v_load(sptr1 + x + 8);
|
||||
x1 = v_load(sptr0 + x + 12) + v_load(sptr1 + x + 12);
|
||||
x0 = v_add(v_load(sptr0 + x + 8), v_load(sptr1 + x + 8));
|
||||
x1 = v_add(v_load(sptr0 + x + 12), v_load(sptr1 + x + 12));
|
||||
s2 = v_muladd(x0, g4, s2);
|
||||
s3 = v_muladd(x1, g4, s3);
|
||||
}
|
||||
@ -493,13 +493,13 @@ FarnebackUpdateFlow_GaussianBlur( const Mat& _R0, const Mat& _R1,
|
||||
{
|
||||
const float *sptr0 = srow[m], *sptr1;
|
||||
v_float32x4 g4 = v_load(simd_kernel);
|
||||
v_float32x4 s0 = v_load(sptr0 + x) * g4;
|
||||
v_float32x4 s0 = v_mul(v_load(sptr0 + x), g4);
|
||||
|
||||
for( i = 1; i <= m; i++ )
|
||||
{
|
||||
sptr0 = srow[m+i], sptr1 = srow[m-i];
|
||||
g4 = v_load(simd_kernel + i*4);
|
||||
v_float32x4 x0 = v_load(sptr0 + x) + v_load(sptr1 + x);
|
||||
v_float32x4 x0 = v_add(v_load(sptr0 + x), v_load(sptr1 + x));
|
||||
s0 = v_muladd(x0, g4, s0);
|
||||
}
|
||||
v_store(vsum + x, s0);
|
||||
@ -528,14 +528,14 @@ FarnebackUpdateFlow_GaussianBlur( const Mat& _R0, const Mat& _R1,
|
||||
for( ; x <= width*5 - 8; x += 8 )
|
||||
{
|
||||
v_float32x4 g4 = v_load(simd_kernel);
|
||||
v_float32x4 s0 = v_load(vsum + x) * g4;
|
||||
v_float32x4 s1 = v_load(vsum + x + 4) * g4;
|
||||
v_float32x4 s0 = v_mul(v_load(vsum + x), g4);
|
||||
v_float32x4 s1 = v_mul(v_load(vsum + x + 4), g4);
|
||||
|
||||
for( i = 1; i <= m; i++ )
|
||||
{
|
||||
g4 = v_load(simd_kernel + i*4);
|
||||
v_float32x4 x0 = v_load(vsum + x - i*5) + v_load(vsum + x+ i*5);
|
||||
v_float32x4 x1 = v_load(vsum + x - i*5 + 4) + v_load(vsum + x+ i*5 + 4);
|
||||
v_float32x4 x0 = v_add(v_load(vsum + x - i * 5), v_load(vsum + x + i * 5));
|
||||
v_float32x4 x1 = v_add(v_load(vsum + x - i * 5 + 4), v_load(vsum + x + i * 5 + 4));
|
||||
s0 = v_muladd(x0, g4, s0);
|
||||
s1 = v_muladd(x1, g4, s1);
|
||||
}
|
||||
|
@ -651,15 +651,15 @@ void VariationalRefinementImpl::ComputeDataTerm_ParBody::operator()(const Range
|
||||
pdU_vec = v_load(pdU + j);
|
||||
pdV_vec = v_load(pdV + j);
|
||||
|
||||
derivNorm_vec = pIx_vec * pIx_vec + pIy_vec * pIy_vec + zeta_vec;
|
||||
Ik1z_vec = pIz_vec + pIx_vec * pdU_vec + pIy_vec * pdV_vec;
|
||||
weight_vec = (delta_vec / v_sqrt(Ik1z_vec * Ik1z_vec / derivNorm_vec + eps_vec)) / derivNorm_vec;
|
||||
derivNorm_vec = v_add(v_add(v_mul(pIx_vec, pIx_vec), v_mul(pIy_vec, pIy_vec)), zeta_vec);
|
||||
Ik1z_vec = v_add(v_add(pIz_vec, v_mul(pIx_vec, pdU_vec)), v_mul(pIy_vec, pdV_vec));
|
||||
weight_vec = v_div(v_div(delta_vec, v_sqrt(v_add(v_div(v_mul(Ik1z_vec, Ik1z_vec), derivNorm_vec), eps_vec))), derivNorm_vec);
|
||||
|
||||
pa11_vec = weight_vec * (pIx_vec * pIx_vec) + zeta_vec;
|
||||
pa12_vec = weight_vec * (pIx_vec * pIy_vec);
|
||||
pa22_vec = weight_vec * (pIy_vec * pIy_vec) + zeta_vec;
|
||||
pb1_vec = zero_vec - weight_vec * (pIz_vec * pIx_vec);
|
||||
pb2_vec = zero_vec - weight_vec * (pIz_vec * pIy_vec);
|
||||
pa11_vec = v_add(v_mul(weight_vec, v_mul(pIx_vec, pIx_vec)), zeta_vec);
|
||||
pa12_vec = v_mul(weight_vec, v_mul(pIx_vec, pIy_vec));
|
||||
pa22_vec = v_add(v_mul(weight_vec, v_mul(pIy_vec, pIy_vec)), zeta_vec);
|
||||
pb1_vec = v_sub(zero_vec, v_mul(weight_vec, v_mul(pIz_vec, pIx_vec)));
|
||||
pb2_vec = v_sub(zero_vec, v_mul(weight_vec, v_mul(pIz_vec, pIy_vec)));
|
||||
|
||||
pIxx_vec = v_load(pIxx + j);
|
||||
pIxy_vec = v_load(pIxy + j);
|
||||
@ -667,18 +667,17 @@ void VariationalRefinementImpl::ComputeDataTerm_ParBody::operator()(const Range
|
||||
pIxz_vec = v_load(pIxz + j);
|
||||
pIyz_vec = v_load(pIyz + j);
|
||||
|
||||
derivNorm_vec = pIxx_vec * pIxx_vec + pIxy_vec * pIxy_vec + zeta_vec;
|
||||
derivNorm2_vec = pIyy_vec * pIyy_vec + pIxy_vec * pIxy_vec + zeta_vec;
|
||||
Ik1zx_vec = pIxz_vec + pIxx_vec * pdU_vec + pIxy_vec * pdV_vec;
|
||||
Ik1zy_vec = pIyz_vec + pIxy_vec * pdU_vec + pIyy_vec * pdV_vec;
|
||||
weight_vec = gamma_vec / v_sqrt(Ik1zx_vec * Ik1zx_vec / derivNorm_vec +
|
||||
Ik1zy_vec * Ik1zy_vec / derivNorm2_vec + eps_vec);
|
||||
derivNorm_vec = v_add(v_add(v_mul(pIxx_vec, pIxx_vec), v_mul(pIxy_vec, pIxy_vec)), zeta_vec);
|
||||
derivNorm2_vec = v_add(v_add(v_mul(pIyy_vec, pIyy_vec), v_mul(pIxy_vec, pIxy_vec)), zeta_vec);
|
||||
Ik1zx_vec = v_add(v_add(pIxz_vec, v_mul(pIxx_vec, pdU_vec)), v_mul(pIxy_vec, pdV_vec));
|
||||
Ik1zy_vec = v_add(v_add(pIyz_vec, v_mul(pIxy_vec, pdU_vec)), v_mul(pIyy_vec, pdV_vec));
|
||||
weight_vec = v_div(gamma_vec, v_sqrt(v_add(v_add(v_div(v_mul(Ik1zx_vec, Ik1zx_vec), derivNorm_vec), v_div(v_mul(Ik1zy_vec, Ik1zy_vec), derivNorm2_vec)), eps_vec)));
|
||||
|
||||
pa11_vec += weight_vec * (pIxx_vec * pIxx_vec / derivNorm_vec + pIxy_vec * pIxy_vec / derivNorm2_vec);
|
||||
pa12_vec += weight_vec * (pIxx_vec * pIxy_vec / derivNorm_vec + pIxy_vec * pIyy_vec / derivNorm2_vec);
|
||||
pa22_vec += weight_vec * (pIxy_vec * pIxy_vec / derivNorm_vec + pIyy_vec * pIyy_vec / derivNorm2_vec);
|
||||
pb1_vec -= weight_vec * (pIxx_vec * pIxz_vec / derivNorm_vec + pIxy_vec * pIyz_vec / derivNorm2_vec);
|
||||
pb2_vec -= weight_vec * (pIxy_vec * pIxz_vec / derivNorm_vec + pIyy_vec * pIyz_vec / derivNorm2_vec);
|
||||
pa11_vec = v_add(pa11_vec, v_mul(weight_vec, v_add(v_div(v_mul(pIxx_vec, pIxx_vec), derivNorm_vec), v_div(v_mul(pIxy_vec, pIxy_vec), derivNorm2_vec))));
|
||||
pa12_vec = v_add(pa12_vec, v_mul(weight_vec, v_add(v_div(v_mul(pIxx_vec, pIxy_vec), derivNorm_vec), v_div(v_mul(pIxy_vec, pIyy_vec), derivNorm2_vec))));
|
||||
pa22_vec = v_add(pa22_vec, v_mul(weight_vec, v_add(v_div(v_mul(pIxy_vec, pIxy_vec), derivNorm_vec), v_div(v_mul(pIyy_vec, pIyy_vec), derivNorm2_vec))));
|
||||
pb1_vec = v_sub(pb1_vec, v_mul(weight_vec, v_add(v_div(v_mul(pIxx_vec, pIxz_vec), derivNorm_vec), v_div(v_mul(pIxy_vec, pIyz_vec), derivNorm2_vec))));
|
||||
pb2_vec = v_sub(pb2_vec, v_mul(weight_vec, v_add(v_div(v_mul(pIxy_vec, pIxz_vec), derivNorm_vec), v_div(v_mul(pIyy_vec, pIyz_vec), derivNorm2_vec))));
|
||||
|
||||
v_store(pa11 + j, pa11_vec);
|
||||
v_store(pa12 + j, pa12_vec);
|
||||
@ -850,26 +849,26 @@ void VariationalRefinementImpl::ComputeSmoothnessTermHorPass_ParBody::operator()
|
||||
cW_u_vec = v_load(cW_u + j);
|
||||
cW_v_vec = v_load(cW_v + j);
|
||||
|
||||
ux_vec = v_load(cW_u_next + j) - cW_u_vec;
|
||||
vx_vec = v_load(cW_v_next + j) - cW_v_vec;
|
||||
uy_vec = v_load(cW_u_next_row + j) - cW_u_vec;
|
||||
vy_vec = v_load(cW_v_next_row + j) - cW_v_vec;
|
||||
ux_vec = v_sub(v_load(cW_u_next + j), cW_u_vec);
|
||||
vx_vec = v_sub(v_load(cW_v_next + j), cW_v_vec);
|
||||
uy_vec = v_sub(v_load(cW_u_next_row + j), cW_u_vec);
|
||||
vy_vec = v_sub(v_load(cW_v_next_row + j), cW_v_vec);
|
||||
pWeight_vec =
|
||||
alpha2_vec / v_sqrt(ux_vec * ux_vec + vx_vec * vx_vec + uy_vec * uy_vec + vy_vec * vy_vec + eps_vec);
|
||||
v_div(alpha2_vec, v_sqrt(v_add(v_add(v_add(v_add(v_mul(ux_vec, ux_vec), v_mul(vx_vec, vx_vec)), v_mul(uy_vec, uy_vec)), v_mul(vy_vec, vy_vec)), eps_vec)));
|
||||
v_store(pWeight + j, pWeight_vec);
|
||||
|
||||
ux_vec = pWeight_vec * (v_load(pW_u_next + j) - v_load(pW_u + j));
|
||||
vx_vec = pWeight_vec * (v_load(pW_v_next + j) - v_load(pW_v + j));
|
||||
ux_vec = v_mul(pWeight_vec, v_sub(v_load(pW_u_next + j), v_load(pW_u + j)));
|
||||
vx_vec = v_mul(pWeight_vec, v_sub(v_load(pW_v_next + j), v_load(pW_v + j)));
|
||||
|
||||
v_store(pA_u + j, v_load(pA_u + j) + pWeight_vec);
|
||||
v_store(pA_v + j, v_load(pA_v + j) + pWeight_vec);
|
||||
v_store(pB_u + j, v_load(pB_u + j) + ux_vec);
|
||||
v_store(pB_v + j, v_load(pB_v + j) + vx_vec);
|
||||
v_store(pA_u + j, v_add(v_load(pA_u + j), pWeight_vec));
|
||||
v_store(pA_v + j, v_add(v_load(pA_v + j), pWeight_vec));
|
||||
v_store(pB_u + j, v_add(v_load(pB_u + j), ux_vec));
|
||||
v_store(pB_v + j, v_add(v_load(pB_v + j), vx_vec));
|
||||
|
||||
v_store(pA_u_next + j, v_load(pA_u_next + j) + pWeight_vec);
|
||||
v_store(pA_v_next + j, v_load(pA_v_next + j) + pWeight_vec);
|
||||
v_store(pB_u_next + j, v_load(pB_u_next + j) - ux_vec);
|
||||
v_store(pB_v_next + j, v_load(pB_v_next + j) - vx_vec);
|
||||
v_store(pA_u_next + j, v_add(v_load(pA_u_next + j), pWeight_vec));
|
||||
v_store(pA_v_next + j, v_add(v_load(pA_v_next + j), pWeight_vec));
|
||||
v_store(pB_u_next + j, v_sub(v_load(pB_u_next + j), ux_vec));
|
||||
v_store(pB_v_next + j, v_sub(v_load(pB_v_next + j), vx_vec));
|
||||
}
|
||||
#endif
|
||||
for (; j < len - 1; j++)
|
||||
@ -956,18 +955,18 @@ void VariationalRefinementImpl::ComputeSmoothnessTermVertPass_ParBody::operator(
|
||||
for (; j < len - 3; j += 4)
|
||||
{
|
||||
pWeight_vec = v_load(pWeight + j);
|
||||
uy_vec = pWeight_vec * (v_load(pW_u_next_row + j) - v_load(pW_u + j));
|
||||
vy_vec = pWeight_vec * (v_load(pW_v_next_row + j) - v_load(pW_v + j));
|
||||
uy_vec = v_mul(pWeight_vec, v_sub(v_load(pW_u_next_row + j), v_load(pW_u + j)));
|
||||
vy_vec = v_mul(pWeight_vec, v_sub(v_load(pW_v_next_row + j), v_load(pW_v + j)));
|
||||
|
||||
v_store(pA_u + j, v_load(pA_u + j) + pWeight_vec);
|
||||
v_store(pA_v + j, v_load(pA_v + j) + pWeight_vec);
|
||||
v_store(pB_u + j, v_load(pB_u + j) + uy_vec);
|
||||
v_store(pB_v + j, v_load(pB_v + j) + vy_vec);
|
||||
v_store(pA_u + j, v_add(v_load(pA_u + j), pWeight_vec));
|
||||
v_store(pA_v + j, v_add(v_load(pA_v + j), pWeight_vec));
|
||||
v_store(pB_u + j, v_add(v_load(pB_u + j), uy_vec));
|
||||
v_store(pB_v + j, v_add(v_load(pB_v + j), vy_vec));
|
||||
|
||||
v_store(pA_u_next_row + j, v_load(pA_u_next_row + j) + pWeight_vec);
|
||||
v_store(pA_v_next_row + j, v_load(pA_v_next_row + j) + pWeight_vec);
|
||||
v_store(pB_u_next_row + j, v_load(pB_u_next_row + j) - uy_vec);
|
||||
v_store(pB_v_next_row + j, v_load(pB_v_next_row + j) - vy_vec);
|
||||
v_store(pA_u_next_row + j, v_add(v_load(pA_u_next_row + j), pWeight_vec));
|
||||
v_store(pA_v_next_row + j, v_add(v_load(pA_v_next_row + j), pWeight_vec));
|
||||
v_store(pB_u_next_row + j, v_sub(v_load(pB_u_next_row + j), uy_vec));
|
||||
v_store(pB_v_next_row + j, v_sub(v_load(pB_v_next_row + j), vy_vec));
|
||||
}
|
||||
#endif
|
||||
for (; j < len; j++)
|
||||
@ -1084,15 +1083,13 @@ void VariationalRefinementImpl::RedBlackSOR_ParBody::operator()(const Range &ran
|
||||
pdv_shifted_vec = v_reinterpret_as_f32(
|
||||
v_extract<3>(v_reinterpret_as_s32(pdv_prev_vec), v_reinterpret_as_s32(pdv_next_vec)));
|
||||
|
||||
sigmaU_vec = pW_shifted_vec * pdu_shifted_vec + pW_vec * pdu_next_vec + pW_prev_row_vec * pdu_prev_row_vec +
|
||||
pW_vec * pdu_next_row_vec;
|
||||
sigmaV_vec = pW_shifted_vec * pdv_shifted_vec + pW_vec * pdv_next_vec + pW_prev_row_vec * pdv_prev_row_vec +
|
||||
pW_vec * pdv_next_row_vec;
|
||||
sigmaU_vec = v_add(v_add(v_add(v_mul(pW_shifted_vec, pdu_shifted_vec), v_mul(pW_vec, pdu_next_vec)), v_mul(pW_prev_row_vec, pdu_prev_row_vec)), v_mul(pW_vec, pdu_next_row_vec));
|
||||
sigmaV_vec = v_add(v_add(v_add(v_mul(pW_shifted_vec, pdv_shifted_vec), v_mul(pW_vec, pdv_next_vec)), v_mul(pW_prev_row_vec, pdv_prev_row_vec)), v_mul(pW_vec, pdv_next_row_vec));
|
||||
|
||||
pdu_vec = v_load(pdu + j);
|
||||
pdv_vec = v_load(pdv + j);
|
||||
pdu_vec += omega_vec * ((sigmaU_vec + v_load(pb1 + j) - pdv_vec * pa12_vec) / v_load(pa11 + j) - pdu_vec);
|
||||
pdv_vec += omega_vec * ((sigmaV_vec + v_load(pb2 + j) - pdu_vec * pa12_vec) / v_load(pa22 + j) - pdv_vec);
|
||||
pdu_vec = v_add(pdu_vec, v_mul(omega_vec, v_sub(v_div(v_sub(v_add(sigmaU_vec, v_load(pb1 + j)), v_mul(pdv_vec, pa12_vec)), v_load(pa11 + j)), pdu_vec)));
|
||||
pdv_vec = v_add(pdv_vec, v_mul(omega_vec, v_sub(v_div(v_sub(v_add(sigmaV_vec, v_load(pb2 + j)), v_mul(pdu_vec, pa12_vec)), v_load(pa22 + j)), pdv_vec)));
|
||||
v_store(pdu + j, pdu_vec);
|
||||
v_store(pdv + j, pdv_vec);
|
||||
|
||||
|
@ -38,8 +38,8 @@ int main(int /*argc*/, char** /*argv*/)
|
||||
|
||||
printf("================== arithm check =================\n");
|
||||
v_uint8 a = vx_setall_u8(10);
|
||||
v_uint8 c = a + vx_setall_u8(45);
|
||||
printf("(vx_setall_u8(10) + vx_setall_u8(45)).get0() => %d\n", (int)c.get0());
|
||||
v_uint8 c = v_add(a, vx_setall_u8(45));
|
||||
printf("v_get0(vx_setall_u8(10) + vx_setall_u8(45)) => %d\n", (int)v_get0(c));
|
||||
#else
|
||||
printf("\nSIMD intrinsics are not available. Check compilation target and passed build options.\n");
|
||||
#endif
|
||||
|
@ -85,7 +85,7 @@ void conv1dsimd(Mat src, Mat kernel, float *ans, int row = 0, int rowk = 0, int
|
||||
|
||||
//! [convolution-1D-main]
|
||||
//! [convolution-1D-main-h1]
|
||||
int step = v_float32().nlanes;
|
||||
int step = VTraits<v_float32x4>::vlanes();
|
||||
float *sptr = src_32.ptr<float>(row), *kptr = kernel.ptr<float>(rowk);
|
||||
for (int k = 0; k < ksize; k++)
|
||||
{
|
||||
@ -96,7 +96,7 @@ void conv1dsimd(Mat src, Mat kernel, float *ans, int row = 0, int rowk = 0, int
|
||||
for (i = 0; i + step < len; i += step)
|
||||
{
|
||||
v_float32 window = vx_load(sptr + i + k);
|
||||
v_float32 sum = vx_load(ans + i) + kernel_wide * window;
|
||||
v_float32 sum = v_add(vx_load(ans + i), v_mul(kernel_wide, window));
|
||||
v_store(ans + i, sum);
|
||||
}
|
||||
//! [convolution-1D-main-h2]
|
||||
@ -122,7 +122,7 @@ void convolute_simd(Mat src, Mat &dst, Mat kernel)
|
||||
|
||||
copyMakeBorder(src, src, sz, sz, 0, 0, BORDER_REPLICATE);
|
||||
|
||||
int step = v_float32().nlanes;
|
||||
int step = VTraits<v_float32x4>::vlanes();
|
||||
//! [convolution-2D-init]
|
||||
|
||||
//! [convolution-2D-main]
|
||||
@ -135,7 +135,7 @@ void convolute_simd(Mat src, Mat &dst, Mat kernel)
|
||||
int j;
|
||||
for (j = 0; j + step < cols; j += step)
|
||||
{
|
||||
v_float32 sum = vx_load(&dst.ptr<float>(i)[j]) + vx_load(&ans[j]);
|
||||
v_float32 sum = v_add(vx_load(&dst.ptr<float>(i)[j]), vx_load(&ans[j]));
|
||||
v_store(&dst.ptr<float>(i)[j], sum);
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user