mirror of
https://github.com/opencv/opencv.git
synced 2025-06-07 09:25:45 +08:00
Wide univ intrinsics (#11953)
* core:OE-27 prepare universal intrinsics to expand (#11022) * core:OE-27 prepare universal intrinsics to expand (#11022) * core: Add universal intrinsics for AVX2 * updated implementation of wide univ. intrinsics; converted several OpenCV HAL functions: sqrt, invsqrt, magnitude, phase, exp to the wide universal intrinsics. * converted log to universal intrinsics; cleaned up the code a bit; added v_lut_deinterleave intrinsics. * core: Add universal intrinsics for AVX2 * fixed multiple compile errors * fixed many more compile errors and hopefully some test failures * fixed some more compile errors * temporarily disabled IPP to debug exp & log; hopefully fixed Doxygen complains * fixed some more compile errors * fixed v_store(short*, v_float16&) signatures * trying to fix the test failures on Linux * fixed some issues found by alalek * restored IPP optimization after the patch with AVX wide intrinsics has been properly tested * restored IPP optimization after the patch with AVX wide intrinsics has been properly tested
This commit is contained in:
parent
481829a81b
commit
f058b5fb1e
@ -60,255 +60,72 @@
|
||||
// access from within opencv code more accessible
|
||||
namespace cv {
|
||||
|
||||
template<typename _Tp> struct V_TypeTraits
|
||||
{
|
||||
};
|
||||
|
||||
#define CV_INTRIN_DEF_TYPE_TRAITS(type, int_type_, uint_type_, abs_type_, w_type_, q_type_, sum_type_, nlanes128_) \
|
||||
template<> struct V_TypeTraits<type> \
|
||||
{ \
|
||||
typedef type value_type; \
|
||||
typedef int_type_ int_type; \
|
||||
typedef abs_type_ abs_type; \
|
||||
typedef uint_type_ uint_type; \
|
||||
typedef w_type_ w_type; \
|
||||
typedef q_type_ q_type; \
|
||||
typedef sum_type_ sum_type; \
|
||||
enum { nlanes128 = nlanes128_ }; \
|
||||
\
|
||||
static inline int_type reinterpret_int(type x) \
|
||||
{ \
|
||||
union { type l; int_type i; } v; \
|
||||
v.l = x; \
|
||||
return v.i; \
|
||||
} \
|
||||
\
|
||||
static inline type reinterpret_from_int(int_type x) \
|
||||
{ \
|
||||
union { type l; int_type i; } v; \
|
||||
v.i = x; \
|
||||
return v.l; \
|
||||
} \
|
||||
}
|
||||
|
||||
CV_INTRIN_DEF_TYPE_TRAITS(uchar, schar, uchar, uchar, ushort, unsigned, unsigned, 16);
|
||||
CV_INTRIN_DEF_TYPE_TRAITS(schar, schar, uchar, uchar, short, int, int, 16);
|
||||
CV_INTRIN_DEF_TYPE_TRAITS(ushort, short, ushort, ushort, unsigned, uint64, unsigned, 8);
|
||||
CV_INTRIN_DEF_TYPE_TRAITS(short, short, ushort, ushort, int, int64, int, 8);
|
||||
CV_INTRIN_DEF_TYPE_TRAITS(unsigned, int, unsigned, unsigned, uint64, void, unsigned, 4);
|
||||
CV_INTRIN_DEF_TYPE_TRAITS(int, int, unsigned, unsigned, int64, void, int, 4);
|
||||
CV_INTRIN_DEF_TYPE_TRAITS(float, int, unsigned, float, double, void, float, 4);
|
||||
CV_INTRIN_DEF_TYPE_TRAITS(uint64, int64, uint64, uint64, void, void, uint64, 2);
|
||||
CV_INTRIN_DEF_TYPE_TRAITS(int64, int64, uint64, uint64, void, void, int64, 2);
|
||||
CV_INTRIN_DEF_TYPE_TRAITS(double, int64, uint64, double, void, void, double, 2);
|
||||
|
||||
#ifndef CV_DOXYGEN
|
||||
|
||||
#ifdef CV_CPU_DISPATCH_MODE
|
||||
#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE __CV_CAT(hal_, CV_CPU_DISPATCH_MODE)
|
||||
#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN namespace __CV_CAT(hal_, CV_CPU_DISPATCH_MODE) {
|
||||
#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END }
|
||||
#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE __CV_CAT(hal_, CV_CPU_DISPATCH_MODE)
|
||||
#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN namespace __CV_CAT(hal_, CV_CPU_DISPATCH_MODE) {
|
||||
#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END }
|
||||
#else
|
||||
#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE hal_baseline
|
||||
#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN namespace hal_baseline {
|
||||
#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END }
|
||||
#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE hal_baseline
|
||||
#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN namespace hal_baseline {
|
||||
#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END }
|
||||
#endif
|
||||
|
||||
|
||||
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
|
||||
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
|
||||
using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE;
|
||||
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
|
||||
#endif
|
||||
|
||||
//! @addtogroup core_hal_intrin
|
||||
//! @{
|
||||
|
||||
//! @cond IGNORED
|
||||
template<typename _Tp> struct V_TypeTraits
|
||||
{
|
||||
typedef _Tp int_type;
|
||||
typedef _Tp uint_type;
|
||||
typedef _Tp abs_type;
|
||||
typedef _Tp sum_type;
|
||||
|
||||
enum { delta = 0, shift = 0 };
|
||||
|
||||
static int_type reinterpret_int(_Tp x) { return x; }
|
||||
static uint_type reinterpet_uint(_Tp x) { return x; }
|
||||
static _Tp reinterpret_from_int(int_type x) { return (_Tp)x; }
|
||||
};
|
||||
|
||||
template<> struct V_TypeTraits<uchar>
|
||||
{
|
||||
typedef uchar value_type;
|
||||
typedef schar int_type;
|
||||
typedef uchar uint_type;
|
||||
typedef uchar abs_type;
|
||||
typedef int sum_type;
|
||||
|
||||
typedef ushort w_type;
|
||||
typedef unsigned q_type;
|
||||
|
||||
enum { delta = 128, shift = 8 };
|
||||
|
||||
static int_type reinterpret_int(value_type x) { return (int_type)x; }
|
||||
static uint_type reinterpret_uint(value_type x) { return (uint_type)x; }
|
||||
static value_type reinterpret_from_int(int_type x) { return (value_type)x; }
|
||||
};
|
||||
|
||||
template<> struct V_TypeTraits<schar>
|
||||
{
|
||||
typedef schar value_type;
|
||||
typedef schar int_type;
|
||||
typedef uchar uint_type;
|
||||
typedef uchar abs_type;
|
||||
typedef int sum_type;
|
||||
|
||||
typedef short w_type;
|
||||
typedef int q_type;
|
||||
|
||||
enum { delta = 128, shift = 8 };
|
||||
|
||||
static int_type reinterpret_int(value_type x) { return (int_type)x; }
|
||||
static uint_type reinterpret_uint(value_type x) { return (uint_type)x; }
|
||||
static value_type reinterpret_from_int(int_type x) { return (value_type)x; }
|
||||
};
|
||||
|
||||
template<> struct V_TypeTraits<ushort>
|
||||
{
|
||||
typedef ushort value_type;
|
||||
typedef short int_type;
|
||||
typedef ushort uint_type;
|
||||
typedef ushort abs_type;
|
||||
typedef int sum_type;
|
||||
|
||||
typedef unsigned w_type;
|
||||
typedef uchar nu_type;
|
||||
|
||||
enum { delta = 32768, shift = 16 };
|
||||
|
||||
static int_type reinterpret_int(value_type x) { return (int_type)x; }
|
||||
static uint_type reinterpret_uint(value_type x) { return (uint_type)x; }
|
||||
static value_type reinterpret_from_int(int_type x) { return (value_type)x; }
|
||||
};
|
||||
|
||||
template<> struct V_TypeTraits<short>
|
||||
{
|
||||
typedef short value_type;
|
||||
typedef short int_type;
|
||||
typedef ushort uint_type;
|
||||
typedef ushort abs_type;
|
||||
typedef int sum_type;
|
||||
|
||||
typedef int w_type;
|
||||
typedef uchar nu_type;
|
||||
typedef schar n_type;
|
||||
|
||||
enum { delta = 128, shift = 8 };
|
||||
|
||||
static int_type reinterpret_int(value_type x) { return (int_type)x; }
|
||||
static uint_type reinterpret_uint(value_type x) { return (uint_type)x; }
|
||||
static value_type reinterpret_from_int(int_type x) { return (value_type)x; }
|
||||
};
|
||||
|
||||
template<> struct V_TypeTraits<unsigned>
|
||||
{
|
||||
typedef unsigned value_type;
|
||||
typedef int int_type;
|
||||
typedef unsigned uint_type;
|
||||
typedef unsigned abs_type;
|
||||
typedef unsigned sum_type;
|
||||
|
||||
typedef uint64 w_type;
|
||||
typedef ushort nu_type;
|
||||
|
||||
static int_type reinterpret_int(value_type x) { return (int_type)x; }
|
||||
static uint_type reinterpret_uint(value_type x) { return (uint_type)x; }
|
||||
static value_type reinterpret_from_int(int_type x) { return (value_type)x; }
|
||||
};
|
||||
|
||||
template<> struct V_TypeTraits<int>
|
||||
{
|
||||
typedef int value_type;
|
||||
typedef int int_type;
|
||||
typedef unsigned uint_type;
|
||||
typedef unsigned abs_type;
|
||||
typedef int sum_type;
|
||||
|
||||
typedef int64 w_type;
|
||||
typedef short n_type;
|
||||
typedef ushort nu_type;
|
||||
|
||||
static int_type reinterpret_int(value_type x) { return (int_type)x; }
|
||||
static uint_type reinterpret_uint(value_type x) { return (uint_type)x; }
|
||||
static value_type reinterpret_from_int(int_type x) { return (value_type)x; }
|
||||
};
|
||||
|
||||
template<> struct V_TypeTraits<uint64>
|
||||
{
|
||||
typedef uint64 value_type;
|
||||
typedef int64 int_type;
|
||||
typedef uint64 uint_type;
|
||||
typedef uint64 abs_type;
|
||||
typedef uint64 sum_type;
|
||||
|
||||
typedef unsigned nu_type;
|
||||
|
||||
static int_type reinterpret_int(value_type x) { return (int_type)x; }
|
||||
static uint_type reinterpret_uint(value_type x) { return (uint_type)x; }
|
||||
static value_type reinterpret_from_int(int_type x) { return (value_type)x; }
|
||||
};
|
||||
|
||||
template<> struct V_TypeTraits<int64>
|
||||
{
|
||||
typedef int64 value_type;
|
||||
typedef int64 int_type;
|
||||
typedef uint64 uint_type;
|
||||
typedef uint64 abs_type;
|
||||
typedef int64 sum_type;
|
||||
|
||||
typedef int nu_type;
|
||||
|
||||
static int_type reinterpret_int(value_type x) { return (int_type)x; }
|
||||
static uint_type reinterpret_uint(value_type x) { return (uint_type)x; }
|
||||
static value_type reinterpret_from_int(int_type x) { return (value_type)x; }
|
||||
};
|
||||
|
||||
|
||||
template<> struct V_TypeTraits<float>
|
||||
{
|
||||
typedef float value_type;
|
||||
typedef int int_type;
|
||||
typedef unsigned uint_type;
|
||||
typedef float abs_type;
|
||||
typedef float sum_type;
|
||||
|
||||
typedef double w_type;
|
||||
|
||||
static int_type reinterpret_int(value_type x)
|
||||
{
|
||||
Cv32suf u;
|
||||
u.f = x;
|
||||
return u.i;
|
||||
}
|
||||
static uint_type reinterpet_uint(value_type x)
|
||||
{
|
||||
Cv32suf u;
|
||||
u.f = x;
|
||||
return u.u;
|
||||
}
|
||||
static value_type reinterpret_from_int(int_type x)
|
||||
{
|
||||
Cv32suf u;
|
||||
u.i = x;
|
||||
return u.f;
|
||||
}
|
||||
};
|
||||
|
||||
template<> struct V_TypeTraits<double>
|
||||
{
|
||||
typedef double value_type;
|
||||
typedef int64 int_type;
|
||||
typedef uint64 uint_type;
|
||||
typedef double abs_type;
|
||||
typedef double sum_type;
|
||||
static int_type reinterpret_int(value_type x)
|
||||
{
|
||||
Cv64suf u;
|
||||
u.f = x;
|
||||
return u.i;
|
||||
}
|
||||
static uint_type reinterpet_uint(value_type x)
|
||||
{
|
||||
Cv64suf u;
|
||||
u.f = x;
|
||||
return u.u;
|
||||
}
|
||||
static value_type reinterpret_from_int(int_type x)
|
||||
{
|
||||
Cv64suf u;
|
||||
u.i = x;
|
||||
return u.f;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T> struct V_SIMD128Traits
|
||||
{
|
||||
enum { nlanes = 16 / sizeof(T) };
|
||||
};
|
||||
|
||||
//! @endcond
|
||||
|
||||
//! @}
|
||||
|
||||
#ifndef CV_DOXYGEN
|
||||
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef CV_DOXYGEN
|
||||
# undef CV_AVX2
|
||||
# undef CV_SSE2
|
||||
# undef CV_NEON
|
||||
# undef CV_VSX
|
||||
# undef CV_FP16
|
||||
#endif
|
||||
|
||||
#if CV_SSE2
|
||||
@ -325,27 +142,25 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
|
||||
|
||||
#else
|
||||
|
||||
#define CV_SIMD128_CPP 1
|
||||
#include "opencv2/core/hal/intrin_cpp.hpp"
|
||||
|
||||
#endif
|
||||
|
||||
//! @addtogroup core_hal_intrin
|
||||
//! @{
|
||||
// AVX2 can be used together with SSE2, so
|
||||
// we define those two sets of intrinsics at once.
|
||||
// Most of the intrinsics do not conflict (the proper overloaded variant is
|
||||
// resolved by the argument types, e.g. v_float32x4 ~ SSE2, v_float32x8 ~ AVX2),
|
||||
// but some of AVX2 intrinsics get v256_ prefix instead of v_, e.g. v256_load() vs v_load().
|
||||
// Correspondingly, the wide intrinsics (which are mapped to the "widest"
|
||||
// available instruction set) will get vx_ prefix
|
||||
// (and will be mapped to v256_ counterparts) (e.g. vx_load() => v245_load())
|
||||
#if CV_AVX2
|
||||
|
||||
#include "opencv2/core/hal/intrin_avx.hpp"
|
||||
|
||||
#ifndef CV_SIMD128
|
||||
//! Set to 1 if current compiler supports vector extensions (NEON or SSE is enabled)
|
||||
#define CV_SIMD128 0
|
||||
#endif
|
||||
|
||||
#ifndef CV_SIMD128_64F
|
||||
//! Set to 1 if current intrinsics implementation supports 64-bit float vectors
|
||||
#define CV_SIMD128_64F 0
|
||||
#endif
|
||||
|
||||
//! @}
|
||||
|
||||
//==================================================================================================
|
||||
|
||||
//! @cond IGNORED
|
||||
|
||||
namespace cv {
|
||||
@ -354,88 +169,175 @@ namespace cv {
|
||||
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
|
||||
#endif
|
||||
|
||||
template <typename R> struct V_RegTrait128;
|
||||
#ifndef CV_SIMD128
|
||||
#define CV_SIMD128 0
|
||||
#endif
|
||||
|
||||
template <> struct V_RegTrait128<uchar> {
|
||||
typedef v_uint8x16 reg;
|
||||
typedef v_uint16x8 w_reg;
|
||||
typedef v_uint32x4 q_reg;
|
||||
typedef v_uint8x16 u_reg;
|
||||
static v_uint8x16 zero() { return v_setzero_u8(); }
|
||||
static v_uint8x16 all(uchar val) { return v_setall_u8(val); }
|
||||
#ifndef CV_SIMD128_64F
|
||||
#define CV_SIMD128_64F 0
|
||||
#endif
|
||||
|
||||
#ifndef CV_SIMD256
|
||||
#define CV_SIMD256 0
|
||||
#endif
|
||||
|
||||
#ifndef CV_SIMD256_64F
|
||||
#define CV_SIMD256_64F 0
|
||||
#endif
|
||||
|
||||
#ifndef CV_SIMD512
|
||||
#define CV_SIMD512 0
|
||||
#endif
|
||||
|
||||
#ifndef CV_SIMD512_64F
|
||||
#define CV_SIMD512_64F 0
|
||||
#endif
|
||||
|
||||
#if CV_SIMD512
|
||||
#define CV_SIMD 1
|
||||
#define CV_SIMD_64F CV_SIMD512_64F
|
||||
#define CV_SIMD_WIDTH 64
|
||||
#elif CV_SIMD256
|
||||
#define CV_SIMD 1
|
||||
#define CV_SIMD_64F CV_SIMD256_64F
|
||||
#define CV_SIMD_WIDTH 32
|
||||
#else
|
||||
#define CV_SIMD CV_SIMD128
|
||||
#define CV_SIMD_64F CV_SIMD128_64F
|
||||
#define CV_SIMD_WIDTH 16
|
||||
#endif
|
||||
|
||||
//==================================================================================================
|
||||
|
||||
#define CV_INTRIN_DEFINE_WIDE_INTRIN(typ, vtyp, short_typ, prefix, loadsfx) \
|
||||
inline vtyp vx_setall_##short_typ(typ v) { return prefix##_setall_##short_typ(v); } \
|
||||
inline vtyp vx_setzero_##short_typ() { return prefix##_setzero_##short_typ(); } \
|
||||
inline vtyp vx_##loadsfx(const typ* ptr) { return prefix##_##loadsfx(ptr); } \
|
||||
inline vtyp vx_##loadsfx##_aligned(const typ* ptr) { return prefix##_##loadsfx##_aligned(ptr); } \
|
||||
inline void vx_store(typ* ptr, const vtyp& v) { return v_store(ptr, v); } \
|
||||
inline void vx_store_aligned(typ* ptr, const vtyp& v) { return v_store_aligned(ptr, v); }
|
||||
|
||||
#define CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(typ, wtyp, prefix) \
|
||||
inline wtyp vx_load_expand(const typ* ptr) { return prefix##_load_expand(ptr); }
|
||||
|
||||
#define CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND_Q(typ, qtyp, prefix) \
|
||||
inline qtyp vx_load_expand_q(const typ* ptr) { return prefix##_load_expand_q(ptr); }
|
||||
|
||||
#define CV_INTRIN_DEFINE_WIDE_INTRIN_WITH_EXPAND(typ, vtyp, short_typ, wtyp, qtyp, prefix, loadsfx) \
|
||||
CV_INTRIN_DEFINE_WIDE_INTRIN(typ, vtyp, short_typ, prefix, loadsfx) \
|
||||
CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(typ, wtyp, prefix) \
|
||||
CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND_Q(typ, qtyp, prefix)
|
||||
|
||||
#define CV_INTRIN_DEFINE_WIDE_INTRIN_ALL_TYPES(prefix) \
|
||||
CV_INTRIN_DEFINE_WIDE_INTRIN_WITH_EXPAND(uchar, v_uint8, u8, v_uint16, v_uint32, prefix, load) \
|
||||
CV_INTRIN_DEFINE_WIDE_INTRIN_WITH_EXPAND(schar, v_int8, s8, v_int16, v_int32, prefix, load) \
|
||||
CV_INTRIN_DEFINE_WIDE_INTRIN(ushort, v_uint16, u16, prefix, load) \
|
||||
CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(ushort, v_uint32, prefix) \
|
||||
CV_INTRIN_DEFINE_WIDE_INTRIN(short, v_int16, s16, prefix, load) \
|
||||
CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(short, v_int32, prefix) \
|
||||
CV_INTRIN_DEFINE_WIDE_INTRIN(int, v_int32, s32, prefix, load) \
|
||||
CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(int, v_int64, prefix) \
|
||||
CV_INTRIN_DEFINE_WIDE_INTRIN(unsigned, v_uint32, u32, prefix, load) \
|
||||
CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(unsigned, v_uint64, prefix) \
|
||||
CV_INTRIN_DEFINE_WIDE_INTRIN(float, v_float32, f32, prefix, load) \
|
||||
CV_INTRIN_DEFINE_WIDE_INTRIN(int64, v_int64, s64, prefix, load) \
|
||||
CV_INTRIN_DEFINE_WIDE_INTRIN(uint64, v_uint64, u64, prefix, load)
|
||||
|
||||
template<typename _Tp> struct V_RegTraits
|
||||
{
|
||||
};
|
||||
|
||||
template <> struct V_RegTrait128<schar> {
|
||||
typedef v_int8x16 reg;
|
||||
typedef v_int16x8 w_reg;
|
||||
typedef v_int32x4 q_reg;
|
||||
typedef v_uint8x16 u_reg;
|
||||
static v_int8x16 zero() { return v_setzero_s8(); }
|
||||
static v_int8x16 all(schar val) { return v_setall_s8(val); }
|
||||
};
|
||||
|
||||
template <> struct V_RegTrait128<ushort> {
|
||||
typedef v_uint16x8 reg;
|
||||
typedef v_uint32x4 w_reg;
|
||||
typedef v_int16x8 int_reg;
|
||||
typedef v_uint16x8 u_reg;
|
||||
static v_uint16x8 zero() { return v_setzero_u16(); }
|
||||
static v_uint16x8 all(ushort val) { return v_setall_u16(val); }
|
||||
};
|
||||
|
||||
template <> struct V_RegTrait128<short> {
|
||||
typedef v_int16x8 reg;
|
||||
typedef v_int32x4 w_reg;
|
||||
typedef v_uint16x8 u_reg;
|
||||
static v_int16x8 zero() { return v_setzero_s16(); }
|
||||
static v_int16x8 all(short val) { return v_setall_s16(val); }
|
||||
};
|
||||
|
||||
template <> struct V_RegTrait128<unsigned> {
|
||||
typedef v_uint32x4 reg;
|
||||
typedef v_uint64x2 w_reg;
|
||||
typedef v_int32x4 int_reg;
|
||||
typedef v_uint32x4 u_reg;
|
||||
static v_uint32x4 zero() { return v_setzero_u32(); }
|
||||
static v_uint32x4 all(unsigned val) { return v_setall_u32(val); }
|
||||
};
|
||||
|
||||
template <> struct V_RegTrait128<int> {
|
||||
typedef v_int32x4 reg;
|
||||
typedef v_int64x2 w_reg;
|
||||
typedef v_uint32x4 u_reg;
|
||||
static v_int32x4 zero() { return v_setzero_s32(); }
|
||||
static v_int32x4 all(int val) { return v_setall_s32(val); }
|
||||
};
|
||||
|
||||
template <> struct V_RegTrait128<uint64> {
|
||||
typedef v_uint64x2 reg;
|
||||
static v_uint64x2 zero() { return v_setzero_u64(); }
|
||||
static v_uint64x2 all(uint64 val) { return v_setall_u64(val); }
|
||||
};
|
||||
|
||||
template <> struct V_RegTrait128<int64> {
|
||||
typedef v_int64x2 reg;
|
||||
static v_int64x2 zero() { return v_setzero_s64(); }
|
||||
static v_int64x2 all(int64 val) { return v_setall_s64(val); }
|
||||
};
|
||||
|
||||
template <> struct V_RegTrait128<float> {
|
||||
typedef v_float32x4 reg;
|
||||
typedef v_int32x4 int_reg;
|
||||
typedef v_float32x4 u_reg;
|
||||
static v_float32x4 zero() { return v_setzero_f32(); }
|
||||
static v_float32x4 all(float val) { return v_setall_f32(val); }
|
||||
};
|
||||
#define CV_DEF_REG_TRAITS(prefix, _reg, lane_type, suffix, _u_reg, _w_reg, _q_reg, _int_reg, _round_reg) \
|
||||
template<> struct V_RegTraits<_reg> \
|
||||
{ \
|
||||
typedef _reg reg; \
|
||||
typedef _u_reg u_reg; \
|
||||
typedef _w_reg w_reg; \
|
||||
typedef _q_reg q_reg; \
|
||||
typedef _int_reg int_reg; \
|
||||
typedef _round_reg round_reg; \
|
||||
}
|
||||
|
||||
#if CV_SIMD128 || CV_SIMD128_CPP
|
||||
CV_DEF_REG_TRAITS(v, v_uint8x16, uchar, u8, v_uint8x16, v_uint16x8, v_uint32x4, v_int8x16, void);
|
||||
CV_DEF_REG_TRAITS(v, v_int8x16, schar, s8, v_uint8x16, v_int16x8, v_int32x4, v_int8x16, void);
|
||||
CV_DEF_REG_TRAITS(v, v_uint16x8, ushort, u16, v_uint16x8, v_uint32x4, v_uint64x2, v_int16x8, void);
|
||||
CV_DEF_REG_TRAITS(v, v_int16x8, short, s16, v_uint16x8, v_int32x4, v_int64x2, v_int16x8, void);
|
||||
CV_DEF_REG_TRAITS(v, v_uint32x4, unsigned, u32, v_uint32x4, v_uint64x2, void, v_int32x4, void);
|
||||
CV_DEF_REG_TRAITS(v, v_int32x4, int, s32, v_uint32x4, v_int64x2, void, v_int32x4, void);
|
||||
#if CV_SIMD128_64F
|
||||
template <> struct V_RegTrait128<double> {
|
||||
typedef v_float64x2 reg;
|
||||
typedef v_int32x4 int_reg;
|
||||
typedef v_float64x2 u_reg;
|
||||
static v_float64x2 zero() { return v_setzero_f64(); }
|
||||
static v_float64x2 all(double val) { return v_setall_f64(val); }
|
||||
};
|
||||
CV_DEF_REG_TRAITS(v, v_float32x4, float, f32, v_float32x4, v_float64x2, void, v_int32x4, v_int32x4);
|
||||
#else
|
||||
CV_DEF_REG_TRAITS(v, v_float32x4, float, f32, v_float32x4, void, void, v_int32x4, v_int32x4);
|
||||
#endif
|
||||
CV_DEF_REG_TRAITS(v, v_uint64x2, uint64, u64, v_uint64x2, void, void, v_int64x2, void);
|
||||
CV_DEF_REG_TRAITS(v, v_int64x2, int64, s64, v_uint64x2, void, void, v_int64x2, void);
|
||||
#if CV_SIMD128_64F
|
||||
CV_DEF_REG_TRAITS(v, v_float64x2, double, f64, v_float64x2, void, void, v_int64x2, v_int32x4);
|
||||
#endif
|
||||
#if CV_FP16
|
||||
CV_DEF_REG_TRAITS(v, v_float16x8, short, f16, v_float32x4, void, void, v_int16x8, v_int16x8);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if CV_SIMD256
|
||||
CV_DEF_REG_TRAITS(v256, v_uint8x32, uchar, u8, v_uint8x32, v_uint16x16, v_uint32x8, v_int8x32, void);
|
||||
CV_DEF_REG_TRAITS(v256, v_int8x32, schar, s8, v_uint8x32, v_int16x16, v_int32x8, v_int8x32, void);
|
||||
CV_DEF_REG_TRAITS(v256, v_uint16x16, ushort, u16, v_uint16x16, v_uint32x8, v_uint64x4, v_int16x16, void);
|
||||
CV_DEF_REG_TRAITS(v256, v_int16x16, short, s16, v_uint16x16, v_int32x8, v_int64x4, v_int16x16, void);
|
||||
CV_DEF_REG_TRAITS(v256, v_uint32x8, unsigned, u32, v_uint32x8, v_uint64x4, void, v_int32x8, void);
|
||||
CV_DEF_REG_TRAITS(v256, v_int32x8, int, s32, v_uint32x8, v_int64x4, void, v_int32x8, void);
|
||||
CV_DEF_REG_TRAITS(v256, v_float32x8, float, f32, v_float32x8, v_float64x4, void, v_int32x8, v_int32x8);
|
||||
CV_DEF_REG_TRAITS(v256, v_uint64x4, uint64, u64, v_uint64x4, void, void, v_int64x4, void);
|
||||
CV_DEF_REG_TRAITS(v256, v_int64x4, int64, s64, v_uint64x4, void, void, v_int64x4, void);
|
||||
CV_DEF_REG_TRAITS(v256, v_float64x4, double, f64, v_float64x4, void, void, v_int64x4, v_int32x8);
|
||||
#if CV_FP16
|
||||
CV_DEF_REG_TRAITS(v256, v_float16x16, short, f16, v_float32x8, void, void, v_int16x16, void);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if CV_SIMD256
|
||||
typedef v_uint8x32 v_uint8;
|
||||
typedef v_int8x32 v_int8;
|
||||
typedef v_uint16x16 v_uint16;
|
||||
typedef v_int16x16 v_int16;
|
||||
typedef v_uint32x8 v_uint32;
|
||||
typedef v_int32x8 v_int32;
|
||||
typedef v_uint64x4 v_uint64;
|
||||
typedef v_int64x4 v_int64;
|
||||
typedef v_float32x8 v_float32;
|
||||
#if CV_SIMD256_64F
|
||||
typedef v_float64x4 v_float64;
|
||||
#endif
|
||||
#if CV_FP16
|
||||
typedef v_float16x16 v_float16;
|
||||
CV_INTRIN_DEFINE_WIDE_INTRIN(short, v_float16, f16, v256, load_f16)
|
||||
#endif
|
||||
CV_INTRIN_DEFINE_WIDE_INTRIN_ALL_TYPES(v256)
|
||||
CV_INTRIN_DEFINE_WIDE_INTRIN(double, v_float64, f64, v256, load)
|
||||
inline void vx_cleanup() { v256_cleanup(); }
|
||||
#elif CV_SIMD128
|
||||
typedef v_uint8x16 v_uint8;
|
||||
typedef v_int8x16 v_int8;
|
||||
typedef v_uint16x8 v_uint16;
|
||||
typedef v_int16x8 v_int16;
|
||||
typedef v_uint32x4 v_uint32;
|
||||
typedef v_int32x4 v_int32;
|
||||
typedef v_uint64x2 v_uint64;
|
||||
typedef v_int64x2 v_int64;
|
||||
typedef v_float32x4 v_float32;
|
||||
#if CV_SIMD128_64F
|
||||
typedef v_float64x2 v_float64;
|
||||
#endif
|
||||
#if CV_FP16
|
||||
typedef v_float16x8 v_float16;
|
||||
CV_INTRIN_DEFINE_WIDE_INTRIN(short, v_float16, f16, v, load_f16)
|
||||
#endif
|
||||
CV_INTRIN_DEFINE_WIDE_INTRIN_ALL_TYPES(v)
|
||||
#if CV_SIMD128_64F
|
||||
CV_INTRIN_DEFINE_WIDE_INTRIN(double, v_float64, f64, v, load)
|
||||
#endif
|
||||
inline void vx_cleanup() { v_cleanup(); }
|
||||
#endif
|
||||
|
||||
inline unsigned int trailingZeros32(unsigned int value) {
|
||||
|
2016
modules/core/include/opencv2/core/hal/intrin_avx.hpp
Normal file
2016
modules/core/include/opencv2/core/hal/intrin_avx.hpp
Normal file
File diff suppressed because it is too large
Load Diff
@ -247,8 +247,6 @@ template<typename _Tp, int n> struct v_reg
|
||||
{
|
||||
//! @cond IGNORED
|
||||
typedef _Tp lane_type;
|
||||
typedef v_reg<typename V_TypeTraits<_Tp>::int_type, n> int_vec;
|
||||
typedef v_reg<typename V_TypeTraits<_Tp>::abs_type, n> abs_vec;
|
||||
enum { nlanes = n };
|
||||
// !@endcond
|
||||
|
||||
@ -797,11 +795,11 @@ inline v_reg<_Tp, n> v_sqr_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>
|
||||
|
||||
/** @brief Multiply and add
|
||||
|
||||
Returns \f$ a*b + c \f$
|
||||
For floating point types and signed 32bit int only. */
|
||||
Returns \f$ a*b + c \f$
|
||||
For floating point types and signed 32bit int only. */
|
||||
template<typename _Tp, int n>
|
||||
inline v_reg<_Tp, n> v_muladd(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
|
||||
const v_reg<_Tp, n>& c)
|
||||
inline v_reg<_Tp, n> v_fma(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
|
||||
const v_reg<_Tp, n>& c)
|
||||
{
|
||||
v_reg<_Tp, n> d;
|
||||
for( int i = 0; i < n; i++ )
|
||||
@ -809,6 +807,14 @@ inline v_reg<_Tp, n> v_muladd(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
|
||||
return d;
|
||||
}
|
||||
|
||||
/** @brief A synonym for v_fma */
|
||||
template<typename _Tp, int n>
|
||||
inline v_reg<_Tp, n> v_muladd(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
|
||||
const v_reg<_Tp, n>& c)
|
||||
{
|
||||
return v_fma(a, b, c);
|
||||
}
|
||||
|
||||
/** @brief Dot product of elements
|
||||
|
||||
Multiply values in two registers and sum adjacent result pairs.
|
||||
@ -1141,9 +1147,9 @@ template<typename _Tp, int n> inline void v_zip( const v_reg<_Tp, n>& a0, const
|
||||
@note Returned type will be detected from passed pointer type, for example uchar ==> cv::v_uint8x16, int ==> cv::v_int32x4, etc.
|
||||
*/
|
||||
template<typename _Tp>
|
||||
inline v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> v_load(const _Tp* ptr)
|
||||
inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_load(const _Tp* ptr)
|
||||
{
|
||||
return v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes>(ptr);
|
||||
return v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128>(ptr);
|
||||
}
|
||||
|
||||
/** @brief Load register contents from memory (aligned)
|
||||
@ -1151,9 +1157,9 @@ inline v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> v_load(const _Tp* ptr)
|
||||
similar to cv::v_load, but source memory block should be aligned (to 16-byte boundary)
|
||||
*/
|
||||
template<typename _Tp>
|
||||
inline v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> v_load_aligned(const _Tp* ptr)
|
||||
inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_load_aligned(const _Tp* ptr)
|
||||
{
|
||||
return v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes>(ptr);
|
||||
return v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128>(ptr);
|
||||
}
|
||||
|
||||
/** @brief Load 64-bits of data to lower part (high part is undefined).
|
||||
@ -1166,9 +1172,9 @@ v_int32x4 r = v_load_low(lo);
|
||||
@endcode
|
||||
*/
|
||||
template<typename _Tp>
|
||||
inline v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> v_load_low(const _Tp* ptr)
|
||||
inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_load_low(const _Tp* ptr)
|
||||
{
|
||||
v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> c;
|
||||
v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> c;
|
||||
for( int i = 0; i < c.nlanes/2; i++ )
|
||||
{
|
||||
c.s[i] = ptr[i];
|
||||
@ -1187,9 +1193,9 @@ v_int32x4 r = v_load_halves(lo, hi);
|
||||
@endcode
|
||||
*/
|
||||
template<typename _Tp>
|
||||
inline v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> v_load_halves(const _Tp* loptr, const _Tp* hiptr)
|
||||
inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_load_halves(const _Tp* loptr, const _Tp* hiptr)
|
||||
{
|
||||
v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> c;
|
||||
v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> c;
|
||||
for( int i = 0; i < c.nlanes/2; i++ )
|
||||
{
|
||||
c.s[i] = loptr[i];
|
||||
@ -1208,11 +1214,11 @@ v_int32x4 r = v_load_expand(buf); // r = {1, 2, 3, 4} - type is int32
|
||||
@endcode
|
||||
For 8-, 16-, 32-bit integer source types. */
|
||||
template<typename _Tp>
|
||||
inline v_reg<typename V_TypeTraits<_Tp>::w_type, V_SIMD128Traits<_Tp>::nlanes / 2>
|
||||
inline v_reg<typename V_TypeTraits<_Tp>::w_type, V_TypeTraits<_Tp>::nlanes128 / 2>
|
||||
v_load_expand(const _Tp* ptr)
|
||||
{
|
||||
typedef typename V_TypeTraits<_Tp>::w_type w_type;
|
||||
v_reg<w_type, V_SIMD128Traits<w_type>::nlanes> c;
|
||||
v_reg<w_type, V_TypeTraits<w_type>::nlanes128> c;
|
||||
for( int i = 0; i < c.nlanes; i++ )
|
||||
{
|
||||
c.s[i] = ptr[i];
|
||||
@ -1229,11 +1235,11 @@ v_int32x4 r = v_load_q(buf); // r = {1, 2, 3, 4} - type is int32
|
||||
@endcode
|
||||
For 8-bit integer source types. */
|
||||
template<typename _Tp>
|
||||
inline v_reg<typename V_TypeTraits<_Tp>::q_type, V_SIMD128Traits<_Tp>::nlanes / 4>
|
||||
inline v_reg<typename V_TypeTraits<_Tp>::q_type, V_TypeTraits<_Tp>::nlanes128 / 4>
|
||||
v_load_expand_q(const _Tp* ptr)
|
||||
{
|
||||
typedef typename V_TypeTraits<_Tp>::q_type q_type;
|
||||
v_reg<q_type, V_SIMD128Traits<q_type>::nlanes> c;
|
||||
v_reg<q_type, V_TypeTraits<q_type>::nlanes128> c;
|
||||
for( int i = 0; i < c.nlanes; i++ )
|
||||
{
|
||||
c.s[i] = ptr[i];
|
||||
@ -1622,6 +1628,17 @@ template<int n> inline v_reg<float, n> v_cvt_f32(const v_reg<int, n>& a)
|
||||
return c;
|
||||
}
|
||||
|
||||
template<int n> inline v_reg<float, n*2> v_cvt_f32(const v_reg<double, n>& a, const v_reg<double, n>& b)
|
||||
{
|
||||
v_reg<float, n*2> c;
|
||||
for( int i = 0; i < n; i++ )
|
||||
{
|
||||
c.s[i] = (float)a.s[i];
|
||||
c.s[i+n] = (float)b.s[i];
|
||||
}
|
||||
return c;
|
||||
}
|
||||
|
||||
/** @brief Convert to double
|
||||
|
||||
Supported input type is cv::v_int32x4. */
|
||||
@ -1644,6 +1661,52 @@ template<int n> inline v_reg<double, n> v_cvt_f64(const v_reg<float, n*2>& a)
|
||||
return c;
|
||||
}
|
||||
|
||||
template<int n> inline v_reg<int, n> v_lut(const int* tab, const v_reg<int, n>& idx)
|
||||
{
|
||||
v_reg<int, n> c;
|
||||
for( int i = 0; i < n; i++ )
|
||||
c.s[i] = tab[idx.s[i]];
|
||||
return c;
|
||||
}
|
||||
|
||||
template<int n> inline v_reg<float, n> v_lut(const float* tab, const v_reg<int, n>& idx)
|
||||
{
|
||||
v_reg<float, n> c;
|
||||
for( int i = 0; i < n; i++ )
|
||||
c.s[i] = tab[idx.s[i]];
|
||||
return c;
|
||||
}
|
||||
|
||||
template<int n> inline v_reg<double, n> v_lut(const double* tab, const v_reg<int, n*2>& idx)
|
||||
{
|
||||
v_reg<double, n> c;
|
||||
for( int i = 0; i < n; i++ )
|
||||
c.s[i] = tab[idx.s[i]];
|
||||
return c;
|
||||
}
|
||||
|
||||
template<int n> inline void v_lut_deinterleave(const float* tab, const v_reg<int, n>& idx,
|
||||
v_reg<float, n>& x, v_reg<float, n>& y)
|
||||
{
|
||||
for( int i = 0; i < n; i++ )
|
||||
{
|
||||
int j = idx.s[i];
|
||||
x.s[i] = tab[j];
|
||||
y.s[i] = tab[j+1];
|
||||
}
|
||||
}
|
||||
|
||||
template<int n> inline void v_lut_deinterleave(const double* tab, const v_reg<int, n*2>& idx,
|
||||
v_reg<double, n>& x, v_reg<double, n>& y)
|
||||
{
|
||||
for( int i = 0; i < n; i++ )
|
||||
{
|
||||
int j = idx.s[i];
|
||||
x.s[i] = tab[j];
|
||||
y.s[i] = tab[j+1];
|
||||
}
|
||||
}
|
||||
|
||||
/** @brief Transpose 4x4 matrix
|
||||
|
||||
Scheme:
|
||||
@ -1968,6 +2031,8 @@ inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
|
||||
v.s[0]*m0.s[3] + v.s[1]*m1.s[3] + v.s[2]*m2.s[3] + m3.s[3]);
|
||||
}
|
||||
|
||||
inline void v_cleanup() {}
|
||||
|
||||
//! @}
|
||||
|
||||
//! @name Check SIMD support
|
||||
|
@ -280,11 +280,29 @@ struct v_float64x2
|
||||
|
||||
#if CV_FP16
|
||||
// Workaround for old compilers
|
||||
template <typename T> static inline int16x4_t vreinterpret_s16_f16(T a)
|
||||
{ return (int16x4_t)a; }
|
||||
template <typename T> static inline float16x4_t vreinterpret_f16_s16(T a)
|
||||
{ return (float16x4_t)a; }
|
||||
template <typename T> static inline float16x4_t cv_vld1_f16(const T* ptr)
|
||||
static inline int16x8_t vreinterpretq_s16_f16(float16x8_t a) { return (int16x8_t)a; }
|
||||
static inline float16x8_t vreinterpretq_f16_s16(int16x8_t a) { return (float16x8_t)a; }
|
||||
static inline int16x4_t vreinterpret_s16_f16(float16x4_t a) { return (int16x4_t)a; }
|
||||
static inline float16x4_t vreinterpret_f16_s16(int16x4_t a) { return (float16x4_t)a; }
|
||||
|
||||
static inline float16x8_t cv_vld1q_f16(const void* ptr)
|
||||
{
|
||||
#ifndef vld1q_f16 // APPLE compiler defines vld1_f16 as macro
|
||||
return vreinterpretq_f16_s16(vld1q_s16((const short*)ptr));
|
||||
#else
|
||||
return vld1q_f16((const __fp16*)ptr);
|
||||
#endif
|
||||
}
|
||||
static inline void cv_vst1q_f16(void* ptr, float16x8_t a)
|
||||
{
|
||||
#ifndef vst1q_f16 // APPLE compiler defines vst1_f16 as macro
|
||||
vst1q_s16((short*)ptr, vreinterpretq_s16_f16(a));
|
||||
#else
|
||||
vst1q_f16((__fp16*)ptr, a);
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline float16x4_t cv_vld1_f16(const void* ptr)
|
||||
{
|
||||
#ifndef vld1_f16 // APPLE compiler defines vld1_f16 as macro
|
||||
return vreinterpret_f16_s16(vld1_s16((const short*)ptr));
|
||||
@ -292,7 +310,7 @@ template <typename T> static inline float16x4_t cv_vld1_f16(const T* ptr)
|
||||
return vld1_f16((const __fp16*)ptr);
|
||||
#endif
|
||||
}
|
||||
template <typename T> static inline void cv_vst1_f16(T* ptr, float16x4_t a)
|
||||
static inline void cv_vst1_f16(void* ptr, float16x4_t a)
|
||||
{
|
||||
#ifndef vst1_f16 // APPLE compiler defines vst1_f16 as macro
|
||||
vst1_s16((short*)ptr, vreinterpret_s16_f16(a));
|
||||
@ -301,24 +319,28 @@ template <typename T> static inline void cv_vst1_f16(T* ptr, float16x4_t a)
|
||||
#endif
|
||||
}
|
||||
|
||||
struct v_float16x4
|
||||
|
||||
struct v_float16x8
|
||||
{
|
||||
typedef short lane_type;
|
||||
enum { nlanes = 4 };
|
||||
enum { nlanes = 8 };
|
||||
|
||||
v_float16x4() {}
|
||||
explicit v_float16x4(float16x4_t v) : val(v) {}
|
||||
v_float16x4(short v0, short v1, short v2, short v3)
|
||||
v_float16x8() {}
|
||||
explicit v_float16x8(float16x8_t v) : val(v) {}
|
||||
v_float16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
|
||||
{
|
||||
short v[] = {v0, v1, v2, v3};
|
||||
val = cv_vld1_f16(v);
|
||||
short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
|
||||
val = cv_vld1q_f16(v);
|
||||
}
|
||||
short get0() const
|
||||
{
|
||||
return vget_lane_s16(vreinterpret_s16_f16(val), 0);
|
||||
return vgetq_lane_s16(vreinterpretq_s16_f16(val), 0);
|
||||
}
|
||||
float16x4_t val;
|
||||
float16x8_t val;
|
||||
};
|
||||
|
||||
inline v_float16x8 v_setzero_f16() { return v_float16x8(vreinterpretq_f16_s16(vdupq_n_s16((short)0))); }
|
||||
inline v_float16x8 v_setall_f16(short v) { return v_float16x8(vreinterpretq_f16_s16(vdupq_n_s16(v))); }
|
||||
#endif
|
||||
|
||||
#define OPENCV_HAL_IMPL_NEON_INIT(_Tpv, _Tp, suffix) \
|
||||
@ -731,14 +753,30 @@ inline v_float32x4 v_sqr_magnitude(const v_float32x4& a, const v_float32x4& b)
|
||||
return v_float32x4(vmlaq_f32(vmulq_f32(a.val, a.val), b.val, b.val));
|
||||
}
|
||||
|
||||
inline v_float32x4 v_fma(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
|
||||
{
|
||||
#if CV_SIMD128_64F
|
||||
// ARMv8, which adds support for 64-bit floating-point (so CV_SIMD128_64F is defined),
|
||||
// also adds FMA support both for single- and double-precision floating-point vectors
|
||||
return v_float32x4(vfmaq_f32(c.val, a.val, b.val));
|
||||
#else
|
||||
return v_float32x4(vmlaq_f32(c.val, a.val, b.val));
|
||||
#endif
|
||||
}
|
||||
|
||||
inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
|
||||
{
|
||||
return v_int32x4(vmlaq_s32(c.val, a.val, b.val));
|
||||
}
|
||||
|
||||
inline v_float32x4 v_muladd(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
|
||||
{
|
||||
return v_float32x4(vmlaq_f32(c.val, a.val, b.val));
|
||||
return v_fma(a, b, c);
|
||||
}
|
||||
|
||||
inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
|
||||
{
|
||||
return v_int32x4(vmlaq_s32(c.val, a.val, b.val));
|
||||
return v_fma(a, b, c);
|
||||
}
|
||||
|
||||
#if CV_SIMD128_64F
|
||||
@ -753,9 +791,14 @@ inline v_float64x2 v_sqr_magnitude(const v_float64x2& a, const v_float64x2& b)
|
||||
return v_float64x2(vaddq_f64(vmulq_f64(a.val, a.val), vmulq_f64(b.val, b.val)));
|
||||
}
|
||||
|
||||
inline v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
|
||||
{
|
||||
return v_float64x2(vfmaq_f64(c.val, a.val, b.val));
|
||||
}
|
||||
|
||||
inline v_float64x2 v_muladd(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
|
||||
{
|
||||
return v_float64x2(vaddq_f64(c.val, vmulq_f64(a.val, b.val)));
|
||||
return v_fma(a, b, c);
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -841,10 +884,15 @@ OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float64x2, double, f64)
|
||||
|
||||
#if CV_FP16
|
||||
// Workaround for old comiplers
|
||||
inline v_float16x4 v_load_f16(const short* ptr)
|
||||
{ return v_float16x4(cv_vld1_f16(ptr)); }
|
||||
inline void v_store_f16(short* ptr, v_float16x4& a)
|
||||
{ cv_vst1_f16(ptr, a.val); }
|
||||
inline v_float16x8 v_load_f16(const short* ptr)
|
||||
{ return v_float16x8(cv_vld1q_f16(ptr)); }
|
||||
inline v_float16x8 v_load_f16_aligned(const short* ptr)
|
||||
{ return v_float16x8(cv_vld1q_f16(ptr)); }
|
||||
|
||||
inline void v_store(short* ptr, const v_float16x8& a)
|
||||
{ cv_vst1q_f16(ptr, a.val); }
|
||||
inline void v_store_aligned(short* ptr, const v_float16x8& a)
|
||||
{ cv_vst1q_f16(ptr, a.val); }
|
||||
#endif
|
||||
|
||||
#define OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
|
||||
@ -1293,6 +1341,11 @@ inline v_float32x4 v_cvt_f32(const v_float64x2& a)
|
||||
return v_float32x4(vcombine_f32(vcvt_f32_f64(a.val), zero));
|
||||
}
|
||||
|
||||
inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
|
||||
{
|
||||
return v_float32x4(vcombine_f32(vcvt_f32_f64(a.val), vcvt_f32_f64(b.val)));
|
||||
}
|
||||
|
||||
inline v_float64x2 v_cvt_f64(const v_int32x4& a)
|
||||
{
|
||||
return v_float64x2(vcvt_f64_f32(vcvt_f32_s32(vget_low_s32(a.val))));
|
||||
@ -1315,17 +1368,88 @@ inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
|
||||
#endif
|
||||
|
||||
#if CV_FP16
|
||||
inline v_float32x4 v_cvt_f32(const v_float16x4& a)
|
||||
inline v_float32x4 v_cvt_f32(const v_float16x8& a)
|
||||
{
|
||||
return v_float32x4(vcvt_f32_f16(a.val));
|
||||
return v_float32x4(vcvt_f32_f16(vget_low_f16(a.val)));
|
||||
}
|
||||
inline v_float32x4 v_cvt_f32_high(const v_float16x8& a)
|
||||
{
|
||||
return v_float32x4(vcvt_f32_f16(vget_high_f16(a.val)));
|
||||
}
|
||||
|
||||
inline v_float16x4 v_cvt_f16(const v_float32x4& a)
|
||||
inline v_float16x8 v_cvt_f16(const v_float32x4& a, const v_float32x4& b)
|
||||
{
|
||||
return v_float16x4(vcvt_f16_f32(a.val));
|
||||
return v_float16x8(vcombine_f16(vcvt_f16_f32(a.val), vcvt_f16_f32(b.val)));
|
||||
}
|
||||
#endif
|
||||
|
||||
////////////// Lookup table access ////////////////////
|
||||
|
||||
inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
|
||||
{
|
||||
int CV_DECL_ALIGNED(32) elems[4] =
|
||||
{
|
||||
tab[vgetq_lane_s32(idxvec.val, 0)],
|
||||
tab[vgetq_lane_s32(idxvec.val, 1)],
|
||||
tab[vgetq_lane_s32(idxvec.val, 2)],
|
||||
tab[vgetq_lane_s32(idxvec.val, 3)]
|
||||
};
|
||||
return v_int32x4(vld1q_s32(elems));
|
||||
}
|
||||
|
||||
inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
|
||||
{
|
||||
float CV_DECL_ALIGNED(32) elems[4] =
|
||||
{
|
||||
tab[vgetq_lane_s32(idxvec.val, 0)],
|
||||
tab[vgetq_lane_s32(idxvec.val, 1)],
|
||||
tab[vgetq_lane_s32(idxvec.val, 2)],
|
||||
tab[vgetq_lane_s32(idxvec.val, 3)]
|
||||
};
|
||||
return v_float32x4(vld1q_f32(elems));
|
||||
}
|
||||
|
||||
inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
|
||||
{
|
||||
/*int CV_DECL_ALIGNED(32) idx[4];
|
||||
v_store(idx, idxvec);
|
||||
|
||||
float32x4_t xy02 = vcombine_f32(vld1_f32(tab + idx[0]), vld1_f32(tab + idx[2]));
|
||||
float32x4_t xy13 = vcombine_f32(vld1_f32(tab + idx[1]), vld1_f32(tab + idx[3]));
|
||||
|
||||
float32x4x2_t xxyy = vuzpq_f32(xy02, xy13);
|
||||
x = v_float32x4(xxyy.val[0]);
|
||||
y = v_float32x4(xxyy.val[1]);*/
|
||||
int CV_DECL_ALIGNED(32) idx[4];
|
||||
v_store_aligned(idx, idxvec);
|
||||
|
||||
x = v_float32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
|
||||
y = v_float32x4(tab[idx[0]+1], tab[idx[1]+1], tab[idx[2]+1], tab[idx[3]+1]);
|
||||
}
|
||||
|
||||
#if CV_SIMD128_64F
|
||||
inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
|
||||
{
|
||||
double CV_DECL_ALIGNED(32) elems[2] =
|
||||
{
|
||||
tab[vgetq_lane_s32(idxvec.val, 0)],
|
||||
tab[vgetq_lane_s32(idxvec.val, 1)],
|
||||
};
|
||||
return v_float64x2(vld1q_f64(elems));
|
||||
}
|
||||
|
||||
inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
|
||||
{
|
||||
int CV_DECL_ALIGNED(32) idx[4];
|
||||
v_store_aligned(idx, idxvec);
|
||||
|
||||
x = v_float64x2(tab[idx[0]], tab[idx[1]]);
|
||||
y = v_float64x2(tab[idx[0]+1], tab[idx[1]+1]);
|
||||
}
|
||||
#endif
|
||||
|
||||
inline void v_cleanup() {}
|
||||
|
||||
//! @name Check SIMD support
|
||||
//! @{
|
||||
//! @brief Check CPU capability of SIMD operation
|
||||
|
@ -58,6 +58,17 @@ namespace cv
|
||||
|
||||
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
|
||||
|
||||
struct v_uint8x16;
|
||||
struct v_int8x16;
|
||||
struct v_uint16x8;
|
||||
struct v_int16x8;
|
||||
struct v_uint32x4;
|
||||
struct v_int32x4;
|
||||
struct v_float32x4;
|
||||
struct v_uint64x2;
|
||||
struct v_int64x2;
|
||||
struct v_float64x2;
|
||||
|
||||
struct v_uint8x16
|
||||
{
|
||||
typedef uchar lane_type;
|
||||
@ -144,6 +155,7 @@ struct v_int16x8
|
||||
{
|
||||
return (short)_mm_cvtsi128_si32(val);
|
||||
}
|
||||
|
||||
__m128i val;
|
||||
};
|
||||
|
||||
@ -163,6 +175,7 @@ struct v_uint32x4
|
||||
{
|
||||
return (unsigned)_mm_cvtsi128_si32(val);
|
||||
}
|
||||
|
||||
__m128i val;
|
||||
};
|
||||
|
||||
@ -182,6 +195,7 @@ struct v_int32x4
|
||||
{
|
||||
return _mm_cvtsi128_si32(val);
|
||||
}
|
||||
|
||||
__m128i val;
|
||||
};
|
||||
|
||||
@ -201,6 +215,7 @@ struct v_float32x4
|
||||
{
|
||||
return _mm_cvtss_f32(val);
|
||||
}
|
||||
|
||||
__m128 val;
|
||||
};
|
||||
|
||||
@ -222,6 +237,7 @@ struct v_uint64x2
|
||||
int b = _mm_cvtsi128_si32(_mm_srli_epi64(val, 32));
|
||||
return (unsigned)a | ((uint64)(unsigned)b << 32);
|
||||
}
|
||||
|
||||
__m128i val;
|
||||
};
|
||||
|
||||
@ -243,6 +259,7 @@ struct v_int64x2
|
||||
int b = _mm_cvtsi128_si32(_mm_srli_epi64(val, 32));
|
||||
return (int64)((unsigned)a | ((uint64)(unsigned)b << 32));
|
||||
}
|
||||
|
||||
__m128i val;
|
||||
};
|
||||
|
||||
@ -262,29 +279,31 @@ struct v_float64x2
|
||||
{
|
||||
return _mm_cvtsd_f64(val);
|
||||
}
|
||||
|
||||
__m128d val;
|
||||
};
|
||||
|
||||
#if CV_FP16
|
||||
struct v_float16x4
|
||||
struct v_float16x8
|
||||
{
|
||||
typedef short lane_type;
|
||||
typedef __m128i vector_type;
|
||||
enum { nlanes = 4 };
|
||||
enum { nlanes = 8 };
|
||||
|
||||
v_float16x4() : val(_mm_setzero_si128()) {}
|
||||
explicit v_float16x4(__m128i v) : val(v) {}
|
||||
v_float16x4(short v0, short v1, short v2, short v3)
|
||||
v_float16x8() : val(_mm_setzero_si128()) {}
|
||||
explicit v_float16x8(__m128i v) : val(v) {}
|
||||
v_float16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
|
||||
{
|
||||
val = _mm_setr_epi16(v0, v1, v2, v3, 0, 0, 0, 0);
|
||||
val = _mm_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7);
|
||||
}
|
||||
short get0() const
|
||||
{
|
||||
return (short)_mm_cvtsi128_si32(val);
|
||||
}
|
||||
|
||||
__m128i val;
|
||||
};
|
||||
#endif
|
||||
inline v_float16x8 v_setzero_f16() { return v_float16x8(_mm_setzero_si128()); }
|
||||
inline v_float16x8 v_setall_f16(short val) { return v_float16x8(_mm_set1_epi16(val)); }
|
||||
|
||||
namespace hal_sse_internal
|
||||
{
|
||||
@ -697,11 +716,15 @@ inline v_uint32x4 operator * (const v_uint32x4& a, const v_uint32x4& b)
|
||||
}
|
||||
inline v_int32x4 operator * (const v_int32x4& a, const v_int32x4& b)
|
||||
{
|
||||
#if CV_SSE4_1
|
||||
return v_int32x4(_mm_mullo_epi32(a.val, b.val));
|
||||
#else
|
||||
__m128i c0 = _mm_mul_epu32(a.val, b.val);
|
||||
__m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32));
|
||||
__m128i d0 = _mm_unpacklo_epi32(c0, c1);
|
||||
__m128i d1 = _mm_unpackhi_epi32(c0, c1);
|
||||
return v_int32x4(_mm_unpacklo_epi64(d0, d1));
|
||||
#endif
|
||||
}
|
||||
inline v_uint32x4& operator *= (v_uint32x4& a, const v_uint32x4& b)
|
||||
{
|
||||
@ -1027,11 +1050,35 @@ inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b)
|
||||
__m128i m = _mm_cmpgt_epi32(b.val, a.val);
|
||||
return v_uint32x4(_mm_sub_epi32(_mm_xor_si128(d, m), m));
|
||||
}
|
||||
inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
|
||||
|
||||
inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
|
||||
{
|
||||
return a * b + c;
|
||||
}
|
||||
|
||||
inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
|
||||
{
|
||||
return v_fma(a, b, c);
|
||||
}
|
||||
|
||||
inline v_float32x4 v_fma(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
|
||||
{
|
||||
#if CV_FMA3
|
||||
return v_float32x4(_mm_fmadd_ps(a.val, b.val, c.val));
|
||||
#else
|
||||
return v_float32x4(_mm_add_ps(_mm_mul_ps(a.val, b.val), c.val));
|
||||
#endif
|
||||
}
|
||||
|
||||
inline v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
|
||||
{
|
||||
#if CV_FMA3
|
||||
return v_float64x2(_mm_fmadd_pd(a.val, b.val, c.val));
|
||||
#else
|
||||
return v_float64x2(_mm_add_pd(_mm_mul_pd(a.val, b.val), c.val));
|
||||
#endif
|
||||
}
|
||||
|
||||
#define OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(_Tpvec, _Tp, _Tpreg, suffix, absmask_vec) \
|
||||
inline _Tpvec v_absdiff(const _Tpvec& a, const _Tpvec& b) \
|
||||
{ \
|
||||
@ -1040,17 +1087,16 @@ inline _Tpvec v_absdiff(const _Tpvec& a, const _Tpvec& b) \
|
||||
} \
|
||||
inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \
|
||||
{ \
|
||||
_Tpreg res = _mm_add_##suffix(_mm_mul_##suffix(a.val, a.val), _mm_mul_##suffix(b.val, b.val)); \
|
||||
return _Tpvec(_mm_sqrt_##suffix(res)); \
|
||||
_Tpvec res = v_fma(a, a, b*b); \
|
||||
return _Tpvec(_mm_sqrt_##suffix(res.val)); \
|
||||
} \
|
||||
inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
|
||||
{ \
|
||||
_Tpreg res = _mm_add_##suffix(_mm_mul_##suffix(a.val, a.val), _mm_mul_##suffix(b.val, b.val)); \
|
||||
return _Tpvec(res); \
|
||||
return v_fma(a, a, b*b); \
|
||||
} \
|
||||
inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
|
||||
{ \
|
||||
return _Tpvec(_mm_add_##suffix(_mm_mul_##suffix(a.val, b.val), c.val)); \
|
||||
return v_fma(a, b, c); \
|
||||
}
|
||||
|
||||
OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(v_float32x4, float, __m128, ps, _mm_set1_epi32((int)0x7fffffff))
|
||||
@ -1268,12 +1314,15 @@ inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
|
||||
OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float32x4, float, ps)
|
||||
OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float64x2, double, pd)
|
||||
|
||||
#if CV_FP16
|
||||
inline v_float16x4 v_load_f16(const short* ptr)
|
||||
{ return v_float16x4(_mm_loadl_epi64((const __m128i*)ptr)); }
|
||||
inline void v_store_f16(short* ptr, v_float16x4& a)
|
||||
{ _mm_storel_epi64((__m128i*)ptr, a.val); }
|
||||
#endif
|
||||
inline v_float16x8 v_load_f16(const short* ptr)
|
||||
{ return v_float16x8(_mm_loadu_si128((const __m128i*)ptr)); }
|
||||
inline v_float16x8 v_load_f16_aligned(const short* ptr)
|
||||
{ return v_float16x8(_mm_load_si128((const __m128i*)ptr)); }
|
||||
|
||||
inline void v_store(short* ptr, const v_float16x8& a)
|
||||
{ _mm_storeu_si128((__m128i*)ptr, a.val); }
|
||||
inline void v_store_aligned(short* ptr, const v_float16x8& a)
|
||||
{ _mm_store_si128((__m128i*)ptr, a.val); }
|
||||
|
||||
#define OPENCV_HAL_IMPL_SSE_REDUCE_OP_8(_Tpvec, scalartype, func, suffix, sbit) \
|
||||
inline scalartype v_reduce_##func(const v_##_Tpvec& a) \
|
||||
@ -2183,6 +2232,11 @@ inline v_float32x4 v_cvt_f32(const v_float64x2& a)
|
||||
return v_float32x4(_mm_cvtpd_ps(a.val));
|
||||
}
|
||||
|
||||
inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
|
||||
{
|
||||
return v_float32x4(_mm_movelh_ps(_mm_cvtpd_ps(a.val), _mm_cvtpd_ps(b.val)));
|
||||
}
|
||||
|
||||
inline v_float64x2 v_cvt_f64(const v_int32x4& a)
|
||||
{
|
||||
return v_float64x2(_mm_cvtepi32_pd(a.val));
|
||||
@ -2200,21 +2254,82 @@ inline v_float64x2 v_cvt_f64(const v_float32x4& a)
|
||||
|
||||
inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
|
||||
{
|
||||
return v_float64x2(_mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(a.val),8))));
|
||||
return v_float64x2(_mm_cvtps_pd(_mm_movehl_ps(a.val, a.val)));
|
||||
}
|
||||
|
||||
#if CV_FP16
|
||||
inline v_float32x4 v_cvt_f32(const v_float16x4& a)
|
||||
inline v_float32x4 v_cvt_f32(const v_float16x8& a)
|
||||
{
|
||||
return v_float32x4(_mm_cvtph_ps(a.val));
|
||||
}
|
||||
|
||||
inline v_float16x4 v_cvt_f16(const v_float32x4& a)
|
||||
inline v_float32x4 v_cvt_f32_high(const v_float16x8& a)
|
||||
{
|
||||
return v_float16x4(_mm_cvtps_ph(a.val, 0));
|
||||
return v_float32x4(_mm_cvtph_ps(_mm_unpackhi_epi64(a.val, a.val)));
|
||||
}
|
||||
|
||||
inline v_float16x8 v_cvt_f16(const v_float32x4& a, const v_float32x4& b)
|
||||
{
|
||||
return v_float16x8(_mm_unpacklo_epi64(_mm_cvtps_ph(a.val, 0), _mm_cvtps_ph(b.val, 0)));
|
||||
}
|
||||
#endif
|
||||
|
||||
////////////// Lookup table access ////////////////////
|
||||
|
||||
inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
|
||||
{
|
||||
int CV_DECL_ALIGNED(32) idx[4];
|
||||
v_store_aligned(idx, idxvec);
|
||||
return v_int32x4(_mm_setr_epi32(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]));
|
||||
}
|
||||
|
||||
inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
|
||||
{
|
||||
int CV_DECL_ALIGNED(32) idx[4];
|
||||
v_store_aligned(idx, idxvec);
|
||||
return v_float32x4(_mm_setr_ps(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]));
|
||||
}
|
||||
|
||||
inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
|
||||
{
|
||||
int idx[2];
|
||||
v_store_low(idx, idxvec);
|
||||
return v_float64x2(_mm_setr_pd(tab[idx[0]], tab[idx[1]]));
|
||||
}
|
||||
|
||||
// loads pairs from the table and deinterleaves them, e.g. returns:
|
||||
// x = (tab[idxvec[0], tab[idxvec[1]], tab[idxvec[2]], tab[idxvec[3]]),
|
||||
// y = (tab[idxvec[0]+1], tab[idxvec[1]+1], tab[idxvec[2]+1], tab[idxvec[3]+1])
|
||||
// note that the indices are float's indices, not the float-pair indices.
|
||||
// in theory, this function can be used to implement bilinear interpolation,
|
||||
// when idxvec are the offsets within the image.
|
||||
inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
|
||||
{
|
||||
int CV_DECL_ALIGNED(32) idx[4];
|
||||
v_store_aligned(idx, idxvec);
|
||||
__m128 z = _mm_setzero_ps();
|
||||
__m128 xy01 = _mm_loadl_pi(z, (__m64*)(tab + idx[0]));
|
||||
__m128 xy23 = _mm_loadl_pi(z, (__m64*)(tab + idx[2]));
|
||||
xy01 = _mm_loadh_pi(xy01, (__m64*)(tab + idx[1]));
|
||||
xy23 = _mm_loadh_pi(xy23, (__m64*)(tab + idx[3]));
|
||||
__m128 xxyy02 = _mm_unpacklo_ps(xy01, xy23);
|
||||
__m128 xxyy13 = _mm_unpackhi_ps(xy01, xy23);
|
||||
x = v_float32x4(_mm_unpacklo_ps(xxyy02, xxyy13));
|
||||
y = v_float32x4(_mm_unpackhi_ps(xxyy02, xxyy13));
|
||||
}
|
||||
|
||||
inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
|
||||
{
|
||||
int idx[2];
|
||||
v_store_low(idx, idxvec);
|
||||
__m128d xy0 = _mm_loadu_pd(tab + idx[0]);
|
||||
__m128d xy1 = _mm_loadu_pd(tab + idx[1]);
|
||||
x = v_float64x2(_mm_unpacklo_pd(xy0, xy1));
|
||||
y = v_float64x2(_mm_unpackhi_pd(xy0, xy1));
|
||||
}
|
||||
|
||||
inline void v_cleanup() {}
|
||||
|
||||
//! @name Check SIMD support
|
||||
//! @{
|
||||
//! @brief Check CPU capability of SIMD operation
|
||||
|
@ -764,6 +764,8 @@ inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return _Tpvec(vec_sqrt(vec_madd(a.val, a.val, vec_mul(b.val, b.val)))); } \
|
||||
inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return _Tpvec(vec_madd(a.val, a.val, vec_mul(b.val, b.val))); } \
|
||||
inline _Tpvec v_fma(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
|
||||
{ return _Tpvec(vec_madd(a.val, b.val, c.val)); } \
|
||||
inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
|
||||
{ return _Tpvec(vec_madd(a.val, b.val, c.val)); }
|
||||
|
||||
@ -836,6 +838,9 @@ inline v_float32x4 v_cvt_f32(const v_int32x4& a)
|
||||
inline v_float32x4 v_cvt_f32(const v_float64x2& a)
|
||||
{ return v_float32x4(vec_mergesqo(vec_cvfo(a.val), vec_float4_z)); }
|
||||
|
||||
inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
|
||||
{ return v_float32x4(vec_mergesqo(vec_cvfo(a.val), vec_cvfo(b.val))); }
|
||||
|
||||
inline v_float64x2 v_cvt_f64(const v_int32x4& a)
|
||||
{ return v_float64x2(vec_ctdo(vec_mergeh(a.val, a.val))); }
|
||||
|
||||
@ -848,6 +853,48 @@ inline v_float64x2 v_cvt_f64(const v_float32x4& a)
|
||||
inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
|
||||
{ return v_float64x2(vec_cvfo(vec_mergel(a.val, a.val))); }
|
||||
|
||||
////////////// Lookup table access ////////////////////
|
||||
|
||||
inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
|
||||
{
|
||||
int CV_DECL_ALIGNED(32) idx[4];
|
||||
v_store_aligned(idx, idxvec);
|
||||
return v_int32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
|
||||
}
|
||||
|
||||
inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
|
||||
{
|
||||
int CV_DECL_ALIGNED(32) idx[4];
|
||||
v_store_aligned(idx, idxvec);
|
||||
return v_float32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
|
||||
}
|
||||
|
||||
inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
|
||||
{
|
||||
int CV_DECL_ALIGNED(32) idx[4];
|
||||
v_store_aligned(idx, idxvec);
|
||||
return v_float64x2(tab[idx[0]], tab[idx[1]]);
|
||||
}
|
||||
|
||||
inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
|
||||
{
|
||||
int CV_DECL_ALIGNED(32) idx[4];
|
||||
v_store_aligned(idx, idxvec);
|
||||
x = v_float32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
|
||||
y = v_float32x4(tab[idx[0]+1], tab[idx[1]+1], tab[idx[2]+1], tab[idx[3]+1]);
|
||||
}
|
||||
|
||||
inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
|
||||
{
|
||||
int CV_DECL_ALIGNED(32) idx[4];
|
||||
v_store_aligned(idx, idxvec);
|
||||
x = v_float64x2(tab[idx[0]], tab[idx[1]]);
|
||||
y = v_float64x2(tab[idx[0]+1], tab[idx[1]+1]);
|
||||
}
|
||||
|
||||
inline void v_cleanup() {}
|
||||
|
||||
|
||||
/** Reinterpret **/
|
||||
/** its up there with load and store operations **/
|
||||
|
||||
|
@ -81,10 +81,9 @@ void cvtScaleHalf_SIMD32f16f( const float* src, size_t sstep, short* dst, size_t
|
||||
for ( ; x <= size.width - cVectorWidth ; x += cVectorWidth)
|
||||
{
|
||||
float32x4_t v_src = vld1q_f32(src + x);
|
||||
|
||||
float16x4_t v_dst = vcvt_f16_f32(v_src);
|
||||
|
||||
cv_vst1_f16((__fp16*)dst + x, v_dst);
|
||||
cv_vst1_f16(dst + x, v_dst);
|
||||
}
|
||||
|
||||
for ( ; x < size.width; x++ )
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -241,9 +241,9 @@ TEST(hal_intrin, float64x2) {
|
||||
}
|
||||
#endif
|
||||
|
||||
TEST(hal_intrin,float16x4)
|
||||
TEST(hal_intrin,float16)
|
||||
{
|
||||
CV_CPU_CALL_FP16_(test_hal_intrin_float16x4, ());
|
||||
CV_CPU_CALL_FP16_(test_hal_intrin_float16, ());
|
||||
throw SkipTestException("Unsupported hardware: FP16 is not available");
|
||||
}
|
||||
|
||||
|
@ -7,9 +7,9 @@
|
||||
namespace opencv_test { namespace hal {
|
||||
CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
|
||||
|
||||
void test_hal_intrin_float16x4()
|
||||
void test_hal_intrin_float16()
|
||||
{
|
||||
TheTest<v_float16x4>()
|
||||
TheTest<v_float16x8>()
|
||||
.test_loadstore_fp16()
|
||||
.test_float_cvt_fp16()
|
||||
;
|
||||
|
@ -6,7 +6,7 @@
|
||||
namespace opencv_test { namespace hal {
|
||||
CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
|
||||
|
||||
void test_hal_intrin_float16x4();
|
||||
void test_hal_intrin_float16();
|
||||
|
||||
#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
|
||||
|
||||
@ -50,6 +50,8 @@ template <> struct initializer<2>
|
||||
template <typename R> struct Data
|
||||
{
|
||||
typedef typename R::lane_type LaneType;
|
||||
typedef typename V_TypeTraits<LaneType>::int_type int_type;
|
||||
|
||||
Data()
|
||||
{
|
||||
for (int i = 0; i < R::nlanes; ++i)
|
||||
@ -104,6 +106,17 @@ template <typename R> struct Data
|
||||
CV_Assert(i >= 0 && i < R::nlanes);
|
||||
return d[i];
|
||||
}
|
||||
int_type as_int(int i) const
|
||||
{
|
||||
CV_Assert(i >= 0 && i < R::nlanes);
|
||||
union
|
||||
{
|
||||
LaneType l;
|
||||
int_type i;
|
||||
} v;
|
||||
v.l = d[i];
|
||||
return v.i;
|
||||
}
|
||||
const LaneType * mid() const
|
||||
{
|
||||
return d + R::nlanes / 2;
|
||||
@ -247,8 +260,9 @@ template<typename R> struct TheTest
|
||||
EXPECT_EQ(d, res);
|
||||
|
||||
// zero, all
|
||||
Data<R> resZ = V_RegTrait128<LaneType>::zero();
|
||||
Data<R> resV = V_RegTrait128<LaneType>::all(8);
|
||||
Data<R> resZ, resV;
|
||||
resZ.fill((LaneType)0);
|
||||
resV.fill((LaneType)8);
|
||||
for (int i = 0; i < R::nlanes; ++i)
|
||||
{
|
||||
EXPECT_EQ((LaneType)0, resZ[i]);
|
||||
@ -339,7 +353,7 @@ template<typename R> struct TheTest
|
||||
// v_expand and v_load_expand
|
||||
TheTest & test_expand()
|
||||
{
|
||||
typedef typename V_RegTrait128<LaneType>::w_reg Rx2;
|
||||
typedef typename V_RegTraits<R>::w_reg Rx2;
|
||||
Data<R> dataA;
|
||||
R a = dataA;
|
||||
|
||||
@ -362,7 +376,7 @@ template<typename R> struct TheTest
|
||||
|
||||
TheTest & test_expand_q()
|
||||
{
|
||||
typedef typename V_RegTrait128<LaneType>::q_reg Rx4;
|
||||
typedef typename V_RegTraits<R>::q_reg Rx4;
|
||||
Data<R> data;
|
||||
Data<Rx4> out = v_load_expand_q(data.d);
|
||||
const int n = Rx4::nlanes;
|
||||
@ -436,7 +450,7 @@ template<typename R> struct TheTest
|
||||
|
||||
TheTest & test_mul_expand()
|
||||
{
|
||||
typedef typename V_RegTrait128<LaneType>::w_reg Rx2;
|
||||
typedef typename V_RegTraits<R>::w_reg Rx2;
|
||||
Data<R> dataA, dataB(2);
|
||||
R a = dataA, b = dataB;
|
||||
Rx2 c, d;
|
||||
@ -456,7 +470,7 @@ template<typename R> struct TheTest
|
||||
|
||||
TheTest & test_abs()
|
||||
{
|
||||
typedef typename V_RegTrait128<LaneType>::u_reg Ru;
|
||||
typedef typename V_RegTraits<R>::u_reg Ru;
|
||||
typedef typename Ru::lane_type u_type;
|
||||
Data<R> dataA, dataB(10);
|
||||
R a = dataA, b = dataB;
|
||||
@ -520,7 +534,7 @@ template<typename R> struct TheTest
|
||||
|
||||
TheTest & test_dot_prod()
|
||||
{
|
||||
typedef typename V_RegTrait128<LaneType>::w_reg Rx2;
|
||||
typedef typename V_RegTraits<R>::w_reg Rx2;
|
||||
typedef typename Rx2::lane_type w_type;
|
||||
|
||||
Data<R> dataA, dataB(2);
|
||||
@ -608,7 +622,7 @@ template<typename R> struct TheTest
|
||||
|
||||
TheTest & test_absdiff()
|
||||
{
|
||||
typedef typename V_RegTrait128<LaneType>::u_reg Ru;
|
||||
typedef typename V_RegTraits<R>::u_reg Ru;
|
||||
typedef typename Ru::lane_type u_type;
|
||||
Data<R> dataA(std::numeric_limits<LaneType>::max()),
|
||||
dataB(std::numeric_limits<LaneType>::min());
|
||||
@ -657,12 +671,21 @@ template<typename R> struct TheTest
|
||||
|
||||
TheTest & test_mask()
|
||||
{
|
||||
typedef V_TypeTraits<LaneType> Traits;
|
||||
typedef typename Traits::int_type int_type;
|
||||
typedef typename V_RegTraits<R>::int_reg int_reg;
|
||||
typedef typename V_RegTraits<int_reg>::u_reg uint_reg;
|
||||
typedef typename int_reg::lane_type int_type;
|
||||
typedef typename uint_reg::lane_type uint_type;
|
||||
|
||||
Data<R> dataA, dataB(0), dataC, dataD(1), dataE(2);
|
||||
dataA[1] *= (LaneType)-1;
|
||||
const LaneType mask_one = Traits::reinterpret_from_int(~(typename Traits::uint_type)(0));
|
||||
union
|
||||
{
|
||||
LaneType l;
|
||||
uint_type ui;
|
||||
}
|
||||
all1s;
|
||||
all1s.ui = (uint_type)-1;
|
||||
LaneType mask_one = all1s.l;
|
||||
dataB[1] = mask_one;
|
||||
dataB[R::nlanes / 2] = mask_one;
|
||||
dataB[R::nlanes - 1] = mask_one;
|
||||
@ -684,10 +707,8 @@ template<typename R> struct TheTest
|
||||
Data<R> resF = f;
|
||||
for (int i = 0; i < R::nlanes; ++i)
|
||||
{
|
||||
int_type m2 = Traits::reinterpret_int(dataB[i]);
|
||||
EXPECT_EQ((Traits::reinterpret_int(dataD[i]) & m2)
|
||||
| (Traits::reinterpret_int(dataE[i]) & ~m2),
|
||||
Traits::reinterpret_int(resF[i]));
|
||||
int_type m2 = dataB.as_int(i);
|
||||
EXPECT_EQ((dataD.as_int(i) & m2) | (dataE.as_int(i) & ~m2), resF.as_int(i));
|
||||
}
|
||||
|
||||
return *this;
|
||||
@ -697,7 +718,7 @@ template<typename R> struct TheTest
|
||||
TheTest & test_pack()
|
||||
{
|
||||
SCOPED_TRACE(s);
|
||||
typedef typename V_RegTrait128<LaneType>::w_reg Rx2;
|
||||
typedef typename V_RegTraits<R>::w_reg Rx2;
|
||||
typedef typename Rx2::lane_type w_type;
|
||||
Data<Rx2> dataA, dataB;
|
||||
dataA += std::numeric_limits<LaneType>::is_signed ? -10 : 10;
|
||||
@ -734,8 +755,9 @@ template<typename R> struct TheTest
|
||||
TheTest & test_pack_u()
|
||||
{
|
||||
SCOPED_TRACE(s);
|
||||
typedef typename V_TypeTraits<LaneType>::w_type LaneType_w;
|
||||
typedef typename V_RegTrait128<LaneType_w>::int_reg Ri2;
|
||||
//typedef typename V_RegTraits<LaneType>::w_type LaneType_w;
|
||||
typedef typename V_RegTraits<R>::w_reg R2;
|
||||
typedef typename V_RegTraits<R2>::int_reg Ri2;
|
||||
typedef typename Ri2::lane_type w_type;
|
||||
|
||||
Data<Ri2> dataA, dataB;
|
||||
@ -864,7 +886,7 @@ template<typename R> struct TheTest
|
||||
|
||||
TheTest & test_float_math()
|
||||
{
|
||||
typedef typename V_RegTrait128<LaneType>::int_reg Ri;
|
||||
typedef typename V_RegTraits<R>::round_reg Ri;
|
||||
Data<R> data1, data2, data3;
|
||||
data1 *= 1.1;
|
||||
data2 += 10;
|
||||
@ -1005,31 +1027,28 @@ template<typename R> struct TheTest
|
||||
|
||||
TheTest & test_loadstore_fp16()
|
||||
{
|
||||
#if CV_FP16 && CV_SIMD128
|
||||
#if CV_FP16 && CV_SIMD
|
||||
AlignedData<R> data;
|
||||
AlignedData<R> out;
|
||||
|
||||
if(1 /* checkHardwareSupport(CV_CPU_FP16) */ )
|
||||
{
|
||||
// check if addresses are aligned and unaligned respectively
|
||||
EXPECT_EQ((size_t)0, (size_t)&data.a.d % 16);
|
||||
EXPECT_NE((size_t)0, (size_t)&data.u.d % 16);
|
||||
EXPECT_EQ((size_t)0, (size_t)&out.a.d % 16);
|
||||
EXPECT_NE((size_t)0, (size_t)&out.u.d % 16);
|
||||
// check if addresses are aligned and unaligned respectively
|
||||
EXPECT_EQ((size_t)0, (size_t)&data.a.d % 16);
|
||||
EXPECT_NE((size_t)0, (size_t)&data.u.d % 16);
|
||||
EXPECT_EQ((size_t)0, (size_t)&out.a.d % 16);
|
||||
EXPECT_NE((size_t)0, (size_t)&out.u.d % 16);
|
||||
|
||||
// check some initialization methods
|
||||
R r1 = data.u;
|
||||
R r2 = v_load_f16(data.a.d);
|
||||
R r3(r2);
|
||||
EXPECT_EQ(data.u[0], r1.get0());
|
||||
EXPECT_EQ(data.a[0], r2.get0());
|
||||
EXPECT_EQ(data.a[0], r3.get0());
|
||||
// check some initialization methods
|
||||
R r1 = data.u;
|
||||
R r2 = v_load_f16(data.a.d);
|
||||
R r3(r2);
|
||||
EXPECT_EQ(data.u[0], r1.get0());
|
||||
EXPECT_EQ(data.a[0], r2.get0());
|
||||
EXPECT_EQ(data.a[0], r3.get0());
|
||||
|
||||
// check some store methods
|
||||
out.a.clear();
|
||||
v_store_f16(out.a.d, r1);
|
||||
EXPECT_EQ(data.a, out.a);
|
||||
}
|
||||
// check some store methods
|
||||
out.a.clear();
|
||||
v_store(out.a.d, r1);
|
||||
EXPECT_EQ(data.a, out.a);
|
||||
|
||||
return *this;
|
||||
#endif
|
||||
@ -1037,18 +1056,15 @@ template<typename R> struct TheTest
|
||||
|
||||
TheTest & test_float_cvt_fp16()
|
||||
{
|
||||
#if CV_FP16 && CV_SIMD128
|
||||
AlignedData<v_float32x4> data;
|
||||
#if CV_FP16 && CV_SIMD
|
||||
AlignedData<v_float32> data;
|
||||
|
||||
if(1 /* checkHardwareSupport(CV_CPU_FP16) */)
|
||||
{
|
||||
// check conversion
|
||||
v_float32x4 r1 = v_load(data.a.d);
|
||||
v_float16x4 r2 = v_cvt_f16(r1);
|
||||
v_float32x4 r3 = v_cvt_f32(r2);
|
||||
EXPECT_EQ(0x3c00, r2.get0());
|
||||
EXPECT_EQ(r3.get0(), r1.get0());
|
||||
}
|
||||
// check conversion
|
||||
v_float32 r1 = vx_load(data.a.d);
|
||||
v_float16 r2 = v_cvt_f16(r1, vx_setzero_f32());
|
||||
v_float32 r3 = v_cvt_f32(r2);
|
||||
EXPECT_EQ(0x3c00, r2.get0());
|
||||
EXPECT_EQ(r3.get0(), r1.get0());
|
||||
|
||||
return *this;
|
||||
#endif
|
||||
|
@ -134,7 +134,9 @@ double Core_PowTest::get_success_error_level( int test_case_idx, int i, int j )
|
||||
if( depth < CV_32F )
|
||||
return power == cvRound(power) && power >= 0 ? 0 : 1;
|
||||
else
|
||||
return Base::get_success_error_level( test_case_idx, i, j );
|
||||
{
|
||||
return depth != CV_64F ? Base::get_success_error_level( test_case_idx, i, j ) : DBL_EPSILON*1024*1.1;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
@ -2129,7 +2129,7 @@ int cmpEps2( TS* ts, const Mat& a, const Mat& b, double success_err_level,
|
||||
switch( code )
|
||||
{
|
||||
case CMP_EPS_BIG_DIFF:
|
||||
sprintf( msg, "%s: Too big difference (=%g)", desc, diff );
|
||||
sprintf( msg, "%s: Too big difference (=%g > %g)", desc, diff, success_err_level );
|
||||
code = TS::FAIL_BAD_ACCURACY;
|
||||
break;
|
||||
case CMP_EPS_INVALID_TEST_DATA:
|
||||
|
Loading…
Reference in New Issue
Block a user