mirror of
https://github.com/opencv/opencv.git
synced 2024-11-27 20:50:25 +08:00
Merge pull request #16236 from alalek:fix_core_simd_emulator
* core: fix intrin_cpp, allow to build modules with SIMD emulator * core(arithm): fix v_zero initialization * core(simd): 'strict' types for binary/bitwise operations * features2d: avoid aligned load issue in GCC 5.4 with emulated SIMD * core(simd): alignment checks in SIMD emulator
This commit is contained in:
parent
c75d93337e
commit
e180cc050b
@ -99,6 +99,7 @@ enum StoreMode
|
||||
|
||||
}
|
||||
|
||||
// TODO FIXIT: Don't use "God" traits. Split on separate cases.
|
||||
template<typename _Tp> struct V_TypeTraits
|
||||
{
|
||||
};
|
||||
@ -130,21 +131,51 @@ template<typename _Tp> struct V_TypeTraits
|
||||
} \
|
||||
}
|
||||
|
||||
#define CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(type, int_type_, uint_type_, abs_type_, w_type_, sum_type_, nlanes128_) \
|
||||
template<> struct V_TypeTraits<type> \
|
||||
{ \
|
||||
typedef type value_type; \
|
||||
typedef int_type_ int_type; \
|
||||
typedef abs_type_ abs_type; \
|
||||
typedef uint_type_ uint_type; \
|
||||
typedef w_type_ w_type; \
|
||||
typedef sum_type_ sum_type; \
|
||||
enum { nlanes128 = nlanes128_ }; \
|
||||
\
|
||||
static inline int_type reinterpret_int(type x) \
|
||||
{ \
|
||||
union { type l; int_type i; } v; \
|
||||
v.l = x; \
|
||||
return v.i; \
|
||||
} \
|
||||
\
|
||||
static inline type reinterpret_from_int(int_type x) \
|
||||
{ \
|
||||
union { type l; int_type i; } v; \
|
||||
v.i = x; \
|
||||
return v.l; \
|
||||
} \
|
||||
}
|
||||
|
||||
CV_INTRIN_DEF_TYPE_TRAITS(uchar, schar, uchar, uchar, ushort, unsigned, unsigned, 16);
|
||||
CV_INTRIN_DEF_TYPE_TRAITS(schar, schar, uchar, uchar, short, int, int, 16);
|
||||
CV_INTRIN_DEF_TYPE_TRAITS(ushort, short, ushort, ushort, unsigned, uint64, unsigned, 8);
|
||||
CV_INTRIN_DEF_TYPE_TRAITS(short, short, ushort, ushort, int, int64, int, 8);
|
||||
CV_INTRIN_DEF_TYPE_TRAITS(unsigned, int, unsigned, unsigned, uint64, void, unsigned, 4);
|
||||
CV_INTRIN_DEF_TYPE_TRAITS(int, int, unsigned, unsigned, int64, void, int, 4);
|
||||
CV_INTRIN_DEF_TYPE_TRAITS(float, int, unsigned, float, double, void, float, 4);
|
||||
CV_INTRIN_DEF_TYPE_TRAITS(uint64, int64, uint64, uint64, void, void, uint64, 2);
|
||||
CV_INTRIN_DEF_TYPE_TRAITS(int64, int64, uint64, uint64, void, void, int64, 2);
|
||||
CV_INTRIN_DEF_TYPE_TRAITS(double, int64, uint64, double, void, void, double, 2);
|
||||
CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(unsigned, int, unsigned, unsigned, uint64, unsigned, 4);
|
||||
CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(int, int, unsigned, unsigned, int64, int, 4);
|
||||
CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(float, int, unsigned, float, double, float, 4);
|
||||
CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(uint64, int64, uint64, uint64, void, uint64, 2);
|
||||
CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(int64, int64, uint64, uint64, void, int64, 2);
|
||||
CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(double, int64, uint64, double, void, double, 2);
|
||||
|
||||
#ifndef CV_DOXYGEN
|
||||
|
||||
#ifndef CV_CPU_OPTIMIZATION_HAL_NAMESPACE
|
||||
#ifdef CV_CPU_DISPATCH_MODE
|
||||
#ifdef CV_FORCE_SIMD128_CPP
|
||||
#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE hal_EMULATOR_CPP
|
||||
#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN namespace hal_EMULATOR_CPP {
|
||||
#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END }
|
||||
#elif defined(CV_CPU_DISPATCH_MODE)
|
||||
#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE __CV_CAT(hal_, CV_CPU_DISPATCH_MODE)
|
||||
#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN namespace __CV_CAT(hal_, CV_CPU_DISPATCH_MODE) {
|
||||
#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END }
|
||||
@ -197,7 +228,6 @@ using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE;
|
||||
|
||||
#else
|
||||
|
||||
#define CV_SIMD128_CPP 1
|
||||
#include "opencv2/core/hal/intrin_cpp.hpp"
|
||||
|
||||
#endif
|
||||
@ -242,6 +272,10 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
|
||||
#define CV_SIMD128 0
|
||||
#endif
|
||||
|
||||
#ifndef CV_SIMD128_CPP
|
||||
#define CV_SIMD128_CPP 0
|
||||
#endif
|
||||
|
||||
#ifndef CV_SIMD128_64F
|
||||
#define CV_SIMD128_64F 0
|
||||
#endif
|
||||
@ -346,7 +380,7 @@ template<typename _Tp> struct V_RegTraits
|
||||
CV_DEF_REG_TRAITS(v, v_int16x8, short, s16, v_uint16x8, v_int32x4, v_int64x2, v_int16x8, void);
|
||||
CV_DEF_REG_TRAITS(v, v_uint32x4, unsigned, u32, v_uint32x4, v_uint64x2, void, v_int32x4, void);
|
||||
CV_DEF_REG_TRAITS(v, v_int32x4, int, s32, v_uint32x4, v_int64x2, void, v_int32x4, void);
|
||||
#if CV_SIMD128_64F
|
||||
#if CV_SIMD128_64F || CV_SIMD128_CPP
|
||||
CV_DEF_REG_TRAITS(v, v_float32x4, float, f32, v_float32x4, v_float64x2, void, v_int32x4, v_int32x4);
|
||||
#else
|
||||
CV_DEF_REG_TRAITS(v, v_float32x4, float, f32, v_float32x4, void, void, v_int32x4, v_int32x4);
|
||||
@ -433,7 +467,11 @@ namespace CV__SIMD_NAMESPACE {
|
||||
} // namespace
|
||||
using namespace CV__SIMD_NAMESPACE;
|
||||
#elif (CV_SIMD128 || CV_SIMD128_CPP) && (!defined(CV__SIMD_FORCE_WIDTH) || CV__SIMD_FORCE_WIDTH == 128)
|
||||
#if defined CV_SIMD128_CPP
|
||||
#define CV__SIMD_NAMESPACE simd128_cpp
|
||||
#else
|
||||
#define CV__SIMD_NAMESPACE simd128
|
||||
#endif
|
||||
namespace CV__SIMD_NAMESPACE {
|
||||
#define CV_SIMD CV_SIMD128
|
||||
#define CV_SIMD_64F CV_SIMD128_64F
|
||||
|
@ -50,6 +50,14 @@
|
||||
#include <algorithm>
|
||||
#include "opencv2/core/saturate.hpp"
|
||||
|
||||
//! @cond IGNORED
|
||||
#define CV_SIMD128_CPP 1
|
||||
#if defined(CV_FORCE_SIMD128_CPP) || defined(CV_DOXYGEN)
|
||||
#define CV_SIMD128 1
|
||||
#define CV_SIMD128_64F 1
|
||||
#endif
|
||||
//! @endcond
|
||||
|
||||
namespace cv
|
||||
{
|
||||
|
||||
@ -135,7 +143,7 @@ Element-wise binary and unary operations.
|
||||
@ref v_shl, @ref v_shr
|
||||
|
||||
- Bitwise logic:
|
||||
@ref operator&(const v_reg &a, const v_reg &b) "&",
|
||||
@ref operator &(const v_reg &a, const v_reg &b) "&",
|
||||
@ref operator |(const v_reg &a, const v_reg &b) "|",
|
||||
@ref operator ^(const v_reg &a, const v_reg &b) "^",
|
||||
@ref operator ~(const v_reg &a) "~"
|
||||
@ -402,50 +410,102 @@ typedef v_reg<uint64, 2> v_uint64x2;
|
||||
/** @brief Two 64-bit signed integer values */
|
||||
typedef v_reg<int64, 2> v_int64x2;
|
||||
|
||||
//! @brief Helper macro
|
||||
//! @ingroup core_hal_intrin_impl
|
||||
#define OPENCV_HAL_IMPL_BIN_OP(bin_op) \
|
||||
template<typename _Tp, int n> inline v_reg<_Tp, n> \
|
||||
operator bin_op (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
|
||||
/** @brief Add values
|
||||
|
||||
For all types. */
|
||||
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator+(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
|
||||
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator+=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
|
||||
|
||||
/** @brief Subtract values
|
||||
|
||||
For all types. */
|
||||
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator-(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
|
||||
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator-=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
|
||||
|
||||
/** @brief Multiply values
|
||||
|
||||
For 16- and 32-bit integer types and floating types. */
|
||||
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator*(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
|
||||
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator*=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
|
||||
|
||||
/** @brief Divide values
|
||||
|
||||
For floating types only. */
|
||||
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator/(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
|
||||
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator/=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
|
||||
|
||||
|
||||
/** @brief Bitwise AND
|
||||
|
||||
Only for integer types. */
|
||||
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator&(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
|
||||
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator&=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
|
||||
|
||||
/** @brief Bitwise OR
|
||||
|
||||
Only for integer types. */
|
||||
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator|(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
|
||||
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator|=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
|
||||
|
||||
/** @brief Bitwise XOR
|
||||
|
||||
Only for integer types.*/
|
||||
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator^(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
|
||||
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator^=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
|
||||
|
||||
/** @brief Bitwise NOT
|
||||
|
||||
Only for integer types.*/
|
||||
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator~(const v_reg<_Tp, n>& a);
|
||||
|
||||
|
||||
#ifndef CV_DOXYGEN
|
||||
|
||||
#define CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES(macro_name, ...) \
|
||||
__CV_EXPAND(macro_name(uchar, __VA_ARGS__)) \
|
||||
__CV_EXPAND(macro_name(schar, __VA_ARGS__)) \
|
||||
__CV_EXPAND(macro_name(ushort, __VA_ARGS__)) \
|
||||
__CV_EXPAND(macro_name(short, __VA_ARGS__)) \
|
||||
__CV_EXPAND(macro_name(unsigned, __VA_ARGS__)) \
|
||||
__CV_EXPAND(macro_name(int, __VA_ARGS__)) \
|
||||
__CV_EXPAND(macro_name(uint64, __VA_ARGS__)) \
|
||||
__CV_EXPAND(macro_name(int64, __VA_ARGS__)) \
|
||||
|
||||
#define CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES(macro_name, ...) \
|
||||
__CV_EXPAND(macro_name(float, __VA_ARGS__)) \
|
||||
__CV_EXPAND(macro_name(double, __VA_ARGS__)) \
|
||||
|
||||
#define CV__HAL_INTRIN_EXPAND_WITH_ALL_TYPES(macro_name, ...) \
|
||||
CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES(macro_name, __VA_ARGS__) \
|
||||
CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES(macro_name, __VA_ARGS__) \
|
||||
|
||||
#define CV__HAL_INTRIN_IMPL_BIN_OP_(_Tp, bin_op) \
|
||||
template<int n> inline \
|
||||
v_reg<_Tp, n> operator bin_op (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
|
||||
{ \
|
||||
v_reg<_Tp, n> c; \
|
||||
for( int i = 0; i < n; i++ ) \
|
||||
c.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \
|
||||
return c; \
|
||||
} \
|
||||
template<typename _Tp, int n> inline v_reg<_Tp, n>& \
|
||||
operator bin_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
|
||||
template<int n> inline \
|
||||
v_reg<_Tp, n>& operator bin_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
|
||||
{ \
|
||||
for( int i = 0; i < n; i++ ) \
|
||||
a.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \
|
||||
return a; \
|
||||
}
|
||||
|
||||
/** @brief Add values
|
||||
#define CV__HAL_INTRIN_IMPL_BIN_OP(bin_op) CV__HAL_INTRIN_EXPAND_WITH_ALL_TYPES(CV__HAL_INTRIN_IMPL_BIN_OP_, bin_op)
|
||||
|
||||
For all types. */
|
||||
OPENCV_HAL_IMPL_BIN_OP(+)
|
||||
CV__HAL_INTRIN_IMPL_BIN_OP(+)
|
||||
CV__HAL_INTRIN_IMPL_BIN_OP(-)
|
||||
CV__HAL_INTRIN_IMPL_BIN_OP(*)
|
||||
CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES(CV__HAL_INTRIN_IMPL_BIN_OP_, /)
|
||||
|
||||
/** @brief Subtract values
|
||||
|
||||
For all types. */
|
||||
OPENCV_HAL_IMPL_BIN_OP(-)
|
||||
|
||||
/** @brief Multiply values
|
||||
|
||||
For 16- and 32-bit integer types and floating types. */
|
||||
OPENCV_HAL_IMPL_BIN_OP(*)
|
||||
|
||||
/** @brief Divide values
|
||||
|
||||
For floating types only. */
|
||||
OPENCV_HAL_IMPL_BIN_OP(/)
|
||||
|
||||
//! @brief Helper macro
|
||||
//! @ingroup core_hal_intrin_impl
|
||||
#define OPENCV_HAL_IMPL_BIT_OP(bit_op) \
|
||||
template<typename _Tp, int n> inline v_reg<_Tp, n> operator bit_op \
|
||||
(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
|
||||
#define CV__HAL_INTRIN_IMPL_BIT_OP_(_Tp, bit_op) \
|
||||
template<int n> CV_INLINE \
|
||||
v_reg<_Tp, n> operator bit_op (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
|
||||
{ \
|
||||
v_reg<_Tp, n> c; \
|
||||
typedef typename V_TypeTraits<_Tp>::int_type itype; \
|
||||
@ -454,8 +514,8 @@ template<typename _Tp, int n> inline v_reg<_Tp, n> operator bit_op \
|
||||
V_TypeTraits<_Tp>::reinterpret_int(b.s[i]))); \
|
||||
return c; \
|
||||
} \
|
||||
template<typename _Tp, int n> inline v_reg<_Tp, n>& operator \
|
||||
bit_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
|
||||
template<int n> CV_INLINE \
|
||||
v_reg<_Tp, n>& operator bit_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
|
||||
{ \
|
||||
typedef typename V_TypeTraits<_Tp>::int_type itype; \
|
||||
for( int i = 0; i < n; i++ ) \
|
||||
@ -464,33 +524,29 @@ template<typename _Tp, int n> inline v_reg<_Tp, n>& operator \
|
||||
return a; \
|
||||
}
|
||||
|
||||
/** @brief Bitwise AND
|
||||
#define CV__HAL_INTRIN_IMPL_BIT_OP(bit_op) \
|
||||
CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES(CV__HAL_INTRIN_IMPL_BIT_OP_, bit_op) \
|
||||
CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES(CV__HAL_INTRIN_IMPL_BIT_OP_, bit_op) /* TODO: FIXIT remove this after masks refactoring */
|
||||
|
||||
Only for integer types. */
|
||||
OPENCV_HAL_IMPL_BIT_OP(&)
|
||||
|
||||
/** @brief Bitwise OR
|
||||
CV__HAL_INTRIN_IMPL_BIT_OP(&)
|
||||
CV__HAL_INTRIN_IMPL_BIT_OP(|)
|
||||
CV__HAL_INTRIN_IMPL_BIT_OP(^)
|
||||
|
||||
Only for integer types. */
|
||||
OPENCV_HAL_IMPL_BIT_OP(|)
|
||||
#define CV__HAL_INTRIN_IMPL_BITWISE_NOT_(_Tp, dummy) \
|
||||
template<int n> CV_INLINE \
|
||||
v_reg<_Tp, n> operator ~ (const v_reg<_Tp, n>& a) \
|
||||
{ \
|
||||
v_reg<_Tp, n> c; \
|
||||
for( int i = 0; i < n; i++ ) \
|
||||
c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int(~V_TypeTraits<_Tp>::reinterpret_int(a.s[i])); \
|
||||
return c; \
|
||||
} \
|
||||
|
||||
/** @brief Bitwise XOR
|
||||
CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES(CV__HAL_INTRIN_IMPL_BITWISE_NOT_, ~)
|
||||
|
||||
Only for integer types.*/
|
||||
OPENCV_HAL_IMPL_BIT_OP(^)
|
||||
#endif // !CV_DOXYGEN
|
||||
|
||||
/** @brief Bitwise NOT
|
||||
|
||||
Only for integer types.*/
|
||||
template<typename _Tp, int n> inline v_reg<_Tp, n> operator ~ (const v_reg<_Tp, n>& a)
|
||||
{
|
||||
v_reg<_Tp, n> c;
|
||||
for( int i = 0; i < n; i++ )
|
||||
{
|
||||
c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int(~V_TypeTraits<_Tp>::reinterpret_int(a.s[i]));
|
||||
}
|
||||
return c;
|
||||
}
|
||||
|
||||
//! @brief Helper macro
|
||||
//! @ingroup core_hal_intrin_impl
|
||||
@ -503,6 +559,27 @@ template<typename _Tp, int n> inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a)
|
||||
return c; \
|
||||
}
|
||||
|
||||
//! @brief Helper macro
|
||||
//! @ingroup core_hal_intrin_impl
|
||||
#define OPENCV_HAL_IMPL_MATH_FUNC_FLOAT(func, cfunc) \
|
||||
inline v_reg<int, 4> func(const v_reg<float, 4>& a) \
|
||||
{ \
|
||||
v_reg<int, 4> c; \
|
||||
for( int i = 0; i < 4; i++ ) \
|
||||
c.s[i] = cfunc(a.s[i]); \
|
||||
return c; \
|
||||
} \
|
||||
inline v_reg<int, 4> func(const v_reg<double, 2>& a) \
|
||||
{ \
|
||||
v_reg<int, 4> c; \
|
||||
for( int i = 0; i < 2; i++ ) \
|
||||
{ \
|
||||
c.s[i] = cfunc(a.s[i]); \
|
||||
c.s[i + 2] = 0; \
|
||||
} \
|
||||
return c; \
|
||||
}
|
||||
|
||||
/** @brief Square root of elements
|
||||
|
||||
Only for floating point types.*/
|
||||
@ -524,22 +601,22 @@ OPENCV_HAL_IMPL_MATH_FUNC(v_abs, (typename V_TypeTraits<_Tp>::abs_type)std::abs,
|
||||
/** @brief Round elements
|
||||
|
||||
Only for floating point types.*/
|
||||
OPENCV_HAL_IMPL_MATH_FUNC(v_round, cvRound, int)
|
||||
OPENCV_HAL_IMPL_MATH_FUNC_FLOAT(v_round, cvRound)
|
||||
|
||||
/** @brief Floor elements
|
||||
|
||||
Only for floating point types.*/
|
||||
OPENCV_HAL_IMPL_MATH_FUNC(v_floor, cvFloor, int)
|
||||
OPENCV_HAL_IMPL_MATH_FUNC_FLOAT(v_floor, cvFloor)
|
||||
|
||||
/** @brief Ceil elements
|
||||
|
||||
Only for floating point types.*/
|
||||
OPENCV_HAL_IMPL_MATH_FUNC(v_ceil, cvCeil, int)
|
||||
OPENCV_HAL_IMPL_MATH_FUNC_FLOAT(v_ceil, cvCeil)
|
||||
|
||||
/** @brief Truncate elements
|
||||
|
||||
Only for floating point types.*/
|
||||
OPENCV_HAL_IMPL_MATH_FUNC(v_trunc, int, int)
|
||||
OPENCV_HAL_IMPL_MATH_FUNC_FLOAT(v_trunc, int)
|
||||
|
||||
//! @brief Helper macro
|
||||
//! @ingroup core_hal_intrin_impl
|
||||
@ -1083,9 +1160,8 @@ OPENCV_HAL_IMPL_SHIFT_OP(<< )
|
||||
For 16-, 32- and 64-bit integer values. */
|
||||
OPENCV_HAL_IMPL_SHIFT_OP(>> )
|
||||
|
||||
/** @brief Element shift left among vector
|
||||
|
||||
For all type */
|
||||
//! @brief Helper macro
|
||||
//! @ingroup core_hal_intrin_impl
|
||||
#define OPENCV_HAL_IMPL_ROTATE_SHIFT_OP(suffix,opA,opB) \
|
||||
template<int imm, typename _Tp, int n> inline v_reg<_Tp, n> v_rotate_##suffix(const v_reg<_Tp, n>& a) \
|
||||
{ \
|
||||
@ -1127,7 +1203,14 @@ template<int imm, typename _Tp, int n> inline v_reg<_Tp, n> v_rotate_##suffix(co
|
||||
return c; \
|
||||
}
|
||||
|
||||
/** @brief Element shift left among vector
|
||||
|
||||
For all type */
|
||||
OPENCV_HAL_IMPL_ROTATE_SHIFT_OP(left, -, +)
|
||||
|
||||
/** @brief Element shift right among vector
|
||||
|
||||
For all type */
|
||||
OPENCV_HAL_IMPL_ROTATE_SHIFT_OP(right, +, -)
|
||||
|
||||
/** @brief Sum packed values
|
||||
@ -1389,6 +1472,7 @@ similar to cv::v_load, but source memory block should be aligned (to 16-byte bou
|
||||
template<typename _Tp>
|
||||
inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_load_aligned(const _Tp* ptr)
|
||||
{
|
||||
CV_Assert(isAligned<sizeof(v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128>)>(ptr));
|
||||
return v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128>(ptr);
|
||||
}
|
||||
|
||||
@ -1620,6 +1704,12 @@ inline void v_store(_Tp* ptr, const v_reg<_Tp, n>& a)
|
||||
ptr[i] = a.s[i];
|
||||
}
|
||||
|
||||
template<typename _Tp, int n>
|
||||
inline void v_store(_Tp* ptr, const v_reg<_Tp, n>& a, hal::StoreMode /*mode*/)
|
||||
{
|
||||
v_store(ptr, a);
|
||||
}
|
||||
|
||||
/** @brief Store data to memory (lower half)
|
||||
|
||||
Store lower half of register contents to memory.
|
||||
@ -1659,22 +1749,22 @@ Pointer __should__ be aligned by 16-byte boundary. */
|
||||
template<typename _Tp, int n>
|
||||
inline void v_store_aligned(_Tp* ptr, const v_reg<_Tp, n>& a)
|
||||
{
|
||||
for( int i = 0; i < n; i++ )
|
||||
ptr[i] = a.s[i];
|
||||
CV_Assert(isAligned<sizeof(v_reg<_Tp, n>)>(ptr));
|
||||
v_store(ptr, a);
|
||||
}
|
||||
|
||||
template<typename _Tp, int n>
|
||||
inline void v_store_aligned_nocache(_Tp* ptr, const v_reg<_Tp, n>& a)
|
||||
{
|
||||
for( int i = 0; i < n; i++ )
|
||||
ptr[i] = a.s[i];
|
||||
CV_Assert(isAligned<sizeof(v_reg<_Tp, n>)>(ptr));
|
||||
v_store(ptr, a);
|
||||
}
|
||||
|
||||
template<typename _Tp, int n>
|
||||
inline void v_store_aligned(_Tp* ptr, const v_reg<_Tp, n>& a, hal::StoreMode /*mode*/)
|
||||
{
|
||||
for( int i = 0; i < n; i++ )
|
||||
ptr[i] = a.s[i];
|
||||
CV_Assert(isAligned<sizeof(v_reg<_Tp, n>)>(ptr));
|
||||
v_store(ptr, a);
|
||||
}
|
||||
|
||||
/** @brief Combine vector from first elements of two vectors
|
||||
@ -1940,6 +2030,17 @@ template<int n> inline v_reg<float, n> v_cvt_f32(const v_reg<int, n>& a)
|
||||
return c;
|
||||
}
|
||||
|
||||
template<int n> inline v_reg<float, n*2> v_cvt_f32(const v_reg<double, n>& a)
|
||||
{
|
||||
v_reg<float, n*2> c;
|
||||
for( int i = 0; i < n; i++ )
|
||||
{
|
||||
c.s[i] = (float)a.s[i];
|
||||
c.s[i+n] = 0;
|
||||
}
|
||||
return c;
|
||||
}
|
||||
|
||||
template<int n> inline v_reg<float, n*2> v_cvt_f32(const v_reg<double, n>& a, const v_reg<double, n>& b)
|
||||
{
|
||||
v_reg<float, n*2> c;
|
||||
@ -1954,36 +2055,76 @@ template<int n> inline v_reg<float, n*2> v_cvt_f32(const v_reg<double, n>& a, co
|
||||
/** @brief Convert to double
|
||||
|
||||
Supported input type is cv::v_int32x4. */
|
||||
template<int n> inline v_reg<double, n> v_cvt_f64(const v_reg<int, n*2>& a)
|
||||
CV_INLINE v_reg<double, 2> v_cvt_f64(const v_reg<int, 4>& a)
|
||||
{
|
||||
enum { n = 2 };
|
||||
v_reg<double, n> c;
|
||||
for( int i = 0; i < n; i++ )
|
||||
c.s[i] = (double)a.s[i];
|
||||
return c;
|
||||
}
|
||||
|
||||
/** @brief Convert to double high part of vector
|
||||
|
||||
Supported input type is cv::v_int32x4. */
|
||||
CV_INLINE v_reg<double, 2> v_cvt_f64_high(const v_reg<int, 4>& a)
|
||||
{
|
||||
enum { n = 2 };
|
||||
v_reg<double, n> c;
|
||||
for( int i = 0; i < n; i++ )
|
||||
c.s[i] = (double)a.s[i + 2];
|
||||
return c;
|
||||
}
|
||||
|
||||
/** @brief Convert to double
|
||||
|
||||
Supported input type is cv::v_float32x4. */
|
||||
template<int n> inline v_reg<double, n> v_cvt_f64(const v_reg<float, n*2>& a)
|
||||
CV_INLINE v_reg<double, 2> v_cvt_f64(const v_reg<float, 4>& a)
|
||||
{
|
||||
enum { n = 2 };
|
||||
v_reg<double, n> c;
|
||||
for( int i = 0; i < n; i++ )
|
||||
c.s[i] = (double)a.s[i];
|
||||
return c;
|
||||
}
|
||||
|
||||
/** @brief Convert to double high part of vector
|
||||
|
||||
Supported input type is cv::v_float32x4. */
|
||||
CV_INLINE v_reg<double, 2> v_cvt_f64_high(const v_reg<float, 4>& a)
|
||||
{
|
||||
enum { n = 2 };
|
||||
v_reg<double, n> c;
|
||||
for( int i = 0; i < n; i++ )
|
||||
c.s[i] = (double)a.s[i + 2];
|
||||
return c;
|
||||
}
|
||||
|
||||
/** @brief Convert to double
|
||||
|
||||
Supported input type is cv::v_int64x2. */
|
||||
template<int n> inline v_reg<double, n> v_cvt_f64(const v_reg<int64, n>& a)
|
||||
CV_INLINE v_reg<double, 2> v_cvt_f64(const v_reg<int64, 2>& a)
|
||||
{
|
||||
enum { n = 2 };
|
||||
v_reg<double, n> c;
|
||||
for( int i = 0; i < n; i++ )
|
||||
c.s[i] = (double)a.s[i];
|
||||
return c;
|
||||
}
|
||||
|
||||
/** @brief Convert to double high part of vector
|
||||
|
||||
Supported input type is cv::v_int64x2. */
|
||||
CV_INLINE v_reg<double, 2> v_cvt_f64_high(const v_reg<int64, 2>& a)
|
||||
{
|
||||
enum { n = 2 };
|
||||
v_reg<double, n> c;
|
||||
for( int i = 0; i < n; i++ )
|
||||
c.s[i] = (double)a.s[i];
|
||||
return c;
|
||||
}
|
||||
|
||||
|
||||
template<typename _Tp> inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_lut(const _Tp* tab, const int* idx)
|
||||
{
|
||||
v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> c;
|
||||
@ -2038,6 +2179,28 @@ template<int n> inline v_reg<double, n> v_lut(const double* tab, const v_reg<int
|
||||
return c;
|
||||
}
|
||||
|
||||
|
||||
inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
|
||||
{
|
||||
return v_lut(tab, idxvec.s);
|
||||
}
|
||||
|
||||
inline v_uint32x4 v_lut(const unsigned* tab, const v_int32x4& idxvec)
|
||||
{
|
||||
return v_lut(tab, idxvec.s);
|
||||
}
|
||||
|
||||
inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
|
||||
{
|
||||
return v_lut(tab, idxvec.s);
|
||||
}
|
||||
|
||||
inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
|
||||
{
|
||||
return v_lut(tab, idxvec.s);
|
||||
}
|
||||
|
||||
|
||||
template<int n> inline void v_lut_deinterleave(const float* tab, const v_reg<int, n>& idx,
|
||||
v_reg<float, n>& x, v_reg<float, n>& y)
|
||||
{
|
||||
@ -2062,7 +2225,7 @@ template<int n> inline void v_lut_deinterleave(const double* tab, const v_reg<in
|
||||
|
||||
template<typename _Tp, int n> inline v_reg<_Tp, n> v_interleave_pairs(const v_reg<_Tp, n>& vec)
|
||||
{
|
||||
v_reg<float, n> c;
|
||||
v_reg<_Tp, n> c;
|
||||
for (int i = 0; i < n/4; i++)
|
||||
{
|
||||
c.s[4*i ] = vec.s[4*i ];
|
||||
@ -2075,7 +2238,7 @@ template<typename _Tp, int n> inline v_reg<_Tp, n> v_interleave_pairs(const v_re
|
||||
|
||||
template<typename _Tp, int n> inline v_reg<_Tp, n> v_interleave_quads(const v_reg<_Tp, n>& vec)
|
||||
{
|
||||
v_reg<float, n> c;
|
||||
v_reg<_Tp, n> c;
|
||||
for (int i = 0; i < n/8; i++)
|
||||
{
|
||||
c.s[8*i ] = vec.s[8*i ];
|
||||
@ -2092,7 +2255,7 @@ template<typename _Tp, int n> inline v_reg<_Tp, n> v_interleave_quads(const v_re
|
||||
|
||||
template<typename _Tp, int n> inline v_reg<_Tp, n> v_pack_triplets(const v_reg<_Tp, n>& vec)
|
||||
{
|
||||
v_reg<float, n> c;
|
||||
v_reg<_Tp, n> c;
|
||||
for (int i = 0; i < n/4; i++)
|
||||
{
|
||||
c.s[3*i ] = vec.s[4*i ];
|
||||
@ -2523,6 +2686,17 @@ inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
|
||||
v.s[0]*m0.s[3] + v.s[1]*m1.s[3] + v.s[2]*m2.s[3] + m3.s[3]);
|
||||
}
|
||||
|
||||
|
||||
inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
|
||||
{ return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_cvt_f64_high(a) * v_cvt_f64_high(b)); }
|
||||
inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
|
||||
{ return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_fma(v_cvt_f64_high(a), v_cvt_f64_high(b), c)); }
|
||||
|
||||
inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
|
||||
{ return v_dotprod_expand(a, b); }
|
||||
inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
|
||||
{ return v_dotprod_expand(a, b, c); }
|
||||
|
||||
////// FP16 support ///////
|
||||
|
||||
inline v_reg<float, V_TypeTraits<float>::nlanes128>
|
||||
@ -2537,7 +2711,7 @@ v_load_expand(const float16_t* ptr)
|
||||
}
|
||||
|
||||
inline void
|
||||
v_pack_store(float16_t* ptr, v_reg<float, V_TypeTraits<float>::nlanes128>& v)
|
||||
v_pack_store(float16_t* ptr, const v_reg<float, V_TypeTraits<float>::nlanes128>& v)
|
||||
{
|
||||
for( int i = 0; i < v.nlanes; i++ )
|
||||
{
|
||||
|
@ -1522,7 +1522,8 @@ struct InRange_SIMD<float>
|
||||
v_float32 low2 = vx_load(src2 + x + v_float32::nlanes);
|
||||
v_float32 high2 = vx_load(src3 + x + v_float32::nlanes);
|
||||
|
||||
v_pack_store(dst + x, v_pack(v_reinterpret_as_u32((values1 >= low1) & (high1 >= values1)), v_reinterpret_as_u32((values2 >= low2) & (high2 >= values2))));
|
||||
v_pack_store(dst + x, v_pack(v_reinterpret_as_u32(values1 >= low1) & v_reinterpret_as_u32(high1 >= values1),
|
||||
v_reinterpret_as_u32(values2 >= low2) & v_reinterpret_as_u32(high2 >= values2)));
|
||||
}
|
||||
vx_cleanup();
|
||||
return x;
|
||||
|
@ -1593,7 +1593,7 @@ struct op_div_f
|
||||
{
|
||||
static inline Tvec r(const Tvec& a, const Tvec& b)
|
||||
{
|
||||
const Tvec v_zero = Tvec();
|
||||
const Tvec v_zero = vx_setall<typename Tvec::lane_type>(0);
|
||||
return v_select(b == v_zero, v_zero, a / b);
|
||||
}
|
||||
static inline T1 r(T1 a, T1 b)
|
||||
@ -1620,7 +1620,7 @@ struct op_div_scale
|
||||
}
|
||||
static inline Tvec pre(const Tvec& denom, const Tvec& res)
|
||||
{
|
||||
const Tvec v_zero = Tvec();
|
||||
const Tvec v_zero = vx_setall<typename Tvec::lane_type>(0);
|
||||
return v_select(denom == v_zero, v_zero, res);
|
||||
}
|
||||
static inline T1 r(T1 a, T1 denom, const T2* scalar)
|
||||
@ -1860,7 +1860,7 @@ struct op_recip
|
||||
}
|
||||
static inline Tvec pre(const Tvec& denom, const Tvec& res)
|
||||
{
|
||||
const Tvec v_zero = Tvec();
|
||||
const Tvec v_zero = vx_setall<typename Tvec::lane_type>(0);
|
||||
return v_select(denom == v_zero, v_zero, res);
|
||||
}
|
||||
static inline T1 r(T1 denom, const T2* scalar)
|
||||
|
@ -916,8 +916,9 @@ double cv::invert( InputArray _src, OutputArray _dst, int method )
|
||||
result = true;
|
||||
d = 1./d;
|
||||
#if CV_SIMD128
|
||||
static const float CV_DECL_ALIGNED(16) inv[4] = { 0.f,-0.f,-0.f,0.f };
|
||||
v_float32x4 s0 = (v_load_halves((const float*)srcdata, (const float*)(srcdata + srcstep)) * v_setall_f32((float)d)) ^ v_load((const float *)inv);//0123//3120
|
||||
const float d_32f = (float)d;
|
||||
const v_float32x4 d_vec(d_32f, -d_32f, -d_32f, d_32f);
|
||||
v_float32x4 s0 = v_load_halves((const float*)srcdata, (const float*)(srcdata + srcstep)) * d_vec;//0123//3120
|
||||
s0 = v_extract<3>(s0, v_combine_low(v_rotate_right<1>(s0), s0));
|
||||
v_store_low((float*)dstdata, s0);
|
||||
v_store_high((float*)(dstdata + dststep), s0);
|
||||
@ -946,7 +947,7 @@ double cv::invert( InputArray _src, OutputArray _dst, int method )
|
||||
v_float64x2 s0 = v_load((const double*)srcdata) * det;
|
||||
v_float64x2 s1 = v_load((const double*)(srcdata+srcstep)) * det;
|
||||
v_float64x2 sm = v_extract<1>(s1, s0);//30
|
||||
v_float64x2 ss = v_extract<1>(s0, s1) ^ v_setall_f64(-0.);//12
|
||||
v_float64x2 ss = v_setall<double>(0) - v_extract<1>(s0, s1);//12
|
||||
v_store((double*)dstdata, v_combine_low(sm, ss));//31
|
||||
v_store((double*)(dstdata + dststep), v_combine_high(ss, sm));//20
|
||||
#else
|
||||
|
@ -725,7 +725,7 @@ void log32f( const float *_x, float *y, int n )
|
||||
|
||||
yf0 = v_fma(v_cvt_f32(yi0), vln2, yf0);
|
||||
|
||||
v_float32 delta = v_reinterpret_as_f32(h0 == vx_setall_s32(510)) & vshift;
|
||||
v_float32 delta = v_select(v_reinterpret_as_f32(h0 == vx_setall_s32(510)), vshift, vx_setall<float>(0));
|
||||
xf0 = v_fma((v_reinterpret_as_f32(xi0) - v1), xf0, delta);
|
||||
|
||||
v_float32 zf0 = v_fma(xf0, vA0, vA1);
|
||||
|
@ -3,22 +3,14 @@
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
#include "test_precomp.hpp"
|
||||
|
||||
// see "opencv2/core/hal/intrin.hpp"
|
||||
#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE hal_EMULATOR_CPP
|
||||
#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN namespace hal_EMULATOR_CPP {
|
||||
#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END }
|
||||
|
||||
// see "opencv2/core/private/cv_cpu_include_simd_declarations.hpp"
|
||||
//#define CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
|
||||
#define CV_FORCE_SIMD128_CPP
|
||||
#undef CV_FORCE_SIMD128_CPP
|
||||
#define CV_FORCE_SIMD128_CPP 1
|
||||
#undef CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
|
||||
#undef CV_CPU_OPTIMIZATION_NAMESPACE_END
|
||||
#define CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN namespace opt_EMULATOR_CPP {
|
||||
#define CV_CPU_OPTIMIZATION_NAMESPACE_END }
|
||||
#include "test_intrin128.simd.hpp"
|
||||
#undef CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
|
||||
#undef CV_CPU_OPTIMIZATION_NAMESPACE_END
|
||||
#undef CV_CPU_DISPATCH_MODE
|
||||
#undef CV_FORCE_SIMD128_CPP
|
||||
|
||||
// tests implementation is in test_intrin_utils.hpp
|
||||
|
@ -222,7 +222,10 @@ template <typename R> std::ostream & operator<<(std::ostream & out, const Data<R
|
||||
return out;
|
||||
}
|
||||
|
||||
template<typename T> static inline void EXPECT_COMPARE_EQ_(const T a, const T b);
|
||||
template<typename T> static inline void EXPECT_COMPARE_EQ_(const T a, const T b)
|
||||
{
|
||||
EXPECT_EQ(a, b);
|
||||
}
|
||||
template<> inline void EXPECT_COMPARE_EQ_<float>(const float a, const float b)
|
||||
{
|
||||
EXPECT_FLOAT_EQ( a, b );
|
||||
@ -742,12 +745,12 @@ template<typename R> struct TheTest
|
||||
for (int i = 0; i < n; ++i)
|
||||
{
|
||||
SCOPED_TRACE(cv::format("i=%d", i));
|
||||
EXPECT_EQ((double)dataA[i*2] * (double)dataA[i*2] +
|
||||
(double)dataA[i*2 + 1] * (double)dataA[i*2 + 1], resA[i]);
|
||||
EXPECT_EQ((double)dataB[i*2] * (double)dataB[i*2] +
|
||||
(double)dataB[i*2 + 1] * (double)dataB[i*2 + 1], resB[i]);
|
||||
EXPECT_EQ((double)dataA[i*2] * (double)dataB[i*2] +
|
||||
(double)dataA[i*2 + 1] * (double)dataB[i*2 + 1] + dataC[i], resC[i]);
|
||||
EXPECT_COMPARE_EQ((double)dataA[i*2] * (double)dataA[i*2] +
|
||||
(double)dataA[i*2 + 1] * (double)dataA[i*2 + 1], resA[i]);
|
||||
EXPECT_COMPARE_EQ((double)dataB[i*2] * (double)dataB[i*2] +
|
||||
(double)dataB[i*2 + 1] * (double)dataB[i*2 + 1], resB[i]);
|
||||
EXPECT_COMPARE_EQ((double)dataA[i*2] * (double)dataB[i*2] +
|
||||
(double)dataA[i*2 + 1] * (double)dataB[i*2 + 1] + dataC[i], resC[i]);
|
||||
}
|
||||
#endif
|
||||
return *this;
|
||||
|
@ -303,7 +303,8 @@ int cornerScore<8>(const uchar* ptr, const int pixel[], int threshold)
|
||||
for (k = 0; k < N; k++)
|
||||
d[k] = (short)(v - ptr[pixel[k]]);
|
||||
|
||||
#if CV_SIMD128
|
||||
#if CV_SIMD128 \
|
||||
&& (!defined(CV_SIMD128_CPP) || (!defined(__GNUC__) || __GNUC__ != 5)) // "movdqa" bug on "v_load(d + 1)" line (Ubuntu 16.04 + GCC 5.4)
|
||||
if (true)
|
||||
{
|
||||
v_int16x8 v0 = v_load(d + 1);
|
||||
|
@ -42,6 +42,7 @@
|
||||
//M*/
|
||||
|
||||
#include "precomp.hpp"
|
||||
#undef CV_FORCE_SIMD128_CPP // expected AVX implementation only
|
||||
#include "opencv2/core/hal/intrin.hpp"
|
||||
#include "corner.hpp"
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user