Merge pull request #12516 from seiko2plus:changeUnvMultiply16

This commit is contained in:
Alexander Alekhin 2018-10-15 12:07:40 +00:00
commit 1cc3f7abbb
15 changed files with 2404 additions and 2060 deletions

View File

@ -139,8 +139,14 @@ using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE;
# undef CV_FP16 # undef CV_FP16
#endif #endif
#if CV_SSE2 || CV_NEON || CV_VSX
#define CV__SIMD_FORWARD 128
#include "opencv2/core/hal/intrin_forward.hpp"
#endif
#if CV_SSE2 #if CV_SSE2
#include "opencv2/core/hal/intrin_sse_em.hpp"
#include "opencv2/core/hal/intrin_sse.hpp" #include "opencv2/core/hal/intrin_sse.hpp"
#elif CV_NEON #elif CV_NEON
@ -168,6 +174,8 @@ using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE;
// (and will be mapped to v256_ counterparts) (e.g. vx_load() => v256_load()) // (and will be mapped to v256_ counterparts) (e.g. vx_load() => v256_load())
#if CV_AVX2 #if CV_AVX2
#define CV__SIMD_FORWARD 256
#include "opencv2/core/hal/intrin_forward.hpp"
#include "opencv2/core/hal/intrin_avx.hpp" #include "opencv2/core/hal/intrin_avx.hpp"
#endif #endif

View File

@ -82,6 +82,14 @@ inline __m128 _v256_extract_low(const __m256& v)
inline __m128d _v256_extract_low(const __m256d& v) inline __m128d _v256_extract_low(const __m256d& v)
{ return _mm256_castpd256_pd128(v); } { return _mm256_castpd256_pd128(v); }
inline __m256i _v256_packs_epu32(const __m256i& a, const __m256i& b)
{
const __m256i m = _mm256_set1_epi32(65535);
__m256i am = _mm256_min_epu32(a, m);
__m256i bm = _mm256_min_epu32(b, m);
return _mm256_packus_epi32(am, bm);
}
///////// Types //////////// ///////// Types ////////////
struct v_uint8x32 struct v_uint8x32
@ -626,10 +634,8 @@ OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int8x32, _mm256_adds_epi8)
OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int8x32, _mm256_subs_epi8) OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int8x32, _mm256_subs_epi8)
OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint16x16, _mm256_adds_epu16) OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint16x16, _mm256_adds_epu16)
OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint16x16, _mm256_subs_epu16) OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint16x16, _mm256_subs_epu16)
OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_uint16x16, _mm256_mullo_epi16)
OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int16x16, _mm256_adds_epi16) OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int16x16, _mm256_adds_epi16)
OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int16x16, _mm256_subs_epi16) OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int16x16, _mm256_subs_epi16)
OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_int16x16, _mm256_mullo_epi16)
OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint32x8, _mm256_add_epi32) OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint32x8, _mm256_add_epi32)
OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint32x8, _mm256_sub_epi32) OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint32x8, _mm256_sub_epi32)
OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_uint32x8, _mm256_mullo_epi32) OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_uint32x8, _mm256_mullo_epi32)
@ -650,13 +656,103 @@ OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_float64x4, _mm256_sub_pd)
OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_float64x4, _mm256_mul_pd) OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_float64x4, _mm256_mul_pd)
OPENCV_HAL_IMPL_AVX_BIN_OP(/, v_float64x4, _mm256_div_pd) OPENCV_HAL_IMPL_AVX_BIN_OP(/, v_float64x4, _mm256_div_pd)
// saturating multiply 8-bit, 16-bit
inline v_uint8x32 operator * (const v_uint8x32& a, const v_uint8x32& b)
{
v_uint16x16 c, d;
v_mul_expand(a, b, c, d);
return v_pack_u(v_reinterpret_as_s16(c), v_reinterpret_as_s16(d));
}
inline v_int8x32 operator * (const v_int8x32& a, const v_int8x32& b)
{
v_int16x16 c, d;
v_mul_expand(a, b, c, d);
return v_pack(c, d);
}
inline v_uint16x16 operator * (const v_uint16x16& a, const v_uint16x16& b)
{
__m256i pl = _mm256_mullo_epi16(a.val, b.val);
__m256i ph = _mm256_mulhi_epu16(a.val, b.val);
__m256i p0 = _mm256_unpacklo_epi16(pl, ph);
__m256i p1 = _mm256_unpackhi_epi16(pl, ph);
return v_uint16x16(_v256_packs_epu32(p0, p1));
}
inline v_int16x16 operator * (const v_int16x16& a, const v_int16x16& b)
{
__m256i pl = _mm256_mullo_epi16(a.val, b.val);
__m256i ph = _mm256_mulhi_epi16(a.val, b.val);
__m256i p0 = _mm256_unpacklo_epi16(pl, ph);
__m256i p1 = _mm256_unpackhi_epi16(pl, ph);
return v_int16x16(_mm256_packs_epi32(p0, p1));
}
inline v_uint8x32& operator *= (v_uint8x32& a, const v_uint8x32& b)
{ a = a * b; return a; }
inline v_int8x32& operator *= (v_int8x32& a, const v_int8x32& b)
{ a = a * b; return a; }
inline v_uint16x16& operator *= (v_uint16x16& a, const v_uint16x16& b)
{ a = a * b; return a; }
inline v_int16x16& operator *= (v_int16x16& a, const v_int16x16& b)
{ a = a * b; return a; }
/** Non-saturating arithmetics **/
#define OPENCV_HAL_IMPL_AVX_BIN_FUNC(func, _Tpvec, intrin) \
inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(intrin(a.val, b.val)); }
OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_uint8x32, _mm256_add_epi8)
OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_int8x32, _mm256_add_epi8)
OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_uint16x16, _mm256_add_epi16)
OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_int16x16, _mm256_add_epi16)
OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_uint8x32, _mm256_sub_epi8)
OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_int8x32, _mm256_sub_epi8)
OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_uint16x16, _mm256_sub_epi16)
OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_int16x16, _mm256_sub_epi16)
OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_mul_wrap, v_uint16x16, _mm256_mullo_epi16)
OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_mul_wrap, v_int16x16, _mm256_mullo_epi16)
inline v_uint8x32 v_mul_wrap(const v_uint8x32& a, const v_uint8x32& b)
{
__m256i ad = _mm256_srai_epi16(a.val, 8);
__m256i bd = _mm256_srai_epi16(b.val, 8);
__m256i p0 = _mm256_mullo_epi16(a.val, b.val); // even
__m256i p1 = _mm256_slli_epi16(_mm256_mullo_epi16(ad, bd), 8); // odd
const __m256i b01 = _mm256_set1_epi32(0xFF00FF00);
return v_uint8x32(_mm256_blendv_epi8(p0, p1, b01));
}
inline v_int8x32 v_mul_wrap(const v_int8x32& a, const v_int8x32& b)
{
return v_reinterpret_as_s8(v_mul_wrap(v_reinterpret_as_u8(a), v_reinterpret_as_u8(b)));
}
// Multiply and expand
inline void v_mul_expand(const v_uint8x32& a, const v_uint8x32& b,
v_uint16x16& c, v_uint16x16& d)
{
v_uint16x16 a0, a1, b0, b1;
v_expand(a, a0, a1);
v_expand(b, b0, b1);
c = v_mul_wrap(a0, b0);
d = v_mul_wrap(a1, b1);
}
inline void v_mul_expand(const v_int8x32& a, const v_int8x32& b,
v_int16x16& c, v_int16x16& d)
{
v_int16x16 a0, a1, b0, b1;
v_expand(a, a0, a1);
v_expand(b, b0, b1);
c = v_mul_wrap(a0, b0);
d = v_mul_wrap(a1, b1);
}
inline void v_mul_expand(const v_int16x16& a, const v_int16x16& b, inline void v_mul_expand(const v_int16x16& a, const v_int16x16& b,
v_int32x8& c, v_int32x8& d) v_int32x8& c, v_int32x8& d)
{ {
v_int16x16 vhi = v_int16x16(_mm256_mulhi_epi16(a.val, b.val)); v_int16x16 vhi = v_int16x16(_mm256_mulhi_epi16(a.val, b.val));
v_int16x16 v0, v1; v_int16x16 v0, v1;
v_zip(a * b, vhi, v0, v1); v_zip(v_mul_wrap(a, b), vhi, v0, v1);
c = v_reinterpret_as_s32(v0); c = v_reinterpret_as_s32(v0);
d = v_reinterpret_as_s32(v1); d = v_reinterpret_as_s32(v1);
@ -668,7 +764,7 @@ inline void v_mul_expand(const v_uint16x16& a, const v_uint16x16& b,
v_uint16x16 vhi = v_uint16x16(_mm256_mulhi_epu16(a.val, b.val)); v_uint16x16 vhi = v_uint16x16(_mm256_mulhi_epu16(a.val, b.val));
v_uint16x16 v0, v1; v_uint16x16 v0, v1;
v_zip(a * b, vhi, v0, v1); v_zip(v_mul_wrap(a, b), vhi, v0, v1);
c = v_reinterpret_as_u32(v0); c = v_reinterpret_as_u32(v0);
d = v_reinterpret_as_u32(v1); d = v_reinterpret_as_u32(v1);
@ -685,20 +781,6 @@ inline void v_mul_expand(const v_uint32x8& a, const v_uint32x8& b,
inline v_int16x16 v_mul_hi(const v_int16x16& a, const v_int16x16& b) { return v_int16x16(_mm256_mulhi_epi16(a.val, b.val)); } inline v_int16x16 v_mul_hi(const v_int16x16& a, const v_int16x16& b) { return v_int16x16(_mm256_mulhi_epi16(a.val, b.val)); }
inline v_uint16x16 v_mul_hi(const v_uint16x16& a, const v_uint16x16& b) { return v_uint16x16(_mm256_mulhi_epu16(a.val, b.val)); } inline v_uint16x16 v_mul_hi(const v_uint16x16& a, const v_uint16x16& b) { return v_uint16x16(_mm256_mulhi_epu16(a.val, b.val)); }
/** Non-saturating arithmetics **/
#define OPENCV_HAL_IMPL_AVX_BIN_FUNC(func, _Tpvec, intrin) \
inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(intrin(a.val, b.val)); }
OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_uint8x32, _mm256_add_epi8)
OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_int8x32, _mm256_add_epi8)
OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_uint16x16, _mm256_add_epi16)
OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_int16x16, _mm256_add_epi16)
OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_uint8x32, _mm256_sub_epi8)
OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_int8x32, _mm256_sub_epi8)
OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_uint16x16, _mm256_sub_epi16)
OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_int16x16, _mm256_sub_epi16)
/** Bitwise shifts **/ /** Bitwise shifts **/
#define OPENCV_HAL_IMPL_AVX_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai) \ #define OPENCV_HAL_IMPL_AVX_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai) \
inline _Tpuvec operator << (const _Tpuvec& a, int imm) \ inline _Tpuvec operator << (const _Tpuvec& a, int imm) \
@ -1385,6 +1467,10 @@ OPENCV_HAL_IMPL_AVX_TRANSPOSE4x4(v_float32x8, ps, _mm256_castps_si256, _mm256_ca
b0.val = intrin(_v256_extract_low(a.val)); \ b0.val = intrin(_v256_extract_low(a.val)); \
b1.val = intrin(_v256_extract_high(a.val)); \ b1.val = intrin(_v256_extract_high(a.val)); \
} \ } \
inline _Tpwvec v_expand_low(const _Tpvec& a) \
{ return _Tpwvec(intrin(_v256_extract_low(a.val))); } \
inline _Tpwvec v_expand_high(const _Tpvec& a) \
{ return _Tpwvec(intrin(_v256_extract_high(a.val))); } \
inline _Tpwvec v256_load_expand(const _Tp* ptr) \ inline _Tpwvec v256_load_expand(const _Tp* ptr) \
{ \ { \
__m128i a = _mm_loadu_si128((const __m128i*)ptr); \ __m128i a = _mm_loadu_si128((const __m128i*)ptr); \
@ -1430,7 +1516,12 @@ inline void v_pack_store(schar* ptr, const v_int16x16& a)
{ v_store_low(ptr, v_pack(a, a)); } { v_store_low(ptr, v_pack(a, a)); }
inline void v_pack_store(uchar* ptr, const v_uint16x16& a) inline void v_pack_store(uchar* ptr, const v_uint16x16& a)
{ v_store_low(ptr, v_pack(a, a)); } {
const __m256i m = _mm256_set1_epi16(255);
__m256i am = _mm256_min_epu16(a.val, m);
am = _v256_shuffle_odd_64(_mm256_packus_epi16(am, am));
v_store_low(ptr, v_uint8x32(am));
}
inline void v_pack_u_store(uchar* ptr, const v_int16x16& a) inline void v_pack_u_store(uchar* ptr, const v_int16x16& a)
{ v_store_low(ptr, v_pack_u(a, a)); } { v_store_low(ptr, v_pack_u(a, a)); }
@ -1484,16 +1575,21 @@ inline v_int16x16 v_pack(const v_int32x8& a, const v_int32x8& b)
{ return v_int16x16(_v256_shuffle_odd_64(_mm256_packs_epi32(a.val, b.val))); } { return v_int16x16(_v256_shuffle_odd_64(_mm256_packs_epi32(a.val, b.val))); }
inline v_uint16x16 v_pack(const v_uint32x8& a, const v_uint32x8& b) inline v_uint16x16 v_pack(const v_uint32x8& a, const v_uint32x8& b)
{ return v_uint16x16(_v256_shuffle_odd_64(_mm256_packus_epi32(a.val, b.val))); } { return v_uint16x16(_v256_shuffle_odd_64(_v256_packs_epu32(a.val, b.val))); }
inline v_uint16x16 v_pack_u(const v_int32x8& a, const v_int32x8& b) inline v_uint16x16 v_pack_u(const v_int32x8& a, const v_int32x8& b)
{ return v_pack(v_reinterpret_as_u32(a), v_reinterpret_as_u32(b)); } { return v_uint16x16(_v256_shuffle_odd_64(_mm256_packus_epi32(a.val, b.val))); }
inline void v_pack_store(short* ptr, const v_int32x8& a) inline void v_pack_store(short* ptr, const v_int32x8& a)
{ v_store_low(ptr, v_pack(a, a)); } { v_store_low(ptr, v_pack(a, a)); }
inline void v_pack_store(ushort* ptr, const v_uint32x8& a) inline void v_pack_store(ushort* ptr, const v_uint32x8& a)
{ v_store_low(ptr, v_pack(a, a)); } {
const __m256i m = _mm256_set1_epi32(65535);
__m256i am = _mm256_min_epu32(a.val, m);
am = _v256_shuffle_odd_64(_mm256_packus_epi32(am, am));
v_store_low(ptr, v_uint16x16(am));
}
inline void v_pack_u_store(ushort* ptr, const v_int32x8& a) inline void v_pack_u_store(ushort* ptr, const v_int32x8& a)
{ v_store_low(ptr, v_pack_u(a, a)); } { v_store_low(ptr, v_pack_u(a, a)); }

View File

@ -108,7 +108,7 @@ block and to save contents of the register to memory block.
These operations allow to reorder or recombine elements in one or multiple vectors. These operations allow to reorder or recombine elements in one or multiple vectors.
- Interleave, deinterleave (2, 3 and 4 channels): @ref v_load_deinterleave, @ref v_store_interleave - Interleave, deinterleave (2, 3 and 4 channels): @ref v_load_deinterleave, @ref v_store_interleave
- Expand: @ref v_load_expand, @ref v_load_expand_q, @ref v_expand - Expand: @ref v_load_expand, @ref v_load_expand_q, @ref v_expand, @ref v_expand_low, @ref v_expand_high
- Pack: @ref v_pack, @ref v_pack_u, @ref v_rshr_pack, @ref v_rshr_pack_u, - Pack: @ref v_pack, @ref v_pack_u, @ref v_rshr_pack, @ref v_rshr_pack_u,
@ref v_pack_store, @ref v_pack_u_store, @ref v_rshr_pack_store, @ref v_rshr_pack_u_store @ref v_pack_store, @ref v_pack_u_store, @ref v_rshr_pack_store, @ref v_rshr_pack_u_store
- Recombine: @ref v_zip, @ref v_recombine, @ref v_combine_low, @ref v_combine_high - Recombine: @ref v_zip, @ref v_recombine, @ref v_combine_low, @ref v_combine_high
@ -185,11 +185,14 @@ Regular integers:
|load, store | x | x | x | x | x | x | |load, store | x | x | x | x | x | x |
|interleave | x | x | x | x | x | x | |interleave | x | x | x | x | x | x |
|expand | x | x | x | x | x | x | |expand | x | x | x | x | x | x |
|expand_low | x | x | x | x | x | x |
|expand_high | x | x | x | x | x | x |
|expand_q | x | x | | | | | |expand_q | x | x | | | | |
|add, sub | x | x | x | x | x | x | |add, sub | x | x | x | x | x | x |
|add_wrap, sub_wrap | x | x | x | x | | | |add_wrap, sub_wrap | x | x | x | x | | |
|mul | | | x | x | x | x | |mul_wrap | x | x | x | x | | |
|mul_expand | | | x | x | x | | |mul | x | x | x | x | x | x |
|mul_expand | x | x | x | x | x | |
|compare | x | x | x | x | x | x | |compare | x | x | x | x | x | x |
|shift | | | x | x | x | x | |shift | | | x | x | x | x |
|dotprod | | | | x | | | |dotprod | | | | x | | |
@ -680,7 +683,7 @@ OPENCV_HAL_IMPL_CMP_OP(!=)
//! @brief Helper macro //! @brief Helper macro
//! @ingroup core_hal_intrin_impl //! @ingroup core_hal_intrin_impl
#define OPENCV_HAL_IMPL_ADD_SUB_OP(func, bin_op, cast_op, _Tp2) \ #define OPENCV_HAL_IMPL_ARITHM_OP(func, bin_op, cast_op, _Tp2) \
template<typename _Tp, int n> \ template<typename _Tp, int n> \
inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
{ \ { \
@ -694,12 +697,17 @@ inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
/** @brief Add values without saturation /** @brief Add values without saturation
For 8- and 16-bit integer values. */ For 8- and 16-bit integer values. */
OPENCV_HAL_IMPL_ADD_SUB_OP(v_add_wrap, +, (_Tp), _Tp) OPENCV_HAL_IMPL_ARITHM_OP(v_add_wrap, +, (_Tp), _Tp)
/** @brief Subtract values without saturation /** @brief Subtract values without saturation
For 8- and 16-bit integer values. */ For 8- and 16-bit integer values. */
OPENCV_HAL_IMPL_ADD_SUB_OP(v_sub_wrap, -, (_Tp), _Tp) OPENCV_HAL_IMPL_ARITHM_OP(v_sub_wrap, -, (_Tp), _Tp)
/** @brief Multiply values without saturation
For 8- and 16-bit integer values. */
OPENCV_HAL_IMPL_ARITHM_OP(v_mul_wrap, *, (_Tp), _Tp)
//! @cond IGNORED //! @cond IGNORED
template<typename T> inline T _absdiff(T a, T b) template<typename T> inline T _absdiff(T a, T b)
@ -1106,6 +1114,44 @@ template<typename _Tp, int n> inline void v_expand(const v_reg<_Tp, n>& a,
} }
} }
/** @brief Expand lower values to the wider pack type
Same as cv::v_expand, but return lower half of the vector.
Scheme:
@code
int32x4 int64x2
{A B C D} ==> {A B}
@endcode */
template<typename _Tp, int n>
inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
v_expand_low(const v_reg<_Tp, n>& a)
{
v_reg<typename V_TypeTraits<_Tp>::w_type, n/2> b;
for( int i = 0; i < (n/2); i++ )
b.s[i] = a.s[i];
return b;
}
/** @brief Expand higher values to the wider pack type
Same as cv::v_expand_low, but expand higher half of the vector instead.
Scheme:
@code
int32x4 int64x2
{A B C D} ==> {C D}
@endcode */
template<typename _Tp, int n>
inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
v_expand_high(const v_reg<_Tp, n>& a)
{
v_reg<typename V_TypeTraits<_Tp>::w_type, n/2> b;
for( int i = 0; i < (n/2); i++ )
b.s[i] = a.s[i+(n/2)];
return b;
}
//! @cond IGNORED //! @cond IGNORED
template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::int_type, n> template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::int_type, n>
v_reinterpret_as_int(const v_reg<_Tp, n>& a) v_reinterpret_as_int(const v_reg<_Tp, n>& a)

View File

@ -0,0 +1,158 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html
#ifndef CV__SIMD_FORWARD
#error "Need to pre-define forward width"
#endif
namespace cv
{
//! @cond IGNORED
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
/** Types **/
#if CV__SIMD_FORWARD == 512
// [todo] 512
#error "AVX512 Not implemented yet"
#elif CV__SIMD_FORWARD == 256
// 256
#define __CV_VX(fun) v256_##fun
#define __CV_V_UINT8 v_uint8x32
#define __CV_V_INT8 v_int8x32
#define __CV_V_UINT16 v_uint16x16
#define __CV_V_INT16 v_int16x16
#define __CV_V_UINT32 v_uint32x8
#define __CV_V_INT32 v_int32x8
#define __CV_V_UINT64 v_uint64x4
#define __CV_V_INT64 v_int64x4
#define __CV_V_FLOAT32 v_float32x8
#define __CV_V_FLOAT64 v_float64x4
struct v_uint8x32;
struct v_int8x32;
struct v_uint16x16;
struct v_int16x16;
struct v_uint32x8;
struct v_int32x8;
struct v_uint64x4;
struct v_int64x4;
struct v_float32x8;
struct v_float64x4;
#else
// 128
#define __CV_VX(fun) v_##fun
#define __CV_V_UINT8 v_uint8x16
#define __CV_V_INT8 v_int8x16
#define __CV_V_UINT16 v_uint16x8
#define __CV_V_INT16 v_int16x8
#define __CV_V_UINT32 v_uint32x4
#define __CV_V_INT32 v_int32x4
#define __CV_V_UINT64 v_uint64x2
#define __CV_V_INT64 v_int64x2
#define __CV_V_FLOAT32 v_float32x4
#define __CV_V_FLOAT64 v_float64x2
struct v_uint8x16;
struct v_int8x16;
struct v_uint16x8;
struct v_int16x8;
struct v_uint32x4;
struct v_int32x4;
struct v_uint64x2;
struct v_int64x2;
struct v_float32x4;
struct v_float64x2;
#endif
/** Value reordering **/
// Expansion
void v_expand(const __CV_V_UINT8&, __CV_V_UINT16&, __CV_V_UINT16&);
void v_expand(const __CV_V_INT8&, __CV_V_INT16&, __CV_V_INT16&);
void v_expand(const __CV_V_UINT16&, __CV_V_UINT32&, __CV_V_UINT32&);
void v_expand(const __CV_V_INT16&, __CV_V_INT32&, __CV_V_INT32&);
void v_expand(const __CV_V_UINT32&, __CV_V_UINT64&, __CV_V_UINT64&);
void v_expand(const __CV_V_INT32&, __CV_V_INT64&, __CV_V_INT64&);
// Low Expansion
__CV_V_UINT16 v_expand_low(const __CV_V_UINT8&);
__CV_V_INT16 v_expand_low(const __CV_V_INT8&);
__CV_V_UINT32 v_expand_low(const __CV_V_UINT16&);
__CV_V_INT32 v_expand_low(const __CV_V_INT16&);
__CV_V_UINT64 v_expand_low(const __CV_V_UINT32&);
__CV_V_INT64 v_expand_low(const __CV_V_INT32&);
// High Expansion
__CV_V_UINT16 v_expand_high(const __CV_V_UINT8&);
__CV_V_INT16 v_expand_high(const __CV_V_INT8&);
__CV_V_UINT32 v_expand_high(const __CV_V_UINT16&);
__CV_V_INT32 v_expand_high(const __CV_V_INT16&);
__CV_V_UINT64 v_expand_high(const __CV_V_UINT32&);
__CV_V_INT64 v_expand_high(const __CV_V_INT32&);
// Load & Low Expansion
__CV_V_UINT16 __CV_VX(load_expand)(const uchar*);
__CV_V_INT16 __CV_VX(load_expand)(const schar*);
__CV_V_UINT32 __CV_VX(load_expand)(const ushort*);
__CV_V_INT32 __CV_VX(load_expand)(const short*);
__CV_V_UINT64 __CV_VX(load_expand)(const uint*);
__CV_V_INT64 __CV_VX(load_expand)(const int*);
// Load lower 8-bit and expand into 32-bit
__CV_V_UINT32 __CV_VX(load_expand_q)(const uchar*);
__CV_V_INT32 __CV_VX(load_expand_q)(const schar*);
// Saturating Pack
__CV_V_UINT8 v_pack(const __CV_V_UINT16&, const __CV_V_UINT16&);
__CV_V_INT8 v_pack(const __CV_V_INT16&, const __CV_V_INT16&);
__CV_V_UINT16 v_pack(const __CV_V_UINT32&, const __CV_V_UINT32&);
__CV_V_INT16 v_pack(const __CV_V_INT32&, const __CV_V_INT32&);
// Non-saturating Pack
__CV_V_UINT32 v_pack(const __CV_V_UINT64&, const __CV_V_UINT64&);
__CV_V_INT32 v_pack(const __CV_V_INT64&, const __CV_V_INT64&);
// Pack signed integers with unsigned saturation
__CV_V_UINT8 v_pack_u(const __CV_V_INT16&, const __CV_V_INT16&);
__CV_V_UINT16 v_pack_u(const __CV_V_INT32&, const __CV_V_INT32&);
/** Arithmetic, bitwise and comparison operations **/
// Non-saturating multiply
#if CV_VSX
template<typename Tvec>
Tvec v_mul_wrap(const Tvec& a, const Tvec& b);
#else
__CV_V_UINT8 v_mul_wrap(const __CV_V_UINT8&, const __CV_V_UINT8&);
__CV_V_INT8 v_mul_wrap(const __CV_V_INT8&, const __CV_V_INT8&);
__CV_V_UINT16 v_mul_wrap(const __CV_V_UINT16&, const __CV_V_UINT16&);
__CV_V_INT16 v_mul_wrap(const __CV_V_INT16&, const __CV_V_INT16&);
#endif
// Multiply and expand
#if CV_VSX
template<typename Tvec, typename Twvec>
void v_mul_expand(const Tvec& a, const Tvec& b, Twvec& c, Twvec& d);
#else
void v_mul_expand(const __CV_V_UINT8&, const __CV_V_UINT8&, __CV_V_UINT16&, __CV_V_UINT16&);
void v_mul_expand(const __CV_V_INT8&, const __CV_V_INT8&, __CV_V_INT16&, __CV_V_INT16&);
void v_mul_expand(const __CV_V_UINT16&, const __CV_V_UINT16&, __CV_V_UINT32&, __CV_V_UINT32&);
void v_mul_expand(const __CV_V_INT16&, const __CV_V_INT16&, __CV_V_INT32&, __CV_V_INT32&);
void v_mul_expand(const __CV_V_UINT32&, const __CV_V_UINT32&, __CV_V_UINT64&, __CV_V_UINT64&);
void v_mul_expand(const __CV_V_INT32&, const __CV_V_INT32&, __CV_V_INT64&, __CV_V_INT64&);
#endif
/** Cleanup **/
#undef CV__SIMD_FORWARD
#undef __CV_VX
#undef __CV_V_UINT8
#undef __CV_V_INT8
#undef __CV_V_UINT16
#undef __CV_V_INT16
#undef __CV_V_UINT32
#undef __CV_V_INT32
#undef __CV_V_UINT64
#undef __CV_V_INT64
#undef __CV_V_FLOAT32
#undef __CV_V_FLOAT64
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
//! @endcond
} // cv::

View File

@ -435,10 +435,8 @@ OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int8x16, vqaddq_s8)
OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int8x16, vqsubq_s8) OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int8x16, vqsubq_s8)
OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint16x8, vqaddq_u16) OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint16x8, vqaddq_u16)
OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint16x8, vqsubq_u16) OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint16x8, vqsubq_u16)
OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_uint16x8, vmulq_u16)
OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int16x8, vqaddq_s16) OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int16x8, vqaddq_s16)
OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int16x8, vqsubq_s16) OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int16x8, vqsubq_s16)
OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_int16x8, vmulq_s16)
OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int32x4, vaddq_s32) OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int32x4, vaddq_s32)
OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int32x4, vsubq_s32) OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int32x4, vsubq_s32)
OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_int32x4, vmulq_s32) OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_int32x4, vmulq_s32)
@ -476,6 +474,37 @@ inline v_float32x4& operator /= (v_float32x4& a, const v_float32x4& b)
} }
#endif #endif
// saturating multiply 8-bit, 16-bit
#define OPENCV_HAL_IMPL_NEON_MUL_SAT(_Tpvec, _Tpwvec) \
inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \
{ \
_Tpwvec c, d; \
v_mul_expand(a, b, c, d); \
return v_pack(c, d); \
} \
inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \
{ a = a * b; return a; }
OPENCV_HAL_IMPL_NEON_MUL_SAT(v_int8x16, v_int16x8)
OPENCV_HAL_IMPL_NEON_MUL_SAT(v_uint8x16, v_uint16x8)
OPENCV_HAL_IMPL_NEON_MUL_SAT(v_int16x8, v_int32x4)
OPENCV_HAL_IMPL_NEON_MUL_SAT(v_uint16x8, v_uint32x4)
// Multiply and expand
inline void v_mul_expand(const v_int8x16& a, const v_int8x16& b,
v_int16x8& c, v_int16x8& d)
{
c.val = vmull_s8(vget_low_s8(a.val), vget_low_s8(b.val));
d.val = vmull_s8(vget_high_s8(a.val), vget_high_s8(b.val));
}
inline void v_mul_expand(const v_uint8x16& a, const v_uint8x16& b,
v_uint16x8& c, v_uint16x8& d)
{
c.val = vmull_u8(vget_low_u8(a.val), vget_low_u8(b.val));
d.val = vmull_u8(vget_high_u8(a.val), vget_high_u8(b.val));
}
inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b, inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
v_int32x4& c, v_int32x4& d) v_int32x4& c, v_int32x4& d)
{ {
@ -714,6 +743,10 @@ OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_sub_wrap, vsubq_u8)
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_sub_wrap, vsubq_s8) OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_sub_wrap, vsubq_s8)
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_sub_wrap, vsubq_u16) OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_sub_wrap, vsubq_u16)
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_sub_wrap, vsubq_s16) OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_sub_wrap, vsubq_s16)
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_mul_wrap, vmulq_u8)
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_mul_wrap, vmulq_s8)
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_mul_wrap, vmulq_u16)
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_mul_wrap, vmulq_s16)
// TODO: absdiff for signed integers // TODO: absdiff for signed integers
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_absdiff, vabdq_u8) OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_absdiff, vabdq_u8)
@ -1056,6 +1089,14 @@ inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
b0.val = vmovl_##suffix(vget_low_##suffix(a.val)); \ b0.val = vmovl_##suffix(vget_low_##suffix(a.val)); \
b1.val = vmovl_##suffix(vget_high_##suffix(a.val)); \ b1.val = vmovl_##suffix(vget_high_##suffix(a.val)); \
} \ } \
inline _Tpwvec v_expand_low(const _Tpvec& a) \
{ \
return _Tpwvec(vmovl_##suffix(vget_low_##suffix(a.val))); \
} \
inline _Tpwvec v_expand_high(const _Tpvec& a) \
{ \
return _Tpwvec(vmovl_##suffix(vget_high_##suffix(a.val))); \
} \
inline _Tpwvec v_load_expand(const _Tp* ptr) \ inline _Tpwvec v_load_expand(const _Tp* ptr) \
{ \ { \
return _Tpwvec(vmovl_##suffix(vld1_##suffix(ptr))); \ return _Tpwvec(vmovl_##suffix(vld1_##suffix(ptr))); \

View File

@ -59,6 +59,8 @@ namespace cv
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
///////// Types ////////////
struct v_uint8x16 struct v_uint8x16
{ {
typedef uchar lane_type; typedef uchar lane_type;
@ -436,13 +438,7 @@ inline __m128i v_select_si128(__m128i mask, __m128i a, __m128i b)
} }
inline v_uint16x8 v_pack(const v_uint32x4& a, const v_uint32x4& b) inline v_uint16x8 v_pack(const v_uint32x4& a, const v_uint32x4& b)
{ { return v_uint16x8(_v128_packs_epu32(a.val, b.val)); }
__m128i z = _mm_setzero_si128(), maxval32 = _mm_set1_epi32(65535), delta32 = _mm_set1_epi32(32768);
__m128i a1 = _mm_sub_epi32(v_select_si128(_mm_cmpgt_epi32(z, a.val), maxval32, a.val), delta32);
__m128i b1 = _mm_sub_epi32(v_select_si128(_mm_cmpgt_epi32(z, b.val), maxval32, b.val), delta32);
__m128i r = _mm_packs_epi32(a1, b1);
return v_uint16x8(_mm_sub_epi16(r, _mm_set1_epi16(-32768)));
}
inline void v_pack_store(ushort* ptr, const v_uint32x4& a) inline void v_pack_store(ushort* ptr, const v_uint32x4& a)
{ {
@ -678,14 +674,14 @@ OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int8x16, _mm_adds_epi8)
OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int8x16, _mm_subs_epi8) OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int8x16, _mm_subs_epi8)
OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint16x8, _mm_adds_epu16) OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint16x8, _mm_adds_epu16)
OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint16x8, _mm_subs_epu16) OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint16x8, _mm_subs_epu16)
OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_uint16x8, _mm_mullo_epi16)
OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int16x8, _mm_adds_epi16) OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int16x8, _mm_adds_epi16)
OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int16x8, _mm_subs_epi16) OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int16x8, _mm_subs_epi16)
OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_int16x8, _mm_mullo_epi16)
OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint32x4, _mm_add_epi32) OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint32x4, _mm_add_epi32)
OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint32x4, _mm_sub_epi32) OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint32x4, _mm_sub_epi32)
OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_uint32x4, _v128_mullo_epi32)
OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int32x4, _mm_add_epi32) OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int32x4, _mm_add_epi32)
OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int32x4, _mm_sub_epi32) OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int32x4, _mm_sub_epi32)
OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_int32x4, _v128_mullo_epi32)
OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_float32x4, _mm_add_ps) OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_float32x4, _mm_add_ps)
OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_float32x4, _mm_sub_ps) OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_float32x4, _mm_sub_ps)
OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_float32x4, _mm_mul_ps) OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_float32x4, _mm_mul_ps)
@ -699,35 +695,49 @@ OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint64x2, _mm_sub_epi64)
OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int64x2, _mm_add_epi64) OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int64x2, _mm_add_epi64)
OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int64x2, _mm_sub_epi64) OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int64x2, _mm_sub_epi64)
inline v_uint32x4 operator * (const v_uint32x4& a, const v_uint32x4& b) // saturating multiply 8-bit, 16-bit
#define OPENCV_HAL_IMPL_SSE_MUL_SAT(_Tpvec, _Tpwvec) \
inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \
{ \
_Tpwvec c, d; \
v_mul_expand(a, b, c, d); \
return v_pack(c, d); \
} \
inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \
{ a = a * b; return a; }
OPENCV_HAL_IMPL_SSE_MUL_SAT(v_int8x16, v_int16x8)
OPENCV_HAL_IMPL_SSE_MUL_SAT(v_uint16x8, v_uint32x4)
OPENCV_HAL_IMPL_SSE_MUL_SAT(v_int16x8, v_int32x4)
inline v_uint8x16 operator * (const v_uint8x16& a, const v_uint8x16& b)
{ {
__m128i c0 = _mm_mul_epu32(a.val, b.val); v_uint16x8 c, d;
__m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32)); v_mul_expand(a, b, c, d);
__m128i d0 = _mm_unpacklo_epi32(c0, c1); return v_pack_u(v_reinterpret_as_s16(c), v_reinterpret_as_s16(d));
__m128i d1 = _mm_unpackhi_epi32(c0, c1);
return v_uint32x4(_mm_unpacklo_epi64(d0, d1));
} }
inline v_int32x4 operator * (const v_int32x4& a, const v_int32x4& b) inline v_uint8x16& operator *= (v_uint8x16& a, const v_uint8x16& b)
{ a = a * b; return a; }
// Multiply and expand
inline void v_mul_expand(const v_uint8x16& a, const v_uint8x16& b,
v_uint16x8& c, v_uint16x8& d)
{ {
#if CV_SSE4_1 v_uint16x8 a0, a1, b0, b1;
return v_int32x4(_mm_mullo_epi32(a.val, b.val)); v_expand(a, a0, a1);
#else v_expand(b, b0, b1);
__m128i c0 = _mm_mul_epu32(a.val, b.val); c = v_mul_wrap(a0, b0);
__m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32)); d = v_mul_wrap(a1, b1);
__m128i d0 = _mm_unpacklo_epi32(c0, c1);
__m128i d1 = _mm_unpackhi_epi32(c0, c1);
return v_int32x4(_mm_unpacklo_epi64(d0, d1));
#endif
} }
inline v_uint32x4& operator *= (v_uint32x4& a, const v_uint32x4& b)
inline void v_mul_expand(const v_int8x16& a, const v_int8x16& b,
v_int16x8& c, v_int16x8& d)
{ {
a = a * b; v_int16x8 a0, a1, b0, b1;
return a; v_expand(a, a0, a1);
} v_expand(b, b0, b1);
inline v_int32x4& operator *= (v_int32x4& a, const v_int32x4& b) c = v_mul_wrap(a0, b0);
{ d = v_mul_wrap(a1, b1);
a = a * b;
return a;
} }
inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b, inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
@ -1018,6 +1028,22 @@ OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_sub_wrap, _mm_sub_epi8)
OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int8x16, v_sub_wrap, _mm_sub_epi8) OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int8x16, v_sub_wrap, _mm_sub_epi8)
OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint16x8, v_sub_wrap, _mm_sub_epi16) OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint16x8, v_sub_wrap, _mm_sub_epi16)
OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_sub_wrap, _mm_sub_epi16) OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_sub_wrap, _mm_sub_epi16)
OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint16x8, v_mul_wrap, _mm_mullo_epi16)
OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_mul_wrap, _mm_mullo_epi16)
inline v_uint8x16 v_mul_wrap(const v_uint8x16& a, const v_uint8x16& b)
{
__m128i ad = _mm_srai_epi16(a.val, 8);
__m128i bd = _mm_srai_epi16(b.val, 8);
__m128i p0 = _mm_mullo_epi16(a.val, b.val); // even
__m128i p1 = _mm_slli_epi16(_mm_mullo_epi16(ad, bd), 8); // odd
const __m128i b01 = _mm_set1_epi32(0xFF00FF00);
return v_uint8x16(_v128_blendv_epi8(p0, p1, b01));
}
inline v_int8x16 v_mul_wrap(const v_int8x16& a, const v_int8x16& b)
{
return v_reinterpret_as_s8(v_mul_wrap(v_reinterpret_as_u8(a), v_reinterpret_as_u8(b)));
}
#define OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(_Tpuvec, _Tpsvec, bits, smask32) \ #define OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(_Tpuvec, _Tpsvec, bits, smask32) \
inline _Tpuvec v_absdiff(const _Tpuvec& a, const _Tpuvec& b) \ inline _Tpuvec v_absdiff(const _Tpuvec& a, const _Tpuvec& b) \
@ -1502,70 +1528,39 @@ OPENCV_HAL_IMPL_SSE_SELECT(v_float32x4, ps)
OPENCV_HAL_IMPL_SSE_SELECT(v_float64x2, pd) OPENCV_HAL_IMPL_SSE_SELECT(v_float64x2, pd)
#endif #endif
#define OPENCV_HAL_IMPL_SSE_EXPAND(_Tpuvec, _Tpwuvec, _Tpu, _Tpsvec, _Tpwsvec, _Tps, suffix, wsuffix, shift) \ /* Expand */
inline void v_expand(const _Tpuvec& a, _Tpwuvec& b0, _Tpwuvec& b1) \ #define OPENCV_HAL_IMPL_SSE_EXPAND(_Tpvec, _Tpwvec, _Tp, intrin) \
{ \ inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
__m128i z = _mm_setzero_si128(); \ { \
b0.val = _mm_unpacklo_##suffix(a.val, z); \ b0.val = intrin(a.val); \
b1.val = _mm_unpackhi_##suffix(a.val, z); \ b1.val = __CV_CAT(intrin, _high)(a.val); \
} \ } \
inline _Tpwuvec v_load_expand(const _Tpu* ptr) \ inline _Tpwvec v_expand_low(const _Tpvec& a) \
{ \ { return _Tpwvec(intrin(a.val)); } \
__m128i z = _mm_setzero_si128(); \ inline _Tpwvec v_expand_high(const _Tpvec& a) \
return _Tpwuvec(_mm_unpacklo_##suffix(_mm_loadl_epi64((const __m128i*)ptr), z)); \ { return _Tpwvec(__CV_CAT(intrin, _high)(a.val)); } \
} \ inline _Tpwvec v_load_expand(const _Tp* ptr) \
inline void v_expand(const _Tpsvec& a, _Tpwsvec& b0, _Tpwsvec& b1) \ { \
{ \ __m128i a = _mm_loadl_epi64((const __m128i*)ptr); \
b0.val = _mm_srai_##wsuffix(_mm_unpacklo_##suffix(a.val, a.val), shift); \ return _Tpwvec(intrin(a)); \
b1.val = _mm_srai_##wsuffix(_mm_unpackhi_##suffix(a.val, a.val), shift); \ }
} \
inline _Tpwsvec v_load_expand(const _Tps* ptr) \
{ \
__m128i a = _mm_loadl_epi64((const __m128i*)ptr); \
return _Tpwsvec(_mm_srai_##wsuffix(_mm_unpacklo_##suffix(a, a), shift)); \
}
OPENCV_HAL_IMPL_SSE_EXPAND(v_uint8x16, v_uint16x8, uchar, v_int8x16, v_int16x8, schar, epi8, epi16, 8) OPENCV_HAL_IMPL_SSE_EXPAND(v_uint8x16, v_uint16x8, uchar, _v128_cvtepu8_epi16)
OPENCV_HAL_IMPL_SSE_EXPAND(v_uint16x8, v_uint32x4, ushort, v_int16x8, v_int32x4, short, epi16, epi32, 16) OPENCV_HAL_IMPL_SSE_EXPAND(v_int8x16, v_int16x8, schar, _v128_cvtepi8_epi16)
OPENCV_HAL_IMPL_SSE_EXPAND(v_uint16x8, v_uint32x4, ushort, _v128_cvtepu16_epi32)
OPENCV_HAL_IMPL_SSE_EXPAND(v_int16x8, v_int32x4, short, _v128_cvtepi16_epi32)
OPENCV_HAL_IMPL_SSE_EXPAND(v_uint32x4, v_uint64x2, unsigned, _v128_cvtepu32_epi64)
OPENCV_HAL_IMPL_SSE_EXPAND(v_int32x4, v_int64x2, int, _v128_cvtepi32_epi64)
inline void v_expand(const v_uint32x4& a, v_uint64x2& b0, v_uint64x2& b1) #define OPENCV_HAL_IMPL_SSE_EXPAND_Q(_Tpvec, _Tp, intrin) \
{ inline _Tpvec v_load_expand_q(const _Tp* ptr) \
__m128i z = _mm_setzero_si128(); { \
b0.val = _mm_unpacklo_epi32(a.val, z); __m128i a = _mm_cvtsi32_si128(*(const int*)ptr); \
b1.val = _mm_unpackhi_epi32(a.val, z); return _Tpvec(intrin(a)); \
} }
inline v_uint64x2 v_load_expand(const unsigned* ptr)
{
__m128i z = _mm_setzero_si128();
return v_uint64x2(_mm_unpacklo_epi32(_mm_loadl_epi64((const __m128i*)ptr), z));
}
inline void v_expand(const v_int32x4& a, v_int64x2& b0, v_int64x2& b1)
{
__m128i s = _mm_srai_epi32(a.val, 31);
b0.val = _mm_unpacklo_epi32(a.val, s);
b1.val = _mm_unpackhi_epi32(a.val, s);
}
inline v_int64x2 v_load_expand(const int* ptr)
{
__m128i a = _mm_loadl_epi64((const __m128i*)ptr);
__m128i s = _mm_srai_epi32(a, 31);
return v_int64x2(_mm_unpacklo_epi32(a, s));
}
inline v_uint32x4 v_load_expand_q(const uchar* ptr) OPENCV_HAL_IMPL_SSE_EXPAND_Q(v_uint32x4, uchar, _v128_cvtepu8_epi32)
{ OPENCV_HAL_IMPL_SSE_EXPAND_Q(v_int32x4, schar, _v128_cvtepi8_epi32)
__m128i z = _mm_setzero_si128();
__m128i a = _mm_cvtsi32_si128(*(const int*)ptr);
return v_uint32x4(_mm_unpacklo_epi16(_mm_unpacklo_epi8(a, z), z));
}
inline v_int32x4 v_load_expand_q(const schar* ptr)
{
__m128i a = _mm_cvtsi32_si128(*(const int*)ptr);
a = _mm_unpacklo_epi8(a, a);
a = _mm_unpacklo_epi8(a, a);
return v_int32x4(_mm_srai_epi32(a, 24));
}
#define OPENCV_HAL_IMPL_SSE_UNPACKS(_Tpvec, suffix, cast_from, cast_to) \ #define OPENCV_HAL_IMPL_SSE_UNPACKS(_Tpvec, suffix, cast_from, cast_to) \
inline void v_zip(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1) \ inline void v_zip(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1) \

View File

@ -0,0 +1,167 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html
#ifndef OPENCV_HAL_INTRIN_SSE_EM_HPP
#define OPENCV_HAL_INTRIN_SSE_EM_HPP
namespace cv
{
//! @cond IGNORED
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
#define OPENCV_HAL_SSE_WRAP_1(fun, tp) \
inline tp _v128_##fun(const tp& a) \
{ return _mm_##fun(a); }
#define OPENCV_HAL_SSE_WRAP_2(fun, tp) \
inline tp _v128_##fun(const tp& a, const tp& b) \
{ return _mm_##fun(a, b); }
#define OPENCV_HAL_SSE_WRAP_3(fun, tp) \
inline tp _v128_##fun(const tp& a, const tp& b, const tp& c) \
{ return _mm_##fun(a, b, c); }
///////////////////////////// XOP /////////////////////////////
// [todo] define CV_XOP
#if 1 // CV_XOP
inline __m128i _v128_comgt_epu32(const __m128i& a, const __m128i& b)
{
const __m128i delta = _mm_set1_epi32((int)0x80000000);
return _mm_cmpgt_epi32(_mm_xor_si128(a, delta), _mm_xor_si128(b, delta));
}
// wrapping XOP
#else
OPENCV_HAL_SSE_WRAP_2(_v128_comgt_epu32, __m128i)
#endif // !CV_XOP
///////////////////////////// SSE4.1 /////////////////////////////
#if !CV_SSE4_1
/** Swizzle **/
inline __m128i _v128_blendv_epi8(const __m128i& a, const __m128i& b, const __m128i& mask)
{ return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(b, a), mask)); }
/** Convert **/
// 8 >> 16
inline __m128i _v128_cvtepu8_epi16(const __m128i& a)
{
const __m128i z = _mm_setzero_si128();
return _mm_unpacklo_epi8(a, z);
}
inline __m128i _v128_cvtepi8_epi16(const __m128i& a)
{ return _mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8); }
// 8 >> 32
inline __m128i _v128_cvtepu8_epi32(const __m128i& a)
{
const __m128i z = _mm_setzero_si128();
return _mm_unpacklo_epi16(_mm_unpacklo_epi8(a, z), z);
}
inline __m128i _v128_cvtepi8_epi32(const __m128i& a)
{
__m128i r = _mm_unpacklo_epi8(a, a);
r = _mm_unpacklo_epi8(r, r);
return _mm_srai_epi32(r, 24);
}
// 16 >> 32
inline __m128i _v128_cvtepu16_epi32(const __m128i& a)
{
const __m128i z = _mm_setzero_si128();
return _mm_unpacklo_epi16(a, z);
}
inline __m128i _v128_cvtepi16_epi32(const __m128i& a)
{ return _mm_srai_epi32(_mm_unpacklo_epi16(a, a), 16); }
// 32 >> 64
inline __m128i _v128_cvtepu32_epi64(const __m128i& a)
{
const __m128i z = _mm_setzero_si128();
return _mm_unpacklo_epi32(a, z);
}
inline __m128i _v128_cvtepi32_epi64(const __m128i& a)
{ return _mm_unpacklo_epi32(a, _mm_srai_epi32(a, 31)); }
/** Arithmetic **/
inline __m128i _v128_mullo_epi32(const __m128i& a, const __m128i& b)
{
__m128i c0 = _mm_mul_epu32(a, b);
__m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a, 32), _mm_srli_epi64(b, 32));
__m128i d0 = _mm_unpacklo_epi32(c0, c1);
__m128i d1 = _mm_unpackhi_epi32(c0, c1);
return _mm_unpacklo_epi64(d0, d1);
}
/** Math **/
inline __m128i _v128_min_epu32(const __m128i& a, const __m128i& b)
{ return _v128_blendv_epi8(a, b, _v128_comgt_epu32(a, b)); }
// wrapping SSE4.1
#else
OPENCV_HAL_SSE_WRAP_1(cvtepu8_epi16, __m128i)
OPENCV_HAL_SSE_WRAP_1(cvtepi8_epi16, __m128i)
OPENCV_HAL_SSE_WRAP_1(cvtepu8_epi32, __m128i)
OPENCV_HAL_SSE_WRAP_1(cvtepi8_epi32, __m128i)
OPENCV_HAL_SSE_WRAP_1(cvtepu16_epi32, __m128i)
OPENCV_HAL_SSE_WRAP_1(cvtepi16_epi32, __m128i)
OPENCV_HAL_SSE_WRAP_1(cvtepu32_epi64, __m128i)
OPENCV_HAL_SSE_WRAP_1(cvtepi32_epi64, __m128i)
OPENCV_HAL_SSE_WRAP_2(min_epu32, __m128i)
OPENCV_HAL_SSE_WRAP_2(mullo_epi32, __m128i)
OPENCV_HAL_SSE_WRAP_3(blendv_epi8, __m128i)
#endif // !CV_SSE4_1
///////////////////////////// Revolutionary /////////////////////////////
/** Convert **/
// 16 << 8
inline __m128i _v128_cvtepu8_epi16_high(const __m128i& a)
{
const __m128i z = _mm_setzero_si128();
return _mm_unpackhi_epi8(a, z);
}
inline __m128i _v128_cvtepi8_epi16_high(const __m128i& a)
{ return _mm_srai_epi16(_mm_unpackhi_epi8(a, a), 8); }
// 32 << 16
inline __m128i _v128_cvtepu16_epi32_high(const __m128i& a)
{
const __m128i z = _mm_setzero_si128();
return _mm_unpackhi_epi16(a, z);
}
inline __m128i _v128_cvtepi16_epi32_high(const __m128i& a)
{ return _mm_srai_epi32(_mm_unpackhi_epi16(a, a), 16); }
// 64 << 32
inline __m128i _v128_cvtepu32_epi64_high(const __m128i& a)
{
const __m128i z = _mm_setzero_si128();
return _mm_unpackhi_epi32(a, z);
}
inline __m128i _v128_cvtepi32_epi64_high(const __m128i& a)
{ return _mm_unpackhi_epi32(a, _mm_srai_epi32(a, 31)); }
/** Miscellaneous **/
inline __m128i _v128_packs_epu32(const __m128i& a, const __m128i& b)
{
const __m128i m = _mm_set1_epi32(65535);
__m128i am = _v128_min_epu32(a, m);
__m128i bm = _v128_min_epu32(b, m);
#if CV_SSE4_1
return _mm_packus_epi32(am, bm);
#else
const __m128i d = _mm_set1_epi32(32768), nd = _mm_set1_epi16(-32768);
am = _mm_sub_epi32(am, d);
bm = _mm_sub_epi32(bm, d);
am = _mm_packs_epi32(am, bm);
return _mm_sub_epi16(am, nd);
#endif
}
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
//! @endcond
} // cv::
#endif // OPENCV_HAL_INTRIN_SSE_EM_HPP

View File

@ -315,6 +315,10 @@ inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
b0.val = fh(a.val); \ b0.val = fh(a.val); \
b1.val = fl(a.val); \ b1.val = fl(a.val); \
} \ } \
inline _Tpwvec v_expand_low(const _Tpvec& a) \
{ return _Tpwvec(fh(a.val)); } \
inline _Tpwvec v_expand_high(const _Tpvec& a) \
{ return _Tpwvec(fl(a.val)); } \
inline _Tpwvec v_load_expand(const _Tp* ptr) \ inline _Tpwvec v_load_expand(const _Tp* ptr) \
{ return _Tpwvec(fh(vec_ld_l8(ptr))); } { return _Tpwvec(fh(vec_ld_l8(ptr))); }
@ -418,10 +422,8 @@ OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_int8x16, vec_adds)
OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_int8x16, vec_subs) OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_int8x16, vec_subs)
OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_uint16x8, vec_adds) OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_uint16x8, vec_adds)
OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_uint16x8, vec_subs) OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_uint16x8, vec_subs)
OPENCV_HAL_IMPL_VSX_BIN_OP(*, v_uint16x8, vec_mul)
OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_int16x8, vec_adds) OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_int16x8, vec_adds)
OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_int16x8, vec_subs) OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_int16x8, vec_subs)
OPENCV_HAL_IMPL_VSX_BIN_OP(*, v_int16x8, vec_mul)
OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_uint32x4, vec_add) OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_uint32x4, vec_add)
OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_uint32x4, vec_sub) OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_uint32x4, vec_sub)
OPENCV_HAL_IMPL_VSX_BIN_OP(*, v_uint32x4, vec_mul) OPENCV_HAL_IMPL_VSX_BIN_OP(*, v_uint32x4, vec_mul)
@ -441,16 +443,30 @@ OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_uint64x2, vec_sub)
OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_int64x2, vec_add) OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_int64x2, vec_add)
OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_int64x2, vec_sub) OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_int64x2, vec_sub)
inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b, v_int32x4& c, v_int32x4& d) // saturating multiply
#define OPENCV_HAL_IMPL_VSX_MUL_SAT(_Tpvec, _Tpwvec) \
inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \
{ \
_Tpwvec c, d; \
v_mul_expand(a, b, c, d); \
return v_pack(c, d); \
} \
inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \
{ a = a * b; return a; }
OPENCV_HAL_IMPL_VSX_MUL_SAT(v_int8x16, v_int16x8)
OPENCV_HAL_IMPL_VSX_MUL_SAT(v_uint8x16, v_uint16x8)
OPENCV_HAL_IMPL_VSX_MUL_SAT(v_int16x8, v_int32x4)
OPENCV_HAL_IMPL_VSX_MUL_SAT(v_uint16x8, v_uint32x4)
template<typename Tvec, typename Twvec>
inline void v_mul_expand(const Tvec& a, const Tvec& b, Twvec& c, Twvec& d)
{ {
c.val = vec_mul(vec_unpackh(a.val), vec_unpackh(b.val)); Twvec p0 = Twvec(vec_mule(a.val, b.val));
d.val = vec_mul(vec_unpackl(a.val), vec_unpackl(b.val)); Twvec p1 = Twvec(vec_mulo(a.val, b.val));
} v_zip(p0, p1, c, d);
inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b, v_uint32x4& c, v_uint32x4& d)
{
c.val = vec_mul(vec_unpackhu(a.val), vec_unpackhu(b.val));
d.val = vec_mul(vec_unpacklu(a.val), vec_unpacklu(b.val));
} }
inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b, v_uint64x2& c, v_uint64x2& d) inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b, v_uint64x2& c, v_uint64x2& d)
{ {
c.val = vec_mul(vec_unpackhu(a.val), vec_unpackhu(b.val)); c.val = vec_mul(vec_unpackhu(a.val), vec_unpackhu(b.val));
@ -459,17 +475,17 @@ inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b, v_uint64x2& c
inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b) inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b)
{ {
return v_int16x8(vec_packs( vec_int4 p0 = vec_mule(a.val, b.val);
vec_sra(vec_mul(vec_unpackh(a.val), vec_unpackh(b.val)), vec_uint4_sp(16)), vec_int4 p1 = vec_mulo(a.val, b.val);
vec_sra(vec_mul(vec_unpackl(a.val), vec_unpackl(b.val)), vec_uint4_sp(16)) static const vec_uchar16 perm = {2, 3, 18, 19, 6, 7, 22, 23, 10, 11, 26, 27, 14, 15, 30, 31};
)); return v_int16x8(vec_perm(vec_short8_c(p0), vec_short8_c(p1), perm));
} }
inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b) inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b)
{ {
return v_uint16x8(vec_packs( vec_uint4 p0 = vec_mule(a.val, b.val);
vec_sr(vec_mul(vec_unpackhu(a.val), vec_unpackhu(b.val)), vec_uint4_sp(16)), vec_uint4 p1 = vec_mulo(a.val, b.val);
vec_sr(vec_mul(vec_unpacklu(a.val), vec_unpacklu(b.val)), vec_uint4_sp(16)) static const vec_uchar16 perm = {2, 3, 18, 19, 6, 7, 22, 23, 10, 11, 26, 27, 14, 15, 30, 31};
)); return v_uint16x8(vec_perm(vec_ushort8_c(p0), vec_ushort8_c(p1), perm));
} }
/** Non-saturating arithmetics **/ /** Non-saturating arithmetics **/
@ -480,6 +496,7 @@ inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_add_wrap, vec_add) OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_add_wrap, vec_add)
OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_sub_wrap, vec_sub) OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_sub_wrap, vec_sub)
OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_mul_wrap, vec_mul)
/** Bitwise shifts **/ /** Bitwise shifts **/
#define OPENCV_HAL_IMPL_VSX_SHIFT_OP(_Tpvec, shr, splfunc) \ #define OPENCV_HAL_IMPL_VSX_SHIFT_OP(_Tpvec, shr, splfunc) \

View File

@ -130,19 +130,21 @@ VSX_FINLINE(rt) fnm(const rg& a, const rg& b) \
# undef vec_mul # undef vec_mul
# endif # endif
/* /*
* there's no a direct instruction for supporting 16-bit multiplication in ISA 2.07, * there's no a direct instruction for supporting 8-bit, 16-bit multiplication in ISA 2.07,
* XLC Implement it by using instruction "multiply even", "multiply odd" and "permute" * XLC Implement it by using instruction "multiply even", "multiply odd" and "permute"
* todo: Do I need to support 8-bit ?
**/ **/
# define VSX_IMPL_MULH(Tvec, Tcast) \ # define VSX_IMPL_MULH(Tvec, cperm) \
VSX_FINLINE(Tvec) vec_mul(const Tvec& a, const Tvec& b) \ VSX_FINLINE(Tvec) vec_mul(const Tvec& a, const Tvec& b) \
{ \ { \
static const vec_uchar16 even_perm = {0, 1, 16, 17, 4, 5, 20, 21, \ static const vec_uchar16 ev_od = {cperm}; \
8, 9, 24, 25, 12, 13, 28, 29}; \ return vec_perm((Tvec)vec_mule(a, b), (Tvec)vec_mulo(a, b), ev_od); \
return vec_perm(Tcast(vec_mule(a, b)), Tcast(vec_mulo(a, b)), even_perm); \
} }
VSX_IMPL_MULH(vec_short8, vec_short8_c) #define VSX_IMPL_MULH_P16 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30
VSX_IMPL_MULH(vec_ushort8, vec_ushort8_c) VSX_IMPL_MULH(vec_char16, VSX_IMPL_MULH_P16)
VSX_IMPL_MULH(vec_uchar16, VSX_IMPL_MULH_P16)
#define VSX_IMPL_MULH_P8 0, 1, 16, 17, 4, 5, 20, 21, 8, 9, 24, 25, 12, 13, 28, 29
VSX_IMPL_MULH(vec_short8, VSX_IMPL_MULH_P8)
VSX_IMPL_MULH(vec_ushort8, VSX_IMPL_MULH_P8)
// vmuluwm can be used for unsigned or signed integers, that's what they said // vmuluwm can be used for unsigned or signed integers, that's what they said
VSX_IMPL_2VRG(vec_int4, vec_int4, vmuluwm, vec_mul) VSX_IMPL_2VRG(vec_int4, vec_int4, vmuluwm, vec_mul)
VSX_IMPL_2VRG(vec_uint4, vec_uint4, vmuluwm, vec_mul) VSX_IMPL_2VRG(vec_uint4, vec_uint4, vmuluwm, vec_mul)

View File

@ -407,10 +407,13 @@ template<typename R> struct TheTest
Data<Rx2> resB = vx_load_expand(dataA.d); Data<Rx2> resB = vx_load_expand(dataA.d);
Rx2 c, d; Rx2 c, d, e, f;
v_expand(a, c, d); v_expand(a, c, d);
Data<Rx2> resC = c, resD = d; e = v_expand_low(a);
f = v_expand_high(a);
Data<Rx2> resC = c, resD = d, resE = e, resF = f;
const int n = Rx2::nlanes; const int n = Rx2::nlanes;
for (int i = 0; i < n; ++i) for (int i = 0; i < n; ++i)
{ {
@ -418,6 +421,8 @@ template<typename R> struct TheTest
EXPECT_EQ(dataA[i], resB[i]); EXPECT_EQ(dataA[i], resB[i]);
EXPECT_EQ(dataA[i], resC[i]); EXPECT_EQ(dataA[i], resC[i]);
EXPECT_EQ(dataA[i + n], resD[i]); EXPECT_EQ(dataA[i + n], resD[i]);
EXPECT_EQ(dataA[i], resE[i]);
EXPECT_EQ(dataA[i + n], resF[i]);
} }
return *this; return *this;
@ -455,19 +460,21 @@ template<typename R> struct TheTest
return *this; return *this;
} }
TheTest & test_addsub_wrap() TheTest & test_arithm_wrap()
{ {
Data<R> dataA, dataB; Data<R> dataA, dataB;
dataB.reverse(); dataB.reverse();
R a = dataA, b = dataB; R a = dataA, b = dataB;
Data<R> resC = v_add_wrap(a, b), Data<R> resC = v_add_wrap(a, b),
resD = v_sub_wrap(a, b); resD = v_sub_wrap(a, b),
resE = v_mul_wrap(a, b);
for (int i = 0; i < R::nlanes; ++i) for (int i = 0; i < R::nlanes; ++i)
{ {
SCOPED_TRACE(cv::format("i=%d", i)); SCOPED_TRACE(cv::format("i=%d", i));
EXPECT_EQ((LaneType)(dataA[i] + dataB[i]), resC[i]); EXPECT_EQ((LaneType)(dataA[i] + dataB[i]), resC[i]);
EXPECT_EQ((LaneType)(dataA[i] - dataB[i]), resD[i]); EXPECT_EQ((LaneType)(dataA[i] - dataB[i]), resD[i]);
EXPECT_EQ((LaneType)(dataA[i] * dataB[i]), resE[i]);
} }
return *this; return *this;
} }
@ -475,6 +482,7 @@ template<typename R> struct TheTest
TheTest & test_mul() TheTest & test_mul()
{ {
Data<R> dataA, dataB; Data<R> dataA, dataB;
dataA[1] = static_cast<LaneType>(std::numeric_limits<LaneType>::max());
dataB.reverse(); dataB.reverse();
R a = dataA, b = dataB; R a = dataA, b = dataB;
@ -482,7 +490,7 @@ template<typename R> struct TheTest
for (int i = 0; i < R::nlanes; ++i) for (int i = 0; i < R::nlanes; ++i)
{ {
SCOPED_TRACE(cv::format("i=%d", i)); SCOPED_TRACE(cv::format("i=%d", i));
EXPECT_EQ(dataA[i] * dataB[i], resC[i]); EXPECT_EQ(saturate_cast<LaneType>(dataA[i] * dataB[i]), resC[i]);
} }
return *this; return *this;
@ -1209,7 +1217,9 @@ void test_hal_intrin_uint8()
.test_expand() .test_expand()
.test_expand_q() .test_expand_q()
.test_addsub() .test_addsub()
.test_addsub_wrap() .test_arithm_wrap()
.test_mul()
.test_mul_expand()
.test_cmp() .test_cmp()
.test_logic() .test_logic()
.test_min_max() .test_min_max()
@ -1242,7 +1252,9 @@ void test_hal_intrin_int8()
.test_expand() .test_expand()
.test_expand_q() .test_expand_q()
.test_addsub() .test_addsub()
.test_addsub_wrap() .test_arithm_wrap()
.test_mul()
.test_mul_expand()
.test_cmp() .test_cmp()
.test_logic() .test_logic()
.test_min_max() .test_min_max()
@ -1267,7 +1279,7 @@ void test_hal_intrin_uint16()
.test_interleave() .test_interleave()
.test_expand() .test_expand()
.test_addsub() .test_addsub()
.test_addsub_wrap() .test_arithm_wrap()
.test_mul() .test_mul()
.test_mul_expand() .test_mul_expand()
.test_cmp() .test_cmp()
@ -1295,7 +1307,7 @@ void test_hal_intrin_int16()
.test_interleave() .test_interleave()
.test_expand() .test_expand()
.test_addsub() .test_addsub()
.test_addsub_wrap() .test_arithm_wrap()
.test_mul() .test_mul()
.test_mul_expand() .test_mul_expand()
.test_cmp() .test_cmp()

View File

@ -1,3 +1,3 @@
set(the_description "Image Processing") set(the_description "Image Processing")
ocv_add_dispatched_file(accum SSE2 AVX NEON) ocv_add_dispatched_file(accum SSE4_1 AVX AVX2)
ocv_define_module(imgproc opencv_core WRAP java python js) ocv_define_module(imgproc opencv_core WRAP java python js)

View File

@ -5,94 +5,102 @@
namespace opencv_test { namespace opencv_test {
#ifdef HAVE_OPENVX typedef Size_MatType Accumulate;
PERF_TEST_P(Size_MatType, Accumulate,
testing::Combine(
testing::Values(::perf::szODD, ::perf::szQVGA, ::perf::szVGA, ::perf::sz1080p),
testing::Values(CV_16SC1, CV_32FC1)
)
)
#else
PERF_TEST_P( Size_MatType, Accumulate,
testing::Combine(
testing::Values(::perf::szODD, ::perf::szQVGA, ::perf::szVGA, ::perf::sz1080p),
testing::Values(CV_32FC1)
)
)
#endif
{
Size sz = get<0>(GetParam());
int dstType = get<1>(GetParam());
Mat src(sz, CV_8UC1); #define MAT_TYPES_ACCUMLATE CV_8UC1, CV_16UC1, CV_32FC1
Mat dst(sz, dstType); #define MAT_TYPES_ACCUMLATE_C MAT_TYPES_ACCUMLATE, CV_8UC3, CV_16UC3, CV_32FC3
#define MAT_TYPES_ACCUMLATE_D MAT_TYPES_ACCUMLATE, CV_64FC1
#define MAT_TYPES_ACCUMLATE_D_C MAT_TYPES_ACCUMLATE_C, CV_64FC1, CV_64FC1
declare.time(100); #define PERF_ACCUMULATE_INIT(_FLTC) \
declare.in(src, WARMUP_RNG).out(dst); const Size srcSize = get<0>(GetParam()); \
const int srcType = get<1>(GetParam()); \
const int dstType = _FLTC(CV_MAT_CN(srcType)); \
Mat src1(srcSize, srcType), dst(srcSize, dstType); \
declare.in(src1, dst, WARMUP_RNG).out(dst);
TEST_CYCLE() accumulate(src, dst); #define PERF_ACCUMULATE_MASK_INIT(_FLTC) \
PERF_ACCUMULATE_INIT(_FLTC) \
Mat mask(srcSize, CV_8UC1); \
declare.in(mask, WARMUP_RNG);
SANITY_CHECK_NOTHING(); #define PERF_TEST_P_ACCUMULATE(_NAME, _TYPES, _INIT, _FUN) \
} PERF_TEST_P(Accumulate, _NAME, \
testing::Combine( \
testing::Values(sz1080p, sz720p, szVGA, szQVGA, szODD), \
testing::Values(_TYPES) \
) \
) \
{ \
_INIT \
TEST_CYCLE() _FUN; \
SANITY_CHECK_NOTHING(); \
}
#ifdef HAVE_OPENVX /////////////////////////////////// Accumulate ///////////////////////////////////
PERF_TEST_P(Size_MatType, AccumulateSquare,
testing::Combine(
testing::Values(::perf::szODD, ::perf::szQVGA, ::perf::szVGA, ::perf::sz1080p),
testing::Values(CV_16SC1, CV_32FC1)
)
)
#else
PERF_TEST_P( Size_MatType, AccumulateSquare,
testing::Combine(
testing::Values(::perf::szODD, ::perf::szQVGA, ::perf::szVGA, ::perf::sz1080p),
testing::Values(CV_32FC1)
)
)
#endif
{
Size sz = get<0>(GetParam());
int dstType = get<1>(GetParam());
Mat src(sz, CV_8UC1); PERF_TEST_P_ACCUMULATE(Accumulate, MAT_TYPES_ACCUMLATE,
Mat dst(sz, dstType); PERF_ACCUMULATE_INIT(CV_32FC), accumulate(src1, dst))
declare.time(100); PERF_TEST_P_ACCUMULATE(AccumulateMask, MAT_TYPES_ACCUMLATE_C,
declare.in(src, WARMUP_RNG).out(dst); PERF_ACCUMULATE_MASK_INIT(CV_32FC), accumulate(src1, dst, mask))
TEST_CYCLE() accumulateSquare(src, dst); PERF_TEST_P_ACCUMULATE(AccumulateDouble, MAT_TYPES_ACCUMLATE_D,
PERF_ACCUMULATE_INIT(CV_64FC), accumulate(src1, dst))
SANITY_CHECK_NOTHING(); PERF_TEST_P_ACCUMULATE(AccumulateDoubleMask, MAT_TYPES_ACCUMLATE_D_C,
} PERF_ACCUMULATE_MASK_INIT(CV_64FC), accumulate(src1, dst, mask))
#ifdef HAVE_OPENVX ///////////////////////////// AccumulateSquare ///////////////////////////////////
PERF_TEST_P(Size_MatType, AccumulateWeighted,
testing::Combine(
testing::Values(::perf::szODD, ::perf::szQVGA, ::perf::szVGA, ::perf::sz1080p),
testing::Values(CV_8UC1, CV_32FC1)
)
)
#else
PERF_TEST_P( Size_MatType, AccumulateWeighted,
testing::Combine(
testing::Values(::perf::szODD, ::perf::szQVGA, ::perf::szVGA, ::perf::sz1080p),
testing::Values(CV_32FC1)
)
)
#endif
{
Size sz = get<0>(GetParam());
int dstType = get<1>(GetParam());
Mat src(sz, CV_8UC1); PERF_TEST_P_ACCUMULATE(Square, MAT_TYPES_ACCUMLATE,
Mat dst(sz, dstType); PERF_ACCUMULATE_INIT(CV_32FC), accumulateSquare(src1, dst))
declare.time(100); PERF_TEST_P_ACCUMULATE(SquareMask, MAT_TYPES_ACCUMLATE_C,
declare.in(src, WARMUP_RNG).out(dst); PERF_ACCUMULATE_MASK_INIT(CV_32FC), accumulateSquare(src1, dst, mask))
TEST_CYCLE() accumulateWeighted(src, dst, 0.314); PERF_TEST_P_ACCUMULATE(SquareDouble, MAT_TYPES_ACCUMLATE_D,
PERF_ACCUMULATE_INIT(CV_64FC), accumulateSquare(src1, dst))
SANITY_CHECK_NOTHING(); PERF_TEST_P_ACCUMULATE(SquareDoubleMask, MAT_TYPES_ACCUMLATE_D_C,
} PERF_ACCUMULATE_MASK_INIT(CV_64FC), accumulateSquare(src1, dst, mask))
///////////////////////////// AccumulateProduct ///////////////////////////////////
#define PERF_ACCUMULATE_INIT_2(_FLTC) \
PERF_ACCUMULATE_INIT(_FLTC) \
Mat src2(srcSize, srcType); \
declare.in(src2);
#define PERF_ACCUMULATE_MASK_INIT_2(_FLTC) \
PERF_ACCUMULATE_MASK_INIT(_FLTC) \
Mat src2(srcSize, srcType); \
declare.in(src2);
PERF_TEST_P_ACCUMULATE(Product, MAT_TYPES_ACCUMLATE,
PERF_ACCUMULATE_INIT_2(CV_32FC), accumulateProduct(src1, src2, dst))
PERF_TEST_P_ACCUMULATE(ProductMask, MAT_TYPES_ACCUMLATE_C,
PERF_ACCUMULATE_MASK_INIT_2(CV_32FC), accumulateProduct(src1, src2, dst, mask))
PERF_TEST_P_ACCUMULATE(ProductDouble, MAT_TYPES_ACCUMLATE_D,
PERF_ACCUMULATE_INIT_2(CV_64FC), accumulateProduct(src1, src2, dst))
PERF_TEST_P_ACCUMULATE(ProductDoubleMask, MAT_TYPES_ACCUMLATE_D_C,
PERF_ACCUMULATE_MASK_INIT_2(CV_64FC), accumulateProduct(src1, src2, dst, mask))
///////////////////////////// AccumulateWeighted ///////////////////////////////////
PERF_TEST_P_ACCUMULATE(Weighted, MAT_TYPES_ACCUMLATE,
PERF_ACCUMULATE_INIT(CV_32FC), accumulateWeighted(src1, dst, 0.123))
PERF_TEST_P_ACCUMULATE(WeightedMask, MAT_TYPES_ACCUMLATE_C,
PERF_ACCUMULATE_MASK_INIT(CV_32FC), accumulateWeighted(src1, dst, 0.123, mask))
PERF_TEST_P_ACCUMULATE(WeightedDouble, MAT_TYPES_ACCUMLATE_D,
PERF_ACCUMULATE_INIT(CV_64FC), accumulateWeighted(src1, dst, 0.123456))
PERF_TEST_P_ACCUMULATE(WeightedDoubleMask, MAT_TYPES_ACCUMLATE_D_C,
PERF_ACCUMULATE_MASK_INIT(CV_64FC), accumulateWeighted(src1, dst, 0.123456, mask))
} // namespace } // namespace

File diff suppressed because it is too large Load Diff

View File

@ -1825,7 +1825,7 @@ void hlineSmooth1N<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, const ufi
const int VECSZ = v_uint16::nlanes; const int VECSZ = v_uint16::nlanes;
v_uint16 v_mul = vx_setall_u16(*((uint16_t*)m)); v_uint16 v_mul = vx_setall_u16(*((uint16_t*)m));
for (; i <= lencn - VECSZ; i += VECSZ) for (; i <= lencn - VECSZ; i += VECSZ)
v_store((uint16_t*)dst + i, v_mul*vx_load_expand(src + i)); v_store((uint16_t*)dst + i, v_mul_wrap(v_mul, vx_load_expand(src + i)));
#endif #endif
for (; i < lencn; i++) for (; i < lencn; i++)
dst[i] = m[0] * src[i]; dst[i] = m[0] * src[i];
@ -1915,7 +1915,9 @@ void hlineSmooth3N<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, const ufi
v_uint16 v_mul1 = vx_setall_u16(_m[1]); v_uint16 v_mul1 = vx_setall_u16(_m[1]);
v_uint16 v_mul2 = vx_setall_u16(_m[2]); v_uint16 v_mul2 = vx_setall_u16(_m[2]);
for (; i <= lencn - VECSZ; i += VECSZ, src += VECSZ, dst += VECSZ) for (; i <= lencn - VECSZ; i += VECSZ, src += VECSZ, dst += VECSZ)
v_store((uint16_t*)dst, vx_load_expand(src - cn) * v_mul0 + vx_load_expand(src) * v_mul1 + vx_load_expand(src + cn) * v_mul2); v_store((uint16_t*)dst, v_mul_wrap(vx_load_expand(src - cn), v_mul0) +
v_mul_wrap(vx_load_expand(src), v_mul1) +
v_mul_wrap(vx_load_expand(src + cn), v_mul2));
#endif #endif
for (; i < lencn; i++, src++, dst++) for (; i < lencn; i++, src++, dst++)
*dst = m[0] * src[-cn] + m[1] * src[0] + m[2] * src[cn]; *dst = m[0] * src[-cn] + m[1] * src[0] + m[2] * src[cn];
@ -2089,7 +2091,8 @@ void hlineSmooth3Naba<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, const
v_uint16 v_mul0 = vx_setall_u16(_m[0]); v_uint16 v_mul0 = vx_setall_u16(_m[0]);
v_uint16 v_mul1 = vx_setall_u16(_m[1]); v_uint16 v_mul1 = vx_setall_u16(_m[1]);
for (; i <= lencn - VECSZ; i += VECSZ, src += VECSZ, dst += VECSZ) for (; i <= lencn - VECSZ; i += VECSZ, src += VECSZ, dst += VECSZ)
v_store((uint16_t*)dst, (vx_load_expand(src - cn) + vx_load_expand(src + cn)) * v_mul0 + vx_load_expand(src) * v_mul1); v_store((uint16_t*)dst, v_mul_wrap(vx_load_expand(src - cn) + vx_load_expand(src + cn), v_mul0) +
v_mul_wrap(vx_load_expand(src), v_mul1));
#endif #endif
for (; i < lencn; i++, src++, dst++) for (; i < lencn; i++, src++, dst++)
*((uint16_t*)dst) = ((uint16_t*)m)[1] * src[0] + ((uint16_t*)m)[0] * ((uint16_t)(src[-cn]) + (uint16_t)(src[cn])); *((uint16_t*)dst) = ((uint16_t*)m)[1] * src[0] + ((uint16_t*)m)[0] * ((uint16_t)(src[-cn]) + (uint16_t)(src[cn]));
@ -2285,7 +2288,11 @@ void hlineSmooth5N<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, const ufi
v_uint16 v_mul3 = vx_setall_u16(_m[3]); v_uint16 v_mul3 = vx_setall_u16(_m[3]);
v_uint16 v_mul4 = vx_setall_u16(_m[4]); v_uint16 v_mul4 = vx_setall_u16(_m[4]);
for (; i <= lencn - VECSZ; i += VECSZ, src += VECSZ, dst += VECSZ) for (; i <= lencn - VECSZ; i += VECSZ, src += VECSZ, dst += VECSZ)
v_store((uint16_t*)dst, vx_load_expand(src - 2 * cn) * v_mul0 + vx_load_expand(src - cn) * v_mul1 + vx_load_expand(src) * v_mul2 + vx_load_expand(src + cn) * v_mul3 + vx_load_expand(src + 2 * cn) * v_mul4); v_store((uint16_t*)dst, v_mul_wrap(vx_load_expand(src - 2 * cn), v_mul0) +
v_mul_wrap(vx_load_expand(src - cn), v_mul1) +
v_mul_wrap(vx_load_expand(src), v_mul2) +
v_mul_wrap(vx_load_expand(src + cn), v_mul3) +
v_mul_wrap(vx_load_expand(src + 2 * cn), v_mul4));
#endif #endif
for (; i < lencn; i++, src++, dst++) for (; i < lencn; i++, src++, dst++)
*dst = m[0] * src[-2*cn] + m[1] * src[-cn] + m[2] * src[0] + m[3] * src[cn] + m[4] * src[2*cn]; *dst = m[0] * src[-2*cn] + m[1] * src[-cn] + m[2] * src[0] + m[3] * src[cn] + m[4] * src[2*cn];
@ -2488,7 +2495,7 @@ void hlineSmooth5N14641<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, cons
const int VECSZ = v_uint16::nlanes; const int VECSZ = v_uint16::nlanes;
v_uint16 v_6 = vx_setall_u16(6); v_uint16 v_6 = vx_setall_u16(6);
for (; i <= lencn - VECSZ; i += VECSZ, src += VECSZ, dst += VECSZ) for (; i <= lencn - VECSZ; i += VECSZ, src += VECSZ, dst += VECSZ)
v_store((uint16_t*)dst, (vx_load_expand(src) * v_6 + ((vx_load_expand(src - cn) + vx_load_expand(src + cn)) << 2) + vx_load_expand(src - 2 * cn) + vx_load_expand(src + 2 * cn)) << 4); v_store((uint16_t*)dst, (v_mul_wrap(vx_load_expand(src), v_6) + ((vx_load_expand(src - cn) + vx_load_expand(src + cn)) << 2) + vx_load_expand(src - 2 * cn) + vx_load_expand(src + 2 * cn)) << 4);
#endif #endif
for (; i < lencn; i++, src++, dst++) for (; i < lencn; i++, src++, dst++)
*((uint16_t*)dst) = (uint16_t(src[0]) * 6 + ((uint16_t(src[-cn]) + uint16_t(src[cn])) << 2) + uint16_t(src[-2 * cn]) + uint16_t(src[2 * cn])) << 4; *((uint16_t*)dst) = (uint16_t(src[0]) * 6 + ((uint16_t(src[-cn]) + uint16_t(src[cn])) << 2) + uint16_t(src[-2 * cn]) + uint16_t(src[2 * cn])) << 4;
@ -2689,7 +2696,9 @@ void hlineSmooth5Nabcba<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, cons
v_uint16 v_mul1 = vx_setall_u16(_m[1]); v_uint16 v_mul1 = vx_setall_u16(_m[1]);
v_uint16 v_mul2 = vx_setall_u16(_m[2]); v_uint16 v_mul2 = vx_setall_u16(_m[2]);
for (; i <= lencn - VECSZ; i += VECSZ, src += VECSZ, dst += VECSZ) for (; i <= lencn - VECSZ; i += VECSZ, src += VECSZ, dst += VECSZ)
v_store((uint16_t*)dst, (vx_load_expand(src - 2 * cn) + vx_load_expand(src + 2 * cn)) * v_mul0 + (vx_load_expand(src - cn) + vx_load_expand(src + cn))* v_mul1 + vx_load_expand(src) * v_mul2); v_store((uint16_t*)dst, v_mul_wrap(vx_load_expand(src - 2 * cn) + vx_load_expand(src + 2 * cn), v_mul0) +
v_mul_wrap(vx_load_expand(src - cn) + vx_load_expand(src + cn), v_mul1) +
v_mul_wrap(vx_load_expand(src), v_mul2));
#endif #endif
for (; i < lencn; i++, src++, dst++) for (; i < lencn; i++, src++, dst++)
*((uint16_t*)dst) = ((uint16_t*)m)[0] * ((uint16_t)(src[-2 * cn]) + (uint16_t)(src[2 * cn])) + ((uint16_t*)m)[1] * ((uint16_t)(src[-cn]) + (uint16_t)(src[cn])) + ((uint16_t*)m)[2] * src[0]; *((uint16_t*)dst) = ((uint16_t*)m)[0] * ((uint16_t)(src[-2 * cn]) + (uint16_t)(src[2 * cn])) + ((uint16_t*)m)[1] * ((uint16_t)(src[-cn]) + (uint16_t)(src[cn])) + ((uint16_t*)m)[2] * src[0];
@ -2804,9 +2813,9 @@ void hlineSmooth<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, const ufixe
const int VECSZ = v_uint16::nlanes; const int VECSZ = v_uint16::nlanes;
for (; i <= lencn - VECSZ; i+=VECSZ, src+=VECSZ, dst+=VECSZ) for (; i <= lencn - VECSZ; i+=VECSZ, src+=VECSZ, dst+=VECSZ)
{ {
v_uint16 v_res0 = vx_load_expand(src) * vx_setall_u16(*((uint16_t*)m)); v_uint16 v_res0 = v_mul_wrap(vx_load_expand(src), vx_setall_u16(*((uint16_t*)m)));
for (int j = 1; j < n; j++) for (int j = 1; j < n; j++)
v_res0 += vx_load_expand(src + j * cn) * vx_setall_u16(*((uint16_t*)(m + j))); v_res0 += v_mul_wrap(vx_load_expand(src + j * cn), vx_setall_u16(*((uint16_t*)(m + j))));
v_store((uint16_t*)dst, v_res0); v_store((uint16_t*)dst, v_res0);
} }
#endif #endif
@ -2923,9 +2932,9 @@ void hlineSmoothONa_yzy_a<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, co
const int VECSZ = v_uint16::nlanes; const int VECSZ = v_uint16::nlanes;
for (; i <= lencn - VECSZ; i += VECSZ, src += VECSZ, dst += VECSZ) for (; i <= lencn - VECSZ; i += VECSZ, src += VECSZ, dst += VECSZ)
{ {
v_uint16 v_res0 = vx_load_expand(src + pre_shift * cn) * vx_setall_u16(*((uint16_t*)(m + pre_shift))); v_uint16 v_res0 = v_mul_wrap(vx_load_expand(src + pre_shift * cn), vx_setall_u16(*((uint16_t*)(m + pre_shift))));
for (int j = 0; j < pre_shift; j ++) for (int j = 0; j < pre_shift; j ++)
v_res0 += (vx_load_expand(src + j * cn) + vx_load_expand(src + (n - 1 - j)*cn)) * vx_setall_u16(*((uint16_t*)(m + j))); v_res0 += v_mul_wrap(vx_load_expand(src + j * cn) + vx_load_expand(src + (n - 1 - j)*cn), vx_setall_u16(*((uint16_t*)(m + j))));
v_store((uint16_t*)dst, v_res0); v_store((uint16_t*)dst, v_res0);
} }
#endif #endif

View File

@ -93,7 +93,7 @@ static void calcSharrDeriv(const cv::Mat& src, cv::Mat& dst)
v_int16x8 s2 = v_reinterpret_as_s16(v_load_expand(srow2 + x)); v_int16x8 s2 = v_reinterpret_as_s16(v_load_expand(srow2 + x));
v_int16x8 t1 = s2 - s0; v_int16x8 t1 = s2 - s0;
v_int16x8 t0 = (s0 + s2) * c3 + s1 * c10; v_int16x8 t0 = v_mul_wrap(s0 + s2, c3) + v_mul_wrap(s1, c10);
v_store(trow0 + x, t0); v_store(trow0 + x, t0);
v_store(trow1 + x, t1); v_store(trow1 + x, t1);
@ -131,7 +131,7 @@ static void calcSharrDeriv(const cv::Mat& src, cv::Mat& dst)
v_int16x8 s4 = v_load(trow1 + x + cn); v_int16x8 s4 = v_load(trow1 + x + cn);
v_int16x8 t0 = s1 - s0; v_int16x8 t0 = s1 - s0;
v_int16x8 t1 = ((s2 + s4) * c3) + (s3 * c10); v_int16x8 t1 = v_mul_wrap(s2 + s4, c3) + v_mul_wrap(s3, c10);
v_store_interleave((drow + x*2), t0, t1); v_store_interleave((drow + x*2), t0, t1);
} }