mirror of
https://github.com/opencv/opencv.git
synced 2024-11-24 11:10:21 +08:00
Merge pull request #12516 from seiko2plus:changeUnvMultiply16
This commit is contained in:
commit
1cc3f7abbb
@ -139,8 +139,14 @@ using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE;
|
||||
# undef CV_FP16
|
||||
#endif
|
||||
|
||||
#if CV_SSE2 || CV_NEON || CV_VSX
|
||||
#define CV__SIMD_FORWARD 128
|
||||
#include "opencv2/core/hal/intrin_forward.hpp"
|
||||
#endif
|
||||
|
||||
#if CV_SSE2
|
||||
|
||||
#include "opencv2/core/hal/intrin_sse_em.hpp"
|
||||
#include "opencv2/core/hal/intrin_sse.hpp"
|
||||
|
||||
#elif CV_NEON
|
||||
@ -168,6 +174,8 @@ using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE;
|
||||
// (and will be mapped to v256_ counterparts) (e.g. vx_load() => v256_load())
|
||||
#if CV_AVX2
|
||||
|
||||
#define CV__SIMD_FORWARD 256
|
||||
#include "opencv2/core/hal/intrin_forward.hpp"
|
||||
#include "opencv2/core/hal/intrin_avx.hpp"
|
||||
|
||||
#endif
|
||||
|
@ -82,6 +82,14 @@ inline __m128 _v256_extract_low(const __m256& v)
|
||||
inline __m128d _v256_extract_low(const __m256d& v)
|
||||
{ return _mm256_castpd256_pd128(v); }
|
||||
|
||||
inline __m256i _v256_packs_epu32(const __m256i& a, const __m256i& b)
|
||||
{
|
||||
const __m256i m = _mm256_set1_epi32(65535);
|
||||
__m256i am = _mm256_min_epu32(a, m);
|
||||
__m256i bm = _mm256_min_epu32(b, m);
|
||||
return _mm256_packus_epi32(am, bm);
|
||||
}
|
||||
|
||||
///////// Types ////////////
|
||||
|
||||
struct v_uint8x32
|
||||
@ -626,10 +634,8 @@ OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int8x32, _mm256_adds_epi8)
|
||||
OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int8x32, _mm256_subs_epi8)
|
||||
OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint16x16, _mm256_adds_epu16)
|
||||
OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint16x16, _mm256_subs_epu16)
|
||||
OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_uint16x16, _mm256_mullo_epi16)
|
||||
OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int16x16, _mm256_adds_epi16)
|
||||
OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int16x16, _mm256_subs_epi16)
|
||||
OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_int16x16, _mm256_mullo_epi16)
|
||||
OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint32x8, _mm256_add_epi32)
|
||||
OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint32x8, _mm256_sub_epi32)
|
||||
OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_uint32x8, _mm256_mullo_epi32)
|
||||
@ -650,13 +656,103 @@ OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_float64x4, _mm256_sub_pd)
|
||||
OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_float64x4, _mm256_mul_pd)
|
||||
OPENCV_HAL_IMPL_AVX_BIN_OP(/, v_float64x4, _mm256_div_pd)
|
||||
|
||||
// saturating multiply 8-bit, 16-bit
|
||||
inline v_uint8x32 operator * (const v_uint8x32& a, const v_uint8x32& b)
|
||||
{
|
||||
v_uint16x16 c, d;
|
||||
v_mul_expand(a, b, c, d);
|
||||
return v_pack_u(v_reinterpret_as_s16(c), v_reinterpret_as_s16(d));
|
||||
}
|
||||
inline v_int8x32 operator * (const v_int8x32& a, const v_int8x32& b)
|
||||
{
|
||||
v_int16x16 c, d;
|
||||
v_mul_expand(a, b, c, d);
|
||||
return v_pack(c, d);
|
||||
}
|
||||
inline v_uint16x16 operator * (const v_uint16x16& a, const v_uint16x16& b)
|
||||
{
|
||||
__m256i pl = _mm256_mullo_epi16(a.val, b.val);
|
||||
__m256i ph = _mm256_mulhi_epu16(a.val, b.val);
|
||||
__m256i p0 = _mm256_unpacklo_epi16(pl, ph);
|
||||
__m256i p1 = _mm256_unpackhi_epi16(pl, ph);
|
||||
return v_uint16x16(_v256_packs_epu32(p0, p1));
|
||||
}
|
||||
inline v_int16x16 operator * (const v_int16x16& a, const v_int16x16& b)
|
||||
{
|
||||
__m256i pl = _mm256_mullo_epi16(a.val, b.val);
|
||||
__m256i ph = _mm256_mulhi_epi16(a.val, b.val);
|
||||
__m256i p0 = _mm256_unpacklo_epi16(pl, ph);
|
||||
__m256i p1 = _mm256_unpackhi_epi16(pl, ph);
|
||||
return v_int16x16(_mm256_packs_epi32(p0, p1));
|
||||
}
|
||||
inline v_uint8x32& operator *= (v_uint8x32& a, const v_uint8x32& b)
|
||||
{ a = a * b; return a; }
|
||||
inline v_int8x32& operator *= (v_int8x32& a, const v_int8x32& b)
|
||||
{ a = a * b; return a; }
|
||||
inline v_uint16x16& operator *= (v_uint16x16& a, const v_uint16x16& b)
|
||||
{ a = a * b; return a; }
|
||||
inline v_int16x16& operator *= (v_int16x16& a, const v_int16x16& b)
|
||||
{ a = a * b; return a; }
|
||||
|
||||
/** Non-saturating arithmetics **/
|
||||
#define OPENCV_HAL_IMPL_AVX_BIN_FUNC(func, _Tpvec, intrin) \
|
||||
inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return _Tpvec(intrin(a.val, b.val)); }
|
||||
|
||||
OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_uint8x32, _mm256_add_epi8)
|
||||
OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_int8x32, _mm256_add_epi8)
|
||||
OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_uint16x16, _mm256_add_epi16)
|
||||
OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_int16x16, _mm256_add_epi16)
|
||||
OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_uint8x32, _mm256_sub_epi8)
|
||||
OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_int8x32, _mm256_sub_epi8)
|
||||
OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_uint16x16, _mm256_sub_epi16)
|
||||
OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_int16x16, _mm256_sub_epi16)
|
||||
OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_mul_wrap, v_uint16x16, _mm256_mullo_epi16)
|
||||
OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_mul_wrap, v_int16x16, _mm256_mullo_epi16)
|
||||
|
||||
inline v_uint8x32 v_mul_wrap(const v_uint8x32& a, const v_uint8x32& b)
|
||||
{
|
||||
__m256i ad = _mm256_srai_epi16(a.val, 8);
|
||||
__m256i bd = _mm256_srai_epi16(b.val, 8);
|
||||
__m256i p0 = _mm256_mullo_epi16(a.val, b.val); // even
|
||||
__m256i p1 = _mm256_slli_epi16(_mm256_mullo_epi16(ad, bd), 8); // odd
|
||||
|
||||
const __m256i b01 = _mm256_set1_epi32(0xFF00FF00);
|
||||
return v_uint8x32(_mm256_blendv_epi8(p0, p1, b01));
|
||||
}
|
||||
inline v_int8x32 v_mul_wrap(const v_int8x32& a, const v_int8x32& b)
|
||||
{
|
||||
return v_reinterpret_as_s8(v_mul_wrap(v_reinterpret_as_u8(a), v_reinterpret_as_u8(b)));
|
||||
}
|
||||
|
||||
// Multiply and expand
|
||||
inline void v_mul_expand(const v_uint8x32& a, const v_uint8x32& b,
|
||||
v_uint16x16& c, v_uint16x16& d)
|
||||
{
|
||||
v_uint16x16 a0, a1, b0, b1;
|
||||
v_expand(a, a0, a1);
|
||||
v_expand(b, b0, b1);
|
||||
c = v_mul_wrap(a0, b0);
|
||||
d = v_mul_wrap(a1, b1);
|
||||
}
|
||||
|
||||
inline void v_mul_expand(const v_int8x32& a, const v_int8x32& b,
|
||||
v_int16x16& c, v_int16x16& d)
|
||||
{
|
||||
v_int16x16 a0, a1, b0, b1;
|
||||
v_expand(a, a0, a1);
|
||||
v_expand(b, b0, b1);
|
||||
c = v_mul_wrap(a0, b0);
|
||||
d = v_mul_wrap(a1, b1);
|
||||
}
|
||||
|
||||
inline void v_mul_expand(const v_int16x16& a, const v_int16x16& b,
|
||||
v_int32x8& c, v_int32x8& d)
|
||||
{
|
||||
v_int16x16 vhi = v_int16x16(_mm256_mulhi_epi16(a.val, b.val));
|
||||
|
||||
v_int16x16 v0, v1;
|
||||
v_zip(a * b, vhi, v0, v1);
|
||||
v_zip(v_mul_wrap(a, b), vhi, v0, v1);
|
||||
|
||||
c = v_reinterpret_as_s32(v0);
|
||||
d = v_reinterpret_as_s32(v1);
|
||||
@ -668,7 +764,7 @@ inline void v_mul_expand(const v_uint16x16& a, const v_uint16x16& b,
|
||||
v_uint16x16 vhi = v_uint16x16(_mm256_mulhi_epu16(a.val, b.val));
|
||||
|
||||
v_uint16x16 v0, v1;
|
||||
v_zip(a * b, vhi, v0, v1);
|
||||
v_zip(v_mul_wrap(a, b), vhi, v0, v1);
|
||||
|
||||
c = v_reinterpret_as_u32(v0);
|
||||
d = v_reinterpret_as_u32(v1);
|
||||
@ -685,20 +781,6 @@ inline void v_mul_expand(const v_uint32x8& a, const v_uint32x8& b,
|
||||
inline v_int16x16 v_mul_hi(const v_int16x16& a, const v_int16x16& b) { return v_int16x16(_mm256_mulhi_epi16(a.val, b.val)); }
|
||||
inline v_uint16x16 v_mul_hi(const v_uint16x16& a, const v_uint16x16& b) { return v_uint16x16(_mm256_mulhi_epu16(a.val, b.val)); }
|
||||
|
||||
/** Non-saturating arithmetics **/
|
||||
#define OPENCV_HAL_IMPL_AVX_BIN_FUNC(func, _Tpvec, intrin) \
|
||||
inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return _Tpvec(intrin(a.val, b.val)); }
|
||||
|
||||
OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_uint8x32, _mm256_add_epi8)
|
||||
OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_int8x32, _mm256_add_epi8)
|
||||
OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_uint16x16, _mm256_add_epi16)
|
||||
OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_int16x16, _mm256_add_epi16)
|
||||
OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_uint8x32, _mm256_sub_epi8)
|
||||
OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_int8x32, _mm256_sub_epi8)
|
||||
OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_uint16x16, _mm256_sub_epi16)
|
||||
OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_int16x16, _mm256_sub_epi16)
|
||||
|
||||
/** Bitwise shifts **/
|
||||
#define OPENCV_HAL_IMPL_AVX_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai) \
|
||||
inline _Tpuvec operator << (const _Tpuvec& a, int imm) \
|
||||
@ -1385,6 +1467,10 @@ OPENCV_HAL_IMPL_AVX_TRANSPOSE4x4(v_float32x8, ps, _mm256_castps_si256, _mm256_ca
|
||||
b0.val = intrin(_v256_extract_low(a.val)); \
|
||||
b1.val = intrin(_v256_extract_high(a.val)); \
|
||||
} \
|
||||
inline _Tpwvec v_expand_low(const _Tpvec& a) \
|
||||
{ return _Tpwvec(intrin(_v256_extract_low(a.val))); } \
|
||||
inline _Tpwvec v_expand_high(const _Tpvec& a) \
|
||||
{ return _Tpwvec(intrin(_v256_extract_high(a.val))); } \
|
||||
inline _Tpwvec v256_load_expand(const _Tp* ptr) \
|
||||
{ \
|
||||
__m128i a = _mm_loadu_si128((const __m128i*)ptr); \
|
||||
@ -1430,7 +1516,12 @@ inline void v_pack_store(schar* ptr, const v_int16x16& a)
|
||||
{ v_store_low(ptr, v_pack(a, a)); }
|
||||
|
||||
inline void v_pack_store(uchar* ptr, const v_uint16x16& a)
|
||||
{ v_store_low(ptr, v_pack(a, a)); }
|
||||
{
|
||||
const __m256i m = _mm256_set1_epi16(255);
|
||||
__m256i am = _mm256_min_epu16(a.val, m);
|
||||
am = _v256_shuffle_odd_64(_mm256_packus_epi16(am, am));
|
||||
v_store_low(ptr, v_uint8x32(am));
|
||||
}
|
||||
|
||||
inline void v_pack_u_store(uchar* ptr, const v_int16x16& a)
|
||||
{ v_store_low(ptr, v_pack_u(a, a)); }
|
||||
@ -1484,16 +1575,21 @@ inline v_int16x16 v_pack(const v_int32x8& a, const v_int32x8& b)
|
||||
{ return v_int16x16(_v256_shuffle_odd_64(_mm256_packs_epi32(a.val, b.val))); }
|
||||
|
||||
inline v_uint16x16 v_pack(const v_uint32x8& a, const v_uint32x8& b)
|
||||
{ return v_uint16x16(_v256_shuffle_odd_64(_mm256_packus_epi32(a.val, b.val))); }
|
||||
{ return v_uint16x16(_v256_shuffle_odd_64(_v256_packs_epu32(a.val, b.val))); }
|
||||
|
||||
inline v_uint16x16 v_pack_u(const v_int32x8& a, const v_int32x8& b)
|
||||
{ return v_pack(v_reinterpret_as_u32(a), v_reinterpret_as_u32(b)); }
|
||||
{ return v_uint16x16(_v256_shuffle_odd_64(_mm256_packus_epi32(a.val, b.val))); }
|
||||
|
||||
inline void v_pack_store(short* ptr, const v_int32x8& a)
|
||||
{ v_store_low(ptr, v_pack(a, a)); }
|
||||
|
||||
inline void v_pack_store(ushort* ptr, const v_uint32x8& a)
|
||||
{ v_store_low(ptr, v_pack(a, a)); }
|
||||
{
|
||||
const __m256i m = _mm256_set1_epi32(65535);
|
||||
__m256i am = _mm256_min_epu32(a.val, m);
|
||||
am = _v256_shuffle_odd_64(_mm256_packus_epi32(am, am));
|
||||
v_store_low(ptr, v_uint16x16(am));
|
||||
}
|
||||
|
||||
inline void v_pack_u_store(ushort* ptr, const v_int32x8& a)
|
||||
{ v_store_low(ptr, v_pack_u(a, a)); }
|
||||
|
@ -108,7 +108,7 @@ block and to save contents of the register to memory block.
|
||||
These operations allow to reorder or recombine elements in one or multiple vectors.
|
||||
|
||||
- Interleave, deinterleave (2, 3 and 4 channels): @ref v_load_deinterleave, @ref v_store_interleave
|
||||
- Expand: @ref v_load_expand, @ref v_load_expand_q, @ref v_expand
|
||||
- Expand: @ref v_load_expand, @ref v_load_expand_q, @ref v_expand, @ref v_expand_low, @ref v_expand_high
|
||||
- Pack: @ref v_pack, @ref v_pack_u, @ref v_rshr_pack, @ref v_rshr_pack_u,
|
||||
@ref v_pack_store, @ref v_pack_u_store, @ref v_rshr_pack_store, @ref v_rshr_pack_u_store
|
||||
- Recombine: @ref v_zip, @ref v_recombine, @ref v_combine_low, @ref v_combine_high
|
||||
@ -185,11 +185,14 @@ Regular integers:
|
||||
|load, store | x | x | x | x | x | x |
|
||||
|interleave | x | x | x | x | x | x |
|
||||
|expand | x | x | x | x | x | x |
|
||||
|expand_low | x | x | x | x | x | x |
|
||||
|expand_high | x | x | x | x | x | x |
|
||||
|expand_q | x | x | | | | |
|
||||
|add, sub | x | x | x | x | x | x |
|
||||
|add_wrap, sub_wrap | x | x | x | x | | |
|
||||
|mul | | | x | x | x | x |
|
||||
|mul_expand | | | x | x | x | |
|
||||
|mul_wrap | x | x | x | x | | |
|
||||
|mul | x | x | x | x | x | x |
|
||||
|mul_expand | x | x | x | x | x | |
|
||||
|compare | x | x | x | x | x | x |
|
||||
|shift | | | x | x | x | x |
|
||||
|dotprod | | | | x | | |
|
||||
@ -680,7 +683,7 @@ OPENCV_HAL_IMPL_CMP_OP(!=)
|
||||
|
||||
//! @brief Helper macro
|
||||
//! @ingroup core_hal_intrin_impl
|
||||
#define OPENCV_HAL_IMPL_ADD_SUB_OP(func, bin_op, cast_op, _Tp2) \
|
||||
#define OPENCV_HAL_IMPL_ARITHM_OP(func, bin_op, cast_op, _Tp2) \
|
||||
template<typename _Tp, int n> \
|
||||
inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
|
||||
{ \
|
||||
@ -694,12 +697,17 @@ inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
|
||||
/** @brief Add values without saturation
|
||||
|
||||
For 8- and 16-bit integer values. */
|
||||
OPENCV_HAL_IMPL_ADD_SUB_OP(v_add_wrap, +, (_Tp), _Tp)
|
||||
OPENCV_HAL_IMPL_ARITHM_OP(v_add_wrap, +, (_Tp), _Tp)
|
||||
|
||||
/** @brief Subtract values without saturation
|
||||
|
||||
For 8- and 16-bit integer values. */
|
||||
OPENCV_HAL_IMPL_ADD_SUB_OP(v_sub_wrap, -, (_Tp), _Tp)
|
||||
OPENCV_HAL_IMPL_ARITHM_OP(v_sub_wrap, -, (_Tp), _Tp)
|
||||
|
||||
/** @brief Multiply values without saturation
|
||||
|
||||
For 8- and 16-bit integer values. */
|
||||
OPENCV_HAL_IMPL_ARITHM_OP(v_mul_wrap, *, (_Tp), _Tp)
|
||||
|
||||
//! @cond IGNORED
|
||||
template<typename T> inline T _absdiff(T a, T b)
|
||||
@ -1106,6 +1114,44 @@ template<typename _Tp, int n> inline void v_expand(const v_reg<_Tp, n>& a,
|
||||
}
|
||||
}
|
||||
|
||||
/** @brief Expand lower values to the wider pack type
|
||||
|
||||
Same as cv::v_expand, but return lower half of the vector.
|
||||
|
||||
Scheme:
|
||||
@code
|
||||
int32x4 int64x2
|
||||
{A B C D} ==> {A B}
|
||||
@endcode */
|
||||
template<typename _Tp, int n>
|
||||
inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
|
||||
v_expand_low(const v_reg<_Tp, n>& a)
|
||||
{
|
||||
v_reg<typename V_TypeTraits<_Tp>::w_type, n/2> b;
|
||||
for( int i = 0; i < (n/2); i++ )
|
||||
b.s[i] = a.s[i];
|
||||
return b;
|
||||
}
|
||||
|
||||
/** @brief Expand higher values to the wider pack type
|
||||
|
||||
Same as cv::v_expand_low, but expand higher half of the vector instead.
|
||||
|
||||
Scheme:
|
||||
@code
|
||||
int32x4 int64x2
|
||||
{A B C D} ==> {C D}
|
||||
@endcode */
|
||||
template<typename _Tp, int n>
|
||||
inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
|
||||
v_expand_high(const v_reg<_Tp, n>& a)
|
||||
{
|
||||
v_reg<typename V_TypeTraits<_Tp>::w_type, n/2> b;
|
||||
for( int i = 0; i < (n/2); i++ )
|
||||
b.s[i] = a.s[i+(n/2)];
|
||||
return b;
|
||||
}
|
||||
|
||||
//! @cond IGNORED
|
||||
template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::int_type, n>
|
||||
v_reinterpret_as_int(const v_reg<_Tp, n>& a)
|
||||
|
158
modules/core/include/opencv2/core/hal/intrin_forward.hpp
Normal file
158
modules/core/include/opencv2/core/hal/intrin_forward.hpp
Normal file
@ -0,0 +1,158 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html
|
||||
|
||||
#ifndef CV__SIMD_FORWARD
|
||||
#error "Need to pre-define forward width"
|
||||
#endif
|
||||
|
||||
namespace cv
|
||||
{
|
||||
|
||||
//! @cond IGNORED
|
||||
|
||||
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
|
||||
|
||||
/** Types **/
|
||||
#if CV__SIMD_FORWARD == 512
|
||||
// [todo] 512
|
||||
#error "AVX512 Not implemented yet"
|
||||
#elif CV__SIMD_FORWARD == 256
|
||||
// 256
|
||||
#define __CV_VX(fun) v256_##fun
|
||||
#define __CV_V_UINT8 v_uint8x32
|
||||
#define __CV_V_INT8 v_int8x32
|
||||
#define __CV_V_UINT16 v_uint16x16
|
||||
#define __CV_V_INT16 v_int16x16
|
||||
#define __CV_V_UINT32 v_uint32x8
|
||||
#define __CV_V_INT32 v_int32x8
|
||||
#define __CV_V_UINT64 v_uint64x4
|
||||
#define __CV_V_INT64 v_int64x4
|
||||
#define __CV_V_FLOAT32 v_float32x8
|
||||
#define __CV_V_FLOAT64 v_float64x4
|
||||
struct v_uint8x32;
|
||||
struct v_int8x32;
|
||||
struct v_uint16x16;
|
||||
struct v_int16x16;
|
||||
struct v_uint32x8;
|
||||
struct v_int32x8;
|
||||
struct v_uint64x4;
|
||||
struct v_int64x4;
|
||||
struct v_float32x8;
|
||||
struct v_float64x4;
|
||||
#else
|
||||
// 128
|
||||
#define __CV_VX(fun) v_##fun
|
||||
#define __CV_V_UINT8 v_uint8x16
|
||||
#define __CV_V_INT8 v_int8x16
|
||||
#define __CV_V_UINT16 v_uint16x8
|
||||
#define __CV_V_INT16 v_int16x8
|
||||
#define __CV_V_UINT32 v_uint32x4
|
||||
#define __CV_V_INT32 v_int32x4
|
||||
#define __CV_V_UINT64 v_uint64x2
|
||||
#define __CV_V_INT64 v_int64x2
|
||||
#define __CV_V_FLOAT32 v_float32x4
|
||||
#define __CV_V_FLOAT64 v_float64x2
|
||||
struct v_uint8x16;
|
||||
struct v_int8x16;
|
||||
struct v_uint16x8;
|
||||
struct v_int16x8;
|
||||
struct v_uint32x4;
|
||||
struct v_int32x4;
|
||||
struct v_uint64x2;
|
||||
struct v_int64x2;
|
||||
struct v_float32x4;
|
||||
struct v_float64x2;
|
||||
#endif
|
||||
|
||||
/** Value reordering **/
|
||||
|
||||
// Expansion
|
||||
void v_expand(const __CV_V_UINT8&, __CV_V_UINT16&, __CV_V_UINT16&);
|
||||
void v_expand(const __CV_V_INT8&, __CV_V_INT16&, __CV_V_INT16&);
|
||||
void v_expand(const __CV_V_UINT16&, __CV_V_UINT32&, __CV_V_UINT32&);
|
||||
void v_expand(const __CV_V_INT16&, __CV_V_INT32&, __CV_V_INT32&);
|
||||
void v_expand(const __CV_V_UINT32&, __CV_V_UINT64&, __CV_V_UINT64&);
|
||||
void v_expand(const __CV_V_INT32&, __CV_V_INT64&, __CV_V_INT64&);
|
||||
// Low Expansion
|
||||
__CV_V_UINT16 v_expand_low(const __CV_V_UINT8&);
|
||||
__CV_V_INT16 v_expand_low(const __CV_V_INT8&);
|
||||
__CV_V_UINT32 v_expand_low(const __CV_V_UINT16&);
|
||||
__CV_V_INT32 v_expand_low(const __CV_V_INT16&);
|
||||
__CV_V_UINT64 v_expand_low(const __CV_V_UINT32&);
|
||||
__CV_V_INT64 v_expand_low(const __CV_V_INT32&);
|
||||
// High Expansion
|
||||
__CV_V_UINT16 v_expand_high(const __CV_V_UINT8&);
|
||||
__CV_V_INT16 v_expand_high(const __CV_V_INT8&);
|
||||
__CV_V_UINT32 v_expand_high(const __CV_V_UINT16&);
|
||||
__CV_V_INT32 v_expand_high(const __CV_V_INT16&);
|
||||
__CV_V_UINT64 v_expand_high(const __CV_V_UINT32&);
|
||||
__CV_V_INT64 v_expand_high(const __CV_V_INT32&);
|
||||
// Load & Low Expansion
|
||||
__CV_V_UINT16 __CV_VX(load_expand)(const uchar*);
|
||||
__CV_V_INT16 __CV_VX(load_expand)(const schar*);
|
||||
__CV_V_UINT32 __CV_VX(load_expand)(const ushort*);
|
||||
__CV_V_INT32 __CV_VX(load_expand)(const short*);
|
||||
__CV_V_UINT64 __CV_VX(load_expand)(const uint*);
|
||||
__CV_V_INT64 __CV_VX(load_expand)(const int*);
|
||||
// Load lower 8-bit and expand into 32-bit
|
||||
__CV_V_UINT32 __CV_VX(load_expand_q)(const uchar*);
|
||||
__CV_V_INT32 __CV_VX(load_expand_q)(const schar*);
|
||||
|
||||
// Saturating Pack
|
||||
__CV_V_UINT8 v_pack(const __CV_V_UINT16&, const __CV_V_UINT16&);
|
||||
__CV_V_INT8 v_pack(const __CV_V_INT16&, const __CV_V_INT16&);
|
||||
__CV_V_UINT16 v_pack(const __CV_V_UINT32&, const __CV_V_UINT32&);
|
||||
__CV_V_INT16 v_pack(const __CV_V_INT32&, const __CV_V_INT32&);
|
||||
// Non-saturating Pack
|
||||
__CV_V_UINT32 v_pack(const __CV_V_UINT64&, const __CV_V_UINT64&);
|
||||
__CV_V_INT32 v_pack(const __CV_V_INT64&, const __CV_V_INT64&);
|
||||
// Pack signed integers with unsigned saturation
|
||||
__CV_V_UINT8 v_pack_u(const __CV_V_INT16&, const __CV_V_INT16&);
|
||||
__CV_V_UINT16 v_pack_u(const __CV_V_INT32&, const __CV_V_INT32&);
|
||||
|
||||
/** Arithmetic, bitwise and comparison operations **/
|
||||
|
||||
// Non-saturating multiply
|
||||
#if CV_VSX
|
||||
template<typename Tvec>
|
||||
Tvec v_mul_wrap(const Tvec& a, const Tvec& b);
|
||||
#else
|
||||
__CV_V_UINT8 v_mul_wrap(const __CV_V_UINT8&, const __CV_V_UINT8&);
|
||||
__CV_V_INT8 v_mul_wrap(const __CV_V_INT8&, const __CV_V_INT8&);
|
||||
__CV_V_UINT16 v_mul_wrap(const __CV_V_UINT16&, const __CV_V_UINT16&);
|
||||
__CV_V_INT16 v_mul_wrap(const __CV_V_INT16&, const __CV_V_INT16&);
|
||||
#endif
|
||||
|
||||
// Multiply and expand
|
||||
#if CV_VSX
|
||||
template<typename Tvec, typename Twvec>
|
||||
void v_mul_expand(const Tvec& a, const Tvec& b, Twvec& c, Twvec& d);
|
||||
#else
|
||||
void v_mul_expand(const __CV_V_UINT8&, const __CV_V_UINT8&, __CV_V_UINT16&, __CV_V_UINT16&);
|
||||
void v_mul_expand(const __CV_V_INT8&, const __CV_V_INT8&, __CV_V_INT16&, __CV_V_INT16&);
|
||||
void v_mul_expand(const __CV_V_UINT16&, const __CV_V_UINT16&, __CV_V_UINT32&, __CV_V_UINT32&);
|
||||
void v_mul_expand(const __CV_V_INT16&, const __CV_V_INT16&, __CV_V_INT32&, __CV_V_INT32&);
|
||||
void v_mul_expand(const __CV_V_UINT32&, const __CV_V_UINT32&, __CV_V_UINT64&, __CV_V_UINT64&);
|
||||
void v_mul_expand(const __CV_V_INT32&, const __CV_V_INT32&, __CV_V_INT64&, __CV_V_INT64&);
|
||||
#endif
|
||||
|
||||
/** Cleanup **/
|
||||
#undef CV__SIMD_FORWARD
|
||||
#undef __CV_VX
|
||||
#undef __CV_V_UINT8
|
||||
#undef __CV_V_INT8
|
||||
#undef __CV_V_UINT16
|
||||
#undef __CV_V_INT16
|
||||
#undef __CV_V_UINT32
|
||||
#undef __CV_V_INT32
|
||||
#undef __CV_V_UINT64
|
||||
#undef __CV_V_INT64
|
||||
#undef __CV_V_FLOAT32
|
||||
#undef __CV_V_FLOAT64
|
||||
|
||||
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
|
||||
|
||||
//! @endcond
|
||||
|
||||
} // cv::
|
@ -435,10 +435,8 @@ OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int8x16, vqaddq_s8)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int8x16, vqsubq_s8)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint16x8, vqaddq_u16)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint16x8, vqsubq_u16)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_uint16x8, vmulq_u16)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int16x8, vqaddq_s16)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int16x8, vqsubq_s16)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_int16x8, vmulq_s16)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int32x4, vaddq_s32)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int32x4, vsubq_s32)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_int32x4, vmulq_s32)
|
||||
@ -476,6 +474,37 @@ inline v_float32x4& operator /= (v_float32x4& a, const v_float32x4& b)
|
||||
}
|
||||
#endif
|
||||
|
||||
// saturating multiply 8-bit, 16-bit
|
||||
#define OPENCV_HAL_IMPL_NEON_MUL_SAT(_Tpvec, _Tpwvec) \
|
||||
inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \
|
||||
{ \
|
||||
_Tpwvec c, d; \
|
||||
v_mul_expand(a, b, c, d); \
|
||||
return v_pack(c, d); \
|
||||
} \
|
||||
inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \
|
||||
{ a = a * b; return a; }
|
||||
|
||||
OPENCV_HAL_IMPL_NEON_MUL_SAT(v_int8x16, v_int16x8)
|
||||
OPENCV_HAL_IMPL_NEON_MUL_SAT(v_uint8x16, v_uint16x8)
|
||||
OPENCV_HAL_IMPL_NEON_MUL_SAT(v_int16x8, v_int32x4)
|
||||
OPENCV_HAL_IMPL_NEON_MUL_SAT(v_uint16x8, v_uint32x4)
|
||||
|
||||
// Multiply and expand
|
||||
inline void v_mul_expand(const v_int8x16& a, const v_int8x16& b,
|
||||
v_int16x8& c, v_int16x8& d)
|
||||
{
|
||||
c.val = vmull_s8(vget_low_s8(a.val), vget_low_s8(b.val));
|
||||
d.val = vmull_s8(vget_high_s8(a.val), vget_high_s8(b.val));
|
||||
}
|
||||
|
||||
inline void v_mul_expand(const v_uint8x16& a, const v_uint8x16& b,
|
||||
v_uint16x8& c, v_uint16x8& d)
|
||||
{
|
||||
c.val = vmull_u8(vget_low_u8(a.val), vget_low_u8(b.val));
|
||||
d.val = vmull_u8(vget_high_u8(a.val), vget_high_u8(b.val));
|
||||
}
|
||||
|
||||
inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
|
||||
v_int32x4& c, v_int32x4& d)
|
||||
{
|
||||
@ -714,6 +743,10 @@ OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_sub_wrap, vsubq_u8)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_sub_wrap, vsubq_s8)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_sub_wrap, vsubq_u16)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_sub_wrap, vsubq_s16)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_mul_wrap, vmulq_u8)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_mul_wrap, vmulq_s8)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_mul_wrap, vmulq_u16)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_mul_wrap, vmulq_s16)
|
||||
|
||||
// TODO: absdiff for signed integers
|
||||
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_absdiff, vabdq_u8)
|
||||
@ -1056,6 +1089,14 @@ inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
|
||||
b0.val = vmovl_##suffix(vget_low_##suffix(a.val)); \
|
||||
b1.val = vmovl_##suffix(vget_high_##suffix(a.val)); \
|
||||
} \
|
||||
inline _Tpwvec v_expand_low(const _Tpvec& a) \
|
||||
{ \
|
||||
return _Tpwvec(vmovl_##suffix(vget_low_##suffix(a.val))); \
|
||||
} \
|
||||
inline _Tpwvec v_expand_high(const _Tpvec& a) \
|
||||
{ \
|
||||
return _Tpwvec(vmovl_##suffix(vget_high_##suffix(a.val))); \
|
||||
} \
|
||||
inline _Tpwvec v_load_expand(const _Tp* ptr) \
|
||||
{ \
|
||||
return _Tpwvec(vmovl_##suffix(vld1_##suffix(ptr))); \
|
||||
|
@ -59,6 +59,8 @@ namespace cv
|
||||
|
||||
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
|
||||
|
||||
///////// Types ////////////
|
||||
|
||||
struct v_uint8x16
|
||||
{
|
||||
typedef uchar lane_type;
|
||||
@ -436,13 +438,7 @@ inline __m128i v_select_si128(__m128i mask, __m128i a, __m128i b)
|
||||
}
|
||||
|
||||
inline v_uint16x8 v_pack(const v_uint32x4& a, const v_uint32x4& b)
|
||||
{
|
||||
__m128i z = _mm_setzero_si128(), maxval32 = _mm_set1_epi32(65535), delta32 = _mm_set1_epi32(32768);
|
||||
__m128i a1 = _mm_sub_epi32(v_select_si128(_mm_cmpgt_epi32(z, a.val), maxval32, a.val), delta32);
|
||||
__m128i b1 = _mm_sub_epi32(v_select_si128(_mm_cmpgt_epi32(z, b.val), maxval32, b.val), delta32);
|
||||
__m128i r = _mm_packs_epi32(a1, b1);
|
||||
return v_uint16x8(_mm_sub_epi16(r, _mm_set1_epi16(-32768)));
|
||||
}
|
||||
{ return v_uint16x8(_v128_packs_epu32(a.val, b.val)); }
|
||||
|
||||
inline void v_pack_store(ushort* ptr, const v_uint32x4& a)
|
||||
{
|
||||
@ -678,14 +674,14 @@ OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int8x16, _mm_adds_epi8)
|
||||
OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int8x16, _mm_subs_epi8)
|
||||
OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint16x8, _mm_adds_epu16)
|
||||
OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint16x8, _mm_subs_epu16)
|
||||
OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_uint16x8, _mm_mullo_epi16)
|
||||
OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int16x8, _mm_adds_epi16)
|
||||
OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int16x8, _mm_subs_epi16)
|
||||
OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_int16x8, _mm_mullo_epi16)
|
||||
OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint32x4, _mm_add_epi32)
|
||||
OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint32x4, _mm_sub_epi32)
|
||||
OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_uint32x4, _v128_mullo_epi32)
|
||||
OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int32x4, _mm_add_epi32)
|
||||
OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int32x4, _mm_sub_epi32)
|
||||
OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_int32x4, _v128_mullo_epi32)
|
||||
OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_float32x4, _mm_add_ps)
|
||||
OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_float32x4, _mm_sub_ps)
|
||||
OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_float32x4, _mm_mul_ps)
|
||||
@ -699,35 +695,49 @@ OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint64x2, _mm_sub_epi64)
|
||||
OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int64x2, _mm_add_epi64)
|
||||
OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int64x2, _mm_sub_epi64)
|
||||
|
||||
inline v_uint32x4 operator * (const v_uint32x4& a, const v_uint32x4& b)
|
||||
// saturating multiply 8-bit, 16-bit
|
||||
#define OPENCV_HAL_IMPL_SSE_MUL_SAT(_Tpvec, _Tpwvec) \
|
||||
inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \
|
||||
{ \
|
||||
_Tpwvec c, d; \
|
||||
v_mul_expand(a, b, c, d); \
|
||||
return v_pack(c, d); \
|
||||
} \
|
||||
inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \
|
||||
{ a = a * b; return a; }
|
||||
|
||||
OPENCV_HAL_IMPL_SSE_MUL_SAT(v_int8x16, v_int16x8)
|
||||
OPENCV_HAL_IMPL_SSE_MUL_SAT(v_uint16x8, v_uint32x4)
|
||||
OPENCV_HAL_IMPL_SSE_MUL_SAT(v_int16x8, v_int32x4)
|
||||
|
||||
inline v_uint8x16 operator * (const v_uint8x16& a, const v_uint8x16& b)
|
||||
{
|
||||
__m128i c0 = _mm_mul_epu32(a.val, b.val);
|
||||
__m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32));
|
||||
__m128i d0 = _mm_unpacklo_epi32(c0, c1);
|
||||
__m128i d1 = _mm_unpackhi_epi32(c0, c1);
|
||||
return v_uint32x4(_mm_unpacklo_epi64(d0, d1));
|
||||
v_uint16x8 c, d;
|
||||
v_mul_expand(a, b, c, d);
|
||||
return v_pack_u(v_reinterpret_as_s16(c), v_reinterpret_as_s16(d));
|
||||
}
|
||||
inline v_int32x4 operator * (const v_int32x4& a, const v_int32x4& b)
|
||||
inline v_uint8x16& operator *= (v_uint8x16& a, const v_uint8x16& b)
|
||||
{ a = a * b; return a; }
|
||||
|
||||
// Multiply and expand
|
||||
inline void v_mul_expand(const v_uint8x16& a, const v_uint8x16& b,
|
||||
v_uint16x8& c, v_uint16x8& d)
|
||||
{
|
||||
#if CV_SSE4_1
|
||||
return v_int32x4(_mm_mullo_epi32(a.val, b.val));
|
||||
#else
|
||||
__m128i c0 = _mm_mul_epu32(a.val, b.val);
|
||||
__m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32));
|
||||
__m128i d0 = _mm_unpacklo_epi32(c0, c1);
|
||||
__m128i d1 = _mm_unpackhi_epi32(c0, c1);
|
||||
return v_int32x4(_mm_unpacklo_epi64(d0, d1));
|
||||
#endif
|
||||
v_uint16x8 a0, a1, b0, b1;
|
||||
v_expand(a, a0, a1);
|
||||
v_expand(b, b0, b1);
|
||||
c = v_mul_wrap(a0, b0);
|
||||
d = v_mul_wrap(a1, b1);
|
||||
}
|
||||
inline v_uint32x4& operator *= (v_uint32x4& a, const v_uint32x4& b)
|
||||
|
||||
inline void v_mul_expand(const v_int8x16& a, const v_int8x16& b,
|
||||
v_int16x8& c, v_int16x8& d)
|
||||
{
|
||||
a = a * b;
|
||||
return a;
|
||||
}
|
||||
inline v_int32x4& operator *= (v_int32x4& a, const v_int32x4& b)
|
||||
{
|
||||
a = a * b;
|
||||
return a;
|
||||
v_int16x8 a0, a1, b0, b1;
|
||||
v_expand(a, a0, a1);
|
||||
v_expand(b, b0, b1);
|
||||
c = v_mul_wrap(a0, b0);
|
||||
d = v_mul_wrap(a1, b1);
|
||||
}
|
||||
|
||||
inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
|
||||
@ -1018,6 +1028,22 @@ OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_sub_wrap, _mm_sub_epi8)
|
||||
OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int8x16, v_sub_wrap, _mm_sub_epi8)
|
||||
OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint16x8, v_sub_wrap, _mm_sub_epi16)
|
||||
OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_sub_wrap, _mm_sub_epi16)
|
||||
OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint16x8, v_mul_wrap, _mm_mullo_epi16)
|
||||
OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_mul_wrap, _mm_mullo_epi16)
|
||||
|
||||
inline v_uint8x16 v_mul_wrap(const v_uint8x16& a, const v_uint8x16& b)
|
||||
{
|
||||
__m128i ad = _mm_srai_epi16(a.val, 8);
|
||||
__m128i bd = _mm_srai_epi16(b.val, 8);
|
||||
__m128i p0 = _mm_mullo_epi16(a.val, b.val); // even
|
||||
__m128i p1 = _mm_slli_epi16(_mm_mullo_epi16(ad, bd), 8); // odd
|
||||
const __m128i b01 = _mm_set1_epi32(0xFF00FF00);
|
||||
return v_uint8x16(_v128_blendv_epi8(p0, p1, b01));
|
||||
}
|
||||
inline v_int8x16 v_mul_wrap(const v_int8x16& a, const v_int8x16& b)
|
||||
{
|
||||
return v_reinterpret_as_s8(v_mul_wrap(v_reinterpret_as_u8(a), v_reinterpret_as_u8(b)));
|
||||
}
|
||||
|
||||
#define OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(_Tpuvec, _Tpsvec, bits, smask32) \
|
||||
inline _Tpuvec v_absdiff(const _Tpuvec& a, const _Tpuvec& b) \
|
||||
@ -1502,70 +1528,39 @@ OPENCV_HAL_IMPL_SSE_SELECT(v_float32x4, ps)
|
||||
OPENCV_HAL_IMPL_SSE_SELECT(v_float64x2, pd)
|
||||
#endif
|
||||
|
||||
#define OPENCV_HAL_IMPL_SSE_EXPAND(_Tpuvec, _Tpwuvec, _Tpu, _Tpsvec, _Tpwsvec, _Tps, suffix, wsuffix, shift) \
|
||||
inline void v_expand(const _Tpuvec& a, _Tpwuvec& b0, _Tpwuvec& b1) \
|
||||
/* Expand */
|
||||
#define OPENCV_HAL_IMPL_SSE_EXPAND(_Tpvec, _Tpwvec, _Tp, intrin) \
|
||||
inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
|
||||
{ \
|
||||
__m128i z = _mm_setzero_si128(); \
|
||||
b0.val = _mm_unpacklo_##suffix(a.val, z); \
|
||||
b1.val = _mm_unpackhi_##suffix(a.val, z); \
|
||||
b0.val = intrin(a.val); \
|
||||
b1.val = __CV_CAT(intrin, _high)(a.val); \
|
||||
} \
|
||||
inline _Tpwuvec v_load_expand(const _Tpu* ptr) \
|
||||
{ \
|
||||
__m128i z = _mm_setzero_si128(); \
|
||||
return _Tpwuvec(_mm_unpacklo_##suffix(_mm_loadl_epi64((const __m128i*)ptr), z)); \
|
||||
} \
|
||||
inline void v_expand(const _Tpsvec& a, _Tpwsvec& b0, _Tpwsvec& b1) \
|
||||
{ \
|
||||
b0.val = _mm_srai_##wsuffix(_mm_unpacklo_##suffix(a.val, a.val), shift); \
|
||||
b1.val = _mm_srai_##wsuffix(_mm_unpackhi_##suffix(a.val, a.val), shift); \
|
||||
} \
|
||||
inline _Tpwsvec v_load_expand(const _Tps* ptr) \
|
||||
inline _Tpwvec v_expand_low(const _Tpvec& a) \
|
||||
{ return _Tpwvec(intrin(a.val)); } \
|
||||
inline _Tpwvec v_expand_high(const _Tpvec& a) \
|
||||
{ return _Tpwvec(__CV_CAT(intrin, _high)(a.val)); } \
|
||||
inline _Tpwvec v_load_expand(const _Tp* ptr) \
|
||||
{ \
|
||||
__m128i a = _mm_loadl_epi64((const __m128i*)ptr); \
|
||||
return _Tpwsvec(_mm_srai_##wsuffix(_mm_unpacklo_##suffix(a, a), shift)); \
|
||||
return _Tpwvec(intrin(a)); \
|
||||
}
|
||||
|
||||
OPENCV_HAL_IMPL_SSE_EXPAND(v_uint8x16, v_uint16x8, uchar, v_int8x16, v_int16x8, schar, epi8, epi16, 8)
|
||||
OPENCV_HAL_IMPL_SSE_EXPAND(v_uint16x8, v_uint32x4, ushort, v_int16x8, v_int32x4, short, epi16, epi32, 16)
|
||||
OPENCV_HAL_IMPL_SSE_EXPAND(v_uint8x16, v_uint16x8, uchar, _v128_cvtepu8_epi16)
|
||||
OPENCV_HAL_IMPL_SSE_EXPAND(v_int8x16, v_int16x8, schar, _v128_cvtepi8_epi16)
|
||||
OPENCV_HAL_IMPL_SSE_EXPAND(v_uint16x8, v_uint32x4, ushort, _v128_cvtepu16_epi32)
|
||||
OPENCV_HAL_IMPL_SSE_EXPAND(v_int16x8, v_int32x4, short, _v128_cvtepi16_epi32)
|
||||
OPENCV_HAL_IMPL_SSE_EXPAND(v_uint32x4, v_uint64x2, unsigned, _v128_cvtepu32_epi64)
|
||||
OPENCV_HAL_IMPL_SSE_EXPAND(v_int32x4, v_int64x2, int, _v128_cvtepi32_epi64)
|
||||
|
||||
inline void v_expand(const v_uint32x4& a, v_uint64x2& b0, v_uint64x2& b1)
|
||||
{
|
||||
__m128i z = _mm_setzero_si128();
|
||||
b0.val = _mm_unpacklo_epi32(a.val, z);
|
||||
b1.val = _mm_unpackhi_epi32(a.val, z);
|
||||
}
|
||||
inline v_uint64x2 v_load_expand(const unsigned* ptr)
|
||||
{
|
||||
__m128i z = _mm_setzero_si128();
|
||||
return v_uint64x2(_mm_unpacklo_epi32(_mm_loadl_epi64((const __m128i*)ptr), z));
|
||||
}
|
||||
inline void v_expand(const v_int32x4& a, v_int64x2& b0, v_int64x2& b1)
|
||||
{
|
||||
__m128i s = _mm_srai_epi32(a.val, 31);
|
||||
b0.val = _mm_unpacklo_epi32(a.val, s);
|
||||
b1.val = _mm_unpackhi_epi32(a.val, s);
|
||||
}
|
||||
inline v_int64x2 v_load_expand(const int* ptr)
|
||||
{
|
||||
__m128i a = _mm_loadl_epi64((const __m128i*)ptr);
|
||||
__m128i s = _mm_srai_epi32(a, 31);
|
||||
return v_int64x2(_mm_unpacklo_epi32(a, s));
|
||||
#define OPENCV_HAL_IMPL_SSE_EXPAND_Q(_Tpvec, _Tp, intrin) \
|
||||
inline _Tpvec v_load_expand_q(const _Tp* ptr) \
|
||||
{ \
|
||||
__m128i a = _mm_cvtsi32_si128(*(const int*)ptr); \
|
||||
return _Tpvec(intrin(a)); \
|
||||
}
|
||||
|
||||
inline v_uint32x4 v_load_expand_q(const uchar* ptr)
|
||||
{
|
||||
__m128i z = _mm_setzero_si128();
|
||||
__m128i a = _mm_cvtsi32_si128(*(const int*)ptr);
|
||||
return v_uint32x4(_mm_unpacklo_epi16(_mm_unpacklo_epi8(a, z), z));
|
||||
}
|
||||
|
||||
inline v_int32x4 v_load_expand_q(const schar* ptr)
|
||||
{
|
||||
__m128i a = _mm_cvtsi32_si128(*(const int*)ptr);
|
||||
a = _mm_unpacklo_epi8(a, a);
|
||||
a = _mm_unpacklo_epi8(a, a);
|
||||
return v_int32x4(_mm_srai_epi32(a, 24));
|
||||
}
|
||||
OPENCV_HAL_IMPL_SSE_EXPAND_Q(v_uint32x4, uchar, _v128_cvtepu8_epi32)
|
||||
OPENCV_HAL_IMPL_SSE_EXPAND_Q(v_int32x4, schar, _v128_cvtepi8_epi32)
|
||||
|
||||
#define OPENCV_HAL_IMPL_SSE_UNPACKS(_Tpvec, suffix, cast_from, cast_to) \
|
||||
inline void v_zip(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1) \
|
||||
|
167
modules/core/include/opencv2/core/hal/intrin_sse_em.hpp
Normal file
167
modules/core/include/opencv2/core/hal/intrin_sse_em.hpp
Normal file
@ -0,0 +1,167 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html
|
||||
|
||||
#ifndef OPENCV_HAL_INTRIN_SSE_EM_HPP
|
||||
#define OPENCV_HAL_INTRIN_SSE_EM_HPP
|
||||
|
||||
namespace cv
|
||||
{
|
||||
|
||||
//! @cond IGNORED
|
||||
|
||||
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
|
||||
|
||||
#define OPENCV_HAL_SSE_WRAP_1(fun, tp) \
|
||||
inline tp _v128_##fun(const tp& a) \
|
||||
{ return _mm_##fun(a); }
|
||||
|
||||
#define OPENCV_HAL_SSE_WRAP_2(fun, tp) \
|
||||
inline tp _v128_##fun(const tp& a, const tp& b) \
|
||||
{ return _mm_##fun(a, b); }
|
||||
|
||||
#define OPENCV_HAL_SSE_WRAP_3(fun, tp) \
|
||||
inline tp _v128_##fun(const tp& a, const tp& b, const tp& c) \
|
||||
{ return _mm_##fun(a, b, c); }
|
||||
|
||||
///////////////////////////// XOP /////////////////////////////
|
||||
|
||||
// [todo] define CV_XOP
|
||||
#if 1 // CV_XOP
|
||||
inline __m128i _v128_comgt_epu32(const __m128i& a, const __m128i& b)
|
||||
{
|
||||
const __m128i delta = _mm_set1_epi32((int)0x80000000);
|
||||
return _mm_cmpgt_epi32(_mm_xor_si128(a, delta), _mm_xor_si128(b, delta));
|
||||
}
|
||||
// wrapping XOP
|
||||
#else
|
||||
OPENCV_HAL_SSE_WRAP_2(_v128_comgt_epu32, __m128i)
|
||||
#endif // !CV_XOP
|
||||
|
||||
///////////////////////////// SSE4.1 /////////////////////////////
|
||||
|
||||
#if !CV_SSE4_1
|
||||
|
||||
/** Swizzle **/
|
||||
inline __m128i _v128_blendv_epi8(const __m128i& a, const __m128i& b, const __m128i& mask)
|
||||
{ return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(b, a), mask)); }
|
||||
|
||||
/** Convert **/
|
||||
// 8 >> 16
|
||||
inline __m128i _v128_cvtepu8_epi16(const __m128i& a)
|
||||
{
|
||||
const __m128i z = _mm_setzero_si128();
|
||||
return _mm_unpacklo_epi8(a, z);
|
||||
}
|
||||
inline __m128i _v128_cvtepi8_epi16(const __m128i& a)
|
||||
{ return _mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8); }
|
||||
// 8 >> 32
|
||||
inline __m128i _v128_cvtepu8_epi32(const __m128i& a)
|
||||
{
|
||||
const __m128i z = _mm_setzero_si128();
|
||||
return _mm_unpacklo_epi16(_mm_unpacklo_epi8(a, z), z);
|
||||
}
|
||||
inline __m128i _v128_cvtepi8_epi32(const __m128i& a)
|
||||
{
|
||||
__m128i r = _mm_unpacklo_epi8(a, a);
|
||||
r = _mm_unpacklo_epi8(r, r);
|
||||
return _mm_srai_epi32(r, 24);
|
||||
}
|
||||
// 16 >> 32
|
||||
inline __m128i _v128_cvtepu16_epi32(const __m128i& a)
|
||||
{
|
||||
const __m128i z = _mm_setzero_si128();
|
||||
return _mm_unpacklo_epi16(a, z);
|
||||
}
|
||||
inline __m128i _v128_cvtepi16_epi32(const __m128i& a)
|
||||
{ return _mm_srai_epi32(_mm_unpacklo_epi16(a, a), 16); }
|
||||
// 32 >> 64
|
||||
inline __m128i _v128_cvtepu32_epi64(const __m128i& a)
|
||||
{
|
||||
const __m128i z = _mm_setzero_si128();
|
||||
return _mm_unpacklo_epi32(a, z);
|
||||
}
|
||||
inline __m128i _v128_cvtepi32_epi64(const __m128i& a)
|
||||
{ return _mm_unpacklo_epi32(a, _mm_srai_epi32(a, 31)); }
|
||||
|
||||
/** Arithmetic **/
|
||||
inline __m128i _v128_mullo_epi32(const __m128i& a, const __m128i& b)
|
||||
{
|
||||
__m128i c0 = _mm_mul_epu32(a, b);
|
||||
__m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a, 32), _mm_srli_epi64(b, 32));
|
||||
__m128i d0 = _mm_unpacklo_epi32(c0, c1);
|
||||
__m128i d1 = _mm_unpackhi_epi32(c0, c1);
|
||||
return _mm_unpacklo_epi64(d0, d1);
|
||||
}
|
||||
|
||||
/** Math **/
|
||||
inline __m128i _v128_min_epu32(const __m128i& a, const __m128i& b)
|
||||
{ return _v128_blendv_epi8(a, b, _v128_comgt_epu32(a, b)); }
|
||||
|
||||
// wrapping SSE4.1
|
||||
#else
|
||||
OPENCV_HAL_SSE_WRAP_1(cvtepu8_epi16, __m128i)
|
||||
OPENCV_HAL_SSE_WRAP_1(cvtepi8_epi16, __m128i)
|
||||
OPENCV_HAL_SSE_WRAP_1(cvtepu8_epi32, __m128i)
|
||||
OPENCV_HAL_SSE_WRAP_1(cvtepi8_epi32, __m128i)
|
||||
OPENCV_HAL_SSE_WRAP_1(cvtepu16_epi32, __m128i)
|
||||
OPENCV_HAL_SSE_WRAP_1(cvtepi16_epi32, __m128i)
|
||||
OPENCV_HAL_SSE_WRAP_1(cvtepu32_epi64, __m128i)
|
||||
OPENCV_HAL_SSE_WRAP_1(cvtepi32_epi64, __m128i)
|
||||
OPENCV_HAL_SSE_WRAP_2(min_epu32, __m128i)
|
||||
OPENCV_HAL_SSE_WRAP_2(mullo_epi32, __m128i)
|
||||
OPENCV_HAL_SSE_WRAP_3(blendv_epi8, __m128i)
|
||||
#endif // !CV_SSE4_1
|
||||
|
||||
///////////////////////////// Revolutionary /////////////////////////////
|
||||
|
||||
/** Convert **/
|
||||
// 16 << 8
|
||||
inline __m128i _v128_cvtepu8_epi16_high(const __m128i& a)
|
||||
{
|
||||
const __m128i z = _mm_setzero_si128();
|
||||
return _mm_unpackhi_epi8(a, z);
|
||||
}
|
||||
inline __m128i _v128_cvtepi8_epi16_high(const __m128i& a)
|
||||
{ return _mm_srai_epi16(_mm_unpackhi_epi8(a, a), 8); }
|
||||
// 32 << 16
|
||||
inline __m128i _v128_cvtepu16_epi32_high(const __m128i& a)
|
||||
{
|
||||
const __m128i z = _mm_setzero_si128();
|
||||
return _mm_unpackhi_epi16(a, z);
|
||||
}
|
||||
inline __m128i _v128_cvtepi16_epi32_high(const __m128i& a)
|
||||
{ return _mm_srai_epi32(_mm_unpackhi_epi16(a, a), 16); }
|
||||
// 64 << 32
|
||||
inline __m128i _v128_cvtepu32_epi64_high(const __m128i& a)
|
||||
{
|
||||
const __m128i z = _mm_setzero_si128();
|
||||
return _mm_unpackhi_epi32(a, z);
|
||||
}
|
||||
inline __m128i _v128_cvtepi32_epi64_high(const __m128i& a)
|
||||
{ return _mm_unpackhi_epi32(a, _mm_srai_epi32(a, 31)); }
|
||||
|
||||
/** Miscellaneous **/
|
||||
inline __m128i _v128_packs_epu32(const __m128i& a, const __m128i& b)
|
||||
{
|
||||
const __m128i m = _mm_set1_epi32(65535);
|
||||
__m128i am = _v128_min_epu32(a, m);
|
||||
__m128i bm = _v128_min_epu32(b, m);
|
||||
#if CV_SSE4_1
|
||||
return _mm_packus_epi32(am, bm);
|
||||
#else
|
||||
const __m128i d = _mm_set1_epi32(32768), nd = _mm_set1_epi16(-32768);
|
||||
am = _mm_sub_epi32(am, d);
|
||||
bm = _mm_sub_epi32(bm, d);
|
||||
am = _mm_packs_epi32(am, bm);
|
||||
return _mm_sub_epi16(am, nd);
|
||||
#endif
|
||||
}
|
||||
|
||||
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
|
||||
|
||||
//! @endcond
|
||||
|
||||
} // cv::
|
||||
|
||||
#endif // OPENCV_HAL_INTRIN_SSE_EM_HPP
|
@ -315,6 +315,10 @@ inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
|
||||
b0.val = fh(a.val); \
|
||||
b1.val = fl(a.val); \
|
||||
} \
|
||||
inline _Tpwvec v_expand_low(const _Tpvec& a) \
|
||||
{ return _Tpwvec(fh(a.val)); } \
|
||||
inline _Tpwvec v_expand_high(const _Tpvec& a) \
|
||||
{ return _Tpwvec(fl(a.val)); } \
|
||||
inline _Tpwvec v_load_expand(const _Tp* ptr) \
|
||||
{ return _Tpwvec(fh(vec_ld_l8(ptr))); }
|
||||
|
||||
@ -418,10 +422,8 @@ OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_int8x16, vec_adds)
|
||||
OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_int8x16, vec_subs)
|
||||
OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_uint16x8, vec_adds)
|
||||
OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_uint16x8, vec_subs)
|
||||
OPENCV_HAL_IMPL_VSX_BIN_OP(*, v_uint16x8, vec_mul)
|
||||
OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_int16x8, vec_adds)
|
||||
OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_int16x8, vec_subs)
|
||||
OPENCV_HAL_IMPL_VSX_BIN_OP(*, v_int16x8, vec_mul)
|
||||
OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_uint32x4, vec_add)
|
||||
OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_uint32x4, vec_sub)
|
||||
OPENCV_HAL_IMPL_VSX_BIN_OP(*, v_uint32x4, vec_mul)
|
||||
@ -441,16 +443,30 @@ OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_uint64x2, vec_sub)
|
||||
OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_int64x2, vec_add)
|
||||
OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_int64x2, vec_sub)
|
||||
|
||||
inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b, v_int32x4& c, v_int32x4& d)
|
||||
// saturating multiply
|
||||
#define OPENCV_HAL_IMPL_VSX_MUL_SAT(_Tpvec, _Tpwvec) \
|
||||
inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \
|
||||
{ \
|
||||
_Tpwvec c, d; \
|
||||
v_mul_expand(a, b, c, d); \
|
||||
return v_pack(c, d); \
|
||||
} \
|
||||
inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \
|
||||
{ a = a * b; return a; }
|
||||
|
||||
OPENCV_HAL_IMPL_VSX_MUL_SAT(v_int8x16, v_int16x8)
|
||||
OPENCV_HAL_IMPL_VSX_MUL_SAT(v_uint8x16, v_uint16x8)
|
||||
OPENCV_HAL_IMPL_VSX_MUL_SAT(v_int16x8, v_int32x4)
|
||||
OPENCV_HAL_IMPL_VSX_MUL_SAT(v_uint16x8, v_uint32x4)
|
||||
|
||||
template<typename Tvec, typename Twvec>
|
||||
inline void v_mul_expand(const Tvec& a, const Tvec& b, Twvec& c, Twvec& d)
|
||||
{
|
||||
c.val = vec_mul(vec_unpackh(a.val), vec_unpackh(b.val));
|
||||
d.val = vec_mul(vec_unpackl(a.val), vec_unpackl(b.val));
|
||||
}
|
||||
inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b, v_uint32x4& c, v_uint32x4& d)
|
||||
{
|
||||
c.val = vec_mul(vec_unpackhu(a.val), vec_unpackhu(b.val));
|
||||
d.val = vec_mul(vec_unpacklu(a.val), vec_unpacklu(b.val));
|
||||
Twvec p0 = Twvec(vec_mule(a.val, b.val));
|
||||
Twvec p1 = Twvec(vec_mulo(a.val, b.val));
|
||||
v_zip(p0, p1, c, d);
|
||||
}
|
||||
|
||||
inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b, v_uint64x2& c, v_uint64x2& d)
|
||||
{
|
||||
c.val = vec_mul(vec_unpackhu(a.val), vec_unpackhu(b.val));
|
||||
@ -459,17 +475,17 @@ inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b, v_uint64x2& c
|
||||
|
||||
inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b)
|
||||
{
|
||||
return v_int16x8(vec_packs(
|
||||
vec_sra(vec_mul(vec_unpackh(a.val), vec_unpackh(b.val)), vec_uint4_sp(16)),
|
||||
vec_sra(vec_mul(vec_unpackl(a.val), vec_unpackl(b.val)), vec_uint4_sp(16))
|
||||
));
|
||||
vec_int4 p0 = vec_mule(a.val, b.val);
|
||||
vec_int4 p1 = vec_mulo(a.val, b.val);
|
||||
static const vec_uchar16 perm = {2, 3, 18, 19, 6, 7, 22, 23, 10, 11, 26, 27, 14, 15, 30, 31};
|
||||
return v_int16x8(vec_perm(vec_short8_c(p0), vec_short8_c(p1), perm));
|
||||
}
|
||||
inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b)
|
||||
{
|
||||
return v_uint16x8(vec_packs(
|
||||
vec_sr(vec_mul(vec_unpackhu(a.val), vec_unpackhu(b.val)), vec_uint4_sp(16)),
|
||||
vec_sr(vec_mul(vec_unpacklu(a.val), vec_unpacklu(b.val)), vec_uint4_sp(16))
|
||||
));
|
||||
vec_uint4 p0 = vec_mule(a.val, b.val);
|
||||
vec_uint4 p1 = vec_mulo(a.val, b.val);
|
||||
static const vec_uchar16 perm = {2, 3, 18, 19, 6, 7, 22, 23, 10, 11, 26, 27, 14, 15, 30, 31};
|
||||
return v_uint16x8(vec_perm(vec_ushort8_c(p0), vec_ushort8_c(p1), perm));
|
||||
}
|
||||
|
||||
/** Non-saturating arithmetics **/
|
||||
@ -480,6 +496,7 @@ inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
|
||||
|
||||
OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_add_wrap, vec_add)
|
||||
OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_sub_wrap, vec_sub)
|
||||
OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_mul_wrap, vec_mul)
|
||||
|
||||
/** Bitwise shifts **/
|
||||
#define OPENCV_HAL_IMPL_VSX_SHIFT_OP(_Tpvec, shr, splfunc) \
|
||||
|
@ -130,19 +130,21 @@ VSX_FINLINE(rt) fnm(const rg& a, const rg& b) \
|
||||
# undef vec_mul
|
||||
# endif
|
||||
/*
|
||||
* there's no a direct instruction for supporting 16-bit multiplication in ISA 2.07,
|
||||
* there's no a direct instruction for supporting 8-bit, 16-bit multiplication in ISA 2.07,
|
||||
* XLC Implement it by using instruction "multiply even", "multiply odd" and "permute"
|
||||
* todo: Do I need to support 8-bit ?
|
||||
**/
|
||||
# define VSX_IMPL_MULH(Tvec, Tcast) \
|
||||
# define VSX_IMPL_MULH(Tvec, cperm) \
|
||||
VSX_FINLINE(Tvec) vec_mul(const Tvec& a, const Tvec& b) \
|
||||
{ \
|
||||
static const vec_uchar16 even_perm = {0, 1, 16, 17, 4, 5, 20, 21, \
|
||||
8, 9, 24, 25, 12, 13, 28, 29}; \
|
||||
return vec_perm(Tcast(vec_mule(a, b)), Tcast(vec_mulo(a, b)), even_perm); \
|
||||
static const vec_uchar16 ev_od = {cperm}; \
|
||||
return vec_perm((Tvec)vec_mule(a, b), (Tvec)vec_mulo(a, b), ev_od); \
|
||||
}
|
||||
VSX_IMPL_MULH(vec_short8, vec_short8_c)
|
||||
VSX_IMPL_MULH(vec_ushort8, vec_ushort8_c)
|
||||
#define VSX_IMPL_MULH_P16 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30
|
||||
VSX_IMPL_MULH(vec_char16, VSX_IMPL_MULH_P16)
|
||||
VSX_IMPL_MULH(vec_uchar16, VSX_IMPL_MULH_P16)
|
||||
#define VSX_IMPL_MULH_P8 0, 1, 16, 17, 4, 5, 20, 21, 8, 9, 24, 25, 12, 13, 28, 29
|
||||
VSX_IMPL_MULH(vec_short8, VSX_IMPL_MULH_P8)
|
||||
VSX_IMPL_MULH(vec_ushort8, VSX_IMPL_MULH_P8)
|
||||
// vmuluwm can be used for unsigned or signed integers, that's what they said
|
||||
VSX_IMPL_2VRG(vec_int4, vec_int4, vmuluwm, vec_mul)
|
||||
VSX_IMPL_2VRG(vec_uint4, vec_uint4, vmuluwm, vec_mul)
|
||||
|
@ -407,10 +407,13 @@ template<typename R> struct TheTest
|
||||
|
||||
Data<Rx2> resB = vx_load_expand(dataA.d);
|
||||
|
||||
Rx2 c, d;
|
||||
Rx2 c, d, e, f;
|
||||
v_expand(a, c, d);
|
||||
|
||||
Data<Rx2> resC = c, resD = d;
|
||||
e = v_expand_low(a);
|
||||
f = v_expand_high(a);
|
||||
|
||||
Data<Rx2> resC = c, resD = d, resE = e, resF = f;
|
||||
const int n = Rx2::nlanes;
|
||||
for (int i = 0; i < n; ++i)
|
||||
{
|
||||
@ -418,6 +421,8 @@ template<typename R> struct TheTest
|
||||
EXPECT_EQ(dataA[i], resB[i]);
|
||||
EXPECT_EQ(dataA[i], resC[i]);
|
||||
EXPECT_EQ(dataA[i + n], resD[i]);
|
||||
EXPECT_EQ(dataA[i], resE[i]);
|
||||
EXPECT_EQ(dataA[i + n], resF[i]);
|
||||
}
|
||||
|
||||
return *this;
|
||||
@ -455,19 +460,21 @@ template<typename R> struct TheTest
|
||||
return *this;
|
||||
}
|
||||
|
||||
TheTest & test_addsub_wrap()
|
||||
TheTest & test_arithm_wrap()
|
||||
{
|
||||
Data<R> dataA, dataB;
|
||||
dataB.reverse();
|
||||
R a = dataA, b = dataB;
|
||||
|
||||
Data<R> resC = v_add_wrap(a, b),
|
||||
resD = v_sub_wrap(a, b);
|
||||
resD = v_sub_wrap(a, b),
|
||||
resE = v_mul_wrap(a, b);
|
||||
for (int i = 0; i < R::nlanes; ++i)
|
||||
{
|
||||
SCOPED_TRACE(cv::format("i=%d", i));
|
||||
EXPECT_EQ((LaneType)(dataA[i] + dataB[i]), resC[i]);
|
||||
EXPECT_EQ((LaneType)(dataA[i] - dataB[i]), resD[i]);
|
||||
EXPECT_EQ((LaneType)(dataA[i] * dataB[i]), resE[i]);
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
@ -475,6 +482,7 @@ template<typename R> struct TheTest
|
||||
TheTest & test_mul()
|
||||
{
|
||||
Data<R> dataA, dataB;
|
||||
dataA[1] = static_cast<LaneType>(std::numeric_limits<LaneType>::max());
|
||||
dataB.reverse();
|
||||
R a = dataA, b = dataB;
|
||||
|
||||
@ -482,7 +490,7 @@ template<typename R> struct TheTest
|
||||
for (int i = 0; i < R::nlanes; ++i)
|
||||
{
|
||||
SCOPED_TRACE(cv::format("i=%d", i));
|
||||
EXPECT_EQ(dataA[i] * dataB[i], resC[i]);
|
||||
EXPECT_EQ(saturate_cast<LaneType>(dataA[i] * dataB[i]), resC[i]);
|
||||
}
|
||||
|
||||
return *this;
|
||||
@ -1209,7 +1217,9 @@ void test_hal_intrin_uint8()
|
||||
.test_expand()
|
||||
.test_expand_q()
|
||||
.test_addsub()
|
||||
.test_addsub_wrap()
|
||||
.test_arithm_wrap()
|
||||
.test_mul()
|
||||
.test_mul_expand()
|
||||
.test_cmp()
|
||||
.test_logic()
|
||||
.test_min_max()
|
||||
@ -1242,7 +1252,9 @@ void test_hal_intrin_int8()
|
||||
.test_expand()
|
||||
.test_expand_q()
|
||||
.test_addsub()
|
||||
.test_addsub_wrap()
|
||||
.test_arithm_wrap()
|
||||
.test_mul()
|
||||
.test_mul_expand()
|
||||
.test_cmp()
|
||||
.test_logic()
|
||||
.test_min_max()
|
||||
@ -1267,7 +1279,7 @@ void test_hal_intrin_uint16()
|
||||
.test_interleave()
|
||||
.test_expand()
|
||||
.test_addsub()
|
||||
.test_addsub_wrap()
|
||||
.test_arithm_wrap()
|
||||
.test_mul()
|
||||
.test_mul_expand()
|
||||
.test_cmp()
|
||||
@ -1295,7 +1307,7 @@ void test_hal_intrin_int16()
|
||||
.test_interleave()
|
||||
.test_expand()
|
||||
.test_addsub()
|
||||
.test_addsub_wrap()
|
||||
.test_arithm_wrap()
|
||||
.test_mul()
|
||||
.test_mul_expand()
|
||||
.test_cmp()
|
||||
|
@ -1,3 +1,3 @@
|
||||
set(the_description "Image Processing")
|
||||
ocv_add_dispatched_file(accum SSE2 AVX NEON)
|
||||
ocv_add_dispatched_file(accum SSE4_1 AVX AVX2)
|
||||
ocv_define_module(imgproc opencv_core WRAP java python js)
|
||||
|
@ -5,94 +5,102 @@
|
||||
|
||||
namespace opencv_test {
|
||||
|
||||
#ifdef HAVE_OPENVX
|
||||
PERF_TEST_P(Size_MatType, Accumulate,
|
||||
testing::Combine(
|
||||
testing::Values(::perf::szODD, ::perf::szQVGA, ::perf::szVGA, ::perf::sz1080p),
|
||||
testing::Values(CV_16SC1, CV_32FC1)
|
||||
)
|
||||
)
|
||||
#else
|
||||
PERF_TEST_P( Size_MatType, Accumulate,
|
||||
testing::Combine(
|
||||
testing::Values(::perf::szODD, ::perf::szQVGA, ::perf::szVGA, ::perf::sz1080p),
|
||||
testing::Values(CV_32FC1)
|
||||
)
|
||||
)
|
||||
#endif
|
||||
{
|
||||
Size sz = get<0>(GetParam());
|
||||
int dstType = get<1>(GetParam());
|
||||
typedef Size_MatType Accumulate;
|
||||
|
||||
Mat src(sz, CV_8UC1);
|
||||
Mat dst(sz, dstType);
|
||||
#define MAT_TYPES_ACCUMLATE CV_8UC1, CV_16UC1, CV_32FC1
|
||||
#define MAT_TYPES_ACCUMLATE_C MAT_TYPES_ACCUMLATE, CV_8UC3, CV_16UC3, CV_32FC3
|
||||
#define MAT_TYPES_ACCUMLATE_D MAT_TYPES_ACCUMLATE, CV_64FC1
|
||||
#define MAT_TYPES_ACCUMLATE_D_C MAT_TYPES_ACCUMLATE_C, CV_64FC1, CV_64FC1
|
||||
|
||||
declare.time(100);
|
||||
declare.in(src, WARMUP_RNG).out(dst);
|
||||
#define PERF_ACCUMULATE_INIT(_FLTC) \
|
||||
const Size srcSize = get<0>(GetParam()); \
|
||||
const int srcType = get<1>(GetParam()); \
|
||||
const int dstType = _FLTC(CV_MAT_CN(srcType)); \
|
||||
Mat src1(srcSize, srcType), dst(srcSize, dstType); \
|
||||
declare.in(src1, dst, WARMUP_RNG).out(dst);
|
||||
|
||||
TEST_CYCLE() accumulate(src, dst);
|
||||
#define PERF_ACCUMULATE_MASK_INIT(_FLTC) \
|
||||
PERF_ACCUMULATE_INIT(_FLTC) \
|
||||
Mat mask(srcSize, CV_8UC1); \
|
||||
declare.in(mask, WARMUP_RNG);
|
||||
|
||||
SANITY_CHECK_NOTHING();
|
||||
#define PERF_TEST_P_ACCUMULATE(_NAME, _TYPES, _INIT, _FUN) \
|
||||
PERF_TEST_P(Accumulate, _NAME, \
|
||||
testing::Combine( \
|
||||
testing::Values(sz1080p, sz720p, szVGA, szQVGA, szODD), \
|
||||
testing::Values(_TYPES) \
|
||||
) \
|
||||
) \
|
||||
{ \
|
||||
_INIT \
|
||||
TEST_CYCLE() _FUN; \
|
||||
SANITY_CHECK_NOTHING(); \
|
||||
}
|
||||
|
||||
#ifdef HAVE_OPENVX
|
||||
PERF_TEST_P(Size_MatType, AccumulateSquare,
|
||||
testing::Combine(
|
||||
testing::Values(::perf::szODD, ::perf::szQVGA, ::perf::szVGA, ::perf::sz1080p),
|
||||
testing::Values(CV_16SC1, CV_32FC1)
|
||||
)
|
||||
)
|
||||
#else
|
||||
PERF_TEST_P( Size_MatType, AccumulateSquare,
|
||||
testing::Combine(
|
||||
testing::Values(::perf::szODD, ::perf::szQVGA, ::perf::szVGA, ::perf::sz1080p),
|
||||
testing::Values(CV_32FC1)
|
||||
)
|
||||
)
|
||||
#endif
|
||||
{
|
||||
Size sz = get<0>(GetParam());
|
||||
int dstType = get<1>(GetParam());
|
||||
/////////////////////////////////// Accumulate ///////////////////////////////////
|
||||
|
||||
Mat src(sz, CV_8UC1);
|
||||
Mat dst(sz, dstType);
|
||||
PERF_TEST_P_ACCUMULATE(Accumulate, MAT_TYPES_ACCUMLATE,
|
||||
PERF_ACCUMULATE_INIT(CV_32FC), accumulate(src1, dst))
|
||||
|
||||
declare.time(100);
|
||||
declare.in(src, WARMUP_RNG).out(dst);
|
||||
PERF_TEST_P_ACCUMULATE(AccumulateMask, MAT_TYPES_ACCUMLATE_C,
|
||||
PERF_ACCUMULATE_MASK_INIT(CV_32FC), accumulate(src1, dst, mask))
|
||||
|
||||
TEST_CYCLE() accumulateSquare(src, dst);
|
||||
PERF_TEST_P_ACCUMULATE(AccumulateDouble, MAT_TYPES_ACCUMLATE_D,
|
||||
PERF_ACCUMULATE_INIT(CV_64FC), accumulate(src1, dst))
|
||||
|
||||
SANITY_CHECK_NOTHING();
|
||||
}
|
||||
PERF_TEST_P_ACCUMULATE(AccumulateDoubleMask, MAT_TYPES_ACCUMLATE_D_C,
|
||||
PERF_ACCUMULATE_MASK_INIT(CV_64FC), accumulate(src1, dst, mask))
|
||||
|
||||
#ifdef HAVE_OPENVX
|
||||
PERF_TEST_P(Size_MatType, AccumulateWeighted,
|
||||
testing::Combine(
|
||||
testing::Values(::perf::szODD, ::perf::szQVGA, ::perf::szVGA, ::perf::sz1080p),
|
||||
testing::Values(CV_8UC1, CV_32FC1)
|
||||
)
|
||||
)
|
||||
#else
|
||||
PERF_TEST_P( Size_MatType, AccumulateWeighted,
|
||||
testing::Combine(
|
||||
testing::Values(::perf::szODD, ::perf::szQVGA, ::perf::szVGA, ::perf::sz1080p),
|
||||
testing::Values(CV_32FC1)
|
||||
)
|
||||
)
|
||||
#endif
|
||||
{
|
||||
Size sz = get<0>(GetParam());
|
||||
int dstType = get<1>(GetParam());
|
||||
///////////////////////////// AccumulateSquare ///////////////////////////////////
|
||||
|
||||
Mat src(sz, CV_8UC1);
|
||||
Mat dst(sz, dstType);
|
||||
PERF_TEST_P_ACCUMULATE(Square, MAT_TYPES_ACCUMLATE,
|
||||
PERF_ACCUMULATE_INIT(CV_32FC), accumulateSquare(src1, dst))
|
||||
|
||||
declare.time(100);
|
||||
declare.in(src, WARMUP_RNG).out(dst);
|
||||
PERF_TEST_P_ACCUMULATE(SquareMask, MAT_TYPES_ACCUMLATE_C,
|
||||
PERF_ACCUMULATE_MASK_INIT(CV_32FC), accumulateSquare(src1, dst, mask))
|
||||
|
||||
TEST_CYCLE() accumulateWeighted(src, dst, 0.314);
|
||||
PERF_TEST_P_ACCUMULATE(SquareDouble, MAT_TYPES_ACCUMLATE_D,
|
||||
PERF_ACCUMULATE_INIT(CV_64FC), accumulateSquare(src1, dst))
|
||||
|
||||
SANITY_CHECK_NOTHING();
|
||||
}
|
||||
PERF_TEST_P_ACCUMULATE(SquareDoubleMask, MAT_TYPES_ACCUMLATE_D_C,
|
||||
PERF_ACCUMULATE_MASK_INIT(CV_64FC), accumulateSquare(src1, dst, mask))
|
||||
|
||||
///////////////////////////// AccumulateProduct ///////////////////////////////////
|
||||
|
||||
#define PERF_ACCUMULATE_INIT_2(_FLTC) \
|
||||
PERF_ACCUMULATE_INIT(_FLTC) \
|
||||
Mat src2(srcSize, srcType); \
|
||||
declare.in(src2);
|
||||
|
||||
#define PERF_ACCUMULATE_MASK_INIT_2(_FLTC) \
|
||||
PERF_ACCUMULATE_MASK_INIT(_FLTC) \
|
||||
Mat src2(srcSize, srcType); \
|
||||
declare.in(src2);
|
||||
|
||||
PERF_TEST_P_ACCUMULATE(Product, MAT_TYPES_ACCUMLATE,
|
||||
PERF_ACCUMULATE_INIT_2(CV_32FC), accumulateProduct(src1, src2, dst))
|
||||
|
||||
PERF_TEST_P_ACCUMULATE(ProductMask, MAT_TYPES_ACCUMLATE_C,
|
||||
PERF_ACCUMULATE_MASK_INIT_2(CV_32FC), accumulateProduct(src1, src2, dst, mask))
|
||||
|
||||
PERF_TEST_P_ACCUMULATE(ProductDouble, MAT_TYPES_ACCUMLATE_D,
|
||||
PERF_ACCUMULATE_INIT_2(CV_64FC), accumulateProduct(src1, src2, dst))
|
||||
|
||||
PERF_TEST_P_ACCUMULATE(ProductDoubleMask, MAT_TYPES_ACCUMLATE_D_C,
|
||||
PERF_ACCUMULATE_MASK_INIT_2(CV_64FC), accumulateProduct(src1, src2, dst, mask))
|
||||
|
||||
///////////////////////////// AccumulateWeighted ///////////////////////////////////
|
||||
|
||||
PERF_TEST_P_ACCUMULATE(Weighted, MAT_TYPES_ACCUMLATE,
|
||||
PERF_ACCUMULATE_INIT(CV_32FC), accumulateWeighted(src1, dst, 0.123))
|
||||
|
||||
PERF_TEST_P_ACCUMULATE(WeightedMask, MAT_TYPES_ACCUMLATE_C,
|
||||
PERF_ACCUMULATE_MASK_INIT(CV_32FC), accumulateWeighted(src1, dst, 0.123, mask))
|
||||
|
||||
PERF_TEST_P_ACCUMULATE(WeightedDouble, MAT_TYPES_ACCUMLATE_D,
|
||||
PERF_ACCUMULATE_INIT(CV_64FC), accumulateWeighted(src1, dst, 0.123456))
|
||||
|
||||
PERF_TEST_P_ACCUMULATE(WeightedDoubleMask, MAT_TYPES_ACCUMLATE_D_C,
|
||||
PERF_ACCUMULATE_MASK_INIT(CV_64FC), accumulateWeighted(src1, dst, 0.123456, mask))
|
||||
|
||||
} // namespace
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -1825,7 +1825,7 @@ void hlineSmooth1N<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, const ufi
|
||||
const int VECSZ = v_uint16::nlanes;
|
||||
v_uint16 v_mul = vx_setall_u16(*((uint16_t*)m));
|
||||
for (; i <= lencn - VECSZ; i += VECSZ)
|
||||
v_store((uint16_t*)dst + i, v_mul*vx_load_expand(src + i));
|
||||
v_store((uint16_t*)dst + i, v_mul_wrap(v_mul, vx_load_expand(src + i)));
|
||||
#endif
|
||||
for (; i < lencn; i++)
|
||||
dst[i] = m[0] * src[i];
|
||||
@ -1915,7 +1915,9 @@ void hlineSmooth3N<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, const ufi
|
||||
v_uint16 v_mul1 = vx_setall_u16(_m[1]);
|
||||
v_uint16 v_mul2 = vx_setall_u16(_m[2]);
|
||||
for (; i <= lencn - VECSZ; i += VECSZ, src += VECSZ, dst += VECSZ)
|
||||
v_store((uint16_t*)dst, vx_load_expand(src - cn) * v_mul0 + vx_load_expand(src) * v_mul1 + vx_load_expand(src + cn) * v_mul2);
|
||||
v_store((uint16_t*)dst, v_mul_wrap(vx_load_expand(src - cn), v_mul0) +
|
||||
v_mul_wrap(vx_load_expand(src), v_mul1) +
|
||||
v_mul_wrap(vx_load_expand(src + cn), v_mul2));
|
||||
#endif
|
||||
for (; i < lencn; i++, src++, dst++)
|
||||
*dst = m[0] * src[-cn] + m[1] * src[0] + m[2] * src[cn];
|
||||
@ -2089,7 +2091,8 @@ void hlineSmooth3Naba<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, const
|
||||
v_uint16 v_mul0 = vx_setall_u16(_m[0]);
|
||||
v_uint16 v_mul1 = vx_setall_u16(_m[1]);
|
||||
for (; i <= lencn - VECSZ; i += VECSZ, src += VECSZ, dst += VECSZ)
|
||||
v_store((uint16_t*)dst, (vx_load_expand(src - cn) + vx_load_expand(src + cn)) * v_mul0 + vx_load_expand(src) * v_mul1);
|
||||
v_store((uint16_t*)dst, v_mul_wrap(vx_load_expand(src - cn) + vx_load_expand(src + cn), v_mul0) +
|
||||
v_mul_wrap(vx_load_expand(src), v_mul1));
|
||||
#endif
|
||||
for (; i < lencn; i++, src++, dst++)
|
||||
*((uint16_t*)dst) = ((uint16_t*)m)[1] * src[0] + ((uint16_t*)m)[0] * ((uint16_t)(src[-cn]) + (uint16_t)(src[cn]));
|
||||
@ -2285,7 +2288,11 @@ void hlineSmooth5N<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, const ufi
|
||||
v_uint16 v_mul3 = vx_setall_u16(_m[3]);
|
||||
v_uint16 v_mul4 = vx_setall_u16(_m[4]);
|
||||
for (; i <= lencn - VECSZ; i += VECSZ, src += VECSZ, dst += VECSZ)
|
||||
v_store((uint16_t*)dst, vx_load_expand(src - 2 * cn) * v_mul0 + vx_load_expand(src - cn) * v_mul1 + vx_load_expand(src) * v_mul2 + vx_load_expand(src + cn) * v_mul3 + vx_load_expand(src + 2 * cn) * v_mul4);
|
||||
v_store((uint16_t*)dst, v_mul_wrap(vx_load_expand(src - 2 * cn), v_mul0) +
|
||||
v_mul_wrap(vx_load_expand(src - cn), v_mul1) +
|
||||
v_mul_wrap(vx_load_expand(src), v_mul2) +
|
||||
v_mul_wrap(vx_load_expand(src + cn), v_mul3) +
|
||||
v_mul_wrap(vx_load_expand(src + 2 * cn), v_mul4));
|
||||
#endif
|
||||
for (; i < lencn; i++, src++, dst++)
|
||||
*dst = m[0] * src[-2*cn] + m[1] * src[-cn] + m[2] * src[0] + m[3] * src[cn] + m[4] * src[2*cn];
|
||||
@ -2488,7 +2495,7 @@ void hlineSmooth5N14641<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, cons
|
||||
const int VECSZ = v_uint16::nlanes;
|
||||
v_uint16 v_6 = vx_setall_u16(6);
|
||||
for (; i <= lencn - VECSZ; i += VECSZ, src += VECSZ, dst += VECSZ)
|
||||
v_store((uint16_t*)dst, (vx_load_expand(src) * v_6 + ((vx_load_expand(src - cn) + vx_load_expand(src + cn)) << 2) + vx_load_expand(src - 2 * cn) + vx_load_expand(src + 2 * cn)) << 4);
|
||||
v_store((uint16_t*)dst, (v_mul_wrap(vx_load_expand(src), v_6) + ((vx_load_expand(src - cn) + vx_load_expand(src + cn)) << 2) + vx_load_expand(src - 2 * cn) + vx_load_expand(src + 2 * cn)) << 4);
|
||||
#endif
|
||||
for (; i < lencn; i++, src++, dst++)
|
||||
*((uint16_t*)dst) = (uint16_t(src[0]) * 6 + ((uint16_t(src[-cn]) + uint16_t(src[cn])) << 2) + uint16_t(src[-2 * cn]) + uint16_t(src[2 * cn])) << 4;
|
||||
@ -2689,7 +2696,9 @@ void hlineSmooth5Nabcba<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, cons
|
||||
v_uint16 v_mul1 = vx_setall_u16(_m[1]);
|
||||
v_uint16 v_mul2 = vx_setall_u16(_m[2]);
|
||||
for (; i <= lencn - VECSZ; i += VECSZ, src += VECSZ, dst += VECSZ)
|
||||
v_store((uint16_t*)dst, (vx_load_expand(src - 2 * cn) + vx_load_expand(src + 2 * cn)) * v_mul0 + (vx_load_expand(src - cn) + vx_load_expand(src + cn))* v_mul1 + vx_load_expand(src) * v_mul2);
|
||||
v_store((uint16_t*)dst, v_mul_wrap(vx_load_expand(src - 2 * cn) + vx_load_expand(src + 2 * cn), v_mul0) +
|
||||
v_mul_wrap(vx_load_expand(src - cn) + vx_load_expand(src + cn), v_mul1) +
|
||||
v_mul_wrap(vx_load_expand(src), v_mul2));
|
||||
#endif
|
||||
for (; i < lencn; i++, src++, dst++)
|
||||
*((uint16_t*)dst) = ((uint16_t*)m)[0] * ((uint16_t)(src[-2 * cn]) + (uint16_t)(src[2 * cn])) + ((uint16_t*)m)[1] * ((uint16_t)(src[-cn]) + (uint16_t)(src[cn])) + ((uint16_t*)m)[2] * src[0];
|
||||
@ -2804,9 +2813,9 @@ void hlineSmooth<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, const ufixe
|
||||
const int VECSZ = v_uint16::nlanes;
|
||||
for (; i <= lencn - VECSZ; i+=VECSZ, src+=VECSZ, dst+=VECSZ)
|
||||
{
|
||||
v_uint16 v_res0 = vx_load_expand(src) * vx_setall_u16(*((uint16_t*)m));
|
||||
v_uint16 v_res0 = v_mul_wrap(vx_load_expand(src), vx_setall_u16(*((uint16_t*)m)));
|
||||
for (int j = 1; j < n; j++)
|
||||
v_res0 += vx_load_expand(src + j * cn) * vx_setall_u16(*((uint16_t*)(m + j)));
|
||||
v_res0 += v_mul_wrap(vx_load_expand(src + j * cn), vx_setall_u16(*((uint16_t*)(m + j))));
|
||||
v_store((uint16_t*)dst, v_res0);
|
||||
}
|
||||
#endif
|
||||
@ -2923,9 +2932,9 @@ void hlineSmoothONa_yzy_a<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, co
|
||||
const int VECSZ = v_uint16::nlanes;
|
||||
for (; i <= lencn - VECSZ; i += VECSZ, src += VECSZ, dst += VECSZ)
|
||||
{
|
||||
v_uint16 v_res0 = vx_load_expand(src + pre_shift * cn) * vx_setall_u16(*((uint16_t*)(m + pre_shift)));
|
||||
v_uint16 v_res0 = v_mul_wrap(vx_load_expand(src + pre_shift * cn), vx_setall_u16(*((uint16_t*)(m + pre_shift))));
|
||||
for (int j = 0; j < pre_shift; j ++)
|
||||
v_res0 += (vx_load_expand(src + j * cn) + vx_load_expand(src + (n - 1 - j)*cn)) * vx_setall_u16(*((uint16_t*)(m + j)));
|
||||
v_res0 += v_mul_wrap(vx_load_expand(src + j * cn) + vx_load_expand(src + (n - 1 - j)*cn), vx_setall_u16(*((uint16_t*)(m + j))));
|
||||
v_store((uint16_t*)dst, v_res0);
|
||||
}
|
||||
#endif
|
||||
|
@ -93,7 +93,7 @@ static void calcSharrDeriv(const cv::Mat& src, cv::Mat& dst)
|
||||
v_int16x8 s2 = v_reinterpret_as_s16(v_load_expand(srow2 + x));
|
||||
|
||||
v_int16x8 t1 = s2 - s0;
|
||||
v_int16x8 t0 = (s0 + s2) * c3 + s1 * c10;
|
||||
v_int16x8 t0 = v_mul_wrap(s0 + s2, c3) + v_mul_wrap(s1, c10);
|
||||
|
||||
v_store(trow0 + x, t0);
|
||||
v_store(trow1 + x, t1);
|
||||
@ -131,7 +131,7 @@ static void calcSharrDeriv(const cv::Mat& src, cv::Mat& dst)
|
||||
v_int16x8 s4 = v_load(trow1 + x + cn);
|
||||
|
||||
v_int16x8 t0 = s1 - s0;
|
||||
v_int16x8 t1 = ((s2 + s4) * c3) + (s3 * c10);
|
||||
v_int16x8 t1 = v_mul_wrap(s2 + s4, c3) + v_mul_wrap(s3, c10);
|
||||
|
||||
v_store_interleave((drow + x*2), t0, t1);
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user