mirror of
https://github.com/opencv/opencv.git
synced 2025-06-12 04:12:52 +08:00
Merge pull request #12064 from seiko2plus:coreUnvintrinArithm2
This commit is contained in:
commit
bb7cfcbcdb
@ -2,6 +2,7 @@ set(the_description "The Core Functionality")
|
||||
|
||||
ocv_add_dispatched_file(mathfuncs_core SSE2 AVX AVX2)
|
||||
ocv_add_dispatched_file(stat SSE4_2 AVX2)
|
||||
ocv_add_dispatched_file(arithm SSE2 SSE4_1 AVX2)
|
||||
|
||||
# dispatching for accuracy tests
|
||||
ocv_add_dispatched_file_force_all(test_intrin128 TEST SSE2 SSE3 SSSE3 SSE4_1 SSE4_2 AVX FP16 AVX2)
|
||||
|
@ -661,7 +661,7 @@ inline v_uint8x32 operator * (const v_uint8x32& a, const v_uint8x32& b)
|
||||
{
|
||||
v_uint16x16 c, d;
|
||||
v_mul_expand(a, b, c, d);
|
||||
return v_pack_u(v_reinterpret_as_s16(c), v_reinterpret_as_s16(d));
|
||||
return v_pack(c, d);
|
||||
}
|
||||
inline v_int8x32 operator * (const v_int8x32& a, const v_int8x32& b)
|
||||
{
|
||||
@ -1291,6 +1291,16 @@ inline v_float32x8 v_absdiff(const v_float32x8& a, const v_float32x8& b)
|
||||
inline v_float64x4 v_absdiff(const v_float64x4& a, const v_float64x4& b)
|
||||
{ return v_abs(a - b); }
|
||||
|
||||
/** Saturating absolute difference **/
|
||||
inline v_int8x32 v_absdiffs(const v_int8x32& a, const v_int8x32& b)
|
||||
{
|
||||
v_int8x32 d = a - b;
|
||||
v_int8x32 m = a < b;
|
||||
return (d ^ m) - m;
|
||||
}
|
||||
inline v_int16x16 v_absdiffs(const v_int16x16& a, const v_int16x16& b)
|
||||
{ return v_max(a, b) - v_min(a, b); }
|
||||
|
||||
////////// Conversions /////////
|
||||
|
||||
/** Rounding **/
|
||||
@ -1300,6 +1310,12 @@ inline v_int32x8 v_round(const v_float32x8& a)
|
||||
inline v_int32x8 v_round(const v_float64x4& a)
|
||||
{ return v_int32x8(_mm256_castsi128_si256(_mm256_cvtpd_epi32(a.val))); }
|
||||
|
||||
inline v_int32x8 v_round(const v_float64x4& a, const v_float64x4& b)
|
||||
{
|
||||
__m128i ai = _mm256_cvtpd_epi32(a.val), bi = _mm256_cvtpd_epi32(b.val);
|
||||
return v_int32x8(_v256_combine(ai, bi));
|
||||
}
|
||||
|
||||
inline v_int32x8 v_trunc(const v_float32x8& a)
|
||||
{ return v_int32x8(_mm256_cvttps_epi32(a.val)); }
|
||||
|
||||
@ -1689,6 +1705,40 @@ void v_rshr_pack_store(int* ptr, const v_int64x4& a)
|
||||
v_pack_store(ptr, (a + delta) >> n);
|
||||
}
|
||||
|
||||
// pack boolean
|
||||
inline v_uint8x32 v_pack_b(const v_uint16x16& a, const v_uint16x16& b)
|
||||
{
|
||||
__m256i ab = _mm256_packs_epi16(a.val, b.val);
|
||||
return v_uint8x32(_v256_shuffle_odd_64(ab));
|
||||
}
|
||||
|
||||
inline v_uint8x32 v_pack_b(const v_uint32x8& a, const v_uint32x8& b,
|
||||
const v_uint32x8& c, const v_uint32x8& d)
|
||||
{
|
||||
__m256i ab = _mm256_packs_epi32(a.val, b.val);
|
||||
__m256i cd = _mm256_packs_epi32(c.val, d.val);
|
||||
|
||||
__m256i abcd = _v256_shuffle_odd_64(_mm256_packs_epi16(ab, cd));
|
||||
return v_uint8x32(_mm256_shuffle_epi32(abcd, _MM_SHUFFLE(3, 1, 2, 0)));
|
||||
}
|
||||
|
||||
inline v_uint8x32 v_pack_b(const v_uint64x4& a, const v_uint64x4& b, const v_uint64x4& c,
|
||||
const v_uint64x4& d, const v_uint64x4& e, const v_uint64x4& f,
|
||||
const v_uint64x4& g, const v_uint64x4& h)
|
||||
{
|
||||
__m256i ab = _mm256_packs_epi32(a.val, b.val);
|
||||
__m256i cd = _mm256_packs_epi32(c.val, d.val);
|
||||
__m256i ef = _mm256_packs_epi32(e.val, f.val);
|
||||
__m256i gh = _mm256_packs_epi32(g.val, h.val);
|
||||
|
||||
__m256i abcd = _mm256_packs_epi32(ab, cd);
|
||||
__m256i efgh = _mm256_packs_epi32(ef, gh);
|
||||
__m256i pkall = _v256_shuffle_odd_64(_mm256_packs_epi16(abcd, efgh));
|
||||
|
||||
__m256i rev = _mm256_alignr_epi8(pkall, pkall, 8);
|
||||
return v_uint8x32(_mm256_unpacklo_epi16(pkall, rev));
|
||||
}
|
||||
|
||||
/* Recombine */
|
||||
// its up there with load and store operations
|
||||
|
||||
|
@ -109,7 +109,7 @@ These operations allow to reorder or recombine elements in one or multiple vecto
|
||||
|
||||
- Interleave, deinterleave (2, 3 and 4 channels): @ref v_load_deinterleave, @ref v_store_interleave
|
||||
- Expand: @ref v_load_expand, @ref v_load_expand_q, @ref v_expand, @ref v_expand_low, @ref v_expand_high
|
||||
- Pack: @ref v_pack, @ref v_pack_u, @ref v_rshr_pack, @ref v_rshr_pack_u,
|
||||
- Pack: @ref v_pack, @ref v_pack_u, @ref v_pack_b, @ref v_rshr_pack, @ref v_rshr_pack_u,
|
||||
@ref v_pack_store, @ref v_pack_u_store, @ref v_rshr_pack_store, @ref v_rshr_pack_u_store
|
||||
- Recombine: @ref v_zip, @ref v_recombine, @ref v_combine_low, @ref v_combine_high
|
||||
- Extract: @ref v_extract
|
||||
@ -159,7 +159,7 @@ Most of these operations return only one value.
|
||||
### Other math
|
||||
|
||||
- Some frequent operations: @ref v_sqrt, @ref v_invsqrt, @ref v_magnitude, @ref v_sqr_magnitude
|
||||
- Absolute values: @ref v_abs, @ref v_absdiff
|
||||
- Absolute values: @ref v_abs, @ref v_absdiff, @ref v_absdiffs
|
||||
|
||||
### Conversions
|
||||
|
||||
@ -199,10 +199,12 @@ Regular integers:
|
||||
|logical | x | x | x | x | x | x |
|
||||
|min, max | x | x | x | x | x | x |
|
||||
|absdiff | x | x | x | x | x | x |
|
||||
|absdiffs | | x | | x | | |
|
||||
|reduce | | | | | x | x |
|
||||
|mask | x | x | x | x | x | x |
|
||||
|pack | x | x | x | x | x | x |
|
||||
|pack_u | x | | x | | | |
|
||||
|pack_b | x | | | | | |
|
||||
|unpack | x | x | x | x | x | x |
|
||||
|extract | x | x | x | x | x | x |
|
||||
|rotate (lanes) | x | x | x | x | x | x |
|
||||
@ -762,6 +764,19 @@ inline v_float64x2 v_absdiff(const v_float64x2& a, const v_float64x2& b)
|
||||
return c;
|
||||
}
|
||||
|
||||
/** @brief Saturating absolute difference
|
||||
|
||||
Returns \f$ saturate(|a - b|) \f$ .
|
||||
For 8-, 16-bit signed integer source types. */
|
||||
template<typename _Tp, int n>
|
||||
inline v_reg<_Tp, n> v_absdiffs(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
|
||||
{
|
||||
v_reg<_Tp, n> c;
|
||||
for( int i = 0; i < n; i++)
|
||||
c.s[i] = saturate_cast<_Tp>(std::abs(a.s[i] - b.s[i]));
|
||||
return c;
|
||||
}
|
||||
|
||||
/** @brief Inversed square root
|
||||
|
||||
Returns \f$ 1/sqrt(a) \f$
|
||||
@ -1613,6 +1628,18 @@ template<int n> inline v_reg<int, n> v_round(const v_reg<float, n>& a)
|
||||
return c;
|
||||
}
|
||||
|
||||
/** @overload */
|
||||
template<int n> inline v_reg<int, n*2> v_round(const v_reg<double, n>& a, const v_reg<double, n>& b)
|
||||
{
|
||||
v_reg<int, n*2> c;
|
||||
for( int i = 0; i < n; i++ )
|
||||
{
|
||||
c.s[i] = cvRound(a.s[i]);
|
||||
c.s[i+n] = cvRound(b.s[i]);
|
||||
}
|
||||
return c;
|
||||
}
|
||||
|
||||
/** @brief Floor
|
||||
|
||||
Floor each value. Input type is float vector ==> output type is int vector.*/
|
||||
@ -2059,6 +2086,103 @@ OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int16x8, short, v_uint8x16, uchar, pack_u, s
|
||||
OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int32x4, int, v_uint16x8, ushort, pack_u, saturate_cast)
|
||||
//! @}
|
||||
|
||||
//! @cond IGNORED
|
||||
template<typename _Tpm, typename _Tp, int n>
|
||||
inline void _pack_b(_Tpm* mptr, const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
|
||||
{
|
||||
for (int i = 0; i < n; ++i)
|
||||
{
|
||||
mptr[i] = (_Tpm)a.s[i];
|
||||
mptr[i + n] = (_Tpm)b.s[i];
|
||||
}
|
||||
}
|
||||
//! @endcond
|
||||
|
||||
//! @name Pack boolean values
|
||||
//! @{
|
||||
//! @brief Pack boolean values from multiple vectors to one unsigned 8-bit integer vector
|
||||
//!
|
||||
//! @note Must provide valid boolean values to guarantee same result for all architectures.
|
||||
|
||||
/** @brief
|
||||
//! For 16-bit boolean values
|
||||
|
||||
Scheme:
|
||||
@code
|
||||
a {0xFFFF 0 0 0xFFFF 0 0xFFFF 0xFFFF 0}
|
||||
b {0xFFFF 0 0xFFFF 0 0 0xFFFF 0 0xFFFF}
|
||||
===============
|
||||
{
|
||||
0xFF 0 0 0xFF 0 0xFF 0xFF 0
|
||||
0xFF 0 0xFF 0 0 0xFF 0 0xFF
|
||||
}
|
||||
@endcode */
|
||||
|
||||
inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
|
||||
{
|
||||
v_uint8x16 mask;
|
||||
_pack_b(mask.s, a, b);
|
||||
return mask;
|
||||
}
|
||||
|
||||
/** @overload
|
||||
For 32-bit boolean values
|
||||
|
||||
Scheme:
|
||||
@code
|
||||
a {0xFFFF.. 0 0 0xFFFF..}
|
||||
b {0 0xFFFF.. 0xFFFF.. 0}
|
||||
c {0xFFFF.. 0 0xFFFF.. 0}
|
||||
d {0 0xFFFF.. 0 0xFFFF..}
|
||||
===============
|
||||
{
|
||||
0xFF 0 0 0xFF 0 0xFF 0xFF 0
|
||||
0xFF 0 0xFF 0 0 0xFF 0 0xFF
|
||||
}
|
||||
@endcode */
|
||||
|
||||
inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
|
||||
const v_uint32x4& c, const v_uint32x4& d)
|
||||
{
|
||||
v_uint8x16 mask;
|
||||
_pack_b(mask.s, a, b);
|
||||
_pack_b(mask.s + 8, c, d);
|
||||
return mask;
|
||||
}
|
||||
|
||||
/** @overload
|
||||
For 64-bit boolean values
|
||||
|
||||
Scheme:
|
||||
@code
|
||||
a {0xFFFF.. 0}
|
||||
b {0 0xFFFF..}
|
||||
c {0xFFFF.. 0}
|
||||
d {0 0xFFFF..}
|
||||
|
||||
e {0xFFFF.. 0}
|
||||
f {0xFFFF.. 0}
|
||||
g {0 0xFFFF..}
|
||||
h {0 0xFFFF..}
|
||||
===============
|
||||
{
|
||||
0xFF 0 0 0xFF 0xFF 0 0 0xFF
|
||||
0xFF 0 0xFF 0 0 0xFF 0 0xFF
|
||||
}
|
||||
@endcode */
|
||||
inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
|
||||
const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f,
|
||||
const v_uint64x2& g, const v_uint64x2& h)
|
||||
{
|
||||
v_uint8x16 mask;
|
||||
_pack_b(mask.s, a, b);
|
||||
_pack_b(mask.s + 4, c, d);
|
||||
_pack_b(mask.s + 8, e, f);
|
||||
_pack_b(mask.s + 12, g, h);
|
||||
return mask;
|
||||
}
|
||||
//! @}
|
||||
|
||||
/** @brief Matrix multiplication
|
||||
|
||||
Scheme:
|
||||
|
@ -394,6 +394,35 @@ OPENCV_HAL_IMPL_NEON_PACK(v_int32x4, int, int32x2_t, s32, v_int64x2, pack, vmovn
|
||||
OPENCV_HAL_IMPL_NEON_PACK(v_uint8x16, uchar, uint8x8_t, u8, v_int16x8, pack_u, vqmovun_s16, vqrshrun_n_s16)
|
||||
OPENCV_HAL_IMPL_NEON_PACK(v_uint16x8, ushort, uint16x4_t, u16, v_int32x4, pack_u, vqmovun_s32, vqrshrun_n_s32)
|
||||
|
||||
// pack boolean
|
||||
inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
|
||||
{
|
||||
uint8x16_t ab = vcombine_u8(vmovn_u16(a.val), vmovn_u16(b.val));
|
||||
return v_uint8x16(ab);
|
||||
}
|
||||
|
||||
inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
|
||||
const v_uint32x4& c, const v_uint32x4& d)
|
||||
{
|
||||
uint16x8_t nab = vcombine_u16(vmovn_u32(a.val), vmovn_u32(b.val));
|
||||
uint16x8_t ncd = vcombine_u16(vmovn_u32(c.val), vmovn_u32(d.val));
|
||||
return v_uint8x16(vcombine_u8(vmovn_u16(nab), vmovn_u16(ncd)));
|
||||
}
|
||||
|
||||
inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
|
||||
const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f,
|
||||
const v_uint64x2& g, const v_uint64x2& h)
|
||||
{
|
||||
uint32x4_t ab = vcombine_u32(vmovn_u64(a.val), vmovn_u64(b.val));
|
||||
uint32x4_t cd = vcombine_u32(vmovn_u64(c.val), vmovn_u64(d.val));
|
||||
uint32x4_t ef = vcombine_u32(vmovn_u64(e.val), vmovn_u64(f.val));
|
||||
uint32x4_t gh = vcombine_u32(vmovn_u64(g.val), vmovn_u64(h.val));
|
||||
|
||||
uint16x8_t abcd = vcombine_u16(vmovn_u32(ab), vmovn_u32(cd));
|
||||
uint16x8_t efgh = vcombine_u16(vmovn_u32(ef), vmovn_u32(gh));
|
||||
return v_uint8x16(vcombine_u8(vmovn_u16(abcd), vmovn_u16(efgh)));
|
||||
}
|
||||
|
||||
inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
|
||||
const v_float32x4& m1, const v_float32x4& m2,
|
||||
const v_float32x4& m3)
|
||||
@ -748,7 +777,6 @@ OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_mul_wrap, vmulq_s8)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_mul_wrap, vmulq_u16)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_mul_wrap, vmulq_s16)
|
||||
|
||||
// TODO: absdiff for signed integers
|
||||
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_absdiff, vabdq_u8)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_absdiff, vabdq_u16)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_absdiff, vabdq_u32)
|
||||
@ -757,6 +785,12 @@ OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_absdiff, vabdq_f32)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float64x2, v_absdiff, vabdq_f64)
|
||||
#endif
|
||||
|
||||
/** Saturating absolute difference **/
|
||||
inline v_int8x16 v_absdiffs(const v_int8x16& a, const v_int8x16& b)
|
||||
{ return v_int8x16(vqabsq_s8(vqsubq_s8(a.val, b.val))); }
|
||||
inline v_int16x8 v_absdiffs(const v_int16x8& a, const v_int16x8& b)
|
||||
{ return v_int16x8(vqabsq_s16(vqsubq_s16(a.val, b.val))); }
|
||||
|
||||
#define OPENCV_HAL_IMPL_NEON_BIN_FUNC2(_Tpvec, _Tpvec2, cast, func, intrin) \
|
||||
inline _Tpvec2 func(const _Tpvec& a, const _Tpvec& b) \
|
||||
{ \
|
||||
@ -1242,6 +1276,11 @@ inline v_int32x4 v_round(const v_float64x2& a)
|
||||
return v_int32x4(vcombine_s32(vmovn_s64(vcvtaq_s64_f64(a.val)), zero));
|
||||
}
|
||||
|
||||
inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
|
||||
{
|
||||
return v_int32x4(vcombine_s32(vmovn_s64(vcvtaq_s64_f64(a.val)), vmovn_s64(vcvtaq_s64_f64(b.val))));
|
||||
}
|
||||
|
||||
inline v_int32x4 v_floor(const v_float64x2& a)
|
||||
{
|
||||
static const int32x2_t zero = vdup_n_s32(0);
|
||||
|
@ -634,6 +634,35 @@ void v_rshr_pack_store(int* ptr, const v_int64x2& a)
|
||||
_mm_storel_epi64((__m128i*)ptr, a2);
|
||||
}
|
||||
|
||||
// pack boolean
|
||||
inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
|
||||
{
|
||||
__m128i ab = _mm_packs_epi16(a.val, b.val);
|
||||
return v_uint8x16(ab);
|
||||
}
|
||||
|
||||
inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
|
||||
const v_uint32x4& c, const v_uint32x4& d)
|
||||
{
|
||||
__m128i ab = _mm_packs_epi32(a.val, b.val);
|
||||
__m128i cd = _mm_packs_epi32(c.val, d.val);
|
||||
return v_uint8x16(_mm_packs_epi16(ab, cd));
|
||||
}
|
||||
|
||||
inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
|
||||
const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f,
|
||||
const v_uint64x2& g, const v_uint64x2& h)
|
||||
{
|
||||
__m128i ab = _mm_packs_epi32(a.val, b.val);
|
||||
__m128i cd = _mm_packs_epi32(c.val, d.val);
|
||||
__m128i ef = _mm_packs_epi32(e.val, f.val);
|
||||
__m128i gh = _mm_packs_epi32(g.val, h.val);
|
||||
|
||||
__m128i abcd = _mm_packs_epi32(ab, cd);
|
||||
__m128i efgh = _mm_packs_epi32(ef, gh);
|
||||
return v_uint8x16(_mm_packs_epi16(abcd, efgh));
|
||||
}
|
||||
|
||||
inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
|
||||
const v_float32x4& m1, const v_float32x4& m2,
|
||||
const v_float32x4& m3)
|
||||
@ -706,19 +735,11 @@ OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int64x2, _mm_sub_epi64)
|
||||
inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \
|
||||
{ a = a * b; return a; }
|
||||
|
||||
OPENCV_HAL_IMPL_SSE_MUL_SAT(v_uint8x16, v_uint16x8)
|
||||
OPENCV_HAL_IMPL_SSE_MUL_SAT(v_int8x16, v_int16x8)
|
||||
OPENCV_HAL_IMPL_SSE_MUL_SAT(v_uint16x8, v_uint32x4)
|
||||
OPENCV_HAL_IMPL_SSE_MUL_SAT(v_int16x8, v_int32x4)
|
||||
|
||||
inline v_uint8x16 operator * (const v_uint8x16& a, const v_uint8x16& b)
|
||||
{
|
||||
v_uint16x8 c, d;
|
||||
v_mul_expand(a, b, c, d);
|
||||
return v_pack_u(v_reinterpret_as_s16(c), v_reinterpret_as_s16(d));
|
||||
}
|
||||
inline v_uint8x16& operator *= (v_uint8x16& a, const v_uint8x16& b)
|
||||
{ a = a * b; return a; }
|
||||
|
||||
// Multiply and expand
|
||||
inline void v_mul_expand(const v_uint8x16& a, const v_uint8x16& b,
|
||||
v_uint16x8& c, v_uint16x8& d)
|
||||
@ -1045,34 +1066,43 @@ inline v_int8x16 v_mul_wrap(const v_int8x16& a, const v_int8x16& b)
|
||||
return v_reinterpret_as_s8(v_mul_wrap(v_reinterpret_as_u8(a), v_reinterpret_as_u8(b)));
|
||||
}
|
||||
|
||||
#define OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(_Tpuvec, _Tpsvec, bits, smask32) \
|
||||
inline _Tpuvec v_absdiff(const _Tpuvec& a, const _Tpuvec& b) \
|
||||
{ \
|
||||
return _Tpuvec(_mm_add_epi##bits(_mm_subs_epu##bits(a.val, b.val), _mm_subs_epu##bits(b.val, a.val))); \
|
||||
} \
|
||||
inline _Tpuvec v_absdiff(const _Tpsvec& a, const _Tpsvec& b) \
|
||||
{ \
|
||||
__m128i smask = _mm_set1_epi32(smask32); \
|
||||
__m128i a1 = _mm_xor_si128(a.val, smask); \
|
||||
__m128i b1 = _mm_xor_si128(b.val, smask); \
|
||||
return _Tpuvec(_mm_add_epi##bits(_mm_subs_epu##bits(a1, b1), _mm_subs_epu##bits(b1, a1))); \
|
||||
}
|
||||
|
||||
OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(v_uint8x16, v_int8x16, 8, (int)0x80808080)
|
||||
OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(v_uint16x8, v_int16x8, 16, (int)0x80008000)
|
||||
/** Absolute difference **/
|
||||
|
||||
inline v_uint8x16 v_absdiff(const v_uint8x16& a, const v_uint8x16& b)
|
||||
{ return v_add_wrap(a - b, b - a); }
|
||||
inline v_uint16x8 v_absdiff(const v_uint16x8& a, const v_uint16x8& b)
|
||||
{ return v_add_wrap(a - b, b - a); }
|
||||
inline v_uint32x4 v_absdiff(const v_uint32x4& a, const v_uint32x4& b)
|
||||
{
|
||||
return v_max(a, b) - v_min(a, b);
|
||||
}
|
||||
{ return v_max(a, b) - v_min(a, b); }
|
||||
|
||||
inline v_uint8x16 v_absdiff(const v_int8x16& a, const v_int8x16& b)
|
||||
{
|
||||
v_int8x16 d = v_sub_wrap(a, b);
|
||||
v_int8x16 m = a < b;
|
||||
return v_reinterpret_as_u8(v_sub_wrap(d ^ m, m));
|
||||
}
|
||||
inline v_uint16x8 v_absdiff(const v_int16x8& a, const v_int16x8& b)
|
||||
{
|
||||
return v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b)));
|
||||
}
|
||||
inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b)
|
||||
{
|
||||
__m128i d = _mm_sub_epi32(a.val, b.val);
|
||||
__m128i m = _mm_cmpgt_epi32(b.val, a.val);
|
||||
return v_uint32x4(_mm_sub_epi32(_mm_xor_si128(d, m), m));
|
||||
v_int32x4 d = a - b;
|
||||
v_int32x4 m = a < b;
|
||||
return v_reinterpret_as_u32((d ^ m) - m);
|
||||
}
|
||||
|
||||
/** Saturating absolute difference **/
|
||||
inline v_int8x16 v_absdiffs(const v_int8x16& a, const v_int8x16& b)
|
||||
{
|
||||
v_int8x16 d = a - b;
|
||||
v_int8x16 m = a < b;
|
||||
return (d ^ m) - m;
|
||||
}
|
||||
inline v_int16x8 v_absdiffs(const v_int16x8& a, const v_int16x8& b)
|
||||
{ return v_max(a, b) - v_min(a, b); }
|
||||
|
||||
|
||||
inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
|
||||
{
|
||||
return a * b + c;
|
||||
@ -1623,6 +1653,12 @@ inline v_int32x4 v_trunc(const v_float32x4& a)
|
||||
inline v_int32x4 v_round(const v_float64x2& a)
|
||||
{ return v_int32x4(_mm_cvtpd_epi32(a.val)); }
|
||||
|
||||
inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
|
||||
{
|
||||
__m128i ai = _mm_cvtpd_epi32(a.val), bi = _mm_cvtpd_epi32(b.val);
|
||||
return v_int32x4(_mm_unpacklo_epi64(ai, bi));
|
||||
}
|
||||
|
||||
inline v_int32x4 v_floor(const v_float64x2& a)
|
||||
{
|
||||
__m128i a1 = _mm_cvtpd_epi32(a.val);
|
||||
|
@ -383,6 +383,35 @@ OPENCV_HAL_IMPL_VSX_PACK(v_uint16x8, ushort, v_int32x4, unsigned int, int,
|
||||
//OPENCV_HAL_IMPL_VSX_PACK(v_uint32x4, uint, v_int64x2, unsigned long long, long long,
|
||||
// vec_sra, vec_packsu, vec_add, pack_u)
|
||||
|
||||
// pack boolean
|
||||
inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
|
||||
{
|
||||
vec_uchar16 ab = vec_pack(a.val, b.val);
|
||||
return v_uint8x16(ab);
|
||||
}
|
||||
|
||||
inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
|
||||
const v_uint32x4& c, const v_uint32x4& d)
|
||||
{
|
||||
vec_ushort8 ab = vec_pack(a.val, b.val);
|
||||
vec_ushort8 cd = vec_pack(c.val, d.val);
|
||||
return v_uint8x16(vec_pack(ab, cd));
|
||||
}
|
||||
|
||||
inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
|
||||
const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f,
|
||||
const v_uint64x2& g, const v_uint64x2& h)
|
||||
{
|
||||
vec_uint4 ab = vec_pack(a.val, b.val);
|
||||
vec_uint4 cd = vec_pack(c.val, d.val);
|
||||
vec_uint4 ef = vec_pack(e.val, f.val);
|
||||
vec_uint4 gh = vec_pack(g.val, h.val);
|
||||
|
||||
vec_ushort8 abcd = vec_pack(ab, cd);
|
||||
vec_ushort8 efgh = vec_pack(ef, gh);
|
||||
return v_uint8x16(vec_pack(abcd, efgh));
|
||||
}
|
||||
|
||||
/* Recombine */
|
||||
template <typename _Tpvec>
|
||||
inline void v_zip(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1)
|
||||
@ -834,16 +863,27 @@ inline v_float32x4 v_abs(const v_float32x4& x)
|
||||
inline v_float64x2 v_abs(const v_float64x2& x)
|
||||
{ return v_float64x2(vec_abs(x.val)); }
|
||||
|
||||
/** Absolute difference **/
|
||||
// unsigned
|
||||
OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_absdiff, vec_absd)
|
||||
|
||||
#define OPENCV_HAL_IMPL_VSX_BIN_FUNC2(_Tpvec, _Tpvec2, cast, func, intrin) \
|
||||
inline _Tpvec2 func(const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return _Tpvec2(cast(intrin(a.val, b.val))); }
|
||||
inline v_uint8x16 v_absdiff(const v_int8x16& a, const v_int8x16& b)
|
||||
{ return v_reinterpret_as_u8(v_sub_wrap(v_max(a, b), v_min(a, b))); }
|
||||
inline v_uint16x8 v_absdiff(const v_int16x8& a, const v_int16x8& b)
|
||||
{ return v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))); }
|
||||
inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b)
|
||||
{ return v_reinterpret_as_u32(v_max(a, b) - v_min(a, b)); }
|
||||
|
||||
OPENCV_HAL_IMPL_VSX_BIN_FUNC2(v_int8x16, v_uint8x16, vec_uchar16_c, v_absdiff, vec_absd)
|
||||
OPENCV_HAL_IMPL_VSX_BIN_FUNC2(v_int16x8, v_uint16x8, vec_ushort8_c, v_absdiff, vec_absd)
|
||||
OPENCV_HAL_IMPL_VSX_BIN_FUNC2(v_int32x4, v_uint32x4, vec_uint4_c, v_absdiff, vec_absd)
|
||||
OPENCV_HAL_IMPL_VSX_BIN_FUNC2(v_int64x2, v_uint64x2, vec_udword2_c, v_absdiff, vec_absd)
|
||||
inline v_float32x4 v_absdiff(const v_float32x4& a, const v_float32x4& b)
|
||||
{ return v_abs(a - b); }
|
||||
inline v_float64x2 v_absdiff(const v_float64x2& a, const v_float64x2& b)
|
||||
{ return v_abs(a - b); }
|
||||
|
||||
/** Absolute difference for signed integers **/
|
||||
inline v_int8x16 v_absdiffs(const v_int8x16& a, const v_int8x16& b)
|
||||
{ return v_int8x16(vec_abss(vec_subs(a.val, b.val))); }
|
||||
inline v_int16x8 v_absdiffs(const v_int16x8& a, const v_int16x8& b)
|
||||
{ return v_int16x8(vec_abss(vec_subs(a.val, b.val))); }
|
||||
|
||||
////////// Conversions /////////
|
||||
|
||||
@ -854,6 +894,9 @@ inline v_int32x4 v_round(const v_float32x4& a)
|
||||
inline v_int32x4 v_round(const v_float64x2& a)
|
||||
{ return v_int32x4(vec_mergesqo(vec_ctso(vec_rint(a.val)), vec_int4_z)); }
|
||||
|
||||
inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
|
||||
{ return v_int32x4(vec_mergesqo(vec_ctso(vec_rint(a.val)), vec_ctso(vec_rint(b.val)))); }
|
||||
|
||||
inline v_int32x4 v_floor(const v_float32x4& a)
|
||||
{ return v_int32x4(vec_cts(vec_floor(a.val))); }
|
||||
|
||||
|
File diff suppressed because it is too large
Load Diff
11
modules/core/src/arithm.dispatch.cpp
Normal file
11
modules/core/src/arithm.dispatch.cpp
Normal file
@ -0,0 +1,11 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html
|
||||
|
||||
#include "precomp.hpp"
|
||||
#include "arithm_ipp.hpp"
|
||||
#include "arithm.simd.hpp"
|
||||
#include "arithm.simd_declarations.hpp"
|
||||
|
||||
#define ARITHM_DISPATCHING_ONLY
|
||||
#include "arithm.simd.hpp"
|
1937
modules/core/src/arithm.simd.hpp
Normal file
1937
modules/core/src/arithm.simd.hpp
Normal file
File diff suppressed because it is too large
Load Diff
@ -1,623 +0,0 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
|
||||
// Copyright (C) 2015, Itseez Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#ifndef __OPENCV_ARITHM_CORE_HPP__
|
||||
#define __OPENCV_ARITHM_CORE_HPP__
|
||||
|
||||
#include "arithm_simd.hpp"
|
||||
|
||||
namespace cv {
|
||||
|
||||
template<typename T1, typename T2=T1, typename T3=T1> struct OpAdd
|
||||
{
|
||||
typedef T1 type1;
|
||||
typedef T2 type2;
|
||||
typedef T3 rtype;
|
||||
T3 operator ()(const T1 a, const T2 b) const { return saturate_cast<T3>(a + b); }
|
||||
};
|
||||
|
||||
template<typename T1, typename T2=T1, typename T3=T1> struct OpSub
|
||||
{
|
||||
typedef T1 type1;
|
||||
typedef T2 type2;
|
||||
typedef T3 rtype;
|
||||
T3 operator ()(const T1 a, const T2 b) const { return saturate_cast<T3>(a - b); }
|
||||
};
|
||||
|
||||
template<typename T1, typename T2=T1, typename T3=T1> struct OpRSub
|
||||
{
|
||||
typedef T1 type1;
|
||||
typedef T2 type2;
|
||||
typedef T3 rtype;
|
||||
T3 operator ()(const T1 a, const T2 b) const { return saturate_cast<T3>(b - a); }
|
||||
};
|
||||
|
||||
template<typename T> struct OpMin
|
||||
{
|
||||
typedef T type1;
|
||||
typedef T type2;
|
||||
typedef T rtype;
|
||||
T operator ()(const T a, const T b) const { return std::min(a, b); }
|
||||
};
|
||||
|
||||
template<typename T> struct OpMax
|
||||
{
|
||||
typedef T type1;
|
||||
typedef T type2;
|
||||
typedef T rtype;
|
||||
T operator ()(const T a, const T b) const { return std::max(a, b); }
|
||||
};
|
||||
|
||||
template<typename T> struct OpAbsDiff
|
||||
{
|
||||
typedef T type1;
|
||||
typedef T type2;
|
||||
typedef T rtype;
|
||||
T operator()(T a, T b) const { return a > b ? a - b : b - a; }
|
||||
};
|
||||
|
||||
// specializations to prevent "-0" results
|
||||
template<> struct OpAbsDiff<float>
|
||||
{
|
||||
typedef float type1;
|
||||
typedef float type2;
|
||||
typedef float rtype;
|
||||
float operator()(float a, float b) const { return std::abs(a - b); }
|
||||
};
|
||||
template<> struct OpAbsDiff<double>
|
||||
{
|
||||
typedef double type1;
|
||||
typedef double type2;
|
||||
typedef double rtype;
|
||||
double operator()(double a, double b) const { return std::abs(a - b); }
|
||||
};
|
||||
|
||||
template<typename T> struct OpAnd
|
||||
{
|
||||
typedef T type1;
|
||||
typedef T type2;
|
||||
typedef T rtype;
|
||||
T operator()( T a, T b ) const { return a & b; }
|
||||
};
|
||||
|
||||
template<typename T> struct OpOr
|
||||
{
|
||||
typedef T type1;
|
||||
typedef T type2;
|
||||
typedef T rtype;
|
||||
T operator()( T a, T b ) const { return a | b; }
|
||||
};
|
||||
|
||||
template<typename T> struct OpXor
|
||||
{
|
||||
typedef T type1;
|
||||
typedef T type2;
|
||||
typedef T rtype;
|
||||
T operator()( T a, T b ) const { return a ^ b; }
|
||||
};
|
||||
|
||||
template<typename T> struct OpNot
|
||||
{
|
||||
typedef T type1;
|
||||
typedef T type2;
|
||||
typedef T rtype;
|
||||
T operator()( T a, T ) const { return ~a; }
|
||||
};
|
||||
|
||||
//=============================================================================
|
||||
|
||||
template<typename T, class Op, class VOp>
|
||||
void vBinOp(const T* src1, size_t step1, const T* src2, size_t step2, T* dst, size_t step, int width, int height)
|
||||
{
|
||||
#if CV_SSE2 || CV_NEON
|
||||
VOp vop;
|
||||
#endif
|
||||
Op op;
|
||||
|
||||
for( ; height--; src1 = (const T *)((const uchar *)src1 + step1),
|
||||
src2 = (const T *)((const uchar *)src2 + step2),
|
||||
dst = (T *)((uchar *)dst + step) )
|
||||
{
|
||||
int x = 0;
|
||||
|
||||
#if CV_NEON || CV_SSE2
|
||||
#if CV_AVX2
|
||||
if( USE_AVX2 )
|
||||
{
|
||||
for( ; x <= width - 32/(int)sizeof(T); x += 32/sizeof(T) )
|
||||
{
|
||||
typename VLoadStore256<T>::reg_type r0 = VLoadStore256<T>::load(src1 + x);
|
||||
r0 = vop(r0, VLoadStore256<T>::load(src2 + x));
|
||||
VLoadStore256<T>::store(dst + x, r0);
|
||||
}
|
||||
}
|
||||
#else
|
||||
#if CV_SSE2
|
||||
if( USE_SSE2 )
|
||||
{
|
||||
#endif // CV_SSE2
|
||||
for( ; x <= width - 32/(int)sizeof(T); x += 32/sizeof(T) )
|
||||
{
|
||||
typename VLoadStore128<T>::reg_type r0 = VLoadStore128<T>::load(src1 + x );
|
||||
typename VLoadStore128<T>::reg_type r1 = VLoadStore128<T>::load(src1 + x + 16/sizeof(T));
|
||||
r0 = vop(r0, VLoadStore128<T>::load(src2 + x ));
|
||||
r1 = vop(r1, VLoadStore128<T>::load(src2 + x + 16/sizeof(T)));
|
||||
VLoadStore128<T>::store(dst + x , r0);
|
||||
VLoadStore128<T>::store(dst + x + 16/sizeof(T), r1);
|
||||
}
|
||||
#if CV_SSE2
|
||||
}
|
||||
#endif // CV_SSE2
|
||||
#endif // CV_AVX2
|
||||
#endif // CV_NEON || CV_SSE2
|
||||
|
||||
#if CV_AVX2
|
||||
// nothing
|
||||
#elif CV_SSE2
|
||||
if( USE_SSE2 )
|
||||
{
|
||||
for( ; x <= width - 8/(int)sizeof(T); x += 8/sizeof(T) )
|
||||
{
|
||||
typename VLoadStore64<T>::reg_type r = VLoadStore64<T>::load(src1 + x);
|
||||
r = vop(r, VLoadStore64<T>::load(src2 + x));
|
||||
VLoadStore64<T>::store(dst + x, r);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#if CV_ENABLE_UNROLLED
|
||||
for( ; x <= width - 4; x += 4 )
|
||||
{
|
||||
T v0 = op(src1[x], src2[x]);
|
||||
T v1 = op(src1[x+1], src2[x+1]);
|
||||
dst[x] = v0; dst[x+1] = v1;
|
||||
v0 = op(src1[x+2], src2[x+2]);
|
||||
v1 = op(src1[x+3], src2[x+3]);
|
||||
dst[x+2] = v0; dst[x+3] = v1;
|
||||
}
|
||||
#endif
|
||||
|
||||
for( ; x < width; x++ )
|
||||
dst[x] = op(src1[x], src2[x]);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T, class Op, class Op32>
|
||||
void vBinOp32(const T* src1, size_t step1, const T* src2, size_t step2,
|
||||
T* dst, size_t step, int width, int height)
|
||||
{
|
||||
#if CV_SSE2 || CV_NEON
|
||||
Op32 op32;
|
||||
#endif
|
||||
Op op;
|
||||
|
||||
for( ; height--; src1 = (const T *)((const uchar *)src1 + step1),
|
||||
src2 = (const T *)((const uchar *)src2 + step2),
|
||||
dst = (T *)((uchar *)dst + step) )
|
||||
{
|
||||
int x = 0;
|
||||
|
||||
#if CV_AVX2
|
||||
if( USE_AVX2 )
|
||||
{
|
||||
if( (((size_t)src1|(size_t)src2|(size_t)dst)&31) == 0 )
|
||||
{
|
||||
for( ; x <= width - 8; x += 8 )
|
||||
{
|
||||
typename VLoadStore256Aligned<T>::reg_type r0 = VLoadStore256Aligned<T>::load(src1 + x);
|
||||
r0 = op32(r0, VLoadStore256Aligned<T>::load(src2 + x));
|
||||
VLoadStore256Aligned<T>::store(dst + x, r0);
|
||||
}
|
||||
}
|
||||
}
|
||||
#elif CV_SSE2
|
||||
if( USE_SSE2 )
|
||||
{
|
||||
if( (((size_t)src1|(size_t)src2|(size_t)dst)&15) == 0 )
|
||||
{
|
||||
for( ; x <= width - 8; x += 8 )
|
||||
{
|
||||
typename VLoadStore128Aligned<T>::reg_type r0 = VLoadStore128Aligned<T>::load(src1 + x );
|
||||
typename VLoadStore128Aligned<T>::reg_type r1 = VLoadStore128Aligned<T>::load(src1 + x + 4);
|
||||
r0 = op32(r0, VLoadStore128Aligned<T>::load(src2 + x ));
|
||||
r1 = op32(r1, VLoadStore128Aligned<T>::load(src2 + x + 4));
|
||||
VLoadStore128Aligned<T>::store(dst + x , r0);
|
||||
VLoadStore128Aligned<T>::store(dst + x + 4, r1);
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif // CV_AVX2
|
||||
|
||||
#if CV_NEON || CV_SSE2
|
||||
#if CV_AVX2
|
||||
if( USE_AVX2 )
|
||||
{
|
||||
for( ; x <= width - 8; x += 8 )
|
||||
{
|
||||
typename VLoadStore256<T>::reg_type r0 = VLoadStore256<T>::load(src1 + x);
|
||||
r0 = op32(r0, VLoadStore256<T>::load(src2 + x));
|
||||
VLoadStore256<T>::store(dst + x, r0);
|
||||
}
|
||||
}
|
||||
#else
|
||||
#if CV_SSE2
|
||||
if( USE_SSE2 )
|
||||
{
|
||||
#endif // CV_SSE2
|
||||
for( ; x <= width - 8; x += 8 )
|
||||
{
|
||||
typename VLoadStore128<T>::reg_type r0 = VLoadStore128<T>::load(src1 + x );
|
||||
typename VLoadStore128<T>::reg_type r1 = VLoadStore128<T>::load(src1 + x + 4);
|
||||
r0 = op32(r0, VLoadStore128<T>::load(src2 + x ));
|
||||
r1 = op32(r1, VLoadStore128<T>::load(src2 + x + 4));
|
||||
VLoadStore128<T>::store(dst + x , r0);
|
||||
VLoadStore128<T>::store(dst + x + 4, r1);
|
||||
}
|
||||
#if CV_SSE2
|
||||
}
|
||||
#endif // CV_SSE2
|
||||
#endif // CV_AVX2
|
||||
#endif // CV_NEON || CV_SSE2
|
||||
|
||||
#if CV_ENABLE_UNROLLED
|
||||
for( ; x <= width - 4; x += 4 )
|
||||
{
|
||||
T v0 = op(src1[x], src2[x]);
|
||||
T v1 = op(src1[x+1], src2[x+1]);
|
||||
dst[x] = v0; dst[x+1] = v1;
|
||||
v0 = op(src1[x+2], src2[x+2]);
|
||||
v1 = op(src1[x+3], src2[x+3]);
|
||||
dst[x+2] = v0; dst[x+3] = v1;
|
||||
}
|
||||
#endif
|
||||
|
||||
for( ; x < width; x++ )
|
||||
dst[x] = op(src1[x], src2[x]);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template<typename T, class Op, class Op64>
|
||||
void vBinOp64(const T* src1, size_t step1, const T* src2, size_t step2,
|
||||
T* dst, size_t step, int width, int height)
|
||||
{
|
||||
#if CV_SSE2
|
||||
Op64 op64;
|
||||
#endif
|
||||
Op op;
|
||||
|
||||
for( ; height--; src1 = (const T *)((const uchar *)src1 + step1),
|
||||
src2 = (const T *)((const uchar *)src2 + step2),
|
||||
dst = (T *)((uchar *)dst + step) )
|
||||
{
|
||||
int x = 0;
|
||||
|
||||
#if CV_AVX2
|
||||
if( USE_AVX2 )
|
||||
{
|
||||
if( (((size_t)src1|(size_t)src2|(size_t)dst)&31) == 0 )
|
||||
{
|
||||
for( ; x <= width - 4; x += 4 )
|
||||
{
|
||||
typename VLoadStore256Aligned<T>::reg_type r0 = VLoadStore256Aligned<T>::load(src1 + x);
|
||||
r0 = op64(r0, VLoadStore256Aligned<T>::load(src2 + x));
|
||||
VLoadStore256Aligned<T>::store(dst + x, r0);
|
||||
}
|
||||
}
|
||||
}
|
||||
#elif CV_SSE2
|
||||
if( USE_SSE2 )
|
||||
{
|
||||
if( (((size_t)src1|(size_t)src2|(size_t)dst)&15) == 0 )
|
||||
{
|
||||
for( ; x <= width - 4; x += 4 )
|
||||
{
|
||||
typename VLoadStore128Aligned<T>::reg_type r0 = VLoadStore128Aligned<T>::load(src1 + x );
|
||||
typename VLoadStore128Aligned<T>::reg_type r1 = VLoadStore128Aligned<T>::load(src1 + x + 2);
|
||||
r0 = op64(r0, VLoadStore128Aligned<T>::load(src2 + x ));
|
||||
r1 = op64(r1, VLoadStore128Aligned<T>::load(src2 + x + 2));
|
||||
VLoadStore128Aligned<T>::store(dst + x , r0);
|
||||
VLoadStore128Aligned<T>::store(dst + x + 2, r1);
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
for( ; x <= width - 4; x += 4 )
|
||||
{
|
||||
T v0 = op(src1[x], src2[x]);
|
||||
T v1 = op(src1[x+1], src2[x+1]);
|
||||
dst[x] = v0; dst[x+1] = v1;
|
||||
v0 = op(src1[x+2], src2[x+2]);
|
||||
v1 = op(src1[x+3], src2[x+3]);
|
||||
dst[x+2] = v0; dst[x+3] = v1;
|
||||
}
|
||||
|
||||
for( ; x < width; x++ )
|
||||
dst[x] = op(src1[x], src2[x]);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T> static void
|
||||
cmp_(const T* src1, size_t step1, const T* src2, size_t step2,
|
||||
uchar* dst, size_t step, int width, int height, int code)
|
||||
{
|
||||
step1 /= sizeof(src1[0]);
|
||||
step2 /= sizeof(src2[0]);
|
||||
if( code == CMP_GE || code == CMP_LT )
|
||||
{
|
||||
std::swap(src1, src2);
|
||||
std::swap(step1, step2);
|
||||
code = code == CMP_GE ? CMP_LE : CMP_GT;
|
||||
}
|
||||
|
||||
Cmp_SIMD<T> vop(code);
|
||||
|
||||
if( code == CMP_GT || code == CMP_LE )
|
||||
{
|
||||
int m = code == CMP_GT ? 0 : 255;
|
||||
for( ; height--; src1 += step1, src2 += step2, dst += step )
|
||||
{
|
||||
int x = vop(src1, src2, dst, width);
|
||||
#if CV_ENABLE_UNROLLED
|
||||
for( ; x <= width - 4; x += 4 )
|
||||
{
|
||||
int t0, t1;
|
||||
t0 = -(src1[x] > src2[x]) ^ m;
|
||||
t1 = -(src1[x+1] > src2[x+1]) ^ m;
|
||||
dst[x] = (uchar)t0; dst[x+1] = (uchar)t1;
|
||||
t0 = -(src1[x+2] > src2[x+2]) ^ m;
|
||||
t1 = -(src1[x+3] > src2[x+3]) ^ m;
|
||||
dst[x+2] = (uchar)t0; dst[x+3] = (uchar)t1;
|
||||
}
|
||||
#endif
|
||||
for( ; x < width; x++ )
|
||||
dst[x] = (uchar)(-(src1[x] > src2[x]) ^ m);
|
||||
}
|
||||
}
|
||||
else if( code == CMP_EQ || code == CMP_NE )
|
||||
{
|
||||
int m = code == CMP_EQ ? 0 : 255;
|
||||
for( ; height--; src1 += step1, src2 += step2, dst += step )
|
||||
{
|
||||
int x = 0;
|
||||
#if CV_ENABLE_UNROLLED
|
||||
for( ; x <= width - 4; x += 4 )
|
||||
{
|
||||
int t0, t1;
|
||||
t0 = -(src1[x] == src2[x]) ^ m;
|
||||
t1 = -(src1[x+1] == src2[x+1]) ^ m;
|
||||
dst[x] = (uchar)t0; dst[x+1] = (uchar)t1;
|
||||
t0 = -(src1[x+2] == src2[x+2]) ^ m;
|
||||
t1 = -(src1[x+3] == src2[x+3]) ^ m;
|
||||
dst[x+2] = (uchar)t0; dst[x+3] = (uchar)t1;
|
||||
}
|
||||
#endif
|
||||
for( ; x < width; x++ )
|
||||
dst[x] = (uchar)(-(src1[x] == src2[x]) ^ m);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T, typename WT> static void
|
||||
mul_( const T* src1, size_t step1, const T* src2, size_t step2,
|
||||
T* dst, size_t step, int width, int height, WT scale )
|
||||
{
|
||||
step1 /= sizeof(src1[0]);
|
||||
step2 /= sizeof(src2[0]);
|
||||
step /= sizeof(dst[0]);
|
||||
|
||||
Mul_SIMD<T, WT> vop;
|
||||
|
||||
if( scale == (WT)1. )
|
||||
{
|
||||
for( ; height--; src1 += step1, src2 += step2, dst += step )
|
||||
{
|
||||
int i = vop(src1, src2, dst, width, scale);
|
||||
#if CV_ENABLE_UNROLLED
|
||||
for(; i <= width - 4; i += 4 )
|
||||
{
|
||||
T t0;
|
||||
T t1;
|
||||
t0 = saturate_cast<T>(src1[i ] * src2[i ]);
|
||||
t1 = saturate_cast<T>(src1[i+1] * src2[i+1]);
|
||||
dst[i ] = t0;
|
||||
dst[i+1] = t1;
|
||||
|
||||
t0 = saturate_cast<T>(src1[i+2] * src2[i+2]);
|
||||
t1 = saturate_cast<T>(src1[i+3] * src2[i+3]);
|
||||
dst[i+2] = t0;
|
||||
dst[i+3] = t1;
|
||||
}
|
||||
#endif
|
||||
for( ; i < width; i++ )
|
||||
dst[i] = saturate_cast<T>(src1[i] * src2[i]);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for( ; height--; src1 += step1, src2 += step2, dst += step )
|
||||
{
|
||||
int i = vop(src1, src2, dst, width, scale);
|
||||
#if CV_ENABLE_UNROLLED
|
||||
for(; i <= width - 4; i += 4 )
|
||||
{
|
||||
T t0 = saturate_cast<T>(scale*(WT)src1[i]*src2[i]);
|
||||
T t1 = saturate_cast<T>(scale*(WT)src1[i+1]*src2[i+1]);
|
||||
dst[i] = t0; dst[i+1] = t1;
|
||||
|
||||
t0 = saturate_cast<T>(scale*(WT)src1[i+2]*src2[i+2]);
|
||||
t1 = saturate_cast<T>(scale*(WT)src1[i+3]*src2[i+3]);
|
||||
dst[i+2] = t0; dst[i+3] = t1;
|
||||
}
|
||||
#endif
|
||||
for( ; i < width; i++ )
|
||||
dst[i] = saturate_cast<T>(scale*(WT)src1[i]*src2[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template<typename T> static void
|
||||
div_i( const T* src1, size_t step1, const T* src2, size_t step2,
|
||||
T* dst, size_t step, int width, int height, double scale )
|
||||
{
|
||||
step1 /= sizeof(src1[0]);
|
||||
step2 /= sizeof(src2[0]);
|
||||
step /= sizeof(dst[0]);
|
||||
|
||||
Div_SIMD<T> vop;
|
||||
float scale_f = (float)scale;
|
||||
|
||||
for( ; height--; src1 += step1, src2 += step2, dst += step )
|
||||
{
|
||||
int i = vop(src1, src2, dst, width, scale);
|
||||
for( ; i < width; i++ )
|
||||
{
|
||||
T num = src1[i], denom = src2[i];
|
||||
dst[i] = denom != 0 ? saturate_cast<T>(num*scale_f/denom) : (T)0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T> static void
|
||||
div_f( const T* src1, size_t step1, const T* src2, size_t step2,
|
||||
T* dst, size_t step, int width, int height, double scale )
|
||||
{
|
||||
T scale_f = (T)scale;
|
||||
step1 /= sizeof(src1[0]);
|
||||
step2 /= sizeof(src2[0]);
|
||||
step /= sizeof(dst[0]);
|
||||
|
||||
Div_SIMD<T> vop;
|
||||
|
||||
for( ; height--; src1 += step1, src2 += step2, dst += step )
|
||||
{
|
||||
int i = vop(src1, src2, dst, width, scale);
|
||||
for( ; i < width; i++ )
|
||||
{
|
||||
T num = src1[i], denom = src2[i];
|
||||
dst[i] = denom != 0 ? saturate_cast<T>(num*scale_f/denom) : (T)0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T> static void
|
||||
recip_i( const T* src2, size_t step2,
|
||||
T* dst, size_t step, int width, int height, double scale )
|
||||
{
|
||||
step2 /= sizeof(src2[0]);
|
||||
step /= sizeof(dst[0]);
|
||||
|
||||
Recip_SIMD<T> vop;
|
||||
float scale_f = (float)scale;
|
||||
|
||||
for( ; height--; src2 += step2, dst += step )
|
||||
{
|
||||
int i = vop(src2, dst, width, scale);
|
||||
for( ; i < width; i++ )
|
||||
{
|
||||
T denom = src2[i];
|
||||
dst[i] = denom != 0 ? saturate_cast<T>(scale_f/denom) : (T)0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T> static void
|
||||
recip_f( const T* src2, size_t step2,
|
||||
T* dst, size_t step, int width, int height, double scale )
|
||||
{
|
||||
T scale_f = (T)scale;
|
||||
step2 /= sizeof(src2[0]);
|
||||
step /= sizeof(dst[0]);
|
||||
|
||||
Recip_SIMD<T> vop;
|
||||
|
||||
for( ; height--; src2 += step2, dst += step )
|
||||
{
|
||||
int i = vop(src2, dst, width, scale);
|
||||
for( ; i < width; i++ )
|
||||
{
|
||||
T denom = src2[i];
|
||||
dst[i] = denom != 0 ? saturate_cast<T>(scale_f/denom) : (T)0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T, typename WT> static void
|
||||
addWeighted_( const T* src1, size_t step1, const T* src2, size_t step2,
|
||||
T* dst, size_t step, int width, int height, void* _scalars )
|
||||
{
|
||||
const double* scalars = (const double*)_scalars;
|
||||
WT alpha = (WT)scalars[0], beta = (WT)scalars[1], gamma = (WT)scalars[2];
|
||||
step1 /= sizeof(src1[0]);
|
||||
step2 /= sizeof(src2[0]);
|
||||
step /= sizeof(dst[0]);
|
||||
|
||||
AddWeighted_SIMD<T, WT> vop;
|
||||
|
||||
for( ; height--; src1 += step1, src2 += step2, dst += step )
|
||||
{
|
||||
int x = vop(src1, src2, dst, width, alpha, beta, gamma);
|
||||
#if CV_ENABLE_UNROLLED
|
||||
for( ; x <= width - 4; x += 4 )
|
||||
{
|
||||
T t0 = saturate_cast<T>(src1[x]*alpha + src2[x]*beta + gamma);
|
||||
T t1 = saturate_cast<T>(src1[x+1]*alpha + src2[x+1]*beta + gamma);
|
||||
dst[x] = t0; dst[x+1] = t1;
|
||||
|
||||
t0 = saturate_cast<T>(src1[x+2]*alpha + src2[x+2]*beta + gamma);
|
||||
t1 = saturate_cast<T>(src1[x+3]*alpha + src2[x+3]*beta + gamma);
|
||||
dst[x+2] = t0; dst[x+3] = t1;
|
||||
}
|
||||
#endif
|
||||
for( ; x < width; x++ )
|
||||
dst[x] = saturate_cast<T>(src1[x]*alpha + src2[x]*beta + gamma);
|
||||
}
|
||||
}
|
||||
|
||||
} // cv::
|
||||
|
||||
|
||||
#endif // __OPENCV_ARITHM_CORE_HPP__
|
417
modules/core/src/arithm_ipp.hpp
Normal file
417
modules/core/src/arithm_ipp.hpp
Normal file
@ -0,0 +1,417 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html
|
||||
#if ARITHM_USE_IPP
|
||||
|
||||
namespace cv { namespace hal {
|
||||
|
||||
//=======================================
|
||||
// Arithmetic and logical operations
|
||||
// +, -, *, /, &, |, ^, ~, abs ...
|
||||
//=======================================
|
||||
|
||||
#define ARITHM_IPP_BIN(fun, ...) \
|
||||
do { \
|
||||
if (!CV_IPP_CHECK_COND) \
|
||||
return 0; \
|
||||
if (height == 1) \
|
||||
step1 = step2 = step = width * sizeof(dst[0]); \
|
||||
if (0 <= CV_INSTRUMENT_FUN_IPP(fun, __VA_ARGS__)) \
|
||||
{ \
|
||||
CV_IMPL_ADD(CV_IMPL_IPP); \
|
||||
return 1; \
|
||||
} \
|
||||
setIppErrorStatus(); \
|
||||
return 0; \
|
||||
} while(0)
|
||||
|
||||
//=======================================
|
||||
// Addition
|
||||
//=======================================
|
||||
|
||||
inline int arithm_ipp_add8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2,
|
||||
uchar* dst, size_t step, int width, int height)
|
||||
{
|
||||
ARITHM_IPP_BIN(ippiAdd_8u_C1RSfs, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height), 0);
|
||||
}
|
||||
|
||||
inline int arithm_ipp_add16u(const ushort* src1, size_t step1, const ushort* src2, size_t step2,
|
||||
ushort* dst, size_t step, int width, int height)
|
||||
{
|
||||
ARITHM_IPP_BIN(ippiAdd_16u_C1RSfs, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height), 0);
|
||||
}
|
||||
|
||||
inline int arithm_ipp_add16s(const short* src1, size_t step1, const short* src2, size_t step2,
|
||||
short* dst, size_t step, int width, int height)
|
||||
{
|
||||
ARITHM_IPP_BIN(ippiAdd_16s_C1RSfs, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height), 0);
|
||||
}
|
||||
|
||||
inline int arithm_ipp_add32f(const float* src1, size_t step1, const float* src2, size_t step2,
|
||||
float* dst, size_t step, int width, int height)
|
||||
{
|
||||
ARITHM_IPP_BIN(ippiAdd_32f_C1R, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height));
|
||||
}
|
||||
|
||||
#define arithm_ipp_add8s(...) 0
|
||||
#define arithm_ipp_add32s(...) 0
|
||||
#define arithm_ipp_add64f(...) 0
|
||||
|
||||
//=======================================
|
||||
// Subtract
|
||||
//=======================================
|
||||
|
||||
inline int arithm_ipp_sub8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2,
|
||||
uchar* dst, size_t step, int width, int height)
|
||||
{
|
||||
ARITHM_IPP_BIN(ippiSub_8u_C1RSfs, src2, (int)step2, src1, (int)step1, dst, (int)step, ippiSize(width, height), 0);
|
||||
}
|
||||
|
||||
inline int arithm_ipp_sub16u(const ushort* src1, size_t step1, const ushort* src2, size_t step2,
|
||||
ushort* dst, size_t step, int width, int height)
|
||||
{
|
||||
ARITHM_IPP_BIN(ippiSub_16u_C1RSfs, src2, (int)step2, src1, (int)step1, dst, (int)step, ippiSize(width, height), 0);
|
||||
}
|
||||
|
||||
inline int arithm_ipp_sub16s(const short* src1, size_t step1, const short* src2, size_t step2,
|
||||
short* dst, size_t step, int width, int height)
|
||||
{
|
||||
ARITHM_IPP_BIN(ippiSub_16s_C1RSfs, src2, (int)step2, src1, (int)step1, dst, (int)step, ippiSize(width, height), 0);
|
||||
}
|
||||
|
||||
inline int arithm_ipp_sub32f(const float* src1, size_t step1, const float* src2, size_t step2,
|
||||
float* dst, size_t step, int width, int height)
|
||||
{
|
||||
ARITHM_IPP_BIN(ippiSub_32f_C1R, src2, (int)step2, src1, (int)step1, dst, (int)step, ippiSize(width, height));
|
||||
}
|
||||
|
||||
#define arithm_ipp_sub8s(...) 0
|
||||
#define arithm_ipp_sub32s(...) 0
|
||||
#define arithm_ipp_sub64f(...) 0
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#define ARITHM_IPP_MIN_MAX(fun, type) \
|
||||
do { \
|
||||
if (!CV_IPP_CHECK_COND) \
|
||||
return 0; \
|
||||
type* s1 = (type*)src1; \
|
||||
type* s2 = (type*)src2; \
|
||||
type* d = dst; \
|
||||
if (height == 1) \
|
||||
step1 = step2 = step = width * sizeof(dst[0]); \
|
||||
int i = 0; \
|
||||
for(; i < height; i++) \
|
||||
{ \
|
||||
if (0 > CV_INSTRUMENT_FUN_IPP(fun, s1, s2, d, width)) \
|
||||
break; \
|
||||
s1 = (type*)((uchar*)s1 + step1); \
|
||||
s2 = (type*)((uchar*)s2 + step2); \
|
||||
d = (type*)((uchar*)d + step); \
|
||||
} \
|
||||
if (i == height) \
|
||||
{ \
|
||||
CV_IMPL_ADD(CV_IMPL_IPP); \
|
||||
return 1; \
|
||||
} \
|
||||
setIppErrorStatus(); \
|
||||
return 0; \
|
||||
} while(0)
|
||||
|
||||
//=======================================
|
||||
// Max
|
||||
//=======================================
|
||||
|
||||
inline int arithm_ipp_max8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2,
|
||||
uchar* dst, size_t step, int width, int height)
|
||||
{
|
||||
ARITHM_IPP_MIN_MAX(ippsMaxEvery_8u, uchar);
|
||||
}
|
||||
|
||||
inline int arithm_ipp_max16u(const ushort* src1, size_t step1, const ushort* src2, size_t step2,
|
||||
ushort* dst, size_t step, int width, int height)
|
||||
{
|
||||
ARITHM_IPP_MIN_MAX(ippsMaxEvery_16u, ushort);
|
||||
}
|
||||
|
||||
inline int arithm_ipp_max32f(const float* src1, size_t step1, const float* src2, size_t step2,
|
||||
float* dst, size_t step, int width, int height)
|
||||
{
|
||||
ARITHM_IPP_MIN_MAX(ippsMaxEvery_32f, float);
|
||||
}
|
||||
|
||||
inline int arithm_ipp_max64f(const double* src1, size_t step1, const double* src2, size_t step2,
|
||||
double* dst, size_t step, int width, int height)
|
||||
{
|
||||
ARITHM_IPP_MIN_MAX(ippsMaxEvery_64f, double);
|
||||
}
|
||||
|
||||
#define arithm_ipp_max8s(...) 0
|
||||
#define arithm_ipp_max16s(...) 0
|
||||
#define arithm_ipp_max32s(...) 0
|
||||
|
||||
//=======================================
|
||||
// Min
|
||||
//=======================================
|
||||
|
||||
inline int arithm_ipp_min8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2,
|
||||
uchar* dst, size_t step, int width, int height)
|
||||
{
|
||||
ARITHM_IPP_MIN_MAX(ippsMinEvery_8u, uchar);
|
||||
}
|
||||
|
||||
inline int arithm_ipp_min16u(const ushort* src1, size_t step1, const ushort* src2, size_t step2,
|
||||
ushort* dst, size_t step, int width, int height)
|
||||
{
|
||||
ARITHM_IPP_MIN_MAX(ippsMinEvery_16u, ushort);
|
||||
}
|
||||
|
||||
inline int arithm_ipp_min32f(const float* src1, size_t step1, const float* src2,size_t step2,
|
||||
float* dst, size_t step, int width, int height)
|
||||
{
|
||||
ARITHM_IPP_MIN_MAX(ippsMinEvery_32f, float);
|
||||
}
|
||||
|
||||
inline int arithm_ipp_min64f(const double* src1, size_t step1, const double* src2, size_t step2,
|
||||
double* dst, size_t step, int width, int height)
|
||||
{
|
||||
ARITHM_IPP_MIN_MAX(ippsMinEvery_64f, double);
|
||||
}
|
||||
|
||||
#define arithm_ipp_min8s(...) 0
|
||||
#define arithm_ipp_min16s(...) 0
|
||||
#define arithm_ipp_min32s(...) 0
|
||||
|
||||
//=======================================
|
||||
// AbsDiff
|
||||
//=======================================
|
||||
|
||||
inline int arithm_ipp_absdiff8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2,
|
||||
uchar* dst, size_t step, int width, int height)
|
||||
{
|
||||
ARITHM_IPP_BIN(ippiAbsDiff_8u_C1R, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height));
|
||||
}
|
||||
|
||||
inline int arithm_ipp_absdiff16u(const ushort* src1, size_t step1, const ushort* src2, size_t step2,
|
||||
ushort* dst, size_t step, int width, int height)
|
||||
{
|
||||
ARITHM_IPP_BIN(ippiAbsDiff_16u_C1R, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height));
|
||||
}
|
||||
|
||||
inline int arithm_ipp_absdiff32f(const float* src1, size_t step1, const float* src2, size_t step2,
|
||||
float* dst, size_t step, int width, int height)
|
||||
{
|
||||
ARITHM_IPP_BIN(ippiAbsDiff_32f_C1R, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height));
|
||||
}
|
||||
#define arithm_ipp_absdiff8s(...) 0
|
||||
#define arithm_ipp_absdiff16s(...) 0
|
||||
#define arithm_ipp_absdiff32s(...) 0
|
||||
#define arithm_ipp_absdiff64f(...) 0
|
||||
|
||||
//=======================================
|
||||
// Logical
|
||||
//=======================================
|
||||
|
||||
inline int arithm_ipp_and8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2,
|
||||
uchar* dst, size_t step, int width, int height)
|
||||
{
|
||||
ARITHM_IPP_BIN(ippiAnd_8u_C1R, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height));
|
||||
}
|
||||
|
||||
inline int arithm_ipp_or8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2,
|
||||
uchar* dst, size_t step, int width, int height)
|
||||
{
|
||||
ARITHM_IPP_BIN(ippiOr_8u_C1R, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height));
|
||||
}
|
||||
|
||||
inline int arithm_ipp_xor8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2,
|
||||
uchar* dst, size_t step, int width, int height)
|
||||
{
|
||||
ARITHM_IPP_BIN(ippiXor_8u_C1R, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height));
|
||||
}
|
||||
|
||||
inline int arithm_ipp_not8u(const uchar* src1, size_t step1, uchar* dst, size_t step, int width, int height)
|
||||
{
|
||||
if (!CV_IPP_CHECK_COND)
|
||||
return 0;
|
||||
if (height == 1)
|
||||
step1 = step = width * sizeof(dst[0]);
|
||||
if (0 <= CV_INSTRUMENT_FUN_IPP(ippiNot_8u_C1R, src1, (int)step1, dst, (int)step, ippiSize(width, height)))
|
||||
{
|
||||
CV_IMPL_ADD(CV_IMPL_IPP);
|
||||
return 1;
|
||||
}
|
||||
setIppErrorStatus();
|
||||
return 0;
|
||||
}
|
||||
|
||||
//=======================================
|
||||
// Compare
|
||||
//=======================================
|
||||
|
||||
#define ARITHM_IPP_CMP(fun, ...) \
|
||||
do { \
|
||||
if (!CV_IPP_CHECK_COND) \
|
||||
return 0; \
|
||||
IppCmpOp op = arithm_ipp_convert_cmp(cmpop); \
|
||||
if (op < 0) \
|
||||
return 0; \
|
||||
if (height == 1) \
|
||||
step1 = step2 = step = width * sizeof(dst[0]); \
|
||||
if (0 <= CV_INSTRUMENT_FUN_IPP(fun, __VA_ARGS__, op)) \
|
||||
{ \
|
||||
CV_IMPL_ADD(CV_IMPL_IPP); \
|
||||
return 1; \
|
||||
} \
|
||||
setIppErrorStatus(); \
|
||||
return 0; \
|
||||
} while(0)
|
||||
|
||||
inline IppCmpOp arithm_ipp_convert_cmp(int cmpop)
|
||||
{
|
||||
switch(cmpop)
|
||||
{
|
||||
case CMP_EQ: return ippCmpEq;
|
||||
case CMP_GT: return ippCmpGreater;
|
||||
case CMP_GE: return ippCmpGreaterEq;
|
||||
case CMP_LT: return ippCmpLess;
|
||||
case CMP_LE: return ippCmpLessEq;
|
||||
default: return (IppCmpOp)-1;
|
||||
}
|
||||
}
|
||||
|
||||
inline int arithm_ipp_cmp8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2,
|
||||
uchar* dst, size_t step, int width, int height, int cmpop)
|
||||
{
|
||||
ARITHM_IPP_CMP(ippiCompare_8u_C1R, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height));
|
||||
}
|
||||
|
||||
inline int arithm_ipp_cmp16u(const ushort* src1, size_t step1, const ushort* src2, size_t step2,
|
||||
uchar* dst, size_t step, int width, int height, int cmpop)
|
||||
{
|
||||
ARITHM_IPP_CMP(ippiCompare_16u_C1R, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height));
|
||||
}
|
||||
|
||||
inline int arithm_ipp_cmp16s(const short* src1, size_t step1, const short* src2, size_t step2,
|
||||
uchar* dst, size_t step, int width, int height, int cmpop)
|
||||
{
|
||||
ARITHM_IPP_CMP(ippiCompare_16s_C1R, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height));
|
||||
}
|
||||
|
||||
inline int arithm_ipp_cmp32f(const float* src1, size_t step1, const float* src2, size_t step2,
|
||||
uchar* dst, size_t step, int width, int height, int cmpop)
|
||||
{
|
||||
ARITHM_IPP_CMP(ippiCompare_32f_C1R, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height));
|
||||
}
|
||||
|
||||
#define arithm_ipp_cmp8s(...) 0
|
||||
#define arithm_ipp_cmp32s(...) 0
|
||||
#define arithm_ipp_cmp64f(...) 0
|
||||
|
||||
//=======================================
|
||||
// Multiply
|
||||
//=======================================
|
||||
|
||||
#define ARITHM_IPP_MUL(fun, ...) \
|
||||
do { \
|
||||
if (!CV_IPP_CHECK_COND) \
|
||||
return 0; \
|
||||
float fscale = (float)scale; \
|
||||
if (std::fabs(fscale - 1) > FLT_EPSILON) \
|
||||
return 0; \
|
||||
if (0 <= CV_INSTRUMENT_FUN_IPP(fun, __VA_ARGS__)) \
|
||||
{ \
|
||||
CV_IMPL_ADD(CV_IMPL_IPP); \
|
||||
return 1; \
|
||||
} \
|
||||
setIppErrorStatus(); \
|
||||
return 0; \
|
||||
} while(0)
|
||||
|
||||
inline int arithm_ipp_mul8u(const uchar *src1, size_t step1, const uchar *src2, size_t step2,
|
||||
uchar *dst, size_t step, int width, int height, double scale)
|
||||
{
|
||||
ARITHM_IPP_MUL(ippiMul_8u_C1RSfs, src1, (int)step1, src2, (int)step2,dst, (int)step, ippiSize(width, height), 0);
|
||||
}
|
||||
inline int arithm_ipp_mul16u(const ushort *src1, size_t step1, const ushort *src2, size_t step2,
|
||||
ushort *dst, size_t step, int width, int height, double scale)
|
||||
{
|
||||
ARITHM_IPP_MUL(ippiMul_16u_C1RSfs, src1, (int)step1, src2, (int)step2,dst, (int)step, ippiSize(width, height), 0);
|
||||
}
|
||||
|
||||
inline int arithm_ipp_mul16s(const short *src1, size_t step1, const short *src2, size_t step2,
|
||||
short *dst, size_t step, int width, int height, double scale)
|
||||
{
|
||||
ARITHM_IPP_MUL(ippiMul_16s_C1RSfs, src1, (int)step1, src2, (int)step2,dst, (int)step, ippiSize(width, height), 0);
|
||||
}
|
||||
|
||||
inline int arithm_ipp_mul32f(const float *src1, size_t step1, const float *src2, size_t step2,
|
||||
float *dst, size_t step, int width, int height, double scale)
|
||||
{
|
||||
ARITHM_IPP_MUL(ippiMul_32f_C1R, src1, (int)step1, src2, (int)step2,dst, (int)step, ippiSize(width, height));
|
||||
}
|
||||
|
||||
#define arithm_ipp_mul8s(...) 0
|
||||
#define arithm_ipp_mul32s(...) 0
|
||||
#define arithm_ipp_mul64f(...) 0
|
||||
|
||||
//=======================================
|
||||
// Div
|
||||
//=======================================
|
||||
|
||||
#define arithm_ipp_div8u(...) 0
|
||||
#define arithm_ipp_div8s(...) 0
|
||||
#define arithm_ipp_div16u(...) 0
|
||||
#define arithm_ipp_div16s(...) 0
|
||||
#define arithm_ipp_div32s(...) 0
|
||||
#define arithm_ipp_div32f(...) 0
|
||||
#define arithm_ipp_div64f(...) 0
|
||||
|
||||
//=======================================
|
||||
// AddWeighted
|
||||
//=======================================
|
||||
|
||||
#define arithm_ipp_addWeighted8u(...) 0
|
||||
#define arithm_ipp_addWeighted8s(...) 0
|
||||
#define arithm_ipp_addWeighted16u(...) 0
|
||||
#define arithm_ipp_addWeighted16s(...) 0
|
||||
#define arithm_ipp_addWeighted32s(...) 0
|
||||
#define arithm_ipp_addWeighted32f(...) 0
|
||||
#define arithm_ipp_addWeighted64f(...) 0
|
||||
|
||||
//=======================================
|
||||
// Reciprocial
|
||||
//=======================================
|
||||
|
||||
#define arithm_ipp_recip8u(...) 0
|
||||
#define arithm_ipp_recip8s(...) 0
|
||||
#define arithm_ipp_recip16u(...) 0
|
||||
#define arithm_ipp_recip16s(...) 0
|
||||
#define arithm_ipp_recip32s(...) 0
|
||||
#define arithm_ipp_recip32f(...) 0
|
||||
#define arithm_ipp_recip64f(...) 0
|
||||
|
||||
/** empty block in case if you have "fun"
|
||||
#define arithm_ipp_8u(...) 0
|
||||
#define arithm_ipp_8s(...) 0
|
||||
#define arithm_ipp_16u(...) 0
|
||||
#define arithm_ipp_16s(...) 0
|
||||
#define arithm_ipp_32s(...) 0
|
||||
#define arithm_ipp_32f(...) 0
|
||||
#define arithm_ipp_64f(...) 0
|
||||
**/
|
||||
|
||||
}} // cv::hal::
|
||||
|
||||
#define ARITHM_CALL_IPP(fun, ...) \
|
||||
{ \
|
||||
if (__CV_EXPAND(fun(__VA_ARGS__))) \
|
||||
return; \
|
||||
}
|
||||
|
||||
#endif // ARITHM_USE_IPP
|
||||
|
||||
|
||||
#if !ARITHM_USE_IPP
|
||||
#define ARITHM_CALL_IPP(...)
|
||||
#endif
|
File diff suppressed because it is too large
Load Diff
@ -86,7 +86,6 @@
|
||||
#include "opencv2/core/sse_utils.hpp"
|
||||
#include "opencv2/core/neon_utils.hpp"
|
||||
#include "opencv2/core/vsx_utils.hpp"
|
||||
#include "arithm_core.hpp"
|
||||
#include "hal_replacement.hpp"
|
||||
|
||||
#ifdef HAVE_TEGRA_OPTIMIZATION
|
||||
@ -110,6 +109,102 @@ extern const uchar g_Saturate8u[];
|
||||
#define CV_MIN_8U(a,b) ((a) - CV_FAST_CAST_8U((a) - (b)))
|
||||
#define CV_MAX_8U(a,b) ((a) + CV_FAST_CAST_8U((b) - (a)))
|
||||
|
||||
template<typename T1, typename T2=T1, typename T3=T1> struct OpAdd
|
||||
{
|
||||
typedef T1 type1;
|
||||
typedef T2 type2;
|
||||
typedef T3 rtype;
|
||||
T3 operator ()(const T1 a, const T2 b) const { return saturate_cast<T3>(a + b); }
|
||||
};
|
||||
|
||||
template<typename T1, typename T2=T1, typename T3=T1> struct OpSub
|
||||
{
|
||||
typedef T1 type1;
|
||||
typedef T2 type2;
|
||||
typedef T3 rtype;
|
||||
T3 operator ()(const T1 a, const T2 b) const { return saturate_cast<T3>(a - b); }
|
||||
};
|
||||
|
||||
template<typename T1, typename T2=T1, typename T3=T1> struct OpRSub
|
||||
{
|
||||
typedef T1 type1;
|
||||
typedef T2 type2;
|
||||
typedef T3 rtype;
|
||||
T3 operator ()(const T1 a, const T2 b) const { return saturate_cast<T3>(b - a); }
|
||||
};
|
||||
|
||||
template<typename T> struct OpMin
|
||||
{
|
||||
typedef T type1;
|
||||
typedef T type2;
|
||||
typedef T rtype;
|
||||
T operator ()(const T a, const T b) const { return std::min(a, b); }
|
||||
};
|
||||
|
||||
template<typename T> struct OpMax
|
||||
{
|
||||
typedef T type1;
|
||||
typedef T type2;
|
||||
typedef T rtype;
|
||||
T operator ()(const T a, const T b) const { return std::max(a, b); }
|
||||
};
|
||||
|
||||
template<typename T> struct OpAbsDiff
|
||||
{
|
||||
typedef T type1;
|
||||
typedef T type2;
|
||||
typedef T rtype;
|
||||
T operator()(T a, T b) const { return a > b ? a - b : b - a; }
|
||||
};
|
||||
|
||||
// specializations to prevent "-0" results
|
||||
template<> struct OpAbsDiff<float>
|
||||
{
|
||||
typedef float type1;
|
||||
typedef float type2;
|
||||
typedef float rtype;
|
||||
float operator()(float a, float b) const { return std::abs(a - b); }
|
||||
};
|
||||
template<> struct OpAbsDiff<double>
|
||||
{
|
||||
typedef double type1;
|
||||
typedef double type2;
|
||||
typedef double rtype;
|
||||
double operator()(double a, double b) const { return std::abs(a - b); }
|
||||
};
|
||||
|
||||
template<typename T> struct OpAnd
|
||||
{
|
||||
typedef T type1;
|
||||
typedef T type2;
|
||||
typedef T rtype;
|
||||
T operator()( T a, T b ) const { return a & b; }
|
||||
};
|
||||
|
||||
template<typename T> struct OpOr
|
||||
{
|
||||
typedef T type1;
|
||||
typedef T type2;
|
||||
typedef T rtype;
|
||||
T operator()( T a, T b ) const { return a | b; }
|
||||
};
|
||||
|
||||
template<typename T> struct OpXor
|
||||
{
|
||||
typedef T type1;
|
||||
typedef T type2;
|
||||
typedef T rtype;
|
||||
T operator()( T a, T b ) const { return a ^ b; }
|
||||
};
|
||||
|
||||
template<typename T> struct OpNot
|
||||
{
|
||||
typedef T type1;
|
||||
typedef T type2;
|
||||
typedef T rtype;
|
||||
T operator()( T a, T ) const { return ~a; }
|
||||
};
|
||||
|
||||
template<> inline uchar OpAdd<uchar>::operator ()(uchar a, uchar b) const
|
||||
{ return CV_FAST_CAST_8U(a + b); }
|
||||
|
||||
|
@ -119,10 +119,14 @@ template <typename R> struct Data
|
||||
d[i] += (LaneType)m;
|
||||
return *this;
|
||||
}
|
||||
void fill(LaneType val, int s, int c = R::nlanes)
|
||||
{
|
||||
for (int i = s; i < c; ++i)
|
||||
d[i] = val;
|
||||
}
|
||||
void fill(LaneType val)
|
||||
{
|
||||
for (int i = 0; i < R::nlanes; ++i)
|
||||
d[i] = val;
|
||||
fill(val, 0);
|
||||
}
|
||||
void reverse()
|
||||
{
|
||||
@ -739,6 +743,23 @@ template<typename R> struct TheTest
|
||||
return *this;
|
||||
}
|
||||
|
||||
TheTest & test_absdiffs()
|
||||
{
|
||||
Data<R> dataA(std::numeric_limits<LaneType>::max()),
|
||||
dataB(std::numeric_limits<LaneType>::min());
|
||||
dataA[0] = (LaneType)-1;
|
||||
dataB[0] = 1;
|
||||
dataA[1] = 2;
|
||||
dataB[1] = (LaneType)-2;
|
||||
R a = dataA, b = dataB;
|
||||
Data<R> resC = v_absdiffs(a, b);
|
||||
for (int i = 0; i < R::nlanes; ++i)
|
||||
{
|
||||
EXPECT_EQ(saturate_cast<LaneType>(std::abs(dataA[i] - dataB[i])), resC[i]);
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
TheTest & test_reduce()
|
||||
{
|
||||
Data<R> dataA;
|
||||
@ -874,6 +895,81 @@ template<typename R> struct TheTest
|
||||
return *this;
|
||||
}
|
||||
|
||||
// v_uint8 only
|
||||
TheTest & test_pack_b()
|
||||
{
|
||||
// 16-bit
|
||||
Data<R> dataA, dataB;
|
||||
dataB.fill(0, R::nlanes / 2);
|
||||
|
||||
R a = dataA, b = dataB;
|
||||
Data<R> maskA = a == b, maskB = a != b;
|
||||
|
||||
a = maskA; b = maskB;
|
||||
Data<R> res = v_pack_b(v_reinterpret_as_u16(a), v_reinterpret_as_u16(b));
|
||||
for (int i = 0; i < v_uint16::nlanes; ++i)
|
||||
{
|
||||
SCOPED_TRACE(cv::format("i=%d", i));
|
||||
EXPECT_EQ(maskA[i * 2], res[i]);
|
||||
EXPECT_EQ(maskB[i * 2], res[i + v_uint16::nlanes]);
|
||||
}
|
||||
|
||||
// 32-bit
|
||||
Data<R> dataC, dataD;
|
||||
dataD.fill(0, R::nlanes / 2);
|
||||
|
||||
R c = dataC, d = dataD;
|
||||
Data<R> maskC = c == d, maskD = c != d;
|
||||
|
||||
c = maskC; d = maskD;
|
||||
res = v_pack_b
|
||||
(
|
||||
v_reinterpret_as_u32(a), v_reinterpret_as_u32(b),
|
||||
v_reinterpret_as_u32(c), v_reinterpret_as_u32(d)
|
||||
);
|
||||
|
||||
for (int i = 0; i < v_uint32::nlanes; ++i)
|
||||
{
|
||||
SCOPED_TRACE(cv::format("i=%d", i));
|
||||
EXPECT_EQ(maskA[i * 4], res[i]);
|
||||
EXPECT_EQ(maskB[i * 4], res[i + v_uint32::nlanes]);
|
||||
EXPECT_EQ(maskC[i * 4], res[i + v_uint32::nlanes * 2]);
|
||||
EXPECT_EQ(maskD[i * 4], res[i + v_uint32::nlanes * 3]);
|
||||
}
|
||||
|
||||
// 64-bit
|
||||
Data<R> dataE, dataF, dataG(0), dataH(0xFF);
|
||||
dataF.fill(0, R::nlanes / 2);
|
||||
|
||||
R e = dataE, f = dataF, g = dataG, h = dataH;
|
||||
Data<R> maskE = e == f, maskF = e != f;
|
||||
|
||||
e = maskE; f = maskF;
|
||||
res = v_pack_b
|
||||
(
|
||||
v_reinterpret_as_u64(a), v_reinterpret_as_u64(b),
|
||||
v_reinterpret_as_u64(c), v_reinterpret_as_u64(d),
|
||||
v_reinterpret_as_u64(e), v_reinterpret_as_u64(f),
|
||||
v_reinterpret_as_u64(g), v_reinterpret_as_u64(h)
|
||||
);
|
||||
|
||||
for (int i = 0; i < v_uint64::nlanes; ++i)
|
||||
{
|
||||
SCOPED_TRACE(cv::format("i=%d", i));
|
||||
EXPECT_EQ(maskA[i * 8], res[i]);
|
||||
EXPECT_EQ(maskB[i * 8], res[i + v_uint64::nlanes]);
|
||||
EXPECT_EQ(maskC[i * 8], res[i + v_uint64::nlanes * 2]);
|
||||
EXPECT_EQ(maskD[i * 8], res[i + v_uint64::nlanes * 3]);
|
||||
|
||||
EXPECT_EQ(maskE[i * 8], res[i + v_uint64::nlanes * 4]);
|
||||
EXPECT_EQ(maskF[i * 8], res[i + v_uint64::nlanes * 5]);
|
||||
EXPECT_EQ(dataG[i * 8], res[i + v_uint64::nlanes * 6]);
|
||||
EXPECT_EQ(dataH[i * 8], res[i + v_uint64::nlanes * 7]);
|
||||
}
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
TheTest & test_unpack()
|
||||
{
|
||||
Data<R> dataA, dataB;
|
||||
@ -1228,6 +1324,7 @@ void test_hal_intrin_uint8()
|
||||
.test_popcount()
|
||||
.test_pack<1>().test_pack<2>().test_pack<3>().test_pack<8>()
|
||||
.test_pack_u<1>().test_pack_u<2>().test_pack_u<3>().test_pack_u<8>()
|
||||
.test_pack_b()
|
||||
.test_unpack()
|
||||
.test_extract<0>().test_extract<1>().test_extract<8>().test_extract<15>()
|
||||
.test_rotate<0>().test_rotate<1>().test_rotate<8>().test_rotate<15>()
|
||||
@ -1259,6 +1356,7 @@ void test_hal_intrin_int8()
|
||||
.test_logic()
|
||||
.test_min_max()
|
||||
.test_absdiff()
|
||||
.test_absdiffs()
|
||||
.test_abs()
|
||||
.test_mask()
|
||||
.test_popcount()
|
||||
@ -1317,6 +1415,7 @@ void test_hal_intrin_int16()
|
||||
.test_logic()
|
||||
.test_min_max()
|
||||
.test_absdiff()
|
||||
.test_absdiffs()
|
||||
.test_abs()
|
||||
.test_reduce()
|
||||
.test_mask()
|
||||
|
Loading…
Reference in New Issue
Block a user