mirror of
https://github.com/opencv/opencv.git
synced 2024-11-24 19:20:28 +08:00
added v_reduce_sum4() universal intrinsic; corrected number of threads in cv::getNumThreads() in the case of GCD
This commit is contained in:
parent
a3189e36c0
commit
fbafc700ea
@ -907,6 +907,27 @@ template<typename _Tp, int n> inline typename V_TypeTraits<_Tp>::sum_type v_redu
|
||||
return c;
|
||||
}
|
||||
|
||||
/** @brief Sums all elements of each input vector, returns the vector of sums
|
||||
|
||||
Scheme:
|
||||
@code
|
||||
result[0] = a[0] + a[1] + a[2] + a[3]
|
||||
result[1] = b[0] + b[1] + b[2] + b[3]
|
||||
result[2] = c[0] + c[1] + c[2] + c[3]
|
||||
result[3] = d[0] + d[1] + d[2] + d[3]
|
||||
@endcode
|
||||
*/
|
||||
inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
|
||||
const v_float32x4& c, const v_float32x4& d)
|
||||
{
|
||||
v_float32x4 r;
|
||||
r.s[0] = a.s[0] + a.s[1] + a.s[2] + a.s[3];
|
||||
r.s[1] = b.s[0] + b.s[1] + b.s[2] + b.s[3];
|
||||
r.s[2] = c.s[0] + c.s[1] + c.s[2] + c.s[3];
|
||||
r.s[3] = d.s[0] + d.s[1] + d.s[2] + d.s[3];
|
||||
return r;
|
||||
}
|
||||
|
||||
/** @brief Get negative values mask
|
||||
|
||||
Returned value is a bit mask with bits set to 1 on places corresponding to negative packed values indexes.
|
||||
|
@ -815,6 +815,21 @@ OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, sum, add, f32)
|
||||
OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, max, max, f32)
|
||||
OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, min, min, f32)
|
||||
|
||||
inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
|
||||
const v_float32x4& c, const v_float32x4& d)
|
||||
{
|
||||
float32x4x2_t ab = vtrnq_f32(a.val, b.val);
|
||||
float32x4x2_t cd = vtrnq_f32(c.val, d.val);
|
||||
|
||||
float32x4_t u0 = vaddq_f32(ab.val[0], ab.val[1]); // a0+a1 b0+b1 a2+a3 b2+b3
|
||||
float32x4_t u1 = vaddq_f32(cd.val[0], cd.val[1]); // c0+c1 d0+d1 c2+c3 d2+d3
|
||||
|
||||
float32x4_t v0 = vcombine_f32(vget_low_f32(u0), vget_low_f32(u1));
|
||||
float32x4_t v1 = vcombine_f32(vget_high_f32(u0), vget_high_f32(u1));
|
||||
|
||||
return v_float32x4(vaddq_f32(v0, v1));
|
||||
}
|
||||
|
||||
#define OPENCV_HAL_IMPL_NEON_POPCOUNT(_Tpvec, cast) \
|
||||
inline v_uint32x4 v_popcount(const _Tpvec& a) \
|
||||
{ \
|
||||
|
@ -1126,6 +1126,14 @@ OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_uint32x4, unsigned, __m128i, epi32, OPENCV
|
||||
OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_int32x4, int, __m128i, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP, si128_si32)
|
||||
OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_float32x4, float, __m128, ps, _mm_castps_si128, _mm_castsi128_ps, ss_f32)
|
||||
|
||||
inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
|
||||
const v_float32x4& c, const v_float32x4& d)
|
||||
{
|
||||
__m128 ab = _mm_hadd_ps(a.val, b.val);
|
||||
__m128 cd = _mm_hadd_ps(c.val, d.val);
|
||||
return v_float32x4(_mm_hadd_ps(ab, cd));
|
||||
}
|
||||
|
||||
OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_uint32x4, unsigned, max, std::max)
|
||||
OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_uint32x4, unsigned, min, std::min)
|
||||
OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_int32x4, int, max, std::max)
|
||||
|
@ -425,7 +425,7 @@ int cv::getNumThreads(void)
|
||||
|
||||
#elif defined HAVE_GCD
|
||||
|
||||
return 512; // the GCD thread pool limit
|
||||
return cv::getNumberOfCPUs(); // the GCD thread pool limit
|
||||
|
||||
#elif defined WINRT
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user