Merge pull request #14916 from terfendail:wsignmask_deprecated

* Avoid using v_signmask universal intrinsic and mark it as deprecated

* Renamed v_find_negative to v_scan_forward
This commit is contained in:
Vitaly Tuzov 2019-07-01 19:53:51 +03:00 committed by Alexander Alekhin
parent 3e4a195b61
commit 9befb7a1d7
13 changed files with 305 additions and 336 deletions

View File

@ -534,12 +534,12 @@ static void findStereoCorrespondenceBM_SIMD( const Mat& left, const Mat& right,
v_expand(sad8, sad4_l, sad4_h); v_expand(sad8, sad4_l, sad4_h);
mask4 = thresh4 > sad4_l; mask4 = thresh4 > sad4_l;
mask4 = mask4 & ((d1 > d4) | (d4 > d2)); mask4 = mask4 & ((d1 > d4) | (d4 > d2));
if( v_signmask(mask4) ) if( v_check_any(mask4) )
break; break;
d4 += dd_4; d4 += dd_4;
mask4 = thresh4 > sad4_h; mask4 = thresh4 > sad4_h;
mask4 = mask4 & ((d1 > d4) | (d4 > d2)); mask4 = mask4 & ((d1 > d4) | (d4 > d2));
if( v_signmask(mask4) ) if( v_check_any(mask4) )
break; break;
d4 += dd_4; d4 += dd_4;
} }

View File

@ -2013,14 +2013,14 @@ void SGBM3WayMainLoop::operator () (const Range& range) const
mask = cost1 < thresh_reg; mask = cost1 < thresh_reg;
mask = mask & ( (cur_d<d1) | (cur_d>d2) ); mask = mask & ( (cur_d<d1) | (cur_d>d2) );
if( v_signmask(mask) ) if( v_check_any(mask) )
break; break;
cur_d = cur_d+eight_reg; cur_d = cur_d+eight_reg;
mask = cost2 < thresh_reg; mask = cost2 < thresh_reg;
mask = mask & ( (cur_d<d1) | (cur_d>d2) ); mask = mask & ( (cur_d<d1) | (cur_d>d2) );
if( v_signmask(mask) ) if( v_check_any(mask) )
break; break;
cur_d = cur_d+eight_reg; cur_d = cur_d+eight_reg;

View File

@ -55,6 +55,34 @@
#define OPENCV_HAL_NOP(a) (a) #define OPENCV_HAL_NOP(a) (a)
#define OPENCV_HAL_1ST(a, b) (a) #define OPENCV_HAL_1ST(a, b) (a)
namespace {
inline unsigned int trailingZeros32(unsigned int value) {
#if defined(_MSC_VER)
#if (_MSC_VER < 1700) || defined(_M_ARM)
unsigned long index = 0;
_BitScanForward(&index, value);
return (unsigned int)index;
#elif defined(__clang__)
// clang-cl doesn't export _tzcnt_u32 for non BMI systems
return value ? __builtin_ctz(value) : 32;
#else
return _tzcnt_u32(value);
#endif
#elif defined(__GNUC__) || defined(__GNUG__)
return __builtin_ctz(value);
#elif defined(__ICC) || defined(__INTEL_COMPILER)
return _bit_scan_forward(value);
#elif defined(__clang__)
return llvm.cttz.i32(value, true);
#else
static const int MultiplyDeBruijnBitPosition[32] = {
0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9 };
return MultiplyDeBruijnBitPosition[((uint32_t)((value & -value) * 0x077CB531U)) >> 27];
#endif
}
}
// unlike HAL API, which is in cv::hal, // unlike HAL API, which is in cv::hal,
// we put intrinsics into cv namespace to make its // we put intrinsics into cv namespace to make its
// access from within opencv code more accessible // access from within opencv code more accessible
@ -419,32 +447,6 @@ namespace CV__SIMD_NAMESPACE {
using namespace CV__SIMD_NAMESPACE; using namespace CV__SIMD_NAMESPACE;
#endif #endif
inline unsigned int trailingZeros32(unsigned int value) {
#if defined(_MSC_VER)
#if (_MSC_VER < 1700) || defined(_M_ARM)
unsigned long index = 0;
_BitScanForward(&index, value);
return (unsigned int)index;
#elif defined(__clang__)
// clang-cl doesn't export _tzcnt_u32 for non BMI systems
return value ? __builtin_ctz(value) : 32;
#else
return _tzcnt_u32(value);
#endif
#elif defined(__GNUC__) || defined(__GNUG__)
return __builtin_ctz(value);
#elif defined(__ICC) || defined(__INTEL_COMPILER)
return _bit_scan_forward(value);
#elif defined(__clang__)
return llvm.cttz.i32(value, true);
#else
static const int MultiplyDeBruijnBitPosition[32] = {
0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9 };
return MultiplyDeBruijnBitPosition[((uint32_t)((value & -value) * 0x077CB531U)) >> 27];
#endif
}
#ifndef CV_DOXYGEN #ifndef CV_DOXYGEN
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
#endif #endif

View File

@ -1244,6 +1244,17 @@ inline int v_signmask(const v_float32x8& a)
inline int v_signmask(const v_float64x4& a) inline int v_signmask(const v_float64x4& a)
{ return _mm256_movemask_pd(a.val); } { return _mm256_movemask_pd(a.val); }
inline int v_scan_forward(const v_int8x32& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); }
inline int v_scan_forward(const v_uint8x32& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); }
inline int v_scan_forward(const v_int16x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 2; }
inline int v_scan_forward(const v_uint16x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 2; }
inline int v_scan_forward(const v_int32x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
inline int v_scan_forward(const v_uint32x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
inline int v_scan_forward(const v_float32x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
inline int v_scan_forward(const v_int64x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
inline int v_scan_forward(const v_uint64x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
inline int v_scan_forward(const v_float64x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
/** Checks **/ /** Checks **/
#define OPENCV_HAL_IMPL_AVX_CHECK(_Tpvec, and_op, allmask) \ #define OPENCV_HAL_IMPL_AVX_CHECK(_Tpvec, and_op, allmask) \
inline bool v_check_all(const _Tpvec& a) \ inline bool v_check_all(const _Tpvec& a) \

View File

@ -2719,7 +2719,7 @@ OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(v_float64x8, double, f64, v_uint64x8
////////// Mask and checks ///////// ////////// Mask and checks /////////
/** Mask **/ /** Mask **/
inline int64 v_signmask(const v_int8x64& a) { return (int64)_mm512_cmp_epi8_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); } inline int64 v_signmask(const v_int8x64& a) { return (int64)_mm512_movepi8_mask(a.val); }
inline int v_signmask(const v_int16x32& a) { return (int)_mm512_cmp_epi16_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); } inline int v_signmask(const v_int16x32& a) { return (int)_mm512_cmp_epi16_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }
inline int v_signmask(const v_int32x16& a) { return (int)_mm512_cmp_epi32_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); } inline int v_signmask(const v_int32x16& a) { return (int)_mm512_cmp_epi32_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }
inline int v_signmask(const v_int64x8& a) { return (int)_mm512_cmp_epi64_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); } inline int v_signmask(const v_int64x8& a) { return (int)_mm512_cmp_epi64_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }
@ -2733,7 +2733,7 @@ inline int v_signmask(const v_float64x8& a) { return v_signmask(v_reinterpret_as
/** Checks **/ /** Checks **/
inline bool v_check_all(const v_int8x64& a) { return !(bool)_mm512_cmp_epi8_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_NLT); } inline bool v_check_all(const v_int8x64& a) { return !(bool)_mm512_cmp_epi8_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_NLT); }
inline bool v_check_any(const v_int8x64& a) { return (bool)_mm512_cmp_epi8_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); } inline bool v_check_any(const v_int8x64& a) { return (bool)_mm512_movepi8_mask(a.val); }
inline bool v_check_all(const v_int16x32& a) { return !(bool)_mm512_cmp_epi16_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_NLT); } inline bool v_check_all(const v_int16x32& a) { return !(bool)_mm512_cmp_epi16_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_NLT); }
inline bool v_check_any(const v_int16x32& a) { return (bool)_mm512_cmp_epi16_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); } inline bool v_check_any(const v_int16x32& a) { return (bool)_mm512_cmp_epi16_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }
inline bool v_check_all(const v_int32x16& a) { return !(bool)_mm512_cmp_epi32_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_NLT); } inline bool v_check_all(const v_int32x16& a) { return !(bool)_mm512_cmp_epi32_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_NLT); }
@ -2754,6 +2754,22 @@ inline bool v_check_any(const v_uint16x32& a) { return v_check_any(v_reinterpret
inline bool v_check_any(const v_uint32x16& a) { return v_check_any(v_reinterpret_as_s32(a)); } inline bool v_check_any(const v_uint32x16& a) { return v_check_any(v_reinterpret_as_s32(a)); }
inline bool v_check_any(const v_uint64x8& a) { return v_check_any(v_reinterpret_as_s64(a)); } inline bool v_check_any(const v_uint64x8& a) { return v_check_any(v_reinterpret_as_s64(a)); }
inline int v_scan_forward(const v_int8x64& a)
{
int64 mask = _mm512_movepi8_mask(a.val);
int mask32 = (int)mask;
return mask != 0 ? mask32 != 0 ? trailingZeros32(mask32) : 32 + trailingZeros32((int)(mask >> 32)) : 0;
}
inline int v_scan_forward(const v_uint8x64& a) { return v_scan_forward(v_reinterpret_as_s8(a)); }
inline int v_scan_forward(const v_int16x32& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))); }
inline int v_scan_forward(const v_uint16x32& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))); }
inline int v_scan_forward(const v_int32x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))) / 2; }
inline int v_scan_forward(const v_uint32x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))) / 2; }
inline int v_scan_forward(const v_float32x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))) / 2; }
inline int v_scan_forward(const v_int64x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))) / 4; }
inline int v_scan_forward(const v_uint64x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))) / 4; }
inline int v_scan_forward(const v_float64x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))) / 4; }
inline void v512_cleanup() { _mm256_zeroall(); } inline void v512_cleanup() { _mm256_zeroall(); }
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END

View File

@ -1072,6 +1072,7 @@ template<typename _Tp, int n> inline typename V_TypeTraits< typename V_TypeTrait
} }
/** @brief Get negative values mask /** @brief Get negative values mask
@deprecated v_signmask depends on a lane count heavily and therefore isn't universal enough
Returned value is a bit mask with bits set to 1 on places corresponding to negative packed values indexes. Returned value is a bit mask with bits set to 1 on places corresponding to negative packed values indexes.
Example: Example:
@ -1088,6 +1089,23 @@ template<typename _Tp, int n> inline int v_signmask(const v_reg<_Tp, n>& a)
return mask; return mask;
} }
/** @brief Get first negative lane index
Returned value is an index of first negative lane (undefined for input of all positive values)
Example:
@code{.cpp}
v_int32x4 r; // set to {0, 0, -1, -1}
int idx = v_heading_zeros(r); // idx = 2
@endcode
*/
template <typename _Tp, int n> inline int v_scan_forward(const v_reg<_Tp, n>& a)
{
for (int i = 0; i < n; i++)
if(V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) < 0)
return i;
return 0;
}
/** @brief Check if all packed values are less than zero /** @brief Check if all packed values are less than zero
Unsigned values will be casted to signed: `uchar 254 => char -2`. Unsigned values will be casted to signed: `uchar 254 => char -2`.

View File

@ -1096,17 +1096,32 @@ inline int v_signmask(const v_int32x4& a)
{ return v_signmask(v_reinterpret_as_u32(a)); } { return v_signmask(v_reinterpret_as_u32(a)); }
inline int v_signmask(const v_float32x4& a) inline int v_signmask(const v_float32x4& a)
{ return v_signmask(v_reinterpret_as_u32(a)); } { return v_signmask(v_reinterpret_as_u32(a)); }
#if CV_SIMD128_64F
inline int v_signmask(const v_uint64x2& a) inline int v_signmask(const v_uint64x2& a)
{ {
int64x1_t m0 = vdup_n_s64(0); int64x1_t m0 = vdup_n_s64(0);
uint64x2_t v0 = vshlq_u64(vshrq_n_u64(a.val, 63), vcombine_s64(m0, m0)); uint64x2_t v0 = vshlq_u64(vshrq_n_u64(a.val, 63), vcombine_s64(m0, m0));
return (int)vgetq_lane_u64(v0, 0) + ((int)vgetq_lane_u64(v0, 1) << 1); return (int)vgetq_lane_u64(v0, 0) + ((int)vgetq_lane_u64(v0, 1) << 1);
} }
inline int v_signmask(const v_int64x2& a)
{ return v_signmask(v_reinterpret_as_u64(a)); }
#if CV_SIMD128_64F
inline int v_signmask(const v_float64x2& a) inline int v_signmask(const v_float64x2& a)
{ return v_signmask(v_reinterpret_as_u64(a)); } { return v_signmask(v_reinterpret_as_u64(a)); }
#endif #endif
inline int v_scan_forward(const v_int8x16& a) { return trailingZeros32(v_signmask(a)); }
inline int v_scan_forward(const v_uint8x16& a) { return trailingZeros32(v_signmask(a)); }
inline int v_scan_forward(const v_int16x8& a) { return trailingZeros32(v_signmask(a)); }
inline int v_scan_forward(const v_uint16x8& a) { return trailingZeros32(v_signmask(a)); }
inline int v_scan_forward(const v_int32x4& a) { return trailingZeros32(v_signmask(a)); }
inline int v_scan_forward(const v_uint32x4& a) { return trailingZeros32(v_signmask(a)); }
inline int v_scan_forward(const v_float32x4& a) { return trailingZeros32(v_signmask(a)); }
inline int v_scan_forward(const v_int64x2& a) { return trailingZeros32(v_signmask(a)); }
inline int v_scan_forward(const v_uint64x2& a) { return trailingZeros32(v_signmask(a)); }
#if CV_SIMD128_64F
inline int v_scan_forward(const v_float64x2& a) { return trailingZeros32(v_signmask(a)); }
#endif
#define OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(_Tpvec, suffix, shift) \ #define OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(_Tpvec, suffix, shift) \
inline bool v_check_all(const v_##_Tpvec& a) \ inline bool v_check_all(const v_##_Tpvec& a) \
{ \ { \

View File

@ -1617,6 +1617,17 @@ OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int32x4, epi8, v_packq_epi32, OPENCV_HAL_AND,
OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_float32x4, ps, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 15, 15) OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_float32x4, ps, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 15, 15)
OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_float64x2, pd, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 3, 3) OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_float64x2, pd, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 3, 3)
inline int v_scan_forward(const v_int8x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); }
inline int v_scan_forward(const v_uint8x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); }
inline int v_scan_forward(const v_int16x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 2; }
inline int v_scan_forward(const v_uint16x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 2; }
inline int v_scan_forward(const v_int32x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
inline int v_scan_forward(const v_uint32x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
inline int v_scan_forward(const v_float32x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
inline int v_scan_forward(const v_int64x2& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
inline int v_scan_forward(const v_uint64x2& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
inline int v_scan_forward(const v_float64x2& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
#if CV_SSE4_1 #if CV_SSE4_1
#define OPENCV_HAL_IMPL_SSE_SELECT(_Tpvec, cast_ret, cast, suffix) \ #define OPENCV_HAL_IMPL_SSE_SELECT(_Tpvec, cast_ret, cast, suffix) \
inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \ inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \

View File

@ -891,6 +891,17 @@ inline int v_signmask(const v_uint64x2& a)
inline int v_signmask(const v_float64x2& a) inline int v_signmask(const v_float64x2& a)
{ return v_signmask(v_reinterpret_as_s64(a)); } { return v_signmask(v_reinterpret_as_s64(a)); }
inline int v_scan_forward(const v_int8x16& a) { return trailingZeros32(v_signmask(a)); }
inline int v_scan_forward(const v_uint8x16& a) { return trailingZeros32(v_signmask(a)); }
inline int v_scan_forward(const v_int16x8& a) { return trailingZeros32(v_signmask(a)); }
inline int v_scan_forward(const v_uint16x8& a) { return trailingZeros32(v_signmask(a)); }
inline int v_scan_forward(const v_int32x4& a) { return trailingZeros32(v_signmask(a)); }
inline int v_scan_forward(const v_uint32x4& a) { return trailingZeros32(v_signmask(a)); }
inline int v_scan_forward(const v_float32x4& a) { return trailingZeros32(v_signmask(a)); }
inline int v_scan_forward(const v_int64x2& a) { return trailingZeros32(v_signmask(a)); }
inline int v_scan_forward(const v_uint64x2& a) { return trailingZeros32(v_signmask(a)); }
inline int v_scan_forward(const v_float64x2& a) { return trailingZeros32(v_signmask(a)); }
template<typename _Tpvec> template<typename _Tpvec>
inline bool v_check_all(const _Tpvec& a) inline bool v_check_all(const _Tpvec& a)
{ return vec_all_lt(a.val, _Tpvec().val); } { return vec_all_lt(a.val, _Tpvec().val); }

View File

@ -132,10 +132,9 @@ void FAST_t(InputArray _img, std::vector<KeyPoint>& keypoints, int threshold, bo
m1 = m1 | ((x3 < v1) & (x0 < v1)); m1 = m1 | ((x3 < v1) & (x0 < v1));
m0 = m0 | m1; m0 = m0 | m1;
int mask = v_signmask(m0); if( !v_check_any(m0) )
if( mask == 0 )
continue; continue;
if( (mask & 255) == 0 ) if( !v_check_any(v_combine_low(m0, m0)) )
{ {
j -= 8; j -= 8;
ptr -= 8; ptr -= 8;
@ -159,16 +158,36 @@ void FAST_t(InputArray _img, std::vector<KeyPoint>& keypoints, int threshold, bo
max1 = v_max(max1, v_reinterpret_as_u8(c1)); max1 = v_max(max1, v_reinterpret_as_u8(c1));
} }
max0 = v_max(max0, max1); max0 = K16 < v_max(max0, max1);
int m = v_signmask(K16 < max0); int m = -v_reduce_sum(v_reinterpret_as_s8(max0));
uchar mflag[16];
v_store(mflag, max0);
for( k = 0; m > 0 && k < 16; k++, m >>= 1 ) for( k = 0; m > 0 && k < 16; k++ )
{ {
if(m & 1) if(mflag[k])
{ {
--m;
cornerpos[ncorners++] = j+k; cornerpos[ncorners++] = j+k;
if(nonmax_suppression) if(nonmax_suppression)
curr[j+k] = (uchar)cornerScore<patternSize>(ptr+k, pixel, threshold); {
short d[25];
for (int _k = 0; _k < 25; _k++)
d[_k] = (short)(ptr[k] - ptr[k + pixel[_k]]);
v_int16x8 a0, b0, a1, b1;
a0 = b0 = a1 = b1 = v_load(d + 8);
for(int shift = 0; shift < 8; ++shift)
{
v_int16x8 v_nms = v_load(d + shift);
a0 = v_min(a0, v_nms);
b0 = v_max(b0, v_nms);
v_nms = v_load(d + 9 + shift);
a1 = v_min(a1, v_nms);
b1 = v_max(b1, v_nms);
}
curr[j + k] = (uchar)(v_reduce_max(v_max(v_max(a0, a1), v_setzero_s16() - v_min(b0, b1))) - 1);
}
} }
} }
} }

View File

@ -47,10 +47,6 @@
#include "opencv2/core/openvx/ovx_defs.hpp" #include "opencv2/core/openvx/ovx_defs.hpp"
#if CV_SIMD128
#define CV_MALLOC_SIMD128 16
#endif
namespace cv namespace cv
{ {
@ -296,18 +292,11 @@ static bool ocl_Canny(InputArray _src, const UMat& dx_, const UMat& dy_, OutputA
#define CANNY_PUSH(map, stack) *map = 2, stack.push_back(map) #define CANNY_PUSH(map, stack) *map = 2, stack.push_back(map)
#define CANNY_CHECK_SIMD(m, high, map, stack) \
if (m > high) \
CANNY_PUSH(map, stack); \
else \
*map = 0
#define CANNY_CHECK(m, high, map, stack) \ #define CANNY_CHECK(m, high, map, stack) \
if (m > high) \ if (m > high) \
CANNY_PUSH(map, stack); \ CANNY_PUSH(map, stack); \
else \ else \
*map = 0; \ *map = 0
continue
class parallelCanny : public ParallelLoopBody class parallelCanny : public ParallelLoopBody
{ {
@ -317,9 +306,14 @@ public:
src(_src), src2(_src), map(_map), _borderPeaksParallel(borderPeaksParallel), src(_src), src2(_src), map(_map), _borderPeaksParallel(borderPeaksParallel),
low(_low), high(_high), aperture_size(_aperture_size), L2gradient(_L2gradient) low(_low), high(_high), aperture_size(_aperture_size), L2gradient(_L2gradient)
{ {
#if CV_SIMD128 #if CV_SIMD
for(int i = 0; i < v_int8::nlanes; ++i)
{
smask[i] = 0;
smask[i + v_int8::nlanes] = (schar)-1;
}
if (true) if (true)
_map.create(src.rows + 2, (int)alignSize((size_t)(src.cols + CV_MALLOC_SIMD128 + 1), CV_MALLOC_SIMD128), CV_8UC1); _map.create(src.rows + 2, (int)alignSize((size_t)(src.cols + CV_SIMD_WIDTH + 1), CV_SIMD_WIDTH), CV_8UC1);
else else
#endif #endif
_map.create(src.rows + 2, src.cols + 2, CV_8UC1); _map.create(src.rows + 2, src.cols + 2, CV_8UC1);
@ -336,9 +330,14 @@ public:
src(_dx), src2(_dy), map(_map), _borderPeaksParallel(borderPeaksParallel), src(_dx), src2(_dy), map(_map), _borderPeaksParallel(borderPeaksParallel),
low(_low), high(_high), aperture_size(0), L2gradient(_L2gradient) low(_low), high(_high), aperture_size(0), L2gradient(_L2gradient)
{ {
#if CV_SIMD128 #if CV_SIMD
for(int i = 0; i < v_int8::nlanes; ++i)
{
smask[i] = 0;
smask[i + v_int8::nlanes] = (schar)-1;
}
if (true) if (true)
_map.create(src.rows + 2, (int)alignSize((size_t)(src.cols + CV_MALLOC_SIMD128 + 1), CV_MALLOC_SIMD128), CV_8UC1); _map.create(src.rows + 2, (int)alignSize((size_t)(src.cols + CV_SIMD_WIDTH + 1), CV_SIMD_WIDTH), CV_8UC1);
else else
#endif #endif
_map.create(src.rows + 2, src.cols + 2, CV_8UC1); _map.create(src.rows + 2, src.cols + 2, CV_8UC1);
@ -397,11 +396,11 @@ public:
} }
// _mag_p: previous row, _mag_a: actual row, _mag_n: next row // _mag_p: previous row, _mag_a: actual row, _mag_n: next row
#if CV_SIMD128 #if CV_SIMD
AutoBuffer<int> buffer(3 * (mapstep * cn + CV_MALLOC_SIMD128)); AutoBuffer<int> buffer(3 * (mapstep * cn + CV_SIMD_WIDTH));
_mag_p = alignPtr(buffer.data() + 1, CV_MALLOC_SIMD128); _mag_p = alignPtr(buffer.data() + 1, CV_SIMD_WIDTH);
_mag_a = alignPtr(_mag_p + mapstep * cn, CV_MALLOC_SIMD128); _mag_a = alignPtr(_mag_p + mapstep * cn, CV_SIMD_WIDTH);
_mag_n = alignPtr(_mag_a + mapstep * cn, CV_MALLOC_SIMD128); _mag_n = alignPtr(_mag_a + mapstep * cn, CV_SIMD_WIDTH);
#else #else
AutoBuffer<int> buffer(3 * (mapstep * cn)); AutoBuffer<int> buffer(3 * (mapstep * cn));
_mag_p = buffer.data() + 1; _mag_p = buffer.data() + 1;
@ -437,21 +436,19 @@ public:
if (L2gradient) if (L2gradient)
{ {
int j = 0, width = src.cols * cn; int j = 0, width = src.cols * cn;
#if CV_SIMD128 #if CV_SIMD
for ( ; j <= width - v_int16::nlanes; j += v_int16::nlanes)
{ {
for ( ; j <= width - 8; j += 8) v_int16 v_dx = vx_load((const short*)(_dx + j));
{ v_int16 v_dy = vx_load((const short*)(_dy + j));
v_int16x8 v_dx = v_load((const short*)(_dx + j));
v_int16x8 v_dy = v_load((const short*)(_dy + j));
v_int32x4 v_dxp_low, v_dxp_high; v_int32 v_dxp_low, v_dxp_high;
v_int32x4 v_dyp_low, v_dyp_high; v_int32 v_dyp_low, v_dyp_high;
v_expand(v_dx, v_dxp_low, v_dxp_high); v_expand(v_dx, v_dxp_low, v_dxp_high);
v_expand(v_dy, v_dyp_low, v_dyp_high); v_expand(v_dy, v_dyp_low, v_dyp_high);
v_store_aligned((int *)(_mag_n + j), v_dxp_low*v_dxp_low+v_dyp_low*v_dyp_low); v_store_aligned((int *)(_mag_n + j), v_dxp_low*v_dxp_low+v_dyp_low*v_dyp_low);
v_store_aligned((int *)(_mag_n + j + 4), v_dxp_high*v_dxp_high+v_dyp_high*v_dyp_high); v_store_aligned((int *)(_mag_n + j + v_int32::nlanes), v_dxp_high*v_dxp_high+v_dyp_high*v_dyp_high);
}
} }
#endif #endif
for ( ; j < width; ++j) for ( ; j < width; ++j)
@ -460,23 +457,21 @@ public:
else else
{ {
int j = 0, width = src.cols * cn; int j = 0, width = src.cols * cn;
#if CV_SIMD128 #if CV_SIMD
for(; j <= width - v_int16::nlanes; j += v_int16::nlanes)
{ {
for(; j <= width - 8; j += 8) v_int16 v_dx = vx_load((const short *)(_dx + j));
{ v_int16 v_dy = vx_load((const short *)(_dy + j));
v_int16x8 v_dx = v_load((const short *)(_dx + j));
v_int16x8 v_dy = v_load((const short *)(_dy + j));
v_dx = v_reinterpret_as_s16(v_abs(v_dx)); v_dx = v_reinterpret_as_s16(v_abs(v_dx));
v_dy = v_reinterpret_as_s16(v_abs(v_dy)); v_dy = v_reinterpret_as_s16(v_abs(v_dy));
v_int32x4 v_dx_ml, v_dy_ml, v_dx_mh, v_dy_mh; v_int32 v_dx_ml, v_dy_ml, v_dx_mh, v_dy_mh;
v_expand(v_dx, v_dx_ml, v_dx_mh); v_expand(v_dx, v_dx_ml, v_dx_mh);
v_expand(v_dy, v_dy_ml, v_dy_mh); v_expand(v_dy, v_dy_ml, v_dy_mh);
v_store_aligned((int *)(_mag_n + j), v_dx_ml + v_dy_ml); v_store_aligned((int *)(_mag_n + j), v_dx_ml + v_dy_ml);
v_store_aligned((int *)(_mag_n + j + 4), v_dx_mh + v_dy_mh); v_store_aligned((int *)(_mag_n + j + v_int32::nlanes), v_dx_mh + v_dy_mh);
}
} }
#endif #endif
for ( ; j < width; ++j) for ( ; j < width; ++j)
@ -520,9 +515,9 @@ public:
// From here actual src row is (i - 1) // From here actual src row is (i - 1)
// Set left and right border to 1 // Set left and right border to 1
#if CV_SIMD128 #if CV_SIMD
if (true) if (true)
_pmap = map.ptr<uchar>(i) + CV_MALLOC_SIMD128; _pmap = map.ptr<uchar>(i) + CV_SIMD_WIDTH;
else else
#endif #endif
_pmap = map.ptr<uchar>(i) + 1; _pmap = map.ptr<uchar>(i) + 1;
@ -542,167 +537,60 @@ public:
const int TG22 = 13573; const int TG22 = 13573;
int j = 0; int j = 0;
#if CV_SIMD128 #if CV_SIMD
{ {
const v_int32x4 v_low = v_setall_s32(low); const v_int32 v_low = vx_setall_s32(low);
const v_int8x16 v_one = v_setall_s8(1); const v_int8 v_one = vx_setall_s8(1);
for (; j <= src.cols - 32; j += 32) for (; j <= src.cols - v_int8::nlanes; j += v_int8::nlanes)
{ {
v_int32x4 v_m1 = v_load_aligned((const int*)(_mag_a + j));
v_int32x4 v_m2 = v_load_aligned((const int*)(_mag_a + j + 4));
v_int32x4 v_m3 = v_load_aligned((const int*)(_mag_a + j + 8));
v_int32x4 v_m4 = v_load_aligned((const int*)(_mag_a + j + 12));
v_int32x4 v_cmp1 = v_m1 > v_low;
v_int32x4 v_cmp2 = v_m2 > v_low;
v_int32x4 v_cmp3 = v_m3 > v_low;
v_int32x4 v_cmp4 = v_m4 > v_low;
v_m1 = v_load_aligned((const int*)(_mag_a + j + 16));
v_m2 = v_load_aligned((const int*)(_mag_a + j + 20));
v_m3 = v_load_aligned((const int*)(_mag_a + j + 24));
v_m4 = v_load_aligned((const int*)(_mag_a + j + 28));
v_store_aligned((signed char*)(_pmap + j), v_one); v_store_aligned((signed char*)(_pmap + j), v_one);
v_store_aligned((signed char*)(_pmap + j + 16), v_one); v_int8 v_cmp = v_pack(v_pack(vx_load_aligned((const int*)(_mag_a + j )) > v_low,
vx_load_aligned((const int*)(_mag_a + j + v_int32::nlanes)) > v_low),
v_int16x8 v_cmp80 = v_pack(v_cmp1, v_cmp2); v_pack(vx_load_aligned((const int*)(_mag_a + j + 2*v_int32::nlanes)) > v_low,
v_int16x8 v_cmp81 = v_pack(v_cmp3, v_cmp4); vx_load_aligned((const int*)(_mag_a + j + 3*v_int32::nlanes)) > v_low));
while (v_check_any(v_cmp))
v_cmp1 = v_m1 > v_low;
v_cmp2 = v_m2 > v_low;
v_cmp3 = v_m3 > v_low;
v_cmp4 = v_m4 > v_low;
v_int8x16 v_cmp = v_pack(v_cmp80, v_cmp81);
v_cmp80 = v_pack(v_cmp1, v_cmp2);
v_cmp81 = v_pack(v_cmp3, v_cmp4);
unsigned int mask = v_signmask(v_cmp);
v_cmp = v_pack(v_cmp80, v_cmp81);
mask |= v_signmask(v_cmp) << 16;
if (mask)
{ {
int k = j; int l = v_scan_forward(v_cmp);
v_cmp &= vx_load(smask + v_int8::nlanes - 1 - l);
int k = j + l;
do int m = _mag_a[k];
short xs = _dx[k];
short ys = _dy[k];
int x = (int)std::abs(xs);
int y = (int)std::abs(ys) << 15;
int tg22x = x * TG22;
if (y < tg22x)
{ {
int l = trailingZeros32(mask); if (m > _mag_a[k - 1] && m >= _mag_a[k + 1])
k += l;
mask >>= l;
int m = _mag_a[k];
short xs = _dx[k];
short ys = _dy[k];
int x = (int)std::abs(xs);
int y = (int)std::abs(ys) << 15;
int tg22x = x * TG22;
if (y < tg22x)
{ {
if (m > _mag_a[k - 1] && m >= _mag_a[k + 1]) CANNY_CHECK(m, high, (_pmap+k), stack);
}
}
else
{
int tg67x = tg22x + (x << 16);
if (y > tg67x)
{
if (m > _mag_p[k] && m >= _mag_n[k])
{ {
CANNY_CHECK_SIMD(m, high, (_pmap+k), stack); CANNY_CHECK(m, high, (_pmap+k), stack);
} }
} }
else else
{ {
int tg67x = tg22x + (x << 16); int s = (xs ^ ys) < 0 ? -1 : 1;
if (y > tg67x) if(m > _mag_p[k - s] && m > _mag_n[k + s])
{ {
if (m > _mag_p[k] && m >= _mag_n[k]) CANNY_CHECK(m, high, (_pmap+k), stack);
{
CANNY_CHECK_SIMD(m, high, (_pmap+k), stack);
}
}
else
{
int s = (xs ^ ys) < 0 ? -1 : 1;
if(m > _mag_p[k - s] && m > _mag_n[k + s])
{
CANNY_CHECK_SIMD(m, high, (_pmap+k), stack);
}
} }
} }
++k; }
} while((mask >>= 1));
} }
} }
if (j <= src.cols - 16)
{
v_int32x4 v_m1 = v_load_aligned((const int*)(_mag_a + j));
v_int32x4 v_m2 = v_load_aligned((const int*)(_mag_a + j + 4));
v_int32x4 v_m3 = v_load_aligned((const int*)(_mag_a + j + 8));
v_int32x4 v_m4 = v_load_aligned((const int*)(_mag_a + j + 12));
v_store_aligned((signed char*)(_pmap + j), v_one);
v_int32x4 v_cmp1 = v_m1 > v_low;
v_int32x4 v_cmp2 = v_m2 > v_low;
v_int32x4 v_cmp3 = v_m3 > v_low;
v_int32x4 v_cmp4 = v_m4 > v_low;
v_int16x8 v_cmp80 = v_pack(v_cmp1, v_cmp2);
v_int16x8 v_cmp81 = v_pack(v_cmp3, v_cmp4);
v_int8x16 v_cmp = v_pack(v_cmp80, v_cmp81);
unsigned int mask = v_signmask(v_cmp);
if (mask)
{
int k = j;
do
{
int l = trailingZeros32(mask);
k += l;
mask >>= l;
int m = _mag_a[k];
short xs = _dx[k];
short ys = _dy[k];
int x = (int)std::abs(xs);
int y = (int)std::abs(ys) << 15;
int tg22x = x * TG22;
if (y < tg22x)
{
if (m > _mag_a[k - 1] && m >= _mag_a[k + 1])
{
CANNY_CHECK_SIMD(m, high, (_pmap+k), stack);
}
}
else
{
int tg67x = tg22x + (x << 16);
if (y > tg67x)
{
if (m > _mag_p[k] && m >= _mag_n[k])
{
CANNY_CHECK_SIMD(m, high, (_pmap+k), stack);
}
}
else
{
int s = (xs ^ ys) < 0 ? -1 : 1;
if(m > _mag_p[k - s] && m > _mag_n[k + s])
{
CANNY_CHECK_SIMD(m, high, (_pmap+k), stack);
}
}
}
++k;
} while((mask >>= 1));
}
j += 16;
}
} }
#endif #endif
for (; j < src.cols; j++) for (; j < src.cols; j++)
@ -723,6 +611,7 @@ public:
if (m > _mag_a[j - 1] && m >= _mag_a[j + 1]) if (m > _mag_a[j - 1] && m >= _mag_a[j + 1])
{ {
CANNY_CHECK(m, high, (_pmap+j), stack); CANNY_CHECK(m, high, (_pmap+j), stack);
continue;
} }
} }
else else
@ -733,6 +622,7 @@ public:
if (m > _mag_p[j] && m >= _mag_n[j]) if (m > _mag_p[j] && m >= _mag_n[j])
{ {
CANNY_CHECK(m, high, (_pmap+j), stack); CANNY_CHECK(m, high, (_pmap+j), stack);
continue;
} }
} }
else else
@ -741,6 +631,7 @@ public:
if(m > _mag_p[j - s] && m > _mag_n[j + s]) if(m > _mag_p[j - s] && m > _mag_n[j + s])
{ {
CANNY_CHECK(m, high, (_pmap+j), stack); CANNY_CHECK(m, high, (_pmap+j), stack);
continue;
} }
} }
} }
@ -802,6 +693,9 @@ private:
ptrdiff_t mapstep; ptrdiff_t mapstep;
int cn; int cn;
mutable Mutex mutex; mutable Mutex mutex;
#if CV_SIMD
schar smask[2*v_int8::nlanes];
#endif
}; };
class finalPass : public ParallelLoopBody class finalPass : public ParallelLoopBody
@ -824,31 +718,31 @@ public:
int j = 0; int j = 0;
uchar *pdst = dst.ptr<uchar>(i); uchar *pdst = dst.ptr<uchar>(i);
const uchar *pmap = map.ptr<uchar>(i + 1); const uchar *pmap = map.ptr<uchar>(i + 1);
#if CV_SIMD128 #if CV_SIMD
if (true) if (true)
pmap += CV_MALLOC_SIMD128; pmap += CV_SIMD_WIDTH;
else else
#endif #endif
pmap += 1; pmap += 1;
#if CV_SIMD128 #if CV_SIMD
{ {
const v_uint8x16 v_zero = v_setzero_u8(); const v_uint8 v_zero = vx_setzero_u8();
const v_uint8x16 v_ff = ~v_zero; const v_uint8 v_ff = ~v_zero;
const v_uint8x16 v_two(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2); const v_uint8 v_two = vx_setall_u8(2);
for (; j <= dst.cols - 16; j += 16) for (; j <= dst.cols - v_uint8::nlanes; j += v_uint8::nlanes)
{ {
v_uint8x16 v_pmap = v_load_aligned((const unsigned char*)(pmap + j)); v_uint8 v_pmap = vx_load_aligned((const unsigned char*)(pmap + j));
v_pmap = v_select(v_pmap == v_two, v_ff, v_zero); v_pmap = v_select(v_pmap == v_two, v_ff, v_zero);
v_store((pdst + j), v_pmap); v_store((pdst + j), v_pmap);
} }
if (j <= dst.cols - 8) if (j <= dst.cols - v_uint8::nlanes/2)
{ {
v_uint8x16 v_pmap = v_load_low((const unsigned char*)(pmap + j)); v_uint8 v_pmap = vx_load_low((const unsigned char*)(pmap + j));
v_pmap = v_select(v_pmap == v_two, v_ff, v_zero); v_pmap = v_select(v_pmap == v_two, v_ff, v_zero);
v_store_low((pdst + j), v_pmap); v_store_low((pdst + j), v_pmap);
j += 8; j += v_uint8::nlanes/2;
} }
} }
#endif #endif

View File

@ -1061,19 +1061,13 @@ cvFindNextContour( CvContourScanner scanner )
} }
else else
{ {
#if CV_SIMD_WIDTH > 16 v_uint8 v_prev = vx_setall_u8((uchar)prev);
v_uint8 vx_prev = vx_setall_u8((uchar)prev); for (; x <= width - v_uint8::nlanes; x += v_uint8::nlanes)
while (x <= width - v_uint8::nlanes &&
v_check_all(vx_load((uchar*)(img + x)) == vx_prev))
x += v_uint8::nlanes;
#endif
v_uint8x16 v_prev = v_setall_u8((uchar)prev);
for (; x <= width - v_uint8x16::nlanes; x += v_uint8x16::nlanes)
{ {
unsigned int mask = (unsigned int)v_signmask(v_load((uchar*)(img + x)) != v_prev); v_uint8 vmask = (vx_load((uchar*)(img + x)) != v_prev);
if (mask) if (v_check_any(vmask))
{ {
p = img[(x += cv::trailingZeros32(mask))]; p = img[(x += v_scan_forward(vmask))];
goto _next_contour; goto _next_contour;
} }
} }
@ -1334,19 +1328,13 @@ CvLinkedRunPoint;
inline int findStartContourPoint(uchar *src_data, CvSize img_size, int j) inline int findStartContourPoint(uchar *src_data, CvSize img_size, int j)
{ {
#if CV_SIMD #if CV_SIMD
#if CV_SIMD_WIDTH > 16 v_uint8 v_zero = vx_setzero_u8();
v_uint8 vx_zero = vx_setzero_u8(); for (; j <= img_size.width - v_uint8::nlanes; j += v_uint8::nlanes)
while (j <= img_size.width - v_uint8::nlanes &&
v_check_all(vx_load((uchar*)(src_data + j)) == vx_zero))
j += v_uint8::nlanes;
#endif
v_uint8x16 v_zero = v_setzero_u8();
for (; j <= img_size.width - v_uint8x16::nlanes; j += v_uint8x16::nlanes)
{ {
unsigned int mask = (unsigned int)v_signmask(v_load((uchar*)(src_data + j)) != v_zero); v_uint8 vmask = (vx_load((uchar*)(src_data + j)) != v_zero);
if (mask) if (v_check_any(vmask))
{ {
j += cv::trailingZeros32(mask); j += v_scan_forward(vmask);
return j; return j;
} }
} }
@ -1365,19 +1353,13 @@ inline int findEndContourPoint(uchar *src_data, CvSize img_size, int j)
} }
else else
{ {
#if CV_SIMD_WIDTH > 16 v_uint8 v_zero = vx_setzero_u8();
v_uint8 vx_zero = vx_setzero_u8();
while (j <= img_size.width - v_uint8::nlanes &&
v_check_all(vx_load((uchar*)(src_data + j)) != vx_zero))
j += v_uint8::nlanes;
#endif
v_uint8x16 v_zero = v_setzero_u8();
for (; j <= img_size.width - v_uint8::nlanes; j += v_uint8::nlanes) for (; j <= img_size.width - v_uint8::nlanes; j += v_uint8::nlanes)
{ {
unsigned int mask = (unsigned int)v_signmask(v_load((uchar*)(src_data + j)) == v_zero); v_uint8 vmask = (vx_load((uchar*)(src_data + j)) == v_zero);
if (mask) if (v_check_any(vmask))
{ {
j += cv::trailingZeros32(mask); j += v_scan_forward(vmask);
return j; return j;
} }
} }

View File

@ -1139,32 +1139,23 @@ public:
for(; x < numCols; ++x ) for(; x < numCols; ++x )
{ {
#if CV_SIMD128 #if CV_SIMD
{ {
v_uint8x16 v_zero = v_setzero_u8(); v_uint8 v_zero = vx_setzero_u8();
for(; x <= numCols - 32; x += 32) { for(; x <= numCols - 2*v_uint8::nlanes; x += 2*v_uint8::nlanes) {
v_uint8x16 v_edge1 = v_load(edgeData + x); v_uint8 v_edge1 = (vx_load(edgeData + x ) != v_zero);
v_uint8x16 v_edge2 = v_load(edgeData + x + 16); v_uint8 v_edge2 = (vx_load(edgeData + x + v_uint8::nlanes) != v_zero);
v_uint8x16 v_cmp1 = (v_edge1 == v_zero); if(v_check_any(v_edge1))
v_uint8x16 v_cmp2 = (v_edge2 == v_zero);
unsigned int mask1 = v_signmask(v_cmp1);
unsigned int mask2 = v_signmask(v_cmp2);
mask1 ^= 0x0000ffff;
mask2 ^= 0x0000ffff;
if(mask1)
{ {
x += trailingZeros32(mask1); x += v_scan_forward(v_edge1);
goto _next_step; goto _next_step;
} }
if(mask2) if(v_check_any(v_edge2))
{ {
x += trailingZeros32(mask2 << 16); x += v_uint8::nlanes + v_scan_forward(v_edge2);
goto _next_step; goto _next_step;
} }
} }
@ -1175,7 +1166,7 @@ public:
if(x == numCols) if(x == numCols)
continue; continue;
#if CV_SIMD128 #if CV_SIMD
_next_step: _next_step:
#endif #endif
float vx, vy; float vx, vy;
@ -1506,36 +1497,35 @@ inline int HoughCircleEstimateRadiusInvoker<NZPointList>::filterCircles(const Po
int nzCount = 0; int nzCount = 0;
const Point* nz_ = &nz[0]; const Point* nz_ = &nz[0];
int j = 0; int j = 0;
#if CV_SIMD128 #if CV_SIMD
{ {
const v_float32x4 v_minRadius2 = v_setall_f32(minRadius2); const v_float32 v_minRadius2 = vx_setall_f32(minRadius2);
const v_float32x4 v_maxRadius2 = v_setall_f32(maxRadius2); const v_float32 v_maxRadius2 = vx_setall_f32(maxRadius2);
v_float32x4 v_curCenterX = v_setall_f32(curCenter.x); v_float32 v_curCenterX = vx_setall_f32(curCenter.x);
v_float32x4 v_curCenterY = v_setall_f32(curCenter.y); v_float32 v_curCenterY = vx_setall_f32(curCenter.y);
float CV_DECL_ALIGNED(16) rbuf[4]; float CV_DECL_ALIGNED(CV_SIMD_WIDTH) rbuf[v_float32::nlanes];
for(; j <= nzSz - 4; j += 4) int CV_DECL_ALIGNED(CV_SIMD_WIDTH) rmask[v_int32::nlanes];
for(; j <= nzSz - v_float32::nlanes; j += v_float32::nlanes)
{ {
v_float32x4 v_nzX, v_nzY; v_float32 v_nzX, v_nzY;
v_load_deinterleave((const float*)&nz_[j], v_nzX, v_nzY); // FIXIT use proper datatype v_load_deinterleave((const float*)&nz_[j], v_nzX, v_nzY); // FIXIT use proper datatype
v_float32x4 v_x = v_cvt_f32(v_reinterpret_as_s32(v_nzX)); v_float32 v_x = v_cvt_f32(v_reinterpret_as_s32(v_nzX));
v_float32x4 v_y = v_cvt_f32(v_reinterpret_as_s32(v_nzY)); v_float32 v_y = v_cvt_f32(v_reinterpret_as_s32(v_nzY));
v_float32x4 v_dx = v_x - v_curCenterX; v_float32 v_dx = v_x - v_curCenterX;
v_float32x4 v_dy = v_y - v_curCenterY; v_float32 v_dy = v_y - v_curCenterY;
v_float32x4 v_r2 = (v_dx * v_dx) + (v_dy * v_dy); v_float32 v_r2 = (v_dx * v_dx) + (v_dy * v_dy);
v_float32x4 vmask = (v_minRadius2 <= v_r2) & (v_r2 <= v_maxRadius2); v_float32 vmask = (v_minRadius2 <= v_r2) & (v_r2 <= v_maxRadius2);
unsigned int mask = v_signmask(vmask); if (v_check_any(vmask))
if (mask)
{ {
v_store_aligned(rmask, v_reinterpret_as_s32(vmask));
v_store_aligned(rbuf, v_r2); v_store_aligned(rbuf, v_r2);
if (mask & 1) ddata[nzCount++] = rbuf[0]; for (int i = 0; i < v_int32::nlanes; ++i)
if (mask & 2) ddata[nzCount++] = rbuf[1]; if (rmask[i]) ddata[nzCount++] = rbuf[i];
if (mask & 4) ddata[nzCount++] = rbuf[2];
if (mask & 8) ddata[nzCount++] = rbuf[3];
} }
} }
} }
@ -1566,12 +1556,13 @@ inline int HoughCircleEstimateRadiusInvoker<NZPointSet>::filterCircles(const Poi
const Range xOuter = Range(std::max(int(curCenter.x - rOuter), 0), std::min(int(curCenter.x + rOuter), positions.cols)); const Range xOuter = Range(std::max(int(curCenter.x - rOuter), 0), std::min(int(curCenter.x + rOuter), positions.cols));
const Range yOuter = Range(std::max(int(curCenter.y - rOuter), 0), std::min(int(curCenter.y + rOuter), positions.rows)); const Range yOuter = Range(std::max(int(curCenter.y - rOuter), 0), std::min(int(curCenter.y + rOuter), positions.rows));
#if CV_SIMD128 #if CV_SIMD
const int numSIMDPoints = 4; float v_seq[v_float32::nlanes];
for (int i = 0; i < v_float32::nlanes; ++i)
const v_float32x4 v_minRadius2 = v_setall_f32(minRadius2); v_seq[i] = (float)i;
const v_float32x4 v_maxRadius2 = v_setall_f32(maxRadius2); const v_float32 v_minRadius2 = vx_setall_f32(minRadius2);
const v_float32x4 v_curCenterX_0123 = v_setall_f32(curCenter.x) - v_float32x4(0.0f, 1.0f, 2.0f, 3.0f); const v_float32 v_maxRadius2 = vx_setall_f32(maxRadius2);
const v_float32 v_curCenterX_0123 = vx_setall_f32(curCenter.x) - vx_load(v_seq);
#endif #endif
for (int y = yOuter.start; y < yOuter.end; y++) for (int y = yOuter.start; y < yOuter.end; y++)
@ -1581,29 +1572,28 @@ inline int HoughCircleEstimateRadiusInvoker<NZPointSet>::filterCircles(const Poi
float dy2 = dy * dy; float dy2 = dy * dy;
int x = xOuter.start; int x = xOuter.start;
#if CV_SIMD128 #if CV_SIMD
{ {
const v_float32x4 v_dy2 = v_setall_f32(dy2); const v_float32 v_dy2 = vx_setall_f32(dy2);
const v_uint32x4 v_zero_u32 = v_setall_u32(0); const v_uint32 v_zero_u32 = vx_setall_u32(0);
float CV_DECL_ALIGNED(16) rbuf[4]; float CV_DECL_ALIGNED(CV_SIMD_WIDTH) rbuf[v_float32::nlanes];
for (; x <= xOuter.end - 4; x += numSIMDPoints) int CV_DECL_ALIGNED(CV_SIMD_WIDTH) rmask[v_int32::nlanes];
for (; x <= xOuter.end - v_float32::nlanes; x += v_float32::nlanes)
{ {
v_uint32x4 v_mask = v_load_expand_q(ptr + x); v_uint32 v_mask = vx_load_expand_q(ptr + x);
v_mask = v_mask != v_zero_u32; v_mask = v_mask != v_zero_u32;
v_float32x4 v_x = v_cvt_f32(v_setall_s32(x)); v_float32 v_x = v_cvt_f32(vx_setall_s32(x));
v_float32x4 v_dx = v_x - v_curCenterX_0123; v_float32 v_dx = v_x - v_curCenterX_0123;
v_float32x4 v_r2 = (v_dx * v_dx) + v_dy2; v_float32 v_r2 = (v_dx * v_dx) + v_dy2;
v_float32x4 vmask = (v_minRadius2 <= v_r2) & (v_r2 <= v_maxRadius2) & v_reinterpret_as_f32(v_mask); v_float32 vmask = (v_minRadius2 <= v_r2) & (v_r2 <= v_maxRadius2) & v_reinterpret_as_f32(v_mask);
unsigned int mask = v_signmask(vmask); if (v_check_any(vmask))
if (mask)
{ {
v_store_aligned(rmask, v_reinterpret_as_s32(vmask));
v_store_aligned(rbuf, v_r2); v_store_aligned(rbuf, v_r2);
if (mask & 1) ddata[nzCount++] = rbuf[0]; for (int i = 0; i < v_int32::nlanes; ++i)
if (mask & 2) ddata[nzCount++] = rbuf[1]; if (rmask[i]) ddata[nzCount++] = rbuf[i];
if (mask & 4) ddata[nzCount++] = rbuf[2];
if (mask & 8) ddata[nzCount++] = rbuf[3];
} }
} }
} }