mirror of
https://github.com/opencv/opencv.git
synced 2025-07-22 20:39:41 +08:00
core: vectorize countNonZero64f
Improves performance a bit. 2.2x on P9 and 2 - 3x on coffee lake x86-64.
This commit is contained in:
parent
dd4f591d54
commit
ec91a3d59d
@ -179,7 +179,25 @@ static int countNonZero32f( const float* src, int len )
|
|||||||
|
|
||||||
static int countNonZero64f( const double* src, int len )
|
static int countNonZero64f( const double* src, int len )
|
||||||
{
|
{
|
||||||
return countNonZero_(src, len);
|
int nz = 0, i = 0;
|
||||||
|
#if CV_SIMD_64F
|
||||||
|
v_int64 sum1 = vx_setzero_s64();
|
||||||
|
v_int64 sum2 = vx_setzero_s64();
|
||||||
|
v_float64 zero = vx_setzero_f64();
|
||||||
|
int step = v_float64::nlanes * 2;
|
||||||
|
int len0 = len & -step;
|
||||||
|
|
||||||
|
for(i = 0; i < len0; i += step )
|
||||||
|
{
|
||||||
|
sum1 += v_reinterpret_as_s64(vx_load(&src[i]) == zero);
|
||||||
|
sum2 += v_reinterpret_as_s64(vx_load(&src[i + step / 2]) == zero);
|
||||||
|
}
|
||||||
|
|
||||||
|
// N.B the value is incremented by -1 (0xF...F) for each value
|
||||||
|
nz = i + (int)v_reduce_sum(sum1 + sum2);
|
||||||
|
v_cleanup();
|
||||||
|
#endif
|
||||||
|
return nz + countNonZero_(src + i, len - i);
|
||||||
}
|
}
|
||||||
|
|
||||||
CountNonZeroFunc getCountNonZeroTab(int depth)
|
CountNonZeroFunc getCountNonZeroTab(int depth)
|
||||||
|
Loading…
Reference in New Issue
Block a user