mirror of
https://github.com/opencv/opencv.git
synced 2025-07-22 20:39:41 +08:00
core: vectorize countNonZero64f
Improves performance a bit. 2.2x on P9 and 2 - 3x on coffee lake x86-64.
This commit is contained in:
parent
dd4f591d54
commit
ec91a3d59d
@ -179,7 +179,25 @@ static int countNonZero32f( const float* src, int len )
|
||||
|
||||
static int countNonZero64f( const double* src, int len )
|
||||
{
|
||||
return countNonZero_(src, len);
|
||||
int nz = 0, i = 0;
|
||||
#if CV_SIMD_64F
|
||||
v_int64 sum1 = vx_setzero_s64();
|
||||
v_int64 sum2 = vx_setzero_s64();
|
||||
v_float64 zero = vx_setzero_f64();
|
||||
int step = v_float64::nlanes * 2;
|
||||
int len0 = len & -step;
|
||||
|
||||
for(i = 0; i < len0; i += step )
|
||||
{
|
||||
sum1 += v_reinterpret_as_s64(vx_load(&src[i]) == zero);
|
||||
sum2 += v_reinterpret_as_s64(vx_load(&src[i + step / 2]) == zero);
|
||||
}
|
||||
|
||||
// N.B the value is incremented by -1 (0xF...F) for each value
|
||||
nz = i + (int)v_reduce_sum(sum1 + sum2);
|
||||
v_cleanup();
|
||||
#endif
|
||||
return nz + countNonZero_(src + i, len - i);
|
||||
}
|
||||
|
||||
CountNonZeroFunc getCountNonZeroTab(int depth)
|
||||
|
Loading…
Reference in New Issue
Block a user