core: vectorize countNonZero64f

Improves performance a bit. 2.2x on P9 and 2 - 3x on coffee lake x86-64.
2025-07-22 20:39:41 +08:00 · 2019-08-28 12:09:07 -05:00 · 2019-08-28 12:09:07 -05:00 · ec91a3d59d
commit ec91a3d59d
parent dd4f591d54
1 changed files with 19 additions and 1 deletions
--- a/modules/core/src/count_non_zero.simd.hpp
+++ b/modules/core/src/count_non_zero.simd.hpp
@ -179,7 +179,25 @@ static int countNonZero32f( const float* src, int len )
 static int countNonZero64f( const double* src, int len )
 {
-    return countNonZero_(src, len);
+    int nz = 0, i = 0;
 #if CV_SIMD_64F
    v_int64 sum1 = vx_setzero_s64();
    v_int64 sum2 = vx_setzero_s64();
    v_float64 zero = vx_setzero_f64();
    int step = v_float64::nlanes * 2;
    int len0 = len & -step;
    for(i = 0; i < len0; i += step )
        {
        sum1 += v_reinterpret_as_s64(vx_load(&src[i]) == zero);
        sum2 += v_reinterpret_as_s64(vx_load(&src[i + step / 2]) == zero);
        }
    // N.B the value is incremented by -1 (0xF...F) for each value
    nz = i + (int)v_reduce_sum(sum1 + sum2);
    v_cleanup();
 #endif
    return nz + countNonZero_(src + i, len - i);
 }
 CountNonZeroFunc getCountNonZeroTab(int depth)