Relax the loop condition to process the final batch.

2025-07-31 01:47:12 +08:00 · 2025-03-13 07:54:41 +00:00 · 2025-03-13 07:54:41 +00:00 · fd62bd0991
commit fd62bd0991
parent 71fe903121
3 changed files with 7 additions and 7 deletions
--- a/modules/core/src/matmul.simd.hpp
+++ b/modules/core/src/matmul.simd.hpp
@ -2544,7 +2544,7 @@ double dotProd_32s(const int* src1, const int* src2, int len)
 #if CV_SIMD_WIDTH == 16
    const int wstep = step * 2;
    v_float64 v_sum1 = vx_setzero_f64();
-    for (; i < len - wstep; i += wstep, src1 += wstep, src2 += wstep)
+    for (; i <= len - wstep; i += wstep, src1 += wstep, src2 += wstep)
    {
        v_int32 v_src10 = vx_load(src1);
        v_int32 v_src20 = vx_load(src2);
@ -2555,7 +2555,7 @@ double dotProd_32s(const int* src1, const int* src2, int len)
    }
    v_sum0 = v_add(v_sum0, v_sum1);
 #endif
-    for (; i < len - step; i += step, src1 += step, src2 += step)
+    for (; i <= len - step; i += step, src1 += step, src2 += step)
    {
        v_int32 v_src1 = vx_load(src1);
        v_int32 v_src2 = vx_load(src2);
--- a/modules/imgproc/src/color_lab.cpp
+++ b/modules/imgproc/src/color_lab.cpp
@ -1953,7 +1953,7 @@ struct RGB2Lab_f
            {
                const int vsize = VTraits<v_float32>::vlanes();
                static const int nPixels = vsize*2;
-                for(; i < n - 3*nPixels; i += 3*nPixels, src += scn*nPixels)
+                for(; i <= n - 3*nPixels; i += 3*nPixels, src += scn*nPixels)
                {
                    v_float32 rvec0, gvec0, bvec0, rvec1, gvec1, bvec1;
                    if(scn == 3)
@ -3297,7 +3297,7 @@ struct RGB2Luvinterpolate
        {
            const int vsize = VTraits<v_uint16>::vlanes();
            static const int nPixels = vsize*2;
-            for(; i < n - 3*nPixels; i += 3*nPixels, src += scn*nPixels)
+            for(; i <= n - 3*nPixels; i += 3*nPixels, src += scn*nPixels)
            {
                /*
                    int R = src[bIdx], G = src[1], B = src[bIdx^2];
--- a/modules/imgproc/src/resize.cpp
+++ b/modules/imgproc/src/resize.cpp
@ -1325,7 +1325,7 @@ struct VResizeLinearVec_32s8u
                v_store(dst + x, v_rshr_pack_u<2>(v_add(v_mul_hi(v_pack(v_shr<4>(vx_load(S0 + x)), v_shr<4>(vx_load(S0 + x + VTraits<v_int32>::vlanes()))), b0), v_mul_hi(v_pack(v_shr<4>(vx_load(S1 + x)), v_shr<4>(vx_load(S1 + x + VTraits<v_int32>::vlanes()))), b1)),
                                                  v_add(v_mul_hi(v_pack(v_shr<4>(vx_load(S0 + x + 2 * VTraits<v_int32>::vlanes())), v_shr<4>(vx_load(S0 + x + 3 * VTraits<v_int32>::vlanes()))), b0), v_mul_hi(v_pack(v_shr<4>(vx_load(S1 + x + 2 * VTraits<v_int32>::vlanes())), v_shr<4>(vx_load(S1 + x + 3 * VTraits<v_int32>::vlanes()))), b1))));

-        for( ; x < width - VTraits<v_int16>::vlanes(); x += VTraits<v_int16>::vlanes())
+        for( ; x <= width - VTraits<v_int16>::vlanes(); x += VTraits<v_int16>::vlanes())
            v_rshr_pack_u_store<2>(dst + x, v_add(v_mul_hi(v_pack(v_shr<4>(vx_load(S0 + x)), v_shr<4>(vx_load(S0 + x + VTraits<v_int32>::vlanes()))), b0), v_mul_hi(v_pack(v_shr<4>(vx_load(S1 + x)), v_shr<4>(vx_load(S1 + x + VTraits<v_int32>::vlanes()))), b1)));

        return x;
@ -1349,7 +1349,7 @@ struct VResizeLinearVec_32f16u
            for (; x <= width - VTraits<v_uint16>::vlanes(); x += VTraits<v_uint16>::vlanes())
                v_store(dst + x, v_pack_u(v_round(v_muladd(vx_load(S0 + x                    ), b0, v_mul(vx_load(S1 + x), b1))),
                                          v_round(v_muladd(vx_load(S0 + x + VTraits<v_float32>::vlanes()), b0, v_mul(vx_load(S1 + x + VTraits<v_float32>::vlanes()), b1)))));
-        for( ; x < width - VTraits<v_float32>::vlanes(); x += VTraits<v_float32>::vlanes())
+        for( ; x <= width - VTraits<v_float32>::vlanes(); x += VTraits<v_float32>::vlanes())
        {
            v_int32 t0 = v_round(v_muladd(vx_load(S0 + x), b0, v_mul(vx_load(S1 + x), b1)));
            v_store_low(dst + x, v_pack_u(t0, t0));
@ -1376,7 +1376,7 @@ struct VResizeLinearVec_32f16s
            for (; x <= width - VTraits<v_int16>::vlanes(); x += VTraits<v_int16>::vlanes())
                v_store(dst + x, v_pack(v_round(v_muladd(vx_load(S0 + x                    ), b0, v_mul(vx_load(S1 + x), b1))),
                                        v_round(v_muladd(vx_load(S0 + x + VTraits<v_float32>::vlanes()), b0, v_mul(vx_load(S1 + x + VTraits<v_float32>::vlanes()), b1)))));
-        for( ; x < width - VTraits<v_float32>::vlanes(); x += VTraits<v_float32>::vlanes())
+        for( ; x <= width - VTraits<v_float32>::vlanes(); x += VTraits<v_float32>::vlanes())
        {
            v_int32 t0 = v_round(v_muladd(vx_load(S0 + x), b0, v_mul(vx_load(S1 + x), b1)));
            v_store_low(dst + x, v_pack(t0, t0));