Relax the loop condition to process the final batch.

This commit is contained in:
Liutong HAN 2025-03-13 07:54:41 +00:00
parent 71fe903121
commit fd62bd0991
3 changed files with 7 additions and 7 deletions

View File

@ -2544,7 +2544,7 @@ double dotProd_32s(const int* src1, const int* src2, int len)
#if CV_SIMD_WIDTH == 16
const int wstep = step * 2;
v_float64 v_sum1 = vx_setzero_f64();
for (; i < len - wstep; i += wstep, src1 += wstep, src2 += wstep)
for (; i <= len - wstep; i += wstep, src1 += wstep, src2 += wstep)
{
v_int32 v_src10 = vx_load(src1);
v_int32 v_src20 = vx_load(src2);
@ -2555,7 +2555,7 @@ double dotProd_32s(const int* src1, const int* src2, int len)
}
v_sum0 = v_add(v_sum0, v_sum1);
#endif
for (; i < len - step; i += step, src1 += step, src2 += step)
for (; i <= len - step; i += step, src1 += step, src2 += step)
{
v_int32 v_src1 = vx_load(src1);
v_int32 v_src2 = vx_load(src2);

View File

@ -1953,7 +1953,7 @@ struct RGB2Lab_f
{
const int vsize = VTraits<v_float32>::vlanes();
static const int nPixels = vsize*2;
for(; i < n - 3*nPixels; i += 3*nPixels, src += scn*nPixels)
for(; i <= n - 3*nPixels; i += 3*nPixels, src += scn*nPixels)
{
v_float32 rvec0, gvec0, bvec0, rvec1, gvec1, bvec1;
if(scn == 3)
@ -3297,7 +3297,7 @@ struct RGB2Luvinterpolate
{
const int vsize = VTraits<v_uint16>::vlanes();
static const int nPixels = vsize*2;
for(; i < n - 3*nPixels; i += 3*nPixels, src += scn*nPixels)
for(; i <= n - 3*nPixels; i += 3*nPixels, src += scn*nPixels)
{
/*
int R = src[bIdx], G = src[1], B = src[bIdx^2];

View File

@ -1325,7 +1325,7 @@ struct VResizeLinearVec_32s8u
v_store(dst + x, v_rshr_pack_u<2>(v_add(v_mul_hi(v_pack(v_shr<4>(vx_load(S0 + x)), v_shr<4>(vx_load(S0 + x + VTraits<v_int32>::vlanes()))), b0), v_mul_hi(v_pack(v_shr<4>(vx_load(S1 + x)), v_shr<4>(vx_load(S1 + x + VTraits<v_int32>::vlanes()))), b1)),
v_add(v_mul_hi(v_pack(v_shr<4>(vx_load(S0 + x + 2 * VTraits<v_int32>::vlanes())), v_shr<4>(vx_load(S0 + x + 3 * VTraits<v_int32>::vlanes()))), b0), v_mul_hi(v_pack(v_shr<4>(vx_load(S1 + x + 2 * VTraits<v_int32>::vlanes())), v_shr<4>(vx_load(S1 + x + 3 * VTraits<v_int32>::vlanes()))), b1))));
for( ; x < width - VTraits<v_int16>::vlanes(); x += VTraits<v_int16>::vlanes())
for( ; x <= width - VTraits<v_int16>::vlanes(); x += VTraits<v_int16>::vlanes())
v_rshr_pack_u_store<2>(dst + x, v_add(v_mul_hi(v_pack(v_shr<4>(vx_load(S0 + x)), v_shr<4>(vx_load(S0 + x + VTraits<v_int32>::vlanes()))), b0), v_mul_hi(v_pack(v_shr<4>(vx_load(S1 + x)), v_shr<4>(vx_load(S1 + x + VTraits<v_int32>::vlanes()))), b1)));
return x;
@ -1349,7 +1349,7 @@ struct VResizeLinearVec_32f16u
for (; x <= width - VTraits<v_uint16>::vlanes(); x += VTraits<v_uint16>::vlanes())
v_store(dst + x, v_pack_u(v_round(v_muladd(vx_load(S0 + x ), b0, v_mul(vx_load(S1 + x), b1))),
v_round(v_muladd(vx_load(S0 + x + VTraits<v_float32>::vlanes()), b0, v_mul(vx_load(S1 + x + VTraits<v_float32>::vlanes()), b1)))));
for( ; x < width - VTraits<v_float32>::vlanes(); x += VTraits<v_float32>::vlanes())
for( ; x <= width - VTraits<v_float32>::vlanes(); x += VTraits<v_float32>::vlanes())
{
v_int32 t0 = v_round(v_muladd(vx_load(S0 + x), b0, v_mul(vx_load(S1 + x), b1)));
v_store_low(dst + x, v_pack_u(t0, t0));
@ -1376,7 +1376,7 @@ struct VResizeLinearVec_32f16s
for (; x <= width - VTraits<v_int16>::vlanes(); x += VTraits<v_int16>::vlanes())
v_store(dst + x, v_pack(v_round(v_muladd(vx_load(S0 + x ), b0, v_mul(vx_load(S1 + x), b1))),
v_round(v_muladd(vx_load(S0 + x + VTraits<v_float32>::vlanes()), b0, v_mul(vx_load(S1 + x + VTraits<v_float32>::vlanes()), b1)))));
for( ; x < width - VTraits<v_float32>::vlanes(); x += VTraits<v_float32>::vlanes())
for( ; x <= width - VTraits<v_float32>::vlanes(); x += VTraits<v_float32>::vlanes())
{
v_int32 t0 = v_round(v_muladd(vx_load(S0 + x), b0, v_mul(vx_load(S1 + x), b1)));
v_store_low(dst + x, v_pack(t0, t0));