mirror of
https://github.com/opencv/opencv.git
synced 2025-07-31 01:47:12 +08:00
Relax the loop condition to process the final batch.
This commit is contained in:
parent
71fe903121
commit
fd62bd0991
@ -2544,7 +2544,7 @@ double dotProd_32s(const int* src1, const int* src2, int len)
|
||||
#if CV_SIMD_WIDTH == 16
|
||||
const int wstep = step * 2;
|
||||
v_float64 v_sum1 = vx_setzero_f64();
|
||||
for (; i < len - wstep; i += wstep, src1 += wstep, src2 += wstep)
|
||||
for (; i <= len - wstep; i += wstep, src1 += wstep, src2 += wstep)
|
||||
{
|
||||
v_int32 v_src10 = vx_load(src1);
|
||||
v_int32 v_src20 = vx_load(src2);
|
||||
@ -2555,7 +2555,7 @@ double dotProd_32s(const int* src1, const int* src2, int len)
|
||||
}
|
||||
v_sum0 = v_add(v_sum0, v_sum1);
|
||||
#endif
|
||||
for (; i < len - step; i += step, src1 += step, src2 += step)
|
||||
for (; i <= len - step; i += step, src1 += step, src2 += step)
|
||||
{
|
||||
v_int32 v_src1 = vx_load(src1);
|
||||
v_int32 v_src2 = vx_load(src2);
|
||||
|
@ -1953,7 +1953,7 @@ struct RGB2Lab_f
|
||||
{
|
||||
const int vsize = VTraits<v_float32>::vlanes();
|
||||
static const int nPixels = vsize*2;
|
||||
for(; i < n - 3*nPixels; i += 3*nPixels, src += scn*nPixels)
|
||||
for(; i <= n - 3*nPixels; i += 3*nPixels, src += scn*nPixels)
|
||||
{
|
||||
v_float32 rvec0, gvec0, bvec0, rvec1, gvec1, bvec1;
|
||||
if(scn == 3)
|
||||
@ -3297,7 +3297,7 @@ struct RGB2Luvinterpolate
|
||||
{
|
||||
const int vsize = VTraits<v_uint16>::vlanes();
|
||||
static const int nPixels = vsize*2;
|
||||
for(; i < n - 3*nPixels; i += 3*nPixels, src += scn*nPixels)
|
||||
for(; i <= n - 3*nPixels; i += 3*nPixels, src += scn*nPixels)
|
||||
{
|
||||
/*
|
||||
int R = src[bIdx], G = src[1], B = src[bIdx^2];
|
||||
|
@ -1325,7 +1325,7 @@ struct VResizeLinearVec_32s8u
|
||||
v_store(dst + x, v_rshr_pack_u<2>(v_add(v_mul_hi(v_pack(v_shr<4>(vx_load(S0 + x)), v_shr<4>(vx_load(S0 + x + VTraits<v_int32>::vlanes()))), b0), v_mul_hi(v_pack(v_shr<4>(vx_load(S1 + x)), v_shr<4>(vx_load(S1 + x + VTraits<v_int32>::vlanes()))), b1)),
|
||||
v_add(v_mul_hi(v_pack(v_shr<4>(vx_load(S0 + x + 2 * VTraits<v_int32>::vlanes())), v_shr<4>(vx_load(S0 + x + 3 * VTraits<v_int32>::vlanes()))), b0), v_mul_hi(v_pack(v_shr<4>(vx_load(S1 + x + 2 * VTraits<v_int32>::vlanes())), v_shr<4>(vx_load(S1 + x + 3 * VTraits<v_int32>::vlanes()))), b1))));
|
||||
|
||||
for( ; x < width - VTraits<v_int16>::vlanes(); x += VTraits<v_int16>::vlanes())
|
||||
for( ; x <= width - VTraits<v_int16>::vlanes(); x += VTraits<v_int16>::vlanes())
|
||||
v_rshr_pack_u_store<2>(dst + x, v_add(v_mul_hi(v_pack(v_shr<4>(vx_load(S0 + x)), v_shr<4>(vx_load(S0 + x + VTraits<v_int32>::vlanes()))), b0), v_mul_hi(v_pack(v_shr<4>(vx_load(S1 + x)), v_shr<4>(vx_load(S1 + x + VTraits<v_int32>::vlanes()))), b1)));
|
||||
|
||||
return x;
|
||||
@ -1349,7 +1349,7 @@ struct VResizeLinearVec_32f16u
|
||||
for (; x <= width - VTraits<v_uint16>::vlanes(); x += VTraits<v_uint16>::vlanes())
|
||||
v_store(dst + x, v_pack_u(v_round(v_muladd(vx_load(S0 + x ), b0, v_mul(vx_load(S1 + x), b1))),
|
||||
v_round(v_muladd(vx_load(S0 + x + VTraits<v_float32>::vlanes()), b0, v_mul(vx_load(S1 + x + VTraits<v_float32>::vlanes()), b1)))));
|
||||
for( ; x < width - VTraits<v_float32>::vlanes(); x += VTraits<v_float32>::vlanes())
|
||||
for( ; x <= width - VTraits<v_float32>::vlanes(); x += VTraits<v_float32>::vlanes())
|
||||
{
|
||||
v_int32 t0 = v_round(v_muladd(vx_load(S0 + x), b0, v_mul(vx_load(S1 + x), b1)));
|
||||
v_store_low(dst + x, v_pack_u(t0, t0));
|
||||
@ -1376,7 +1376,7 @@ struct VResizeLinearVec_32f16s
|
||||
for (; x <= width - VTraits<v_int16>::vlanes(); x += VTraits<v_int16>::vlanes())
|
||||
v_store(dst + x, v_pack(v_round(v_muladd(vx_load(S0 + x ), b0, v_mul(vx_load(S1 + x), b1))),
|
||||
v_round(v_muladd(vx_load(S0 + x + VTraits<v_float32>::vlanes()), b0, v_mul(vx_load(S1 + x + VTraits<v_float32>::vlanes()), b1)))));
|
||||
for( ; x < width - VTraits<v_float32>::vlanes(); x += VTraits<v_float32>::vlanes())
|
||||
for( ; x <= width - VTraits<v_float32>::vlanes(); x += VTraits<v_float32>::vlanes())
|
||||
{
|
||||
v_int32 t0 = v_round(v_muladd(vx_load(S0 + x), b0, v_mul(vx_load(S1 + x), b1)));
|
||||
v_store_low(dst + x, v_pack(t0, t0));
|
||||
|
Loading…
Reference in New Issue
Block a user