mirror of
https://github.com/opencv/opencv.git
synced 2024-11-28 21:20:18 +08:00
Merge pull request #15136 from ChipKerchner:dotProd_unroll
* Unroll multiply and add instructions in dotProd_32f - 35% faster. * Eliminate unnecessary v_reduce_sum instructions.
This commit is contained in:
parent
ac425f67e4
commit
0db4fb1835
@ -2511,6 +2511,27 @@ double dotProd_32f(const float* src1, const float* src2, int len)
|
||||
|
||||
int j = 0;
|
||||
int cWidth = v_float32::nlanes;
|
||||
|
||||
#if CV_ENABLE_UNROLLED
|
||||
v_float32 v_sum1 = vx_setzero_f32();
|
||||
v_float32 v_sum2 = vx_setzero_f32();
|
||||
v_float32 v_sum3 = vx_setzero_f32();
|
||||
|
||||
for (; j <= blockSize - (cWidth * 4); j += (cWidth * 4))
|
||||
{
|
||||
v_sum = v_muladd(vx_load(src1 + j),
|
||||
vx_load(src2 + j), v_sum);
|
||||
v_sum1 = v_muladd(vx_load(src1 + j + cWidth),
|
||||
vx_load(src2 + j + cWidth), v_sum1);
|
||||
v_sum2 = v_muladd(vx_load(src1 + j + (cWidth * 2)),
|
||||
vx_load(src2 + j + (cWidth * 2)), v_sum2);
|
||||
v_sum3 = v_muladd(vx_load(src1 + j + (cWidth * 3)),
|
||||
vx_load(src2 + j + (cWidth * 3)), v_sum3);
|
||||
}
|
||||
|
||||
v_sum += v_sum1 + v_sum2 + v_sum3;
|
||||
#endif
|
||||
|
||||
for (; j <= blockSize - cWidth; j += cWidth)
|
||||
v_sum = v_muladd(vx_load(src1 + j), vx_load(src2 + j), v_sum);
|
||||
|
||||
@ -2532,4 +2553,4 @@ double dotProd_64f(const double* src1, const double* src2, int len)
|
||||
|
||||
#endif
|
||||
CV_CPU_OPTIMIZATION_NAMESPACE_END
|
||||
} // namespace
|
||||
} // namespace
|
||||
|
Loading…
Reference in New Issue
Block a user