mirror of
https://github.com/opencv/opencv.git
synced 2025-06-12 20:42:53 +08:00
norm.cpp(normL2Sqr_): improve performance of pipeline
The most of target machine use one type cpu unit resource to execute some one type of instruction, e.g. all vx_load API use load/store cpu unit, and v_muladd API use mul/mula cpu unit, we interleave vx_load and v_muladd to improve performance on most targets like RISCV or ARM.
This commit is contained in:
parent
564d1a0f79
commit
12b8d542b7
@ -152,10 +152,10 @@ float normL2Sqr_(const float* a, const float* b, int n)
|
|||||||
{
|
{
|
||||||
v_float32 t0 = vx_load(a + j) - vx_load(b + j);
|
v_float32 t0 = vx_load(a + j) - vx_load(b + j);
|
||||||
v_float32 t1 = vx_load(a + j + v_float32::nlanes) - vx_load(b + j + v_float32::nlanes);
|
v_float32 t1 = vx_load(a + j + v_float32::nlanes) - vx_load(b + j + v_float32::nlanes);
|
||||||
v_float32 t2 = vx_load(a + j + 2 * v_float32::nlanes) - vx_load(b + j + 2 * v_float32::nlanes);
|
|
||||||
v_float32 t3 = vx_load(a + j + 3 * v_float32::nlanes) - vx_load(b + j + 3 * v_float32::nlanes);
|
|
||||||
v_d0 = v_muladd(t0, t0, v_d0);
|
v_d0 = v_muladd(t0, t0, v_d0);
|
||||||
|
v_float32 t2 = vx_load(a + j + 2 * v_float32::nlanes) - vx_load(b + j + 2 * v_float32::nlanes);
|
||||||
v_d1 = v_muladd(t1, t1, v_d1);
|
v_d1 = v_muladd(t1, t1, v_d1);
|
||||||
|
v_float32 t3 = vx_load(a + j + 3 * v_float32::nlanes) - vx_load(b + j + 3 * v_float32::nlanes);
|
||||||
v_d2 = v_muladd(t2, t2, v_d2);
|
v_d2 = v_muladd(t2, t2, v_d2);
|
||||||
v_d3 = v_muladd(t3, t3, v_d3);
|
v_d3 = v_muladd(t3, t3, v_d3);
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user