mirror of
https://github.com/opencv/opencv.git
synced 2024-11-25 11:40:44 +08:00
core: vectorize dotProd_32s
Use 4x FMA chains to sum on SIMD 128 FP64 targets. On x86 this showed about 1.4x improvement. For PPC, do a full multiply (32x32->64b), convert to DP then accumulate. This may be slightly less precise for some inputs. But is 1.5x faster than the above which is about 1.5x than the FMA above for ~2.5x speedup.
This commit is contained in:
parent
7295983964
commit
33fb253a66
@ -1051,6 +1051,15 @@ inline v_float64x2 v_cvt_f64(const v_float32x4& a)
|
|||||||
inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
|
inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
|
||||||
{ return v_float64x2(vec_cvfo(vec_mergel(a.val, a.val))); }
|
{ return v_float64x2(vec_cvfo(vec_mergel(a.val, a.val))); }
|
||||||
|
|
||||||
|
// The altivec intrinsic is missing for this 2.06 insn
|
||||||
|
inline v_float64x2 v_cvt_f64(const v_int64x2& a)
|
||||||
|
{
|
||||||
|
vec_double2 out;
|
||||||
|
|
||||||
|
__asm__ ("xvcvsxddp %x0,%x1" : "=wa"(out) : "wa"(a.val));
|
||||||
|
return v_float64x2(out);
|
||||||
|
}
|
||||||
|
|
||||||
////////////// Lookup table access ////////////////////
|
////////////// Lookup table access ////////////////////
|
||||||
|
|
||||||
inline v_int8x16 v_lut(const schar* tab, const int* idx)
|
inline v_int8x16 v_lut(const schar* tab, const int* idx)
|
||||||
|
@ -2493,7 +2493,36 @@ double dotProd_16s(const short* src1, const short* src2, int len)
|
|||||||
|
|
||||||
double dotProd_32s(const int* src1, const int* src2, int len)
|
double dotProd_32s(const int* src1, const int* src2, int len)
|
||||||
{
|
{
|
||||||
|
#if CV_SIMD128_64F
|
||||||
|
double r = 0.0;
|
||||||
|
int i = 0;
|
||||||
|
int lenAligned = len & -v_int32x4::nlanes;
|
||||||
|
v_float64x2 a(0.0, 0.0);
|
||||||
|
v_float64x2 b(0.0, 0.0);
|
||||||
|
|
||||||
|
for( i = 0; i < lenAligned; i += v_int32x4::nlanes )
|
||||||
|
{
|
||||||
|
v_int32x4 s1 = v_load(src1);
|
||||||
|
v_int32x4 s2 = v_load(src2);
|
||||||
|
|
||||||
|
#if CV_VSX
|
||||||
|
// Do 32x32->64 multiplies, convert/round to double, accumulate
|
||||||
|
// Potentially less precise than FMA, but 1.5x faster than fma below.
|
||||||
|
a += v_cvt_f64(v_int64(vec_mule(s1.val, s2.val)));
|
||||||
|
b += v_cvt_f64(v_int64(vec_mulo(s1.val, s2.val)));
|
||||||
|
#else
|
||||||
|
a = v_fma(v_cvt_f64(s1), v_cvt_f64(s2), a);
|
||||||
|
b = v_fma(v_cvt_f64_high(s1), v_cvt_f64_high(s2), b);
|
||||||
|
#endif
|
||||||
|
src1 += v_int32x4::nlanes;
|
||||||
|
src2 += v_int32x4::nlanes;
|
||||||
|
}
|
||||||
|
a += b;
|
||||||
|
r = v_reduce_sum(a);
|
||||||
|
return r + dotProd_(src1, src2, len - i);
|
||||||
|
#else
|
||||||
return dotProd_(src1, src2, len);
|
return dotProd_(src1, src2, len);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
double dotProd_32f(const float* src1, const float* src2, int len)
|
double dotProd_32f(const float* src1, const float* src2, int len)
|
||||||
|
Loading…
Reference in New Issue
Block a user