mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2025-01-18 22:43:45 +08:00
Merge pull request #2257 from stweil/dp
Optimize calculation of dot product for double vectors with AVX
This commit is contained in:
commit
b6868599dc
@ -29,70 +29,30 @@ namespace tesseract {
|
||||
// Computes and returns the dot product of the n-vectors u and v.
|
||||
// Uses Intel AVX intrinsics to access the SIMD instruction set.
|
||||
double DotProductAVX(const double* u, const double* v, int n) {
|
||||
int max_offset = n - 4;
|
||||
int offset = 0;
|
||||
// Accumulate a set of 4 sums in sum, by loading pairs of 4 values from u and
|
||||
// v, and multiplying them together in parallel.
|
||||
__m256d sum = _mm256_setzero_pd();
|
||||
if (offset <= max_offset) {
|
||||
offset = 4;
|
||||
// Aligned load is reputedly faster but requires 32 byte aligned input.
|
||||
if ((reinterpret_cast<uintptr_t>(u) & 31) == 0 &&
|
||||
(reinterpret_cast<uintptr_t>(v) & 31) == 0) {
|
||||
// Use aligned load.
|
||||
__m256d floats1 = _mm256_load_pd(u);
|
||||
__m256d floats2 = _mm256_load_pd(v);
|
||||
// Multiply.
|
||||
sum = _mm256_mul_pd(floats1, floats2);
|
||||
while (offset <= max_offset) {
|
||||
floats1 = _mm256_load_pd(u + offset);
|
||||
floats2 = _mm256_load_pd(v + offset);
|
||||
offset += 4;
|
||||
__m256d product = _mm256_mul_pd(floats1, floats2);
|
||||
sum = _mm256_add_pd(sum, product);
|
||||
}
|
||||
} else {
|
||||
// Use unaligned load.
|
||||
__m256d floats1 = _mm256_loadu_pd(u);
|
||||
__m256d floats2 = _mm256_loadu_pd(v);
|
||||
// Multiply.
|
||||
sum = _mm256_mul_pd(floats1, floats2);
|
||||
while (offset <= max_offset) {
|
||||
floats1 = _mm256_loadu_pd(u + offset);
|
||||
floats2 = _mm256_loadu_pd(v + offset);
|
||||
offset += 4;
|
||||
__m256d product = _mm256_mul_pd(floats1, floats2);
|
||||
sum = _mm256_add_pd(sum, product);
|
||||
}
|
||||
}
|
||||
const unsigned quot = n / 8;
|
||||
const unsigned rem = n % 8;
|
||||
__m256d t0 = _mm256_setzero_pd();
|
||||
__m256d t1 = _mm256_setzero_pd();
|
||||
for (unsigned k = 0; k < quot; k++) {
|
||||
__m256d f0 = _mm256_loadu_pd(u);
|
||||
__m256d f1 = _mm256_loadu_pd(v);
|
||||
f0 = _mm256_mul_pd(f0, f1);
|
||||
t0 = _mm256_add_pd(t0, f0);
|
||||
u += 4;
|
||||
v += 4;
|
||||
__m256d f2 = _mm256_loadu_pd(u);
|
||||
__m256d f3 = _mm256_loadu_pd(v);
|
||||
f2 = _mm256_mul_pd(f2, f3);
|
||||
t1 = _mm256_add_pd(t1, f2);
|
||||
u += 4;
|
||||
v += 4;
|
||||
}
|
||||
// Add the 4 product sums together horizontally. Not so easy as with sse, as
|
||||
// there is no add across the upper/lower 128 bit boundary, so permute to
|
||||
// move the upper 128 bits to lower in another register.
|
||||
__m256d sum2 = _mm256_permute2f128_pd(sum, sum, 1);
|
||||
sum = _mm256_hadd_pd(sum, sum2);
|
||||
sum = _mm256_hadd_pd(sum, sum);
|
||||
double result;
|
||||
// _mm256_extract_f64 doesn't exist, but resist the temptation to use an sse
|
||||
// instruction, as that introduces a 70 cycle delay. All this casting is to
|
||||
// fool the intrinsics into thinking we are extracting the bottom int64.
|
||||
auto cast_sum = _mm256_castpd_si256(sum);
|
||||
#pragma GCC diagnostic push
|
||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
|
||||
*(reinterpret_cast<int64_t*>(&result)) =
|
||||
#if defined(_WIN32) || defined(__i386__)
|
||||
// This is a very simple workaround that is activated
|
||||
// for all platforms that do not have _mm256_extract_epi64.
|
||||
// _mm256_extract_epi64(X, Y) == ((uint64_t*)&X)[Y]
|
||||
((uint64_t*)&cast_sum)[0]
|
||||
#else
|
||||
_mm256_extract_epi64(cast_sum, 0)
|
||||
#endif
|
||||
;
|
||||
#pragma GCC diagnostic pop
|
||||
while (offset < n) {
|
||||
result += u[offset] * v[offset];
|
||||
++offset;
|
||||
t0 = _mm256_hadd_pd(t0, t1);
|
||||
alignas(32) double tmp[4];
|
||||
_mm256_store_pd(tmp, t0);
|
||||
double result = tmp[0] + tmp[1] + tmp[2] + tmp[3];
|
||||
for (unsigned k = 0; k < rem; k++) {
|
||||
result += *u++ * *v++;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user