Merge pull request #2257 from stweil/dp

Optimize calculation of dot product for double vectors with AVX
This commit is contained in:
zdenop 2019-02-21 07:40:22 +01:00 committed by GitHub
commit b6868599dc
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -29,70 +29,30 @@ namespace tesseract {
// Computes and returns the dot product of the n-vectors u and v.
// Uses Intel AVX intrinsics to access the SIMD instruction set.
double DotProductAVX(const double* u, const double* v, int n) {
int max_offset = n - 4;
int offset = 0;
// Accumulate a set of 4 sums in sum, by loading pairs of 4 values from u and
// v, and multiplying them together in parallel.
__m256d sum = _mm256_setzero_pd();
if (offset <= max_offset) {
offset = 4;
// Aligned load is reputedly faster but requires 32 byte aligned input.
if ((reinterpret_cast<uintptr_t>(u) & 31) == 0 &&
(reinterpret_cast<uintptr_t>(v) & 31) == 0) {
// Use aligned load.
__m256d floats1 = _mm256_load_pd(u);
__m256d floats2 = _mm256_load_pd(v);
// Multiply.
sum = _mm256_mul_pd(floats1, floats2);
while (offset <= max_offset) {
floats1 = _mm256_load_pd(u + offset);
floats2 = _mm256_load_pd(v + offset);
offset += 4;
__m256d product = _mm256_mul_pd(floats1, floats2);
sum = _mm256_add_pd(sum, product);
}
} else {
// Use unaligned load.
__m256d floats1 = _mm256_loadu_pd(u);
__m256d floats2 = _mm256_loadu_pd(v);
// Multiply.
sum = _mm256_mul_pd(floats1, floats2);
while (offset <= max_offset) {
floats1 = _mm256_loadu_pd(u + offset);
floats2 = _mm256_loadu_pd(v + offset);
offset += 4;
__m256d product = _mm256_mul_pd(floats1, floats2);
sum = _mm256_add_pd(sum, product);
}
}
const unsigned quot = n / 8;
const unsigned rem = n % 8;
__m256d t0 = _mm256_setzero_pd();
__m256d t1 = _mm256_setzero_pd();
for (unsigned k = 0; k < quot; k++) {
__m256d f0 = _mm256_loadu_pd(u);
__m256d f1 = _mm256_loadu_pd(v);
f0 = _mm256_mul_pd(f0, f1);
t0 = _mm256_add_pd(t0, f0);
u += 4;
v += 4;
__m256d f2 = _mm256_loadu_pd(u);
__m256d f3 = _mm256_loadu_pd(v);
f2 = _mm256_mul_pd(f2, f3);
t1 = _mm256_add_pd(t1, f2);
u += 4;
v += 4;
}
// Add the 4 product sums together horizontally. Not so easy as with sse, as
// there is no add across the upper/lower 128 bit boundary, so permute to
// move the upper 128 bits to lower in another register.
__m256d sum2 = _mm256_permute2f128_pd(sum, sum, 1);
sum = _mm256_hadd_pd(sum, sum2);
sum = _mm256_hadd_pd(sum, sum);
double result;
// _mm256_extract_f64 doesn't exist, but resist the temptation to use an sse
// instruction, as that introduces a 70 cycle delay. All this casting is to
// fool the intrinsics into thinking we are extracting the bottom int64.
auto cast_sum = _mm256_castpd_si256(sum);
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
*(reinterpret_cast<int64_t*>(&result)) =
#if defined(_WIN32) || defined(__i386__)
// This is a very simple workaround that is activated
// for all platforms that do not have _mm256_extract_epi64.
// _mm256_extract_epi64(X, Y) == ((uint64_t*)&X)[Y]
((uint64_t*)&cast_sum)[0]
#else
_mm256_extract_epi64(cast_sum, 0)
#endif
;
#pragma GCC diagnostic pop
while (offset < n) {
result += u[offset] * v[offset];
++offset;
t0 = _mm256_hadd_pd(t0, t1);
alignas(32) double tmp[4];
_mm256_store_pd(tmp, t0);
double result = tmp[0] + tmp[1] + tmp[2] + tmp[3];
for (unsigned k = 0; k < rem; k++) {
result += *u++ * *v++;
}
return result;
}