/////////////////////////////////////////////////////////////////////// // File: dotproductavx.cpp // Description: Architecture-specific dot-product function. // Author: Ray Smith // Created: Wed Jul 22 10:48:05 PDT 2015 // // (C) Copyright 2015, Google Inc. // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. /////////////////////////////////////////////////////////////////////// #if !defined(__AVX__) // Implementation for non-avx archs. #include "dotproductavx.h" #include #include namespace tesseract { double DotProductAVX(const double* u, const double* v, int n) { fprintf(stderr, "DotProductAVX can't be used on Android\n"); abort(); } } // namespace tesseract #else // !defined(__AVX__) // Implementation for avx capable archs. #include #include #include "dotproductavx.h" #include "host.h" namespace tesseract { // Computes and returns the dot product of the n-vectors u and v. // Uses Intel AVX intrinsics to access the SIMD instruction set. double DotProductAVX(const double* u, const double* v, int n) { int max_offset = n - 4; int offset = 0; // Accumulate a set of 4 sums in sum, by loading pairs of 4 values from u and // v, and multiplying them together in parallel. __m256d sum = _mm256_setzero_pd(); if (offset <= max_offset) { offset = 4; // Aligned load is reputedly faster but requires 32 byte aligned input. if ((reinterpret_cast(u) & 31) == 0 && (reinterpret_cast(v) & 31) == 0) { // Use aligned load. __m256d floats1 = _mm256_load_pd(u); __m256d floats2 = _mm256_load_pd(v); // Multiply. sum = _mm256_mul_pd(floats1, floats2); while (offset <= max_offset) { floats1 = _mm256_load_pd(u + offset); floats2 = _mm256_load_pd(v + offset); offset += 4; __m256d product = _mm256_mul_pd(floats1, floats2); sum = _mm256_add_pd(sum, product); } } else { // Use unaligned load. __m256d floats1 = _mm256_loadu_pd(u); __m256d floats2 = _mm256_loadu_pd(v); // Multiply. sum = _mm256_mul_pd(floats1, floats2); while (offset <= max_offset) { floats1 = _mm256_loadu_pd(u + offset); floats2 = _mm256_loadu_pd(v + offset); offset += 4; __m256d product = _mm256_mul_pd(floats1, floats2); sum = _mm256_add_pd(sum, product); } } } // Add the 4 product sums together horizontally. Not so easy as with sse, as // there is no add across the upper/lower 128 bit boundary, so permute to // move the upper 128 bits to lower in another register. __m256d sum2 = _mm256_permute2f128_pd(sum, sum, 1); sum = _mm256_hadd_pd(sum, sum2); sum = _mm256_hadd_pd(sum, sum); double result; // _mm256_extract_f64 doesn't exist, but resist the temptation to use an sse // instruction, as that introduces a 70 cycle delay. All this casting is to // fool the intrinsics into thinking we are extracting the bottom int64. auto cast_sum = _mm256_castpd_si256(sum); #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wstrict-aliasing") *(reinterpret_cast(&result)) = #if defined(_WIN32) || defined(__i386__) // This is a very simple workaround that is activated // for all platforms that do not have _mm256_extract_epi64. // _mm256_extract_epi64(X, Y) == ((uint64_t*)&X)[Y] ((uint64_t*)&cast_sum)[0] #else _mm256_extract_epi64(cast_sum, 0) #endif ; #pragma GCC diagnostic pop while (offset < n) { result += u[offset] * v[offset]; ++offset; } return result; } } // namespace tesseract. #endif // ANDROID_BUILD