/////////////////////////////////////////////////////////////////////// // File: dotproductsse.cpp // Description: Architecture-specific dot-product function. // Author: Ray Smith // Created: Wed Jul 22 10:57:45 PDT 2015 // // (C) Copyright 2015, Google Inc. // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. /////////////////////////////////////////////////////////////////////// #if !defined(__SSE4_1__) // This code can't compile with "-msse4.1", so use dummy stubs. #include "dotproductsse.h" #include #include namespace tesseract { double DotProductSSE(const double* u, const double* v, int n) { fprintf(stderr, "DotProductSSE can't be used on Android\n"); abort(); } int32_t IntDotProductSSE(const int8_t* u, const int8_t* v, int n) { fprintf(stderr, "IntDotProductSSE can't be used on Android\n"); abort(); } } // namespace tesseract #else // !defined(__SSE4_1__) // Non-Android code here #include #include #include #include "dotproductsse.h" #include "host.h" namespace tesseract { // Computes and returns the dot product of the n-vectors u and v. // Uses Intel SSE intrinsics to access the SIMD instruction set. double DotProductSSE(const double* u, const double* v, int n) { int max_offset = n - 2; int offset = 0; // Accumulate a set of 2 sums in sum, by loading pairs of 2 values from u and // v, and multiplying them together in parallel. __m128d sum = _mm_setzero_pd(); if (offset <= max_offset) { offset = 2; // Aligned load is reputedly faster but requires 16 byte aligned input. if ((reinterpret_cast(u) & 15) == 0 && (reinterpret_cast(v) & 15) == 0) { // Use aligned load. sum = _mm_load_pd(u); __m128d floats2 = _mm_load_pd(v); // Multiply. sum = _mm_mul_pd(sum, floats2); while (offset <= max_offset) { __m128d floats1 = _mm_load_pd(u + offset); floats2 = _mm_load_pd(v + offset); offset += 2; floats1 = _mm_mul_pd(floats1, floats2); sum = _mm_add_pd(sum, floats1); } } else { // Use unaligned load. sum = _mm_loadu_pd(u); __m128d floats2 = _mm_loadu_pd(v); // Multiply. sum = _mm_mul_pd(sum, floats2); while (offset <= max_offset) { __m128d floats1 = _mm_loadu_pd(u + offset); floats2 = _mm_loadu_pd(v + offset); offset += 2; floats1 = _mm_mul_pd(floats1, floats2); sum = _mm_add_pd(sum, floats1); } } } // Add the 2 sums in sum horizontally. sum = _mm_hadd_pd(sum, sum); // Extract the low result. double result = _mm_cvtsd_f64(sum); // Add on any left-over products. while (offset < n) { result += u[offset] * v[offset]; ++offset; } return result; } // Computes and returns the dot product of the n-vectors u and v. // Uses Intel SSE intrinsics to access the SIMD instruction set. int32_t IntDotProductSSE(const int8_t* u, const int8_t* v, int n) { int max_offset = n - 8; int offset = 0; // Accumulate a set of 4 32-bit sums in sum, by loading 8 pairs of 8-bit // values, extending to 16 bit, multiplying to make 32 bit results. __m128i sum = _mm_setzero_si128(); if (offset <= max_offset) { offset = 8; __m128i packed1 = _mm_loadl_epi64(reinterpret_cast(u)); __m128i packed2 = _mm_loadl_epi64(reinterpret_cast(v)); sum = _mm_cvtepi8_epi16(packed1); packed2 = _mm_cvtepi8_epi16(packed2); // The magic _mm_add_epi16 is perfect here. It multiplies 8 pairs of 16 bit // ints to make 32 bit results, which are then horizontally added in pairs // to make 4 32 bit results that still fit in a 128 bit register. sum = _mm_madd_epi16(sum, packed2); while (offset <= max_offset) { packed1 = _mm_loadl_epi64(reinterpret_cast(u + offset)); packed2 = _mm_loadl_epi64(reinterpret_cast(v + offset)); offset += 8; packed1 = _mm_cvtepi8_epi16(packed1); packed2 = _mm_cvtepi8_epi16(packed2); packed1 = _mm_madd_epi16(packed1, packed2); sum = _mm_add_epi32(sum, packed1); } } // Sum the 4 packed 32 bit sums and extract the low result. sum = _mm_hadd_epi32(sum, sum); sum = _mm_hadd_epi32(sum, sum); int32_t result = _mm_cvtsi128_si32(sum); while (offset < n) { result += u[offset] * v[offset]; ++offset; } return result; } } // namespace tesseract. #endif // ANDROID_BUILD