Implement DotProductSSE() for FAST_FLOAT

[sw] Formatted commit message
2024-11-28 05:39:35 +08:00 · 2021-07-13 09:20:39 +02:00 · 2021-07-13 09:20:39 +02:00 · 27597883db
commit 27597883db
parent 79e8b4f344
2 changed files with 59 additions and 7 deletions
--- a/src/arch/dotproductsse.cpp
+++ b/src/arch/dotproductsse.cpp
@ -31,12 +31,63 @@ namespace tesseract {
 // Computes and returns the dot product of the n-vectors u and v.
 // Uses Intel SSE intrinsics to access the SIMD instruction set.
 #if defined(FAST_FLOAT)
-TFloat DotProductSSE(const TFloat *u, const TFloat *v, int n) {
-  TFloat total = 0.0;
-  for (int k = 0; k < n; ++k) {
-    total += u[k] * v[k];
+float DotProductSSE(const float *u, const float *v, int n) {
+  int max_offset = n - 4;
+  int offset = 0;
+  // Accumulate a set of 4 sums in sum, by loading pairs of 4 values from u and
+  // v, and multiplying them together in parallel.
+  __m128 sum = _mm_setzero_ps();
+  if (offset <= max_offset) {
+    offset = 4;
+    // Aligned load is reputedly faster but requires 16 byte aligned input.
+    if ((reinterpret_cast<uintptr_t>(u) & 15) == 0 &&
+        (reinterpret_cast<uintptr_t>(v) & 15) == 0) {
+      // Use aligned load.
+      sum = _mm_load_ps(u);
+      __m128 floats2 = _mm_load_ps(v);
+      // Multiply.
+      sum = _mm_mul_ps(sum, floats2);
+      while (offset <= max_offset) {
+        __m128 floats1 = _mm_load_ps(u + offset);
+        floats2 = _mm_load_ps(v + offset);
+        floats1 = _mm_mul_ps(floats1, floats2);
+        sum = _mm_add_ps(sum, floats1);
+        offset += 4;
      }
-  return total;
+    } else {
+      // Use unaligned load.
+      sum = _mm_loadu_ps(u);
+      __m128 floats2 = _mm_loadu_ps(v);
+      // Multiply.
+      sum = _mm_mul_ps(sum, floats2);
+      while (offset <= max_offset) {
+        __m128 floats1 = _mm_loadu_ps(u + offset);
+        floats2 = _mm_loadu_ps(v + offset);
+        floats1 = _mm_mul_ps(floats1, floats2);
+        sum = _mm_add_ps(sum, floats1);
+        offset += 4;
+      }
+    }
+  }
+  // Add the 4 sums in sum horizontally.
+#if 0
+	alignas(32) float tmp[4];
+	_mm_store_ps(tmp, sum);
+	float result = tmp[0] + tmp[1] + tmp[2] + tmp[3];
+#else
+  __m128 zero = _mm_setzero_ps();
+  // https://www.felixcloutier.com/x86/haddps
+  sum = _mm_hadd_ps(sum, zero);
+  sum = _mm_hadd_ps(sum, zero);
+  // Extract the low result.
+  float result = _mm_cvtss_f32(sum);
+#endif
+  // Add on any left-over products.
+  while (offset < n) {
+    result += u[offset] * v[offset];
+    ++offset;
+  }
+  return result;
 }
 #else
 double DotProductSSE(const double *u, const double *v, int n) {
@ -48,7 +99,8 @@ double DotProductSSE(const double *u, const double *v, int n) {
  if (offset <= max_offset) {
    offset = 2;
    // Aligned load is reputedly faster but requires 16 byte aligned input.
-    if ((reinterpret_cast<uintptr_t>(u) & 15) == 0 && (reinterpret_cast<uintptr_t>(v) & 15) == 0) {
+    if ((reinterpret_cast<uintptr_t>(u) & 15) == 0 &&
+        (reinterpret_cast<uintptr_t>(v) & 15) == 0) {
      // Use aligned load.
      sum = _mm_load_pd(u);
      __m128d floats2 = _mm_load_pd(v);
--- a/src/arch/intsimdmatrix.h
+++ b/src/arch/intsimdmatrix.h
@ -115,7 +115,7 @@ struct TESS_API IntSimdMatrix {
  static const IntSimdMatrix *intSimdMatrix;
  // Only available with NEON.
  static const IntSimdMatrix intSimdMatrixNEON;
-  // Only available with AVX2 / SSE.
+  // Only available with AVX2 / AVX / FMA / SSE.
  static const IntSimdMatrix intSimdMatrixAVX2;
  static const IntSimdMatrix intSimdMatrixSSE;
 };