Merge pull request #2561 from stweil/fma

Implement double dot product using Intel FMA and reduce number of include files
2024-12-04 01:39:16 +08:00 · 2019-07-13 09:51:32 +02:00 · 2019-07-13 09:51:32 +02:00 · 8352194959
commit 8352194959
parent cce26fa197 61eab60fe3
14 changed files with 115 additions and 70 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -220,6 +220,7 @@ endforeach()
 # add definition as expected in src/arch/simddetect.cpp
 set(AVX_OPT OFF)
 set(AVX2_OPT OFF)
+set(FMA_OPT OFF)
 set(SSE41_OPT OFF)
 set(MARCH_NATIVE_OPT OFF)
 foreach(flag ${_enable_vector_unit_list})  # from OptimizeForArchitecture()
@ -227,7 +228,7 @@ foreach(flag ${_enable_vector_unit_list})  # from OptimizeForArchitecture()
    string(REPLACE "\." "_" flag "${flag}")
    set(sim_flags "${sim_flags} -D${flag}")
    string(REPLACE "_" "" flag "${flag}")
-    if("${flag}" MATCHES "AVX|AVX2|SSE41")
+    if("${flag}" MATCHES "AVX|AVX2|FMA|SSE41")
        set("${flag}_OPT" ON)
    endif()
 endforeach(flag)
@ -285,6 +286,7 @@ message( STATUS "Architecture flags: ${Vc_ARCHITECTURE_FLAGS}")
 message( STATUS "Vector unit list: ${_enable_vector_unit_list}")
 message( STATUS "AVX_OPT: ${AVX_OPT}")
 message( STATUS "AVX2_OPT: ${AVX2_OPT}")
+message( STATUS "FMA_OPT: ${FMA_OPT}")
 message( STATUS "SSE41_OPT: ${SSE41_OPT}")
 message( STATUS "MARCH_NATIVE_OPT: ${MARCH_NATIVE_OPT}")
 message( STATUS "sim_flags: ${sim_flags}")
@ -364,6 +366,9 @@ endif(AVX_OPT)
 if(AVX2_OPT)
   list(APPEND tesseract_src src/arch/intsimdmatrixavx2.cpp)
 endif(AVX2_OPT)
+if(FMA_OPT)
+   list(APPEND tesseract_src src/arch/dotproductfma.cpp)
+endif(AVX_OPT)
 if(SSE41_OPT)
   list(APPEND tesseract_src src/arch/dotproductsse.cpp src/arch/intsimdmatrixsse.cpp)
 endif(SSE41_OPT)
@ -505,8 +510,7 @@ install(FILES
    ${CMAKE_CURRENT_BINARY_DIR}/api/tess_version.h

    #from arch/makefile.am
-    src/arch/dotproductavx.h
-    src/arch/dotproductsse.h
+    src/arch/dotproduct.h
    src/arch/intsimdmatrix.h
    src/arch/simddetect.h

--- a/configure.ac
+++ b/configure.ac
@ -126,6 +126,9 @@ AM_CONDITIONAL([AVX_OPT], ${avx})
 AX_CHECK_COMPILE_FLAG([-mavx2], [avx2=true], [avx2=false], [$WERROR])
 AM_CONDITIONAL([AVX2_OPT], $avx2)

+AX_CHECK_COMPILE_FLAG([-mfma], [fma=true], [fma=false], [$WERROR])
+AM_CONDITIONAL([FMA_OPT], $fma)
+
 AX_CHECK_COMPILE_FLAG([-msse4.1], [sse41=true], [sse41=false], [$WERROR])
 AM_CONDITIONAL([SSE41_OPT], $sse41)

--- a/src/api/Makefile.am
+++ b/src/api/Makefile.am
@ -68,6 +68,9 @@ endif
 if AVX2_OPT
 libtesseract_la_LIBADD += ../arch/libtesseract_avx2.la
 endif
+if FMA_OPT
+libtesseract_la_LIBADD += ../arch/libtesseract_fma.la
+endif
 if SSE41_OPT
 libtesseract_la_LIBADD += ../arch/libtesseract_sse.la
 endif
--- a/src/api/tesseractmain.cpp
+++ b/src/api/tesseractmain.cpp
@ -136,6 +136,7 @@ static void PrintVersionInfo() {
  if (tesseract::SIMDDetect::IsAVX512FAvailable()) printf(" Found AVX512F\n");
  if (tesseract::SIMDDetect::IsAVX2Available()) printf(" Found AVX2\n");
  if (tesseract::SIMDDetect::IsAVXAvailable()) printf(" Found AVX\n");
+  if (tesseract::SIMDDetect::IsFMAAvailable()) printf(" Found FMA\n");
  if (tesseract::SIMDDetect::IsSSEAvailable()) printf(" Found SSE\n");
 #ifdef _OPENMP
  printf(" Found OpenMP %d\n", _OPENMP);
--- a/src/arch/Makefile.am
+++ b/src/arch/Makefile.am
@ -9,7 +9,7 @@ endif

 pkginclude_HEADERS =

-noinst_HEADERS = dotproduct.h dotproductavx.h dotproductsse.h
+noinst_HEADERS = dotproduct.h
 noinst_HEADERS += intsimdmatrix.h
 noinst_HEADERS += simddetect.h

@ -20,6 +20,9 @@ endif
 if AVX2_OPT
 noinst_LTLIBRARIES += libtesseract_avx2.la
 endif
+if FMA_OPT
+noinst_LTLIBRARIES += libtesseract_fma.la
+endif
 if SSE41_OPT
 noinst_LTLIBRARIES += libtesseract_sse.la
 endif
@ -34,6 +37,10 @@ if AVX2_OPT
 libtesseract_arch_la_CPPFLAGS += -DAVX2
 libtesseract_avx2_la_CXXFLAGS = -mavx2
 endif
+if FMA_OPT
+libtesseract_arch_la_CPPFLAGS += -DFMA
+libtesseract_fma_la_CXXFLAGS = -mfma
+endif
 if SSE41_OPT
 libtesseract_arch_la_CPPFLAGS += -DSSE4_1
 libtesseract_sse_la_CXXFLAGS = -msse4.1
@ -55,6 +62,10 @@ if AVX2_OPT
 libtesseract_avx2_la_SOURCES = intsimdmatrixavx2.cpp
 endif

+if FMA_OPT
+libtesseract_fma_la_SOURCES = dotproductfma.cpp
+endif
+
 if SSE41_OPT
 libtesseract_sse_la_SOURCES = dotproductsse.cpp intsimdmatrixsse.cpp
 endif
--- a/src/arch/dotproduct.h
+++ b/src/arch/dotproduct.h
@ -22,6 +22,15 @@ namespace tesseract {
 // Computes and returns the dot product of the n-vectors u and v.
 double DotProductNative(const double* u, const double* v, int n);

+// Uses Intel AVX intrinsics to access the SIMD instruction set.
+double DotProductAVX(const double* u, const double* v, int n);
+
+// Use Intel FMA.
+double DotProductFMA(const double* u, const double* v, int n);
+
+// Uses Intel SSE intrinsics to access the SIMD instruction set.
+double DotProductSSE(const double* u, const double* v, int n);
+
 }  // namespace tesseract.

 #endif  // TESSERACT_ARCH_DOTPRODUCT_H_
--- a/src/arch/dotproductavx.cpp
+++ b/src/arch/dotproductavx.cpp
@ -2,7 +2,6 @@
 // File:        dotproductavx.cpp
 // Description: Architecture-specific dot-product function.
 // Author:      Ray Smith
-// Created:     Wed Jul 22 10:48:05 PDT 2015
 //
 // (C) Copyright 2015, Google Inc.
 // Licensed under the Apache License, Version 2.0 (the "License");
@ -22,7 +21,7 @@

 #include <immintrin.h>
 #include <cstdint>
-#include "dotproductavx.h"
+#include "dotproduct.h"

 namespace tesseract {

--- a/src/arch/dotproductavx.h
+++ b/src/arch/dotproductavx.h
@ -1,30 +0,0 @@
-///////////////////////////////////////////////////////////////////////
-// File:        dotproductavx.h
-// Description: Architecture-specific dot-product function.
-// Author:      Ray Smith
-// Created:     Wed Jul 22 10:51:05 PDT 2015
-//
-// (C) Copyright 2015, Google Inc.
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-// http://www.apache.org/licenses/LICENSE-2.0
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-///////////////////////////////////////////////////////////////////////
-
-#ifndef TESSERACT_ARCH_DOTPRODUCTAVX_H_
-#define TESSERACT_ARCH_DOTPRODUCTAVX_H_
-
-namespace tesseract {
-
-// Computes and returns the dot product of the n-vectors u and v.
-// Uses Intel AVX intrinsics to access the SIMD instruction set.
-double DotProductAVX(const double* u, const double* v, int n);
-
-}  // namespace tesseract.
-
-#endif  // TESSERACT_ARCH_DOTPRODUCTAVX_H_
--- a/src/arch/dotproductfma.cpp
+++ b/src/arch/dotproductfma.cpp
@ -0,0 +1,57 @@
+///////////////////////////////////////////////////////////////////////
+// File:        dotproductfma.cpp
+// Description: Architecture-specific dot-product function.
+// Author:      Stefan Weil
+//
+// (C) Copyright 2015, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+///////////////////////////////////////////////////////////////////////
+
+#if !defined(__FMA__)
+#error Implementation only for FMA capable architectures
+#endif
+
+#include <immintrin.h>
+#include <cstdint>
+#include "dotproduct.h"
+
+namespace tesseract {
+
+// Computes and returns the dot product of the n-vectors u and v.
+// Uses Intel FMA intrinsics to access the SIMD instruction set.
+double DotProductFMA(const double* u, const double* v, int n) {
+  const unsigned quot = n / 8;
+  const unsigned rem = n % 8;
+  __m256d t0 = _mm256_setzero_pd();
+  __m256d t1 = _mm256_setzero_pd();
+  for (unsigned k = 0; k < quot; k++) {
+    __m256d f0 = _mm256_loadu_pd(u);
+    __m256d f1 = _mm256_loadu_pd(v);
+    t0 = _mm256_fmadd_pd(f0, f1, t0);
+    u += 4;
+    v += 4;
+    __m256d f2 = _mm256_loadu_pd(u);
+    __m256d f3 = _mm256_loadu_pd(v);
+    t1 = _mm256_fmadd_pd(f2, f3, t1);
+    u += 4;
+    v += 4;
+  }
+  t0 = _mm256_hadd_pd(t0, t1);
+  alignas(32) double tmp[4];
+  _mm256_store_pd(tmp, t0);
+  double result = tmp[0] + tmp[1] + tmp[2] + tmp[3];
+  for (unsigned k = 0; k < rem; k++) {
+    result += *u++ * *v++;
+  }
+  return result;
+}
+
+}  // namespace tesseract.
--- a/src/arch/dotproductsse.cpp
+++ b/src/arch/dotproductsse.cpp
@ -22,7 +22,7 @@
 #include <emmintrin.h>
 #include <smmintrin.h>
 #include <cstdint>
-#include "dotproductsse.h"
+#include "dotproduct.h"

 namespace tesseract {

--- a/src/arch/dotproductsse.h
+++ b/src/arch/dotproductsse.h
@ -1,29 +0,0 @@
-///////////////////////////////////////////////////////////////////////
-// File:        dotproductsse.h
-// Description: Architecture-specific dot-product function.
-// Author:      Ray Smith
-//
-// (C) Copyright 2015, Google Inc.
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-// http://www.apache.org/licenses/LICENSE-2.0
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-///////////////////////////////////////////////////////////////////////
-
-#ifndef TESSERACT_ARCH_DOTPRODUCTSSE_H_
-#define TESSERACT_ARCH_DOTPRODUCTSSE_H_
-
-namespace tesseract {
-
-// Computes and returns the dot product of the n-vectors u and v.
-// Uses Intel SSE intrinsics to access the SIMD instruction set.
-double DotProductSSE(const double* u, const double* v, int n);
-
-}  // namespace tesseract.
-
-#endif  // TESSERACT_ARCH_DOTPRODUCTSSE_H_
--- a/src/arch/intsimdmatrixsse.cpp
+++ b/src/arch/intsimdmatrixsse.cpp
@ -24,7 +24,6 @@
 #include <cstdint>
 #include <emmintrin.h>
 #include <smmintrin.h>
-#include "dotproductsse.h"

 namespace tesseract {

--- a/src/arch/simddetect.cpp
+++ b/src/arch/simddetect.cpp
@ -18,13 +18,11 @@
 #include <numeric>           // for std::inner_product
 #include "simddetect.h"
 #include "dotproduct.h"
-#include "dotproductavx.h"
-#include "dotproductsse.h"
 #include "intsimdmatrix.h"   // for IntSimdMatrix
 #include "params.h"   // for STRING_VAR
 #include "tprintf.h"  // for tprintf

-#if defined(AVX) || defined(AVX2) || defined(SSE4_1)
+#if defined(AVX) || defined(AVX2) || defined(FMA) || defined(SSE4_1)
 # define HAS_CPUID
 #endif

@ -60,6 +58,8 @@ bool SIMDDetect::avx_available_;
 bool SIMDDetect::avx2_available_;
 bool SIMDDetect::avx512F_available_;
 bool SIMDDetect::avx512BW_available_;
+// If true, then FMA has been detected.
+bool SIMDDetect::fma_available_;
 // If true, then SSe4.1 has been detected.
 bool SIMDDetect::sse_available_;

@ -98,6 +98,9 @@ SIMDDetect::SIMDDetect() {
 #if defined(SSE4_1)
    sse_available_ = (ecx & 0x00080000) != 0;
 #endif
+#if defined(FMA)
+    fma_available_ = (ecx & 0x00001000) != 0;
+#endif
 #if defined(AVX)
    avx_available_ = (ecx & 0x10000000) != 0;
    if (avx_available_) {
@ -121,6 +124,9 @@ SIMDDetect::SIMDDetect() {
 #if defined(SSE4_1)
    sse_available_ = (cpuInfo[2] & 0x00080000) != 0;
 #endif
+#if defined(FMA)
+    fma_available_ = (cpuInfo[2] & 0x00001000) != 0;
+#endif
 #if defined(AVX)
    avx_available_ = (cpuInfo[2] & 0x10000000) != 0;
 #endif
@ -185,6 +191,12 @@ void SIMDDetect::Update() {
    SetDotProduct(DotProductAVX, &IntSimdMatrix::intSimdMatrixSSE);
    dotproduct_method = "avx";
 #endif
+#if defined(FMA)
+  } else if (!strcmp(dotproduct.string(), "fma")) {
+    // FMA selected by config variable.
+    SetDotProduct(DotProductFMA, IntSimdMatrix::intSimdMatrix);
+    dotproduct_method = "fma";
+#endif
 #if defined(SSE4_1)
  } else if (!strcmp(dotproduct.string(), "sse")) {
    // SSE selected by config variable.
--- a/src/arch/simddetect.h
+++ b/src/arch/simddetect.h
@ -46,6 +46,10 @@ class SIMDDetect {
  static inline bool IsAVX512BWAvailable() {
    return detector.avx512BW_available_;
  }
+  // Returns true if FMA is available on this system.
+  static inline bool IsFMAAvailable() {
+    return detector.fma_available_;
+  }
  // Returns true if SSE4.1 is available on this system.
  static inline bool IsSSEAvailable() {
    return detector.sse_available_;
@ -66,6 +70,8 @@ class SIMDDetect {
  static TESS_API bool avx2_available_;
  static TESS_API bool avx512F_available_;
  static TESS_API bool avx512BW_available_;
+  // If true, then FMA has been detected.
+  static TESS_API bool fma_available_;
  // If true, then SSe4.1 has been detected.
  static TESS_API bool sse_available_;
 };