Merge pull request #3792 from stweil/avx512f

Add initial support for Intel AVX512F
2025-01-20 07:25:05 +08:00 · 2022-04-15 12:52:18 +02:00 · 2022-04-15 12:52:18 +02:00 · 864ab537bb
commit 864ab537bb
parent b5b5e92694 b0d82879e5
6 changed files with 112 additions and 0 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -167,6 +167,10 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86|x86_64|AMD64|amd64|i386|i686")
    set(AVX2_COMPILE_FLAGS "/arch:AVX2")
    add_definitions("-DHAVE_AVX2")

+    set(HAVE_AVX512F ON)
+    set(AVX512_COMPILE_FLAGS "/arch:AVX512")
+    add_definitions("-DHAVE_AVX512F")
+
    set(HAVE_FMA ON)
    set(FMA_COMPILE_FLAGS "-D__FMA__")
    add_definitions("-DHAVE_FMA")
@ -198,6 +202,12 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86|x86_64|AMD64|amd64|i386|i686")
      add_definitions("-DHAVE_AVX2")
    endif()

+    check_cxx_compiler_flag("-mavx512f" HAVE_AVX512F)
+    if(HAVE_AVX512F)
+      set(AVX512F_COMPILE_FLAGS "-mavx512f")
+      add_definitions("-DHAVE_AVX512F")
+    endif()
+
    check_cxx_compiler_flag("-mfma" HAVE_FMA)
    if(HAVE_FMA)
      set(FMA_COMPILE_FLAGS "-mfma")
@ -215,6 +225,7 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "arm64|aarch64.*|AARCH64.*")

  set(HAVE_AVX FALSE)
  set(HAVE_AVX2 FALSE)
+  set(HAVE_AVX512F FALSE)
  set(HAVE_FMA FALSE)
  set(HAVE_SSE4_1 FALSE)

@ -225,6 +236,7 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "arm.*")

  set(HAVE_AVX FALSE)
  set(HAVE_AVX2 FALSE)
+  set(HAVE_AVX512F FALSE)
  set(HAVE_FMA FALSE)
  set(HAVE_SSE4_1 FALSE)

@ -238,6 +250,7 @@ else()

  set(HAVE_AVX FALSE)
  set(HAVE_AVX2 FALSE)
+  set(HAVE_AVX512F FALSE)
  set(HAVE_FMA FALSE)
  set(HAVE_NEON FALSE)
  set(HAVE_SSE4_1 FALSE)
@ -480,6 +493,7 @@ message(STATUS "Linker options: ${CMAKE_EXE_LINKER_FLAGS} "
 message(STATUS "Install directory: ${CMAKE_INSTALL_PREFIX}")
 message(STATUS "HAVE_AVX: ${HAVE_AVX}")
 message(STATUS "HAVE_AVX2: ${HAVE_AVX2}")
+message(STATUS "HAVE_AVX512F: ${HAVE_AVX512F}")
 message(STATUS "HAVE_FMA: ${HAVE_FMA}")
 message(STATUS "HAVE_SSE4_1: ${HAVE_SSE4_1}")
 message(STATUS "MARCH_NATIVE_OPT: ${MARCH_NATIVE_OPT}")
@ -641,6 +655,11 @@ if(HAVE_AVX2)
  set_source_files_properties(src/arch/intsimdmatrixavx2.cpp
                              PROPERTIES COMPILE_FLAGS ${AVX2_COMPILE_FLAGS})
 endif(HAVE_AVX2)
+if(HAVE_AVX512F)
+  list(APPEND arch_files_opt src/arch/dotproductavx512.cpp)
+  set_source_files_properties(src/arch/dotproductavx512.cpp
+                              PROPERTIES COMPILE_FLAGS ${AVX512F_COMPILE_FLAGS})
+endif(HAVE_AVX512F)
 if(HAVE_FMA)
  list(APPEND arch_files_opt src/arch/dotproductfma.cpp)
  set_source_files_properties(src/arch/dotproductfma.cpp
--- a/Makefile.am
+++ b/Makefile.am
@ -160,6 +160,14 @@ libtesseract_la_LIBADD += libtesseract_avx2.la
 noinst_LTLIBRARIES += libtesseract_avx2.la
 endif

+if HAVE_AVX512F
+libtesseract_avx512_la_CXXFLAGS = -mavx512f
+libtesseract_avx512_la_CXXFLAGS += -I$(top_srcdir)/src/ccutil
+libtesseract_avx512_la_SOURCES = src/arch/dotproductavx512.cpp
+libtesseract_la_LIBADD += libtesseract_avx512.la
+noinst_LTLIBRARIES += libtesseract_avx512.la
+endif
+
 if HAVE_FMA
 libtesseract_fma_la_CXXFLAGS = -mfma
 libtesseract_fma_la_CXXFLAGS += -I$(top_srcdir)/src/ccutil
--- a/configure.ac
+++ b/configure.ac
@ -129,6 +129,7 @@ AX_CHECK_COMPILE_FLAG([-Werror=unused-command-line-argument], [WERROR=-Werror=un

 AM_CONDITIONAL([HAVE_AVX], false)
 AM_CONDITIONAL([HAVE_AVX2], false)
+AM_CONDITIONAL([HAVE_AVX512F], false)
 AM_CONDITIONAL([HAVE_FMA], false)
 AM_CONDITIONAL([HAVE_SSE4_1], false)
 AM_CONDITIONAL([HAVE_NEON], false)
@ -149,6 +150,12 @@ case "${host_cpu}" in
      AC_DEFINE([HAVE_AVX2], [1], [Enable AVX2 instructions])
    fi

+    AX_CHECK_COMPILE_FLAG([-mavx512f], [avx512f=true], [avx512f=false], [$WERROR])
+    AM_CONDITIONAL([HAVE_AVX512F], $avx512f)
+    if $avx512f; then
+      AC_DEFINE([HAVE_AVX512F], [1], [Enable AVX512F instructions])
+    fi
+
    AX_CHECK_COMPILE_FLAG([-mfma], [fma=true], [fma=false], [$WERROR])
    AM_CONDITIONAL([HAVE_FMA], $fma)
    if $fma; then
--- a/src/arch/dotproduct.h
+++ b/src/arch/dotproduct.h
@ -27,6 +27,9 @@ TFloat DotProductNative(const TFloat *u, const TFloat *v, int n);
 // Uses Intel AVX intrinsics to access the SIMD instruction set.
 TFloat DotProductAVX(const TFloat *u, const TFloat *v, int n);

+// Uses Intel AVX512F intrinsics to access the SIMD instruction set.
+TFloat DotProductAVX512F(const TFloat *u, const TFloat *v, int n);
+
 // Use Intel FMA.
 TFloat DotProductFMA(const TFloat *u, const TFloat *v, int n);

--- a/src/arch/dotproductavx512.cpp
+++ b/src/arch/dotproductavx512.cpp
@ -0,0 +1,70 @@
+///////////////////////////////////////////////////////////////////////
+// File:        dotproductavx512.cpp
+// Description: Architecture-specific dot-product function.
+// Author:      Stefan Weil
+//
+// (C) Copyright 2022
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+///////////////////////////////////////////////////////////////////////
+
+#if !defined(__AVX__)
+#  if defined(__i686__) || defined(__x86_64__)
+#    error Implementation only for AVX capable architectures
+#  endif
+#else
+
+#  include <immintrin.h>
+#  include <cstdint>
+#  include "dotproduct.h"
+
+namespace tesseract {
+
+// Computes and returns the dot product of the n-vectors u and v.
+// Uses Intel AVX intrinsics to access the SIMD instruction set.
+#  if defined(FAST_FLOAT)
+float DotProductAVX512F(const float *u, const float *v, int n) {
+  const unsigned quot = n / 16;
+  const unsigned rem = n % 16;
+  __m512 t0 = _mm512_setzero_ps();
+  for (unsigned k = 0; k < quot; k++) {
+    __m512 f0 = _mm512_loadu_ps(u);
+    __m512 f1 = _mm512_loadu_ps(v);
+    t0 = _mm512_fmadd_ps(f0, f1, t0);
+    u += 16;
+    v += 16;
+  }
+  float result = _mm512_reduce_add_ps(t0);
+  for (unsigned k = 0; k < rem; k++) {
+    result += *u++ * *v++;
+  }
+  return result;
+}
+#  else
+double DotProductAVX512F(const double *u, const double *v, int n) {
+  const unsigned quot = n / 8;
+  const unsigned rem = n % 8;
+  __m512d t0 = _mm512_setzero_pd();
+  for (unsigned k = 0; k < quot; k++) {
+    t0 = _mm512_fmadd_pd(_mm512_loadu_pd(u), _mm512_loadu_pd(v), t0);
+    u += 8;
+    v += 8;
+  }
+  double result = _mm512_reduce_add_pd(t0);
+  for (unsigned k = 0; k < rem; k++) {
+    result += *u++ * *v++;
+  }
+  return result;
+}
+#  endif
+
+} // namespace tesseract.
+
+#endif
--- a/src/arch/simddetect.cpp
+++ b/src/arch/simddetect.cpp
@ -225,6 +225,11 @@ SIMDDetect::SIMDDetect() {
  // Select code for calculation of dot product based on autodetection.
  if (false) {
    // This is a dummy to support conditional compilation.
+#if defined(HAVE_AVX512F)
+  } else if (avx512F_available_) {
+    // AVX512F detected.
+    SetDotProduct(DotProductAVX512F, &IntSimdMatrix::intSimdMatrixAVX2);
+#endif
 #if defined(HAVE_AVX2)
  } else if (avx2_available_) {
    // AVX2 detected.