Merge pull request #3792 from stweil/avx512f

Add initial support for Intel AVX512F
This commit is contained in:
zdenop 2022-04-15 12:52:18 +02:00 committed by GitHub
commit 864ab537bb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 112 additions and 0 deletions

View File

@ -167,6 +167,10 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86|x86_64|AMD64|amd64|i386|i686")
set(AVX2_COMPILE_FLAGS "/arch:AVX2")
add_definitions("-DHAVE_AVX2")
set(HAVE_AVX512F ON)
set(AVX512_COMPILE_FLAGS "/arch:AVX512")
add_definitions("-DHAVE_AVX512F")
set(HAVE_FMA ON)
set(FMA_COMPILE_FLAGS "-D__FMA__")
add_definitions("-DHAVE_FMA")
@ -198,6 +202,12 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86|x86_64|AMD64|amd64|i386|i686")
add_definitions("-DHAVE_AVX2")
endif()
check_cxx_compiler_flag("-mavx512f" HAVE_AVX512F)
if(HAVE_AVX512F)
set(AVX512F_COMPILE_FLAGS "-mavx512f")
add_definitions("-DHAVE_AVX512F")
endif()
check_cxx_compiler_flag("-mfma" HAVE_FMA)
if(HAVE_FMA)
set(FMA_COMPILE_FLAGS "-mfma")
@ -215,6 +225,7 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "arm64|aarch64.*|AARCH64.*")
set(HAVE_AVX FALSE)
set(HAVE_AVX2 FALSE)
set(HAVE_AVX512F FALSE)
set(HAVE_FMA FALSE)
set(HAVE_SSE4_1 FALSE)
@ -225,6 +236,7 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "arm.*")
set(HAVE_AVX FALSE)
set(HAVE_AVX2 FALSE)
set(HAVE_AVX512F FALSE)
set(HAVE_FMA FALSE)
set(HAVE_SSE4_1 FALSE)
@ -238,6 +250,7 @@ else()
set(HAVE_AVX FALSE)
set(HAVE_AVX2 FALSE)
set(HAVE_AVX512F FALSE)
set(HAVE_FMA FALSE)
set(HAVE_NEON FALSE)
set(HAVE_SSE4_1 FALSE)
@ -480,6 +493,7 @@ message(STATUS "Linker options: ${CMAKE_EXE_LINKER_FLAGS} "
message(STATUS "Install directory: ${CMAKE_INSTALL_PREFIX}")
message(STATUS "HAVE_AVX: ${HAVE_AVX}")
message(STATUS "HAVE_AVX2: ${HAVE_AVX2}")
message(STATUS "HAVE_AVX512F: ${HAVE_AVX512F}")
message(STATUS "HAVE_FMA: ${HAVE_FMA}")
message(STATUS "HAVE_SSE4_1: ${HAVE_SSE4_1}")
message(STATUS "MARCH_NATIVE_OPT: ${MARCH_NATIVE_OPT}")
@ -641,6 +655,11 @@ if(HAVE_AVX2)
set_source_files_properties(src/arch/intsimdmatrixavx2.cpp
PROPERTIES COMPILE_FLAGS ${AVX2_COMPILE_FLAGS})
endif(HAVE_AVX2)
if(HAVE_AVX512F)
list(APPEND arch_files_opt src/arch/dotproductavx512.cpp)
set_source_files_properties(src/arch/dotproductavx512.cpp
PROPERTIES COMPILE_FLAGS ${AVX512F_COMPILE_FLAGS})
endif(HAVE_AVX512F)
if(HAVE_FMA)
list(APPEND arch_files_opt src/arch/dotproductfma.cpp)
set_source_files_properties(src/arch/dotproductfma.cpp

View File

@ -160,6 +160,14 @@ libtesseract_la_LIBADD += libtesseract_avx2.la
noinst_LTLIBRARIES += libtesseract_avx2.la
endif
if HAVE_AVX512F
libtesseract_avx512_la_CXXFLAGS = -mavx512f
libtesseract_avx512_la_CXXFLAGS += -I$(top_srcdir)/src/ccutil
libtesseract_avx512_la_SOURCES = src/arch/dotproductavx512.cpp
libtesseract_la_LIBADD += libtesseract_avx512.la
noinst_LTLIBRARIES += libtesseract_avx512.la
endif
if HAVE_FMA
libtesseract_fma_la_CXXFLAGS = -mfma
libtesseract_fma_la_CXXFLAGS += -I$(top_srcdir)/src/ccutil

View File

@ -129,6 +129,7 @@ AX_CHECK_COMPILE_FLAG([-Werror=unused-command-line-argument], [WERROR=-Werror=un
AM_CONDITIONAL([HAVE_AVX], false)
AM_CONDITIONAL([HAVE_AVX2], false)
AM_CONDITIONAL([HAVE_AVX512F], false)
AM_CONDITIONAL([HAVE_FMA], false)
AM_CONDITIONAL([HAVE_SSE4_1], false)
AM_CONDITIONAL([HAVE_NEON], false)
@ -149,6 +150,12 @@ case "${host_cpu}" in
AC_DEFINE([HAVE_AVX2], [1], [Enable AVX2 instructions])
fi
AX_CHECK_COMPILE_FLAG([-mavx512f], [avx512f=true], [avx512f=false], [$WERROR])
AM_CONDITIONAL([HAVE_AVX512F], $avx512f)
if $avx512f; then
AC_DEFINE([HAVE_AVX512F], [1], [Enable AVX512F instructions])
fi
AX_CHECK_COMPILE_FLAG([-mfma], [fma=true], [fma=false], [$WERROR])
AM_CONDITIONAL([HAVE_FMA], $fma)
if $fma; then

View File

@ -27,6 +27,9 @@ TFloat DotProductNative(const TFloat *u, const TFloat *v, int n);
// Uses Intel AVX intrinsics to access the SIMD instruction set.
TFloat DotProductAVX(const TFloat *u, const TFloat *v, int n);
// Uses Intel AVX512F intrinsics to access the SIMD instruction set.
TFloat DotProductAVX512F(const TFloat *u, const TFloat *v, int n);
// Use Intel FMA.
TFloat DotProductFMA(const TFloat *u, const TFloat *v, int n);

View File

@ -0,0 +1,70 @@
///////////////////////////////////////////////////////////////////////
// File: dotproductavx512.cpp
// Description: Architecture-specific dot-product function.
// Author: Stefan Weil
//
// (C) Copyright 2022
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
///////////////////////////////////////////////////////////////////////
#if !defined(__AVX__)
# if defined(__i686__) || defined(__x86_64__)
# error Implementation only for AVX capable architectures
# endif
#else
# include <immintrin.h>
# include <cstdint>
# include "dotproduct.h"
namespace tesseract {
// Computes and returns the dot product of the n-vectors u and v.
// Uses Intel AVX intrinsics to access the SIMD instruction set.
# if defined(FAST_FLOAT)
float DotProductAVX512F(const float *u, const float *v, int n) {
const unsigned quot = n / 16;
const unsigned rem = n % 16;
__m512 t0 = _mm512_setzero_ps();
for (unsigned k = 0; k < quot; k++) {
__m512 f0 = _mm512_loadu_ps(u);
__m512 f1 = _mm512_loadu_ps(v);
t0 = _mm512_fmadd_ps(f0, f1, t0);
u += 16;
v += 16;
}
float result = _mm512_reduce_add_ps(t0);
for (unsigned k = 0; k < rem; k++) {
result += *u++ * *v++;
}
return result;
}
# else
double DotProductAVX512F(const double *u, const double *v, int n) {
const unsigned quot = n / 8;
const unsigned rem = n % 8;
__m512d t0 = _mm512_setzero_pd();
for (unsigned k = 0; k < quot; k++) {
t0 = _mm512_fmadd_pd(_mm512_loadu_pd(u), _mm512_loadu_pd(v), t0);
u += 8;
v += 8;
}
double result = _mm512_reduce_add_pd(t0);
for (unsigned k = 0; k < rem; k++) {
result += *u++ * *v++;
}
return result;
}
# endif
} // namespace tesseract.
#endif

View File

@ -225,6 +225,11 @@ SIMDDetect::SIMDDetect() {
// Select code for calculation of dot product based on autodetection.
if (false) {
// This is a dummy to support conditional compilation.
#if defined(HAVE_AVX512F)
} else if (avx512F_available_) {
// AVX512F detected.
SetDotProduct(DotProductAVX512F, &IntSimdMatrix::intSimdMatrixAVX2);
#endif
#if defined(HAVE_AVX2)
} else if (avx2_available_) {
// AVX2 detected.