Merge pull request #2561 from stweil/fma

Implement double dot product using Intel FMA and reduce number of include files
This commit is contained in:
zdenop 2019-07-13 09:51:32 +02:00 committed by GitHub
commit 8352194959
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
14 changed files with 115 additions and 70 deletions

View File

@ -220,6 +220,7 @@ endforeach()
# add definition as expected in src/arch/simddetect.cpp
set(AVX_OPT OFF)
set(AVX2_OPT OFF)
set(FMA_OPT OFF)
set(SSE41_OPT OFF)
set(MARCH_NATIVE_OPT OFF)
foreach(flag ${_enable_vector_unit_list}) # from OptimizeForArchitecture()
@ -227,7 +228,7 @@ foreach(flag ${_enable_vector_unit_list}) # from OptimizeForArchitecture()
string(REPLACE "\." "_" flag "${flag}")
set(sim_flags "${sim_flags} -D${flag}")
string(REPLACE "_" "" flag "${flag}")
if("${flag}" MATCHES "AVX|AVX2|SSE41")
if("${flag}" MATCHES "AVX|AVX2|FMA|SSE41")
set("${flag}_OPT" ON)
endif()
endforeach(flag)
@ -285,6 +286,7 @@ message( STATUS "Architecture flags: ${Vc_ARCHITECTURE_FLAGS}")
message( STATUS "Vector unit list: ${_enable_vector_unit_list}")
message( STATUS "AVX_OPT: ${AVX_OPT}")
message( STATUS "AVX2_OPT: ${AVX2_OPT}")
message( STATUS "FMA_OPT: ${FMA_OPT}")
message( STATUS "SSE41_OPT: ${SSE41_OPT}")
message( STATUS "MARCH_NATIVE_OPT: ${MARCH_NATIVE_OPT}")
message( STATUS "sim_flags: ${sim_flags}")
@ -364,6 +366,9 @@ endif(AVX_OPT)
if(AVX2_OPT)
list(APPEND tesseract_src src/arch/intsimdmatrixavx2.cpp)
endif(AVX2_OPT)
if(FMA_OPT)
list(APPEND tesseract_src src/arch/dotproductfma.cpp)
endif(AVX_OPT)
if(SSE41_OPT)
list(APPEND tesseract_src src/arch/dotproductsse.cpp src/arch/intsimdmatrixsse.cpp)
endif(SSE41_OPT)
@ -505,8 +510,7 @@ install(FILES
${CMAKE_CURRENT_BINARY_DIR}/api/tess_version.h
#from arch/makefile.am
src/arch/dotproductavx.h
src/arch/dotproductsse.h
src/arch/dotproduct.h
src/arch/intsimdmatrix.h
src/arch/simddetect.h

View File

@ -126,6 +126,9 @@ AM_CONDITIONAL([AVX_OPT], ${avx})
AX_CHECK_COMPILE_FLAG([-mavx2], [avx2=true], [avx2=false], [$WERROR])
AM_CONDITIONAL([AVX2_OPT], $avx2)
AX_CHECK_COMPILE_FLAG([-mfma], [fma=true], [fma=false], [$WERROR])
AM_CONDITIONAL([FMA_OPT], $fma)
AX_CHECK_COMPILE_FLAG([-msse4.1], [sse41=true], [sse41=false], [$WERROR])
AM_CONDITIONAL([SSE41_OPT], $sse41)

View File

@ -68,6 +68,9 @@ endif
if AVX2_OPT
libtesseract_la_LIBADD += ../arch/libtesseract_avx2.la
endif
if FMA_OPT
libtesseract_la_LIBADD += ../arch/libtesseract_fma.la
endif
if SSE41_OPT
libtesseract_la_LIBADD += ../arch/libtesseract_sse.la
endif

View File

@ -136,6 +136,7 @@ static void PrintVersionInfo() {
if (tesseract::SIMDDetect::IsAVX512FAvailable()) printf(" Found AVX512F\n");
if (tesseract::SIMDDetect::IsAVX2Available()) printf(" Found AVX2\n");
if (tesseract::SIMDDetect::IsAVXAvailable()) printf(" Found AVX\n");
if (tesseract::SIMDDetect::IsFMAAvailable()) printf(" Found FMA\n");
if (tesseract::SIMDDetect::IsSSEAvailable()) printf(" Found SSE\n");
#ifdef _OPENMP
printf(" Found OpenMP %d\n", _OPENMP);

View File

@ -9,7 +9,7 @@ endif
pkginclude_HEADERS =
noinst_HEADERS = dotproduct.h dotproductavx.h dotproductsse.h
noinst_HEADERS = dotproduct.h
noinst_HEADERS += intsimdmatrix.h
noinst_HEADERS += simddetect.h
@ -20,6 +20,9 @@ endif
if AVX2_OPT
noinst_LTLIBRARIES += libtesseract_avx2.la
endif
if FMA_OPT
noinst_LTLIBRARIES += libtesseract_fma.la
endif
if SSE41_OPT
noinst_LTLIBRARIES += libtesseract_sse.la
endif
@ -34,6 +37,10 @@ if AVX2_OPT
libtesseract_arch_la_CPPFLAGS += -DAVX2
libtesseract_avx2_la_CXXFLAGS = -mavx2
endif
if FMA_OPT
libtesseract_arch_la_CPPFLAGS += -DFMA
libtesseract_fma_la_CXXFLAGS = -mfma
endif
if SSE41_OPT
libtesseract_arch_la_CPPFLAGS += -DSSE4_1
libtesseract_sse_la_CXXFLAGS = -msse4.1
@ -55,6 +62,10 @@ if AVX2_OPT
libtesseract_avx2_la_SOURCES = intsimdmatrixavx2.cpp
endif
if FMA_OPT
libtesseract_fma_la_SOURCES = dotproductfma.cpp
endif
if SSE41_OPT
libtesseract_sse_la_SOURCES = dotproductsse.cpp intsimdmatrixsse.cpp
endif

View File

@ -22,6 +22,15 @@ namespace tesseract {
// Computes and returns the dot product of the n-vectors u and v.
double DotProductNative(const double* u, const double* v, int n);
// Uses Intel AVX intrinsics to access the SIMD instruction set.
double DotProductAVX(const double* u, const double* v, int n);
// Use Intel FMA.
double DotProductFMA(const double* u, const double* v, int n);
// Uses Intel SSE intrinsics to access the SIMD instruction set.
double DotProductSSE(const double* u, const double* v, int n);
} // namespace tesseract.
#endif // TESSERACT_ARCH_DOTPRODUCT_H_

View File

@ -2,7 +2,6 @@
// File: dotproductavx.cpp
// Description: Architecture-specific dot-product function.
// Author: Ray Smith
// Created: Wed Jul 22 10:48:05 PDT 2015
//
// (C) Copyright 2015, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
@ -22,7 +21,7 @@
#include <immintrin.h>
#include <cstdint>
#include "dotproductavx.h"
#include "dotproduct.h"
namespace tesseract {

View File

@ -1,30 +0,0 @@
///////////////////////////////////////////////////////////////////////
// File: dotproductavx.h
// Description: Architecture-specific dot-product function.
// Author: Ray Smith
// Created: Wed Jul 22 10:51:05 PDT 2015
//
// (C) Copyright 2015, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
///////////////////////////////////////////////////////////////////////
#ifndef TESSERACT_ARCH_DOTPRODUCTAVX_H_
#define TESSERACT_ARCH_DOTPRODUCTAVX_H_
namespace tesseract {
// Computes and returns the dot product of the n-vectors u and v.
// Uses Intel AVX intrinsics to access the SIMD instruction set.
double DotProductAVX(const double* u, const double* v, int n);
} // namespace tesseract.
#endif // TESSERACT_ARCH_DOTPRODUCTAVX_H_

View File

@ -0,0 +1,57 @@
///////////////////////////////////////////////////////////////////////
// File: dotproductfma.cpp
// Description: Architecture-specific dot-product function.
// Author: Stefan Weil
//
// (C) Copyright 2015, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
///////////////////////////////////////////////////////////////////////
#if !defined(__FMA__)
#error Implementation only for FMA capable architectures
#endif
#include <immintrin.h>
#include <cstdint>
#include "dotproduct.h"
namespace tesseract {
// Computes and returns the dot product of the n-vectors u and v.
// Uses Intel FMA intrinsics to access the SIMD instruction set.
double DotProductFMA(const double* u, const double* v, int n) {
const unsigned quot = n / 8;
const unsigned rem = n % 8;
__m256d t0 = _mm256_setzero_pd();
__m256d t1 = _mm256_setzero_pd();
for (unsigned k = 0; k < quot; k++) {
__m256d f0 = _mm256_loadu_pd(u);
__m256d f1 = _mm256_loadu_pd(v);
t0 = _mm256_fmadd_pd(f0, f1, t0);
u += 4;
v += 4;
__m256d f2 = _mm256_loadu_pd(u);
__m256d f3 = _mm256_loadu_pd(v);
t1 = _mm256_fmadd_pd(f2, f3, t1);
u += 4;
v += 4;
}
t0 = _mm256_hadd_pd(t0, t1);
alignas(32) double tmp[4];
_mm256_store_pd(tmp, t0);
double result = tmp[0] + tmp[1] + tmp[2] + tmp[3];
for (unsigned k = 0; k < rem; k++) {
result += *u++ * *v++;
}
return result;
}
} // namespace tesseract.

View File

@ -22,7 +22,7 @@
#include <emmintrin.h>
#include <smmintrin.h>
#include <cstdint>
#include "dotproductsse.h"
#include "dotproduct.h"
namespace tesseract {

View File

@ -1,29 +0,0 @@
///////////////////////////////////////////////////////////////////////
// File: dotproductsse.h
// Description: Architecture-specific dot-product function.
// Author: Ray Smith
//
// (C) Copyright 2015, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
///////////////////////////////////////////////////////////////////////
#ifndef TESSERACT_ARCH_DOTPRODUCTSSE_H_
#define TESSERACT_ARCH_DOTPRODUCTSSE_H_
namespace tesseract {
// Computes and returns the dot product of the n-vectors u and v.
// Uses Intel SSE intrinsics to access the SIMD instruction set.
double DotProductSSE(const double* u, const double* v, int n);
} // namespace tesseract.
#endif // TESSERACT_ARCH_DOTPRODUCTSSE_H_

View File

@ -24,7 +24,6 @@
#include <cstdint>
#include <emmintrin.h>
#include <smmintrin.h>
#include "dotproductsse.h"
namespace tesseract {

View File

@ -18,13 +18,11 @@
#include <numeric> // for std::inner_product
#include "simddetect.h"
#include "dotproduct.h"
#include "dotproductavx.h"
#include "dotproductsse.h"
#include "intsimdmatrix.h" // for IntSimdMatrix
#include "params.h" // for STRING_VAR
#include "tprintf.h" // for tprintf
#if defined(AVX) || defined(AVX2) || defined(SSE4_1)
#if defined(AVX) || defined(AVX2) || defined(FMA) || defined(SSE4_1)
# define HAS_CPUID
#endif
@ -60,6 +58,8 @@ bool SIMDDetect::avx_available_;
bool SIMDDetect::avx2_available_;
bool SIMDDetect::avx512F_available_;
bool SIMDDetect::avx512BW_available_;
// If true, then FMA has been detected.
bool SIMDDetect::fma_available_;
// If true, then SSe4.1 has been detected.
bool SIMDDetect::sse_available_;
@ -98,6 +98,9 @@ SIMDDetect::SIMDDetect() {
#if defined(SSE4_1)
sse_available_ = (ecx & 0x00080000) != 0;
#endif
#if defined(FMA)
fma_available_ = (ecx & 0x00001000) != 0;
#endif
#if defined(AVX)
avx_available_ = (ecx & 0x10000000) != 0;
if (avx_available_) {
@ -121,6 +124,9 @@ SIMDDetect::SIMDDetect() {
#if defined(SSE4_1)
sse_available_ = (cpuInfo[2] & 0x00080000) != 0;
#endif
#if defined(FMA)
fma_available_ = (cpuInfo[2] & 0x00001000) != 0;
#endif
#if defined(AVX)
avx_available_ = (cpuInfo[2] & 0x10000000) != 0;
#endif
@ -185,6 +191,12 @@ void SIMDDetect::Update() {
SetDotProduct(DotProductAVX, &IntSimdMatrix::intSimdMatrixSSE);
dotproduct_method = "avx";
#endif
#if defined(FMA)
} else if (!strcmp(dotproduct.string(), "fma")) {
// FMA selected by config variable.
SetDotProduct(DotProductFMA, IntSimdMatrix::intSimdMatrix);
dotproduct_method = "fma";
#endif
#if defined(SSE4_1)
} else if (!strcmp(dotproduct.string(), "sse")) {
// SSE selected by config variable.

View File

@ -46,6 +46,10 @@ class SIMDDetect {
static inline bool IsAVX512BWAvailable() {
return detector.avx512BW_available_;
}
// Returns true if FMA is available on this system.
static inline bool IsFMAAvailable() {
return detector.fma_available_;
}
// Returns true if SSE4.1 is available on this system.
static inline bool IsSSEAvailable() {
return detector.sse_available_;
@ -66,6 +70,8 @@ class SIMDDetect {
static TESS_API bool avx2_available_;
static TESS_API bool avx512F_available_;
static TESS_API bool avx512BW_available_;
// If true, then FMA has been detected.
static TESS_API bool fma_available_;
// If true, then SSe4.1 has been detected.
static TESS_API bool sse_available_;
};