mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2024-12-04 01:39:16 +08:00
Merge pull request #2561 from stweil/fma
Implement double dot product using Intel FMA and reduce number of include files
This commit is contained in:
commit
8352194959
@ -220,6 +220,7 @@ endforeach()
|
||||
# add definition as expected in src/arch/simddetect.cpp
|
||||
set(AVX_OPT OFF)
|
||||
set(AVX2_OPT OFF)
|
||||
set(FMA_OPT OFF)
|
||||
set(SSE41_OPT OFF)
|
||||
set(MARCH_NATIVE_OPT OFF)
|
||||
foreach(flag ${_enable_vector_unit_list}) # from OptimizeForArchitecture()
|
||||
@ -227,7 +228,7 @@ foreach(flag ${_enable_vector_unit_list}) # from OptimizeForArchitecture()
|
||||
string(REPLACE "\." "_" flag "${flag}")
|
||||
set(sim_flags "${sim_flags} -D${flag}")
|
||||
string(REPLACE "_" "" flag "${flag}")
|
||||
if("${flag}" MATCHES "AVX|AVX2|SSE41")
|
||||
if("${flag}" MATCHES "AVX|AVX2|FMA|SSE41")
|
||||
set("${flag}_OPT" ON)
|
||||
endif()
|
||||
endforeach(flag)
|
||||
@ -285,6 +286,7 @@ message( STATUS "Architecture flags: ${Vc_ARCHITECTURE_FLAGS}")
|
||||
message( STATUS "Vector unit list: ${_enable_vector_unit_list}")
|
||||
message( STATUS "AVX_OPT: ${AVX_OPT}")
|
||||
message( STATUS "AVX2_OPT: ${AVX2_OPT}")
|
||||
message( STATUS "FMA_OPT: ${FMA_OPT}")
|
||||
message( STATUS "SSE41_OPT: ${SSE41_OPT}")
|
||||
message( STATUS "MARCH_NATIVE_OPT: ${MARCH_NATIVE_OPT}")
|
||||
message( STATUS "sim_flags: ${sim_flags}")
|
||||
@ -364,6 +366,9 @@ endif(AVX_OPT)
|
||||
if(AVX2_OPT)
|
||||
list(APPEND tesseract_src src/arch/intsimdmatrixavx2.cpp)
|
||||
endif(AVX2_OPT)
|
||||
if(FMA_OPT)
|
||||
list(APPEND tesseract_src src/arch/dotproductfma.cpp)
|
||||
endif(AVX_OPT)
|
||||
if(SSE41_OPT)
|
||||
list(APPEND tesseract_src src/arch/dotproductsse.cpp src/arch/intsimdmatrixsse.cpp)
|
||||
endif(SSE41_OPT)
|
||||
@ -505,8 +510,7 @@ install(FILES
|
||||
${CMAKE_CURRENT_BINARY_DIR}/api/tess_version.h
|
||||
|
||||
#from arch/makefile.am
|
||||
src/arch/dotproductavx.h
|
||||
src/arch/dotproductsse.h
|
||||
src/arch/dotproduct.h
|
||||
src/arch/intsimdmatrix.h
|
||||
src/arch/simddetect.h
|
||||
|
||||
|
@ -126,6 +126,9 @@ AM_CONDITIONAL([AVX_OPT], ${avx})
|
||||
AX_CHECK_COMPILE_FLAG([-mavx2], [avx2=true], [avx2=false], [$WERROR])
|
||||
AM_CONDITIONAL([AVX2_OPT], $avx2)
|
||||
|
||||
AX_CHECK_COMPILE_FLAG([-mfma], [fma=true], [fma=false], [$WERROR])
|
||||
AM_CONDITIONAL([FMA_OPT], $fma)
|
||||
|
||||
AX_CHECK_COMPILE_FLAG([-msse4.1], [sse41=true], [sse41=false], [$WERROR])
|
||||
AM_CONDITIONAL([SSE41_OPT], $sse41)
|
||||
|
||||
|
@ -68,6 +68,9 @@ endif
|
||||
if AVX2_OPT
|
||||
libtesseract_la_LIBADD += ../arch/libtesseract_avx2.la
|
||||
endif
|
||||
if FMA_OPT
|
||||
libtesseract_la_LIBADD += ../arch/libtesseract_fma.la
|
||||
endif
|
||||
if SSE41_OPT
|
||||
libtesseract_la_LIBADD += ../arch/libtesseract_sse.la
|
||||
endif
|
||||
|
@ -136,6 +136,7 @@ static void PrintVersionInfo() {
|
||||
if (tesseract::SIMDDetect::IsAVX512FAvailable()) printf(" Found AVX512F\n");
|
||||
if (tesseract::SIMDDetect::IsAVX2Available()) printf(" Found AVX2\n");
|
||||
if (tesseract::SIMDDetect::IsAVXAvailable()) printf(" Found AVX\n");
|
||||
if (tesseract::SIMDDetect::IsFMAAvailable()) printf(" Found FMA\n");
|
||||
if (tesseract::SIMDDetect::IsSSEAvailable()) printf(" Found SSE\n");
|
||||
#ifdef _OPENMP
|
||||
printf(" Found OpenMP %d\n", _OPENMP);
|
||||
|
@ -9,7 +9,7 @@ endif
|
||||
|
||||
pkginclude_HEADERS =
|
||||
|
||||
noinst_HEADERS = dotproduct.h dotproductavx.h dotproductsse.h
|
||||
noinst_HEADERS = dotproduct.h
|
||||
noinst_HEADERS += intsimdmatrix.h
|
||||
noinst_HEADERS += simddetect.h
|
||||
|
||||
@ -20,6 +20,9 @@ endif
|
||||
if AVX2_OPT
|
||||
noinst_LTLIBRARIES += libtesseract_avx2.la
|
||||
endif
|
||||
if FMA_OPT
|
||||
noinst_LTLIBRARIES += libtesseract_fma.la
|
||||
endif
|
||||
if SSE41_OPT
|
||||
noinst_LTLIBRARIES += libtesseract_sse.la
|
||||
endif
|
||||
@ -34,6 +37,10 @@ if AVX2_OPT
|
||||
libtesseract_arch_la_CPPFLAGS += -DAVX2
|
||||
libtesseract_avx2_la_CXXFLAGS = -mavx2
|
||||
endif
|
||||
if FMA_OPT
|
||||
libtesseract_arch_la_CPPFLAGS += -DFMA
|
||||
libtesseract_fma_la_CXXFLAGS = -mfma
|
||||
endif
|
||||
if SSE41_OPT
|
||||
libtesseract_arch_la_CPPFLAGS += -DSSE4_1
|
||||
libtesseract_sse_la_CXXFLAGS = -msse4.1
|
||||
@ -55,6 +62,10 @@ if AVX2_OPT
|
||||
libtesseract_avx2_la_SOURCES = intsimdmatrixavx2.cpp
|
||||
endif
|
||||
|
||||
if FMA_OPT
|
||||
libtesseract_fma_la_SOURCES = dotproductfma.cpp
|
||||
endif
|
||||
|
||||
if SSE41_OPT
|
||||
libtesseract_sse_la_SOURCES = dotproductsse.cpp intsimdmatrixsse.cpp
|
||||
endif
|
||||
|
@ -22,6 +22,15 @@ namespace tesseract {
|
||||
// Computes and returns the dot product of the n-vectors u and v.
|
||||
double DotProductNative(const double* u, const double* v, int n);
|
||||
|
||||
// Uses Intel AVX intrinsics to access the SIMD instruction set.
|
||||
double DotProductAVX(const double* u, const double* v, int n);
|
||||
|
||||
// Use Intel FMA.
|
||||
double DotProductFMA(const double* u, const double* v, int n);
|
||||
|
||||
// Uses Intel SSE intrinsics to access the SIMD instruction set.
|
||||
double DotProductSSE(const double* u, const double* v, int n);
|
||||
|
||||
} // namespace tesseract.
|
||||
|
||||
#endif // TESSERACT_ARCH_DOTPRODUCT_H_
|
||||
|
@ -2,7 +2,6 @@
|
||||
// File: dotproductavx.cpp
|
||||
// Description: Architecture-specific dot-product function.
|
||||
// Author: Ray Smith
|
||||
// Created: Wed Jul 22 10:48:05 PDT 2015
|
||||
//
|
||||
// (C) Copyright 2015, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
@ -22,7 +21,7 @@
|
||||
|
||||
#include <immintrin.h>
|
||||
#include <cstdint>
|
||||
#include "dotproductavx.h"
|
||||
#include "dotproduct.h"
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
|
@ -1,30 +0,0 @@
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
// File: dotproductavx.h
|
||||
// Description: Architecture-specific dot-product function.
|
||||
// Author: Ray Smith
|
||||
// Created: Wed Jul 22 10:51:05 PDT 2015
|
||||
//
|
||||
// (C) Copyright 2015, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifndef TESSERACT_ARCH_DOTPRODUCTAVX_H_
|
||||
#define TESSERACT_ARCH_DOTPRODUCTAVX_H_
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
// Computes and returns the dot product of the n-vectors u and v.
|
||||
// Uses Intel AVX intrinsics to access the SIMD instruction set.
|
||||
double DotProductAVX(const double* u, const double* v, int n);
|
||||
|
||||
} // namespace tesseract.
|
||||
|
||||
#endif // TESSERACT_ARCH_DOTPRODUCTAVX_H_
|
57
src/arch/dotproductfma.cpp
Normal file
57
src/arch/dotproductfma.cpp
Normal file
@ -0,0 +1,57 @@
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
// File: dotproductfma.cpp
|
||||
// Description: Architecture-specific dot-product function.
|
||||
// Author: Stefan Weil
|
||||
//
|
||||
// (C) Copyright 2015, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#if !defined(__FMA__)
|
||||
#error Implementation only for FMA capable architectures
|
||||
#endif
|
||||
|
||||
#include <immintrin.h>
|
||||
#include <cstdint>
|
||||
#include "dotproduct.h"
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
// Computes and returns the dot product of the n-vectors u and v.
|
||||
// Uses Intel FMA intrinsics to access the SIMD instruction set.
|
||||
double DotProductFMA(const double* u, const double* v, int n) {
|
||||
const unsigned quot = n / 8;
|
||||
const unsigned rem = n % 8;
|
||||
__m256d t0 = _mm256_setzero_pd();
|
||||
__m256d t1 = _mm256_setzero_pd();
|
||||
for (unsigned k = 0; k < quot; k++) {
|
||||
__m256d f0 = _mm256_loadu_pd(u);
|
||||
__m256d f1 = _mm256_loadu_pd(v);
|
||||
t0 = _mm256_fmadd_pd(f0, f1, t0);
|
||||
u += 4;
|
||||
v += 4;
|
||||
__m256d f2 = _mm256_loadu_pd(u);
|
||||
__m256d f3 = _mm256_loadu_pd(v);
|
||||
t1 = _mm256_fmadd_pd(f2, f3, t1);
|
||||
u += 4;
|
||||
v += 4;
|
||||
}
|
||||
t0 = _mm256_hadd_pd(t0, t1);
|
||||
alignas(32) double tmp[4];
|
||||
_mm256_store_pd(tmp, t0);
|
||||
double result = tmp[0] + tmp[1] + tmp[2] + tmp[3];
|
||||
for (unsigned k = 0; k < rem; k++) {
|
||||
result += *u++ * *v++;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
} // namespace tesseract.
|
@ -22,7 +22,7 @@
|
||||
#include <emmintrin.h>
|
||||
#include <smmintrin.h>
|
||||
#include <cstdint>
|
||||
#include "dotproductsse.h"
|
||||
#include "dotproduct.h"
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
|
@ -1,29 +0,0 @@
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
// File: dotproductsse.h
|
||||
// Description: Architecture-specific dot-product function.
|
||||
// Author: Ray Smith
|
||||
//
|
||||
// (C) Copyright 2015, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifndef TESSERACT_ARCH_DOTPRODUCTSSE_H_
|
||||
#define TESSERACT_ARCH_DOTPRODUCTSSE_H_
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
// Computes and returns the dot product of the n-vectors u and v.
|
||||
// Uses Intel SSE intrinsics to access the SIMD instruction set.
|
||||
double DotProductSSE(const double* u, const double* v, int n);
|
||||
|
||||
} // namespace tesseract.
|
||||
|
||||
#endif // TESSERACT_ARCH_DOTPRODUCTSSE_H_
|
@ -24,7 +24,6 @@
|
||||
#include <cstdint>
|
||||
#include <emmintrin.h>
|
||||
#include <smmintrin.h>
|
||||
#include "dotproductsse.h"
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
|
@ -18,13 +18,11 @@
|
||||
#include <numeric> // for std::inner_product
|
||||
#include "simddetect.h"
|
||||
#include "dotproduct.h"
|
||||
#include "dotproductavx.h"
|
||||
#include "dotproductsse.h"
|
||||
#include "intsimdmatrix.h" // for IntSimdMatrix
|
||||
#include "params.h" // for STRING_VAR
|
||||
#include "tprintf.h" // for tprintf
|
||||
|
||||
#if defined(AVX) || defined(AVX2) || defined(SSE4_1)
|
||||
#if defined(AVX) || defined(AVX2) || defined(FMA) || defined(SSE4_1)
|
||||
# define HAS_CPUID
|
||||
#endif
|
||||
|
||||
@ -60,6 +58,8 @@ bool SIMDDetect::avx_available_;
|
||||
bool SIMDDetect::avx2_available_;
|
||||
bool SIMDDetect::avx512F_available_;
|
||||
bool SIMDDetect::avx512BW_available_;
|
||||
// If true, then FMA has been detected.
|
||||
bool SIMDDetect::fma_available_;
|
||||
// If true, then SSe4.1 has been detected.
|
||||
bool SIMDDetect::sse_available_;
|
||||
|
||||
@ -98,6 +98,9 @@ SIMDDetect::SIMDDetect() {
|
||||
#if defined(SSE4_1)
|
||||
sse_available_ = (ecx & 0x00080000) != 0;
|
||||
#endif
|
||||
#if defined(FMA)
|
||||
fma_available_ = (ecx & 0x00001000) != 0;
|
||||
#endif
|
||||
#if defined(AVX)
|
||||
avx_available_ = (ecx & 0x10000000) != 0;
|
||||
if (avx_available_) {
|
||||
@ -121,6 +124,9 @@ SIMDDetect::SIMDDetect() {
|
||||
#if defined(SSE4_1)
|
||||
sse_available_ = (cpuInfo[2] & 0x00080000) != 0;
|
||||
#endif
|
||||
#if defined(FMA)
|
||||
fma_available_ = (cpuInfo[2] & 0x00001000) != 0;
|
||||
#endif
|
||||
#if defined(AVX)
|
||||
avx_available_ = (cpuInfo[2] & 0x10000000) != 0;
|
||||
#endif
|
||||
@ -185,6 +191,12 @@ void SIMDDetect::Update() {
|
||||
SetDotProduct(DotProductAVX, &IntSimdMatrix::intSimdMatrixSSE);
|
||||
dotproduct_method = "avx";
|
||||
#endif
|
||||
#if defined(FMA)
|
||||
} else if (!strcmp(dotproduct.string(), "fma")) {
|
||||
// FMA selected by config variable.
|
||||
SetDotProduct(DotProductFMA, IntSimdMatrix::intSimdMatrix);
|
||||
dotproduct_method = "fma";
|
||||
#endif
|
||||
#if defined(SSE4_1)
|
||||
} else if (!strcmp(dotproduct.string(), "sse")) {
|
||||
// SSE selected by config variable.
|
||||
|
@ -46,6 +46,10 @@ class SIMDDetect {
|
||||
static inline bool IsAVX512BWAvailable() {
|
||||
return detector.avx512BW_available_;
|
||||
}
|
||||
// Returns true if FMA is available on this system.
|
||||
static inline bool IsFMAAvailable() {
|
||||
return detector.fma_available_;
|
||||
}
|
||||
// Returns true if SSE4.1 is available on this system.
|
||||
static inline bool IsSSEAvailable() {
|
||||
return detector.sse_available_;
|
||||
@ -66,6 +70,8 @@ class SIMDDetect {
|
||||
static TESS_API bool avx2_available_;
|
||||
static TESS_API bool avx512F_available_;
|
||||
static TESS_API bool avx512BW_available_;
|
||||
// If true, then FMA has been detected.
|
||||
static TESS_API bool fma_available_;
|
||||
// If true, then SSe4.1 has been detected.
|
||||
static TESS_API bool sse_available_;
|
||||
};
|
||||
|
Loading…
Reference in New Issue
Block a user