From a73e7b97a414d3a87833157e78d3b1aa93128572 Mon Sep 17 00:00:00 2001 From: Stefan Weil Date: Mon, 2 Aug 2021 13:23:30 +0200 Subject: [PATCH] Add float dotproduct implementation for NEON Signed-off-by: Stefan Weil --- Makefile.am | 5 +++ src/arch/dotproduct.h | 3 ++ src/arch/dotproductneon.cpp | 67 +++++++++++++++++++++++++++++++++++++ src/arch/simddetect.cpp | 4 +-- 4 files changed, 77 insertions(+), 2 deletions(-) create mode 100644 src/arch/dotproductneon.cpp diff --git a/Makefile.am b/Makefile.am index 741cd02b..97f0d434 100644 --- a/Makefile.am +++ b/Makefile.am @@ -187,8 +187,13 @@ endif if HAVE_NEON libtesseract_neon_la_CXXFLAGS = $(NEON_CXXFLAGS) +libtesseract_neon_la_CXXFLAGS += -O3 +if OPENMP_SIMD +libtesseract_neon_la_CXXFLAGS += -fopenmp-simd -DOPENMP_SIMD +endif libtesseract_neon_la_CXXFLAGS += -I$(top_srcdir)/src/ccutil libtesseract_neon_la_SOURCES = src/arch/intsimdmatrixneon.cpp +libtesseract_neon_la_SOURCES += src/arch/dotproductneon.cpp libtesseract_la_LIBADD += libtesseract_neon.la noinst_LTLIBRARIES += libtesseract_neon.la endif diff --git a/src/arch/dotproduct.h b/src/arch/dotproduct.h index 94ba4310..4ee8ddd4 100644 --- a/src/arch/dotproduct.h +++ b/src/arch/dotproduct.h @@ -33,6 +33,9 @@ TFloat DotProductFMA(const TFloat *u, const TFloat *v, int n); // Uses Intel SSE intrinsics to access the SIMD instruction set. TFloat DotProductSSE(const TFloat *u, const TFloat *v, int n); +// Use NEON intrinsics. +TFloat DotProductNEON(const TFloat *u, const TFloat *v, int n); + } // namespace tesseract. #endif // TESSERACT_ARCH_DOTPRODUCT_H_ diff --git a/src/arch/dotproductneon.cpp b/src/arch/dotproductneon.cpp new file mode 100644 index 00000000..de6ea2f8 --- /dev/null +++ b/src/arch/dotproductneon.cpp @@ -0,0 +1,67 @@ +/////////////////////////////////////////////////////////////////////// +// File: dotproductneon.cpp +// Description: Dot product function for ARM NEON. +// Author: Stefan Weil +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +/////////////////////////////////////////////////////////////////////// + +#include +#include "dotproduct.h" + +namespace tesseract { + +// Documentation: +// https://developer.arm.com/architectures/instruction-sets/intrinsics/ + +#if defined(FAST_FLOAT) && defined(__ARM_ARCH_ISA_A64) + +float DotProductNEON(const float *u, const float *v, int n) { + float32x4_t result0123 = vdupq_n_f32(0.0f); + float32x4_t result4567 = vdupq_n_f32(0.0f); + while (n > 7) { + // Calculate 8 dot products per iteration. + float32x4_t u0 = vld1q_f32(u); + float32x4_t v0 = vld1q_f32(v); + float32x4_t u4 = vld1q_f32(u + 4); + float32x4_t v4 = vld1q_f32(v + 4); + result0123 = vfmaq_f32(result0123, u0, v0); + result4567 = vfmaq_f32(result4567, u4, v4); + u += 8; + v += 8; + n -= 8; + } + float total = vaddvq_f32(result0123); + total += vaddvq_f32(result4567); + while (n > 0) { + total += *u++ * *v++; + n--; + } + return total; +} + +#else + +// Computes and returns the dot product of the two n-vectors u and v. +TFloat DotProductNEON(const TFloat *u, const TFloat *v, int n) { + TFloat total = 0; +#if defined(OPENMP_SIMD) || defined(_OPENMP) +#pragma omp simd reduction(+:total) +#endif + for (int k = 0; k < n; k++) { + total += u[k] * v[k]; + } + return total; +} + +#endif + +} // namespace tesseract diff --git a/src/arch/simddetect.cpp b/src/arch/simddetect.cpp index a8244c82..4bc0ba03 100644 --- a/src/arch/simddetect.cpp +++ b/src/arch/simddetect.cpp @@ -237,7 +237,7 @@ SIMDDetect::SIMDDetect() { #if defined(HAVE_NEON) || defined(__aarch64__) } else if (neon_available_) { // NEON detected. - SetDotProduct(DotProductNative, &IntSimdMatrix::intSimdMatrixNEON); + SetDotProduct(DotProductNEON, &IntSimdMatrix::intSimdMatrixNEON); #endif } @@ -294,7 +294,7 @@ void SIMDDetect::Update() { #if defined(HAVE_NEON) || defined(__aarch64__) } else if (dotproduct == "neon" && neon_available_) { // NEON selected by config variable. - SetDotProduct(DotProductNative, &IntSimdMatrix::intSimdMatrixNEON); + SetDotProduct(DotProductNEON, &IntSimdMatrix::intSimdMatrixNEON); dotproduct_method = "neon"; #endif } else if (dotproduct == "std::inner_product") {