mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2024-11-24 11:09:06 +08:00
Implement DotProductSSE() for FAST_FLOAT
[sw] Formatted commit message
This commit is contained in:
parent
79e8b4f344
commit
27597883db
@ -31,12 +31,63 @@ namespace tesseract {
|
||||
// Computes and returns the dot product of the n-vectors u and v.
|
||||
// Uses Intel SSE intrinsics to access the SIMD instruction set.
|
||||
#if defined(FAST_FLOAT)
|
||||
TFloat DotProductSSE(const TFloat *u, const TFloat *v, int n) {
|
||||
TFloat total = 0.0;
|
||||
for (int k = 0; k < n; ++k) {
|
||||
total += u[k] * v[k];
|
||||
float DotProductSSE(const float *u, const float *v, int n) {
|
||||
int max_offset = n - 4;
|
||||
int offset = 0;
|
||||
// Accumulate a set of 4 sums in sum, by loading pairs of 4 values from u and
|
||||
// v, and multiplying them together in parallel.
|
||||
__m128 sum = _mm_setzero_ps();
|
||||
if (offset <= max_offset) {
|
||||
offset = 4;
|
||||
// Aligned load is reputedly faster but requires 16 byte aligned input.
|
||||
if ((reinterpret_cast<uintptr_t>(u) & 15) == 0 &&
|
||||
(reinterpret_cast<uintptr_t>(v) & 15) == 0) {
|
||||
// Use aligned load.
|
||||
sum = _mm_load_ps(u);
|
||||
__m128 floats2 = _mm_load_ps(v);
|
||||
// Multiply.
|
||||
sum = _mm_mul_ps(sum, floats2);
|
||||
while (offset <= max_offset) {
|
||||
__m128 floats1 = _mm_load_ps(u + offset);
|
||||
floats2 = _mm_load_ps(v + offset);
|
||||
floats1 = _mm_mul_ps(floats1, floats2);
|
||||
sum = _mm_add_ps(sum, floats1);
|
||||
offset += 4;
|
||||
}
|
||||
} else {
|
||||
// Use unaligned load.
|
||||
sum = _mm_loadu_ps(u);
|
||||
__m128 floats2 = _mm_loadu_ps(v);
|
||||
// Multiply.
|
||||
sum = _mm_mul_ps(sum, floats2);
|
||||
while (offset <= max_offset) {
|
||||
__m128 floats1 = _mm_loadu_ps(u + offset);
|
||||
floats2 = _mm_loadu_ps(v + offset);
|
||||
floats1 = _mm_mul_ps(floats1, floats2);
|
||||
sum = _mm_add_ps(sum, floats1);
|
||||
offset += 4;
|
||||
}
|
||||
}
|
||||
}
|
||||
return total;
|
||||
// Add the 4 sums in sum horizontally.
|
||||
#if 0
|
||||
alignas(32) float tmp[4];
|
||||
_mm_store_ps(tmp, sum);
|
||||
float result = tmp[0] + tmp[1] + tmp[2] + tmp[3];
|
||||
#else
|
||||
__m128 zero = _mm_setzero_ps();
|
||||
// https://www.felixcloutier.com/x86/haddps
|
||||
sum = _mm_hadd_ps(sum, zero);
|
||||
sum = _mm_hadd_ps(sum, zero);
|
||||
// Extract the low result.
|
||||
float result = _mm_cvtss_f32(sum);
|
||||
#endif
|
||||
// Add on any left-over products.
|
||||
while (offset < n) {
|
||||
result += u[offset] * v[offset];
|
||||
++offset;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
#else
|
||||
double DotProductSSE(const double *u, const double *v, int n) {
|
||||
@ -48,7 +99,8 @@ double DotProductSSE(const double *u, const double *v, int n) {
|
||||
if (offset <= max_offset) {
|
||||
offset = 2;
|
||||
// Aligned load is reputedly faster but requires 16 byte aligned input.
|
||||
if ((reinterpret_cast<uintptr_t>(u) & 15) == 0 && (reinterpret_cast<uintptr_t>(v) & 15) == 0) {
|
||||
if ((reinterpret_cast<uintptr_t>(u) & 15) == 0 &&
|
||||
(reinterpret_cast<uintptr_t>(v) & 15) == 0) {
|
||||
// Use aligned load.
|
||||
sum = _mm_load_pd(u);
|
||||
__m128d floats2 = _mm_load_pd(v);
|
||||
|
@ -115,7 +115,7 @@ struct TESS_API IntSimdMatrix {
|
||||
static const IntSimdMatrix *intSimdMatrix;
|
||||
// Only available with NEON.
|
||||
static const IntSimdMatrix intSimdMatrixNEON;
|
||||
// Only available with AVX2 / SSE.
|
||||
// Only available with AVX2 / AVX / FMA / SSE.
|
||||
static const IntSimdMatrix intSimdMatrixAVX2;
|
||||
static const IntSimdMatrix intSimdMatrixSSE;
|
||||
};
|
||||
|
Loading…
Reference in New Issue
Block a user