Merge pull request #738 from stweil/avx

Support AVX for 32 bit platforms
2024-11-27 20:59:36 +08:00 · 2017-02-27 19:35:19 +01:00 · 2017-02-27 19:35:19 +01:00 · e68e48772d
commit e68e48772d
parent 6a7831b06d e663d00fbe
2 changed files with 9 additions and 14 deletions
--- a/arch/dotproductavx.cpp
+++ b/arch/dotproductavx.cpp
@ -16,9 +16,8 @@
 // limitations under the License.
 ///////////////////////////////////////////////////////////////////////

-#if !defined(__AVX__) || defined(__i386__)
+#if !defined(__AVX__)
 // Implementation for non-avx archs.
-// Also used for 32 bit AVX archs because of missing _mm256_extract_epi64.

 #include "dotproductavx.h"
 #include <stdio.h>
@ -92,13 +91,13 @@ double DotProductAVX(const double* u, const double* v, int n) {
  // fool the instrinsics into thinking we are extracting the bottom int64.
  auto cast_sum = _mm256_castpd_si256(sum);
  *(reinterpret_cast<inT64*>(&result)) =
-#ifndef _WIN32
-      _mm256_extract_epi64(cast_sum, 0)
-#else
-      // this is a very simple workaround that probably could be activated
-      // for all other platforms that do not have _mm256_extract_epi64
+#if defined(_WIN32) || defined(__i386__)
+      // This is a very simple workaround that is activated
+      // for all platforms that do not have _mm256_extract_epi64.
      // _mm256_extract_epi64(X, Y) == ((uint64_t*)&X)[Y]
      ((uint64_t*)&cast_sum)[0]
+#else
+      _mm256_extract_epi64(cast_sum, 0)
 #endif
      ;
  while (offset < n) {
--- a/configure.ac
+++ b/configure.ac
@ -119,13 +119,9 @@ esac
 AM_CONDITIONAL([AVX_OPT], false)
 AM_CONDITIONAL([SSE41_OPT], false)

-# The current implementation for AVX uses 64 bit code.
-AC_CHECK_SIZEOF([void *])
-if test "$ac_cv_sizeof_void_p" = "8"; then
-    AX_CHECK_COMPILE_FLAG([-mavx], [avx=true], [avx=false])
-    if $avx; then
-        AM_CONDITIONAL([AVX_OPT], true)
-    fi
+AX_CHECK_COMPILE_FLAG([-mavx], [avx=true], [avx=false])
+if $avx; then
+    AM_CONDITIONAL([AVX_OPT], true)
 fi

 AX_CHECK_COMPILE_FLAG([-msse4.1], [sse41=true], [sse41=false])