mirror of
https://github.com/opencv/opencv.git
synced 2024-11-24 03:00:14 +08:00
avx2
This commit is contained in:
parent
28833421ae
commit
8d48632ebe
@ -221,6 +221,7 @@ OCV_OPTION(ENABLE_SSSE3 "Enable SSSE3 instructions"
|
||||
OCV_OPTION(ENABLE_SSE41 "Enable SSE4.1 instructions" OFF IF ((CV_ICC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) )
|
||||
OCV_OPTION(ENABLE_SSE42 "Enable SSE4.2 instructions" OFF IF (CMAKE_COMPILER_IS_GNUCXX AND (X86 OR X86_64)) )
|
||||
OCV_OPTION(ENABLE_AVX "Enable AVX instructions" OFF IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) )
|
||||
OCV_OPTION(ENABLE_AVX2 "Enable AVX2 instructions" OFF IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) )
|
||||
OCV_OPTION(ENABLE_NEON "Enable NEON instructions" OFF IF CMAKE_COMPILER_IS_GNUCXX AND (ARM OR IOS) )
|
||||
OCV_OPTION(ENABLE_VFPV3 "Enable VFPv3-D32 instructions" OFF IF CMAKE_COMPILER_IS_GNUCXX AND (ARM OR IOS) )
|
||||
OCV_OPTION(ENABLE_NOISY_WARNINGS "Show all warnings even if they are too noisy" OFF )
|
||||
|
@ -140,6 +140,9 @@ if(CMAKE_COMPILER_IS_GNUCXX)
|
||||
if(ENABLE_AVX)
|
||||
add_extra_compiler_option(-mavx)
|
||||
endif()
|
||||
if(ENABLE_AVX2)
|
||||
add_extra_compiler_option(-mavx2)
|
||||
endif()
|
||||
|
||||
# GCC depresses SSEx instructions when -mavx is used. Instead, it generates new AVX instructions or AVX equivalence for all SSEx instructions when needed.
|
||||
if(NOT OPENCV_EXTRA_CXX_FLAGS MATCHES "-mavx")
|
||||
|
@ -114,7 +114,8 @@
|
||||
#define CV_CPU_SSE4_2 7
|
||||
#define CV_CPU_POPCNT 8
|
||||
#define CV_CPU_AVX 10
|
||||
#define CV_CPU_NEON 11
|
||||
#define CV_CPU_AVX2 11
|
||||
#define CV_CPU_NEON 12
|
||||
// when adding to this list remember to update the enum in core/utility.cpp
|
||||
#define CV_HARDWARE_MAX_FEATURE 255
|
||||
|
||||
@ -141,7 +142,7 @@
|
||||
# include <nmmintrin.h>
|
||||
# define CV_SSE4_2 1
|
||||
# endif
|
||||
# if defined __AVX__ || (defined _MSC_FULL_VER && _MSC_FULL_VER >= 160040219)
|
||||
# if defined __AVX__ || defined __AVX2__ || (defined _MSC_FULL_VER && _MSC_FULL_VER >= 160040219)
|
||||
// MS Visual Studio 2010 (2012?) has no macro pre-defined to identify the use of /arch:AVX
|
||||
// See: http://connect.microsoft.com/VisualStudio/feedback/details/605858/arch-avx-should-define-a-predefined-macro-in-x64-and-set-a-unique-value-for-m-ix86-fp-in-win32
|
||||
# include <immintrin.h>
|
||||
@ -150,6 +151,9 @@
|
||||
# define __xgetbv() _xgetbv(_XCR_XFEATURE_ENABLED_MASK)
|
||||
# else
|
||||
# define __xgetbv() 0
|
||||
# ifdef __AVX2__
|
||||
# define CV_AVX2 1
|
||||
# endif
|
||||
# endif
|
||||
# endif
|
||||
#endif
|
||||
@ -187,6 +191,9 @@
|
||||
#ifndef CV_AVX
|
||||
# define CV_AVX 0
|
||||
#endif
|
||||
#ifndef CV_AVX2
|
||||
# define CV_AVX2 0
|
||||
#endif
|
||||
#ifndef CV_NEON
|
||||
# define CV_NEON 0
|
||||
#endif
|
||||
|
@ -2294,26 +2294,44 @@ cvtScale_<short, int, float>( const short* src, size_t sstep,
|
||||
{
|
||||
int x = 0;
|
||||
|
||||
#if CV_SSE2
|
||||
if(USE_SSE2)//~5X
|
||||
#if CV_AVX2
|
||||
if (USE_AVX2)
|
||||
{
|
||||
__m256 scale256 = _mm256_set1_ps (scale);
|
||||
__m256 shift256 = _mm256_set1_ps (shift);
|
||||
__m256i zero = _mm256_setzero_si256();
|
||||
for ( ; x <= size.width - 16; x += 16)
|
||||
{
|
||||
__m128 scale128 = _mm_set1_ps (scale);
|
||||
__m128 shift128 = _mm_set1_ps (shift);
|
||||
for(; x <= size.width - 8; x += 8 )
|
||||
{
|
||||
__m128i r0 = _mm_loadl_epi64((const __m128i*)(src + x));
|
||||
__m128i r1 = _mm_loadl_epi64((const __m128i*)(src + x + 4));
|
||||
__m128 rf0 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r0, r0), 16));
|
||||
__m128 rf1 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r1, r1), 16));
|
||||
rf0 = _mm_add_ps(_mm_mul_ps(rf0, scale128), shift128);
|
||||
rf1 = _mm_add_ps(_mm_mul_ps(rf1, scale128), shift128);
|
||||
r0 = _mm_cvtps_epi32(rf0);
|
||||
r1 = _mm_cvtps_epi32(rf1);
|
||||
|
||||
_mm_storeu_si128((__m128i*)(dst + x), r0);
|
||||
_mm_storeu_si128((__m128i*)(dst + x + 4), r1);
|
||||
}
|
||||
__m256i v_src = _mm256_loadu_si256((__m256i const *)(src + x));
|
||||
__m256i v_src_lo = _mm256_unpacklo_epi16(v_src, zero);
|
||||
__m256i v_src_hi = _mm256_unpackhi_epi16(v_src, zero);
|
||||
__m256 v_dst0 = _mm256_add_ps( _mm256_mul_ps(_mm256_cvtepi32_ps (v_src_lo), scale256), shift256);
|
||||
__m256 v_dst1 = _mm256_add_ps( _mm256_mul_ps(_mm256_cvtepi32_ps (v_src_hi), scale256), shift256);
|
||||
_mm256_storeu_si256 ((__m256i *)(dst + x), _mm256_cvtps_epi32(v_dst0));
|
||||
_mm256_storeu_si256 ((__m256i *)(dst + x + 8), _mm256_cvtps_epi32(v_dst1));
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if CV_SSE2
|
||||
if (USE_SSE2)//~5X
|
||||
{
|
||||
__m128 scale128 = _mm_set1_ps (scale);
|
||||
__m128 shift128 = _mm_set1_ps (shift);
|
||||
for(; x <= size.width - 8; x += 8 )
|
||||
{
|
||||
__m128i r0 = _mm_loadl_epi64((const __m128i*)(src + x));
|
||||
__m128i r1 = _mm_loadl_epi64((const __m128i*)(src + x + 4));
|
||||
__m128 rf0 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r0, r0), 16));
|
||||
__m128 rf1 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r1, r1), 16));
|
||||
rf0 = _mm_add_ps(_mm_mul_ps(rf0, scale128), shift128);
|
||||
rf1 = _mm_add_ps(_mm_mul_ps(rf1, scale128), shift128);
|
||||
r0 = _mm_cvtps_epi32(rf0);
|
||||
r1 = _mm_cvtps_epi32(rf1);
|
||||
|
||||
_mm_storeu_si128((__m128i*)(dst + x), r0);
|
||||
_mm_storeu_si128((__m128i*)(dst + x + 4), r1);
|
||||
}
|
||||
}
|
||||
#elif CV_NEON
|
||||
float32x4_t v_shift = vdupq_n_f32(shift);
|
||||
for(; x <= size.width - 8; x += 8 )
|
||||
@ -2330,24 +2348,6 @@ cvtScale_<short, int, float>( const short* src, size_t sstep,
|
||||
}
|
||||
#endif
|
||||
|
||||
//We will wait Haswell
|
||||
/*
|
||||
#if CV_AVX
|
||||
if(USE_AVX)//2X - bad variant
|
||||
{
|
||||
////TODO:AVX implementation (optimization?) required
|
||||
__m256 scale256 = _mm256_set1_ps (scale);
|
||||
__m256 shift256 = _mm256_set1_ps (shift);
|
||||
for(; x <= size.width - 8; x += 8 )
|
||||
{
|
||||
__m256i buf = _mm256_set_epi32((int)(*(src+x+7)),(int)(*(src+x+6)),(int)(*(src+x+5)),(int)(*(src+x+4)),(int)(*(src+x+3)),(int)(*(src+x+2)),(int)(*(src+x+1)),(int)(*(src+x)));
|
||||
__m256 r0 = _mm256_add_ps( _mm256_mul_ps(_mm256_cvtepi32_ps (buf), scale256), shift256);
|
||||
__m256i res = _mm256_cvtps_epi32(r0);
|
||||
_mm256_storeu_si256 ((__m256i*)(dst+x), res);
|
||||
}
|
||||
}
|
||||
#endif*/
|
||||
|
||||
for(; x < size.width; x++ )
|
||||
dst[x] = saturate_cast<int>(src[x]*scale + shift);
|
||||
}
|
||||
|
@ -192,6 +192,7 @@ struct NoVec
|
||||
extern volatile bool USE_SSE2;
|
||||
extern volatile bool USE_SSE4_2;
|
||||
extern volatile bool USE_AVX;
|
||||
extern volatile bool USE_AVX2;
|
||||
|
||||
enum { BLOCK_SIZE = 1024 };
|
||||
|
||||
|
@ -82,6 +82,22 @@
|
||||
pop ebx
|
||||
}
|
||||
}
|
||||
static void __cpuidex(int* cpuid_data, int, int)
|
||||
{
|
||||
__asm
|
||||
{
|
||||
push edi
|
||||
mov edi, cpuid_data
|
||||
mov eax, 7
|
||||
mov ecx, 0
|
||||
cpuid
|
||||
mov [edi], eax
|
||||
mov [edi + 4], ebx
|
||||
mov [edi + 8], ecx
|
||||
mov [edi + 12], edx
|
||||
pop edi
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
@ -203,7 +219,7 @@ struct HWFeatures
|
||||
enum { MAX_FEATURE = CV_HARDWARE_MAX_FEATURE };
|
||||
|
||||
HWFeatures(void)
|
||||
{
|
||||
{
|
||||
memset( have, 0, sizeof(have) );
|
||||
x86_family = 0;
|
||||
}
|
||||
@ -251,6 +267,40 @@ struct HWFeatures
|
||||
f.have[CV_CPU_SSE4_2] = (cpuid_data[2] & (1<<20)) != 0;
|
||||
f.have[CV_CPU_POPCNT] = (cpuid_data[2] & (1<<23)) != 0;
|
||||
f.have[CV_CPU_AVX] = (((cpuid_data[2] & (1<<28)) != 0)&&((cpuid_data[2] & (1<<27)) != 0));//OS uses XSAVE_XRSTORE and CPU support AVX
|
||||
|
||||
// make the second call to the cpuid command in order to get
|
||||
// information about extended features like AVX2
|
||||
#if defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
|
||||
__cpuidex(cpuid_data, 7, 0);
|
||||
#elif defined __GNUC__ && (defined __i386__ || defined __x86_64__)
|
||||
#ifdef __x86_64__
|
||||
asm __volatile__
|
||||
(
|
||||
"movl $7, %%eax\n\t"
|
||||
"movl $0, %%ecx\n\t"
|
||||
"cpuid\n\t"
|
||||
:[eax]"=a"(cpuid_data[0]),[ebx]"=b"(cpuid_data[1]),[ecx]"=c"(cpuid_data[2]),[edx]"=d"(cpuid_data[3])
|
||||
:
|
||||
: "cc"
|
||||
);
|
||||
#else
|
||||
asm volatile
|
||||
(
|
||||
"pushl %%eax\n\t"
|
||||
"pushl %%edx\n\t"
|
||||
"movl $7,%%eax\n\t"
|
||||
"movl $0,%%ecx\n\t"
|
||||
"cpuid\n\t"
|
||||
"popl %%edx\n\t"
|
||||
"popl %%eax\n\t"
|
||||
: "=b"(cpuid_data[1]), "=c"(cpuid_data[2])
|
||||
:
|
||||
: "cc"
|
||||
);
|
||||
#endif
|
||||
#endif
|
||||
f.have[CV_CPU_AVX2] = (cpuid_data[1] & (1<<5)) != 0;
|
||||
|
||||
}
|
||||
|
||||
return f;
|
||||
@ -290,6 +340,7 @@ IPPInitializer ippInitializer;
|
||||
volatile bool USE_SSE2 = featuresEnabled.have[CV_CPU_SSE2];
|
||||
volatile bool USE_SSE4_2 = featuresEnabled.have[CV_CPU_SSE4_2];
|
||||
volatile bool USE_AVX = featuresEnabled.have[CV_CPU_AVX];
|
||||
volatile bool USE_AVX2 = featuresEnabled.have[CV_CPU_AVX2];
|
||||
|
||||
void setUseOptimized( bool flag )
|
||||
{
|
||||
|
@ -3019,6 +3019,9 @@ void printVersionInfo(bool useStdOut)
|
||||
#if CV_AVX
|
||||
if (checkHardwareSupport(CV_CPU_AVX)) cpu_features += " avx";
|
||||
#endif
|
||||
#if CV_AVX2
|
||||
if (checkHardwareSupport(CV_CPU_AVX2)) cpu_features += " avx2";
|
||||
#endif
|
||||
#if CV_NEON
|
||||
cpu_features += " neon"; // NEON is currently not checked at runtime
|
||||
#endif
|
||||
|
Loading…
Reference in New Issue
Block a user