This commit is contained in:
Ilya Lavrenov 2015-01-12 10:59:28 +03:00
parent 28833421ae
commit 8d48632ebe
7 changed files with 105 additions and 39 deletions

View File

@ -221,6 +221,7 @@ OCV_OPTION(ENABLE_SSSE3 "Enable SSSE3 instructions"
OCV_OPTION(ENABLE_SSE41 "Enable SSE4.1 instructions" OFF IF ((CV_ICC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) )
OCV_OPTION(ENABLE_SSE42 "Enable SSE4.2 instructions" OFF IF (CMAKE_COMPILER_IS_GNUCXX AND (X86 OR X86_64)) )
OCV_OPTION(ENABLE_AVX "Enable AVX instructions" OFF IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) )
OCV_OPTION(ENABLE_AVX2 "Enable AVX2 instructions" OFF IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) )
OCV_OPTION(ENABLE_NEON "Enable NEON instructions" OFF IF CMAKE_COMPILER_IS_GNUCXX AND (ARM OR IOS) )
OCV_OPTION(ENABLE_VFPV3 "Enable VFPv3-D32 instructions" OFF IF CMAKE_COMPILER_IS_GNUCXX AND (ARM OR IOS) )
OCV_OPTION(ENABLE_NOISY_WARNINGS "Show all warnings even if they are too noisy" OFF )

View File

@ -140,6 +140,9 @@ if(CMAKE_COMPILER_IS_GNUCXX)
if(ENABLE_AVX)
add_extra_compiler_option(-mavx)
endif()
if(ENABLE_AVX2)
add_extra_compiler_option(-mavx2)
endif()
# GCC depresses SSEx instructions when -mavx is used. Instead, it generates new AVX instructions or AVX equivalence for all SSEx instructions when needed.
if(NOT OPENCV_EXTRA_CXX_FLAGS MATCHES "-mavx")

View File

@ -114,7 +114,8 @@
#define CV_CPU_SSE4_2 7
#define CV_CPU_POPCNT 8
#define CV_CPU_AVX 10
#define CV_CPU_NEON 11
#define CV_CPU_AVX2 11
#define CV_CPU_NEON 12
// when adding to this list remember to update the enum in core/utility.cpp
#define CV_HARDWARE_MAX_FEATURE 255
@ -141,7 +142,7 @@
# include <nmmintrin.h>
# define CV_SSE4_2 1
# endif
# if defined __AVX__ || (defined _MSC_FULL_VER && _MSC_FULL_VER >= 160040219)
# if defined __AVX__ || defined __AVX2__ || (defined _MSC_FULL_VER && _MSC_FULL_VER >= 160040219)
// MS Visual Studio 2010 (2012?) has no macro pre-defined to identify the use of /arch:AVX
// See: http://connect.microsoft.com/VisualStudio/feedback/details/605858/arch-avx-should-define-a-predefined-macro-in-x64-and-set-a-unique-value-for-m-ix86-fp-in-win32
# include <immintrin.h>
@ -150,6 +151,9 @@
# define __xgetbv() _xgetbv(_XCR_XFEATURE_ENABLED_MASK)
# else
# define __xgetbv() 0
# ifdef __AVX2__
# define CV_AVX2 1
# endif
# endif
# endif
#endif
@ -187,6 +191,9 @@
#ifndef CV_AVX
# define CV_AVX 0
#endif
#ifndef CV_AVX2
# define CV_AVX2 0
#endif
#ifndef CV_NEON
# define CV_NEON 0
#endif

View File

@ -2294,6 +2294,24 @@ cvtScale_<short, int, float>( const short* src, size_t sstep,
{
int x = 0;
#if CV_AVX2
if (USE_AVX2)
{
__m256 scale256 = _mm256_set1_ps (scale);
__m256 shift256 = _mm256_set1_ps (shift);
__m256i zero = _mm256_setzero_si256();
for ( ; x <= size.width - 16; x += 16)
{
__m256i v_src = _mm256_loadu_si256((__m256i const *)(src + x));
__m256i v_src_lo = _mm256_unpacklo_epi16(v_src, zero);
__m256i v_src_hi = _mm256_unpackhi_epi16(v_src, zero);
__m256 v_dst0 = _mm256_add_ps( _mm256_mul_ps(_mm256_cvtepi32_ps (v_src_lo), scale256), shift256);
__m256 v_dst1 = _mm256_add_ps( _mm256_mul_ps(_mm256_cvtepi32_ps (v_src_hi), scale256), shift256);
_mm256_storeu_si256 ((__m256i *)(dst + x), _mm256_cvtps_epi32(v_dst0));
_mm256_storeu_si256 ((__m256i *)(dst + x + 8), _mm256_cvtps_epi32(v_dst1));
}
}
#endif
#if CV_SSE2
if (USE_SSE2)//~5X
{
@ -2330,24 +2348,6 @@ cvtScale_<short, int, float>( const short* src, size_t sstep,
}
#endif
//We will wait Haswell
/*
#if CV_AVX
if(USE_AVX)//2X - bad variant
{
////TODO:AVX implementation (optimization?) required
__m256 scale256 = _mm256_set1_ps (scale);
__m256 shift256 = _mm256_set1_ps (shift);
for(; x <= size.width - 8; x += 8 )
{
__m256i buf = _mm256_set_epi32((int)(*(src+x+7)),(int)(*(src+x+6)),(int)(*(src+x+5)),(int)(*(src+x+4)),(int)(*(src+x+3)),(int)(*(src+x+2)),(int)(*(src+x+1)),(int)(*(src+x)));
__m256 r0 = _mm256_add_ps( _mm256_mul_ps(_mm256_cvtepi32_ps (buf), scale256), shift256);
__m256i res = _mm256_cvtps_epi32(r0);
_mm256_storeu_si256 ((__m256i*)(dst+x), res);
}
}
#endif*/
for(; x < size.width; x++ )
dst[x] = saturate_cast<int>(src[x]*scale + shift);
}

View File

@ -192,6 +192,7 @@ struct NoVec
extern volatile bool USE_SSE2;
extern volatile bool USE_SSE4_2;
extern volatile bool USE_AVX;
extern volatile bool USE_AVX2;
enum { BLOCK_SIZE = 1024 };

View File

@ -82,6 +82,22 @@
pop ebx
}
}
static void __cpuidex(int* cpuid_data, int, int)
{
__asm
{
push edi
mov edi, cpuid_data
mov eax, 7
mov ecx, 0
cpuid
mov [edi], eax
mov [edi + 4], ebx
mov [edi + 8], ecx
mov [edi + 12], edx
pop edi
}
}
#endif
#endif
@ -251,6 +267,40 @@ struct HWFeatures
f.have[CV_CPU_SSE4_2] = (cpuid_data[2] & (1<<20)) != 0;
f.have[CV_CPU_POPCNT] = (cpuid_data[2] & (1<<23)) != 0;
f.have[CV_CPU_AVX] = (((cpuid_data[2] & (1<<28)) != 0)&&((cpuid_data[2] & (1<<27)) != 0));//OS uses XSAVE_XRSTORE and CPU support AVX
// make the second call to the cpuid command in order to get
// information about extended features like AVX2
#if defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
__cpuidex(cpuid_data, 7, 0);
#elif defined __GNUC__ && (defined __i386__ || defined __x86_64__)
#ifdef __x86_64__
asm __volatile__
(
"movl $7, %%eax\n\t"
"movl $0, %%ecx\n\t"
"cpuid\n\t"
:[eax]"=a"(cpuid_data[0]),[ebx]"=b"(cpuid_data[1]),[ecx]"=c"(cpuid_data[2]),[edx]"=d"(cpuid_data[3])
:
: "cc"
);
#else
asm volatile
(
"pushl %%eax\n\t"
"pushl %%edx\n\t"
"movl $7,%%eax\n\t"
"movl $0,%%ecx\n\t"
"cpuid\n\t"
"popl %%edx\n\t"
"popl %%eax\n\t"
: "=b"(cpuid_data[1]), "=c"(cpuid_data[2])
:
: "cc"
);
#endif
#endif
f.have[CV_CPU_AVX2] = (cpuid_data[1] & (1<<5)) != 0;
}
return f;
@ -290,6 +340,7 @@ IPPInitializer ippInitializer;
volatile bool USE_SSE2 = featuresEnabled.have[CV_CPU_SSE2];
volatile bool USE_SSE4_2 = featuresEnabled.have[CV_CPU_SSE4_2];
volatile bool USE_AVX = featuresEnabled.have[CV_CPU_AVX];
volatile bool USE_AVX2 = featuresEnabled.have[CV_CPU_AVX2];
void setUseOptimized( bool flag )
{

View File

@ -3019,6 +3019,9 @@ void printVersionInfo(bool useStdOut)
#if CV_AVX
if (checkHardwareSupport(CV_CPU_AVX)) cpu_features += " avx";
#endif
#if CV_AVX2
if (checkHardwareSupport(CV_CPU_AVX2)) cpu_features += " avx2";
#endif
#if CV_NEON
cpu_features += " neon"; // NEON is currently not checked at runtime
#endif