diff --git a/CMakeLists.txt b/CMakeLists.txt index da0b42cb1c..2f4fd3323d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -214,13 +214,13 @@ OCV_OPTION(ENABLE_COVERAGE "Enable coverage collection with GCov" OCV_OPTION(ENABLE_OMIT_FRAME_POINTER "Enable -fomit-frame-pointer for GCC" ON IF CMAKE_COMPILER_IS_GNUCXX AND NOT (APPLE AND CMAKE_COMPILER_IS_CLANGCXX) ) OCV_OPTION(ENABLE_POWERPC "Enable PowerPC for GCC" ON IF (CMAKE_COMPILER_IS_GNUCXX AND CMAKE_SYSTEM_PROCESSOR MATCHES powerpc.*) ) OCV_OPTION(ENABLE_FAST_MATH "Enable -ffast-math (not recommended for GCC 4.6.x)" OFF IF (CMAKE_COMPILER_IS_GNUCXX AND (X86 OR X86_64)) ) -OCV_OPTION(ENABLE_POPCNT "Enable POPCNT instructions" ON IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) ) OCV_OPTION(ENABLE_SSE "Enable SSE instructions" ON IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) ) OCV_OPTION(ENABLE_SSE2 "Enable SSE2 instructions" ON IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) ) -OCV_OPTION(ENABLE_SSE3 "Enable SSE3 instructions" ON IF ((CV_ICC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) ) -OCV_OPTION(ENABLE_SSSE3 "Enable SSSE3 instructions" OFF IF (CMAKE_COMPILER_IS_GNUCXX AND (X86 OR X86_64)) ) -OCV_OPTION(ENABLE_SSE41 "Enable SSE4.1 instructions" OFF IF ((CV_ICC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) ) -OCV_OPTION(ENABLE_SSE42 "Enable SSE4.2 instructions" OFF IF (CMAKE_COMPILER_IS_GNUCXX AND (X86 OR X86_64)) ) +OCV_OPTION(ENABLE_SSE3 "Enable SSE3 instructions" ON IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX OR CV_ICC) AND (X86 OR X86_64)) ) +OCV_OPTION(ENABLE_SSSE3 "Enable SSSE3 instructions" OFF IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) ) +OCV_OPTION(ENABLE_SSE41 "Enable SSE4.1 instructions" OFF IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX OR CV_ICC) AND (X86 OR X86_64)) ) +OCV_OPTION(ENABLE_SSE42 "Enable SSE4.2 instructions" OFF IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) ) +OCV_OPTION(ENABLE_POPCNT "Enable POPCNT instructions" OFF IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) ) OCV_OPTION(ENABLE_AVX "Enable AVX instructions" OFF IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) ) OCV_OPTION(ENABLE_AVX2 "Enable AVX2 instructions" OFF IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) ) OCV_OPTION(ENABLE_FMA3 "Enable FMA3 instructions" OFF IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) ) diff --git a/cmake/OpenCVCompilerOptions.cmake b/cmake/OpenCVCompilerOptions.cmake index 66e16e7863..13559b5c8a 100644 --- a/cmake/OpenCVCompilerOptions.cmake +++ b/cmake/OpenCVCompilerOptions.cmake @@ -122,9 +122,6 @@ if(CMAKE_COMPILER_IS_GNUCXX) if(ENABLE_POWERPC) add_extra_compiler_option("-mcpu=G3 -mtune=G5") endif() - if(ENABLE_POPCNT) - add_extra_compiler_option(-mpopcnt) - endif() if(ENABLE_SSE) add_extra_compiler_option(-msse) endif() @@ -168,6 +165,10 @@ if(CMAKE_COMPILER_IS_GNUCXX) if(ENABLE_SSE42) add_extra_compiler_option(-msse4.2) endif() + + if(ENABLE_POPCNT) + add_extra_compiler_option(-mpopcnt) + endif() endif() endif(NOT MINGW) diff --git a/modules/core/include/opencv2/core/cvdef.h b/modules/core/include/opencv2/core/cvdef.h index a9d59c7693..3fdaa6954d 100644 --- a/modules/core/include/opencv2/core/cvdef.h +++ b/modules/core/include/opencv2/core/cvdef.h @@ -166,7 +166,7 @@ # endif # define CV_POPCNT 1 # endif -# if defined __AVX__ || (defined _MSC_VER && _MSC_VER >= 1600) +# if defined __AVX__ || (defined _MSC_VER && _MSC_VER >= 1600 && 0) // MS Visual Studio 2010 (2012?) has no macro pre-defined to identify the use of /arch:AVX // See: http://connect.microsoft.com/VisualStudio/feedback/details/605858/arch-avx-should-define-a-predefined-macro-in-x64-and-set-a-unique-value-for-m-ix86-fp-in-win32 # include @@ -177,7 +177,7 @@ # define __xgetbv() 0 # endif # endif -# if defined __AVX2__ || (defined _MSC_VER && _MSC_VER >= 1800) +# if defined __AVX2__ || (defined _MSC_VER && _MSC_VER >= 1800 && 0) # include # define CV_AVX2 1 # if defined __FMA__ diff --git a/modules/core/src/stat.cpp b/modules/core/src/stat.cpp index b26308051b..87c423dc3b 100644 --- a/modules/core/src/stat.cpp +++ b/modules/core/src/stat.cpp @@ -800,6 +800,137 @@ static CountNonZeroFunc getCountNonZeroTab(int depth) return countNonZeroTab[depth]; } +template +struct SumSqr_SIMD +{ + int operator () (const T *, const uchar *, ST *, SQT *, int, int) const + { + return 0; + } +}; + +#if CV_SSE2 + +template <> +struct SumSqr_SIMD +{ + int operator () (const uchar * src0, const uchar * mask, int * sum, int * sqsum, int len, int cn) const + { + if (mask || (cn != 1 && cn != 2) || !USE_SSE2) + return 0; + + int x = 0; + __m128i v_zero = _mm_setzero_si128(), v_sum = v_zero, v_sqsum = v_zero; + + for ( ; x <= len - 16; x += 16) + { + __m128i v_src = _mm_loadu_si128((const __m128i *)(src0 + x)); + __m128i v_half = _mm_unpacklo_epi8(v_src, v_zero); + + __m128i v_mullo = _mm_mullo_epi16(v_half, v_half); + __m128i v_mulhi = _mm_mulhi_epi16(v_half, v_half); + v_sum = _mm_add_epi32(v_sum, _mm_unpacklo_epi16(v_half, v_zero)); + v_sum = _mm_add_epi32(v_sum, _mm_unpackhi_epi16(v_half, v_zero)); + v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpacklo_epi16(v_mullo, v_mulhi)); + v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpackhi_epi16(v_mullo, v_mulhi)); + + v_half = _mm_unpackhi_epi8(v_src, v_zero); + v_mullo = _mm_mullo_epi16(v_half, v_half); + v_mulhi = _mm_mulhi_epi16(v_half, v_half); + v_sum = _mm_add_epi32(v_sum, _mm_unpacklo_epi16(v_half, v_zero)); + v_sum = _mm_add_epi32(v_sum, _mm_unpackhi_epi16(v_half, v_zero)); + v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpacklo_epi16(v_mullo, v_mulhi)); + v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpackhi_epi16(v_mullo, v_mulhi)); + } + + for ( ; x <= len - 8; x += 8) + { + __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src0 + x)), v_zero); + + __m128i v_mullo = _mm_mullo_epi16(v_src, v_src); + __m128i v_mulhi = _mm_mulhi_epi16(v_src, v_src); + v_sum = _mm_add_epi32(v_sum, _mm_unpacklo_epi16(v_src, v_zero)); + v_sum = _mm_add_epi32(v_sum, _mm_unpackhi_epi16(v_src, v_zero)); + v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpacklo_epi16(v_mullo, v_mulhi)); + v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpackhi_epi16(v_mullo, v_mulhi)); + } + + int CV_DECL_ALIGNED(16) ar[8]; + _mm_store_si128((__m128i*)ar, v_sum); + _mm_store_si128((__m128i*)(ar + 4), v_sqsum); + + for (int i = 0; i < 4; i += cn) + for (int j = 0; j < cn; ++j) + { + sum[j] += ar[j + i]; + sqsum[j] += ar[4 + j + i]; + } + + return x / cn; + } +}; + +template <> +struct SumSqr_SIMD +{ + int operator () (const schar * src0, const uchar * mask, int * sum, int * sqsum, int len, int cn) const + { + if (mask || (cn != 1 && cn != 2) || !USE_SSE2) + return 0; + + int x = 0; + __m128i v_zero = _mm_setzero_si128(), v_sum = v_zero, v_sqsum = v_zero; + + for ( ; x <= len - 16; x += 16) + { + __m128i v_src = _mm_loadu_si128((const __m128i *)(src0 + x)); + __m128i v_half = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src), 8); + + __m128i v_mullo = _mm_mullo_epi16(v_half, v_half); + __m128i v_mulhi = _mm_mulhi_epi16(v_half, v_half); + v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_half), 16)); + v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_half), 16)); + v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpacklo_epi16(v_mullo, v_mulhi)); + v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpackhi_epi16(v_mullo, v_mulhi)); + + v_half = _mm_srai_epi16(_mm_unpackhi_epi8(v_zero, v_src), 8); + v_mullo = _mm_mullo_epi16(v_half, v_half); + v_mulhi = _mm_mulhi_epi16(v_half, v_half); + v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_half), 16)); + v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_half), 16)); + v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpacklo_epi16(v_mullo, v_mulhi)); + v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpackhi_epi16(v_mullo, v_mulhi)); + } + + for ( ; x <= len - 8; x += 8) + { + __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src0 + x))), 8); + + __m128i v_mullo = _mm_mullo_epi16(v_src, v_src); + __m128i v_mulhi = _mm_mulhi_epi16(v_src, v_src); + v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16)); + v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16)); + v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpacklo_epi16(v_mullo, v_mulhi)); + v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpackhi_epi16(v_mullo, v_mulhi)); + } + + int CV_DECL_ALIGNED(16) ar[8]; + _mm_store_si128((__m128i*)ar, v_sum); + _mm_store_si128((__m128i*)(ar + 4), v_sqsum); + + for (int i = 0; i < 4; i += cn) + for (int j = 0; j < cn; ++j) + { + sum[j] += ar[j + i]; + sqsum[j] += ar[4 + j + i]; + } + + return x / cn; + } +}; + +#endif + template static int sumsqr_(const T* src0, const uchar* mask, ST* sum, SQT* sqsum, int len, int cn ) { @@ -807,14 +938,15 @@ static int sumsqr_(const T* src0, const uchar* mask, ST* sum, SQT* sqsum, int le if( !mask ) { - int i; - int k = cn % 4; + SumSqr_SIMD vop; + int i = vop(src0, mask, sum, sqsum, len, cn), k = cn % 4; + src += i * cn; if( k == 1 ) { ST s0 = sum[0]; SQT sq0 = sqsum[0]; - for( i = 0; i < len; i++, src += cn ) + for( ; i < len; i++, src += cn ) { T v = src[0]; s0 += v; sq0 += (SQT)v*v; @@ -826,7 +958,7 @@ static int sumsqr_(const T* src0, const uchar* mask, ST* sum, SQT* sqsum, int le { ST s0 = sum[0], s1 = sum[1]; SQT sq0 = sqsum[0], sq1 = sqsum[1]; - for( i = 0; i < len; i++, src += cn ) + for( ; i < len; i++, src += cn ) { T v0 = src[0], v1 = src[1]; s0 += v0; sq0 += (SQT)v0*v0; @@ -839,7 +971,7 @@ static int sumsqr_(const T* src0, const uchar* mask, ST* sum, SQT* sqsum, int le { ST s0 = sum[0], s1 = sum[1], s2 = sum[2]; SQT sq0 = sqsum[0], sq1 = sqsum[1], sq2 = sqsum[2]; - for( i = 0; i < len; i++, src += cn ) + for( ; i < len; i++, src += cn ) { T v0 = src[0], v1 = src[1], v2 = src[2]; s0 += v0; sq0 += (SQT)v0*v0; @@ -855,7 +987,7 @@ static int sumsqr_(const T* src0, const uchar* mask, ST* sum, SQT* sqsum, int le src = src0 + k; ST s0 = sum[k], s1 = sum[k+1], s2 = sum[k+2], s3 = sum[k+3]; SQT sq0 = sqsum[k], sq1 = sqsum[k+1], sq2 = sqsum[k+2], sq3 = sqsum[k+3]; - for( i = 0; i < len; i++, src += cn ) + for( ; i < len; i++, src += cn ) { T v0, v1; v0 = src[0], v1 = src[1]; diff --git a/modules/imgproc/src/color.cpp b/modules/imgproc/src/color.cpp index 5ae1170b43..b900cf1845 100644 --- a/modules/imgproc/src/color.cpp +++ b/modules/imgproc/src/color.cpp @@ -1598,10 +1598,10 @@ struct RGB2Gray haveSIMD = checkHardwareSupport(CV_CPU_SSE2); } - void process(__m128 v_r, __m128 v_g, __m128 v_b, + void process(__m128 v_b, __m128 v_g, __m128 v_r, __m128 & v_gray) const { - v_gray = _mm_mul_ps(v_r, v_cb); + v_gray = _mm_mul_ps(v_r, v_cr); v_gray = _mm_add_ps(v_gray, _mm_mul_ps(v_g, v_cg)); v_gray = _mm_add_ps(v_gray, _mm_mul_ps(v_b, v_cb)); } diff --git a/modules/imgproc/src/imgwarp.cpp b/modules/imgproc/src/imgwarp.cpp index 304210f84e..fe126fbbd1 100644 --- a/modules/imgproc/src/imgwarp.cpp +++ b/modules/imgproc/src/imgwarp.cpp @@ -5016,8 +5016,8 @@ void cv::convertMaps( InputArray _map1, InputArray _map2, vandq_s32(v_ix1, v_mask))); vst1q_u16(dst2 + x, vcombine_u16(v_dst0, v_dst1)); } - #elif CV_SSE2 - if (useSSE2) + #elif CV_SSE4_1 + if (useSSE4_1) { __m128 v_its = _mm_set1_ps(INTER_TAB_SIZE); __m128i v_its1 = _mm_set1_epi32(INTER_TAB_SIZE-1); diff --git a/modules/imgproc/src/smooth.cpp b/modules/imgproc/src/smooth.cpp index 2dc2fbdf7e..ec274259e1 100644 --- a/modules/imgproc/src/smooth.cpp +++ b/modules/imgproc/src/smooth.cpp @@ -842,7 +842,7 @@ struct ColumnSum : { int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i)); - vst1q_s32(D + i, v_s01); + vst1q_s32(D + i, v_s0); vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i))); } #endif