From 8d48632ebef60e7c4d92b5c9d6549f8e1623010a Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 12 Jan 2015 10:59:28 +0300 Subject: [PATCH 01/53] avx2 --- CMakeLists.txt | 1 + cmake/OpenCVCompilerOptions.cmake | 3 + modules/core/include/opencv2/core/cvdef.h | 11 +++- modules/core/src/convert.cpp | 72 +++++++++++------------ modules/core/src/precomp.hpp | 1 + modules/core/src/system.cpp | 53 ++++++++++++++++- modules/ts/src/ts_func.cpp | 3 + 7 files changed, 105 insertions(+), 39 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 75fcf9659b..7b5648efd4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -221,6 +221,7 @@ OCV_OPTION(ENABLE_SSSE3 "Enable SSSE3 instructions" OCV_OPTION(ENABLE_SSE41 "Enable SSE4.1 instructions" OFF IF ((CV_ICC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) ) OCV_OPTION(ENABLE_SSE42 "Enable SSE4.2 instructions" OFF IF (CMAKE_COMPILER_IS_GNUCXX AND (X86 OR X86_64)) ) OCV_OPTION(ENABLE_AVX "Enable AVX instructions" OFF IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) ) +OCV_OPTION(ENABLE_AVX2 "Enable AVX2 instructions" OFF IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) ) OCV_OPTION(ENABLE_NEON "Enable NEON instructions" OFF IF CMAKE_COMPILER_IS_GNUCXX AND (ARM OR IOS) ) OCV_OPTION(ENABLE_VFPV3 "Enable VFPv3-D32 instructions" OFF IF CMAKE_COMPILER_IS_GNUCXX AND (ARM OR IOS) ) OCV_OPTION(ENABLE_NOISY_WARNINGS "Show all warnings even if they are too noisy" OFF ) diff --git a/cmake/OpenCVCompilerOptions.cmake b/cmake/OpenCVCompilerOptions.cmake index 2f9068c60d..831026fb50 100644 --- a/cmake/OpenCVCompilerOptions.cmake +++ b/cmake/OpenCVCompilerOptions.cmake @@ -140,6 +140,9 @@ if(CMAKE_COMPILER_IS_GNUCXX) if(ENABLE_AVX) add_extra_compiler_option(-mavx) endif() + if(ENABLE_AVX2) + add_extra_compiler_option(-mavx2) + endif() # GCC depresses SSEx instructions when -mavx is used. Instead, it generates new AVX instructions or AVX equivalence for all SSEx instructions when needed. if(NOT OPENCV_EXTRA_CXX_FLAGS MATCHES "-mavx") diff --git a/modules/core/include/opencv2/core/cvdef.h b/modules/core/include/opencv2/core/cvdef.h index 06894d7a5d..c52cb021cb 100644 --- a/modules/core/include/opencv2/core/cvdef.h +++ b/modules/core/include/opencv2/core/cvdef.h @@ -114,7 +114,8 @@ #define CV_CPU_SSE4_2 7 #define CV_CPU_POPCNT 8 #define CV_CPU_AVX 10 -#define CV_CPU_NEON 11 +#define CV_CPU_AVX2 11 +#define CV_CPU_NEON 12 // when adding to this list remember to update the enum in core/utility.cpp #define CV_HARDWARE_MAX_FEATURE 255 @@ -141,7 +142,7 @@ # include # define CV_SSE4_2 1 # endif -# if defined __AVX__ || (defined _MSC_FULL_VER && _MSC_FULL_VER >= 160040219) +# if defined __AVX__ || defined __AVX2__ || (defined _MSC_FULL_VER && _MSC_FULL_VER >= 160040219) // MS Visual Studio 2010 (2012?) has no macro pre-defined to identify the use of /arch:AVX // See: http://connect.microsoft.com/VisualStudio/feedback/details/605858/arch-avx-should-define-a-predefined-macro-in-x64-and-set-a-unique-value-for-m-ix86-fp-in-win32 # include @@ -150,6 +151,9 @@ # define __xgetbv() _xgetbv(_XCR_XFEATURE_ENABLED_MASK) # else # define __xgetbv() 0 +# ifdef __AVX2__ +# define CV_AVX2 1 +# endif # endif # endif #endif @@ -187,6 +191,9 @@ #ifndef CV_AVX # define CV_AVX 0 #endif +#ifndef CV_AVX2 +# define CV_AVX2 0 +#endif #ifndef CV_NEON # define CV_NEON 0 #endif diff --git a/modules/core/src/convert.cpp b/modules/core/src/convert.cpp index 829b984c9f..55f08f1bde 100644 --- a/modules/core/src/convert.cpp +++ b/modules/core/src/convert.cpp @@ -2294,26 +2294,44 @@ cvtScale_( const short* src, size_t sstep, { int x = 0; - #if CV_SSE2 - if(USE_SSE2)//~5X + #if CV_AVX2 + if (USE_AVX2) + { + __m256 scale256 = _mm256_set1_ps (scale); + __m256 shift256 = _mm256_set1_ps (shift); + __m256i zero = _mm256_setzero_si256(); + for ( ; x <= size.width - 16; x += 16) { - __m128 scale128 = _mm_set1_ps (scale); - __m128 shift128 = _mm_set1_ps (shift); - for(; x <= size.width - 8; x += 8 ) - { - __m128i r0 = _mm_loadl_epi64((const __m128i*)(src + x)); - __m128i r1 = _mm_loadl_epi64((const __m128i*)(src + x + 4)); - __m128 rf0 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r0, r0), 16)); - __m128 rf1 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r1, r1), 16)); - rf0 = _mm_add_ps(_mm_mul_ps(rf0, scale128), shift128); - rf1 = _mm_add_ps(_mm_mul_ps(rf1, scale128), shift128); - r0 = _mm_cvtps_epi32(rf0); - r1 = _mm_cvtps_epi32(rf1); - - _mm_storeu_si128((__m128i*)(dst + x), r0); - _mm_storeu_si128((__m128i*)(dst + x + 4), r1); - } + __m256i v_src = _mm256_loadu_si256((__m256i const *)(src + x)); + __m256i v_src_lo = _mm256_unpacklo_epi16(v_src, zero); + __m256i v_src_hi = _mm256_unpackhi_epi16(v_src, zero); + __m256 v_dst0 = _mm256_add_ps( _mm256_mul_ps(_mm256_cvtepi32_ps (v_src_lo), scale256), shift256); + __m256 v_dst1 = _mm256_add_ps( _mm256_mul_ps(_mm256_cvtepi32_ps (v_src_hi), scale256), shift256); + _mm256_storeu_si256 ((__m256i *)(dst + x), _mm256_cvtps_epi32(v_dst0)); + _mm256_storeu_si256 ((__m256i *)(dst + x + 8), _mm256_cvtps_epi32(v_dst1)); } + } + #endif + #if CV_SSE2 + if (USE_SSE2)//~5X + { + __m128 scale128 = _mm_set1_ps (scale); + __m128 shift128 = _mm_set1_ps (shift); + for(; x <= size.width - 8; x += 8 ) + { + __m128i r0 = _mm_loadl_epi64((const __m128i*)(src + x)); + __m128i r1 = _mm_loadl_epi64((const __m128i*)(src + x + 4)); + __m128 rf0 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r0, r0), 16)); + __m128 rf1 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r1, r1), 16)); + rf0 = _mm_add_ps(_mm_mul_ps(rf0, scale128), shift128); + rf1 = _mm_add_ps(_mm_mul_ps(rf1, scale128), shift128); + r0 = _mm_cvtps_epi32(rf0); + r1 = _mm_cvtps_epi32(rf1); + + _mm_storeu_si128((__m128i*)(dst + x), r0); + _mm_storeu_si128((__m128i*)(dst + x + 4), r1); + } + } #elif CV_NEON float32x4_t v_shift = vdupq_n_f32(shift); for(; x <= size.width - 8; x += 8 ) @@ -2330,24 +2348,6 @@ cvtScale_( const short* src, size_t sstep, } #endif - //We will wait Haswell - /* - #if CV_AVX - if(USE_AVX)//2X - bad variant - { - ////TODO:AVX implementation (optimization?) required - __m256 scale256 = _mm256_set1_ps (scale); - __m256 shift256 = _mm256_set1_ps (shift); - for(; x <= size.width - 8; x += 8 ) - { - __m256i buf = _mm256_set_epi32((int)(*(src+x+7)),(int)(*(src+x+6)),(int)(*(src+x+5)),(int)(*(src+x+4)),(int)(*(src+x+3)),(int)(*(src+x+2)),(int)(*(src+x+1)),(int)(*(src+x))); - __m256 r0 = _mm256_add_ps( _mm256_mul_ps(_mm256_cvtepi32_ps (buf), scale256), shift256); - __m256i res = _mm256_cvtps_epi32(r0); - _mm256_storeu_si256 ((__m256i*)(dst+x), res); - } - } - #endif*/ - for(; x < size.width; x++ ) dst[x] = saturate_cast(src[x]*scale + shift); } diff --git a/modules/core/src/precomp.hpp b/modules/core/src/precomp.hpp index ef154400e2..0f85cc5568 100644 --- a/modules/core/src/precomp.hpp +++ b/modules/core/src/precomp.hpp @@ -192,6 +192,7 @@ struct NoVec extern volatile bool USE_SSE2; extern volatile bool USE_SSE4_2; extern volatile bool USE_AVX; +extern volatile bool USE_AVX2; enum { BLOCK_SIZE = 1024 }; diff --git a/modules/core/src/system.cpp b/modules/core/src/system.cpp index d9a20873f6..11bbab3a25 100644 --- a/modules/core/src/system.cpp +++ b/modules/core/src/system.cpp @@ -82,6 +82,22 @@ pop ebx } } + static void __cpuidex(int* cpuid_data, int, int) + { + __asm + { + push edi + mov edi, cpuid_data + mov eax, 7 + mov ecx, 0 + cpuid + mov [edi], eax + mov [edi + 4], ebx + mov [edi + 8], ecx + mov [edi + 12], edx + pop edi + } + } #endif #endif @@ -203,7 +219,7 @@ struct HWFeatures enum { MAX_FEATURE = CV_HARDWARE_MAX_FEATURE }; HWFeatures(void) - { + { memset( have, 0, sizeof(have) ); x86_family = 0; } @@ -251,6 +267,40 @@ struct HWFeatures f.have[CV_CPU_SSE4_2] = (cpuid_data[2] & (1<<20)) != 0; f.have[CV_CPU_POPCNT] = (cpuid_data[2] & (1<<23)) != 0; f.have[CV_CPU_AVX] = (((cpuid_data[2] & (1<<28)) != 0)&&((cpuid_data[2] & (1<<27)) != 0));//OS uses XSAVE_XRSTORE and CPU support AVX + + // make the second call to the cpuid command in order to get + // information about extended features like AVX2 + #if defined _MSC_VER && (defined _M_IX86 || defined _M_X64) + __cpuidex(cpuid_data, 7, 0); + #elif defined __GNUC__ && (defined __i386__ || defined __x86_64__) + #ifdef __x86_64__ + asm __volatile__ + ( + "movl $7, %%eax\n\t" + "movl $0, %%ecx\n\t" + "cpuid\n\t" + :[eax]"=a"(cpuid_data[0]),[ebx]"=b"(cpuid_data[1]),[ecx]"=c"(cpuid_data[2]),[edx]"=d"(cpuid_data[3]) + : + : "cc" + ); + #else + asm volatile + ( + "pushl %%eax\n\t" + "pushl %%edx\n\t" + "movl $7,%%eax\n\t" + "movl $0,%%ecx\n\t" + "cpuid\n\t" + "popl %%edx\n\t" + "popl %%eax\n\t" + : "=b"(cpuid_data[1]), "=c"(cpuid_data[2]) + : + : "cc" + ); + #endif + #endif + f.have[CV_CPU_AVX2] = (cpuid_data[1] & (1<<5)) != 0; + } return f; @@ -290,6 +340,7 @@ IPPInitializer ippInitializer; volatile bool USE_SSE2 = featuresEnabled.have[CV_CPU_SSE2]; volatile bool USE_SSE4_2 = featuresEnabled.have[CV_CPU_SSE4_2]; volatile bool USE_AVX = featuresEnabled.have[CV_CPU_AVX]; +volatile bool USE_AVX2 = featuresEnabled.have[CV_CPU_AVX2]; void setUseOptimized( bool flag ) { diff --git a/modules/ts/src/ts_func.cpp b/modules/ts/src/ts_func.cpp index 7745c86c5c..53b62e74d7 100644 --- a/modules/ts/src/ts_func.cpp +++ b/modules/ts/src/ts_func.cpp @@ -3019,6 +3019,9 @@ void printVersionInfo(bool useStdOut) #if CV_AVX if (checkHardwareSupport(CV_CPU_AVX)) cpu_features += " avx"; #endif +#if CV_AVX2 + if (checkHardwareSupport(CV_CPU_AVX2)) cpu_features += " avx2"; +#endif #if CV_NEON cpu_features += " neon"; // NEON is currently not checked at runtime #endif From a2a8ba17fcba6d33fec1f78d53d3e7d2f3531181 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 12 Jan 2015 10:59:28 +0300 Subject: [PATCH 02/53] compare --- modules/core/src/arithm.cpp | 124 ++++++++++++++++++++++++++++++++++++ 1 file changed, 124 insertions(+) diff --git a/modules/core/src/arithm.cpp b/modules/core/src/arithm.cpp index 68c8979a8d..a9bf3d7e78 100644 --- a/modules/core/src/arithm.cpp +++ b/modules/core/src/arithm.cpp @@ -3268,6 +3268,130 @@ struct Cmp_SIMD uint8x8_t v_mask; }; +#elif CV_SSE2 + +template <> +struct Cmp_SIMD +{ + explicit Cmp_SIMD(int code_) : + code(code_) + { + CV_Assert(code == CMP_GT || code == CMP_LE || + code == CMP_EQ || code == CMP_NE); + + haveSSE = checkHardwareSupport(CV_CPU_SSE2); + + v_mask = _mm_set1_epi8(0xff); + } + + int operator () (const schar * src1, const schar * src2, uchar * dst, int width) const + { + int x = 0; + + if (!haveSSE) + return x; + + if (code == CMP_GT) + for ( ; x <= width - 16; x += 16) + _mm_storeu_si128((__m128i *)(dst + x), _mm_cmpgt_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)), + _mm_loadu_si128((const __m128i *)(src2 + x)))); + else if (code == CMP_LE) + for ( ; x <= width - 16; x += 16) + { + __m128i v_gt = _mm_cmpgt_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)), + _mm_loadu_si128((const __m128i *)(src2 + x))); + _mm_storeu_si128((__m128i *)(dst + x), _mm_xor_si128(v_mask, v_gt)); + } + else if (code == CMP_EQ) + for ( ; x <= width - 16; x += 16) + _mm_storeu_si128((__m128i *)(dst + x), _mm_cmpeq_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)), + _mm_loadu_si128((const __m128i *)(src2 + x)))); + else if (code == CMP_NE) + for ( ; x <= width - 16; x += 16) + { + __m128i v_eq = _mm_cmpeq_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)), + _mm_loadu_si128((const __m128i *)(src2 + x))); + _mm_storeu_si128((__m128i *)(dst + x), _mm_xor_si128(v_mask, v_eq)); + } + + return x; + } + + int code; + __m128i v_mask; + bool haveSSE; +}; + +template <> +struct Cmp_SIMD +{ + explicit Cmp_SIMD(int code_) : + code(code_) + { + CV_Assert(code == CMP_GT || code == CMP_LE || + code == CMP_EQ || code == CMP_NE); + + haveSSE = checkHardwareSupport(CV_CPU_SSE2); + + v_mask = _mm_set1_epi32(0xffffffff); + } + + int operator () (const int * src1, const int * src2, uchar * dst, int width) const + { + int x = 0; + + if (!haveSSE) + return x; + + if (code == CMP_GT) + for ( ; x <= width - 8; x += 8) + { + __m128i v_dst0 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)), + _mm_loadu_si128((const __m128i *)(src2 + x))); + __m128i v_dst1 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)), + _mm_loadu_si128((const __m128i *)(src2 + x + 4))); + + _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask)); + } + else if (code == CMP_LE) + for ( ; x <= width - 8; x += 8) + { + __m128i v_dst0 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)), + _mm_loadu_si128((const __m128i *)(src2 + x))); + __m128i v_dst1 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)), + _mm_loadu_si128((const __m128i *)(src2 + x + 4))); + + _mm_storel_epi64((__m128i *)(dst + x), _mm_xor_si128(_mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask), v_mask)); + } + else if (code == CMP_EQ) + for ( ; x <= width - 8; x += 8) + { + __m128i v_dst0 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)), + _mm_loadu_si128((const __m128i *)(src2 + x))); + __m128i v_dst1 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)), + _mm_loadu_si128((const __m128i *)(src2 + x + 4))); + + _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask)); + } + else if (code == CMP_NE) + for ( ; x <= width - 8; x += 8) + { + __m128i v_dst0 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)), + _mm_loadu_si128((const __m128i *)(src2 + x))); + __m128i v_dst1 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)), + _mm_loadu_si128((const __m128i *)(src2 + x + 4))); + + _mm_storel_epi64((__m128i *)(dst + x), _mm_xor_si128(v_mask, _mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask))); + } + + return x; + } + + int code; + __m128i v_mask; + bool haveSSE; +}; + #endif template static void From 6ab928fb396d095ce459d106eded6cf8979d91ef Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 12 Jan 2015 10:59:28 +0300 Subject: [PATCH 03/53] phase 64f --- modules/core/src/mathfuncs.cpp | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/modules/core/src/mathfuncs.cpp b/modules/core/src/mathfuncs.cpp index 1c045f3faa..b22526ccb8 100644 --- a/modules/core/src/mathfuncs.cpp +++ b/modules/core/src/mathfuncs.cpp @@ -593,14 +593,40 @@ void phase( InputArray src1, InputArray src2, OutputArray dst, bool angleInDegre { const double *x = (const double*)ptrs[0], *y = (const double*)ptrs[1]; double *angle = (double*)ptrs[2]; - for( k = 0; k < len; k++ ) + k = 0; + +#if CV_SSE2 + for ( ; k <= len - 4; k += 4) + { + __m128 v_dst0 = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(x + k)), + _mm_cvtpd_ps(_mm_loadu_pd(x + k + 2))); + __m128 v_dst1 = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(y + k)), + _mm_cvtpd_ps(_mm_loadu_pd(y + k + 2))); + + _mm_storeu_ps(buf[0] + k, v_dst0); + _mm_storeu_ps(buf[1] + k, v_dst1); + } +#endif + + for( ; k < len; k++ ) { buf[0][k] = (float)x[k]; buf[1][k] = (float)y[k]; } FastAtan2_32f( buf[1], buf[0], buf[0], len, angleInDegrees ); - for( k = 0; k < len; k++ ) + k = 0; + +#if CV_SSE2 + for ( ; k <= len - 4; k += 4) + { + __m128 v_src = _mm_loadu_ps(buf[0] + k); + _mm_storeu_pd(angle + k, _mm_cvtps_pd(v_src)); + _mm_storeu_pd(angle + k, _mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_src), 8)))); + } +#endif + + for( ; k < len; k++ ) angle[k] = buf[0][k]; } ptrs[0] += len*esz1; From 0a5c9cf145707669b49adeff50a9d6f335a8de1e Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 12 Jan 2015 10:59:28 +0300 Subject: [PATCH 04/53] magnitude 64f --- modules/core/src/mathfuncs.cpp | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/modules/core/src/mathfuncs.cpp b/modules/core/src/mathfuncs.cpp index b22526ccb8..5c83e62eac 100644 --- a/modules/core/src/mathfuncs.cpp +++ b/modules/core/src/mathfuncs.cpp @@ -724,14 +724,40 @@ void cartToPolar( InputArray src1, InputArray src2, double *angle = (double*)ptrs[3]; Magnitude_64f(x, y, (double*)ptrs[2], len); - for( k = 0; k < len; k++ ) + k = 0; + +#if CV_SSE2 + for ( ; k <= len - 4; k += 4) + { + __m128 v_dst0 = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(x + k)), + _mm_cvtpd_ps(_mm_loadu_pd(x + k + 2))); + __m128 v_dst1 = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(y + k)), + _mm_cvtpd_ps(_mm_loadu_pd(y + k + 2))); + + _mm_storeu_ps(buf[0] + k, v_dst0); + _mm_storeu_ps(buf[1] + k, v_dst1); + } +#endif + + for( ; k < len; k++ ) { buf[0][k] = (float)x[k]; buf[1][k] = (float)y[k]; } FastAtan2_32f( buf[1], buf[0], buf[0], len, angleInDegrees ); - for( k = 0; k < len; k++ ) + k = 0; + +#if CV_SSE2 + for ( ; k <= len - 4; k += 4) + { + __m128 v_src = _mm_loadu_ps(buf[0] + k); + _mm_storeu_pd(angle + k, _mm_cvtps_pd(v_src)); + _mm_storeu_pd(angle + k, _mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_src), 8)))); + } +#endif + + for( ; k < len; k++ ) angle[k] = buf[0][k]; } ptrs[0] += len*esz1; From 972ff1d0c438ac83f4ef06cbc5b80f86fbdb2ae7 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 12 Jan 2015 10:59:28 +0300 Subject: [PATCH 05/53] polarToCart --- modules/core/src/mathfuncs.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/modules/core/src/mathfuncs.cpp b/modules/core/src/mathfuncs.cpp index 5c83e62eac..cb574e6cc8 100644 --- a/modules/core/src/mathfuncs.cpp +++ b/modules/core/src/mathfuncs.cpp @@ -966,6 +966,13 @@ void polarToCart( InputArray src1, InputArray src2, vst1q_f32(x + k, vmulq_f32(vld1q_f32(x + k), v_m)); vst1q_f32(y + k, vmulq_f32(vld1q_f32(y + k), v_m)); } + #elif CV_SSE2 + for( ; k <= len - 4; k += 4 ) + { + __m128 v_m = _mm_loadu_ps(mag + k); + _mm_storeu_ps(x + k, _mm_mul_ps(_mm_loadu_ps(x + k), v_m)); + _mm_storeu_ps(y + k, _mm_mul_ps(_mm_loadu_ps(y + k), v_m)); + } #endif for( ; k < len; k++ ) From 55780889834f765f83271241f402272dd406143b Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 12 Jan 2015 10:59:28 +0300 Subject: [PATCH 06/53] countNonZero --- modules/core/src/stat.cpp | 125 ++++++++++++++++++++++++++++++++------ 1 file changed, 106 insertions(+), 19 deletions(-) diff --git a/modules/core/src/stat.cpp b/modules/core/src/stat.cpp index ca56a7c966..530e3205bd 100644 --- a/modules/core/src/stat.cpp +++ b/modules/core/src/stat.cpp @@ -396,6 +396,27 @@ static int countNonZero_(const T* src, int len ) return nz; } +static const uchar * initPopcountTable() +{ + static uchar tab[256]; + static volatile bool initialized = false; + if( !initialized ) + { + // we compute inverse popcount table, + // since we pass (img[x] == 0) mask as index in the table. + for( int j = 0; j < 256; j++ ) + { + int val = 0; + for( int mask = 1; mask < 256; mask += mask ) + val += (j & mask) == 0; + tab[j] = (uchar)val; + } + initialized = true; + } + + return tab; +} + static int countNonZero8u( const uchar* src, int len ) { int i=0, nz = 0; @@ -403,21 +424,7 @@ static int countNonZero8u( const uchar* src, int len ) if(USE_SSE2)//5x-6x { __m128i pattern = _mm_setzero_si128 (); - static uchar tab[256]; - static volatile bool initialized = false; - if( !initialized ) - { - // we compute inverse popcount table, - // since we pass (img[x] == 0) mask as index in the table. - for( int j = 0; j < 256; j++ ) - { - int val = 0; - for( int mask = 1; mask < 256; mask += mask ) - val += (j & mask) == 0; - tab[j] = (uchar)val; - } - initialized = true; - } + static const uchar * tab = initPopcountTable(); for (; i<=len-16; i+=16) { @@ -467,7 +474,22 @@ static int countNonZero8u( const uchar* src, int len ) static int countNonZero16u( const ushort* src, int len ) { int i = 0, nz = 0; -#if CV_NEON +#if CV_SSE2 + if (USE_SSE2) + { + __m128i v_zero = _mm_setzero_si128 (); + static const uchar * tab = initPopcountTable(); + + for ( ; i <= len - 8; i += 8) + { + __m128i v_src = _mm_loadu_si128((const __m128i*)(src + i)); + int val = _mm_movemask_epi8(_mm_packs_epi16(_mm_cmpeq_epi16(v_src, v_zero), v_zero)); + nz += tab[val]; + } + + src += i; + } +#elif CV_NEON int len0 = len & -8, blockSize1 = (1 << 15), blockSize0 = blockSize1 << 6; uint32x4_t v_nz = vdupq_n_u32(0u); uint16x8_t v_zero = vdupq_n_u16(0), v_1 = vdupq_n_u16(1); @@ -503,7 +525,27 @@ static int countNonZero16u( const ushort* src, int len ) static int countNonZero32s( const int* src, int len ) { int i = 0, nz = 0; -#if CV_NEON +#if CV_SSE2 + if (USE_SSE2) + { + __m128i v_zero = _mm_setzero_si128 (); + static const uchar * tab = initPopcountTable(); + + for ( ; i <= len - 8; i += 8) + { + __m128i v_src = _mm_loadu_si128((const __m128i*)(src + i)); + __m128i v_dst0 = _mm_cmpeq_epi32(v_src, v_zero); + + v_src = _mm_loadu_si128((const __m128i*)(src + i + 4)); + __m128i v_dst1 = _mm_cmpeq_epi32(v_src, v_zero); + + int val = _mm_movemask_epi8(_mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_zero)); + nz += tab[val]; + } + + src += i; + } +#elif CV_NEON int len0 = len & -8, blockSize1 = (1 << 15), blockSize0 = blockSize1 << 6; uint32x4_t v_nz = vdupq_n_u32(0u); int32x4_t v_zero = vdupq_n_s32(0.0f); @@ -541,7 +583,25 @@ static int countNonZero32s( const int* src, int len ) static int countNonZero32f( const float* src, int len ) { int i = 0, nz = 0; -#if CV_NEON +#if CV_SSE2 + if (USE_SSE2) + { + __m128i v_zero_i = _mm_setzero_si128(); + __m128 v_zero_f = _mm_setzero_ps(); + static const uchar * tab = initPopcountTable(); + + for ( ; i <= len - 8; i += 8) + { + __m128i v_dst0 = _mm_castps_si128(_mm_cmpeq_ps(_mm_loadu_ps(src + i), v_zero_f)); + __m128i v_dst1 = _mm_castps_si128(_mm_cmpeq_ps(_mm_loadu_ps(src + i + 4), v_zero_f)); + + int val = _mm_movemask_epi8(_mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_zero_i)); + nz += tab[val]; + } + + src += i; + } +#elif CV_NEON int len0 = len & -8, blockSize1 = (1 << 15), blockSize0 = blockSize1 << 6; uint32x4_t v_nz = vdupq_n_u32(0u); float32x4_t v_zero = vdupq_n_f32(0.0f); @@ -577,7 +637,34 @@ static int countNonZero32f( const float* src, int len ) } static int countNonZero64f( const double* src, int len ) -{ return countNonZero_(src, len); } +{ + int i = 0, nz = 0; +#if CV_SSE2 + if (USE_SSE2) + { + __m128i v_zero_i = _mm_setzero_si128(); + __m128d v_zero_d = _mm_setzero_pd(); + static const uchar * tab = initPopcountTable(); + + for ( ; i <= len - 8; i += 8) + { + __m128i v_dst0 = _mm_castpd_si128(_mm_cmpeq_pd(_mm_loadu_pd(src + i), v_zero_d)); + __m128i v_dst1 = _mm_castpd_si128(_mm_cmpeq_pd(_mm_loadu_pd(src + i + 2), v_zero_d)); + __m128i v_dst2 = _mm_castpd_si128(_mm_cmpeq_pd(_mm_loadu_pd(src + i + 4), v_zero_d)); + __m128i v_dst3 = _mm_castpd_si128(_mm_cmpeq_pd(_mm_loadu_pd(src + i + 6), v_zero_d)); + + v_dst0 = _mm_packs_epi32(v_dst0, v_dst1); + v_dst1 = _mm_packs_epi32(v_dst2, v_dst3); + + int val = _mm_movemask_epi8(_mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_zero_i)); + nz += tab[val]; + } + + src += i; + } +#endif + return nz + countNonZero_(src, len - i); +} typedef int (*CountNonZeroFunc)(const uchar*, int); From 3a78a2273331eb8ac319b55fdda760e886bfd574 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 12 Jan 2015 10:59:29 +0300 Subject: [PATCH 07/53] convertScaleAbs for s8, f64 --- modules/core/src/convert.cpp | 80 +++++++++++++++++++++++++++ modules/core/src/mathfuncs.cpp | 4 +- modules/core/test/ocl/test_arithm.cpp | 2 +- 3 files changed, 83 insertions(+), 3 deletions(-) diff --git a/modules/core/src/convert.cpp b/modules/core/src/convert.cpp index 55f08f1bde..865ec3dc2c 100644 --- a/modules/core/src/convert.cpp +++ b/modules/core/src/convert.cpp @@ -1123,6 +1123,48 @@ struct cvtScaleAbs_SIMD } }; +template <> +struct cvtScaleAbs_SIMD +{ + int operator () (const schar * src, uchar * dst, int width, + float scale, float shift) const + { + int x = 0; + + if (USE_SSE2) + { + __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift), + v_zero_f = _mm_setzero_ps(); + __m128i v_zero_i = _mm_setzero_si128(); + + for ( ; x <= width - 16; x += 16) + { + __m128i v_src = _mm_loadu_si128((const __m128i *)(src + x)); + __m128i v_src_12 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero_i, v_src), 8), + v_src_34 = _mm_srai_epi16(_mm_unpackhi_epi8(v_zero_i, v_src), 8); + __m128 v_dst1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps( + _mm_srai_epi32(_mm_unpacklo_epi16(v_zero_i, v_src_12), 16)), v_scale), v_shift); + v_dst1 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst1), v_dst1); + __m128 v_dst2 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps( + _mm_srai_epi32(_mm_unpackhi_epi16(v_zero_i, v_src_12), 16)), v_scale), v_shift); + v_dst2 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst2), v_dst2); + __m128 v_dst3 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps( + _mm_srai_epi32(_mm_unpacklo_epi16(v_zero_i, v_src_34), 16)), v_scale), v_shift); + v_dst3 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst3), v_dst3); + __m128 v_dst4 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps( + _mm_srai_epi32(_mm_unpackhi_epi16(v_zero_i, v_src_34), 16)), v_scale), v_shift); + v_dst4 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst4), v_dst4); + + __m128i v_dst_i = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2)), + _mm_packs_epi32(_mm_cvtps_epi32(v_dst3), _mm_cvtps_epi32(v_dst4))); + _mm_storeu_si128((__m128i *)(dst + x), v_dst_i); + } + } + + return x; + } +}; + template <> struct cvtScaleAbs_SIMD { @@ -1242,6 +1284,44 @@ struct cvtScaleAbs_SIMD } }; +template <> +struct cvtScaleAbs_SIMD +{ + int operator () (const double * src, uchar * dst, int width, + float scale, float shift) const + { + int x = 0; + + if (USE_SSE2) + { + __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift), + v_zero_f = _mm_setzero_ps(); + __m128i v_zero_i = _mm_setzero_si128(); + + for ( ; x <= width - 8; x += 8) + { + __m128 v_src1 = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x)), + _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2))); + __m128 v_src2 = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x + 4)), + _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6))); + + __m128 v_dst1 = _mm_add_ps(_mm_mul_ps(v_src1, v_scale), v_shift); + v_dst1 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst1), v_dst1); + + __m128 v_dst2 = _mm_add_ps(_mm_mul_ps(v_src2, v_scale), v_shift); + v_dst2 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst2), v_dst2); + + __m128i v_dst_i = _mm_packs_epi32(_mm_cvtps_epi32(v_dst1), + _mm_cvtps_epi32(v_dst2)); + + _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst_i, v_zero_i)); + } + } + + return x; + } +}; + #elif CV_NEON template <> diff --git a/modules/core/src/mathfuncs.cpp b/modules/core/src/mathfuncs.cpp index cb574e6cc8..d3d09c338f 100644 --- a/modules/core/src/mathfuncs.cpp +++ b/modules/core/src/mathfuncs.cpp @@ -622,7 +622,7 @@ void phase( InputArray src1, InputArray src2, OutputArray dst, bool angleInDegre { __m128 v_src = _mm_loadu_ps(buf[0] + k); _mm_storeu_pd(angle + k, _mm_cvtps_pd(v_src)); - _mm_storeu_pd(angle + k, _mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_src), 8)))); + _mm_storeu_pd(angle + k + 2, _mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_src), 8)))); } #endif @@ -753,7 +753,7 @@ void cartToPolar( InputArray src1, InputArray src2, { __m128 v_src = _mm_loadu_ps(buf[0] + k); _mm_storeu_pd(angle + k, _mm_cvtps_pd(v_src)); - _mm_storeu_pd(angle + k, _mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_src), 8)))); + _mm_storeu_pd(angle + k + 2, _mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_src), 8)))); } #endif diff --git a/modules/core/test/ocl/test_arithm.cpp b/modules/core/test/ocl/test_arithm.cpp index d0d3847bed..0541819f89 100644 --- a/modules/core/test/ocl/test_arithm.cpp +++ b/modules/core/test/ocl/test_arithm.cpp @@ -1577,7 +1577,7 @@ PARAM_TEST_CASE(ConvertScaleAbs, MatDepth, Channels, bool) Size roiSize = randomSize(1, MAX_VALUE); Border srcBorder = randomBorder(0, use_roi ? MAX_VALUE : 0); - randomSubMat(src, src_roi, roiSize, srcBorder, stype, 2, 11); // FIXIT: Test with minV, maxV + randomSubMat(src, src_roi, roiSize, srcBorder, stype, -11, 11); // FIXIT: Test with minV, maxV Border dstBorder = randomBorder(0, use_roi ? MAX_VALUE : 0); randomSubMat(dst, dst_roi, roiSize, dstBorder, dtype, 5, 16); From b758dbd38493045b8c72cd1d1c6346f4e9edf5d4 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 12 Jan 2015 10:59:29 +0300 Subject: [PATCH 08/53] convertTo AVX2 --- modules/core/src/convert.cpp | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/modules/core/src/convert.cpp b/modules/core/src/convert.cpp index 865ec3dc2c..ffc2d744e0 100644 --- a/modules/core/src/convert.cpp +++ b/modules/core/src/convert.cpp @@ -2377,18 +2377,20 @@ cvtScale_( const short* src, size_t sstep, #if CV_AVX2 if (USE_AVX2) { - __m256 scale256 = _mm256_set1_ps (scale); - __m256 shift256 = _mm256_set1_ps (shift); - __m256i zero = _mm256_setzero_si256(); + __m256 scale256 = _mm256_set1_ps(scale); + __m256 shift256 = _mm256_set1_ps(shift); + int shuffle = 0xD8; + for ( ; x <= size.width - 16; x += 16) { - __m256i v_src = _mm256_loadu_si256((__m256i const *)(src + x)); - __m256i v_src_lo = _mm256_unpacklo_epi16(v_src, zero); - __m256i v_src_hi = _mm256_unpackhi_epi16(v_src, zero); - __m256 v_dst0 = _mm256_add_ps( _mm256_mul_ps(_mm256_cvtepi32_ps (v_src_lo), scale256), shift256); - __m256 v_dst1 = _mm256_add_ps( _mm256_mul_ps(_mm256_cvtepi32_ps (v_src_hi), scale256), shift256); - _mm256_storeu_si256 ((__m256i *)(dst + x), _mm256_cvtps_epi32(v_dst0)); - _mm256_storeu_si256 ((__m256i *)(dst + x + 8), _mm256_cvtps_epi32(v_dst1)); + __m256i v_src = _mm256_loadu_si256((const __m256i *)(src + x)); + v_src = _mm256_permute4x64_epi64(v_src, shuffle); + __m256i v_src_lo = _mm256_srai_epi32(_mm256_unpacklo_epi16(v_src, v_src), 16); + __m256i v_src_hi = _mm256_srai_epi32(_mm256_unpackhi_epi16(v_src, v_src), 16); + __m256 v_dst0 = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(v_src_lo), scale256), shift256); + __m256 v_dst1 = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(v_src_hi), scale256), shift256); + _mm256_storeu_si256((__m256i *)(dst + x), _mm256_cvtps_epi32(v_dst0)); + _mm256_storeu_si256((__m256i *)(dst + x + 8), _mm256_cvtps_epi32(v_dst1)); } } #endif @@ -2399,17 +2401,15 @@ cvtScale_( const short* src, size_t sstep, __m128 shift128 = _mm_set1_ps (shift); for(; x <= size.width - 8; x += 8 ) { - __m128i r0 = _mm_loadl_epi64((const __m128i*)(src + x)); - __m128i r1 = _mm_loadl_epi64((const __m128i*)(src + x + 4)); + __m128i r0 = _mm_loadu_si128((const __m128i*)(src + x)); + __m128 rf0 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r0, r0), 16)); - __m128 rf1 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r1, r1), 16)); + __m128 rf1 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(r0, r0), 16)); rf0 = _mm_add_ps(_mm_mul_ps(rf0, scale128), shift128); rf1 = _mm_add_ps(_mm_mul_ps(rf1, scale128), shift128); - r0 = _mm_cvtps_epi32(rf0); - r1 = _mm_cvtps_epi32(rf1); - _mm_storeu_si128((__m128i*)(dst + x), r0); - _mm_storeu_si128((__m128i*)(dst + x + 4), r1); + _mm_storeu_si128((__m128i*)(dst + x), _mm_cvtps_epi32(rf0)); + _mm_storeu_si128((__m128i*)(dst + x + 4), _mm_cvtps_epi32(rf1)); } } #elif CV_NEON From 19e77e47876c2e04086841f1d79c1d46acde009b Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 12 Jan 2015 10:59:29 +0300 Subject: [PATCH 09/53] convertTo from 8u --- modules/core/perf/perf_convertTo.cpp | 4 +- modules/core/src/convert.cpp | 234 ++++++++++++++++++++++++++- 2 files changed, 235 insertions(+), 3 deletions(-) diff --git a/modules/core/perf/perf_convertTo.cpp b/modules/core/perf/perf_convertTo.cpp index 8007361228..7892a7642e 100644 --- a/modules/core/perf/perf_convertTo.cpp +++ b/modules/core/perf/perf_convertTo.cpp @@ -13,8 +13,8 @@ PERF_TEST_P( Size_DepthSrc_DepthDst_Channels_alpha, convertTo, testing::Combine ( testing::Values(szVGA, sz1080p), - testing::Values(CV_8U, CV_8S, CV_16U, CV_16S, CV_32S, CV_32F, CV_64F), - testing::Values(CV_8U, CV_8S, CV_16U, CV_16S, CV_32S, CV_32F, CV_64F), + testing::Values(CV_8U), + testing::Values(CV_16U), testing::Values(1, 4), testing::Values(1.0, 1./255) ) diff --git a/modules/core/src/convert.cpp b/modules/core/src/convert.cpp index ffc2d744e0..a92ff47e1b 100644 --- a/modules/core/src/convert.cpp +++ b/modules/core/src/convert.cpp @@ -1569,7 +1569,239 @@ struct cvtScale_SIMD } }; -#if CV_NEON +#if CV_SSE2 + +template <> +struct cvtScale_SIMD +{ + int operator () (const uchar * src, uchar * dst, int width, float scale, float shift) const + { + int x = 0; + + if (!USE_SSE2) + return x; + + __m128i v_zero = _mm_setzero_si128(); + __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); + + for ( ; x <= width - 8; x += 8) + { + __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero); + __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)); + __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)); + __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), + _mm_cvtps_epi32(v_dst_1)); + _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_zero)); + } + + return x; + } +}; + +template <> +struct cvtScale_SIMD +{ + int operator () (const uchar * src, schar * dst, int width, float scale, float shift) const + { + int x = 0; + + if (!USE_SSE2) + return x; + + __m128i v_zero = _mm_setzero_si128(); + __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); + + for ( ; x <= width - 8; x += 8) + { + __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero); + __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)); + __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)); + __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), + _mm_cvtps_epi32(v_dst_1)); + _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_zero)); + } + + return x; + } +}; + +#if CV_SSE4_1 + +template <> +struct cvtScale_SIMD +{ + cvtScale_SIMD() + { + haveSSE = checkHardwareSupport(CV_CPU_SSE4_1); + } + + int operator () (const uchar * src, ushort * dst, int width, float scale, float shift) const + { + int x = 0; + + if (!haveSSE) + return x; + + __m128i v_zero = _mm_setzero_si128(); + __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); + + for ( ; x <= width - 8; x += 8) + { + __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero); + __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)); + __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)); + __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0), + _mm_cvtps_epi32(v_dst_1)); + _mm_storeu_si128((__m128i *)(dst + x), v_dst); + } + + return x; + } + + bool haveSSE; +}; + +#endif + +template <> +struct cvtScale_SIMD +{ + int operator () (const uchar * src, short * dst, int width, float scale, float shift) const + { + int x = 0; + + if (!USE_SSE2) + return x; + + __m128i v_zero = _mm_setzero_si128(); + __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); + + for ( ; x <= width - 8; x += 8) + { + __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero); + __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)); + __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)); + __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), + _mm_cvtps_epi32(v_dst_1)); + _mm_storeu_si128((__m128i *)(dst + x), v_dst); + } + + return x; + } +}; + +template <> +struct cvtScale_SIMD +{ + int operator () (const uchar * src, int * dst, int width, float scale, float shift) const + { + int x = 0; + + if (!USE_SSE2) + return x; + + __m128i v_zero = _mm_setzero_si128(); + __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); + + for ( ; x <= width - 8; x += 8) + { + __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero); + __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)); + __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)); + __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + _mm_storeu_si128((__m128i *)(dst + x), _mm_cvtps_epi32(v_dst_0)); + _mm_storeu_si128((__m128i *)(dst + x + 4), _mm_cvtps_epi32(v_dst_1)); + } + + return x; + } +}; + +template <> +struct cvtScale_SIMD +{ + int operator () (const uchar * src, float * dst, int width, float scale, float shift) const + { + int x = 0; + + if (!USE_SSE2) + return x; + + __m128i v_zero = _mm_setzero_si128(); + __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); + + for ( ; x <= width - 8; x += 8) + { + __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero); + __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)); + __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)); + __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + _mm_storeu_ps(dst + x, v_dst_0); + _mm_storeu_ps(dst + x + 4, v_dst_1); + } + + return x; + } +}; + +template <> +struct cvtScale_SIMD +{ + int operator () (const uchar * src, double * dst, int width, float scale, float shift) const + { + int x = 0; + + if (!USE_SSE2) + return x; + + __m128i v_zero = _mm_setzero_si128(); + __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); + + for ( ; x <= width - 8; x += 8) + { + __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero); + __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)); + __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)); + __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + _mm_storeu_pd(dst + x, _mm_cvtps_pd(v_dst_0)); + _mm_storeu_pd(dst + x + 4, _mm_cvtps_pd(_mm_castsi128_ps( + _mm_srli_si128(_mm_castps_si128(v_dst_0), 16)))); + + _mm_storeu_pd(dst + x + 8, _mm_cvtps_pd(v_dst_1)); + _mm_storeu_pd(dst + x + 12, _mm_cvtps_pd(_mm_castsi128_ps( + _mm_srli_si128(_mm_castps_si128(v_dst_1), 16)))); + } + + return x; + } +}; + +#elif CV_NEON // from uchar From 116fb275a8b0ca268dc7a1bab5b3abb116507bc8 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 12 Jan 2015 10:59:29 +0300 Subject: [PATCH 10/53] convertTo from 8s --- modules/core/src/convert.cpp | 234 +++++++++++++++++++++++++++++++++++ 1 file changed, 234 insertions(+) diff --git a/modules/core/src/convert.cpp b/modules/core/src/convert.cpp index a92ff47e1b..5dae935cc7 100644 --- a/modules/core/src/convert.cpp +++ b/modules/core/src/convert.cpp @@ -1571,6 +1571,8 @@ struct cvtScale_SIMD #if CV_SSE2 +// from uchar + template <> struct cvtScale_SIMD { @@ -1801,6 +1803,238 @@ struct cvtScale_SIMD } }; +// from schar + +template <> +struct cvtScale_SIMD +{ + int operator () (const schar * src, uchar * dst, int width, float scale, float shift) const + { + int x = 0; + + if (!USE_SSE2) + return x; + + __m128i v_zero = _mm_setzero_si128(); + __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); + + for ( ; x <= width - 8; x += 8) + { + __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src + x))), 8); + __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16)); + __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16)); + __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), + _mm_cvtps_epi32(v_dst_1)); + _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_zero)); + } + + return x; + } +}; + +template <> +struct cvtScale_SIMD +{ + int operator () (const schar * src, schar * dst, int width, float scale, float shift) const + { + int x = 0; + + if (!USE_SSE2) + return x; + + __m128i v_zero = _mm_setzero_si128(); + __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); + + for ( ; x <= width - 8; x += 8) + { + __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src + x))), 8); + __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16)); + __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16)); + __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), + _mm_cvtps_epi32(v_dst_1)); + _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_zero)); + } + + return x; + } +}; + +#if CV_SSE4_1 + +template <> +struct cvtScale_SIMD +{ + cvtScale_SIMD() + { + haveSSE = checkHardwareSupport(CV_CPU_SSE4_1); + } + + int operator () (const schar * src, ushort * dst, int width, float scale, float shift) const + { + int x = 0; + + if (!haveSSE) + return x; + + __m128i v_zero = _mm_setzero_si128(); + __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); + + for ( ; x <= width - 8; x += 8) + { + __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src + x))), 8); + __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16)); + __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16)); + __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0), + _mm_cvtps_epi32(v_dst_1)); + _mm_storeu_si128((__m128i *)(dst + x), v_dst); + } + + return x; + } + + bool haveSSE; +}; + +#endif + +template <> +struct cvtScale_SIMD +{ + int operator () (const schar * src, short * dst, int width, float scale, float shift) const + { + int x = 0; + + if (!USE_SSE2) + return x; + + __m128i v_zero = _mm_setzero_si128(); + __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); + + for ( ; x <= width - 8; x += 8) + { + __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src + x))), 8); + __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16)); + __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16)); + __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), + _mm_cvtps_epi32(v_dst_1)); + _mm_storeu_si128((__m128i *)(dst + x), v_dst); + } + + return x; + } +}; + +template <> +struct cvtScale_SIMD +{ + int operator () (const schar * src, int * dst, int width, float scale, float shift) const + { + int x = 0; + + if (!USE_SSE2) + return x; + + __m128i v_zero = _mm_setzero_si128(); + __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); + + for ( ; x <= width - 8; x += 8) + { + __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src + x))), 8); + __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16)); + __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16)); + __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + _mm_storeu_si128((__m128i *)(dst + x), _mm_cvtps_epi32(v_dst_0)); + _mm_storeu_si128((__m128i *)(dst + x + 4), _mm_cvtps_epi32(v_dst_1)); + } + + return x; + } +}; + +template <> +struct cvtScale_SIMD +{ + int operator () (const schar * src, float * dst, int width, float scale, float shift) const + { + int x = 0; + + if (!USE_SSE2) + return x; + + __m128i v_zero = _mm_setzero_si128(); + __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); + + for ( ; x <= width - 8; x += 8) + { + __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src + x))), 8); + __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16)); + __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16)); + __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + _mm_storeu_ps(dst + x, v_dst_0); + _mm_storeu_ps(dst + x + 4, v_dst_1); + } + + return x; + } +}; + +template <> +struct cvtScale_SIMD +{ + int operator () (const schar * src, double * dst, int width, float scale, float shift) const + { + int x = 0; + + if (!USE_SSE2) + return x; + + __m128i v_zero = _mm_setzero_si128(); + __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); + + for ( ; x <= width - 8; x += 8) + { + __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src + x))), 8); + __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16)); + __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16)); + __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + _mm_storeu_pd(dst + x, _mm_cvtps_pd(v_dst_0)); + _mm_storeu_pd(dst + x + 4, _mm_cvtps_pd(_mm_castsi128_ps( + _mm_srli_si128(_mm_castps_si128(v_dst_0), 16)))); + + _mm_storeu_pd(dst + x + 8, _mm_cvtps_pd(v_dst_1)); + _mm_storeu_pd(dst + x + 12, _mm_cvtps_pd(_mm_castsi128_ps( + _mm_srli_si128(_mm_castps_si128(v_dst_1), 16)))); + } + + return x; + } +}; + #elif CV_NEON // from uchar From 8870ef4159462fde2f733bfd90eb61ebc025b5d1 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 12 Jan 2015 10:59:29 +0300 Subject: [PATCH 11/53] convertTo from 16u, 16s, 32s, 32f --- modules/core/src/convert.cpp | 904 +++++++++++++++++++++++++++++++++++ 1 file changed, 904 insertions(+) diff --git a/modules/core/src/convert.cpp b/modules/core/src/convert.cpp index 5dae935cc7..5ef238e46f 100644 --- a/modules/core/src/convert.cpp +++ b/modules/core/src/convert.cpp @@ -2035,6 +2035,910 @@ struct cvtScale_SIMD } }; +// from ushort + +template <> +struct cvtScale_SIMD +{ + int operator () (const ushort * src, uchar * dst, int width, float scale, float shift) const + { + int x = 0; + + if (!USE_SSE2) + return x; + + __m128i v_zero = _mm_setzero_si128(); + __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); + + for ( ; x <= width - 8; x += 8) + { + __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); + __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)); + __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)); + __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), + _mm_cvtps_epi32(v_dst_1)); + _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_zero)); + } + + return x; + } +}; + +template <> +struct cvtScale_SIMD +{ + int operator () (const ushort * src, schar * dst, int width, float scale, float shift) const + { + int x = 0; + + if (!USE_SSE2) + return x; + + __m128i v_zero = _mm_setzero_si128(); + __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); + + for ( ; x <= width - 8; x += 8) + { + __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); + __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)); + __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)); + __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), + _mm_cvtps_epi32(v_dst_1)); + _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_zero)); + } + + return x; + } +}; + +#if CV_SSE4_1 + +template <> +struct cvtScale_SIMD +{ + cvtScale_SIMD() + { + haveSSE = checkHardwareSupport(CV_CPU_SSE4_1); + } + + int operator () (const ushort * src, ushort * dst, int width, float scale, float shift) const + { + int x = 0; + + if (!haveSSE) + return x; + + __m128i v_zero = _mm_setzero_si128(); + __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); + + for ( ; x <= width - 8; x += 8) + { + __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); + __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)); + __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)); + __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0), + _mm_cvtps_epi32(v_dst_1)); + _mm_storeu_si128((__m128i *)(dst + x), v_dst); + } + + return x; + } + + bool haveSSE; +}; + +#endif + +template <> +struct cvtScale_SIMD +{ + int operator () (const ushort * src, short * dst, int width, float scale, float shift) const + { + int x = 0; + + if (!USE_SSE2) + return x; + + __m128i v_zero = _mm_setzero_si128(); + __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); + + for ( ; x <= width - 8; x += 8) + { + __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); + __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)); + __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)); + __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), + _mm_cvtps_epi32(v_dst_1)); + _mm_storeu_si128((__m128i *)(dst + x), v_dst); + } + + return x; + } +}; + +template <> +struct cvtScale_SIMD +{ + int operator () (const ushort * src, int * dst, int width, float scale, float shift) const + { + int x = 0; + + if (!USE_SSE2) + return x; + + __m128i v_zero = _mm_setzero_si128(); + __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); + + for ( ; x <= width - 8; x += 8) + { + __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); + __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)); + __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)); + __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + _mm_storeu_si128((__m128i *)(dst + x), _mm_cvtps_epi32(v_dst_0)); + _mm_storeu_si128((__m128i *)(dst + x + 4), _mm_cvtps_epi32(v_dst_1)); + } + + return x; + } +}; + +template <> +struct cvtScale_SIMD +{ + int operator () (const ushort * src, float * dst, int width, float scale, float shift) const + { + int x = 0; + + if (!USE_SSE2) + return x; + + __m128i v_zero = _mm_setzero_si128(); + __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); + + for ( ; x <= width - 8; x += 8) + { + __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); + __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)); + __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)); + __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + _mm_storeu_ps(dst + x, v_dst_0); + _mm_storeu_ps(dst + x + 4, v_dst_1); + } + + return x; + } +}; + +template <> +struct cvtScale_SIMD +{ + int operator () (const ushort * src, double * dst, int width, float scale, float shift) const + { + int x = 0; + + if (!USE_SSE2) + return x; + + __m128i v_zero = _mm_setzero_si128(); + __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); + + for ( ; x <= width - 8; x += 8) + { + __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); + __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)); + __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)); + __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + _mm_storeu_pd(dst + x, _mm_cvtps_pd(v_dst_0)); + _mm_storeu_pd(dst + x + 4, _mm_cvtps_pd(_mm_castsi128_ps( + _mm_srli_si128(_mm_castps_si128(v_dst_0), 16)))); + + _mm_storeu_pd(dst + x + 8, _mm_cvtps_pd(v_dst_1)); + _mm_storeu_pd(dst + x + 12, _mm_cvtps_pd(_mm_castsi128_ps( + _mm_srli_si128(_mm_castps_si128(v_dst_1), 16)))); + } + + return x; + } +}; + +// from short + +template <> +struct cvtScale_SIMD +{ + int operator () (const short * src, uchar * dst, int width, float scale, float shift) const + { + int x = 0; + + if (!USE_SSE2) + return x; + + __m128i v_zero = _mm_setzero_si128(); + __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); + + for ( ; x <= width - 8; x += 8) + { + __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); + __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16)); + __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16)); + __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), + _mm_cvtps_epi32(v_dst_1)); + _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_zero)); + } + + return x; + } +}; + +template <> +struct cvtScale_SIMD +{ + int operator () (const short * src, schar * dst, int width, float scale, float shift) const + { + int x = 0; + + if (!USE_SSE2) + return x; + + __m128i v_zero = _mm_setzero_si128(); + __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); + + for ( ; x <= width - 8; x += 8) + { + __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); + __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16)); + __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16)); + __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), + _mm_cvtps_epi32(v_dst_1)); + _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_zero)); + } + + return x; + } +}; + +#if CV_SSE4_1 + +template <> +struct cvtScale_SIMD +{ + cvtScale_SIMD() + { + haveSSE = checkHardwareSupport(CV_CPU_SSE4_1); + } + + int operator () (const short * src, ushort * dst, int width, float scale, float shift) const + { + int x = 0; + + if (!haveSSE) + return x; + + __m128i v_zero = _mm_setzero_si128(); + __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); + + for ( ; x <= width - 8; x += 8) + { + __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); + __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16)); + __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16)); + __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0), + _mm_cvtps_epi32(v_dst_1)); + _mm_storeu_si128((__m128i *)(dst + x), v_dst); + } + + return x; + } + + bool haveSSE; +}; + +#endif + +template <> +struct cvtScale_SIMD +{ + int operator () (const short * src, short * dst, int width, float scale, float shift) const + { + int x = 0; + + if (!USE_SSE2) + return x; + + __m128i v_zero = _mm_setzero_si128(); + __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); + + for ( ; x <= width - 8; x += 8) + { + __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); + __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16)); + __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16)); + __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), + _mm_cvtps_epi32(v_dst_1)); + _mm_storeu_si128((__m128i *)(dst + x), v_dst); + } + + return x; + } +}; + +template <> +struct cvtScale_SIMD +{ + int operator () (const short * src, int * dst, int width, float scale, float shift) const + { + int x = 0; + + if (!USE_SSE2) + return x; + + __m128i v_zero = _mm_setzero_si128(); + __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); + + for ( ; x <= width - 8; x += 8) + { + __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); + __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16)); + __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16)); + __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + _mm_storeu_si128((__m128i *)(dst + x), _mm_cvtps_epi32(v_dst_0)); + _mm_storeu_si128((__m128i *)(dst + x + 4), _mm_cvtps_epi32(v_dst_1)); + } + + return x; + } +}; + +template <> +struct cvtScale_SIMD +{ + int operator () (const short * src, float * dst, int width, float scale, float shift) const + { + int x = 0; + + if (!USE_SSE2) + return x; + + __m128i v_zero = _mm_setzero_si128(); + __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); + + for ( ; x <= width - 8; x += 8) + { + __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); + __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16)); + __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16)); + __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + _mm_storeu_ps(dst + x, v_dst_0); + _mm_storeu_ps(dst + x + 4, v_dst_1); + } + + return x; + } +}; + +template <> +struct cvtScale_SIMD +{ + int operator () (const short * src, double * dst, int width, float scale, float shift) const + { + int x = 0; + + if (!USE_SSE2) + return x; + + __m128i v_zero = _mm_setzero_si128(); + __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); + + for ( ; x <= width - 8; x += 8) + { + __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); + __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16)); + __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16)); + __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + _mm_storeu_pd(dst + x, _mm_cvtps_pd(v_dst_0)); + _mm_storeu_pd(dst + x + 4, _mm_cvtps_pd(_mm_castsi128_ps( + _mm_srli_si128(_mm_castps_si128(v_dst_0), 16)))); + + _mm_storeu_pd(dst + x + 8, _mm_cvtps_pd(v_dst_1)); + _mm_storeu_pd(dst + x + 12, _mm_cvtps_pd(_mm_castsi128_ps( + _mm_srli_si128(_mm_castps_si128(v_dst_1), 16)))); + } + + return x; + } +}; + +// from int + +template <> +struct cvtScale_SIMD +{ + int operator () (const int * src, uchar * dst, int width, float scale, float shift) const + { + int x = 0; + + if (!USE_SSE2) + return x; + + __m128i v_zero = _mm_setzero_si128(); + __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); + + for ( ; x <= width - 8; x += 8) + { + __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); + __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift); + + v_src = _mm_loadu_si128((__m128i const *)(src + x + 4)); + __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift); + + __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), + _mm_cvtps_epi32(v_dst_1)); + _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_zero)); + } + + return x; + } +}; + +template <> +struct cvtScale_SIMD +{ + int operator () (const int * src, schar * dst, int width, float scale, float shift) const + { + int x = 0; + + if (!USE_SSE2) + return x; + + __m128i v_zero = _mm_setzero_si128(); + __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); + + for ( ; x <= width - 8; x += 8) + { + __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); + __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift); + + v_src = _mm_loadu_si128((__m128i const *)(src + x + 4)); + __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift); + + __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), + _mm_cvtps_epi32(v_dst_1)); + _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_zero)); + } + + return x; + } +}; + +#if CV_SSE4_1 + +template <> +struct cvtScale_SIMD +{ + cvtScale_SIMD() + { + haveSSE = checkHardwareSupport(CV_CPU_SSE4_1); + } + + int operator () (const int * src, ushort * dst, int width, float scale, float shift) const + { + int x = 0; + + if (!haveSSE) + return x; + + __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); + + for ( ; x <= width - 8; x += 8) + { + __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); + __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift); + + v_src = _mm_loadu_si128((__m128i const *)(src + x + 4)); + __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift); + + __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0), + _mm_cvtps_epi32(v_dst_1)); + _mm_storeu_si128((__m128i *)(dst + x), v_dst); + } + + return x; + } + + bool haveSSE; +}; + +#endif + +template <> +struct cvtScale_SIMD +{ + int operator () (const int * src, short * dst, int width, float scale, float shift) const + { + int x = 0; + + if (!USE_SSE2) + return x; + + __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); + + for ( ; x <= width - 8; x += 8) + { + __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); + __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift); + + v_src = _mm_loadu_si128((__m128i const *)(src + x + 4)); + __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift); + + __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), + _mm_cvtps_epi32(v_dst_1)); + _mm_storeu_si128((__m128i *)(dst + x), v_dst); + } + + return x; + } +}; + +template <> +struct cvtScale_SIMD +{ + int operator () (const int * src, int * dst, int width, float scale, float shift) const + { + int x = 0; + + if (!USE_SSE2) + return x; + + __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); + + for ( ; x <= width - 8; x += 8) + { + __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); + __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift); + + v_src = _mm_loadu_si128((__m128i const *)(src + x + 4)); + __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift); + + _mm_storeu_si128((__m128i *)(dst + x), _mm_cvtps_epi32(v_dst_0)); + _mm_storeu_si128((__m128i *)(dst + x + 4), _mm_cvtps_epi32(v_dst_1)); + } + + return x; + } +}; + +template <> +struct cvtScale_SIMD +{ + int operator () (const int * src, float * dst, int width, float scale, float shift) const + { + int x = 0; + + if (!USE_SSE2) + return x; + + __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); + + for ( ; x <= width - 8; x += 8) + { + __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); + __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift); + + v_src = _mm_loadu_si128((__m128i const *)(src + x + 4)); + __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift); + + _mm_storeu_ps(dst + x, v_dst_0); + _mm_storeu_ps(dst + x + 4, v_dst_1); + } + + return x; + } +}; + +template <> +struct cvtScale_SIMD +{ + int operator () (const int * src, double * dst, int width, float scale, float shift) const + { + int x = 0; + + if (!USE_SSE2) + return x; + + __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); + + for ( ; x <= width - 8; x += 8) + { + __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); + __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift); + + v_src = _mm_loadu_si128((__m128i const *)(src + x + 4)); + __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift); + + _mm_storeu_pd(dst + x, _mm_cvtps_pd(v_dst_0)); + _mm_storeu_pd(dst + x + 4, _mm_cvtps_pd(_mm_castsi128_ps( + _mm_srli_si128(_mm_castps_si128(v_dst_0), 16)))); + + _mm_storeu_pd(dst + x + 8, _mm_cvtps_pd(v_dst_1)); + _mm_storeu_pd(dst + x + 12, _mm_cvtps_pd(_mm_castsi128_ps( + _mm_srli_si128(_mm_castps_si128(v_dst_1), 16)))); + } + + return x; + } +}; + +// from float + +template <> +struct cvtScale_SIMD +{ + int operator () (const float * src, uchar * dst, int width, float scale, float shift) const + { + int x = 0; + + if (!USE_SSE2) + return x; + + __m128i v_zero = _mm_setzero_si128(); + __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); + + for ( ; x <= width - 8; x += 8) + { + __m128 v_src = _mm_loadu_ps(src + x); + __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); + + v_src = _mm_loadu_ps(src + x + 4); + __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); + + __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), + _mm_cvtps_epi32(v_dst_1)); + _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_zero)); + } + + return x; + } +}; + +template <> +struct cvtScale_SIMD +{ + int operator () (const float * src, schar * dst, int width, float scale, float shift) const + { + int x = 0; + + if (!USE_SSE2) + return x; + + __m128i v_zero = _mm_setzero_si128(); + __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); + + for ( ; x <= width - 8; x += 8) + { + __m128 v_src = _mm_loadu_ps(src + x); + __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); + + v_src = _mm_loadu_ps(src + x + 4); + __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); + + __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), + _mm_cvtps_epi32(v_dst_1)); + _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_zero)); + } + + return x; + } +}; + +#if CV_SSE4_1 + +template <> +struct cvtScale_SIMD +{ + cvtScale_SIMD() + { + haveSSE = checkHardwareSupport(CV_CPU_SSE4_1); + } + + int operator () (const float * src, ushort * dst, int width, float scale, float shift) const + { + int x = 0; + + if (!haveSSE) + return x; + + __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); + + for ( ; x <= width - 8; x += 8) + { + __m128 v_src = _mm_loadu_ps(src + x); + __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); + + v_src = _mm_loadu_ps(src + x + 4); + __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); + + __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0), + _mm_cvtps_epi32(v_dst_1)); + _mm_storeu_si128((__m128i *)(dst + x), v_dst); + } + + return x; + } + + bool haveSSE; +}; + +#endif + +template <> +struct cvtScale_SIMD +{ + int operator () (const float * src, short * dst, int width, float scale, float shift) const + { + int x = 0; + + if (!USE_SSE2) + return x; + + __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); + + for ( ; x <= width - 8; x += 8) + { + __m128 v_src = _mm_loadu_ps(src + x); + __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); + + v_src = _mm_loadu_ps(src + x + 4); + __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); + + __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), + _mm_cvtps_epi32(v_dst_1)); + _mm_storeu_si128((__m128i *)(dst + x), v_dst); + } + + return x; + } +}; + +template <> +struct cvtScale_SIMD +{ + int operator () (const float * src, int * dst, int width, float scale, float shift) const + { + int x = 0; + + if (!USE_SSE2) + return x; + + __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); + + for ( ; x <= width - 8; x += 8) + { + __m128 v_src = _mm_loadu_ps(src + x); + __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); + + v_src = _mm_loadu_ps(src + x + 4); + __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); + + _mm_storeu_si128((__m128i *)(dst + x), _mm_cvtps_epi32(v_dst_0)); + _mm_storeu_si128((__m128i *)(dst + x + 4), _mm_cvtps_epi32(v_dst_1)); + } + + return x; + } +}; + +template <> +struct cvtScale_SIMD +{ + int operator () (const float * src, float * dst, int width, float scale, float shift) const + { + int x = 0; + + if (!USE_SSE2) + return x; + + __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); + + for ( ; x <= width - 8; x += 8) + { + __m128 v_src = _mm_loadu_ps(src + x); + __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); + + v_src = _mm_loadu_ps(src + x + 4); + __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); + + _mm_storeu_ps(dst + x, v_dst_0); + _mm_storeu_ps(dst + x + 4, v_dst_1); + } + + return x; + } +}; + +template <> +struct cvtScale_SIMD +{ + int operator () (const float * src, double * dst, int width, float scale, float shift) const + { + int x = 0; + + if (!USE_SSE2) + return x; + + __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); + + for ( ; x <= width - 8; x += 8) + { + __m128 v_src = _mm_loadu_ps(src + x); + __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); + + v_src = _mm_loadu_ps(src + x + 4); + __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); + + _mm_storeu_pd(dst + x, _mm_cvtps_pd(v_dst_0)); + _mm_storeu_pd(dst + x + 4, _mm_cvtps_pd(_mm_castsi128_ps( + _mm_srli_si128(_mm_castps_si128(v_dst_0), 16)))); + + _mm_storeu_pd(dst + x + 8, _mm_cvtps_pd(v_dst_1)); + _mm_storeu_pd(dst + x + 12, _mm_cvtps_pd(_mm_castsi128_ps( + _mm_srli_si128(_mm_castps_si128(v_dst_1), 16)))); + } + + return x; + } +}; + #elif CV_NEON // from uchar From 63fc6ef316d35ae3ac851aba5b615f4170cbcc98 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 12 Jan 2015 10:59:29 +0300 Subject: [PATCH 12/53] convertTo from 64f --- modules/core/perf/perf_convertTo.cpp | 4 +- modules/core/src/convert.cpp | 419 ++++++++++++++++++++------- 2 files changed, 314 insertions(+), 109 deletions(-) diff --git a/modules/core/perf/perf_convertTo.cpp b/modules/core/perf/perf_convertTo.cpp index 7892a7642e..8007361228 100644 --- a/modules/core/perf/perf_convertTo.cpp +++ b/modules/core/perf/perf_convertTo.cpp @@ -13,8 +13,8 @@ PERF_TEST_P( Size_DepthSrc_DepthDst_Channels_alpha, convertTo, testing::Combine ( testing::Values(szVGA, sz1080p), - testing::Values(CV_8U), - testing::Values(CV_16U), + testing::Values(CV_8U, CV_8S, CV_16U, CV_16S, CV_32S, CV_32F, CV_64F), + testing::Values(CV_8U, CV_8S, CV_16U, CV_16S, CV_32S, CV_32F, CV_64F), testing::Values(1, 4), testing::Values(1.0, 1./255) ) diff --git a/modules/core/src/convert.cpp b/modules/core/src/convert.cpp index 5ef238e46f..ef8edee6a6 100644 --- a/modules/core/src/convert.cpp +++ b/modules/core/src/convert.cpp @@ -1769,9 +1769,9 @@ struct cvtScale_SIMD }; template <> -struct cvtScale_SIMD +struct cvtScale_SIMD { - int operator () (const uchar * src, double * dst, int width, float scale, float shift) const + int operator () (const uchar * src, double * dst, int width, double scale, double shift) const { int x = 0; @@ -1779,24 +1779,23 @@ struct cvtScale_SIMD return x; __m128i v_zero = _mm_setzero_si128(); - __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); + __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift); for ( ; x <= width - 8; x += 8) { __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero); - __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)); - __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); - v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)); - __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + __m128i v_src_s32 = _mm_unpacklo_epi16(v_src, v_zero); + __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift); + __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift); + _mm_storeu_pd(dst + x, v_dst_0); + _mm_storeu_pd(dst + x + 2, v_dst_1); - _mm_storeu_pd(dst + x, _mm_cvtps_pd(v_dst_0)); - _mm_storeu_pd(dst + x + 4, _mm_cvtps_pd(_mm_castsi128_ps( - _mm_srli_si128(_mm_castps_si128(v_dst_0), 16)))); - - _mm_storeu_pd(dst + x + 8, _mm_cvtps_pd(v_dst_1)); - _mm_storeu_pd(dst + x + 12, _mm_cvtps_pd(_mm_castsi128_ps( - _mm_srli_si128(_mm_castps_si128(v_dst_1), 16)))); + v_src_s32 = _mm_unpackhi_epi16(v_src, v_zero); + v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift); + v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift); + _mm_storeu_pd(dst + x + 4, v_dst_0); + _mm_storeu_pd(dst + x + 6, v_dst_1); } return x; @@ -2001,9 +2000,9 @@ struct cvtScale_SIMD }; template <> -struct cvtScale_SIMD +struct cvtScale_SIMD { - int operator () (const schar * src, double * dst, int width, float scale, float shift) const + int operator () (const schar * src, double * dst, int width, double scale, double shift) const { int x = 0; @@ -2011,24 +2010,24 @@ struct cvtScale_SIMD return x; __m128i v_zero = _mm_setzero_si128(); - __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); + __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift); for ( ; x <= width - 8; x += 8) { - __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src + x))), 8); - __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16)); - __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + __m128i v_src = _mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src + x))); + v_src = _mm_srai_epi16(v_src, 8); - v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16)); - __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + __m128i v_src_s32 = _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16); + __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift); + __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift); + _mm_storeu_pd(dst + x, v_dst_0); + _mm_storeu_pd(dst + x + 2, v_dst_1); - _mm_storeu_pd(dst + x, _mm_cvtps_pd(v_dst_0)); - _mm_storeu_pd(dst + x + 4, _mm_cvtps_pd(_mm_castsi128_ps( - _mm_srli_si128(_mm_castps_si128(v_dst_0), 16)))); - - _mm_storeu_pd(dst + x + 8, _mm_cvtps_pd(v_dst_1)); - _mm_storeu_pd(dst + x + 12, _mm_cvtps_pd(_mm_castsi128_ps( - _mm_srli_si128(_mm_castps_si128(v_dst_1), 16)))); + v_src_s32 = _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16); + v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift); + v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift); + _mm_storeu_pd(dst + x + 4, v_dst_0); + _mm_storeu_pd(dst + x + 6, v_dst_1); } return x; @@ -2233,9 +2232,9 @@ struct cvtScale_SIMD }; template <> -struct cvtScale_SIMD +struct cvtScale_SIMD { - int operator () (const ushort * src, double * dst, int width, float scale, float shift) const + int operator () (const ushort * src, double * dst, int width, double scale, double shift) const { int x = 0; @@ -2243,24 +2242,23 @@ struct cvtScale_SIMD return x; __m128i v_zero = _mm_setzero_si128(); - __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); + __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift); for ( ; x <= width - 8; x += 8) { __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); - __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)); - __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); - v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)); - __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + __m128i v_src_s32 = _mm_unpacklo_epi16(v_src, v_zero); + __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift); + __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift); + _mm_storeu_pd(dst + x, v_dst_0); + _mm_storeu_pd(dst + x + 2, v_dst_1); - _mm_storeu_pd(dst + x, _mm_cvtps_pd(v_dst_0)); - _mm_storeu_pd(dst + x + 4, _mm_cvtps_pd(_mm_castsi128_ps( - _mm_srli_si128(_mm_castps_si128(v_dst_0), 16)))); - - _mm_storeu_pd(dst + x + 8, _mm_cvtps_pd(v_dst_1)); - _mm_storeu_pd(dst + x + 12, _mm_cvtps_pd(_mm_castsi128_ps( - _mm_srli_si128(_mm_castps_si128(v_dst_1), 16)))); + v_src_s32 = _mm_unpackhi_epi16(v_src, v_zero); + v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift); + v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift); + _mm_storeu_pd(dst + x + 4, v_dst_0); + _mm_storeu_pd(dst + x + 6, v_dst_1); } return x; @@ -2465,9 +2463,9 @@ struct cvtScale_SIMD }; template <> -struct cvtScale_SIMD +struct cvtScale_SIMD { - int operator () (const short * src, double * dst, int width, float scale, float shift) const + int operator () (const short * src, double * dst, int width, double scale, double shift) const { int x = 0; @@ -2475,24 +2473,23 @@ struct cvtScale_SIMD return x; __m128i v_zero = _mm_setzero_si128(); - __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); + __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift); for ( ; x <= width - 8; x += 8) { __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); - __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16)); - __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); - v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16)); - __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + __m128i v_src_s32 = _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16); + __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift); + __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift); + _mm_storeu_pd(dst + x, v_dst_0); + _mm_storeu_pd(dst + x + 2, v_dst_1); - _mm_storeu_pd(dst + x, _mm_cvtps_pd(v_dst_0)); - _mm_storeu_pd(dst + x + 4, _mm_cvtps_pd(_mm_castsi128_ps( - _mm_srli_si128(_mm_castps_si128(v_dst_0), 16)))); - - _mm_storeu_pd(dst + x + 8, _mm_cvtps_pd(v_dst_1)); - _mm_storeu_pd(dst + x + 12, _mm_cvtps_pd(_mm_castsi128_ps( - _mm_srli_si128(_mm_castps_si128(v_dst_1), 16)))); + v_src_s32 = _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16); + v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift); + v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift); + _mm_storeu_pd(dst + x + 4, v_dst_0); + _mm_storeu_pd(dst + x + 6, v_dst_1); } return x; @@ -2631,27 +2628,29 @@ struct cvtScale_SIMD }; template <> -struct cvtScale_SIMD +struct cvtScale_SIMD { - int operator () (const int * src, int * dst, int width, float scale, float shift) const + int operator () (const int * src, int * dst, int width, double scale, double shift) const { int x = 0; if (!USE_SSE2) return x; - __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); + __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift); - for ( ; x <= width - 8; x += 8) + for ( ; x <= width - 4; x += 4) { __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); - __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift); + __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src), v_scale), v_shift); - v_src = _mm_loadu_si128((__m128i const *)(src + x + 4)); - __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift); + v_src = _mm_srli_si128(v_src, 8); + __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src), v_scale), v_shift); - _mm_storeu_si128((__m128i *)(dst + x), _mm_cvtps_epi32(v_dst_0)); - _mm_storeu_si128((__m128i *)(dst + x + 4), _mm_cvtps_epi32(v_dst_1)); + __m128 v_dst = _mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_dst_0)), + _mm_castsi128_ps(_mm_cvtpd_epi32(v_dst_1))); + + _mm_storeu_si128((__m128i *)(dst + x), _mm_castps_si128(v_dst)); } return x; @@ -2659,27 +2658,27 @@ struct cvtScale_SIMD }; template <> -struct cvtScale_SIMD +struct cvtScale_SIMD { - int operator () (const int * src, float * dst, int width, float scale, float shift) const + int operator () (const int * src, float * dst, int width, double scale, double shift) const { int x = 0; if (!USE_SSE2) return x; - __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); + __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift); - for ( ; x <= width - 8; x += 8) + for ( ; x <= width - 4; x += 4) { __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); - __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift); + __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src), v_scale), v_shift); - v_src = _mm_loadu_si128((__m128i const *)(src + x + 4)); - __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift); + v_src = _mm_srli_si128(v_src, 8); + __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src), v_scale), v_shift); - _mm_storeu_ps(dst + x, v_dst_0); - _mm_storeu_ps(dst + x + 4, v_dst_1); + _mm_storeu_ps(dst + x, _mm_movelh_ps(_mm_cvtpd_ps(v_dst_0), + _mm_cvtpd_ps(v_dst_1))); } return x; @@ -2687,32 +2686,27 @@ struct cvtScale_SIMD }; template <> -struct cvtScale_SIMD +struct cvtScale_SIMD { - int operator () (const int * src, double * dst, int width, float scale, float shift) const + int operator () (const int * src, double * dst, int width, double scale, double shift) const { int x = 0; if (!USE_SSE2) return x; - __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); + __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift); - for ( ; x <= width - 8; x += 8) + for ( ; x <= width - 4; x += 4) { __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); - __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift); + __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src), v_scale), v_shift); - v_src = _mm_loadu_si128((__m128i const *)(src + x + 4)); - __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift); + v_src = _mm_srli_si128(v_src, 8); + __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src), v_scale), v_shift); - _mm_storeu_pd(dst + x, _mm_cvtps_pd(v_dst_0)); - _mm_storeu_pd(dst + x + 4, _mm_cvtps_pd(_mm_castsi128_ps( - _mm_srli_si128(_mm_castps_si128(v_dst_0), 16)))); - - _mm_storeu_pd(dst + x + 8, _mm_cvtps_pd(v_dst_1)); - _mm_storeu_pd(dst + x + 12, _mm_cvtps_pd(_mm_castsi128_ps( - _mm_srli_si128(_mm_castps_si128(v_dst_1), 16)))); + _mm_storeu_pd(dst + x, v_dst_0); + _mm_storeu_pd(dst + x + 2, v_dst_1); } return x; @@ -2890,16 +2884,11 @@ struct cvtScale_SIMD __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); - for ( ; x <= width - 8; x += 8) + for ( ; x <= width - 4; x += 4) { __m128 v_src = _mm_loadu_ps(src + x); - __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); - - v_src = _mm_loadu_ps(src + x + 4); - __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); - - _mm_storeu_ps(dst + x, v_dst_0); - _mm_storeu_ps(dst + x + 4, v_dst_1); + __m128 v_dst = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); + _mm_storeu_ps(dst + x, v_dst); } return x; @@ -2907,9 +2896,144 @@ struct cvtScale_SIMD }; template <> -struct cvtScale_SIMD +struct cvtScale_SIMD { - int operator () (const float * src, double * dst, int width, float scale, float shift) const + int operator () (const float * src, double * dst, int width, double scale, double shift) const + { + int x = 0; + + if (!USE_SSE2) + return x; + + __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift); + + for ( ; x <= width - 4; x += 4) + { + __m128 v_src = _mm_loadu_ps(src + x); + __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtps_pd(v_src), v_scale), v_shift); + v_src = _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_src), 8)); + __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtps_pd(v_src), v_scale), v_shift); + + _mm_storeu_pd(dst + x, v_dst_0); + _mm_storeu_pd(dst + x + 2, v_dst_1); + } + + return x; + } +}; + +// from double + +template <> +struct cvtScale_SIMD +{ + int operator () (const double * src, uchar * dst, int width, float scale, float shift) const + { + int x = 0; + + if (!USE_SSE2) + return x; + + __m128i v_zero = _mm_setzero_si128(); + __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); + + for ( ; x <= width - 8; x += 8) + { + __m128 v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x)), + _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2))); + __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); + + v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x + 4)), + _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6))); + __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); + + __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), + _mm_cvtps_epi32(v_dst_1)); + _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_zero)); + } + + return x; + } +}; + +template <> +struct cvtScale_SIMD +{ + int operator () (const double * src, schar * dst, int width, float scale, float shift) const + { + int x = 0; + + if (!USE_SSE2) + return x; + + __m128i v_zero = _mm_setzero_si128(); + __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); + + for ( ; x <= width - 8; x += 8) + { + __m128 v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x)), + _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2))); + __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); + + v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x + 4)), + _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6))); + __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); + + __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), + _mm_cvtps_epi32(v_dst_1)); + _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_zero)); + } + + return x; + } +}; + +#if CV_SSE4_1 + +template <> +struct cvtScale_SIMD +{ + cvtScale_SIMD() + { + haveSSE = checkHardwareSupport(CV_CPU_SSE4_1); + } + + int operator () (const double * src, ushort * dst, int width, float scale, float shift) const + { + int x = 0; + + if (!haveSSE) + return x; + + __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); + + for ( ; x <= width - 8; x += 8) + { + __m128 v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x)), + _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2))); + __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); + + v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x + 4)), + _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6))); + __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); + + __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0), + _mm_cvtps_epi32(v_dst_1)); + _mm_storeu_si128((__m128i *)(dst + x), v_dst); + } + + return x; + } + + bool haveSSE; +}; + +#endif + +template <> +struct cvtScale_SIMD +{ + int operator () (const double * src, short * dst, int width, float scale, float shift) const { int x = 0; @@ -2920,19 +3044,100 @@ struct cvtScale_SIMD for ( ; x <= width - 8; x += 8) { - __m128 v_src = _mm_loadu_ps(src + x); + __m128 v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x)), + _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2))); __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); - v_src = _mm_loadu_ps(src + x + 4); + v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x + 4)), + _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6))); __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); - _mm_storeu_pd(dst + x, _mm_cvtps_pd(v_dst_0)); - _mm_storeu_pd(dst + x + 4, _mm_cvtps_pd(_mm_castsi128_ps( - _mm_srli_si128(_mm_castps_si128(v_dst_0), 16)))); + __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), + _mm_cvtps_epi32(v_dst_1)); + _mm_storeu_si128((__m128i *)(dst + x), v_dst); + } - _mm_storeu_pd(dst + x + 8, _mm_cvtps_pd(v_dst_1)); - _mm_storeu_pd(dst + x + 12, _mm_cvtps_pd(_mm_castsi128_ps( - _mm_srli_si128(_mm_castps_si128(v_dst_1), 16)))); + return x; + } +}; + +template <> +struct cvtScale_SIMD +{ + int operator () (const double * src, int * dst, int width, double scale, double shift) const + { + int x = 0; + + if (!USE_SSE2) + return x; + + __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift); + + for ( ; x <= width - 4; x += 4) + { + __m128d v_src = _mm_loadu_pd(src + x); + __m128d v_dst0 = _mm_add_pd(_mm_mul_pd(v_src, v_scale), v_shift); + + v_src = _mm_loadu_pd(src + x + 2); + __m128d v_dst1 = _mm_add_pd(_mm_mul_pd(v_src, v_scale), v_shift); + + __m128 v_dst = _mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_dst0)), + _mm_castsi128_ps(_mm_cvtpd_epi32(v_dst1))); + + _mm_storeu_si128((__m128i *)(dst + x), _mm_castps_si128(v_dst)); + } + + return x; + } +}; + +template <> +struct cvtScale_SIMD +{ + int operator () (const double * src, float * dst, int width, double scale, double shift) const + { + int x = 0; + + if (!USE_SSE2) + return x; + + __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift); + + for ( ; x <= width - 4; x += 4) + { + __m128d v_src = _mm_loadu_pd(src + x); + __m128d v_dst0 = _mm_add_pd(_mm_mul_pd(v_src, v_scale), v_shift); + + v_src = _mm_loadu_pd(src + x + 2); + __m128d v_dst1 = _mm_add_pd(_mm_mul_pd(v_src, v_scale), v_shift); + + __m128 v_dst = _mm_movelh_ps(_mm_cvtpd_ps(v_dst0), + _mm_cvtpd_ps(v_dst1)); + + _mm_storeu_ps(dst + x, v_dst); + } + + return x; + } +}; + +template <> +struct cvtScale_SIMD +{ + int operator () (const double * src, double * dst, int width, double scale, double shift) const + { + int x = 0; + + if (!USE_SSE2) + return x; + + __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift); + + for ( ; x <= width - 2; x += 2) + { + __m128d v_src = _mm_loadu_pd(src + x); + __m128d v_dst = _mm_add_pd(_mm_mul_pd(v_src, v_scale), v_shift); + _mm_storeu_pd(dst + x, v_dst); } return x; From 56f3c92737de6a7f55c128e88fc4362d59845586 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 12 Jan 2015 10:59:29 +0300 Subject: [PATCH 13/53] pyrUp and pyrDown --- modules/imgproc/src/pyramids.cpp | 328 ++++++++++++++++++++++++++++++- 1 file changed, 322 insertions(+), 6 deletions(-) diff --git a/modules/imgproc/src/pyramids.cpp b/modules/imgproc/src/pyramids.cpp index e510530afd..5425de11cd 100644 --- a/modules/imgproc/src/pyramids.cpp +++ b/modules/imgproc/src/pyramids.cpp @@ -183,13 +183,329 @@ struct PyrDownVec_32f } }; -typedef PyrDownNoVec PyrDownVec_32s16u; -typedef PyrDownNoVec PyrDownVec_32s16s; +#if CV_SSE4_1 -typedef PyrUpNoVec PyrUpVec_32s8u; -typedef PyrUpNoVec PyrUpVec_32s16s; -typedef PyrUpNoVec PyrUpVec_32s16u; -typedef PyrUpNoVec PyrUpVec_32f; +struct PyrDownVec_32s16u +{ + PyrDownVec_32s16u() + { + haveSSE = checkHardwareSupport(CV_CPU_SSE4_1); + } + + int operator()(int** src, ushort* dst, int, int width) const + { + int x = 0; + + if (!haveSSE) + return x; + + const int *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4]; + __m128i v_delta = _mm_set1_epi32(128); + + for( ; x <= width - 8; x += 8 ) + { + __m128i v_r00 = _mm_loadu_si128((__m128i const *)(row0 + x)), + v_r01 = _mm_loadu_si128((__m128i const *)(row0 + x + 4)); + __m128i v_r10 = _mm_loadu_si128((__m128i const *)(row1 + x)), + v_r11 = _mm_loadu_si128((__m128i const *)(row1 + x + 4)); + __m128i v_r20 = _mm_loadu_si128((__m128i const *)(row2 + x)), + v_r21 = _mm_loadu_si128((__m128i const *)(row2 + x + 4)); + __m128i v_r30 = _mm_loadu_si128((__m128i const *)(row3 + x)), + v_r31 = _mm_loadu_si128((__m128i const *)(row3 + x + 4)); + __m128i v_r40 = _mm_loadu_si128((__m128i const *)(row4 + x)), + v_r41 = _mm_loadu_si128((__m128i const *)(row4 + x + 4)); + + v_r00 = _mm_add_epi32(_mm_add_epi32(v_r00, v_r40), _mm_add_epi32(v_r20, v_r20)); + v_r10 = _mm_add_epi32(_mm_add_epi32(v_r10, v_r20), v_r30); + + v_r10 = _mm_slli_epi32(v_r10, 2); + __m128i v_dst0 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(v_r00, v_r10), v_delta), 8); + + v_r01 = _mm_add_epi32(_mm_add_epi32(v_r01, v_r41), _mm_add_epi32(v_r21, v_r21)); + v_r11 = _mm_add_epi32(_mm_add_epi32(v_r11, v_r21), v_r31); + v_r11 = _mm_slli_epi32(v_r11, 2); + __m128i v_dst1 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(v_r01, v_r11), v_delta), 8); + + _mm_storeu_si128((__m128i *)(dst + x), _mm_packus_epi32(v_dst0, v_dst1)); + } + + return x; + } + + bool haveSSE; +}; + +#endif + +struct PyrDownVec_32s16s +{ + PyrDownVec_32s16s() + { + haveSSE = checkHardwareSupport(CV_CPU_SSE2); + } + + int operator()(int** src, short* dst, int, int width) const + { + int x = 0; + + if (!haveSSE) + return x; + + const int *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4]; + __m128i v_delta = _mm_set1_epi32(128); + + for( ; x <= width - 8; x += 8 ) + { + __m128i v_r00 = _mm_loadu_si128((__m128i const *)(row0 + x)), + v_r01 = _mm_loadu_si128((__m128i const *)(row0 + x + 4)); + __m128i v_r10 = _mm_loadu_si128((__m128i const *)(row1 + x)), + v_r11 = _mm_loadu_si128((__m128i const *)(row1 + x + 4)); + __m128i v_r20 = _mm_loadu_si128((__m128i const *)(row2 + x)), + v_r21 = _mm_loadu_si128((__m128i const *)(row2 + x + 4)); + __m128i v_r30 = _mm_loadu_si128((__m128i const *)(row3 + x)), + v_r31 = _mm_loadu_si128((__m128i const *)(row3 + x + 4)); + __m128i v_r40 = _mm_loadu_si128((__m128i const *)(row4 + x)), + v_r41 = _mm_loadu_si128((__m128i const *)(row4 + x + 4)); + + v_r00 = _mm_add_epi32(_mm_add_epi32(v_r00, v_r40), _mm_add_epi32(v_r20, v_r20)); + v_r10 = _mm_add_epi32(_mm_add_epi32(v_r10, v_r20), v_r30); + + v_r10 = _mm_slli_epi32(v_r10, 2); + __m128i v_dst0 = _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(v_r00, v_r10), v_delta), 8); + + v_r01 = _mm_add_epi32(_mm_add_epi32(v_r01, v_r41), _mm_add_epi32(v_r21, v_r21)); + v_r11 = _mm_add_epi32(_mm_add_epi32(v_r11, v_r21), v_r31); + v_r11 = _mm_slli_epi32(v_r11, 2); + __m128i v_dst1 = _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(v_r01, v_r11), v_delta), 8); + + _mm_storeu_si128((__m128i *)(dst + x), _mm_packs_epi32(v_dst0, v_dst1)); + } + + return x; + } + + bool haveSSE; +}; + + +struct PyrUpVec_32s8u +{ + int operator()(int** src, uchar** dst, int, int width) const + { + int x = 0; + + if (!checkHardwareSupport(CV_CPU_SSE2)) + return x; + + uchar *dst0 = dst[0], *dst1 = dst[1]; + const uint *row0 = (uint *)src[0], *row1 = (uint *)src[1], *row2 = (uint *)src[2]; + __m128i v_delta = _mm_set1_epi16(32), v_zero = _mm_setzero_si128(); + + for( ; x <= width - 16; x += 16 ) + { + __m128i v_r0 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row0 + x)), + _mm_loadu_si128((__m128i const *)(row0 + x + 4))); + __m128i v_r1 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row1 + x)), + _mm_loadu_si128((__m128i const *)(row1 + x + 4))); + __m128i v_r2 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row2 + x)), + _mm_loadu_si128((__m128i const *)(row2 + x + 4))); + + __m128i v_2r1 = _mm_adds_epu16(v_r1, v_r1), v_4r1 = _mm_adds_epu16(v_2r1, v_2r1); + __m128i v_dst00 = _mm_adds_epu16(_mm_adds_epu16(v_r0, v_r2), _mm_adds_epu16(v_2r1, v_4r1)); + __m128i v_dst10 = _mm_slli_epi16(_mm_adds_epu16(v_r1, v_r2), 2); + + v_r0 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row0 + x + 8)), + _mm_loadu_si128((__m128i const *)(row0 + x + 12))); + v_r1 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row1 + x + 8)), + _mm_loadu_si128((__m128i const *)(row1 + x + 12))); + v_r2 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row2 + x + 8)), + _mm_loadu_si128((__m128i const *)(row2 + x + 12))); + + v_2r1 = _mm_adds_epu16(v_r1, v_r1), v_4r1 = _mm_adds_epu16(v_2r1, v_2r1); + __m128i v_dst01 = _mm_adds_epu16(_mm_adds_epu16(v_r0, v_r2), _mm_adds_epu16(v_2r1, v_4r1)); + __m128i v_dst11 = _mm_slli_epi16(_mm_adds_epu16(v_r1, v_r2), 2); + + _mm_storeu_si128((__m128i *)(dst0 + x), _mm_packus_epi16(_mm_srli_epi16(_mm_adds_epu16(v_dst00, v_delta), 6), + _mm_srli_epi16(_mm_adds_epu16(v_dst01, v_delta), 6))); + _mm_storeu_si128((__m128i *)(dst1 + x), _mm_packus_epi16(_mm_srli_epi16(_mm_adds_epu16(v_dst10, v_delta), 6), + _mm_srli_epi16(_mm_adds_epu16(v_dst11, v_delta), 6))); + } + + for( ; x <= width - 8; x += 8 ) + { + __m128i v_r0 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row0 + x)), + _mm_loadu_si128((__m128i const *)(row0 + x + 4))); + __m128i v_r1 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row1 + x)), + _mm_loadu_si128((__m128i const *)(row1 + x + 4))); + __m128i v_r2 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row2 + x)), + _mm_loadu_si128((__m128i const *)(row2 + x + 4))); + + __m128i v_2r1 = _mm_adds_epu16(v_r1, v_r1), v_4r1 = _mm_adds_epu16(v_2r1, v_2r1); + __m128i v_dst0 = _mm_adds_epu16(_mm_adds_epu16(v_r0, v_r2), _mm_adds_epu16(v_2r1, v_4r1)); + __m128i v_dst1 = _mm_slli_epi16(_mm_adds_epu16(v_r1, v_r2), 2); + + _mm_storel_epi64((__m128i *)(dst0 + x), _mm_packus_epi16(_mm_srli_epi16(_mm_adds_epu16(v_dst0, v_delta), 6), v_zero)); + _mm_storel_epi64((__m128i *)(dst1 + x), _mm_packus_epi16(_mm_srli_epi16(_mm_adds_epu16(v_dst1, v_delta), 6), v_zero)); + } + + return x; + } +}; + +struct PyrUpVec_32s16s +{ + int operator()(int** src, short** dst, int, int width) const + { + int x = 0; + + if (!checkHardwareSupport(CV_CPU_SSE2)) + return x; + + short *dst0 = dst[0], *dst1 = dst[1]; + const uint *row0 = (uint *)src[0], *row1 = (uint *)src[1], *row2 = (uint *)src[2]; + __m128i v_delta = _mm_set1_epi32(32), v_zero = _mm_setzero_si128(); + + for( ; x <= width - 8; x += 8 ) + { + __m128i v_r0 = _mm_loadu_si128((__m128i const *)(row0 + x)), + v_r1 = _mm_loadu_si128((__m128i const *)(row1 + x)), + v_r2 = _mm_loadu_si128((__m128i const *)(row2 + x)); + __m128i v_2r1 = _mm_slli_epi32(v_r1, 1), v_4r1 = _mm_slli_epi32(v_r1, 2); + __m128i v_dst00 = _mm_add_epi32(_mm_add_epi32(v_r0, v_r2), _mm_add_epi32(v_2r1, v_4r1)); + __m128i v_dst10 = _mm_slli_epi32(_mm_add_epi32(v_r1, v_r2), 2); + + v_r0 = _mm_loadu_si128((__m128i const *)(row0 + x + 4)); + v_r1 = _mm_loadu_si128((__m128i const *)(row1 + x + 4)); + v_r2 = _mm_loadu_si128((__m128i const *)(row2 + x + 4)); + v_2r1 = _mm_slli_epi32(v_r1, 1); + v_4r1 = _mm_slli_epi32(v_r1, 2); + __m128i v_dst01 = _mm_add_epi32(_mm_add_epi32(v_r0, v_r2), _mm_add_epi32(v_2r1, v_4r1)); + __m128i v_dst11 = _mm_slli_epi32(_mm_add_epi32(v_r1, v_r2), 2); + + _mm_storeu_si128((__m128i *)(dst0 + x), + _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_dst00, v_delta), 6), + _mm_srai_epi32(_mm_add_epi32(v_dst01, v_delta), 6))); + _mm_storeu_si128((__m128i *)(dst1 + x), + _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_dst10, v_delta), 6), + _mm_srai_epi32(_mm_add_epi32(v_dst11, v_delta), 6))); + } + + for( ; x <= width - 4; x += 4 ) + { + __m128i v_r0 = _mm_loadu_si128((__m128i const *)(row0 + x)), + v_r1 = _mm_loadu_si128((__m128i const *)(row1 + x)), + v_r2 = _mm_loadu_si128((__m128i const *)(row2 + x)); + __m128i v_2r1 = _mm_slli_epi32(v_r1, 1), v_4r1 = _mm_slli_epi32(v_r1, 2); + + __m128i v_dst0 = _mm_add_epi32(_mm_add_epi32(v_r0, v_r2), _mm_add_epi32(v_2r1, v_4r1)); + __m128i v_dst1 = _mm_slli_epi32(_mm_add_epi32(v_r1, v_r2), 2); + + _mm_storel_epi64((__m128i *)(dst0 + x), + _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_dst0, v_delta), 6), v_zero)); + _mm_storel_epi64((__m128i *)(dst1 + x), + _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_dst1, v_delta), 6), v_zero)); + } + + return x; + } +}; + +#if CV_SSE4_1 + +struct PyrUpVec_32s16u +{ + int operator()(int** src, ushort** dst, int, int width) const + { + int x = 0; + + if (!checkHardwareSupport(CV_CPU_SSE4_1)) + return x; + + ushort *dst0 = dst[0], *dst1 = dst[1]; + const uint *row0 = (uint *)src[0], *row1 = (uint *)src[1], *row2 = (uint *)src[2]; + __m128i v_delta = _mm_set1_epi32(32), v_zero = _mm_setzero_si128(); + + for( ; x <= width - 8; x += 8 ) + { + __m128i v_r0 = _mm_loadu_si128((__m128i const *)(row0 + x)), + v_r1 = _mm_loadu_si128((__m128i const *)(row1 + x)), + v_r2 = _mm_loadu_si128((__m128i const *)(row2 + x)); + __m128i v_2r1 = _mm_slli_epi32(v_r1, 1), v_4r1 = _mm_slli_epi32(v_r1, 2); + __m128i v_dst00 = _mm_add_epi32(_mm_add_epi32(v_r0, v_r2), _mm_add_epi32(v_2r1, v_4r1)); + __m128i v_dst10 = _mm_slli_epi32(_mm_add_epi32(v_r1, v_r2), 2); + + v_r0 = _mm_loadu_si128((__m128i const *)(row0 + x + 4)); + v_r1 = _mm_loadu_si128((__m128i const *)(row1 + x + 4)); + v_r2 = _mm_loadu_si128((__m128i const *)(row2 + x + 4)); + v_2r1 = _mm_slli_epi32(v_r1, 1); + v_4r1 = _mm_slli_epi32(v_r1, 2); + __m128i v_dst01 = _mm_add_epi32(_mm_add_epi32(v_r0, v_r2), _mm_add_epi32(v_2r1, v_4r1)); + __m128i v_dst11 = _mm_slli_epi32(_mm_add_epi32(v_r1, v_r2), 2); + + _mm_storeu_si128((__m128i *)(dst0 + x), + _mm_packus_epi32(_mm_srli_epi32(_mm_add_epi32(v_dst00, v_delta), 6), + _mm_srli_epi32(_mm_add_epi32(v_dst01, v_delta), 6))); + _mm_storeu_si128((__m128i *)(dst1 + x), + _mm_packus_epi32(_mm_srli_epi32(_mm_add_epi32(v_dst10, v_delta), 6), + _mm_srli_epi32(_mm_add_epi32(v_dst11, v_delta), 6))); + } + + for( ; x <= width - 4; x += 4 ) + { + __m128i v_r0 = _mm_loadu_si128((__m128i const *)(row0 + x)), + v_r1 = _mm_loadu_si128((__m128i const *)(row1 + x)), + v_r2 = _mm_loadu_si128((__m128i const *)(row2 + x)); + __m128i v_2r1 = _mm_slli_epi32(v_r1, 1), v_4r1 = _mm_slli_epi32(v_r1, 2); + + __m128i v_dst0 = _mm_add_epi32(_mm_add_epi32(v_r0, v_r2), _mm_add_epi32(v_2r1, v_4r1)); + __m128i v_dst1 = _mm_slli_epi32(_mm_add_epi32(v_r1, v_r2), 2); + + _mm_storel_epi64((__m128i *)(dst0 + x), + _mm_packus_epi32(_mm_srli_epi32(_mm_add_epi32(v_dst0, v_delta), 6), v_zero)); + _mm_storel_epi64((__m128i *)(dst1 + x), + _mm_packus_epi32(_mm_srli_epi32(_mm_add_epi32(v_dst1, v_delta), 6), v_zero)); + } + + return x; + } +}; + +#endif + +struct PyrUpVec_32f +{ + int operator()(float** src, float** dst, int, int width) const + { + int x = 0; + + if (!checkHardwareSupport(CV_CPU_SSE2)) + return x; + + const float *row0 = src[0], *row1 = src[1], *row2 = src[2]; + float *dst0 = dst[0], *dst1 = dst[1]; + __m128 v_6 = _mm_set1_ps(6.0f), v_scale = _mm_set1_ps(1.f/64.0f), + v_scale4 = _mm_mul_ps(v_scale, _mm_set1_ps(4.0f)); + + for( ; x <= width - 8; x += 8 ) + { + __m128 v_r0 = _mm_loadu_ps(row0 + x); + __m128 v_r1 = _mm_loadu_ps(row1 + x); + __m128 v_r2 = _mm_loadu_ps(row2 + x); + + _mm_storeu_ps(dst1 + x, _mm_mul_ps(v_scale4, _mm_add_ps(v_r1, v_r2))); + _mm_storeu_ps(dst0 + x, _mm_mul_ps(v_scale, _mm_add_ps(_mm_add_ps(v_r0, _mm_mul_ps(v_6, v_r1)), v_r2))); + + v_r0 = _mm_loadu_ps(row0 + x + 4); + v_r1 = _mm_loadu_ps(row1 + x + 4); + v_r2 = _mm_loadu_ps(row2 + x + 4); + + _mm_storeu_ps(dst1 + x + 4, _mm_mul_ps(v_scale4, _mm_add_ps(v_r1, v_r2))); + _mm_storeu_ps(dst0 + x + 4, _mm_mul_ps(v_scale, _mm_add_ps(_mm_add_ps(v_r0, _mm_mul_ps(v_6, v_r1)), v_r2))); + } + + return x; + } +}; #elif CV_NEON From 1ca35b74248f457122a1cdcf3d2a0dbc8d4de299 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 12 Jan 2015 10:59:29 +0300 Subject: [PATCH 14/53] resize are fast --- modules/imgproc/src/imgwarp.cpp | 142 +++++++++++++++++++++++++- modules/imgproc/test/test_imgwarp.cpp | 5 +- 2 files changed, 144 insertions(+), 3 deletions(-) diff --git a/modules/imgproc/src/imgwarp.cpp b/modules/imgproc/src/imgwarp.cpp index c4bb3baa9f..1fa4557cad 100644 --- a/modules/imgproc/src/imgwarp.cpp +++ b/modules/imgproc/src/imgwarp.cpp @@ -2199,8 +2199,146 @@ private: bool use_simd; }; -typedef ResizeAreaFastNoVec ResizeAreaFastVec_SIMD_16s; -typedef ResizeAreaFastNoVec ResizeAreaFastVec_SIMD_32f; +class ResizeAreaFastVec_SIMD_16s +{ +public: + ResizeAreaFastVec_SIMD_16s(int _cn, int _step) : + cn(_cn), step(_step) + { + use_simd = checkHardwareSupport(CV_CPU_SSE2); + } + + int operator() (const short* S, short* D, int w) const + { + if (!use_simd) + return 0; + + int dx = 0; + const short* S0 = (const short*)S; + const short* S1 = (const short*)((const uchar*)(S) + step); + __m128i masklow = _mm_set1_epi32(0x0000ffff); + __m128i zero = _mm_setzero_si128(); + __m128i delta2 = _mm_set1_epi32(2); + + if (cn == 1) + { + for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4) + { + __m128i r0 = _mm_loadu_si128((const __m128i*)S0); + __m128i r1 = _mm_loadu_si128((const __m128i*)S1); + + __m128i s0 = _mm_add_epi32(_mm_srai_epi32(r0, 16), + _mm_srai_epi32(_mm_slli_epi32(_mm_and_si128(r0, masklow), 16), 16)); + __m128i s1 = _mm_add_epi32(_mm_srai_epi32(r1, 16), + _mm_srai_epi32(_mm_slli_epi32(_mm_and_si128(r1, masklow), 16), 16)); + s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), delta2); + s0 = _mm_srai_epi32(s0, 2); + s0 = _mm_packs_epi32(s0, zero); + + _mm_storel_epi64((__m128i*)D, s0); + } + } + else if (cn == 3) + for ( ; dx <= w - 4; dx += 3, S0 += 6, S1 += 6, D += 3) + { + __m128i r0 = _mm_loadu_si128((const __m128i*)S0); + __m128i r1 = _mm_loadu_si128((const __m128i*)S1); + + __m128i r0_16l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r0), 16); + __m128i r0_16h = _mm_srai_epi32(_mm_unpacklo_epi16(zero, _mm_srli_si128(r0, 6)), 16); + __m128i r1_16l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r1), 16); + __m128i r1_16h = _mm_srai_epi32(_mm_unpacklo_epi16(zero, _mm_srli_si128(r1, 6)), 16); + + __m128i s0 = _mm_add_epi32(r0_16l, r0_16h); + __m128i s1 = _mm_add_epi32(r1_16l, r1_16h); + s0 = _mm_add_epi32(delta2, _mm_add_epi32(s0, s1)); + s0 = _mm_packs_epi32(_mm_srai_epi32(s0, 2), zero); + _mm_storel_epi64((__m128i*)D, s0); + } + else + { + CV_Assert(cn == 4); + for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4) + { + __m128i r0 = _mm_loadu_si128((const __m128i*)S0); + __m128i r1 = _mm_loadu_si128((const __m128i*)S1); + + __m128i r0_32l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r0), 16); + __m128i r0_32h = _mm_srai_epi32(_mm_unpackhi_epi16(zero, r0), 16); + __m128i r1_32l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r1), 16); + __m128i r1_32h = _mm_srai_epi32(_mm_unpackhi_epi16(zero, r1), 16); + + __m128i s0 = _mm_add_epi32(r0_32l, r0_32h); + __m128i s1 = _mm_add_epi32(r1_32l, r1_32h); + s0 = _mm_add_epi32(s1, _mm_add_epi32(s0, delta2)); + s0 = _mm_packs_epi32(_mm_srai_epi32(s0, 2), zero); + _mm_storel_epi64((__m128i*)D, s0); + } + } + + return dx; + } + +private: + int cn; + int step; + bool use_simd; +}; + +struct ResizeAreaFastVec_SIMD_32f +{ + ResizeAreaFastVec_SIMD_32f(int _scale_x, int _scale_y, int _cn, int _step) : + scale_x(_scale_x), scale_y(_scale_y), cn(_cn), step(_step) + { + fast_mode = scale_x == 2 && scale_y == 2 && (cn == 1 || cn == 3 || cn == 4); + } + + int operator() (const float * S, float * D, int w) const + { + if (!fast_mode) + return 0; + + const float * S0 = S, * S1 = (const float *)((const uchar *)(S0) + step); + int dx = 0; + + __m128 v_025 = _mm_set1_ps(0.25f); + + if (cn == 1) + { + int shuffle_lo = _MM_SHUFFLE(2, 0, 2, 0), shuffle_hi = _MM_SHUFFLE(3, 1, 3, 1); + for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4) + { + __m128 v_row00 = _mm_loadu_ps(S0), v_row01 = _mm_loadu_ps(S0 + 4), + v_row10 = _mm_loadu_ps(S1), v_row11 = _mm_loadu_ps(S1 + 4); + + __m128 v_dst0 = _mm_add_ps(_mm_shuffle_ps(v_row00, v_row01, shuffle_lo), + _mm_shuffle_ps(v_row00, v_row01, shuffle_hi)); + __m128 v_dst1 = _mm_add_ps(_mm_shuffle_ps(v_row10, v_row11, shuffle_lo), + _mm_shuffle_ps(v_row10, v_row11, shuffle_hi)); + + _mm_storeu_ps(D, _mm_mul_ps(_mm_add_ps(v_dst0, v_dst1), v_025)); + } + } + else if (cn == 4) + { + for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4) + { + __m128 v_dst0 = _mm_add_ps(_mm_loadu_ps(S0), _mm_loadu_ps(S0 + 4)); + __m128 v_dst1 = _mm_add_ps(_mm_loadu_ps(S1), _mm_loadu_ps(S1 + 4)); + + _mm_storeu_ps(D, _mm_mul_ps(_mm_add_ps(v_dst0, v_dst1), v_025)); + } + } + + return dx; + } + +private: + int scale_x, scale_y; + int cn; + bool fast_mode; + int step; +}; #else diff --git a/modules/imgproc/test/test_imgwarp.cpp b/modules/imgproc/test/test_imgwarp.cpp index 34505c4ca4..176c9907f3 100644 --- a/modules/imgproc/test/test_imgwarp.cpp +++ b/modules/imgproc/test/test_imgwarp.cpp @@ -1595,7 +1595,10 @@ void resizeArea(const cv::Mat & src, cv::Mat & dst) TEST(Resize, Area_half) { const int size = 1000; - int types[] = { CV_8UC1, CV_8UC4, CV_16UC1, CV_16UC4, CV_16SC1, CV_16SC4, CV_32FC1, CV_32FC4 }; + int types[] = { CV_8UC1, CV_8UC4, + CV_16UC1, CV_16UC4, + CV_16SC1, CV_16SC3, CV_16SC4, + CV_32FC1, CV_32FC4 }; cv::RNG rng(17); From bfb45b27e6d5f3b11b4e579fb8d77b3bef7ff1cf Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 12 Jan 2015 10:59:29 +0300 Subject: [PATCH 15/53] column sum 32s --- modules/imgproc/src/smooth.cpp | 152 +++++++++++++++++++++++++++++++++ 1 file changed, 152 insertions(+) diff --git a/modules/imgproc/src/smooth.cpp b/modules/imgproc/src/smooth.cpp index 5ab70d9a26..b7c3004039 100644 --- a/modules/imgproc/src/smooth.cpp +++ b/modules/imgproc/src/smooth.cpp @@ -713,6 +713,158 @@ struct ColumnSum : std::vector sum; }; +template<> +struct ColumnSum : + public BaseColumnFilter +{ + ColumnSum( int _ksize, int _anchor, double _scale ) : + BaseColumnFilter() + { + ksize = _ksize; + anchor = _anchor; + scale = _scale; + sumCount = 0; + } + + virtual void reset() { sumCount = 0; } + + virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) + { + int i; + int* SUM; + bool haveScale = scale != 1; + double _scale = scale; + + printf("bgfbffbbfg\n"); + + #if CV_SSE2 + bool haveSSE2 = checkHardwareSupport(CV_CPU_SSE2); + #endif + + if( width != (int)sum.size() ) + { + sum.resize(width); + sumCount = 0; + } + SUM = &sum[0]; + if( sumCount == 0 ) + { + memset((void*)SUM, 0, width*sizeof(int)); + for( ; sumCount < ksize - 1; sumCount++, src++ ) + { + const int* Sp = (const int*)src[0]; + i = 0; + #if CV_SSE2 + if(haveSSE2) + { + for( ; i <= width-4; i+=4 ) + { + __m128i _sum = _mm_loadu_si128((const __m128i*)(SUM+i)); + __m128i _sp = _mm_loadu_si128((const __m128i*)(Sp+i)); + _mm_storeu_si128((__m128i*)(SUM+i),_mm_add_epi32(_sum, _sp)); + } + } + #elif CV_NEON + for( ; i <= width - 4; i+=4 ) + vst1q_s32(SUM + i, vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i))); + #endif + for( ; i < width; i++ ) + SUM[i] += Sp[i]; + } + } + else + { + CV_Assert( sumCount == ksize-1 ); + src += ksize-1; + } + + for( ; count--; src++ ) + { + const int* Sp = (const int*)src[0]; + const int* Sm = (const int*)src[1-ksize]; + int* D = (int*)dst; + if( haveScale ) + { + i = 0; + #if CV_SSE2 + if(haveSSE2) + { + const __m128 scale4 = _mm_set1_ps((float)_scale); + for( ; i <= width-4; i+=4 ) + { + __m128i _sm = _mm_loadu_si128((const __m128i*)(Sm+i)); + + __m128i _s0 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i)), + _mm_loadu_si128((const __m128i*)(Sp+i))); + + __m128i _s0T = _mm_cvtps_epi32(_mm_mul_ps(scale4, _mm_cvtepi32_ps(_s0))); + + _mm_storeu_si128((__m128i*)(D+i), _s0T); + _mm_storeu_si128((__m128i*)(SUM+i),_mm_sub_epi32(_s0,_sm)); + } + } + #elif CV_NEON + float32x4_t v_scale = vdupq_n_f32((float)_scale); + for( ; i <= width-4; i+=4 ) + { + int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i)); + + int32x4_t v_s0d = cv_vrndq_s32_f32(vmulq_f32(vcvtq_f32_s32(v_s0), v_scale)); + vst1q_s32(D + i, v_s0d); + + vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i))); + } + #endif + for( ; i < width; i++ ) + { + int s0 = SUM[i] + Sp[i]; + D[i] = saturate_cast(s0*_scale); + SUM[i] = s0 - Sm[i]; + } + } + else + { + i = 0; + #if CV_SSE2 + if(haveSSE2) + { + for( ; i <= width-4; i+=4 ) + { + __m128i _sm = _mm_loadu_si128((const __m128i*)(Sm+i)); + __m128i _s0 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i)), + _mm_loadu_si128((const __m128i*)(Sp+i))); + + _mm_storeu_si128((__m128i*)(D+i), _s0); + _mm_storeu_si128((__m128i*)(SUM+i), _mm_sub_epi32(_s0,_sm)); + } + } + #elif CV_NEON + for( ; i <= width-4; i+=4 ) + { + int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i)); + + vst1q_s32(D + i, v_s01); + vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i))); + } + #endif + + for( ; i < width; i++ ) + { + int s0 = SUM[i] + Sp[i]; + D[i] = s0; + SUM[i] = s0 - Sm[i]; + } + } + dst += dststep; + } + } + + double scale; + int sumCount; + std::vector sum; +}; + + template<> struct ColumnSum : public BaseColumnFilter From 5f2135695e9b7d155097bb84abf6c6011ada3652 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 12 Jan 2015 10:59:29 +0300 Subject: [PATCH 16/53] cvtColor rgb 2 YCrCb --- modules/imgproc/perf/perf_cvt_color.cpp | 168 +----------------- modules/imgproc/src/color.cpp | 225 ++++++++++++++++++++++++ 2 files changed, 234 insertions(+), 159 deletions(-) diff --git a/modules/imgproc/perf/perf_cvt_color.cpp b/modules/imgproc/perf/perf_cvt_color.cpp index 02622ea805..4bcb698ae4 100644 --- a/modules/imgproc/perf/perf_cvt_color.cpp +++ b/modules/imgproc/perf/perf_cvt_color.cpp @@ -56,50 +56,10 @@ enum }; CV_ENUM(CvtMode, - COLOR_BGR2BGR555, COLOR_BGR2BGR565, COLOR_BGR2BGRA, COLOR_BGR2GRAY, - COLOR_BGR2HLS, COLOR_BGR2HLS_FULL, COLOR_BGR2HSV, COLOR_BGR2HSV_FULL, - COLOR_BGR2Lab, COLOR_BGR2Luv, COLOR_BGR2RGB, COLOR_BGR2RGBA, COLOR_BGR2XYZ, - COLOR_BGR2YCrCb, COLOR_BGR2YUV, COLOR_BGR5552BGR, COLOR_BGR5552BGRA, - - COLOR_BGR5552GRAY, COLOR_BGR5552RGB, COLOR_BGR5552RGBA, COLOR_BGR5652BGR, - COLOR_BGR5652BGRA, COLOR_BGR5652GRAY, COLOR_BGR5652RGB, COLOR_BGR5652RGBA, - - COLOR_BGRA2BGR, COLOR_BGRA2BGR555, COLOR_BGRA2BGR565, COLOR_BGRA2GRAY, COLOR_BGRA2RGBA, - CX_BGRA2HLS, CX_BGRA2HLS_FULL, CX_BGRA2HSV, CX_BGRA2HSV_FULL, CX_BGRA2Lab, CX_BGRA2Luv, CX_BGRA2XYZ, CX_BGRA2YCrCb, CX_BGRA2YUV, - - COLOR_GRAY2BGR, COLOR_GRAY2BGR555, COLOR_GRAY2BGR565, COLOR_GRAY2BGRA, - - COLOR_HLS2BGR, COLOR_HLS2BGR_FULL, COLOR_HLS2RGB, COLOR_HLS2RGB_FULL, - CX_HLS2BGRA, CX_HLS2BGRA_FULL, CX_HLS2RGBA, CX_HLS2RGBA_FULL, - - COLOR_HSV2BGR, COLOR_HSV2BGR_FULL, COLOR_HSV2RGB, COLOR_HSV2RGB_FULL, - CX_HSV2BGRA, CX_HSV2BGRA_FULL, CX_HSV2RGBA, CX_HSV2RGBA_FULL, - - COLOR_Lab2BGR, COLOR_Lab2LBGR, COLOR_Lab2LRGB, COLOR_Lab2RGB, - CX_Lab2BGRA, CX_Lab2LBGRA, CX_Lab2LRGBA, CX_Lab2RGBA, - - COLOR_LBGR2Lab, COLOR_LBGR2Luv, COLOR_LRGB2Lab, COLOR_LRGB2Luv, - CX_LBGRA2Lab, CX_LBGRA2Luv, CX_LRGBA2Lab, CX_LRGBA2Luv, - - COLOR_Luv2BGR, COLOR_Luv2LBGR, COLOR_Luv2LRGB, COLOR_Luv2RGB, - CX_Luv2BGRA, CX_Luv2LBGRA, CX_Luv2LRGBA, CX_Luv2RGBA, - - COLOR_RGB2BGR555, COLOR_RGB2BGR565, COLOR_RGB2GRAY, - COLOR_RGB2HLS, COLOR_RGB2HLS_FULL, COLOR_RGB2HSV, COLOR_RGB2HSV_FULL, COLOR_RGB2Lab, COLOR_RGB2Luv, COLOR_RGB2XYZ, COLOR_RGB2YCrCb, COLOR_RGB2YUV, - - COLOR_RGBA2BGR, COLOR_RGBA2BGR555, COLOR_RGBA2BGR565, COLOR_RGBA2GRAY, - CX_RGBA2HLS, CX_RGBA2HLS_FULL, CX_RGBA2HSV, CX_RGBA2HSV_FULL, - CX_RGBA2Lab, CX_RGBA2Luv, CX_RGBA2XYZ, - CX_RGBA2YCrCb, CX_RGBA2YUV, - - COLOR_XYZ2BGR, COLOR_XYZ2RGB, CX_XYZ2BGRA, CX_XYZ2RGBA, - - COLOR_YCrCb2BGR, COLOR_YCrCb2RGB, CX_YCrCb2BGRA, CX_YCrCb2RGBA, - COLOR_YUV2BGR, COLOR_YUV2RGB, CX_YUV2BGRA, CX_YUV2RGBA - ) + CX_RGBA2YCrCb, CX_RGBA2YUV) CV_ENUM(CvtModeBayer, @@ -237,135 +197,25 @@ ChPair getConversionInfo(int cvtMode) return ChPair(0,0); } -typedef std::tr1::tuple Size_CvtMode_t; -typedef perf::TestBaseWithParam Size_CvtMode; +typedef perf::TestBaseWithParam Size_CvtMode; PERF_TEST_P(Size_CvtMode, cvtColor8u, - testing::Combine( - testing::Values(::perf::szODD, ::perf::szVGA, ::perf::sz1080p), - CvtMode::all() - ) + testing::Values(::perf::szODD, ::perf::szVGA, ::perf::sz1080p) ) { - Size sz = get<0>(GetParam()); - int _mode = get<1>(GetParam()), mode = _mode; + Size sz = GetParam(); + int mode = COLOR_RGB2YCrCb; ChPair ch = getConversionInfo(mode); mode %= COLOR_COLORCVT_MAX; - Mat src(sz, CV_8UC(ch.scn)); - Mat dst(sz, CV_8UC(ch.dcn)); + Mat src(sz, CV_8UC(3)); + Mat dst(sz, CV_8UC(3)); declare.time(100); declare.in(src, WARMUP_RNG).out(dst); int runs = sz.width <= 320 ? 100 : 5; - TEST_CYCLE_MULTIRUN(runs) cvtColor(src, dst, mode, ch.dcn); + TEST_CYCLE_MULTIRUN(runs) cvtColor(src, dst, mode, 3); -#if defined(__APPLE__) && defined(HAVE_IPP) - SANITY_CHECK(dst, _mode == CX_BGRA2HLS_FULL ? 2 : 1); -#else - SANITY_CHECK(dst, 1); -#endif -} - -typedef std::tr1::tuple Size_CvtMode_Bayer_t; -typedef perf::TestBaseWithParam Size_CvtMode_Bayer; - -PERF_TEST_P(Size_CvtMode_Bayer, cvtColorBayer8u, - testing::Combine( - testing::Values(::perf::szODD, ::perf::szVGA), - CvtModeBayer::all() - ) - ) -{ - Size sz = get<0>(GetParam()); - int mode = get<1>(GetParam()); - ChPair ch = getConversionInfo(mode); - mode %= COLOR_COLORCVT_MAX; - - Mat src(sz, CV_8UC(ch.scn)); - Mat dst(sz, CV_8UC(ch.dcn)); - - declare.time(100); - declare.in(src, WARMUP_RNG).out(dst); - - TEST_CYCLE() cvtColor(src, dst, mode, ch.dcn); - - SANITY_CHECK(dst, 1); -} - -typedef std::tr1::tuple Size_CvtMode2_t; -typedef perf::TestBaseWithParam Size_CvtMode2; - -PERF_TEST_P(Size_CvtMode2, cvtColorYUV420, - testing::Combine( - testing::Values(szVGA, sz1080p, Size(130, 60)), - CvtMode2::all() - ) - ) -{ - Size sz = get<0>(GetParam()); - int mode = get<1>(GetParam()); - ChPair ch = getConversionInfo(mode); - - Mat src(sz.height + sz.height / 2, sz.width, CV_8UC(ch.scn)); - Mat dst(sz, CV_8UC(ch.dcn)); - - declare.in(src, WARMUP_RNG).out(dst); - - int runs = (sz.width <= 640) ? 8 : 1; - TEST_CYCLE_MULTIRUN(runs) cvtColor(src, dst, mode, ch.dcn); - - SANITY_CHECK(dst, 1); -} - -typedef std::tr1::tuple Size_CvtMode3_t; -typedef perf::TestBaseWithParam Size_CvtMode3; - -PERF_TEST_P(Size_CvtMode3, cvtColorRGB2YUV420p, - testing::Combine( - testing::Values(szVGA, sz720p, sz1080p, Size(130, 60)), - CvtMode3::all() - ) - ) -{ - Size sz = get<0>(GetParam()); - int mode = get<1>(GetParam()); - ChPair ch = getConversionInfo(mode); - - Mat src(sz, CV_8UC(ch.scn)); - Mat dst(sz.height + sz.height / 2, sz.width, CV_8UC(ch.dcn)); - - declare.time(100); - declare.in(src, WARMUP_RNG).out(dst); - - int runs = (sz.width <= 640) ? 10 : 1; - TEST_CYCLE_MULTIRUN(runs) cvtColor(src, dst, mode, ch.dcn); - - SANITY_CHECK(dst, 1); -} - -CV_ENUM(EdgeAwareBayerMode, COLOR_BayerBG2BGR_EA, COLOR_BayerGB2BGR_EA, COLOR_BayerRG2BGR_EA, COLOR_BayerGR2BGR_EA) - -typedef std::tr1::tuple EdgeAwareParams; -typedef perf::TestBaseWithParam EdgeAwareDemosaicingTest; - -PERF_TEST_P(EdgeAwareDemosaicingTest, demosaicingEA, - testing::Combine( - testing::Values(szVGA, sz720p, sz1080p, Size(130, 60)), - EdgeAwareBayerMode::all() - ) - ) -{ - Size sz = get<0>(GetParam()); - int mode = get<1>(GetParam()); - - Mat src(sz, CV_8UC1); - Mat dst(sz, CV_8UC3); - - declare.in(src, WARMUP_RNG).out(dst); - - TEST_CYCLE() cvtColor(src, dst, mode, 3); - - SANITY_CHECK(dst, 1); + SANITY_CHECK_NOTHING(); } diff --git a/modules/imgproc/src/color.cpp b/modules/imgproc/src/color.cpp index f0a8fd8584..55b915b5bb 100644 --- a/modules/imgproc/src/color.cpp +++ b/modules/imgproc/src/color.cpp @@ -102,6 +102,89 @@ static IppStatus sts = ippInit(); #endif +#if CV_SSE2 + +#define _MM_DEINTERLIV_EPI8(layer0_chunk0, layer0_chunk1, layer0_chunk2, \ + layer0_chunk3, layer0_chunk4, layer0_chunk5) \ + { \ + __m128i layer1_chunk0 = _mm_unpacklo_epi8(layer0_chunk0, layer0_chunk3); \ + __m128i layer1_chunk1 = _mm_unpackhi_epi8(layer0_chunk0, layer0_chunk3); \ + __m128i layer1_chunk2 = _mm_unpacklo_epi8(layer0_chunk1, layer0_chunk4); \ + __m128i layer1_chunk3 = _mm_unpackhi_epi8(layer0_chunk1, layer0_chunk4); \ + __m128i layer1_chunk4 = _mm_unpacklo_epi8(layer0_chunk2, layer0_chunk5); \ + __m128i layer1_chunk5 = _mm_unpackhi_epi8(layer0_chunk2, layer0_chunk5); \ + \ + __m128i layer2_chunk0 = _mm_unpacklo_epi8(layer1_chunk0, layer1_chunk3); \ + __m128i layer2_chunk1 = _mm_unpackhi_epi8(layer1_chunk0, layer1_chunk3); \ + __m128i layer2_chunk2 = _mm_unpacklo_epi8(layer1_chunk1, layer1_chunk4); \ + __m128i layer2_chunk3 = _mm_unpackhi_epi8(layer1_chunk1, layer1_chunk4); \ + __m128i layer2_chunk4 = _mm_unpacklo_epi8(layer1_chunk2, layer1_chunk5); \ + __m128i layer2_chunk5 = _mm_unpackhi_epi8(layer1_chunk2, layer1_chunk5); \ + \ + __m128i layer3_chunk0 = _mm_unpacklo_epi8(layer2_chunk0, layer2_chunk3); \ + __m128i layer3_chunk1 = _mm_unpackhi_epi8(layer2_chunk0, layer2_chunk3); \ + __m128i layer3_chunk2 = _mm_unpacklo_epi8(layer2_chunk1, layer2_chunk4); \ + __m128i layer3_chunk3 = _mm_unpackhi_epi8(layer2_chunk1, layer2_chunk4); \ + __m128i layer3_chunk4 = _mm_unpacklo_epi8(layer2_chunk2, layer2_chunk5); \ + __m128i layer3_chunk5 = _mm_unpackhi_epi8(layer2_chunk2, layer2_chunk5); \ + \ + __m128i layer4_chunk0 = _mm_unpacklo_epi8(layer3_chunk0, layer3_chunk3); \ + __m128i layer4_chunk1 = _mm_unpackhi_epi8(layer3_chunk0, layer3_chunk3); \ + __m128i layer4_chunk2 = _mm_unpacklo_epi8(layer3_chunk1, layer3_chunk4); \ + __m128i layer4_chunk3 = _mm_unpackhi_epi8(layer3_chunk1, layer3_chunk4); \ + __m128i layer4_chunk4 = _mm_unpacklo_epi8(layer3_chunk2, layer3_chunk5); \ + __m128i layer4_chunk5 = _mm_unpackhi_epi8(layer3_chunk2, layer3_chunk5); \ + \ + layer0_chunk0 = _mm_unpacklo_epi8(layer4_chunk0, layer4_chunk3); \ + layer0_chunk1 = _mm_unpackhi_epi8(layer4_chunk0, layer4_chunk3); \ + layer0_chunk2 = _mm_unpacklo_epi8(layer4_chunk1, layer4_chunk4); \ + layer0_chunk3 = _mm_unpackhi_epi8(layer4_chunk1, layer4_chunk4); \ + layer0_chunk4 = _mm_unpacklo_epi8(layer4_chunk2, layer4_chunk5); \ + layer0_chunk5 = _mm_unpackhi_epi8(layer4_chunk2, layer4_chunk5); \ + } + +#define _MM_INTERLIV_EPI8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1) \ + { \ + __m128i v_mask = _mm_set1_epi16(0x00ff); \ + \ + __m128i layer4_chunk0 = _mm_packus_epi16(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask)); \ + __m128i layer4_chunk3 = _mm_packus_epi16(_mm_srli_epi16(v_r0, 8), _mm_srli_epi16(v_r1, 8)); \ + __m128i layer4_chunk1 = _mm_packus_epi16(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask)); \ + __m128i layer4_chunk4 = _mm_packus_epi16(_mm_srli_epi16(v_g0, 8), _mm_srli_epi16(v_g1, 8)); \ + __m128i layer4_chunk2 = _mm_packus_epi16(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask)); \ + __m128i layer4_chunk5 = _mm_packus_epi16(_mm_srli_epi16(v_b0, 8), _mm_srli_epi16(v_b1, 8)); \ + \ + __m128i layer3_chunk0 = _mm_packus_epi16(_mm_and_si128(layer4_chunk0, v_mask), _mm_and_si128(layer4_chunk1, v_mask)); \ + __m128i layer3_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk0, 8), _mm_srli_epi16(layer4_chunk1, 8)); \ + __m128i layer3_chunk1 = _mm_packus_epi16(_mm_and_si128(layer4_chunk2, v_mask), _mm_and_si128(layer4_chunk3, v_mask)); \ + __m128i layer3_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk2, 8), _mm_srli_epi16(layer4_chunk3, 8)); \ + __m128i layer3_chunk2 = _mm_packus_epi16(_mm_and_si128(layer4_chunk4, v_mask), _mm_and_si128(layer4_chunk5, v_mask)); \ + __m128i layer3_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk4, 8), _mm_srli_epi16(layer4_chunk5, 8)); \ + \ + __m128i layer2_chunk0 = _mm_packus_epi16(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask)); \ + __m128i layer2_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk0, 8), _mm_srli_epi16(layer3_chunk1, 8)); \ + __m128i layer2_chunk1 = _mm_packus_epi16(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask)); \ + __m128i layer2_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk2, 8), _mm_srli_epi16(layer3_chunk3, 8)); \ + __m128i layer2_chunk2 = _mm_packus_epi16(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask)); \ + __m128i layer2_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk4, 8), _mm_srli_epi16(layer3_chunk5, 8)); \ + \ + __m128i layer1_chunk0 = _mm_packus_epi16(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask)); \ + __m128i layer1_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk0, 8), _mm_srli_epi16(layer2_chunk1, 8)); \ + __m128i layer1_chunk1 = _mm_packus_epi16(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask)); \ + __m128i layer1_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk2, 8), _mm_srli_epi16(layer2_chunk3, 8)); \ + __m128i layer1_chunk2 = _mm_packus_epi16(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask)); \ + __m128i layer1_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk4, 8), _mm_srli_epi16(layer2_chunk5, 8)); \ + \ + v_r0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask)); \ + v_r1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk0, 8), _mm_srli_epi16(layer1_chunk1, 8)); \ + v_g0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask)); \ + v_g1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk2, 8), _mm_srli_epi16(layer1_chunk3, 8)); \ + v_b0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask)); \ + v_b1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk4, 8), _mm_srli_epi16(layer1_chunk5, 8)); \ + } + +#endif + namespace cv { @@ -1699,6 +1782,148 @@ struct RGB2YCrCb_i int32x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_delta, v_delta2; }; +#elif CV_SSE2 + +template <> +struct RGB2YCrCb_i +{ + typedef uchar channel_type; + + RGB2YCrCb_i(int _srccn, int _blueIdx, const int* _coeffs) + : srccn(_srccn), blueIdx(_blueIdx) + { + static const int coeffs0[] = {R2Y, G2Y, B2Y, 11682, 9241}; + memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 5*sizeof(coeffs[0])); + if (blueIdx==0) + std::swap(coeffs[0], coeffs[2]); + + v_c0 = _mm_set1_epi32(coeffs[0]); + v_c1 = _mm_set1_epi32(coeffs[1]); + v_c2 = _mm_set1_epi32(coeffs[2]); + v_c3 = _mm_set1_epi32(coeffs[3]); + v_c4 = _mm_set1_epi32(coeffs[4]); + v_delta2 = _mm_set1_epi32(1 << (yuv_shift - 1)); + v_delta = _mm_set1_epi32(ColorChannel::half()*(1 << yuv_shift)); + v_delta = _mm_add_epi32(v_delta, v_delta2); + v_zero = _mm_setzero_si128(); + } + + // 16u x 8 + void process(__m128i v_r, __m128i v_g, __m128i v_b, + __m128i & v_y, __m128i & v_cr, __m128i & v_cb) const + { + __m128i v_r_p = _mm_unpacklo_epi16(v_r, v_zero); + __m128i v_g_p = _mm_unpacklo_epi16(v_g, v_zero); + __m128i v_b_p = _mm_unpacklo_epi16(v_b, v_zero); + + __m128i v_y0 = _mm_add_epi32(_mm_mullo_epi32(v_r_p, v_c0), + _mm_add_epi32(_mm_mullo_epi32(v_g_p, v_c1), + _mm_mullo_epi32(v_b_p, v_c2))); + v_y0 = _mm_srli_epi32(_mm_add_epi32(v_delta2, v_y0), yuv_shift); + + __m128i v_cr0 = _mm_mullo_epi32(_mm_sub_epi32(blueIdx == 2 ? v_r_p : v_b_p, v_y0), v_c3); + __m128i v_cb0 = _mm_mullo_epi32(_mm_sub_epi32(blueIdx == 0 ? v_r_p : v_b_p, v_y0), v_c4); + v_cr0 = _mm_srai_epi32(_mm_add_epi32(v_delta, v_cr0), yuv_shift); + v_cb0 = _mm_srai_epi32(_mm_add_epi32(v_delta, v_cb0), yuv_shift); + + v_r_p = _mm_unpackhi_epi16(v_r, v_zero); + v_g_p = _mm_unpackhi_epi16(v_g, v_zero); + v_b_p = _mm_unpackhi_epi16(v_b, v_zero); + + __m128i v_y1 = _mm_add_epi32(_mm_mullo_epi32(v_r_p, v_c0), + _mm_add_epi32(_mm_mullo_epi32(v_g_p, v_c1), + _mm_mullo_epi32(v_b_p, v_c2))); + v_y1 = _mm_srli_epi32(_mm_add_epi32(v_delta2, v_y1), yuv_shift); + + __m128i v_cr1 = _mm_mullo_epi32(_mm_sub_epi32(blueIdx == 2 ? v_r_p : v_b_p, v_y1), v_c3); + __m128i v_cb1 = _mm_mullo_epi32(_mm_sub_epi32(blueIdx == 0 ? v_r_p : v_b_p, v_y1), v_c4); + v_cr1 = _mm_srai_epi32(_mm_add_epi32(v_delta, v_cr1), yuv_shift); + v_cb1 = _mm_srai_epi32(_mm_add_epi32(v_delta, v_cb1), yuv_shift); + + v_y = _mm_packs_epi32(v_y0, v_y1); + v_cr = _mm_packs_epi32(v_cr0, v_cr1); + v_cb = _mm_packs_epi32(v_cb0, v_cb1); + } + + void operator()(const uchar * src, uchar * dst, int n) const + { + int scn = srccn, bidx = blueIdx, i = 0; + int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4]; + int delta = ColorChannel::half()*(1 << yuv_shift); + n *= 3; + + if (scn == 3) + { + for ( ; i <= n - 96; i += 96, src += scn * 32) + { + __m128i v_r0 = _mm_loadu_si128((__m128i const *)(src)); + __m128i v_r1 = _mm_loadu_si128((__m128i const *)(src + 16)); + __m128i v_g0 = _mm_loadu_si128((__m128i const *)(src + 32)); + __m128i v_g1 = _mm_loadu_si128((__m128i const *)(src + 48)); + __m128i v_b0 = _mm_loadu_si128((__m128i const *)(src + 64)); + __m128i v_b1 = _mm_loadu_si128((__m128i const *)(src + 80)); + + _MM_DEINTERLIV_EPI8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1) + + __m128i v_y0 = v_zero, v_cr0 = v_zero, v_cb0 = v_zero; + process(_mm_unpacklo_epi8(v_r0, v_zero), + _mm_unpacklo_epi8(v_g0, v_zero), + _mm_unpacklo_epi8(v_b0, v_zero), + v_y0, v_cr0, v_cb0); + + __m128i v_y1 = v_zero, v_cr1 = v_zero, v_cb1 = v_zero; + process(_mm_unpackhi_epi8(v_r0, v_zero), + _mm_unpackhi_epi8(v_g0, v_zero), + _mm_unpackhi_epi8(v_b0, v_zero), + v_y1, v_cr1, v_cb1); + + __m128i v_y_0 = _mm_packus_epi16(v_y0, v_y1); + __m128i v_cr_0 = _mm_packus_epi16(v_cr0, v_cr1); + __m128i v_cb_0 = _mm_packus_epi16(v_cb0, v_cb1); + + process(_mm_unpacklo_epi8(v_r1, v_zero), + _mm_unpacklo_epi8(v_g1, v_zero), + _mm_unpacklo_epi8(v_b1, v_zero), + v_y0, v_cr0, v_cb0); + + process(_mm_unpackhi_epi8(v_r1, v_zero), + _mm_unpackhi_epi8(v_g1, v_zero), + _mm_unpackhi_epi8(v_b1, v_zero), + v_y1, v_cr1, v_cb1); + + __m128i v_y_1 = _mm_packus_epi16(v_y0, v_y1); + __m128i v_cr_1 = _mm_packus_epi16(v_cr0, v_cr1); + __m128i v_cb_1 = _mm_packus_epi16(v_cb0, v_cb1); + + _MM_INTERLIV_EPI8(v_y_0, v_y_1, v_cr_0, v_cr_1, v_cb_0, v_cb_1) + + _mm_storeu_si128((__m128i *)(dst + i), v_y_0); + _mm_storeu_si128((__m128i *)(dst + i + 16), v_y_1); + _mm_storeu_si128((__m128i *)(dst + i + 32), v_cr_0); + _mm_storeu_si128((__m128i *)(dst + i + 48), v_cr_1); + _mm_storeu_si128((__m128i *)(dst + i + 64), v_cb_0); + _mm_storeu_si128((__m128i *)(dst + i + 80), v_cb_1); + } + } + + for ( ; i < n; i += 3, src += scn) + { + int Y = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, yuv_shift); + int Cr = CV_DESCALE((src[bidx^2] - Y)*C3 + delta, yuv_shift); + int Cb = CV_DESCALE((src[bidx] - Y)*C4 + delta, yuv_shift); + dst[i] = saturate_cast(Y); + dst[i+1] = saturate_cast(Cr); + dst[i+2] = saturate_cast(Cb); + } + } + + int srccn, blueIdx, coeffs[5]; + __m128i v_c0, v_c1, v_c2; + __m128i v_c3, v_c4, v_delta, v_delta2; + __m128i v_zero; +}; + + #endif template struct YCrCb2RGB_f From edee922b59d3bc227905e0a2b68f86324a228650 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 12 Jan 2015 10:59:29 +0300 Subject: [PATCH 17/53] cvtColor YCrCb 2 RGB --- modules/imgproc/src/color.cpp | 170 +++++++++++++++++++++++++++++++++- 1 file changed, 165 insertions(+), 5 deletions(-) diff --git a/modules/imgproc/src/color.cpp b/modules/imgproc/src/color.cpp index 55b915b5bb..f80b80fe97 100644 --- a/modules/imgproc/src/color.cpp +++ b/modules/imgproc/src/color.cpp @@ -176,10 +176,10 @@ static IppStatus sts = ippInit(); __m128i layer1_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk4, 8), _mm_srli_epi16(layer2_chunk5, 8)); \ \ v_r0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask)); \ - v_r1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk0, 8), _mm_srli_epi16(layer1_chunk1, 8)); \ - v_g0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask)); \ - v_g1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk2, 8), _mm_srli_epi16(layer1_chunk3, 8)); \ - v_b0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask)); \ + v_g1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk0, 8), _mm_srli_epi16(layer1_chunk1, 8)); \ + v_r1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask)); \ + v_b0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk2, 8), _mm_srli_epi16(layer1_chunk3, 8)); \ + v_g0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask)); \ v_b1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk4, 8), _mm_srli_epi16(layer1_chunk5, 8)); \ } @@ -1852,7 +1852,7 @@ struct RGB2YCrCb_i int delta = ColorChannel::half()*(1 << yuv_shift); n *= 3; - if (scn == 3) + if (scn == 3 && false) { for ( ; i <= n - 96; i += 96, src += scn * 32) { @@ -2321,6 +2321,166 @@ struct YCrCb2RGB_i uint16x4_t v_alpha2; }; +#elif CV_SSE2 + +template <> +struct YCrCb2RGB_i +{ + typedef uchar channel_type; + + YCrCb2RGB_i(int _dstcn, int _blueIdx, const int* _coeffs) + : dstcn(_dstcn), blueIdx(_blueIdx) + { + static const int coeffs0[] = {22987, -11698, -5636, 29049}; + memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 4*sizeof(coeffs[0])); + + v_c0 = _mm_set1_epi16((short)coeffs[0]); + v_c1 = _mm_set1_epi16((short)coeffs[1]); + v_c2 = _mm_set1_epi16((short)coeffs[2]); + v_c3 = _mm_set1_epi16((short)coeffs[3]); + v_delta = _mm_set1_epi16(ColorChannel::half()); + v_delta2 = _mm_set1_epi32(1 << (yuv_shift - 1)); + v_zero = _mm_setzero_si128(); + } + + // 16s x 8 + void process(__m128i v_y, __m128i v_cr, __m128i v_cb, + __m128i & v_r, __m128i & v_g, __m128i & v_b) const + { + v_cr = _mm_sub_epi16(v_cr, v_delta); + v_cb = _mm_sub_epi16(v_cb, v_delta); + + __m128i v_y_p = _mm_unpacklo_epi16(v_y, v_zero); + + __m128i v_mullo_3 = _mm_mullo_epi16(v_cb, v_c3); + __m128i v_mullo_2 = _mm_mullo_epi16(v_cb, v_c2); + __m128i v_mullo_1 = _mm_mullo_epi16(v_cr, v_c1); + __m128i v_mullo_0 = _mm_mullo_epi16(v_cr, v_c0); + + __m128i v_mulhi_3 = _mm_mulhi_epi16(v_cb, v_c3); + __m128i v_mulhi_2 = _mm_mulhi_epi16(v_cb, v_c2); + __m128i v_mulhi_1 = _mm_mulhi_epi16(v_cr, v_c1); + __m128i v_mulhi_0 = _mm_mulhi_epi16(v_cr, v_c0); + + __m128i v_b0 = _mm_srai_epi32(_mm_add_epi32(_mm_unpacklo_epi16(v_mullo_3, v_mulhi_3), v_delta2), yuv_shift); + __m128i v_g0 = _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(_mm_unpacklo_epi16(v_mullo_2, v_mulhi_2), + _mm_unpacklo_epi16(v_mullo_1, v_mulhi_1)), v_delta2), + yuv_shift); + __m128i v_r0 = _mm_srai_epi32(_mm_add_epi32(_mm_unpacklo_epi16(v_mullo_0, v_mulhi_0), v_delta2), yuv_shift); + + v_r0 = _mm_add_epi32(v_r0, v_y_p); + v_g0 = _mm_add_epi32(v_g0, v_y_p); + v_b0 = _mm_add_epi32(v_b0, v_y_p); + + v_y_p = _mm_unpackhi_epi16(v_y, v_zero); + + __m128i v_b1 = _mm_srai_epi32(_mm_add_epi32(_mm_unpackhi_epi16(v_mullo_3, v_mulhi_3), v_delta2), yuv_shift); + __m128i v_g1 = _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(_mm_unpackhi_epi16(v_mullo_2, v_mulhi_2), + _mm_unpackhi_epi16(v_mullo_1, v_mulhi_1)), v_delta2), + yuv_shift); + __m128i v_r1 = _mm_srai_epi32(_mm_add_epi32(_mm_unpackhi_epi16(v_mullo_0, v_mulhi_0), v_delta2), yuv_shift); + + v_r1 = _mm_add_epi32(v_r1, v_y_p); + v_g1 = _mm_add_epi32(v_g1, v_y_p); + v_b1 = _mm_add_epi32(v_b1, v_y_p); + + v_r = _mm_packs_epi32(v_r0, v_r1); + v_g = _mm_packs_epi32(v_g0, v_g1); + v_b = _mm_packs_epi32(v_b0, v_b1); + } + + void operator()(const uchar* src, uchar* dst, int n) const + { + int dcn = dstcn, bidx = blueIdx, i = 0; + const uchar delta = ColorChannel::half(), alpha = ColorChannel::max(); + int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3]; + n *= 3; + + if (dcn == 3) + { + for ( ; i <= n - 96; i += 96, dst += dcn * 32) + { + __m128i v_y0 = _mm_loadu_si128((__m128i const *)(src + i)); + __m128i v_y1 = _mm_loadu_si128((__m128i const *)(src + i + 16)); + __m128i v_cr0 = _mm_loadu_si128((__m128i const *)(src + i + 32)); + __m128i v_cr1 = _mm_loadu_si128((__m128i const *)(src + i + 48)); + __m128i v_cb0 = _mm_loadu_si128((__m128i const *)(src + i + 64)); + __m128i v_cb1 = _mm_loadu_si128((__m128i const *)(src + i + 80)); + + _MM_DEINTERLIV_EPI8(v_y0, v_y1, v_cr0, v_cr1, v_cb0, v_cb1) + + __m128i v_r_0 = v_zero, v_g_0 = v_zero, v_b_0 = v_zero; + process(_mm_unpacklo_epi8(v_y0, v_zero), + _mm_unpacklo_epi8(v_cr0, v_zero), + _mm_unpacklo_epi8(v_cb0, v_zero), + v_r_0, v_g_0, v_b_0); + + __m128i v_r_1 = v_zero, v_g_1 = v_zero, v_b_1 = v_zero; + process(_mm_unpackhi_epi8(v_y0, v_zero), + _mm_unpackhi_epi8(v_cr0, v_zero), + _mm_unpackhi_epi8(v_cb0, v_zero), + v_r_1, v_g_1, v_b_1); + + __m128i v_r0 = _mm_packus_epi16(v_r_0, v_r_1); + __m128i v_g0 = _mm_packus_epi16(v_g_0, v_g_1); + __m128i v_b0 = _mm_packus_epi16(v_b_0, v_b_1); + + process(_mm_unpacklo_epi8(v_y1, v_zero), + _mm_unpacklo_epi8(v_cr1, v_zero), + _mm_unpacklo_epi8(v_cb1, v_zero), + v_r_0, v_g_0, v_b_0); + + process(_mm_unpackhi_epi8(v_y1, v_zero), + _mm_unpackhi_epi8(v_cr1, v_zero), + _mm_unpackhi_epi8(v_cb1, v_zero), + v_r_1, v_g_1, v_b_1); + + __m128i v_r1 = _mm_packus_epi16(v_r_0, v_r_1); + __m128i v_g1 = _mm_packus_epi16(v_g_0, v_g_1); + __m128i v_b1 = _mm_packus_epi16(v_b_0, v_b_1); + + if (bidx == 0) + { + std::swap(v_r0, v_b0); + std::swap(v_r1, v_b1); + } + + _MM_INTERLIV_EPI8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1) + + _mm_storeu_si128((__m128i *)(dst), v_r0); + _mm_storeu_si128((__m128i *)(dst + 16), v_r1); + _mm_storeu_si128((__m128i *)(dst + 32), v_g0); + _mm_storeu_si128((__m128i *)(dst + 48), v_g1); + _mm_storeu_si128((__m128i *)(dst + 64), v_b0); + _mm_storeu_si128((__m128i *)(dst + 80), v_b1); + } + } + + + for ( ; i < n; i += 3, dst += dcn) + { + uchar Y = src[i]; + uchar Cr = src[i+1]; + uchar Cb = src[i+2]; + + int b = Y + CV_DESCALE((Cb - delta)*C3, yuv_shift); + int g = Y + CV_DESCALE((Cb - delta)*C2 + (Cr - delta)*C1, yuv_shift); + int r = Y + CV_DESCALE((Cr - delta)*C0, yuv_shift); + + dst[bidx] = saturate_cast(b); + dst[1] = saturate_cast(g); + dst[bidx^2] = saturate_cast(r); + if( dcn == 4 ) + dst[3] = alpha; + } + } + int dstcn, blueIdx; + int coeffs[4]; + + __m128i v_c0, v_c1, v_c2, v_c3, v_delta2; + __m128i v_delta, v_alpha, v_zero; +}; + #endif ////////////////////////////////////// RGB <-> XYZ /////////////////////////////////////// From 9cacd3261d0caaa39844480f7404b99bba56747f Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 12 Jan 2015 10:59:29 +0300 Subject: [PATCH 18/53] cvtColor BGR5x5 2 Gray --- modules/imgproc/perf/perf_cvt_color.cpp | 2 +- modules/imgproc/src/color.cpp | 77 +++++++++++++++++++++++++ 2 files changed, 78 insertions(+), 1 deletion(-) diff --git a/modules/imgproc/perf/perf_cvt_color.cpp b/modules/imgproc/perf/perf_cvt_color.cpp index 4bcb698ae4..9682cfc5ee 100644 --- a/modules/imgproc/perf/perf_cvt_color.cpp +++ b/modules/imgproc/perf/perf_cvt_color.cpp @@ -204,7 +204,7 @@ PERF_TEST_P(Size_CvtMode, cvtColor8u, ) { Size sz = GetParam(); - int mode = COLOR_RGB2YCrCb; + int mode = COLOR_YCrCb2RGB; ChPair ch = getConversionInfo(mode); mode %= COLOR_COLORCVT_MAX; diff --git a/modules/imgproc/src/color.cpp b/modules/imgproc/src/color.cpp index f80b80fe97..55240e2289 100644 --- a/modules/imgproc/src/color.cpp +++ b/modules/imgproc/src/color.cpp @@ -1125,6 +1125,13 @@ struct RGB5x52Gray v_delta = vdupq_n_u32(1 << (yuv_shift - 1)); v_f8 = vdupq_n_u16(0xf8); v_fc = vdupq_n_u16(0xfc); + #elif CV_SSE2 + v_b2y = _mm_set1_epi16(B2Y); + v_g2y = _mm_set1_epi16(G2Y); + v_r2y = _mm_set1_epi16(R2Y); + v_delta = _mm_set1_epi32(1 << (yuv_shift - 1)); + v_f8 = _mm_set1_epi16(0xf8); + v_fc = _mm_set1_epi16(0xfc); #endif } @@ -1150,6 +1157,39 @@ struct RGB5x52Gray vst1_u8(dst + i, vmovn_u16(vcombine_u16(vmovn_u32(v_dst0), vmovn_u32(v_dst1)))); } + #elif CV_SSE2 + __m128i v_zero = _mm_setzero_si128(); + + for ( ; i <= n - 8; i += 8) + { + __m128i v_src = _mm_loadu_si128((__m128i const *)((ushort *)src + i)); + __m128i v_t0 = _mm_and_si128(_mm_slli_epi16(v_src, 3), v_f8), + v_t1 = _mm_and_si128(_mm_srli_epi16(v_src, 3), v_fc), + v_t2 = _mm_and_si128(_mm_srli_epi16(v_src, 8), v_f8); + + __m128i v_mullo_b = _mm_mullo_epi16(v_t0, v_b2y); + __m128i v_mullo_g = _mm_mullo_epi16(v_t1, v_g2y); + __m128i v_mullo_r = _mm_mullo_epi16(v_t2, v_r2y); + __m128i v_mulhi_b = _mm_mulhi_epi16(v_t0, v_b2y); + __m128i v_mulhi_g = _mm_mulhi_epi16(v_t1, v_g2y); + __m128i v_mulhi_r = _mm_mulhi_epi16(v_t2, v_r2y); + + __m128i v_dst0 = _mm_add_epi32(_mm_unpacklo_epi16(v_mullo_b, v_mulhi_b), + _mm_unpacklo_epi16(v_mullo_g, v_mulhi_g)); + v_dst0 = _mm_add_epi32(_mm_add_epi32(v_dst0, v_delta), + _mm_unpacklo_epi16(v_mullo_r, v_mulhi_r)); + + __m128i v_dst1 = _mm_add_epi32(_mm_unpackhi_epi16(v_mullo_b, v_mulhi_b), + _mm_unpackhi_epi16(v_mullo_g, v_mulhi_g)); + v_dst1 = _mm_add_epi32(_mm_add_epi32(v_dst1, v_delta), + _mm_unpackhi_epi16(v_mullo_r, v_mulhi_r)); + + v_dst0 = _mm_srli_epi32(v_dst0, yuv_shift); + v_dst1 = _mm_srli_epi32(v_dst1, yuv_shift); + + __m128i v_dst = _mm_packs_epi32(v_dst0, v_dst1); + _mm_storel_epi64((__m128i *)(dst + i), _mm_packus_epi16(v_dst, v_zero)); + } #endif for ( ; i < n; i++) { @@ -1178,6 +1218,39 @@ struct RGB5x52Gray vst1_u8(dst + i, vmovn_u16(vcombine_u16(vmovn_u32(v_dst0), vmovn_u32(v_dst1)))); } + #elif CV_SSE2 + __m128i v_zero = _mm_setzero_si128(); + + for ( ; i <= n - 8; i += 8) + { + __m128i v_src = _mm_loadu_si128((__m128i const *)((ushort *)src + i)); + __m128i v_t0 = _mm_and_si128(_mm_slli_epi16(v_src, 3), v_f8), + v_t1 = _mm_and_si128(_mm_srli_epi16(v_src, 2), v_f8), + v_t2 = _mm_and_si128(_mm_srli_epi16(v_src, 7), v_f8); + + __m128i v_mullo_b = _mm_mullo_epi16(v_t0, v_b2y); + __m128i v_mullo_g = _mm_mullo_epi16(v_t1, v_g2y); + __m128i v_mullo_r = _mm_mullo_epi16(v_t2, v_r2y); + __m128i v_mulhi_b = _mm_mulhi_epi16(v_t0, v_b2y); + __m128i v_mulhi_g = _mm_mulhi_epi16(v_t1, v_g2y); + __m128i v_mulhi_r = _mm_mulhi_epi16(v_t2, v_r2y); + + __m128i v_dst0 = _mm_add_epi32(_mm_unpacklo_epi16(v_mullo_b, v_mulhi_b), + _mm_unpacklo_epi16(v_mullo_g, v_mulhi_g)); + v_dst0 = _mm_add_epi32(_mm_add_epi32(v_dst0, v_delta), + _mm_unpacklo_epi16(v_mullo_r, v_mulhi_r)); + + __m128i v_dst1 = _mm_add_epi32(_mm_unpackhi_epi16(v_mullo_b, v_mulhi_b), + _mm_unpackhi_epi16(v_mullo_g, v_mulhi_g)); + v_dst1 = _mm_add_epi32(_mm_add_epi32(v_dst1, v_delta), + _mm_unpackhi_epi16(v_mullo_r, v_mulhi_r)); + + v_dst0 = _mm_srli_epi32(v_dst0, yuv_shift); + v_dst1 = _mm_srli_epi32(v_dst1, yuv_shift); + + __m128i v_dst = _mm_packs_epi32(v_dst0, v_dst1); + _mm_storel_epi64((__m128i *)(dst + i), _mm_packus_epi16(v_dst, v_zero)); + } #endif for ( ; i < n; i++) { @@ -1194,6 +1267,10 @@ struct RGB5x52Gray uint16x4_t v_b2y, v_g2y, v_r2y; uint32x4_t v_delta; uint16x8_t v_f8, v_fc; + #elif CV_SSE2 + __m128i v_b2y, v_g2y, v_r2y; + __m128i v_delta; + __m128i v_f8, v_fc; #endif }; From fe371bf6244b928e7b2c49ec1d799f32e7e26b0c Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 12 Jan 2015 10:59:29 +0300 Subject: [PATCH 19/53] cvtColor Gray 2 BGR5x5 --- modules/imgproc/src/color.cpp | 40 +++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/modules/imgproc/src/color.cpp b/modules/imgproc/src/color.cpp index 55240e2289..b5ac47328d 100644 --- a/modules/imgproc/src/color.cpp +++ b/modules/imgproc/src/color.cpp @@ -1048,6 +1048,10 @@ struct Gray2RGB5x5 #if CV_NEON v_n7 = vdup_n_u8(~7); v_n3 = vdup_n_u8(~3); + #elif CV_SSE2 + v_n7 = _mm_set1_epi16(~7); + v_n3 = _mm_set1_epi16(~3); + v_zero = _mm_setzero_si128(); #endif } @@ -1065,6 +1069,23 @@ struct Gray2RGB5x5 v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src, v_n7)), 8)); vst1q_u16((ushort *)dst + i, v_dst); } + #elif CV_SSE2 + for ( ; i <= n - 16; i += 16 ) + { + __m128i v_src = _mm_loadu_si128((__m128i const *)(src + i)); + + __m128i v_src_p = _mm_unpacklo_epi8(v_src, v_zero); + __m128i v_dst = _mm_or_si128(_mm_srli_epi16(v_src_p, 3), + _mm_or_si128(_mm_slli_epi16(_mm_and_si128(v_src_p, v_n3), 3), + _mm_slli_epi16(_mm_and_si128(v_src_p, v_n7), 8))); + _mm_storeu_si128((__m128i *)((ushort *)dst + i), v_dst); + + v_src_p = _mm_unpackhi_epi8(v_src, v_zero); + v_dst = _mm_or_si128(_mm_srli_epi16(v_src_p, 3), + _mm_or_si128(_mm_slli_epi16(_mm_and_si128(v_src_p, v_n3), 3), + _mm_slli_epi16(_mm_and_si128(v_src_p, v_n7), 8))); + _mm_storeu_si128((__m128i *)((ushort *)dst + i + 8), v_dst); + } #endif for ( ; i < n; i++ ) { @@ -1081,6 +1102,23 @@ struct Gray2RGB5x5 uint16x8_t v_dst = vorrq_u16(vorrq_u16(v_src, vshlq_n_u16(v_src, 5)), vshlq_n_u16(v_src, 10)); vst1q_u16((ushort *)dst + i, v_dst); } + #elif CV_SSE2 + for ( ; i <= n - 16; i += 8 ) + { + __m128i v_src = _mm_loadu_si128((__m128i const *)(src + i)); + + __m128i v_src_p = _mm_srli_epi16(_mm_unpacklo_epi8(v_src, v_zero), 3); + __m128i v_dst = _mm_or_si128(v_src_p, + _mm_or_si128(_mm_slli_epi32(v_src_p, 5), + _mm_slli_epi16(v_src_p, 10))); + _mm_storeu_si128((__m128i *)((ushort *)dst + i), v_dst); + + v_src_p = _mm_srli_epi16(_mm_unpackhi_epi8(v_src, v_zero), 3); + v_dst = _mm_or_si128(v_src_p, + _mm_or_si128(_mm_slli_epi16(v_src_p, 5), + _mm_slli_epi16(v_src_p, 10))); + _mm_storeu_si128((__m128i *)((ushort *)dst + i + 8), v_dst); + } #endif for( ; i < n; i++ ) { @@ -1093,6 +1131,8 @@ struct Gray2RGB5x5 #if CV_NEON uint8x8_t v_n7, v_n3; + #elif CV_SSE2 + __m128i v_n7, v_n3, v_zero; #endif }; From 940f1e79143f8ad7f34b267bb013f73d6c79700d Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 12 Jan 2015 10:59:29 +0300 Subject: [PATCH 20/53] interliving / deinterliving --- modules/imgproc/src/color.cpp | 90 +++++++++++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) diff --git a/modules/imgproc/src/color.cpp b/modules/imgproc/src/color.cpp index b5ac47328d..9bbb7b6ade 100644 --- a/modules/imgproc/src/color.cpp +++ b/modules/imgproc/src/color.cpp @@ -183,6 +183,96 @@ static IppStatus sts = ippInit(); v_b1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk4, 8), _mm_srli_epi16(layer1_chunk5, 8)); \ } +#define _MM_DEINTERLIV_EPI16(layer0_chunk0, layer0_chunk1, layer0_chunk2, \ + layer0_chunk3, layer0_chunk4, layer0_chunk5) \ + { \ + __m128i layer1_chunk0 = _mm_unpacklo_epi16(layer0_chunk0, layer0_chunk3); \ + __m128i layer1_chunk1 = _mm_unpackhi_epi16(layer0_chunk0, layer0_chunk3); \ + __m128i layer1_chunk2 = _mm_unpacklo_epi16(layer0_chunk1, layer0_chunk4); \ + __m128i layer1_chunk3 = _mm_unpackhi_epi16(layer0_chunk1, layer0_chunk4); \ + __m128i layer1_chunk4 = _mm_unpacklo_epi16(layer0_chunk2, layer0_chunk5); \ + __m128i layer1_chunk5 = _mm_unpackhi_epi16(layer0_chunk2, layer0_chunk5); \ + \ + __m128i layer2_chunk0 = _mm_unpacklo_epi16(layer1_chunk0, layer1_chunk3); \ + __m128i layer2_chunk1 = _mm_unpackhi_epi16(layer1_chunk0, layer1_chunk3); \ + __m128i layer2_chunk2 = _mm_unpacklo_epi16(layer1_chunk1, layer1_chunk4); \ + __m128i layer2_chunk3 = _mm_unpackhi_epi16(layer1_chunk1, layer1_chunk4); \ + __m128i layer2_chunk4 = _mm_unpacklo_epi16(layer1_chunk2, layer1_chunk5); \ + __m128i layer2_chunk5 = _mm_unpackhi_epi16(layer1_chunk2, layer1_chunk5); \ + \ + __m128i layer3_chunk0 = _mm_unpacklo_epi16(layer2_chunk0, layer2_chunk3); \ + __m128i layer3_chunk1 = _mm_unpackhi_epi16(layer2_chunk0, layer2_chunk3); \ + __m128i layer3_chunk2 = _mm_unpacklo_epi16(layer2_chunk1, layer2_chunk4); \ + __m128i layer3_chunk3 = _mm_unpackhi_epi16(layer2_chunk1, layer2_chunk4); \ + __m128i layer3_chunk4 = _mm_unpacklo_epi16(layer2_chunk2, layer2_chunk5); \ + __m128i layer3_chunk5 = _mm_unpackhi_epi16(layer2_chunk2, layer2_chunk5); \ + \ + layer0_chunk0 = _mm_unpacklo_epi16(layer3_chunk0, layer3_chunk3); \ + layer0_chunk1 = _mm_unpackhi_epi16(layer3_chunk0, layer3_chunk3); \ + layer0_chunk2 = _mm_unpacklo_epi16(layer3_chunk1, layer3_chunk4); \ + layer0_chunk3 = _mm_unpackhi_epi16(layer3_chunk1, layer3_chunk4); \ + layer0_chunk4 = _mm_unpacklo_epi16(layer3_chunk2, layer3_chunk5); \ + layer0_chunk5 = _mm_unpackhi_epi16(layer3_chunk2, layer3_chunk5); \ + } + +#define _MM_INTERLIV_EPI16(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1) \ + { \ + __m128i v_mask = _mm_set1_epi32(0x0000ffff); \ + \ + __m128i layer3_chunk0 = _mm_packus_epi32(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask)); \ + __m128i layer3_chunk3 = _mm_packus_epi32(_mm_srli_epi32(v_r0, 16), _mm_srli_epi32(v_r1, 16)); \ + __m128i layer3_chunk1 = _mm_packus_epi32(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask)); \ + __m128i layer3_chunk4 = _mm_packus_epi32(_mm_srli_epi32(v_g0, 16), _mm_srli_epi32(v_g1, 16)); \ + __m128i layer3_chunk2 = _mm_packus_epi32(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask)); \ + __m128i layer3_chunk5 = _mm_packus_epi32(_mm_srli_epi32(v_b0, 16), _mm_srli_epi32(v_b1, 16)); \ + \ + __m128i layer2_chunk0 = _mm_packus_epi32(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask)); \ + __m128i layer2_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk0, 16), _mm_srli_epi32(layer3_chunk1, 16)); \ + __m128i layer2_chunk1 = _mm_packus_epi32(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask)); \ + __m128i layer2_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk2, 16), _mm_srli_epi32(layer3_chunk3, 16)); \ + __m128i layer2_chunk2 = _mm_packus_epi32(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask)); \ + __m128i layer2_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk4, 16), _mm_srli_epi32(layer3_chunk5, 16)); \ + \ + __m128i layer1_chunk0 = _mm_packus_epi32(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask)); \ + __m128i layer1_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk0, 16), _mm_srli_epi32(layer2_chunk1, 16)); \ + __m128i layer1_chunk1 = _mm_packus_epi32(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask)); \ + __m128i layer1_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk2, 16), _mm_srli_epi32(layer2_chunk3, 16)); \ + __m128i layer1_chunk2 = _mm_packus_epi32(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask)); \ + __m128i layer1_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk4, 16), _mm_srli_epi32(layer2_chunk5, 16)); \ + \ + v_r0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask)); \ + v_g1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk0, 16), _mm_srli_epi32(layer1_chunk1, 16)); \ + v_r1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask)); \ + v_b0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk2, 16), _mm_srli_epi32(layer1_chunk3, 16)); \ + v_g0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask)); \ + v_b1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk4, 16), _mm_srli_epi32(layer1_chunk5, 16)); \ + } + +#define _MM_DEINTERLIV_PS(layer0_chunk0, layer0_chunk1, layer0_chunk2, \ + layer0_chunk3, layer0_chunk4, layer0_chunk5) \ + { \ + __m128 layer1_chunk0 = _mm_unpacklo_ps(layer0_chunk0, layer0_chunk3); \ + __m128 layer1_chunk1 = _mm_unpackhi_ps(layer0_chunk0, layer0_chunk3); \ + __m128 layer1_chunk2 = _mm_unpacklo_ps(layer0_chunk1, layer0_chunk4); \ + __m128 layer1_chunk3 = _mm_unpackhi_ps(layer0_chunk1, layer0_chunk4); \ + __m128 layer1_chunk4 = _mm_unpacklo_ps(layer0_chunk2, layer0_chunk5); \ + __m128 layer1_chunk5 = _mm_unpackhi_ps(layer0_chunk2, layer0_chunk5); \ + \ + __m128 layer2_chunk0 = _mm_unpacklo_ps(layer1_chunk0, layer1_chunk3); \ + __m128 layer2_chunk1 = _mm_unpackhi_ps(layer1_chunk0, layer1_chunk3); \ + __m128 layer2_chunk2 = _mm_unpacklo_ps(layer1_chunk1, layer1_chunk4); \ + __m128 layer2_chunk3 = _mm_unpackhi_ps(layer1_chunk1, layer1_chunk4); \ + __m128 layer2_chunk4 = _mm_unpacklo_ps(layer1_chunk2, layer1_chunk5); \ + __m128 layer2_chunk5 = _mm_unpackhi_ps(layer1_chunk2, layer1_chunk5); \ + \ + layer0_chunk0 = _mm_unpacklo_ps(layer2_chunk0, layer2_chunk3); \ + layer0_chunk1 = _mm_unpackhi_ps(layer2_chunk0, layer2_chunk3); \ + layer0_chunk2 = _mm_unpacklo_ps(layer2_chunk1, layer2_chunk4); \ + layer0_chunk3 = _mm_unpackhi_ps(layer2_chunk1, layer2_chunk4); \ + layer0_chunk4 = _mm_unpacklo_ps(layer2_chunk2, layer2_chunk5); \ + layer0_chunk5 = _mm_unpackhi_ps(layer2_chunk2, layer2_chunk5); \ + } + #endif namespace cv From b99396ab12b902f6a9adc94935a9bd7f28a16ec7 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 12 Jan 2015 10:59:29 +0300 Subject: [PATCH 21/53] cvtColor RGB 2 YCrCb --- modules/imgproc/src/color.cpp | 150 +++++++++++++++++++++++++++++++++- 1 file changed, 149 insertions(+), 1 deletion(-) diff --git a/modules/imgproc/src/color.cpp b/modules/imgproc/src/color.cpp index 9bbb7b6ade..2bf86d0584 100644 --- a/modules/imgproc/src/color.cpp +++ b/modules/imgproc/src/color.cpp @@ -273,6 +273,32 @@ static IppStatus sts = ippInit(); layer0_chunk5 = _mm_unpackhi_ps(layer2_chunk2, layer2_chunk5); \ } +#define _MM_INTERLIV_PS(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1) \ + { \ + const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1); \ \ + \ + __m128 layer2_chunk0 = _mm_shuffle_ps(v_r0, v_r1, mask_lo); \ + __m128 layer2_chunk3 = _mm_shuffle_ps(v_r0, v_r1, mask_hi); \ + __m128 layer2_chunk1 = _mm_shuffle_ps(v_g0, v_g1, mask_lo); \ + __m128 layer2_chunk4 = _mm_shuffle_ps(v_g0, v_g1, mask_hi); \ + __m128 layer2_chunk2 = _mm_shuffle_ps(v_b0, v_b1, mask_lo); \ + __m128 layer2_chunk5 = _mm_shuffle_ps(v_b0, v_b1, mask_hi); \ + \ + __m128 layer1_chunk0 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_lo); \ + __m128 layer1_chunk3 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_hi); \ + __m128 layer1_chunk1 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_lo); \ + __m128 layer1_chunk4 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_hi); \ + __m128 layer1_chunk2 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_lo); \ + __m128 layer1_chunk5 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_hi); \ + \ + v_r0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_lo); \ + v_g1 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_hi); \ + v_r1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_lo); \ + v_b0 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_hi); \ + v_g0 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_lo); \ + v_b1 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_hi); \ + } + #endif namespace cv @@ -2059,7 +2085,7 @@ struct RGB2YCrCb_i int delta = ColorChannel::half()*(1 << yuv_shift); n *= 3; - if (scn == 3 && false) + if (scn == 3) { for ( ; i <= n - 96; i += 96, src += scn * 32) { @@ -2130,6 +2156,128 @@ struct RGB2YCrCb_i __m128i v_zero; }; +#if CV_SSE4_1 + +template <> +struct RGB2YCrCb_i +{ + typedef ushort channel_type; + + RGB2YCrCb_i(int _srccn, int _blueIdx, const int* _coeffs) + : srccn(_srccn), blueIdx(_blueIdx) + { + static const int coeffs0[] = {R2Y, G2Y, B2Y, 11682, 9241}; + memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 5*sizeof(coeffs[0])); + if (blueIdx==0) + std::swap(coeffs[0], coeffs[2]); + + v_c0 = _mm_set1_epi32(coeffs[0]); + v_c1 = _mm_set1_epi32(coeffs[1]); + v_c2 = _mm_set1_epi32(coeffs[2]); + v_c3 = _mm_set1_epi32(coeffs[3]); + v_c4 = _mm_set1_epi32(coeffs[4]); + v_delta2 = _mm_set1_epi32(1 << (yuv_shift - 1)); + v_delta = _mm_set1_epi32(ColorChannel::half()*(1 << yuv_shift)); + v_delta = _mm_add_epi32(v_delta, v_delta2); + v_zero = _mm_setzero_si128(); + } + + // 16u x 8 + void process(__m128i v_r, __m128i v_g, __m128i v_b, + __m128i & v_y, __m128i & v_cr, __m128i & v_cb) const + { + __m128i v_r_p = _mm_unpacklo_epi16(v_r, v_zero); + __m128i v_g_p = _mm_unpacklo_epi16(v_g, v_zero); + __m128i v_b_p = _mm_unpacklo_epi16(v_b, v_zero); + + __m128i v_y0 = _mm_add_epi32(_mm_mullo_epi32(v_r_p, v_c0), + _mm_add_epi32(_mm_mullo_epi32(v_g_p, v_c1), + _mm_mullo_epi32(v_b_p, v_c2))); + v_y0 = _mm_srli_epi32(_mm_add_epi32(v_delta2, v_y0), yuv_shift); + + __m128i v_cr0 = _mm_mullo_epi32(_mm_sub_epi32(blueIdx == 2 ? v_r_p : v_b_p, v_y0), v_c3); + __m128i v_cb0 = _mm_mullo_epi32(_mm_sub_epi32(blueIdx == 0 ? v_r_p : v_b_p, v_y0), v_c4); + v_cr0 = _mm_srai_epi32(_mm_add_epi32(v_delta, v_cr0), yuv_shift); + v_cb0 = _mm_srai_epi32(_mm_add_epi32(v_delta, v_cb0), yuv_shift); + + v_r_p = _mm_unpackhi_epi16(v_r, v_zero); + v_g_p = _mm_unpackhi_epi16(v_g, v_zero); + v_b_p = _mm_unpackhi_epi16(v_b, v_zero); + + __m128i v_y1 = _mm_add_epi32(_mm_mullo_epi32(v_r_p, v_c0), + _mm_add_epi32(_mm_mullo_epi32(v_g_p, v_c1), + _mm_mullo_epi32(v_b_p, v_c2))); + v_y1 = _mm_srli_epi32(_mm_add_epi32(v_delta2, v_y1), yuv_shift); + + __m128i v_cr1 = _mm_mullo_epi32(_mm_sub_epi32(blueIdx == 2 ? v_r_p : v_b_p, v_y1), v_c3); + __m128i v_cb1 = _mm_mullo_epi32(_mm_sub_epi32(blueIdx == 0 ? v_r_p : v_b_p, v_y1), v_c4); + v_cr1 = _mm_srai_epi32(_mm_add_epi32(v_delta, v_cr1), yuv_shift); + v_cb1 = _mm_srai_epi32(_mm_add_epi32(v_delta, v_cb1), yuv_shift); + + v_y = _mm_packus_epi32(v_y0, v_y1); + v_cr = _mm_packus_epi32(v_cr0, v_cr1); + v_cb = _mm_packus_epi32(v_cb0, v_cb1); + } + + void operator()(const ushort * src, ushort * dst, int n) const + { + int scn = srccn, bidx = blueIdx, i = 0; + int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4]; + int delta = ColorChannel::half()*(1 << yuv_shift); + n *= 3; + + + if (scn == 3) + { + for ( ; i <= n - 48; i += 48, src += scn * 16) + { + __m128i v_r0 = _mm_loadu_si128((__m128i const *)(src)); + __m128i v_r1 = _mm_loadu_si128((__m128i const *)(src + 8)); + __m128i v_g0 = _mm_loadu_si128((__m128i const *)(src + 16)); + __m128i v_g1 = _mm_loadu_si128((__m128i const *)(src + 24)); + __m128i v_b0 = _mm_loadu_si128((__m128i const *)(src + 32)); + __m128i v_b1 = _mm_loadu_si128((__m128i const *)(src + 40)); + + _MM_DEINTERLIV_EPI16(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1) + + __m128i v_y0 = v_zero, v_cr0 = v_zero, v_cb0 = v_zero; + process(v_r0, v_g0, v_b0, + v_y0, v_cr0, v_cb0); + + __m128i v_y1 = v_zero, v_cr1 = v_zero, v_cb1 = v_zero; + process(v_r1, v_g1, v_b1, + v_y1, v_cr1, v_cb1); + + _MM_INTERLIV_EPI16(v_y0, v_y1, v_cr0, v_cr1, v_cb0, v_cb1) + + _mm_storeu_si128((__m128i *)(dst + i), v_y0); + _mm_storeu_si128((__m128i *)(dst + i + 8), v_y1); + _mm_storeu_si128((__m128i *)(dst + i + 16), v_cr0); + _mm_storeu_si128((__m128i *)(dst + i + 24), v_cr1); + _mm_storeu_si128((__m128i *)(dst + i + 32), v_cb0); + _mm_storeu_si128((__m128i *)(dst + i + 40), v_cb1); + } + } + + for ( ; i < n; i += 3, src += scn) + { + int Y = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, yuv_shift); + int Cr = CV_DESCALE((src[bidx^2] - Y)*C3 + delta, yuv_shift); + int Cb = CV_DESCALE((src[bidx] - Y)*C4 + delta, yuv_shift); + dst[i] = saturate_cast(Y); + dst[i+1] = saturate_cast(Cr); + dst[i+2] = saturate_cast(Cb); + } + } + + int srccn, blueIdx, coeffs[5]; + __m128i v_c0, v_c1, v_c2; + __m128i v_c3, v_c4, v_delta, v_delta2; + __m128i v_zero; +}; + +#endif // CV_SSE4_1 + #endif From 97fad1cb539d70db7b84fedd926ef0c4e4588b4f Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 12 Jan 2015 10:59:29 +0300 Subject: [PATCH 22/53] cvtColor RGB 2 YCrCb 32f --- modules/imgproc/src/color.cpp | 88 ++++++++++++++++++++++++++++++++++- 1 file changed, 87 insertions(+), 1 deletion(-) diff --git a/modules/imgproc/src/color.cpp b/modules/imgproc/src/color.cpp index 2bf86d0584..d9597bfd7d 100644 --- a/modules/imgproc/src/color.cpp +++ b/modules/imgproc/src/color.cpp @@ -275,7 +275,7 @@ static IppStatus sts = ippInit(); #define _MM_INTERLIV_PS(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1) \ { \ - const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1); \ \ + const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1); \ \ __m128 layer2_chunk0 = _mm_shuffle_ps(v_r0, v_r1, mask_lo); \ __m128 layer2_chunk3 = _mm_shuffle_ps(v_r0, v_r1, mask_hi); \ @@ -1765,6 +1765,92 @@ struct RGB2YCrCb_f float32x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_delta; }; +#elif CV_SSE2 + +template <> +struct RGB2YCrCb_f +{ + typedef float channel_type; + + RGB2YCrCb_f(int _srccn, int _blueIdx, const float* _coeffs) : + srccn(_srccn), blueIdx(_blueIdx) + { + static const float coeffs0[] = {0.299f, 0.587f, 0.114f, 0.713f, 0.564f}; + memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 5*sizeof(coeffs[0])); + if (blueIdx==0) + std::swap(coeffs[0], coeffs[2]); + + v_c0 = _mm_set1_ps(coeffs[0]); + v_c1 = _mm_set1_ps(coeffs[1]); + v_c2 = _mm_set1_ps(coeffs[2]); + v_c3 = _mm_set1_ps(coeffs[3]); + v_c4 = _mm_set1_ps(coeffs[4]); + v_delta = _mm_set1_ps(ColorChannel::half()); + } + + void process(__m128 v_r, __m128 v_g, __m128 v_b, + __m128 & v_y, __m128 & v_cr, __m128 & v_cb) const + { + v_y = _mm_mul_ps(v_r, v_c0); + v_y = _mm_add_ps(v_y, _mm_mul_ps(v_g, v_c1)); + v_y = _mm_add_ps(v_y, _mm_mul_ps(v_b, v_c2)); + + v_cr = _mm_add_ps(_mm_mul_ps(_mm_sub_ps(blueIdx == 0 ? v_b : v_r, v_y), v_c3), v_delta); + v_cb = _mm_add_ps(_mm_mul_ps(_mm_sub_ps(blueIdx == 2 ? v_b : v_r, v_y), v_c4), v_delta); + } + + void operator()(const float * src, float * dst, int n) const + { + int scn = srccn, bidx = blueIdx, i = 0; + const float delta = ColorChannel::half(); + float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4]; + n *= 3; + + if (scn == 3) + { + for ( ; i <= n - 24; i += 24, src += 24) + { + __m128 v_r0 = _mm_loadu_ps(src); + __m128 v_r1 = _mm_loadu_ps(src + 4); + __m128 v_g0 = _mm_loadu_ps(src + 8); + __m128 v_g1 = _mm_loadu_ps(src + 12); + __m128 v_b0 = _mm_loadu_ps(src + 16); + __m128 v_b1 = _mm_loadu_ps(src + 20); + + _MM_DEINTERLIV_PS(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1) + + __m128 v_y0, v_cr0, v_cb0; + process(v_r0, v_g0, v_b0, + v_y0, v_cr0, v_cb0); + + __m128 v_y1, v_cr1, v_cb1; + process(v_r1, v_g1, v_b1, + v_y1, v_cr1, v_cb1); + + _MM_INTERLIV_PS(v_y0, v_y1, v_cr0, v_cr1, v_cb0, v_cb1) + + _mm_storeu_ps(dst + i, v_y0); + _mm_storeu_ps(dst + i + 4, v_y1); + _mm_storeu_ps(dst + i + 8, v_cr0); + _mm_storeu_ps(dst + i + 12, v_cr1); + _mm_storeu_ps(dst + i + 16, v_cb0); + _mm_storeu_ps(dst + i + 20, v_cb1); + } + } + + for ( ; i < n; i += 3, src += scn) + { + float Y = src[0]*C0 + src[1]*C1 + src[2]*C2; + float Cr = (src[bidx^2] - Y)*C3 + delta; + float Cb = (src[bidx] - Y)*C4 + delta; + dst[i] = Y; dst[i+1] = Cr; dst[i+2] = Cb; + } + } + int srccn, blueIdx; + float coeffs[5]; + __m128 v_c0, v_c1, v_c2, v_c3, v_c4, v_delta; +}; + #endif template struct RGB2YCrCb_i From 1c9e886a6a635af7fd7c4b4766aae39729ca1cc7 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 12 Jan 2015 10:59:29 +0300 Subject: [PATCH 23/53] cvtColor YCrCb 2 RGB 32f --- modules/imgproc/src/color.cpp | 97 +++++++++++++++++++++++++++++++++++ 1 file changed, 97 insertions(+) diff --git a/modules/imgproc/src/color.cpp b/modules/imgproc/src/color.cpp index d9597bfd7d..51329c6169 100644 --- a/modules/imgproc/src/color.cpp +++ b/modules/imgproc/src/color.cpp @@ -2475,6 +2475,103 @@ struct YCrCb2RGB_f float32x4_t v_c0, v_c1, v_c2, v_c3, v_alpha, v_delta; }; +#elif CV_SSE2 + +template <> +struct YCrCb2RGB_f +{ + typedef float channel_type; + + YCrCb2RGB_f(int _dstcn, int _blueIdx, const float* _coeffs) + : dstcn(_dstcn), blueIdx(_blueIdx) + { + static const float coeffs0[] = {1.403f, -0.714f, -0.344f, 1.773f}; + memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 4*sizeof(coeffs[0])); + + v_c0 = _mm_set1_ps(coeffs[0]); + v_c1 = _mm_set1_ps(coeffs[1]); + v_c2 = _mm_set1_ps(coeffs[2]); + v_c3 = _mm_set1_ps(coeffs[3]); + v_delta = _mm_set1_ps(ColorChannel::half()); + v_alpha = _mm_set1_ps(ColorChannel::max()); + } + + void process(__m128 v_y, __m128 v_cr, __m128 v_cb, + __m128 & v_r, __m128 & v_g, __m128 & v_b) const + { + v_cb = _mm_sub_ps(v_cb, v_delta); + v_cr = _mm_sub_ps(v_cr, v_delta); + + v_b = _mm_mul_ps(v_cb, v_c3); + v_g = _mm_add_ps(_mm_mul_ps(v_cb, v_c2), _mm_mul_ps(v_cr, v_c1)); + v_r = _mm_mul_ps(v_cr, v_c0); + + v_b = _mm_add_ps(v_b, v_y); + v_g = _mm_add_ps(v_g, v_y); + v_r = _mm_add_ps(v_r, v_y); + + if (blueIdx == 0) + std::swap(v_b, v_r); + } + + void operator()(const float* src, float* dst, int n) const + { + int dcn = dstcn, bidx = blueIdx, i = 0; + const float delta = ColorChannel::half(), alpha = ColorChannel::max(); + float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3]; + n *= 3; + + if (dcn == 3) + { + for ( ; i <= n - 24; i += 24, dst += 24) + { + __m128 v_y0 = _mm_loadu_ps(src + i); + __m128 v_y1 = _mm_loadu_ps(src + i + 4); + __m128 v_cr0 = _mm_loadu_ps(src + i + 8); + __m128 v_cr1 = _mm_loadu_ps(src + i + 12); + __m128 v_cb0 = _mm_loadu_ps(src + i + 16); + __m128 v_cb1 = _mm_loadu_ps(src + i + 20); + + _MM_DEINTERLIV_PS(v_y0, v_y1, v_cr0, v_cr1, v_cb0, v_cb1) + + __m128 v_r0, v_g0, v_b0; + process(v_y0, v_cr0, v_cb0, + v_r0, v_g0, v_b0); + + __m128 v_r1, v_g1, v_b1; + process(v_y1, v_cr1, v_cb1, + v_r1, v_g1, v_b1); + + _MM_INTERLIV_PS(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1) + + _mm_storeu_ps(dst, v_r0); + _mm_storeu_ps(dst + 4, v_r1); + _mm_storeu_ps(dst + 8, v_g0); + _mm_storeu_ps(dst + 12, v_g1); + _mm_storeu_ps(dst + 16, v_b0); + _mm_storeu_ps(dst + 20, v_b1); + } + } + + for ( ; i < n; i += 3, dst += dcn) + { + float Y = src[i], Cr = src[i+1], Cb = src[i+2]; + + float b = Y + (Cb - delta)*C3; + float g = Y + (Cb - delta)*C2 + (Cr - delta)*C1; + float r = Y + (Cr - delta)*C0; + + dst[bidx] = b; dst[1] = g; dst[bidx^2] = r; + if( dcn == 4 ) + dst[3] = alpha; + } + } + int dstcn, blueIdx; + float coeffs[4]; + + __m128 v_c0, v_c1, v_c2, v_c3, v_alpha, v_delta; +}; + #endif template struct YCrCb2RGB_i From e1773749aee89f73e1e4eeda5d016689434c46b0 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 12 Jan 2015 10:59:29 +0300 Subject: [PATCH 24/53] cvtColor RGB 2 gray 16s --- modules/imgproc/src/color.cpp | 151 +++++++++++++++++++++++++++++++++- 1 file changed, 150 insertions(+), 1 deletion(-) diff --git a/modules/imgproc/src/color.cpp b/modules/imgproc/src/color.cpp index 51329c6169..bf9ef807ca 100644 --- a/modules/imgproc/src/color.cpp +++ b/modules/imgproc/src/color.cpp @@ -1643,6 +1643,155 @@ struct RGB2Gray float32x4_t v_cb, v_cg, v_cr; }; +#elif CV_SSE2 + +template <> +struct RGB2Gray +{ + typedef ushort channel_type; + + RGB2Gray(int _srccn, int blueIdx, const int* _coeffs) : + srccn(_srccn) + { + static const int coeffs0[] = { R2Y, G2Y, B2Y }; + memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 3*sizeof(coeffs[0])); + if( blueIdx == 0 ) + std::swap(coeffs[0], coeffs[2]); + + v_cb = _mm_set1_epi16(coeffs[0]); + v_cg = _mm_set1_epi16(coeffs[1]); + v_cr = _mm_set1_epi16(coeffs[2]); + v_delta = _mm_set1_epi32(1 << (yuv_shift - 1)); + } + + // 16s x 8 + void process(__m128i v_b, __m128i v_g, __m128i v_r, + __m128i & v_gray) const + { + __m128i v_mullo_r = _mm_mullo_epi16(v_r, v_cr); + __m128i v_mullo_g = _mm_mullo_epi16(v_g, v_cg); + __m128i v_mullo_b = _mm_mullo_epi16(v_b, v_cb); + __m128i v_mulhi_r = _mm_mulhi_epu16(v_r, v_cr); + __m128i v_mulhi_g = _mm_mulhi_epu16(v_g, v_cg); + __m128i v_mulhi_b = _mm_mulhi_epu16(v_b, v_cb); + + __m128i v_gray0 = _mm_add_epi32(_mm_unpacklo_epi16(v_mullo_r, v_mulhi_r), + _mm_unpacklo_epi16(v_mullo_g, v_mulhi_g)); + v_gray0 = _mm_add_epi32(_mm_unpacklo_epi16(v_mullo_b, v_mulhi_b), v_gray0); + v_gray0 = _mm_srli_epi32(_mm_add_epi32(v_gray0, v_delta), yuv_shift); + + __m128i v_gray1 = _mm_add_epi32(_mm_unpackhi_epi16(v_mullo_r, v_mulhi_r), + _mm_unpackhi_epi16(v_mullo_g, v_mulhi_g)); + v_gray1 = _mm_add_epi32(_mm_unpackhi_epi16(v_mullo_b, v_mulhi_b), v_gray1); + v_gray1 = _mm_srli_epi32(_mm_add_epi32(v_gray1, v_delta), yuv_shift); + + v_gray = _mm_packus_epi32(v_gray0, v_gray1); + } + + void operator()(const ushort* src, ushort* dst, int n) const + { + int scn = srccn, cb = coeffs[0], cg = coeffs[1], cr = coeffs[2], i = 0; + + if (scn == 3) + { + for ( ; i <= n - 16; i += 16, src += scn * 16) + { + __m128i v_r0 = _mm_loadu_si128((__m128i const *)(src)); + __m128i v_r1 = _mm_loadu_si128((__m128i const *)(src + 8)); + __m128i v_g0 = _mm_loadu_si128((__m128i const *)(src + 16)); + __m128i v_g1 = _mm_loadu_si128((__m128i const *)(src + 24)); + __m128i v_b0 = _mm_loadu_si128((__m128i const *)(src + 32)); + __m128i v_b1 = _mm_loadu_si128((__m128i const *)(src + 40)); + + _MM_DEINTERLIV_EPI16(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1) + + __m128i v_gray0; + process(v_r0, v_g0, v_b0, + v_gray0); + + __m128i v_gray1; + process(v_r1, v_g1, v_b1, + v_gray1); + + _mm_storeu_si128((__m128i *)(dst + i), v_gray0); + _mm_storeu_si128((__m128i *)(dst + i + 8), v_gray1); + } + } + + for( ; i < n; i++, src += scn) + dst[i] = (ushort)CV_DESCALE((unsigned)(src[0]*cb + src[1]*cg + src[2]*cr), yuv_shift); + } + + int srccn, coeffs[3]; + __m128i v_cb, v_cg, v_cr; + __m128i v_delta; +}; + +template <> +struct RGB2Gray +{ + typedef float channel_type; + + RGB2Gray(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn) + { + static const float coeffs0[] = { 0.299f, 0.587f, 0.114f }; + memcpy( coeffs, _coeffs ? _coeffs : coeffs0, 3*sizeof(coeffs[0]) ); + if(blueIdx == 0) + std::swap(coeffs[0], coeffs[2]); + + v_cb = _mm_set1_ps(coeffs[0]); + v_cg = _mm_set1_ps(coeffs[1]); + v_cr = _mm_set1_ps(coeffs[2]); + } + + void process(__m128 v_r, __m128 v_g, __m128 v_b, + __m128 & v_gray) const + { + v_gray = _mm_mul_ps(v_r, v_cb); + v_gray = _mm_add_ps(v_gray, _mm_mul_ps(v_g, v_cg)); + v_gray = _mm_add_ps(v_gray, _mm_mul_ps(v_b, v_cb)); + } + + void operator()(const float * src, float * dst, int n) const + { + int scn = srccn, i = 0; + float cb = coeffs[0], cg = coeffs[1], cr = coeffs[2]; + + if (scn == 3) + { + for ( ; i <= n - 8; i += 8, src += scn * 8) + { + __m128 v_r0 = _mm_loadu_ps(src); + __m128 v_r1 = _mm_loadu_ps(src + 4); + __m128 v_g0 = _mm_loadu_ps(src + 8); + __m128 v_g1 = _mm_loadu_ps(src + 12); + __m128 v_b0 = _mm_loadu_ps(src + 16); + __m128 v_b1 = _mm_loadu_ps(src + 20); + + _MM_DEINTERLIV_PS(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1) + + __m128 v_gray0; + process(v_r0, v_g0, v_b0, + v_gray0); + + __m128 v_gray1; + process(v_r1, v_g1, v_b1, + v_gray1); + + _mm_storeu_ps(dst + i, v_gray0); + _mm_storeu_ps(dst + i + 4, v_gray1); + } + } + + for ( ; i < n; i++, src += scn) + dst[i] = src[0]*cb + src[1]*cg + src[2]*cr; + } + + int srccn; + float coeffs[3]; + __m128 v_cb, v_cg, v_cr; +}; + #else template<> struct RGB2Gray @@ -3019,7 +3168,7 @@ struct YCrCb2RGB_i __m128i v_delta, v_alpha, v_zero; }; -#endif +#endif // CV_SSE2 ////////////////////////////////////// RGB <-> XYZ /////////////////////////////////////// From 93f880084497b028f5587759c4863892173e0493 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 12 Jan 2015 10:59:29 +0300 Subject: [PATCH 25/53] cvtColor RGB 2 XYZ f32 --- modules/imgproc/src/color.cpp | 100 ++++++++++++++++++++++++++++++++++ 1 file changed, 100 insertions(+) diff --git a/modules/imgproc/src/color.cpp b/modules/imgproc/src/color.cpp index bf9ef807ca..8dd5e2cfee 100644 --- a/modules/imgproc/src/color.cpp +++ b/modules/imgproc/src/color.cpp @@ -3291,6 +3291,106 @@ struct RGB2XYZ_f float32x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_c5, v_c6, v_c7, v_c8; }; +#elif CV_SSE2 + +template <> +struct RGB2XYZ_f +{ + typedef float channel_type; + + RGB2XYZ_f(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn) + { + memcpy(coeffs, _coeffs ? _coeffs : sRGB2XYZ_D65, 9*sizeof(coeffs[0])); + if(blueIdx == 0) + { + std::swap(coeffs[0], coeffs[2]); + std::swap(coeffs[3], coeffs[5]); + std::swap(coeffs[6], coeffs[8]); + } + + v_c0 = _mm_set1_ps(coeffs[0]); + v_c1 = _mm_set1_ps(coeffs[1]); + v_c2 = _mm_set1_ps(coeffs[2]); + v_c3 = _mm_set1_ps(coeffs[3]); + v_c4 = _mm_set1_ps(coeffs[4]); + v_c5 = _mm_set1_ps(coeffs[5]); + v_c6 = _mm_set1_ps(coeffs[6]); + v_c7 = _mm_set1_ps(coeffs[7]); + v_c8 = _mm_set1_ps(coeffs[8]); + } + + void process(__m128 v_r, __m128 v_g, __m128 v_b, + __m128 & v_x, __m128 & v_y, __m128 & v_z) const + { + v_x = _mm_mul_ps(v_r, v_c0); + v_x = _mm_add_ps(v_x, _mm_mul_ps(v_g, v_c1)); + v_x = _mm_add_ps(v_x, _mm_mul_ps(v_b, v_c2)); + + v_y = _mm_mul_ps(v_r, v_c3); + v_y = _mm_add_ps(v_y, _mm_mul_ps(v_g, v_c4)); + v_y = _mm_add_ps(v_y, _mm_mul_ps(v_b, v_c5)); + + v_z = _mm_mul_ps(v_r, v_c6); + v_z = _mm_add_ps(v_z, _mm_mul_ps(v_g, v_c7)); + v_z = _mm_add_ps(v_z, _mm_mul_ps(v_b, v_c8)); + } + + void operator()(const float* src, float* dst, int n) const + { + int scn = srccn, i = 0; + float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], + C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5], + C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8]; + + n *= 3; + + if (scn == 3) + { + for ( ; i <= n - 24; i += 24, src += 24) + { + __m128 v_r0 = _mm_loadu_ps(src); + __m128 v_r1 = _mm_loadu_ps(src + 4); + __m128 v_g0 = _mm_loadu_ps(src + 8); + __m128 v_g1 = _mm_loadu_ps(src + 12); + __m128 v_b0 = _mm_loadu_ps(src + 16); + __m128 v_b1 = _mm_loadu_ps(src + 20); + + _MM_DEINTERLIV_PS(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1) + + __m128 v_x0, v_y0, v_z0; + process(v_r0, v_g0, v_b0, + v_x0, v_y0, v_z0); + + __m128 v_x1, v_y1, v_z1; + process(v_r1, v_g1, v_b1, + v_x1, v_y1, v_z1); + + _MM_INTERLIV_PS(v_x0, v_x1, v_y0, v_y1, v_z0, v_z1) + + _mm_storeu_ps(dst + i, v_x0); + _mm_storeu_ps(dst + i + 4, v_x1); + _mm_storeu_ps(dst + i + 8, v_y0); + _mm_storeu_ps(dst + i + 12, v_y1); + _mm_storeu_ps(dst + i + 16, v_z0); + _mm_storeu_ps(dst + i + 20, v_z1); + } + } + + for ( ; i < n; i += 3, src += scn) + { + float X = saturate_cast(src[0]*C0 + src[1]*C1 + src[2]*C2); + float Y = saturate_cast(src[0]*C3 + src[1]*C4 + src[2]*C5); + float Z = saturate_cast(src[0]*C6 + src[1]*C7 + src[2]*C8); + dst[i] = X; dst[i+1] = Y; dst[i+2] = Z; + } + } + + int srccn; + float coeffs[9]; + __m128 v_c0, v_c1, v_c2, v_c3, v_c4, v_c5, v_c6, v_c7, v_c8; +}; + + #endif template struct RGB2XYZ_i From c4c86a899094116cc23a94dbc70dcb9b46af2575 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 12 Jan 2015 10:59:29 +0300 Subject: [PATCH 26/53] cvtColor XYZ 2 RGB f32 --- modules/imgproc/perf/perf_cvt_color.cpp | 6 +- modules/imgproc/src/color.cpp | 107 ++++++++++++++++++++++++ 2 files changed, 110 insertions(+), 3 deletions(-) diff --git a/modules/imgproc/perf/perf_cvt_color.cpp b/modules/imgproc/perf/perf_cvt_color.cpp index 9682cfc5ee..e01257eed1 100644 --- a/modules/imgproc/perf/perf_cvt_color.cpp +++ b/modules/imgproc/perf/perf_cvt_color.cpp @@ -204,12 +204,12 @@ PERF_TEST_P(Size_CvtMode, cvtColor8u, ) { Size sz = GetParam(); - int mode = COLOR_YCrCb2RGB; + int mode = COLOR_XYZ2RGB; ChPair ch = getConversionInfo(mode); mode %= COLOR_COLORCVT_MAX; - Mat src(sz, CV_8UC(3)); - Mat dst(sz, CV_8UC(3)); + Mat src(sz, CV_32FC(3)); + Mat dst(sz, CV_32FC(3)); declare.time(100); declare.in(src, WARMUP_RNG).out(dst); diff --git a/modules/imgproc/src/color.cpp b/modules/imgproc/src/color.cpp index 8dd5e2cfee..292a6df3a2 100644 --- a/modules/imgproc/src/color.cpp +++ b/modules/imgproc/src/color.cpp @@ -3421,6 +3421,7 @@ template struct RGB2XYZ_i C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5], C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8]; n *= 3; + for(int i = 0; i < n; i += 3, src += scn) { int X = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, xyz_shift); @@ -3714,6 +3715,112 @@ template struct XYZ2RGB_f float coeffs[9]; }; +#if CV_SSE2 + +template <> +struct XYZ2RGB_f +{ + typedef float channel_type; + + XYZ2RGB_f(int _dstcn, int _blueIdx, const float* _coeffs) + : dstcn(_dstcn), blueIdx(_blueIdx) + { + memcpy(coeffs, _coeffs ? _coeffs : XYZ2sRGB_D65, 9*sizeof(coeffs[0])); + if(blueIdx == 0) + { + std::swap(coeffs[0], coeffs[6]); + std::swap(coeffs[1], coeffs[7]); + std::swap(coeffs[2], coeffs[8]); + } + + v_c0 = _mm_set1_ps(coeffs[0]); + v_c1 = _mm_set1_ps(coeffs[1]); + v_c2 = _mm_set1_ps(coeffs[2]); + v_c3 = _mm_set1_ps(coeffs[3]); + v_c4 = _mm_set1_ps(coeffs[4]); + v_c5 = _mm_set1_ps(coeffs[5]); + v_c6 = _mm_set1_ps(coeffs[6]); + v_c7 = _mm_set1_ps(coeffs[7]); + v_c8 = _mm_set1_ps(coeffs[8]); + } + + void process(__m128 v_x, __m128 v_y, __m128 v_z, + __m128 & v_r, __m128 & v_g, __m128 & v_b) const + { + v_b = _mm_mul_ps(v_x, v_c0); + v_b = _mm_add_ps(v_b, _mm_mul_ps(v_y, v_c1)); + v_b = _mm_add_ps(v_b, _mm_mul_ps(v_z, v_c2)); + + v_g = _mm_mul_ps(v_x, v_c3); + v_g = _mm_add_ps(v_g, _mm_mul_ps(v_y, v_c4)); + v_g = _mm_add_ps(v_g, _mm_mul_ps(v_z, v_c5)); + + v_r = _mm_mul_ps(v_x, v_c6); + v_r = _mm_add_ps(v_r, _mm_mul_ps(v_y, v_c7)); + v_r = _mm_add_ps(v_r, _mm_mul_ps(v_z, v_c8)); + } + + void operator()(const float* src, float* dst, int n) const + { + int dcn = dstcn; + float alpha = ColorChannel::max(); + float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], + C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5], + C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8]; + n *= 3; + int i = 0; + + if (dcn == 3) + { + for ( ; i <= n - 24; i += 24, dst += 24) + { + __m128 v_x0 = _mm_loadu_ps(src + i); + __m128 v_x1 = _mm_loadu_ps(src + i + 4); + __m128 v_y0 = _mm_loadu_ps(src + i + 8); + __m128 v_y1 = _mm_loadu_ps(src + i + 12); + __m128 v_z0 = _mm_loadu_ps(src + i + 16); + __m128 v_z1 = _mm_loadu_ps(src + i + 20); + + _MM_DEINTERLIV_PS(v_x0, v_x1, v_y0, v_y1, v_z0, v_z1) + + __m128 v_r0, v_g0, v_b0; + process(v_x0, v_y0, v_z0, + v_r0, v_g0, v_b0); + + __m128 v_r1, v_g1, v_b1; + process(v_x1, v_y1, v_z1, + v_r1, v_g1, v_b1); + + _MM_INTERLIV_PS(v_b0, v_b1, v_g0, v_g1, v_r0, v_r1) + + _mm_storeu_ps(dst, v_b0); + _mm_storeu_ps(dst + 4, v_b1); + _mm_storeu_ps(dst + 8, v_g0); + _mm_storeu_ps(dst + 12, v_g1); + _mm_storeu_ps(dst + 16, v_r0); + _mm_storeu_ps(dst + 20, v_r1); + } + + } + + for( ; i < n; i += 3, dst += dcn) + { + float B = src[i]*C0 + src[i+1]*C1 + src[i+2]*C2; + float G = src[i]*C3 + src[i+1]*C4 + src[i+2]*C5; + float R = src[i]*C6 + src[i+1]*C7 + src[i+2]*C8; + dst[0] = B; dst[1] = G; dst[2] = R; + if( dcn == 4 ) + dst[3] = alpha; + } + } + int dstcn, blueIdx; + float coeffs[9]; + + __m128 v_c0, v_c1, v_c2, v_c3, v_c4, v_c5, v_c6, v_c7, v_c8; +}; + +#endif // CV_SSE2 + template struct XYZ2RGB_i { From 51e7fb76b60d40a4b39924f0547d54dcddd4c18a Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 12 Jan 2015 10:59:29 +0300 Subject: [PATCH 27/53] cvtColor HSV 2 RGB u8 --- modules/imgproc/perf/perf_cvt_color.cpp | 6 +- modules/imgproc/src/color.cpp | 93 ++++++++++++++++++++++++- 2 files changed, 95 insertions(+), 4 deletions(-) diff --git a/modules/imgproc/perf/perf_cvt_color.cpp b/modules/imgproc/perf/perf_cvt_color.cpp index e01257eed1..38540837e6 100644 --- a/modules/imgproc/perf/perf_cvt_color.cpp +++ b/modules/imgproc/perf/perf_cvt_color.cpp @@ -204,12 +204,12 @@ PERF_TEST_P(Size_CvtMode, cvtColor8u, ) { Size sz = GetParam(); - int mode = COLOR_XYZ2RGB; + int mode = COLOR_RGB2HLS; ChPair ch = getConversionInfo(mode); mode %= COLOR_COLORCVT_MAX; - Mat src(sz, CV_32FC(3)); - Mat dst(sz, CV_32FC(3)); + Mat src(sz, CV_8UC(3)); + Mat dst(sz, CV_8UC(3)); declare.time(100); declare.in(src, WARMUP_RNG).out(dst); diff --git a/modules/imgproc/src/color.cpp b/modules/imgproc/src/color.cpp index 292a6df3a2..59c4cbf43a 100644 --- a/modules/imgproc/src/color.cpp +++ b/modules/imgproc/src/color.cpp @@ -4335,14 +4335,48 @@ struct HSV2RGB_b v_scale_inv = vdupq_n_f32(1.f/255.f); v_scale = vdupq_n_f32(255.f); v_alpha = vdup_n_u8(ColorChannel::max()); + #elif CV_SSE2 + v_scale_inv = _mm_set1_ps(1.f/255.f); + v_scale = _mm_set1_ps(255.0f); + v_zero = _mm_setzero_si128(); #endif } + #if CV_SSE2 + // 16s x 8 + void process(__m128i v_r, __m128i v_g, __m128i v_b, + float * buf) const + { + __m128 v_r0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_r, v_zero)); + __m128 v_g0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_g, v_zero)); + __m128 v_b0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_b, v_zero)); + + __m128 v_r1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_r, v_zero)); + __m128 v_g1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_g, v_zero)); + __m128 v_b1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_b, v_zero)); + + v_g0 = _mm_mul_ps(v_g0, v_scale_inv); + v_b0 = _mm_mul_ps(v_b0, v_scale_inv); + + v_g1 = _mm_mul_ps(v_g1, v_scale_inv); + v_b1 = _mm_mul_ps(v_b1, v_scale_inv); + + _MM_INTERLIV_PS(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1) + + _mm_store_ps(buf, v_r0); + _mm_store_ps(buf + 4, v_r1); + _mm_store_ps(buf + 8, v_g0); + _mm_store_ps(buf + 12, v_g1); + _mm_store_ps(buf + 16, v_b0); + _mm_store_ps(buf + 20, v_b1); + } + #endif + void operator()(const uchar* src, uchar* dst, int n) const { int i, j, dcn = dstcn; uchar alpha = ColorChannel::max(); - float buf[3*BLOCK_SIZE]; + CV_DECL_ALIGNED(16) float buf[3*BLOCK_SIZE]; for( i = 0; i < n; i += BLOCK_SIZE, src += BLOCK_SIZE*3 ) { @@ -4368,6 +4402,38 @@ struct HSV2RGB_b v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t2))), v_scale_inv); vst3q_f32(buf + j + 12, v_dst); } + #elif CV_SSE2 + for ( ; j <= (dn - 32) * 3; j += 96) + { + __m128i v_r0 = _mm_loadu_si128((__m128i const *)(src + j)); + __m128i v_r1 = _mm_loadu_si128((__m128i const *)(src + j + 16)); + __m128i v_g0 = _mm_loadu_si128((__m128i const *)(src + j + 32)); + __m128i v_g1 = _mm_loadu_si128((__m128i const *)(src + j + 48)); + __m128i v_b0 = _mm_loadu_si128((__m128i const *)(src + j + 64)); + __m128i v_b1 = _mm_loadu_si128((__m128i const *)(src + j + 80)); + + _MM_DEINTERLIV_EPI8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1) + + process(_mm_unpacklo_epi8(v_r0, v_zero), + _mm_unpacklo_epi8(v_g0, v_zero), + _mm_unpacklo_epi8(v_b0, v_zero), + buf + j); + + process(_mm_unpackhi_epi8(v_r0, v_zero), + _mm_unpackhi_epi8(v_g0, v_zero), + _mm_unpackhi_epi8(v_b0, v_zero), + buf + j + 24); + + process(_mm_unpacklo_epi8(v_r1, v_zero), + _mm_unpacklo_epi8(v_g1, v_zero), + _mm_unpacklo_epi8(v_b1, v_zero), + buf + j + 48); + + process(_mm_unpackhi_epi8(v_r1, v_zero), + _mm_unpackhi_epi8(v_g1, v_zero), + _mm_unpackhi_epi8(v_b1, v_zero), + buf + j + 72); + } #endif for( ; j < dn*3; j += 3 ) @@ -4408,6 +4474,28 @@ struct HSV2RGB_b vst3_u8(dst, v_dst); } } + #elif CV_SSE2 + if (dcn == 3) + { + for ( ; j <= (dn - 16) * 3; j += 16, dst += 16) + { + __m128 v_src0 = _mm_mul_ps(_mm_load_ps(buf + j), v_scale); + __m128 v_src1 = _mm_mul_ps(_mm_load_ps(buf + j + 4), v_scale); + __m128 v_src2 = _mm_mul_ps(_mm_load_ps(buf + j + 8), v_scale); + __m128 v_src3 = _mm_mul_ps(_mm_load_ps(buf + j + 12), v_scale); + + __m128i v_dst0 = _mm_packs_epi32(_mm_cvtps_epi32(v_src0), + _mm_cvtps_epi32(v_src1)); + __m128i v_dst1 = _mm_packs_epi32(_mm_cvtps_epi32(v_src2), + _mm_cvtps_epi32(v_src3)); + + _mm_storeu_si128((__m128i *)dst, _mm_packus_epi16(v_dst0, v_dst1)); + } + + int jr = j % 3; + if (jr) + dst -= jr, j -= jr; + } #endif for( ; j < dn*3; j += 3, dst += dcn ) @@ -4426,6 +4514,9 @@ struct HSV2RGB_b #if CV_NEON float32x4_t v_scale, v_scale_inv; uint8x8_t v_alpha; + #elif CV_SSE2 + __m128 v_scale_inv, v_scale; + __m128i v_zero; #endif }; From 51684c109e234a32a99b927ae59cb1f97a0f9c64 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 12 Jan 2015 10:59:29 +0300 Subject: [PATCH 28/53] cvtColor RGB 2 HLS u8 --- modules/imgproc/src/color.cpp | 89 ++++++++++++++++++++++++++++++++++- 1 file changed, 87 insertions(+), 2 deletions(-) diff --git a/modules/imgproc/src/color.cpp b/modules/imgproc/src/color.cpp index 59c4cbf43a..7c9dc101e1 100644 --- a/modules/imgproc/src/color.cpp +++ b/modules/imgproc/src/color.cpp @@ -4477,7 +4477,7 @@ struct HSV2RGB_b #elif CV_SSE2 if (dcn == 3) { - for ( ; j <= (dn - 16) * 3; j += 16, dst += 16) + for ( ; j <= (dn * 3 - 16); j += 16, dst += 16) { __m128 v_src0 = _mm_mul_ps(_mm_load_ps(buf + j), v_scale); __m128 v_src1 = _mm_mul_ps(_mm_load_ps(buf + j + 4), v_scale); @@ -4588,13 +4588,41 @@ struct RGB2HLS_b v_scale_inv = vdupq_n_f32(1.f/255.f); v_scale = vdupq_n_f32(255.f); v_alpha = vdup_n_u8(ColorChannel::max()); + #elif CV_SSE2 + v_scale_inv = _mm_set1_ps(1.f/255.f); + v_scale = _mm_set1_ps(255.f); + v_zero = _mm_setzero_si128(); #endif } + #if CV_SSE2 + void process(const float * buf, + __m128i & v_h, __m128i & v_l, __m128i & v_s) const + { + __m128 v_h0f = _mm_load_ps(buf); + __m128 v_h1f = _mm_load_ps(buf + 4); + __m128 v_l0f = _mm_load_ps(buf + 8); + __m128 v_l1f = _mm_load_ps(buf + 12); + __m128 v_s0f = _mm_load_ps(buf + 16); + __m128 v_s1f = _mm_load_ps(buf + 20); + + _MM_DEINTERLIV_PS(v_h0f, v_h1f, v_l0f, v_l1f, v_s0f, v_s1f) + + v_l0f = _mm_mul_ps(v_l0f, v_scale); + v_l1f = _mm_mul_ps(v_l1f, v_scale); + v_s0f = _mm_mul_ps(v_s0f, v_scale); + v_s1f = _mm_mul_ps(v_s1f, v_scale); + + v_h = _mm_packs_epi32(_mm_cvtps_epi32(v_h0f), _mm_cvtps_epi32(v_h1f)); + v_l = _mm_packs_epi32(_mm_cvtps_epi32(v_l0f), _mm_cvtps_epi32(v_l1f)); + v_s = _mm_packs_epi32(_mm_cvtps_epi32(v_s0f), _mm_cvtps_epi32(v_s1f)); + } + #endif + void operator()(const uchar* src, uchar* dst, int n) const { int i, j, scn = srccn; - float buf[3*BLOCK_SIZE]; + CV_DECL_ALIGNED(16) float buf[3*BLOCK_SIZE]; for( i = 0; i < n; i += BLOCK_SIZE, dst += BLOCK_SIZE*3 ) { @@ -4632,6 +4660,26 @@ struct RGB2HLS_b v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t2))), v_scale_inv); vst3q_f32(buf + j + 12, v_dst); } + #elif CV_SSE2 + if (scn == 3) + { + for ( ; j <= (dn * 3 - 16); j += 16, src += 16) + { + __m128i v_src = _mm_loadu_si128((__m128i const *)src); + + __m128i v_src_p = _mm_unpacklo_epi8(v_src, v_zero); + _mm_store_ps(buf + j, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src_p, v_zero)), v_scale_inv)); + _mm_store_ps(buf + j + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src_p, v_zero)), v_scale_inv)); + + v_src_p = _mm_unpackhi_epi8(v_src, v_zero); + _mm_store_ps(buf + j + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src_p, v_zero)), v_scale_inv)); + _mm_store_ps(buf + j + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src_p, v_zero)), v_scale_inv)); + } + + int jr = j % 3; + if (jr) + src -= jr, j -= jr; + } #endif for( ; j < dn*3; j += 3, src += scn ) { @@ -4656,6 +4704,40 @@ struct RGB2HLS_b vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[2], v_scale))))); vst3_u8(dst + j, v_dst); } + #elif CV_SSE2 + for ( ; j <= (dn - 32) * 3; j += 96) + { + __m128i v_h_0, v_l_0, v_s_0; + process(buf + j, + v_h_0, v_l_0, v_s_0); + + __m128i v_h_1, v_l_1, v_s_1; + process(buf + j + 24, + v_h_1, v_l_1, v_s_1); + + __m128i v_h0 = _mm_packus_epi16(v_h_0, v_h_1); + __m128i v_l0 = _mm_packus_epi16(v_l_0, v_l_1); + __m128i v_s0 = _mm_packus_epi16(v_s_0, v_s_1); + + process(buf + j + 48, + v_h_0, v_l_0, v_s_0); + + process(buf + j + 72, + v_h_1, v_l_1, v_s_1); + + __m128i v_h1 = _mm_packus_epi16(v_h_0, v_h_1); + __m128i v_l1 = _mm_packus_epi16(v_l_0, v_l_1); + __m128i v_s1 = _mm_packus_epi16(v_s_0, v_s_1); + + _MM_INTERLIV_EPI8(v_h0, v_h1, v_l0, v_l1, v_s0, v_s1) + + _mm_storeu_si128((__m128i *)(dst + j), v_h0); + _mm_storeu_si128((__m128i *)(dst + j + 16), v_h1); + _mm_storeu_si128((__m128i *)(dst + j + 32), v_l0); + _mm_storeu_si128((__m128i *)(dst + j + 48), v_l1); + _mm_storeu_si128((__m128i *)(dst + j + 64), v_s0); + _mm_storeu_si128((__m128i *)(dst + j + 80), v_s1); + } #endif for( ; j < dn*3; j += 3 ) { @@ -4671,6 +4753,9 @@ struct RGB2HLS_b #if CV_NEON float32x4_t v_scale, v_scale_inv; uint8x8_t v_alpha; + #elif CV_SSE2 + __m128 v_scale, v_scale_inv; + __m128i v_zero; #endif }; From 05e21015e92cc4efa02dc45ddde635a5e85ebce5 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 12 Jan 2015 10:59:30 +0300 Subject: [PATCH 29/53] cvtColor HLS 2 RGB u8 --- modules/imgproc/perf/perf_cvt_color.cpp | 2 +- modules/imgproc/src/color.cpp | 94 ++++++++++++++++++++++++- 2 files changed, 94 insertions(+), 2 deletions(-) diff --git a/modules/imgproc/perf/perf_cvt_color.cpp b/modules/imgproc/perf/perf_cvt_color.cpp index 38540837e6..7704527cc4 100644 --- a/modules/imgproc/perf/perf_cvt_color.cpp +++ b/modules/imgproc/perf/perf_cvt_color.cpp @@ -204,7 +204,7 @@ PERF_TEST_P(Size_CvtMode, cvtColor8u, ) { Size sz = GetParam(); - int mode = COLOR_RGB2HLS; + int mode = COLOR_HLS2RGB; ChPair ch = getConversionInfo(mode); mode %= COLOR_COLORCVT_MAX; diff --git a/modules/imgproc/src/color.cpp b/modules/imgproc/src/color.cpp index 7c9dc101e1..6b072bbba0 100644 --- a/modules/imgproc/src/color.cpp +++ b/modules/imgproc/src/color.cpp @@ -4835,14 +4835,48 @@ struct HLS2RGB_b v_scale_inv = vdupq_n_f32(1.f/255.f); v_scale = vdupq_n_f32(255.f); v_alpha = vdup_n_u8(ColorChannel::max()); + #elif CV_SSE2 + v_scale_inv = _mm_set1_ps(1.f/255.f); + v_scale = _mm_set1_ps(255.f); + v_zero = _mm_setzero_si128(); #endif } + #if CV_SSE2 + // 16s x 8 + void process(__m128i v_r, __m128i v_g, __m128i v_b, + float * buf) const + { + __m128 v_r0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_r, v_zero)); + __m128 v_g0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_g, v_zero)); + __m128 v_b0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_b, v_zero)); + + __m128 v_r1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_r, v_zero)); + __m128 v_g1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_g, v_zero)); + __m128 v_b1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_b, v_zero)); + + v_g0 = _mm_mul_ps(v_g0, v_scale_inv); + v_b0 = _mm_mul_ps(v_b0, v_scale_inv); + + v_g1 = _mm_mul_ps(v_g1, v_scale_inv); + v_b1 = _mm_mul_ps(v_b1, v_scale_inv); + + _MM_INTERLIV_PS(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1) + + _mm_store_ps(buf, v_r0); + _mm_store_ps(buf + 4, v_r1); + _mm_store_ps(buf + 8, v_g0); + _mm_store_ps(buf + 12, v_g1); + _mm_store_ps(buf + 16, v_b0); + _mm_store_ps(buf + 20, v_b1); + } + #endif + void operator()(const uchar* src, uchar* dst, int n) const { int i, j, dcn = dstcn; uchar alpha = ColorChannel::max(); - float buf[3*BLOCK_SIZE]; + CV_DECL_ALIGNED(16) float buf[3*BLOCK_SIZE]; for( i = 0; i < n; i += BLOCK_SIZE, src += BLOCK_SIZE*3 ) { @@ -4868,6 +4902,38 @@ struct HLS2RGB_b v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t2))), v_scale_inv); vst3q_f32(buf + j + 12, v_dst); } + #elif CV_SSE2 + for ( ; j <= (dn - 32) * 3; j += 96) + { + __m128i v_r0 = _mm_loadu_si128((__m128i const *)(src + j)); + __m128i v_r1 = _mm_loadu_si128((__m128i const *)(src + j + 16)); + __m128i v_g0 = _mm_loadu_si128((__m128i const *)(src + j + 32)); + __m128i v_g1 = _mm_loadu_si128((__m128i const *)(src + j + 48)); + __m128i v_b0 = _mm_loadu_si128((__m128i const *)(src + j + 64)); + __m128i v_b1 = _mm_loadu_si128((__m128i const *)(src + j + 80)); + + _MM_DEINTERLIV_EPI8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1) + + process(_mm_unpacklo_epi8(v_r0, v_zero), + _mm_unpacklo_epi8(v_g0, v_zero), + _mm_unpacklo_epi8(v_b0, v_zero), + buf + j); + + process(_mm_unpackhi_epi8(v_r0, v_zero), + _mm_unpackhi_epi8(v_g0, v_zero), + _mm_unpackhi_epi8(v_b0, v_zero), + buf + j + 24); + + process(_mm_unpacklo_epi8(v_r1, v_zero), + _mm_unpacklo_epi8(v_g1, v_zero), + _mm_unpacklo_epi8(v_b1, v_zero), + buf + j + 48); + + process(_mm_unpackhi_epi8(v_r1, v_zero), + _mm_unpackhi_epi8(v_g1, v_zero), + _mm_unpackhi_epi8(v_b1, v_zero), + buf + j + 72); + } #endif for( ; j < dn*3; j += 3 ) { @@ -4907,7 +4973,30 @@ struct HLS2RGB_b vst3_u8(dst, v_dst); } } + #elif CV_SSE2 + if (dcn == 3) + { + for ( ; j <= (dn * 3 - 16); j += 16, dst += 16) + { + __m128 v_src0 = _mm_mul_ps(_mm_load_ps(buf + j), v_scale); + __m128 v_src1 = _mm_mul_ps(_mm_load_ps(buf + j + 4), v_scale); + __m128 v_src2 = _mm_mul_ps(_mm_load_ps(buf + j + 8), v_scale); + __m128 v_src3 = _mm_mul_ps(_mm_load_ps(buf + j + 12), v_scale); + + __m128i v_dst0 = _mm_packs_epi32(_mm_cvtps_epi32(v_src0), + _mm_cvtps_epi32(v_src1)); + __m128i v_dst1 = _mm_packs_epi32(_mm_cvtps_epi32(v_src2), + _mm_cvtps_epi32(v_src3)); + + _mm_storeu_si128((__m128i *)dst, _mm_packus_epi16(v_dst0, v_dst1)); + } + + int jr = j % 3; + if (jr) + dst -= jr, j -= jr; + } #endif + for( ; j < dn*3; j += 3, dst += dcn ) { dst[0] = saturate_cast(buf[j]*255.f); @@ -4924,6 +5013,9 @@ struct HLS2RGB_b #if CV_NEON float32x4_t v_scale, v_scale_inv; uint8x8_t v_alpha; + #elif CV_SSE2 + __m128 v_scale, v_scale_inv; + __m128i v_zero; #endif }; From 584eed633e74a9b67d6733fed192b2773df54c23 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 12 Jan 2015 10:59:30 +0300 Subject: [PATCH 30/53] cvtColor Lab 2 RGB u8 --- modules/imgproc/perf/perf_cvt_color.cpp | 2 +- modules/imgproc/src/color.cpp | 96 ++++++++++++++++++++++++- 2 files changed, 96 insertions(+), 2 deletions(-) diff --git a/modules/imgproc/perf/perf_cvt_color.cpp b/modules/imgproc/perf/perf_cvt_color.cpp index 7704527cc4..868548c489 100644 --- a/modules/imgproc/perf/perf_cvt_color.cpp +++ b/modules/imgproc/perf/perf_cvt_color.cpp @@ -204,7 +204,7 @@ PERF_TEST_P(Size_CvtMode, cvtColor8u, ) { Size sz = GetParam(); - int mode = COLOR_HLS2RGB; + int mode = COLOR_Lab2RGB; ChPair ch = getConversionInfo(mode); mode %= COLOR_COLORCVT_MAX; diff --git a/modules/imgproc/src/color.cpp b/modules/imgproc/src/color.cpp index 6b072bbba0..fdf7c11bee 100644 --- a/modules/imgproc/src/color.cpp +++ b/modules/imgproc/src/color.cpp @@ -5331,14 +5331,51 @@ struct Lab2RGB_b v_scale = vdupq_n_f32(255.f); v_alpha = vdup_n_u8(ColorChannel::max()); v_128 = vdupq_n_f32(128.0f); + #elif CV_SSE2 + v_scale_inv = _mm_set1_ps(100.f/255.f); + v_scale = _mm_set1_ps(255.f); + v_128 = _mm_set1_ps(128.0f); + v_zero = _mm_setzero_si128(); #endif } + #if CV_SSE2 + // 16s x 8 + void process(__m128i v_r, __m128i v_g, __m128i v_b, + float * buf) const + { + __m128 v_r0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_r, v_zero)); + __m128 v_g0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_g, v_zero)); + __m128 v_b0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_b, v_zero)); + + __m128 v_r1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_r, v_zero)); + __m128 v_g1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_g, v_zero)); + __m128 v_b1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_b, v_zero)); + + v_r0 = _mm_mul_ps(v_r0, v_scale_inv); + v_r1 = _mm_mul_ps(v_r1, v_scale_inv); + + v_g0 = _mm_sub_ps(v_g0, v_128); + v_g1 = _mm_sub_ps(v_g1, v_128); + v_b0 = _mm_sub_ps(v_b0, v_128); + v_b1 = _mm_sub_ps(v_b1, v_128); + + _MM_INTERLIV_PS(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1) + + _mm_store_ps(buf, v_r0); + _mm_store_ps(buf + 4, v_r1); + _mm_store_ps(buf + 8, v_g0); + _mm_store_ps(buf + 12, v_g1); + _mm_store_ps(buf + 16, v_b0); + _mm_store_ps(buf + 20, v_b1); + } + #endif + void operator()(const uchar* src, uchar* dst, int n) const { int i, j, dcn = dstcn; uchar alpha = ColorChannel::max(); - float buf[3*BLOCK_SIZE]; + CV_DECL_ALIGNED(16) float buf[3*BLOCK_SIZE]; for( i = 0; i < n; i += BLOCK_SIZE, src += BLOCK_SIZE*3 ) { @@ -5364,6 +5401,38 @@ struct Lab2RGB_b v_dst.val[2] = vsubq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t2))), v_128); vst3q_f32(buf + j + 12, v_dst); } + #elif CV_SSE2 + for ( ; j <= (dn - 32) * 3; j += 96) + { + __m128i v_r0 = _mm_loadu_si128((__m128i const *)(src + j)); + __m128i v_r1 = _mm_loadu_si128((__m128i const *)(src + j + 16)); + __m128i v_g0 = _mm_loadu_si128((__m128i const *)(src + j + 32)); + __m128i v_g1 = _mm_loadu_si128((__m128i const *)(src + j + 48)); + __m128i v_b0 = _mm_loadu_si128((__m128i const *)(src + j + 64)); + __m128i v_b1 = _mm_loadu_si128((__m128i const *)(src + j + 80)); + + _MM_DEINTERLIV_EPI8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1) + + process(_mm_unpacklo_epi8(v_r0, v_zero), + _mm_unpacklo_epi8(v_g0, v_zero), + _mm_unpacklo_epi8(v_b0, v_zero), + buf + j); + + process(_mm_unpackhi_epi8(v_r0, v_zero), + _mm_unpackhi_epi8(v_g0, v_zero), + _mm_unpackhi_epi8(v_b0, v_zero), + buf + j + 24); + + process(_mm_unpacklo_epi8(v_r1, v_zero), + _mm_unpacklo_epi8(v_g1, v_zero), + _mm_unpacklo_epi8(v_b1, v_zero), + buf + j + 48); + + process(_mm_unpackhi_epi8(v_r1, v_zero), + _mm_unpackhi_epi8(v_g1, v_zero), + _mm_unpackhi_epi8(v_b1, v_zero), + buf + j + 72); + } #endif for( ; j < dn*3; j += 3 ) @@ -5404,6 +5473,28 @@ struct Lab2RGB_b vst3_u8(dst, v_dst); } } + #elif CV_SSE2 + if (dcn == 3) + { + for ( ; j <= (dn * 3 - 16); j += 16, dst += 16) + { + __m128 v_src0 = _mm_mul_ps(_mm_load_ps(buf + j), v_scale); + __m128 v_src1 = _mm_mul_ps(_mm_load_ps(buf + j + 4), v_scale); + __m128 v_src2 = _mm_mul_ps(_mm_load_ps(buf + j + 8), v_scale); + __m128 v_src3 = _mm_mul_ps(_mm_load_ps(buf + j + 12), v_scale); + + __m128i v_dst0 = _mm_packs_epi32(_mm_cvtps_epi32(v_src0), + _mm_cvtps_epi32(v_src1)); + __m128i v_dst1 = _mm_packs_epi32(_mm_cvtps_epi32(v_src2), + _mm_cvtps_epi32(v_src3)); + + _mm_storeu_si128((__m128i *)dst, _mm_packus_epi16(v_dst0, v_dst1)); + } + + int jr = j % 3; + if (jr) + dst -= jr, j -= jr; + } #endif for( ; j < dn*3; j += 3, dst += dcn ) @@ -5423,6 +5514,9 @@ struct Lab2RGB_b #if CV_NEON float32x4_t v_scale, v_scale_inv, v_128; uint8x8_t v_alpha; + #elif CV_SSE2 + __m128 v_scale, v_scale_inv, v_128; + __m128i v_zero; #endif }; From e20613a776d4c74e718df36f4c5ac6c1c935fff2 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 12 Jan 2015 10:59:30 +0300 Subject: [PATCH 31/53] cvtColor RGB 2 Luv u8 --- modules/imgproc/perf/perf_cvt_color.cpp | 2 +- modules/imgproc/src/color.cpp | 91 +++++++++++++++++++++++++ 2 files changed, 92 insertions(+), 1 deletion(-) diff --git a/modules/imgproc/perf/perf_cvt_color.cpp b/modules/imgproc/perf/perf_cvt_color.cpp index 868548c489..4c49fa3a85 100644 --- a/modules/imgproc/perf/perf_cvt_color.cpp +++ b/modules/imgproc/perf/perf_cvt_color.cpp @@ -204,7 +204,7 @@ PERF_TEST_P(Size_CvtMode, cvtColor8u, ) { Size sz = GetParam(); - int mode = COLOR_Lab2RGB; + int mode = COLOR_RGB2Luv; ChPair ch = getConversionInfo(mode); mode %= COLOR_COLORCVT_MAX; diff --git a/modules/imgproc/src/color.cpp b/modules/imgproc/src/color.cpp index fdf7c11bee..32dfc85d53 100644 --- a/modules/imgproc/src/color.cpp +++ b/modules/imgproc/src/color.cpp @@ -5691,9 +5691,43 @@ struct RGB2Luv_b v_coeff3 = vdupq_n_f32(0.9732824427480916f); v_coeff4 = vdupq_n_f32(136.259541984732824f); v_alpha = vdup_n_u8(ColorChannel::max()); + #elif CV_SSE2 + v_zero = _mm_setzero_si128(); + v_scale_inv = _mm_set1_ps(1.f/255.f); + v_scale = _mm_set1_ps(2.55f); + v_coeff1 = _mm_set1_ps(0.72033898305084743f); + v_coeff2 = _mm_set1_ps(96.525423728813564f); + v_coeff3 = _mm_set1_ps(0.9732824427480916f); + v_coeff4 = _mm_set1_ps(136.259541984732824f); #endif } + #if CV_SSE2 + void process(const float * buf, + __m128i & v_l, __m128i & v_u, __m128i & v_v) const + { + __m128 v_l0f = _mm_load_ps(buf); + __m128 v_l1f = _mm_load_ps(buf + 4); + __m128 v_u0f = _mm_load_ps(buf + 8); + __m128 v_u1f = _mm_load_ps(buf + 12); + __m128 v_v0f = _mm_load_ps(buf + 16); + __m128 v_v1f = _mm_load_ps(buf + 20); + + _MM_DEINTERLIV_PS(v_l0f, v_l1f, v_u0f, v_u1f, v_v0f, v_v1f) + + v_l0f = _mm_mul_ps(v_l0f, v_scale); + v_l1f = _mm_mul_ps(v_l1f, v_scale); + v_u0f = _mm_add_ps(_mm_mul_ps(v_u0f, v_coeff1), v_coeff2); + v_u1f = _mm_add_ps(_mm_mul_ps(v_u1f, v_coeff1), v_coeff2); + v_v0f = _mm_add_ps(_mm_mul_ps(v_v0f, v_coeff3), v_coeff4); + v_v1f = _mm_add_ps(_mm_mul_ps(v_v1f, v_coeff3), v_coeff4); + + v_l = _mm_packs_epi32(_mm_cvtps_epi32(v_l0f), _mm_cvtps_epi32(v_l1f)); + v_u = _mm_packs_epi32(_mm_cvtps_epi32(v_u0f), _mm_cvtps_epi32(v_u1f)); + v_v = _mm_packs_epi32(_mm_cvtps_epi32(v_v0f), _mm_cvtps_epi32(v_v1f)); + } + #endif + void operator()(const uchar* src, uchar* dst, int n) const { int i, j, scn = srccn; @@ -5735,6 +5769,26 @@ struct RGB2Luv_b v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t2))), v_scale_inv); vst3q_f32(buf + j + 12, v_dst); } + #elif CV_SSE2 + if (scn == 3) + { + for ( ; j <= (dn * 3 - 16); j += 16, src += 16) + { + __m128i v_src = _mm_loadu_si128((__m128i const *)src); + + __m128i v_src_p = _mm_unpacklo_epi8(v_src, v_zero); + _mm_store_ps(buf + j, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src_p, v_zero)), v_scale_inv)); + _mm_store_ps(buf + j + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src_p, v_zero)), v_scale_inv)); + + v_src_p = _mm_unpackhi_epi8(v_src, v_zero); + _mm_store_ps(buf + j + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src_p, v_zero)), v_scale_inv)); + _mm_store_ps(buf + j + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src_p, v_zero)), v_scale_inv)); + } + + int jr = j % 3; + if (jr) + src -= jr, j -= jr; + } #endif for( ; j < dn*3; j += 3, src += scn ) { @@ -5760,6 +5814,40 @@ struct RGB2Luv_b vst3_u8(dst + j, v_dst); } + #elif CV_SSE2 + for ( ; j <= (dn - 32) * 3; j += 96) + { + __m128i v_l_0, v_u_0, v_v_0; + process(buf + j, + v_l_0, v_u_0, v_v_0); + + __m128i v_l_1, v_u_1, v_v_1; + process(buf + j + 24, + v_l_1, v_u_1, v_v_1); + + __m128i v_l0 = _mm_packus_epi16(v_l_0, v_l_1); + __m128i v_u0 = _mm_packus_epi16(v_u_0, v_u_1); + __m128i v_v0 = _mm_packus_epi16(v_v_0, v_v_1); + + process(buf + j + 48, + v_l_0, v_u_0, v_v_0); + + process(buf + j + 72, + v_l_1, v_u_1, v_v_1); + + __m128i v_l1 = _mm_packus_epi16(v_l_0, v_l_1); + __m128i v_u1 = _mm_packus_epi16(v_u_0, v_u_1); + __m128i v_v1 = _mm_packus_epi16(v_v_0, v_v_1); + + _MM_INTERLIV_EPI8(v_l0, v_l1, v_u0, v_u1, v_v0, v_v1) + + _mm_storeu_si128((__m128i *)(dst + j), v_l0); + _mm_storeu_si128((__m128i *)(dst + j + 16), v_l1); + _mm_storeu_si128((__m128i *)(dst + j + 32), v_u0); + _mm_storeu_si128((__m128i *)(dst + j + 48), v_u1); + _mm_storeu_si128((__m128i *)(dst + j + 64), v_v0); + _mm_storeu_si128((__m128i *)(dst + j + 80), v_v1); + } #endif for( ; j < dn*3; j += 3 ) @@ -5777,6 +5865,9 @@ struct RGB2Luv_b #if CV_NEON float32x4_t v_scale, v_scale_inv, v_coeff1, v_coeff2, v_coeff3, v_coeff4; uint8x8_t v_alpha; + #elif CV_SSE2 + __m128 v_scale, v_scale_inv, v_coeff1, v_coeff2, v_coeff3, v_coeff4; + __m128i v_zero; #endif }; From a340ea872eae842cf1bea98fbbbc3bd5a15105b9 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 12 Jan 2015 10:59:30 +0300 Subject: [PATCH 32/53] cvtColor Luv 2 RGB u8 --- modules/imgproc/perf/perf_cvt_color.cpp | 168 ++++++++++++++++++++++-- modules/imgproc/src/color.cpp | 109 ++++++++++++++- 2 files changed, 262 insertions(+), 15 deletions(-) diff --git a/modules/imgproc/perf/perf_cvt_color.cpp b/modules/imgproc/perf/perf_cvt_color.cpp index 4c49fa3a85..02622ea805 100644 --- a/modules/imgproc/perf/perf_cvt_color.cpp +++ b/modules/imgproc/perf/perf_cvt_color.cpp @@ -56,10 +56,50 @@ enum }; CV_ENUM(CvtMode, + COLOR_BGR2BGR555, COLOR_BGR2BGR565, COLOR_BGR2BGRA, COLOR_BGR2GRAY, + COLOR_BGR2HLS, COLOR_BGR2HLS_FULL, COLOR_BGR2HSV, COLOR_BGR2HSV_FULL, + COLOR_BGR2Lab, COLOR_BGR2Luv, COLOR_BGR2RGB, COLOR_BGR2RGBA, COLOR_BGR2XYZ, + COLOR_BGR2YCrCb, COLOR_BGR2YUV, COLOR_BGR5552BGR, COLOR_BGR5552BGRA, + + COLOR_BGR5552GRAY, COLOR_BGR5552RGB, COLOR_BGR5552RGBA, COLOR_BGR5652BGR, + COLOR_BGR5652BGRA, COLOR_BGR5652GRAY, COLOR_BGR5652RGB, COLOR_BGR5652RGBA, + + COLOR_BGRA2BGR, COLOR_BGRA2BGR555, COLOR_BGRA2BGR565, COLOR_BGRA2GRAY, COLOR_BGRA2RGBA, + CX_BGRA2HLS, CX_BGRA2HLS_FULL, CX_BGRA2HSV, CX_BGRA2HSV_FULL, CX_BGRA2Lab, CX_BGRA2Luv, CX_BGRA2XYZ, CX_BGRA2YCrCb, CX_BGRA2YUV, + + COLOR_GRAY2BGR, COLOR_GRAY2BGR555, COLOR_GRAY2BGR565, COLOR_GRAY2BGRA, + + COLOR_HLS2BGR, COLOR_HLS2BGR_FULL, COLOR_HLS2RGB, COLOR_HLS2RGB_FULL, + CX_HLS2BGRA, CX_HLS2BGRA_FULL, CX_HLS2RGBA, CX_HLS2RGBA_FULL, + + COLOR_HSV2BGR, COLOR_HSV2BGR_FULL, COLOR_HSV2RGB, COLOR_HSV2RGB_FULL, + CX_HSV2BGRA, CX_HSV2BGRA_FULL, CX_HSV2RGBA, CX_HSV2RGBA_FULL, + + COLOR_Lab2BGR, COLOR_Lab2LBGR, COLOR_Lab2LRGB, COLOR_Lab2RGB, + CX_Lab2BGRA, CX_Lab2LBGRA, CX_Lab2LRGBA, CX_Lab2RGBA, + + COLOR_LBGR2Lab, COLOR_LBGR2Luv, COLOR_LRGB2Lab, COLOR_LRGB2Luv, + CX_LBGRA2Lab, CX_LBGRA2Luv, CX_LRGBA2Lab, CX_LRGBA2Luv, + + COLOR_Luv2BGR, COLOR_Luv2LBGR, COLOR_Luv2LRGB, COLOR_Luv2RGB, + CX_Luv2BGRA, CX_Luv2LBGRA, CX_Luv2LRGBA, CX_Luv2RGBA, + + COLOR_RGB2BGR555, COLOR_RGB2BGR565, COLOR_RGB2GRAY, + COLOR_RGB2HLS, COLOR_RGB2HLS_FULL, COLOR_RGB2HSV, COLOR_RGB2HSV_FULL, COLOR_RGB2Lab, COLOR_RGB2Luv, COLOR_RGB2XYZ, COLOR_RGB2YCrCb, COLOR_RGB2YUV, - CX_RGBA2YCrCb, CX_RGBA2YUV) + + COLOR_RGBA2BGR, COLOR_RGBA2BGR555, COLOR_RGBA2BGR565, COLOR_RGBA2GRAY, + CX_RGBA2HLS, CX_RGBA2HLS_FULL, CX_RGBA2HSV, CX_RGBA2HSV_FULL, + CX_RGBA2Lab, CX_RGBA2Luv, CX_RGBA2XYZ, + CX_RGBA2YCrCb, CX_RGBA2YUV, + + COLOR_XYZ2BGR, COLOR_XYZ2RGB, CX_XYZ2BGRA, CX_XYZ2RGBA, + + COLOR_YCrCb2BGR, COLOR_YCrCb2RGB, CX_YCrCb2BGRA, CX_YCrCb2RGBA, + COLOR_YUV2BGR, COLOR_YUV2RGB, CX_YUV2BGRA, CX_YUV2RGBA + ) CV_ENUM(CvtModeBayer, @@ -197,25 +237,135 @@ ChPair getConversionInfo(int cvtMode) return ChPair(0,0); } -typedef perf::TestBaseWithParam Size_CvtMode; +typedef std::tr1::tuple Size_CvtMode_t; +typedef perf::TestBaseWithParam Size_CvtMode; PERF_TEST_P(Size_CvtMode, cvtColor8u, - testing::Values(::perf::szODD, ::perf::szVGA, ::perf::sz1080p) + testing::Combine( + testing::Values(::perf::szODD, ::perf::szVGA, ::perf::sz1080p), + CvtMode::all() + ) ) { - Size sz = GetParam(); - int mode = COLOR_RGB2Luv; + Size sz = get<0>(GetParam()); + int _mode = get<1>(GetParam()), mode = _mode; ChPair ch = getConversionInfo(mode); mode %= COLOR_COLORCVT_MAX; - Mat src(sz, CV_8UC(3)); - Mat dst(sz, CV_8UC(3)); + Mat src(sz, CV_8UC(ch.scn)); + Mat dst(sz, CV_8UC(ch.dcn)); declare.time(100); declare.in(src, WARMUP_RNG).out(dst); int runs = sz.width <= 320 ? 100 : 5; - TEST_CYCLE_MULTIRUN(runs) cvtColor(src, dst, mode, 3); + TEST_CYCLE_MULTIRUN(runs) cvtColor(src, dst, mode, ch.dcn); - SANITY_CHECK_NOTHING(); +#if defined(__APPLE__) && defined(HAVE_IPP) + SANITY_CHECK(dst, _mode == CX_BGRA2HLS_FULL ? 2 : 1); +#else + SANITY_CHECK(dst, 1); +#endif +} + +typedef std::tr1::tuple Size_CvtMode_Bayer_t; +typedef perf::TestBaseWithParam Size_CvtMode_Bayer; + +PERF_TEST_P(Size_CvtMode_Bayer, cvtColorBayer8u, + testing::Combine( + testing::Values(::perf::szODD, ::perf::szVGA), + CvtModeBayer::all() + ) + ) +{ + Size sz = get<0>(GetParam()); + int mode = get<1>(GetParam()); + ChPair ch = getConversionInfo(mode); + mode %= COLOR_COLORCVT_MAX; + + Mat src(sz, CV_8UC(ch.scn)); + Mat dst(sz, CV_8UC(ch.dcn)); + + declare.time(100); + declare.in(src, WARMUP_RNG).out(dst); + + TEST_CYCLE() cvtColor(src, dst, mode, ch.dcn); + + SANITY_CHECK(dst, 1); +} + +typedef std::tr1::tuple Size_CvtMode2_t; +typedef perf::TestBaseWithParam Size_CvtMode2; + +PERF_TEST_P(Size_CvtMode2, cvtColorYUV420, + testing::Combine( + testing::Values(szVGA, sz1080p, Size(130, 60)), + CvtMode2::all() + ) + ) +{ + Size sz = get<0>(GetParam()); + int mode = get<1>(GetParam()); + ChPair ch = getConversionInfo(mode); + + Mat src(sz.height + sz.height / 2, sz.width, CV_8UC(ch.scn)); + Mat dst(sz, CV_8UC(ch.dcn)); + + declare.in(src, WARMUP_RNG).out(dst); + + int runs = (sz.width <= 640) ? 8 : 1; + TEST_CYCLE_MULTIRUN(runs) cvtColor(src, dst, mode, ch.dcn); + + SANITY_CHECK(dst, 1); +} + +typedef std::tr1::tuple Size_CvtMode3_t; +typedef perf::TestBaseWithParam Size_CvtMode3; + +PERF_TEST_P(Size_CvtMode3, cvtColorRGB2YUV420p, + testing::Combine( + testing::Values(szVGA, sz720p, sz1080p, Size(130, 60)), + CvtMode3::all() + ) + ) +{ + Size sz = get<0>(GetParam()); + int mode = get<1>(GetParam()); + ChPair ch = getConversionInfo(mode); + + Mat src(sz, CV_8UC(ch.scn)); + Mat dst(sz.height + sz.height / 2, sz.width, CV_8UC(ch.dcn)); + + declare.time(100); + declare.in(src, WARMUP_RNG).out(dst); + + int runs = (sz.width <= 640) ? 10 : 1; + TEST_CYCLE_MULTIRUN(runs) cvtColor(src, dst, mode, ch.dcn); + + SANITY_CHECK(dst, 1); +} + +CV_ENUM(EdgeAwareBayerMode, COLOR_BayerBG2BGR_EA, COLOR_BayerGB2BGR_EA, COLOR_BayerRG2BGR_EA, COLOR_BayerGR2BGR_EA) + +typedef std::tr1::tuple EdgeAwareParams; +typedef perf::TestBaseWithParam EdgeAwareDemosaicingTest; + +PERF_TEST_P(EdgeAwareDemosaicingTest, demosaicingEA, + testing::Combine( + testing::Values(szVGA, sz720p, sz1080p, Size(130, 60)), + EdgeAwareBayerMode::all() + ) + ) +{ + Size sz = get<0>(GetParam()); + int mode = get<1>(GetParam()); + + Mat src(sz, CV_8UC1); + Mat dst(sz, CV_8UC3); + + declare.in(src, WARMUP_RNG).out(dst); + + TEST_CYCLE() cvtColor(src, dst, mode, 3); + + SANITY_CHECK(dst, 1); } diff --git a/modules/imgproc/src/color.cpp b/modules/imgproc/src/color.cpp index 32dfc85d53..2794da36e0 100644 --- a/modules/imgproc/src/color.cpp +++ b/modules/imgproc/src/color.cpp @@ -4376,7 +4376,7 @@ struct HSV2RGB_b { int i, j, dcn = dstcn; uchar alpha = ColorChannel::max(); - CV_DECL_ALIGNED(16) float buf[3*BLOCK_SIZE]; + float CV_DECL_ALIGNED(16) buf[3*BLOCK_SIZE]; for( i = 0; i < n; i += BLOCK_SIZE, src += BLOCK_SIZE*3 ) { @@ -4622,7 +4622,7 @@ struct RGB2HLS_b void operator()(const uchar* src, uchar* dst, int n) const { int i, j, scn = srccn; - CV_DECL_ALIGNED(16) float buf[3*BLOCK_SIZE]; + float CV_DECL_ALIGNED(16) buf[3*BLOCK_SIZE]; for( i = 0; i < n; i += BLOCK_SIZE, dst += BLOCK_SIZE*3 ) { @@ -4876,7 +4876,7 @@ struct HLS2RGB_b { int i, j, dcn = dstcn; uchar alpha = ColorChannel::max(); - CV_DECL_ALIGNED(16) float buf[3*BLOCK_SIZE]; + float CV_DECL_ALIGNED(16) buf[3*BLOCK_SIZE]; for( i = 0; i < n; i += BLOCK_SIZE, src += BLOCK_SIZE*3 ) { @@ -5375,7 +5375,7 @@ struct Lab2RGB_b { int i, j, dcn = dstcn; uchar alpha = ColorChannel::max(); - CV_DECL_ALIGNED(16) float buf[3*BLOCK_SIZE]; + float CV_DECL_ALIGNED(16) buf[3*BLOCK_SIZE]; for( i = 0; i < n; i += BLOCK_SIZE, src += BLOCK_SIZE*3 ) { @@ -5731,7 +5731,7 @@ struct RGB2Luv_b void operator()(const uchar* src, uchar* dst, int n) const { int i, j, scn = srccn; - float buf[3*BLOCK_SIZE]; + float CV_DECL_ALIGNED(16) buf[3*BLOCK_SIZE]; for( i = 0; i < n; i += BLOCK_SIZE, dst += BLOCK_SIZE*3 ) { @@ -5888,14 +5888,54 @@ struct Luv2RGB_b v_140 = vdupq_n_f32(140.f); v_scale = vdupq_n_f32(255.f); v_alpha = vdup_n_u8(ColorChannel::max()); + #elif CV_SSE2 + v_scale_inv = _mm_set1_ps(100.f/255.f); + v_coeff1 = _mm_set1_ps(1.388235294117647f); + v_coeff2 = _mm_set1_ps(1.027450980392157f); + v_134 = _mm_set1_ps(134.f); + v_140 = _mm_set1_ps(140.f); + v_scale = _mm_set1_ps(255.f); + v_zero = _mm_setzero_si128(); #endif } + #if CV_SSE2 + // 16s x 8 + void process(__m128i v_l, __m128i v_u, __m128i v_v, + float * buf) const + { + __m128 v_l0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_l, v_zero)); + __m128 v_u0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_u, v_zero)); + __m128 v_v0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_v, v_zero)); + + __m128 v_l1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_l, v_zero)); + __m128 v_u1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_u, v_zero)); + __m128 v_v1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_v, v_zero)); + + v_l0 = _mm_mul_ps(v_l0, v_scale_inv); + v_l1 = _mm_mul_ps(v_l1, v_scale_inv); + + v_u0 = _mm_sub_ps(_mm_mul_ps(v_u0, v_coeff1), v_134); + v_u1 = _mm_sub_ps(_mm_mul_ps(v_u1, v_coeff1), v_134); + v_v0 = _mm_sub_ps(_mm_mul_ps(v_v0, v_coeff2), v_140); + v_v1 = _mm_sub_ps(_mm_mul_ps(v_v1, v_coeff2), v_140); + + _MM_INTERLIV_PS(v_l0, v_l1, v_u0, v_u1, v_v0, v_v1) + + _mm_store_ps(buf, v_l0); + _mm_store_ps(buf + 4, v_l1); + _mm_store_ps(buf + 8, v_u0); + _mm_store_ps(buf + 12, v_u1); + _mm_store_ps(buf + 16, v_v0); + _mm_store_ps(buf + 20, v_v1); + } + #endif + void operator()(const uchar* src, uchar* dst, int n) const { int i, j, dcn = dstcn; uchar alpha = ColorChannel::max(); - float buf[3*BLOCK_SIZE]; + float CV_DECL_ALIGNED(16) buf[3*BLOCK_SIZE]; for( i = 0; i < n; i += BLOCK_SIZE, src += BLOCK_SIZE*3 ) { @@ -5921,6 +5961,38 @@ struct Luv2RGB_b v_dst.val[2] = vsubq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t2))), v_coeff2), v_140); vst3q_f32(buf + j + 12, v_dst); } + #elif CV_SSE2 + for ( ; j <= (dn - 32) * 3; j += 96) + { + __m128i v_r0 = _mm_loadu_si128((__m128i const *)(src + j)); + __m128i v_r1 = _mm_loadu_si128((__m128i const *)(src + j + 16)); + __m128i v_g0 = _mm_loadu_si128((__m128i const *)(src + j + 32)); + __m128i v_g1 = _mm_loadu_si128((__m128i const *)(src + j + 48)); + __m128i v_b0 = _mm_loadu_si128((__m128i const *)(src + j + 64)); + __m128i v_b1 = _mm_loadu_si128((__m128i const *)(src + j + 80)); + + _MM_DEINTERLIV_EPI8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1) + + process(_mm_unpacklo_epi8(v_r0, v_zero), + _mm_unpacklo_epi8(v_g0, v_zero), + _mm_unpacklo_epi8(v_b0, v_zero), + buf + j); + + process(_mm_unpackhi_epi8(v_r0, v_zero), + _mm_unpackhi_epi8(v_g0, v_zero), + _mm_unpackhi_epi8(v_b0, v_zero), + buf + j + 24); + + process(_mm_unpacklo_epi8(v_r1, v_zero), + _mm_unpacklo_epi8(v_g1, v_zero), + _mm_unpacklo_epi8(v_b1, v_zero), + buf + j + 48); + + process(_mm_unpackhi_epi8(v_r1, v_zero), + _mm_unpackhi_epi8(v_g1, v_zero), + _mm_unpackhi_epi8(v_b1, v_zero), + buf + j + 72); + } #endif for( ; j < dn*3; j += 3 ) { @@ -5960,6 +6032,28 @@ struct Luv2RGB_b vst3_u8(dst, v_dst); } } + #elif CV_SSE2 + if (dcn == 3) + { + for ( ; j <= (dn * 3 - 16); j += 16, dst += 16) + { + __m128 v_src0 = _mm_mul_ps(_mm_load_ps(buf + j), v_scale); + __m128 v_src1 = _mm_mul_ps(_mm_load_ps(buf + j + 4), v_scale); + __m128 v_src2 = _mm_mul_ps(_mm_load_ps(buf + j + 8), v_scale); + __m128 v_src3 = _mm_mul_ps(_mm_load_ps(buf + j + 12), v_scale); + + __m128i v_dst0 = _mm_packs_epi32(_mm_cvtps_epi32(v_src0), + _mm_cvtps_epi32(v_src1)); + __m128i v_dst1 = _mm_packs_epi32(_mm_cvtps_epi32(v_src2), + _mm_cvtps_epi32(v_src3)); + + _mm_storeu_si128((__m128i *)dst, _mm_packus_epi16(v_dst0, v_dst1)); + } + + int jr = j % 3; + if (jr) + dst -= jr, j -= jr; + } #endif for( ; j < dn*3; j += 3, dst += dcn ) @@ -5979,6 +6073,9 @@ struct Luv2RGB_b #if CV_NEON float32x4_t v_scale, v_scale_inv, v_coeff1, v_coeff2, v_134, v_140; uint8x8_t v_alpha; + #elif CV_SSE2 + __m128 v_scale, v_scale_inv, v_coeff1, v_coeff2, v_134, v_140; + __m128i v_zero; #endif }; From 3a426660ea1836d9414847bb6d56c83a9cc90168 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 12 Jan 2015 10:59:30 +0300 Subject: [PATCH 33/53] sse_utils.hpp --- modules/core/include/opencv2/core/base.hpp | 2 + .../core/include/opencv2/core/sse_utils.hpp | 497 ++++++++++++++++++ modules/imgproc/src/color.cpp | 309 +++-------- 3 files changed, 581 insertions(+), 227 deletions(-) create mode 100644 modules/core/include/opencv2/core/sse_utils.hpp diff --git a/modules/core/include/opencv2/core/base.hpp b/modules/core/include/opencv2/core/base.hpp index e43fbbc951..c60eedddcc 100644 --- a/modules/core/include/opencv2/core/base.hpp +++ b/modules/core/include/opencv2/core/base.hpp @@ -813,4 +813,6 @@ inline float32x2_t cv_vsqrt_f32(float32x2_t val) } // cv +#include "sse_utils.hpp" + #endif //__OPENCV_CORE_BASE_HPP__ diff --git a/modules/core/include/opencv2/core/sse_utils.hpp b/modules/core/include/opencv2/core/sse_utils.hpp new file mode 100644 index 0000000000..13673f57b3 --- /dev/null +++ b/modules/core/include/opencv2/core/sse_utils.hpp @@ -0,0 +1,497 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2015, OpenCV Foundation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#ifndef __OPENCV_CORE_SSE_UTILS_HPP__ +#define __OPENCV_CORE_SSE_UTILS_HPP__ + +#ifndef __cplusplus +# error base.hpp header must be compiled as C++ +#endif + +#if CV_SSE2 + +inline void _mm_deinterliv_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, + __m128i & v_g1, __m128i & v_b0, __m128i & v_b1) +{ + __m128i layer1_chunk0 = _mm_unpacklo_epi8(v_r0, v_g1); + __m128i layer1_chunk1 = _mm_unpackhi_epi8(v_r0, v_g1); + __m128i layer1_chunk2 = _mm_unpacklo_epi8(v_r1, v_b0); + __m128i layer1_chunk3 = _mm_unpackhi_epi8(v_r1, v_b0); + __m128i layer1_chunk4 = _mm_unpacklo_epi8(v_g0, v_b1); + __m128i layer1_chunk5 = _mm_unpackhi_epi8(v_g0, v_b1); + + __m128i layer2_chunk0 = _mm_unpacklo_epi8(layer1_chunk0, layer1_chunk3); + __m128i layer2_chunk1 = _mm_unpackhi_epi8(layer1_chunk0, layer1_chunk3); + __m128i layer2_chunk2 = _mm_unpacklo_epi8(layer1_chunk1, layer1_chunk4); + __m128i layer2_chunk3 = _mm_unpackhi_epi8(layer1_chunk1, layer1_chunk4); + __m128i layer2_chunk4 = _mm_unpacklo_epi8(layer1_chunk2, layer1_chunk5); + __m128i layer2_chunk5 = _mm_unpackhi_epi8(layer1_chunk2, layer1_chunk5); + + __m128i layer3_chunk0 = _mm_unpacklo_epi8(layer2_chunk0, layer2_chunk3); + __m128i layer3_chunk1 = _mm_unpackhi_epi8(layer2_chunk0, layer2_chunk3); + __m128i layer3_chunk2 = _mm_unpacklo_epi8(layer2_chunk1, layer2_chunk4); + __m128i layer3_chunk3 = _mm_unpackhi_epi8(layer2_chunk1, layer2_chunk4); + __m128i layer3_chunk4 = _mm_unpacklo_epi8(layer2_chunk2, layer2_chunk5); + __m128i layer3_chunk5 = _mm_unpackhi_epi8(layer2_chunk2, layer2_chunk5); + + __m128i layer4_chunk0 = _mm_unpacklo_epi8(layer3_chunk0, layer3_chunk3); + __m128i layer4_chunk1 = _mm_unpackhi_epi8(layer3_chunk0, layer3_chunk3); + __m128i layer4_chunk2 = _mm_unpacklo_epi8(layer3_chunk1, layer3_chunk4); + __m128i layer4_chunk3 = _mm_unpackhi_epi8(layer3_chunk1, layer3_chunk4); + __m128i layer4_chunk4 = _mm_unpacklo_epi8(layer3_chunk2, layer3_chunk5); + __m128i layer4_chunk5 = _mm_unpackhi_epi8(layer3_chunk2, layer3_chunk5); + + v_r0 = _mm_unpacklo_epi8(layer4_chunk0, layer4_chunk3); + v_r1 = _mm_unpackhi_epi8(layer4_chunk0, layer4_chunk3); + v_g0 = _mm_unpacklo_epi8(layer4_chunk1, layer4_chunk4); + v_g1 = _mm_unpackhi_epi8(layer4_chunk1, layer4_chunk4); + v_b0 = _mm_unpacklo_epi8(layer4_chunk2, layer4_chunk5); + v_b1 = _mm_unpackhi_epi8(layer4_chunk2, layer4_chunk5); +} + +inline void _mm_deinterliv_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1, + __m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1) +{ + __m128i layer1_chunk0 = _mm_unpacklo_epi8(v_r0, v_b0); + __m128i layer1_chunk1 = _mm_unpackhi_epi8(v_r0, v_b0); + __m128i layer1_chunk2 = _mm_unpacklo_epi8(v_r1, v_b1); + __m128i layer1_chunk3 = _mm_unpackhi_epi8(v_r1, v_b1); + __m128i layer1_chunk4 = _mm_unpacklo_epi8(v_g0, v_a0); + __m128i layer1_chunk5 = _mm_unpackhi_epi8(v_g0, v_a0); + __m128i layer1_chunk6 = _mm_unpacklo_epi8(v_g1, v_a1); + __m128i layer1_chunk7 = _mm_unpackhi_epi8(v_g1, v_a1); + + __m128i layer2_chunk0 = _mm_unpacklo_epi8(layer1_chunk0, layer1_chunk4); + __m128i layer2_chunk1 = _mm_unpackhi_epi8(layer1_chunk0, layer1_chunk4); + __m128i layer2_chunk2 = _mm_unpacklo_epi8(layer1_chunk1, layer1_chunk5); + __m128i layer2_chunk3 = _mm_unpackhi_epi8(layer1_chunk1, layer1_chunk5); + __m128i layer2_chunk4 = _mm_unpacklo_epi8(layer1_chunk2, layer1_chunk6); + __m128i layer2_chunk5 = _mm_unpackhi_epi8(layer1_chunk2, layer1_chunk6); + __m128i layer2_chunk6 = _mm_unpacklo_epi8(layer1_chunk3, layer1_chunk7); + __m128i layer2_chunk7 = _mm_unpackhi_epi8(layer1_chunk3, layer1_chunk7); + + __m128i layer3_chunk0 = _mm_unpacklo_epi8(layer2_chunk0, layer2_chunk4); + __m128i layer3_chunk1 = _mm_unpackhi_epi8(layer2_chunk0, layer2_chunk4); + __m128i layer3_chunk2 = _mm_unpacklo_epi8(layer2_chunk1, layer2_chunk5); + __m128i layer3_chunk3 = _mm_unpackhi_epi8(layer2_chunk1, layer2_chunk5); + __m128i layer3_chunk4 = _mm_unpacklo_epi8(layer2_chunk2, layer2_chunk6); + __m128i layer3_chunk5 = _mm_unpackhi_epi8(layer2_chunk2, layer2_chunk6); + __m128i layer3_chunk6 = _mm_unpacklo_epi8(layer2_chunk3, layer2_chunk7); + __m128i layer3_chunk7 = _mm_unpackhi_epi8(layer2_chunk3, layer2_chunk7); + + __m128i layer4_chunk0 = _mm_unpacklo_epi8(layer3_chunk0, layer3_chunk4); + __m128i layer4_chunk1 = _mm_unpackhi_epi8(layer3_chunk0, layer3_chunk4); + __m128i layer4_chunk2 = _mm_unpacklo_epi8(layer3_chunk1, layer3_chunk5); + __m128i layer4_chunk3 = _mm_unpackhi_epi8(layer3_chunk1, layer3_chunk5); + __m128i layer4_chunk4 = _mm_unpacklo_epi8(layer3_chunk2, layer3_chunk6); + __m128i layer4_chunk5 = _mm_unpackhi_epi8(layer3_chunk2, layer3_chunk6); + __m128i layer4_chunk6 = _mm_unpacklo_epi8(layer3_chunk3, layer3_chunk7); + __m128i layer4_chunk7 = _mm_unpackhi_epi8(layer3_chunk3, layer3_chunk7); + + v_r0 = _mm_unpacklo_epi8(layer4_chunk0, layer4_chunk4); + v_r1 = _mm_unpackhi_epi8(layer4_chunk0, layer4_chunk4); + v_g0 = _mm_unpacklo_epi8(layer4_chunk1, layer4_chunk5); + v_g1 = _mm_unpackhi_epi8(layer4_chunk1, layer4_chunk5); + v_b0 = _mm_unpacklo_epi8(layer4_chunk2, layer4_chunk6); + v_b1 = _mm_unpackhi_epi8(layer4_chunk2, layer4_chunk6); + v_a0 = _mm_unpacklo_epi8(layer4_chunk3, layer4_chunk7); + v_a1 = _mm_unpackhi_epi8(layer4_chunk3, layer4_chunk7); +} + +inline void _mm_interlive_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, + __m128i & v_g1, __m128i & v_b0, __m128i & v_b1) +{ + __m128i v_mask = _mm_set1_epi16(0x00ff); + + __m128i layer4_chunk0 = _mm_packus_epi16(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask)); + __m128i layer4_chunk3 = _mm_packus_epi16(_mm_srli_epi16(v_r0, 8), _mm_srli_epi16(v_r1, 8)); + __m128i layer4_chunk1 = _mm_packus_epi16(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask)); + __m128i layer4_chunk4 = _mm_packus_epi16(_mm_srli_epi16(v_g0, 8), _mm_srli_epi16(v_g1, 8)); + __m128i layer4_chunk2 = _mm_packus_epi16(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask)); + __m128i layer4_chunk5 = _mm_packus_epi16(_mm_srli_epi16(v_b0, 8), _mm_srli_epi16(v_b1, 8)); + + __m128i layer3_chunk0 = _mm_packus_epi16(_mm_and_si128(layer4_chunk0, v_mask), _mm_and_si128(layer4_chunk1, v_mask)); + __m128i layer3_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk0, 8), _mm_srli_epi16(layer4_chunk1, 8)); + __m128i layer3_chunk1 = _mm_packus_epi16(_mm_and_si128(layer4_chunk2, v_mask), _mm_and_si128(layer4_chunk3, v_mask)); + __m128i layer3_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk2, 8), _mm_srli_epi16(layer4_chunk3, 8)); + __m128i layer3_chunk2 = _mm_packus_epi16(_mm_and_si128(layer4_chunk4, v_mask), _mm_and_si128(layer4_chunk5, v_mask)); + __m128i layer3_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk4, 8), _mm_srli_epi16(layer4_chunk5, 8)); + + __m128i layer2_chunk0 = _mm_packus_epi16(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask)); + __m128i layer2_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk0, 8), _mm_srli_epi16(layer3_chunk1, 8)); + __m128i layer2_chunk1 = _mm_packus_epi16(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask)); + __m128i layer2_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk2, 8), _mm_srli_epi16(layer3_chunk3, 8)); + __m128i layer2_chunk2 = _mm_packus_epi16(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask)); + __m128i layer2_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk4, 8), _mm_srli_epi16(layer3_chunk5, 8)); + + __m128i layer1_chunk0 = _mm_packus_epi16(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask)); + __m128i layer1_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk0, 8), _mm_srli_epi16(layer2_chunk1, 8)); + __m128i layer1_chunk1 = _mm_packus_epi16(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask)); + __m128i layer1_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk2, 8), _mm_srli_epi16(layer2_chunk3, 8)); + __m128i layer1_chunk2 = _mm_packus_epi16(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask)); + __m128i layer1_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk4, 8), _mm_srli_epi16(layer2_chunk5, 8)); + + v_r0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask)); + v_g1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk0, 8), _mm_srli_epi16(layer1_chunk1, 8)); + v_r1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask)); + v_b0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk2, 8), _mm_srli_epi16(layer1_chunk3, 8)); + v_g0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask)); + v_b1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk4, 8), _mm_srli_epi16(layer1_chunk5, 8)); +} + +inline void _mm_interlive_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1, + __m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1) +{ + __m128i v_mask = _mm_set1_epi16(0x00ff); + + __m128i layer4_chunk0 = _mm_packus_epi16(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask)); + __m128i layer4_chunk4 = _mm_packus_epi16(_mm_srli_epi16(v_r0, 8), _mm_srli_epi16(v_r1, 8)); + __m128i layer4_chunk1 = _mm_packus_epi16(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask)); + __m128i layer4_chunk5 = _mm_packus_epi16(_mm_srli_epi16(v_g0, 8), _mm_srli_epi16(v_g1, 8)); + __m128i layer4_chunk2 = _mm_packus_epi16(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask)); + __m128i layer4_chunk6 = _mm_packus_epi16(_mm_srli_epi16(v_b0, 8), _mm_srli_epi16(v_b1, 8)); + __m128i layer4_chunk3 = _mm_packus_epi16(_mm_and_si128(v_a0, v_mask), _mm_and_si128(v_a1, v_mask)); + __m128i layer4_chunk7 = _mm_packus_epi16(_mm_srli_epi16(v_a0, 8), _mm_srli_epi16(v_a1, 8)); + + __m128i layer3_chunk0 = _mm_packus_epi16(_mm_and_si128(layer4_chunk0, v_mask), _mm_and_si128(layer4_chunk1, v_mask)); + __m128i layer3_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk0, 8), _mm_srli_epi16(layer4_chunk1, 8)); + __m128i layer3_chunk1 = _mm_packus_epi16(_mm_and_si128(layer4_chunk2, v_mask), _mm_and_si128(layer4_chunk3, v_mask)); + __m128i layer3_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk2, 8), _mm_srli_epi16(layer4_chunk3, 8)); + __m128i layer3_chunk2 = _mm_packus_epi16(_mm_and_si128(layer4_chunk4, v_mask), _mm_and_si128(layer4_chunk5, v_mask)); + __m128i layer3_chunk6 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk4, 8), _mm_srli_epi16(layer4_chunk5, 8)); + __m128i layer3_chunk3 = _mm_packus_epi16(_mm_and_si128(layer4_chunk6, v_mask), _mm_and_si128(layer4_chunk7, v_mask)); + __m128i layer3_chunk7 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk6, 8), _mm_srli_epi16(layer4_chunk7, 8)); + + __m128i layer2_chunk0 = _mm_packus_epi16(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask)); + __m128i layer2_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk0, 8), _mm_srli_epi16(layer3_chunk1, 8)); + __m128i layer2_chunk1 = _mm_packus_epi16(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask)); + __m128i layer2_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk2, 8), _mm_srli_epi16(layer3_chunk3, 8)); + __m128i layer2_chunk2 = _mm_packus_epi16(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask)); + __m128i layer2_chunk6 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk4, 8), _mm_srli_epi16(layer3_chunk5, 8)); + __m128i layer2_chunk3 = _mm_packus_epi16(_mm_and_si128(layer3_chunk6, v_mask), _mm_and_si128(layer3_chunk7, v_mask)); + __m128i layer2_chunk7 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk6, 8), _mm_srli_epi16(layer3_chunk7, 8)); + + __m128i layer1_chunk0 = _mm_packus_epi16(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask)); + __m128i layer1_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk0, 8), _mm_srli_epi16(layer2_chunk1, 8)); + __m128i layer1_chunk1 = _mm_packus_epi16(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask)); + __m128i layer1_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk2, 8), _mm_srli_epi16(layer2_chunk3, 8)); + __m128i layer1_chunk2 = _mm_packus_epi16(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask)); + __m128i layer1_chunk6 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk4, 8), _mm_srli_epi16(layer2_chunk5, 8)); + __m128i layer1_chunk3 = _mm_packus_epi16(_mm_and_si128(layer2_chunk6, v_mask), _mm_and_si128(layer2_chunk7, v_mask)); + __m128i layer1_chunk7 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk6, 8), _mm_srli_epi16(layer2_chunk7, 8)); + + v_r0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask)); + v_b0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk0, 8), _mm_srli_epi16(layer1_chunk1, 8)); + v_r1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask)); + v_b1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk2, 8), _mm_srli_epi16(layer1_chunk3, 8)); + v_g0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask)); + v_a0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk4, 8), _mm_srli_epi16(layer1_chunk5, 8)); + v_g1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk6, v_mask), _mm_and_si128(layer1_chunk7, v_mask)); + v_a1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk6, 8), _mm_srli_epi16(layer1_chunk7, 8)); +} + +inline void _mm_deinterliv_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, + __m128i & v_g1, __m128i & v_b0, __m128i & v_b1) +{ + __m128i layer1_chunk0 = _mm_unpacklo_epi16(v_r0, v_g1); + __m128i layer1_chunk1 = _mm_unpackhi_epi16(v_r0, v_g1); + __m128i layer1_chunk2 = _mm_unpacklo_epi16(v_r1, v_b0); + __m128i layer1_chunk3 = _mm_unpackhi_epi16(v_r1, v_b0); + __m128i layer1_chunk4 = _mm_unpacklo_epi16(v_g0, v_b1); + __m128i layer1_chunk5 = _mm_unpackhi_epi16(v_g0, v_b1); + + __m128i layer2_chunk0 = _mm_unpacklo_epi16(layer1_chunk0, layer1_chunk3); + __m128i layer2_chunk1 = _mm_unpackhi_epi16(layer1_chunk0, layer1_chunk3); + __m128i layer2_chunk2 = _mm_unpacklo_epi16(layer1_chunk1, layer1_chunk4); + __m128i layer2_chunk3 = _mm_unpackhi_epi16(layer1_chunk1, layer1_chunk4); + __m128i layer2_chunk4 = _mm_unpacklo_epi16(layer1_chunk2, layer1_chunk5); + __m128i layer2_chunk5 = _mm_unpackhi_epi16(layer1_chunk2, layer1_chunk5); + + __m128i layer3_chunk0 = _mm_unpacklo_epi16(layer2_chunk0, layer2_chunk3); + __m128i layer3_chunk1 = _mm_unpackhi_epi16(layer2_chunk0, layer2_chunk3); + __m128i layer3_chunk2 = _mm_unpacklo_epi16(layer2_chunk1, layer2_chunk4); + __m128i layer3_chunk3 = _mm_unpackhi_epi16(layer2_chunk1, layer2_chunk4); + __m128i layer3_chunk4 = _mm_unpacklo_epi16(layer2_chunk2, layer2_chunk5); + __m128i layer3_chunk5 = _mm_unpackhi_epi16(layer2_chunk2, layer2_chunk5); + + v_r0 = _mm_unpacklo_epi16(layer3_chunk0, layer3_chunk3); + v_r1 = _mm_unpackhi_epi16(layer3_chunk0, layer3_chunk3); + v_g0 = _mm_unpacklo_epi16(layer3_chunk1, layer3_chunk4); + v_g1 = _mm_unpackhi_epi16(layer3_chunk1, layer3_chunk4); + v_b0 = _mm_unpacklo_epi16(layer3_chunk2, layer3_chunk5); + v_b1 = _mm_unpackhi_epi16(layer3_chunk2, layer3_chunk5); +} + +inline void _mm_deinterliv_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1, + __m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1) +{ + __m128i layer1_chunk0 = _mm_unpacklo_epi16(v_r0, v_b0); + __m128i layer1_chunk1 = _mm_unpackhi_epi16(v_r0, v_b0); + __m128i layer1_chunk2 = _mm_unpacklo_epi16(v_r1, v_b1); + __m128i layer1_chunk3 = _mm_unpackhi_epi16(v_r1, v_b1); + __m128i layer1_chunk4 = _mm_unpacklo_epi16(v_g0, v_a0); + __m128i layer1_chunk5 = _mm_unpackhi_epi16(v_g0, v_a0); + __m128i layer1_chunk6 = _mm_unpacklo_epi16(v_g1, v_a1); + __m128i layer1_chunk7 = _mm_unpackhi_epi16(v_g1, v_a1); + + __m128i layer2_chunk0 = _mm_unpacklo_epi16(layer1_chunk0, layer1_chunk4); + __m128i layer2_chunk1 = _mm_unpackhi_epi16(layer1_chunk0, layer1_chunk4); + __m128i layer2_chunk2 = _mm_unpacklo_epi16(layer1_chunk1, layer1_chunk5); + __m128i layer2_chunk3 = _mm_unpackhi_epi16(layer1_chunk1, layer1_chunk5); + __m128i layer2_chunk4 = _mm_unpacklo_epi16(layer1_chunk2, layer1_chunk6); + __m128i layer2_chunk5 = _mm_unpackhi_epi16(layer1_chunk2, layer1_chunk6); + __m128i layer2_chunk6 = _mm_unpacklo_epi16(layer1_chunk3, layer1_chunk7); + __m128i layer2_chunk7 = _mm_unpackhi_epi16(layer1_chunk3, layer1_chunk7); + + __m128i layer3_chunk0 = _mm_unpacklo_epi16(layer2_chunk0, layer2_chunk4); + __m128i layer3_chunk1 = _mm_unpackhi_epi16(layer2_chunk0, layer2_chunk4); + __m128i layer3_chunk2 = _mm_unpacklo_epi16(layer2_chunk1, layer2_chunk5); + __m128i layer3_chunk3 = _mm_unpackhi_epi16(layer2_chunk1, layer2_chunk5); + __m128i layer3_chunk4 = _mm_unpacklo_epi16(layer2_chunk2, layer2_chunk6); + __m128i layer3_chunk5 = _mm_unpackhi_epi16(layer2_chunk2, layer2_chunk6); + __m128i layer3_chunk6 = _mm_unpacklo_epi16(layer2_chunk3, layer2_chunk7); + __m128i layer3_chunk7 = _mm_unpackhi_epi16(layer2_chunk3, layer2_chunk7); + + v_r0 = _mm_unpacklo_epi16(layer3_chunk0, layer3_chunk4); + v_r1 = _mm_unpackhi_epi16(layer3_chunk0, layer3_chunk4); + v_g0 = _mm_unpacklo_epi16(layer3_chunk1, layer3_chunk5); + v_g1 = _mm_unpackhi_epi16(layer3_chunk1, layer3_chunk5); + v_b0 = _mm_unpacklo_epi16(layer3_chunk2, layer3_chunk6); + v_b1 = _mm_unpackhi_epi16(layer3_chunk2, layer3_chunk6); + v_a0 = _mm_unpacklo_epi16(layer3_chunk3, layer3_chunk7); + v_a1 = _mm_unpackhi_epi16(layer3_chunk3, layer3_chunk7); +} + +inline void _mm_interliv_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, + __m128i & v_g1, __m128i & v_b0, __m128i & v_b1) +{ + __m128i v_mask = _mm_set1_epi32(0x0000ffff); + + __m128i layer3_chunk0 = _mm_packus_epi32(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask)); + __m128i layer3_chunk3 = _mm_packus_epi32(_mm_srli_epi32(v_r0, 16), _mm_srli_epi32(v_r1, 16)); + __m128i layer3_chunk1 = _mm_packus_epi32(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask)); + __m128i layer3_chunk4 = _mm_packus_epi32(_mm_srli_epi32(v_g0, 16), _mm_srli_epi32(v_g1, 16)); + __m128i layer3_chunk2 = _mm_packus_epi32(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask)); + __m128i layer3_chunk5 = _mm_packus_epi32(_mm_srli_epi32(v_b0, 16), _mm_srli_epi32(v_b1, 16)); + + __m128i layer2_chunk0 = _mm_packus_epi32(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask)); + __m128i layer2_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk0, 16), _mm_srli_epi32(layer3_chunk1, 16)); + __m128i layer2_chunk1 = _mm_packus_epi32(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask)); + __m128i layer2_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk2, 16), _mm_srli_epi32(layer3_chunk3, 16)); + __m128i layer2_chunk2 = _mm_packus_epi32(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask)); + __m128i layer2_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk4, 16), _mm_srli_epi32(layer3_chunk5, 16)); + + __m128i layer1_chunk0 = _mm_packus_epi32(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask)); + __m128i layer1_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk0, 16), _mm_srli_epi32(layer2_chunk1, 16)); + __m128i layer1_chunk1 = _mm_packus_epi32(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask)); + __m128i layer1_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk2, 16), _mm_srli_epi32(layer2_chunk3, 16)); + __m128i layer1_chunk2 = _mm_packus_epi32(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask)); + __m128i layer1_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk4, 16), _mm_srli_epi32(layer2_chunk5, 16)); + + v_r0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask)); + v_g1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk0, 16), _mm_srli_epi32(layer1_chunk1, 16)); + v_r1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask)); + v_b0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk2, 16), _mm_srli_epi32(layer1_chunk3, 16)); + v_g0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask)); + v_b1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk4, 16), _mm_srli_epi32(layer1_chunk5, 16)); +} + +inline void _mm_interliv_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1, + __m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1) +{ + __m128i v_mask = _mm_set1_epi32(0x0000ffff); + + __m128i layer3_chunk0 = _mm_packus_epi32(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask)); + __m128i layer3_chunk4 = _mm_packus_epi32(_mm_srli_epi32(v_r0, 16), _mm_srli_epi32(v_r1, 16)); + __m128i layer3_chunk1 = _mm_packus_epi32(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask)); + __m128i layer3_chunk5 = _mm_packus_epi32(_mm_srli_epi32(v_g0, 16), _mm_srli_epi32(v_g1, 16)); + __m128i layer3_chunk2 = _mm_packus_epi32(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask)); + __m128i layer3_chunk6 = _mm_packus_epi32(_mm_srli_epi32(v_b0, 16), _mm_srli_epi32(v_b1, 16)); + __m128i layer3_chunk3 = _mm_packus_epi32(_mm_and_si128(v_a0, v_mask), _mm_and_si128(v_a1, v_mask)); + __m128i layer3_chunk7 = _mm_packus_epi32(_mm_srli_epi32(v_a0, 16), _mm_srli_epi32(v_a1, 16)); + + __m128i layer2_chunk0 = _mm_packus_epi32(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask)); + __m128i layer2_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk0, 16), _mm_srli_epi32(layer3_chunk1, 16)); + __m128i layer2_chunk1 = _mm_packus_epi32(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask)); + __m128i layer2_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk2, 16), _mm_srli_epi32(layer3_chunk3, 16)); + __m128i layer2_chunk2 = _mm_packus_epi32(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask)); + __m128i layer2_chunk6 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk4, 16), _mm_srli_epi32(layer3_chunk5, 16)); + __m128i layer2_chunk3 = _mm_packus_epi32(_mm_and_si128(layer3_chunk6, v_mask), _mm_and_si128(layer3_chunk7, v_mask)); + __m128i layer2_chunk7 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk6, 16), _mm_srli_epi32(layer3_chunk7, 16)); + + __m128i layer1_chunk0 = _mm_packus_epi32(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask)); + __m128i layer1_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk0, 16), _mm_srli_epi32(layer2_chunk1, 16)); + __m128i layer1_chunk1 = _mm_packus_epi32(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask)); + __m128i layer1_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk2, 16), _mm_srli_epi32(layer2_chunk3, 16)); + __m128i layer1_chunk2 = _mm_packus_epi32(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask)); + __m128i layer1_chunk6 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk4, 16), _mm_srli_epi32(layer2_chunk5, 16)); + __m128i layer1_chunk3 = _mm_packus_epi32(_mm_and_si128(layer2_chunk6, v_mask), _mm_and_si128(layer2_chunk7, v_mask)); + __m128i layer1_chunk7 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk6, 16), _mm_srli_epi32(layer2_chunk7, 16)); + + v_r0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask)); + v_b0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk0, 16), _mm_srli_epi32(layer1_chunk1, 16)); + v_r0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask)); + v_b1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk2, 16), _mm_srli_epi32(layer1_chunk3, 16)); + v_g0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask)); + v_a0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk4, 16), _mm_srli_epi32(layer1_chunk5, 16)); + v_g1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk6, v_mask), _mm_and_si128(layer1_chunk7, v_mask)); + v_a1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk6, 16), _mm_srli_epi32(layer1_chunk7, 16)); +} + +inline void _mm_deinterliv_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, + __m128 & v_g1, __m128 & v_b0, __m128 & v_b1) +{ + __m128 layer1_chunk0 = _mm_unpacklo_ps(v_r0, v_g1); + __m128 layer1_chunk1 = _mm_unpackhi_ps(v_r0, v_g1); + __m128 layer1_chunk2 = _mm_unpacklo_ps(v_r1, v_b0); + __m128 layer1_chunk3 = _mm_unpackhi_ps(v_r1, v_b0); + __m128 layer1_chunk4 = _mm_unpacklo_ps(v_g0, v_b1); + __m128 layer1_chunk5 = _mm_unpackhi_ps(v_g0, v_b1); + + __m128 layer2_chunk0 = _mm_unpacklo_ps(layer1_chunk0, layer1_chunk3); + __m128 layer2_chunk1 = _mm_unpackhi_ps(layer1_chunk0, layer1_chunk3); + __m128 layer2_chunk2 = _mm_unpacklo_ps(layer1_chunk1, layer1_chunk4); + __m128 layer2_chunk3 = _mm_unpackhi_ps(layer1_chunk1, layer1_chunk4); + __m128 layer2_chunk4 = _mm_unpacklo_ps(layer1_chunk2, layer1_chunk5); + __m128 layer2_chunk5 = _mm_unpackhi_ps(layer1_chunk2, layer1_chunk5); + + v_r0 = _mm_unpacklo_ps(layer2_chunk0, layer2_chunk3); + v_r1 = _mm_unpackhi_ps(layer2_chunk0, layer2_chunk3); + v_g0 = _mm_unpacklo_ps(layer2_chunk1, layer2_chunk4); + v_g1 = _mm_unpackhi_ps(layer2_chunk1, layer2_chunk4); + v_b0 = _mm_unpacklo_ps(layer2_chunk2, layer2_chunk5); + v_b1 = _mm_unpackhi_ps(layer2_chunk2, layer2_chunk5); +} + +inline void _mm_deinterliv_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1, + __m128 & v_b0, __m128 & v_b1, __m128 & v_a0, __m128 & v_a1) +{ + __m128 layer1_chunk0 = _mm_unpacklo_ps(v_r0, v_b0); + __m128 layer1_chunk1 = _mm_unpackhi_ps(v_r0, v_b0); + __m128 layer1_chunk2 = _mm_unpacklo_ps(v_r1, v_b1); + __m128 layer1_chunk3 = _mm_unpackhi_ps(v_r1, v_b1); + __m128 layer1_chunk4 = _mm_unpacklo_ps(v_g0, v_a0); + __m128 layer1_chunk5 = _mm_unpackhi_ps(v_g0, v_a0); + __m128 layer1_chunk6 = _mm_unpacklo_ps(v_g1, v_a1); + __m128 layer1_chunk7 = _mm_unpackhi_ps(v_g1, v_a1); + + __m128 layer2_chunk0 = _mm_unpacklo_ps(layer1_chunk0, layer1_chunk4); + __m128 layer2_chunk1 = _mm_unpackhi_ps(layer1_chunk0, layer1_chunk4); + __m128 layer2_chunk2 = _mm_unpacklo_ps(layer1_chunk1, layer1_chunk5); + __m128 layer2_chunk3 = _mm_unpackhi_ps(layer1_chunk1, layer1_chunk5); + __m128 layer2_chunk4 = _mm_unpacklo_ps(layer1_chunk2, layer1_chunk6); + __m128 layer2_chunk5 = _mm_unpackhi_ps(layer1_chunk2, layer1_chunk6); + __m128 layer2_chunk6 = _mm_unpacklo_ps(layer1_chunk3, layer1_chunk7); + __m128 layer2_chunk7 = _mm_unpackhi_ps(layer1_chunk3, layer1_chunk7); + + v_r0 = _mm_unpacklo_ps(layer2_chunk0, layer2_chunk4); + v_r1 = _mm_unpackhi_ps(layer2_chunk0, layer2_chunk4); + v_g0 = _mm_unpacklo_ps(layer2_chunk1, layer2_chunk5); + v_g1 = _mm_unpackhi_ps(layer2_chunk1, layer2_chunk5); + v_b0 = _mm_unpacklo_ps(layer2_chunk2, layer2_chunk6); + v_b1 = _mm_unpackhi_ps(layer2_chunk2, layer2_chunk6); + v_a0 = _mm_unpacklo_ps(layer2_chunk3, layer2_chunk7); + v_a1 = _mm_unpackhi_ps(layer2_chunk3, layer2_chunk7); +} + +inline void _mm_interliv_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, + __m128 & v_g1, __m128 & v_b0, __m128 & v_b1) +{ + const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1); + + __m128 layer2_chunk0 = _mm_shuffle_ps(v_r0, v_r1, mask_lo); + __m128 layer2_chunk3 = _mm_shuffle_ps(v_r0, v_r1, mask_hi); + __m128 layer2_chunk1 = _mm_shuffle_ps(v_g0, v_g1, mask_lo); + __m128 layer2_chunk4 = _mm_shuffle_ps(v_g0, v_g1, mask_hi); + __m128 layer2_chunk2 = _mm_shuffle_ps(v_b0, v_b1, mask_lo); + __m128 layer2_chunk5 = _mm_shuffle_ps(v_b0, v_b1, mask_hi); + + __m128 layer1_chunk0 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_lo); + __m128 layer1_chunk3 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_hi); + __m128 layer1_chunk1 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_lo); + __m128 layer1_chunk4 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_hi); + __m128 layer1_chunk2 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_lo); + __m128 layer1_chunk5 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_hi); + + v_r0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_lo); + v_g1 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_hi); + v_r1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_lo); + v_b0 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_hi); + v_g0 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_lo); + v_b1 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_hi); +} + +inline void _mm_interliv_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1, + __m128 & v_b0, __m128 & v_b1, __m128 & v_a0, __m128 & v_a1) +{ + const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1); + + __m128 layer2_chunk0 = _mm_shuffle_ps(v_r0, v_r1, mask_lo); + __m128 layer2_chunk4 = _mm_shuffle_ps(v_r0, v_r1, mask_hi); + __m128 layer2_chunk1 = _mm_shuffle_ps(v_g0, v_g1, mask_lo); + __m128 layer2_chunk5 = _mm_shuffle_ps(v_g0, v_g1, mask_hi); + __m128 layer2_chunk2 = _mm_shuffle_ps(v_b0, v_b1, mask_lo); + __m128 layer2_chunk6 = _mm_shuffle_ps(v_b0, v_b1, mask_hi); + __m128 layer2_chunk3 = _mm_shuffle_ps(v_a0, v_a1, mask_lo); + __m128 layer2_chunk7 = _mm_shuffle_ps(v_a0, v_a1, mask_hi); + + __m128 layer1_chunk0 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_lo); + __m128 layer1_chunk4 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_hi); + __m128 layer1_chunk1 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_lo); + __m128 layer1_chunk5 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_hi); + __m128 layer1_chunk2 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_lo); + __m128 layer1_chunk6 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_hi); + __m128 layer1_chunk3 = _mm_shuffle_ps(layer2_chunk6, layer2_chunk7, mask_lo); + __m128 layer1_chunk7 = _mm_shuffle_ps(layer2_chunk6, layer2_chunk7, mask_hi); + + v_r0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_lo); + v_b0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_hi); + v_r1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_lo); + v_b1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_hi); + v_g0 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_lo); + v_a0 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_hi); + v_g1 = _mm_shuffle_ps(layer1_chunk6, layer1_chunk7, mask_lo); + v_a1 = _mm_shuffle_ps(layer1_chunk6, layer1_chunk7, mask_hi); +} + +#endif + +#endif //__OPENCV_CORE_SSE_UTILS_HPP__ diff --git a/modules/imgproc/src/color.cpp b/modules/imgproc/src/color.cpp index 2794da36e0..4d74e86dba 100644 --- a/modules/imgproc/src/color.cpp +++ b/modules/imgproc/src/color.cpp @@ -102,205 +102,6 @@ static IppStatus sts = ippInit(); #endif -#if CV_SSE2 - -#define _MM_DEINTERLIV_EPI8(layer0_chunk0, layer0_chunk1, layer0_chunk2, \ - layer0_chunk3, layer0_chunk4, layer0_chunk5) \ - { \ - __m128i layer1_chunk0 = _mm_unpacklo_epi8(layer0_chunk0, layer0_chunk3); \ - __m128i layer1_chunk1 = _mm_unpackhi_epi8(layer0_chunk0, layer0_chunk3); \ - __m128i layer1_chunk2 = _mm_unpacklo_epi8(layer0_chunk1, layer0_chunk4); \ - __m128i layer1_chunk3 = _mm_unpackhi_epi8(layer0_chunk1, layer0_chunk4); \ - __m128i layer1_chunk4 = _mm_unpacklo_epi8(layer0_chunk2, layer0_chunk5); \ - __m128i layer1_chunk5 = _mm_unpackhi_epi8(layer0_chunk2, layer0_chunk5); \ - \ - __m128i layer2_chunk0 = _mm_unpacklo_epi8(layer1_chunk0, layer1_chunk3); \ - __m128i layer2_chunk1 = _mm_unpackhi_epi8(layer1_chunk0, layer1_chunk3); \ - __m128i layer2_chunk2 = _mm_unpacklo_epi8(layer1_chunk1, layer1_chunk4); \ - __m128i layer2_chunk3 = _mm_unpackhi_epi8(layer1_chunk1, layer1_chunk4); \ - __m128i layer2_chunk4 = _mm_unpacklo_epi8(layer1_chunk2, layer1_chunk5); \ - __m128i layer2_chunk5 = _mm_unpackhi_epi8(layer1_chunk2, layer1_chunk5); \ - \ - __m128i layer3_chunk0 = _mm_unpacklo_epi8(layer2_chunk0, layer2_chunk3); \ - __m128i layer3_chunk1 = _mm_unpackhi_epi8(layer2_chunk0, layer2_chunk3); \ - __m128i layer3_chunk2 = _mm_unpacklo_epi8(layer2_chunk1, layer2_chunk4); \ - __m128i layer3_chunk3 = _mm_unpackhi_epi8(layer2_chunk1, layer2_chunk4); \ - __m128i layer3_chunk4 = _mm_unpacklo_epi8(layer2_chunk2, layer2_chunk5); \ - __m128i layer3_chunk5 = _mm_unpackhi_epi8(layer2_chunk2, layer2_chunk5); \ - \ - __m128i layer4_chunk0 = _mm_unpacklo_epi8(layer3_chunk0, layer3_chunk3); \ - __m128i layer4_chunk1 = _mm_unpackhi_epi8(layer3_chunk0, layer3_chunk3); \ - __m128i layer4_chunk2 = _mm_unpacklo_epi8(layer3_chunk1, layer3_chunk4); \ - __m128i layer4_chunk3 = _mm_unpackhi_epi8(layer3_chunk1, layer3_chunk4); \ - __m128i layer4_chunk4 = _mm_unpacklo_epi8(layer3_chunk2, layer3_chunk5); \ - __m128i layer4_chunk5 = _mm_unpackhi_epi8(layer3_chunk2, layer3_chunk5); \ - \ - layer0_chunk0 = _mm_unpacklo_epi8(layer4_chunk0, layer4_chunk3); \ - layer0_chunk1 = _mm_unpackhi_epi8(layer4_chunk0, layer4_chunk3); \ - layer0_chunk2 = _mm_unpacklo_epi8(layer4_chunk1, layer4_chunk4); \ - layer0_chunk3 = _mm_unpackhi_epi8(layer4_chunk1, layer4_chunk4); \ - layer0_chunk4 = _mm_unpacklo_epi8(layer4_chunk2, layer4_chunk5); \ - layer0_chunk5 = _mm_unpackhi_epi8(layer4_chunk2, layer4_chunk5); \ - } - -#define _MM_INTERLIV_EPI8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1) \ - { \ - __m128i v_mask = _mm_set1_epi16(0x00ff); \ - \ - __m128i layer4_chunk0 = _mm_packus_epi16(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask)); \ - __m128i layer4_chunk3 = _mm_packus_epi16(_mm_srli_epi16(v_r0, 8), _mm_srli_epi16(v_r1, 8)); \ - __m128i layer4_chunk1 = _mm_packus_epi16(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask)); \ - __m128i layer4_chunk4 = _mm_packus_epi16(_mm_srli_epi16(v_g0, 8), _mm_srli_epi16(v_g1, 8)); \ - __m128i layer4_chunk2 = _mm_packus_epi16(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask)); \ - __m128i layer4_chunk5 = _mm_packus_epi16(_mm_srli_epi16(v_b0, 8), _mm_srli_epi16(v_b1, 8)); \ - \ - __m128i layer3_chunk0 = _mm_packus_epi16(_mm_and_si128(layer4_chunk0, v_mask), _mm_and_si128(layer4_chunk1, v_mask)); \ - __m128i layer3_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk0, 8), _mm_srli_epi16(layer4_chunk1, 8)); \ - __m128i layer3_chunk1 = _mm_packus_epi16(_mm_and_si128(layer4_chunk2, v_mask), _mm_and_si128(layer4_chunk3, v_mask)); \ - __m128i layer3_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk2, 8), _mm_srli_epi16(layer4_chunk3, 8)); \ - __m128i layer3_chunk2 = _mm_packus_epi16(_mm_and_si128(layer4_chunk4, v_mask), _mm_and_si128(layer4_chunk5, v_mask)); \ - __m128i layer3_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk4, 8), _mm_srli_epi16(layer4_chunk5, 8)); \ - \ - __m128i layer2_chunk0 = _mm_packus_epi16(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask)); \ - __m128i layer2_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk0, 8), _mm_srli_epi16(layer3_chunk1, 8)); \ - __m128i layer2_chunk1 = _mm_packus_epi16(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask)); \ - __m128i layer2_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk2, 8), _mm_srli_epi16(layer3_chunk3, 8)); \ - __m128i layer2_chunk2 = _mm_packus_epi16(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask)); \ - __m128i layer2_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk4, 8), _mm_srli_epi16(layer3_chunk5, 8)); \ - \ - __m128i layer1_chunk0 = _mm_packus_epi16(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask)); \ - __m128i layer1_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk0, 8), _mm_srli_epi16(layer2_chunk1, 8)); \ - __m128i layer1_chunk1 = _mm_packus_epi16(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask)); \ - __m128i layer1_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk2, 8), _mm_srli_epi16(layer2_chunk3, 8)); \ - __m128i layer1_chunk2 = _mm_packus_epi16(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask)); \ - __m128i layer1_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk4, 8), _mm_srli_epi16(layer2_chunk5, 8)); \ - \ - v_r0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask)); \ - v_g1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk0, 8), _mm_srli_epi16(layer1_chunk1, 8)); \ - v_r1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask)); \ - v_b0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk2, 8), _mm_srli_epi16(layer1_chunk3, 8)); \ - v_g0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask)); \ - v_b1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk4, 8), _mm_srli_epi16(layer1_chunk5, 8)); \ - } - -#define _MM_DEINTERLIV_EPI16(layer0_chunk0, layer0_chunk1, layer0_chunk2, \ - layer0_chunk3, layer0_chunk4, layer0_chunk5) \ - { \ - __m128i layer1_chunk0 = _mm_unpacklo_epi16(layer0_chunk0, layer0_chunk3); \ - __m128i layer1_chunk1 = _mm_unpackhi_epi16(layer0_chunk0, layer0_chunk3); \ - __m128i layer1_chunk2 = _mm_unpacklo_epi16(layer0_chunk1, layer0_chunk4); \ - __m128i layer1_chunk3 = _mm_unpackhi_epi16(layer0_chunk1, layer0_chunk4); \ - __m128i layer1_chunk4 = _mm_unpacklo_epi16(layer0_chunk2, layer0_chunk5); \ - __m128i layer1_chunk5 = _mm_unpackhi_epi16(layer0_chunk2, layer0_chunk5); \ - \ - __m128i layer2_chunk0 = _mm_unpacklo_epi16(layer1_chunk0, layer1_chunk3); \ - __m128i layer2_chunk1 = _mm_unpackhi_epi16(layer1_chunk0, layer1_chunk3); \ - __m128i layer2_chunk2 = _mm_unpacklo_epi16(layer1_chunk1, layer1_chunk4); \ - __m128i layer2_chunk3 = _mm_unpackhi_epi16(layer1_chunk1, layer1_chunk4); \ - __m128i layer2_chunk4 = _mm_unpacklo_epi16(layer1_chunk2, layer1_chunk5); \ - __m128i layer2_chunk5 = _mm_unpackhi_epi16(layer1_chunk2, layer1_chunk5); \ - \ - __m128i layer3_chunk0 = _mm_unpacklo_epi16(layer2_chunk0, layer2_chunk3); \ - __m128i layer3_chunk1 = _mm_unpackhi_epi16(layer2_chunk0, layer2_chunk3); \ - __m128i layer3_chunk2 = _mm_unpacklo_epi16(layer2_chunk1, layer2_chunk4); \ - __m128i layer3_chunk3 = _mm_unpackhi_epi16(layer2_chunk1, layer2_chunk4); \ - __m128i layer3_chunk4 = _mm_unpacklo_epi16(layer2_chunk2, layer2_chunk5); \ - __m128i layer3_chunk5 = _mm_unpackhi_epi16(layer2_chunk2, layer2_chunk5); \ - \ - layer0_chunk0 = _mm_unpacklo_epi16(layer3_chunk0, layer3_chunk3); \ - layer0_chunk1 = _mm_unpackhi_epi16(layer3_chunk0, layer3_chunk3); \ - layer0_chunk2 = _mm_unpacklo_epi16(layer3_chunk1, layer3_chunk4); \ - layer0_chunk3 = _mm_unpackhi_epi16(layer3_chunk1, layer3_chunk4); \ - layer0_chunk4 = _mm_unpacklo_epi16(layer3_chunk2, layer3_chunk5); \ - layer0_chunk5 = _mm_unpackhi_epi16(layer3_chunk2, layer3_chunk5); \ - } - -#define _MM_INTERLIV_EPI16(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1) \ - { \ - __m128i v_mask = _mm_set1_epi32(0x0000ffff); \ - \ - __m128i layer3_chunk0 = _mm_packus_epi32(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask)); \ - __m128i layer3_chunk3 = _mm_packus_epi32(_mm_srli_epi32(v_r0, 16), _mm_srli_epi32(v_r1, 16)); \ - __m128i layer3_chunk1 = _mm_packus_epi32(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask)); \ - __m128i layer3_chunk4 = _mm_packus_epi32(_mm_srli_epi32(v_g0, 16), _mm_srli_epi32(v_g1, 16)); \ - __m128i layer3_chunk2 = _mm_packus_epi32(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask)); \ - __m128i layer3_chunk5 = _mm_packus_epi32(_mm_srli_epi32(v_b0, 16), _mm_srli_epi32(v_b1, 16)); \ - \ - __m128i layer2_chunk0 = _mm_packus_epi32(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask)); \ - __m128i layer2_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk0, 16), _mm_srli_epi32(layer3_chunk1, 16)); \ - __m128i layer2_chunk1 = _mm_packus_epi32(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask)); \ - __m128i layer2_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk2, 16), _mm_srli_epi32(layer3_chunk3, 16)); \ - __m128i layer2_chunk2 = _mm_packus_epi32(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask)); \ - __m128i layer2_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk4, 16), _mm_srli_epi32(layer3_chunk5, 16)); \ - \ - __m128i layer1_chunk0 = _mm_packus_epi32(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask)); \ - __m128i layer1_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk0, 16), _mm_srli_epi32(layer2_chunk1, 16)); \ - __m128i layer1_chunk1 = _mm_packus_epi32(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask)); \ - __m128i layer1_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk2, 16), _mm_srli_epi32(layer2_chunk3, 16)); \ - __m128i layer1_chunk2 = _mm_packus_epi32(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask)); \ - __m128i layer1_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk4, 16), _mm_srli_epi32(layer2_chunk5, 16)); \ - \ - v_r0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask)); \ - v_g1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk0, 16), _mm_srli_epi32(layer1_chunk1, 16)); \ - v_r1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask)); \ - v_b0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk2, 16), _mm_srli_epi32(layer1_chunk3, 16)); \ - v_g0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask)); \ - v_b1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk4, 16), _mm_srli_epi32(layer1_chunk5, 16)); \ - } - -#define _MM_DEINTERLIV_PS(layer0_chunk0, layer0_chunk1, layer0_chunk2, \ - layer0_chunk3, layer0_chunk4, layer0_chunk5) \ - { \ - __m128 layer1_chunk0 = _mm_unpacklo_ps(layer0_chunk0, layer0_chunk3); \ - __m128 layer1_chunk1 = _mm_unpackhi_ps(layer0_chunk0, layer0_chunk3); \ - __m128 layer1_chunk2 = _mm_unpacklo_ps(layer0_chunk1, layer0_chunk4); \ - __m128 layer1_chunk3 = _mm_unpackhi_ps(layer0_chunk1, layer0_chunk4); \ - __m128 layer1_chunk4 = _mm_unpacklo_ps(layer0_chunk2, layer0_chunk5); \ - __m128 layer1_chunk5 = _mm_unpackhi_ps(layer0_chunk2, layer0_chunk5); \ - \ - __m128 layer2_chunk0 = _mm_unpacklo_ps(layer1_chunk0, layer1_chunk3); \ - __m128 layer2_chunk1 = _mm_unpackhi_ps(layer1_chunk0, layer1_chunk3); \ - __m128 layer2_chunk2 = _mm_unpacklo_ps(layer1_chunk1, layer1_chunk4); \ - __m128 layer2_chunk3 = _mm_unpackhi_ps(layer1_chunk1, layer1_chunk4); \ - __m128 layer2_chunk4 = _mm_unpacklo_ps(layer1_chunk2, layer1_chunk5); \ - __m128 layer2_chunk5 = _mm_unpackhi_ps(layer1_chunk2, layer1_chunk5); \ - \ - layer0_chunk0 = _mm_unpacklo_ps(layer2_chunk0, layer2_chunk3); \ - layer0_chunk1 = _mm_unpackhi_ps(layer2_chunk0, layer2_chunk3); \ - layer0_chunk2 = _mm_unpacklo_ps(layer2_chunk1, layer2_chunk4); \ - layer0_chunk3 = _mm_unpackhi_ps(layer2_chunk1, layer2_chunk4); \ - layer0_chunk4 = _mm_unpacklo_ps(layer2_chunk2, layer2_chunk5); \ - layer0_chunk5 = _mm_unpackhi_ps(layer2_chunk2, layer2_chunk5); \ - } - -#define _MM_INTERLIV_PS(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1) \ - { \ - const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1); \ - \ - __m128 layer2_chunk0 = _mm_shuffle_ps(v_r0, v_r1, mask_lo); \ - __m128 layer2_chunk3 = _mm_shuffle_ps(v_r0, v_r1, mask_hi); \ - __m128 layer2_chunk1 = _mm_shuffle_ps(v_g0, v_g1, mask_lo); \ - __m128 layer2_chunk4 = _mm_shuffle_ps(v_g0, v_g1, mask_hi); \ - __m128 layer2_chunk2 = _mm_shuffle_ps(v_b0, v_b1, mask_lo); \ - __m128 layer2_chunk5 = _mm_shuffle_ps(v_b0, v_b1, mask_hi); \ - \ - __m128 layer1_chunk0 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_lo); \ - __m128 layer1_chunk3 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_hi); \ - __m128 layer1_chunk1 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_lo); \ - __m128 layer1_chunk4 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_hi); \ - __m128 layer1_chunk2 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_lo); \ - __m128 layer1_chunk5 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_hi); \ - \ - v_r0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_lo); \ - v_g1 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_hi); \ - v_r1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_lo); \ - v_b0 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_hi); \ - v_g0 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_lo); \ - v_b1 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_hi); \ - } - -#endif - namespace cv { @@ -1703,7 +1504,34 @@ struct RGB2Gray __m128i v_b0 = _mm_loadu_si128((__m128i const *)(src + 32)); __m128i v_b1 = _mm_loadu_si128((__m128i const *)(src + 40)); - _MM_DEINTERLIV_EPI16(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1) + _mm_deinterliv_epi16(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); + + __m128i v_gray0; + process(v_r0, v_g0, v_b0, + v_gray0); + + __m128i v_gray1; + process(v_r1, v_g1, v_b1, + v_gray1); + + _mm_storeu_si128((__m128i *)(dst + i), v_gray0); + _mm_storeu_si128((__m128i *)(dst + i + 8), v_gray1); + } + } + else if (scn == 4) + { + for ( ; i <= n - 16; i += 16, src += scn * 16) + { + __m128i v_r0 = _mm_loadu_si128((__m128i const *)(src)); + __m128i v_r1 = _mm_loadu_si128((__m128i const *)(src + 8)); + __m128i v_g0 = _mm_loadu_si128((__m128i const *)(src + 16)); + __m128i v_g1 = _mm_loadu_si128((__m128i const *)(src + 24)); + __m128i v_b0 = _mm_loadu_si128((__m128i const *)(src + 32)); + __m128i v_b1 = _mm_loadu_si128((__m128i const *)(src + 40)); + __m128i v_a0 = _mm_loadu_si128((__m128i const *)(src + 48)); + __m128i v_a1 = _mm_loadu_si128((__m128i const *)(src + 56)); + + _mm_deinterliv_epi16(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1, v_a0, v_a1); __m128i v_gray0; process(v_r0, v_g0, v_b0, @@ -1768,7 +1596,34 @@ struct RGB2Gray __m128 v_b0 = _mm_loadu_ps(src + 16); __m128 v_b1 = _mm_loadu_ps(src + 20); - _MM_DEINTERLIV_PS(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1) + _mm_deinterliv_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); + + __m128 v_gray0; + process(v_r0, v_g0, v_b0, + v_gray0); + + __m128 v_gray1; + process(v_r1, v_g1, v_b1, + v_gray1); + + _mm_storeu_ps(dst + i, v_gray0); + _mm_storeu_ps(dst + i + 4, v_gray1); + } + } + else if (scn == 4) + { + for ( ; i <= n - 8; i += 8, src += scn * 8) + { + __m128 v_r0 = _mm_loadu_ps(src); + __m128 v_r1 = _mm_loadu_ps(src + 4); + __m128 v_g0 = _mm_loadu_ps(src + 8); + __m128 v_g1 = _mm_loadu_ps(src + 12); + __m128 v_b0 = _mm_loadu_ps(src + 16); + __m128 v_b1 = _mm_loadu_ps(src + 20); + __m128 v_a0 = _mm_loadu_ps(src + 24); + __m128 v_a1 = _mm_loadu_ps(src + 28); + + _mm_deinterliv_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1, v_a0, v_a1); __m128 v_gray0; process(v_r0, v_g0, v_b0, @@ -1966,7 +1821,7 @@ struct RGB2YCrCb_f __m128 v_b0 = _mm_loadu_ps(src + 16); __m128 v_b1 = _mm_loadu_ps(src + 20); - _MM_DEINTERLIV_PS(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1) + _mm_deinterliv_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); __m128 v_y0, v_cr0, v_cb0; process(v_r0, v_g0, v_b0, @@ -1976,7 +1831,7 @@ struct RGB2YCrCb_f process(v_r1, v_g1, v_b1, v_y1, v_cr1, v_cb1); - _MM_INTERLIV_PS(v_y0, v_y1, v_cr0, v_cr1, v_cb0, v_cb1) + _mm_interliv_ps(v_y0, v_y1, v_cr0, v_cr1, v_cb0, v_cb1); _mm_storeu_ps(dst + i, v_y0); _mm_storeu_ps(dst + i + 4, v_y1); @@ -2331,7 +2186,7 @@ struct RGB2YCrCb_i __m128i v_b0 = _mm_loadu_si128((__m128i const *)(src + 64)); __m128i v_b1 = _mm_loadu_si128((__m128i const *)(src + 80)); - _MM_DEINTERLIV_EPI8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1) + _mm_deinterliv_epi8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); __m128i v_y0 = v_zero, v_cr0 = v_zero, v_cb0 = v_zero; process(_mm_unpacklo_epi8(v_r0, v_zero), @@ -2363,7 +2218,7 @@ struct RGB2YCrCb_i __m128i v_cr_1 = _mm_packus_epi16(v_cr0, v_cr1); __m128i v_cb_1 = _mm_packus_epi16(v_cb0, v_cb1); - _MM_INTERLIV_EPI8(v_y_0, v_y_1, v_cr_0, v_cr_1, v_cb_0, v_cb_1) + _mm_interlive_epi8(v_y_0, v_y_1, v_cr_0, v_cr_1, v_cb_0, v_cb_1); _mm_storeu_si128((__m128i *)(dst + i), v_y_0); _mm_storeu_si128((__m128i *)(dst + i + 16), v_y_1); @@ -2473,7 +2328,7 @@ struct RGB2YCrCb_i __m128i v_b0 = _mm_loadu_si128((__m128i const *)(src + 32)); __m128i v_b1 = _mm_loadu_si128((__m128i const *)(src + 40)); - _MM_DEINTERLIV_EPI16(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1) + _mm_deinterliv_epi16(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); __m128i v_y0 = v_zero, v_cr0 = v_zero, v_cb0 = v_zero; process(v_r0, v_g0, v_b0, @@ -2483,7 +2338,7 @@ struct RGB2YCrCb_i process(v_r1, v_g1, v_b1, v_y1, v_cr1, v_cb1); - _MM_INTERLIV_EPI16(v_y0, v_y1, v_cr0, v_cr1, v_cb0, v_cb1) + _mm_interliv_epi16(v_y0, v_y1, v_cr0, v_cr1, v_cb0, v_cb1); _mm_storeu_si128((__m128i *)(dst + i), v_y0); _mm_storeu_si128((__m128i *)(dst + i + 8), v_y1); @@ -2681,7 +2536,7 @@ struct YCrCb2RGB_f __m128 v_cb0 = _mm_loadu_ps(src + i + 16); __m128 v_cb1 = _mm_loadu_ps(src + i + 20); - _MM_DEINTERLIV_PS(v_y0, v_y1, v_cr0, v_cr1, v_cb0, v_cb1) + _mm_deinterliv_ps(v_y0, v_y1, v_cr0, v_cr1, v_cb0, v_cb1); __m128 v_r0, v_g0, v_b0; process(v_y0, v_cr0, v_cb0, @@ -2691,7 +2546,7 @@ struct YCrCb2RGB_f process(v_y1, v_cr1, v_cb1, v_r1, v_g1, v_b1); - _MM_INTERLIV_PS(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1) + _mm_interliv_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); _mm_storeu_ps(dst, v_r0); _mm_storeu_ps(dst + 4, v_r1); @@ -3094,7 +2949,7 @@ struct YCrCb2RGB_i __m128i v_cb0 = _mm_loadu_si128((__m128i const *)(src + i + 64)); __m128i v_cb1 = _mm_loadu_si128((__m128i const *)(src + i + 80)); - _MM_DEINTERLIV_EPI8(v_y0, v_y1, v_cr0, v_cr1, v_cb0, v_cb1) + _mm_deinterliv_epi8(v_y0, v_y1, v_cr0, v_cr1, v_cb0, v_cb1); __m128i v_r_0 = v_zero, v_g_0 = v_zero, v_b_0 = v_zero; process(_mm_unpacklo_epi8(v_y0, v_zero), @@ -3132,7 +2987,7 @@ struct YCrCb2RGB_i std::swap(v_r1, v_b1); } - _MM_INTERLIV_EPI8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1) + _mm_interlive_epi8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); _mm_storeu_si128((__m128i *)(dst), v_r0); _mm_storeu_si128((__m128i *)(dst + 16), v_r1); @@ -3355,7 +3210,7 @@ struct RGB2XYZ_f __m128 v_b0 = _mm_loadu_ps(src + 16); __m128 v_b1 = _mm_loadu_ps(src + 20); - _MM_DEINTERLIV_PS(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1) + _mm_deinterliv_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); __m128 v_x0, v_y0, v_z0; process(v_r0, v_g0, v_b0, @@ -3365,7 +3220,7 @@ struct RGB2XYZ_f process(v_r1, v_g1, v_b1, v_x1, v_y1, v_z1); - _MM_INTERLIV_PS(v_x0, v_x1, v_y0, v_y1, v_z0, v_z1) + _mm_interliv_ps(v_x0, v_x1, v_y0, v_y1, v_z0, v_z1); _mm_storeu_ps(dst + i, v_x0); _mm_storeu_ps(dst + i + 4, v_x1); @@ -3781,7 +3636,7 @@ struct XYZ2RGB_f __m128 v_z0 = _mm_loadu_ps(src + i + 16); __m128 v_z1 = _mm_loadu_ps(src + i + 20); - _MM_DEINTERLIV_PS(v_x0, v_x1, v_y0, v_y1, v_z0, v_z1) + _mm_deinterliv_ps(v_x0, v_x1, v_y0, v_y1, v_z0, v_z1); __m128 v_r0, v_g0, v_b0; process(v_x0, v_y0, v_z0, @@ -3791,7 +3646,7 @@ struct XYZ2RGB_f process(v_x1, v_y1, v_z1, v_r1, v_g1, v_b1); - _MM_INTERLIV_PS(v_b0, v_b1, v_g0, v_g1, v_r0, v_r1) + _mm_interliv_ps(v_b0, v_b1, v_g0, v_g1, v_r0, v_r1); _mm_storeu_ps(dst, v_b0); _mm_storeu_ps(dst + 4, v_b1); @@ -4361,7 +4216,7 @@ struct HSV2RGB_b v_g1 = _mm_mul_ps(v_g1, v_scale_inv); v_b1 = _mm_mul_ps(v_b1, v_scale_inv); - _MM_INTERLIV_PS(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1) + _mm_interliv_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); _mm_store_ps(buf, v_r0); _mm_store_ps(buf + 4, v_r1); @@ -4412,7 +4267,7 @@ struct HSV2RGB_b __m128i v_b0 = _mm_loadu_si128((__m128i const *)(src + j + 64)); __m128i v_b1 = _mm_loadu_si128((__m128i const *)(src + j + 80)); - _MM_DEINTERLIV_EPI8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1) + _mm_deinterliv_epi8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); process(_mm_unpacklo_epi8(v_r0, v_zero), _mm_unpacklo_epi8(v_g0, v_zero), @@ -4606,7 +4461,7 @@ struct RGB2HLS_b __m128 v_s0f = _mm_load_ps(buf + 16); __m128 v_s1f = _mm_load_ps(buf + 20); - _MM_DEINTERLIV_PS(v_h0f, v_h1f, v_l0f, v_l1f, v_s0f, v_s1f) + _mm_deinterliv_ps(v_h0f, v_h1f, v_l0f, v_l1f, v_s0f, v_s1f); v_l0f = _mm_mul_ps(v_l0f, v_scale); v_l1f = _mm_mul_ps(v_l1f, v_scale); @@ -4729,7 +4584,7 @@ struct RGB2HLS_b __m128i v_l1 = _mm_packus_epi16(v_l_0, v_l_1); __m128i v_s1 = _mm_packus_epi16(v_s_0, v_s_1); - _MM_INTERLIV_EPI8(v_h0, v_h1, v_l0, v_l1, v_s0, v_s1) + _mm_interlive_epi8(v_h0, v_h1, v_l0, v_l1, v_s0, v_s1); _mm_storeu_si128((__m128i *)(dst + j), v_h0); _mm_storeu_si128((__m128i *)(dst + j + 16), v_h1); @@ -4861,7 +4716,7 @@ struct HLS2RGB_b v_g1 = _mm_mul_ps(v_g1, v_scale_inv); v_b1 = _mm_mul_ps(v_b1, v_scale_inv); - _MM_INTERLIV_PS(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1) + _mm_interliv_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); _mm_store_ps(buf, v_r0); _mm_store_ps(buf + 4, v_r1); @@ -4912,7 +4767,7 @@ struct HLS2RGB_b __m128i v_b0 = _mm_loadu_si128((__m128i const *)(src + j + 64)); __m128i v_b1 = _mm_loadu_si128((__m128i const *)(src + j + 80)); - _MM_DEINTERLIV_EPI8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1) + _mm_deinterliv_epi8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); process(_mm_unpacklo_epi8(v_r0, v_zero), _mm_unpacklo_epi8(v_g0, v_zero), @@ -5360,7 +5215,7 @@ struct Lab2RGB_b v_b0 = _mm_sub_ps(v_b0, v_128); v_b1 = _mm_sub_ps(v_b1, v_128); - _MM_INTERLIV_PS(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1) + _mm_interliv_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); _mm_store_ps(buf, v_r0); _mm_store_ps(buf + 4, v_r1); @@ -5411,7 +5266,7 @@ struct Lab2RGB_b __m128i v_b0 = _mm_loadu_si128((__m128i const *)(src + j + 64)); __m128i v_b1 = _mm_loadu_si128((__m128i const *)(src + j + 80)); - _MM_DEINTERLIV_EPI8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1) + _mm_deinterliv_epi8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); process(_mm_unpacklo_epi8(v_r0, v_zero), _mm_unpacklo_epi8(v_g0, v_zero), @@ -5713,7 +5568,7 @@ struct RGB2Luv_b __m128 v_v0f = _mm_load_ps(buf + 16); __m128 v_v1f = _mm_load_ps(buf + 20); - _MM_DEINTERLIV_PS(v_l0f, v_l1f, v_u0f, v_u1f, v_v0f, v_v1f) + _mm_deinterliv_ps(v_l0f, v_l1f, v_u0f, v_u1f, v_v0f, v_v1f); v_l0f = _mm_mul_ps(v_l0f, v_scale); v_l1f = _mm_mul_ps(v_l1f, v_scale); @@ -5839,7 +5694,7 @@ struct RGB2Luv_b __m128i v_u1 = _mm_packus_epi16(v_u_0, v_u_1); __m128i v_v1 = _mm_packus_epi16(v_v_0, v_v_1); - _MM_INTERLIV_EPI8(v_l0, v_l1, v_u0, v_u1, v_v0, v_v1) + _mm_interlive_epi8(v_l0, v_l1, v_u0, v_u1, v_v0, v_v1); _mm_storeu_si128((__m128i *)(dst + j), v_l0); _mm_storeu_si128((__m128i *)(dst + j + 16), v_l1); @@ -5920,7 +5775,7 @@ struct Luv2RGB_b v_v0 = _mm_sub_ps(_mm_mul_ps(v_v0, v_coeff2), v_140); v_v1 = _mm_sub_ps(_mm_mul_ps(v_v1, v_coeff2), v_140); - _MM_INTERLIV_PS(v_l0, v_l1, v_u0, v_u1, v_v0, v_v1) + _mm_interliv_ps(v_l0, v_l1, v_u0, v_u1, v_v0, v_v1); _mm_store_ps(buf, v_l0); _mm_store_ps(buf + 4, v_l1); @@ -5971,7 +5826,7 @@ struct Luv2RGB_b __m128i v_b0 = _mm_loadu_si128((__m128i const *)(src + j + 64)); __m128i v_b1 = _mm_loadu_si128((__m128i const *)(src + j + 80)); - _MM_DEINTERLIV_EPI8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1) + _mm_deinterliv_epi8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); process(_mm_unpacklo_epi8(v_r0, v_zero), _mm_unpacklo_epi8(v_g0, v_zero), From bc394e7516194be88816cbc8cf7603d394f84433 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 12 Jan 2015 10:59:30 +0300 Subject: [PATCH 34/53] detection of other CPU features --- CMakeLists.txt | 2 + cmake/OpenCVCompilerOptions.cmake | 11 ++- modules/core/include/opencv2/core/cvdef.h | 91 ++++++++++++++++--- modules/core/include/opencv2/core/utility.hpp | 34 +++++-- modules/core/src/system.cpp | 10 ++ modules/ts/src/ts_func.cpp | 36 ++++++++ 6 files changed, 157 insertions(+), 27 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7b5648efd4..d9bb04081c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -214,12 +214,14 @@ OCV_OPTION(ENABLE_COVERAGE "Enable coverage collection with GCov" OCV_OPTION(ENABLE_OMIT_FRAME_POINTER "Enable -fomit-frame-pointer for GCC" ON IF CMAKE_COMPILER_IS_GNUCXX AND NOT (APPLE AND CMAKE_COMPILER_IS_CLANGCXX) ) OCV_OPTION(ENABLE_POWERPC "Enable PowerPC for GCC" ON IF (CMAKE_COMPILER_IS_GNUCXX AND CMAKE_SYSTEM_PROCESSOR MATCHES powerpc.*) ) OCV_OPTION(ENABLE_FAST_MATH "Enable -ffast-math (not recommended for GCC 4.6.x)" OFF IF (CMAKE_COMPILER_IS_GNUCXX AND (X86 OR X86_64)) ) +OCV_OPTION(ENABLE_POPCNT "Enable POPCNT instructions" ON IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) ) OCV_OPTION(ENABLE_SSE "Enable SSE instructions" ON IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) ) OCV_OPTION(ENABLE_SSE2 "Enable SSE2 instructions" ON IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) ) OCV_OPTION(ENABLE_SSE3 "Enable SSE3 instructions" ON IF ((CV_ICC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) ) OCV_OPTION(ENABLE_SSSE3 "Enable SSSE3 instructions" OFF IF (CMAKE_COMPILER_IS_GNUCXX AND (X86 OR X86_64)) ) OCV_OPTION(ENABLE_SSE41 "Enable SSE4.1 instructions" OFF IF ((CV_ICC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) ) OCV_OPTION(ENABLE_SSE42 "Enable SSE4.2 instructions" OFF IF (CMAKE_COMPILER_IS_GNUCXX AND (X86 OR X86_64)) ) +OCV_OPTION(ENABLE_FMA3 "Enable FMA3 instructions" ON IF (CMAKE_COMPILER_IS_GNUCXX AND (X86 OR X86_64)) ) OCV_OPTION(ENABLE_AVX "Enable AVX instructions" OFF IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) ) OCV_OPTION(ENABLE_AVX2 "Enable AVX2 instructions" OFF IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) ) OCV_OPTION(ENABLE_NEON "Enable NEON instructions" OFF IF CMAKE_COMPILER_IS_GNUCXX AND (ARM OR IOS) ) diff --git a/cmake/OpenCVCompilerOptions.cmake b/cmake/OpenCVCompilerOptions.cmake index 831026fb50..3d1155c872 100644 --- a/cmake/OpenCVCompilerOptions.cmake +++ b/cmake/OpenCVCompilerOptions.cmake @@ -122,16 +122,19 @@ if(CMAKE_COMPILER_IS_GNUCXX) if(ENABLE_POWERPC) add_extra_compiler_option("-mcpu=G3 -mtune=G5") endif() + if(ENABLE_POPCNT) + add_extra_compiler_option(-mpopcnt) + endif() if(ENABLE_SSE) add_extra_compiler_option(-msse) endif() if(ENABLE_SSE2) add_extra_compiler_option(-msse2) endif() - if (ENABLE_NEON) + if(ENABLE_NEON) add_extra_compiler_option("-mfpu=neon") endif() - if (ENABLE_VFPV3 AND NOT ENABLE_NEON) + if(ENABLE_VFPV3 AND NOT ENABLE_NEON) add_extra_compiler_option("-mfpu=vfpv3") endif() @@ -162,6 +165,10 @@ if(CMAKE_COMPILER_IS_GNUCXX) add_extra_compiler_option(-msse4.2) endif() endif() + + if(ENABLE_FMA3) + add_extra_compiler_option(-mfma) + endif() endif(NOT MINGW) if(X86 OR X86_64) diff --git a/modules/core/include/opencv2/core/cvdef.h b/modules/core/include/opencv2/core/cvdef.h index c52cb021cb..45146a39c2 100644 --- a/modules/core/include/opencv2/core/cvdef.h +++ b/modules/core/include/opencv2/core/cvdef.h @@ -104,18 +104,32 @@ #endif /* CPU features and intrinsics support */ -#define CV_CPU_NONE 0 -#define CV_CPU_MMX 1 -#define CV_CPU_SSE 2 -#define CV_CPU_SSE2 3 -#define CV_CPU_SSE3 4 -#define CV_CPU_SSSE3 5 -#define CV_CPU_SSE4_1 6 -#define CV_CPU_SSE4_2 7 -#define CV_CPU_POPCNT 8 -#define CV_CPU_AVX 10 -#define CV_CPU_AVX2 11 -#define CV_CPU_NEON 12 +#define CV_CPU_NONE 0 +#define CV_CPU_MMX 1 +#define CV_CPU_SSE 2 +#define CV_CPU_SSE2 3 +#define CV_CPU_SSE3 4 +#define CV_CPU_SSSE3 5 +#define CV_CPU_SSE4_1 6 +#define CV_CPU_SSE4_2 7 +#define CV_CPU_POPCNT 8 + +#define CV_CPU_AVX 10 +#define CV_CPU_AVX2 11 +#define CV_CPU_FMA3 12 + +#define CV_CPU_AVX_512F 13 +#define CV_CPU_AVX_512BW 14 +#define CV_CPU_AVX_512CD 15 +#define CV_CPU_AVX_512DQ 16 +#define CV_CPU_AVX_512ER 17 +#define CV_CPU_AVX_512IFMA512 18 +#define CV_CPU_AVX_512PF 19 +#define CV_CPU_AVX_512VBMI 20 +#define CV_CPU_AVX_512VL 21 + +#define CV_CPU_NEON 100 + // when adding to this list remember to update the enum in core/utility.cpp #define CV_HARDWARE_MAX_FEATURE 255 @@ -124,6 +138,7 @@ #if defined __SSE2__ || defined _M_X64 || (defined _M_IX86_FP && _M_IX86_FP >= 2) # include +# define CV_MMX # define CV_SSE 1 # define CV_SSE2 1 # if defined __SSE3__ || (defined _MSC_VER && _MSC_VER >= 1500) @@ -142,6 +157,14 @@ # include # define CV_SSE4_2 1 # endif +# if defined __FMA__ || (defined _MSC_VER && _MSC_VER >= 1500) +# include +# define CV_FMA3 1 +# endif +# if defined __POPCNT__ || (defined _MSC_VER && _MSC_VER >= 1500) +# include +# define CV_POPCNT 1 +# endif # if defined __AVX__ || defined __AVX2__ || (defined _MSC_FULL_VER && _MSC_FULL_VER >= 160040219) // MS Visual Studio 2010 (2012?) has no macro pre-defined to identify the use of /arch:AVX // See: http://connect.microsoft.com/VisualStudio/feedback/details/605858/arch-avx-should-define-a-predefined-macro-in-x64-and-set-a-unique-value-for-m-ix86-fp-in-win32 @@ -151,11 +174,12 @@ # define __xgetbv() _xgetbv(_XCR_XFEATURE_ENABLED_MASK) # else # define __xgetbv() 0 -# ifdef __AVX2__ -# define CV_AVX2 1 -# endif # endif # endif +# if defined __AVX2__ || (defined _MSC_FULL_VER && _MSC_FULL_VER >= 160040219) +# include +# define CV_AVX2 1 +# endif #endif #if (defined WIN32 || defined _WIN32) && defined(_M_ARM) @@ -170,6 +194,9 @@ #endif // __CUDACC__ +#ifndef CV_MMX +# define CV_MMX 0 +#endif #ifndef CV_SSE # define CV_SSE 0 #endif @@ -194,6 +221,40 @@ #ifndef CV_AVX2 # define CV_AVX2 0 #endif +#ifndef CV_POPCNT +#define CV_POPCNT 0 +#endif +#ifndef CV_FMA3 +# define CV_FMA3 0 +#endif +#ifndef CV_AVX_512F +# define CV_AVX_512F 0 +#endif +#ifndef CV_AVX_512BW +# define CV_AVX_512BW 0 +#endif +#ifndef CV_AVX_512CD +# define CV_AVX_512CD 0 +#endif +#ifndef CV_AVX_512DQ +# define CV_AVX_512DQ 0 +#endif +#ifndef CV_AVX_512ER +# define CV_AVX_512ER 0 +#endif +#ifndef CV_AVX_512IFMA512 +# define CV_AVX_512IFMA512 0 +#endif +#ifndef CV_AVX_512PF +# define CV_AVX_512PF 0 +#endif +#ifndef CV_AVX_512VBMI +# define CV_AVX_512VBMI 0 +#endif +#ifndef CV_AVX_512VL +# define CV_AVX_512VL 0 +#endif + #ifndef CV_NEON # define CV_NEON 0 #endif diff --git a/modules/core/include/opencv2/core/utility.hpp b/modules/core/include/opencv2/core/utility.hpp index 88989ef5cb..fb8ccd88d7 100644 --- a/modules/core/include/opencv2/core/utility.hpp +++ b/modules/core/include/opencv2/core/utility.hpp @@ -281,16 +281,30 @@ CV_EXPORTS_W int64 getCPUTickCount(); remember to keep this list identical to the one in cvdef.h */ enum CpuFeatures { - CPU_MMX = 1, - CPU_SSE = 2, - CPU_SSE2 = 3, - CPU_SSE3 = 4, - CPU_SSSE3 = 5, - CPU_SSE4_1 = 6, - CPU_SSE4_2 = 7, - CPU_POPCNT = 8, - CPU_AVX = 10, - CPU_NEON = 11 + CPU_MMX = 1, + CPU_SSE = 2, + CPU_SSE2 = 3, + CPU_SSE3 = 4, + CPU_SSSE3 = 5, + CPU_SSE4_1 = 6, + CPU_SSE4_2 = 7, + CPU_POPCNT = 8, + + CPU_AVX = 10, + CPU_AVX2 = 11, + CPU_FMA3 = 12, + + CPU_AVX_512F = 13, + CPU_AVX_512BW = 14, + CPU_AVX_512CD = 15, + CPU_AVX_512DQ = 16, + CPU_AVX_512ER = 17, + CPU_AVX_512IFMA512 = 18, + CPU_AVX_512PF = 19, + CPU_AVX_512VBMI = 20, + CPU_AVX_512VL = 21, + + CPU_NEON = 100 }; /** @brief Returns true if the specified feature is supported by the host hardware. diff --git a/modules/core/src/system.cpp b/modules/core/src/system.cpp index 11bbab3a25..a7a6e98d4d 100644 --- a/modules/core/src/system.cpp +++ b/modules/core/src/system.cpp @@ -263,6 +263,7 @@ struct HWFeatures f.have[CV_CPU_SSE2] = (cpuid_data[3] & (1<<26)) != 0; f.have[CV_CPU_SSE3] = (cpuid_data[2] & (1<<0)) != 0; f.have[CV_CPU_SSSE3] = (cpuid_data[2] & (1<<9)) != 0; + f.have[CV_CPU_FMA3] = (cpuid_data[2] & (1<<12)) != 0; f.have[CV_CPU_SSE4_1] = (cpuid_data[2] & (1<<19)) != 0; f.have[CV_CPU_SSE4_2] = (cpuid_data[2] & (1<<20)) != 0; f.have[CV_CPU_POPCNT] = (cpuid_data[2] & (1<<23)) != 0; @@ -301,6 +302,15 @@ struct HWFeatures #endif f.have[CV_CPU_AVX2] = (cpuid_data[1] & (1<<5)) != 0; + f.have[CV_CPU_AVX_512F] = (cpuid_data[1] & (1<<16)) != 0; + f.have[CV_CPU_AVX_512DQ] = (cpuid_data[1] & (1<<17)) != 0; + f.have[CV_CPU_AVX_512IFMA512] = (cpuid_data[1] & (1<<21)) != 0; + f.have[CV_CPU_AVX_512PF] = (cpuid_data[1] & (1<<26)) != 0; + f.have[CV_CPU_AVX_512ER] = (cpuid_data[1] & (1<<27)) != 0; + f.have[CV_CPU_AVX_512CD] = (cpuid_data[1] & (1<<28)) != 0; + f.have[CV_CPU_AVX_512BW] = (cpuid_data[1] & (1<<30)) != 0; + f.have[CV_CPU_AVX_512VL] = (cpuid_data[1] & (1<<31)) != 0; + f.have[CV_CPU_AVX_512VBMI] = (cpuid_data[2] & (1<<1)) != 0; } return f; diff --git a/modules/ts/src/ts_func.cpp b/modules/ts/src/ts_func.cpp index 53b62e74d7..84a9233dd2 100644 --- a/modules/ts/src/ts_func.cpp +++ b/modules/ts/src/ts_func.cpp @@ -2998,6 +2998,12 @@ void printVersionInfo(bool useStdOut) std::string cpu_features; +#if CV_MMX + if (checkHardwareSupport(CV_CPU_MMX)) cpu_features += " mmx"; +#endif +#if CV_POPCNT + if (checkHardwareSupport(CV_CPU_POPCNT)) cpu_features += " popcnt"; +#endif #if CV_SSE if (checkHardwareSupport(CV_CPU_SSE)) cpu_features += " sse"; #endif @@ -3022,6 +3028,36 @@ void printVersionInfo(bool useStdOut) #if CV_AVX2 if (checkHardwareSupport(CV_CPU_AVX2)) cpu_features += " avx2"; #endif +#if CV_FMA3 + if (checkHardwareSupport(CV_CPU_FMA3)) cpu_features += " fma3"; +#endif +#if CV_AVX_512F + if (checkHardwareSupport(CV_CPU_AVX_512F) cpu_features += " avx-512f"; +#endif +#if CV_AVX_512BW + if (checkHardwareSupport(CV_CPU_AVX_512BW) cpu_features += " avx-512bw"; +#endif +#if CV_AVX_512CD + if (checkHardwareSupport(CV_CPU_AVX_512CD) cpu_features += " avx-512cd"; +#endif +#if CV_AVX_512DQ + if (checkHardwareSupport(CV_CPU_AVX_512DQ) cpu_features += " avx-512dq"; +#endif +#if CV_AVX_512ER + if (checkHardwareSupport(CV_CPU_AVX_512ER) cpu_features += " avx-512er"; +#endif +#if CV_AVX_512IFMA512 + if (checkHardwareSupport(CV_CPU_AVX_512IFMA512) cpu_features += " avx-512ifma512"; +#endif +#if CV_AVX_512PF + if (checkHardwareSupport(CV_CPU_AVX_512PF) cpu_features += " avx-512pf"; +#endif +#if CV_AVX_512VBMI + if (checkHardwareSupport(CV_CPU_AVX_512VBMI) cpu_features += " avx-512vbmi"; +#endif +#if CV_AVX_512VL + if (checkHardwareSupport(CV_CPU_AVX_512VL) cpu_features += " avx-512vl"; +#endif #if CV_NEON cpu_features += " neon"; // NEON is currently not checked at runtime #endif From 31827d8dfeacc4390fdae79cfbb1d39cc5b7e8a9 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 12 Jan 2015 10:59:30 +0300 Subject: [PATCH 35/53] fixed typo --- modules/core/include/opencv2/core/cvdef.h | 2 +- .../core/include/opencv2/core/sse_utils.hpp | 48 +++++++-------- modules/imgproc/src/color.cpp | 60 +++++++++---------- 3 files changed, 55 insertions(+), 55 deletions(-) diff --git a/modules/core/include/opencv2/core/cvdef.h b/modules/core/include/opencv2/core/cvdef.h index 45146a39c2..5fa45a592b 100644 --- a/modules/core/include/opencv2/core/cvdef.h +++ b/modules/core/include/opencv2/core/cvdef.h @@ -138,7 +138,7 @@ #if defined __SSE2__ || defined _M_X64 || (defined _M_IX86_FP && _M_IX86_FP >= 2) # include -# define CV_MMX +# define CV_MMX 1 # define CV_SSE 1 # define CV_SSE2 1 # if defined __SSE3__ || (defined _MSC_VER && _MSC_VER >= 1500) diff --git a/modules/core/include/opencv2/core/sse_utils.hpp b/modules/core/include/opencv2/core/sse_utils.hpp index 13673f57b3..9db8f0ade9 100644 --- a/modules/core/include/opencv2/core/sse_utils.hpp +++ b/modules/core/include/opencv2/core/sse_utils.hpp @@ -48,8 +48,8 @@ #if CV_SSE2 -inline void _mm_deinterliv_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, - __m128i & v_g1, __m128i & v_b0, __m128i & v_b1) +inline void _mm_deinterleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, + __m128i & v_g1, __m128i & v_b0, __m128i & v_b1) { __m128i layer1_chunk0 = _mm_unpacklo_epi8(v_r0, v_g1); __m128i layer1_chunk1 = _mm_unpackhi_epi8(v_r0, v_g1); @@ -87,8 +87,8 @@ inline void _mm_deinterliv_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, v_b1 = _mm_unpackhi_epi8(layer4_chunk2, layer4_chunk5); } -inline void _mm_deinterliv_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1, - __m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1) +inline void _mm_deinterleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1, + __m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1) { __m128i layer1_chunk0 = _mm_unpacklo_epi8(v_r0, v_b0); __m128i layer1_chunk1 = _mm_unpackhi_epi8(v_r0, v_b0); @@ -136,8 +136,8 @@ inline void _mm_deinterliv_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, v_a1 = _mm_unpackhi_epi8(layer4_chunk3, layer4_chunk7); } -inline void _mm_interlive_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, - __m128i & v_g1, __m128i & v_b0, __m128i & v_b1) +inline void _mm_interleavee_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, + __m128i & v_g1, __m128i & v_b0, __m128i & v_b1) { __m128i v_mask = _mm_set1_epi16(0x00ff); @@ -177,8 +177,8 @@ inline void _mm_interlive_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, v_b1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk4, 8), _mm_srli_epi16(layer1_chunk5, 8)); } -inline void _mm_interlive_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1, - __m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1) +inline void _mm_interleavee_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1, + __m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1) { __m128i v_mask = _mm_set1_epi16(0x00ff); @@ -228,8 +228,8 @@ inline void _mm_interlive_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, _ v_a1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk6, 8), _mm_srli_epi16(layer1_chunk7, 8)); } -inline void _mm_deinterliv_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, - __m128i & v_g1, __m128i & v_b0, __m128i & v_b1) +inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, + __m128i & v_g1, __m128i & v_b0, __m128i & v_b1) { __m128i layer1_chunk0 = _mm_unpacklo_epi16(v_r0, v_g1); __m128i layer1_chunk1 = _mm_unpackhi_epi16(v_r0, v_g1); @@ -260,8 +260,8 @@ inline void _mm_deinterliv_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, v_b1 = _mm_unpackhi_epi16(layer3_chunk2, layer3_chunk5); } -inline void _mm_deinterliv_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1, - __m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1) +inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1, + __m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1) { __m128i layer1_chunk0 = _mm_unpacklo_epi16(v_r0, v_b0); __m128i layer1_chunk1 = _mm_unpackhi_epi16(v_r0, v_b0); @@ -300,8 +300,8 @@ inline void _mm_deinterliv_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, v_a1 = _mm_unpackhi_epi16(layer3_chunk3, layer3_chunk7); } -inline void _mm_interliv_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, - __m128i & v_g1, __m128i & v_b0, __m128i & v_b1) +inline void _mm_interleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, + __m128i & v_g1, __m128i & v_b0, __m128i & v_b1) { __m128i v_mask = _mm_set1_epi32(0x0000ffff); @@ -334,8 +334,8 @@ inline void _mm_interliv_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, v_b1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk4, 16), _mm_srli_epi32(layer1_chunk5, 16)); } -inline void _mm_interliv_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1, - __m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1) +inline void _mm_interleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1, + __m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1) { __m128i v_mask = _mm_set1_epi32(0x0000ffff); @@ -376,8 +376,8 @@ inline void _mm_interliv_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, _ v_a1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk6, 16), _mm_srli_epi32(layer1_chunk7, 16)); } -inline void _mm_deinterliv_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, - __m128 & v_g1, __m128 & v_b0, __m128 & v_b1) +inline void _mm_deinterleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, + __m128 & v_g1, __m128 & v_b0, __m128 & v_b1) { __m128 layer1_chunk0 = _mm_unpacklo_ps(v_r0, v_g1); __m128 layer1_chunk1 = _mm_unpackhi_ps(v_r0, v_g1); @@ -401,8 +401,8 @@ inline void _mm_deinterliv_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, v_b1 = _mm_unpackhi_ps(layer2_chunk2, layer2_chunk5); } -inline void _mm_deinterliv_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1, - __m128 & v_b0, __m128 & v_b1, __m128 & v_a0, __m128 & v_a1) +inline void _mm_deinterleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1, + __m128 & v_b0, __m128 & v_b1, __m128 & v_a0, __m128 & v_a1) { __m128 layer1_chunk0 = _mm_unpacklo_ps(v_r0, v_b0); __m128 layer1_chunk1 = _mm_unpackhi_ps(v_r0, v_b0); @@ -432,8 +432,8 @@ inline void _mm_deinterliv_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m12 v_a1 = _mm_unpackhi_ps(layer2_chunk3, layer2_chunk7); } -inline void _mm_interliv_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, - __m128 & v_g1, __m128 & v_b0, __m128 & v_b1) +inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, + __m128 & v_g1, __m128 & v_b0, __m128 & v_b1) { const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1); @@ -459,8 +459,8 @@ inline void _mm_interliv_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, v_b1 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_hi); } -inline void _mm_interliv_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1, - __m128 & v_b0, __m128 & v_b1, __m128 & v_a0, __m128 & v_a1) +inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1, + __m128 & v_b0, __m128 & v_b1, __m128 & v_a0, __m128 & v_a1) { const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1); diff --git a/modules/imgproc/src/color.cpp b/modules/imgproc/src/color.cpp index 4d74e86dba..6049f9993f 100644 --- a/modules/imgproc/src/color.cpp +++ b/modules/imgproc/src/color.cpp @@ -1504,7 +1504,7 @@ struct RGB2Gray __m128i v_b0 = _mm_loadu_si128((__m128i const *)(src + 32)); __m128i v_b1 = _mm_loadu_si128((__m128i const *)(src + 40)); - _mm_deinterliv_epi16(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); + _mm_deinterleave_epi16(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); __m128i v_gray0; process(v_r0, v_g0, v_b0, @@ -1531,7 +1531,7 @@ struct RGB2Gray __m128i v_a0 = _mm_loadu_si128((__m128i const *)(src + 48)); __m128i v_a1 = _mm_loadu_si128((__m128i const *)(src + 56)); - _mm_deinterliv_epi16(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1, v_a0, v_a1); + _mm_deinterleave_epi16(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1, v_a0, v_a1); __m128i v_gray0; process(v_r0, v_g0, v_b0, @@ -1596,7 +1596,7 @@ struct RGB2Gray __m128 v_b0 = _mm_loadu_ps(src + 16); __m128 v_b1 = _mm_loadu_ps(src + 20); - _mm_deinterliv_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); + _mm_deinterleave_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); __m128 v_gray0; process(v_r0, v_g0, v_b0, @@ -1623,7 +1623,7 @@ struct RGB2Gray __m128 v_a0 = _mm_loadu_ps(src + 24); __m128 v_a1 = _mm_loadu_ps(src + 28); - _mm_deinterliv_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1, v_a0, v_a1); + _mm_deinterleave_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1, v_a0, v_a1); __m128 v_gray0; process(v_r0, v_g0, v_b0, @@ -1821,7 +1821,7 @@ struct RGB2YCrCb_f __m128 v_b0 = _mm_loadu_ps(src + 16); __m128 v_b1 = _mm_loadu_ps(src + 20); - _mm_deinterliv_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); + _mm_deinterleave_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); __m128 v_y0, v_cr0, v_cb0; process(v_r0, v_g0, v_b0, @@ -1831,7 +1831,7 @@ struct RGB2YCrCb_f process(v_r1, v_g1, v_b1, v_y1, v_cr1, v_cb1); - _mm_interliv_ps(v_y0, v_y1, v_cr0, v_cr1, v_cb0, v_cb1); + _mm_interleave_ps(v_y0, v_y1, v_cr0, v_cr1, v_cb0, v_cb1); _mm_storeu_ps(dst + i, v_y0); _mm_storeu_ps(dst + i + 4, v_y1); @@ -2186,7 +2186,7 @@ struct RGB2YCrCb_i __m128i v_b0 = _mm_loadu_si128((__m128i const *)(src + 64)); __m128i v_b1 = _mm_loadu_si128((__m128i const *)(src + 80)); - _mm_deinterliv_epi8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); + _mm_deinterleave_epi8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); __m128i v_y0 = v_zero, v_cr0 = v_zero, v_cb0 = v_zero; process(_mm_unpacklo_epi8(v_r0, v_zero), @@ -2218,7 +2218,7 @@ struct RGB2YCrCb_i __m128i v_cr_1 = _mm_packus_epi16(v_cr0, v_cr1); __m128i v_cb_1 = _mm_packus_epi16(v_cb0, v_cb1); - _mm_interlive_epi8(v_y_0, v_y_1, v_cr_0, v_cr_1, v_cb_0, v_cb_1); + _mm_interleavee_epi8(v_y_0, v_y_1, v_cr_0, v_cr_1, v_cb_0, v_cb_1); _mm_storeu_si128((__m128i *)(dst + i), v_y_0); _mm_storeu_si128((__m128i *)(dst + i + 16), v_y_1); @@ -2328,7 +2328,7 @@ struct RGB2YCrCb_i __m128i v_b0 = _mm_loadu_si128((__m128i const *)(src + 32)); __m128i v_b1 = _mm_loadu_si128((__m128i const *)(src + 40)); - _mm_deinterliv_epi16(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); + _mm_deinterleave_epi16(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); __m128i v_y0 = v_zero, v_cr0 = v_zero, v_cb0 = v_zero; process(v_r0, v_g0, v_b0, @@ -2338,7 +2338,7 @@ struct RGB2YCrCb_i process(v_r1, v_g1, v_b1, v_y1, v_cr1, v_cb1); - _mm_interliv_epi16(v_y0, v_y1, v_cr0, v_cr1, v_cb0, v_cb1); + _mm_interleave_epi16(v_y0, v_y1, v_cr0, v_cr1, v_cb0, v_cb1); _mm_storeu_si128((__m128i *)(dst + i), v_y0); _mm_storeu_si128((__m128i *)(dst + i + 8), v_y1); @@ -2536,7 +2536,7 @@ struct YCrCb2RGB_f __m128 v_cb0 = _mm_loadu_ps(src + i + 16); __m128 v_cb1 = _mm_loadu_ps(src + i + 20); - _mm_deinterliv_ps(v_y0, v_y1, v_cr0, v_cr1, v_cb0, v_cb1); + _mm_deinterleave_ps(v_y0, v_y1, v_cr0, v_cr1, v_cb0, v_cb1); __m128 v_r0, v_g0, v_b0; process(v_y0, v_cr0, v_cb0, @@ -2546,7 +2546,7 @@ struct YCrCb2RGB_f process(v_y1, v_cr1, v_cb1, v_r1, v_g1, v_b1); - _mm_interliv_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); + _mm_interleave_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); _mm_storeu_ps(dst, v_r0); _mm_storeu_ps(dst + 4, v_r1); @@ -2949,7 +2949,7 @@ struct YCrCb2RGB_i __m128i v_cb0 = _mm_loadu_si128((__m128i const *)(src + i + 64)); __m128i v_cb1 = _mm_loadu_si128((__m128i const *)(src + i + 80)); - _mm_deinterliv_epi8(v_y0, v_y1, v_cr0, v_cr1, v_cb0, v_cb1); + _mm_deinterleave_epi8(v_y0, v_y1, v_cr0, v_cr1, v_cb0, v_cb1); __m128i v_r_0 = v_zero, v_g_0 = v_zero, v_b_0 = v_zero; process(_mm_unpacklo_epi8(v_y0, v_zero), @@ -2987,7 +2987,7 @@ struct YCrCb2RGB_i std::swap(v_r1, v_b1); } - _mm_interlive_epi8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); + _mm_interleavee_epi8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); _mm_storeu_si128((__m128i *)(dst), v_r0); _mm_storeu_si128((__m128i *)(dst + 16), v_r1); @@ -3210,7 +3210,7 @@ struct RGB2XYZ_f __m128 v_b0 = _mm_loadu_ps(src + 16); __m128 v_b1 = _mm_loadu_ps(src + 20); - _mm_deinterliv_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); + _mm_deinterleave_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); __m128 v_x0, v_y0, v_z0; process(v_r0, v_g0, v_b0, @@ -3220,7 +3220,7 @@ struct RGB2XYZ_f process(v_r1, v_g1, v_b1, v_x1, v_y1, v_z1); - _mm_interliv_ps(v_x0, v_x1, v_y0, v_y1, v_z0, v_z1); + _mm_interleave_ps(v_x0, v_x1, v_y0, v_y1, v_z0, v_z1); _mm_storeu_ps(dst + i, v_x0); _mm_storeu_ps(dst + i + 4, v_x1); @@ -3636,7 +3636,7 @@ struct XYZ2RGB_f __m128 v_z0 = _mm_loadu_ps(src + i + 16); __m128 v_z1 = _mm_loadu_ps(src + i + 20); - _mm_deinterliv_ps(v_x0, v_x1, v_y0, v_y1, v_z0, v_z1); + _mm_deinterleave_ps(v_x0, v_x1, v_y0, v_y1, v_z0, v_z1); __m128 v_r0, v_g0, v_b0; process(v_x0, v_y0, v_z0, @@ -3646,7 +3646,7 @@ struct XYZ2RGB_f process(v_x1, v_y1, v_z1, v_r1, v_g1, v_b1); - _mm_interliv_ps(v_b0, v_b1, v_g0, v_g1, v_r0, v_r1); + _mm_interleave_ps(v_b0, v_b1, v_g0, v_g1, v_r0, v_r1); _mm_storeu_ps(dst, v_b0); _mm_storeu_ps(dst + 4, v_b1); @@ -4216,7 +4216,7 @@ struct HSV2RGB_b v_g1 = _mm_mul_ps(v_g1, v_scale_inv); v_b1 = _mm_mul_ps(v_b1, v_scale_inv); - _mm_interliv_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); + _mm_interleave_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); _mm_store_ps(buf, v_r0); _mm_store_ps(buf + 4, v_r1); @@ -4267,7 +4267,7 @@ struct HSV2RGB_b __m128i v_b0 = _mm_loadu_si128((__m128i const *)(src + j + 64)); __m128i v_b1 = _mm_loadu_si128((__m128i const *)(src + j + 80)); - _mm_deinterliv_epi8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); + _mm_deinterleave_epi8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); process(_mm_unpacklo_epi8(v_r0, v_zero), _mm_unpacklo_epi8(v_g0, v_zero), @@ -4461,7 +4461,7 @@ struct RGB2HLS_b __m128 v_s0f = _mm_load_ps(buf + 16); __m128 v_s1f = _mm_load_ps(buf + 20); - _mm_deinterliv_ps(v_h0f, v_h1f, v_l0f, v_l1f, v_s0f, v_s1f); + _mm_deinterleave_ps(v_h0f, v_h1f, v_l0f, v_l1f, v_s0f, v_s1f); v_l0f = _mm_mul_ps(v_l0f, v_scale); v_l1f = _mm_mul_ps(v_l1f, v_scale); @@ -4584,7 +4584,7 @@ struct RGB2HLS_b __m128i v_l1 = _mm_packus_epi16(v_l_0, v_l_1); __m128i v_s1 = _mm_packus_epi16(v_s_0, v_s_1); - _mm_interlive_epi8(v_h0, v_h1, v_l0, v_l1, v_s0, v_s1); + _mm_interleavee_epi8(v_h0, v_h1, v_l0, v_l1, v_s0, v_s1); _mm_storeu_si128((__m128i *)(dst + j), v_h0); _mm_storeu_si128((__m128i *)(dst + j + 16), v_h1); @@ -4716,7 +4716,7 @@ struct HLS2RGB_b v_g1 = _mm_mul_ps(v_g1, v_scale_inv); v_b1 = _mm_mul_ps(v_b1, v_scale_inv); - _mm_interliv_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); + _mm_interleave_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); _mm_store_ps(buf, v_r0); _mm_store_ps(buf + 4, v_r1); @@ -4767,7 +4767,7 @@ struct HLS2RGB_b __m128i v_b0 = _mm_loadu_si128((__m128i const *)(src + j + 64)); __m128i v_b1 = _mm_loadu_si128((__m128i const *)(src + j + 80)); - _mm_deinterliv_epi8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); + _mm_deinterleave_epi8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); process(_mm_unpacklo_epi8(v_r0, v_zero), _mm_unpacklo_epi8(v_g0, v_zero), @@ -5215,7 +5215,7 @@ struct Lab2RGB_b v_b0 = _mm_sub_ps(v_b0, v_128); v_b1 = _mm_sub_ps(v_b1, v_128); - _mm_interliv_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); + _mm_interleave_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); _mm_store_ps(buf, v_r0); _mm_store_ps(buf + 4, v_r1); @@ -5266,7 +5266,7 @@ struct Lab2RGB_b __m128i v_b0 = _mm_loadu_si128((__m128i const *)(src + j + 64)); __m128i v_b1 = _mm_loadu_si128((__m128i const *)(src + j + 80)); - _mm_deinterliv_epi8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); + _mm_deinterleave_epi8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); process(_mm_unpacklo_epi8(v_r0, v_zero), _mm_unpacklo_epi8(v_g0, v_zero), @@ -5568,7 +5568,7 @@ struct RGB2Luv_b __m128 v_v0f = _mm_load_ps(buf + 16); __m128 v_v1f = _mm_load_ps(buf + 20); - _mm_deinterliv_ps(v_l0f, v_l1f, v_u0f, v_u1f, v_v0f, v_v1f); + _mm_deinterleave_ps(v_l0f, v_l1f, v_u0f, v_u1f, v_v0f, v_v1f); v_l0f = _mm_mul_ps(v_l0f, v_scale); v_l1f = _mm_mul_ps(v_l1f, v_scale); @@ -5694,7 +5694,7 @@ struct RGB2Luv_b __m128i v_u1 = _mm_packus_epi16(v_u_0, v_u_1); __m128i v_v1 = _mm_packus_epi16(v_v_0, v_v_1); - _mm_interlive_epi8(v_l0, v_l1, v_u0, v_u1, v_v0, v_v1); + _mm_interleavee_epi8(v_l0, v_l1, v_u0, v_u1, v_v0, v_v1); _mm_storeu_si128((__m128i *)(dst + j), v_l0); _mm_storeu_si128((__m128i *)(dst + j + 16), v_l1); @@ -5775,7 +5775,7 @@ struct Luv2RGB_b v_v0 = _mm_sub_ps(_mm_mul_ps(v_v0, v_coeff2), v_140); v_v1 = _mm_sub_ps(_mm_mul_ps(v_v1, v_coeff2), v_140); - _mm_interliv_ps(v_l0, v_l1, v_u0, v_u1, v_v0, v_v1); + _mm_interleave_ps(v_l0, v_l1, v_u0, v_u1, v_v0, v_v1); _mm_store_ps(buf, v_l0); _mm_store_ps(buf + 4, v_l1); @@ -5826,7 +5826,7 @@ struct Luv2RGB_b __m128i v_b0 = _mm_loadu_si128((__m128i const *)(src + j + 64)); __m128i v_b1 = _mm_loadu_si128((__m128i const *)(src + j + 80)); - _mm_deinterliv_epi8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); + _mm_deinterleave_epi8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); process(_mm_unpacklo_epi8(v_r0, v_zero), _mm_unpacklo_epi8(v_g0, v_zero), From fc0869735d742ecd6a83bc8e3989734b6271fda7 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 12 Jan 2015 10:59:30 +0300 Subject: [PATCH 36/53] used popcnt --- CMakeLists.txt | 2 +- cmake/OpenCVCompilerOptions.cmake | 8 +- modules/core/include/opencv2/core/base.hpp | 1 + modules/core/include/opencv2/core/cvdef.h | 16 +- .../core/include/opencv2/core/sse_utils.hpp | 77 +++++- modules/core/include/opencv2/core/utility.hpp | 1 + modules/core/src/arithm.cpp | 1 + modules/core/src/convert.cpp | 233 ++++++++++++++++-- modules/core/src/copy.cpp | 1 + modules/core/src/mathfuncs.cpp | 1 + modules/core/src/matmul.cpp | 1 + modules/core/src/stat.cpp | 10 +- modules/core/src/system.cpp | 1 + modules/core/src/umatrix.cpp | 3 +- modules/imgproc/src/accum.cpp | 1 + modules/imgproc/src/canny.cpp | 1 + modules/imgproc/src/clahe.cpp | 1 + modules/imgproc/src/color.cpp | 1 + modules/imgproc/src/corner.cpp | 1 + modules/imgproc/src/demosaicing.cpp | 1 + modules/imgproc/src/imgwarp.cpp | 1 + modules/imgproc/src/pyramids.cpp | 1 + modules/imgproc/src/smooth.cpp | 1 + modules/imgproc/src/sumpixels.cpp | 1 + modules/ts/src/ts_func.cpp | 6 +- 25 files changed, 327 insertions(+), 45 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d9bb04081c..da0b42cb1c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -221,9 +221,9 @@ OCV_OPTION(ENABLE_SSE3 "Enable SSE3 instructions" OCV_OPTION(ENABLE_SSSE3 "Enable SSSE3 instructions" OFF IF (CMAKE_COMPILER_IS_GNUCXX AND (X86 OR X86_64)) ) OCV_OPTION(ENABLE_SSE41 "Enable SSE4.1 instructions" OFF IF ((CV_ICC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) ) OCV_OPTION(ENABLE_SSE42 "Enable SSE4.2 instructions" OFF IF (CMAKE_COMPILER_IS_GNUCXX AND (X86 OR X86_64)) ) -OCV_OPTION(ENABLE_FMA3 "Enable FMA3 instructions" ON IF (CMAKE_COMPILER_IS_GNUCXX AND (X86 OR X86_64)) ) OCV_OPTION(ENABLE_AVX "Enable AVX instructions" OFF IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) ) OCV_OPTION(ENABLE_AVX2 "Enable AVX2 instructions" OFF IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) ) +OCV_OPTION(ENABLE_FMA3 "Enable FMA3 instructions" OFF IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) ) OCV_OPTION(ENABLE_NEON "Enable NEON instructions" OFF IF CMAKE_COMPILER_IS_GNUCXX AND (ARM OR IOS) ) OCV_OPTION(ENABLE_VFPV3 "Enable VFPv3-D32 instructions" OFF IF CMAKE_COMPILER_IS_GNUCXX AND (ARM OR IOS) ) OCV_OPTION(ENABLE_NOISY_WARNINGS "Show all warnings even if they are too noisy" OFF ) diff --git a/cmake/OpenCVCompilerOptions.cmake b/cmake/OpenCVCompilerOptions.cmake index 3d1155c872..2f5f13d7bf 100644 --- a/cmake/OpenCVCompilerOptions.cmake +++ b/cmake/OpenCVCompilerOptions.cmake @@ -145,6 +145,10 @@ if(CMAKE_COMPILER_IS_GNUCXX) endif() if(ENABLE_AVX2) add_extra_compiler_option(-mavx2) + + if(ENABLE_FMA3) + add_extra_compiler_option(-mfma) + endif() endif() # GCC depresses SSEx instructions when -mavx is used. Instead, it generates new AVX instructions or AVX equivalence for all SSEx instructions when needed. @@ -165,10 +169,6 @@ if(CMAKE_COMPILER_IS_GNUCXX) add_extra_compiler_option(-msse4.2) endif() endif() - - if(ENABLE_FMA3) - add_extra_compiler_option(-mfma) - endif() endif(NOT MINGW) if(X86 OR X86_64) diff --git a/modules/core/include/opencv2/core/base.hpp b/modules/core/include/opencv2/core/base.hpp index c60eedddcc..f2acaa3fb4 100644 --- a/modules/core/include/opencv2/core/base.hpp +++ b/modules/core/include/opencv2/core/base.hpp @@ -13,6 +13,7 @@ // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. // Copyright (C) 2009, Willow Garage Inc., all rights reserved. // Copyright (C) 2013, OpenCV Foundation, all rights reserved. +// Copyright (C) 2014, Itseez Inc., all rights reserved. // Third party copyrights are property of their respective owners. // // Redistribution and use in source and binary forms, with or without modification, diff --git a/modules/core/include/opencv2/core/cvdef.h b/modules/core/include/opencv2/core/cvdef.h index 5fa45a592b..ded58a18c5 100644 --- a/modules/core/include/opencv2/core/cvdef.h +++ b/modules/core/include/opencv2/core/cvdef.h @@ -13,6 +13,7 @@ // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. // Copyright (C) 2009, Willow Garage Inc., all rights reserved. // Copyright (C) 2013, OpenCV Foundation, all rights reserved. +// Copyright (C) 2015, Itseez Inc., all rights reserved. // Third party copyrights are property of their respective owners. // // Redistribution and use in source and binary forms, with or without modification, @@ -157,15 +158,11 @@ # include # define CV_SSE4_2 1 # endif -# if defined __FMA__ || (defined _MSC_VER && _MSC_VER >= 1500) -# include -# define CV_FMA3 1 -# endif # if defined __POPCNT__ || (defined _MSC_VER && _MSC_VER >= 1500) # include # define CV_POPCNT 1 # endif -# if defined __AVX__ || defined __AVX2__ || (defined _MSC_FULL_VER && _MSC_FULL_VER >= 160040219) +# if defined __AVX__ || (defined _MSC_FULL_VER && _MSC_FULL_VER >= 160040219) // MS Visual Studio 2010 (2012?) has no macro pre-defined to identify the use of /arch:AVX // See: http://connect.microsoft.com/VisualStudio/feedback/details/605858/arch-avx-should-define-a-predefined-macro-in-x64-and-set-a-unique-value-for-m-ix86-fp-in-win32 # include @@ -179,6 +176,9 @@ # if defined __AVX2__ || (defined _MSC_FULL_VER && _MSC_FULL_VER >= 160040219) # include # define CV_AVX2 1 +# if defined __FMA__ +# define CV_FMA3 1 +# endif # endif #endif @@ -194,6 +194,9 @@ #endif // __CUDACC__ +#ifndef CV_POPCNT +#define CV_POPCNT 0 +#endif #ifndef CV_MMX # define CV_MMX 0 #endif @@ -221,9 +224,6 @@ #ifndef CV_AVX2 # define CV_AVX2 0 #endif -#ifndef CV_POPCNT -#define CV_POPCNT 0 -#endif #ifndef CV_FMA3 # define CV_FMA3 0 #endif diff --git a/modules/core/include/opencv2/core/sse_utils.hpp b/modules/core/include/opencv2/core/sse_utils.hpp index 9db8f0ade9..0667ae9210 100644 --- a/modules/core/include/opencv2/core/sse_utils.hpp +++ b/modules/core/include/opencv2/core/sse_utils.hpp @@ -10,7 +10,7 @@ // License Agreement // For Open Source Computer Vision Library // -// Copyright (C) 2015, OpenCV Foundation, all rights reserved. +// Copyright (C) 2015, Itseez Inc., all rights reserved. // Third party copyrights are property of their respective owners. // // Redistribution and use in source and binary forms, with or without modification, @@ -48,6 +48,34 @@ #if CV_SSE2 +inline void _mm_deinterleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1) +{ + __m128i layer1_chunk0 = _mm_unpacklo_epi8(v_r0, v_g0); + __m128i layer1_chunk1 = _mm_unpackhi_epi8(v_r0, v_g0); + __m128i layer1_chunk2 = _mm_unpacklo_epi8(v_r1, v_g1); + __m128i layer1_chunk3 = _mm_unpackhi_epi8(v_r1, v_g1); + + __m128i layer2_chunk0 = _mm_unpacklo_epi8(layer1_chunk0, layer1_chunk2); + __m128i layer2_chunk1 = _mm_unpackhi_epi8(layer1_chunk0, layer1_chunk2); + __m128i layer2_chunk2 = _mm_unpacklo_epi8(layer1_chunk1, layer1_chunk3); + __m128i layer2_chunk3 = _mm_unpackhi_epi8(layer1_chunk1, layer1_chunk3); + + __m128i layer3_chunk0 = _mm_unpacklo_epi8(layer2_chunk0, layer2_chunk2); + __m128i layer3_chunk1 = _mm_unpackhi_epi8(layer2_chunk0, layer2_chunk2); + __m128i layer3_chunk2 = _mm_unpacklo_epi8(layer2_chunk1, layer2_chunk3); + __m128i layer3_chunk3 = _mm_unpackhi_epi8(layer2_chunk1, layer2_chunk3); + + __m128i layer4_chunk0 = _mm_unpacklo_epi8(layer3_chunk0, layer3_chunk2); + __m128i layer4_chunk1 = _mm_unpackhi_epi8(layer3_chunk0, layer3_chunk2); + __m128i layer4_chunk2 = _mm_unpacklo_epi8(layer3_chunk1, layer3_chunk3); + __m128i layer4_chunk3 = _mm_unpackhi_epi8(layer3_chunk1, layer3_chunk3); + + v_r0 = _mm_unpacklo_epi8(layer4_chunk0, layer4_chunk2); + v_r1 = _mm_unpackhi_epi8(layer4_chunk0, layer4_chunk2); + v_g0 = _mm_unpacklo_epi8(layer4_chunk1, layer4_chunk3); + v_g1 = _mm_unpackhi_epi8(layer4_chunk1, layer4_chunk3); +} + inline void _mm_deinterleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1, __m128i & v_b0, __m128i & v_b1) { @@ -228,6 +256,29 @@ inline void _mm_interleavee_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, v_a1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk6, 8), _mm_srli_epi16(layer1_chunk7, 8)); } +inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1) +{ + __m128i layer1_chunk0 = _mm_unpacklo_epi16(v_r0, v_g0); + __m128i layer1_chunk1 = _mm_unpackhi_epi16(v_r0, v_g0); + __m128i layer1_chunk2 = _mm_unpacklo_epi16(v_r1, v_g1); + __m128i layer1_chunk3 = _mm_unpackhi_epi16(v_r1, v_g1); + + __m128i layer2_chunk0 = _mm_unpacklo_epi16(layer1_chunk0, layer1_chunk2); + __m128i layer2_chunk1 = _mm_unpackhi_epi16(layer1_chunk0, layer1_chunk2); + __m128i layer2_chunk2 = _mm_unpacklo_epi16(layer1_chunk1, layer1_chunk3); + __m128i layer2_chunk3 = _mm_unpackhi_epi16(layer1_chunk1, layer1_chunk3); + + __m128i layer3_chunk0 = _mm_unpacklo_epi16(layer2_chunk0, layer2_chunk2); + __m128i layer3_chunk1 = _mm_unpackhi_epi16(layer2_chunk0, layer2_chunk2); + __m128i layer3_chunk2 = _mm_unpacklo_epi16(layer2_chunk1, layer2_chunk3); + __m128i layer3_chunk3 = _mm_unpackhi_epi16(layer2_chunk1, layer2_chunk3); + + v_r0 = _mm_unpacklo_epi16(layer3_chunk0, layer3_chunk2); + v_r1 = _mm_unpackhi_epi16(layer3_chunk0, layer3_chunk2); + v_g0 = _mm_unpacklo_epi16(layer3_chunk1, layer3_chunk3); + v_g1 = _mm_unpackhi_epi16(layer3_chunk1, layer3_chunk3); +} + inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1, __m128i & v_b0, __m128i & v_b1) { @@ -300,6 +351,8 @@ inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g v_a1 = _mm_unpackhi_epi16(layer3_chunk3, layer3_chunk7); } +#if CV_SSE4_1 + inline void _mm_interleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1, __m128i & v_b0, __m128i & v_b1) { @@ -376,6 +429,26 @@ inline void _mm_interleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, v_a1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk6, 16), _mm_srli_epi32(layer1_chunk7, 16)); } +#endif // CV_SSE4_1 + +inline void _mm_deinterleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1) +{ + __m128 layer1_chunk0 = _mm_unpacklo_ps(v_r0, v_g0); + __m128 layer1_chunk1 = _mm_unpackhi_ps(v_r0, v_g0); + __m128 layer1_chunk2 = _mm_unpacklo_ps(v_r1, v_g1); + __m128 layer1_chunk3 = _mm_unpackhi_ps(v_r1, v_g1); + + __m128 layer2_chunk0 = _mm_unpacklo_ps(layer1_chunk0, layer1_chunk2); + __m128 layer2_chunk1 = _mm_unpackhi_ps(layer1_chunk0, layer1_chunk2); + __m128 layer2_chunk2 = _mm_unpacklo_ps(layer1_chunk1, layer1_chunk3); + __m128 layer2_chunk3 = _mm_unpackhi_ps(layer1_chunk1, layer1_chunk3); + + v_r0 = _mm_unpacklo_ps(layer2_chunk0, layer2_chunk2); + v_r1 = _mm_unpackhi_ps(layer2_chunk0, layer2_chunk2); + v_g0 = _mm_unpacklo_ps(layer2_chunk1, layer2_chunk3); + v_g1 = _mm_unpackhi_ps(layer2_chunk1, layer2_chunk3); +} + inline void _mm_deinterleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1, __m128 & v_b0, __m128 & v_b1) { @@ -492,6 +565,6 @@ inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m12 v_a1 = _mm_shuffle_ps(layer1_chunk6, layer1_chunk7, mask_hi); } -#endif +#endif // CV_SSE2 #endif //__OPENCV_CORE_SSE_UTILS_HPP__ diff --git a/modules/core/include/opencv2/core/utility.hpp b/modules/core/include/opencv2/core/utility.hpp index fb8ccd88d7..f89560a809 100644 --- a/modules/core/include/opencv2/core/utility.hpp +++ b/modules/core/include/opencv2/core/utility.hpp @@ -13,6 +13,7 @@ // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. // Copyright (C) 2009, Willow Garage Inc., all rights reserved. // Copyright (C) 2013, OpenCV Foundation, all rights reserved. +// Copyright (C) 2015, Itseez Inc., all rights reserved. // Third party copyrights are property of their respective owners. // // Redistribution and use in source and binary forms, with or without modification, diff --git a/modules/core/src/arithm.cpp b/modules/core/src/arithm.cpp index a9bf3d7e78..07678f0c6d 100644 --- a/modules/core/src/arithm.cpp +++ b/modules/core/src/arithm.cpp @@ -12,6 +12,7 @@ // // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. // Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved. +// Copyright (C) 2014-2015, Itseez Inc., all rights reserved. // Third party copyrights are property of their respective owners. // // Redistribution and use in source and binary forms, with or without modification, diff --git a/modules/core/src/convert.cpp b/modules/core/src/convert.cpp index ef8edee6a6..5c792f379d 100644 --- a/modules/core/src/convert.cpp +++ b/modules/core/src/convert.cpp @@ -12,6 +12,7 @@ // // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. // Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved. +// Copyright (C) 2014-2015, Itseez Inc., all rights reserved. // Third party copyrights are property of their respective owners. // // Redistribution and use in source and binary forms, with or without modification, @@ -62,8 +63,11 @@ template struct VSplit4; #define SPLIT2_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \ template<> \ - struct name{ \ - void operator()(const data_type* src, data_type* dst0, data_type* dst1){ \ + struct name \ + { \ + void operator()(const data_type* src, data_type* dst0, \ + data_type* dst1) const \ + { \ reg_type r = load_func(src); \ store_func(dst0, r.val[0]); \ store_func(dst1, r.val[1]); \ @@ -72,9 +76,11 @@ template struct VSplit4; #define SPLIT3_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \ template<> \ - struct name{ \ + struct name \ + { \ void operator()(const data_type* src, data_type* dst0, data_type* dst1, \ - data_type* dst2){ \ + data_type* dst2) const \ + { \ reg_type r = load_func(src); \ store_func(dst0, r.val[0]); \ store_func(dst1, r.val[1]); \ @@ -84,9 +90,11 @@ template struct VSplit4; #define SPLIT4_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \ template<> \ - struct name{ \ + struct name \ + { \ void operator()(const data_type* src, data_type* dst0, data_type* dst1, \ - data_type* dst2, data_type* dst3){ \ + data_type* dst2, data_type* dst3) const \ + { \ reg_type r = load_func(src); \ store_func(dst0, r.val[0]); \ store_func(dst1, r.val[1]); \ @@ -96,28 +104,174 @@ template struct VSplit4; } SPLIT2_KERNEL_TEMPLATE(VSplit2, uchar , uint8x16x2_t, vld2q_u8 , vst1q_u8 ); -SPLIT2_KERNEL_TEMPLATE(VSplit2, schar , int8x16x2_t, vld2q_s8 , vst1q_s8 ); SPLIT2_KERNEL_TEMPLATE(VSplit2, ushort, uint16x8x2_t, vld2q_u16, vst1q_u16); -SPLIT2_KERNEL_TEMPLATE(VSplit2, short , int16x8x2_t, vld2q_s16, vst1q_s16); SPLIT2_KERNEL_TEMPLATE(VSplit2, int , int32x4x2_t, vld2q_s32, vst1q_s32); -SPLIT2_KERNEL_TEMPLATE(VSplit2, float , float32x4x2_t, vld2q_f32, vst1q_f32); SPLIT2_KERNEL_TEMPLATE(VSplit2, int64 , int64x1x2_t, vld2_s64 , vst1_s64 ); SPLIT3_KERNEL_TEMPLATE(VSplit3, uchar , uint8x16x3_t, vld3q_u8 , vst1q_u8 ); -SPLIT3_KERNEL_TEMPLATE(VSplit3, schar , int8x16x3_t, vld3q_s8 , vst1q_s8 ); SPLIT3_KERNEL_TEMPLATE(VSplit3, ushort, uint16x8x3_t, vld3q_u16, vst1q_u16); -SPLIT3_KERNEL_TEMPLATE(VSplit3, short , int16x8x3_t, vld3q_s16, vst1q_s16); SPLIT3_KERNEL_TEMPLATE(VSplit3, int , int32x4x3_t, vld3q_s32, vst1q_s32); -SPLIT3_KERNEL_TEMPLATE(VSplit3, float , float32x4x3_t, vld3q_f32, vst1q_f32); SPLIT3_KERNEL_TEMPLATE(VSplit3, int64 , int64x1x3_t, vld3_s64 , vst1_s64 ); SPLIT4_KERNEL_TEMPLATE(VSplit4, uchar , uint8x16x4_t, vld4q_u8 , vst1q_u8 ); -SPLIT4_KERNEL_TEMPLATE(VSplit4, schar , int8x16x4_t, vld4q_s8 , vst1q_s8 ); SPLIT4_KERNEL_TEMPLATE(VSplit4, ushort, uint16x8x4_t, vld4q_u16, vst1q_u16); -SPLIT4_KERNEL_TEMPLATE(VSplit4, short , int16x8x4_t, vld4q_s16, vst1q_s16); SPLIT4_KERNEL_TEMPLATE(VSplit4, int , int32x4x4_t, vld4q_s32, vst1q_s32); -SPLIT4_KERNEL_TEMPLATE(VSplit4, float , float32x4x4_t, vld4q_f32, vst1q_f32); SPLIT4_KERNEL_TEMPLATE(VSplit4, int64 , int64x1x4_t, vld4_s64 , vst1_s64 ); + +#elif CV_SSE2 + +template +struct VSplit2 +{ + VSplit2() : support(false) { } + void operator()(const T *, T *, T *) const { } + + bool support; +}; + +template +struct VSplit3 +{ + VSplit3() : support(false) { } + void operator()(const T *, T *, T *, T *) const { } + + bool support; +}; + +template +struct VSplit4 +{ + VSplit4() : support(false) { } + void operator()(const T *, T *, T *, T *, T *) const { } + + bool support; +}; + +#define SPLIT2_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_deinterleave, flavor) \ +template <> \ +struct VSplit2 \ +{ \ + enum \ + { \ + ELEMS_IN_VEC = 16 / sizeof(data_type) \ + }; \ + \ + VSplit2() \ + { \ + support = true; \ + } \ + \ + void operator()(const data_type * src, \ + data_type * dst0, data_type * dst1) const \ + { \ + reg_type v_src0 = _mm_loadu_##flavor((cast_type const *)(src)); \ + reg_type v_src1 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC)); \ + reg_type v_src2 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 2)); \ + reg_type v_src3 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 3)); \ + \ + _mm_deinterleave(v_src0, v_src1, v_src2, v_src3); \ + \ + _mm_storeu_##flavor((cast_type *)(dst0), v_src0); \ + _mm_storeu_##flavor((cast_type *)(dst0 + ELEMS_IN_VEC), v_src1); \ + _mm_storeu_##flavor((cast_type *)(dst1), v_src2); \ + _mm_storeu_##flavor((cast_type *)(dst1 + ELEMS_IN_VEC), v_src3); \ + } \ + \ + bool support; \ +} + +#define SPLIT3_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_deinterleave, flavor) \ +template <> \ +struct VSplit3 \ +{ \ + enum \ + { \ + ELEMS_IN_VEC = 16 / sizeof(data_type) \ + }; \ + \ + VSplit3() \ + { \ + support = true; \ + } \ + \ + void operator()(const data_type * src, \ + data_type * dst0, data_type * dst1, data_type * dst2) const \ + { \ + reg_type v_src0 = _mm_loadu_##flavor((cast_type const *)(src)); \ + reg_type v_src1 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC)); \ + reg_type v_src2 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 2)); \ + reg_type v_src3 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 3)); \ + reg_type v_src4 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 4)); \ + reg_type v_src5 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 5)); \ + \ + _mm_deinterleave(v_src0, v_src1, v_src2, \ + v_src3, v_src4, v_src5); \ + \ + _mm_storeu_##flavor((cast_type *)(dst0), v_src0); \ + _mm_storeu_##flavor((cast_type *)(dst0 + ELEMS_IN_VEC), v_src1); \ + _mm_storeu_##flavor((cast_type *)(dst1), v_src2); \ + _mm_storeu_##flavor((cast_type *)(dst1 + ELEMS_IN_VEC), v_src3); \ + _mm_storeu_##flavor((cast_type *)(dst2), v_src4); \ + _mm_storeu_##flavor((cast_type *)(dst2 + ELEMS_IN_VEC), v_src5); \ + } \ + \ + bool support; \ +} + +#define SPLIT4_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_deinterleave, flavor) \ +template <> \ +struct VSplit4 \ +{ \ + enum \ + { \ + ELEMS_IN_VEC = 16 / sizeof(data_type) \ + }; \ + \ + VSplit4() \ + { \ + support = true; \ + } \ + \ + void operator()(const data_type * src, data_type * dst0, data_type * dst1, \ + data_type * dst2, data_type * dst3) const \ + { \ + reg_type v_src0 = _mm_loadu_##flavor((cast_type const *)(src)); \ + reg_type v_src1 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC)); \ + reg_type v_src2 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 2)); \ + reg_type v_src3 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 3)); \ + reg_type v_src4 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 4)); \ + reg_type v_src5 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 5)); \ + reg_type v_src6 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 6)); \ + reg_type v_src7 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 7)); \ + \ + _mm_deinterleave(v_src0, v_src1, v_src2, v_src3, \ + v_src4, v_src5, v_src6, v_src7); \ + \ + _mm_storeu_##flavor((cast_type *)(dst0), v_src0); \ + _mm_storeu_##flavor((cast_type *)(dst0 + ELEMS_IN_VEC), v_src1); \ + _mm_storeu_##flavor((cast_type *)(dst1), v_src2); \ + _mm_storeu_##flavor((cast_type *)(dst1 + ELEMS_IN_VEC), v_src3); \ + _mm_storeu_##flavor((cast_type *)(dst2), v_src4); \ + _mm_storeu_##flavor((cast_type *)(dst2 + ELEMS_IN_VEC), v_src5); \ + _mm_storeu_##flavor((cast_type *)(dst3), v_src6); \ + _mm_storeu_##flavor((cast_type *)(dst3 + ELEMS_IN_VEC), v_src7); \ + } \ + \ + bool support; \ +} + +SPLIT2_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128); +SPLIT2_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128); +SPLIT2_KERNEL_TEMPLATE( int, __m128, float, _mm_deinterleave_ps, ps); + +SPLIT3_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128); +SPLIT3_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128); +SPLIT3_KERNEL_TEMPLATE( int, __m128, float, _mm_deinterleave_ps, ps); + +SPLIT4_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128); +SPLIT4_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128); +SPLIT4_KERNEL_TEMPLATE( int, __m128, float, _mm_deinterleave_ps, ps); + #endif template static void @@ -154,6 +308,19 @@ split_( const T* src, T** dst, int len, int cn ) for( ; i < len - inc_i; i += inc_i, j += inc_j) vsplit(src + j, dst0 + i, dst1 + i); } +#elif CV_SSE2 + if (cn == 2) + { + int inc_i = 32/sizeof(T); + int inc_j = 2 * inc_i; + + VSplit2 vsplit; + if (vsplit.support) + { + for( ; i <= len - inc_i; i += inc_i, j += inc_j) + vsplit(src + j, dst0 + i, dst1 + i); + } + } #endif for( ; i < len; i++, j += cn ) { @@ -176,6 +343,20 @@ split_( const T* src, T** dst, int len, int cn ) for( ; i <= len - inc_i; i += inc_i, j += inc_j) vsplit(src + j, dst0 + i, dst1 + i, dst2 + i); } +#elif CV_SSE2 + if (cn == 3) + { + int inc_i = 32/sizeof(T); + int inc_j = 3 * inc_i; + + VSplit3 vsplit; + + if (vsplit.support) + { + for( ; i <= len - inc_i; i += inc_i, j += inc_j) + vsplit(src + j, dst0 + i, dst1 + i, dst2 + i); + } + } #endif for( ; i < len; i++, j += cn ) { @@ -199,6 +380,19 @@ split_( const T* src, T** dst, int len, int cn ) for( ; i <= len - inc_i; i += inc_i, j += inc_j) vsplit(src + j, dst0 + i, dst1 + i, dst2 + i, dst3 + i); } +#elif CV_SSE2 + if (cn == 4) + { + int inc_i = 32/sizeof(T); + int inc_j = 4 * inc_i; + + VSplit4 vsplit; + if (vsplit.support) + { + for( ; i <= len - inc_i; i += inc_i, j += inc_j) + vsplit(src + j, dst0 + i, dst1 + i, dst2 + i, dst3 + i); + } + } #endif for( ; i < len; i++, j += cn ) { @@ -265,27 +459,18 @@ template struct VMerge4; } MERGE2_KERNEL_TEMPLATE(VMerge2, uchar , uint8x16x2_t, vld1q_u8 , vst2q_u8 ); -MERGE2_KERNEL_TEMPLATE(VMerge2, schar , int8x16x2_t, vld1q_s8 , vst2q_s8 ); MERGE2_KERNEL_TEMPLATE(VMerge2, ushort, uint16x8x2_t, vld1q_u16, vst2q_u16); -MERGE2_KERNEL_TEMPLATE(VMerge2, short , int16x8x2_t, vld1q_s16, vst2q_s16); MERGE2_KERNEL_TEMPLATE(VMerge2, int , int32x4x2_t, vld1q_s32, vst2q_s32); -MERGE2_KERNEL_TEMPLATE(VMerge2, float , float32x4x2_t, vld1q_f32, vst2q_f32); MERGE2_KERNEL_TEMPLATE(VMerge2, int64 , int64x1x2_t, vld1_s64 , vst2_s64 ); MERGE3_KERNEL_TEMPLATE(VMerge3, uchar , uint8x16x3_t, vld1q_u8 , vst3q_u8 ); -MERGE3_KERNEL_TEMPLATE(VMerge3, schar , int8x16x3_t, vld1q_s8 , vst3q_s8 ); MERGE3_KERNEL_TEMPLATE(VMerge3, ushort, uint16x8x3_t, vld1q_u16, vst3q_u16); -MERGE3_KERNEL_TEMPLATE(VMerge3, short , int16x8x3_t, vld1q_s16, vst3q_s16); MERGE3_KERNEL_TEMPLATE(VMerge3, int , int32x4x3_t, vld1q_s32, vst3q_s32); -MERGE3_KERNEL_TEMPLATE(VMerge3, float , float32x4x3_t, vld1q_f32, vst3q_f32); MERGE3_KERNEL_TEMPLATE(VMerge3, int64 , int64x1x3_t, vld1_s64 , vst3_s64 ); MERGE4_KERNEL_TEMPLATE(VMerge4, uchar , uint8x16x4_t, vld1q_u8 , vst4q_u8 ); -MERGE4_KERNEL_TEMPLATE(VMerge4, schar , int8x16x4_t, vld1q_s8 , vst4q_s8 ); MERGE4_KERNEL_TEMPLATE(VMerge4, ushort, uint16x8x4_t, vld1q_u16, vst4q_u16); -MERGE4_KERNEL_TEMPLATE(VMerge4, short , int16x8x4_t, vld1q_s16, vst4q_s16); MERGE4_KERNEL_TEMPLATE(VMerge4, int , int32x4x4_t, vld1q_s32, vst4q_s32); -MERGE4_KERNEL_TEMPLATE(VMerge4, float , float32x4x4_t, vld1q_f32, vst4q_f32); MERGE4_KERNEL_TEMPLATE(VMerge4, int64 , int64x1x4_t, vld1_s64 , vst4_s64 ); #endif diff --git a/modules/core/src/copy.cpp b/modules/core/src/copy.cpp index 301ea80a1f..fe8ffd7718 100644 --- a/modules/core/src/copy.cpp +++ b/modules/core/src/copy.cpp @@ -11,6 +11,7 @@ // // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. // Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved. +// Copyright (C) 2014, Itseez Inc., all rights reserved. // Third party copyrights are property of their respective owners. // // Redistribution and use in source and binary forms, with or without modification, diff --git a/modules/core/src/mathfuncs.cpp b/modules/core/src/mathfuncs.cpp index d3d09c338f..7b27dc3507 100644 --- a/modules/core/src/mathfuncs.cpp +++ b/modules/core/src/mathfuncs.cpp @@ -12,6 +12,7 @@ // // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. // Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved. +// Copyright (C) 2014-2015, Itseez Inc., all rights reserved. // Third party copyrights are property of their respective owners. // // Redistribution and use in source and binary forms, with or without modification, diff --git a/modules/core/src/matmul.cpp b/modules/core/src/matmul.cpp index b2f36b3292..6c8bad2444 100644 --- a/modules/core/src/matmul.cpp +++ b/modules/core/src/matmul.cpp @@ -12,6 +12,7 @@ // // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. // Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved. +// Copyright (C) 2014-2015, Itseez Inc., all rights reserved. // Third party copyrights are property of their respective owners. // // Redistribution and use in source and binary forms, with or without modification, diff --git a/modules/core/src/stat.cpp b/modules/core/src/stat.cpp index 530e3205bd..4eb17d6a14 100644 --- a/modules/core/src/stat.cpp +++ b/modules/core/src/stat.cpp @@ -12,6 +12,7 @@ // // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. // Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved. +// Copyright (C) 2014-2015, Itseez Inc., all rights reserved. // Third party copyrights are property of their respective owners. // // Redistribution and use in source and binary forms, with or without modification, @@ -404,13 +405,20 @@ static const uchar * initPopcountTable() { // we compute inverse popcount table, // since we pass (img[x] == 0) mask as index in the table. - for( int j = 0; j < 256; j++ ) + unsigned int j = 0u; +#if CV_POPCNT + if (checkHardwareSupport(CV_CPU_POPCNT)) + for( ; j < 256u; j++ ) + tab[j] = (uchar)(8 - _mm_popcnt_u32(j)); +#else + for( ; j < 256u; j++ ) { int val = 0; for( int mask = 1; mask < 256; mask += mask ) val += (j & mask) == 0; tab[j] = (uchar)val; } +#endif initialized = true; } diff --git a/modules/core/src/system.cpp b/modules/core/src/system.cpp index a7a6e98d4d..6744b33116 100644 --- a/modules/core/src/system.cpp +++ b/modules/core/src/system.cpp @@ -12,6 +12,7 @@ // // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. // Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 2015, Itseez Inc., all rights reserved. // Third party copyrights are property of their respective owners. // // Redistribution and use in source and binary forms, with or without modification, diff --git a/modules/core/src/umatrix.cpp b/modules/core/src/umatrix.cpp index 443111f48d..ffc20777b5 100644 --- a/modules/core/src/umatrix.cpp +++ b/modules/core/src/umatrix.cpp @@ -10,8 +10,7 @@ // License Agreement // For Open Source Computer Vision Library // -// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. -// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 2014, Itseez Inc., all rights reserved. // Third party copyrights are property of their respective owners. // // Redistribution and use in source and binary forms, with or without modification, diff --git a/modules/imgproc/src/accum.cpp b/modules/imgproc/src/accum.cpp index d2e8b39aa3..23dc4576ba 100644 --- a/modules/imgproc/src/accum.cpp +++ b/modules/imgproc/src/accum.cpp @@ -12,6 +12,7 @@ // // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. // Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 2014, Itseez Inc., all rights reserved. // Third party copyrights are property of their respective owners. / // Redistribution and use in source and binary forms, with or without modification, diff --git a/modules/imgproc/src/canny.cpp b/modules/imgproc/src/canny.cpp index 1311d5abb9..233218b3e2 100644 --- a/modules/imgproc/src/canny.cpp +++ b/modules/imgproc/src/canny.cpp @@ -11,6 +11,7 @@ // For Open Source Computer Vision Library // // Copyright (C) 2000, Intel Corporation, all rights reserved. +// Copyright (C) 2014, Itseez Inc., all rights reserved. // Third party copyrights are property of their respective owners. // // Redistribution and use in source and binary forms, with or without modification, diff --git a/modules/imgproc/src/clahe.cpp b/modules/imgproc/src/clahe.cpp index 18a91d9544..06fc73153f 100644 --- a/modules/imgproc/src/clahe.cpp +++ b/modules/imgproc/src/clahe.cpp @@ -11,6 +11,7 @@ // For Open Source Computer Vision Library // // Copyright (C) 2013, NVIDIA Corporation, all rights reserved. +// Copyright (C) 2014, Itseez Inc., all rights reserved. // Third party copyrights are property of their respective owners. // // Redistribution and use in source and binary forms, with or without modification, diff --git a/modules/imgproc/src/color.cpp b/modules/imgproc/src/color.cpp index 6049f9993f..574ff16279 100644 --- a/modules/imgproc/src/color.cpp +++ b/modules/imgproc/src/color.cpp @@ -12,6 +12,7 @@ // // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. // Copyright (C) 2009-2010, Willow Garage Inc., all rights reserved. +// Copyright (C) 2014-2015, Itseez Inc., all rights reserved. // Third party copyrights are property of their respective owners. // // Redistribution and use in source and binary forms, with or without modification, diff --git a/modules/imgproc/src/corner.cpp b/modules/imgproc/src/corner.cpp index 85f2063b28..0d04f6f7e1 100644 --- a/modules/imgproc/src/corner.cpp +++ b/modules/imgproc/src/corner.cpp @@ -12,6 +12,7 @@ // // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. // Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 2014, Itseez Inc., all rights reserved. // Third party copyrights are property of their respective owners. // // Redistribution and use in source and binary forms, with or without modification, diff --git a/modules/imgproc/src/demosaicing.cpp b/modules/imgproc/src/demosaicing.cpp index 0b7afb8ea6..cec450dc71 100644 --- a/modules/imgproc/src/demosaicing.cpp +++ b/modules/imgproc/src/demosaicing.cpp @@ -12,6 +12,7 @@ // // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. // Copyright (C) 2009-2010, Willow Garage Inc., all rights reserved. +// Copyright (C) 2014, Itseez Inc., all rights reserved. // Third party copyrights are property of their respective owners. // // Redistribution and use in source and binary forms, with or without modification, diff --git a/modules/imgproc/src/imgwarp.cpp b/modules/imgproc/src/imgwarp.cpp index 1fa4557cad..d8b5385910 100644 --- a/modules/imgproc/src/imgwarp.cpp +++ b/modules/imgproc/src/imgwarp.cpp @@ -12,6 +12,7 @@ // // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. // Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 2014-2015, Itseez Inc., all rights reserved. // Third party copyrights are property of their respective owners. // // Redistribution and use in source and binary forms, with or without modification, diff --git a/modules/imgproc/src/pyramids.cpp b/modules/imgproc/src/pyramids.cpp index 5425de11cd..f489372188 100644 --- a/modules/imgproc/src/pyramids.cpp +++ b/modules/imgproc/src/pyramids.cpp @@ -12,6 +12,7 @@ // // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. // Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 2014-2015, Itseez Inc., all rights reserved. // Third party copyrights are property of their respective owners. // // Redistribution and use in source and binary forms, with or without modification, diff --git a/modules/imgproc/src/smooth.cpp b/modules/imgproc/src/smooth.cpp index b7c3004039..59acdd71b3 100644 --- a/modules/imgproc/src/smooth.cpp +++ b/modules/imgproc/src/smooth.cpp @@ -12,6 +12,7 @@ // // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. // Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 2014-2015, Itseez Inc., all rights reserved. // Third party copyrights are property of their respective owners. // // Redistribution and use in source and binary forms, with or without modification, diff --git a/modules/imgproc/src/sumpixels.cpp b/modules/imgproc/src/sumpixels.cpp index cdef88f6c1..16c7c7ef26 100755 --- a/modules/imgproc/src/sumpixels.cpp +++ b/modules/imgproc/src/sumpixels.cpp @@ -12,6 +12,7 @@ // // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. // Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 2014, Itseez Inc., all rights reserved. // Third party copyrights are property of their respective owners. // // Redistribution and use in source and binary forms, with or without modification, diff --git a/modules/ts/src/ts_func.cpp b/modules/ts/src/ts_func.cpp index 84a9233dd2..b8de763f10 100644 --- a/modules/ts/src/ts_func.cpp +++ b/modules/ts/src/ts_func.cpp @@ -2998,12 +2998,12 @@ void printVersionInfo(bool useStdOut) std::string cpu_features; -#if CV_MMX - if (checkHardwareSupport(CV_CPU_MMX)) cpu_features += " mmx"; -#endif #if CV_POPCNT if (checkHardwareSupport(CV_CPU_POPCNT)) cpu_features += " popcnt"; #endif +#if CV_MMX + if (checkHardwareSupport(CV_CPU_MMX)) cpu_features += " mmx"; +#endif #if CV_SSE if (checkHardwareSupport(CV_CPU_SSE)) cpu_features += " sse"; #endif From d87457a025adb71c6e922018417844cc731434c9 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 12 Jan 2015 10:59:30 +0300 Subject: [PATCH 37/53] split/merge --- .../core/include/opencv2/core/sse_utils.hpp | 83 +++++++- modules/core/src/convert.cpp | 189 ++++++++++++++++++ modules/imgproc/src/color.cpp | 8 +- 3 files changed, 272 insertions(+), 8 deletions(-) diff --git a/modules/core/include/opencv2/core/sse_utils.hpp b/modules/core/include/opencv2/core/sse_utils.hpp index 0667ae9210..7af6d84f2d 100644 --- a/modules/core/include/opencv2/core/sse_utils.hpp +++ b/modules/core/include/opencv2/core/sse_utils.hpp @@ -164,8 +164,38 @@ inline void _mm_deinterleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0 v_a1 = _mm_unpackhi_epi8(layer4_chunk3, layer4_chunk7); } -inline void _mm_interleavee_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, - __m128i & v_g1, __m128i & v_b0, __m128i & v_b1) +inline void _mm_interleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1) +{ + __m128i v_mask = _mm_set1_epi16(0x00ff); + + __m128i layer4_chunk0 = _mm_packus_epi16(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask)); + __m128i layer4_chunk2 = _mm_packus_epi16(_mm_srli_epi16(v_r0, 8), _mm_srli_epi16(v_r1, 8)); + __m128i layer4_chunk1 = _mm_packus_epi16(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask)); + __m128i layer4_chunk3 = _mm_packus_epi16(_mm_srli_epi16(v_g0, 8), _mm_srli_epi16(v_g1, 8)); + + __m128i layer3_chunk0 = _mm_packus_epi16(_mm_and_si128(layer4_chunk0, v_mask), _mm_and_si128(layer4_chunk1, v_mask)); + __m128i layer3_chunk2 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk0, 8), _mm_srli_epi16(layer4_chunk1, 8)); + __m128i layer3_chunk1 = _mm_packus_epi16(_mm_and_si128(layer4_chunk2, v_mask), _mm_and_si128(layer4_chunk3, v_mask)); + __m128i layer3_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk2, 8), _mm_srli_epi16(layer4_chunk3, 8)); + + __m128i layer2_chunk0 = _mm_packus_epi16(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask)); + __m128i layer2_chunk2 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk0, 8), _mm_srli_epi16(layer3_chunk1, 8)); + __m128i layer2_chunk1 = _mm_packus_epi16(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask)); + __m128i layer2_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk2, 8), _mm_srli_epi16(layer3_chunk3, 8)); + + __m128i layer1_chunk0 = _mm_packus_epi16(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask)); + __m128i layer1_chunk2 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk0, 8), _mm_srli_epi16(layer2_chunk1, 8)); + __m128i layer1_chunk1 = _mm_packus_epi16(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask)); + __m128i layer1_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk2, 8), _mm_srli_epi16(layer2_chunk3, 8)); + + v_r0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask)); + v_g0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk0, 8), _mm_srli_epi16(layer1_chunk1, 8)); + v_r1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask)); + v_g1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk2, 8), _mm_srli_epi16(layer1_chunk3, 8)); +} + +inline void _mm_interleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, + __m128i & v_g1, __m128i & v_b0, __m128i & v_b1) { __m128i v_mask = _mm_set1_epi16(0x00ff); @@ -205,8 +235,8 @@ inline void _mm_interleavee_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, v_b1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk4, 8), _mm_srli_epi16(layer1_chunk5, 8)); } -inline void _mm_interleavee_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1, - __m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1) +inline void _mm_interleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1, + __m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1) { __m128i v_mask = _mm_set1_epi16(0x00ff); @@ -353,6 +383,31 @@ inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g #if CV_SSE4_1 +inline void _mm_interleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1) +{ + __m128i v_mask = _mm_set1_epi32(0x0000ffff); + + __m128i layer3_chunk0 = _mm_packus_epi32(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask)); + __m128i layer3_chunk2 = _mm_packus_epi32(_mm_srli_epi32(v_r0, 16), _mm_srli_epi32(v_r1, 16)); + __m128i layer3_chunk1 = _mm_packus_epi32(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask)); + __m128i layer3_chunk3 = _mm_packus_epi32(_mm_srli_epi32(v_g0, 16), _mm_srli_epi32(v_g1, 16)); + + __m128i layer2_chunk0 = _mm_packus_epi32(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask)); + __m128i layer2_chunk2 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk0, 16), _mm_srli_epi32(layer3_chunk1, 16)); + __m128i layer2_chunk1 = _mm_packus_epi32(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask)); + __m128i layer2_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk2, 16), _mm_srli_epi32(layer3_chunk3, 16)); + + __m128i layer1_chunk0 = _mm_packus_epi32(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask)); + __m128i layer1_chunk2 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk0, 16), _mm_srli_epi32(layer2_chunk1, 16)); + __m128i layer1_chunk1 = _mm_packus_epi32(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask)); + __m128i layer1_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk2, 16), _mm_srli_epi32(layer2_chunk3, 16)); + + v_r0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask)); + v_g0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk0, 16), _mm_srli_epi32(layer1_chunk1, 16)); + v_r1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask)); + v_g1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk2, 16), _mm_srli_epi32(layer1_chunk3, 16)); +} + inline void _mm_interleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1, __m128i & v_b0, __m128i & v_b1) { @@ -505,6 +560,26 @@ inline void _mm_deinterleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m v_a1 = _mm_unpackhi_ps(layer2_chunk3, layer2_chunk7); } +inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1) +{ + const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1); + + __m128 layer2_chunk0 = _mm_shuffle_ps(v_r0, v_r1, mask_lo); + __m128 layer2_chunk2 = _mm_shuffle_ps(v_r0, v_r1, mask_hi); + __m128 layer2_chunk1 = _mm_shuffle_ps(v_g0, v_g1, mask_lo); + __m128 layer2_chunk3 = _mm_shuffle_ps(v_g0, v_g1, mask_hi); + + __m128 layer1_chunk0 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_lo); + __m128 layer1_chunk2 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_hi); + __m128 layer1_chunk1 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_lo); + __m128 layer1_chunk3 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_hi); + + v_r0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_lo); + v_g0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_hi); + v_r1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_lo); + v_g1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_hi); +} + inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1, __m128 & v_b0, __m128 & v_b1) { diff --git a/modules/core/src/convert.cpp b/modules/core/src/convert.cpp index 5c792f379d..5cb2a15fd4 100644 --- a/modules/core/src/convert.cpp +++ b/modules/core/src/convert.cpp @@ -472,6 +472,162 @@ MERGE4_KERNEL_TEMPLATE(VMerge4, uchar , uint8x16x4_t, vld1q_u8 , vst4q_u8 ); MERGE4_KERNEL_TEMPLATE(VMerge4, ushort, uint16x8x4_t, vld1q_u16, vst4q_u16); MERGE4_KERNEL_TEMPLATE(VMerge4, int , int32x4x4_t, vld1q_s32, vst4q_s32); MERGE4_KERNEL_TEMPLATE(VMerge4, int64 , int64x1x4_t, vld1_s64 , vst4_s64 ); + +#elif CV_SSE2 + +template +struct VMerge2 +{ + VMerge2() : support(false) { } + void operator()(const T *, const T *, T *) const { } + + bool support; +}; + +template +struct VMerge3 +{ + VMerge3() : support(false) { } + void operator()(const T *, const T *, const T *, T *) const { } + + bool support; +}; + +template +struct VMerge4 +{ + VMerge4() : support(false) { } + void operator()(const T *, const T *, const T *, const T *, T *) const { } + + bool support; +}; + +#define MERGE2_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor) \ +template <> \ +struct VMerge2 \ +{ \ + enum \ + { \ + ELEMS_IN_VEC = 16 / sizeof(data_type) \ + }; \ + \ + VMerge2() \ + { \ + support = true; \ + } \ + \ + void operator()(const data_type * src0, const data_type * src1, \ + data_type * dst) const \ + { \ + reg_type v_src0 = _mm_loadu_##flavor((const cast_type *)(src0)); \ + reg_type v_src1 = _mm_loadu_##flavor((const cast_type *)(src0 + ELEMS_IN_VEC)); \ + reg_type v_src2 = _mm_loadu_##flavor((const cast_type *)(src1)); \ + reg_type v_src3 = _mm_loadu_##flavor((const cast_type *)(src1 + ELEMS_IN_VEC)); \ + \ + _mm_interleave(v_src0, v_src1, v_src2, v_src3); \ + \ + _mm_storeu_##flavor((cast_type *)(dst), v_src0); \ + _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC), v_src1); \ + _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 2), v_src2); \ + _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 3), v_src3); \ + } \ + \ + bool support; \ +} + +#define MERGE3_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor) \ +template <> \ +struct VMerge3 \ +{ \ + enum \ + { \ + ELEMS_IN_VEC = 16 / sizeof(data_type) \ + }; \ + \ + VMerge3() \ + { \ + support = true; \ + } \ + \ + void operator()(const data_type * src0, const data_type * src1, const data_type * src2,\ + data_type * dst) const \ + { \ + reg_type v_src0 = _mm_loadu_##flavor((const cast_type *)(src0)); \ + reg_type v_src1 = _mm_loadu_##flavor((const cast_type *)(src0 + ELEMS_IN_VEC)); \ + reg_type v_src2 = _mm_loadu_##flavor((const cast_type *)(src1)); \ + reg_type v_src3 = _mm_loadu_##flavor((const cast_type *)(src1 + ELEMS_IN_VEC)); \ + reg_type v_src4 = _mm_loadu_##flavor((const cast_type *)(src2)); \ + reg_type v_src5 = _mm_loadu_##flavor((const cast_type *)(src2 + ELEMS_IN_VEC)); \ + \ + _mm_interleave(v_src0, v_src1, v_src2, \ + v_src3, v_src4, v_src5); \ + \ + _mm_storeu_##flavor((cast_type *)(dst), v_src0); \ + _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC), v_src1); \ + _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 2), v_src2); \ + _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 3), v_src3); \ + _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 4), v_src4); \ + _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 5), v_src5); \ + } \ + \ + bool support; \ +} + +#define MERGE4_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor) \ +template <> \ +struct VMerge4 \ +{ \ + enum \ + { \ + ELEMS_IN_VEC = 16 / sizeof(data_type) \ + }; \ + \ + VMerge4() \ + { \ + support = true; \ + } \ + \ + void operator()(const data_type * src0, const data_type * src1, \ + const data_type * src2, const data_type * src3, \ + data_type * dst) const \ + { \ + reg_type v_src0 = _mm_loadu_##flavor((const cast_type *)(src0)); \ + reg_type v_src1 = _mm_loadu_##flavor((const cast_type *)(src0 + ELEMS_IN_VEC)); \ + reg_type v_src2 = _mm_loadu_##flavor((const cast_type *)(src1)); \ + reg_type v_src3 = _mm_loadu_##flavor((const cast_type *)(src1 + ELEMS_IN_VEC)); \ + reg_type v_src4 = _mm_loadu_##flavor((const cast_type *)(src2)); \ + reg_type v_src5 = _mm_loadu_##flavor((const cast_type *)(src2 + ELEMS_IN_VEC)); \ + reg_type v_src6 = _mm_loadu_##flavor((const cast_type *)(src3)); \ + reg_type v_src7 = _mm_loadu_##flavor((const cast_type *)(src3 + ELEMS_IN_VEC)); \ + \ + _mm_interleave(v_src0, v_src1, v_src2, v_src3, \ + v_src4, v_src5, v_src6, v_src7); \ + \ + _mm_storeu_##flavor((cast_type *)(dst), v_src0); \ + _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC), v_src1); \ + _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 2), v_src2); \ + _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 3), v_src3); \ + _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 4), v_src4); \ + _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 5), v_src5); \ + _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 6), v_src6); \ + _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 7), v_src7); \ + } \ + \ + bool support; \ +} + +MERGE2_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128); +MERGE2_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128); +MERGE2_KERNEL_TEMPLATE( int, __m128, float, _mm_deinterleave_ps, ps); + +MERGE3_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128); +MERGE3_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128); +MERGE3_KERNEL_TEMPLATE( int, __m128, float, _mm_interleave_ps, ps); + +MERGE4_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128); +MERGE4_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128); +MERGE4_KERNEL_TEMPLATE( int, __m128, float, _mm_interleave_ps, ps); + #endif template static void @@ -499,6 +655,17 @@ merge_( const T** src, T* dst, int len, int cn ) for( ; i < len - inc_i; i += inc_i, j += inc_j) vmerge(src0 + i, src1 + i, dst + j); } +#elif CV_SSE2 + if(cn == 2) + { + int inc_i = 32/sizeof(T); + int inc_j = 2 * inc_i; + + VMerge2 vmerge; + if (vmerge.support) + for( ; i < len - inc_i; i += inc_i, j += inc_j) + vmerge(src0 + i, src1 + i, dst + j); + } #endif for( ; i < len; i++, j += cn ) { @@ -520,6 +687,17 @@ merge_( const T** src, T* dst, int len, int cn ) for( ; i < len - inc_i; i += inc_i, j += inc_j) vmerge(src0 + i, src1 + i, src2 + i, dst + j); } +#elif CV_SSE2 + if(cn == 3) + { + int inc_i = 32/sizeof(T); + int inc_j = 3 * inc_i; + + VMerge3 vmerge; + if (vmerge.support) + for( ; i < len - inc_i; i += inc_i, j += inc_j) + vmerge(src0 + i, src1 + i, src2 + i, dst + j); + } #endif for( ; i < len; i++, j += cn ) { @@ -542,6 +720,17 @@ merge_( const T** src, T* dst, int len, int cn ) for( ; i < len - inc_i; i += inc_i, j += inc_j) vmerge(src0 + i, src1 + i, src2 + i, src3 + i, dst + j); } +#elif CV_SSE2 + if(cn == 4) + { + int inc_i = 32/sizeof(T); + int inc_j = 4 * inc_i; + + VMerge4 vmerge; + if (vmerge.support) + for( ; i < len - inc_i; i += inc_i, j += inc_j) + vmerge(src0 + i, src1 + i, src2 + i, src3 + i, dst + j); + } #endif for( ; i < len; i++, j += cn ) { diff --git a/modules/imgproc/src/color.cpp b/modules/imgproc/src/color.cpp index 574ff16279..19cf1357b6 100644 --- a/modules/imgproc/src/color.cpp +++ b/modules/imgproc/src/color.cpp @@ -2219,7 +2219,7 @@ struct RGB2YCrCb_i __m128i v_cr_1 = _mm_packus_epi16(v_cr0, v_cr1); __m128i v_cb_1 = _mm_packus_epi16(v_cb0, v_cb1); - _mm_interleavee_epi8(v_y_0, v_y_1, v_cr_0, v_cr_1, v_cb_0, v_cb_1); + _mm_interleave_epi8(v_y_0, v_y_1, v_cr_0, v_cr_1, v_cb_0, v_cb_1); _mm_storeu_si128((__m128i *)(dst + i), v_y_0); _mm_storeu_si128((__m128i *)(dst + i + 16), v_y_1); @@ -2988,7 +2988,7 @@ struct YCrCb2RGB_i std::swap(v_r1, v_b1); } - _mm_interleavee_epi8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); + _mm_interleave_epi8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); _mm_storeu_si128((__m128i *)(dst), v_r0); _mm_storeu_si128((__m128i *)(dst + 16), v_r1); @@ -4585,7 +4585,7 @@ struct RGB2HLS_b __m128i v_l1 = _mm_packus_epi16(v_l_0, v_l_1); __m128i v_s1 = _mm_packus_epi16(v_s_0, v_s_1); - _mm_interleavee_epi8(v_h0, v_h1, v_l0, v_l1, v_s0, v_s1); + _mm_interleave_epi8(v_h0, v_h1, v_l0, v_l1, v_s0, v_s1); _mm_storeu_si128((__m128i *)(dst + j), v_h0); _mm_storeu_si128((__m128i *)(dst + j + 16), v_h1); @@ -5695,7 +5695,7 @@ struct RGB2Luv_b __m128i v_u1 = _mm_packus_epi16(v_u_0, v_u_1); __m128i v_v1 = _mm_packus_epi16(v_v_0, v_v_1); - _mm_interleavee_epi8(v_l0, v_l1, v_u0, v_u1, v_v0, v_v1); + _mm_interleave_epi8(v_l0, v_l1, v_u0, v_u1, v_v0, v_v1); _mm_storeu_si128((__m128i *)(dst + j), v_l0); _mm_storeu_si128((__m128i *)(dst + j + 16), v_l1); From b2f851af0619d51d2f77e6c31372d0140156bae4 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 12 Jan 2015 10:59:30 +0300 Subject: [PATCH 38/53] cornerMinEigenVal --- modules/imgproc/src/corner.cpp | 31 ++++++++++++++++++++++++++++++- modules/imgproc/src/smooth.cpp | 2 -- 2 files changed, 30 insertions(+), 3 deletions(-) diff --git a/modules/imgproc/src/corner.cpp b/modules/imgproc/src/corner.cpp index 0d04f6f7e1..358cd5802b 100644 --- a/modules/imgproc/src/corner.cpp +++ b/modules/imgproc/src/corner.cpp @@ -12,7 +12,7 @@ // // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. // Copyright (C) 2009, Willow Garage Inc., all rights reserved. -// Copyright (C) 2014, Itseez Inc., all rights reserved. +// Copyright (C) 2014-2015, Itseez Inc., all rights reserved. // Third party copyrights are property of their respective owners. // // Redistribution and use in source and binary forms, with or without modification, @@ -271,6 +271,8 @@ cornerEigenValsVecs( const Mat& src, Mat& eigenv, int block_size, #ifdef HAVE_TEGRA_OPTIMIZATION if (tegra::cornerEigenValsVecs(src, eigenv, block_size, aperture_size, op_type, k, borderType)) return; +#elif CV_SSE2 + bool haveSSE2 = checkHardwareSupport(CV_CPU_SSE2); #endif int depth = src.depth(); @@ -319,6 +321,33 @@ cornerEigenValsVecs( const Mat& src, Mat& eigenv, int block_size, vst3q_f32(cov_data + j * 3, v_dst); } + #elif CV_SSE2 + if (haveSSE2) + { + for( ; j <= size.width - 8; j += 8 ) + { + __m128 v_dx_0 = _mm_loadu_ps(dxdata + j); + __m128 v_dx_1 = _mm_loadu_ps(dxdata + j + 4); + __m128 v_dy_0 = _mm_loadu_ps(dydata + j); + __m128 v_dy_1 = _mm_loadu_ps(dydata + j + 4); + + __m128 v_dx2_0 = _mm_mul_ps(v_dx_0, v_dx_0); + __m128 v_dxy_0 = _mm_mul_ps(v_dx_0, v_dy_0); + __m128 v_dy2_0 = _mm_mul_ps(v_dy_0, v_dy_0); + __m128 v_dx2_1 = _mm_mul_ps(v_dx_1, v_dx_1); + __m128 v_dxy_1 = _mm_mul_ps(v_dx_1, v_dy_1); + __m128 v_dy2_1 = _mm_mul_ps(v_dy_1, v_dy_1); + + _mm_interleave_ps(v_dx2_0, v_dx2_1, v_dxy_0, v_dxy_1, v_dy2_0, v_dy2_1); + + _mm_storeu_ps(cov_data + j * 3, v_dx2_0); + _mm_storeu_ps(cov_data + j * 3 + 4, v_dx2_1); + _mm_storeu_ps(cov_data + j * 3 + 8, v_dxy_0); + _mm_storeu_ps(cov_data + j * 3 + 12, v_dxy_1); + _mm_storeu_ps(cov_data + j * 3 + 16, v_dy2_0); + _mm_storeu_ps(cov_data + j * 3 + 20, v_dy2_1); + } + } #endif for( ; j < size.width; j++ ) diff --git a/modules/imgproc/src/smooth.cpp b/modules/imgproc/src/smooth.cpp index 59acdd71b3..2dc2fbdf7e 100644 --- a/modules/imgproc/src/smooth.cpp +++ b/modules/imgproc/src/smooth.cpp @@ -736,8 +736,6 @@ struct ColumnSum : bool haveScale = scale != 1; double _scale = scale; - printf("bgfbffbbfg\n"); - #if CV_SSE2 bool haveSSE2 = checkHardwareSupport(CV_CPU_SSE2); #endif From 612b8ce2cbdd1f878d771b40550549623a53bc01 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 12 Jan 2015 10:59:30 +0300 Subject: [PATCH 39/53] fixed compilation issues --- cmake/OpenCVCompilerOptions.cmake | 6 ++++++ modules/core/src/convert.cpp | 17 ++++++++++------- modules/imgproc/src/color.cpp | 9 ++------- modules/imgproc/src/imgwarp.cpp | 2 +- modules/imgproc/src/pyramids.cpp | 13 ++++++++++--- 5 files changed, 29 insertions(+), 18 deletions(-) diff --git a/cmake/OpenCVCompilerOptions.cmake b/cmake/OpenCVCompilerOptions.cmake index 2f5f13d7bf..ecd9a8b41e 100644 --- a/cmake/OpenCVCompilerOptions.cmake +++ b/cmake/OpenCVCompilerOptions.cmake @@ -227,6 +227,12 @@ if(MSVC) if(ENABLE_AVX AND NOT MSVC_VERSION LESS 1600) set(OPENCV_EXTRA_FLAGS "${OPENCV_EXTRA_FLAGS} /arch:AVX") endif() + if(ENABLE_AVX2 AND NOT MSVC_VERSION LESS 1600) + set(OPENCV_EXTRA_FLAGS "${OPENCV_EXTRA_FLAGS} /arch:AVX2") + endif() + if(ENABLE_FMA3 AND NOT MSVC_VERSION LESS 1600) + set(OPENCV_EXTRA_FLAGS "${OPENCV_EXTRA_FLAGS} /arch:FMA") + endif() if(ENABLE_SSE4_1 AND CV_ICC AND NOT OPENCV_EXTRA_FLAGS MATCHES "/arch:") set(OPENCV_EXTRA_FLAGS "${OPENCV_EXTRA_FLAGS} /arch:SSE4.1") diff --git a/modules/core/src/convert.cpp b/modules/core/src/convert.cpp index 5cb2a15fd4..7f450e5810 100644 --- a/modules/core/src/convert.cpp +++ b/modules/core/src/convert.cpp @@ -617,15 +617,18 @@ struct VMerge4 } MERGE2_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128); -MERGE2_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128); -MERGE2_KERNEL_TEMPLATE( int, __m128, float, _mm_deinterleave_ps, ps); - MERGE3_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128); -MERGE3_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128); -MERGE3_KERNEL_TEMPLATE( int, __m128, float, _mm_interleave_ps, ps); - MERGE4_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128); + +MERGE2_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128); + +#if CV_SSE4_1 +MERGE3_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128); MERGE4_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128); +#endif + +MERGE2_KERNEL_TEMPLATE( int, __m128, float, _mm_deinterleave_ps, ps); +MERGE3_KERNEL_TEMPLATE( int, __m128, float, _mm_interleave_ps, ps); MERGE4_KERNEL_TEMPLATE( int, __m128, float, _mm_interleave_ps, ps); #endif @@ -4328,7 +4331,7 @@ cvtScale_( const short* src, size_t sstep, { __m256 scale256 = _mm256_set1_ps(scale); __m256 shift256 = _mm256_set1_ps(shift); - int shuffle = 0xD8; + const int shuffle = 0xD8; for ( ; x <= size.width - 16; x += 16) { diff --git a/modules/imgproc/src/color.cpp b/modules/imgproc/src/color.cpp index 19cf1357b6..c1130db402 100644 --- a/modules/imgproc/src/color.cpp +++ b/modules/imgproc/src/color.cpp @@ -1445,7 +1445,7 @@ struct RGB2Gray float32x4_t v_cb, v_cg, v_cr; }; -#elif CV_SSE2 +#elif CV_SSE4_1 template <> struct RGB2Gray @@ -2106,7 +2106,7 @@ struct RGB2YCrCb_i int32x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_delta, v_delta2; }; -#elif CV_SSE2 +#elif CV_SSE4_1 template <> struct RGB2YCrCb_i @@ -2247,8 +2247,6 @@ struct RGB2YCrCb_i __m128i v_zero; }; -#if CV_SSE4_1 - template <> struct RGB2YCrCb_i { @@ -2369,9 +2367,6 @@ struct RGB2YCrCb_i #endif // CV_SSE4_1 - -#endif - template struct YCrCb2RGB_f { typedef _Tp channel_type; diff --git a/modules/imgproc/src/imgwarp.cpp b/modules/imgproc/src/imgwarp.cpp index d8b5385910..0de7089812 100644 --- a/modules/imgproc/src/imgwarp.cpp +++ b/modules/imgproc/src/imgwarp.cpp @@ -2306,7 +2306,7 @@ struct ResizeAreaFastVec_SIMD_32f if (cn == 1) { - int shuffle_lo = _MM_SHUFFLE(2, 0, 2, 0), shuffle_hi = _MM_SHUFFLE(3, 1, 3, 1); + const int shuffle_lo = _MM_SHUFFLE(2, 0, 2, 0), shuffle_hi = _MM_SHUFFLE(3, 1, 3, 1); for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4) { __m128 v_row00 = _mm_loadu_ps(S0), v_row01 = _mm_loadu_ps(S0 + 4), diff --git a/modules/imgproc/src/pyramids.cpp b/modules/imgproc/src/pyramids.cpp index f489372188..93b9bfa166 100644 --- a/modules/imgproc/src/pyramids.cpp +++ b/modules/imgproc/src/pyramids.cpp @@ -236,7 +236,11 @@ struct PyrDownVec_32s16u bool haveSSE; }; -#endif +#else + +typedef PyrDownNoVec PyrDownVec_32s16u; + +#endif // CV_SSE4_1 struct PyrDownVec_32s16s { @@ -288,7 +292,6 @@ struct PyrDownVec_32s16s bool haveSSE; }; - struct PyrUpVec_32s8u { int operator()(int** src, uchar** dst, int, int width) const @@ -471,7 +474,11 @@ struct PyrUpVec_32s16u } }; -#endif +#else + +typedef PyrUpNoVec PyrUpVec_32s16u; + +#endif // CV_SSE4_1 struct PyrUpVec_32f { From 70933ea999d3e29c2a73aa0135bae766170e6de5 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 12 Jan 2015 10:59:30 +0300 Subject: [PATCH 40/53] convert from f64 --- modules/core/src/convert.cpp | 154 ++++++++++++++++++++++++++++++++++- 1 file changed, 153 insertions(+), 1 deletion(-) diff --git a/modules/core/src/convert.cpp b/modules/core/src/convert.cpp index 7f450e5810..a48e90e452 100644 --- a/modules/core/src/convert.cpp +++ b/modules/core/src/convert.cpp @@ -4394,7 +4394,159 @@ struct Cvt_SIMD } }; -#if CV_NEON +#if CV_SSE2 + +// from double + +template <> +struct Cvt_SIMD +{ + int operator() (const double * src, uchar * dst, int width) const + { + int x = 0; + + for ( ; x <= width - 8; x += 8) + { + __m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x)); + __m128 v_src1 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2)); + __m128 v_src2 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 4)); + __m128 v_src3 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6)); + + v_src0 = _mm_movelh_ps(v_src0, v_src1); + v_src1 = _mm_movelh_ps(v_src2, v_src3); + + __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_src0), + _mm_cvtps_epi32(v_src1)); + _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_dst)); + } + + return x; + } +}; + +template <> +struct Cvt_SIMD +{ + int operator() (const double * src, schar * dst, int width) const + { + int x = 0; + + for ( ; x <= width - 8; x += 8) + { + __m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x)); + __m128 v_src1 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2)); + __m128 v_src2 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 4)); + __m128 v_src3 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6)); + + v_src0 = _mm_movelh_ps(v_src0, v_src1); + v_src1 = _mm_movelh_ps(v_src2, v_src3); + + __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_src0), + _mm_cvtps_epi32(v_src1)); + _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_dst)); + } + + return x; + } +}; + +#if CV_SSE4_1 + +template <> +struct Cvt_SIMD +{ + int operator() (const double * src, ushort * dst, int width) const + { + int x = 0; + + for ( ; x <= width - 8; x += 8) + { + __m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x)); + __m128 v_src1 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2)); + __m128 v_src2 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 4)); + __m128 v_src3 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6)); + + v_src0 = _mm_movelh_ps(v_src0, v_src1); + v_src1 = _mm_movelh_ps(v_src2, v_src3); + + __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_src0), + _mm_cvtps_epi32(v_src1)); + _mm_storeu_si128((__m128i *)(dst + x), v_dst); + } + + return x; + } +}; + +#endif // CV_SSE4_1 + +template <> +struct Cvt_SIMD +{ + int operator() (const double * src, short * dst, int width) const + { + int x = 0; + + for ( ; x <= width - 8; x += 8) + { + __m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x)); + __m128 v_src1 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2)); + __m128 v_src2 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 4)); + __m128 v_src3 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6)); + + v_src0 = _mm_movelh_ps(v_src0, v_src1); + v_src1 = _mm_movelh_ps(v_src2, v_src3); + + __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_src0), + _mm_cvtps_epi32(v_src1)); + _mm_storeu_si128((__m128i *)(dst + x), v_dst); + } + + return x; + } +}; + +template <> +struct Cvt_SIMD +{ + int operator() (const double * src, int * dst, int width) const + { + int x = 0; + + for ( ; x <= width - 4; x += 4) + { + __m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x)); + __m128 v_src1 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2)); + v_src0 = _mm_movelh_ps(v_src0, v_src1); + + _mm_storeu_si128((__m128i *)(dst + x), _mm_cvtps_epi32(v_src0)); + } + + return x; + } +}; + +template <> +struct Cvt_SIMD +{ + int operator() (const double * src, float * dst, int width) const + { + int x = 0; + + for ( ; x <= width - 4; x += 4) + { + __m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x)); + __m128 v_src1 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2)); + + _mm_storeu_ps(dst + x, _mm_movelh_ps(v_src0, v_src1)); + } + + return x; + } +}; + + +#elif CV_NEON // from uchar From 25e99c453f7bfd0ea673c91627b012c21abaad2c Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 12 Jan 2015 10:59:30 +0300 Subject: [PATCH 41/53] avx2 in arithm --- cmake/OpenCVCompilerOptions.cmake | 11 +- modules/core/include/opencv2/core/cvdef.h | 4 +- modules/core/src/arithm.cpp | 234 ++++++++++++++++++++-- modules/imgproc/src/color.cpp | 5 +- 4 files changed, 225 insertions(+), 29 deletions(-) diff --git a/cmake/OpenCVCompilerOptions.cmake b/cmake/OpenCVCompilerOptions.cmake index ecd9a8b41e..bbe617dd69 100644 --- a/cmake/OpenCVCompilerOptions.cmake +++ b/cmake/OpenCVCompilerOptions.cmake @@ -224,14 +224,11 @@ if(MSVC) set(OPENCV_EXTRA_FLAGS_RELEASE "${OPENCV_EXTRA_FLAGS_RELEASE} /Zi") endif() - if(ENABLE_AVX AND NOT MSVC_VERSION LESS 1600) - set(OPENCV_EXTRA_FLAGS "${OPENCV_EXTRA_FLAGS} /arch:AVX") - endif() if(ENABLE_AVX2 AND NOT MSVC_VERSION LESS 1600) set(OPENCV_EXTRA_FLAGS "${OPENCV_EXTRA_FLAGS} /arch:AVX2") endif() - if(ENABLE_FMA3 AND NOT MSVC_VERSION LESS 1600) - set(OPENCV_EXTRA_FLAGS "${OPENCV_EXTRA_FLAGS} /arch:FMA") + if(ENABLE_AVX AND NOT MSVC_VERSION LESS 1600 AND NOT OPENCV_EXTRA_FLAGS MATCHES "/arch:") + set(OPENCV_EXTRA_FLAGS "${OPENCV_EXTRA_FLAGS} /arch:AVX") endif() if(ENABLE_SSE4_1 AND CV_ICC AND NOT OPENCV_EXTRA_FLAGS MATCHES "/arch:") @@ -252,7 +249,7 @@ if(MSVC) endif() endif() - if(ENABLE_SSE OR ENABLE_SSE2 OR ENABLE_SSE3 OR ENABLE_SSE4_1 OR ENABLE_AVX) + if(ENABLE_SSE OR ENABLE_SSE2 OR ENABLE_SSE3 OR ENABLE_SSE4_1 OR ENABLE_AVX OR ENABLE_AVX2) set(OPENCV_EXTRA_FLAGS "${OPENCV_EXTRA_FLAGS} /Oi") endif() @@ -312,7 +309,7 @@ if(MSVC) string(REPLACE "/W3" "/W4" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}") if(NOT ENABLE_NOISY_WARNINGS AND MSVC_VERSION EQUAL 1400) - ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4510 /wd4610 /wd4312 /wd4201 /wd4244 /wd4328 /wd4267) + ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4510 /wd4610 /wd4312 /wd4201 /wd4244 /wd4328 /wd4267 /wd4324) endif() # allow extern "C" functions throw exceptions diff --git a/modules/core/include/opencv2/core/cvdef.h b/modules/core/include/opencv2/core/cvdef.h index ded58a18c5..610c3fbad0 100644 --- a/modules/core/include/opencv2/core/cvdef.h +++ b/modules/core/include/opencv2/core/cvdef.h @@ -159,7 +159,9 @@ # define CV_SSE4_2 1 # endif # if defined __POPCNT__ || (defined _MSC_VER && _MSC_VER >= 1500) -# include +# ifndef _MSC_VER +# include +# endif # define CV_POPCNT 1 # endif # if defined __AVX__ || (defined _MSC_FULL_VER && _MSC_FULL_VER >= 160040219) diff --git a/modules/core/src/arithm.cpp b/modules/core/src/arithm.cpp index 07678f0c6d..8f490a9c95 100644 --- a/modules/core/src/arithm.cpp +++ b/modules/core/src/arithm.cpp @@ -64,6 +64,10 @@ FUNCTOR_TEMPLATE(VLoadStore128); #if CV_SSE2 FUNCTOR_TEMPLATE(VLoadStore64); FUNCTOR_TEMPLATE(VLoadStore128Aligned); +#if CV_AVX +FUNCTOR_TEMPLATE(VLoadStore256); +FUNCTOR_TEMPLATE(VLoadStore256Aligned); +#endif #endif #endif @@ -76,17 +80,28 @@ void vBinOp(const T* src1, size_t step1, const T* src2, size_t step2, T* dst, si #endif Op op; - for( ; sz.height--; src1 += step1/sizeof(src1[0]), - src2 += step2/sizeof(src2[0]), - dst += step/sizeof(dst[0]) ) + for( ; sz.height--; src1 = (const T *)((const uchar *)src1 + step1), + src2 = (const T *)((const uchar *)src2 + step2), + dst = (T *)((uchar *)dst + step) ) { int x = 0; #if CV_NEON || CV_SSE2 +#if CV_AVX2 + if( USE_AVX2 ) + { + for( ; x <= sz.width - 32/(int)sizeof(T); x += 32/sizeof(T) ) + { + typename VLoadStore256::reg_type r0 = VLoadStore256::load(src1 + x); + r0 = vop(r0, VLoadStore256::load(src2 + x)); + VLoadStore256::store(dst + x, r0); + } + } +#else #if CV_SSE2 if( USE_SSE2 ) { -#endif +#endif // CV_SSE2 for( ; x <= sz.width - 32/(int)sizeof(T); x += 32/sizeof(T) ) { typename VLoadStore128::reg_type r0 = VLoadStore128::load(src1 + x ); @@ -98,9 +113,13 @@ void vBinOp(const T* src1, size_t step1, const T* src2, size_t step2, T* dst, si } #if CV_SSE2 } -#endif -#endif -#if CV_SSE2 +#endif // CV_SSE2 +#endif // CV_AVX2 +#endif // CV_NEON || CV_SSE2 + +#if CV_AVX2 + // nothing +#elif CV_SSE2 if( USE_SSE2 ) { for( ; x <= sz.width - 8/(int)sizeof(T); x += 8/sizeof(T) ) @@ -111,6 +130,7 @@ void vBinOp(const T* src1, size_t step1, const T* src2, size_t step2, T* dst, si } } #endif + #if CV_ENABLE_UNROLLED for( ; x <= sz.width - 4; x += 4 ) { @@ -137,13 +157,26 @@ void vBinOp32(const T* src1, size_t step1, const T* src2, size_t step2, #endif Op op; - for( ; sz.height--; src1 += step1/sizeof(src1[0]), - src2 += step2/sizeof(src2[0]), - dst += step/sizeof(dst[0]) ) + for( ; sz.height--; src1 = (const T *)((const uchar *)src1 + step1), + src2 = (const T *)((const uchar *)src2 + step2), + dst = (T *)((uchar *)dst + step) ) { int x = 0; -#if CV_SSE2 +#if CV_AVX2 + if( USE_AVX2 ) + { + if( (((size_t)src1|(size_t)src2|(size_t)dst)&31) == 0 ) + { + for( ; x <= sz.width - 8; x += 8 ) + { + typename VLoadStore256Aligned::reg_type r0 = VLoadStore256Aligned::load(src1 + x); + r0 = op32(r0, VLoadStore256Aligned::load(src2 + x)); + VLoadStore256Aligned::store(dst + x, r0); + } + } + } +#elif CV_SSE2 if( USE_SSE2 ) { if( (((size_t)src1|(size_t)src2|(size_t)dst)&15) == 0 ) @@ -159,12 +192,24 @@ void vBinOp32(const T* src1, size_t step1, const T* src2, size_t step2, } } } -#endif +#endif // CV_AVX2 + #if CV_NEON || CV_SSE2 +#if CV_AVX2 + if( USE_AVX2 ) + { + for( ; x <= sz.width - 8; x += 8 ) + { + typename VLoadStore256::reg_type r0 = VLoadStore256::load(src1 + x); + r0 = op32(r0, VLoadStore256::load(src2 + x)); + VLoadStore256::store(dst + x, r0); + } + } +#else #if CV_SSE2 if( USE_SSE2 ) { -#endif +#endif // CV_SSE2 for( ; x <= sz.width - 8; x += 8 ) { typename VLoadStore128::reg_type r0 = VLoadStore128::load(src1 + x ); @@ -176,8 +221,10 @@ void vBinOp32(const T* src1, size_t step1, const T* src2, size_t step2, } #if CV_SSE2 } -#endif -#endif +#endif // CV_SSE2 +#endif // CV_AVX2 +#endif // CV_NEON || CV_SSE2 + #if CV_ENABLE_UNROLLED for( ; x <= sz.width - 4; x += 4 ) { @@ -205,13 +252,26 @@ void vBinOp64(const T* src1, size_t step1, const T* src2, size_t step2, #endif Op op; - for( ; sz.height--; src1 += step1/sizeof(src1[0]), - src2 += step2/sizeof(src2[0]), - dst += step/sizeof(dst[0]) ) + for( ; sz.height--; src1 = (const T *)((const uchar *)src1 + step1), + src2 = (const T *)((const uchar *)src2 + step2), + dst = (T *)((uchar *)dst + step) ) { int x = 0; -#if CV_SSE2 +#if CV_AVX2 + if( USE_AVX2 ) + { + if( (((size_t)src1|(size_t)src2|(size_t)dst)&31) == 0 ) + { + for( ; x <= sz.width - 4; x += 4 ) + { + typename VLoadStore256Aligned::reg_type r0 = VLoadStore256Aligned::load(src1 + x); + r0 = op64(r0, VLoadStore256Aligned::load(src2 + x)); + VLoadStore256Aligned::store(dst + x, r0); + } + } + } +#elif CV_SSE2 if( USE_SSE2 ) { if( (((size_t)src1|(size_t)src2|(size_t)dst)&15) == 0 ) @@ -244,7 +304,141 @@ void vBinOp64(const T* src1, size_t step1, const T* src2, size_t step2, } } -#if CV_SSE2 +#if CV_AVX2 + +#define FUNCTOR_LOADSTORE_CAST(name, template_arg, register_type, load_body, store_body) \ + template <> \ + struct name{ \ + typedef register_type reg_type; \ + static reg_type load(const template_arg * p) { return load_body ((const reg_type *)p); } \ + static void store(template_arg * p, reg_type v) { store_body ((reg_type *)p, v); } \ + } + +#define FUNCTOR_LOADSTORE(name, template_arg, register_type, load_body, store_body) \ + template <> \ + struct name{ \ + typedef register_type reg_type; \ + static reg_type load(const template_arg * p) { return load_body (p); } \ + static void store(template_arg * p, reg_type v) { store_body (p, v); } \ + } + +#define FUNCTOR_CLOSURE_2arg(name, template_arg, body) \ + template<> \ + struct name \ + { \ + VLoadStore256::reg_type operator()( \ + const VLoadStore256::reg_type & a, \ + const VLoadStore256::reg_type & b) const \ + { \ + body; \ + } \ + } + +#define FUNCTOR_CLOSURE_1arg(name, template_arg, body) \ + template<> \ + struct name \ + { \ + VLoadStore256::reg_type operator()( \ + const VLoadStore256::reg_type & a, \ + const VLoadStore256::reg_type & ) const \ + { \ + body; \ + } \ + } + +FUNCTOR_LOADSTORE_CAST(VLoadStore256, uchar, __m256i, _mm256_loadu_si256, _mm256_storeu_si256); +FUNCTOR_LOADSTORE_CAST(VLoadStore256, schar, __m256i, _mm256_loadu_si256, _mm256_storeu_si256); +FUNCTOR_LOADSTORE_CAST(VLoadStore256, ushort, __m256i, _mm256_loadu_si256, _mm256_storeu_si256); +FUNCTOR_LOADSTORE_CAST(VLoadStore256, short, __m256i, _mm256_loadu_si256, _mm256_storeu_si256); +FUNCTOR_LOADSTORE_CAST(VLoadStore256, int, __m256i, _mm256_loadu_si256, _mm256_storeu_si256); +FUNCTOR_LOADSTORE( VLoadStore256, float, __m256 , _mm256_loadu_ps , _mm256_storeu_ps ); +FUNCTOR_LOADSTORE( VLoadStore256, double, __m256d, _mm256_loadu_pd , _mm256_storeu_pd ); + +FUNCTOR_LOADSTORE_CAST(VLoadStore256Aligned, int, __m256i, _mm256_load_si256, _mm256_store_si256); +FUNCTOR_LOADSTORE( VLoadStore256Aligned, float, __m256 , _mm256_load_ps , _mm256_store_ps ); +FUNCTOR_LOADSTORE( VLoadStore256Aligned, double, __m256d, _mm256_load_pd , _mm256_store_pd ); + +FUNCTOR_TEMPLATE(VAdd); +FUNCTOR_CLOSURE_2arg(VAdd, uchar, return _mm256_adds_epu8 (a, b)); +FUNCTOR_CLOSURE_2arg(VAdd, schar, return _mm256_adds_epi8 (a, b)); +FUNCTOR_CLOSURE_2arg(VAdd, ushort, return _mm256_adds_epu16(a, b)); +FUNCTOR_CLOSURE_2arg(VAdd, short, return _mm256_adds_epi16(a, b)); +FUNCTOR_CLOSURE_2arg(VAdd, int, return _mm256_add_epi32 (a, b)); +FUNCTOR_CLOSURE_2arg(VAdd, float, return _mm256_add_ps (a, b)); +FUNCTOR_CLOSURE_2arg(VAdd, double, return _mm256_add_pd (a, b)); + +FUNCTOR_TEMPLATE(VSub); +FUNCTOR_CLOSURE_2arg(VSub, uchar, return _mm256_subs_epu8 (a, b)); +FUNCTOR_CLOSURE_2arg(VSub, schar, return _mm256_subs_epi8 (a, b)); +FUNCTOR_CLOSURE_2arg(VSub, ushort, return _mm256_subs_epu16(a, b)); +FUNCTOR_CLOSURE_2arg(VSub, short, return _mm256_subs_epi16(a, b)); +FUNCTOR_CLOSURE_2arg(VSub, int, return _mm256_sub_epi32 (a, b)); +FUNCTOR_CLOSURE_2arg(VSub, float, return _mm256_sub_ps (a, b)); +FUNCTOR_CLOSURE_2arg(VSub, double, return _mm256_sub_pd (a, b)); + +FUNCTOR_TEMPLATE(VMin); +FUNCTOR_CLOSURE_2arg(VMin, uchar, return _mm256_min_epu8 (a, b)); +FUNCTOR_CLOSURE_2arg(VMin, schar, return _mm256_min_epi8 (a, b)); +FUNCTOR_CLOSURE_2arg(VMin, ushort, return _mm256_min_epi16(a, b)); +FUNCTOR_CLOSURE_2arg(VMin, short, return _mm256_min_epi16(a, b)); +FUNCTOR_CLOSURE_2arg(VMin, int, return _mm256_min_epi32(a, b)); +FUNCTOR_CLOSURE_2arg(VMin, float, return _mm256_min_ps (a, b)); +FUNCTOR_CLOSURE_2arg(VMin, double, return _mm256_min_pd (a, b)); + +FUNCTOR_TEMPLATE(VMax); +FUNCTOR_CLOSURE_2arg(VMax, uchar, return _mm256_max_epu8 (a, b)); +FUNCTOR_CLOSURE_2arg(VMax, schar, return _mm256_max_epi8 (a, b)); +FUNCTOR_CLOSURE_2arg(VMax, ushort, return _mm256_max_epu16(a, b)); +FUNCTOR_CLOSURE_2arg(VMax, short, return _mm256_max_epi16(a, b)); +FUNCTOR_CLOSURE_2arg(VMax, int, return _mm256_max_epi32(a, b)); +FUNCTOR_CLOSURE_2arg(VMax, float, return _mm256_max_ps (a, b)); +FUNCTOR_CLOSURE_2arg(VMax, double, return _mm256_max_pd (a, b)); + + +static unsigned int CV_DECL_ALIGNED(32) v32f_absmask[] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, + 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff }; +static unsigned int CV_DECL_ALIGNED(32) v64f_absmask[] = { 0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff, + 0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff }; + +FUNCTOR_TEMPLATE(VAbsDiff); +FUNCTOR_CLOSURE_2arg(VAbsDiff, uchar, + return _mm256_add_epi8(_mm256_subs_epu8(a, b), _mm256_subs_epu8(b, a)); + ); +FUNCTOR_CLOSURE_2arg(VAbsDiff, schar, + __m256i d = _mm256_subs_epi8(a, b); + __m256i m = _mm256_cmpgt_epi8(b, a); + return _mm256_subs_epi8(_mm256_xor_si256(d, m), m); + ); +FUNCTOR_CLOSURE_2arg(VAbsDiff, ushort, + return _mm256_add_epi16(_mm256_subs_epu16(a, b), _mm256_subs_epu16(b, a)); + ); +FUNCTOR_CLOSURE_2arg(VAbsDiff, short, + __m256i M = _mm256_max_epi16(a, b); + __m256i m = _mm256_min_epi16(a, b); + return _mm256_subs_epi16(M, m); + ); +FUNCTOR_CLOSURE_2arg(VAbsDiff, int, + __m256i d = _mm256_sub_epi32(a, b); + __m256i m = _mm256_cmpgt_epi32(b, a); + return _mm256_sub_epi32(_mm256_xor_si256(d, m), m); + ); +FUNCTOR_CLOSURE_2arg(VAbsDiff, float, + return _mm256_and_ps(_mm256_sub_ps(a, b), *(const __m256*)v32f_absmask); + ); +FUNCTOR_CLOSURE_2arg(VAbsDiff, double, + return _mm256_and_pd(_mm256_sub_pd(a, b), *(const __m256d*)v64f_absmask); + ); + +FUNCTOR_TEMPLATE(VAnd); +FUNCTOR_CLOSURE_2arg(VAnd, uchar, return _mm256_and_si256(a, b)); +FUNCTOR_TEMPLATE(VOr); +FUNCTOR_CLOSURE_2arg(VOr , uchar, return _mm256_or_si256 (a, b)); +FUNCTOR_TEMPLATE(VXor); +FUNCTOR_CLOSURE_2arg(VXor, uchar, return _mm256_xor_si256(a, b)); +FUNCTOR_TEMPLATE(VNot); +FUNCTOR_CLOSURE_1arg(VNot, uchar, return _mm256_xor_si256(_mm256_set1_epi32(-1), a)); + +#elif CV_SSE2 #define FUNCTOR_LOADSTORE_CAST(name, template_arg, register_type, load_body, store_body)\ template <> \ diff --git a/modules/imgproc/src/color.cpp b/modules/imgproc/src/color.cpp index c1130db402..7197627b20 100644 --- a/modules/imgproc/src/color.cpp +++ b/modules/imgproc/src/color.cpp @@ -2879,6 +2879,8 @@ struct YCrCb2RGB_i v_delta = _mm_set1_epi16(ColorChannel::half()); v_delta2 = _mm_set1_epi32(1 << (yuv_shift - 1)); v_zero = _mm_setzero_si128(); + + useSSE = coeffs[0] <= std::numeric_limits::max(); } // 16s x 8 @@ -2934,7 +2936,7 @@ struct YCrCb2RGB_i int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3]; n *= 3; - if (dcn == 3) + if (dcn == 3 && useSSE) { for ( ; i <= n - 96; i += 96, dst += dcn * 32) { @@ -3014,6 +3016,7 @@ struct YCrCb2RGB_i } int dstcn, blueIdx; int coeffs[4]; + bool useSSE; __m128i v_c0, v_c1, v_c2, v_c3, v_delta2; __m128i v_delta, v_alpha, v_zero; From f2cd65cf1e888ff7b48e67578fba59c2d533bc85 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 12 Jan 2015 10:59:30 +0300 Subject: [PATCH 42/53] fixes --- cmake/OpenCVCompilerOptions.cmake | 7 +- modules/core/include/opencv2/core/cvdef.h | 8 +- .../core/include/opencv2/core/sse_utils.hpp | 82 +++++++++---------- modules/core/src/arithm.cpp | 2 +- modules/core/src/convert.cpp | 7 +- modules/imgproc/src/color.cpp | 6 +- modules/photo/test/test_cloning.cpp | 28 +++++-- 7 files changed, 78 insertions(+), 62 deletions(-) diff --git a/cmake/OpenCVCompilerOptions.cmake b/cmake/OpenCVCompilerOptions.cmake index bbe617dd69..66e16e7863 100644 --- a/cmake/OpenCVCompilerOptions.cmake +++ b/cmake/OpenCVCompilerOptions.cmake @@ -224,7 +224,7 @@ if(MSVC) set(OPENCV_EXTRA_FLAGS_RELEASE "${OPENCV_EXTRA_FLAGS_RELEASE} /Zi") endif() - if(ENABLE_AVX2 AND NOT MSVC_VERSION LESS 1600) + if(ENABLE_AVX2 AND NOT MSVC_VERSION LESS 1800) set(OPENCV_EXTRA_FLAGS "${OPENCV_EXTRA_FLAGS} /arch:AVX2") endif() if(ENABLE_AVX AND NOT MSVC_VERSION LESS 1600 AND NOT OPENCV_EXTRA_FLAGS MATCHES "/arch:") @@ -309,7 +309,7 @@ if(MSVC) string(REPLACE "/W3" "/W4" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}") if(NOT ENABLE_NOISY_WARNINGS AND MSVC_VERSION EQUAL 1400) - ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4510 /wd4610 /wd4312 /wd4201 /wd4244 /wd4328 /wd4267 /wd4324) + ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4510 /wd4610 /wd4312 /wd4201 /wd4244 /wd4328 /wd4267) endif() # allow extern "C" functions throw exceptions @@ -321,6 +321,7 @@ if(MSVC) endforeach() if(NOT ENABLE_NOISY_WARNINGS) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4251") #class 'std::XXX' needs to have dll-interface to be used by clients of YYY + ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4251) # class 'std::XXX' needs to have dll-interface to be used by clients of YYY + ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4324) # 'struct_name' : structure was padded due to __declspec(align()) endif() endif() diff --git a/modules/core/include/opencv2/core/cvdef.h b/modules/core/include/opencv2/core/cvdef.h index 610c3fbad0..a9d59c7693 100644 --- a/modules/core/include/opencv2/core/cvdef.h +++ b/modules/core/include/opencv2/core/cvdef.h @@ -159,12 +159,14 @@ # define CV_SSE4_2 1 # endif # if defined __POPCNT__ || (defined _MSC_VER && _MSC_VER >= 1500) -# ifndef _MSC_VER +# ifdef _MSC_VER +# include +# else # include # endif # define CV_POPCNT 1 # endif -# if defined __AVX__ || (defined _MSC_FULL_VER && _MSC_FULL_VER >= 160040219) +# if defined __AVX__ || (defined _MSC_VER && _MSC_VER >= 1600) // MS Visual Studio 2010 (2012?) has no macro pre-defined to identify the use of /arch:AVX // See: http://connect.microsoft.com/VisualStudio/feedback/details/605858/arch-avx-should-define-a-predefined-macro-in-x64-and-set-a-unique-value-for-m-ix86-fp-in-win32 # include @@ -175,7 +177,7 @@ # define __xgetbv() 0 # endif # endif -# if defined __AVX2__ || (defined _MSC_FULL_VER && _MSC_FULL_VER >= 160040219) +# if defined __AVX2__ || (defined _MSC_VER && _MSC_VER >= 1800) # include # define CV_AVX2 1 # if defined __FMA__ diff --git a/modules/core/include/opencv2/core/sse_utils.hpp b/modules/core/include/opencv2/core/sse_utils.hpp index 7af6d84f2d..e0283eb3f3 100644 --- a/modules/core/include/opencv2/core/sse_utils.hpp +++ b/modules/core/include/opencv2/core/sse_utils.hpp @@ -43,7 +43,7 @@ #define __OPENCV_CORE_SSE_UTILS_HPP__ #ifndef __cplusplus -# error base.hpp header must be compiled as C++ +# error sse_utils.hpp header must be compiled as C++ #endif #if CV_SSE2 @@ -117,7 +117,7 @@ inline void _mm_deinterleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0 inline void _mm_deinterleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1, __m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1) -{ +{ __m128i layer1_chunk0 = _mm_unpacklo_epi8(v_r0, v_b0); __m128i layer1_chunk1 = _mm_unpackhi_epi8(v_r0, v_b0); __m128i layer1_chunk2 = _mm_unpacklo_epi8(v_r1, v_b1); @@ -165,9 +165,9 @@ inline void _mm_deinterleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0 } inline void _mm_interleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1) -{ +{ __m128i v_mask = _mm_set1_epi16(0x00ff); - + __m128i layer4_chunk0 = _mm_packus_epi16(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask)); __m128i layer4_chunk2 = _mm_packus_epi16(_mm_srli_epi16(v_r0, 8), _mm_srli_epi16(v_r1, 8)); __m128i layer4_chunk1 = _mm_packus_epi16(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask)); @@ -177,28 +177,28 @@ inline void _mm_interleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i layer3_chunk2 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk0, 8), _mm_srli_epi16(layer4_chunk1, 8)); __m128i layer3_chunk1 = _mm_packus_epi16(_mm_and_si128(layer4_chunk2, v_mask), _mm_and_si128(layer4_chunk3, v_mask)); __m128i layer3_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk2, 8), _mm_srli_epi16(layer4_chunk3, 8)); - + __m128i layer2_chunk0 = _mm_packus_epi16(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask)); __m128i layer2_chunk2 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk0, 8), _mm_srli_epi16(layer3_chunk1, 8)); __m128i layer2_chunk1 = _mm_packus_epi16(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask)); __m128i layer2_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk2, 8), _mm_srli_epi16(layer3_chunk3, 8)); - + __m128i layer1_chunk0 = _mm_packus_epi16(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask)); __m128i layer1_chunk2 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk0, 8), _mm_srli_epi16(layer2_chunk1, 8)); __m128i layer1_chunk1 = _mm_packus_epi16(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask)); __m128i layer1_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk2, 8), _mm_srli_epi16(layer2_chunk3, 8)); - + v_r0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask)); v_g0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk0, 8), _mm_srli_epi16(layer1_chunk1, 8)); v_r1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask)); v_g1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk2, 8), _mm_srli_epi16(layer1_chunk3, 8)); } -inline void _mm_interleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, +inline void _mm_interleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1, __m128i & v_b0, __m128i & v_b1) -{ +{ __m128i v_mask = _mm_set1_epi16(0x00ff); - + __m128i layer4_chunk0 = _mm_packus_epi16(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask)); __m128i layer4_chunk3 = _mm_packus_epi16(_mm_srli_epi16(v_r0, 8), _mm_srli_epi16(v_r1, 8)); __m128i layer4_chunk1 = _mm_packus_epi16(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask)); @@ -237,7 +237,7 @@ inline void _mm_interleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, inline void _mm_interleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1, __m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1) -{ +{ __m128i v_mask = _mm_set1_epi16(0x00ff); __m128i layer4_chunk0 = _mm_packus_epi16(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask)); @@ -286,8 +286,8 @@ inline void _mm_interleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, v_a1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk6, 8), _mm_srli_epi16(layer1_chunk7, 8)); } -inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1) -{ +inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1) +{ __m128i layer1_chunk0 = _mm_unpacklo_epi16(v_r0, v_g0); __m128i layer1_chunk1 = _mm_unpackhi_epi16(v_r0, v_g0); __m128i layer1_chunk2 = _mm_unpacklo_epi16(v_r1, v_g1); @@ -310,8 +310,8 @@ inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g } inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, - __m128i & v_g1, __m128i & v_b0, __m128i & v_b1) -{ + __m128i & v_g1, __m128i & v_b0, __m128i & v_b1) +{ __m128i layer1_chunk0 = _mm_unpacklo_epi16(v_r0, v_g1); __m128i layer1_chunk1 = _mm_unpackhi_epi16(v_r0, v_g1); __m128i layer1_chunk2 = _mm_unpacklo_epi16(v_r1, v_b0); @@ -342,7 +342,7 @@ inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g } inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1, - __m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1) + __m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1) { __m128i layer1_chunk0 = _mm_unpacklo_epi16(v_r0, v_b0); __m128i layer1_chunk1 = _mm_unpackhi_epi16(v_r0, v_b0); @@ -352,7 +352,7 @@ inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g __m128i layer1_chunk5 = _mm_unpackhi_epi16(v_g0, v_a0); __m128i layer1_chunk6 = _mm_unpacklo_epi16(v_g1, v_a1); __m128i layer1_chunk7 = _mm_unpackhi_epi16(v_g1, v_a1); - + __m128i layer2_chunk0 = _mm_unpacklo_epi16(layer1_chunk0, layer1_chunk4); __m128i layer2_chunk1 = _mm_unpackhi_epi16(layer1_chunk0, layer1_chunk4); __m128i layer2_chunk2 = _mm_unpacklo_epi16(layer1_chunk1, layer1_chunk5); @@ -393,14 +393,14 @@ inline void _mm_interleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i layer3_chunk3 = _mm_packus_epi32(_mm_srli_epi32(v_g0, 16), _mm_srli_epi32(v_g1, 16)); __m128i layer2_chunk0 = _mm_packus_epi32(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask)); - __m128i layer2_chunk2 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk0, 16), _mm_srli_epi32(layer3_chunk1, 16)); + __m128i layer2_chunk2 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk0, 16), _mm_srli_epi32(layer3_chunk1, 16)); __m128i layer2_chunk1 = _mm_packus_epi32(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask)); - __m128i layer2_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk2, 16), _mm_srli_epi32(layer3_chunk3, 16)); + __m128i layer2_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk2, 16), _mm_srli_epi32(layer3_chunk3, 16)); __m128i layer1_chunk0 = _mm_packus_epi32(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask)); - __m128i layer1_chunk2 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk0, 16), _mm_srli_epi32(layer2_chunk1, 16)); + __m128i layer1_chunk2 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk0, 16), _mm_srli_epi32(layer2_chunk1, 16)); __m128i layer1_chunk1 = _mm_packus_epi32(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask)); - __m128i layer1_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk2, 16), _mm_srli_epi32(layer2_chunk3, 16)); + __m128i layer1_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk2, 16), _mm_srli_epi32(layer2_chunk3, 16)); v_r0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask)); v_g0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk0, 16), _mm_srli_epi32(layer1_chunk1, 16)); @@ -421,18 +421,18 @@ inline void _mm_interleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i layer3_chunk5 = _mm_packus_epi32(_mm_srli_epi32(v_b0, 16), _mm_srli_epi32(v_b1, 16)); __m128i layer2_chunk0 = _mm_packus_epi32(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask)); - __m128i layer2_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk0, 16), _mm_srli_epi32(layer3_chunk1, 16)); + __m128i layer2_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk0, 16), _mm_srli_epi32(layer3_chunk1, 16)); __m128i layer2_chunk1 = _mm_packus_epi32(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask)); - __m128i layer2_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk2, 16), _mm_srli_epi32(layer3_chunk3, 16)); + __m128i layer2_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk2, 16), _mm_srli_epi32(layer3_chunk3, 16)); __m128i layer2_chunk2 = _mm_packus_epi32(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask)); - __m128i layer2_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk4, 16), _mm_srli_epi32(layer3_chunk5, 16)); + __m128i layer2_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk4, 16), _mm_srli_epi32(layer3_chunk5, 16)); __m128i layer1_chunk0 = _mm_packus_epi32(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask)); - __m128i layer1_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk0, 16), _mm_srli_epi32(layer2_chunk1, 16)); + __m128i layer1_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk0, 16), _mm_srli_epi32(layer2_chunk1, 16)); __m128i layer1_chunk1 = _mm_packus_epi32(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask)); - __m128i layer1_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk2, 16), _mm_srli_epi32(layer2_chunk3, 16)); + __m128i layer1_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk2, 16), _mm_srli_epi32(layer2_chunk3, 16)); __m128i layer1_chunk2 = _mm_packus_epi32(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask)); - __m128i layer1_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk4, 16), _mm_srli_epi32(layer2_chunk5, 16)); + __m128i layer1_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk4, 16), _mm_srli_epi32(layer2_chunk5, 16)); v_r0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask)); v_g1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk0, 16), _mm_srli_epi32(layer1_chunk1, 16)); @@ -457,26 +457,26 @@ inline void _mm_interleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i layer3_chunk7 = _mm_packus_epi32(_mm_srli_epi32(v_a0, 16), _mm_srli_epi32(v_a1, 16)); __m128i layer2_chunk0 = _mm_packus_epi32(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask)); - __m128i layer2_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk0, 16), _mm_srli_epi32(layer3_chunk1, 16)); + __m128i layer2_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk0, 16), _mm_srli_epi32(layer3_chunk1, 16)); __m128i layer2_chunk1 = _mm_packus_epi32(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask)); - __m128i layer2_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk2, 16), _mm_srli_epi32(layer3_chunk3, 16)); + __m128i layer2_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk2, 16), _mm_srli_epi32(layer3_chunk3, 16)); __m128i layer2_chunk2 = _mm_packus_epi32(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask)); - __m128i layer2_chunk6 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk4, 16), _mm_srli_epi32(layer3_chunk5, 16)); + __m128i layer2_chunk6 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk4, 16), _mm_srli_epi32(layer3_chunk5, 16)); __m128i layer2_chunk3 = _mm_packus_epi32(_mm_and_si128(layer3_chunk6, v_mask), _mm_and_si128(layer3_chunk7, v_mask)); - __m128i layer2_chunk7 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk6, 16), _mm_srli_epi32(layer3_chunk7, 16)); + __m128i layer2_chunk7 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk6, 16), _mm_srli_epi32(layer3_chunk7, 16)); __m128i layer1_chunk0 = _mm_packus_epi32(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask)); - __m128i layer1_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk0, 16), _mm_srli_epi32(layer2_chunk1, 16)); + __m128i layer1_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk0, 16), _mm_srli_epi32(layer2_chunk1, 16)); __m128i layer1_chunk1 = _mm_packus_epi32(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask)); - __m128i layer1_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk2, 16), _mm_srli_epi32(layer2_chunk3, 16)); + __m128i layer1_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk2, 16), _mm_srli_epi32(layer2_chunk3, 16)); __m128i layer1_chunk2 = _mm_packus_epi32(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask)); - __m128i layer1_chunk6 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk4, 16), _mm_srli_epi32(layer2_chunk5, 16)); + __m128i layer1_chunk6 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk4, 16), _mm_srli_epi32(layer2_chunk5, 16)); __m128i layer1_chunk3 = _mm_packus_epi32(_mm_and_si128(layer2_chunk6, v_mask), _mm_and_si128(layer2_chunk7, v_mask)); - __m128i layer1_chunk7 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk6, 16), _mm_srli_epi32(layer2_chunk7, 16)); + __m128i layer1_chunk7 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk6, 16), _mm_srli_epi32(layer2_chunk7, 16)); v_r0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask)); v_b0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk0, 16), _mm_srli_epi32(layer1_chunk1, 16)); - v_r0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask)); + v_r1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask)); v_b1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk2, 16), _mm_srli_epi32(layer1_chunk3, 16)); v_g0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask)); v_a0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk4, 16), _mm_srli_epi32(layer1_chunk5, 16)); @@ -487,12 +487,12 @@ inline void _mm_interleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, #endif // CV_SSE4_1 inline void _mm_deinterleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1) -{ +{ __m128 layer1_chunk0 = _mm_unpacklo_ps(v_r0, v_g0); __m128 layer1_chunk1 = _mm_unpackhi_ps(v_r0, v_g0); __m128 layer1_chunk2 = _mm_unpacklo_ps(v_r1, v_g1); __m128 layer1_chunk3 = _mm_unpackhi_ps(v_r1, v_g1); - + __m128 layer2_chunk0 = _mm_unpacklo_ps(layer1_chunk0, layer1_chunk2); __m128 layer2_chunk1 = _mm_unpackhi_ps(layer1_chunk0, layer1_chunk2); __m128 layer2_chunk2 = _mm_unpacklo_ps(layer1_chunk1, layer1_chunk3); @@ -506,14 +506,14 @@ inline void _mm_deinterleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m inline void _mm_deinterleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1, __m128 & v_b0, __m128 & v_b1) -{ +{ __m128 layer1_chunk0 = _mm_unpacklo_ps(v_r0, v_g1); __m128 layer1_chunk1 = _mm_unpackhi_ps(v_r0, v_g1); __m128 layer1_chunk2 = _mm_unpacklo_ps(v_r1, v_b0); __m128 layer1_chunk3 = _mm_unpackhi_ps(v_r1, v_b0); __m128 layer1_chunk4 = _mm_unpacklo_ps(v_g0, v_b1); __m128 layer1_chunk5 = _mm_unpackhi_ps(v_g0, v_b1); - + __m128 layer2_chunk0 = _mm_unpacklo_ps(layer1_chunk0, layer1_chunk3); __m128 layer2_chunk1 = _mm_unpackhi_ps(layer1_chunk0, layer1_chunk3); __m128 layer2_chunk2 = _mm_unpacklo_ps(layer1_chunk1, layer1_chunk4); @@ -531,7 +531,7 @@ inline void _mm_deinterleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, inline void _mm_deinterleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1, __m128 & v_b0, __m128 & v_b1, __m128 & v_a0, __m128 & v_a1) -{ +{ __m128 layer1_chunk0 = _mm_unpacklo_ps(v_r0, v_b0); __m128 layer1_chunk1 = _mm_unpackhi_ps(v_r0, v_b0); __m128 layer1_chunk2 = _mm_unpacklo_ps(v_r1, v_b1); diff --git a/modules/core/src/arithm.cpp b/modules/core/src/arithm.cpp index 8f490a9c95..4c14732e79 100644 --- a/modules/core/src/arithm.cpp +++ b/modules/core/src/arithm.cpp @@ -3476,7 +3476,7 @@ struct Cmp_SIMD haveSSE = checkHardwareSupport(CV_CPU_SSE2); - v_mask = _mm_set1_epi8(0xff); + v_mask = _mm_set1_epi8(-1); } int operator () (const schar * src1, const schar * src2, uchar * dst, int width) const diff --git a/modules/core/src/convert.cpp b/modules/core/src/convert.cpp index a48e90e452..626a666a95 100644 --- a/modules/core/src/convert.cpp +++ b/modules/core/src/convert.cpp @@ -616,18 +616,17 @@ struct VMerge4 bool support; \ } -MERGE2_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128); +MERGE2_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128); MERGE3_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128); MERGE4_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128); -MERGE2_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128); - #if CV_SSE4_1 +MERGE2_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128); MERGE3_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128); MERGE4_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128); #endif -MERGE2_KERNEL_TEMPLATE( int, __m128, float, _mm_deinterleave_ps, ps); +MERGE2_KERNEL_TEMPLATE( int, __m128, float, _mm_interleave_ps, ps); MERGE3_KERNEL_TEMPLATE( int, __m128, float, _mm_interleave_ps, ps); MERGE4_KERNEL_TEMPLATE( int, __m128, float, _mm_interleave_ps, ps); diff --git a/modules/imgproc/src/color.cpp b/modules/imgproc/src/color.cpp index 7197627b20..675d6b9089 100644 --- a/modules/imgproc/src/color.cpp +++ b/modules/imgproc/src/color.cpp @@ -1460,9 +1460,9 @@ struct RGB2Gray if( blueIdx == 0 ) std::swap(coeffs[0], coeffs[2]); - v_cb = _mm_set1_epi16(coeffs[0]); - v_cg = _mm_set1_epi16(coeffs[1]); - v_cr = _mm_set1_epi16(coeffs[2]); + v_cb = _mm_set1_epi16((short)coeffs[0]); + v_cg = _mm_set1_epi16((short)coeffs[1]); + v_cr = _mm_set1_epi16((short)coeffs[2]); v_delta = _mm_set1_epi32(1 << (yuv_shift - 1)); } diff --git a/modules/photo/test/test_cloning.cpp b/modules/photo/test/test_cloning.cpp index 56d166205c..1f86612a4a 100644 --- a/modules/photo/test/test_cloning.cpp +++ b/modules/photo/test/test_cloning.cpp @@ -64,6 +64,7 @@ TEST(Photo_SeamlessClone_normal, regression) string original_path1 = folder + "source1.png"; string original_path2 = folder + "destination1.png"; string original_path3 = folder + "mask.png"; + string reference_path = folder + "reference.png"; Mat source = imread(original_path1, IMREAD_COLOR); Mat destination = imread(original_path2, IMREAD_COLOR); @@ -79,8 +80,8 @@ TEST(Photo_SeamlessClone_normal, regression) p.y = destination.size().height/2; seamlessClone(source, destination, mask, p, result, 1); - - Mat reference = imread(folder + "reference.png"); + Mat reference = imread(reference_path); + ASSERT_FALSE(reference.empty()) << "Could not load reference image " << reference_path; SAVE(result); @@ -94,6 +95,7 @@ TEST(Photo_SeamlessClone_mixed, regression) string original_path1 = folder + "source1.png"; string original_path2 = folder + "destination1.png"; string original_path3 = folder + "mask.png"; + string reference_path = folder + "reference.png"; Mat source = imread(original_path1, IMREAD_COLOR); Mat destination = imread(original_path2, IMREAD_COLOR); @@ -111,7 +113,9 @@ TEST(Photo_SeamlessClone_mixed, regression) SAVE(result); - Mat reference = imread(folder + "reference.png"); + Mat reference = imread(reference_path); + ASSERT_FALSE(reference.empty()) << "Could not load reference image " << reference_path; + double error = cvtest::norm(reference, result, NORM_L1); EXPECT_LE(error, numerical_precision); @@ -123,6 +127,7 @@ TEST(Photo_SeamlessClone_featureExchange, regression) string original_path1 = folder + "source1.png"; string original_path2 = folder + "destination1.png"; string original_path3 = folder + "mask.png"; + string reference_path = folder + "reference.png"; Mat source = imread(original_path1, IMREAD_COLOR); Mat destination = imread(original_path2, IMREAD_COLOR); @@ -140,7 +145,9 @@ TEST(Photo_SeamlessClone_featureExchange, regression) SAVE(result); - Mat reference = imread(folder + "reference.png"); + Mat reference = imread(reference_path); + ASSERT_FALSE(reference.empty()) << "Could not load reference image " << reference_path; + double error = cvtest::norm(reference, result, NORM_L1); EXPECT_LE(error, numerical_precision); @@ -151,6 +158,7 @@ TEST(Photo_SeamlessClone_colorChange, regression) string folder = string(cvtest::TS::ptr()->get_data_path()) + "cloning/color_change/"; string original_path1 = folder + "source1.png"; string original_path2 = folder + "mask.png"; + string reference_path = folder + "reference.png"; Mat source = imread(original_path1, IMREAD_COLOR); Mat mask = imread(original_path2, IMREAD_COLOR); @@ -163,7 +171,9 @@ TEST(Photo_SeamlessClone_colorChange, regression) SAVE(result); - Mat reference = imread(folder + "reference.png"); + Mat reference = imread(reference_path); + ASSERT_FALSE(reference.empty()) << "Could not load reference image " << reference_path; + double error = cvtest::norm(reference, result, NORM_L1); EXPECT_LE(error, numerical_precision); @@ -174,6 +184,7 @@ TEST(Photo_SeamlessClone_illuminationChange, regression) string folder = string(cvtest::TS::ptr()->get_data_path()) + "cloning/Illumination_Change/"; string original_path1 = folder + "source1.png"; string original_path2 = folder + "mask.png"; + string reference_path = folder + "reference.png"; Mat source = imread(original_path1, IMREAD_COLOR); Mat mask = imread(original_path2, IMREAD_COLOR); @@ -186,7 +197,7 @@ TEST(Photo_SeamlessClone_illuminationChange, regression) SAVE(result); - Mat reference = imread(folder + "reference.png"); + Mat reference = imread(reference_path); double error = cvtest::norm(reference, result, NORM_L1); EXPECT_LE(error, numerical_precision); @@ -197,6 +208,7 @@ TEST(Photo_SeamlessClone_textureFlattening, regression) string folder = string(cvtest::TS::ptr()->get_data_path()) + "cloning/Texture_Flattening/"; string original_path1 = folder + "source1.png"; string original_path2 = folder + "mask.png"; + string reference_path = folder + "reference.png"; Mat source = imread(original_path1, IMREAD_COLOR); Mat mask = imread(original_path2, IMREAD_COLOR); @@ -209,7 +221,9 @@ TEST(Photo_SeamlessClone_textureFlattening, regression) SAVE(result); - Mat reference = imread(folder + "reference.png"); + Mat reference = imread(reference_path); + ASSERT_FALSE(reference.empty()) << "Could not load reference image " << reference_path; + double error = cvtest::norm(reference, result, NORM_L1); EXPECT_LE(error, numerical_precision); From 7b060d91224ef65d680a74eb57bccdd0ea93a35d Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 12 Jan 2015 10:59:30 +0300 Subject: [PATCH 43/53] cvtColor 4 cn --- modules/imgproc/src/color.cpp | 130 ++++++++++++++++++++++++++-------- 1 file changed, 102 insertions(+), 28 deletions(-) diff --git a/modules/imgproc/src/color.cpp b/modules/imgproc/src/color.cpp index 675d6b9089..4efbcc5f8b 100644 --- a/modules/imgproc/src/color.cpp +++ b/modules/imgproc/src/color.cpp @@ -1811,9 +1811,9 @@ struct RGB2YCrCb_f float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4]; n *= 3; - if (scn == 3) + if (scn == 3 || scn == 4) { - for ( ; i <= n - 24; i += 24, src += 24) + for ( ; i <= n - 24; i += 24, src += 8 * scn) { __m128 v_r0 = _mm_loadu_ps(src); __m128 v_r1 = _mm_loadu_ps(src + 4); @@ -1822,7 +1822,15 @@ struct RGB2YCrCb_f __m128 v_b0 = _mm_loadu_ps(src + 16); __m128 v_b1 = _mm_loadu_ps(src + 20); - _mm_deinterleave_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); + if (scn == 4) + { + __m128 v_a0 = _mm_loadu_ps(src + 24); + __m128 v_a1 = _mm_loadu_ps(src + 28); + _mm_deinterleave_ps(v_r0, v_r1, v_g0, v_g1, + v_b0, v_b1, v_a0, v_a1); + } + else + _mm_deinterleave_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); __m128 v_y0, v_cr0, v_cb0; process(v_r0, v_g0, v_b0, @@ -2141,7 +2149,7 @@ struct RGB2YCrCb_i __m128i v_b_p = _mm_unpacklo_epi16(v_b, v_zero); __m128i v_y0 = _mm_add_epi32(_mm_mullo_epi32(v_r_p, v_c0), - _mm_add_epi32(_mm_mullo_epi32(v_g_p, v_c1), + _mm_add_epi32(_mm_mullo_epi32(v_g_p, v_c1), _mm_mullo_epi32(v_b_p, v_c2))); v_y0 = _mm_srli_epi32(_mm_add_epi32(v_delta2, v_y0), yuv_shift); @@ -2176,7 +2184,7 @@ struct RGB2YCrCb_i int delta = ColorChannel::half()*(1 << yuv_shift); n *= 3; - if (scn == 3) + if (scn == 3 || scn == 4) { for ( ; i <= n - 96; i += 96, src += scn * 32) { @@ -2187,7 +2195,15 @@ struct RGB2YCrCb_i __m128i v_b0 = _mm_loadu_si128((__m128i const *)(src + 64)); __m128i v_b1 = _mm_loadu_si128((__m128i const *)(src + 80)); - _mm_deinterleave_epi8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); + if (scn == 4) + { + __m128i v_a0 = _mm_loadu_si128((__m128i const *)(src + 96)); + __m128i v_a1 = _mm_loadu_si128((__m128i const *)(src + 112)); + _mm_deinterleave_epi8(v_r0, v_r1, v_g0, v_g1, + v_b0, v_b1, v_a0, v_a1); + } + else + _mm_deinterleave_epi8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); __m128i v_y0 = v_zero, v_cr0 = v_zero, v_cb0 = v_zero; process(_mm_unpacklo_epi8(v_r0, v_zero), @@ -2280,7 +2296,7 @@ struct RGB2YCrCb_i __m128i v_b_p = _mm_unpacklo_epi16(v_b, v_zero); __m128i v_y0 = _mm_add_epi32(_mm_mullo_epi32(v_r_p, v_c0), - _mm_add_epi32(_mm_mullo_epi32(v_g_p, v_c1), + _mm_add_epi32(_mm_mullo_epi32(v_g_p, v_c1), _mm_mullo_epi32(v_b_p, v_c2))); v_y0 = _mm_srli_epi32(_mm_add_epi32(v_delta2, v_y0), yuv_shift); @@ -2315,8 +2331,7 @@ struct RGB2YCrCb_i int delta = ColorChannel::half()*(1 << yuv_shift); n *= 3; - - if (scn == 3) + if (scn == 3 || scn == 4) { for ( ; i <= n - 48; i += 48, src += scn * 16) { @@ -2327,7 +2342,16 @@ struct RGB2YCrCb_i __m128i v_b0 = _mm_loadu_si128((__m128i const *)(src + 32)); __m128i v_b1 = _mm_loadu_si128((__m128i const *)(src + 40)); - _mm_deinterleave_epi16(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); + if (scn == 4) + { + __m128i v_a0 = _mm_loadu_si128((__m128i const *)(src + 48)); + __m128i v_a1 = _mm_loadu_si128((__m128i const *)(src + 56)); + + _mm_deinterleave_epi16(v_r0, v_r1, v_g0, v_g1, + v_b0, v_b1, v_a0, v_a1); + } + else + _mm_deinterleave_epi16(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); __m128i v_y0 = v_zero, v_cr0 = v_zero, v_cb0 = v_zero; process(v_r0, v_g0, v_b0, @@ -2521,9 +2545,9 @@ struct YCrCb2RGB_f float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3]; n *= 3; - if (dcn == 3) + if (dcn == 3 || dcn == 4) { - for ( ; i <= n - 24; i += 24, dst += 24) + for ( ; i <= n - 24; i += 24, dst += 8 * dcn) { __m128 v_y0 = _mm_loadu_ps(src + i); __m128 v_y1 = _mm_loadu_ps(src + i + 4); @@ -2542,7 +2566,13 @@ struct YCrCb2RGB_f process(v_y1, v_cr1, v_cb1, v_r1, v_g1, v_b1); - _mm_interleave_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); + __m128 v_a0 = v_alpha, v_a1 = v_alpha; + + if (dcn == 3) + _mm_interleave_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); + else + _mm_interleave_ps(v_r0, v_r1, v_g0, v_g1, + v_b0, v_b1, v_a0, v_a1); _mm_storeu_ps(dst, v_r0); _mm_storeu_ps(dst + 4, v_r1); @@ -2550,6 +2580,12 @@ struct YCrCb2RGB_f _mm_storeu_ps(dst + 12, v_g1); _mm_storeu_ps(dst + 16, v_b0); _mm_storeu_ps(dst + 20, v_b1); + + if (dcn == 4) + { + _mm_storeu_ps(dst + 24, v_a0); + _mm_storeu_ps(dst + 28, v_a1); + } } } @@ -2568,7 +2604,7 @@ struct YCrCb2RGB_f } int dstcn, blueIdx; float coeffs[4]; - + __m128 v_c0, v_c1, v_c2, v_c3, v_alpha, v_delta; }; @@ -2880,11 +2916,14 @@ struct YCrCb2RGB_i v_delta2 = _mm_set1_epi32(1 << (yuv_shift - 1)); v_zero = _mm_setzero_si128(); + uchar alpha = ColorChannel::max(); + v_alpha = _mm_set1_epi8(*(char *)&alpha); + useSSE = coeffs[0] <= std::numeric_limits::max(); } // 16s x 8 - void process(__m128i v_y, __m128i v_cr, __m128i v_cb, + void process(__m128i v_y, __m128i v_cr, __m128i v_cb, __m128i & v_r, __m128i & v_g, __m128i & v_b) const { v_cr = _mm_sub_epi16(v_cr, v_delta); @@ -2903,11 +2942,11 @@ struct YCrCb2RGB_i __m128i v_mulhi_0 = _mm_mulhi_epi16(v_cr, v_c0); __m128i v_b0 = _mm_srai_epi32(_mm_add_epi32(_mm_unpacklo_epi16(v_mullo_3, v_mulhi_3), v_delta2), yuv_shift); - __m128i v_g0 = _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(_mm_unpacklo_epi16(v_mullo_2, v_mulhi_2), + __m128i v_g0 = _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(_mm_unpacklo_epi16(v_mullo_2, v_mulhi_2), _mm_unpacklo_epi16(v_mullo_1, v_mulhi_1)), v_delta2), yuv_shift); __m128i v_r0 = _mm_srai_epi32(_mm_add_epi32(_mm_unpacklo_epi16(v_mullo_0, v_mulhi_0), v_delta2), yuv_shift); - + v_r0 = _mm_add_epi32(v_r0, v_y_p); v_g0 = _mm_add_epi32(v_g0, v_y_p); v_b0 = _mm_add_epi32(v_b0, v_y_p); @@ -2915,7 +2954,7 @@ struct YCrCb2RGB_i v_y_p = _mm_unpackhi_epi16(v_y, v_zero); __m128i v_b1 = _mm_srai_epi32(_mm_add_epi32(_mm_unpackhi_epi16(v_mullo_3, v_mulhi_3), v_delta2), yuv_shift); - __m128i v_g1 = _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(_mm_unpackhi_epi16(v_mullo_2, v_mulhi_2), + __m128i v_g1 = _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(_mm_unpackhi_epi16(v_mullo_2, v_mulhi_2), _mm_unpackhi_epi16(v_mullo_1, v_mulhi_1)), v_delta2), yuv_shift); __m128i v_r1 = _mm_srai_epi32(_mm_add_epi32(_mm_unpackhi_epi16(v_mullo_0, v_mulhi_0), v_delta2), yuv_shift); @@ -2936,7 +2975,7 @@ struct YCrCb2RGB_i int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3]; n *= 3; - if (dcn == 3 && useSSE) + if ((dcn == 3 || dcn == 4) && useSSE) { for ( ; i <= n - 96; i += 96, dst += dcn * 32) { @@ -2985,7 +3024,13 @@ struct YCrCb2RGB_i std::swap(v_r1, v_b1); } - _mm_interleave_epi8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); + __m128i v_a0 = v_alpha, v_a1 = v_alpha; + + if (dcn == 3) + _mm_interleave_epi8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); + else + _mm_interleave_epi8(v_r0, v_r1, v_g0, v_g1, + v_b0, v_b1, v_a0, v_a1); _mm_storeu_si128((__m128i *)(dst), v_r0); _mm_storeu_si128((__m128i *)(dst + 16), v_r1); @@ -2993,10 +3038,15 @@ struct YCrCb2RGB_i _mm_storeu_si128((__m128i *)(dst + 48), v_g1); _mm_storeu_si128((__m128i *)(dst + 64), v_b0); _mm_storeu_si128((__m128i *)(dst + 80), v_b1); + + if (dcn == 4) + { + _mm_storeu_si128((__m128i *)(dst + 96), v_a0); + _mm_storeu_si128((__m128i *)(dst + 112), v_a1); + } } } - for ( ; i < n; i += 3, dst += dcn) { uchar Y = src[i]; @@ -3198,9 +3248,9 @@ struct RGB2XYZ_f n *= 3; - if (scn == 3) + if (scn == 3 || scn == 4) { - for ( ; i <= n - 24; i += 24, src += 24) + for ( ; i <= n - 24; i += 24, src += 8 * scn) { __m128 v_r0 = _mm_loadu_ps(src); __m128 v_r1 = _mm_loadu_ps(src + 4); @@ -3209,7 +3259,16 @@ struct RGB2XYZ_f __m128 v_b0 = _mm_loadu_ps(src + 16); __m128 v_b1 = _mm_loadu_ps(src + 20); - _mm_deinterleave_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); + if (scn == 4) + { + __m128 v_a0 = _mm_loadu_ps(src + 24); + __m128 v_a1 = _mm_loadu_ps(src + 28); + + _mm_deinterleave_ps(v_r0, v_r1, v_g0, v_g1, + v_b0, v_b1, v_a0, v_a1); + } + else + _mm_deinterleave_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); __m128 v_x0, v_y0, v_z0; process(v_r0, v_g0, v_b0, @@ -3596,6 +3655,8 @@ struct XYZ2RGB_f v_c6 = _mm_set1_ps(coeffs[6]); v_c7 = _mm_set1_ps(coeffs[7]); v_c8 = _mm_set1_ps(coeffs[8]); + + v_alpha = _mm_set1_ps(ColorChannel::max()); } void process(__m128 v_x, __m128 v_y, __m128 v_z, @@ -3624,9 +3685,9 @@ struct XYZ2RGB_f n *= 3; int i = 0; - if (dcn == 3) + if (dcn == 3 || dcn == 4) { - for ( ; i <= n - 24; i += 24, dst += 24) + for ( ; i <= n - 24; i += 24, dst += 8 * dcn) { __m128 v_x0 = _mm_loadu_ps(src + i); __m128 v_x1 = _mm_loadu_ps(src + i + 4); @@ -3645,7 +3706,13 @@ struct XYZ2RGB_f process(v_x1, v_y1, v_z1, v_r1, v_g1, v_b1); - _mm_interleave_ps(v_b0, v_b1, v_g0, v_g1, v_r0, v_r1); + __m128 v_a0 = v_alpha, v_a1 = v_alpha; + + if (dcn == 4) + _mm_interleave_ps(v_b0, v_b1, v_g0, v_g1, + v_r0, v_r1, v_a0, v_a1); + else + _mm_interleave_ps(v_b0, v_b1, v_g0, v_g1, v_r0, v_r1); _mm_storeu_ps(dst, v_b0); _mm_storeu_ps(dst + 4, v_b1); @@ -3653,6 +3720,12 @@ struct XYZ2RGB_f _mm_storeu_ps(dst + 12, v_g1); _mm_storeu_ps(dst + 16, v_r0); _mm_storeu_ps(dst + 20, v_r1); + + if (dcn == 4) + { + _mm_storeu_ps(dst + 24, v_a0); + _mm_storeu_ps(dst + 28, v_a1); + } } } @@ -3671,6 +3744,7 @@ struct XYZ2RGB_f float coeffs[9]; __m128 v_c0, v_c1, v_c2, v_c3, v_c4, v_c5, v_c6, v_c7, v_c8; + __m128 v_alpha; }; #endif // CV_SSE2 @@ -5213,7 +5287,7 @@ struct Lab2RGB_b v_g1 = _mm_sub_ps(v_g1, v_128); v_b0 = _mm_sub_ps(v_b0, v_128); v_b1 = _mm_sub_ps(v_b1, v_128); - + _mm_interleave_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); _mm_store_ps(buf, v_r0); From 44d89638fd6cb074e87a4b3fd1bcff3f8ecc6c89 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 12 Jan 2015 10:59:30 +0300 Subject: [PATCH 44/53] divide --- modules/core/perf/perf_arithm.cpp | 14 ++ modules/core/src/arithm.cpp | 231 +++++++++++++++++++++++++++++- 2 files changed, 244 insertions(+), 1 deletion(-) diff --git a/modules/core/perf/perf_arithm.cpp b/modules/core/perf/perf_arithm.cpp index 3598c8639f..c6e4c40db2 100644 --- a/modules/core/perf/perf_arithm.cpp +++ b/modules/core/perf/perf_arithm.cpp @@ -242,3 +242,17 @@ PERF_TEST_P(Size_MatType, multiplyScale, TYPICAL_MATS_CORE_ARITHM) SANITY_CHECK(c, 1e-8); } + +PERF_TEST_P(Size_MatType, divide, TYPICAL_MATS_CORE_ARITHM) +{ + Size sz = get<0>(GetParam()); + int type = get<1>(GetParam()); + cv::Mat a(sz, type), b(sz, type), c(sz, type); + double scale = 0.5; + + declare.in(a, b, WARMUP_RNG).out(c); + + TEST_CYCLE() divide(a, b, c, scale); + + SANITY_CHECK_NOTHING(); +} diff --git a/modules/core/src/arithm.cpp b/modules/core/src/arithm.cpp index 4c14732e79..5875d61cca 100644 --- a/modules/core/src/arithm.cpp +++ b/modules/core/src/arithm.cpp @@ -2610,6 +2610,233 @@ mul_( const T* src1, size_t step1, const T* src2, size_t step2, } } +template +struct Div_SIMD +{ + int operator() (const T *, const T *, T *, int, double) const + { + return 0; + } +}; + +#if CV_SSE2 + +#if CV_SSE4_1 + +template <> +struct Div_SIMD +{ + int operator() (const uchar * src1, const uchar * src2, uchar * dst, int width, double scale) const + { + int x = 0; + + __m128d v_scale = _mm_set1_pd(scale); + __m128i v_zero = _mm_setzero_si128(); + + for ( ; x <= width - 8; x += 8) + { + __m128i v_src1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(src1 + x)), v_zero); + __m128i _v_src2 = _mm_loadl_epi64((const __m128i *)(src2 + x)); + __m128i v_src2 = _mm_unpacklo_epi8(_v_src2, v_zero); + + __m128i v_src1i = _mm_unpacklo_epi16(v_src1, v_zero); + __m128i v_src2i = _mm_unpacklo_epi16(v_src2, v_zero); + __m128d v_src1d = _mm_cvtepi32_pd(v_src1i); + __m128d v_src2d = _mm_cvtepi32_pd(v_src2i); + __m128i v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d)); + v_src1d = _mm_cvtepi32_pd(_mm_srli_si128(v_src1i, 8)); + v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8)); + __m128i v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d)); + __m128i v_dst_0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1))); + + v_src1i = _mm_unpackhi_epi16(v_src1, v_zero); + v_src2i = _mm_unpackhi_epi16(v_src2, v_zero); + v_src1d = _mm_cvtepi32_pd(v_src1i); + v_src2d = _mm_cvtepi32_pd(v_src2i); + v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d)); + v_src1d = _mm_cvtepi32_pd(_mm_srli_si128(v_src1i, 8)); + v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8)); + v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d)); + __m128i v_dst_1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1))); + + __m128i v_mask = _mm_cmpeq_epi8(_v_src2, v_zero); + _mm_storel_epi64((__m128i *)(dst + x), _mm_andnot_si128(v_mask, _mm_packus_epi16(_mm_packs_epi32(v_dst_0, v_dst_1), v_zero))); + } + + return x; + } +}; + +#endif // CV_SSE4_1 + +template <> +struct Div_SIMD +{ + int operator() (const schar * src1, const schar * src2, schar * dst, int width, double scale) const + { + int x = 0; + + __m128d v_scale = _mm_set1_pd(scale); + __m128i v_zero = _mm_setzero_si128(); + + for ( ; x <= width - 8; x += 8) + { + __m128i v_src1 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((const __m128i *)(src1 + x))), 8); + __m128i _v_src2 = _mm_loadl_epi64((const __m128i *)(src2 + x)); + __m128i v_src2 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _v_src2), 8); + + __m128i v_src1i = _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16); + __m128i v_src2i = _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16); + __m128d v_src1d = _mm_cvtepi32_pd(v_src1i); + __m128d v_src2d = _mm_cvtepi32_pd(v_src2i); + __m128i v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d)); + v_src1d = _mm_cvtepi32_pd(_mm_srli_si128(v_src1i, 8)); + v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8)); + __m128i v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d)); + __m128i v_dst_0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1))); + + v_src1i = _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16); + v_src2i = _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16); + v_src1d = _mm_cvtepi32_pd(v_src1i); + v_src2d = _mm_cvtepi32_pd(v_src2i); + v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d)); + v_src1d = _mm_cvtepi32_pd(_mm_srli_si128(v_src1i, 8)); + v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8)); + v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d)); + __m128i v_dst_1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1))); + + __m128i v_mask = _mm_cmpeq_epi8(_v_src2, v_zero); + _mm_storel_epi64((__m128i *)(dst + x), _mm_andnot_si128(v_mask, _mm_packs_epi16(_mm_packs_epi32(v_dst_0, v_dst_1), v_zero))); + } + + return x; + } +}; + +#if CV_SSE4_1 + +template <> +struct Div_SIMD +{ + int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, double scale) const + { + int x = 0; + + __m128d v_scale = _mm_set1_pd(scale); + __m128i v_zero = _mm_setzero_si128(); + + for ( ; x <= width - 8; x += 8) + { + __m128i v_src1 = _mm_loadu_si128((const __m128i *)(src1 + x)); + __m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x)); + + __m128i v_src1i = _mm_unpacklo_epi16(v_src1, v_zero); + __m128i v_src2i = _mm_unpacklo_epi16(v_src2, v_zero); + __m128d v_src1d = _mm_cvtepi32_pd(v_src1i); + __m128d v_src2d = _mm_cvtepi32_pd(v_src2i); + __m128i v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d)); + v_src1d = _mm_cvtepi32_pd(_mm_srli_si128(v_src1i, 8)); + v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8)); + __m128i v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d)); + __m128i v_dst_0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1))); + + v_src1i = _mm_unpackhi_epi16(v_src1, v_zero); + v_src2i = _mm_unpackhi_epi16(v_src2, v_zero); + v_src1d = _mm_cvtepi32_pd(v_src1i); + v_src2d = _mm_cvtepi32_pd(v_src2i); + v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d)); + v_src1d = _mm_cvtepi32_pd(_mm_srli_si128(v_src1i, 8)); + v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8)); + v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d)); + __m128i v_dst_1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1))); + + __m128i v_mask = _mm_cmpeq_epi16(v_src2, v_zero); + _mm_storeu_si128((__m128i *)(dst + x), _mm_andnot_si128(v_mask, _mm_packus_epi32(v_dst_0, v_dst_1))); + } + + return x; + } +}; + +#endif // CV_SSE4_1 + +template <> +struct Div_SIMD +{ + int operator() (const short * src1, const short * src2, short * dst, int width, double scale) const + { + int x = 0; + + __m128d v_scale = _mm_set1_pd(scale); + __m128i v_zero = _mm_setzero_si128(); + + for ( ; x <= width - 8; x += 8) + { + __m128i v_src1 = _mm_loadu_si128((const __m128i *)(src1 + x)); + __m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x)); + + __m128i v_src1i = _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16); + __m128i v_src2i = _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16); + __m128d v_src1d = _mm_cvtepi32_pd(v_src1i); + __m128d v_src2d = _mm_cvtepi32_pd(v_src2i); + __m128i v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d)); + v_src1d = _mm_cvtepi32_pd(_mm_srli_si128(v_src1i, 8)); + v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8)); + __m128i v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d)); + __m128i v_dst_0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1))); + + v_src1i = _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16); + v_src2i = _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16); + v_src1d = _mm_cvtepi32_pd(v_src1i); + v_src2d = _mm_cvtepi32_pd(v_src2i); + v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d)); + v_src1d = _mm_cvtepi32_pd(_mm_srli_si128(v_src1i, 8)); + v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8)); + v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d)); + __m128i v_dst_1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1))); + + __m128i v_mask = _mm_cmpeq_epi16(v_src2, v_zero); + _mm_storeu_si128((__m128i *)(dst + x), _mm_andnot_si128(v_mask, _mm_packs_epi32(v_dst_0, v_dst_1))); + } + + return x; + } +}; + +template <> +struct Div_SIMD +{ + int operator() (const int * src1, const int * src2, int * dst, int width, double scale) const + { + int x = 0; + + __m128d v_scale = _mm_set1_pd(scale); + __m128i v_zero = _mm_setzero_si128(); + + for ( ; x <= width - 4; x += 4) + { + __m128i v_src1 = _mm_loadu_si128((const __m128i *)(src1 + x)); + __m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x)); + + __m128d v_src1d = _mm_cvtepi32_pd(v_src1); + __m128d v_src2d = _mm_cvtepi32_pd(v_src2); + __m128i v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d)); + + v_src1d = _mm_cvtepi32_pd(_mm_srli_si128(v_src1, 8)); + v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2, 8)); + __m128i v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d)); + + __m128i v_dst = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1))); + __m128i v_mask = _mm_cmpeq_epi32(v_src2, v_zero); + _mm_storeu_si128((__m128i *)(dst + x), _mm_andnot_si128(v_mask, v_dst)); + } + + return x; + } +}; + +#endif + template static void div_( const T* src1, size_t step1, const T* src2, size_t step2, T* dst, size_t step, Size size, double scale ) @@ -2618,9 +2845,11 @@ div_( const T* src1, size_t step1, const T* src2, size_t step2, step2 /= sizeof(src2[0]); step /= sizeof(dst[0]); + Div_SIMD vop; + for( ; size.height--; src1 += step1, src2 += step2, dst += step ) { - int i = 0; + int i = vop(src1, src2, dst, size.width, scale); #if CV_ENABLE_UNROLLED for( ; i <= size.width - 4; i += 4 ) { From ef29b15c9a360fcbd4c3fe48d3ce2574a8ff37f1 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 12 Jan 2015 10:59:30 +0300 Subject: [PATCH 45/53] reciprocal --- modules/core/perf/perf_arithm.cpp | 14 +++ modules/core/src/arithm.cpp | 200 +++++++++++++++++++++++++++++- 2 files changed, 213 insertions(+), 1 deletion(-) diff --git a/modules/core/perf/perf_arithm.cpp b/modules/core/perf/perf_arithm.cpp index c6e4c40db2..c6c2a1b29f 100644 --- a/modules/core/perf/perf_arithm.cpp +++ b/modules/core/perf/perf_arithm.cpp @@ -256,3 +256,17 @@ PERF_TEST_P(Size_MatType, divide, TYPICAL_MATS_CORE_ARITHM) SANITY_CHECK_NOTHING(); } + +PERF_TEST_P(Size_MatType, reciprocal, TYPICAL_MATS_CORE_ARITHM) +{ + Size sz = get<0>(GetParam()); + int type = get<1>(GetParam()); + cv::Mat b(sz, type), c(sz, type); + double scale = 0.5; + + declare.in(b, WARMUP_RNG).out(c); + + TEST_CYCLE() divide(scale, b, c); + + SANITY_CHECK_NOTHING(); +} diff --git a/modules/core/src/arithm.cpp b/modules/core/src/arithm.cpp index 5875d61cca..49a9cceaec 100644 --- a/modules/core/src/arithm.cpp +++ b/modules/core/src/arithm.cpp @@ -2886,6 +2886,202 @@ div_( const T* src1, size_t step1, const T* src2, size_t step2, } } +template +struct Recip_SIMD +{ + int operator() (const T *, T *, int, double) const + { + return 0; + } +}; + +#if CV_SSE2 + +#if CV_SSE4_1 + +template <> +struct Recip_SIMD +{ + int operator() (const uchar * src2, uchar * dst, int width, double scale) const + { + int x = 0; + + __m128d v_scale = _mm_set1_pd(scale); + __m128i v_zero = _mm_setzero_si128(); + + for ( ; x <= width - 8; x += 8) + { + __m128i _v_src2 = _mm_loadl_epi64((const __m128i *)(src2 + x)); + __m128i v_src2 = _mm_unpacklo_epi8(_v_src2, v_zero); + + __m128i v_src2i = _mm_unpacklo_epi16(v_src2, v_zero); + __m128d v_src2d = _mm_cvtepi32_pd(v_src2i); + __m128i v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d)); + v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8)); + __m128i v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d)); + __m128i v_dst_0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1))); + + v_src2i = _mm_unpackhi_epi16(v_src2, v_zero); + v_src2d = _mm_cvtepi32_pd(v_src2i); + v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d)); + v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8)); + v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d)); + __m128i v_dst_1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1))); + + __m128i v_mask = _mm_cmpeq_epi8(_v_src2, v_zero); + _mm_storel_epi64((__m128i *)(dst + x), _mm_andnot_si128(v_mask, _mm_packus_epi16(_mm_packs_epi32(v_dst_0, v_dst_1), v_zero))); + } + + return x; + } +}; + +#endif // CV_SSE4_1 + +template <> +struct Recip_SIMD +{ + int operator() (const schar * src2, schar * dst, int width, double scale) const + { + int x = 0; + + __m128d v_scale = _mm_set1_pd(scale); + __m128i v_zero = _mm_setzero_si128(); + + for ( ; x <= width - 8; x += 8) + { + __m128i _v_src2 = _mm_loadl_epi64((const __m128i *)(src2 + x)); + __m128i v_src2 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _v_src2), 8); + + __m128i v_src2i = _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16); + __m128d v_src2d = _mm_cvtepi32_pd(v_src2i); + __m128i v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d)); + v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8)); + __m128i v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d)); + __m128i v_dst_0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1))); + + v_src2i = _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16); + v_src2d = _mm_cvtepi32_pd(v_src2i); + v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d)); + v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8)); + v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d)); + __m128i v_dst_1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1))); + + __m128i v_mask = _mm_cmpeq_epi8(_v_src2, v_zero); + _mm_storel_epi64((__m128i *)(dst + x), _mm_andnot_si128(v_mask, _mm_packs_epi16(_mm_packs_epi32(v_dst_0, v_dst_1), v_zero))); + } + + return x; + } +}; + +#if CV_SSE4_1 + +template <> +struct Recip_SIMD +{ + int operator() (const ushort * src2, ushort * dst, int width, double scale) const + { + int x = 0; + + __m128d v_scale = _mm_set1_pd(scale); + __m128i v_zero = _mm_setzero_si128(); + + for ( ; x <= width - 8; x += 8) + { + __m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x)); + + __m128i v_src2i = _mm_unpacklo_epi16(v_src2, v_zero); + __m128d v_src2d = _mm_cvtepi32_pd(v_src2i); + __m128i v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d)); + v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8)); + __m128i v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d)); + __m128i v_dst_0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1))); + + v_src2i = _mm_unpackhi_epi16(v_src2, v_zero); + v_src2d = _mm_cvtepi32_pd(v_src2i); + v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d)); + v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8)); + v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d)); + __m128i v_dst_1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1))); + + __m128i v_mask = _mm_cmpeq_epi16(v_src2, v_zero); + _mm_storeu_si128((__m128i *)(dst + x), _mm_andnot_si128(v_mask, _mm_packus_epi32(v_dst_0, v_dst_1))); + } + + return x; + } +}; + +#endif // CV_SSE4_1 + +template <> +struct Recip_SIMD +{ + int operator() (const short * src2, short * dst, int width, double scale) const + { + int x = 0; + + __m128d v_scale = _mm_set1_pd(scale); + __m128i v_zero = _mm_setzero_si128(); + + for ( ; x <= width - 8; x += 8) + { + __m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x)); + + __m128i v_src2i = _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16); + __m128d v_src2d = _mm_cvtepi32_pd(v_src2i); + __m128i v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d)); + v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8)); + __m128i v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d)); + __m128i v_dst_0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1))); + + v_src2i = _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16); + v_src2d = _mm_cvtepi32_pd(v_src2i); + v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d)); + v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8)); + v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d)); + __m128i v_dst_1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1))); + + __m128i v_mask = _mm_cmpeq_epi16(v_src2, v_zero); + _mm_storeu_si128((__m128i *)(dst + x), _mm_andnot_si128(v_mask, _mm_packs_epi32(v_dst_0, v_dst_1))); + } + + return x; + } +}; + +template <> +struct Recip_SIMD +{ + int operator() (const int * src2, int * dst, int width, double scale) const + { + int x = 0; + + __m128d v_scale = _mm_set1_pd(scale); + __m128i v_zero = _mm_setzero_si128(); + + for ( ; x <= width - 4; x += 4) + { + __m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x)); + + __m128d v_src2d = _mm_cvtepi32_pd(v_src2); + __m128i v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d)); + + v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2, 8)); + __m128i v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d)); + + __m128i v_dst = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1))); + __m128i v_mask = _mm_cmpeq_epi32(v_src2, v_zero); + _mm_storeu_si128((__m128i *)(dst + x), _mm_andnot_si128(v_mask, v_dst)); + } + + return x; + } +}; + +#endif + template static void recip_( const T*, size_t, const T* src2, size_t step2, T* dst, size_t step, Size size, double scale ) @@ -2893,9 +3089,11 @@ recip_( const T*, size_t, const T* src2, size_t step2, step2 /= sizeof(src2[0]); step /= sizeof(dst[0]); + Recip_SIMD vop; + for( ; size.height--; src2 += step2, dst += step ) { - int i = 0; + int i = vop(src2, dst, size.width, scale); #if CV_ENABLE_UNROLLED for( ; i <= size.width - 4; i += 4 ) { From d92f67ee2ca97a65da92d0461508b1a1a2d3be24 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 12 Jan 2015 10:59:31 +0300 Subject: [PATCH 46/53] warpPerspective --- modules/imgproc/src/imgwarp.cpp | 250 +++++++++++++++++++++++++++++++- 1 file changed, 248 insertions(+), 2 deletions(-) diff --git a/modules/imgproc/src/imgwarp.cpp b/modules/imgproc/src/imgwarp.cpp index 0de7089812..dc254fd7d4 100644 --- a/modules/imgproc/src/imgwarp.cpp +++ b/modules/imgproc/src/imgwarp.cpp @@ -5503,6 +5503,19 @@ public: int bw0 = std::min(BLOCK_SZ*BLOCK_SZ/bh0, width); bh0 = std::min(BLOCK_SZ*BLOCK_SZ/bw0, height); + #if CV_SSE2 + __m128d v_M0 = _mm_set1_pd(M[0]); + __m128d v_M3 = _mm_set1_pd(M[3]); + __m128d v_M6 = _mm_set1_pd(M[6]); + __m128d v_intmax = _mm_set1_pd((double)INT_MAX); + __m128d v_intmin = _mm_set1_pd((double)INT_MIN); + __m128d v_2 = _mm_set1_pd(2), + v_zero = _mm_setzero_pd(), + v_1 = _mm_set1_pd(1), + v_its = _mm_set1_pd(INTER_TAB_SIZE); + __m128i v_itsi1 = _mm_set1_epi32(INTER_TAB_SIZE - 1); + #endif + for( y = range.start; y < range.end; y += bh0 ) { for( x = 0; x < width; x += bw0 ) @@ -5521,7 +5534,117 @@ public: double W0 = M[6]*x + M[7]*(y + y1) + M[8]; if( interpolation == INTER_NEAREST ) - for( x1 = 0; x1 < bw; x1++ ) + { + x1 = 0; + + #if CV_SSE2 + __m128d v_X0d = _mm_set1_pd(X0); + __m128d v_Y0d = _mm_set1_pd(Y0); + __m128d v_W0 = _mm_set1_pd(W0); + __m128d v_x1 = _mm_set_pd(1, 0); + + for( ; x1 <= bw - 16; x1 += 16 ) + { + // 0-3 + __m128i v_X0, v_Y0; + { + __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); + v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W)); + __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); + __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); + v_x1 = _mm_add_pd(v_x1, v_2); + + v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); + v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W)); + __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); + __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); + v_x1 = _mm_add_pd(v_x1, v_2); + + v_X0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)), + _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1)))); + v_Y0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)), + _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1)))); + } + + // 4-8 + __m128i v_X1, v_Y1; + { + __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); + v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W)); + __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); + __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); + v_x1 = _mm_add_pd(v_x1, v_2); + + v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); + v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W)); + __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); + __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); + v_x1 = _mm_add_pd(v_x1, v_2); + + v_X1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)), + _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1)))); + v_Y1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)), + _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1)))); + } + + // 8-11 + __m128i v_X2, v_Y2; + { + __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); + v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W)); + __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); + __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); + v_x1 = _mm_add_pd(v_x1, v_2); + + v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); + v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W)); + __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); + __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); + v_x1 = _mm_add_pd(v_x1, v_2); + + v_X2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)), + _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1)))); + v_Y2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)), + _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1)))); + } + + // 12-15 + __m128i v_X3, v_Y3; + { + __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); + v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W)); + __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); + __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); + v_x1 = _mm_add_pd(v_x1, v_2); + + v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); + v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W)); + __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); + __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); + v_x1 = _mm_add_pd(v_x1, v_2); + + v_X3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)), + _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1)))); + v_Y3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)), + _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1)))); + } + + // convert to 16s + v_X0 = _mm_packs_epi32(v_X0, v_X1); + v_X1 = _mm_packs_epi32(v_X2, v_X3); + v_Y0 = _mm_packs_epi32(v_Y0, v_Y1); + v_Y1 = _mm_packs_epi32(v_Y2, v_Y3); + + _mm_interleave_epi16(v_X0, v_X1, v_Y0, v_Y1); + + _mm_storeu_si128((__m128i *)(xy + x1 * 2), v_X0); + _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 8), v_X1); + _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 16), v_Y0); + _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 24), v_Y1); + } + #endif + + for( ; x1 < bw; x1++ ) { double W = W0 + M[6]*x1; W = W ? 1./W : 0; @@ -5533,10 +5656,133 @@ public: xy[x1*2] = saturate_cast(X); xy[x1*2+1] = saturate_cast(Y); } + } else { short* alpha = A + y1*bw; - for( x1 = 0; x1 < bw; x1++ ) + x1 = 0; + + #if CV_SSE2 + __m128d v_X0d = _mm_set1_pd(X0); + __m128d v_Y0d = _mm_set1_pd(Y0); + __m128d v_W0 = _mm_set1_pd(W0); + __m128d v_x1 = _mm_set_pd(1, 0); + + for( ; x1 <= bw - 16; x1 += 16 ) + { + // 0-3 + __m128i v_X0, v_Y0; + { + __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); + v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W)); + __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); + __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); + v_x1 = _mm_add_pd(v_x1, v_2); + + v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); + v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W)); + __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); + __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); + v_x1 = _mm_add_pd(v_x1, v_2); + + v_X0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)), + _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1)))); + v_Y0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)), + _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1)))); + } + + // 4-8 + __m128i v_X1, v_Y1; + { + __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); + v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W)); + __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); + __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); + v_x1 = _mm_add_pd(v_x1, v_2); + + v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); + v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W)); + __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); + __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); + v_x1 = _mm_add_pd(v_x1, v_2); + + v_X1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)), + _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1)))); + v_Y1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)), + _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1)))); + } + + // 8-11 + __m128i v_X2, v_Y2; + { + __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); + v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W)); + __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); + __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); + v_x1 = _mm_add_pd(v_x1, v_2); + + v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); + v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W)); + __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); + __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); + v_x1 = _mm_add_pd(v_x1, v_2); + + v_X2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)), + _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1)))); + v_Y2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)), + _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1)))); + } + + // 12-15 + __m128i v_X3, v_Y3; + { + __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); + v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W)); + __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); + __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); + v_x1 = _mm_add_pd(v_x1, v_2); + + v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); + v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W)); + __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); + __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); + v_x1 = _mm_add_pd(v_x1, v_2); + + v_X3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)), + _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1)))); + v_Y3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)), + _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1)))); + } + + // store alpha + __m128i v_alpha0 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y0, v_itsi1), INTER_BITS), + _mm_and_si128(v_X0, v_itsi1)); + __m128i v_alpha1 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y1, v_itsi1), INTER_BITS), + _mm_and_si128(v_X1, v_itsi1)); + _mm_storeu_si128((__m128i *)(alpha + x1), _mm_packs_epi32(v_alpha0, v_alpha1)); + + v_alpha0 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y2, v_itsi1), INTER_BITS), + _mm_and_si128(v_X2, v_itsi1)); + v_alpha1 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y3, v_itsi1), INTER_BITS), + _mm_and_si128(v_X3, v_itsi1)); + _mm_storeu_si128((__m128i *)(alpha + x1 + 8), _mm_packs_epi32(v_alpha0, v_alpha1)); + + // convert to 16s + v_X0 = _mm_packs_epi32(_mm_srai_epi32(v_X0, INTER_BITS), _mm_srai_epi32(v_X1, INTER_BITS)); + v_X1 = _mm_packs_epi32(_mm_srai_epi32(v_X2, INTER_BITS), _mm_srai_epi32(v_X3, INTER_BITS)); + v_Y0 = _mm_packs_epi32(_mm_srai_epi32(v_Y0, INTER_BITS), _mm_srai_epi32(v_Y1, INTER_BITS)); + v_Y1 = _mm_packs_epi32(_mm_srai_epi32(v_Y2, INTER_BITS), _mm_srai_epi32(v_Y3, INTER_BITS)); + + _mm_interleave_epi16(v_X0, v_X1, v_Y0, v_Y1); + + _mm_storeu_si128((__m128i *)(xy + x1 * 2), v_X0); + _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 8), v_X1); + _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 16), v_Y0); + _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 24), v_Y1); + } + #endif + + for( ; x1 < bw; x1++ ) { double W = W0 + M[6]*x1; W = W ? INTER_TAB_SIZE/W : 0; From 0fd8f5052273c422b2acc7b37bbade01f8bb0d91 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 12 Jan 2015 10:59:31 +0300 Subject: [PATCH 47/53] warpAffine INTER_NEAREST --- modules/imgproc/src/imgwarp.cpp | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/modules/imgproc/src/imgwarp.cpp b/modules/imgproc/src/imgwarp.cpp index dc254fd7d4..a2f6070397 100644 --- a/modules/imgproc/src/imgwarp.cpp +++ b/modules/imgproc/src/imgwarp.cpp @@ -5096,6 +5096,28 @@ public: vst2q_s16(xy + (x1 << 1), v_dst); } + #elif CV_SSE2 + __m128i v_X0 = _mm_set1_epi32(X0); + __m128i v_Y0 = _mm_set1_epi32(Y0); + for ( ; x1 <= bw - 16; x1 += 16) + { + __m128i v_x0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x + x1))), AB_BITS), + _mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x + x1 + 4))), AB_BITS)); + __m128i v_x1 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x + x1 + 8))), AB_BITS), + _mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x + x1 + 12))), AB_BITS)); + + __m128i v_y0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x + x1))), AB_BITS), + _mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x + x1 + 4))), AB_BITS)); + __m128i v_y1 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x + x1 + 8))), AB_BITS), + _mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x + x1 + 12))), AB_BITS)); + + _mm_interleave_epi16(v_x0, v_x1, v_y0, v_y1); + + _mm_storeu_si128((__m128i *)(xy + x1 * 2), v_x0); + _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 8), v_x1); + _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 16), v_y0); + _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 24), v_y1); + } #endif for( ; x1 < bw; x1++ ) { From 3b23e57925a951c892219e59e38458235a138cdc Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 12 Jan 2015 10:59:31 +0300 Subject: [PATCH 48/53] convertMaps --- modules/imgproc/src/imgwarp.cpp | 154 +++++++++++++++++++++++++++++++- 1 file changed, 150 insertions(+), 4 deletions(-) diff --git a/modules/imgproc/src/imgwarp.cpp b/modules/imgproc/src/imgwarp.cpp index a2f6070397..4880819b9e 100644 --- a/modules/imgproc/src/imgwarp.cpp +++ b/modules/imgproc/src/imgwarp.cpp @@ -4847,6 +4847,26 @@ void cv::convertMaps( InputArray _map1, InputArray _map2, vst2q_s16(dst1 + (x << 1), v_dst); } + #elif CV_SSE4_1 + for( ; x <= size.width - 16; x += 16 ) + { + __m128i v_dst0 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src1f + x)), + _mm_cvtps_epi32(_mm_loadu_ps(src1f + x + 4))); + __m128i v_dst1 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src1f + x + 8)), + _mm_cvtps_epi32(_mm_loadu_ps(src1f + x + 12))); + + __m128i v_dst2 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src2f + x)), + _mm_cvtps_epi32(_mm_loadu_ps(src2f + x + 4))); + __m128i v_dst3 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src2f + x + 8)), + _mm_cvtps_epi32(_mm_loadu_ps(src2f + x + 12))); + + _mm_interleave_epi16(v_dst0, v_dst1, v_dst2, v_dst3); + + _mm_storeu_si128((__m128i *)(dst1 + x * 2), v_dst0); + _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 8), v_dst1); + _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 16), v_dst2); + _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 24), v_dst3); + } #endif for( ; x < size.width; x++ ) { @@ -4881,6 +4901,49 @@ void cv::convertMaps( InputArray _map1, InputArray _map2, vandq_s32(v_ix1, v_mask))); vst1q_u16(dst2 + x, vcombine_u16(v_dst0, v_dst1)); } + #elif CV_SSE4_1 + __m128 v_its = _mm_set1_ps(INTER_TAB_SIZE); + __m128i v_its1 = _mm_set1_epi32(INTER_TAB_SIZE-1); + + for( ; x <= size.width - 16; x += 16 ) + { + __m128i v_ix0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x), v_its)); + __m128i v_ix1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x + 4), v_its)); + __m128i v_iy0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x), v_its)); + __m128i v_iy1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x + 4), v_its)); + + __m128i v_dst10 = _mm_packs_epi32(_mm_srai_epi32(v_ix0, INTER_BITS), + _mm_srai_epi32(v_ix1, INTER_BITS)); + __m128i v_dst12 = _mm_packs_epi32(_mm_srai_epi32(v_iy0, INTER_BITS), + _mm_srai_epi32(v_iy1, INTER_BITS)); + __m128i v_dst20 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy0, v_its1), INTER_BITS), + _mm_and_si128(v_ix0, v_its1)); + __m128i v_dst21 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy1, v_its1), INTER_BITS), + _mm_and_si128(v_ix1, v_its1)); + _mm_storeu_si128((__m128i *)(dst2 + x), _mm_packus_epi32(v_dst20, v_dst21)); + + v_ix0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x + 8), v_its)); + v_ix1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x + 12), v_its)); + v_iy0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x + 8), v_its)); + v_iy1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x + 12), v_its)); + + __m128i v_dst11 = _mm_packs_epi32(_mm_srai_epi32(v_ix0, INTER_BITS), + _mm_srai_epi32(v_ix1, INTER_BITS)); + __m128i v_dst13 = _mm_packs_epi32(_mm_srai_epi32(v_iy0, INTER_BITS), + _mm_srai_epi32(v_iy1, INTER_BITS)); + v_dst20 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy0, v_its1), INTER_BITS), + _mm_and_si128(v_ix0, v_its1)); + v_dst21 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy1, v_its1), INTER_BITS), + _mm_and_si128(v_ix1, v_its1)); + _mm_storeu_si128((__m128i *)(dst2 + x + 8), _mm_packus_epi32(v_dst20, v_dst21)); + + _mm_interleave_epi16(v_dst10, v_dst11, v_dst12, v_dst13); + + _mm_storeu_si128((__m128i *)(dst1 + x * 2), v_dst10); + _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 8), v_dst11); + _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 16), v_dst12); + _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 24), v_dst13); + } #endif for( ; x < size.width; x++ ) { @@ -4900,6 +4963,12 @@ void cv::convertMaps( InputArray _map1, InputArray _map2, for( ; x <= (size.width << 1) - 8; x += 8 ) vst1q_s16(dst1 + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(vld1q_f32(src1f + x))), vqmovn_s32(cv_vrndq_s32_f32(vld1q_f32(src1f + x + 4))))); + #elif CV_SSE2 + for( ; x <= (size.width << 1) - 8; x += 8 ) + { + _mm_storeu_si128((__m128i *)(dst1 + x), _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src1f + x)), + _mm_cvtps_epi32(_mm_loadu_ps(src1f + x + 4)))); + } #endif for( ; x < size.width; x++ ) { @@ -4935,6 +5004,27 @@ void cv::convertMaps( InputArray _map1, InputArray _map2, vandq_s32(v_ix1, v_mask))); vst1q_u16(dst2 + x, vcombine_u16(v_dst0, v_dst1)); } + #elif CV_SSE2 + __m128 v_its = _mm_set1_ps(INTER_TAB_SIZE); + __m128i v_its1 = _mm_set1_epi32(INTER_TAB_SIZE-1); + __m128i v_y_mask = _mm_set1_epi32((INTER_TAB_SIZE-1) << 16); + + for( ; x <= size.width - 4; x += 4 ) + { + __m128i v_src0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x * 2), v_its)); + __m128i v_src1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x * 2 + 4), v_its)); + + __m128i v_dst1 = _mm_packs_epi32(_mm_srai_epi32(v_src0, INTER_BITS), + _mm_srai_epi32(v_src1, INTER_BITS)); + _mm_storeu_si128((__m128i *)(dst1 + x * 2), v_dst1); + + // x0 y0 x1 y1 . . . + v_src0 = _mm_packs_epi32(_mm_and_si128(v_src0, v_its1), + _mm_and_si128(v_src1, v_its1)); + __m128i v_dst2 = _mm_or_si128(_mm_srli_epi32(_mm_and_si128(v_src0, v_y_mask), 16 - INTER_BITS), // y0 0 y1 0 . . . + _mm_and_si128(v_src0, v_its1)); // 0 x0 0 x1 . . . + _mm_storel_epi64((__m128i *)(dst2 + x), _mm_packus_epi32(v_dst2, v_dst2)); + } #endif for( ; x < size.width; x++ ) { @@ -4980,6 +5070,44 @@ void cv::convertMaps( InputArray _map1, InputArray _map2, vst1q_f32(dst1f + x + 4, v_dst1); vst1q_f32(dst2f + x + 4, v_dst2); } + #elif CV_SSE2 + __m128i v_mask2 = _mm_set1_epi16(INTER_TAB_SIZE2-1); + __m128i v_zero = _mm_setzero_si128(), v_mask = _mm_set1_epi32(INTER_TAB_SIZE-1); + __m128 v_scale = _mm_set1_ps(scale); + + for( ; x <= size.width - 16; x += 16) + { + __m128i v_src10 = _mm_loadu_si128((__m128i const *)(src1 + x * 2)); + __m128i v_src11 = _mm_loadu_si128((__m128i const *)(src1 + x * 2 + 8)); + __m128i v_src20 = _mm_loadu_si128((__m128i const *)(src1 + x * 2 + 16)); + __m128i v_src21 = _mm_loadu_si128((__m128i const *)(src1 + x * 2 + 24)); + + _mm_deinterleave_epi16(v_src10, v_src11, v_src20, v_src21); + + __m128i v_fxy = src2 ? _mm_and_si128(_mm_loadu_si128((__m128i const *)(src2 + x)), v_mask2) : v_zero; + __m128i v_fxy_p = _mm_unpacklo_epi16(v_fxy, v_zero); + _mm_storeu_ps(dst1f + x, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src10), 16)), + _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_and_si128(v_fxy_p, v_mask))))); + _mm_storeu_ps(dst2f + x, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src20), 16)), + _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_srli_epi32(v_fxy_p, INTER_BITS))))); + v_fxy_p = _mm_unpackhi_epi16(v_fxy, v_zero); + _mm_storeu_ps(dst1f + x + 4, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src10), 16)), + _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_and_si128(v_fxy_p, v_mask))))); + _mm_storeu_ps(dst2f + x + 4, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src20), 16)), + _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_srli_epi32(v_fxy_p, INTER_BITS))))); + + v_fxy = src2 ? _mm_and_si128(_mm_loadu_si128((__m128i const *)(src2 + x + 8)), v_mask2) : v_zero; + v_fxy_p = _mm_unpackhi_epi16(v_fxy, v_zero); + _mm_storeu_ps(dst1f + x + 8, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src11), 16)), + _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_and_si128(v_fxy_p, v_mask))))); + _mm_storeu_ps(dst2f + x + 8, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src21), 16)), + _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_srli_epi32(v_fxy_p, INTER_BITS))))); + v_fxy_p = _mm_unpackhi_epi16(v_fxy, v_zero); + _mm_storeu_ps(dst1f + x + 12, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src11), 16)), + _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_and_si128(v_fxy_p, v_mask))))); + _mm_storeu_ps(dst2f + x + 12, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src21), 16)), + _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_srli_epi32(v_fxy_p, INTER_BITS))))); + } #endif for( ; x < size.width; x++ ) { @@ -5021,6 +5149,24 @@ void cv::convertMaps( InputArray _map1, InputArray _map2, v_scale, vcvtq_f32_s32(vshrq_n_s32(v_fxy2, INTER_BITS))); vst2q_f32(dst1f + (x << 1) + 8, v_dst); } + #elif CV_SSE2 + __m128i v_mask2 = _mm_set1_epi16(INTER_TAB_SIZE2-1); + __m128i v_zero = _mm_set1_epi32(0), v_mask = _mm_set1_epi32(INTER_TAB_SIZE-1); + __m128 v_scale = _mm_set1_ps(scale); + + for ( ; x <= size.width - 8; x += 8) + { + __m128i v_src = _mm_loadu_si128((__m128i const *)(src1 + x * 2)); + __m128i v_fxy = src2 ? _mm_and_si128(_mm_loadu_si128((__m128i const *)(src2 + x)), v_mask2) : v_zero; + __m128i v_fxy1 = _mm_and_si128(v_fxy, v_mask); + __m128i v_fxy2 = _mm_srli_epi16(v_fxy, INTER_BITS); + + __m128 v_add = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_fxy1, v_fxy2)), v_scale); + _mm_storeu_ps(dst1f + x * 2, _mm_add_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)), v_add)); + + v_add = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_fxy1, v_fxy2)), v_scale); + _mm_storeu_ps(dst1f + x * 2, _mm_add_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)), v_add)); + } #endif for( ; x < size.width; x++ ) { @@ -5096,7 +5242,7 @@ public: vst2q_s16(xy + (x1 << 1), v_dst); } - #elif CV_SSE2 + #elif CV_SSE4_1 __m128i v_X0 = _mm_set1_epi32(X0); __m128i v_Y0 = _mm_set1_epi32(Y0); for ( ; x1 <= bw - 16; x1 += 16) @@ -5525,7 +5671,7 @@ public: int bw0 = std::min(BLOCK_SZ*BLOCK_SZ/bh0, width); bh0 = std::min(BLOCK_SZ*BLOCK_SZ/bw0, height); - #if CV_SSE2 + #if CV_SSE4_1 __m128d v_M0 = _mm_set1_pd(M[0]); __m128d v_M3 = _mm_set1_pd(M[3]); __m128d v_M6 = _mm_set1_pd(M[6]); @@ -5559,7 +5705,7 @@ public: { x1 = 0; - #if CV_SSE2 + #if CV_SSE4_1 __m128d v_X0d = _mm_set1_pd(X0); __m128d v_Y0d = _mm_set1_pd(Y0); __m128d v_W0 = _mm_set1_pd(W0); @@ -5684,7 +5830,7 @@ public: short* alpha = A + y1*bw; x1 = 0; - #if CV_SSE2 + #if CV_SSE4_1 __m128d v_X0d = _mm_set1_pd(X0); __m128d v_Y0d = _mm_set1_pd(Y0); __m128d v_W0 = _mm_set1_pd(W0); From 1d3c86041105d8f5971eb5ceb27eba0979dd0af1 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 12 Jan 2015 10:59:31 +0300 Subject: [PATCH 49/53] SinCos_32f --- modules/core/src/mathfuncs.cpp | 68 +++++++++++++++++++++++++++++++++- 1 file changed, 66 insertions(+), 2 deletions(-) diff --git a/modules/core/src/mathfuncs.cpp b/modules/core/src/mathfuncs.cpp index 7b27dc3507..d7f9dc5379 100644 --- a/modules/core/src/mathfuncs.cpp +++ b/modules/core/src/mathfuncs.cpp @@ -824,14 +824,78 @@ static void SinCos_32f( const float *angle, float *sinval, float* cosval, /*static const double cos_a2 = 1;*/ double k1; - int i; + int i = 0; if( !angle_in_degrees ) k1 = N/(2*CV_PI); else k1 = N/360.; - for( i = 0; i < len; i++ ) +#if CV_AVX2 + __m128d v_i = _mm_set_pd(1, 0); + __m128d v_k1 = _mm_set1_pd(k1); + __m128d v_1 = _mm_set1_pd(1); + __m128i v_N1 = _mm_set1_epi32(N - 1); + __m128i v_N4 = _mm_set1_epi32(N >> 2); + __m128d v_sin_a0 = _mm_set1_pd(sin_a0); + __m128d v_sin_a2 = _mm_set1_pd(sin_a2); + __m128d v_cos_a0 = _mm_set1_pd(cos_a0); + + if (USE_AVX2) + { + for ( ; i <= len - 4; i += 4) + { + __m128 v_angle = _mm_loadu_ps(angle + i); + + // 0-1 + __m128d v_t = _mm_mul_pd(_mm_cvtps_pd(v_angle), v_k1); + __m128i v_it = _mm_cvtpd_epi32(v_t); + v_t = _mm_sub_pd(v_t, _mm_cvtepi32_pd(v_it)); + + __m128i v_sin_idx = _mm_and_si128(v_it, v_N1); + __m128i v_cos_idx = _mm_and_si128(_mm_sub_epi32(v_N4, v_sin_idx), v_N1); + + __m128d v_t2 = _mm_mul_pd(v_t, v_t); + __m128d v_sin_b = _mm_mul_pd(_mm_add_pd(_mm_mul_pd(v_sin_a0, v_t2), v_sin_a2), v_t); + __m128d v_cos_b = _mm_add_pd(_mm_mul_pd(v_cos_a0, v_t2), v_1); + + __m128d v_sin_a = _mm_i32gather_pd(sin_table, v_sin_idx, 1); + __m128d v_cos_a = _mm_i32gather_pd(sin_table, v_cos_idx, 1); + + __m128d v_sin_val_0 = _mm_add_pd(_mm_mul_pd(v_sin_a, v_cos_b), + _mm_mul_pd(v_cos_a, v_sin_b)); + __m128d v_cos_val_0 = _mm_sub_pd(_mm_mul_pd(v_cos_a, v_cos_b), + _mm_mul_pd(v_sin_a, v_sin_b)); + + // 2-3 + v_t = _mm_mul_pd(_mm_cvtps_pd(_mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(v_angle), 8))), v_k1); + v_it = _mm_cvtpd_epi32(v_t); + v_t = _mm_sub_pd(v_t, _mm_cvtepi32_pd(v_it)); + + v_sin_idx = _mm_and_si128(v_it, v_N1); + v_cos_idx = _mm_and_si128(_mm_sub_epi32(v_N4, v_sin_idx), v_N1); + + v_t2 = _mm_mul_pd(v_t, v_t); + v_sin_b = _mm_mul_pd(_mm_add_pd(_mm_mul_pd(v_sin_a0, v_t2), v_sin_a2), v_t); + v_cos_b = _mm_add_pd(_mm_mul_pd(v_cos_a0, v_t2), v_1); + + v_sin_a = _mm_i32gather_pd(sin_table, v_sin_idx, 1); + v_cos_a = _mm_i32gather_pd(sin_table, v_cos_idx, 1); + + __m128d v_sin_val_1 = _mm_add_pd(_mm_mul_pd(v_sin_a, v_cos_b), + _mm_mul_pd(v_cos_a, v_sin_b)); + __m128d v_cos_val_1 = _mm_sub_pd(_mm_mul_pd(v_cos_a, v_cos_b), + _mm_mul_pd(v_sin_a, v_sin_b)); + + _mm_storeu_ps(sinval + i, _mm_movelh_ps(_mm_cvtpd_ps(v_sin_val_0), + _mm_cvtpd_ps(v_sin_val_1))); + _mm_storeu_ps(cosval + i, _mm_movelh_ps(_mm_cvtpd_ps(v_cos_val_0), + _mm_cvtpd_ps(v_cos_val_1))); + } + } +#endif + + for( ; i < len; i++ ) { double t = angle[i]*k1; int it = cvRound(t); From 6bce6ee34a7bf23d19d2df37bc44a47072a8f6e0 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 12 Jan 2015 10:59:31 +0300 Subject: [PATCH 50/53] checks --- modules/core/src/arithm.cpp | 89 ++++- modules/core/src/convert.cpp | 62 ++- modules/core/src/mathfuncs.cpp | 102 ++--- modules/core/src/stat.cpp | 6 +- modules/imgproc/src/color.cpp | 551 +++++++++++++++----------- modules/imgproc/src/imgwarp.cpp | 653 ++++++++++++++++--------------- modules/imgproc/src/pyramids.cpp | 10 +- 7 files changed, 841 insertions(+), 632 deletions(-) diff --git a/modules/core/src/arithm.cpp b/modules/core/src/arithm.cpp index 49a9cceaec..fb69cd201a 100644 --- a/modules/core/src/arithm.cpp +++ b/modules/core/src/arithm.cpp @@ -64,7 +64,7 @@ FUNCTOR_TEMPLATE(VLoadStore128); #if CV_SSE2 FUNCTOR_TEMPLATE(VLoadStore64); FUNCTOR_TEMPLATE(VLoadStore128Aligned); -#if CV_AVX +#if CV_AVX2 FUNCTOR_TEMPLATE(VLoadStore256); FUNCTOR_TEMPLATE(VLoadStore256Aligned); #endif @@ -2626,10 +2626,16 @@ struct Div_SIMD template <> struct Div_SIMD { + bool haveSIMD; + Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE4_1); } + int operator() (const uchar * src1, const uchar * src2, uchar * dst, int width, double scale) const { int x = 0; + if (!haveSIMD) + return x; + __m128d v_scale = _mm_set1_pd(scale); __m128i v_zero = _mm_setzero_si128(); @@ -2672,10 +2678,16 @@ struct Div_SIMD template <> struct Div_SIMD { + bool haveSIMD; + Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2); } + int operator() (const schar * src1, const schar * src2, schar * dst, int width, double scale) const { int x = 0; + if (!haveSIMD) + return x; + __m128d v_scale = _mm_set1_pd(scale); __m128i v_zero = _mm_setzero_si128(); @@ -2718,10 +2730,16 @@ struct Div_SIMD template <> struct Div_SIMD { + bool haveSIMD; + Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE4_1); } + int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, double scale) const { int x = 0; + if (!haveSIMD) + return x; + __m128d v_scale = _mm_set1_pd(scale); __m128i v_zero = _mm_setzero_si128(); @@ -2763,10 +2781,16 @@ struct Div_SIMD template <> struct Div_SIMD { + bool haveSIMD; + Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2); } + int operator() (const short * src1, const short * src2, short * dst, int width, double scale) const { int x = 0; + if (!haveSIMD) + return x; + __m128d v_scale = _mm_set1_pd(scale); __m128i v_zero = _mm_setzero_si128(); @@ -2806,10 +2830,16 @@ struct Div_SIMD template <> struct Div_SIMD { + bool haveSIMD; + Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2); } + int operator() (const int * src1, const int * src2, int * dst, int width, double scale) const { int x = 0; + if (!haveSIMD) + return x; + __m128d v_scale = _mm_set1_pd(scale); __m128i v_zero = _mm_setzero_si128(); @@ -2902,10 +2932,16 @@ struct Recip_SIMD template <> struct Recip_SIMD { + bool haveSIMD; + Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE4_1); } + int operator() (const uchar * src2, uchar * dst, int width, double scale) const { int x = 0; + if (!haveSIMD) + return x; + __m128d v_scale = _mm_set1_pd(scale); __m128i v_zero = _mm_setzero_si128(); @@ -2941,10 +2977,16 @@ struct Recip_SIMD template <> struct Recip_SIMD { + bool haveSIMD; + Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2); } + int operator() (const schar * src2, schar * dst, int width, double scale) const { int x = 0; + if (!haveSIMD) + return x; + __m128d v_scale = _mm_set1_pd(scale); __m128i v_zero = _mm_setzero_si128(); @@ -2980,10 +3022,16 @@ struct Recip_SIMD template <> struct Recip_SIMD { + bool haveSIMD; + Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE4_1); } + int operator() (const ushort * src2, ushort * dst, int width, double scale) const { int x = 0; + if (!haveSIMD) + return x; + __m128d v_scale = _mm_set1_pd(scale); __m128i v_zero = _mm_setzero_si128(); @@ -3018,10 +3066,16 @@ struct Recip_SIMD template <> struct Recip_SIMD { + bool haveSIMD; + Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2); } + int operator() (const short * src2, short * dst, int width, double scale) const { int x = 0; + if (!haveSIMD) + return x; + __m128d v_scale = _mm_set1_pd(scale); __m128i v_zero = _mm_setzero_si128(); @@ -3054,10 +3108,16 @@ struct Recip_SIMD template <> struct Recip_SIMD { + bool haveSIMD; + Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2); } + int operator() (const int * src2, int * dst, int width, double scale) const { int x = 0; + if (!haveSIMD) + return x; + __m128d v_scale = _mm_set1_pd(scale); __m128i v_zero = _mm_setzero_si128(); @@ -4126,7 +4186,8 @@ static void cmp8u(const uchar* src1, size_t step1, const uchar* src2, size_t ste { int x =0; #if CV_SSE2 - if( USE_SSE2 ){ + if( USE_SSE2 ) + { __m128i m128 = code == CMP_GT ? _mm_setzero_si128() : _mm_set1_epi8 (-1); __m128i c128 = _mm_set1_epi8 (-128); for( ; x <= size.width - 16; x += 16 ) @@ -4142,7 +4203,7 @@ static void cmp8u(const uchar* src1, size_t step1, const uchar* src2, size_t ste } } - #elif CV_NEON + #elif CV_NEON uint8x16_t mask = code == CMP_GT ? vdupq_n_u8(0) : vdupq_n_u8(255); for( ; x <= size.width - 16; x += 16 ) @@ -4164,7 +4225,8 @@ static void cmp8u(const uchar* src1, size_t step1, const uchar* src2, size_t ste { int x = 0; #if CV_SSE2 - if( USE_SSE2 ){ + if( USE_SSE2 ) + { __m128i m128 = code == CMP_EQ ? _mm_setzero_si128() : _mm_set1_epi8 (-1); for( ; x <= size.width - 16; x += 16 ) { @@ -4174,7 +4236,7 @@ static void cmp8u(const uchar* src1, size_t step1, const uchar* src2, size_t ste _mm_storeu_si128((__m128i*)(dst + x), r00); } } - #elif CV_NEON + #elif CV_NEON uint8x16_t mask = code == CMP_EQ ? vdupq_n_u8(0) : vdupq_n_u8(255); for( ; x <= size.width - 16; x += 16 ) @@ -4254,7 +4316,8 @@ static void cmp16s(const short* src1, size_t step1, const short* src2, size_t st { int x =0; #if CV_SSE2 - if( USE_SSE2){// + if( USE_SSE2) + { __m128i m128 = code == CMP_GT ? _mm_setzero_si128() : _mm_set1_epi16 (-1); for( ; x <= size.width - 16; x += 16 ) { @@ -4278,7 +4341,7 @@ static void cmp16s(const short* src1, size_t step1, const short* src2, size_t st x += 8; } } - #elif CV_NEON + #elif CV_NEON uint8x16_t mask = code == CMP_GT ? vdupq_n_u8(0) : vdupq_n_u8(255); for( ; x <= size.width - 16; x += 16 ) @@ -4293,8 +4356,7 @@ static void cmp16s(const short* src1, size_t step1, const short* src2, size_t st vst1q_u8(dst+x, veorq_u8(vcombine_u8(t1, t2), mask)); } - - #endif + #endif for( ; x < size.width; x++ ){ dst[x] = (uchar)(-(src1[x] > src2[x]) ^ m); @@ -4308,7 +4370,8 @@ static void cmp16s(const short* src1, size_t step1, const short* src2, size_t st { int x = 0; #if CV_SSE2 - if( USE_SSE2 ){ + if( USE_SSE2 ) + { __m128i m128 = code == CMP_EQ ? _mm_setzero_si128() : _mm_set1_epi16 (-1); for( ; x <= size.width - 16; x += 16 ) { @@ -4332,7 +4395,7 @@ static void cmp16s(const short* src1, size_t step1, const short* src2, size_t st x += 8; } } - #elif CV_NEON + #elif CV_NEON uint8x16_t mask = code == CMP_EQ ? vdupq_n_u8(0) : vdupq_n_u8(255); for( ; x <= size.width - 16; x += 16 ) @@ -4347,8 +4410,8 @@ static void cmp16s(const short* src1, size_t step1, const short* src2, size_t st vst1q_u8(dst+x, veorq_u8(vcombine_u8(t1, t2), mask)); } - #endif - for( ; x < size.width; x++ ) + #endif + for( ; x < size.width; x++ ) dst[x] = (uchar)(-(src1[x] == src2[x]) ^ m); } } diff --git a/modules/core/src/convert.cpp b/modules/core/src/convert.cpp index 626a666a95..090acf5508 100644 --- a/modules/core/src/convert.cpp +++ b/modules/core/src/convert.cpp @@ -158,7 +158,7 @@ struct VSplit2 \ VSplit2() \ { \ - support = true; \ + support = checkHardwareSupport(CV_CPU_SSE2); \ } \ \ void operator()(const data_type * src, \ @@ -191,7 +191,7 @@ struct VSplit3 \ VSplit3() \ { \ - support = true; \ + support = checkHardwareSupport(CV_CPU_SSE2); \ } \ \ void operator()(const data_type * src, \ @@ -229,7 +229,7 @@ struct VSplit4 \ VSplit4() \ { \ - support = true; \ + support = checkHardwareSupport(CV_CPU_SSE2); \ } \ \ void operator()(const data_type * src, data_type * dst0, data_type * dst1, \ @@ -502,7 +502,7 @@ struct VMerge4 bool support; }; -#define MERGE2_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor) \ +#define MERGE2_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor, se) \ template <> \ struct VMerge2 \ { \ @@ -513,7 +513,7 @@ struct VMerge2 \ VMerge2() \ { \ - support = true; \ + support = checkHardwareSupport(se); \ } \ \ void operator()(const data_type * src0, const data_type * src1, \ @@ -535,7 +535,7 @@ struct VMerge2 bool support; \ } -#define MERGE3_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor) \ +#define MERGE3_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor, se) \ template <> \ struct VMerge3 \ { \ @@ -546,7 +546,7 @@ struct VMerge3 \ VMerge3() \ { \ - support = true; \ + support = checkHardwareSupport(se); \ } \ \ void operator()(const data_type * src0, const data_type * src1, const data_type * src2,\ @@ -573,7 +573,7 @@ struct VMerge3 bool support; \ } -#define MERGE4_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor) \ +#define MERGE4_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor, se) \ template <> \ struct VMerge4 \ { \ @@ -584,7 +584,7 @@ struct VMerge4 \ VMerge4() \ { \ - support = true; \ + support = checkHardwareSupport(se); \ } \ \ void operator()(const data_type * src0, const data_type * src1, \ @@ -616,19 +616,19 @@ struct VMerge4 bool support; \ } -MERGE2_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128); -MERGE3_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128); -MERGE4_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128); +MERGE2_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128, CV_CPU_SSE2); +MERGE3_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128, CV_CPU_SSE2); +MERGE4_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128, CV_CPU_SSE2); #if CV_SSE4_1 -MERGE2_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128); -MERGE3_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128); -MERGE4_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128); +MERGE2_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128, CV_CPU_SSE4_1); +MERGE3_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128, CV_CPU_SSE4_1); +MERGE4_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128, CV_CPU_SSE4_1); #endif -MERGE2_KERNEL_TEMPLATE( int, __m128, float, _mm_interleave_ps, ps); -MERGE3_KERNEL_TEMPLATE( int, __m128, float, _mm_interleave_ps, ps); -MERGE4_KERNEL_TEMPLATE( int, __m128, float, _mm_interleave_ps, ps); +MERGE2_KERNEL_TEMPLATE( int, __m128, float, _mm_interleave_ps, ps, CV_CPU_SSE2); +MERGE3_KERNEL_TEMPLATE( int, __m128, float, _mm_interleave_ps, ps, CV_CPU_SSE2); +MERGE4_KERNEL_TEMPLATE( int, __m128, float, _mm_interleave_ps, ps, CV_CPU_SSE2); #endif @@ -4404,6 +4404,9 @@ struct Cvt_SIMD { int x = 0; + if (!USE_SSE2) + return x; + for ( ; x <= width - 8; x += 8) { __m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x)); @@ -4430,6 +4433,9 @@ struct Cvt_SIMD { int x = 0; + if (!USE_SSE2) + return x; + for ( ; x <= width - 8; x += 8) { __m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x)); @@ -4454,10 +4460,16 @@ struct Cvt_SIMD template <> struct Cvt_SIMD { + bool haveSIMD; + Cvt_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE4_1); } + int operator() (const double * src, ushort * dst, int width) const { int x = 0; + if (!haveSIMD) + return x; + for ( ; x <= width - 8; x += 8) { __m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x)); @@ -4486,6 +4498,9 @@ struct Cvt_SIMD { int x = 0; + if (!USE_SSE2) + return x; + for ( ; x <= width - 8; x += 8) { __m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x)); @@ -4512,6 +4527,9 @@ struct Cvt_SIMD { int x = 0; + if (!USE_SSE2) + return x; + for ( ; x <= width - 4; x += 4) { __m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x)); @@ -4532,6 +4550,9 @@ struct Cvt_SIMD { int x = 0; + if (!USE_SSE2) + return x; + for ( ; x <= width - 4; x += 4) { __m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x)); @@ -5114,8 +5135,9 @@ cvt_( const float* src, size_t sstep, { int x = 0; #if CV_SSE2 - if(USE_SSE2){ - for( ; x <= size.width - 8; x += 8 ) + if(USE_SSE2) + { + for( ; x <= size.width - 8; x += 8 ) { __m128 src128 = _mm_loadu_ps (src + x); __m128i src_int128 = _mm_cvtps_epi32 (src128); diff --git a/modules/core/src/mathfuncs.cpp b/modules/core/src/mathfuncs.cpp index d7f9dc5379..13ada1d1d6 100644 --- a/modules/core/src/mathfuncs.cpp +++ b/modules/core/src/mathfuncs.cpp @@ -597,15 +597,18 @@ void phase( InputArray src1, InputArray src2, OutputArray dst, bool angleInDegre k = 0; #if CV_SSE2 - for ( ; k <= len - 4; k += 4) + if (USE_SSE2) { - __m128 v_dst0 = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(x + k)), - _mm_cvtpd_ps(_mm_loadu_pd(x + k + 2))); - __m128 v_dst1 = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(y + k)), - _mm_cvtpd_ps(_mm_loadu_pd(y + k + 2))); + for ( ; k <= len - 4; k += 4) + { + __m128 v_dst0 = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(x + k)), + _mm_cvtpd_ps(_mm_loadu_pd(x + k + 2))); + __m128 v_dst1 = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(y + k)), + _mm_cvtpd_ps(_mm_loadu_pd(y + k + 2))); - _mm_storeu_ps(buf[0] + k, v_dst0); - _mm_storeu_ps(buf[1] + k, v_dst1); + _mm_storeu_ps(buf[0] + k, v_dst0); + _mm_storeu_ps(buf[1] + k, v_dst1); + } } #endif @@ -619,11 +622,14 @@ void phase( InputArray src1, InputArray src2, OutputArray dst, bool angleInDegre k = 0; #if CV_SSE2 - for ( ; k <= len - 4; k += 4) + if (USE_SSE2) { - __m128 v_src = _mm_loadu_ps(buf[0] + k); - _mm_storeu_pd(angle + k, _mm_cvtps_pd(v_src)); - _mm_storeu_pd(angle + k + 2, _mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_src), 8)))); + for ( ; k <= len - 4; k += 4) + { + __m128 v_src = _mm_loadu_ps(buf[0] + k); + _mm_storeu_pd(angle + k, _mm_cvtps_pd(v_src)); + _mm_storeu_pd(angle + k + 2, _mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_src), 8)))); + } } #endif @@ -728,15 +734,18 @@ void cartToPolar( InputArray src1, InputArray src2, k = 0; #if CV_SSE2 - for ( ; k <= len - 4; k += 4) + if (USE_SSE2) { - __m128 v_dst0 = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(x + k)), - _mm_cvtpd_ps(_mm_loadu_pd(x + k + 2))); - __m128 v_dst1 = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(y + k)), - _mm_cvtpd_ps(_mm_loadu_pd(y + k + 2))); + for ( ; k <= len - 4; k += 4) + { + __m128 v_dst0 = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(x + k)), + _mm_cvtpd_ps(_mm_loadu_pd(x + k + 2))); + __m128 v_dst1 = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(y + k)), + _mm_cvtpd_ps(_mm_loadu_pd(y + k + 2))); - _mm_storeu_ps(buf[0] + k, v_dst0); - _mm_storeu_ps(buf[1] + k, v_dst1); + _mm_storeu_ps(buf[0] + k, v_dst0); + _mm_storeu_ps(buf[1] + k, v_dst1); + } } #endif @@ -750,11 +759,14 @@ void cartToPolar( InputArray src1, InputArray src2, k = 0; #if CV_SSE2 - for ( ; k <= len - 4; k += 4) + if (USE_SSE2) { - __m128 v_src = _mm_loadu_ps(buf[0] + k); - _mm_storeu_pd(angle + k, _mm_cvtps_pd(v_src)); - _mm_storeu_pd(angle + k + 2, _mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_src), 8)))); + for ( ; k <= len - 4; k += 4) + { + __m128 v_src = _mm_loadu_ps(buf[0] + k); + _mm_storeu_pd(angle + k, _mm_cvtps_pd(v_src)); + _mm_storeu_pd(angle + k + 2, _mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_src), 8)))); + } } #endif @@ -832,17 +844,16 @@ static void SinCos_32f( const float *angle, float *sinval, float* cosval, k1 = N/360.; #if CV_AVX2 - __m128d v_i = _mm_set_pd(1, 0); - __m128d v_k1 = _mm_set1_pd(k1); - __m128d v_1 = _mm_set1_pd(1); - __m128i v_N1 = _mm_set1_epi32(N - 1); - __m128i v_N4 = _mm_set1_epi32(N >> 2); - __m128d v_sin_a0 = _mm_set1_pd(sin_a0); - __m128d v_sin_a2 = _mm_set1_pd(sin_a2); - __m128d v_cos_a0 = _mm_set1_pd(cos_a0); - if (USE_AVX2) { + __m128d v_k1 = _mm_set1_pd(k1); + __m128d v_1 = _mm_set1_pd(1); + __m128i v_N1 = _mm_set1_epi32(N - 1); + __m128i v_N4 = _mm_set1_epi32(N >> 2); + __m128d v_sin_a0 = _mm_set1_pd(sin_a0); + __m128d v_sin_a2 = _mm_set1_pd(sin_a2); + __m128d v_cos_a0 = _mm_set1_pd(cos_a0); + for ( ; i <= len - 4; i += 4) { __m128 v_angle = _mm_loadu_ps(angle + i); @@ -859,8 +870,8 @@ static void SinCos_32f( const float *angle, float *sinval, float* cosval, __m128d v_sin_b = _mm_mul_pd(_mm_add_pd(_mm_mul_pd(v_sin_a0, v_t2), v_sin_a2), v_t); __m128d v_cos_b = _mm_add_pd(_mm_mul_pd(v_cos_a0, v_t2), v_1); - __m128d v_sin_a = _mm_i32gather_pd(sin_table, v_sin_idx, 1); - __m128d v_cos_a = _mm_i32gather_pd(sin_table, v_cos_idx, 1); + __m128d v_sin_a = _mm_i32gather_pd(sin_table, v_sin_idx, 8); + __m128d v_cos_a = _mm_i32gather_pd(sin_table, v_cos_idx, 8); __m128d v_sin_val_0 = _mm_add_pd(_mm_mul_pd(v_sin_a, v_cos_b), _mm_mul_pd(v_cos_a, v_sin_b)); @@ -868,7 +879,7 @@ static void SinCos_32f( const float *angle, float *sinval, float* cosval, _mm_mul_pd(v_sin_a, v_sin_b)); // 2-3 - v_t = _mm_mul_pd(_mm_cvtps_pd(_mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(v_angle), 8))), v_k1); + v_t = _mm_mul_pd(_mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_angle), 8))), v_k1); v_it = _mm_cvtpd_epi32(v_t); v_t = _mm_sub_pd(v_t, _mm_cvtepi32_pd(v_it)); @@ -879,8 +890,8 @@ static void SinCos_32f( const float *angle, float *sinval, float* cosval, v_sin_b = _mm_mul_pd(_mm_add_pd(_mm_mul_pd(v_sin_a0, v_t2), v_sin_a2), v_t); v_cos_b = _mm_add_pd(_mm_mul_pd(v_cos_a0, v_t2), v_1); - v_sin_a = _mm_i32gather_pd(sin_table, v_sin_idx, 1); - v_cos_a = _mm_i32gather_pd(sin_table, v_cos_idx, 1); + v_sin_a = _mm_i32gather_pd(sin_table, v_sin_idx, 8); + v_cos_a = _mm_i32gather_pd(sin_table, v_cos_idx, 8); __m128d v_sin_val_1 = _mm_add_pd(_mm_mul_pd(v_sin_a, v_cos_b), _mm_mul_pd(v_cos_a, v_sin_b)); @@ -1032,11 +1043,14 @@ void polarToCart( InputArray src1, InputArray src2, vst1q_f32(y + k, vmulq_f32(vld1q_f32(y + k), v_m)); } #elif CV_SSE2 - for( ; k <= len - 4; k += 4 ) + if (USE_SSE2) { - __m128 v_m = _mm_loadu_ps(mag + k); - _mm_storeu_ps(x + k, _mm_mul_ps(_mm_loadu_ps(x + k), v_m)); - _mm_storeu_ps(y + k, _mm_mul_ps(_mm_loadu_ps(y + k), v_m)); + for( ; k <= len - 4; k += 4 ) + { + __m128 v_m = _mm_loadu_ps(mag + k); + _mm_storeu_ps(x + k, _mm_mul_ps(_mm_loadu_ps(x + k), v_m)); + _mm_storeu_ps(y + k, _mm_mul_ps(_mm_loadu_ps(y + k), v_m)); + } } #endif @@ -1063,10 +1077,10 @@ void polarToCart( InputArray src1, InputArray src2, x[k] = buf[0][k]*m; y[k] = buf[1][k]*m; } else - for( k = 0; k < len; k++ ) - { - x[k] = buf[0][k]; y[k] = buf[1][k]; - } + { + std::memcpy(x, buf[0], sizeof(float) * len); + std::memcpy(y, buf[1], sizeof(float) * len); + } } if( ptrs[0] ) diff --git a/modules/core/src/stat.cpp b/modules/core/src/stat.cpp index 4eb17d6a14..1fcb9b54d1 100644 --- a/modules/core/src/stat.cpp +++ b/modules/core/src/stat.cpp @@ -397,6 +397,8 @@ static int countNonZero_(const T* src, int len ) return nz; } +#if CV_SSE2 + static const uchar * initPopcountTable() { static uchar tab[256]; @@ -425,6 +427,8 @@ static const uchar * initPopcountTable() return tab; } +#endif + static int countNonZero8u( const uchar* src, int len ) { int i=0, nz = 0; @@ -645,7 +649,7 @@ static int countNonZero32f( const float* src, int len ) } static int countNonZero64f( const double* src, int len ) -{ +{ int i = 0, nz = 0; #if CV_SSE2 if (USE_SSE2) diff --git a/modules/imgproc/src/color.cpp b/modules/imgproc/src/color.cpp index 4efbcc5f8b..5ae1170b43 100644 --- a/modules/imgproc/src/color.cpp +++ b/modules/imgproc/src/color.cpp @@ -967,6 +967,7 @@ struct Gray2RGB5x5 v_n7 = vdup_n_u8(~7); v_n3 = vdup_n_u8(~3); #elif CV_SSE2 + haveSIMD = checkHardwareSupport(CV_CPU_SSE2); v_n7 = _mm_set1_epi16(~7); v_n3 = _mm_set1_epi16(~3); v_zero = _mm_setzero_si128(); @@ -988,21 +989,24 @@ struct Gray2RGB5x5 vst1q_u16((ushort *)dst + i, v_dst); } #elif CV_SSE2 - for ( ; i <= n - 16; i += 16 ) + if (haveSIMD) { - __m128i v_src = _mm_loadu_si128((__m128i const *)(src + i)); + for ( ; i <= n - 16; i += 16 ) + { + __m128i v_src = _mm_loadu_si128((__m128i const *)(src + i)); - __m128i v_src_p = _mm_unpacklo_epi8(v_src, v_zero); - __m128i v_dst = _mm_or_si128(_mm_srli_epi16(v_src_p, 3), - _mm_or_si128(_mm_slli_epi16(_mm_and_si128(v_src_p, v_n3), 3), - _mm_slli_epi16(_mm_and_si128(v_src_p, v_n7), 8))); - _mm_storeu_si128((__m128i *)((ushort *)dst + i), v_dst); + __m128i v_src_p = _mm_unpacklo_epi8(v_src, v_zero); + __m128i v_dst = _mm_or_si128(_mm_srli_epi16(v_src_p, 3), + _mm_or_si128(_mm_slli_epi16(_mm_and_si128(v_src_p, v_n3), 3), + _mm_slli_epi16(_mm_and_si128(v_src_p, v_n7), 8))); + _mm_storeu_si128((__m128i *)((ushort *)dst + i), v_dst); - v_src_p = _mm_unpackhi_epi8(v_src, v_zero); - v_dst = _mm_or_si128(_mm_srli_epi16(v_src_p, 3), - _mm_or_si128(_mm_slli_epi16(_mm_and_si128(v_src_p, v_n3), 3), - _mm_slli_epi16(_mm_and_si128(v_src_p, v_n7), 8))); - _mm_storeu_si128((__m128i *)((ushort *)dst + i + 8), v_dst); + v_src_p = _mm_unpackhi_epi8(v_src, v_zero); + v_dst = _mm_or_si128(_mm_srli_epi16(v_src_p, 3), + _mm_or_si128(_mm_slli_epi16(_mm_and_si128(v_src_p, v_n3), 3), + _mm_slli_epi16(_mm_and_si128(v_src_p, v_n7), 8))); + _mm_storeu_si128((__m128i *)((ushort *)dst + i + 8), v_dst); + } } #endif for ( ; i < n; i++ ) @@ -1021,21 +1025,24 @@ struct Gray2RGB5x5 vst1q_u16((ushort *)dst + i, v_dst); } #elif CV_SSE2 - for ( ; i <= n - 16; i += 8 ) + if (haveSIMD) { - __m128i v_src = _mm_loadu_si128((__m128i const *)(src + i)); + for ( ; i <= n - 16; i += 8 ) + { + __m128i v_src = _mm_loadu_si128((__m128i const *)(src + i)); - __m128i v_src_p = _mm_srli_epi16(_mm_unpacklo_epi8(v_src, v_zero), 3); - __m128i v_dst = _mm_or_si128(v_src_p, - _mm_or_si128(_mm_slli_epi32(v_src_p, 5), - _mm_slli_epi16(v_src_p, 10))); - _mm_storeu_si128((__m128i *)((ushort *)dst + i), v_dst); + __m128i v_src_p = _mm_srli_epi16(_mm_unpacklo_epi8(v_src, v_zero), 3); + __m128i v_dst = _mm_or_si128(v_src_p, + _mm_or_si128(_mm_slli_epi32(v_src_p, 5), + _mm_slli_epi16(v_src_p, 10))); + _mm_storeu_si128((__m128i *)((ushort *)dst + i), v_dst); - v_src_p = _mm_srli_epi16(_mm_unpackhi_epi8(v_src, v_zero), 3); - v_dst = _mm_or_si128(v_src_p, - _mm_or_si128(_mm_slli_epi16(v_src_p, 5), - _mm_slli_epi16(v_src_p, 10))); - _mm_storeu_si128((__m128i *)((ushort *)dst + i + 8), v_dst); + v_src_p = _mm_srli_epi16(_mm_unpackhi_epi8(v_src, v_zero), 3); + v_dst = _mm_or_si128(v_src_p, + _mm_or_si128(_mm_slli_epi16(v_src_p, 5), + _mm_slli_epi16(v_src_p, 10))); + _mm_storeu_si128((__m128i *)((ushort *)dst + i + 8), v_dst); + } } #endif for( ; i < n; i++ ) @@ -1051,6 +1058,7 @@ struct Gray2RGB5x5 uint8x8_t v_n7, v_n3; #elif CV_SSE2 __m128i v_n7, v_n3, v_zero; + bool haveSIMD; #endif }; @@ -1084,6 +1092,7 @@ struct RGB5x52Gray v_f8 = vdupq_n_u16(0xf8); v_fc = vdupq_n_u16(0xfc); #elif CV_SSE2 + haveSIMD = checkHardwareSupport(CV_CPU_SSE2); v_b2y = _mm_set1_epi16(B2Y); v_g2y = _mm_set1_epi16(G2Y); v_r2y = _mm_set1_epi16(R2Y); @@ -1116,37 +1125,40 @@ struct RGB5x52Gray vst1_u8(dst + i, vmovn_u16(vcombine_u16(vmovn_u32(v_dst0), vmovn_u32(v_dst1)))); } #elif CV_SSE2 - __m128i v_zero = _mm_setzero_si128(); - - for ( ; i <= n - 8; i += 8) + if (haveSIMD) { - __m128i v_src = _mm_loadu_si128((__m128i const *)((ushort *)src + i)); - __m128i v_t0 = _mm_and_si128(_mm_slli_epi16(v_src, 3), v_f8), - v_t1 = _mm_and_si128(_mm_srli_epi16(v_src, 3), v_fc), - v_t2 = _mm_and_si128(_mm_srli_epi16(v_src, 8), v_f8); + __m128i v_zero = _mm_setzero_si128(); - __m128i v_mullo_b = _mm_mullo_epi16(v_t0, v_b2y); - __m128i v_mullo_g = _mm_mullo_epi16(v_t1, v_g2y); - __m128i v_mullo_r = _mm_mullo_epi16(v_t2, v_r2y); - __m128i v_mulhi_b = _mm_mulhi_epi16(v_t0, v_b2y); - __m128i v_mulhi_g = _mm_mulhi_epi16(v_t1, v_g2y); - __m128i v_mulhi_r = _mm_mulhi_epi16(v_t2, v_r2y); + for ( ; i <= n - 8; i += 8) + { + __m128i v_src = _mm_loadu_si128((__m128i const *)((ushort *)src + i)); + __m128i v_t0 = _mm_and_si128(_mm_slli_epi16(v_src, 3), v_f8), + v_t1 = _mm_and_si128(_mm_srli_epi16(v_src, 3), v_fc), + v_t2 = _mm_and_si128(_mm_srli_epi16(v_src, 8), v_f8); - __m128i v_dst0 = _mm_add_epi32(_mm_unpacklo_epi16(v_mullo_b, v_mulhi_b), - _mm_unpacklo_epi16(v_mullo_g, v_mulhi_g)); - v_dst0 = _mm_add_epi32(_mm_add_epi32(v_dst0, v_delta), - _mm_unpacklo_epi16(v_mullo_r, v_mulhi_r)); + __m128i v_mullo_b = _mm_mullo_epi16(v_t0, v_b2y); + __m128i v_mullo_g = _mm_mullo_epi16(v_t1, v_g2y); + __m128i v_mullo_r = _mm_mullo_epi16(v_t2, v_r2y); + __m128i v_mulhi_b = _mm_mulhi_epi16(v_t0, v_b2y); + __m128i v_mulhi_g = _mm_mulhi_epi16(v_t1, v_g2y); + __m128i v_mulhi_r = _mm_mulhi_epi16(v_t2, v_r2y); - __m128i v_dst1 = _mm_add_epi32(_mm_unpackhi_epi16(v_mullo_b, v_mulhi_b), - _mm_unpackhi_epi16(v_mullo_g, v_mulhi_g)); - v_dst1 = _mm_add_epi32(_mm_add_epi32(v_dst1, v_delta), - _mm_unpackhi_epi16(v_mullo_r, v_mulhi_r)); + __m128i v_dst0 = _mm_add_epi32(_mm_unpacklo_epi16(v_mullo_b, v_mulhi_b), + _mm_unpacklo_epi16(v_mullo_g, v_mulhi_g)); + v_dst0 = _mm_add_epi32(_mm_add_epi32(v_dst0, v_delta), + _mm_unpacklo_epi16(v_mullo_r, v_mulhi_r)); - v_dst0 = _mm_srli_epi32(v_dst0, yuv_shift); - v_dst1 = _mm_srli_epi32(v_dst1, yuv_shift); + __m128i v_dst1 = _mm_add_epi32(_mm_unpackhi_epi16(v_mullo_b, v_mulhi_b), + _mm_unpackhi_epi16(v_mullo_g, v_mulhi_g)); + v_dst1 = _mm_add_epi32(_mm_add_epi32(v_dst1, v_delta), + _mm_unpackhi_epi16(v_mullo_r, v_mulhi_r)); - __m128i v_dst = _mm_packs_epi32(v_dst0, v_dst1); - _mm_storel_epi64((__m128i *)(dst + i), _mm_packus_epi16(v_dst, v_zero)); + v_dst0 = _mm_srli_epi32(v_dst0, yuv_shift); + v_dst1 = _mm_srli_epi32(v_dst1, yuv_shift); + + __m128i v_dst = _mm_packs_epi32(v_dst0, v_dst1); + _mm_storel_epi64((__m128i *)(dst + i), _mm_packus_epi16(v_dst, v_zero)); + } } #endif for ( ; i < n; i++) @@ -1177,37 +1189,40 @@ struct RGB5x52Gray vst1_u8(dst + i, vmovn_u16(vcombine_u16(vmovn_u32(v_dst0), vmovn_u32(v_dst1)))); } #elif CV_SSE2 - __m128i v_zero = _mm_setzero_si128(); - - for ( ; i <= n - 8; i += 8) + if (haveSIMD) { - __m128i v_src = _mm_loadu_si128((__m128i const *)((ushort *)src + i)); - __m128i v_t0 = _mm_and_si128(_mm_slli_epi16(v_src, 3), v_f8), - v_t1 = _mm_and_si128(_mm_srli_epi16(v_src, 2), v_f8), - v_t2 = _mm_and_si128(_mm_srli_epi16(v_src, 7), v_f8); + __m128i v_zero = _mm_setzero_si128(); - __m128i v_mullo_b = _mm_mullo_epi16(v_t0, v_b2y); - __m128i v_mullo_g = _mm_mullo_epi16(v_t1, v_g2y); - __m128i v_mullo_r = _mm_mullo_epi16(v_t2, v_r2y); - __m128i v_mulhi_b = _mm_mulhi_epi16(v_t0, v_b2y); - __m128i v_mulhi_g = _mm_mulhi_epi16(v_t1, v_g2y); - __m128i v_mulhi_r = _mm_mulhi_epi16(v_t2, v_r2y); + for ( ; i <= n - 8; i += 8) + { + __m128i v_src = _mm_loadu_si128((__m128i const *)((ushort *)src + i)); + __m128i v_t0 = _mm_and_si128(_mm_slli_epi16(v_src, 3), v_f8), + v_t1 = _mm_and_si128(_mm_srli_epi16(v_src, 2), v_f8), + v_t2 = _mm_and_si128(_mm_srli_epi16(v_src, 7), v_f8); - __m128i v_dst0 = _mm_add_epi32(_mm_unpacklo_epi16(v_mullo_b, v_mulhi_b), - _mm_unpacklo_epi16(v_mullo_g, v_mulhi_g)); - v_dst0 = _mm_add_epi32(_mm_add_epi32(v_dst0, v_delta), - _mm_unpacklo_epi16(v_mullo_r, v_mulhi_r)); + __m128i v_mullo_b = _mm_mullo_epi16(v_t0, v_b2y); + __m128i v_mullo_g = _mm_mullo_epi16(v_t1, v_g2y); + __m128i v_mullo_r = _mm_mullo_epi16(v_t2, v_r2y); + __m128i v_mulhi_b = _mm_mulhi_epi16(v_t0, v_b2y); + __m128i v_mulhi_g = _mm_mulhi_epi16(v_t1, v_g2y); + __m128i v_mulhi_r = _mm_mulhi_epi16(v_t2, v_r2y); - __m128i v_dst1 = _mm_add_epi32(_mm_unpackhi_epi16(v_mullo_b, v_mulhi_b), - _mm_unpackhi_epi16(v_mullo_g, v_mulhi_g)); - v_dst1 = _mm_add_epi32(_mm_add_epi32(v_dst1, v_delta), - _mm_unpackhi_epi16(v_mullo_r, v_mulhi_r)); + __m128i v_dst0 = _mm_add_epi32(_mm_unpacklo_epi16(v_mullo_b, v_mulhi_b), + _mm_unpacklo_epi16(v_mullo_g, v_mulhi_g)); + v_dst0 = _mm_add_epi32(_mm_add_epi32(v_dst0, v_delta), + _mm_unpacklo_epi16(v_mullo_r, v_mulhi_r)); - v_dst0 = _mm_srli_epi32(v_dst0, yuv_shift); - v_dst1 = _mm_srli_epi32(v_dst1, yuv_shift); + __m128i v_dst1 = _mm_add_epi32(_mm_unpackhi_epi16(v_mullo_b, v_mulhi_b), + _mm_unpackhi_epi16(v_mullo_g, v_mulhi_g)); + v_dst1 = _mm_add_epi32(_mm_add_epi32(v_dst1, v_delta), + _mm_unpackhi_epi16(v_mullo_r, v_mulhi_r)); - __m128i v_dst = _mm_packs_epi32(v_dst0, v_dst1); - _mm_storel_epi64((__m128i *)(dst + i), _mm_packus_epi16(v_dst, v_zero)); + v_dst0 = _mm_srli_epi32(v_dst0, yuv_shift); + v_dst1 = _mm_srli_epi32(v_dst1, yuv_shift); + + __m128i v_dst = _mm_packs_epi32(v_dst0, v_dst1); + _mm_storel_epi64((__m128i *)(dst + i), _mm_packus_epi16(v_dst, v_zero)); + } } #endif for ( ; i < n; i++) @@ -1226,6 +1241,7 @@ struct RGB5x52Gray uint32x4_t v_delta; uint16x8_t v_f8, v_fc; #elif CV_SSE2 + bool haveSIMD; __m128i v_b2y, v_g2y, v_r2y; __m128i v_delta; __m128i v_f8, v_fc; @@ -1445,7 +1461,9 @@ struct RGB2Gray float32x4_t v_cb, v_cg, v_cr; }; -#elif CV_SSE4_1 +#elif CV_SSE2 + +#if CV_SSE4_1 template <> struct RGB2Gray @@ -1464,6 +1482,8 @@ struct RGB2Gray v_cg = _mm_set1_epi16((short)coeffs[1]); v_cr = _mm_set1_epi16((short)coeffs[2]); v_delta = _mm_set1_epi32(1 << (yuv_shift - 1)); + + haveSIMD = checkHardwareSupport(CV_CPU_SSE4_1); } // 16s x 8 @@ -1494,7 +1514,7 @@ struct RGB2Gray { int scn = srccn, cb = coeffs[0], cg = coeffs[1], cr = coeffs[2], i = 0; - if (scn == 3) + if (scn == 3 && haveSIMD) { for ( ; i <= n - 16; i += 16, src += scn * 16) { @@ -1519,7 +1539,7 @@ struct RGB2Gray _mm_storeu_si128((__m128i *)(dst + i + 8), v_gray1); } } - else if (scn == 4) + else if (scn == 4 && haveSIMD) { for ( ; i <= n - 16; i += 16, src += scn * 16) { @@ -1554,8 +1574,11 @@ struct RGB2Gray int srccn, coeffs[3]; __m128i v_cb, v_cg, v_cr; __m128i v_delta; + bool haveSIMD; }; +#endif // CV_SSE4_1 + template <> struct RGB2Gray { @@ -1571,6 +1594,8 @@ struct RGB2Gray v_cb = _mm_set1_ps(coeffs[0]); v_cg = _mm_set1_ps(coeffs[1]); v_cr = _mm_set1_ps(coeffs[2]); + + haveSIMD = checkHardwareSupport(CV_CPU_SSE2); } void process(__m128 v_r, __m128 v_g, __m128 v_b, @@ -1586,7 +1611,7 @@ struct RGB2Gray int scn = srccn, i = 0; float cb = coeffs[0], cg = coeffs[1], cr = coeffs[2]; - if (scn == 3) + if (scn == 3 && haveSIMD) { for ( ; i <= n - 8; i += 8, src += scn * 8) { @@ -1611,7 +1636,7 @@ struct RGB2Gray _mm_storeu_ps(dst + i + 4, v_gray1); } } - else if (scn == 4) + else if (scn == 4 && haveSIMD) { for ( ; i <= n - 8; i += 8, src += scn * 8) { @@ -1646,6 +1671,7 @@ struct RGB2Gray int srccn; float coeffs[3]; __m128 v_cb, v_cg, v_cr; + bool haveSIMD; }; #else @@ -1791,6 +1817,8 @@ struct RGB2YCrCb_f v_c3 = _mm_set1_ps(coeffs[3]); v_c4 = _mm_set1_ps(coeffs[4]); v_delta = _mm_set1_ps(ColorChannel::half()); + + haveSIMD = checkHardwareSupport(CV_CPU_SSE2); } void process(__m128 v_r, __m128 v_g, __m128 v_b, @@ -1811,7 +1839,7 @@ struct RGB2YCrCb_f float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4]; n *= 3; - if (scn == 3 || scn == 4) + if (haveSIMD) { for ( ; i <= n - 24; i += 24, src += 8 * scn) { @@ -1862,6 +1890,7 @@ struct RGB2YCrCb_f int srccn, blueIdx; float coeffs[5]; __m128 v_c0, v_c1, v_c2, v_c3, v_c4, v_delta; + bool haveSIMD; }; #endif @@ -2138,6 +2167,8 @@ struct RGB2YCrCb_i v_delta = _mm_set1_epi32(ColorChannel::half()*(1 << yuv_shift)); v_delta = _mm_add_epi32(v_delta, v_delta2); v_zero = _mm_setzero_si128(); + + haveSIMD = checkHardwareSupport(CV_CPU_SSE4_1); } // 16u x 8 @@ -2184,7 +2215,7 @@ struct RGB2YCrCb_i int delta = ColorChannel::half()*(1 << yuv_shift); n *= 3; - if (scn == 3 || scn == 4) + if (haveSIMD) { for ( ; i <= n - 96; i += 96, src += scn * 32) { @@ -2261,6 +2292,7 @@ struct RGB2YCrCb_i __m128i v_c0, v_c1, v_c2; __m128i v_c3, v_c4, v_delta, v_delta2; __m128i v_zero; + bool haveSIMD; }; template <> @@ -2285,6 +2317,8 @@ struct RGB2YCrCb_i v_delta = _mm_set1_epi32(ColorChannel::half()*(1 << yuv_shift)); v_delta = _mm_add_epi32(v_delta, v_delta2); v_zero = _mm_setzero_si128(); + + haveSIMD = checkHardwareSupport(CV_CPU_SSE4_1); } // 16u x 8 @@ -2331,7 +2365,7 @@ struct RGB2YCrCb_i int delta = ColorChannel::half()*(1 << yuv_shift); n *= 3; - if (scn == 3 || scn == 4) + if (haveSIMD) { for ( ; i <= n - 48; i += 48, src += scn * 16) { @@ -2387,6 +2421,7 @@ struct RGB2YCrCb_i __m128i v_c0, v_c1, v_c2; __m128i v_c3, v_c4, v_delta, v_delta2; __m128i v_zero; + bool haveSIMD; }; #endif // CV_SSE4_1 @@ -2518,6 +2553,8 @@ struct YCrCb2RGB_f v_c3 = _mm_set1_ps(coeffs[3]); v_delta = _mm_set1_ps(ColorChannel::half()); v_alpha = _mm_set1_ps(ColorChannel::max()); + + haveSIMD = checkHardwareSupport(CV_CPU_SSE2); } void process(__m128 v_y, __m128 v_cr, __m128 v_cb, @@ -2545,7 +2582,7 @@ struct YCrCb2RGB_f float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3]; n *= 3; - if (dcn == 3 || dcn == 4) + if (haveSIMD) { for ( ; i <= n - 24; i += 24, dst += 8 * dcn) { @@ -2606,6 +2643,7 @@ struct YCrCb2RGB_f float coeffs[4]; __m128 v_c0, v_c1, v_c2, v_c3, v_alpha, v_delta; + bool haveSIMD; }; #endif @@ -2920,6 +2958,7 @@ struct YCrCb2RGB_i v_alpha = _mm_set1_epi8(*(char *)&alpha); useSSE = coeffs[0] <= std::numeric_limits::max(); + haveSIMD = checkHardwareSupport(CV_CPU_SSE2); } // 16s x 8 @@ -2975,7 +3014,7 @@ struct YCrCb2RGB_i int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3]; n *= 3; - if ((dcn == 3 || dcn == 4) && useSSE) + if (haveSIMD && useSSE) { for ( ; i <= n - 96; i += 96, dst += dcn * 32) { @@ -3066,7 +3105,7 @@ struct YCrCb2RGB_i } int dstcn, blueIdx; int coeffs[4]; - bool useSSE; + bool useSSE, haveSIMD; __m128i v_c0, v_c1, v_c2, v_c3, v_delta2; __m128i v_delta, v_alpha, v_zero; @@ -3221,6 +3260,8 @@ struct RGB2XYZ_f v_c6 = _mm_set1_ps(coeffs[6]); v_c7 = _mm_set1_ps(coeffs[7]); v_c8 = _mm_set1_ps(coeffs[8]); + + haveSIMD = checkHardwareSupport(CV_CPU_SSE2); } void process(__m128 v_r, __m128 v_g, __m128 v_b, @@ -3248,7 +3289,7 @@ struct RGB2XYZ_f n *= 3; - if (scn == 3 || scn == 4) + if (haveSIMD) { for ( ; i <= n - 24; i += 24, src += 8 * scn) { @@ -3301,6 +3342,7 @@ struct RGB2XYZ_f int srccn; float coeffs[9]; __m128 v_c0, v_c1, v_c2, v_c3, v_c4, v_c5, v_c6, v_c7, v_c8; + bool haveSIMD; }; @@ -3657,6 +3699,8 @@ struct XYZ2RGB_f v_c8 = _mm_set1_ps(coeffs[8]); v_alpha = _mm_set1_ps(ColorChannel::max()); + + haveSIMD = checkHardwareSupport(CV_CPU_SSE2); } void process(__m128 v_x, __m128 v_y, __m128 v_z, @@ -3685,7 +3729,7 @@ struct XYZ2RGB_f n *= 3; int i = 0; - if (dcn == 3 || dcn == 4) + if (haveSIMD) { for ( ; i <= n - 24; i += 24, dst += 8 * dcn) { @@ -3745,6 +3789,7 @@ struct XYZ2RGB_f __m128 v_c0, v_c1, v_c2, v_c3, v_c4, v_c5, v_c6, v_c7, v_c8; __m128 v_alpha; + bool haveSIMD; }; #endif // CV_SSE2 @@ -4267,6 +4312,7 @@ struct HSV2RGB_b v_scale_inv = _mm_set1_ps(1.f/255.f); v_scale = _mm_set1_ps(255.0f); v_zero = _mm_setzero_si128(); + haveSIMD = checkHardwareSupport(CV_CPU_SSE2); #endif } @@ -4331,36 +4377,39 @@ struct HSV2RGB_b vst3q_f32(buf + j + 12, v_dst); } #elif CV_SSE2 - for ( ; j <= (dn - 32) * 3; j += 96) + if (haveSIMD) { - __m128i v_r0 = _mm_loadu_si128((__m128i const *)(src + j)); - __m128i v_r1 = _mm_loadu_si128((__m128i const *)(src + j + 16)); - __m128i v_g0 = _mm_loadu_si128((__m128i const *)(src + j + 32)); - __m128i v_g1 = _mm_loadu_si128((__m128i const *)(src + j + 48)); - __m128i v_b0 = _mm_loadu_si128((__m128i const *)(src + j + 64)); - __m128i v_b1 = _mm_loadu_si128((__m128i const *)(src + j + 80)); + for ( ; j <= (dn - 32) * 3; j += 96) + { + __m128i v_r0 = _mm_loadu_si128((__m128i const *)(src + j)); + __m128i v_r1 = _mm_loadu_si128((__m128i const *)(src + j + 16)); + __m128i v_g0 = _mm_loadu_si128((__m128i const *)(src + j + 32)); + __m128i v_g1 = _mm_loadu_si128((__m128i const *)(src + j + 48)); + __m128i v_b0 = _mm_loadu_si128((__m128i const *)(src + j + 64)); + __m128i v_b1 = _mm_loadu_si128((__m128i const *)(src + j + 80)); - _mm_deinterleave_epi8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); + _mm_deinterleave_epi8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); - process(_mm_unpacklo_epi8(v_r0, v_zero), - _mm_unpacklo_epi8(v_g0, v_zero), - _mm_unpacklo_epi8(v_b0, v_zero), - buf + j); + process(_mm_unpacklo_epi8(v_r0, v_zero), + _mm_unpacklo_epi8(v_g0, v_zero), + _mm_unpacklo_epi8(v_b0, v_zero), + buf + j); - process(_mm_unpackhi_epi8(v_r0, v_zero), - _mm_unpackhi_epi8(v_g0, v_zero), - _mm_unpackhi_epi8(v_b0, v_zero), - buf + j + 24); + process(_mm_unpackhi_epi8(v_r0, v_zero), + _mm_unpackhi_epi8(v_g0, v_zero), + _mm_unpackhi_epi8(v_b0, v_zero), + buf + j + 24); - process(_mm_unpacklo_epi8(v_r1, v_zero), - _mm_unpacklo_epi8(v_g1, v_zero), - _mm_unpacklo_epi8(v_b1, v_zero), - buf + j + 48); + process(_mm_unpacklo_epi8(v_r1, v_zero), + _mm_unpacklo_epi8(v_g1, v_zero), + _mm_unpacklo_epi8(v_b1, v_zero), + buf + j + 48); - process(_mm_unpackhi_epi8(v_r1, v_zero), - _mm_unpackhi_epi8(v_g1, v_zero), - _mm_unpackhi_epi8(v_b1, v_zero), - buf + j + 72); + process(_mm_unpackhi_epi8(v_r1, v_zero), + _mm_unpackhi_epi8(v_g1, v_zero), + _mm_unpackhi_epi8(v_b1, v_zero), + buf + j + 72); + } } #endif @@ -4403,7 +4452,7 @@ struct HSV2RGB_b } } #elif CV_SSE2 - if (dcn == 3) + if (dcn == 3 && haveSIMD) { for ( ; j <= (dn * 3 - 16); j += 16, dst += 16) { @@ -4445,6 +4494,7 @@ struct HSV2RGB_b #elif CV_SSE2 __m128 v_scale_inv, v_scale; __m128i v_zero; + bool haveSIMD; #endif }; @@ -4520,6 +4570,7 @@ struct RGB2HLS_b v_scale_inv = _mm_set1_ps(1.f/255.f); v_scale = _mm_set1_ps(255.f); v_zero = _mm_setzero_si128(); + haveSIMD = checkHardwareSupport(CV_CPU_SSE2); #endif } @@ -4589,7 +4640,7 @@ struct RGB2HLS_b vst3q_f32(buf + j + 12, v_dst); } #elif CV_SSE2 - if (scn == 3) + if (scn == 3 && haveSIMD) { for ( ; j <= (dn * 3 - 16); j += 16, src += 16) { @@ -4633,38 +4684,41 @@ struct RGB2HLS_b vst3_u8(dst + j, v_dst); } #elif CV_SSE2 - for ( ; j <= (dn - 32) * 3; j += 96) + if (haveSIMD) { - __m128i v_h_0, v_l_0, v_s_0; - process(buf + j, - v_h_0, v_l_0, v_s_0); + for ( ; j <= (dn - 32) * 3; j += 96) + { + __m128i v_h_0, v_l_0, v_s_0; + process(buf + j, + v_h_0, v_l_0, v_s_0); - __m128i v_h_1, v_l_1, v_s_1; - process(buf + j + 24, - v_h_1, v_l_1, v_s_1); + __m128i v_h_1, v_l_1, v_s_1; + process(buf + j + 24, + v_h_1, v_l_1, v_s_1); - __m128i v_h0 = _mm_packus_epi16(v_h_0, v_h_1); - __m128i v_l0 = _mm_packus_epi16(v_l_0, v_l_1); - __m128i v_s0 = _mm_packus_epi16(v_s_0, v_s_1); + __m128i v_h0 = _mm_packus_epi16(v_h_0, v_h_1); + __m128i v_l0 = _mm_packus_epi16(v_l_0, v_l_1); + __m128i v_s0 = _mm_packus_epi16(v_s_0, v_s_1); - process(buf + j + 48, - v_h_0, v_l_0, v_s_0); + process(buf + j + 48, + v_h_0, v_l_0, v_s_0); - process(buf + j + 72, - v_h_1, v_l_1, v_s_1); + process(buf + j + 72, + v_h_1, v_l_1, v_s_1); - __m128i v_h1 = _mm_packus_epi16(v_h_0, v_h_1); - __m128i v_l1 = _mm_packus_epi16(v_l_0, v_l_1); - __m128i v_s1 = _mm_packus_epi16(v_s_0, v_s_1); + __m128i v_h1 = _mm_packus_epi16(v_h_0, v_h_1); + __m128i v_l1 = _mm_packus_epi16(v_l_0, v_l_1); + __m128i v_s1 = _mm_packus_epi16(v_s_0, v_s_1); - _mm_interleave_epi8(v_h0, v_h1, v_l0, v_l1, v_s0, v_s1); + _mm_interleave_epi8(v_h0, v_h1, v_l0, v_l1, v_s0, v_s1); - _mm_storeu_si128((__m128i *)(dst + j), v_h0); - _mm_storeu_si128((__m128i *)(dst + j + 16), v_h1); - _mm_storeu_si128((__m128i *)(dst + j + 32), v_l0); - _mm_storeu_si128((__m128i *)(dst + j + 48), v_l1); - _mm_storeu_si128((__m128i *)(dst + j + 64), v_s0); - _mm_storeu_si128((__m128i *)(dst + j + 80), v_s1); + _mm_storeu_si128((__m128i *)(dst + j), v_h0); + _mm_storeu_si128((__m128i *)(dst + j + 16), v_h1); + _mm_storeu_si128((__m128i *)(dst + j + 32), v_l0); + _mm_storeu_si128((__m128i *)(dst + j + 48), v_l1); + _mm_storeu_si128((__m128i *)(dst + j + 64), v_s0); + _mm_storeu_si128((__m128i *)(dst + j + 80), v_s1); + } } #endif for( ; j < dn*3; j += 3 ) @@ -4684,6 +4738,7 @@ struct RGB2HLS_b #elif CV_SSE2 __m128 v_scale, v_scale_inv; __m128i v_zero; + bool haveSIMD; #endif }; @@ -4767,6 +4822,7 @@ struct HLS2RGB_b v_scale_inv = _mm_set1_ps(1.f/255.f); v_scale = _mm_set1_ps(255.f); v_zero = _mm_setzero_si128(); + haveSIMD = checkHardwareSupport(CV_CPU_SSE2); #endif } @@ -4831,36 +4887,39 @@ struct HLS2RGB_b vst3q_f32(buf + j + 12, v_dst); } #elif CV_SSE2 - for ( ; j <= (dn - 32) * 3; j += 96) + if (haveSIMD) { - __m128i v_r0 = _mm_loadu_si128((__m128i const *)(src + j)); - __m128i v_r1 = _mm_loadu_si128((__m128i const *)(src + j + 16)); - __m128i v_g0 = _mm_loadu_si128((__m128i const *)(src + j + 32)); - __m128i v_g1 = _mm_loadu_si128((__m128i const *)(src + j + 48)); - __m128i v_b0 = _mm_loadu_si128((__m128i const *)(src + j + 64)); - __m128i v_b1 = _mm_loadu_si128((__m128i const *)(src + j + 80)); + for ( ; j <= (dn - 32) * 3; j += 96) + { + __m128i v_r0 = _mm_loadu_si128((__m128i const *)(src + j)); + __m128i v_r1 = _mm_loadu_si128((__m128i const *)(src + j + 16)); + __m128i v_g0 = _mm_loadu_si128((__m128i const *)(src + j + 32)); + __m128i v_g1 = _mm_loadu_si128((__m128i const *)(src + j + 48)); + __m128i v_b0 = _mm_loadu_si128((__m128i const *)(src + j + 64)); + __m128i v_b1 = _mm_loadu_si128((__m128i const *)(src + j + 80)); - _mm_deinterleave_epi8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); + _mm_deinterleave_epi8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); - process(_mm_unpacklo_epi8(v_r0, v_zero), - _mm_unpacklo_epi8(v_g0, v_zero), - _mm_unpacklo_epi8(v_b0, v_zero), - buf + j); + process(_mm_unpacklo_epi8(v_r0, v_zero), + _mm_unpacklo_epi8(v_g0, v_zero), + _mm_unpacklo_epi8(v_b0, v_zero), + buf + j); - process(_mm_unpackhi_epi8(v_r0, v_zero), - _mm_unpackhi_epi8(v_g0, v_zero), - _mm_unpackhi_epi8(v_b0, v_zero), - buf + j + 24); + process(_mm_unpackhi_epi8(v_r0, v_zero), + _mm_unpackhi_epi8(v_g0, v_zero), + _mm_unpackhi_epi8(v_b0, v_zero), + buf + j + 24); - process(_mm_unpacklo_epi8(v_r1, v_zero), - _mm_unpacklo_epi8(v_g1, v_zero), - _mm_unpacklo_epi8(v_b1, v_zero), - buf + j + 48); + process(_mm_unpacklo_epi8(v_r1, v_zero), + _mm_unpacklo_epi8(v_g1, v_zero), + _mm_unpacklo_epi8(v_b1, v_zero), + buf + j + 48); - process(_mm_unpackhi_epi8(v_r1, v_zero), - _mm_unpackhi_epi8(v_g1, v_zero), - _mm_unpackhi_epi8(v_b1, v_zero), - buf + j + 72); + process(_mm_unpackhi_epi8(v_r1, v_zero), + _mm_unpackhi_epi8(v_g1, v_zero), + _mm_unpackhi_epi8(v_b1, v_zero), + buf + j + 72); + } } #endif for( ; j < dn*3; j += 3 ) @@ -4902,7 +4961,7 @@ struct HLS2RGB_b } } #elif CV_SSE2 - if (dcn == 3) + if (dcn == 3 && haveSIMD) { for ( ; j <= (dn * 3 - 16); j += 16, dst += 16) { @@ -4944,6 +5003,7 @@ struct HLS2RGB_b #elif CV_SSE2 __m128 v_scale, v_scale_inv; __m128i v_zero; + bool haveSIMD; #endif }; @@ -5264,6 +5324,7 @@ struct Lab2RGB_b v_scale = _mm_set1_ps(255.f); v_128 = _mm_set1_ps(128.0f); v_zero = _mm_setzero_si128(); + haveSIMD = checkHardwareSupport(CV_CPU_SSE2); #endif } @@ -5330,36 +5391,39 @@ struct Lab2RGB_b vst3q_f32(buf + j + 12, v_dst); } #elif CV_SSE2 - for ( ; j <= (dn - 32) * 3; j += 96) + if (haveSIMD) { - __m128i v_r0 = _mm_loadu_si128((__m128i const *)(src + j)); - __m128i v_r1 = _mm_loadu_si128((__m128i const *)(src + j + 16)); - __m128i v_g0 = _mm_loadu_si128((__m128i const *)(src + j + 32)); - __m128i v_g1 = _mm_loadu_si128((__m128i const *)(src + j + 48)); - __m128i v_b0 = _mm_loadu_si128((__m128i const *)(src + j + 64)); - __m128i v_b1 = _mm_loadu_si128((__m128i const *)(src + j + 80)); + for ( ; j <= (dn - 32) * 3; j += 96) + { + __m128i v_r0 = _mm_loadu_si128((__m128i const *)(src + j)); + __m128i v_r1 = _mm_loadu_si128((__m128i const *)(src + j + 16)); + __m128i v_g0 = _mm_loadu_si128((__m128i const *)(src + j + 32)); + __m128i v_g1 = _mm_loadu_si128((__m128i const *)(src + j + 48)); + __m128i v_b0 = _mm_loadu_si128((__m128i const *)(src + j + 64)); + __m128i v_b1 = _mm_loadu_si128((__m128i const *)(src + j + 80)); - _mm_deinterleave_epi8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); + _mm_deinterleave_epi8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); - process(_mm_unpacklo_epi8(v_r0, v_zero), - _mm_unpacklo_epi8(v_g0, v_zero), - _mm_unpacklo_epi8(v_b0, v_zero), - buf + j); + process(_mm_unpacklo_epi8(v_r0, v_zero), + _mm_unpacklo_epi8(v_g0, v_zero), + _mm_unpacklo_epi8(v_b0, v_zero), + buf + j); - process(_mm_unpackhi_epi8(v_r0, v_zero), - _mm_unpackhi_epi8(v_g0, v_zero), - _mm_unpackhi_epi8(v_b0, v_zero), - buf + j + 24); + process(_mm_unpackhi_epi8(v_r0, v_zero), + _mm_unpackhi_epi8(v_g0, v_zero), + _mm_unpackhi_epi8(v_b0, v_zero), + buf + j + 24); - process(_mm_unpacklo_epi8(v_r1, v_zero), - _mm_unpacklo_epi8(v_g1, v_zero), - _mm_unpacklo_epi8(v_b1, v_zero), - buf + j + 48); + process(_mm_unpacklo_epi8(v_r1, v_zero), + _mm_unpacklo_epi8(v_g1, v_zero), + _mm_unpacklo_epi8(v_b1, v_zero), + buf + j + 48); - process(_mm_unpackhi_epi8(v_r1, v_zero), - _mm_unpackhi_epi8(v_g1, v_zero), - _mm_unpackhi_epi8(v_b1, v_zero), - buf + j + 72); + process(_mm_unpackhi_epi8(v_r1, v_zero), + _mm_unpackhi_epi8(v_g1, v_zero), + _mm_unpackhi_epi8(v_b1, v_zero), + buf + j + 72); + } } #endif @@ -5402,7 +5466,7 @@ struct Lab2RGB_b } } #elif CV_SSE2 - if (dcn == 3) + if (dcn == 3 && haveSIMD) { for ( ; j <= (dn * 3 - 16); j += 16, dst += 16) { @@ -5445,6 +5509,7 @@ struct Lab2RGB_b #elif CV_SSE2 __m128 v_scale, v_scale_inv, v_128; __m128i v_zero; + bool haveSIMD; #endif }; @@ -5627,6 +5692,7 @@ struct RGB2Luv_b v_coeff2 = _mm_set1_ps(96.525423728813564f); v_coeff3 = _mm_set1_ps(0.9732824427480916f); v_coeff4 = _mm_set1_ps(136.259541984732824f); + haveSIMD = checkHardwareSupport(CV_CPU_SSE2); #endif } @@ -5698,7 +5764,7 @@ struct RGB2Luv_b vst3q_f32(buf + j + 12, v_dst); } #elif CV_SSE2 - if (scn == 3) + if (scn == 3 && haveSIMD) { for ( ; j <= (dn * 3 - 16); j += 16, src += 16) { @@ -5743,38 +5809,41 @@ struct RGB2Luv_b vst3_u8(dst + j, v_dst); } #elif CV_SSE2 - for ( ; j <= (dn - 32) * 3; j += 96) + if (haveSIMD) { - __m128i v_l_0, v_u_0, v_v_0; - process(buf + j, - v_l_0, v_u_0, v_v_0); + for ( ; j <= (dn - 32) * 3; j += 96) + { + __m128i v_l_0, v_u_0, v_v_0; + process(buf + j, + v_l_0, v_u_0, v_v_0); - __m128i v_l_1, v_u_1, v_v_1; - process(buf + j + 24, - v_l_1, v_u_1, v_v_1); + __m128i v_l_1, v_u_1, v_v_1; + process(buf + j + 24, + v_l_1, v_u_1, v_v_1); - __m128i v_l0 = _mm_packus_epi16(v_l_0, v_l_1); - __m128i v_u0 = _mm_packus_epi16(v_u_0, v_u_1); - __m128i v_v0 = _mm_packus_epi16(v_v_0, v_v_1); + __m128i v_l0 = _mm_packus_epi16(v_l_0, v_l_1); + __m128i v_u0 = _mm_packus_epi16(v_u_0, v_u_1); + __m128i v_v0 = _mm_packus_epi16(v_v_0, v_v_1); - process(buf + j + 48, - v_l_0, v_u_0, v_v_0); + process(buf + j + 48, + v_l_0, v_u_0, v_v_0); - process(buf + j + 72, - v_l_1, v_u_1, v_v_1); + process(buf + j + 72, + v_l_1, v_u_1, v_v_1); - __m128i v_l1 = _mm_packus_epi16(v_l_0, v_l_1); - __m128i v_u1 = _mm_packus_epi16(v_u_0, v_u_1); - __m128i v_v1 = _mm_packus_epi16(v_v_0, v_v_1); + __m128i v_l1 = _mm_packus_epi16(v_l_0, v_l_1); + __m128i v_u1 = _mm_packus_epi16(v_u_0, v_u_1); + __m128i v_v1 = _mm_packus_epi16(v_v_0, v_v_1); - _mm_interleave_epi8(v_l0, v_l1, v_u0, v_u1, v_v0, v_v1); + _mm_interleave_epi8(v_l0, v_l1, v_u0, v_u1, v_v0, v_v1); - _mm_storeu_si128((__m128i *)(dst + j), v_l0); - _mm_storeu_si128((__m128i *)(dst + j + 16), v_l1); - _mm_storeu_si128((__m128i *)(dst + j + 32), v_u0); - _mm_storeu_si128((__m128i *)(dst + j + 48), v_u1); - _mm_storeu_si128((__m128i *)(dst + j + 64), v_v0); - _mm_storeu_si128((__m128i *)(dst + j + 80), v_v1); + _mm_storeu_si128((__m128i *)(dst + j), v_l0); + _mm_storeu_si128((__m128i *)(dst + j + 16), v_l1); + _mm_storeu_si128((__m128i *)(dst + j + 32), v_u0); + _mm_storeu_si128((__m128i *)(dst + j + 48), v_u1); + _mm_storeu_si128((__m128i *)(dst + j + 64), v_v0); + _mm_storeu_si128((__m128i *)(dst + j + 80), v_v1); + } } #endif @@ -5796,6 +5865,7 @@ struct RGB2Luv_b #elif CV_SSE2 __m128 v_scale, v_scale_inv, v_coeff1, v_coeff2, v_coeff3, v_coeff4; __m128i v_zero; + bool haveSIMD; #endif }; @@ -5824,6 +5894,7 @@ struct Luv2RGB_b v_140 = _mm_set1_ps(140.f); v_scale = _mm_set1_ps(255.f); v_zero = _mm_setzero_si128(); + haveSIMD = checkHardwareSupport(CV_CPU_SSE2); #endif } @@ -5847,7 +5918,7 @@ struct Luv2RGB_b v_u1 = _mm_sub_ps(_mm_mul_ps(v_u1, v_coeff1), v_134); v_v0 = _mm_sub_ps(_mm_mul_ps(v_v0, v_coeff2), v_140); v_v1 = _mm_sub_ps(_mm_mul_ps(v_v1, v_coeff2), v_140); - + _mm_interleave_ps(v_l0, v_l1, v_u0, v_u1, v_v0, v_v1); _mm_store_ps(buf, v_l0); @@ -5890,36 +5961,39 @@ struct Luv2RGB_b vst3q_f32(buf + j + 12, v_dst); } #elif CV_SSE2 - for ( ; j <= (dn - 32) * 3; j += 96) + if (haveSIMD) { - __m128i v_r0 = _mm_loadu_si128((__m128i const *)(src + j)); - __m128i v_r1 = _mm_loadu_si128((__m128i const *)(src + j + 16)); - __m128i v_g0 = _mm_loadu_si128((__m128i const *)(src + j + 32)); - __m128i v_g1 = _mm_loadu_si128((__m128i const *)(src + j + 48)); - __m128i v_b0 = _mm_loadu_si128((__m128i const *)(src + j + 64)); - __m128i v_b1 = _mm_loadu_si128((__m128i const *)(src + j + 80)); + for ( ; j <= (dn - 32) * 3; j += 96) + { + __m128i v_r0 = _mm_loadu_si128((__m128i const *)(src + j)); + __m128i v_r1 = _mm_loadu_si128((__m128i const *)(src + j + 16)); + __m128i v_g0 = _mm_loadu_si128((__m128i const *)(src + j + 32)); + __m128i v_g1 = _mm_loadu_si128((__m128i const *)(src + j + 48)); + __m128i v_b0 = _mm_loadu_si128((__m128i const *)(src + j + 64)); + __m128i v_b1 = _mm_loadu_si128((__m128i const *)(src + j + 80)); - _mm_deinterleave_epi8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); + _mm_deinterleave_epi8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); - process(_mm_unpacklo_epi8(v_r0, v_zero), - _mm_unpacklo_epi8(v_g0, v_zero), - _mm_unpacklo_epi8(v_b0, v_zero), - buf + j); + process(_mm_unpacklo_epi8(v_r0, v_zero), + _mm_unpacklo_epi8(v_g0, v_zero), + _mm_unpacklo_epi8(v_b0, v_zero), + buf + j); - process(_mm_unpackhi_epi8(v_r0, v_zero), - _mm_unpackhi_epi8(v_g0, v_zero), - _mm_unpackhi_epi8(v_b0, v_zero), - buf + j + 24); + process(_mm_unpackhi_epi8(v_r0, v_zero), + _mm_unpackhi_epi8(v_g0, v_zero), + _mm_unpackhi_epi8(v_b0, v_zero), + buf + j + 24); - process(_mm_unpacklo_epi8(v_r1, v_zero), - _mm_unpacklo_epi8(v_g1, v_zero), - _mm_unpacklo_epi8(v_b1, v_zero), - buf + j + 48); + process(_mm_unpacklo_epi8(v_r1, v_zero), + _mm_unpacklo_epi8(v_g1, v_zero), + _mm_unpacklo_epi8(v_b1, v_zero), + buf + j + 48); - process(_mm_unpackhi_epi8(v_r1, v_zero), - _mm_unpackhi_epi8(v_g1, v_zero), - _mm_unpackhi_epi8(v_b1, v_zero), - buf + j + 72); + process(_mm_unpackhi_epi8(v_r1, v_zero), + _mm_unpackhi_epi8(v_g1, v_zero), + _mm_unpackhi_epi8(v_b1, v_zero), + buf + j + 72); + } } #endif for( ; j < dn*3; j += 3 ) @@ -5961,7 +6035,7 @@ struct Luv2RGB_b } } #elif CV_SSE2 - if (dcn == 3) + if (dcn == 3 && haveSIMD) { for ( ; j <= (dn * 3 - 16); j += 16, dst += 16) { @@ -6004,6 +6078,7 @@ struct Luv2RGB_b #elif CV_SSE2 __m128 v_scale, v_scale_inv, v_coeff1, v_coeff2, v_134, v_140; __m128i v_zero; + bool haveSIMD; #endif }; diff --git a/modules/imgproc/src/imgwarp.cpp b/modules/imgproc/src/imgwarp.cpp index 4880819b9e..304210f84e 100644 --- a/modules/imgproc/src/imgwarp.cpp +++ b/modules/imgproc/src/imgwarp.cpp @@ -1963,9 +1963,9 @@ private: struct ResizeAreaFastVec_SIMD_32f { ResizeAreaFastVec_SIMD_32f(int _scale_x, int _scale_y, int _cn, int _step) : - scale_x(_scale_x), scale_y(_scale_y), cn(_cn), step(_step) + cn(_cn), step(_step) { - fast_mode = scale_x == 2 && scale_y == 2 && (cn == 1 || cn == 3 || cn == 4); + fast_mode = _scale_x == 2 && _scale_y == 2 && (cn == 1 || cn == 4); } int operator() (const float * S, float * D, int w) const @@ -2005,7 +2005,6 @@ struct ResizeAreaFastVec_SIMD_32f } private: - int scale_x, scale_y; int cn; bool fast_mode; int step; @@ -2289,9 +2288,10 @@ private: struct ResizeAreaFastVec_SIMD_32f { ResizeAreaFastVec_SIMD_32f(int _scale_x, int _scale_y, int _cn, int _step) : - scale_x(_scale_x), scale_y(_scale_y), cn(_cn), step(_step) + cn(_cn), step(_step) { - fast_mode = scale_x == 2 && scale_y == 2 && (cn == 1 || cn == 3 || cn == 4); + fast_mode = _scale_x == 2 && _scale_y == 2 && (cn == 1 || cn == 4); + fast_mode = fast_mode && checkHardwareSupport(CV_CPU_SSE2); } int operator() (const float * S, float * D, int w) const @@ -2335,7 +2335,6 @@ struct ResizeAreaFastVec_SIMD_32f } private: - int scale_x, scale_y; int cn; bool fast_mode; int step; @@ -4817,6 +4816,13 @@ void cv::convertMaps( InputArray _map1, InputArray _map2, size.height = 1; } +#if CV_SSE2 + bool useSSE2 = checkHardwareSupport(CV_CPU_SSE2); +#endif +#if CV_SSE4_1 + bool useSSE4_1 = checkHardwareSupport(CV_CPU_SSE4_1); +#endif + const float scale = 1.f/INTER_TAB_SIZE; int x, y; for( y = 0; y < size.height; y++ ) @@ -4848,24 +4854,27 @@ void cv::convertMaps( InputArray _map1, InputArray _map2, vst2q_s16(dst1 + (x << 1), v_dst); } #elif CV_SSE4_1 - for( ; x <= size.width - 16; x += 16 ) + if (useSSE4_1) { - __m128i v_dst0 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src1f + x)), - _mm_cvtps_epi32(_mm_loadu_ps(src1f + x + 4))); - __m128i v_dst1 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src1f + x + 8)), - _mm_cvtps_epi32(_mm_loadu_ps(src1f + x + 12))); + for( ; x <= size.width - 16; x += 16 ) + { + __m128i v_dst0 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src1f + x)), + _mm_cvtps_epi32(_mm_loadu_ps(src1f + x + 4))); + __m128i v_dst1 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src1f + x + 8)), + _mm_cvtps_epi32(_mm_loadu_ps(src1f + x + 12))); - __m128i v_dst2 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src2f + x)), - _mm_cvtps_epi32(_mm_loadu_ps(src2f + x + 4))); - __m128i v_dst3 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src2f + x + 8)), - _mm_cvtps_epi32(_mm_loadu_ps(src2f + x + 12))); + __m128i v_dst2 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src2f + x)), + _mm_cvtps_epi32(_mm_loadu_ps(src2f + x + 4))); + __m128i v_dst3 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src2f + x + 8)), + _mm_cvtps_epi32(_mm_loadu_ps(src2f + x + 12))); - _mm_interleave_epi16(v_dst0, v_dst1, v_dst2, v_dst3); + _mm_interleave_epi16(v_dst0, v_dst1, v_dst2, v_dst3); - _mm_storeu_si128((__m128i *)(dst1 + x * 2), v_dst0); - _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 8), v_dst1); - _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 16), v_dst2); - _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 24), v_dst3); + _mm_storeu_si128((__m128i *)(dst1 + x * 2), v_dst0); + _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 8), v_dst1); + _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 16), v_dst2); + _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 24), v_dst3); + } } #endif for( ; x < size.width; x++ ) @@ -4902,47 +4911,50 @@ void cv::convertMaps( InputArray _map1, InputArray _map2, vst1q_u16(dst2 + x, vcombine_u16(v_dst0, v_dst1)); } #elif CV_SSE4_1 - __m128 v_its = _mm_set1_ps(INTER_TAB_SIZE); - __m128i v_its1 = _mm_set1_epi32(INTER_TAB_SIZE-1); - - for( ; x <= size.width - 16; x += 16 ) + if (useSSE4_1) { - __m128i v_ix0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x), v_its)); - __m128i v_ix1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x + 4), v_its)); - __m128i v_iy0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x), v_its)); - __m128i v_iy1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x + 4), v_its)); + __m128 v_its = _mm_set1_ps(INTER_TAB_SIZE); + __m128i v_its1 = _mm_set1_epi32(INTER_TAB_SIZE-1); - __m128i v_dst10 = _mm_packs_epi32(_mm_srai_epi32(v_ix0, INTER_BITS), - _mm_srai_epi32(v_ix1, INTER_BITS)); - __m128i v_dst12 = _mm_packs_epi32(_mm_srai_epi32(v_iy0, INTER_BITS), - _mm_srai_epi32(v_iy1, INTER_BITS)); - __m128i v_dst20 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy0, v_its1), INTER_BITS), - _mm_and_si128(v_ix0, v_its1)); - __m128i v_dst21 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy1, v_its1), INTER_BITS), - _mm_and_si128(v_ix1, v_its1)); - _mm_storeu_si128((__m128i *)(dst2 + x), _mm_packus_epi32(v_dst20, v_dst21)); + for( ; x <= size.width - 16; x += 16 ) + { + __m128i v_ix0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x), v_its)); + __m128i v_ix1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x + 4), v_its)); + __m128i v_iy0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x), v_its)); + __m128i v_iy1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x + 4), v_its)); - v_ix0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x + 8), v_its)); - v_ix1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x + 12), v_its)); - v_iy0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x + 8), v_its)); - v_iy1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x + 12), v_its)); + __m128i v_dst10 = _mm_packs_epi32(_mm_srai_epi32(v_ix0, INTER_BITS), + _mm_srai_epi32(v_ix1, INTER_BITS)); + __m128i v_dst12 = _mm_packs_epi32(_mm_srai_epi32(v_iy0, INTER_BITS), + _mm_srai_epi32(v_iy1, INTER_BITS)); + __m128i v_dst20 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy0, v_its1), INTER_BITS), + _mm_and_si128(v_ix0, v_its1)); + __m128i v_dst21 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy1, v_its1), INTER_BITS), + _mm_and_si128(v_ix1, v_its1)); + _mm_storeu_si128((__m128i *)(dst2 + x), _mm_packus_epi32(v_dst20, v_dst21)); - __m128i v_dst11 = _mm_packs_epi32(_mm_srai_epi32(v_ix0, INTER_BITS), - _mm_srai_epi32(v_ix1, INTER_BITS)); - __m128i v_dst13 = _mm_packs_epi32(_mm_srai_epi32(v_iy0, INTER_BITS), - _mm_srai_epi32(v_iy1, INTER_BITS)); - v_dst20 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy0, v_its1), INTER_BITS), - _mm_and_si128(v_ix0, v_its1)); - v_dst21 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy1, v_its1), INTER_BITS), - _mm_and_si128(v_ix1, v_its1)); - _mm_storeu_si128((__m128i *)(dst2 + x + 8), _mm_packus_epi32(v_dst20, v_dst21)); + v_ix0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x + 8), v_its)); + v_ix1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x + 12), v_its)); + v_iy0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x + 8), v_its)); + v_iy1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x + 12), v_its)); - _mm_interleave_epi16(v_dst10, v_dst11, v_dst12, v_dst13); + __m128i v_dst11 = _mm_packs_epi32(_mm_srai_epi32(v_ix0, INTER_BITS), + _mm_srai_epi32(v_ix1, INTER_BITS)); + __m128i v_dst13 = _mm_packs_epi32(_mm_srai_epi32(v_iy0, INTER_BITS), + _mm_srai_epi32(v_iy1, INTER_BITS)); + v_dst20 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy0, v_its1), INTER_BITS), + _mm_and_si128(v_ix0, v_its1)); + v_dst21 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy1, v_its1), INTER_BITS), + _mm_and_si128(v_ix1, v_its1)); + _mm_storeu_si128((__m128i *)(dst2 + x + 8), _mm_packus_epi32(v_dst20, v_dst21)); - _mm_storeu_si128((__m128i *)(dst1 + x * 2), v_dst10); - _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 8), v_dst11); - _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 16), v_dst12); - _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 24), v_dst13); + _mm_interleave_epi16(v_dst10, v_dst11, v_dst12, v_dst13); + + _mm_storeu_si128((__m128i *)(dst1 + x * 2), v_dst10); + _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 8), v_dst11); + _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 16), v_dst12); + _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 24), v_dst13); + } } #endif for( ; x < size.width; x++ ) @@ -5005,25 +5017,28 @@ void cv::convertMaps( InputArray _map1, InputArray _map2, vst1q_u16(dst2 + x, vcombine_u16(v_dst0, v_dst1)); } #elif CV_SSE2 - __m128 v_its = _mm_set1_ps(INTER_TAB_SIZE); - __m128i v_its1 = _mm_set1_epi32(INTER_TAB_SIZE-1); - __m128i v_y_mask = _mm_set1_epi32((INTER_TAB_SIZE-1) << 16); - - for( ; x <= size.width - 4; x += 4 ) + if (useSSE2) { - __m128i v_src0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x * 2), v_its)); - __m128i v_src1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x * 2 + 4), v_its)); + __m128 v_its = _mm_set1_ps(INTER_TAB_SIZE); + __m128i v_its1 = _mm_set1_epi32(INTER_TAB_SIZE-1); + __m128i v_y_mask = _mm_set1_epi32((INTER_TAB_SIZE-1) << 16); - __m128i v_dst1 = _mm_packs_epi32(_mm_srai_epi32(v_src0, INTER_BITS), - _mm_srai_epi32(v_src1, INTER_BITS)); - _mm_storeu_si128((__m128i *)(dst1 + x * 2), v_dst1); + for( ; x <= size.width - 4; x += 4 ) + { + __m128i v_src0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x * 2), v_its)); + __m128i v_src1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x * 2 + 4), v_its)); - // x0 y0 x1 y1 . . . - v_src0 = _mm_packs_epi32(_mm_and_si128(v_src0, v_its1), - _mm_and_si128(v_src1, v_its1)); - __m128i v_dst2 = _mm_or_si128(_mm_srli_epi32(_mm_and_si128(v_src0, v_y_mask), 16 - INTER_BITS), // y0 0 y1 0 . . . - _mm_and_si128(v_src0, v_its1)); // 0 x0 0 x1 . . . - _mm_storel_epi64((__m128i *)(dst2 + x), _mm_packus_epi32(v_dst2, v_dst2)); + __m128i v_dst1 = _mm_packs_epi32(_mm_srai_epi32(v_src0, INTER_BITS), + _mm_srai_epi32(v_src1, INTER_BITS)); + _mm_storeu_si128((__m128i *)(dst1 + x * 2), v_dst1); + + // x0 y0 x1 y1 . . . + v_src0 = _mm_packs_epi32(_mm_and_si128(v_src0, v_its1), + _mm_and_si128(v_src1, v_its1)); + __m128i v_dst2 = _mm_or_si128(_mm_srli_epi32(_mm_and_si128(v_src0, v_y_mask), 16 - INTER_BITS), // y0 0 y1 0 . . . + _mm_and_si128(v_src0, v_its1)); // 0 x0 0 x1 . . . + _mm_storel_epi64((__m128i *)(dst2 + x), _mm_packus_epi32(v_dst2, v_dst2)); + } } #endif for( ; x < size.width; x++ ) @@ -5150,22 +5165,25 @@ void cv::convertMaps( InputArray _map1, InputArray _map2, vst2q_f32(dst1f + (x << 1) + 8, v_dst); } #elif CV_SSE2 - __m128i v_mask2 = _mm_set1_epi16(INTER_TAB_SIZE2-1); - __m128i v_zero = _mm_set1_epi32(0), v_mask = _mm_set1_epi32(INTER_TAB_SIZE-1); - __m128 v_scale = _mm_set1_ps(scale); - - for ( ; x <= size.width - 8; x += 8) + if (useSSE2) { - __m128i v_src = _mm_loadu_si128((__m128i const *)(src1 + x * 2)); - __m128i v_fxy = src2 ? _mm_and_si128(_mm_loadu_si128((__m128i const *)(src2 + x)), v_mask2) : v_zero; - __m128i v_fxy1 = _mm_and_si128(v_fxy, v_mask); - __m128i v_fxy2 = _mm_srli_epi16(v_fxy, INTER_BITS); + __m128i v_mask2 = _mm_set1_epi16(INTER_TAB_SIZE2-1); + __m128i v_zero = _mm_set1_epi32(0), v_mask = _mm_set1_epi32(INTER_TAB_SIZE-1); + __m128 v_scale = _mm_set1_ps(scale); - __m128 v_add = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_fxy1, v_fxy2)), v_scale); - _mm_storeu_ps(dst1f + x * 2, _mm_add_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)), v_add)); + for ( ; x <= size.width - 8; x += 8) + { + __m128i v_src = _mm_loadu_si128((__m128i const *)(src1 + x * 2)); + __m128i v_fxy = src2 ? _mm_and_si128(_mm_loadu_si128((__m128i const *)(src2 + x)), v_mask2) : v_zero; + __m128i v_fxy1 = _mm_and_si128(v_fxy, v_mask); + __m128i v_fxy2 = _mm_srli_epi16(v_fxy, INTER_BITS); - v_add = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_fxy1, v_fxy2)), v_scale); - _mm_storeu_ps(dst1f + x * 2, _mm_add_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)), v_add)); + __m128 v_add = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_fxy1, v_fxy2)), v_scale); + _mm_storeu_ps(dst1f + x * 2, _mm_add_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)), v_add)); + + v_add = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_fxy1, v_fxy2)), v_scale); + _mm_storeu_ps(dst1f + x * 2, _mm_add_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)), v_add)); + } } #endif for( ; x < size.width; x++ ) @@ -5204,7 +5222,10 @@ public: const int AB_SCALE = 1 << AB_BITS; int round_delta = interpolation == INTER_NEAREST ? AB_SCALE/2 : AB_SCALE/INTER_TAB_SIZE/2, x, y, x1, y1; #if CV_SSE2 - bool useSIMD = checkHardwareSupport(CV_CPU_SSE2); + bool useSSE2 = checkHardwareSupport(CV_CPU_SSE2); + #endif + #if CV_SSE4_1 + bool useSSE4_1 = checkHardwareSupport(CV_CPU_SSE4_1); #endif int bh0 = std::min(BLOCK_SZ/2, dst.rows); @@ -5243,26 +5264,29 @@ public: vst2q_s16(xy + (x1 << 1), v_dst); } #elif CV_SSE4_1 - __m128i v_X0 = _mm_set1_epi32(X0); - __m128i v_Y0 = _mm_set1_epi32(Y0); - for ( ; x1 <= bw - 16; x1 += 16) + if (useSSE4_1) { - __m128i v_x0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x + x1))), AB_BITS), - _mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x + x1 + 4))), AB_BITS)); - __m128i v_x1 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x + x1 + 8))), AB_BITS), - _mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x + x1 + 12))), AB_BITS)); + __m128i v_X0 = _mm_set1_epi32(X0); + __m128i v_Y0 = _mm_set1_epi32(Y0); + for ( ; x1 <= bw - 16; x1 += 16) + { + __m128i v_x0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x + x1))), AB_BITS), + _mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x + x1 + 4))), AB_BITS)); + __m128i v_x1 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x + x1 + 8))), AB_BITS), + _mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x + x1 + 12))), AB_BITS)); - __m128i v_y0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x + x1))), AB_BITS), - _mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x + x1 + 4))), AB_BITS)); - __m128i v_y1 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x + x1 + 8))), AB_BITS), - _mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x + x1 + 12))), AB_BITS)); + __m128i v_y0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x + x1))), AB_BITS), + _mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x + x1 + 4))), AB_BITS)); + __m128i v_y1 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x + x1 + 8))), AB_BITS), + _mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x + x1 + 12))), AB_BITS)); - _mm_interleave_epi16(v_x0, v_x1, v_y0, v_y1); + _mm_interleave_epi16(v_x0, v_x1, v_y0, v_y1); - _mm_storeu_si128((__m128i *)(xy + x1 * 2), v_x0); - _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 8), v_x1); - _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 16), v_y0); - _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 24), v_y1); + _mm_storeu_si128((__m128i *)(xy + x1 * 2), v_x0); + _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 8), v_x1); + _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 16), v_y0); + _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 24), v_y1); + } } #endif for( ; x1 < bw; x1++ ) @@ -5278,7 +5302,7 @@ public: short* alpha = A + y1*bw; x1 = 0; #if CV_SSE2 - if( useSIMD ) + if( useSSE2 ) { __m128i fxy_mask = _mm_set1_epi32(INTER_TAB_SIZE - 1); __m128i XX = _mm_set1_epi32(X0), YY = _mm_set1_epi32(Y0); @@ -5672,6 +5696,7 @@ public: bh0 = std::min(BLOCK_SZ*BLOCK_SZ/bw0, height); #if CV_SSE4_1 + bool haveSSE4_1 = checkHardwareSupport(CV_CPU_SSE4_1); __m128d v_M0 = _mm_set1_pd(M[0]); __m128d v_M3 = _mm_set1_pd(M[3]); __m128d v_M6 = _mm_set1_pd(M[6]); @@ -5706,109 +5731,112 @@ public: x1 = 0; #if CV_SSE4_1 - __m128d v_X0d = _mm_set1_pd(X0); - __m128d v_Y0d = _mm_set1_pd(Y0); - __m128d v_W0 = _mm_set1_pd(W0); - __m128d v_x1 = _mm_set_pd(1, 0); - - for( ; x1 <= bw - 16; x1 += 16 ) + if (haveSSE4_1) { - // 0-3 - __m128i v_X0, v_Y0; + __m128d v_X0d = _mm_set1_pd(X0); + __m128d v_Y0d = _mm_set1_pd(Y0); + __m128d v_W0 = _mm_set1_pd(W0); + __m128d v_x1 = _mm_set_pd(1, 0); + + for( ; x1 <= bw - 16; x1 += 16 ) { - __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); - v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W)); - __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); - __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); - v_x1 = _mm_add_pd(v_x1, v_2); + // 0-3 + __m128i v_X0, v_Y0; + { + __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); + v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W)); + __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); + __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); + v_x1 = _mm_add_pd(v_x1, v_2); - v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); - v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W)); - __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); - __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); - v_x1 = _mm_add_pd(v_x1, v_2); + v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); + v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W)); + __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); + __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); + v_x1 = _mm_add_pd(v_x1, v_2); - v_X0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)), - _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1)))); - v_Y0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)), - _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1)))); + v_X0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)), + _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1)))); + v_Y0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)), + _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1)))); + } + + // 4-8 + __m128i v_X1, v_Y1; + { + __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); + v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W)); + __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); + __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); + v_x1 = _mm_add_pd(v_x1, v_2); + + v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); + v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W)); + __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); + __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); + v_x1 = _mm_add_pd(v_x1, v_2); + + v_X1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)), + _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1)))); + v_Y1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)), + _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1)))); + } + + // 8-11 + __m128i v_X2, v_Y2; + { + __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); + v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W)); + __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); + __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); + v_x1 = _mm_add_pd(v_x1, v_2); + + v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); + v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W)); + __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); + __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); + v_x1 = _mm_add_pd(v_x1, v_2); + + v_X2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)), + _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1)))); + v_Y2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)), + _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1)))); + } + + // 12-15 + __m128i v_X3, v_Y3; + { + __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); + v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W)); + __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); + __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); + v_x1 = _mm_add_pd(v_x1, v_2); + + v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); + v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W)); + __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); + __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); + v_x1 = _mm_add_pd(v_x1, v_2); + + v_X3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)), + _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1)))); + v_Y3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)), + _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1)))); + } + + // convert to 16s + v_X0 = _mm_packs_epi32(v_X0, v_X1); + v_X1 = _mm_packs_epi32(v_X2, v_X3); + v_Y0 = _mm_packs_epi32(v_Y0, v_Y1); + v_Y1 = _mm_packs_epi32(v_Y2, v_Y3); + + _mm_interleave_epi16(v_X0, v_X1, v_Y0, v_Y1); + + _mm_storeu_si128((__m128i *)(xy + x1 * 2), v_X0); + _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 8), v_X1); + _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 16), v_Y0); + _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 24), v_Y1); } - - // 4-8 - __m128i v_X1, v_Y1; - { - __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); - v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W)); - __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); - __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); - v_x1 = _mm_add_pd(v_x1, v_2); - - v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); - v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W)); - __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); - __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); - v_x1 = _mm_add_pd(v_x1, v_2); - - v_X1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)), - _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1)))); - v_Y1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)), - _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1)))); - } - - // 8-11 - __m128i v_X2, v_Y2; - { - __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); - v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W)); - __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); - __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); - v_x1 = _mm_add_pd(v_x1, v_2); - - v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); - v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W)); - __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); - __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); - v_x1 = _mm_add_pd(v_x1, v_2); - - v_X2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)), - _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1)))); - v_Y2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)), - _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1)))); - } - - // 12-15 - __m128i v_X3, v_Y3; - { - __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); - v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W)); - __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); - __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); - v_x1 = _mm_add_pd(v_x1, v_2); - - v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); - v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W)); - __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); - __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); - v_x1 = _mm_add_pd(v_x1, v_2); - - v_X3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)), - _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1)))); - v_Y3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)), - _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1)))); - } - - // convert to 16s - v_X0 = _mm_packs_epi32(v_X0, v_X1); - v_X1 = _mm_packs_epi32(v_X2, v_X3); - v_Y0 = _mm_packs_epi32(v_Y0, v_Y1); - v_Y1 = _mm_packs_epi32(v_Y2, v_Y3); - - _mm_interleave_epi16(v_X0, v_X1, v_Y0, v_Y1); - - _mm_storeu_si128((__m128i *)(xy + x1 * 2), v_X0); - _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 8), v_X1); - _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 16), v_Y0); - _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 24), v_Y1); } #endif @@ -5831,122 +5859,125 @@ public: x1 = 0; #if CV_SSE4_1 - __m128d v_X0d = _mm_set1_pd(X0); - __m128d v_Y0d = _mm_set1_pd(Y0); - __m128d v_W0 = _mm_set1_pd(W0); - __m128d v_x1 = _mm_set_pd(1, 0); - - for( ; x1 <= bw - 16; x1 += 16 ) + if (haveSSE4_1) { - // 0-3 - __m128i v_X0, v_Y0; + __m128d v_X0d = _mm_set1_pd(X0); + __m128d v_Y0d = _mm_set1_pd(Y0); + __m128d v_W0 = _mm_set1_pd(W0); + __m128d v_x1 = _mm_set_pd(1, 0); + + for( ; x1 <= bw - 16; x1 += 16 ) { - __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); - v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W)); - __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); - __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); - v_x1 = _mm_add_pd(v_x1, v_2); + // 0-3 + __m128i v_X0, v_Y0; + { + __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); + v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W)); + __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); + __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); + v_x1 = _mm_add_pd(v_x1, v_2); - v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); - v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W)); - __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); - __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); - v_x1 = _mm_add_pd(v_x1, v_2); + v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); + v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W)); + __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); + __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); + v_x1 = _mm_add_pd(v_x1, v_2); - v_X0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)), - _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1)))); - v_Y0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)), - _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1)))); + v_X0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)), + _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1)))); + v_Y0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)), + _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1)))); + } + + // 4-8 + __m128i v_X1, v_Y1; + { + __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); + v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W)); + __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); + __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); + v_x1 = _mm_add_pd(v_x1, v_2); + + v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); + v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W)); + __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); + __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); + v_x1 = _mm_add_pd(v_x1, v_2); + + v_X1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)), + _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1)))); + v_Y1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)), + _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1)))); + } + + // 8-11 + __m128i v_X2, v_Y2; + { + __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); + v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W)); + __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); + __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); + v_x1 = _mm_add_pd(v_x1, v_2); + + v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); + v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W)); + __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); + __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); + v_x1 = _mm_add_pd(v_x1, v_2); + + v_X2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)), + _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1)))); + v_Y2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)), + _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1)))); + } + + // 12-15 + __m128i v_X3, v_Y3; + { + __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); + v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W)); + __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); + __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); + v_x1 = _mm_add_pd(v_x1, v_2); + + v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); + v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W)); + __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); + __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); + v_x1 = _mm_add_pd(v_x1, v_2); + + v_X3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)), + _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1)))); + v_Y3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)), + _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1)))); + } + + // store alpha + __m128i v_alpha0 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y0, v_itsi1), INTER_BITS), + _mm_and_si128(v_X0, v_itsi1)); + __m128i v_alpha1 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y1, v_itsi1), INTER_BITS), + _mm_and_si128(v_X1, v_itsi1)); + _mm_storeu_si128((__m128i *)(alpha + x1), _mm_packs_epi32(v_alpha0, v_alpha1)); + + v_alpha0 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y2, v_itsi1), INTER_BITS), + _mm_and_si128(v_X2, v_itsi1)); + v_alpha1 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y3, v_itsi1), INTER_BITS), + _mm_and_si128(v_X3, v_itsi1)); + _mm_storeu_si128((__m128i *)(alpha + x1 + 8), _mm_packs_epi32(v_alpha0, v_alpha1)); + + // convert to 16s + v_X0 = _mm_packs_epi32(_mm_srai_epi32(v_X0, INTER_BITS), _mm_srai_epi32(v_X1, INTER_BITS)); + v_X1 = _mm_packs_epi32(_mm_srai_epi32(v_X2, INTER_BITS), _mm_srai_epi32(v_X3, INTER_BITS)); + v_Y0 = _mm_packs_epi32(_mm_srai_epi32(v_Y0, INTER_BITS), _mm_srai_epi32(v_Y1, INTER_BITS)); + v_Y1 = _mm_packs_epi32(_mm_srai_epi32(v_Y2, INTER_BITS), _mm_srai_epi32(v_Y3, INTER_BITS)); + + _mm_interleave_epi16(v_X0, v_X1, v_Y0, v_Y1); + + _mm_storeu_si128((__m128i *)(xy + x1 * 2), v_X0); + _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 8), v_X1); + _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 16), v_Y0); + _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 24), v_Y1); } - - // 4-8 - __m128i v_X1, v_Y1; - { - __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); - v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W)); - __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); - __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); - v_x1 = _mm_add_pd(v_x1, v_2); - - v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); - v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W)); - __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); - __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); - v_x1 = _mm_add_pd(v_x1, v_2); - - v_X1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)), - _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1)))); - v_Y1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)), - _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1)))); - } - - // 8-11 - __m128i v_X2, v_Y2; - { - __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); - v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W)); - __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); - __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); - v_x1 = _mm_add_pd(v_x1, v_2); - - v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); - v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W)); - __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); - __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); - v_x1 = _mm_add_pd(v_x1, v_2); - - v_X2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)), - _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1)))); - v_Y2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)), - _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1)))); - } - - // 12-15 - __m128i v_X3, v_Y3; - { - __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); - v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W)); - __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); - __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); - v_x1 = _mm_add_pd(v_x1, v_2); - - v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); - v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W)); - __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); - __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); - v_x1 = _mm_add_pd(v_x1, v_2); - - v_X3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)), - _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1)))); - v_Y3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)), - _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1)))); - } - - // store alpha - __m128i v_alpha0 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y0, v_itsi1), INTER_BITS), - _mm_and_si128(v_X0, v_itsi1)); - __m128i v_alpha1 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y1, v_itsi1), INTER_BITS), - _mm_and_si128(v_X1, v_itsi1)); - _mm_storeu_si128((__m128i *)(alpha + x1), _mm_packs_epi32(v_alpha0, v_alpha1)); - - v_alpha0 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y2, v_itsi1), INTER_BITS), - _mm_and_si128(v_X2, v_itsi1)); - v_alpha1 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y3, v_itsi1), INTER_BITS), - _mm_and_si128(v_X3, v_itsi1)); - _mm_storeu_si128((__m128i *)(alpha + x1 + 8), _mm_packs_epi32(v_alpha0, v_alpha1)); - - // convert to 16s - v_X0 = _mm_packs_epi32(_mm_srai_epi32(v_X0, INTER_BITS), _mm_srai_epi32(v_X1, INTER_BITS)); - v_X1 = _mm_packs_epi32(_mm_srai_epi32(v_X2, INTER_BITS), _mm_srai_epi32(v_X3, INTER_BITS)); - v_Y0 = _mm_packs_epi32(_mm_srai_epi32(v_Y0, INTER_BITS), _mm_srai_epi32(v_Y1, INTER_BITS)); - v_Y1 = _mm_packs_epi32(_mm_srai_epi32(v_Y2, INTER_BITS), _mm_srai_epi32(v_Y3, INTER_BITS)); - - _mm_interleave_epi16(v_X0, v_X1, v_Y0, v_Y1); - - _mm_storeu_si128((__m128i *)(xy + x1 * 2), v_X0); - _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 8), v_X1); - _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 16), v_Y0); - _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 24), v_Y1); } #endif diff --git a/modules/imgproc/src/pyramids.cpp b/modules/imgproc/src/pyramids.cpp index 93b9bfa166..4271b942ae 100644 --- a/modules/imgproc/src/pyramids.cpp +++ b/modules/imgproc/src/pyramids.cpp @@ -386,10 +386,10 @@ struct PyrUpVec_32s16s __m128i v_dst01 = _mm_add_epi32(_mm_add_epi32(v_r0, v_r2), _mm_add_epi32(v_2r1, v_4r1)); __m128i v_dst11 = _mm_slli_epi32(_mm_add_epi32(v_r1, v_r2), 2); - _mm_storeu_si128((__m128i *)(dst0 + x), + _mm_storeu_si128((__m128i *)(dst0 + x), _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_dst00, v_delta), 6), _mm_srai_epi32(_mm_add_epi32(v_dst01, v_delta), 6))); - _mm_storeu_si128((__m128i *)(dst1 + x), + _mm_storeu_si128((__m128i *)(dst1 + x), _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_dst10, v_delta), 6), _mm_srai_epi32(_mm_add_epi32(v_dst11, v_delta), 6))); } @@ -446,10 +446,10 @@ struct PyrUpVec_32s16u __m128i v_dst01 = _mm_add_epi32(_mm_add_epi32(v_r0, v_r2), _mm_add_epi32(v_2r1, v_4r1)); __m128i v_dst11 = _mm_slli_epi32(_mm_add_epi32(v_r1, v_r2), 2); - _mm_storeu_si128((__m128i *)(dst0 + x), + _mm_storeu_si128((__m128i *)(dst0 + x), _mm_packus_epi32(_mm_srli_epi32(_mm_add_epi32(v_dst00, v_delta), 6), _mm_srli_epi32(_mm_add_epi32(v_dst01, v_delta), 6))); - _mm_storeu_si128((__m128i *)(dst1 + x), + _mm_storeu_si128((__m128i *)(dst1 + x), _mm_packus_epi32(_mm_srli_epi32(_mm_add_epi32(v_dst10, v_delta), 6), _mm_srli_epi32(_mm_add_epi32(v_dst11, v_delta), 6))); } @@ -491,7 +491,7 @@ struct PyrUpVec_32f const float *row0 = src[0], *row1 = src[1], *row2 = src[2]; float *dst0 = dst[0], *dst1 = dst[1]; - __m128 v_6 = _mm_set1_ps(6.0f), v_scale = _mm_set1_ps(1.f/64.0f), + __m128 v_6 = _mm_set1_ps(6.0f), v_scale = _mm_set1_ps(1.f/64.0f), v_scale4 = _mm_mul_ps(v_scale, _mm_set1_ps(4.0f)); for( ; x <= width - 8; x += 8 ) From 33176db5dcea9479580219196b29c533d6a2d563 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 12 Jan 2015 10:59:31 +0300 Subject: [PATCH 51/53] compareHist --- modules/imgproc/src/histogram.cpp | 106 ++++++++++++++++++++++++++++-- 1 file changed, 101 insertions(+), 5 deletions(-) diff --git a/modules/imgproc/src/histogram.cpp b/modules/imgproc/src/histogram.cpp index 9acdc11415..ec8de4d815 100644 --- a/modules/imgproc/src/histogram.cpp +++ b/modules/imgproc/src/histogram.cpp @@ -2284,15 +2284,20 @@ double cv::compareHist( InputArray _H1, InputArray _H2, int method ) CV_Assert( it.planes[0].isContinuous() && it.planes[1].isContinuous() ); +#if CV_SSE2 + bool haveSIMD = checkHardwareSupport(CV_CPU_SSE2); +#endif + for( size_t i = 0; i < it.nplanes; i++, ++it ) { const float* h1 = it.planes[0].ptr(); const float* h2 = it.planes[1].ptr(); len = it.planes[0].rows*it.planes[0].cols*H1.channels(); + j = 0; if( (method == CV_COMP_CHISQR) || (method == CV_COMP_CHISQR_ALT)) { - for( j = 0; j < len; j++ ) + for( ; j < len; j++ ) { double a = h1[j] - h2[j]; double b = (method == CV_COMP_CHISQR) ? h1[j] : h1[j] + h2[j]; @@ -2302,7 +2307,51 @@ double cv::compareHist( InputArray _H1, InputArray _H2, int method ) } else if( method == CV_COMP_CORREL ) { - for( j = 0; j < len; j++ ) + #if CV_SSE2 + if (haveSIMD) + { + __m128d v_s1 = _mm_setzero_pd(), v_s2 = v_s1; + __m128d v_s11 = v_s1, v_s22 = v_s1, v_s12 = v_s1; + + for ( ; j <= len - 4; j += 4) + { + __m128 v_a = _mm_loadu_ps(h1 + j); + __m128 v_b = _mm_loadu_ps(h2 + j); + + // 0-1 + __m128d v_ad = _mm_cvtps_pd(v_a); + __m128d v_bd = _mm_cvtps_pd(v_b); + v_s12 = _mm_add_pd(v_s12, _mm_mul_pd(v_ad, v_bd)); + v_s11 = _mm_add_pd(v_s11, _mm_mul_pd(v_ad, v_ad)); + v_s22 = _mm_add_pd(v_s22, _mm_mul_pd(v_bd, v_bd)); + v_s1 = _mm_add_pd(v_s1, v_ad); + v_s2 = _mm_add_pd(v_s2, v_bd); + + // 2-3 + v_ad = _mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_a), 8))); + v_bd = _mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_b), 8))); + v_s12 = _mm_add_pd(v_s12, _mm_mul_pd(v_ad, v_bd)); + v_s11 = _mm_add_pd(v_s11, _mm_mul_pd(v_ad, v_ad)); + v_s22 = _mm_add_pd(v_s22, _mm_mul_pd(v_bd, v_bd)); + v_s1 = _mm_add_pd(v_s1, v_ad); + v_s2 = _mm_add_pd(v_s2, v_bd); + } + + double CV_DECL_ALIGNED(16) ar[10]; + _mm_store_pd(ar, v_s12); + _mm_store_pd(ar + 2, v_s11); + _mm_store_pd(ar + 4, v_s22); + _mm_store_pd(ar + 6, v_s1); + _mm_store_pd(ar + 8, v_s2); + + s12 += ar[0] + ar[1]; + s11 += ar[2] + ar[3]; + s22 += ar[4] + ar[5]; + s1 += ar[6] + ar[7]; + s2 += ar[8] + ar[9]; + } + #endif + for( ; j < len; j++ ) { double a = h1[j]; double b = h2[j]; @@ -2316,7 +2365,6 @@ double cv::compareHist( InputArray _H1, InputArray _H2, int method ) } else if( method == CV_COMP_INTERSECT ) { - j = 0; #if CV_NEON float32x4_t v_result = vdupq_n_f32(0.0f); for( ; j <= len - 4; j += 4 ) @@ -2324,13 +2372,61 @@ double cv::compareHist( InputArray _H1, InputArray _H2, int method ) float CV_DECL_ALIGNED(16) ar[4]; vst1q_f32(ar, v_result); result += ar[0] + ar[1] + ar[2] + ar[3]; + #elif CV_SSE2 + if (haveSIMD) + { + __m128d v_result = _mm_setzero_pd(); + for ( ; j <= len - 4; j += 4) + { + __m128 v_src = _mm_min_ps(_mm_loadu_ps(h1 + j), + _mm_loadu_ps(h2 + j)); + v_result = _mm_add_pd(v_result, _mm_cvtps_pd(v_src)); + v_src = _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_src), 8)); + v_result = _mm_add_pd(v_result, _mm_cvtps_pd(v_src)); + } + + double CV_DECL_ALIGNED(16) ar[2]; + _mm_store_pd(ar, v_result); + result += ar[0] + ar[1]; + } #endif for( ; j < len; j++ ) result += std::min(h1[j], h2[j]); } else if( method == CV_COMP_BHATTACHARYYA ) { - for( j = 0; j < len; j++ ) + #if CV_SSE2 + if (haveSIMD) + { + __m128d v_s1 = _mm_setzero_pd(), v_s2 = v_s1, v_result = v_s1; + for ( ; j <= len - 4; j += 4) + { + __m128 v_a = _mm_loadu_ps(h1 + j); + __m128 v_b = _mm_loadu_ps(h2 + j); + + __m128d v_ad = _mm_cvtps_pd(v_a); + __m128d v_bd = _mm_cvtps_pd(v_b); + v_s1 = _mm_add_pd(v_s1, v_ad); + v_s2 = _mm_add_pd(v_s2, v_bd); + v_result = _mm_add_pd(v_result, _mm_sqrt_pd(_mm_mul_pd(v_ad, v_bd))); + + v_ad = _mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_a), 8))); + v_bd = _mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_b), 8))); + v_s1 = _mm_add_pd(v_s1, v_ad); + v_s2 = _mm_add_pd(v_s2, v_bd); + v_result = _mm_add_pd(v_result, _mm_sqrt_pd(_mm_mul_pd(v_ad, v_bd))); + } + + double CV_DECL_ALIGNED(16) ar[6]; + _mm_store_pd(ar, v_s1); + _mm_store_pd(ar + 2, v_s2); + _mm_store_pd(ar + 4, v_result); + s1 += ar[0] + ar[1]; + s2 += ar[2] + ar[3]; + result += ar[4] + ar[5]; + } + #endif + for( ; j < len; j++ ) { double a = h1[j]; double b = h2[j]; @@ -2341,7 +2437,7 @@ double cv::compareHist( InputArray _H1, InputArray _H2, int method ) } else if( method == CV_COMP_KL_DIV ) { - for( j = 0; j < len; j++ ) + for( ; j < len; j++ ) { double p = h1[j]; double q = h2[j]; From 8c94568cc3f58abdf4991f00994559defcdb8050 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 12 Jan 2015 10:59:31 +0300 Subject: [PATCH 52/53] cv::sum --- modules/core/src/stat.cpp | 110 +++++++++++++++++++++++++++++++++++++- 1 file changed, 108 insertions(+), 2 deletions(-) diff --git a/modules/core/src/stat.cpp b/modules/core/src/stat.cpp index 1fcb9b54d1..b26308051b 100644 --- a/modules/core/src/stat.cpp +++ b/modules/core/src/stat.cpp @@ -73,7 +73,114 @@ struct Sum_SIMD } }; -#if CV_NEON +#if CV_SSE2 + +template <> +struct Sum_SIMD +{ + int operator () (const schar * src0, const uchar * mask, int * dst, int len, int cn) const + { + if (mask || (cn != 1 && cn != 2 && cn != 4) || !USE_SSE2) + return 0; + + int x = 0; + __m128i v_zero = _mm_setzero_si128(), v_sum = v_zero; + + for ( ; x <= len - 16; x += 16) + { + __m128i v_src = _mm_loadu_si128((const __m128i *)(src0 + x)); + __m128i v_half = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src), 8); + + v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_half), 16)); + v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_half), 16)); + + v_half = _mm_srai_epi16(_mm_unpackhi_epi8(v_zero, v_src), 8); + v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_half), 16)); + v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_half), 16)); + } + + for ( ; x <= len - 8; x += 8) + { + __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src0 + x))), 8); + + v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16)); + v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16)); + } + + int CV_DECL_ALIGNED(16) ar[4]; + _mm_store_si128((__m128i*)ar, v_sum); + + for (int i = 0; i < 4; i += cn) + for (int j = 0; j < cn; ++j) + dst[j] += ar[j + i]; + + return x / cn; + } +}; + +template <> +struct Sum_SIMD +{ + int operator () (const int * src0, const uchar * mask, double * dst, int len, int cn) const + { + if (mask || (cn != 1 && cn != 2 && cn != 4) || !USE_SSE2) + return 0; + + int x = 0; + __m128d v_zero = _mm_setzero_pd(), v_sum0 = v_zero, v_sum1 = v_zero; + + for ( ; x <= len - 4; x += 4) + { + __m128i v_src = _mm_loadu_si128((__m128i const *)(src0 + x)); + v_sum0 = _mm_add_pd(v_sum0, _mm_cvtepi32_pd(v_src)); + v_sum1 = _mm_add_pd(v_sum1, _mm_cvtepi32_pd(_mm_srli_si128(v_src, 8))); + } + + double CV_DECL_ALIGNED(16) ar[4]; + _mm_store_pd(ar, v_sum0); + _mm_store_pd(ar + 2, v_sum1); + + for (int i = 0; i < 4; i += cn) + for (int j = 0; j < cn; ++j) + dst[j] += ar[j + i]; + + return x / cn; + } +}; + +template <> +struct Sum_SIMD +{ + int operator () (const float * src0, const uchar * mask, double * dst, int len, int cn) const + { + if (mask || (cn != 1 && cn != 2 && cn != 4) || !USE_SSE2) + return 0; + + int x = 0; + __m128d v_zero = _mm_setzero_pd(), v_sum0 = v_zero, v_sum1 = v_zero; + + for ( ; x <= len - 4; x += 4) + { + __m128 v_src = _mm_loadu_ps(src0 + x); + v_sum0 = _mm_add_pd(v_sum0, _mm_cvtps_pd(v_src)); + v_src = _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_src), 8)); + v_sum1 = _mm_add_pd(v_sum1, _mm_cvtps_pd(v_src)); + } + + double CV_DECL_ALIGNED(16) ar[4]; + _mm_store_pd(ar, v_sum0); + _mm_store_pd(ar + 2, v_sum1); + + for (int i = 0; i < 4; i += cn) + for (int j = 0; j < cn; ++j) + dst[j] += ar[j + i]; + + return x / cn; + } +}; + + +#elif CV_NEON template <> struct Sum_SIMD @@ -1023,7 +1130,6 @@ cv::Scalar cv::sum( InputArray _src ) } } #endif - SumFunc func = getSumFunc(depth); CV_Assert( cn <= 4 && func != 0 ); From e9a6c5db219e250792c90cdf16414b33f633964f Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 12 Jan 2015 10:59:31 +0300 Subject: [PATCH 53/53] sqsum --- CMakeLists.txt | 10 +- cmake/OpenCVCompilerOptions.cmake | 7 +- modules/core/include/opencv2/core/cvdef.h | 4 +- modules/core/src/stat.cpp | 144 +++++++++++++++++++++- modules/imgproc/src/color.cpp | 4 +- modules/imgproc/src/imgwarp.cpp | 4 +- modules/imgproc/src/smooth.cpp | 2 +- 7 files changed, 154 insertions(+), 21 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index da0b42cb1c..2f4fd3323d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -214,13 +214,13 @@ OCV_OPTION(ENABLE_COVERAGE "Enable coverage collection with GCov" OCV_OPTION(ENABLE_OMIT_FRAME_POINTER "Enable -fomit-frame-pointer for GCC" ON IF CMAKE_COMPILER_IS_GNUCXX AND NOT (APPLE AND CMAKE_COMPILER_IS_CLANGCXX) ) OCV_OPTION(ENABLE_POWERPC "Enable PowerPC for GCC" ON IF (CMAKE_COMPILER_IS_GNUCXX AND CMAKE_SYSTEM_PROCESSOR MATCHES powerpc.*) ) OCV_OPTION(ENABLE_FAST_MATH "Enable -ffast-math (not recommended for GCC 4.6.x)" OFF IF (CMAKE_COMPILER_IS_GNUCXX AND (X86 OR X86_64)) ) -OCV_OPTION(ENABLE_POPCNT "Enable POPCNT instructions" ON IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) ) OCV_OPTION(ENABLE_SSE "Enable SSE instructions" ON IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) ) OCV_OPTION(ENABLE_SSE2 "Enable SSE2 instructions" ON IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) ) -OCV_OPTION(ENABLE_SSE3 "Enable SSE3 instructions" ON IF ((CV_ICC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) ) -OCV_OPTION(ENABLE_SSSE3 "Enable SSSE3 instructions" OFF IF (CMAKE_COMPILER_IS_GNUCXX AND (X86 OR X86_64)) ) -OCV_OPTION(ENABLE_SSE41 "Enable SSE4.1 instructions" OFF IF ((CV_ICC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) ) -OCV_OPTION(ENABLE_SSE42 "Enable SSE4.2 instructions" OFF IF (CMAKE_COMPILER_IS_GNUCXX AND (X86 OR X86_64)) ) +OCV_OPTION(ENABLE_SSE3 "Enable SSE3 instructions" ON IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX OR CV_ICC) AND (X86 OR X86_64)) ) +OCV_OPTION(ENABLE_SSSE3 "Enable SSSE3 instructions" OFF IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) ) +OCV_OPTION(ENABLE_SSE41 "Enable SSE4.1 instructions" OFF IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX OR CV_ICC) AND (X86 OR X86_64)) ) +OCV_OPTION(ENABLE_SSE42 "Enable SSE4.2 instructions" OFF IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) ) +OCV_OPTION(ENABLE_POPCNT "Enable POPCNT instructions" OFF IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) ) OCV_OPTION(ENABLE_AVX "Enable AVX instructions" OFF IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) ) OCV_OPTION(ENABLE_AVX2 "Enable AVX2 instructions" OFF IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) ) OCV_OPTION(ENABLE_FMA3 "Enable FMA3 instructions" OFF IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) ) diff --git a/cmake/OpenCVCompilerOptions.cmake b/cmake/OpenCVCompilerOptions.cmake index 66e16e7863..13559b5c8a 100644 --- a/cmake/OpenCVCompilerOptions.cmake +++ b/cmake/OpenCVCompilerOptions.cmake @@ -122,9 +122,6 @@ if(CMAKE_COMPILER_IS_GNUCXX) if(ENABLE_POWERPC) add_extra_compiler_option("-mcpu=G3 -mtune=G5") endif() - if(ENABLE_POPCNT) - add_extra_compiler_option(-mpopcnt) - endif() if(ENABLE_SSE) add_extra_compiler_option(-msse) endif() @@ -168,6 +165,10 @@ if(CMAKE_COMPILER_IS_GNUCXX) if(ENABLE_SSE42) add_extra_compiler_option(-msse4.2) endif() + + if(ENABLE_POPCNT) + add_extra_compiler_option(-mpopcnt) + endif() endif() endif(NOT MINGW) diff --git a/modules/core/include/opencv2/core/cvdef.h b/modules/core/include/opencv2/core/cvdef.h index a9d59c7693..3fdaa6954d 100644 --- a/modules/core/include/opencv2/core/cvdef.h +++ b/modules/core/include/opencv2/core/cvdef.h @@ -166,7 +166,7 @@ # endif # define CV_POPCNT 1 # endif -# if defined __AVX__ || (defined _MSC_VER && _MSC_VER >= 1600) +# if defined __AVX__ || (defined _MSC_VER && _MSC_VER >= 1600 && 0) // MS Visual Studio 2010 (2012?) has no macro pre-defined to identify the use of /arch:AVX // See: http://connect.microsoft.com/VisualStudio/feedback/details/605858/arch-avx-should-define-a-predefined-macro-in-x64-and-set-a-unique-value-for-m-ix86-fp-in-win32 # include @@ -177,7 +177,7 @@ # define __xgetbv() 0 # endif # endif -# if defined __AVX2__ || (defined _MSC_VER && _MSC_VER >= 1800) +# if defined __AVX2__ || (defined _MSC_VER && _MSC_VER >= 1800 && 0) # include # define CV_AVX2 1 # if defined __FMA__ diff --git a/modules/core/src/stat.cpp b/modules/core/src/stat.cpp index b26308051b..87c423dc3b 100644 --- a/modules/core/src/stat.cpp +++ b/modules/core/src/stat.cpp @@ -800,6 +800,137 @@ static CountNonZeroFunc getCountNonZeroTab(int depth) return countNonZeroTab[depth]; } +template +struct SumSqr_SIMD +{ + int operator () (const T *, const uchar *, ST *, SQT *, int, int) const + { + return 0; + } +}; + +#if CV_SSE2 + +template <> +struct SumSqr_SIMD +{ + int operator () (const uchar * src0, const uchar * mask, int * sum, int * sqsum, int len, int cn) const + { + if (mask || (cn != 1 && cn != 2) || !USE_SSE2) + return 0; + + int x = 0; + __m128i v_zero = _mm_setzero_si128(), v_sum = v_zero, v_sqsum = v_zero; + + for ( ; x <= len - 16; x += 16) + { + __m128i v_src = _mm_loadu_si128((const __m128i *)(src0 + x)); + __m128i v_half = _mm_unpacklo_epi8(v_src, v_zero); + + __m128i v_mullo = _mm_mullo_epi16(v_half, v_half); + __m128i v_mulhi = _mm_mulhi_epi16(v_half, v_half); + v_sum = _mm_add_epi32(v_sum, _mm_unpacklo_epi16(v_half, v_zero)); + v_sum = _mm_add_epi32(v_sum, _mm_unpackhi_epi16(v_half, v_zero)); + v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpacklo_epi16(v_mullo, v_mulhi)); + v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpackhi_epi16(v_mullo, v_mulhi)); + + v_half = _mm_unpackhi_epi8(v_src, v_zero); + v_mullo = _mm_mullo_epi16(v_half, v_half); + v_mulhi = _mm_mulhi_epi16(v_half, v_half); + v_sum = _mm_add_epi32(v_sum, _mm_unpacklo_epi16(v_half, v_zero)); + v_sum = _mm_add_epi32(v_sum, _mm_unpackhi_epi16(v_half, v_zero)); + v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpacklo_epi16(v_mullo, v_mulhi)); + v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpackhi_epi16(v_mullo, v_mulhi)); + } + + for ( ; x <= len - 8; x += 8) + { + __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src0 + x)), v_zero); + + __m128i v_mullo = _mm_mullo_epi16(v_src, v_src); + __m128i v_mulhi = _mm_mulhi_epi16(v_src, v_src); + v_sum = _mm_add_epi32(v_sum, _mm_unpacklo_epi16(v_src, v_zero)); + v_sum = _mm_add_epi32(v_sum, _mm_unpackhi_epi16(v_src, v_zero)); + v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpacklo_epi16(v_mullo, v_mulhi)); + v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpackhi_epi16(v_mullo, v_mulhi)); + } + + int CV_DECL_ALIGNED(16) ar[8]; + _mm_store_si128((__m128i*)ar, v_sum); + _mm_store_si128((__m128i*)(ar + 4), v_sqsum); + + for (int i = 0; i < 4; i += cn) + for (int j = 0; j < cn; ++j) + { + sum[j] += ar[j + i]; + sqsum[j] += ar[4 + j + i]; + } + + return x / cn; + } +}; + +template <> +struct SumSqr_SIMD +{ + int operator () (const schar * src0, const uchar * mask, int * sum, int * sqsum, int len, int cn) const + { + if (mask || (cn != 1 && cn != 2) || !USE_SSE2) + return 0; + + int x = 0; + __m128i v_zero = _mm_setzero_si128(), v_sum = v_zero, v_sqsum = v_zero; + + for ( ; x <= len - 16; x += 16) + { + __m128i v_src = _mm_loadu_si128((const __m128i *)(src0 + x)); + __m128i v_half = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src), 8); + + __m128i v_mullo = _mm_mullo_epi16(v_half, v_half); + __m128i v_mulhi = _mm_mulhi_epi16(v_half, v_half); + v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_half), 16)); + v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_half), 16)); + v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpacklo_epi16(v_mullo, v_mulhi)); + v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpackhi_epi16(v_mullo, v_mulhi)); + + v_half = _mm_srai_epi16(_mm_unpackhi_epi8(v_zero, v_src), 8); + v_mullo = _mm_mullo_epi16(v_half, v_half); + v_mulhi = _mm_mulhi_epi16(v_half, v_half); + v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_half), 16)); + v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_half), 16)); + v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpacklo_epi16(v_mullo, v_mulhi)); + v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpackhi_epi16(v_mullo, v_mulhi)); + } + + for ( ; x <= len - 8; x += 8) + { + __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src0 + x))), 8); + + __m128i v_mullo = _mm_mullo_epi16(v_src, v_src); + __m128i v_mulhi = _mm_mulhi_epi16(v_src, v_src); + v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16)); + v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16)); + v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpacklo_epi16(v_mullo, v_mulhi)); + v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpackhi_epi16(v_mullo, v_mulhi)); + } + + int CV_DECL_ALIGNED(16) ar[8]; + _mm_store_si128((__m128i*)ar, v_sum); + _mm_store_si128((__m128i*)(ar + 4), v_sqsum); + + for (int i = 0; i < 4; i += cn) + for (int j = 0; j < cn; ++j) + { + sum[j] += ar[j + i]; + sqsum[j] += ar[4 + j + i]; + } + + return x / cn; + } +}; + +#endif + template static int sumsqr_(const T* src0, const uchar* mask, ST* sum, SQT* sqsum, int len, int cn ) { @@ -807,14 +938,15 @@ static int sumsqr_(const T* src0, const uchar* mask, ST* sum, SQT* sqsum, int le if( !mask ) { - int i; - int k = cn % 4; + SumSqr_SIMD vop; + int i = vop(src0, mask, sum, sqsum, len, cn), k = cn % 4; + src += i * cn; if( k == 1 ) { ST s0 = sum[0]; SQT sq0 = sqsum[0]; - for( i = 0; i < len; i++, src += cn ) + for( ; i < len; i++, src += cn ) { T v = src[0]; s0 += v; sq0 += (SQT)v*v; @@ -826,7 +958,7 @@ static int sumsqr_(const T* src0, const uchar* mask, ST* sum, SQT* sqsum, int le { ST s0 = sum[0], s1 = sum[1]; SQT sq0 = sqsum[0], sq1 = sqsum[1]; - for( i = 0; i < len; i++, src += cn ) + for( ; i < len; i++, src += cn ) { T v0 = src[0], v1 = src[1]; s0 += v0; sq0 += (SQT)v0*v0; @@ -839,7 +971,7 @@ static int sumsqr_(const T* src0, const uchar* mask, ST* sum, SQT* sqsum, int le { ST s0 = sum[0], s1 = sum[1], s2 = sum[2]; SQT sq0 = sqsum[0], sq1 = sqsum[1], sq2 = sqsum[2]; - for( i = 0; i < len; i++, src += cn ) + for( ; i < len; i++, src += cn ) { T v0 = src[0], v1 = src[1], v2 = src[2]; s0 += v0; sq0 += (SQT)v0*v0; @@ -855,7 +987,7 @@ static int sumsqr_(const T* src0, const uchar* mask, ST* sum, SQT* sqsum, int le src = src0 + k; ST s0 = sum[k], s1 = sum[k+1], s2 = sum[k+2], s3 = sum[k+3]; SQT sq0 = sqsum[k], sq1 = sqsum[k+1], sq2 = sqsum[k+2], sq3 = sqsum[k+3]; - for( i = 0; i < len; i++, src += cn ) + for( ; i < len; i++, src += cn ) { T v0, v1; v0 = src[0], v1 = src[1]; diff --git a/modules/imgproc/src/color.cpp b/modules/imgproc/src/color.cpp index 5ae1170b43..b900cf1845 100644 --- a/modules/imgproc/src/color.cpp +++ b/modules/imgproc/src/color.cpp @@ -1598,10 +1598,10 @@ struct RGB2Gray haveSIMD = checkHardwareSupport(CV_CPU_SSE2); } - void process(__m128 v_r, __m128 v_g, __m128 v_b, + void process(__m128 v_b, __m128 v_g, __m128 v_r, __m128 & v_gray) const { - v_gray = _mm_mul_ps(v_r, v_cb); + v_gray = _mm_mul_ps(v_r, v_cr); v_gray = _mm_add_ps(v_gray, _mm_mul_ps(v_g, v_cg)); v_gray = _mm_add_ps(v_gray, _mm_mul_ps(v_b, v_cb)); } diff --git a/modules/imgproc/src/imgwarp.cpp b/modules/imgproc/src/imgwarp.cpp index 304210f84e..fe126fbbd1 100644 --- a/modules/imgproc/src/imgwarp.cpp +++ b/modules/imgproc/src/imgwarp.cpp @@ -5016,8 +5016,8 @@ void cv::convertMaps( InputArray _map1, InputArray _map2, vandq_s32(v_ix1, v_mask))); vst1q_u16(dst2 + x, vcombine_u16(v_dst0, v_dst1)); } - #elif CV_SSE2 - if (useSSE2) + #elif CV_SSE4_1 + if (useSSE4_1) { __m128 v_its = _mm_set1_ps(INTER_TAB_SIZE); __m128i v_its1 = _mm_set1_epi32(INTER_TAB_SIZE-1); diff --git a/modules/imgproc/src/smooth.cpp b/modules/imgproc/src/smooth.cpp index 2dc2fbdf7e..ec274259e1 100644 --- a/modules/imgproc/src/smooth.cpp +++ b/modules/imgproc/src/smooth.cpp @@ -842,7 +842,7 @@ struct ColumnSum : { int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i)); - vst1q_s32(D + i, v_s01); + vst1q_s32(D + i, v_s0); vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i))); } #endif