diff --git a/CMakeLists.txt b/CMakeLists.txt index d9bb04081c..da0b42cb1c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -221,9 +221,9 @@ OCV_OPTION(ENABLE_SSE3 "Enable SSE3 instructions" OCV_OPTION(ENABLE_SSSE3 "Enable SSSE3 instructions" OFF IF (CMAKE_COMPILER_IS_GNUCXX AND (X86 OR X86_64)) ) OCV_OPTION(ENABLE_SSE41 "Enable SSE4.1 instructions" OFF IF ((CV_ICC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) ) OCV_OPTION(ENABLE_SSE42 "Enable SSE4.2 instructions" OFF IF (CMAKE_COMPILER_IS_GNUCXX AND (X86 OR X86_64)) ) -OCV_OPTION(ENABLE_FMA3 "Enable FMA3 instructions" ON IF (CMAKE_COMPILER_IS_GNUCXX AND (X86 OR X86_64)) ) OCV_OPTION(ENABLE_AVX "Enable AVX instructions" OFF IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) ) OCV_OPTION(ENABLE_AVX2 "Enable AVX2 instructions" OFF IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) ) +OCV_OPTION(ENABLE_FMA3 "Enable FMA3 instructions" OFF IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) ) OCV_OPTION(ENABLE_NEON "Enable NEON instructions" OFF IF CMAKE_COMPILER_IS_GNUCXX AND (ARM OR IOS) ) OCV_OPTION(ENABLE_VFPV3 "Enable VFPv3-D32 instructions" OFF IF CMAKE_COMPILER_IS_GNUCXX AND (ARM OR IOS) ) OCV_OPTION(ENABLE_NOISY_WARNINGS "Show all warnings even if they are too noisy" OFF ) diff --git a/cmake/OpenCVCompilerOptions.cmake b/cmake/OpenCVCompilerOptions.cmake index 3d1155c872..2f5f13d7bf 100644 --- a/cmake/OpenCVCompilerOptions.cmake +++ b/cmake/OpenCVCompilerOptions.cmake @@ -145,6 +145,10 @@ if(CMAKE_COMPILER_IS_GNUCXX) endif() if(ENABLE_AVX2) add_extra_compiler_option(-mavx2) + + if(ENABLE_FMA3) + add_extra_compiler_option(-mfma) + endif() endif() # GCC depresses SSEx instructions when -mavx is used. Instead, it generates new AVX instructions or AVX equivalence for all SSEx instructions when needed. @@ -165,10 +169,6 @@ if(CMAKE_COMPILER_IS_GNUCXX) add_extra_compiler_option(-msse4.2) endif() endif() - - if(ENABLE_FMA3) - add_extra_compiler_option(-mfma) - endif() endif(NOT MINGW) if(X86 OR X86_64) diff --git a/modules/core/include/opencv2/core/base.hpp b/modules/core/include/opencv2/core/base.hpp index c60eedddcc..f2acaa3fb4 100644 --- a/modules/core/include/opencv2/core/base.hpp +++ b/modules/core/include/opencv2/core/base.hpp @@ -13,6 +13,7 @@ // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. // Copyright (C) 2009, Willow Garage Inc., all rights reserved. // Copyright (C) 2013, OpenCV Foundation, all rights reserved. +// Copyright (C) 2014, Itseez Inc., all rights reserved. // Third party copyrights are property of their respective owners. // // Redistribution and use in source and binary forms, with or without modification, diff --git a/modules/core/include/opencv2/core/cvdef.h b/modules/core/include/opencv2/core/cvdef.h index 5fa45a592b..ded58a18c5 100644 --- a/modules/core/include/opencv2/core/cvdef.h +++ b/modules/core/include/opencv2/core/cvdef.h @@ -13,6 +13,7 @@ // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. // Copyright (C) 2009, Willow Garage Inc., all rights reserved. // Copyright (C) 2013, OpenCV Foundation, all rights reserved. +// Copyright (C) 2015, Itseez Inc., all rights reserved. // Third party copyrights are property of their respective owners. // // Redistribution and use in source and binary forms, with or without modification, @@ -157,15 +158,11 @@ # include # define CV_SSE4_2 1 # endif -# if defined __FMA__ || (defined _MSC_VER && _MSC_VER >= 1500) -# include -# define CV_FMA3 1 -# endif # if defined __POPCNT__ || (defined _MSC_VER && _MSC_VER >= 1500) # include # define CV_POPCNT 1 # endif -# if defined __AVX__ || defined __AVX2__ || (defined _MSC_FULL_VER && _MSC_FULL_VER >= 160040219) +# if defined __AVX__ || (defined _MSC_FULL_VER && _MSC_FULL_VER >= 160040219) // MS Visual Studio 2010 (2012?) has no macro pre-defined to identify the use of /arch:AVX // See: http://connect.microsoft.com/VisualStudio/feedback/details/605858/arch-avx-should-define-a-predefined-macro-in-x64-and-set-a-unique-value-for-m-ix86-fp-in-win32 # include @@ -179,6 +176,9 @@ # if defined __AVX2__ || (defined _MSC_FULL_VER && _MSC_FULL_VER >= 160040219) # include # define CV_AVX2 1 +# if defined __FMA__ +# define CV_FMA3 1 +# endif # endif #endif @@ -194,6 +194,9 @@ #endif // __CUDACC__ +#ifndef CV_POPCNT +#define CV_POPCNT 0 +#endif #ifndef CV_MMX # define CV_MMX 0 #endif @@ -221,9 +224,6 @@ #ifndef CV_AVX2 # define CV_AVX2 0 #endif -#ifndef CV_POPCNT -#define CV_POPCNT 0 -#endif #ifndef CV_FMA3 # define CV_FMA3 0 #endif diff --git a/modules/core/include/opencv2/core/sse_utils.hpp b/modules/core/include/opencv2/core/sse_utils.hpp index 9db8f0ade9..0667ae9210 100644 --- a/modules/core/include/opencv2/core/sse_utils.hpp +++ b/modules/core/include/opencv2/core/sse_utils.hpp @@ -10,7 +10,7 @@ // License Agreement // For Open Source Computer Vision Library // -// Copyright (C) 2015, OpenCV Foundation, all rights reserved. +// Copyright (C) 2015, Itseez Inc., all rights reserved. // Third party copyrights are property of their respective owners. // // Redistribution and use in source and binary forms, with or without modification, @@ -48,6 +48,34 @@ #if CV_SSE2 +inline void _mm_deinterleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1) +{ + __m128i layer1_chunk0 = _mm_unpacklo_epi8(v_r0, v_g0); + __m128i layer1_chunk1 = _mm_unpackhi_epi8(v_r0, v_g0); + __m128i layer1_chunk2 = _mm_unpacklo_epi8(v_r1, v_g1); + __m128i layer1_chunk3 = _mm_unpackhi_epi8(v_r1, v_g1); + + __m128i layer2_chunk0 = _mm_unpacklo_epi8(layer1_chunk0, layer1_chunk2); + __m128i layer2_chunk1 = _mm_unpackhi_epi8(layer1_chunk0, layer1_chunk2); + __m128i layer2_chunk2 = _mm_unpacklo_epi8(layer1_chunk1, layer1_chunk3); + __m128i layer2_chunk3 = _mm_unpackhi_epi8(layer1_chunk1, layer1_chunk3); + + __m128i layer3_chunk0 = _mm_unpacklo_epi8(layer2_chunk0, layer2_chunk2); + __m128i layer3_chunk1 = _mm_unpackhi_epi8(layer2_chunk0, layer2_chunk2); + __m128i layer3_chunk2 = _mm_unpacklo_epi8(layer2_chunk1, layer2_chunk3); + __m128i layer3_chunk3 = _mm_unpackhi_epi8(layer2_chunk1, layer2_chunk3); + + __m128i layer4_chunk0 = _mm_unpacklo_epi8(layer3_chunk0, layer3_chunk2); + __m128i layer4_chunk1 = _mm_unpackhi_epi8(layer3_chunk0, layer3_chunk2); + __m128i layer4_chunk2 = _mm_unpacklo_epi8(layer3_chunk1, layer3_chunk3); + __m128i layer4_chunk3 = _mm_unpackhi_epi8(layer3_chunk1, layer3_chunk3); + + v_r0 = _mm_unpacklo_epi8(layer4_chunk0, layer4_chunk2); + v_r1 = _mm_unpackhi_epi8(layer4_chunk0, layer4_chunk2); + v_g0 = _mm_unpacklo_epi8(layer4_chunk1, layer4_chunk3); + v_g1 = _mm_unpackhi_epi8(layer4_chunk1, layer4_chunk3); +} + inline void _mm_deinterleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1, __m128i & v_b0, __m128i & v_b1) { @@ -228,6 +256,29 @@ inline void _mm_interleavee_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, v_a1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk6, 8), _mm_srli_epi16(layer1_chunk7, 8)); } +inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1) +{ + __m128i layer1_chunk0 = _mm_unpacklo_epi16(v_r0, v_g0); + __m128i layer1_chunk1 = _mm_unpackhi_epi16(v_r0, v_g0); + __m128i layer1_chunk2 = _mm_unpacklo_epi16(v_r1, v_g1); + __m128i layer1_chunk3 = _mm_unpackhi_epi16(v_r1, v_g1); + + __m128i layer2_chunk0 = _mm_unpacklo_epi16(layer1_chunk0, layer1_chunk2); + __m128i layer2_chunk1 = _mm_unpackhi_epi16(layer1_chunk0, layer1_chunk2); + __m128i layer2_chunk2 = _mm_unpacklo_epi16(layer1_chunk1, layer1_chunk3); + __m128i layer2_chunk3 = _mm_unpackhi_epi16(layer1_chunk1, layer1_chunk3); + + __m128i layer3_chunk0 = _mm_unpacklo_epi16(layer2_chunk0, layer2_chunk2); + __m128i layer3_chunk1 = _mm_unpackhi_epi16(layer2_chunk0, layer2_chunk2); + __m128i layer3_chunk2 = _mm_unpacklo_epi16(layer2_chunk1, layer2_chunk3); + __m128i layer3_chunk3 = _mm_unpackhi_epi16(layer2_chunk1, layer2_chunk3); + + v_r0 = _mm_unpacklo_epi16(layer3_chunk0, layer3_chunk2); + v_r1 = _mm_unpackhi_epi16(layer3_chunk0, layer3_chunk2); + v_g0 = _mm_unpacklo_epi16(layer3_chunk1, layer3_chunk3); + v_g1 = _mm_unpackhi_epi16(layer3_chunk1, layer3_chunk3); +} + inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1, __m128i & v_b0, __m128i & v_b1) { @@ -300,6 +351,8 @@ inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g v_a1 = _mm_unpackhi_epi16(layer3_chunk3, layer3_chunk7); } +#if CV_SSE4_1 + inline void _mm_interleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1, __m128i & v_b0, __m128i & v_b1) { @@ -376,6 +429,26 @@ inline void _mm_interleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, v_a1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk6, 16), _mm_srli_epi32(layer1_chunk7, 16)); } +#endif // CV_SSE4_1 + +inline void _mm_deinterleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1) +{ + __m128 layer1_chunk0 = _mm_unpacklo_ps(v_r0, v_g0); + __m128 layer1_chunk1 = _mm_unpackhi_ps(v_r0, v_g0); + __m128 layer1_chunk2 = _mm_unpacklo_ps(v_r1, v_g1); + __m128 layer1_chunk3 = _mm_unpackhi_ps(v_r1, v_g1); + + __m128 layer2_chunk0 = _mm_unpacklo_ps(layer1_chunk0, layer1_chunk2); + __m128 layer2_chunk1 = _mm_unpackhi_ps(layer1_chunk0, layer1_chunk2); + __m128 layer2_chunk2 = _mm_unpacklo_ps(layer1_chunk1, layer1_chunk3); + __m128 layer2_chunk3 = _mm_unpackhi_ps(layer1_chunk1, layer1_chunk3); + + v_r0 = _mm_unpacklo_ps(layer2_chunk0, layer2_chunk2); + v_r1 = _mm_unpackhi_ps(layer2_chunk0, layer2_chunk2); + v_g0 = _mm_unpacklo_ps(layer2_chunk1, layer2_chunk3); + v_g1 = _mm_unpackhi_ps(layer2_chunk1, layer2_chunk3); +} + inline void _mm_deinterleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1, __m128 & v_b0, __m128 & v_b1) { @@ -492,6 +565,6 @@ inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m12 v_a1 = _mm_shuffle_ps(layer1_chunk6, layer1_chunk7, mask_hi); } -#endif +#endif // CV_SSE2 #endif //__OPENCV_CORE_SSE_UTILS_HPP__ diff --git a/modules/core/include/opencv2/core/utility.hpp b/modules/core/include/opencv2/core/utility.hpp index fb8ccd88d7..f89560a809 100644 --- a/modules/core/include/opencv2/core/utility.hpp +++ b/modules/core/include/opencv2/core/utility.hpp @@ -13,6 +13,7 @@ // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. // Copyright (C) 2009, Willow Garage Inc., all rights reserved. // Copyright (C) 2013, OpenCV Foundation, all rights reserved. +// Copyright (C) 2015, Itseez Inc., all rights reserved. // Third party copyrights are property of their respective owners. // // Redistribution and use in source and binary forms, with or without modification, diff --git a/modules/core/src/arithm.cpp b/modules/core/src/arithm.cpp index a9bf3d7e78..07678f0c6d 100644 --- a/modules/core/src/arithm.cpp +++ b/modules/core/src/arithm.cpp @@ -12,6 +12,7 @@ // // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. // Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved. +// Copyright (C) 2014-2015, Itseez Inc., all rights reserved. // Third party copyrights are property of their respective owners. // // Redistribution and use in source and binary forms, with or without modification, diff --git a/modules/core/src/convert.cpp b/modules/core/src/convert.cpp index ef8edee6a6..5c792f379d 100644 --- a/modules/core/src/convert.cpp +++ b/modules/core/src/convert.cpp @@ -12,6 +12,7 @@ // // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. // Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved. +// Copyright (C) 2014-2015, Itseez Inc., all rights reserved. // Third party copyrights are property of their respective owners. // // Redistribution and use in source and binary forms, with or without modification, @@ -62,8 +63,11 @@ template struct VSplit4; #define SPLIT2_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \ template<> \ - struct name{ \ - void operator()(const data_type* src, data_type* dst0, data_type* dst1){ \ + struct name \ + { \ + void operator()(const data_type* src, data_type* dst0, \ + data_type* dst1) const \ + { \ reg_type r = load_func(src); \ store_func(dst0, r.val[0]); \ store_func(dst1, r.val[1]); \ @@ -72,9 +76,11 @@ template struct VSplit4; #define SPLIT3_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \ template<> \ - struct name{ \ + struct name \ + { \ void operator()(const data_type* src, data_type* dst0, data_type* dst1, \ - data_type* dst2){ \ + data_type* dst2) const \ + { \ reg_type r = load_func(src); \ store_func(dst0, r.val[0]); \ store_func(dst1, r.val[1]); \ @@ -84,9 +90,11 @@ template struct VSplit4; #define SPLIT4_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \ template<> \ - struct name{ \ + struct name \ + { \ void operator()(const data_type* src, data_type* dst0, data_type* dst1, \ - data_type* dst2, data_type* dst3){ \ + data_type* dst2, data_type* dst3) const \ + { \ reg_type r = load_func(src); \ store_func(dst0, r.val[0]); \ store_func(dst1, r.val[1]); \ @@ -96,28 +104,174 @@ template struct VSplit4; } SPLIT2_KERNEL_TEMPLATE(VSplit2, uchar , uint8x16x2_t, vld2q_u8 , vst1q_u8 ); -SPLIT2_KERNEL_TEMPLATE(VSplit2, schar , int8x16x2_t, vld2q_s8 , vst1q_s8 ); SPLIT2_KERNEL_TEMPLATE(VSplit2, ushort, uint16x8x2_t, vld2q_u16, vst1q_u16); -SPLIT2_KERNEL_TEMPLATE(VSplit2, short , int16x8x2_t, vld2q_s16, vst1q_s16); SPLIT2_KERNEL_TEMPLATE(VSplit2, int , int32x4x2_t, vld2q_s32, vst1q_s32); -SPLIT2_KERNEL_TEMPLATE(VSplit2, float , float32x4x2_t, vld2q_f32, vst1q_f32); SPLIT2_KERNEL_TEMPLATE(VSplit2, int64 , int64x1x2_t, vld2_s64 , vst1_s64 ); SPLIT3_KERNEL_TEMPLATE(VSplit3, uchar , uint8x16x3_t, vld3q_u8 , vst1q_u8 ); -SPLIT3_KERNEL_TEMPLATE(VSplit3, schar , int8x16x3_t, vld3q_s8 , vst1q_s8 ); SPLIT3_KERNEL_TEMPLATE(VSplit3, ushort, uint16x8x3_t, vld3q_u16, vst1q_u16); -SPLIT3_KERNEL_TEMPLATE(VSplit3, short , int16x8x3_t, vld3q_s16, vst1q_s16); SPLIT3_KERNEL_TEMPLATE(VSplit3, int , int32x4x3_t, vld3q_s32, vst1q_s32); -SPLIT3_KERNEL_TEMPLATE(VSplit3, float , float32x4x3_t, vld3q_f32, vst1q_f32); SPLIT3_KERNEL_TEMPLATE(VSplit3, int64 , int64x1x3_t, vld3_s64 , vst1_s64 ); SPLIT4_KERNEL_TEMPLATE(VSplit4, uchar , uint8x16x4_t, vld4q_u8 , vst1q_u8 ); -SPLIT4_KERNEL_TEMPLATE(VSplit4, schar , int8x16x4_t, vld4q_s8 , vst1q_s8 ); SPLIT4_KERNEL_TEMPLATE(VSplit4, ushort, uint16x8x4_t, vld4q_u16, vst1q_u16); -SPLIT4_KERNEL_TEMPLATE(VSplit4, short , int16x8x4_t, vld4q_s16, vst1q_s16); SPLIT4_KERNEL_TEMPLATE(VSplit4, int , int32x4x4_t, vld4q_s32, vst1q_s32); -SPLIT4_KERNEL_TEMPLATE(VSplit4, float , float32x4x4_t, vld4q_f32, vst1q_f32); SPLIT4_KERNEL_TEMPLATE(VSplit4, int64 , int64x1x4_t, vld4_s64 , vst1_s64 ); + +#elif CV_SSE2 + +template +struct VSplit2 +{ + VSplit2() : support(false) { } + void operator()(const T *, T *, T *) const { } + + bool support; +}; + +template +struct VSplit3 +{ + VSplit3() : support(false) { } + void operator()(const T *, T *, T *, T *) const { } + + bool support; +}; + +template +struct VSplit4 +{ + VSplit4() : support(false) { } + void operator()(const T *, T *, T *, T *, T *) const { } + + bool support; +}; + +#define SPLIT2_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_deinterleave, flavor) \ +template <> \ +struct VSplit2 \ +{ \ + enum \ + { \ + ELEMS_IN_VEC = 16 / sizeof(data_type) \ + }; \ + \ + VSplit2() \ + { \ + support = true; \ + } \ + \ + void operator()(const data_type * src, \ + data_type * dst0, data_type * dst1) const \ + { \ + reg_type v_src0 = _mm_loadu_##flavor((cast_type const *)(src)); \ + reg_type v_src1 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC)); \ + reg_type v_src2 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 2)); \ + reg_type v_src3 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 3)); \ + \ + _mm_deinterleave(v_src0, v_src1, v_src2, v_src3); \ + \ + _mm_storeu_##flavor((cast_type *)(dst0), v_src0); \ + _mm_storeu_##flavor((cast_type *)(dst0 + ELEMS_IN_VEC), v_src1); \ + _mm_storeu_##flavor((cast_type *)(dst1), v_src2); \ + _mm_storeu_##flavor((cast_type *)(dst1 + ELEMS_IN_VEC), v_src3); \ + } \ + \ + bool support; \ +} + +#define SPLIT3_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_deinterleave, flavor) \ +template <> \ +struct VSplit3 \ +{ \ + enum \ + { \ + ELEMS_IN_VEC = 16 / sizeof(data_type) \ + }; \ + \ + VSplit3() \ + { \ + support = true; \ + } \ + \ + void operator()(const data_type * src, \ + data_type * dst0, data_type * dst1, data_type * dst2) const \ + { \ + reg_type v_src0 = _mm_loadu_##flavor((cast_type const *)(src)); \ + reg_type v_src1 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC)); \ + reg_type v_src2 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 2)); \ + reg_type v_src3 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 3)); \ + reg_type v_src4 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 4)); \ + reg_type v_src5 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 5)); \ + \ + _mm_deinterleave(v_src0, v_src1, v_src2, \ + v_src3, v_src4, v_src5); \ + \ + _mm_storeu_##flavor((cast_type *)(dst0), v_src0); \ + _mm_storeu_##flavor((cast_type *)(dst0 + ELEMS_IN_VEC), v_src1); \ + _mm_storeu_##flavor((cast_type *)(dst1), v_src2); \ + _mm_storeu_##flavor((cast_type *)(dst1 + ELEMS_IN_VEC), v_src3); \ + _mm_storeu_##flavor((cast_type *)(dst2), v_src4); \ + _mm_storeu_##flavor((cast_type *)(dst2 + ELEMS_IN_VEC), v_src5); \ + } \ + \ + bool support; \ +} + +#define SPLIT4_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_deinterleave, flavor) \ +template <> \ +struct VSplit4 \ +{ \ + enum \ + { \ + ELEMS_IN_VEC = 16 / sizeof(data_type) \ + }; \ + \ + VSplit4() \ + { \ + support = true; \ + } \ + \ + void operator()(const data_type * src, data_type * dst0, data_type * dst1, \ + data_type * dst2, data_type * dst3) const \ + { \ + reg_type v_src0 = _mm_loadu_##flavor((cast_type const *)(src)); \ + reg_type v_src1 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC)); \ + reg_type v_src2 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 2)); \ + reg_type v_src3 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 3)); \ + reg_type v_src4 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 4)); \ + reg_type v_src5 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 5)); \ + reg_type v_src6 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 6)); \ + reg_type v_src7 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 7)); \ + \ + _mm_deinterleave(v_src0, v_src1, v_src2, v_src3, \ + v_src4, v_src5, v_src6, v_src7); \ + \ + _mm_storeu_##flavor((cast_type *)(dst0), v_src0); \ + _mm_storeu_##flavor((cast_type *)(dst0 + ELEMS_IN_VEC), v_src1); \ + _mm_storeu_##flavor((cast_type *)(dst1), v_src2); \ + _mm_storeu_##flavor((cast_type *)(dst1 + ELEMS_IN_VEC), v_src3); \ + _mm_storeu_##flavor((cast_type *)(dst2), v_src4); \ + _mm_storeu_##flavor((cast_type *)(dst2 + ELEMS_IN_VEC), v_src5); \ + _mm_storeu_##flavor((cast_type *)(dst3), v_src6); \ + _mm_storeu_##flavor((cast_type *)(dst3 + ELEMS_IN_VEC), v_src7); \ + } \ + \ + bool support; \ +} + +SPLIT2_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128); +SPLIT2_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128); +SPLIT2_KERNEL_TEMPLATE( int, __m128, float, _mm_deinterleave_ps, ps); + +SPLIT3_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128); +SPLIT3_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128); +SPLIT3_KERNEL_TEMPLATE( int, __m128, float, _mm_deinterleave_ps, ps); + +SPLIT4_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128); +SPLIT4_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128); +SPLIT4_KERNEL_TEMPLATE( int, __m128, float, _mm_deinterleave_ps, ps); + #endif template static void @@ -154,6 +308,19 @@ split_( const T* src, T** dst, int len, int cn ) for( ; i < len - inc_i; i += inc_i, j += inc_j) vsplit(src + j, dst0 + i, dst1 + i); } +#elif CV_SSE2 + if (cn == 2) + { + int inc_i = 32/sizeof(T); + int inc_j = 2 * inc_i; + + VSplit2 vsplit; + if (vsplit.support) + { + for( ; i <= len - inc_i; i += inc_i, j += inc_j) + vsplit(src + j, dst0 + i, dst1 + i); + } + } #endif for( ; i < len; i++, j += cn ) { @@ -176,6 +343,20 @@ split_( const T* src, T** dst, int len, int cn ) for( ; i <= len - inc_i; i += inc_i, j += inc_j) vsplit(src + j, dst0 + i, dst1 + i, dst2 + i); } +#elif CV_SSE2 + if (cn == 3) + { + int inc_i = 32/sizeof(T); + int inc_j = 3 * inc_i; + + VSplit3 vsplit; + + if (vsplit.support) + { + for( ; i <= len - inc_i; i += inc_i, j += inc_j) + vsplit(src + j, dst0 + i, dst1 + i, dst2 + i); + } + } #endif for( ; i < len; i++, j += cn ) { @@ -199,6 +380,19 @@ split_( const T* src, T** dst, int len, int cn ) for( ; i <= len - inc_i; i += inc_i, j += inc_j) vsplit(src + j, dst0 + i, dst1 + i, dst2 + i, dst3 + i); } +#elif CV_SSE2 + if (cn == 4) + { + int inc_i = 32/sizeof(T); + int inc_j = 4 * inc_i; + + VSplit4 vsplit; + if (vsplit.support) + { + for( ; i <= len - inc_i; i += inc_i, j += inc_j) + vsplit(src + j, dst0 + i, dst1 + i, dst2 + i, dst3 + i); + } + } #endif for( ; i < len; i++, j += cn ) { @@ -265,27 +459,18 @@ template struct VMerge4; } MERGE2_KERNEL_TEMPLATE(VMerge2, uchar , uint8x16x2_t, vld1q_u8 , vst2q_u8 ); -MERGE2_KERNEL_TEMPLATE(VMerge2, schar , int8x16x2_t, vld1q_s8 , vst2q_s8 ); MERGE2_KERNEL_TEMPLATE(VMerge2, ushort, uint16x8x2_t, vld1q_u16, vst2q_u16); -MERGE2_KERNEL_TEMPLATE(VMerge2, short , int16x8x2_t, vld1q_s16, vst2q_s16); MERGE2_KERNEL_TEMPLATE(VMerge2, int , int32x4x2_t, vld1q_s32, vst2q_s32); -MERGE2_KERNEL_TEMPLATE(VMerge2, float , float32x4x2_t, vld1q_f32, vst2q_f32); MERGE2_KERNEL_TEMPLATE(VMerge2, int64 , int64x1x2_t, vld1_s64 , vst2_s64 ); MERGE3_KERNEL_TEMPLATE(VMerge3, uchar , uint8x16x3_t, vld1q_u8 , vst3q_u8 ); -MERGE3_KERNEL_TEMPLATE(VMerge3, schar , int8x16x3_t, vld1q_s8 , vst3q_s8 ); MERGE3_KERNEL_TEMPLATE(VMerge3, ushort, uint16x8x3_t, vld1q_u16, vst3q_u16); -MERGE3_KERNEL_TEMPLATE(VMerge3, short , int16x8x3_t, vld1q_s16, vst3q_s16); MERGE3_KERNEL_TEMPLATE(VMerge3, int , int32x4x3_t, vld1q_s32, vst3q_s32); -MERGE3_KERNEL_TEMPLATE(VMerge3, float , float32x4x3_t, vld1q_f32, vst3q_f32); MERGE3_KERNEL_TEMPLATE(VMerge3, int64 , int64x1x3_t, vld1_s64 , vst3_s64 ); MERGE4_KERNEL_TEMPLATE(VMerge4, uchar , uint8x16x4_t, vld1q_u8 , vst4q_u8 ); -MERGE4_KERNEL_TEMPLATE(VMerge4, schar , int8x16x4_t, vld1q_s8 , vst4q_s8 ); MERGE4_KERNEL_TEMPLATE(VMerge4, ushort, uint16x8x4_t, vld1q_u16, vst4q_u16); -MERGE4_KERNEL_TEMPLATE(VMerge4, short , int16x8x4_t, vld1q_s16, vst4q_s16); MERGE4_KERNEL_TEMPLATE(VMerge4, int , int32x4x4_t, vld1q_s32, vst4q_s32); -MERGE4_KERNEL_TEMPLATE(VMerge4, float , float32x4x4_t, vld1q_f32, vst4q_f32); MERGE4_KERNEL_TEMPLATE(VMerge4, int64 , int64x1x4_t, vld1_s64 , vst4_s64 ); #endif diff --git a/modules/core/src/copy.cpp b/modules/core/src/copy.cpp index 301ea80a1f..fe8ffd7718 100644 --- a/modules/core/src/copy.cpp +++ b/modules/core/src/copy.cpp @@ -11,6 +11,7 @@ // // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. // Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved. +// Copyright (C) 2014, Itseez Inc., all rights reserved. // Third party copyrights are property of their respective owners. // // Redistribution and use in source and binary forms, with or without modification, diff --git a/modules/core/src/mathfuncs.cpp b/modules/core/src/mathfuncs.cpp index d3d09c338f..7b27dc3507 100644 --- a/modules/core/src/mathfuncs.cpp +++ b/modules/core/src/mathfuncs.cpp @@ -12,6 +12,7 @@ // // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. // Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved. +// Copyright (C) 2014-2015, Itseez Inc., all rights reserved. // Third party copyrights are property of their respective owners. // // Redistribution and use in source and binary forms, with or without modification, diff --git a/modules/core/src/matmul.cpp b/modules/core/src/matmul.cpp index b2f36b3292..6c8bad2444 100644 --- a/modules/core/src/matmul.cpp +++ b/modules/core/src/matmul.cpp @@ -12,6 +12,7 @@ // // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. // Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved. +// Copyright (C) 2014-2015, Itseez Inc., all rights reserved. // Third party copyrights are property of their respective owners. // // Redistribution and use in source and binary forms, with or without modification, diff --git a/modules/core/src/stat.cpp b/modules/core/src/stat.cpp index 530e3205bd..4eb17d6a14 100644 --- a/modules/core/src/stat.cpp +++ b/modules/core/src/stat.cpp @@ -12,6 +12,7 @@ // // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. // Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved. +// Copyright (C) 2014-2015, Itseez Inc., all rights reserved. // Third party copyrights are property of their respective owners. // // Redistribution and use in source and binary forms, with or without modification, @@ -404,13 +405,20 @@ static const uchar * initPopcountTable() { // we compute inverse popcount table, // since we pass (img[x] == 0) mask as index in the table. - for( int j = 0; j < 256; j++ ) + unsigned int j = 0u; +#if CV_POPCNT + if (checkHardwareSupport(CV_CPU_POPCNT)) + for( ; j < 256u; j++ ) + tab[j] = (uchar)(8 - _mm_popcnt_u32(j)); +#else + for( ; j < 256u; j++ ) { int val = 0; for( int mask = 1; mask < 256; mask += mask ) val += (j & mask) == 0; tab[j] = (uchar)val; } +#endif initialized = true; } diff --git a/modules/core/src/system.cpp b/modules/core/src/system.cpp index a7a6e98d4d..6744b33116 100644 --- a/modules/core/src/system.cpp +++ b/modules/core/src/system.cpp @@ -12,6 +12,7 @@ // // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. // Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 2015, Itseez Inc., all rights reserved. // Third party copyrights are property of their respective owners. // // Redistribution and use in source and binary forms, with or without modification, diff --git a/modules/core/src/umatrix.cpp b/modules/core/src/umatrix.cpp index 443111f48d..ffc20777b5 100644 --- a/modules/core/src/umatrix.cpp +++ b/modules/core/src/umatrix.cpp @@ -10,8 +10,7 @@ // License Agreement // For Open Source Computer Vision Library // -// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. -// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 2014, Itseez Inc., all rights reserved. // Third party copyrights are property of their respective owners. // // Redistribution and use in source and binary forms, with or without modification, diff --git a/modules/imgproc/src/accum.cpp b/modules/imgproc/src/accum.cpp index d2e8b39aa3..23dc4576ba 100644 --- a/modules/imgproc/src/accum.cpp +++ b/modules/imgproc/src/accum.cpp @@ -12,6 +12,7 @@ // // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. // Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 2014, Itseez Inc., all rights reserved. // Third party copyrights are property of their respective owners. / // Redistribution and use in source and binary forms, with or without modification, diff --git a/modules/imgproc/src/canny.cpp b/modules/imgproc/src/canny.cpp index 1311d5abb9..233218b3e2 100644 --- a/modules/imgproc/src/canny.cpp +++ b/modules/imgproc/src/canny.cpp @@ -11,6 +11,7 @@ // For Open Source Computer Vision Library // // Copyright (C) 2000, Intel Corporation, all rights reserved. +// Copyright (C) 2014, Itseez Inc., all rights reserved. // Third party copyrights are property of their respective owners. // // Redistribution and use in source and binary forms, with or without modification, diff --git a/modules/imgproc/src/clahe.cpp b/modules/imgproc/src/clahe.cpp index 18a91d9544..06fc73153f 100644 --- a/modules/imgproc/src/clahe.cpp +++ b/modules/imgproc/src/clahe.cpp @@ -11,6 +11,7 @@ // For Open Source Computer Vision Library // // Copyright (C) 2013, NVIDIA Corporation, all rights reserved. +// Copyright (C) 2014, Itseez Inc., all rights reserved. // Third party copyrights are property of their respective owners. // // Redistribution and use in source and binary forms, with or without modification, diff --git a/modules/imgproc/src/color.cpp b/modules/imgproc/src/color.cpp index 6049f9993f..574ff16279 100644 --- a/modules/imgproc/src/color.cpp +++ b/modules/imgproc/src/color.cpp @@ -12,6 +12,7 @@ // // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. // Copyright (C) 2009-2010, Willow Garage Inc., all rights reserved. +// Copyright (C) 2014-2015, Itseez Inc., all rights reserved. // Third party copyrights are property of their respective owners. // // Redistribution and use in source and binary forms, with or without modification, diff --git a/modules/imgproc/src/corner.cpp b/modules/imgproc/src/corner.cpp index 85f2063b28..0d04f6f7e1 100644 --- a/modules/imgproc/src/corner.cpp +++ b/modules/imgproc/src/corner.cpp @@ -12,6 +12,7 @@ // // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. // Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 2014, Itseez Inc., all rights reserved. // Third party copyrights are property of their respective owners. // // Redistribution and use in source and binary forms, with or without modification, diff --git a/modules/imgproc/src/demosaicing.cpp b/modules/imgproc/src/demosaicing.cpp index 0b7afb8ea6..cec450dc71 100644 --- a/modules/imgproc/src/demosaicing.cpp +++ b/modules/imgproc/src/demosaicing.cpp @@ -12,6 +12,7 @@ // // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. // Copyright (C) 2009-2010, Willow Garage Inc., all rights reserved. +// Copyright (C) 2014, Itseez Inc., all rights reserved. // Third party copyrights are property of their respective owners. // // Redistribution and use in source and binary forms, with or without modification, diff --git a/modules/imgproc/src/imgwarp.cpp b/modules/imgproc/src/imgwarp.cpp index 1fa4557cad..d8b5385910 100644 --- a/modules/imgproc/src/imgwarp.cpp +++ b/modules/imgproc/src/imgwarp.cpp @@ -12,6 +12,7 @@ // // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. // Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 2014-2015, Itseez Inc., all rights reserved. // Third party copyrights are property of their respective owners. // // Redistribution and use in source and binary forms, with or without modification, diff --git a/modules/imgproc/src/pyramids.cpp b/modules/imgproc/src/pyramids.cpp index 5425de11cd..f489372188 100644 --- a/modules/imgproc/src/pyramids.cpp +++ b/modules/imgproc/src/pyramids.cpp @@ -12,6 +12,7 @@ // // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. // Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 2014-2015, Itseez Inc., all rights reserved. // Third party copyrights are property of their respective owners. // // Redistribution and use in source and binary forms, with or without modification, diff --git a/modules/imgproc/src/smooth.cpp b/modules/imgproc/src/smooth.cpp index b7c3004039..59acdd71b3 100644 --- a/modules/imgproc/src/smooth.cpp +++ b/modules/imgproc/src/smooth.cpp @@ -12,6 +12,7 @@ // // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. // Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 2014-2015, Itseez Inc., all rights reserved. // Third party copyrights are property of their respective owners. // // Redistribution and use in source and binary forms, with or without modification, diff --git a/modules/imgproc/src/sumpixels.cpp b/modules/imgproc/src/sumpixels.cpp index cdef88f6c1..16c7c7ef26 100755 --- a/modules/imgproc/src/sumpixels.cpp +++ b/modules/imgproc/src/sumpixels.cpp @@ -12,6 +12,7 @@ // // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. // Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 2014, Itseez Inc., all rights reserved. // Third party copyrights are property of their respective owners. // // Redistribution and use in source and binary forms, with or without modification, diff --git a/modules/ts/src/ts_func.cpp b/modules/ts/src/ts_func.cpp index 84a9233dd2..b8de763f10 100644 --- a/modules/ts/src/ts_func.cpp +++ b/modules/ts/src/ts_func.cpp @@ -2998,12 +2998,12 @@ void printVersionInfo(bool useStdOut) std::string cpu_features; -#if CV_MMX - if (checkHardwareSupport(CV_CPU_MMX)) cpu_features += " mmx"; -#endif #if CV_POPCNT if (checkHardwareSupport(CV_CPU_POPCNT)) cpu_features += " popcnt"; #endif +#if CV_MMX + if (checkHardwareSupport(CV_CPU_MMX)) cpu_features += " mmx"; +#endif #if CV_SSE if (checkHardwareSupport(CV_CPU_SSE)) cpu_features += " sse"; #endif