From c38023f4e78b68d0fba9fb5ba682cb91ee0d42b5 Mon Sep 17 00:00:00 2001 From: Richard Yoo Date: Wed, 9 Jul 2014 16:55:39 -0700 Subject: [PATCH] Modifications to support dynamic vector dispatch. --- cmake/OpenCVCompilerOptions.cmake | 8 +- cmake/OpenCVModule.cmake | 14 + ...tility_and_system_functions_and_macros.rst | 1 + modules/core/include/opencv2/core/core.hpp | 1 + modules/core/src/system.cpp | 2 - modules/imgproc/src/avx/imgwarp_avx.cpp | 176 ++++++ modules/imgproc/src/avx/imgwarp_avx.hpp | 51 ++ modules/imgproc/src/avx2/imgwarp_avx2.cpp | 431 +++++++++++++ modules/imgproc/src/avx2/imgwarp_avx2.hpp | 57 ++ modules/imgproc/src/imgwarp.cpp | 598 ++---------------- modules/ts/src/ts_func.cpp | 4 - 11 files changed, 786 insertions(+), 557 deletions(-) create mode 100644 modules/imgproc/src/avx/imgwarp_avx.cpp create mode 100644 modules/imgproc/src/avx/imgwarp_avx.hpp create mode 100644 modules/imgproc/src/avx2/imgwarp_avx2.cpp create mode 100644 modules/imgproc/src/avx2/imgwarp_avx2.hpp diff --git a/cmake/OpenCVCompilerOptions.cmake b/cmake/OpenCVCompilerOptions.cmake index f28aaeed50..7de23a66bc 100644 --- a/cmake/OpenCVCompilerOptions.cmake +++ b/cmake/OpenCVCompilerOptions.cmake @@ -140,15 +140,15 @@ if(CMAKE_COMPILER_IS_GNUCXX) # SSE3 and further should be disabled under MingW because it generates compiler errors if(NOT MINGW) if(ENABLE_AVX) - add_extra_compiler_option(-mavx) + ocv_check_flag_support(CXX "-mavx" _varname) endif() if(ENABLE_AVX2) - add_extra_compiler_option(-mavx2) + ocv_check_flag_support(CXX "-mavx2" _varname) endif() # GCC depresses SSEx instructions when -mavx is used. Instead, it generates new AVX instructions or AVX equivalence for all SSEx instructions when needed. - if(NOT OPENCV_EXTRA_CXX_FLAGS MATCHES "-m(avx|avx2)") + if(NOT OPENCV_EXTRA_CXX_FLAGS MATCHES "-mavx") if(ENABLE_SSE3) add_extra_compiler_option(-msse3) endif() @@ -169,7 +169,7 @@ if(CMAKE_COMPILER_IS_GNUCXX) if(X86 OR X86_64) if(NOT APPLE AND CMAKE_SIZEOF_VOID_P EQUAL 4) - if(OPENCV_EXTRA_CXX_FLAGS MATCHES "-m(sse2|avx|avx2)") + if(OPENCV_EXTRA_CXX_FLAGS MATCHES "-m(sse2|avx)") add_extra_compiler_option(-mfpmath=sse)# !! important - be on the same wave with x64 compilers else() add_extra_compiler_option(-mfpmath=387) diff --git a/cmake/OpenCVModule.cmake b/cmake/OpenCVModule.cmake index 79e508609e..f9d333f8d9 100644 --- a/cmake/OpenCVModule.cmake +++ b/cmake/OpenCVModule.cmake @@ -526,6 +526,20 @@ macro(ocv_glob_module_sources) list(APPEND lib_srcs ${cl_kernels} "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.cpp" "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.hpp") endif() + if(ENABLE_AVX) + file(GLOB avx_srcs "src/avx/*.cpp") + foreach(src ${avx_srcs}) + set_source_files_properties(${src} PROPERTIES COMPILE_FLAGS -mavx) + endforeach() + endif() + + if(ENABLE_AVX2) + file(GLOB avx2_srcs "src/avx2/*.cpp") + foreach(src ${avx2_srcs}) + set_source_files_properties(${src} PROPERTIES COMPILE_FLAGS -mavx2) + endforeach() + endif() + source_group("Include" FILES ${lib_hdrs}) source_group("Include\\detail" FILES ${lib_hdrs_detail}) diff --git a/modules/core/doc/utility_and_system_functions_and_macros.rst b/modules/core/doc/utility_and_system_functions_and_macros.rst index 41cf7e1b72..73a6b65386 100644 --- a/modules/core/doc/utility_and_system_functions_and_macros.rst +++ b/modules/core/doc/utility_and_system_functions_and_macros.rst @@ -317,6 +317,7 @@ Returns true if the specified feature is supported by the host hardware. * ``CV_CPU_SSE4_2`` - SSE 4.2 * ``CV_CPU_POPCNT`` - POPCOUNT * ``CV_CPU_AVX`` - AVX + * ``CV_CPU_AVX2`` - AVX2 The function returns true if the host hardware supports the specified feature. When user calls ``setUseOptimized(false)``, the subsequent calls to ``checkHardwareSupport()`` will return false until ``setUseOptimized(true)`` is called. This way user can dynamically switch on and off the optimized code in OpenCV. diff --git a/modules/core/include/opencv2/core/core.hpp b/modules/core/include/opencv2/core/core.hpp index 5667e9e50f..76b9e68133 100644 --- a/modules/core/include/opencv2/core/core.hpp +++ b/modules/core/include/opencv2/core/core.hpp @@ -284,6 +284,7 @@ CV_EXPORTS_W int64 getCPUTickCount(); - CV_CPU_SSE4_2 - SSE 4.2 - CV_CPU_POPCNT - POPCOUNT - CV_CPU_AVX - AVX + - CV_CPU_AVX2 - AVX2 \note {Note that the function output is not static. Once you called cv::useOptimized(false), most of the hardware acceleration is disabled and thus the function will returns false, diff --git a/modules/core/src/system.cpp b/modules/core/src/system.cpp index 40d64ffe1b..5a970d511e 100644 --- a/modules/core/src/system.cpp +++ b/modules/core/src/system.cpp @@ -253,7 +253,6 @@ struct HWFeatures f.have[CV_CPU_AVX] = (((cpuid_data[2] & (1<<28)) != 0)&&((cpuid_data[2] & (1<<27)) != 0));//OS uses XSAVE_XRSTORE and CPU support AVX } -#if CV_AVX2 #if defined _MSC_VER && (defined _M_IX86 || defined _M_X64) __cpuidex(cpuid_data, 7, 0); #elif defined __GNUC__ && (defined __i386__ || defined __x86_64__) @@ -286,7 +285,6 @@ struct HWFeatures { f.have[CV_CPU_AVX2] = (cpuid_data[1] & (1<<5)) != 0; } -#endif return f; } diff --git a/modules/imgproc/src/avx/imgwarp_avx.cpp b/modules/imgproc/src/avx/imgwarp_avx.cpp new file mode 100644 index 0000000000..b7ab44a189 --- /dev/null +++ b/modules/imgproc/src/avx/imgwarp_avx.cpp @@ -0,0 +1,176 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include "../precomp.hpp" +#include "imgwarp_avx.hpp" + +#if CV_AVX +int VResizeLinearVec_32f_avx(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) +{ + const float** src = (const float**)_src; + const float* beta = (const float*)_beta; + const float *S0 = src[0], *S1 = src[1]; + float* dst = (float*)_dst; + int x = 0; + + __m256 b0 = _mm256_set1_ps(beta[0]), b1 = _mm256_set1_ps(beta[1]); + + if( (((size_t)S0|(size_t)S1)&31) == 0 ) + for( ; x <= width - 16; x += 16 ) + { + __m256 x0, x1, y0, y1; + x0 = _mm256_load_ps(S0 + x); + x1 = _mm256_load_ps(S0 + x + 8); + y0 = _mm256_load_ps(S1 + x); + y1 = _mm256_load_ps(S1 + x + 8); + + x0 = _mm256_add_ps(_mm256_mul_ps(x0, b0), _mm256_mul_ps(y0, b1)); + x1 = _mm256_add_ps(_mm256_mul_ps(x1, b0), _mm256_mul_ps(y1, b1)); + + _mm256_storeu_ps( dst + x, x0); + _mm256_storeu_ps( dst + x + 8, x1); + } + else + for( ; x <= width - 16; x += 16 ) + { + __m256 x0, x1, y0, y1; + x0 = _mm256_loadu_ps(S0 + x); + x1 = _mm256_loadu_ps(S0 + x + 8); + y0 = _mm256_loadu_ps(S1 + x); + y1 = _mm256_loadu_ps(S1 + x + 8); + + x0 = _mm256_add_ps(_mm256_mul_ps(x0, b0), _mm256_mul_ps(y0, b1)); + x1 = _mm256_add_ps(_mm256_mul_ps(x1, b0), _mm256_mul_ps(y1, b1)); + + _mm256_storeu_ps( dst + x, x0); + _mm256_storeu_ps( dst + x + 8, x1); + } + + return x; +} + +int VResizeCubicVec_32f_avx(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) +{ + const float** src = (const float**)_src; + const float* beta = (const float*)_beta; + const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3]; + float* dst = (float*)_dst; + int x = 0; + __m256 b0 = _mm256_set1_ps(beta[0]), b1 = _mm256_set1_ps(beta[1]), + b2 = _mm256_set1_ps(beta[2]), b3 = _mm256_set1_ps(beta[3]); + + if( (((size_t)S0|(size_t)S1|(size_t)S2|(size_t)S3)&31) == 0 ) + for( ; x <= width - 16; x += 16 ) + { + __m256 x0, x1, y0, y1, s0, s1; + x0 = _mm256_load_ps(S0 + x); + x1 = _mm256_load_ps(S0 + x + 8); + y0 = _mm256_load_ps(S1 + x); + y1 = _mm256_load_ps(S1 + x + 8); + + s0 = _mm256_mul_ps(x0, b0); + s1 = _mm256_mul_ps(x1, b0); + y0 = _mm256_mul_ps(y0, b1); + y1 = _mm256_mul_ps(y1, b1); + s0 = _mm256_add_ps(s0, y0); + s1 = _mm256_add_ps(s1, y1); + + x0 = _mm256_load_ps(S2 + x); + x1 = _mm256_load_ps(S2 + x + 8); + y0 = _mm256_load_ps(S3 + x); + y1 = _mm256_load_ps(S3 + x + 8); + + x0 = _mm256_mul_ps(x0, b2); + x1 = _mm256_mul_ps(x1, b2); + y0 = _mm256_mul_ps(y0, b3); + y1 = _mm256_mul_ps(y1, b3); + s0 = _mm256_add_ps(s0, x0); + s1 = _mm256_add_ps(s1, x1); + s0 = _mm256_add_ps(s0, y0); + s1 = _mm256_add_ps(s1, y1); + + _mm256_storeu_ps( dst + x, s0); + _mm256_storeu_ps( dst + x + 8, s1); + } + else + for( ; x <= width - 16; x += 16 ) + { + __m256 x0, x1, y0, y1, s0, s1; + x0 = _mm256_loadu_ps(S0 + x); + x1 = _mm256_loadu_ps(S0 + x + 8); + y0 = _mm256_loadu_ps(S1 + x); + y1 = _mm256_loadu_ps(S1 + x + 8); + + s0 = _mm256_mul_ps(x0, b0); + s1 = _mm256_mul_ps(x1, b0); + y0 = _mm256_mul_ps(y0, b1); + y1 = _mm256_mul_ps(y1, b1); + s0 = _mm256_add_ps(s0, y0); + s1 = _mm256_add_ps(s1, y1); + + x0 = _mm256_loadu_ps(S2 + x); + x1 = _mm256_loadu_ps(S2 + x + 8); + y0 = _mm256_loadu_ps(S3 + x); + y1 = _mm256_loadu_ps(S3 + x + 8); + + x0 = _mm256_mul_ps(x0, b2); + x1 = _mm256_mul_ps(x1, b2); + y0 = _mm256_mul_ps(y0, b3); + y1 = _mm256_mul_ps(y1, b3); + s0 = _mm256_add_ps(s0, x0); + s1 = _mm256_add_ps(s1, x1); + s0 = _mm256_add_ps(s0, y0); + s1 = _mm256_add_ps(s1, y1); + + _mm256_storeu_ps( dst + x, s0); + _mm256_storeu_ps( dst + x + 8, s1); + } + + return x; +} +#else +int VResizeLinearVec_32f_avx(const uchar**, uchar*, const uchar*, int ) { return 0; } + +int VResizeCubicVec_32f_avx(const uchar**, uchar*, const uchar*, int ) { return 0; } +#endif + +/* End of file. */ diff --git a/modules/imgproc/src/avx/imgwarp_avx.hpp b/modules/imgproc/src/avx/imgwarp_avx.hpp new file mode 100644 index 0000000000..d3e3a2b342 --- /dev/null +++ b/modules/imgproc/src/avx/imgwarp_avx.hpp @@ -0,0 +1,51 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// Intel License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000, Intel Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of Intel Corporation may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#ifndef _CV_IMGWARP_AVX_H_ +#define _CV_IMGWARP_AVX_H_ + +int VResizeLinearVec_32f_avx(const uchar** _src, uchar* _dst, const uchar* _beta, int width ); + +int VResizeCubicVec_32f_avx(const uchar** _src, uchar* _dst, const uchar* _beta, int width ); + +#endif + +/* End of file. */ diff --git a/modules/imgproc/src/avx2/imgwarp_avx2.cpp b/modules/imgproc/src/avx2/imgwarp_avx2.cpp new file mode 100644 index 0000000000..6e4f1fc6e8 --- /dev/null +++ b/modules/imgproc/src/avx2/imgwarp_avx2.cpp @@ -0,0 +1,431 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include "../precomp.hpp" +#include "imgwarp_avx2.hpp" + +const int INTER_RESIZE_COEF_BITS=11; +const int INTER_RESIZE_COEF_SCALE=1 << INTER_RESIZE_COEF_BITS; + +#if CV_AVX2 +int VResizeLinearVec_32s8u_avx2(const uchar** _src, uchar* dst, const uchar* _beta, int width ) +{ + const int** src = (const int**)_src; + const short* beta = (const short*)_beta; + const int *S0 = src[0], *S1 = src[1]; + int x = 0; + __m256i b0 = _mm256_set1_epi16(beta[0]), b1 = _mm256_set1_epi16(beta[1]); + __m256i delta = _mm256_set1_epi16(2); + const int index[8] = { 0, 4, 1, 5, 2, 6, 3, 7 }; + __m256i shuffle = _mm256_load_si256((const __m256i*)index); + + if( (((size_t)S0|(size_t)S1)&31) == 0 ) + for( ; x <= width - 32; x += 32 ) + { + __m256i x0, x1, x2, y0, y1, y2; + x0 = _mm256_load_si256((const __m256i*)(S0 + x)); + x1 = _mm256_load_si256((const __m256i*)(S0 + x + 8)); + y0 = _mm256_load_si256((const __m256i*)(S1 + x)); + y1 = _mm256_load_si256((const __m256i*)(S1 + x + 8)); + x0 = _mm256_packs_epi32(_mm256_srai_epi32(x0, 4), _mm256_srai_epi32(x1, 4)); + y0 = _mm256_packs_epi32(_mm256_srai_epi32(y0, 4), _mm256_srai_epi32(y1, 4)); + + x1 = _mm256_load_si256((const __m256i*)(S0 + x + 16)); + x2 = _mm256_load_si256((const __m256i*)(S0 + x + 24)); + y1 = _mm256_load_si256((const __m256i*)(S1 + x + 16)); + y2 = _mm256_load_si256((const __m256i*)(S1 + x + 24)); + x1 = _mm256_packs_epi32(_mm256_srai_epi32(x1, 4), _mm256_srai_epi32(x2, 4)); + y1 = _mm256_packs_epi32(_mm256_srai_epi32(y1, 4), _mm256_srai_epi32(y2, 4)); + + x0 = _mm256_adds_epi16(_mm256_mulhi_epi16(x0, b0), _mm256_mulhi_epi16(y0, b1)); + x1 = _mm256_adds_epi16(_mm256_mulhi_epi16(x1, b0), _mm256_mulhi_epi16(y1, b1)); + + x0 = _mm256_srai_epi16(_mm256_adds_epi16(x0, delta), 2); + x1 = _mm256_srai_epi16(_mm256_adds_epi16(x1, delta), 2); + x0 = _mm256_packus_epi16(x0, x1); + x0 = _mm256_permutevar8x32_epi32(x0, shuffle); + _mm256_storeu_si256( (__m256i*)(dst + x), x0); + } + else + for( ; x <= width - 32; x += 32 ) + { + __m256i x0, x1, x2, y0, y1, y2; + x0 = _mm256_loadu_si256((const __m256i*)(S0 + x)); + x1 = _mm256_loadu_si256((const __m256i*)(S0 + x + 8)); + y0 = _mm256_loadu_si256((const __m256i*)(S1 + x)); + y1 = _mm256_loadu_si256((const __m256i*)(S1 + x + 8)); + x0 = _mm256_packs_epi32(_mm256_srai_epi32(x0, 4), _mm256_srai_epi32(x1, 4)); + y0 = _mm256_packs_epi32(_mm256_srai_epi32(y0, 4), _mm256_srai_epi32(y1, 4)); + + x1 = _mm256_loadu_si256((const __m256i*)(S0 + x + 16)); + x2 = _mm256_loadu_si256((const __m256i*)(S0 + x + 24)); + y1 = _mm256_loadu_si256((const __m256i*)(S1 + x + 16)); + y2 = _mm256_loadu_si256((const __m256i*)(S1 + x + 24)); + x1 = _mm256_packs_epi32(_mm256_srai_epi32(x1, 4), _mm256_srai_epi32(x2, 4)); + y1 = _mm256_packs_epi32(_mm256_srai_epi32(y1, 4), _mm256_srai_epi32(y2, 4)); + + x0 = _mm256_adds_epi16(_mm256_mulhi_epi16(x0, b0), _mm256_mulhi_epi16(y0, b1)); + x1 = _mm256_adds_epi16(_mm256_mulhi_epi16(x1, b0), _mm256_mulhi_epi16(y1, b1)); + + x0 = _mm256_srai_epi16(_mm256_adds_epi16(x0, delta), 2); + x1 = _mm256_srai_epi16(_mm256_adds_epi16(x1, delta), 2); + x0 = _mm256_packus_epi16(x0, x1); + x0 = _mm256_permutevar8x32_epi32(x0, shuffle); + _mm256_storeu_si256( (__m256i*)(dst + x), x0); + } + + for( ; x < width - 8; x += 8 ) + { + __m256i x0, y0; + x0 = _mm256_srai_epi32(_mm256_loadu_si256((const __m256i*)(S0 + x)), 4); + y0 = _mm256_srai_epi32(_mm256_loadu_si256((const __m256i*)(S1 + x)), 4); + x0 = _mm256_packs_epi32(x0, x0); + y0 = _mm256_packs_epi32(y0, y0); + x0 = _mm256_adds_epi16(_mm256_mulhi_epi16(x0, b0), _mm256_mulhi_epi16(y0, b1)); + x0 = _mm256_srai_epi16(_mm256_adds_epi16(x0, delta), 2); + x0 = _mm256_packus_epi16(x0, x0); + *(int*)(dst + x) = _mm_cvtsi128_si32(_mm256_extracti128_si256(x0, 0)); + *(int*)(dst + x + 4) = _mm_cvtsi128_si32(_mm256_extracti128_si256(x0, 1)); + } + + return x; +} + +template +int VResizeLinearVec_32f16_avx2(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) +{ + const float** src = (const float**)_src; + const float* beta = (const float*)_beta; + const float *S0 = src[0], *S1 = src[1]; + ushort* dst = (ushort*)_dst; + int x = 0; + + __m256 b0 = _mm256_set1_ps(beta[0]), b1 = _mm256_set1_ps(beta[1]); + __m256i preshift = _mm256_set1_epi32(shiftval); + __m256i postshift = _mm256_set1_epi16((short)shiftval); + + if( (((size_t)S0|(size_t)S1)&31) == 0 ) + for( ; x <= width - 32; x += 32 ) + { + __m256 x0, x1, y0, y1; + __m256i t0, t1, t2; + x0 = _mm256_load_ps(S0 + x); + x1 = _mm256_load_ps(S0 + x + 8); + y0 = _mm256_load_ps(S1 + x); + y1 = _mm256_load_ps(S1 + x + 8); + + x0 = _mm256_add_ps(_mm256_mul_ps(x0, b0), _mm256_mul_ps(y0, b1)); + x1 = _mm256_add_ps(_mm256_mul_ps(x1, b0), _mm256_mul_ps(y1, b1)); + t0 = _mm256_add_epi32(_mm256_cvtps_epi32(x0), preshift); + t2 = _mm256_add_epi32(_mm256_cvtps_epi32(x1), preshift); + t0 = _mm256_add_epi16(_mm256_packs_epi32(t0, t2), postshift); + + x0 = _mm256_load_ps(S0 + x + 16); + x1 = _mm256_load_ps(S0 + x + 24); + y0 = _mm256_load_ps(S1 + x + 16); + y1 = _mm256_load_ps(S1 + x + 24); + + x0 = _mm256_add_ps(_mm256_mul_ps(x0, b0), _mm256_mul_ps(y0, b1)); + x1 = _mm256_add_ps(_mm256_mul_ps(x1, b0), _mm256_mul_ps(y1, b1)); + t1 = _mm256_add_epi32(_mm256_cvtps_epi32(x0), preshift); + t2 = _mm256_add_epi32(_mm256_cvtps_epi32(x1), preshift); + t1 = _mm256_add_epi16(_mm256_packs_epi32(t1, t2), postshift); + + _mm256_storeu_si256( (__m256i*)(dst + x), t0); + _mm256_storeu_si256( (__m256i*)(dst + x + 16), t1); + } + else + for( ; x <= width - 32; x += 32 ) + { + __m256 x0, x1, y0, y1; + __m256i t0, t1, t2; + x0 = _mm256_loadu_ps(S0 + x); + x1 = _mm256_loadu_ps(S0 + x + 8); + y0 = _mm256_loadu_ps(S1 + x); + y1 = _mm256_loadu_ps(S1 + x + 8); + + x0 = _mm256_add_ps(_mm256_mul_ps(x0, b0), _mm256_mul_ps(y0, b1)); + x1 = _mm256_add_ps(_mm256_mul_ps(x1, b0), _mm256_mul_ps(y1, b1)); + t0 = _mm256_add_epi32(_mm256_cvtps_epi32(x0), preshift); + t2 = _mm256_add_epi32(_mm256_cvtps_epi32(x1), preshift); + t0 = _mm256_add_epi16(_mm256_packs_epi32(t0, t2), postshift); + + x0 = _mm256_loadu_ps(S0 + x + 16); + x1 = _mm256_loadu_ps(S0 + x + 24); + y0 = _mm256_loadu_ps(S1 + x + 16); + y1 = _mm256_loadu_ps(S1 + x + 24); + + x0 = _mm256_add_ps(_mm256_mul_ps(x0, b0), _mm256_mul_ps(y0, b1)); + x1 = _mm256_add_ps(_mm256_mul_ps(x1, b0), _mm256_mul_ps(y1, b1)); + t1 = _mm256_add_epi32(_mm256_cvtps_epi32(x0), preshift); + t2 = _mm256_add_epi32(_mm256_cvtps_epi32(x1), preshift); + t1 = _mm256_add_epi16(_mm256_packs_epi32(t1, t2), postshift); + + _mm256_storeu_si256( (__m256i*)(dst + x), t0); + _mm256_storeu_si256( (__m256i*)(dst + x + 16), t1); + } + + for( ; x < width - 8; x += 8 ) + { + __m256 x0, y0; + __m256i t0; + x0 = _mm256_loadu_ps(S0 + x); + y0 = _mm256_loadu_ps(S1 + x); + + x0 = _mm256_add_ps(_mm256_mul_ps(x0, b0), _mm256_mul_ps(y0, b1)); + t0 = _mm256_add_epi32(_mm256_cvtps_epi32(x0), preshift); + t0 = _mm256_add_epi16(_mm256_packs_epi32(t0, t0), postshift); + _mm_storel_epi64( (__m128i*)(dst + x), _mm256_extracti128_si256(t0, 0)); + _mm_storel_epi64( (__m128i*)(dst + x + 4), _mm256_extracti128_si256(t0, 1)); + } + + return x; +} + +int VResizeCubicVec_32s8u_avx2(const uchar** _src, uchar* dst, const uchar* _beta, int width ) +{ + const int** src = (const int**)_src; + const short* beta = (const short*)_beta; + const int *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3]; + int x = 0; + float scale = 1.f/(INTER_RESIZE_COEF_SCALE*INTER_RESIZE_COEF_SCALE); + __m256 b0 = _mm256_set1_ps(beta[0]*scale), b1 = _mm256_set1_ps(beta[1]*scale), + b2 = _mm256_set1_ps(beta[2]*scale), b3 = _mm256_set1_ps(beta[3]*scale); + const int shuffle = 0xd8; // 11 | 01 | 10 | 00 + + if( (((size_t)S0|(size_t)S1|(size_t)S2|(size_t)S3)&31) == 0 ) + for( ; x <= width - 16; x += 16 ) + { + __m256i x0, x1, y0, y1; + __m256 s0, s1, f0, f1; + x0 = _mm256_load_si256((const __m256i*)(S0 + x)); + x1 = _mm256_load_si256((const __m256i*)(S0 + x + 8)); + y0 = _mm256_load_si256((const __m256i*)(S1 + x)); + y1 = _mm256_load_si256((const __m256i*)(S1 + x + 8)); + + s0 = _mm256_mul_ps(_mm256_cvtepi32_ps(x0), b0); + s1 = _mm256_mul_ps(_mm256_cvtepi32_ps(x1), b0); + f0 = _mm256_mul_ps(_mm256_cvtepi32_ps(y0), b1); + f1 = _mm256_mul_ps(_mm256_cvtepi32_ps(y1), b1); + s0 = _mm256_add_ps(s0, f0); + s1 = _mm256_add_ps(s1, f1); + + x0 = _mm256_load_si256((const __m256i*)(S2 + x)); + x1 = _mm256_load_si256((const __m256i*)(S2 + x + 8)); + y0 = _mm256_load_si256((const __m256i*)(S3 + x)); + y1 = _mm256_load_si256((const __m256i*)(S3 + x + 8)); + + f0 = _mm256_mul_ps(_mm256_cvtepi32_ps(x0), b2); + f1 = _mm256_mul_ps(_mm256_cvtepi32_ps(x1), b2); + s0 = _mm256_add_ps(s0, f0); + s1 = _mm256_add_ps(s1, f1); + f0 = _mm256_mul_ps(_mm256_cvtepi32_ps(y0), b3); + f1 = _mm256_mul_ps(_mm256_cvtepi32_ps(y1), b3); + s0 = _mm256_add_ps(s0, f0); + s1 = _mm256_add_ps(s1, f1); + + x0 = _mm256_cvtps_epi32(s0); + x1 = _mm256_cvtps_epi32(s1); + + x0 = _mm256_packs_epi32(x0, x1); + x0 = _mm256_permute4x64_epi64(x0, shuffle); + x0 = _mm256_packus_epi16(x0, x0); + _mm_storel_epi64( (__m128i*)(dst + x), _mm256_extracti128_si256(x0, 0)); + _mm_storel_epi64( (__m128i*)(dst + x + 8), _mm256_extracti128_si256(x0, 1)); + } + else + for( ; x <= width - 16; x += 16 ) + { + __m256i x0, x1, y0, y1; + __m256 s0, s1, f0, f1; + x0 = _mm256_loadu_si256((const __m256i*)(S0 + x)); + x1 = _mm256_loadu_si256((const __m256i*)(S0 + x + 8)); + y0 = _mm256_loadu_si256((const __m256i*)(S1 + x)); + y1 = _mm256_loadu_si256((const __m256i*)(S1 + x + 8)); + + s0 = _mm256_mul_ps(_mm256_cvtepi32_ps(x0), b0); + s1 = _mm256_mul_ps(_mm256_cvtepi32_ps(x1), b0); + f0 = _mm256_mul_ps(_mm256_cvtepi32_ps(y0), b1); + f1 = _mm256_mul_ps(_mm256_cvtepi32_ps(y1), b1); + s0 = _mm256_add_ps(s0, f0); + s1 = _mm256_add_ps(s1, f1); + + x0 = _mm256_loadu_si256((const __m256i*)(S2 + x)); + x1 = _mm256_loadu_si256((const __m256i*)(S2 + x + 8)); + y0 = _mm256_loadu_si256((const __m256i*)(S3 + x)); + y1 = _mm256_loadu_si256((const __m256i*)(S3 + x + 8)); + + f0 = _mm256_mul_ps(_mm256_cvtepi32_ps(x0), b2); + f1 = _mm256_mul_ps(_mm256_cvtepi32_ps(x1), b2); + s0 = _mm256_add_ps(s0, f0); + s1 = _mm256_add_ps(s1, f1); + f0 = _mm256_mul_ps(_mm256_cvtepi32_ps(y0), b3); + f1 = _mm256_mul_ps(_mm256_cvtepi32_ps(y1), b3); + s0 = _mm256_add_ps(s0, f0); + s1 = _mm256_add_ps(s1, f1); + + x0 = _mm256_cvtps_epi32(s0); + x1 = _mm256_cvtps_epi32(s1); + + x0 = _mm256_packs_epi32(x0, x1); + x0 = _mm256_permute4x64_epi64(x0, shuffle); + x0 = _mm256_packus_epi16(x0, x0); + _mm_storel_epi64( (__m128i*)(dst + x), _mm256_extracti128_si256(x0, 0)); + _mm_storel_epi64( (__m128i*)(dst + x + 8), _mm256_extracti128_si256(x0, 1)); + } + + return x; +} + +template +int VResizeCubicVec_32f16_avx2(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) +{ + const float** src = (const float**)_src; + const float* beta = (const float*)_beta; + const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3]; + ushort* dst = (ushort*)_dst; + int x = 0; + __m256 b0 = _mm256_set1_ps(beta[0]), b1 = _mm256_set1_ps(beta[1]), + b2 = _mm256_set1_ps(beta[2]), b3 = _mm256_set1_ps(beta[3]); + __m256i preshift = _mm256_set1_epi32(shiftval); + __m256i postshift = _mm256_set1_epi16((short)shiftval); + const int shuffle = 0xd8; // 11 | 01 | 10 | 00 + + if( (((size_t)S0|(size_t)S1|(size_t)S2|(size_t)S3)&31) == 0 ) + for( ; x <= width - 16; x += 16 ) + { + __m256 x0, x1, y0, y1, s0, s1; + __m256i t0, t1; + x0 = _mm256_load_ps(S0 + x); + x1 = _mm256_load_ps(S0 + x + 8); + y0 = _mm256_load_ps(S1 + x); + y1 = _mm256_load_ps(S1 + x + 8); + + s0 = _mm256_mul_ps(x0, b0); + s1 = _mm256_mul_ps(x1, b0); + y0 = _mm256_mul_ps(y0, b1); + y1 = _mm256_mul_ps(y1, b1); + s0 = _mm256_add_ps(s0, y0); + s1 = _mm256_add_ps(s1, y1); + + x0 = _mm256_load_ps(S2 + x); + x1 = _mm256_load_ps(S2 + x + 8); + y0 = _mm256_load_ps(S3 + x); + y1 = _mm256_load_ps(S3 + x + 8); + + x0 = _mm256_mul_ps(x0, b2); + x1 = _mm256_mul_ps(x1, b2); + y0 = _mm256_mul_ps(y0, b3); + y1 = _mm256_mul_ps(y1, b3); + s0 = _mm256_add_ps(s0, x0); + s1 = _mm256_add_ps(s1, x1); + s0 = _mm256_add_ps(s0, y0); + s1 = _mm256_add_ps(s1, y1); + + t0 = _mm256_add_epi32(_mm256_cvtps_epi32(s0), preshift); + t1 = _mm256_add_epi32(_mm256_cvtps_epi32(s1), preshift); + + t0 = _mm256_add_epi16(_mm256_packs_epi32(t0, t1), postshift); + t0 = _mm256_permute4x64_epi64(t0, shuffle); + _mm256_storeu_si256( (__m256i*)(dst + x), t0); + } + else + for( ; x <= width - 16; x += 16 ) + { + __m256 x0, x1, y0, y1, s0, s1; + __m256i t0, t1; + x0 = _mm256_loadu_ps(S0 + x); + x1 = _mm256_loadu_ps(S0 + x + 8); + y0 = _mm256_loadu_ps(S1 + x); + y1 = _mm256_loadu_ps(S1 + x + 8); + + s0 = _mm256_mul_ps(x0, b0); + s1 = _mm256_mul_ps(x1, b0); + y0 = _mm256_mul_ps(y0, b1); + y1 = _mm256_mul_ps(y1, b1); + s0 = _mm256_add_ps(s0, y0); + s1 = _mm256_add_ps(s1, y1); + + x0 = _mm256_loadu_ps(S2 + x); + x1 = _mm256_loadu_ps(S2 + x + 8); + y0 = _mm256_loadu_ps(S3 + x); + y1 = _mm256_loadu_ps(S3 + x + 8); + + x0 = _mm256_mul_ps(x0, b2); + x1 = _mm256_mul_ps(x1, b2); + y0 = _mm256_mul_ps(y0, b3); + y1 = _mm256_mul_ps(y1, b3); + s0 = _mm256_add_ps(s0, x0); + s1 = _mm256_add_ps(s1, x1); + s0 = _mm256_add_ps(s0, y0); + s1 = _mm256_add_ps(s1, y1); + + t0 = _mm256_add_epi32(_mm256_cvtps_epi32(s0), preshift); + t1 = _mm256_add_epi32(_mm256_cvtps_epi32(s1), preshift); + + t0 = _mm256_add_epi16(_mm256_packs_epi32(t0, t1), postshift); + t0 = _mm256_permute4x64_epi64(t0, shuffle); + _mm256_storeu_si256( (__m256i*)(dst + x), t0); + } + + return x; +} +#else +int VResizeLinearVec_32s8u_avx2(const uchar**, uchar*, const uchar*, int ) { return 0; } + +template +int VResizeLinearVec_32f16_avx2(const uchar**, uchar*, const uchar*, int ) { return 0; } + +int VResizeCubicVec_32s8u_avx2(const uchar**, uchar*, const uchar*, int ) { return 0; } + +template +int VResizeCubicVec_32f16_avx2(const uchar**, uchar*, const uchar*, int ) { return 0; } +#endif + +// Template instantiations. +template int VResizeLinearVec_32f16_avx2(const uchar** _src, uchar* _dst, const uchar* _beta, int width ); +template int VResizeLinearVec_32f16_avx2<0>(const uchar** _src, uchar* _dst, const uchar* _beta, int width ); + +template int VResizeCubicVec_32f16_avx2(const uchar** _src, uchar* _dst, const uchar* _beta, int width ); +template int VResizeCubicVec_32f16_avx2<0>(const uchar** _src, uchar* _dst, const uchar* _beta, int width ); + +/* End of file. */ diff --git a/modules/imgproc/src/avx2/imgwarp_avx2.hpp b/modules/imgproc/src/avx2/imgwarp_avx2.hpp new file mode 100644 index 0000000000..f4d4e63c73 --- /dev/null +++ b/modules/imgproc/src/avx2/imgwarp_avx2.hpp @@ -0,0 +1,57 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// Intel License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000, Intel Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of Intel Corporation may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#ifndef _CV_IMGWARP_AVX2_H_ +#define _CV_IMGWARP_AVX2_H_ + +int VResizeLinearVec_32s8u_avx2(const uchar** _src, uchar* dst, const uchar* _beta, int width ); + +template +int VResizeLinearVec_32f16_avx2(const uchar** _src, uchar* _dst, const uchar* _beta, int width ); + +int VResizeCubicVec_32s8u_avx2(const uchar** _src, uchar* dst, const uchar* _beta, int width ); + +template +int VResizeCubicVec_32f16_avx2(const uchar** _src, uchar* _dst, const uchar* _beta, int width ); + +#endif + +/* End of file. */ diff --git a/modules/imgproc/src/imgwarp.cpp b/modules/imgproc/src/imgwarp.cpp index 88b278710d..b77526044c 100644 --- a/modules/imgproc/src/imgwarp.cpp +++ b/modules/imgproc/src/imgwarp.cpp @@ -47,6 +47,8 @@ // */ #include "precomp.hpp" +#include "avx/imgwarp_avx.hpp" +#include "avx2/imgwarp_avx2.hpp" #include #include @@ -54,10 +56,6 @@ static IppStatus sts = ippInit(); #endif -#ifdef _MSC_VER -# pragma warning(disable:4752) // Disable warning for mixing SSE and AVX -#endif - namespace cv { @@ -455,7 +453,7 @@ struct HResizeNoVec #if CV_SSE2 -static int VResizeLinearVec_32s8u_sse2(const uchar** _src, uchar* dst, const uchar* _beta, int width) +static int VResizeLinearVec_32s8u_sse2(const uchar** _src, uchar* dst, const uchar* _beta, int width ) { const int** src = (const int**)_src; const short* beta = (const short*)_beta; @@ -531,103 +529,19 @@ static int VResizeLinearVec_32s8u_sse2(const uchar** _src, uchar* dst, const uch return x; } -#if CV_AVX2 -int VResizeLinearVec_32s8u_avx2(const uchar** _src, uchar* dst, const uchar* _beta, int width ) -{ - const int** src = (const int**)_src; - const short* beta = (const short*)_beta; - const int *S0 = src[0], *S1 = src[1]; - int x = 0; - __m256i b0 = _mm256_set1_epi16(beta[0]), b1 = _mm256_set1_epi16(beta[1]); - __m256i delta = _mm256_set1_epi16(2); - const int index[8] = { 0, 4, 1, 5, 2, 6, 3, 7 }; - __m256i shuffle = _mm256_load_si256((const __m256i*)index); - - if( (((size_t)S0|(size_t)S1)&31) == 0 ) - for( ; x <= width - 32; x += 32 ) - { - __m256i x0, x1, x2, y0, y1, y2; - x0 = _mm256_load_si256((const __m256i*)(S0 + x)); - x1 = _mm256_load_si256((const __m256i*)(S0 + x + 8)); - y0 = _mm256_load_si256((const __m256i*)(S1 + x)); - y1 = _mm256_load_si256((const __m256i*)(S1 + x + 8)); - x0 = _mm256_packs_epi32(_mm256_srai_epi32(x0, 4), _mm256_srai_epi32(x1, 4)); - y0 = _mm256_packs_epi32(_mm256_srai_epi32(y0, 4), _mm256_srai_epi32(y1, 4)); - - x1 = _mm256_load_si256((const __m256i*)(S0 + x + 16)); - x2 = _mm256_load_si256((const __m256i*)(S0 + x + 24)); - y1 = _mm256_load_si256((const __m256i*)(S1 + x + 16)); - y2 = _mm256_load_si256((const __m256i*)(S1 + x + 24)); - x1 = _mm256_packs_epi32(_mm256_srai_epi32(x1, 4), _mm256_srai_epi32(x2, 4)); - y1 = _mm256_packs_epi32(_mm256_srai_epi32(y1, 4), _mm256_srai_epi32(y2, 4)); - - x0 = _mm256_adds_epi16(_mm256_mulhi_epi16(x0, b0), _mm256_mulhi_epi16(y0, b1)); - x1 = _mm256_adds_epi16(_mm256_mulhi_epi16(x1, b0), _mm256_mulhi_epi16(y1, b1)); - - x0 = _mm256_srai_epi16(_mm256_adds_epi16(x0, delta), 2); - x1 = _mm256_srai_epi16(_mm256_adds_epi16(x1, delta), 2); - x0 = _mm256_packus_epi16(x0, x1); - x0 = _mm256_permutevar8x32_epi32(x0, shuffle); - _mm256_storeu_si256( (__m256i*)(dst + x), x0); - } - else - for( ; x <= width - 32; x += 32 ) - { - __m256i x0, x1, x2, y0, y1, y2; - x0 = _mm256_loadu_si256((const __m256i*)(S0 + x)); - x1 = _mm256_loadu_si256((const __m256i*)(S0 + x + 8)); - y0 = _mm256_loadu_si256((const __m256i*)(S1 + x)); - y1 = _mm256_loadu_si256((const __m256i*)(S1 + x + 8)); - x0 = _mm256_packs_epi32(_mm256_srai_epi32(x0, 4), _mm256_srai_epi32(x1, 4)); - y0 = _mm256_packs_epi32(_mm256_srai_epi32(y0, 4), _mm256_srai_epi32(y1, 4)); - - x1 = _mm256_loadu_si256((const __m256i*)(S0 + x + 16)); - x2 = _mm256_loadu_si256((const __m256i*)(S0 + x + 24)); - y1 = _mm256_loadu_si256((const __m256i*)(S1 + x + 16)); - y2 = _mm256_loadu_si256((const __m256i*)(S1 + x + 24)); - x1 = _mm256_packs_epi32(_mm256_srai_epi32(x1, 4), _mm256_srai_epi32(x2, 4)); - y1 = _mm256_packs_epi32(_mm256_srai_epi32(y1, 4), _mm256_srai_epi32(y2, 4)); - - x0 = _mm256_adds_epi16(_mm256_mulhi_epi16(x0, b0), _mm256_mulhi_epi16(y0, b1)); - x1 = _mm256_adds_epi16(_mm256_mulhi_epi16(x1, b0), _mm256_mulhi_epi16(y1, b1)); - - x0 = _mm256_srai_epi16(_mm256_adds_epi16(x0, delta), 2); - x1 = _mm256_srai_epi16(_mm256_adds_epi16(x1, delta), 2); - x0 = _mm256_packus_epi16(x0, x1); - x0 = _mm256_permutevar8x32_epi32(x0, shuffle); - _mm256_storeu_si256( (__m256i*)(dst + x), x0); - } - - for( ; x < width - 8; x += 8 ) - { - __m256i x0, y0; - x0 = _mm256_srai_epi32(_mm256_loadu_si256((const __m256i*)(S0 + x)), 4); - y0 = _mm256_srai_epi32(_mm256_loadu_si256((const __m256i*)(S1 + x)), 4); - x0 = _mm256_packs_epi32(x0, x0); - y0 = _mm256_packs_epi32(y0, y0); - x0 = _mm256_adds_epi16(_mm256_mulhi_epi16(x0, b0), _mm256_mulhi_epi16(y0, b1)); - x0 = _mm256_srai_epi16(_mm256_adds_epi16(x0, delta), 2); - x0 = _mm256_packus_epi16(x0, x0); - *(int*)(dst + x) = _mm_cvtsi128_si32(_mm256_extracti128_si256(x0, 0)); - *(int*)(dst + x + 4) = _mm_cvtsi128_si32(_mm256_extracti128_si256(x0, 1)); - } - - return x; -} -#endif - struct VResizeLinearVec_32s8u { int operator()(const uchar** _src, uchar* dst, const uchar* _beta, int width ) const { -#if CV_AVX2 - if( checkHardwareSupport(CV_CPU_AVX2) ) - return VResizeLinearVec_32s8u_avx2(_src, dst, _beta, width); -#endif - if( checkHardwareSupport(CV_CPU_SSE2) ) - return VResizeLinearVec_32s8u_sse2(_src, dst, _beta, width); + int processed = 0; - return 0; + if( checkHardwareSupport(CV_CPU_AVX2) ) + processed += VResizeLinearVec_32s8u_avx2(_src, dst, _beta, width); + + if( !processed && checkHardwareSupport(CV_CPU_SSE2) ) + processed += VResizeLinearVec_32s8u_sse2(_src, dst, _beta, width); + + return processed; } }; @@ -721,111 +635,19 @@ int VResizeLinearVec_32f16_sse2(const uchar** _src, uchar* _dst, const uchar* _b return x; } -#if CV_AVX2 -template -int VResizeLinearVec_32f16_avx2(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) -{ - const float** src = (const float**)_src; - const float* beta = (const float*)_beta; - const float *S0 = src[0], *S1 = src[1]; - ushort* dst = (ushort*)_dst; - int x = 0; - - __m256 b0 = _mm256_set1_ps(beta[0]), b1 = _mm256_set1_ps(beta[1]); - __m256i preshift = _mm256_set1_epi32(shiftval); - __m256i postshift = _mm256_set1_epi16((short)shiftval); - - if( (((size_t)S0|(size_t)S1)&31) == 0 ) - for( ; x <= width - 32; x += 32 ) - { - __m256 x0, x1, y0, y1; - __m256i t0, t1, t2; - x0 = _mm256_load_ps(S0 + x); - x1 = _mm256_load_ps(S0 + x + 8); - y0 = _mm256_load_ps(S1 + x); - y1 = _mm256_load_ps(S1 + x + 8); - - x0 = _mm256_add_ps(_mm256_mul_ps(x0, b0), _mm256_mul_ps(y0, b1)); - x1 = _mm256_add_ps(_mm256_mul_ps(x1, b0), _mm256_mul_ps(y1, b1)); - t0 = _mm256_add_epi32(_mm256_cvtps_epi32(x0), preshift); - t2 = _mm256_add_epi32(_mm256_cvtps_epi32(x1), preshift); - t0 = _mm256_add_epi16(_mm256_packs_epi32(t0, t2), postshift); - - x0 = _mm256_load_ps(S0 + x + 16); - x1 = _mm256_load_ps(S0 + x + 24); - y0 = _mm256_load_ps(S1 + x + 16); - y1 = _mm256_load_ps(S1 + x + 24); - - x0 = _mm256_add_ps(_mm256_mul_ps(x0, b0), _mm256_mul_ps(y0, b1)); - x1 = _mm256_add_ps(_mm256_mul_ps(x1, b0), _mm256_mul_ps(y1, b1)); - t1 = _mm256_add_epi32(_mm256_cvtps_epi32(x0), preshift); - t2 = _mm256_add_epi32(_mm256_cvtps_epi32(x1), preshift); - t1 = _mm256_add_epi16(_mm256_packs_epi32(t1, t2), postshift); - - _mm256_storeu_si256( (__m256i*)(dst + x), t0); - _mm256_storeu_si256( (__m256i*)(dst + x + 16), t1); - } - else - for( ; x <= width - 32; x += 32 ) - { - __m256 x0, x1, y0, y1; - __m256i t0, t1, t2; - x0 = _mm256_loadu_ps(S0 + x); - x1 = _mm256_loadu_ps(S0 + x + 8); - y0 = _mm256_loadu_ps(S1 + x); - y1 = _mm256_loadu_ps(S1 + x + 8); - - x0 = _mm256_add_ps(_mm256_mul_ps(x0, b0), _mm256_mul_ps(y0, b1)); - x1 = _mm256_add_ps(_mm256_mul_ps(x1, b0), _mm256_mul_ps(y1, b1)); - t0 = _mm256_add_epi32(_mm256_cvtps_epi32(x0), preshift); - t2 = _mm256_add_epi32(_mm256_cvtps_epi32(x1), preshift); - t0 = _mm256_add_epi16(_mm256_packs_epi32(t0, t2), postshift); - - x0 = _mm256_loadu_ps(S0 + x + 16); - x1 = _mm256_loadu_ps(S0 + x + 24); - y0 = _mm256_loadu_ps(S1 + x + 16); - y1 = _mm256_loadu_ps(S1 + x + 24); - - x0 = _mm256_add_ps(_mm256_mul_ps(x0, b0), _mm256_mul_ps(y0, b1)); - x1 = _mm256_add_ps(_mm256_mul_ps(x1, b0), _mm256_mul_ps(y1, b1)); - t1 = _mm256_add_epi32(_mm256_cvtps_epi32(x0), preshift); - t2 = _mm256_add_epi32(_mm256_cvtps_epi32(x1), preshift); - t1 = _mm256_add_epi16(_mm256_packs_epi32(t1, t2), postshift); - - _mm256_storeu_si256( (__m256i*)(dst + x), t0); - _mm256_storeu_si256( (__m256i*)(dst + x + 16), t1); - } - - for( ; x < width - 8; x += 8 ) - { - __m256 x0, y0; - __m256i t0; - x0 = _mm256_loadu_ps(S0 + x); - y0 = _mm256_loadu_ps(S1 + x); - - x0 = _mm256_add_ps(_mm256_mul_ps(x0, b0), _mm256_mul_ps(y0, b1)); - t0 = _mm256_add_epi32(_mm256_cvtps_epi32(x0), preshift); - t0 = _mm256_add_epi16(_mm256_packs_epi32(t0, t0), postshift); - _mm_storel_epi64( (__m128i*)(dst + x), _mm256_extracti128_si256(t0, 0)); - _mm_storel_epi64( (__m128i*)(dst + x + 4), _mm256_extracti128_si256(t0, 1)); - } - - return x; -} -#endif - template struct VResizeLinearVec_32f16 { int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const { -#if CV_AVX2 - if( checkHardwareSupport(CV_CPU_AVX2) ) - return VResizeLinearVec_32f16_avx2(_src, _dst, _beta, width); -#endif - if( checkHardwareSupport(CV_CPU_SSE2) ) - return VResizeLinearVec_32f16_sse2(_src, _dst, _beta, width); + int processed = 0; - return 0; + if( checkHardwareSupport(CV_CPU_AVX2) ) + processed += VResizeLinearVec_32f16_avx2(_src, _dst, _beta, width); + + if( !processed && checkHardwareSupport(CV_CPU_SSE2) ) + processed += VResizeLinearVec_32f16_sse2(_src, _dst, _beta, width); + + return processed; } }; @@ -876,68 +698,22 @@ static int VResizeLinearVec_32f_sse(const uchar** _src, uchar* _dst, const uchar return x; } -#if CV_AVX -int VResizeLinearVec_32f_avx(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) -{ - const float** src = (const float**)_src; - const float* beta = (const float*)_beta; - const float *S0 = src[0], *S1 = src[1]; - float* dst = (float*)_dst; - int x = 0; - - __m256 b0 = _mm256_set1_ps(beta[0]), b1 = _mm256_set1_ps(beta[1]); - - if( (((size_t)S0|(size_t)S1)&31) == 0 ) - for( ; x <= width - 16; x += 16 ) - { - __m256 x0, x1, y0, y1; - x0 = _mm256_load_ps(S0 + x); - x1 = _mm256_load_ps(S0 + x + 8); - y0 = _mm256_load_ps(S1 + x); - y1 = _mm256_load_ps(S1 + x + 8); - - x0 = _mm256_add_ps(_mm256_mul_ps(x0, b0), _mm256_mul_ps(y0, b1)); - x1 = _mm256_add_ps(_mm256_mul_ps(x1, b0), _mm256_mul_ps(y1, b1)); - - _mm256_storeu_ps( dst + x, x0); - _mm256_storeu_ps( dst + x + 8, x1); - } - else - for( ; x <= width - 16; x += 16 ) - { - __m256 x0, x1, y0, y1; - x0 = _mm256_loadu_ps(S0 + x); - x1 = _mm256_loadu_ps(S0 + x + 8); - y0 = _mm256_loadu_ps(S1 + x); - y1 = _mm256_loadu_ps(S1 + x + 8); - - x0 = _mm256_add_ps(_mm256_mul_ps(x0, b0), _mm256_mul_ps(y0, b1)); - x1 = _mm256_add_ps(_mm256_mul_ps(x1, b0), _mm256_mul_ps(y1, b1)); - - _mm256_storeu_ps( dst + x, x0); - _mm256_storeu_ps( dst + x + 8, x1); - } - - return x; -} -#endif - struct VResizeLinearVec_32f { int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const { -#if CV_AVX - if( checkHardwareSupport(CV_CPU_AVX) ) - return VResizeLinearVec_32f_avx(_src, _dst, _beta, width); -#endif - if( checkHardwareSupport(CV_CPU_SSE) ) - return VResizeLinearVec_32f_sse(_src, _dst, _beta, width); + int processed = 0; - return 0; + if( checkHardwareSupport(CV_CPU_AVX) ) + processed += VResizeLinearVec_32f_avx(_src, _dst, _beta, width); + + if( !processed && checkHardwareSupport(CV_CPU_SSE) ) + processed += VResizeLinearVec_32f_sse(_src, _dst, _beta, width); + + return processed; } }; - static int VResizeCubicVec_32s8u_sse2(const uchar** _src, uchar* dst, const uchar* _beta, int width ) { const int** src = (const int**)_src; @@ -1026,115 +802,19 @@ static int VResizeCubicVec_32s8u_sse2(const uchar** _src, uchar* dst, const ucha return x; } -#if CV_AVX2 -int VResizeCubicVec_32s8u_avx2(const uchar** _src, uchar* dst, const uchar* _beta, int width ) -{ - const int** src = (const int**)_src; - const short* beta = (const short*)_beta; - const int *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3]; - int x = 0; - float scale = 1.f/(INTER_RESIZE_COEF_SCALE*INTER_RESIZE_COEF_SCALE); - __m256 b0 = _mm256_set1_ps(beta[0]*scale), b1 = _mm256_set1_ps(beta[1]*scale), - b2 = _mm256_set1_ps(beta[2]*scale), b3 = _mm256_set1_ps(beta[3]*scale); - const int shuffle = 0xd8; // 11 | 01 | 10 | 00 - - if( (((size_t)S0|(size_t)S1|(size_t)S2|(size_t)S3)&31) == 0 ) - for( ; x <= width - 16; x += 16 ) - { - __m256i x0, x1, y0, y1; - __m256 s0, s1, f0, f1; - x0 = _mm256_load_si256((const __m256i*)(S0 + x)); - x1 = _mm256_load_si256((const __m256i*)(S0 + x + 8)); - y0 = _mm256_load_si256((const __m256i*)(S1 + x)); - y1 = _mm256_load_si256((const __m256i*)(S1 + x + 8)); - - s0 = _mm256_mul_ps(_mm256_cvtepi32_ps(x0), b0); - s1 = _mm256_mul_ps(_mm256_cvtepi32_ps(x1), b0); - f0 = _mm256_mul_ps(_mm256_cvtepi32_ps(y0), b1); - f1 = _mm256_mul_ps(_mm256_cvtepi32_ps(y1), b1); - s0 = _mm256_add_ps(s0, f0); - s1 = _mm256_add_ps(s1, f1); - - x0 = _mm256_load_si256((const __m256i*)(S2 + x)); - x1 = _mm256_load_si256((const __m256i*)(S2 + x + 8)); - y0 = _mm256_load_si256((const __m256i*)(S3 + x)); - y1 = _mm256_load_si256((const __m256i*)(S3 + x + 8)); - - f0 = _mm256_mul_ps(_mm256_cvtepi32_ps(x0), b2); - f1 = _mm256_mul_ps(_mm256_cvtepi32_ps(x1), b2); - s0 = _mm256_add_ps(s0, f0); - s1 = _mm256_add_ps(s1, f1); - f0 = _mm256_mul_ps(_mm256_cvtepi32_ps(y0), b3); - f1 = _mm256_mul_ps(_mm256_cvtepi32_ps(y1), b3); - s0 = _mm256_add_ps(s0, f0); - s1 = _mm256_add_ps(s1, f1); - - x0 = _mm256_cvtps_epi32(s0); - x1 = _mm256_cvtps_epi32(s1); - - x0 = _mm256_packs_epi32(x0, x1); - x0 = _mm256_permute4x64_epi64(x0, shuffle); - x0 = _mm256_packus_epi16(x0, x0); - _mm_storel_epi64( (__m128i*)(dst + x), _mm256_extracti128_si256(x0, 0)); - _mm_storel_epi64( (__m128i*)(dst + x + 8), _mm256_extracti128_si256(x0, 1)); - } - else - for( ; x <= width - 16; x += 16 ) - { - __m256i x0, x1, y0, y1; - __m256 s0, s1, f0, f1; - x0 = _mm256_loadu_si256((const __m256i*)(S0 + x)); - x1 = _mm256_loadu_si256((const __m256i*)(S0 + x + 8)); - y0 = _mm256_loadu_si256((const __m256i*)(S1 + x)); - y1 = _mm256_loadu_si256((const __m256i*)(S1 + x + 8)); - - s0 = _mm256_mul_ps(_mm256_cvtepi32_ps(x0), b0); - s1 = _mm256_mul_ps(_mm256_cvtepi32_ps(x1), b0); - f0 = _mm256_mul_ps(_mm256_cvtepi32_ps(y0), b1); - f1 = _mm256_mul_ps(_mm256_cvtepi32_ps(y1), b1); - s0 = _mm256_add_ps(s0, f0); - s1 = _mm256_add_ps(s1, f1); - - x0 = _mm256_loadu_si256((const __m256i*)(S2 + x)); - x1 = _mm256_loadu_si256((const __m256i*)(S2 + x + 8)); - y0 = _mm256_loadu_si256((const __m256i*)(S3 + x)); - y1 = _mm256_loadu_si256((const __m256i*)(S3 + x + 8)); - - f0 = _mm256_mul_ps(_mm256_cvtepi32_ps(x0), b2); - f1 = _mm256_mul_ps(_mm256_cvtepi32_ps(x1), b2); - s0 = _mm256_add_ps(s0, f0); - s1 = _mm256_add_ps(s1, f1); - f0 = _mm256_mul_ps(_mm256_cvtepi32_ps(y0), b3); - f1 = _mm256_mul_ps(_mm256_cvtepi32_ps(y1), b3); - s0 = _mm256_add_ps(s0, f0); - s1 = _mm256_add_ps(s1, f1); - - x0 = _mm256_cvtps_epi32(s0); - x1 = _mm256_cvtps_epi32(s1); - - x0 = _mm256_packs_epi32(x0, x1); - x0 = _mm256_permute4x64_epi64(x0, shuffle); - x0 = _mm256_packus_epi16(x0, x0); - _mm_storel_epi64( (__m128i*)(dst + x), _mm256_extracti128_si256(x0, 0)); - _mm_storel_epi64( (__m128i*)(dst + x + 8), _mm256_extracti128_si256(x0, 1)); - } - - return x; -} -#endif - struct VResizeCubicVec_32s8u { int operator()(const uchar** _src, uchar* dst, const uchar* _beta, int width ) const { -#if CV_AVX2 - if( checkHardwareSupport(CV_CPU_AVX2) ) - return VResizeCubicVec_32s8u_avx2(_src, dst, _beta, width); -#endif - if( checkHardwareSupport(CV_CPU_SSE2) ) - return VResizeCubicVec_32s8u_sse2(_src, dst, _beta, width); + int processed = 0; - return 0; + if( checkHardwareSupport(CV_CPU_AVX2) ) + processed += VResizeCubicVec_32s8u_avx2(_src, dst, _beta, width); + + if( !processed && checkHardwareSupport(CV_CPU_SSE2) ) + processed += VResizeCubicVec_32s8u_sse2(_src, dst, _beta, width); + + return processed; } }; @@ -1230,114 +910,19 @@ int VResizeCubicVec_32f16_sse2(const uchar** _src, uchar* _dst, const uchar* _be return x; } -#if CV_AVX2 -template -int VResizeCubicVec_32f16_avx2(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) -{ - const float** src = (const float**)_src; - const float* beta = (const float*)_beta; - const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3]; - ushort* dst = (ushort*)_dst; - int x = 0; - __m256 b0 = _mm256_set1_ps(beta[0]), b1 = _mm256_set1_ps(beta[1]), - b2 = _mm256_set1_ps(beta[2]), b3 = _mm256_set1_ps(beta[3]); - __m256i preshift = _mm256_set1_epi32(shiftval); - __m256i postshift = _mm256_set1_epi16((short)shiftval); - const int shuffle = 0xd8; // 11 | 01 | 10 | 00 - - if( (((size_t)S0|(size_t)S1|(size_t)S2|(size_t)S3)&31) == 0 ) - for( ; x <= width - 16; x += 16 ) - { - __m256 x0, x1, y0, y1, s0, s1; - __m256i t0, t1; - x0 = _mm256_load_ps(S0 + x); - x1 = _mm256_load_ps(S0 + x + 8); - y0 = _mm256_load_ps(S1 + x); - y1 = _mm256_load_ps(S1 + x + 8); - - s0 = _mm256_mul_ps(x0, b0); - s1 = _mm256_mul_ps(x1, b0); - y0 = _mm256_mul_ps(y0, b1); - y1 = _mm256_mul_ps(y1, b1); - s0 = _mm256_add_ps(s0, y0); - s1 = _mm256_add_ps(s1, y1); - - x0 = _mm256_load_ps(S2 + x); - x1 = _mm256_load_ps(S2 + x + 8); - y0 = _mm256_load_ps(S3 + x); - y1 = _mm256_load_ps(S3 + x + 8); - - x0 = _mm256_mul_ps(x0, b2); - x1 = _mm256_mul_ps(x1, b2); - y0 = _mm256_mul_ps(y0, b3); - y1 = _mm256_mul_ps(y1, b3); - s0 = _mm256_add_ps(s0, x0); - s1 = _mm256_add_ps(s1, x1); - s0 = _mm256_add_ps(s0, y0); - s1 = _mm256_add_ps(s1, y1); - - t0 = _mm256_add_epi32(_mm256_cvtps_epi32(s0), preshift); - t1 = _mm256_add_epi32(_mm256_cvtps_epi32(s1), preshift); - - t0 = _mm256_add_epi16(_mm256_packs_epi32(t0, t1), postshift); - t0 = _mm256_permute4x64_epi64(t0, shuffle); - _mm256_storeu_si256( (__m256i*)(dst + x), t0); - } - else - for( ; x <= width - 16; x += 16 ) - { - __m256 x0, x1, y0, y1, s0, s1; - __m256i t0, t1; - x0 = _mm256_loadu_ps(S0 + x); - x1 = _mm256_loadu_ps(S0 + x + 8); - y0 = _mm256_loadu_ps(S1 + x); - y1 = _mm256_loadu_ps(S1 + x + 8); - - s0 = _mm256_mul_ps(x0, b0); - s1 = _mm256_mul_ps(x1, b0); - y0 = _mm256_mul_ps(y0, b1); - y1 = _mm256_mul_ps(y1, b1); - s0 = _mm256_add_ps(s0, y0); - s1 = _mm256_add_ps(s1, y1); - - x0 = _mm256_loadu_ps(S2 + x); - x1 = _mm256_loadu_ps(S2 + x + 8); - y0 = _mm256_loadu_ps(S3 + x); - y1 = _mm256_loadu_ps(S3 + x + 8); - - x0 = _mm256_mul_ps(x0, b2); - x1 = _mm256_mul_ps(x1, b2); - y0 = _mm256_mul_ps(y0, b3); - y1 = _mm256_mul_ps(y1, b3); - s0 = _mm256_add_ps(s0, x0); - s1 = _mm256_add_ps(s1, x1); - s0 = _mm256_add_ps(s0, y0); - s1 = _mm256_add_ps(s1, y1); - - t0 = _mm256_add_epi32(_mm256_cvtps_epi32(s0), preshift); - t1 = _mm256_add_epi32(_mm256_cvtps_epi32(s1), preshift); - - t0 = _mm256_add_epi16(_mm256_packs_epi32(t0, t1), postshift); - t0 = _mm256_permute4x64_epi64(t0, shuffle); - _mm256_storeu_si256( (__m256i*)(dst + x), t0); - } - - return x; -} -#endif - template struct VResizeCubicVec_32f16 { int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const { -#if CV_AVX2 - if( checkHardwareSupport(CV_CPU_AVX2) ) - return VResizeCubicVec_32f16_avx2(_src, _dst, _beta, width); -#endif - if( checkHardwareSupport(CV_CPU_SSE2) ) - return VResizeCubicVec_32f16_sse2(_src, _dst, _beta, width); + int processed = 0; - return 0; + if( checkHardwareSupport(CV_CPU_AVX2) ) + processed += VResizeCubicVec_32f16_avx2(_src, _dst, _beta, width); + + if( !processed && checkHardwareSupport(CV_CPU_SSE2) ) + processed += VResizeCubicVec_32f16_sse2(_src, _dst, _beta, width); + + return processed; } }; @@ -1424,100 +1009,19 @@ static int VResizeCubicVec_32f_sse(const uchar** _src, uchar* _dst, const uchar* return x; } -#if CV_AVX -int VResizeCubicVec_32f_avx(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) -{ - const float** src = (const float**)_src; - const float* beta = (const float*)_beta; - const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3]; - float* dst = (float*)_dst; - int x = 0; - __m256 b0 = _mm256_set1_ps(beta[0]), b1 = _mm256_set1_ps(beta[1]), - b2 = _mm256_set1_ps(beta[2]), b3 = _mm256_set1_ps(beta[3]); - - if( (((size_t)S0|(size_t)S1|(size_t)S2|(size_t)S3)&31) == 0 ) - for( ; x <= width - 16; x += 16 ) - { - __m256 x0, x1, y0, y1, s0, s1; - x0 = _mm256_load_ps(S0 + x); - x1 = _mm256_load_ps(S0 + x + 8); - y0 = _mm256_load_ps(S1 + x); - y1 = _mm256_load_ps(S1 + x + 8); - - s0 = _mm256_mul_ps(x0, b0); - s1 = _mm256_mul_ps(x1, b0); - y0 = _mm256_mul_ps(y0, b1); - y1 = _mm256_mul_ps(y1, b1); - s0 = _mm256_add_ps(s0, y0); - s1 = _mm256_add_ps(s1, y1); - - x0 = _mm256_load_ps(S2 + x); - x1 = _mm256_load_ps(S2 + x + 8); - y0 = _mm256_load_ps(S3 + x); - y1 = _mm256_load_ps(S3 + x + 8); - - x0 = _mm256_mul_ps(x0, b2); - x1 = _mm256_mul_ps(x1, b2); - y0 = _mm256_mul_ps(y0, b3); - y1 = _mm256_mul_ps(y1, b3); - s0 = _mm256_add_ps(s0, x0); - s1 = _mm256_add_ps(s1, x1); - s0 = _mm256_add_ps(s0, y0); - s1 = _mm256_add_ps(s1, y1); - - _mm256_storeu_ps( dst + x, s0); - _mm256_storeu_ps( dst + x + 8, s1); - } - else - for( ; x <= width - 16; x += 16 ) - { - __m256 x0, x1, y0, y1, s0, s1; - x0 = _mm256_loadu_ps(S0 + x); - x1 = _mm256_loadu_ps(S0 + x + 8); - y0 = _mm256_loadu_ps(S1 + x); - y1 = _mm256_loadu_ps(S1 + x + 8); - - s0 = _mm256_mul_ps(x0, b0); - s1 = _mm256_mul_ps(x1, b0); - y0 = _mm256_mul_ps(y0, b1); - y1 = _mm256_mul_ps(y1, b1); - s0 = _mm256_add_ps(s0, y0); - s1 = _mm256_add_ps(s1, y1); - - x0 = _mm256_loadu_ps(S2 + x); - x1 = _mm256_loadu_ps(S2 + x + 8); - y0 = _mm256_loadu_ps(S3 + x); - y1 = _mm256_loadu_ps(S3 + x + 8); - - x0 = _mm256_mul_ps(x0, b2); - x1 = _mm256_mul_ps(x1, b2); - y0 = _mm256_mul_ps(y0, b3); - y1 = _mm256_mul_ps(y1, b3); - s0 = _mm256_add_ps(s0, x0); - s1 = _mm256_add_ps(s1, x1); - s0 = _mm256_add_ps(s0, y0); - s1 = _mm256_add_ps(s1, y1); - - _mm256_storeu_ps( dst + x, s0); - _mm256_storeu_ps( dst + x + 8, s1); - } - - return x; -} -#endif - struct VResizeCubicVec_32f { int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const { -#if CV_AVX - if( checkHardwareSupport(CV_CPU_AVX) ) - return VResizeCubicVec_32f_avx(_src, _dst, _beta, width); -#endif - if( checkHardwareSupport(CV_CPU_SSE) ) - return VResizeCubicVec_32f_sse(_src, _dst, _beta, width); + int processed = 0; - return 0; + if( checkHardwareSupport(CV_CPU_AVX) ) + processed += VResizeCubicVec_32f_avx(_src, _dst, _beta, width); + + if( !processed && checkHardwareSupport(CV_CPU_SSE) ) + processed += VResizeCubicVec_32f_sse(_src, _dst, _beta, width); + + return processed; } }; diff --git a/modules/ts/src/ts_func.cpp b/modules/ts/src/ts_func.cpp index 39907edac4..428a5f8128 100644 --- a/modules/ts/src/ts_func.cpp +++ b/modules/ts/src/ts_func.cpp @@ -3002,12 +3002,8 @@ void printVersionInfo(bool useStdOut) #if CV_SSE4_2 if (checkHardwareSupport(CV_CPU_SSE4_2)) cpu_features += " sse4.2"; #endif -#if CV_AVX if (checkHardwareSupport(CV_CPU_AVX)) cpu_features += " avx"; -#endif -#if CV_AVX2 if (checkHardwareSupport(CV_CPU_AVX2)) cpu_features += " avx2"; -#endif #if CV_NEON cpu_features += " neon"; // NEON is currently not checked at runtime #endif