From b4bcdd10a160dc19788efeca80f0db1a21bacc36 Mon Sep 17 00:00:00 2001 From: Maksim Shabunin Date: Thu, 3 Dec 2015 14:43:37 +0300 Subject: [PATCH 1/5] HAL: improvements - added new functions from core module: split, merge, add, sub, mul, div, ... - added function replacement mechanism - added example of HAL replacement library --- CMakeLists.txt | 5 + cmake/templates/custom_hal.hpp.in | 6 + modules/core/include/opencv2/core/base.hpp | 2 - modules/core/include/opencv2/core/utility.hpp | 31 - modules/core/src/arithm.cpp | 3942 +---------------- modules/core/src/convert.cpp | 752 +--- modules/core/src/precomp.hpp | 50 +- modules/core/src/system.cpp | 194 +- modules/hal/CMakeLists.txt | 12 +- modules/hal/include/opencv2/hal.hpp | 200 +- modules/hal/include/opencv2/hal/defs.h | 88 +- modules/hal/include/opencv2/hal/interface.hpp | 91 + .../include/opencv2/hal}/sse_utils.hpp | 2 + modules/hal/samples/simple_hal/CMakeLists.txt | 12 + modules/hal/samples/simple_hal/simple.cpp | 34 + modules/hal/samples/simple_hal/simple.hpp | 20 + modules/hal/src/arithm.cpp | 1090 ++++- modules/hal/src/arithm_core.hpp | 657 +++ modules/hal/src/arithm_simd.hpp | 2025 +++++++++ modules/hal/src/hardware.cpp | 221 + modules/hal/src/merge.cpp | 408 ++ modules/hal/src/precomp.hpp | 10 + modules/hal/src/replacement.hpp | 208 + modules/hal/src/split.cpp | 424 ++ modules/imgproc/src/precomp.hpp | 2 + 25 files changed, 5552 insertions(+), 4934 deletions(-) create mode 100644 cmake/templates/custom_hal.hpp.in create mode 100644 modules/hal/include/opencv2/hal/interface.hpp rename modules/{core/include/opencv2/core => hal/include/opencv2/hal}/sse_utils.hpp (99%) create mode 100644 modules/hal/samples/simple_hal/CMakeLists.txt create mode 100644 modules/hal/samples/simple_hal/simple.cpp create mode 100644 modules/hal/samples/simple_hal/simple.hpp create mode 100644 modules/hal/src/arithm_core.hpp create mode 100644 modules/hal/src/arithm_simd.hpp create mode 100644 modules/hal/src/hardware.cpp create mode 100644 modules/hal/src/merge.cpp create mode 100644 modules/hal/src/replacement.hpp create mode 100644 modules/hal/src/split.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 05696527d5..deed0a6485 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -587,6 +587,11 @@ include(cmake/OpenCVFindMatlab.cmake) include(cmake/OpenCVDetectVTK.cmake) +if (OPENCV_HAL_HEADERS AND OPENCV_HAL_LIBS) + get_filename_component(OPENCV_HAL_HEADERS "${OPENCV_HAL_HEADERS}" ABSOLUTE) + get_filename_component(OPENCV_HAL_LIBS "${OPENCV_HAL_LIBS}" ABSOLUTE) +endif() + # ---------------------------------------------------------------------------- # Add CUDA libraries (needed for apps/tools, samples) # ---------------------------------------------------------------------------- diff --git a/cmake/templates/custom_hal.hpp.in b/cmake/templates/custom_hal.hpp.in new file mode 100644 index 0000000000..b298a033ec --- /dev/null +++ b/cmake/templates/custom_hal.hpp.in @@ -0,0 +1,6 @@ +#ifndef _CUSTOM_HAL_INCLUDED_ +#define _CUSTOM_HAL_INCLUDED_ + +@OPENCV_HAL_HEADERS_INCLUDES@ + +#endif diff --git a/modules/core/include/opencv2/core/base.hpp b/modules/core/include/opencv2/core/base.hpp index a3c40f5601..9a0d4989b9 100644 --- a/modules/core/include/opencv2/core/base.hpp +++ b/modules/core/include/opencv2/core/base.hpp @@ -762,6 +762,4 @@ inline float32x2_t cv_vsqrt_f32(float32x2_t val) } // cv -#include "sse_utils.hpp" - #endif //__OPENCV_CORE_BASE_HPP__ diff --git a/modules/core/include/opencv2/core/utility.hpp b/modules/core/include/opencv2/core/utility.hpp index 4d7d7df668..b66ade5c17 100644 --- a/modules/core/include/opencv2/core/utility.hpp +++ b/modules/core/include/opencv2/core/utility.hpp @@ -277,37 +277,6 @@ execution time. */ CV_EXPORTS_W int64 getCPUTickCount(); -/** @brief Available CPU features. - -remember to keep this list identical to the one in cvdef.h -*/ -enum CpuFeatures { - CPU_MMX = 1, - CPU_SSE = 2, - CPU_SSE2 = 3, - CPU_SSE3 = 4, - CPU_SSSE3 = 5, - CPU_SSE4_1 = 6, - CPU_SSE4_2 = 7, - CPU_POPCNT = 8, - - CPU_AVX = 10, - CPU_AVX2 = 11, - CPU_FMA3 = 12, - - CPU_AVX_512F = 13, - CPU_AVX_512BW = 14, - CPU_AVX_512CD = 15, - CPU_AVX_512DQ = 16, - CPU_AVX_512ER = 17, - CPU_AVX_512IFMA512 = 18, - CPU_AVX_512PF = 19, - CPU_AVX_512VBMI = 20, - CPU_AVX_512VL = 21, - - CPU_NEON = 100 -}; - /** @brief Returns true if the specified feature is supported by the host hardware. The function returns true if the host hardware supports the specified feature. When user calls diff --git a/modules/core/src/arithm.cpp b/modules/core/src/arithm.cpp index 6ad72461db..06cd7916e2 100644 --- a/modules/core/src/arithm.cpp +++ b/modules/core/src/arithm.cpp @@ -53,1354 +53,6 @@ namespace cv { -struct NOP {}; - -#if CV_SSE2 || CV_NEON - -#define FUNCTOR_TEMPLATE(name) \ - template struct name {} - -FUNCTOR_TEMPLATE(VLoadStore128); -#if CV_SSE2 -FUNCTOR_TEMPLATE(VLoadStore64); -FUNCTOR_TEMPLATE(VLoadStore128Aligned); -#if CV_AVX2 -FUNCTOR_TEMPLATE(VLoadStore256); -FUNCTOR_TEMPLATE(VLoadStore256Aligned); -#endif -#endif - -#endif - -template -void vBinOp(const T* src1, size_t step1, const T* src2, size_t step2, T* dst, size_t step, Size sz) -{ -#if CV_SSE2 || CV_NEON - VOp vop; -#endif - Op op; - - for( ; sz.height--; src1 = (const T *)((const uchar *)src1 + step1), - src2 = (const T *)((const uchar *)src2 + step2), - dst = (T *)((uchar *)dst + step) ) - { - int x = 0; - -#if CV_NEON || CV_SSE2 -#if CV_AVX2 - if( USE_AVX2 ) - { - for( ; x <= sz.width - 32/(int)sizeof(T); x += 32/sizeof(T) ) - { - typename VLoadStore256::reg_type r0 = VLoadStore256::load(src1 + x); - r0 = vop(r0, VLoadStore256::load(src2 + x)); - VLoadStore256::store(dst + x, r0); - } - } -#else -#if CV_SSE2 - if( USE_SSE2 ) - { -#endif // CV_SSE2 - for( ; x <= sz.width - 32/(int)sizeof(T); x += 32/sizeof(T) ) - { - typename VLoadStore128::reg_type r0 = VLoadStore128::load(src1 + x ); - typename VLoadStore128::reg_type r1 = VLoadStore128::load(src1 + x + 16/sizeof(T)); - r0 = vop(r0, VLoadStore128::load(src2 + x )); - r1 = vop(r1, VLoadStore128::load(src2 + x + 16/sizeof(T))); - VLoadStore128::store(dst + x , r0); - VLoadStore128::store(dst + x + 16/sizeof(T), r1); - } -#if CV_SSE2 - } -#endif // CV_SSE2 -#endif // CV_AVX2 -#endif // CV_NEON || CV_SSE2 - -#if CV_AVX2 - // nothing -#elif CV_SSE2 - if( USE_SSE2 ) - { - for( ; x <= sz.width - 8/(int)sizeof(T); x += 8/sizeof(T) ) - { - typename VLoadStore64::reg_type r = VLoadStore64::load(src1 + x); - r = vop(r, VLoadStore64::load(src2 + x)); - VLoadStore64::store(dst + x, r); - } - } -#endif - -#if CV_ENABLE_UNROLLED - for( ; x <= sz.width - 4; x += 4 ) - { - T v0 = op(src1[x], src2[x]); - T v1 = op(src1[x+1], src2[x+1]); - dst[x] = v0; dst[x+1] = v1; - v0 = op(src1[x+2], src2[x+2]); - v1 = op(src1[x+3], src2[x+3]); - dst[x+2] = v0; dst[x+3] = v1; - } -#endif - - for( ; x < sz.width; x++ ) - dst[x] = op(src1[x], src2[x]); - } -} - -template -void vBinOp32(const T* src1, size_t step1, const T* src2, size_t step2, - T* dst, size_t step, Size sz) -{ -#if CV_SSE2 || CV_NEON - Op32 op32; -#endif - Op op; - - for( ; sz.height--; src1 = (const T *)((const uchar *)src1 + step1), - src2 = (const T *)((const uchar *)src2 + step2), - dst = (T *)((uchar *)dst + step) ) - { - int x = 0; - -#if CV_AVX2 - if( USE_AVX2 ) - { - if( (((size_t)src1|(size_t)src2|(size_t)dst)&31) == 0 ) - { - for( ; x <= sz.width - 8; x += 8 ) - { - typename VLoadStore256Aligned::reg_type r0 = VLoadStore256Aligned::load(src1 + x); - r0 = op32(r0, VLoadStore256Aligned::load(src2 + x)); - VLoadStore256Aligned::store(dst + x, r0); - } - } - } -#elif CV_SSE2 - if( USE_SSE2 ) - { - if( (((size_t)src1|(size_t)src2|(size_t)dst)&15) == 0 ) - { - for( ; x <= sz.width - 8; x += 8 ) - { - typename VLoadStore128Aligned::reg_type r0 = VLoadStore128Aligned::load(src1 + x ); - typename VLoadStore128Aligned::reg_type r1 = VLoadStore128Aligned::load(src1 + x + 4); - r0 = op32(r0, VLoadStore128Aligned::load(src2 + x )); - r1 = op32(r1, VLoadStore128Aligned::load(src2 + x + 4)); - VLoadStore128Aligned::store(dst + x , r0); - VLoadStore128Aligned::store(dst + x + 4, r1); - } - } - } -#endif // CV_AVX2 - -#if CV_NEON || CV_SSE2 -#if CV_AVX2 - if( USE_AVX2 ) - { - for( ; x <= sz.width - 8; x += 8 ) - { - typename VLoadStore256::reg_type r0 = VLoadStore256::load(src1 + x); - r0 = op32(r0, VLoadStore256::load(src2 + x)); - VLoadStore256::store(dst + x, r0); - } - } -#else -#if CV_SSE2 - if( USE_SSE2 ) - { -#endif // CV_SSE2 - for( ; x <= sz.width - 8; x += 8 ) - { - typename VLoadStore128::reg_type r0 = VLoadStore128::load(src1 + x ); - typename VLoadStore128::reg_type r1 = VLoadStore128::load(src1 + x + 4); - r0 = op32(r0, VLoadStore128::load(src2 + x )); - r1 = op32(r1, VLoadStore128::load(src2 + x + 4)); - VLoadStore128::store(dst + x , r0); - VLoadStore128::store(dst + x + 4, r1); - } -#if CV_SSE2 - } -#endif // CV_SSE2 -#endif // CV_AVX2 -#endif // CV_NEON || CV_SSE2 - -#if CV_ENABLE_UNROLLED - for( ; x <= sz.width - 4; x += 4 ) - { - T v0 = op(src1[x], src2[x]); - T v1 = op(src1[x+1], src2[x+1]); - dst[x] = v0; dst[x+1] = v1; - v0 = op(src1[x+2], src2[x+2]); - v1 = op(src1[x+3], src2[x+3]); - dst[x+2] = v0; dst[x+3] = v1; - } -#endif - - for( ; x < sz.width; x++ ) - dst[x] = op(src1[x], src2[x]); - } -} - - -template -void vBinOp64(const T* src1, size_t step1, const T* src2, size_t step2, - T* dst, size_t step, Size sz) -{ -#if CV_SSE2 - Op64 op64; -#endif - Op op; - - for( ; sz.height--; src1 = (const T *)((const uchar *)src1 + step1), - src2 = (const T *)((const uchar *)src2 + step2), - dst = (T *)((uchar *)dst + step) ) - { - int x = 0; - -#if CV_AVX2 - if( USE_AVX2 ) - { - if( (((size_t)src1|(size_t)src2|(size_t)dst)&31) == 0 ) - { - for( ; x <= sz.width - 4; x += 4 ) - { - typename VLoadStore256Aligned::reg_type r0 = VLoadStore256Aligned::load(src1 + x); - r0 = op64(r0, VLoadStore256Aligned::load(src2 + x)); - VLoadStore256Aligned::store(dst + x, r0); - } - } - } -#elif CV_SSE2 - if( USE_SSE2 ) - { - if( (((size_t)src1|(size_t)src2|(size_t)dst)&15) == 0 ) - { - for( ; x <= sz.width - 4; x += 4 ) - { - typename VLoadStore128Aligned::reg_type r0 = VLoadStore128Aligned::load(src1 + x ); - typename VLoadStore128Aligned::reg_type r1 = VLoadStore128Aligned::load(src1 + x + 2); - r0 = op64(r0, VLoadStore128Aligned::load(src2 + x )); - r1 = op64(r1, VLoadStore128Aligned::load(src2 + x + 2)); - VLoadStore128Aligned::store(dst + x , r0); - VLoadStore128Aligned::store(dst + x + 2, r1); - } - } - } -#endif - - for( ; x <= sz.width - 4; x += 4 ) - { - T v0 = op(src1[x], src2[x]); - T v1 = op(src1[x+1], src2[x+1]); - dst[x] = v0; dst[x+1] = v1; - v0 = op(src1[x+2], src2[x+2]); - v1 = op(src1[x+3], src2[x+3]); - dst[x+2] = v0; dst[x+3] = v1; - } - - for( ; x < sz.width; x++ ) - dst[x] = op(src1[x], src2[x]); - } -} - -#if CV_AVX2 - -#define FUNCTOR_LOADSTORE_CAST(name, template_arg, register_type, load_body, store_body) \ - template <> \ - struct name{ \ - typedef register_type reg_type; \ - static reg_type load(const template_arg * p) { return load_body ((const reg_type *)p); } \ - static void store(template_arg * p, reg_type v) { store_body ((reg_type *)p, v); } \ - } - -#define FUNCTOR_LOADSTORE(name, template_arg, register_type, load_body, store_body) \ - template <> \ - struct name{ \ - typedef register_type reg_type; \ - static reg_type load(const template_arg * p) { return load_body (p); } \ - static void store(template_arg * p, reg_type v) { store_body (p, v); } \ - } - -#define FUNCTOR_CLOSURE_2arg(name, template_arg, body) \ - template<> \ - struct name \ - { \ - VLoadStore256::reg_type operator()( \ - const VLoadStore256::reg_type & a, \ - const VLoadStore256::reg_type & b) const \ - { \ - body; \ - } \ - } - -#define FUNCTOR_CLOSURE_1arg(name, template_arg, body) \ - template<> \ - struct name \ - { \ - VLoadStore256::reg_type operator()( \ - const VLoadStore256::reg_type & a, \ - const VLoadStore256::reg_type & ) const \ - { \ - body; \ - } \ - } - -FUNCTOR_LOADSTORE_CAST(VLoadStore256, uchar, __m256i, _mm256_loadu_si256, _mm256_storeu_si256); -FUNCTOR_LOADSTORE_CAST(VLoadStore256, schar, __m256i, _mm256_loadu_si256, _mm256_storeu_si256); -FUNCTOR_LOADSTORE_CAST(VLoadStore256, ushort, __m256i, _mm256_loadu_si256, _mm256_storeu_si256); -FUNCTOR_LOADSTORE_CAST(VLoadStore256, short, __m256i, _mm256_loadu_si256, _mm256_storeu_si256); -FUNCTOR_LOADSTORE_CAST(VLoadStore256, int, __m256i, _mm256_loadu_si256, _mm256_storeu_si256); -FUNCTOR_LOADSTORE( VLoadStore256, float, __m256 , _mm256_loadu_ps , _mm256_storeu_ps ); -FUNCTOR_LOADSTORE( VLoadStore256, double, __m256d, _mm256_loadu_pd , _mm256_storeu_pd ); - -FUNCTOR_LOADSTORE_CAST(VLoadStore256Aligned, int, __m256i, _mm256_load_si256, _mm256_store_si256); -FUNCTOR_LOADSTORE( VLoadStore256Aligned, float, __m256 , _mm256_load_ps , _mm256_store_ps ); -FUNCTOR_LOADSTORE( VLoadStore256Aligned, double, __m256d, _mm256_load_pd , _mm256_store_pd ); - -FUNCTOR_TEMPLATE(VAdd); -FUNCTOR_CLOSURE_2arg(VAdd, uchar, return _mm256_adds_epu8 (a, b)); -FUNCTOR_CLOSURE_2arg(VAdd, schar, return _mm256_adds_epi8 (a, b)); -FUNCTOR_CLOSURE_2arg(VAdd, ushort, return _mm256_adds_epu16(a, b)); -FUNCTOR_CLOSURE_2arg(VAdd, short, return _mm256_adds_epi16(a, b)); -FUNCTOR_CLOSURE_2arg(VAdd, int, return _mm256_add_epi32 (a, b)); -FUNCTOR_CLOSURE_2arg(VAdd, float, return _mm256_add_ps (a, b)); -FUNCTOR_CLOSURE_2arg(VAdd, double, return _mm256_add_pd (a, b)); - -FUNCTOR_TEMPLATE(VSub); -FUNCTOR_CLOSURE_2arg(VSub, uchar, return _mm256_subs_epu8 (a, b)); -FUNCTOR_CLOSURE_2arg(VSub, schar, return _mm256_subs_epi8 (a, b)); -FUNCTOR_CLOSURE_2arg(VSub, ushort, return _mm256_subs_epu16(a, b)); -FUNCTOR_CLOSURE_2arg(VSub, short, return _mm256_subs_epi16(a, b)); -FUNCTOR_CLOSURE_2arg(VSub, int, return _mm256_sub_epi32 (a, b)); -FUNCTOR_CLOSURE_2arg(VSub, float, return _mm256_sub_ps (a, b)); -FUNCTOR_CLOSURE_2arg(VSub, double, return _mm256_sub_pd (a, b)); - -FUNCTOR_TEMPLATE(VMin); -FUNCTOR_CLOSURE_2arg(VMin, uchar, return _mm256_min_epu8 (a, b)); -FUNCTOR_CLOSURE_2arg(VMin, schar, return _mm256_min_epi8 (a, b)); -FUNCTOR_CLOSURE_2arg(VMin, ushort, return _mm256_min_epi16(a, b)); -FUNCTOR_CLOSURE_2arg(VMin, short, return _mm256_min_epi16(a, b)); -FUNCTOR_CLOSURE_2arg(VMin, int, return _mm256_min_epi32(a, b)); -FUNCTOR_CLOSURE_2arg(VMin, float, return _mm256_min_ps (a, b)); -FUNCTOR_CLOSURE_2arg(VMin, double, return _mm256_min_pd (a, b)); - -FUNCTOR_TEMPLATE(VMax); -FUNCTOR_CLOSURE_2arg(VMax, uchar, return _mm256_max_epu8 (a, b)); -FUNCTOR_CLOSURE_2arg(VMax, schar, return _mm256_max_epi8 (a, b)); -FUNCTOR_CLOSURE_2arg(VMax, ushort, return _mm256_max_epu16(a, b)); -FUNCTOR_CLOSURE_2arg(VMax, short, return _mm256_max_epi16(a, b)); -FUNCTOR_CLOSURE_2arg(VMax, int, return _mm256_max_epi32(a, b)); -FUNCTOR_CLOSURE_2arg(VMax, float, return _mm256_max_ps (a, b)); -FUNCTOR_CLOSURE_2arg(VMax, double, return _mm256_max_pd (a, b)); - - -static unsigned int CV_DECL_ALIGNED(32) v32f_absmask[] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, - 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff }; -static unsigned int CV_DECL_ALIGNED(32) v64f_absmask[] = { 0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff, - 0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff }; - -FUNCTOR_TEMPLATE(VAbsDiff); -FUNCTOR_CLOSURE_2arg(VAbsDiff, uchar, - return _mm256_add_epi8(_mm256_subs_epu8(a, b), _mm256_subs_epu8(b, a)); - ); -FUNCTOR_CLOSURE_2arg(VAbsDiff, schar, - __m256i d = _mm256_subs_epi8(a, b); - __m256i m = _mm256_cmpgt_epi8(b, a); - return _mm256_subs_epi8(_mm256_xor_si256(d, m), m); - ); -FUNCTOR_CLOSURE_2arg(VAbsDiff, ushort, - return _mm256_add_epi16(_mm256_subs_epu16(a, b), _mm256_subs_epu16(b, a)); - ); -FUNCTOR_CLOSURE_2arg(VAbsDiff, short, - __m256i M = _mm256_max_epi16(a, b); - __m256i m = _mm256_min_epi16(a, b); - return _mm256_subs_epi16(M, m); - ); -FUNCTOR_CLOSURE_2arg(VAbsDiff, int, - __m256i d = _mm256_sub_epi32(a, b); - __m256i m = _mm256_cmpgt_epi32(b, a); - return _mm256_sub_epi32(_mm256_xor_si256(d, m), m); - ); -FUNCTOR_CLOSURE_2arg(VAbsDiff, float, - return _mm256_and_ps(_mm256_sub_ps(a, b), *(const __m256*)v32f_absmask); - ); -FUNCTOR_CLOSURE_2arg(VAbsDiff, double, - return _mm256_and_pd(_mm256_sub_pd(a, b), *(const __m256d*)v64f_absmask); - ); - -FUNCTOR_TEMPLATE(VAnd); -FUNCTOR_CLOSURE_2arg(VAnd, uchar, return _mm256_and_si256(a, b)); -FUNCTOR_TEMPLATE(VOr); -FUNCTOR_CLOSURE_2arg(VOr , uchar, return _mm256_or_si256 (a, b)); -FUNCTOR_TEMPLATE(VXor); -FUNCTOR_CLOSURE_2arg(VXor, uchar, return _mm256_xor_si256(a, b)); -FUNCTOR_TEMPLATE(VNot); -FUNCTOR_CLOSURE_1arg(VNot, uchar, return _mm256_xor_si256(_mm256_set1_epi32(-1), a)); - -#elif CV_SSE2 - -#define FUNCTOR_LOADSTORE_CAST(name, template_arg, register_type, load_body, store_body)\ - template <> \ - struct name{ \ - typedef register_type reg_type; \ - static reg_type load(const template_arg * p) { return load_body ((const reg_type *)p); } \ - static void store(template_arg * p, reg_type v) { store_body ((reg_type *)p, v); } \ - } - -#define FUNCTOR_LOADSTORE(name, template_arg, register_type, load_body, store_body)\ - template <> \ - struct name{ \ - typedef register_type reg_type; \ - static reg_type load(const template_arg * p) { return load_body (p); } \ - static void store(template_arg * p, reg_type v) { store_body (p, v); } \ - } - -#define FUNCTOR_CLOSURE_2arg(name, template_arg, body)\ - template<> \ - struct name \ - { \ - VLoadStore128::reg_type operator()( \ - const VLoadStore128::reg_type & a, \ - const VLoadStore128::reg_type & b) const \ - { \ - body; \ - } \ - } - -#define FUNCTOR_CLOSURE_1arg(name, template_arg, body)\ - template<> \ - struct name \ - { \ - VLoadStore128::reg_type operator()( \ - const VLoadStore128::reg_type & a, \ - const VLoadStore128::reg_type & ) const \ - { \ - body; \ - } \ - } - -FUNCTOR_LOADSTORE_CAST(VLoadStore128, uchar, __m128i, _mm_loadu_si128, _mm_storeu_si128); -FUNCTOR_LOADSTORE_CAST(VLoadStore128, schar, __m128i, _mm_loadu_si128, _mm_storeu_si128); -FUNCTOR_LOADSTORE_CAST(VLoadStore128, ushort, __m128i, _mm_loadu_si128, _mm_storeu_si128); -FUNCTOR_LOADSTORE_CAST(VLoadStore128, short, __m128i, _mm_loadu_si128, _mm_storeu_si128); -FUNCTOR_LOADSTORE_CAST(VLoadStore128, int, __m128i, _mm_loadu_si128, _mm_storeu_si128); -FUNCTOR_LOADSTORE( VLoadStore128, float, __m128 , _mm_loadu_ps , _mm_storeu_ps ); -FUNCTOR_LOADSTORE( VLoadStore128, double, __m128d, _mm_loadu_pd , _mm_storeu_pd ); - -FUNCTOR_LOADSTORE_CAST(VLoadStore64, uchar, __m128i, _mm_loadl_epi64, _mm_storel_epi64); -FUNCTOR_LOADSTORE_CAST(VLoadStore64, schar, __m128i, _mm_loadl_epi64, _mm_storel_epi64); -FUNCTOR_LOADSTORE_CAST(VLoadStore64, ushort, __m128i, _mm_loadl_epi64, _mm_storel_epi64); -FUNCTOR_LOADSTORE_CAST(VLoadStore64, short, __m128i, _mm_loadl_epi64, _mm_storel_epi64); - -FUNCTOR_LOADSTORE_CAST(VLoadStore128Aligned, int, __m128i, _mm_load_si128, _mm_store_si128); -FUNCTOR_LOADSTORE( VLoadStore128Aligned, float, __m128 , _mm_load_ps , _mm_store_ps ); -FUNCTOR_LOADSTORE( VLoadStore128Aligned, double, __m128d, _mm_load_pd , _mm_store_pd ); - -FUNCTOR_TEMPLATE(VAdd); -FUNCTOR_CLOSURE_2arg(VAdd, uchar, return _mm_adds_epu8 (a, b)); -FUNCTOR_CLOSURE_2arg(VAdd, schar, return _mm_adds_epi8 (a, b)); -FUNCTOR_CLOSURE_2arg(VAdd, ushort, return _mm_adds_epu16(a, b)); -FUNCTOR_CLOSURE_2arg(VAdd, short, return _mm_adds_epi16(a, b)); -FUNCTOR_CLOSURE_2arg(VAdd, int, return _mm_add_epi32 (a, b)); -FUNCTOR_CLOSURE_2arg(VAdd, float, return _mm_add_ps (a, b)); -FUNCTOR_CLOSURE_2arg(VAdd, double, return _mm_add_pd (a, b)); - -FUNCTOR_TEMPLATE(VSub); -FUNCTOR_CLOSURE_2arg(VSub, uchar, return _mm_subs_epu8 (a, b)); -FUNCTOR_CLOSURE_2arg(VSub, schar, return _mm_subs_epi8 (a, b)); -FUNCTOR_CLOSURE_2arg(VSub, ushort, return _mm_subs_epu16(a, b)); -FUNCTOR_CLOSURE_2arg(VSub, short, return _mm_subs_epi16(a, b)); -FUNCTOR_CLOSURE_2arg(VSub, int, return _mm_sub_epi32 (a, b)); -FUNCTOR_CLOSURE_2arg(VSub, float, return _mm_sub_ps (a, b)); -FUNCTOR_CLOSURE_2arg(VSub, double, return _mm_sub_pd (a, b)); - -FUNCTOR_TEMPLATE(VMin); -FUNCTOR_CLOSURE_2arg(VMin, uchar, return _mm_min_epu8(a, b)); -FUNCTOR_CLOSURE_2arg(VMin, schar, - __m128i m = _mm_cmpgt_epi8(a, b); - return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m)); - ); -FUNCTOR_CLOSURE_2arg(VMin, ushort, return _mm_subs_epu16(a, _mm_subs_epu16(a, b))); -FUNCTOR_CLOSURE_2arg(VMin, short, return _mm_min_epi16(a, b)); -FUNCTOR_CLOSURE_2arg(VMin, int, - __m128i m = _mm_cmpgt_epi32(a, b); - return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m)); - ); -FUNCTOR_CLOSURE_2arg(VMin, float, return _mm_min_ps(a, b)); -FUNCTOR_CLOSURE_2arg(VMin, double, return _mm_min_pd(a, b)); - -FUNCTOR_TEMPLATE(VMax); -FUNCTOR_CLOSURE_2arg(VMax, uchar, return _mm_max_epu8(a, b)); -FUNCTOR_CLOSURE_2arg(VMax, schar, - __m128i m = _mm_cmpgt_epi8(b, a); - return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m)); - ); -FUNCTOR_CLOSURE_2arg(VMax, ushort, return _mm_adds_epu16(_mm_subs_epu16(a, b), b)); -FUNCTOR_CLOSURE_2arg(VMax, short, return _mm_max_epi16(a, b)); -FUNCTOR_CLOSURE_2arg(VMax, int, - __m128i m = _mm_cmpgt_epi32(b, a); - return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m)); - ); -FUNCTOR_CLOSURE_2arg(VMax, float, return _mm_max_ps(a, b)); -FUNCTOR_CLOSURE_2arg(VMax, double, return _mm_max_pd(a, b)); - - -static unsigned int CV_DECL_ALIGNED(16) v32f_absmask[] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff }; -static unsigned int CV_DECL_ALIGNED(16) v64f_absmask[] = { 0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff }; - -FUNCTOR_TEMPLATE(VAbsDiff); -FUNCTOR_CLOSURE_2arg(VAbsDiff, uchar, - return _mm_add_epi8(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a)); - ); -FUNCTOR_CLOSURE_2arg(VAbsDiff, schar, - __m128i d = _mm_subs_epi8(a, b); - __m128i m = _mm_cmpgt_epi8(b, a); - return _mm_subs_epi8(_mm_xor_si128(d, m), m); - ); -FUNCTOR_CLOSURE_2arg(VAbsDiff, ushort, - return _mm_add_epi16(_mm_subs_epu16(a, b), _mm_subs_epu16(b, a)); - ); -FUNCTOR_CLOSURE_2arg(VAbsDiff, short, - __m128i M = _mm_max_epi16(a, b); - __m128i m = _mm_min_epi16(a, b); - return _mm_subs_epi16(M, m); - ); -FUNCTOR_CLOSURE_2arg(VAbsDiff, int, - __m128i d = _mm_sub_epi32(a, b); - __m128i m = _mm_cmpgt_epi32(b, a); - return _mm_sub_epi32(_mm_xor_si128(d, m), m); - ); -FUNCTOR_CLOSURE_2arg(VAbsDiff, float, - return _mm_and_ps(_mm_sub_ps(a,b), *(const __m128*)v32f_absmask); - ); -FUNCTOR_CLOSURE_2arg(VAbsDiff, double, - return _mm_and_pd(_mm_sub_pd(a,b), *(const __m128d*)v64f_absmask); - ); - -FUNCTOR_TEMPLATE(VAnd); -FUNCTOR_CLOSURE_2arg(VAnd, uchar, return _mm_and_si128(a, b)); -FUNCTOR_TEMPLATE(VOr); -FUNCTOR_CLOSURE_2arg(VOr , uchar, return _mm_or_si128 (a, b)); -FUNCTOR_TEMPLATE(VXor); -FUNCTOR_CLOSURE_2arg(VXor, uchar, return _mm_xor_si128(a, b)); -FUNCTOR_TEMPLATE(VNot); -FUNCTOR_CLOSURE_1arg(VNot, uchar, return _mm_xor_si128(_mm_set1_epi32(-1), a)); -#endif - -#if CV_NEON - -#define FUNCTOR_LOADSTORE(name, template_arg, register_type, load_body, store_body)\ - template <> \ - struct name{ \ - typedef register_type reg_type; \ - static reg_type load(const template_arg * p) { return load_body (p);}; \ - static void store(template_arg * p, reg_type v) { store_body (p, v);}; \ - } - -#define FUNCTOR_CLOSURE_2arg(name, template_arg, body)\ - template<> \ - struct name \ - { \ - VLoadStore128::reg_type operator()( \ - VLoadStore128::reg_type a, \ - VLoadStore128::reg_type b) const \ - { \ - return body; \ - }; \ - } - -#define FUNCTOR_CLOSURE_1arg(name, template_arg, body)\ - template<> \ - struct name \ - { \ - VLoadStore128::reg_type operator()( \ - VLoadStore128::reg_type a, \ - VLoadStore128::reg_type ) const \ - { \ - return body; \ - }; \ - } - -FUNCTOR_LOADSTORE(VLoadStore128, uchar, uint8x16_t, vld1q_u8 , vst1q_u8 ); -FUNCTOR_LOADSTORE(VLoadStore128, schar, int8x16_t, vld1q_s8 , vst1q_s8 ); -FUNCTOR_LOADSTORE(VLoadStore128, ushort, uint16x8_t, vld1q_u16, vst1q_u16); -FUNCTOR_LOADSTORE(VLoadStore128, short, int16x8_t, vld1q_s16, vst1q_s16); -FUNCTOR_LOADSTORE(VLoadStore128, int, int32x4_t, vld1q_s32, vst1q_s32); -FUNCTOR_LOADSTORE(VLoadStore128, float, float32x4_t, vld1q_f32, vst1q_f32); - -FUNCTOR_TEMPLATE(VAdd); -FUNCTOR_CLOSURE_2arg(VAdd, uchar, vqaddq_u8 (a, b)); -FUNCTOR_CLOSURE_2arg(VAdd, schar, vqaddq_s8 (a, b)); -FUNCTOR_CLOSURE_2arg(VAdd, ushort, vqaddq_u16(a, b)); -FUNCTOR_CLOSURE_2arg(VAdd, short, vqaddq_s16(a, b)); -FUNCTOR_CLOSURE_2arg(VAdd, int, vaddq_s32 (a, b)); -FUNCTOR_CLOSURE_2arg(VAdd, float, vaddq_f32 (a, b)); - -FUNCTOR_TEMPLATE(VSub); -FUNCTOR_CLOSURE_2arg(VSub, uchar, vqsubq_u8 (a, b)); -FUNCTOR_CLOSURE_2arg(VSub, schar, vqsubq_s8 (a, b)); -FUNCTOR_CLOSURE_2arg(VSub, ushort, vqsubq_u16(a, b)); -FUNCTOR_CLOSURE_2arg(VSub, short, vqsubq_s16(a, b)); -FUNCTOR_CLOSURE_2arg(VSub, int, vsubq_s32 (a, b)); -FUNCTOR_CLOSURE_2arg(VSub, float, vsubq_f32 (a, b)); - -FUNCTOR_TEMPLATE(VMin); -FUNCTOR_CLOSURE_2arg(VMin, uchar, vminq_u8 (a, b)); -FUNCTOR_CLOSURE_2arg(VMin, schar, vminq_s8 (a, b)); -FUNCTOR_CLOSURE_2arg(VMin, ushort, vminq_u16(a, b)); -FUNCTOR_CLOSURE_2arg(VMin, short, vminq_s16(a, b)); -FUNCTOR_CLOSURE_2arg(VMin, int, vminq_s32(a, b)); -FUNCTOR_CLOSURE_2arg(VMin, float, vminq_f32(a, b)); - -FUNCTOR_TEMPLATE(VMax); -FUNCTOR_CLOSURE_2arg(VMax, uchar, vmaxq_u8 (a, b)); -FUNCTOR_CLOSURE_2arg(VMax, schar, vmaxq_s8 (a, b)); -FUNCTOR_CLOSURE_2arg(VMax, ushort, vmaxq_u16(a, b)); -FUNCTOR_CLOSURE_2arg(VMax, short, vmaxq_s16(a, b)); -FUNCTOR_CLOSURE_2arg(VMax, int, vmaxq_s32(a, b)); -FUNCTOR_CLOSURE_2arg(VMax, float, vmaxq_f32(a, b)); - -FUNCTOR_TEMPLATE(VAbsDiff); -FUNCTOR_CLOSURE_2arg(VAbsDiff, uchar, vabdq_u8 (a, b)); -FUNCTOR_CLOSURE_2arg(VAbsDiff, schar, vqabsq_s8 (vqsubq_s8(a, b))); -FUNCTOR_CLOSURE_2arg(VAbsDiff, ushort, vabdq_u16 (a, b)); -FUNCTOR_CLOSURE_2arg(VAbsDiff, short, vqabsq_s16(vqsubq_s16(a, b))); -FUNCTOR_CLOSURE_2arg(VAbsDiff, int, vabdq_s32 (a, b)); -FUNCTOR_CLOSURE_2arg(VAbsDiff, float, vabdq_f32 (a, b)); - -FUNCTOR_TEMPLATE(VAnd); -FUNCTOR_CLOSURE_2arg(VAnd, uchar, vandq_u8(a, b)); -FUNCTOR_TEMPLATE(VOr); -FUNCTOR_CLOSURE_2arg(VOr , uchar, vorrq_u8(a, b)); -FUNCTOR_TEMPLATE(VXor); -FUNCTOR_CLOSURE_2arg(VXor, uchar, veorq_u8(a, b)); -FUNCTOR_TEMPLATE(VNot); -FUNCTOR_CLOSURE_1arg(VNot, uchar, vmvnq_u8(a )); -#endif - -#if CV_SSE2 || CV_NEON -#define IF_SIMD(op) op -#else -#define IF_SIMD(op) NOP -#endif - -template<> inline uchar OpAdd::operator ()(uchar a, uchar b) const -{ return CV_FAST_CAST_8U(a + b); } -template<> inline uchar OpSub::operator ()(uchar a, uchar b) const -{ return CV_FAST_CAST_8U(a - b); } - -template struct OpAbsDiff -{ - typedef T type1; - typedef T type2; - typedef T rtype; - T operator()(T a, T b) const { return (T)std::abs(a - b); } -}; - -template<> inline short OpAbsDiff::operator ()(short a, short b) const -{ return saturate_cast(std::abs(a - b)); } - -template<> inline schar OpAbsDiff::operator ()(schar a, schar b) const -{ return saturate_cast(std::abs(a - b)); } - -template struct OpAbsDiffS -{ - typedef T type1; - typedef WT type2; - typedef T rtype; - T operator()(T a, WT b) const { return saturate_cast(std::abs(a - b)); } -}; - -template struct OpAnd -{ - typedef T type1; - typedef T type2; - typedef T rtype; - T operator()( T a, T b ) const { return a & b; } -}; - -template struct OpOr -{ - typedef T type1; - typedef T type2; - typedef T rtype; - T operator()( T a, T b ) const { return a | b; } -}; - -template struct OpXor -{ - typedef T type1; - typedef T type2; - typedef T rtype; - T operator()( T a, T b ) const { return a ^ b; } -}; - -template struct OpNot -{ - typedef T type1; - typedef T type2; - typedef T rtype; - T operator()( T a, T ) const { return ~a; } -}; - -#if (ARITHM_USE_IPP == 1) -static inline void fixSteps(Size sz, size_t elemSize, size_t& step1, size_t& step2, size_t& step) -{ - if( sz.height == 1 ) - step1 = step2 = step = sz.width*elemSize; -} -#endif - -static void add8u( const uchar* src1, size_t step1, - const uchar* src2, size_t step2, - uchar* dst, size_t step, Size sz, void* ) -{ -#if (ARITHM_USE_IPP == 1) - CV_IPP_CHECK() - { - fixSteps(sz, sizeof(dst[0]), step1, step2, step); - if (0 <= ippiAdd_8u_C1RSfs(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz), 0)) - { - CV_IMPL_ADD(CV_IMPL_IPP); - return; - } - setIppErrorStatus(); - } -#endif - (vBinOp, IF_SIMD(VAdd)>(src1, step1, src2, step2, dst, step, sz)); -} - -static void add8s( const schar* src1, size_t step1, - const schar* src2, size_t step2, - schar* dst, size_t step, Size sz, void* ) -{ - vBinOp, IF_SIMD(VAdd)>(src1, step1, src2, step2, dst, step, sz); -} - -static void add16u( const ushort* src1, size_t step1, - const ushort* src2, size_t step2, - ushort* dst, size_t step, Size sz, void* ) -{ -#if (ARITHM_USE_IPP == 1) - CV_IPP_CHECK() - { - fixSteps(sz, sizeof(dst[0]), step1, step2, step); - if (0 <= ippiAdd_16u_C1RSfs(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz), 0)) - { - CV_IMPL_ADD(CV_IMPL_IPP); - return; - } - setIppErrorStatus(); - } -#endif - (vBinOp, IF_SIMD(VAdd)>(src1, step1, src2, step2, dst, step, sz)); -} - -static void add16s( const short* src1, size_t step1, - const short* src2, size_t step2, - short* dst, size_t step, Size sz, void* ) -{ -#if (ARITHM_USE_IPP == 1) - CV_IPP_CHECK() - { - fixSteps(sz, sizeof(dst[0]), step1, step2, step); - if (0 <= ippiAdd_16s_C1RSfs(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz), 0)) - { - CV_IMPL_ADD(CV_IMPL_IPP); - return; - } - setIppErrorStatus(); - } -#endif - (vBinOp, IF_SIMD(VAdd)>(src1, step1, src2, step2, dst, step, sz)); -} - -static void add32s( const int* src1, size_t step1, - const int* src2, size_t step2, - int* dst, size_t step, Size sz, void* ) -{ - vBinOp32, IF_SIMD(VAdd)>(src1, step1, src2, step2, dst, step, sz); -} - -static void add32f( const float* src1, size_t step1, - const float* src2, size_t step2, - float* dst, size_t step, Size sz, void* ) -{ -#if (ARITHM_USE_IPP == 1) - CV_IPP_CHECK() - { - fixSteps(sz, sizeof(dst[0]), step1, step2, step); - if (0 <= ippiAdd_32f_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz))) - { - CV_IMPL_ADD(CV_IMPL_IPP); - return; - } - setIppErrorStatus(); - } -#endif - (vBinOp32, IF_SIMD(VAdd)>(src1, step1, src2, step2, dst, step, sz)); -} - -static void add64f( const double* src1, size_t step1, - const double* src2, size_t step2, - double* dst, size_t step, Size sz, void* ) -{ - vBinOp64, IF_SIMD(VAdd)>(src1, step1, src2, step2, dst, step, sz); -} - -static void sub8u( const uchar* src1, size_t step1, - const uchar* src2, size_t step2, - uchar* dst, size_t step, Size sz, void* ) -{ -#if (ARITHM_USE_IPP == 1) - CV_IPP_CHECK() - { - fixSteps(sz, sizeof(dst[0]), step1, step2, step); - if (0 <= ippiSub_8u_C1RSfs(src2, (int)step2, src1, (int)step1, dst, (int)step, ippiSize(sz), 0)) - { - CV_IMPL_ADD(CV_IMPL_IPP); - return; - } - setIppErrorStatus(); - } -#endif - (vBinOp, IF_SIMD(VSub)>(src1, step1, src2, step2, dst, step, sz)); -} - -static void sub8s( const schar* src1, size_t step1, - const schar* src2, size_t step2, - schar* dst, size_t step, Size sz, void* ) -{ - vBinOp, IF_SIMD(VSub)>(src1, step1, src2, step2, dst, step, sz); -} - -static void sub16u( const ushort* src1, size_t step1, - const ushort* src2, size_t step2, - ushort* dst, size_t step, Size sz, void* ) -{ -#if (ARITHM_USE_IPP == 1) - CV_IPP_CHECK() - { - fixSteps(sz, sizeof(dst[0]), step1, step2, step); - if (0 <= ippiSub_16u_C1RSfs(src2, (int)step2, src1, (int)step1, dst, (int)step, ippiSize(sz), 0)) - { - CV_IMPL_ADD(CV_IMPL_IPP); - return; - } - setIppErrorStatus(); - } -#endif - (vBinOp, IF_SIMD(VSub)>(src1, step1, src2, step2, dst, step, sz)); -} - -static void sub16s( const short* src1, size_t step1, - const short* src2, size_t step2, - short* dst, size_t step, Size sz, void* ) -{ -#if (ARITHM_USE_IPP == 1) - CV_IPP_CHECK() - { - fixSteps(sz, sizeof(dst[0]), step1, step2, step); - if (0 <= ippiSub_16s_C1RSfs(src2, (int)step2, src1, (int)step1, dst, (int)step, ippiSize(sz), 0)) - { - CV_IMPL_ADD(CV_IMPL_IPP); - return; - } - setIppErrorStatus(); - } -#endif - (vBinOp, IF_SIMD(VSub)>(src1, step1, src2, step2, dst, step, sz)); -} - -static void sub32s( const int* src1, size_t step1, - const int* src2, size_t step2, - int* dst, size_t step, Size sz, void* ) -{ - vBinOp32, IF_SIMD(VSub)>(src1, step1, src2, step2, dst, step, sz); -} - -static void sub32f( const float* src1, size_t step1, - const float* src2, size_t step2, - float* dst, size_t step, Size sz, void* ) -{ -#if (ARITHM_USE_IPP == 1) - CV_IPP_CHECK() - { - fixSteps(sz, sizeof(dst[0]), step1, step2, step); - if (0 <= ippiSub_32f_C1R(src2, (int)step2, src1, (int)step1, dst, (int)step, ippiSize(sz))) - { - CV_IMPL_ADD(CV_IMPL_IPP); - return; - } - setIppErrorStatus(); - } -#endif - (vBinOp32, IF_SIMD(VSub)>(src1, step1, src2, step2, dst, step, sz)); -} - -static void sub64f( const double* src1, size_t step1, - const double* src2, size_t step2, - double* dst, size_t step, Size sz, void* ) -{ - vBinOp64, IF_SIMD(VSub)>(src1, step1, src2, step2, dst, step, sz); -} - -template<> inline uchar OpMin::operator ()(uchar a, uchar b) const { return CV_MIN_8U(a, b); } -template<> inline uchar OpMax::operator ()(uchar a, uchar b) const { return CV_MAX_8U(a, b); } - -static void max8u( const uchar* src1, size_t step1, - const uchar* src2, size_t step2, - uchar* dst, size_t step, Size sz, void* ) -{ -#if (ARITHM_USE_IPP == 1) - CV_IPP_CHECK() - { - uchar* s1 = (uchar*)src1; - uchar* s2 = (uchar*)src2; - uchar* d = dst; - fixSteps(sz, sizeof(dst[0]), step1, step2, step); - int i = 0; - for(; i < sz.height; i++) - { - if (0 > ippsMaxEvery_8u(s1, s2, d, sz.width)) - break; - s1 += step1; - s2 += step2; - d += step; - } - if (i == sz.height) - { - CV_IMPL_ADD(CV_IMPL_IPP); - return; - } - setIppErrorStatus(); - } -#endif - vBinOp, IF_SIMD(VMax)>(src1, step1, src2, step2, dst, step, sz); -} - -static void max8s( const schar* src1, size_t step1, - const schar* src2, size_t step2, - schar* dst, size_t step, Size sz, void* ) -{ - vBinOp, IF_SIMD(VMax)>(src1, step1, src2, step2, dst, step, sz); -} - -static void max16u( const ushort* src1, size_t step1, - const ushort* src2, size_t step2, - ushort* dst, size_t step, Size sz, void* ) -{ -#if (ARITHM_USE_IPP == 1) - CV_IPP_CHECK() - { - ushort* s1 = (ushort*)src1; - ushort* s2 = (ushort*)src2; - ushort* d = dst; - fixSteps(sz, sizeof(dst[0]), step1, step2, step); - int i = 0; - for(; i < sz.height; i++) - { - if (0 > ippsMaxEvery_16u(s1, s2, d, sz.width)) - break; - s1 = (ushort*)((uchar*)s1 + step1); - s2 = (ushort*)((uchar*)s2 + step2); - d = (ushort*)((uchar*)d + step); - } - if (i == sz.height) - { - CV_IMPL_ADD(CV_IMPL_IPP); - return; - } - setIppErrorStatus(); - } -#endif - vBinOp, IF_SIMD(VMax)>(src1, step1, src2, step2, dst, step, sz); -} - -static void max16s( const short* src1, size_t step1, - const short* src2, size_t step2, - short* dst, size_t step, Size sz, void* ) -{ - vBinOp, IF_SIMD(VMax)>(src1, step1, src2, step2, dst, step, sz); -} - -static void max32s( const int* src1, size_t step1, - const int* src2, size_t step2, - int* dst, size_t step, Size sz, void* ) -{ - vBinOp32, IF_SIMD(VMax)>(src1, step1, src2, step2, dst, step, sz); -} - -static void max32f( const float* src1, size_t step1, - const float* src2, size_t step2, - float* dst, size_t step, Size sz, void* ) -{ -#if (ARITHM_USE_IPP == 1) - CV_IPP_CHECK() - { - float* s1 = (float*)src1; - float* s2 = (float*)src2; - float* d = dst; - fixSteps(sz, sizeof(dst[0]), step1, step2, step); - int i = 0; - for(; i < sz.height; i++) - { - if (0 > ippsMaxEvery_32f(s1, s2, d, sz.width)) - break; - s1 = (float*)((uchar*)s1 + step1); - s2 = (float*)((uchar*)s2 + step2); - d = (float*)((uchar*)d + step); - } - if (i == sz.height) - { - CV_IMPL_ADD(CV_IMPL_IPP); - return; - } - setIppErrorStatus(); - } -#endif - vBinOp32, IF_SIMD(VMax)>(src1, step1, src2, step2, dst, step, sz); -} - -static void max64f( const double* src1, size_t step1, - const double* src2, size_t step2, - double* dst, size_t step, Size sz, void* ) -{ -#if ARITHM_USE_IPP == 1 - CV_IPP_CHECK() - { - double* s1 = (double*)src1; - double* s2 = (double*)src2; - double* d = dst; - fixSteps(sz, sizeof(dst[0]), step1, step2, step); - int i = 0; - for(; i < sz.height; i++) - { - if (0 > ippsMaxEvery_64f(s1, s2, d, sz.width)) - break; - s1 = (double*)((uchar*)s1 + step1); - s2 = (double*)((uchar*)s2 + step2); - d = (double*)((uchar*)d + step); - } - if (i == sz.height) - { - CV_IMPL_ADD(CV_IMPL_IPP); - return; - } - setIppErrorStatus(); - } -#endif - vBinOp64, IF_SIMD(VMax)>(src1, step1, src2, step2, dst, step, sz); -} - -static void min8u( const uchar* src1, size_t step1, - const uchar* src2, size_t step2, - uchar* dst, size_t step, Size sz, void* ) -{ -#if (ARITHM_USE_IPP == 1) - CV_IPP_CHECK() - { - uchar* s1 = (uchar*)src1; - uchar* s2 = (uchar*)src2; - uchar* d = dst; - fixSteps(sz, sizeof(dst[0]), step1, step2, step); - int i = 0; - for(; i < sz.height; i++) - { - if (0 > ippsMinEvery_8u(s1, s2, d, sz.width)) - break; - s1 += step1; - s2 += step2; - d += step; - } - if (i == sz.height) - { - CV_IMPL_ADD(CV_IMPL_IPP); - return; - } - setIppErrorStatus(); - } -#endif - vBinOp, IF_SIMD(VMin)>(src1, step1, src2, step2, dst, step, sz); -} - -static void min8s( const schar* src1, size_t step1, - const schar* src2, size_t step2, - schar* dst, size_t step, Size sz, void* ) -{ - vBinOp, IF_SIMD(VMin)>(src1, step1, src2, step2, dst, step, sz); -} - -static void min16u( const ushort* src1, size_t step1, - const ushort* src2, size_t step2, - ushort* dst, size_t step, Size sz, void* ) -{ -#if (ARITHM_USE_IPP == 1) - CV_IPP_CHECK() - { - ushort* s1 = (ushort*)src1; - ushort* s2 = (ushort*)src2; - ushort* d = dst; - fixSteps(sz, sizeof(dst[0]), step1, step2, step); - int i = 0; - for(; i < sz.height; i++) - { - if (0 > ippsMinEvery_16u(s1, s2, d, sz.width)) - break; - s1 = (ushort*)((uchar*)s1 + step1); - s2 = (ushort*)((uchar*)s2 + step2); - d = (ushort*)((uchar*)d + step); - } - if (i == sz.height) - { - CV_IMPL_ADD(CV_IMPL_IPP); - return; - } - setIppErrorStatus(); - } -#endif - vBinOp, IF_SIMD(VMin)>(src1, step1, src2, step2, dst, step, sz); -} - -static void min16s( const short* src1, size_t step1, - const short* src2, size_t step2, - short* dst, size_t step, Size sz, void* ) -{ - vBinOp, IF_SIMD(VMin)>(src1, step1, src2, step2, dst, step, sz); -} - -static void min32s( const int* src1, size_t step1, - const int* src2, size_t step2, - int* dst, size_t step, Size sz, void* ) -{ - vBinOp32, IF_SIMD(VMin)>(src1, step1, src2, step2, dst, step, sz); -} - -static void min32f( const float* src1, size_t step1, - const float* src2, size_t step2, - float* dst, size_t step, Size sz, void* ) -{ -#if (ARITHM_USE_IPP == 1) - CV_IPP_CHECK() - { - float* s1 = (float*)src1; - float* s2 = (float*)src2; - float* d = dst; - fixSteps(sz, sizeof(dst[0]), step1, step2, step); - int i = 0; - for(; i < sz.height; i++) - { - if (0 > ippsMinEvery_32f(s1, s2, d, sz.width)) - break; - s1 = (float*)((uchar*)s1 + step1); - s2 = (float*)((uchar*)s2 + step2); - d = (float*)((uchar*)d + step); - } - if (i == sz.height) - { - CV_IMPL_ADD(CV_IMPL_IPP); - return; - } - setIppErrorStatus(); - } -#endif - vBinOp32, IF_SIMD(VMin)>(src1, step1, src2, step2, dst, step, sz); -} - -static void min64f( const double* src1, size_t step1, - const double* src2, size_t step2, - double* dst, size_t step, Size sz, void* ) -{ -#if ARITHM_USE_IPP == 1 - CV_IPP_CHECK() - { - double* s1 = (double*)src1; - double* s2 = (double*)src2; - double* d = dst; - fixSteps(sz, sizeof(dst[0]), step1, step2, step); - int i = 0; - for(; i < sz.height; i++) - { - if (0 > ippsMinEvery_64f(s1, s2, d, sz.width)) - break; - s1 = (double*)((uchar*)s1 + step1); - s2 = (double*)((uchar*)s2 + step2); - d = (double*)((uchar*)d + step); - } - if (i == sz.height) - { - CV_IMPL_ADD(CV_IMPL_IPP); - return; - } - setIppErrorStatus(); - } -#endif - vBinOp64, IF_SIMD(VMin)>(src1, step1, src2, step2, dst, step, sz); -} - -static void absdiff8u( const uchar* src1, size_t step1, - const uchar* src2, size_t step2, - uchar* dst, size_t step, Size sz, void* ) -{ -#if (ARITHM_USE_IPP == 1) - CV_IPP_CHECK() - { - fixSteps(sz, sizeof(dst[0]), step1, step2, step); - if (0 <= ippiAbsDiff_8u_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz))) - { - CV_IMPL_ADD(CV_IMPL_IPP); - return; - } - setIppErrorStatus(); - } -#endif - (vBinOp, IF_SIMD(VAbsDiff)>(src1, step1, src2, step2, dst, step, sz)); -} - -static void absdiff8s( const schar* src1, size_t step1, - const schar* src2, size_t step2, - schar* dst, size_t step, Size sz, void* ) -{ - vBinOp, IF_SIMD(VAbsDiff)>(src1, step1, src2, step2, dst, step, sz); -} - -static void absdiff16u( const ushort* src1, size_t step1, - const ushort* src2, size_t step2, - ushort* dst, size_t step, Size sz, void* ) -{ -#if (ARITHM_USE_IPP == 1) - CV_IPP_CHECK() - { - fixSteps(sz, sizeof(dst[0]), step1, step2, step); - if (0 <= ippiAbsDiff_16u_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz))) - { - CV_IMPL_ADD(CV_IMPL_IPP); - return; - } - setIppErrorStatus(); - } -#endif - (vBinOp, IF_SIMD(VAbsDiff)>(src1, step1, src2, step2, dst, step, sz)); -} - -static void absdiff16s( const short* src1, size_t step1, - const short* src2, size_t step2, - short* dst, size_t step, Size sz, void* ) -{ - vBinOp, IF_SIMD(VAbsDiff)>(src1, step1, src2, step2, dst, step, sz); -} - -static void absdiff32s( const int* src1, size_t step1, - const int* src2, size_t step2, - int* dst, size_t step, Size sz, void* ) -{ - vBinOp32, IF_SIMD(VAbsDiff)>(src1, step1, src2, step2, dst, step, sz); -} - -static void absdiff32f( const float* src1, size_t step1, - const float* src2, size_t step2, - float* dst, size_t step, Size sz, void* ) -{ -#if (ARITHM_USE_IPP == 1) - CV_IPP_CHECK() - { - fixSteps(sz, sizeof(dst[0]), step1, step2, step); - if (0 <= ippiAbsDiff_32f_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz))) - { - CV_IMPL_ADD(CV_IMPL_IPP); - return; - } - setIppErrorStatus(); - } -#endif - (vBinOp32, IF_SIMD(VAbsDiff)>(src1, step1, src2, step2, dst, step, sz)); -} - -static void absdiff64f( const double* src1, size_t step1, - const double* src2, size_t step2, - double* dst, size_t step, Size sz, void* ) -{ - vBinOp64, IF_SIMD(VAbsDiff)>(src1, step1, src2, step2, dst, step, sz); -} - - -static void and8u( const uchar* src1, size_t step1, - const uchar* src2, size_t step2, - uchar* dst, size_t step, Size sz, void* ) -{ -#if (ARITHM_USE_IPP == 1) - CV_IPP_CHECK() - { - fixSteps(sz, sizeof(dst[0]), step1, step2, step); - if (0 <= ippiAnd_8u_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz))) - { - CV_IMPL_ADD(CV_IMPL_IPP); - return; - } - setIppErrorStatus(); - } -#endif - (vBinOp, IF_SIMD(VAnd)>(src1, step1, src2, step2, dst, step, sz)); -} - -static void or8u( const uchar* src1, size_t step1, - const uchar* src2, size_t step2, - uchar* dst, size_t step, Size sz, void* ) -{ -#if (ARITHM_USE_IPP == 1) - CV_IPP_CHECK() - { - fixSteps(sz, sizeof(dst[0]), step1, step2, step); - if (0 <= ippiOr_8u_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz))) - { - CV_IMPL_ADD(CV_IMPL_IPP); - return; - } - setIppErrorStatus(); - } -#endif - (vBinOp, IF_SIMD(VOr)>(src1, step1, src2, step2, dst, step, sz)); -} - -static void xor8u( const uchar* src1, size_t step1, - const uchar* src2, size_t step2, - uchar* dst, size_t step, Size sz, void* ) -{ -#if (ARITHM_USE_IPP == 1) - CV_IPP_CHECK() - { - fixSteps(sz, sizeof(dst[0]), step1, step2, step); - if (0 <= ippiXor_8u_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz))) - { - CV_IMPL_ADD(CV_IMPL_IPP); - return; - } - setIppErrorStatus(); - } -#endif - (vBinOp, IF_SIMD(VXor)>(src1, step1, src2, step2, dst, step, sz)); -} - -static void not8u( const uchar* src1, size_t step1, - const uchar* src2, size_t step2, - uchar* dst, size_t step, Size sz, void* ) -{ -#if (ARITHM_USE_IPP == 1) - CV_IPP_CHECK() - { - fixSteps(sz, sizeof(dst[0]), step1, step2, step); (void)src2; - if (0 <= ippiNot_8u_C1R(src1, (int)step1, dst, (int)step, ippiSize(sz))) - { - CV_IMPL_ADD(CV_IMPL_IPP); - return; - } - setIppErrorStatus(); - } -#endif - (vBinOp, IF_SIMD(VNot)>(src1, step1, src2, step2, dst, step, sz)); -} - /****************************************************************************************\ * logical operations * \****************************************************************************************/ @@ -1511,7 +163,7 @@ static bool ocl_binary_op(InputArray _src1, InputArray _src2, OutputArray _dst, #endif static void binary_op( InputArray _src1, InputArray _src2, OutputArray _dst, - InputArray _mask, const BinaryFunc* tab, + InputArray _mask, const BinaryFuncC* tab, bool bitwise, int oclop ) { const _InputArray *psrc1 = &_src1, *psrc2 = &_src2; @@ -1526,7 +178,7 @@ static void binary_op( InputArray _src1, InputArray _src2, OutputArray _dst, dims1 <= 2 && dims2 <= 2; #endif bool haveMask = !_mask.empty(), haveScalar = false; - BinaryFunc func; + BinaryFuncC func; if( dims1 <= 2 && dims2 <= 2 && kind1 == kind2 && sz1 == sz2 && type1 == type2 && !haveMask ) { @@ -1548,7 +200,7 @@ static void binary_op( InputArray _src1, InputArray _src2, OutputArray _dst, if( len == (size_t)(int)len ) { sz.width = (int)len; - func(src1.ptr(), src1.step, src2.ptr(), src2.step, dst.ptr(), dst.step, sz, 0); + func(src1.ptr(), src1.step, src2.ptr(), src2.step, dst.ptr(), dst.step, sz.width, sz.height, 0); return; } } @@ -1639,7 +291,7 @@ static void binary_op( InputArray _src1, InputArray _src2, OutputArray _dst, { int bsz = (int)MIN(total - j, blocksize); - func( ptrs[0], 0, ptrs[1], 0, haveMask ? maskbuf : ptrs[2], 0, Size(bsz*cn, 1), 0 ); + func( ptrs[0], 0, ptrs[1], 0, haveMask ? maskbuf : ptrs[2], 0, bsz*cn, 1, 0 ); if( haveMask ) { copymask( maskbuf, 0, ptrs[3], 0, ptrs[2], 0, Size(bsz, 1), &esz ); @@ -1671,7 +323,7 @@ static void binary_op( InputArray _src1, InputArray _src2, OutputArray _dst, { int bsz = (int)MIN(total - j, blocksize); - func( ptrs[0], 0, scbuf, 0, haveMask ? maskbuf : ptrs[1], 0, Size(bsz*cn, 1), 0 ); + func( ptrs[0], 0, scbuf, 0, haveMask ? maskbuf : ptrs[1], 0, bsz*cn, 1, 0 ); if( haveMask ) { copymask( maskbuf, 0, ptrs[2], 0, ptrs[1], 0, Size(bsz, 1), &esz ); @@ -1685,28 +337,28 @@ static void binary_op( InputArray _src1, InputArray _src2, OutputArray _dst, } } -static BinaryFunc* getMaxTab() +static BinaryFuncC* getMaxTab() { - static BinaryFunc maxTab[] = + static BinaryFuncC maxTab[] = { - (BinaryFunc)GET_OPTIMIZED(max8u), (BinaryFunc)GET_OPTIMIZED(max8s), - (BinaryFunc)GET_OPTIMIZED(max16u), (BinaryFunc)GET_OPTIMIZED(max16s), - (BinaryFunc)GET_OPTIMIZED(max32s), - (BinaryFunc)GET_OPTIMIZED(max32f), (BinaryFunc)max64f, + (BinaryFuncC)GET_OPTIMIZED(cv::hal::max8u), (BinaryFuncC)GET_OPTIMIZED(cv::hal::max8s), + (BinaryFuncC)GET_OPTIMIZED(cv::hal::max16u), (BinaryFuncC)GET_OPTIMIZED(cv::hal::max16s), + (BinaryFuncC)GET_OPTIMIZED(cv::hal::max32s), + (BinaryFuncC)GET_OPTIMIZED(cv::hal::max32f), (BinaryFuncC)cv::hal::max64f, 0 }; return maxTab; } -static BinaryFunc* getMinTab() +static BinaryFuncC* getMinTab() { - static BinaryFunc minTab[] = + static BinaryFuncC minTab[] = { - (BinaryFunc)GET_OPTIMIZED(min8u), (BinaryFunc)GET_OPTIMIZED(min8s), - (BinaryFunc)GET_OPTIMIZED(min16u), (BinaryFunc)GET_OPTIMIZED(min16s), - (BinaryFunc)GET_OPTIMIZED(min32s), - (BinaryFunc)GET_OPTIMIZED(min32f), (BinaryFunc)min64f, + (BinaryFuncC)GET_OPTIMIZED(cv::hal::min8u), (BinaryFuncC)GET_OPTIMIZED(cv::hal::min8s), + (BinaryFuncC)GET_OPTIMIZED(cv::hal::min16u), (BinaryFuncC)GET_OPTIMIZED(cv::hal::min16s), + (BinaryFuncC)GET_OPTIMIZED(cv::hal::min32s), + (BinaryFuncC)GET_OPTIMIZED(cv::hal::min32f), (BinaryFuncC)cv::hal::min64f, 0 }; @@ -1717,25 +369,25 @@ static BinaryFunc* getMinTab() void cv::bitwise_and(InputArray a, InputArray b, OutputArray c, InputArray mask) { - BinaryFunc f = (BinaryFunc)GET_OPTIMIZED(and8u); + BinaryFuncC f = (BinaryFuncC)GET_OPTIMIZED(cv::hal::and8u); binary_op(a, b, c, mask, &f, true, OCL_OP_AND); } void cv::bitwise_or(InputArray a, InputArray b, OutputArray c, InputArray mask) { - BinaryFunc f = (BinaryFunc)GET_OPTIMIZED(or8u); + BinaryFuncC f = (BinaryFuncC)GET_OPTIMIZED(cv::hal::or8u); binary_op(a, b, c, mask, &f, true, OCL_OP_OR); } void cv::bitwise_xor(InputArray a, InputArray b, OutputArray c, InputArray mask) { - BinaryFunc f = (BinaryFunc)GET_OPTIMIZED(xor8u); + BinaryFuncC f = (BinaryFuncC)GET_OPTIMIZED(cv::hal::xor8u); binary_op(a, b, c, mask, &f, true, OCL_OP_XOR); } void cv::bitwise_not(InputArray a, OutputArray c, InputArray mask) { - BinaryFunc f = (BinaryFunc)GET_OPTIMIZED(not8u); + BinaryFuncC f = (BinaryFuncC)GET_OPTIMIZED(cv::hal::not8u); binary_op(a, a, c, mask, &f, true, OCL_OP_NOT); } @@ -1924,7 +576,7 @@ static bool ocl_arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst, #endif static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst, - InputArray _mask, int dtype, BinaryFunc* tab, bool muldiv=false, + InputArray _mask, int dtype, BinaryFuncC* tab, bool muldiv=false, void* usrdata=0, int oclop=-1 ) { const _InputArray *psrc1 = &_src1, *psrc2 = &_src2; @@ -1955,7 +607,7 @@ static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst, Mat src1 = psrc1->getMat(), src2 = psrc2->getMat(), dst = _dst.getMat(); Size sz = getContinuousSize(src1, src2, dst, src1.channels()); - tab[depth1](src1.ptr(), src1.step, src2.ptr(), src2.step, dst.ptr(), dst.step, sz, usrdata); + tab[depth1](src1.ptr(), src1.step, src2.ptr(), src2.step, dst.ptr(), dst.step, sz.width, sz.height, usrdata); return; } @@ -2068,7 +720,7 @@ static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst, (cvtsrc2 || haveScalar ? wsz : 0) + (cvtdst ? wsz : 0) + (haveMask ? dsz : 0); - BinaryFunc func = tab[CV_MAT_DEPTH(wtype)]; + BinaryFuncC func = tab[CV_MAT_DEPTH(wtype)]; if( !haveScalar ) { @@ -2115,10 +767,10 @@ static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst, } if( !haveMask && !cvtdst ) - func( sptr1, 1, sptr2, 1, dptr, 1, bszn, usrdata ); + func( sptr1, 1, sptr2, 1, dptr, 1, bszn.width, bszn.height, usrdata ); else { - func( sptr1, 1, sptr2, 1, wbuf, 0, bszn, usrdata ); + func( sptr1, 1, sptr2, 1, wbuf, 0, bszn.width, bszn.height, usrdata ); if( !haveMask ) cvtdst( wbuf, 1, 0, 1, dptr, 1, bszn, 0 ); else if( !cvtdst ) @@ -2178,10 +830,10 @@ static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst, std::swap(sptr1, sptr2); if( !haveMask && !cvtdst ) - func( sptr1, 1, sptr2, 1, dptr, 1, bszn, usrdata ); + func( sptr1, 1, sptr2, 1, dptr, 1, bszn.width, bszn.height, usrdata ); else { - func( sptr1, 1, sptr2, 1, wbuf, 1, bszn, usrdata ); + func( sptr1, 1, sptr2, 1, wbuf, 1, bszn.width, bszn.height, usrdata ); if( !haveMask ) cvtdst( wbuf, 1, 0, 1, dptr, 1, bszn, 0 ); else if( !cvtdst ) @@ -2202,42 +854,42 @@ static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst, } } -static BinaryFunc* getAddTab() +static BinaryFuncC* getAddTab() { - static BinaryFunc addTab[] = + static BinaryFuncC addTab[] = { - (BinaryFunc)GET_OPTIMIZED(add8u), (BinaryFunc)GET_OPTIMIZED(add8s), - (BinaryFunc)GET_OPTIMIZED(add16u), (BinaryFunc)GET_OPTIMIZED(add16s), - (BinaryFunc)GET_OPTIMIZED(add32s), - (BinaryFunc)GET_OPTIMIZED(add32f), (BinaryFunc)add64f, + (BinaryFuncC)GET_OPTIMIZED(cv::hal::add8u), (BinaryFuncC)GET_OPTIMIZED(cv::hal::add8s), + (BinaryFuncC)GET_OPTIMIZED(cv::hal::add16u), (BinaryFuncC)GET_OPTIMIZED(cv::hal::add16s), + (BinaryFuncC)GET_OPTIMIZED(cv::hal::add32s), + (BinaryFuncC)GET_OPTIMIZED(cv::hal::add32f), (BinaryFuncC)cv::hal::add64f, 0 }; return addTab; } -static BinaryFunc* getSubTab() +static BinaryFuncC* getSubTab() { - static BinaryFunc subTab[] = + static BinaryFuncC subTab[] = { - (BinaryFunc)GET_OPTIMIZED(sub8u), (BinaryFunc)GET_OPTIMIZED(sub8s), - (BinaryFunc)GET_OPTIMIZED(sub16u), (BinaryFunc)GET_OPTIMIZED(sub16s), - (BinaryFunc)GET_OPTIMIZED(sub32s), - (BinaryFunc)GET_OPTIMIZED(sub32f), (BinaryFunc)sub64f, + (BinaryFuncC)GET_OPTIMIZED(cv::hal::sub8u), (BinaryFuncC)GET_OPTIMIZED(cv::hal::sub8s), + (BinaryFuncC)GET_OPTIMIZED(cv::hal::sub16u), (BinaryFuncC)GET_OPTIMIZED(cv::hal::sub16s), + (BinaryFuncC)GET_OPTIMIZED(cv::hal::sub32s), + (BinaryFuncC)GET_OPTIMIZED(cv::hal::sub32f), (BinaryFuncC)cv::hal::sub64f, 0 }; return subTab; } -static BinaryFunc* getAbsDiffTab() +static BinaryFuncC* getAbsDiffTab() { - static BinaryFunc absDiffTab[] = + static BinaryFuncC absDiffTab[] = { - (BinaryFunc)GET_OPTIMIZED(absdiff8u), (BinaryFunc)GET_OPTIMIZED(absdiff8s), - (BinaryFunc)GET_OPTIMIZED(absdiff16u), (BinaryFunc)GET_OPTIMIZED(absdiff16s), - (BinaryFunc)GET_OPTIMIZED(absdiff32s), - (BinaryFunc)GET_OPTIMIZED(absdiff32f), (BinaryFunc)absdiff64f, + (BinaryFuncC)GET_OPTIMIZED(cv::hal::absdiff8u), (BinaryFuncC)GET_OPTIMIZED(cv::hal::absdiff8s), + (BinaryFuncC)GET_OPTIMIZED(cv::hal::absdiff16u), (BinaryFuncC)GET_OPTIMIZED(cv::hal::absdiff16s), + (BinaryFuncC)GET_OPTIMIZED(cv::hal::absdiff32s), + (BinaryFuncC)GET_OPTIMIZED(cv::hal::absdiff32f), (BinaryFuncC)cv::hal::absdiff64f, 0 }; @@ -2323,1365 +975,37 @@ void cv::absdiff( InputArray src1, InputArray src2, OutputArray dst ) namespace cv { -template -struct Mul_SIMD +static BinaryFuncC* getMulTab() { - int operator() (const T *, const T *, T *, int, WT) const + static BinaryFuncC mulTab[] = { - return 0; - } -}; - -#if CV_NEON - -template <> -struct Mul_SIMD -{ - int operator() (const uchar * src1, const uchar * src2, uchar * dst, int width, float scale) const - { - int x = 0; - - if( scale == 1.0f ) - for ( ; x <= width - 8; x += 8) - { - uint16x8_t v_src1 = vmovl_u8(vld1_u8(src1 + x)); - uint16x8_t v_src2 = vmovl_u8(vld1_u8(src2 + x)); - - float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))), - vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2)))); - float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))), - vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2)))); - - uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)), - vqmovn_u32(cv_vrndq_u32_f32(v_dst2))); - vst1_u8(dst + x, vqmovn_u16(v_dst)); - } - else - { - float32x4_t v_scale = vdupq_n_f32(scale); - for ( ; x <= width - 8; x += 8) - { - uint16x8_t v_src1 = vmovl_u8(vld1_u8(src1 + x)); - uint16x8_t v_src2 = vmovl_u8(vld1_u8(src2 + x)); - - float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))), - vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2)))); - v_dst1 = vmulq_f32(v_dst1, v_scale); - float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))), - vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2)))); - v_dst2 = vmulq_f32(v_dst2, v_scale); - - uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)), - vqmovn_u32(cv_vrndq_u32_f32(v_dst2))); - vst1_u8(dst + x, vqmovn_u16(v_dst)); - } - } - - return x; - } -}; - -template <> -struct Mul_SIMD -{ - int operator() (const schar * src1, const schar * src2, schar * dst, int width, float scale) const - { - int x = 0; - - if( scale == 1.0f ) - for ( ; x <= width - 8; x += 8) - { - int16x8_t v_src1 = vmovl_s8(vld1_s8(src1 + x)); - int16x8_t v_src2 = vmovl_s8(vld1_s8(src2 + x)); - - float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))), - vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2)))); - float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))), - vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2)))); - - int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)), - vqmovn_s32(cv_vrndq_s32_f32(v_dst2))); - vst1_s8(dst + x, vqmovn_s16(v_dst)); - } - else - { - float32x4_t v_scale = vdupq_n_f32(scale); - for ( ; x <= width - 8; x += 8) - { - int16x8_t v_src1 = vmovl_s8(vld1_s8(src1 + x)); - int16x8_t v_src2 = vmovl_s8(vld1_s8(src2 + x)); - - float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))), - vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2)))); - v_dst1 = vmulq_f32(v_dst1, v_scale); - float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))), - vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2)))); - v_dst2 = vmulq_f32(v_dst2, v_scale); - - int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)), - vqmovn_s32(cv_vrndq_s32_f32(v_dst2))); - vst1_s8(dst + x, vqmovn_s16(v_dst)); - } - } - - return x; - } -}; - -template <> -struct Mul_SIMD -{ - int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float scale) const - { - int x = 0; - - if( scale == 1.0f ) - for ( ; x <= width - 8; x += 8) - { - uint16x8_t v_src1 = vld1q_u16(src1 + x), v_src2 = vld1q_u16(src2 + x); - - float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))), - vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2)))); - float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))), - vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2)))); - - uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)), - vqmovn_u32(cv_vrndq_u32_f32(v_dst2))); - vst1q_u16(dst + x, v_dst); - } - else - { - float32x4_t v_scale = vdupq_n_f32(scale); - for ( ; x <= width - 8; x += 8) - { - uint16x8_t v_src1 = vld1q_u16(src1 + x), v_src2 = vld1q_u16(src2 + x); - - float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))), - vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2)))); - v_dst1 = vmulq_f32(v_dst1, v_scale); - float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))), - vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2)))); - v_dst2 = vmulq_f32(v_dst2, v_scale); - - uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)), - vqmovn_u32(cv_vrndq_u32_f32(v_dst2))); - vst1q_u16(dst + x, v_dst); - } - } - - return x; - } -}; - -template <> -struct Mul_SIMD -{ - int operator() (const short * src1, const short * src2, short * dst, int width, float scale) const - { - int x = 0; - - if( scale == 1.0f ) - for ( ; x <= width - 8; x += 8) - { - int16x8_t v_src1 = vld1q_s16(src1 + x), v_src2 = vld1q_s16(src2 + x); - - float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))), - vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2)))); - float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))), - vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2)))); - - int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)), - vqmovn_s32(cv_vrndq_s32_f32(v_dst2))); - vst1q_s16(dst + x, v_dst); - } - else - { - float32x4_t v_scale = vdupq_n_f32(scale); - for ( ; x <= width - 8; x += 8) - { - int16x8_t v_src1 = vld1q_s16(src1 + x), v_src2 = vld1q_s16(src2 + x); - - float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))), - vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2)))); - v_dst1 = vmulq_f32(v_dst1, v_scale); - float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))), - vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2)))); - v_dst2 = vmulq_f32(v_dst2, v_scale); - - int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)), - vqmovn_s32(cv_vrndq_s32_f32(v_dst2))); - vst1q_s16(dst + x, v_dst); - } - } - - return x; - } -}; - -template <> -struct Mul_SIMD -{ - int operator() (const float * src1, const float * src2, float * dst, int width, float scale) const - { - int x = 0; - - if( scale == 1.0f ) - for ( ; x <= width - 8; x += 8) - { - float32x4_t v_dst1 = vmulq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x)); - float32x4_t v_dst2 = vmulq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4)); - vst1q_f32(dst + x, v_dst1); - vst1q_f32(dst + x + 4, v_dst2); - } - else - { - float32x4_t v_scale = vdupq_n_f32(scale); - for ( ; x <= width - 8; x += 8) - { - float32x4_t v_dst1 = vmulq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x)); - v_dst1 = vmulq_f32(v_dst1, v_scale); - - float32x4_t v_dst2 = vmulq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4)); - v_dst2 = vmulq_f32(v_dst2, v_scale); - - vst1q_f32(dst + x, v_dst1); - vst1q_f32(dst + x + 4, v_dst2); - } - } - - return x; - } -}; - -#elif CV_SSE2 - -#if CV_SSE4_1 - -template <> -struct Mul_SIMD -{ - Mul_SIMD() - { - haveSSE = checkHardwareSupport(CV_CPU_SSE4_1); - } - - int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float scale) const - { - int x = 0; - - if (!haveSSE) - return x; - - __m128i v_zero = _mm_setzero_si128(); - - if( scale != 1.0f ) - { - __m128 v_scale = _mm_set1_ps(scale); - for ( ; x <= width - 8; x += 8) - { - __m128i v_src1 = _mm_loadu_si128((__m128i const *)(src1 + x)); - __m128i v_src2 = _mm_loadu_si128((__m128i const *)(src2 + x)); - - __m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src1, v_zero)), - _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src2, v_zero))); - v_dst1 = _mm_mul_ps(v_dst1, v_scale); - - __m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src1, v_zero)), - _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src2, v_zero))); - v_dst2 = _mm_mul_ps(v_dst2, v_scale); - - __m128i v_dsti = _mm_packus_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2)); - _mm_storeu_si128((__m128i *)(dst + x), v_dsti); - } - } - - return x; - } - - bool haveSSE; -}; - -#endif - -template <> -struct Mul_SIMD -{ - Mul_SIMD() - { - haveSSE = checkHardwareSupport(CV_CPU_SSE2); - } - - int operator() (const schar * src1, const schar * src2, schar * dst, int width, float scale) const - { - int x = 0; - - if (!haveSSE) - return x; - - __m128i v_zero = _mm_setzero_si128(); - - if( scale == 1.0f ) - for ( ; x <= width - 8; x += 8) - { - __m128i v_src1 = _mm_loadl_epi64((__m128i const *)(src1 + x)); - __m128i v_src2 = _mm_loadl_epi64((__m128i const *)(src2 + x)); - - v_src1 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src1), 8); - v_src2 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src2), 8); - - __m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)), - _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16))); - - __m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)), - _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16))); - - __m128i v_dsti = _mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2)); - _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dsti, v_zero)); - } - else - { - __m128 v_scale = _mm_set1_ps(scale); - for ( ; x <= width - 8; x += 8) - { - __m128i v_src1 = _mm_loadl_epi64((__m128i const *)(src1 + x)); - __m128i v_src2 = _mm_loadl_epi64((__m128i const *)(src2 + x)); - - v_src1 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src1), 8); - v_src2 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src2), 8); - - __m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)), - _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16))); - v_dst1 = _mm_mul_ps(v_dst1, v_scale); - - __m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)), - _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16))); - v_dst2 = _mm_mul_ps(v_dst2, v_scale); - - __m128i v_dsti = _mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2)); - _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dsti, v_zero)); - } - } - - return x; - } - - bool haveSSE; -}; - -template <> -struct Mul_SIMD -{ - Mul_SIMD() - { - haveSSE = checkHardwareSupport(CV_CPU_SSE2); - } - - int operator() (const short * src1, const short * src2, short * dst, int width, float scale) const - { - int x = 0; - - if (!haveSSE) - return x; - - __m128i v_zero = _mm_setzero_si128(); - - if( scale != 1.0f ) - { - __m128 v_scale = _mm_set1_ps(scale); - for ( ; x <= width - 8; x += 8) - { - __m128i v_src1 = _mm_loadu_si128((__m128i const *)(src1 + x)); - __m128i v_src2 = _mm_loadu_si128((__m128i const *)(src2 + x)); - - __m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)), - _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16))); - v_dst1 = _mm_mul_ps(v_dst1, v_scale); - - __m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)), - _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16))); - v_dst2 = _mm_mul_ps(v_dst2, v_scale); - - __m128i v_dsti = _mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2)); - _mm_storeu_si128((__m128i *)(dst + x), v_dsti); - } - } - - return x; - } - - bool haveSSE; -}; - -#endif - -template static void -mul_( const T* src1, size_t step1, const T* src2, size_t step2, - T* dst, size_t step, Size size, WT scale ) -{ - step1 /= sizeof(src1[0]); - step2 /= sizeof(src2[0]); - step /= sizeof(dst[0]); - - Mul_SIMD vop; - - if( scale == (WT)1. ) - { - for( ; size.height--; src1 += step1, src2 += step2, dst += step ) - { - int i = vop(src1, src2, dst, size.width, scale); - #if CV_ENABLE_UNROLLED - for(; i <= size.width - 4; i += 4 ) - { - T t0; - T t1; - t0 = saturate_cast(src1[i ] * src2[i ]); - t1 = saturate_cast(src1[i+1] * src2[i+1]); - dst[i ] = t0; - dst[i+1] = t1; - - t0 = saturate_cast(src1[i+2] * src2[i+2]); - t1 = saturate_cast(src1[i+3] * src2[i+3]); - dst[i+2] = t0; - dst[i+3] = t1; - } - #endif - for( ; i < size.width; i++ ) - dst[i] = saturate_cast(src1[i] * src2[i]); - } - } - else - { - for( ; size.height--; src1 += step1, src2 += step2, dst += step ) - { - int i = vop(src1, src2, dst, size.width, scale); - #if CV_ENABLE_UNROLLED - for(; i <= size.width - 4; i += 4 ) - { - T t0 = saturate_cast(scale*(WT)src1[i]*src2[i]); - T t1 = saturate_cast(scale*(WT)src1[i+1]*src2[i+1]); - dst[i] = t0; dst[i+1] = t1; - - t0 = saturate_cast(scale*(WT)src1[i+2]*src2[i+2]); - t1 = saturate_cast(scale*(WT)src1[i+3]*src2[i+3]); - dst[i+2] = t0; dst[i+3] = t1; - } - #endif - for( ; i < size.width; i++ ) - dst[i] = saturate_cast(scale*(WT)src1[i]*src2[i]); - } - } -} - -template -struct Div_SIMD -{ - int operator() (const T *, const T *, T *, int, double) const - { - return 0; - } -}; - -template -struct Recip_SIMD -{ - int operator() (const T *, T *, int, double) const - { - return 0; - } -}; - - -#if CV_SIMD128 - -template <> -struct Div_SIMD -{ - bool haveSIMD; - Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } - - int operator() (const uchar * src1, const uchar * src2, uchar * dst, int width, double scale) const - { - int x = 0; - - if (!haveSIMD) - return x; - - v_float32x4 v_scale = v_setall_f32((float)scale); - v_uint16x8 v_zero = v_setzero_u16(); - - for ( ; x <= width - 8; x += 8) - { - v_uint16x8 v_src1 = v_load_expand(src1 + x); - v_uint16x8 v_src2 = v_load_expand(src2 + x); - - v_uint32x4 t0, t1, t2, t3; - v_expand(v_src1, t0, t1); - v_expand(v_src2, t2, t3); - - v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0)); - v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1)); - - v_float32x4 f2 = v_cvt_f32(v_reinterpret_as_s32(t2)); - v_float32x4 f3 = v_cvt_f32(v_reinterpret_as_s32(t3)); - - f0 = f0 * v_scale / f2; - f1 = f1 * v_scale / f3; - - v_int32x4 i0 = v_round(f0), i1 = v_round(f1); - v_uint16x8 res = v_pack_u(i0, i1); - - res = v_select(v_src2 == v_zero, v_zero, res); - v_pack_store(dst + x, res); - } - - return x; - } -}; - - -template <> -struct Div_SIMD -{ - bool haveSIMD; - Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } - - int operator() (const schar * src1, const schar * src2, schar * dst, int width, double scale) const - { - int x = 0; - - if (!haveSIMD) - return x; - - v_float32x4 v_scale = v_setall_f32((float)scale); - v_int16x8 v_zero = v_setzero_s16(); - - for ( ; x <= width - 8; x += 8) - { - v_int16x8 v_src1 = v_load_expand(src1 + x); - v_int16x8 v_src2 = v_load_expand(src2 + x); - - v_int32x4 t0, t1, t2, t3; - v_expand(v_src1, t0, t1); - v_expand(v_src2, t2, t3); - - v_float32x4 f0 = v_cvt_f32(t0); - v_float32x4 f1 = v_cvt_f32(t1); - - v_float32x4 f2 = v_cvt_f32(t2); - v_float32x4 f3 = v_cvt_f32(t3); - - f0 = f0 * v_scale / f2; - f1 = f1 * v_scale / f3; - - v_int32x4 i0 = v_round(f0), i1 = v_round(f1); - v_int16x8 res = v_pack(i0, i1); - - res = v_select(v_src2 == v_zero, v_zero, res); - v_pack_store(dst + x, res); - } - - return x; - } -}; - - -template <> -struct Div_SIMD -{ - bool haveSIMD; - Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } - - int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, double scale) const - { - int x = 0; - - if (!haveSIMD) - return x; - - v_float32x4 v_scale = v_setall_f32((float)scale); - v_uint16x8 v_zero = v_setzero_u16(); - - for ( ; x <= width - 8; x += 8) - { - v_uint16x8 v_src1 = v_load(src1 + x); - v_uint16x8 v_src2 = v_load(src2 + x); - - v_uint32x4 t0, t1, t2, t3; - v_expand(v_src1, t0, t1); - v_expand(v_src2, t2, t3); - - v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0)); - v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1)); - - v_float32x4 f2 = v_cvt_f32(v_reinterpret_as_s32(t2)); - v_float32x4 f3 = v_cvt_f32(v_reinterpret_as_s32(t3)); - - f0 = f0 * v_scale / f2; - f1 = f1 * v_scale / f3; - - v_int32x4 i0 = v_round(f0), i1 = v_round(f1); - v_uint16x8 res = v_pack_u(i0, i1); - - res = v_select(v_src2 == v_zero, v_zero, res); - v_store(dst + x, res); - } - - return x; - } -}; - -template <> -struct Div_SIMD -{ - bool haveSIMD; - Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } - - int operator() (const short * src1, const short * src2, short * dst, int width, double scale) const - { - int x = 0; - - if (!haveSIMD) - return x; - - v_float32x4 v_scale = v_setall_f32((float)scale); - v_int16x8 v_zero = v_setzero_s16(); - - for ( ; x <= width - 8; x += 8) - { - v_int16x8 v_src1 = v_load(src1 + x); - v_int16x8 v_src2 = v_load(src2 + x); - - v_int32x4 t0, t1, t2, t3; - v_expand(v_src1, t0, t1); - v_expand(v_src2, t2, t3); - - v_float32x4 f0 = v_cvt_f32(t0); - v_float32x4 f1 = v_cvt_f32(t1); - - v_float32x4 f2 = v_cvt_f32(t2); - v_float32x4 f3 = v_cvt_f32(t3); - - f0 = f0 * v_scale / f2; - f1 = f1 * v_scale / f3; - - v_int32x4 i0 = v_round(f0), i1 = v_round(f1); - v_int16x8 res = v_pack(i0, i1); - - res = v_select(v_src2 == v_zero, v_zero, res); - v_store(dst + x, res); - } - - return x; - } -}; - -template <> -struct Div_SIMD -{ - bool haveSIMD; - Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } - - int operator() (const int * src1, const int * src2, int * dst, int width, double scale) const - { - int x = 0; - - if (!haveSIMD) - return x; - - v_float32x4 v_scale = v_setall_f32((float)scale); - v_int32x4 v_zero = v_setzero_s32(); - - for ( ; x <= width - 8; x += 8) - { - v_int32x4 t0 = v_load(src1 + x); - v_int32x4 t1 = v_load(src1 + x + 4); - v_int32x4 t2 = v_load(src2 + x); - v_int32x4 t3 = v_load(src2 + x + 4); - - v_float32x4 f0 = v_cvt_f32(t0); - v_float32x4 f1 = v_cvt_f32(t1); - v_float32x4 f2 = v_cvt_f32(t2); - v_float32x4 f3 = v_cvt_f32(t3); - - f0 = f0 * v_scale / f2; - f1 = f1 * v_scale / f3; - - v_int32x4 res0 = v_round(f0), res1 = v_round(f1); - - res0 = v_select(t2 == v_zero, v_zero, res0); - res1 = v_select(t3 == v_zero, v_zero, res1); - v_store(dst + x, res0); - v_store(dst + x + 4, res1); - } - - return x; - } -}; - - -template <> -struct Div_SIMD -{ - bool haveSIMD; - Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } - - int operator() (const float * src1, const float * src2, float * dst, int width, double scale) const - { - int x = 0; - - if (!haveSIMD) - return x; - - v_float32x4 v_scale = v_setall_f32((float)scale); - v_float32x4 v_zero = v_setzero_f32(); - - for ( ; x <= width - 8; x += 8) - { - v_float32x4 f0 = v_load(src1 + x); - v_float32x4 f1 = v_load(src1 + x + 4); - v_float32x4 f2 = v_load(src2 + x); - v_float32x4 f3 = v_load(src2 + x + 4); - - v_float32x4 res0 = f0 * v_scale / f2; - v_float32x4 res1 = f1 * v_scale / f3; - - res0 = v_select(f2 == v_zero, v_zero, res0); - res1 = v_select(f3 == v_zero, v_zero, res1); - - v_store(dst + x, res0); - v_store(dst + x + 4, res1); - } - - return x; - } -}; - - -///////////////////////// RECIPROCAL ////////////////////// - -template <> -struct Recip_SIMD -{ - bool haveSIMD; - Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } - - int operator() (const uchar * src2, uchar * dst, int width, double scale) const - { - int x = 0; - - if (!haveSIMD) - return x; - - v_float32x4 v_scale = v_setall_f32((float)scale); - v_uint16x8 v_zero = v_setzero_u16(); - - for ( ; x <= width - 8; x += 8) - { - v_uint16x8 v_src2 = v_load_expand(src2 + x); - - v_uint32x4 t0, t1; - v_expand(v_src2, t0, t1); - - v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0)); - v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1)); - - f0 = v_scale / f0; - f1 = v_scale / f1; - - v_int32x4 i0 = v_round(f0), i1 = v_round(f1); - v_uint16x8 res = v_pack_u(i0, i1); - - res = v_select(v_src2 == v_zero, v_zero, res); - v_pack_store(dst + x, res); - } - - return x; - } -}; - - -template <> -struct Recip_SIMD -{ - bool haveSIMD; - Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } - - int operator() (const schar * src2, schar * dst, int width, double scale) const - { - int x = 0; - - if (!haveSIMD) - return x; - - v_float32x4 v_scale = v_setall_f32((float)scale); - v_int16x8 v_zero = v_setzero_s16(); - - for ( ; x <= width - 8; x += 8) - { - v_int16x8 v_src2 = v_load_expand(src2 + x); - - v_int32x4 t0, t1; - v_expand(v_src2, t0, t1); - - v_float32x4 f0 = v_cvt_f32(t0); - v_float32x4 f1 = v_cvt_f32(t1); - - f0 = v_scale / f0; - f1 = v_scale / f1; - - v_int32x4 i0 = v_round(f0), i1 = v_round(f1); - v_int16x8 res = v_pack(i0, i1); - - res = v_select(v_src2 == v_zero, v_zero, res); - v_pack_store(dst + x, res); - } - - return x; - } -}; - - -template <> -struct Recip_SIMD -{ - bool haveSIMD; - Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } - - int operator() (const ushort * src2, ushort * dst, int width, double scale) const - { - int x = 0; - - if (!haveSIMD) - return x; - - v_float32x4 v_scale = v_setall_f32((float)scale); - v_uint16x8 v_zero = v_setzero_u16(); - - for ( ; x <= width - 8; x += 8) - { - v_uint16x8 v_src2 = v_load(src2 + x); - - v_uint32x4 t0, t1; - v_expand(v_src2, t0, t1); - - v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0)); - v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1)); - - f0 = v_scale / f0; - f1 = v_scale / f1; - - v_int32x4 i0 = v_round(f0), i1 = v_round(f1); - v_uint16x8 res = v_pack_u(i0, i1); - - res = v_select(v_src2 == v_zero, v_zero, res); - v_store(dst + x, res); - } - - return x; - } -}; - -template <> -struct Recip_SIMD -{ - bool haveSIMD; - Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } - - int operator() (const short * src2, short * dst, int width, double scale) const - { - int x = 0; - - if (!haveSIMD) - return x; - - v_float32x4 v_scale = v_setall_f32((float)scale); - v_int16x8 v_zero = v_setzero_s16(); - - for ( ; x <= width - 8; x += 8) - { - v_int16x8 v_src2 = v_load(src2 + x); - - v_int32x4 t0, t1; - v_expand(v_src2, t0, t1); - - v_float32x4 f0 = v_cvt_f32(t0); - v_float32x4 f1 = v_cvt_f32(t1); - - f0 = v_scale / f0; - f1 = v_scale / f1; - - v_int32x4 i0 = v_round(f0), i1 = v_round(f1); - v_int16x8 res = v_pack(i0, i1); - - res = v_select(v_src2 == v_zero, v_zero, res); - v_store(dst + x, res); - } - - return x; - } -}; - -template <> -struct Recip_SIMD -{ - bool haveSIMD; - Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } - - int operator() (const int * src2, int * dst, int width, double scale) const - { - int x = 0; - - if (!haveSIMD) - return x; - - v_float32x4 v_scale = v_setall_f32((float)scale); - v_int32x4 v_zero = v_setzero_s32(); - - for ( ; x <= width - 8; x += 8) - { - v_int32x4 t0 = v_load(src2 + x); - v_int32x4 t1 = v_load(src2 + x + 4); - - v_float32x4 f0 = v_cvt_f32(t0); - v_float32x4 f1 = v_cvt_f32(t1); - - f0 = v_scale / f0; - f1 = v_scale / f1; - - v_int32x4 res0 = v_round(f0), res1 = v_round(f1); - - res0 = v_select(t0 == v_zero, v_zero, res0); - res1 = v_select(t1 == v_zero, v_zero, res1); - v_store(dst + x, res0); - v_store(dst + x + 4, res1); - } - - return x; - } -}; - - -template <> -struct Recip_SIMD -{ - bool haveSIMD; - Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } - - int operator() (const float * src2, float * dst, int width, double scale) const - { - int x = 0; - - if (!haveSIMD) - return x; - - v_float32x4 v_scale = v_setall_f32((float)scale); - v_float32x4 v_zero = v_setzero_f32(); - - for ( ; x <= width - 8; x += 8) - { - v_float32x4 f0 = v_load(src2 + x); - v_float32x4 f1 = v_load(src2 + x + 4); - - v_float32x4 res0 = v_scale / f0; - v_float32x4 res1 = v_scale / f1; - - res0 = v_select(f0 == v_zero, v_zero, res0); - res1 = v_select(f1 == v_zero, v_zero, res1); - - v_store(dst + x, res0); - v_store(dst + x + 4, res1); - } - - return x; - } -}; - -#if CV_SIMD128_64F - -template <> -struct Div_SIMD -{ - bool haveSIMD; - Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } - - int operator() (const double * src1, const double * src2, double * dst, int width, double scale) const - { - int x = 0; - - if (!haveSIMD) - return x; - - v_float64x2 v_scale = v_setall_f64(scale); - v_float64x2 v_zero = v_setzero_f64(); - - for ( ; x <= width - 4; x += 4) - { - v_float64x2 f0 = v_load(src1 + x); - v_float64x2 f1 = v_load(src1 + x + 2); - v_float64x2 f2 = v_load(src2 + x); - v_float64x2 f3 = v_load(src2 + x + 2); - - v_float64x2 res0 = f0 * v_scale / f2; - v_float64x2 res1 = f1 * v_scale / f3; - - res0 = v_select(f0 == v_zero, v_zero, res0); - res1 = v_select(f1 == v_zero, v_zero, res1); - - v_store(dst + x, res0); - v_store(dst + x + 2, res1); - } - - return x; - } -}; - -template <> -struct Recip_SIMD -{ - bool haveSIMD; - Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } - - int operator() (const double * src2, double * dst, int width, double scale) const - { - int x = 0; - - if (!haveSIMD) - return x; - - v_float64x2 v_scale = v_setall_f64(scale); - v_float64x2 v_zero = v_setzero_f64(); - - for ( ; x <= width - 4; x += 4) - { - v_float64x2 f0 = v_load(src2 + x); - v_float64x2 f1 = v_load(src2 + x + 2); - - v_float64x2 res0 = v_scale / f0; - v_float64x2 res1 = v_scale / f1; - - res0 = v_select(f0 == v_zero, v_zero, res0); - res1 = v_select(f1 == v_zero, v_zero, res1); - - v_store(dst + x, res0); - v_store(dst + x + 2, res1); - } - - return x; - } -}; - -#endif - -#endif - -template static void -div_i( const T* src1, size_t step1, const T* src2, size_t step2, - T* dst, size_t step, Size size, double scale ) -{ - step1 /= sizeof(src1[0]); - step2 /= sizeof(src2[0]); - step /= sizeof(dst[0]); - - Div_SIMD vop; - float scale_f = (float)scale; - - for( ; size.height--; src1 += step1, src2 += step2, dst += step ) - { - int i = vop(src1, src2, dst, size.width, scale); - for( ; i < size.width; i++ ) - { - T num = src1[i], denom = src2[i]; - dst[i] = denom != 0 ? saturate_cast(num*scale_f/denom) : (T)0; - } - } -} - -template static void -div_f( const T* src1, size_t step1, const T* src2, size_t step2, - T* dst, size_t step, Size size, double scale ) -{ - T scale_f = (T)scale; - step1 /= sizeof(src1[0]); - step2 /= sizeof(src2[0]); - step /= sizeof(dst[0]); - - Div_SIMD vop; - - for( ; size.height--; src1 += step1, src2 += step2, dst += step ) - { - int i = vop(src1, src2, dst, size.width, scale); - for( ; i < size.width; i++ ) - { - T num = src1[i], denom = src2[i]; - dst[i] = denom != 0 ? saturate_cast(num*scale_f/denom) : (T)0; - } - } -} - -template static void -recip_i( const T*, size_t, const T* src2, size_t step2, - T* dst, size_t step, Size size, double scale ) -{ - step2 /= sizeof(src2[0]); - step /= sizeof(dst[0]); - - Recip_SIMD vop; - float scale_f = (float)scale; - - for( ; size.height--; src2 += step2, dst += step ) - { - int i = vop(src2, dst, size.width, scale); - for( ; i < size.width; i++ ) - { - T denom = src2[i]; - dst[i] = denom != 0 ? saturate_cast(scale_f/denom) : (T)0; - } - } -} - -template static void -recip_f( const T*, size_t, const T* src2, size_t step2, - T* dst, size_t step, Size size, double scale ) -{ - T scale_f = (T)scale; - step2 /= sizeof(src2[0]); - step /= sizeof(dst[0]); - - Recip_SIMD vop; - - for( ; size.height--; src2 += step2, dst += step ) - { - int i = vop(src2, dst, size.width, scale); - for( ; i < size.width; i++ ) - { - T denom = src2[i]; - dst[i] = denom != 0 ? saturate_cast(scale_f/denom) : (T)0; - } - } -} - - -static void mul8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, - uchar* dst, size_t step, Size sz, void* scale) -{ - float fscale = (float)*(const double*)scale; -#if defined HAVE_IPP - CV_IPP_CHECK() - { - if (std::fabs(fscale - 1) <= FLT_EPSILON) - { - if (ippiMul_8u_C1RSfs(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz), 0) >= 0) - { - CV_IMPL_ADD(CV_IMPL_IPP); - return; - } - setIppErrorStatus(); - } - } -#endif - mul_(src1, step1, src2, step2, dst, step, sz, fscale); -} - -static void mul8s( const schar* src1, size_t step1, const schar* src2, size_t step2, - schar* dst, size_t step, Size sz, void* scale) -{ - mul_(src1, step1, src2, step2, dst, step, sz, (float)*(const double*)scale); -} - -static void mul16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, - ushort* dst, size_t step, Size sz, void* scale) -{ - float fscale = (float)*(const double*)scale; -#if defined HAVE_IPP - CV_IPP_CHECK() - { - if (std::fabs(fscale - 1) <= FLT_EPSILON) - { - if (ippiMul_16u_C1RSfs(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz), 0) >= 0) - { - CV_IMPL_ADD(CV_IMPL_IPP); - return; - } - setIppErrorStatus(); - } - } -#endif - mul_(src1, step1, src2, step2, dst, step, sz, fscale); -} - -static void mul16s( const short* src1, size_t step1, const short* src2, size_t step2, - short* dst, size_t step, Size sz, void* scale) -{ - float fscale = (float)*(const double*)scale; -#if defined HAVE_IPP - CV_IPP_CHECK() - { - if (std::fabs(fscale - 1) <= FLT_EPSILON) - { - if (ippiMul_16s_C1RSfs(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz), 0) >= 0) - { - CV_IMPL_ADD(CV_IMPL_IPP); - return; - } - setIppErrorStatus(); - } - } -#endif - mul_(src1, step1, src2, step2, dst, step, sz, fscale); -} - -static void mul32s( const int* src1, size_t step1, const int* src2, size_t step2, - int* dst, size_t step, Size sz, void* scale) -{ - mul_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); -} - -static void mul32f( const float* src1, size_t step1, const float* src2, size_t step2, - float* dst, size_t step, Size sz, void* scale) -{ - float fscale = (float)*(const double*)scale; -#if defined HAVE_IPP - CV_IPP_CHECK() - { - if (std::fabs(fscale - 1) <= FLT_EPSILON) - { - if (ippiMul_32f_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz)) >= 0) - { - CV_IMPL_ADD(CV_IMPL_IPP); - return; - } - setIppErrorStatus(); - } - } -#endif - mul_(src1, step1, src2, step2, dst, step, sz, fscale); -} - -static void mul64f( const double* src1, size_t step1, const double* src2, size_t step2, - double* dst, size_t step, Size sz, void* scale) -{ - mul_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); -} - -static void div8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, - uchar* dst, size_t step, Size sz, void* scale) -{ - if( src1 ) - div_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); - else - recip_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); -} - -static void div8s( const schar* src1, size_t step1, const schar* src2, size_t step2, - schar* dst, size_t step, Size sz, void* scale) -{ - div_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); -} - -static void div16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, - ushort* dst, size_t step, Size sz, void* scale) -{ - div_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); -} - -static void div16s( const short* src1, size_t step1, const short* src2, size_t step2, - short* dst, size_t step, Size sz, void* scale) -{ - div_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); -} - -static void div32s( const int* src1, size_t step1, const int* src2, size_t step2, - int* dst, size_t step, Size sz, void* scale) -{ - div_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); -} - -static void div32f( const float* src1, size_t step1, const float* src2, size_t step2, - float* dst, size_t step, Size sz, void* scale) -{ - div_f(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); -} - -static void div64f( const double* src1, size_t step1, const double* src2, size_t step2, - double* dst, size_t step, Size sz, void* scale) -{ - div_f(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); -} - -static void recip8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, - uchar* dst, size_t step, Size sz, void* scale) -{ - recip_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); -} - -static void recip8s( const schar* src1, size_t step1, const schar* src2, size_t step2, - schar* dst, size_t step, Size sz, void* scale) -{ - recip_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); -} - -static void recip16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, - ushort* dst, size_t step, Size sz, void* scale) -{ - recip_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); -} - -static void recip16s( const short* src1, size_t step1, const short* src2, size_t step2, - short* dst, size_t step, Size sz, void* scale) -{ - recip_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); -} - -static void recip32s( const int* src1, size_t step1, const int* src2, size_t step2, - int* dst, size_t step, Size sz, void* scale) -{ - recip_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); -} - -static void recip32f( const float* src1, size_t step1, const float* src2, size_t step2, - float* dst, size_t step, Size sz, void* scale) -{ - recip_f(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); -} - -static void recip64f( const double* src1, size_t step1, const double* src2, size_t step2, - double* dst, size_t step, Size sz, void* scale) -{ - recip_f(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); -} - - -static BinaryFunc* getMulTab() -{ - static BinaryFunc mulTab[] = - { - (BinaryFunc)mul8u, (BinaryFunc)mul8s, (BinaryFunc)mul16u, - (BinaryFunc)mul16s, (BinaryFunc)mul32s, (BinaryFunc)mul32f, - (BinaryFunc)mul64f, 0 + (BinaryFuncC)cv::hal::mul8u, (BinaryFuncC)cv::hal::mul8s, (BinaryFuncC)cv::hal::mul16u, + (BinaryFuncC)cv::hal::mul16s, (BinaryFuncC)cv::hal::mul32s, (BinaryFuncC)cv::hal::mul32f, + (BinaryFuncC)cv::hal::mul64f, 0 }; return mulTab; } -static BinaryFunc* getDivTab() +static BinaryFuncC* getDivTab() { - static BinaryFunc divTab[] = + static BinaryFuncC divTab[] = { - (BinaryFunc)div8u, (BinaryFunc)div8s, (BinaryFunc)div16u, - (BinaryFunc)div16s, (BinaryFunc)div32s, (BinaryFunc)div32f, - (BinaryFunc)div64f, 0 + (BinaryFuncC)cv::hal::div8u, (BinaryFuncC)cv::hal::div8s, (BinaryFuncC)cv::hal::div16u, + (BinaryFuncC)cv::hal::div16s, (BinaryFuncC)cv::hal::div32s, (BinaryFuncC)cv::hal::div32f, + (BinaryFuncC)cv::hal::div64f, 0 }; return divTab; } -static BinaryFunc* getRecipTab() +static BinaryFuncC* getRecipTab() { - static BinaryFunc recipTab[] = + static BinaryFuncC recipTab[] = { - (BinaryFunc)recip8u, (BinaryFunc)recip8s, (BinaryFunc)recip16u, - (BinaryFunc)recip16s, (BinaryFunc)recip32s, (BinaryFunc)recip32f, - (BinaryFunc)recip64f, 0 + (BinaryFuncC)cv::hal::recip8u, (BinaryFuncC)cv::hal::recip8s, (BinaryFuncC)cv::hal::recip16u, + (BinaryFuncC)cv::hal::recip16s, (BinaryFuncC)cv::hal::recip32s, (BinaryFuncC)cv::hal::recip32f, + (BinaryFuncC)cv::hal::recip64f, 0 }; return recipTab; @@ -3715,421 +1039,13 @@ void cv::divide(double scale, InputArray src2, namespace cv { -template -struct AddWeighted_SIMD +static BinaryFuncC* getAddWeightedTab() { - int operator() (const T *, const T *, T *, int, WT, WT, WT) const + static BinaryFuncC addWeightedTab[] = { - return 0; - } -}; - -#if CV_SSE2 - -template <> -struct AddWeighted_SIMD -{ - AddWeighted_SIMD() - { - haveSSE2 = checkHardwareSupport(CV_CPU_SSE2); - } - - int operator() (const schar * src1, const schar * src2, schar * dst, int width, float alpha, float beta, float gamma) const - { - int x = 0; - - if (!haveSSE2) - return x; - - __m128i v_zero = _mm_setzero_si128(); - __m128 v_alpha = _mm_set1_ps(alpha), v_beta = _mm_set1_ps(beta), - v_gamma = _mm_set1_ps(gamma); - - for( ; x <= width - 8; x += 8 ) - { - __m128i v_src1 = _mm_loadl_epi64((const __m128i *)(src1 + x)); - __m128i v_src2 = _mm_loadl_epi64((const __m128i *)(src2 + x)); - - __m128i v_src1_p = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src1), 8); - __m128i v_src2_p = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src2), 8); - - __m128 v_dstf0 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1_p), 16)), v_alpha); - v_dstf0 = _mm_add_ps(_mm_add_ps(v_dstf0, v_gamma), - _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2_p), 16)), v_beta)); - - __m128 v_dstf1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1_p), 16)), v_alpha); - v_dstf1 = _mm_add_ps(_mm_add_ps(v_dstf1, v_gamma), - _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2_p), 16)), v_beta)); - - __m128i v_dst16 = _mm_packs_epi32(_mm_cvtps_epi32(v_dstf0), - _mm_cvtps_epi32(v_dstf1)); - - _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst16, v_zero)); - } - - return x; - } - - bool haveSSE2; -}; - -template <> -struct AddWeighted_SIMD -{ - AddWeighted_SIMD() - { - haveSSE2 = checkHardwareSupport(CV_CPU_SSE2); - } - - int operator() (const short * src1, const short * src2, short * dst, int width, float alpha, float beta, float gamma) const - { - int x = 0; - - if (!haveSSE2) - return x; - - __m128i v_zero = _mm_setzero_si128(); - __m128 v_alpha = _mm_set1_ps(alpha), v_beta = _mm_set1_ps(beta), - v_gamma = _mm_set1_ps(gamma); - - for( ; x <= width - 8; x += 8 ) - { - __m128i v_src1 = _mm_loadu_si128((const __m128i *)(src1 + x)); - __m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x)); - - __m128 v_dstf0 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)), v_alpha); - v_dstf0 = _mm_add_ps(_mm_add_ps(v_dstf0, v_gamma), - _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16)), v_beta)); - - __m128 v_dstf1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)), v_alpha); - v_dstf1 = _mm_add_ps(_mm_add_ps(v_dstf1, v_gamma), - _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16)), v_beta)); - - _mm_storeu_si128((__m128i *)(dst + x), _mm_packs_epi32(_mm_cvtps_epi32(v_dstf0), - _mm_cvtps_epi32(v_dstf1))); - } - - return x; - } - - bool haveSSE2; -}; - -#if CV_SSE4_1 - -template <> -struct AddWeighted_SIMD -{ - AddWeighted_SIMD() - { - haveSSE4_1 = checkHardwareSupport(CV_CPU_SSE4_1); - } - - int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float alpha, float beta, float gamma) const - { - int x = 0; - - if (!haveSSE4_1) - return x; - - __m128i v_zero = _mm_setzero_si128(); - __m128 v_alpha = _mm_set1_ps(alpha), v_beta = _mm_set1_ps(beta), - v_gamma = _mm_set1_ps(gamma); - - for( ; x <= width - 8; x += 8 ) - { - __m128i v_src1 = _mm_loadu_si128((const __m128i *)(src1 + x)); - __m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x)); - - __m128 v_dstf0 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src1, v_zero)), v_alpha); - v_dstf0 = _mm_add_ps(_mm_add_ps(v_dstf0, v_gamma), - _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src2, v_zero)), v_beta)); - - __m128 v_dstf1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src1, v_zero)), v_alpha); - v_dstf1 = _mm_add_ps(_mm_add_ps(v_dstf1, v_gamma), - _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src2, v_zero)), v_beta)); - - _mm_storeu_si128((__m128i *)(dst + x), _mm_packus_epi32(_mm_cvtps_epi32(v_dstf0), - _mm_cvtps_epi32(v_dstf1))); - } - - return x; - } - - bool haveSSE4_1; -}; - -#endif - -#elif CV_NEON - -template <> -struct AddWeighted_SIMD -{ - int operator() (const schar * src1, const schar * src2, schar * dst, int width, float alpha, float beta, float gamma) const - { - int x = 0; - - float32x4_t g = vdupq_n_f32 (gamma); - - for( ; x <= width - 8; x += 8 ) - { - int8x8_t in1 = vld1_s8(src1 + x); - int16x8_t in1_16 = vmovl_s8(in1); - float32x4_t in1_f_l = vcvtq_f32_s32(vmovl_s16(vget_low_s16(in1_16))); - float32x4_t in1_f_h = vcvtq_f32_s32(vmovl_s16(vget_high_s16(in1_16))); - - int8x8_t in2 = vld1_s8(src2+x); - int16x8_t in2_16 = vmovl_s8(in2); - float32x4_t in2_f_l = vcvtq_f32_s32(vmovl_s16(vget_low_s16(in2_16))); - float32x4_t in2_f_h = vcvtq_f32_s32(vmovl_s16(vget_high_s16(in2_16))); - - float32x4_t out_f_l = vaddq_f32(vmulq_n_f32(in1_f_l, alpha), vmulq_n_f32(in2_f_l, beta)); - float32x4_t out_f_h = vaddq_f32(vmulq_n_f32(in1_f_h, alpha), vmulq_n_f32(in2_f_h, beta)); - out_f_l = vaddq_f32(out_f_l, g); - out_f_h = vaddq_f32(out_f_h, g); - - int16x4_t out_16_l = vqmovn_s32(cv_vrndq_s32_f32(out_f_l)); - int16x4_t out_16_h = vqmovn_s32(cv_vrndq_s32_f32(out_f_h)); - - int16x8_t out_16 = vcombine_s16(out_16_l, out_16_h); - int8x8_t out = vqmovn_s16(out_16); - - vst1_s8(dst + x, out); - } - - return x; - } -}; - -template <> -struct AddWeighted_SIMD -{ - int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float alpha, float beta, float gamma) const - { - int x = 0; - - float32x4_t g = vdupq_n_f32(gamma); - - for( ; x <= width - 8; x += 8 ) - { - uint16x8_t v_src1 = vld1q_u16(src1 + x), v_src2 = vld1q_u16(src2 + x); - - float32x4_t v_s1 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))), alpha); - float32x4_t v_s2 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2))), beta); - uint16x4_t v_dst1 = vqmovn_u32(cv_vrndq_u32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g))); - - v_s1 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))), alpha); - v_s2 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2))), beta); - uint16x4_t v_dst2 = vqmovn_u32(cv_vrndq_u32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g))); - - vst1q_u16(dst + x, vcombine_u16(v_dst1, v_dst2)); - } - - return x; - } -}; - -template <> -struct AddWeighted_SIMD -{ - int operator() (const short * src1, const short * src2, short * dst, int width, float alpha, float beta, float gamma) const - { - int x = 0; - - float32x4_t g = vdupq_n_f32(gamma); - - for( ; x <= width - 8; x += 8 ) - { - int16x8_t v_src1 = vld1q_s16(src1 + x), v_src2 = vld1q_s16(src2 + x); - - float32x4_t v_s1 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))), alpha); - float32x4_t v_s2 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2))), beta); - int16x4_t v_dst1 = vqmovn_s32(cv_vrndq_s32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g))); - - v_s1 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))), alpha); - v_s2 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2))), beta); - int16x4_t v_dst2 = vqmovn_s32(cv_vrndq_s32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g))); - - vst1q_s16(dst + x, vcombine_s16(v_dst1, v_dst2)); - } - - return x; - } -}; - -#endif - -template static void -addWeighted_( const T* src1, size_t step1, const T* src2, size_t step2, - T* dst, size_t step, Size size, void* _scalars ) -{ - const double* scalars = (const double*)_scalars; - WT alpha = (WT)scalars[0], beta = (WT)scalars[1], gamma = (WT)scalars[2]; - step1 /= sizeof(src1[0]); - step2 /= sizeof(src2[0]); - step /= sizeof(dst[0]); - - AddWeighted_SIMD vop; - - for( ; size.height--; src1 += step1, src2 += step2, dst += step ) - { - int x = vop(src1, src2, dst, size.width, alpha, beta, gamma); - #if CV_ENABLE_UNROLLED - for( ; x <= size.width - 4; x += 4 ) - { - T t0 = saturate_cast(src1[x]*alpha + src2[x]*beta + gamma); - T t1 = saturate_cast(src1[x+1]*alpha + src2[x+1]*beta + gamma); - dst[x] = t0; dst[x+1] = t1; - - t0 = saturate_cast(src1[x+2]*alpha + src2[x+2]*beta + gamma); - t1 = saturate_cast(src1[x+3]*alpha + src2[x+3]*beta + gamma); - dst[x+2] = t0; dst[x+3] = t1; - } - #endif - for( ; x < size.width; x++ ) - dst[x] = saturate_cast(src1[x]*alpha + src2[x]*beta + gamma); - } -} - - -static void -addWeighted8u( const uchar* src1, size_t step1, - const uchar* src2, size_t step2, - uchar* dst, size_t step, Size size, - void* _scalars ) -{ - const double* scalars = (const double*)_scalars; - float alpha = (float)scalars[0], beta = (float)scalars[1], gamma = (float)scalars[2]; - - for( ; size.height--; src1 += step1, src2 += step2, dst += step ) - { - int x = 0; - -#if CV_SSE2 - if( USE_SSE2 ) - { - __m128 a4 = _mm_set1_ps(alpha), b4 = _mm_set1_ps(beta), g4 = _mm_set1_ps(gamma); - __m128i z = _mm_setzero_si128(); - - for( ; x <= size.width - 8; x += 8 ) - { - __m128i u = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(src1 + x)), z); - __m128i v = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(src2 + x)), z); - - __m128 u0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(u, z)); - __m128 u1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(u, z)); - __m128 v0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v, z)); - __m128 v1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v, z)); - - u0 = _mm_add_ps(_mm_mul_ps(u0, a4), _mm_mul_ps(v0, b4)); - u1 = _mm_add_ps(_mm_mul_ps(u1, a4), _mm_mul_ps(v1, b4)); - u0 = _mm_add_ps(u0, g4); u1 = _mm_add_ps(u1, g4); - - u = _mm_packs_epi32(_mm_cvtps_epi32(u0), _mm_cvtps_epi32(u1)); - u = _mm_packus_epi16(u, u); - - _mm_storel_epi64((__m128i*)(dst + x), u); - } - } -#elif CV_NEON - float32x4_t g = vdupq_n_f32 (gamma); - - for( ; x <= size.width - 8; x += 8 ) - { - uint8x8_t in1 = vld1_u8(src1+x); - uint16x8_t in1_16 = vmovl_u8(in1); - float32x4_t in1_f_l = vcvtq_f32_u32(vmovl_u16(vget_low_u16(in1_16))); - float32x4_t in1_f_h = vcvtq_f32_u32(vmovl_u16(vget_high_u16(in1_16))); - - uint8x8_t in2 = vld1_u8(src2+x); - uint16x8_t in2_16 = vmovl_u8(in2); - float32x4_t in2_f_l = vcvtq_f32_u32(vmovl_u16(vget_low_u16(in2_16))); - float32x4_t in2_f_h = vcvtq_f32_u32(vmovl_u16(vget_high_u16(in2_16))); - - float32x4_t out_f_l = vaddq_f32(vmulq_n_f32(in1_f_l, alpha), vmulq_n_f32(in2_f_l, beta)); - float32x4_t out_f_h = vaddq_f32(vmulq_n_f32(in1_f_h, alpha), vmulq_n_f32(in2_f_h, beta)); - out_f_l = vaddq_f32(out_f_l, g); - out_f_h = vaddq_f32(out_f_h, g); - - uint16x4_t out_16_l = vqmovun_s32(cv_vrndq_s32_f32(out_f_l)); - uint16x4_t out_16_h = vqmovun_s32(cv_vrndq_s32_f32(out_f_h)); - - uint16x8_t out_16 = vcombine_u16(out_16_l, out_16_h); - uint8x8_t out = vqmovn_u16(out_16); - - vst1_u8(dst+x, out); - } -#endif - #if CV_ENABLE_UNROLLED - for( ; x <= size.width - 4; x += 4 ) - { - float t0, t1; - t0 = CV_8TO32F(src1[x])*alpha + CV_8TO32F(src2[x])*beta + gamma; - t1 = CV_8TO32F(src1[x+1])*alpha + CV_8TO32F(src2[x+1])*beta + gamma; - - dst[x] = saturate_cast(t0); - dst[x+1] = saturate_cast(t1); - - t0 = CV_8TO32F(src1[x+2])*alpha + CV_8TO32F(src2[x+2])*beta + gamma; - t1 = CV_8TO32F(src1[x+3])*alpha + CV_8TO32F(src2[x+3])*beta + gamma; - - dst[x+2] = saturate_cast(t0); - dst[x+3] = saturate_cast(t1); - } - #endif - - for( ; x < size.width; x++ ) - { - float t0 = CV_8TO32F(src1[x])*alpha + CV_8TO32F(src2[x])*beta + gamma; - dst[x] = saturate_cast(t0); - } - } -} - -static void addWeighted8s( const schar* src1, size_t step1, const schar* src2, size_t step2, - schar* dst, size_t step, Size sz, void* scalars ) -{ - addWeighted_(src1, step1, src2, step2, dst, step, sz, scalars); -} - -static void addWeighted16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, - ushort* dst, size_t step, Size sz, void* scalars ) -{ - addWeighted_(src1, step1, src2, step2, dst, step, sz, scalars); -} - -static void addWeighted16s( const short* src1, size_t step1, const short* src2, size_t step2, - short* dst, size_t step, Size sz, void* scalars ) -{ - addWeighted_(src1, step1, src2, step2, dst, step, sz, scalars); -} - -static void addWeighted32s( const int* src1, size_t step1, const int* src2, size_t step2, - int* dst, size_t step, Size sz, void* scalars ) -{ - addWeighted_(src1, step1, src2, step2, dst, step, sz, scalars); -} - -static void addWeighted32f( const float* src1, size_t step1, const float* src2, size_t step2, - float* dst, size_t step, Size sz, void* scalars ) -{ - addWeighted_(src1, step1, src2, step2, dst, step, sz, scalars); -} - -static void addWeighted64f( const double* src1, size_t step1, const double* src2, size_t step2, - double* dst, size_t step, Size sz, void* scalars ) -{ - addWeighted_(src1, step1, src2, step2, dst, step, sz, scalars); -} - -static BinaryFunc* getAddWeightedTab() -{ - static BinaryFunc addWeightedTab[] = - { - (BinaryFunc)GET_OPTIMIZED(addWeighted8u), (BinaryFunc)GET_OPTIMIZED(addWeighted8s), (BinaryFunc)GET_OPTIMIZED(addWeighted16u), - (BinaryFunc)GET_OPTIMIZED(addWeighted16s), (BinaryFunc)GET_OPTIMIZED(addWeighted32s), (BinaryFunc)addWeighted32f, - (BinaryFunc)addWeighted64f, 0 + (BinaryFuncC)GET_OPTIMIZED(cv::hal::addWeighted8u), (BinaryFuncC)GET_OPTIMIZED(cv::hal::addWeighted8s), (BinaryFuncC)GET_OPTIMIZED(cv::hal::addWeighted16u), + (BinaryFuncC)GET_OPTIMIZED(cv::hal::addWeighted16s), (BinaryFuncC)GET_OPTIMIZED(cv::hal::addWeighted32s), (BinaryFuncC)cv::hal::addWeighted32f, + (BinaryFuncC)cv::hal::addWeighted64f, 0 }; return addWeightedTab; @@ -4152,720 +1068,14 @@ void cv::addWeighted( InputArray src1, double alpha, InputArray src2, namespace cv { -template -struct Cmp_SIMD +static BinaryFuncC getCmpFunc(int depth) { - explicit Cmp_SIMD(int) + static BinaryFuncC cmpTab[] = { - } - - int operator () (const T *, const T *, uchar *, int) const - { - return 0; - } -}; - -#if CV_NEON - -template <> -struct Cmp_SIMD -{ - explicit Cmp_SIMD(int code_) : - code(code_) - { - CV_Assert(code == CMP_GT || code == CMP_LE || - code == CMP_EQ || code == CMP_NE); - - v_mask = vdupq_n_u8(255); - } - - int operator () (const schar * src1, const schar * src2, uchar * dst, int width) const - { - int x = 0; - - if (code == CMP_GT) - for ( ; x <= width - 16; x += 16) - vst1q_u8(dst + x, vcgtq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x))); - else if (code == CMP_LE) - for ( ; x <= width - 16; x += 16) - vst1q_u8(dst + x, vcleq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x))); - else if (code == CMP_EQ) - for ( ; x <= width - 16; x += 16) - vst1q_u8(dst + x, vceqq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x))); - else if (code == CMP_NE) - for ( ; x <= width - 16; x += 16) - vst1q_u8(dst + x, veorq_u8(vceqq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x)), v_mask)); - - return x; - } - - int code; - uint8x16_t v_mask; -}; - -template <> -struct Cmp_SIMD -{ - explicit Cmp_SIMD(int code_) : - code(code_) - { - CV_Assert(code == CMP_GT || code == CMP_LE || - code == CMP_EQ || code == CMP_NE); - - v_mask = vdup_n_u8(255); - } - - int operator () (const ushort * src1, const ushort * src2, uchar * dst, int width) const - { - int x = 0; - - if (code == CMP_GT) - for ( ; x <= width - 8; x += 8) - { - uint16x8_t v_dst = vcgtq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x)); - vst1_u8(dst + x, vmovn_u16(v_dst)); - } - else if (code == CMP_LE) - for ( ; x <= width - 8; x += 8) - { - uint16x8_t v_dst = vcleq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x)); - vst1_u8(dst + x, vmovn_u16(v_dst)); - } - else if (code == CMP_EQ) - for ( ; x <= width - 8; x += 8) - { - uint16x8_t v_dst = vceqq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x)); - vst1_u8(dst + x, vmovn_u16(v_dst)); - } - else if (code == CMP_NE) - for ( ; x <= width - 8; x += 8) - { - uint16x8_t v_dst = vceqq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x)); - vst1_u8(dst + x, veor_u8(vmovn_u16(v_dst), v_mask)); - } - - return x; - } - - int code; - uint8x8_t v_mask; -}; - -template <> -struct Cmp_SIMD -{ - explicit Cmp_SIMD(int code_) : - code(code_) - { - CV_Assert(code == CMP_GT || code == CMP_LE || - code == CMP_EQ || code == CMP_NE); - - v_mask = vdup_n_u8(255); - } - - int operator () (const int * src1, const int * src2, uchar * dst, int width) const - { - int x = 0; - - if (code == CMP_GT) - for ( ; x <= width - 8; x += 8) - { - uint32x4_t v_dst1 = vcgtq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x)); - uint32x4_t v_dst2 = vcgtq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4)); - vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2)))); - } - else if (code == CMP_LE) - for ( ; x <= width - 8; x += 8) - { - uint32x4_t v_dst1 = vcleq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x)); - uint32x4_t v_dst2 = vcleq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4)); - vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2)))); - } - else if (code == CMP_EQ) - for ( ; x <= width - 8; x += 8) - { - uint32x4_t v_dst1 = vceqq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x)); - uint32x4_t v_dst2 = vceqq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4)); - vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2)))); - } - else if (code == CMP_NE) - for ( ; x <= width - 8; x += 8) - { - uint32x4_t v_dst1 = vceqq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x)); - uint32x4_t v_dst2 = vceqq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4)); - uint8x8_t v_dst = vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))); - vst1_u8(dst + x, veor_u8(v_dst, v_mask)); - } - - return x; - } - - int code; - uint8x8_t v_mask; -}; - -template <> -struct Cmp_SIMD -{ - explicit Cmp_SIMD(int code_) : - code(code_) - { - CV_Assert(code == CMP_GT || code == CMP_LE || - code == CMP_EQ || code == CMP_NE); - - v_mask = vdup_n_u8(255); - } - - int operator () (const float * src1, const float * src2, uchar * dst, int width) const - { - int x = 0; - - if (code == CMP_GT) - for ( ; x <= width - 8; x += 8) - { - uint32x4_t v_dst1 = vcgtq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x)); - uint32x4_t v_dst2 = vcgtq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4)); - vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2)))); - } - else if (code == CMP_LE) - for ( ; x <= width - 8; x += 8) - { - uint32x4_t v_dst1 = vcleq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x)); - uint32x4_t v_dst2 = vcleq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4)); - vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2)))); - } - else if (code == CMP_EQ) - for ( ; x <= width - 8; x += 8) - { - uint32x4_t v_dst1 = vceqq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x)); - uint32x4_t v_dst2 = vceqq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4)); - vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2)))); - } - else if (code == CMP_NE) - for ( ; x <= width - 8; x += 8) - { - uint32x4_t v_dst1 = vceqq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x)); - uint32x4_t v_dst2 = vceqq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4)); - uint8x8_t v_dst = vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))); - vst1_u8(dst + x, veor_u8(v_dst, v_mask)); - } - - return x; - } - - int code; - uint8x8_t v_mask; -}; - -#elif CV_SSE2 - -template <> -struct Cmp_SIMD -{ - explicit Cmp_SIMD(int code_) : - code(code_) - { - CV_Assert(code == CMP_GT || code == CMP_LE || - code == CMP_EQ || code == CMP_NE); - - haveSSE = checkHardwareSupport(CV_CPU_SSE2); - - v_mask = _mm_set1_epi8(-1); - } - - int operator () (const schar * src1, const schar * src2, uchar * dst, int width) const - { - int x = 0; - - if (!haveSSE) - return x; - - if (code == CMP_GT) - for ( ; x <= width - 16; x += 16) - _mm_storeu_si128((__m128i *)(dst + x), _mm_cmpgt_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)), - _mm_loadu_si128((const __m128i *)(src2 + x)))); - else if (code == CMP_LE) - for ( ; x <= width - 16; x += 16) - { - __m128i v_gt = _mm_cmpgt_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)), - _mm_loadu_si128((const __m128i *)(src2 + x))); - _mm_storeu_si128((__m128i *)(dst + x), _mm_xor_si128(v_mask, v_gt)); - } - else if (code == CMP_EQ) - for ( ; x <= width - 16; x += 16) - _mm_storeu_si128((__m128i *)(dst + x), _mm_cmpeq_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)), - _mm_loadu_si128((const __m128i *)(src2 + x)))); - else if (code == CMP_NE) - for ( ; x <= width - 16; x += 16) - { - __m128i v_eq = _mm_cmpeq_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)), - _mm_loadu_si128((const __m128i *)(src2 + x))); - _mm_storeu_si128((__m128i *)(dst + x), _mm_xor_si128(v_mask, v_eq)); - } - - return x; - } - - int code; - __m128i v_mask; - bool haveSSE; -}; - -template <> -struct Cmp_SIMD -{ - explicit Cmp_SIMD(int code_) : - code(code_) - { - CV_Assert(code == CMP_GT || code == CMP_LE || - code == CMP_EQ || code == CMP_NE); - - haveSSE = checkHardwareSupport(CV_CPU_SSE2); - - v_mask = _mm_set1_epi32(0xffffffff); - } - - int operator () (const int * src1, const int * src2, uchar * dst, int width) const - { - int x = 0; - - if (!haveSSE) - return x; - - if (code == CMP_GT) - for ( ; x <= width - 8; x += 8) - { - __m128i v_dst0 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)), - _mm_loadu_si128((const __m128i *)(src2 + x))); - __m128i v_dst1 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)), - _mm_loadu_si128((const __m128i *)(src2 + x + 4))); - - _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask)); - } - else if (code == CMP_LE) - for ( ; x <= width - 8; x += 8) - { - __m128i v_dst0 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)), - _mm_loadu_si128((const __m128i *)(src2 + x))); - __m128i v_dst1 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)), - _mm_loadu_si128((const __m128i *)(src2 + x + 4))); - - _mm_storel_epi64((__m128i *)(dst + x), _mm_xor_si128(_mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask), v_mask)); - } - else if (code == CMP_EQ) - for ( ; x <= width - 8; x += 8) - { - __m128i v_dst0 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)), - _mm_loadu_si128((const __m128i *)(src2 + x))); - __m128i v_dst1 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)), - _mm_loadu_si128((const __m128i *)(src2 + x + 4))); - - _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask)); - } - else if (code == CMP_NE) - for ( ; x <= width - 8; x += 8) - { - __m128i v_dst0 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)), - _mm_loadu_si128((const __m128i *)(src2 + x))); - __m128i v_dst1 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)), - _mm_loadu_si128((const __m128i *)(src2 + x + 4))); - - _mm_storel_epi64((__m128i *)(dst + x), _mm_xor_si128(v_mask, _mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask))); - } - - return x; - } - - int code; - __m128i v_mask; - bool haveSSE; -}; - -#endif - -template static void -cmp_(const T* src1, size_t step1, const T* src2, size_t step2, - uchar* dst, size_t step, Size size, int code) -{ - step1 /= sizeof(src1[0]); - step2 /= sizeof(src2[0]); - if( code == CMP_GE || code == CMP_LT ) - { - std::swap(src1, src2); - std::swap(step1, step2); - code = code == CMP_GE ? CMP_LE : CMP_GT; - } - - Cmp_SIMD vop(code); - - if( code == CMP_GT || code == CMP_LE ) - { - int m = code == CMP_GT ? 0 : 255; - for( ; size.height--; src1 += step1, src2 += step2, dst += step ) - { - int x = vop(src1, src2, dst, size.width); - #if CV_ENABLE_UNROLLED - for( ; x <= size.width - 4; x += 4 ) - { - int t0, t1; - t0 = -(src1[x] > src2[x]) ^ m; - t1 = -(src1[x+1] > src2[x+1]) ^ m; - dst[x] = (uchar)t0; dst[x+1] = (uchar)t1; - t0 = -(src1[x+2] > src2[x+2]) ^ m; - t1 = -(src1[x+3] > src2[x+3]) ^ m; - dst[x+2] = (uchar)t0; dst[x+3] = (uchar)t1; - } - #endif - for( ; x < size.width; x++ ) - dst[x] = (uchar)(-(src1[x] > src2[x]) ^ m); - } - } - else if( code == CMP_EQ || code == CMP_NE ) - { - int m = code == CMP_EQ ? 0 : 255; - for( ; size.height--; src1 += step1, src2 += step2, dst += step ) - { - int x = 0; - #if CV_ENABLE_UNROLLED - for( ; x <= size.width - 4; x += 4 ) - { - int t0, t1; - t0 = -(src1[x] == src2[x]) ^ m; - t1 = -(src1[x+1] == src2[x+1]) ^ m; - dst[x] = (uchar)t0; dst[x+1] = (uchar)t1; - t0 = -(src1[x+2] == src2[x+2]) ^ m; - t1 = -(src1[x+3] == src2[x+3]) ^ m; - dst[x+2] = (uchar)t0; dst[x+3] = (uchar)t1; - } - #endif - for( ; x < size.width; x++ ) - dst[x] = (uchar)(-(src1[x] == src2[x]) ^ m); - } - } -} - -#if ARITHM_USE_IPP -inline static IppCmpOp convert_cmp(int _cmpop) -{ - return _cmpop == CMP_EQ ? ippCmpEq : - _cmpop == CMP_GT ? ippCmpGreater : - _cmpop == CMP_GE ? ippCmpGreaterEq : - _cmpop == CMP_LT ? ippCmpLess : - _cmpop == CMP_LE ? ippCmpLessEq : - (IppCmpOp)-1; -} -#endif - -static void cmp8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, - uchar* dst, size_t step, Size size, void* _cmpop) -{ -#if ARITHM_USE_IPP - CV_IPP_CHECK() - { - IppCmpOp op = convert_cmp(*(int *)_cmpop); - if( op >= 0 ) - { - fixSteps(size, sizeof(dst[0]), step1, step2, step); - if (0 <= ippiCompare_8u_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(size), op)) - { - CV_IMPL_ADD(CV_IMPL_IPP); - return; - } - setIppErrorStatus(); - } - } -#endif - //vz optimized cmp_(src1, step1, src2, step2, dst, step, size, *(int*)_cmpop); - int code = *(int*)_cmpop; - step1 /= sizeof(src1[0]); - step2 /= sizeof(src2[0]); - if( code == CMP_GE || code == CMP_LT ) - { - std::swap(src1, src2); - std::swap(step1, step2); - code = code == CMP_GE ? CMP_LE : CMP_GT; - } - - if( code == CMP_GT || code == CMP_LE ) - { - int m = code == CMP_GT ? 0 : 255; - for( ; size.height--; src1 += step1, src2 += step2, dst += step ) - { - int x =0; - #if CV_SSE2 - if( USE_SSE2 ) - { - __m128i m128 = code == CMP_GT ? _mm_setzero_si128() : _mm_set1_epi8 (-1); - __m128i c128 = _mm_set1_epi8 (-128); - for( ; x <= size.width - 16; x += 16 ) - { - __m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x)); - __m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x)); - // no simd for 8u comparison, that's why we need the trick - r00 = _mm_sub_epi8(r00,c128); - r10 = _mm_sub_epi8(r10,c128); - - r00 =_mm_xor_si128(_mm_cmpgt_epi8(r00, r10), m128); - _mm_storeu_si128((__m128i*)(dst + x),r00); - - } - } - #elif CV_NEON - uint8x16_t mask = code == CMP_GT ? vdupq_n_u8(0) : vdupq_n_u8(255); - - for( ; x <= size.width - 16; x += 16 ) - { - vst1q_u8(dst+x, veorq_u8(vcgtq_u8(vld1q_u8(src1+x), vld1q_u8(src2+x)), mask)); - } - - #endif - - for( ; x < size.width; x++ ){ - dst[x] = (uchar)(-(src1[x] > src2[x]) ^ m); - } - } - } - else if( code == CMP_EQ || code == CMP_NE ) - { - int m = code == CMP_EQ ? 0 : 255; - for( ; size.height--; src1 += step1, src2 += step2, dst += step ) - { - int x = 0; - #if CV_SSE2 - if( USE_SSE2 ) - { - __m128i m128 = code == CMP_EQ ? _mm_setzero_si128() : _mm_set1_epi8 (-1); - for( ; x <= size.width - 16; x += 16 ) - { - __m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x)); - __m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x)); - r00 = _mm_xor_si128 ( _mm_cmpeq_epi8 (r00, r10), m128); - _mm_storeu_si128((__m128i*)(dst + x), r00); - } - } - #elif CV_NEON - uint8x16_t mask = code == CMP_EQ ? vdupq_n_u8(0) : vdupq_n_u8(255); - - for( ; x <= size.width - 16; x += 16 ) - { - vst1q_u8(dst+x, veorq_u8(vceqq_u8(vld1q_u8(src1+x), vld1q_u8(src2+x)), mask)); - } - #endif - for( ; x < size.width; x++ ) - dst[x] = (uchar)(-(src1[x] == src2[x]) ^ m); - } - } -} - -static void cmp8s(const schar* src1, size_t step1, const schar* src2, size_t step2, - uchar* dst, size_t step, Size size, void* _cmpop) -{ - cmp_(src1, step1, src2, step2, dst, step, size, *(int*)_cmpop); -} - -static void cmp16u(const ushort* src1, size_t step1, const ushort* src2, size_t step2, - uchar* dst, size_t step, Size size, void* _cmpop) -{ -#if ARITHM_USE_IPP - CV_IPP_CHECK() - { - IppCmpOp op = convert_cmp(*(int *)_cmpop); - if( op >= 0 ) - { - fixSteps(size, sizeof(dst[0]), step1, step2, step); - if (0 <= ippiCompare_16u_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(size), op)) - { - CV_IMPL_ADD(CV_IMPL_IPP); - return; - } - setIppErrorStatus(); - } - } -#endif - cmp_(src1, step1, src2, step2, dst, step, size, *(int*)_cmpop); -} - -static void cmp16s(const short* src1, size_t step1, const short* src2, size_t step2, - uchar* dst, size_t step, Size size, void* _cmpop) -{ -#if ARITHM_USE_IPP - CV_IPP_CHECK() - { - IppCmpOp op = convert_cmp(*(int *)_cmpop); - if( op >= 0 ) - { - fixSteps(size, sizeof(dst[0]), step1, step2, step); - if (0 <= ippiCompare_16s_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(size), op)) - { - CV_IMPL_ADD(CV_IMPL_IPP); - return; - } - setIppErrorStatus(); - } - } -#endif - //vz optimized cmp_(src1, step1, src2, step2, dst, step, size, *(int*)_cmpop); - - int code = *(int*)_cmpop; - step1 /= sizeof(src1[0]); - step2 /= sizeof(src2[0]); - if( code == CMP_GE || code == CMP_LT ) - { - std::swap(src1, src2); - std::swap(step1, step2); - code = code == CMP_GE ? CMP_LE : CMP_GT; - } - - if( code == CMP_GT || code == CMP_LE ) - { - int m = code == CMP_GT ? 0 : 255; - for( ; size.height--; src1 += step1, src2 += step2, dst += step ) - { - int x =0; - #if CV_SSE2 - if( USE_SSE2) - { - __m128i m128 = code == CMP_GT ? _mm_setzero_si128() : _mm_set1_epi16 (-1); - for( ; x <= size.width - 16; x += 16 ) - { - __m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x)); - __m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x)); - r00 = _mm_xor_si128 ( _mm_cmpgt_epi16 (r00, r10), m128); - __m128i r01 = _mm_loadu_si128((const __m128i*)(src1 + x + 8)); - __m128i r11 = _mm_loadu_si128((const __m128i*)(src2 + x + 8)); - r01 = _mm_xor_si128 ( _mm_cmpgt_epi16 (r01, r11), m128); - r11 = _mm_packs_epi16(r00, r01); - _mm_storeu_si128((__m128i*)(dst + x), r11); - } - if( x <= size.width-8) - { - __m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x)); - __m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x)); - r00 = _mm_xor_si128 ( _mm_cmpgt_epi16 (r00, r10), m128); - r10 = _mm_packs_epi16(r00, r00); - _mm_storel_epi64((__m128i*)(dst + x), r10); - - x += 8; - } - } - #elif CV_NEON - uint8x16_t mask = code == CMP_GT ? vdupq_n_u8(0) : vdupq_n_u8(255); - - for( ; x <= size.width - 16; x += 16 ) - { - int16x8_t in1 = vld1q_s16(src1 + x); - int16x8_t in2 = vld1q_s16(src2 + x); - uint8x8_t t1 = vmovn_u16(vcgtq_s16(in1, in2)); - - in1 = vld1q_s16(src1 + x + 8); - in2 = vld1q_s16(src2 + x + 8); - uint8x8_t t2 = vmovn_u16(vcgtq_s16(in1, in2)); - - vst1q_u8(dst+x, veorq_u8(vcombine_u8(t1, t2), mask)); - } - #endif - - for( ; x < size.width; x++ ){ - dst[x] = (uchar)(-(src1[x] > src2[x]) ^ m); - } - } - } - else if( code == CMP_EQ || code == CMP_NE ) - { - int m = code == CMP_EQ ? 0 : 255; - for( ; size.height--; src1 += step1, src2 += step2, dst += step ) - { - int x = 0; - #if CV_SSE2 - if( USE_SSE2 ) - { - __m128i m128 = code == CMP_EQ ? _mm_setzero_si128() : _mm_set1_epi16 (-1); - for( ; x <= size.width - 16; x += 16 ) - { - __m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x)); - __m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x)); - r00 = _mm_xor_si128 ( _mm_cmpeq_epi16 (r00, r10), m128); - __m128i r01 = _mm_loadu_si128((const __m128i*)(src1 + x + 8)); - __m128i r11 = _mm_loadu_si128((const __m128i*)(src2 + x + 8)); - r01 = _mm_xor_si128 ( _mm_cmpeq_epi16 (r01, r11), m128); - r11 = _mm_packs_epi16(r00, r01); - _mm_storeu_si128((__m128i*)(dst + x), r11); - } - if( x <= size.width - 8) - { - __m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x)); - __m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x)); - r00 = _mm_xor_si128 ( _mm_cmpeq_epi16 (r00, r10), m128); - r10 = _mm_packs_epi16(r00, r00); - _mm_storel_epi64((__m128i*)(dst + x), r10); - - x += 8; - } - } - #elif CV_NEON - uint8x16_t mask = code == CMP_EQ ? vdupq_n_u8(0) : vdupq_n_u8(255); - - for( ; x <= size.width - 16; x += 16 ) - { - int16x8_t in1 = vld1q_s16(src1 + x); - int16x8_t in2 = vld1q_s16(src2 + x); - uint8x8_t t1 = vmovn_u16(vceqq_s16(in1, in2)); - - in1 = vld1q_s16(src1 + x + 8); - in2 = vld1q_s16(src2 + x + 8); - uint8x8_t t2 = vmovn_u16(vceqq_s16(in1, in2)); - - vst1q_u8(dst+x, veorq_u8(vcombine_u8(t1, t2), mask)); - } - #endif - for( ; x < size.width; x++ ) - dst[x] = (uchar)(-(src1[x] == src2[x]) ^ m); - } - } -} - -static void cmp32s(const int* src1, size_t step1, const int* src2, size_t step2, - uchar* dst, size_t step, Size size, void* _cmpop) -{ - cmp_(src1, step1, src2, step2, dst, step, size, *(int*)_cmpop); -} - -static void cmp32f(const float* src1, size_t step1, const float* src2, size_t step2, - uchar* dst, size_t step, Size size, void* _cmpop) -{ -#if ARITHM_USE_IPP - CV_IPP_CHECK() - { - IppCmpOp op = convert_cmp(*(int *)_cmpop); - if( op >= 0 ) - { - fixSteps(size, sizeof(dst[0]), step1, step2, step); - if (0 <= ippiCompare_32f_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(size), op)) - { - CV_IMPL_ADD(CV_IMPL_IPP); - return; - } - setIppErrorStatus(); - } - } -#endif - cmp_(src1, step1, src2, step2, dst, step, size, *(int*)_cmpop); -} - -static void cmp64f(const double* src1, size_t step1, const double* src2, size_t step2, - uchar* dst, size_t step, Size size, void* _cmpop) -{ - cmp_(src1, step1, src2, step2, dst, step, size, *(int*)_cmpop); -} - -static BinaryFunc getCmpFunc(int depth) -{ - static BinaryFunc cmpTab[] = - { - (BinaryFunc)GET_OPTIMIZED(cmp8u), (BinaryFunc)GET_OPTIMIZED(cmp8s), - (BinaryFunc)GET_OPTIMIZED(cmp16u), (BinaryFunc)GET_OPTIMIZED(cmp16s), - (BinaryFunc)GET_OPTIMIZED(cmp32s), - (BinaryFunc)GET_OPTIMIZED(cmp32f), (BinaryFunc)cmp64f, + (BinaryFuncC)GET_OPTIMIZED(cv::hal::cmp8u), (BinaryFuncC)GET_OPTIMIZED(cv::hal::cmp8s), + (BinaryFuncC)GET_OPTIMIZED(cv::hal::cmp16u), (BinaryFuncC)GET_OPTIMIZED(cv::hal::cmp16s), + (BinaryFuncC)GET_OPTIMIZED(cv::hal::cmp32s), + (BinaryFuncC)GET_OPTIMIZED(cv::hal::cmp32f), (BinaryFuncC)cv::hal::cmp64f, 0 }; @@ -5020,7 +1230,7 @@ void cv::compare(InputArray _src1, InputArray _src2, OutputArray _dst, int op) _dst.create(src1.size(), CV_8UC(cn)); Mat dst = _dst.getMat(); Size sz = getContinuousSize(src1, src2, dst, src1.channels()); - getCmpFunc(src1.depth())(src1.ptr(), src1.step, src2.ptr(), src2.step, dst.ptr(), dst.step, sz, &op); + getCmpFunc(src1.depth())(src1.ptr(), src1.step, src2.ptr(), src2.step, dst.ptr(), dst.step, sz.width, sz.height, &op); return; } @@ -5032,7 +1242,7 @@ void cv::compare(InputArray _src1, InputArray _src2, OutputArray _dst, int op) size_t esz = src1.elemSize(); size_t blocksize0 = (size_t)(BLOCK_SIZE + esz-1)/esz; - BinaryFunc func = getCmpFunc(depth1); + BinaryFuncC func = getCmpFunc(depth1); if( !haveScalar ) { @@ -5043,7 +1253,7 @@ void cv::compare(InputArray _src1, InputArray _src2, OutputArray _dst, int op) size_t total = it.size; for( size_t i = 0; i < it.nplanes; i++, ++it ) - func( ptrs[0], 0, ptrs[1], 0, ptrs[2], 0, Size((int)total, 1), &op ); + func( ptrs[0], 0, ptrs[1], 0, ptrs[2], 0, (int)total, 1, &op ); } else { @@ -5095,7 +1305,7 @@ void cv::compare(InputArray _src1, InputArray _src2, OutputArray _dst, int op) for( size_t j = 0; j < total; j += blocksize ) { int bsz = (int)MIN(total - j, blocksize); - func( ptrs[0], 0, buf, 0, ptrs[1], 0, Size(bsz, 1), &op); + func( ptrs[0], 0, buf, 0, ptrs[1], 0, bsz, 1, &op); ptrs[0] += bsz*esz; ptrs[1] += bsz; } diff --git a/modules/core/src/convert.cpp b/modules/core/src/convert.cpp index fbbea5e1b1..6c693a43a0 100644 --- a/modules/core/src/convert.cpp +++ b/modules/core/src/convert.cpp @@ -42,6 +42,7 @@ //M*/ #include "precomp.hpp" + #include "opencl_kernels_core.hpp" #ifdef __APPLE__ @@ -49,776 +50,37 @@ #define CV_NEON 0 #endif -namespace cv -{ /****************************************************************************************\ * split & merge * \****************************************************************************************/ -#if CV_NEON -template struct VSplit2; -template struct VSplit3; -template struct VSplit4; - -#define SPLIT2_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \ - template<> \ - struct name \ - { \ - void operator()(const data_type* src, data_type* dst0, \ - data_type* dst1) const \ - { \ - reg_type r = load_func(src); \ - store_func(dst0, r.val[0]); \ - store_func(dst1, r.val[1]); \ - } \ - } - -#define SPLIT3_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \ - template<> \ - struct name \ - { \ - void operator()(const data_type* src, data_type* dst0, data_type* dst1, \ - data_type* dst2) const \ - { \ - reg_type r = load_func(src); \ - store_func(dst0, r.val[0]); \ - store_func(dst1, r.val[1]); \ - store_func(dst2, r.val[2]); \ - } \ - } - -#define SPLIT4_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \ - template<> \ - struct name \ - { \ - void operator()(const data_type* src, data_type* dst0, data_type* dst1, \ - data_type* dst2, data_type* dst3) const \ - { \ - reg_type r = load_func(src); \ - store_func(dst0, r.val[0]); \ - store_func(dst1, r.val[1]); \ - store_func(dst2, r.val[2]); \ - store_func(dst3, r.val[3]); \ - } \ - } - -SPLIT2_KERNEL_TEMPLATE(VSplit2, uchar , uint8x16x2_t, vld2q_u8 , vst1q_u8 ); -SPLIT2_KERNEL_TEMPLATE(VSplit2, ushort, uint16x8x2_t, vld2q_u16, vst1q_u16); -SPLIT2_KERNEL_TEMPLATE(VSplit2, int , int32x4x2_t, vld2q_s32, vst1q_s32); -SPLIT2_KERNEL_TEMPLATE(VSplit2, int64 , int64x1x2_t, vld2_s64 , vst1_s64 ); - -SPLIT3_KERNEL_TEMPLATE(VSplit3, uchar , uint8x16x3_t, vld3q_u8 , vst1q_u8 ); -SPLIT3_KERNEL_TEMPLATE(VSplit3, ushort, uint16x8x3_t, vld3q_u16, vst1q_u16); -SPLIT3_KERNEL_TEMPLATE(VSplit3, int , int32x4x3_t, vld3q_s32, vst1q_s32); -SPLIT3_KERNEL_TEMPLATE(VSplit3, int64 , int64x1x3_t, vld3_s64 , vst1_s64 ); - -SPLIT4_KERNEL_TEMPLATE(VSplit4, uchar , uint8x16x4_t, vld4q_u8 , vst1q_u8 ); -SPLIT4_KERNEL_TEMPLATE(VSplit4, ushort, uint16x8x4_t, vld4q_u16, vst1q_u16); -SPLIT4_KERNEL_TEMPLATE(VSplit4, int , int32x4x4_t, vld4q_s32, vst1q_s32); -SPLIT4_KERNEL_TEMPLATE(VSplit4, int64 , int64x1x4_t, vld4_s64 , vst1_s64 ); - -#elif CV_SSE2 - -template -struct VSplit2 -{ - VSplit2() : support(false) { } - void operator()(const T *, T *, T *) const { } - - bool support; -}; - -template -struct VSplit3 -{ - VSplit3() : support(false) { } - void operator()(const T *, T *, T *, T *) const { } - - bool support; -}; - -template -struct VSplit4 -{ - VSplit4() : support(false) { } - void operator()(const T *, T *, T *, T *, T *) const { } - - bool support; -}; - -#define SPLIT2_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_deinterleave, flavor) \ -template <> \ -struct VSplit2 \ -{ \ - enum \ - { \ - ELEMS_IN_VEC = 16 / sizeof(data_type) \ - }; \ - \ - VSplit2() \ - { \ - support = checkHardwareSupport(CV_CPU_SSE2); \ - } \ - \ - void operator()(const data_type * src, \ - data_type * dst0, data_type * dst1) const \ - { \ - reg_type v_src0 = _mm_loadu_##flavor((cast_type const *)(src)); \ - reg_type v_src1 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC)); \ - reg_type v_src2 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 2)); \ - reg_type v_src3 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 3)); \ - \ - _mm_deinterleave(v_src0, v_src1, v_src2, v_src3); \ - \ - _mm_storeu_##flavor((cast_type *)(dst0), v_src0); \ - _mm_storeu_##flavor((cast_type *)(dst0 + ELEMS_IN_VEC), v_src1); \ - _mm_storeu_##flavor((cast_type *)(dst1), v_src2); \ - _mm_storeu_##flavor((cast_type *)(dst1 + ELEMS_IN_VEC), v_src3); \ - } \ - \ - bool support; \ -} - -#define SPLIT3_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_deinterleave, flavor) \ -template <> \ -struct VSplit3 \ -{ \ - enum \ - { \ - ELEMS_IN_VEC = 16 / sizeof(data_type) \ - }; \ - \ - VSplit3() \ - { \ - support = checkHardwareSupport(CV_CPU_SSE2); \ - } \ - \ - void operator()(const data_type * src, \ - data_type * dst0, data_type * dst1, data_type * dst2) const \ - { \ - reg_type v_src0 = _mm_loadu_##flavor((cast_type const *)(src)); \ - reg_type v_src1 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC)); \ - reg_type v_src2 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 2)); \ - reg_type v_src3 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 3)); \ - reg_type v_src4 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 4)); \ - reg_type v_src5 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 5)); \ - \ - _mm_deinterleave(v_src0, v_src1, v_src2, \ - v_src3, v_src4, v_src5); \ - \ - _mm_storeu_##flavor((cast_type *)(dst0), v_src0); \ - _mm_storeu_##flavor((cast_type *)(dst0 + ELEMS_IN_VEC), v_src1); \ - _mm_storeu_##flavor((cast_type *)(dst1), v_src2); \ - _mm_storeu_##flavor((cast_type *)(dst1 + ELEMS_IN_VEC), v_src3); \ - _mm_storeu_##flavor((cast_type *)(dst2), v_src4); \ - _mm_storeu_##flavor((cast_type *)(dst2 + ELEMS_IN_VEC), v_src5); \ - } \ - \ - bool support; \ -} - -#define SPLIT4_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_deinterleave, flavor) \ -template <> \ -struct VSplit4 \ -{ \ - enum \ - { \ - ELEMS_IN_VEC = 16 / sizeof(data_type) \ - }; \ - \ - VSplit4() \ - { \ - support = checkHardwareSupport(CV_CPU_SSE2); \ - } \ - \ - void operator()(const data_type * src, data_type * dst0, data_type * dst1, \ - data_type * dst2, data_type * dst3) const \ - { \ - reg_type v_src0 = _mm_loadu_##flavor((cast_type const *)(src)); \ - reg_type v_src1 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC)); \ - reg_type v_src2 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 2)); \ - reg_type v_src3 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 3)); \ - reg_type v_src4 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 4)); \ - reg_type v_src5 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 5)); \ - reg_type v_src6 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 6)); \ - reg_type v_src7 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 7)); \ - \ - _mm_deinterleave(v_src0, v_src1, v_src2, v_src3, \ - v_src4, v_src5, v_src6, v_src7); \ - \ - _mm_storeu_##flavor((cast_type *)(dst0), v_src0); \ - _mm_storeu_##flavor((cast_type *)(dst0 + ELEMS_IN_VEC), v_src1); \ - _mm_storeu_##flavor((cast_type *)(dst1), v_src2); \ - _mm_storeu_##flavor((cast_type *)(dst1 + ELEMS_IN_VEC), v_src3); \ - _mm_storeu_##flavor((cast_type *)(dst2), v_src4); \ - _mm_storeu_##flavor((cast_type *)(dst2 + ELEMS_IN_VEC), v_src5); \ - _mm_storeu_##flavor((cast_type *)(dst3), v_src6); \ - _mm_storeu_##flavor((cast_type *)(dst3 + ELEMS_IN_VEC), v_src7); \ - } \ - \ - bool support; \ -} - -SPLIT2_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128); -SPLIT2_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128); -SPLIT2_KERNEL_TEMPLATE( int, __m128, float, _mm_deinterleave_ps, ps); - -SPLIT3_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128); -SPLIT3_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128); -SPLIT3_KERNEL_TEMPLATE( int, __m128, float, _mm_deinterleave_ps, ps); - -SPLIT4_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128); -SPLIT4_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128); -SPLIT4_KERNEL_TEMPLATE( int, __m128, float, _mm_deinterleave_ps, ps); - -#endif - -template static void -split_( const T* src, T** dst, int len, int cn ) -{ - int k = cn % 4 ? cn % 4 : 4; - int i, j; - if( k == 1 ) - { - T* dst0 = dst[0]; - - if(cn == 1) - { - memcpy(dst0, src, len * sizeof(T)); - } - else - { - for( i = 0, j = 0 ; i < len; i++, j += cn ) - dst0[i] = src[j]; - } - } - else if( k == 2 ) - { - T *dst0 = dst[0], *dst1 = dst[1]; - i = j = 0; - -#if CV_NEON - if(cn == 2) - { - int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T); - int inc_j = 2 * inc_i; - - VSplit2 vsplit; - for( ; i < len - inc_i; i += inc_i, j += inc_j) - vsplit(src + j, dst0 + i, dst1 + i); - } -#elif CV_SSE2 - if (cn == 2) - { - int inc_i = 32/sizeof(T); - int inc_j = 2 * inc_i; - - VSplit2 vsplit; - if (vsplit.support) - { - for( ; i <= len - inc_i; i += inc_i, j += inc_j) - vsplit(src + j, dst0 + i, dst1 + i); - } - } -#endif - for( ; i < len; i++, j += cn ) - { - dst0[i] = src[j]; - dst1[i] = src[j+1]; - } - } - else if( k == 3 ) - { - T *dst0 = dst[0], *dst1 = dst[1], *dst2 = dst[2]; - i = j = 0; - -#if CV_NEON - if(cn == 3) - { - int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T); - int inc_j = 3 * inc_i; - - VSplit3 vsplit; - for( ; i <= len - inc_i; i += inc_i, j += inc_j) - vsplit(src + j, dst0 + i, dst1 + i, dst2 + i); - } -#elif CV_SSE2 - if (cn == 3) - { - int inc_i = 32/sizeof(T); - int inc_j = 3 * inc_i; - - VSplit3 vsplit; - - if (vsplit.support) - { - for( ; i <= len - inc_i; i += inc_i, j += inc_j) - vsplit(src + j, dst0 + i, dst1 + i, dst2 + i); - } - } -#endif - for( ; i < len; i++, j += cn ) - { - dst0[i] = src[j]; - dst1[i] = src[j+1]; - dst2[i] = src[j+2]; - } - } - else - { - T *dst0 = dst[0], *dst1 = dst[1], *dst2 = dst[2], *dst3 = dst[3]; - i = j = 0; - -#if CV_NEON - if(cn == 4) - { - int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T); - int inc_j = 4 * inc_i; - - VSplit4 vsplit; - for( ; i <= len - inc_i; i += inc_i, j += inc_j) - vsplit(src + j, dst0 + i, dst1 + i, dst2 + i, dst3 + i); - } -#elif CV_SSE2 - if (cn == 4) - { - int inc_i = 32/sizeof(T); - int inc_j = 4 * inc_i; - - VSplit4 vsplit; - if (vsplit.support) - { - for( ; i <= len - inc_i; i += inc_i, j += inc_j) - vsplit(src + j, dst0 + i, dst1 + i, dst2 + i, dst3 + i); - } - } -#endif - for( ; i < len; i++, j += cn ) - { - dst0[i] = src[j]; dst1[i] = src[j+1]; - dst2[i] = src[j+2]; dst3[i] = src[j+3]; - } - } - - for( ; k < cn; k += 4 ) - { - T *dst0 = dst[k], *dst1 = dst[k+1], *dst2 = dst[k+2], *dst3 = dst[k+3]; - for( i = 0, j = k; i < len; i++, j += cn ) - { - dst0[i] = src[j]; dst1[i] = src[j+1]; - dst2[i] = src[j+2]; dst3[i] = src[j+3]; - } - } -} - - -#if CV_NEON -template struct VMerge2; -template struct VMerge3; -template struct VMerge4; - -#define MERGE2_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \ - template<> \ - struct name{ \ - void operator()(const data_type* src0, const data_type* src1, \ - data_type* dst){ \ - reg_type r; \ - r.val[0] = load_func(src0); \ - r.val[1] = load_func(src1); \ - store_func(dst, r); \ - } \ - } - -#define MERGE3_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \ - template<> \ - struct name{ \ - void operator()(const data_type* src0, const data_type* src1, \ - const data_type* src2, data_type* dst){ \ - reg_type r; \ - r.val[0] = load_func(src0); \ - r.val[1] = load_func(src1); \ - r.val[2] = load_func(src2); \ - store_func(dst, r); \ - } \ - } - -#define MERGE4_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \ - template<> \ - struct name{ \ - void operator()(const data_type* src0, const data_type* src1, \ - const data_type* src2, const data_type* src3, \ - data_type* dst){ \ - reg_type r; \ - r.val[0] = load_func(src0); \ - r.val[1] = load_func(src1); \ - r.val[2] = load_func(src2); \ - r.val[3] = load_func(src3); \ - store_func(dst, r); \ - } \ - } - -MERGE2_KERNEL_TEMPLATE(VMerge2, uchar , uint8x16x2_t, vld1q_u8 , vst2q_u8 ); -MERGE2_KERNEL_TEMPLATE(VMerge2, ushort, uint16x8x2_t, vld1q_u16, vst2q_u16); -MERGE2_KERNEL_TEMPLATE(VMerge2, int , int32x4x2_t, vld1q_s32, vst2q_s32); -MERGE2_KERNEL_TEMPLATE(VMerge2, int64 , int64x1x2_t, vld1_s64 , vst2_s64 ); - -MERGE3_KERNEL_TEMPLATE(VMerge3, uchar , uint8x16x3_t, vld1q_u8 , vst3q_u8 ); -MERGE3_KERNEL_TEMPLATE(VMerge3, ushort, uint16x8x3_t, vld1q_u16, vst3q_u16); -MERGE3_KERNEL_TEMPLATE(VMerge3, int , int32x4x3_t, vld1q_s32, vst3q_s32); -MERGE3_KERNEL_TEMPLATE(VMerge3, int64 , int64x1x3_t, vld1_s64 , vst3_s64 ); - -MERGE4_KERNEL_TEMPLATE(VMerge4, uchar , uint8x16x4_t, vld1q_u8 , vst4q_u8 ); -MERGE4_KERNEL_TEMPLATE(VMerge4, ushort, uint16x8x4_t, vld1q_u16, vst4q_u16); -MERGE4_KERNEL_TEMPLATE(VMerge4, int , int32x4x4_t, vld1q_s32, vst4q_s32); -MERGE4_KERNEL_TEMPLATE(VMerge4, int64 , int64x1x4_t, vld1_s64 , vst4_s64 ); - -#elif CV_SSE2 - -template -struct VMerge2 -{ - VMerge2() : support(false) { } - void operator()(const T *, const T *, T *) const { } - - bool support; -}; - -template -struct VMerge3 -{ - VMerge3() : support(false) { } - void operator()(const T *, const T *, const T *, T *) const { } - - bool support; -}; - -template -struct VMerge4 -{ - VMerge4() : support(false) { } - void operator()(const T *, const T *, const T *, const T *, T *) const { } - - bool support; -}; - -#define MERGE2_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor, se) \ -template <> \ -struct VMerge2 \ -{ \ - enum \ - { \ - ELEMS_IN_VEC = 16 / sizeof(data_type) \ - }; \ - \ - VMerge2() \ - { \ - support = checkHardwareSupport(se); \ - } \ - \ - void operator()(const data_type * src0, const data_type * src1, \ - data_type * dst) const \ - { \ - reg_type v_src0 = _mm_loadu_##flavor((const cast_type *)(src0)); \ - reg_type v_src1 = _mm_loadu_##flavor((const cast_type *)(src0 + ELEMS_IN_VEC)); \ - reg_type v_src2 = _mm_loadu_##flavor((const cast_type *)(src1)); \ - reg_type v_src3 = _mm_loadu_##flavor((const cast_type *)(src1 + ELEMS_IN_VEC)); \ - \ - _mm_interleave(v_src0, v_src1, v_src2, v_src3); \ - \ - _mm_storeu_##flavor((cast_type *)(dst), v_src0); \ - _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC), v_src1); \ - _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 2), v_src2); \ - _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 3), v_src3); \ - } \ - \ - bool support; \ -} - -#define MERGE3_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor, se) \ -template <> \ -struct VMerge3 \ -{ \ - enum \ - { \ - ELEMS_IN_VEC = 16 / sizeof(data_type) \ - }; \ - \ - VMerge3() \ - { \ - support = checkHardwareSupport(se); \ - } \ - \ - void operator()(const data_type * src0, const data_type * src1, const data_type * src2,\ - data_type * dst) const \ - { \ - reg_type v_src0 = _mm_loadu_##flavor((const cast_type *)(src0)); \ - reg_type v_src1 = _mm_loadu_##flavor((const cast_type *)(src0 + ELEMS_IN_VEC)); \ - reg_type v_src2 = _mm_loadu_##flavor((const cast_type *)(src1)); \ - reg_type v_src3 = _mm_loadu_##flavor((const cast_type *)(src1 + ELEMS_IN_VEC)); \ - reg_type v_src4 = _mm_loadu_##flavor((const cast_type *)(src2)); \ - reg_type v_src5 = _mm_loadu_##flavor((const cast_type *)(src2 + ELEMS_IN_VEC)); \ - \ - _mm_interleave(v_src0, v_src1, v_src2, \ - v_src3, v_src4, v_src5); \ - \ - _mm_storeu_##flavor((cast_type *)(dst), v_src0); \ - _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC), v_src1); \ - _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 2), v_src2); \ - _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 3), v_src3); \ - _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 4), v_src4); \ - _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 5), v_src5); \ - } \ - \ - bool support; \ -} - -#define MERGE4_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor, se) \ -template <> \ -struct VMerge4 \ -{ \ - enum \ - { \ - ELEMS_IN_VEC = 16 / sizeof(data_type) \ - }; \ - \ - VMerge4() \ - { \ - support = checkHardwareSupport(se); \ - } \ - \ - void operator()(const data_type * src0, const data_type * src1, \ - const data_type * src2, const data_type * src3, \ - data_type * dst) const \ - { \ - reg_type v_src0 = _mm_loadu_##flavor((const cast_type *)(src0)); \ - reg_type v_src1 = _mm_loadu_##flavor((const cast_type *)(src0 + ELEMS_IN_VEC)); \ - reg_type v_src2 = _mm_loadu_##flavor((const cast_type *)(src1)); \ - reg_type v_src3 = _mm_loadu_##flavor((const cast_type *)(src1 + ELEMS_IN_VEC)); \ - reg_type v_src4 = _mm_loadu_##flavor((const cast_type *)(src2)); \ - reg_type v_src5 = _mm_loadu_##flavor((const cast_type *)(src2 + ELEMS_IN_VEC)); \ - reg_type v_src6 = _mm_loadu_##flavor((const cast_type *)(src3)); \ - reg_type v_src7 = _mm_loadu_##flavor((const cast_type *)(src3 + ELEMS_IN_VEC)); \ - \ - _mm_interleave(v_src0, v_src1, v_src2, v_src3, \ - v_src4, v_src5, v_src6, v_src7); \ - \ - _mm_storeu_##flavor((cast_type *)(dst), v_src0); \ - _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC), v_src1); \ - _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 2), v_src2); \ - _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 3), v_src3); \ - _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 4), v_src4); \ - _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 5), v_src5); \ - _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 6), v_src6); \ - _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 7), v_src7); \ - } \ - \ - bool support; \ -} - -MERGE2_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128, CV_CPU_SSE2); -MERGE3_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128, CV_CPU_SSE2); -MERGE4_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128, CV_CPU_SSE2); - -#if CV_SSE4_1 -MERGE2_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128, CV_CPU_SSE4_1); -MERGE3_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128, CV_CPU_SSE4_1); -MERGE4_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128, CV_CPU_SSE4_1); -#endif - -MERGE2_KERNEL_TEMPLATE( int, __m128, float, _mm_interleave_ps, ps, CV_CPU_SSE2); -MERGE3_KERNEL_TEMPLATE( int, __m128, float, _mm_interleave_ps, ps, CV_CPU_SSE2); -MERGE4_KERNEL_TEMPLATE( int, __m128, float, _mm_interleave_ps, ps, CV_CPU_SSE2); - -#endif - -template static void -merge_( const T** src, T* dst, int len, int cn ) -{ - int k = cn % 4 ? cn % 4 : 4; - int i, j; - if( k == 1 ) - { - const T* src0 = src[0]; - for( i = j = 0; i < len; i++, j += cn ) - dst[j] = src0[i]; - } - else if( k == 2 ) - { - const T *src0 = src[0], *src1 = src[1]; - i = j = 0; -#if CV_NEON - if(cn == 2) - { - int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T); - int inc_j = 2 * inc_i; - - VMerge2 vmerge; - for( ; i < len - inc_i; i += inc_i, j += inc_j) - vmerge(src0 + i, src1 + i, dst + j); - } -#elif CV_SSE2 - if(cn == 2) - { - int inc_i = 32/sizeof(T); - int inc_j = 2 * inc_i; - - VMerge2 vmerge; - if (vmerge.support) - for( ; i < len - inc_i; i += inc_i, j += inc_j) - vmerge(src0 + i, src1 + i, dst + j); - } -#endif - for( ; i < len; i++, j += cn ) - { - dst[j] = src0[i]; - dst[j+1] = src1[i]; - } - } - else if( k == 3 ) - { - const T *src0 = src[0], *src1 = src[1], *src2 = src[2]; - i = j = 0; -#if CV_NEON - if(cn == 3) - { - int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T); - int inc_j = 3 * inc_i; - - VMerge3 vmerge; - for( ; i < len - inc_i; i += inc_i, j += inc_j) - vmerge(src0 + i, src1 + i, src2 + i, dst + j); - } -#elif CV_SSE2 - if(cn == 3) - { - int inc_i = 32/sizeof(T); - int inc_j = 3 * inc_i; - - VMerge3 vmerge; - if (vmerge.support) - for( ; i < len - inc_i; i += inc_i, j += inc_j) - vmerge(src0 + i, src1 + i, src2 + i, dst + j); - } -#endif - for( ; i < len; i++, j += cn ) - { - dst[j] = src0[i]; - dst[j+1] = src1[i]; - dst[j+2] = src2[i]; - } - } - else - { - const T *src0 = src[0], *src1 = src[1], *src2 = src[2], *src3 = src[3]; - i = j = 0; -#if CV_NEON - if(cn == 4) - { - int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T); - int inc_j = 4 * inc_i; - - VMerge4 vmerge; - for( ; i < len - inc_i; i += inc_i, j += inc_j) - vmerge(src0 + i, src1 + i, src2 + i, src3 + i, dst + j); - } -#elif CV_SSE2 - if(cn == 4) - { - int inc_i = 32/sizeof(T); - int inc_j = 4 * inc_i; - - VMerge4 vmerge; - if (vmerge.support) - for( ; i < len - inc_i; i += inc_i, j += inc_j) - vmerge(src0 + i, src1 + i, src2 + i, src3 + i, dst + j); - } -#endif - for( ; i < len; i++, j += cn ) - { - dst[j] = src0[i]; dst[j+1] = src1[i]; - dst[j+2] = src2[i]; dst[j+3] = src3[i]; - } - } - - for( ; k < cn; k += 4 ) - { - const T *src0 = src[k], *src1 = src[k+1], *src2 = src[k+2], *src3 = src[k+3]; - for( i = 0, j = k; i < len; i++, j += cn ) - { - dst[j] = src0[i]; dst[j+1] = src1[i]; - dst[j+2] = src2[i]; dst[j+3] = src3[i]; - } - } -} - -static void split8u(const uchar* src, uchar** dst, int len, int cn ) -{ - split_(src, dst, len, cn); -} - -static void split16u(const ushort* src, ushort** dst, int len, int cn ) -{ - split_(src, dst, len, cn); -} - -static void split32s(const int* src, int** dst, int len, int cn ) -{ - split_(src, dst, len, cn); -} - -static void split64s(const int64* src, int64** dst, int len, int cn ) -{ - split_(src, dst, len, cn); -} - -static void merge8u(const uchar** src, uchar* dst, int len, int cn ) -{ - merge_(src, dst, len, cn); -} - -static void merge16u(const ushort** src, ushort* dst, int len, int cn ) -{ - merge_(src, dst, len, cn); -} - -static void merge32s(const int** src, int* dst, int len, int cn ) -{ - merge_(src, dst, len, cn); -} - -static void merge64s(const int64** src, int64* dst, int len, int cn ) -{ - merge_(src, dst, len, cn); -} - typedef void (*SplitFunc)(const uchar* src, uchar** dst, int len, int cn); -typedef void (*MergeFunc)(const uchar** src, uchar* dst, int len, int cn); static SplitFunc getSplitFunc(int depth) { static SplitFunc splitTab[] = { - (SplitFunc)GET_OPTIMIZED(split8u), (SplitFunc)GET_OPTIMIZED(split8u), (SplitFunc)GET_OPTIMIZED(split16u), (SplitFunc)GET_OPTIMIZED(split16u), - (SplitFunc)GET_OPTIMIZED(split32s), (SplitFunc)GET_OPTIMIZED(split32s), (SplitFunc)GET_OPTIMIZED(split64s), 0 + (SplitFunc)GET_OPTIMIZED(cv::hal::split8u), (SplitFunc)GET_OPTIMIZED(cv::hal::split8u), (SplitFunc)GET_OPTIMIZED(cv::hal::split16u), (SplitFunc)GET_OPTIMIZED(cv::hal::split16u), + (SplitFunc)GET_OPTIMIZED(cv::hal::split32s), (SplitFunc)GET_OPTIMIZED(cv::hal::split32s), (SplitFunc)GET_OPTIMIZED(cv::hal::split64s), 0 }; return splitTab[depth]; } +typedef void (*MergeFunc)(const uchar** src, uchar* dst, int len, int cn); + static MergeFunc getMergeFunc(int depth) { static MergeFunc mergeTab[] = { - (MergeFunc)GET_OPTIMIZED(merge8u), (MergeFunc)GET_OPTIMIZED(merge8u), (MergeFunc)GET_OPTIMIZED(merge16u), (MergeFunc)GET_OPTIMIZED(merge16u), - (MergeFunc)GET_OPTIMIZED(merge32s), (MergeFunc)GET_OPTIMIZED(merge32s), (MergeFunc)GET_OPTIMIZED(merge64s), 0 + (MergeFunc)GET_OPTIMIZED(cv::hal::merge8u), (MergeFunc)GET_OPTIMIZED(cv::hal::merge8u), (MergeFunc)GET_OPTIMIZED(cv::hal::merge16u), (MergeFunc)GET_OPTIMIZED(cv::hal::merge16u), + (MergeFunc)GET_OPTIMIZED(cv::hal::merge32s), (MergeFunc)GET_OPTIMIZED(cv::hal::merge32s), (MergeFunc)GET_OPTIMIZED(cv::hal::merge64s), 0 }; return mergeTab[depth]; } -} - void cv::split(const Mat& src, Mat* mv) { int k, depth = src.depth(), cn = src.channels(); diff --git a/modules/core/src/precomp.hpp b/modules/core/src/precomp.hpp index 6d19744820..d1f2ec22e1 100644 --- a/modules/core/src/precomp.hpp +++ b/modules/core/src/precomp.hpp @@ -83,6 +83,11 @@ typedef void (*BinaryFunc)(const uchar* src1, size_t step1, uchar* dst, size_t step, Size sz, void*); +typedef void (*BinaryFuncC)(const uchar* src1, size_t step1, + const uchar* src2, size_t step2, + uchar* dst, size_t step, int width, int height, + void*); + BinaryFunc getConvertFunc(int sdepth, int ddepth); BinaryFunc getCopyMaskFunc(size_t esz); @@ -114,46 +119,6 @@ extern const uchar g_Saturate8u[]; void deleteThreadAllocData(); #endif -template struct OpAdd -{ - typedef T1 type1; - typedef T2 type2; - typedef T3 rtype; - T3 operator ()(const T1 a, const T2 b) const { return saturate_cast(a + b); } -}; - -template struct OpSub -{ - typedef T1 type1; - typedef T2 type2; - typedef T3 rtype; - T3 operator ()(const T1 a, const T2 b) const { return saturate_cast(a - b); } -}; - -template struct OpRSub -{ - typedef T1 type1; - typedef T2 type2; - typedef T3 rtype; - T3 operator ()(const T1 a, const T2 b) const { return saturate_cast(b - a); } -}; - -template struct OpMin -{ - typedef T type1; - typedef T type2; - typedef T rtype; - T operator ()(const T a, const T b) const { return std::min(a, b); } -}; - -template struct OpMax -{ - typedef T type1; - typedef T type2; - typedef T rtype; - T operator ()(const T a, const T b) const { return std::max(a, b); } -}; - inline Size getContinuousSize_( int flags, int cols, int rows, int widthScale ) { int64 sz = (int64)cols * rows * widthScale; @@ -201,11 +166,6 @@ struct NoVec size_t operator()(const void*, const void*, void*, size_t) const { return 0; } }; -extern volatile bool USE_SSE2; -extern volatile bool USE_SSE4_2; -extern volatile bool USE_AVX; -extern volatile bool USE_AVX2; - enum { BLOCK_SIZE = 1024 }; #if defined HAVE_IPP && (IPP_VERSION_X100 >= 700) diff --git a/modules/core/src/system.cpp b/modules/core/src/system.cpp index dbe35ebfa4..ba2c9d536f 100644 --- a/modules/core/src/system.cpp +++ b/modules/core/src/system.cpp @@ -86,45 +86,6 @@ Mutex* __initialization_mutex_initializer = &getInitializationMutex(); #undef max #undef abs #include -#if defined _MSC_VER - #if _MSC_VER >= 1400 - #include - #elif defined _M_IX86 - static void __cpuid(int* cpuid_data, int) - { - __asm - { - push ebx - push edi - mov edi, cpuid_data - mov eax, 1 - cpuid - mov [edi], eax - mov [edi + 4], ebx - mov [edi + 8], ecx - mov [edi + 12], edx - pop edi - pop ebx - } - } - static void __cpuidex(int* cpuid_data, int, int) - { - __asm - { - push edi - mov edi, cpuid_data - mov eax, 7 - mov ecx, 0 - cpuid - mov [edi], eax - mov [edi + 4], ebx - mov [edi + 8], ecx - mov [edi + 12], edx - pop edi - } - } - #endif -#endif #ifdef WINRT #include @@ -237,160 +198,15 @@ void Exception::formatMessage() msg = format("%s:%d: error: (%d) %s\n", file.c_str(), line, code, err.c_str()); } -struct HWFeatures -{ - enum { MAX_FEATURE = CV_HARDWARE_MAX_FEATURE }; - - HWFeatures(void) - { - memset( have, 0, sizeof(have) ); - x86_family = 0; - } - - static HWFeatures initialize(void) - { - HWFeatures f; - int cpuid_data[4] = { 0, 0, 0, 0 }; - - #if defined _MSC_VER && (defined _M_IX86 || defined _M_X64) - __cpuid(cpuid_data, 1); - #elif defined __GNUC__ && (defined __i386__ || defined __x86_64__) - #ifdef __x86_64__ - asm __volatile__ - ( - "movl $1, %%eax\n\t" - "cpuid\n\t" - :[eax]"=a"(cpuid_data[0]),[ebx]"=b"(cpuid_data[1]),[ecx]"=c"(cpuid_data[2]),[edx]"=d"(cpuid_data[3]) - : - : "cc" - ); - #else - asm volatile - ( - "pushl %%ebx\n\t" - "movl $1,%%eax\n\t" - "cpuid\n\t" - "popl %%ebx\n\t" - : "=a"(cpuid_data[0]), "=c"(cpuid_data[2]), "=d"(cpuid_data[3]) - : - : "cc" - ); - #endif - #endif - - f.x86_family = (cpuid_data[0] >> 8) & 15; - if( f.x86_family >= 6 ) - { - f.have[CV_CPU_MMX] = (cpuid_data[3] & (1 << 23)) != 0; - f.have[CV_CPU_SSE] = (cpuid_data[3] & (1<<25)) != 0; - f.have[CV_CPU_SSE2] = (cpuid_data[3] & (1<<26)) != 0; - f.have[CV_CPU_SSE3] = (cpuid_data[2] & (1<<0)) != 0; - f.have[CV_CPU_SSSE3] = (cpuid_data[2] & (1<<9)) != 0; - f.have[CV_CPU_FMA3] = (cpuid_data[2] & (1<<12)) != 0; - f.have[CV_CPU_SSE4_1] = (cpuid_data[2] & (1<<19)) != 0; - f.have[CV_CPU_SSE4_2] = (cpuid_data[2] & (1<<20)) != 0; - f.have[CV_CPU_POPCNT] = (cpuid_data[2] & (1<<23)) != 0; - f.have[CV_CPU_AVX] = (((cpuid_data[2] & (1<<28)) != 0)&&((cpuid_data[2] & (1<<27)) != 0));//OS uses XSAVE_XRSTORE and CPU support AVX - - // make the second call to the cpuid command in order to get - // information about extended features like AVX2 - #if defined _MSC_VER && (defined _M_IX86 || defined _M_X64) - __cpuidex(cpuid_data, 7, 0); - #elif defined __GNUC__ && (defined __i386__ || defined __x86_64__) - #ifdef __x86_64__ - asm __volatile__ - ( - "movl $7, %%eax\n\t" - "movl $0, %%ecx\n\t" - "cpuid\n\t" - :[eax]"=a"(cpuid_data[0]),[ebx]"=b"(cpuid_data[1]),[ecx]"=c"(cpuid_data[2]),[edx]"=d"(cpuid_data[3]) - : - : "cc" - ); - #else - asm volatile - ( - "pushl %%ebx\n\t" - "movl $7,%%eax\n\t" - "movl $0,%%ecx\n\t" - "cpuid\n\t" - "movl %%ebx, %0\n\t" - "popl %%ebx\n\t" - : "=r"(cpuid_data[1]), "=c"(cpuid_data[2]) - : - : "cc" - ); - #endif - #endif - f.have[CV_CPU_AVX2] = (cpuid_data[1] & (1<<5)) != 0; - - f.have[CV_CPU_AVX_512F] = (cpuid_data[1] & (1<<16)) != 0; - f.have[CV_CPU_AVX_512DQ] = (cpuid_data[1] & (1<<17)) != 0; - f.have[CV_CPU_AVX_512IFMA512] = (cpuid_data[1] & (1<<21)) != 0; - f.have[CV_CPU_AVX_512PF] = (cpuid_data[1] & (1<<26)) != 0; - f.have[CV_CPU_AVX_512ER] = (cpuid_data[1] & (1<<27)) != 0; - f.have[CV_CPU_AVX_512CD] = (cpuid_data[1] & (1<<28)) != 0; - f.have[CV_CPU_AVX_512BW] = (cpuid_data[1] & (1<<30)) != 0; - f.have[CV_CPU_AVX_512VL] = (cpuid_data[1] & (1<<31)) != 0; - f.have[CV_CPU_AVX_512VBMI] = (cpuid_data[2] & (1<<1)) != 0; - } - - #if defined ANDROID || defined __linux__ - #ifdef __aarch64__ - f.have[CV_CPU_NEON] = true; - #else - int cpufile = open("/proc/self/auxv", O_RDONLY); - - if (cpufile >= 0) - { - Elf32_auxv_t auxv; - const size_t size_auxv_t = sizeof(auxv); - - while ((size_t)read(cpufile, &auxv, size_auxv_t) == size_auxv_t) - { - if (auxv.a_type == AT_HWCAP) - { - f.have[CV_CPU_NEON] = (auxv.a_un.a_val & 4096) != 0; - break; - } - } - - close(cpufile); - } - #endif - #elif (defined __clang__ || defined __APPLE__) && (defined __ARM_NEON__ || (defined __ARM_NEON && defined __aarch64__)) - f.have[CV_CPU_NEON] = true; - #endif - - return f; - } - - int x86_family; - bool have[MAX_FEATURE+1]; -}; - -static HWFeatures featuresEnabled = HWFeatures::initialize(), featuresDisabled = HWFeatures(); -static HWFeatures* currentFeatures = &featuresEnabled; - bool checkHardwareSupport(int feature) { CV_DbgAssert( 0 <= feature && feature <= CV_HARDWARE_MAX_FEATURE ); - return currentFeatures->have[feature]; + return cv::hal::checkHardwareSupport(feature); } - -volatile bool useOptimizedFlag = true; - -volatile bool USE_SSE2 = featuresEnabled.have[CV_CPU_SSE2]; -volatile bool USE_SSE4_2 = featuresEnabled.have[CV_CPU_SSE4_2]; -volatile bool USE_AVX = featuresEnabled.have[CV_CPU_AVX]; -volatile bool USE_AVX2 = featuresEnabled.have[CV_CPU_AVX2]; - void setUseOptimized( bool flag ) { - useOptimizedFlag = flag; - currentFeatures = flag ? &featuresEnabled : &featuresDisabled; - USE_SSE2 = currentFeatures->have[CV_CPU_SSE2]; + cv::hal::setUseOptimized(flag); ipp::setUseIPP(flag); #ifdef HAVE_OPENCL @@ -403,7 +219,7 @@ void setUseOptimized( bool flag ) bool useOptimized(void) { - return useOptimizedFlag; + return cv::hal::useOptimized(); } int64 getTickCount(void) @@ -683,12 +499,12 @@ redirectError( CvErrorCallback errCallback, void* userdata, void** prevUserdata) CV_IMPL int cvCheckHardwareSupport(int feature) { CV_DbgAssert( 0 <= feature && feature <= CV_HARDWARE_MAX_FEATURE ); - return cv::currentFeatures->have[feature]; + return cv::hal::checkHardwareSupport(feature); } CV_IMPL int cvUseOptimized( int flag ) { - int prevMode = cv::useOptimizedFlag; + int prevMode = cv::useOptimized(); cv::setUseOptimized( flag != 0 ); return prevMode; } diff --git a/modules/hal/CMakeLists.txt b/modules/hal/CMakeLists.txt index b04e96b9e7..982913dba7 100644 --- a/modules/hal/CMakeLists.txt +++ b/modules/hal/CMakeLists.txt @@ -2,10 +2,20 @@ set(the_description "The Hardware Acceleration Layer (HAL) module") set(OPENCV_MODULE_TYPE STATIC) +if(OPENCV_HAL_HEADERS AND OPENCV_HAL_LIBS) + set(OPENCV_HAL_HEADERS_INCLUDES "#include \"${OPENCV_HAL_HEADERS}\"") + set(DEPS "${OPENCV_HAL_LIBS}") +else() + set(OPENCV_HAL_HEADERS_INCLUDES "// using default HAL") + set(DEPS "") +endif() + +configure_file("${OpenCV_SOURCE_DIR}/cmake/templates/custom_hal.hpp.in" "${CMAKE_BINARY_DIR}/custom_hal.hpp" @ONLY) + if(UNIX) if(CMAKE_COMPILER_IS_GNUCXX OR CV_ICC) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC") endif() endif() -ocv_define_module(hal) +ocv_define_module(hal ${DEPS}) diff --git a/modules/hal/include/opencv2/hal.hpp b/modules/hal/include/opencv2/hal.hpp index 9d448757d2..d1ec73b429 100644 --- a/modules/hal/include/opencv2/hal.hpp +++ b/modules/hal/include/opencv2/hal.hpp @@ -46,6 +46,7 @@ #define __OPENCV_HAL_HPP__ #include "opencv2/hal/defs.h" +#include "opencv2/hal/interface.hpp" /** @defgroup hal Hardware Acceleration Layer @@ -58,22 +59,19 @@ @} */ - namespace cv { namespace hal { //! @addtogroup hal //! @{ -namespace Error { - -enum +class Failure { - Ok = 0, - Unknown = -1 +public: + Failure(int code_ = Error::Unknown) : code(code_) {} +public: + int code; }; -} - int normHamming(const uchar* a, int n); int normHamming(const uchar* a, const uchar* b, int n); @@ -104,8 +102,194 @@ void sqrt(const double* src, double* dst, int len); void invSqrt(const float* src, float* dst, int len); void invSqrt(const double* src, double* dst, int len); +void split8u(const uchar* src, uchar** dst, int len, int cn ); +void split16u(const ushort* src, ushort** dst, int len, int cn ); +void split32s(const int* src, int** dst, int len, int cn ); +void split64s(const int64* src, int64** dst, int len, int cn ); + +void merge8u(const uchar** src, uchar* dst, int len, int cn ); +void merge16u(const ushort** src, ushort* dst, int len, int cn ); +void merge32s(const int** src, int* dst, int len, int cn ); +void merge64s(const int64** src, int64* dst, int len, int cn ); + +void add8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* ); +void add8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* ); +void add16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* ); +void add16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* ); +void add32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* ); +void add32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* ); +void add64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* ); + +void sub8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* ); +void sub8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* ); +void sub16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* ); +void sub16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* ); +void sub32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* ); +void sub32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* ); +void sub64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* ); + +void max8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* ); +void max8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* ); +void max16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* ); +void max16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* ); +void max32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* ); +void max32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* ); +void max64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* ); + +void min8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* ); +void min8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* ); +void min16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* ); +void min16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* ); +void min32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* ); +void min32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* ); +void min64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* ); + +void absdiff8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* ); +void absdiff8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* ); +void absdiff16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* ); +void absdiff16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* ); +void absdiff32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* ); +void absdiff32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* ); +void absdiff64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* ); + +void and8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* ); +void or8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* ); +void xor8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* ); +void not8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* ); + +void cmp8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _cmpop); +void cmp8s(const schar* src1, size_t step1, const schar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _cmpop); +void cmp16u(const ushort* src1, size_t step1, const ushort* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _cmpop); +void cmp16s(const short* src1, size_t step1, const short* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _cmpop); +void cmp32s(const int* src1, size_t step1, const int* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _cmpop); +void cmp32f(const float* src1, size_t step1, const float* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _cmpop); +void cmp64f(const double* src1, size_t step1, const double* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _cmpop); + +void mul8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* scale); +void mul8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* scale); +void mul16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* scale); +void mul16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* scale); +void mul32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* scale); +void mul32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* scale); +void mul64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* scale); + +void div8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* scale); +void div8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* scale); +void div16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* scale); +void div16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* scale); +void div32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* scale); +void div32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* scale); +void div64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* scale); + +void recip8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* scale); +void recip8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* scale); +void recip16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* scale); +void recip16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* scale); +void recip32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* scale); +void recip32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* scale); +void recip64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* scale); + +void addWeighted8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _scalars ); +void addWeighted8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* scalars ); +void addWeighted16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* scalars ); +void addWeighted16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* scalars ); +void addWeighted32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* scalars ); +void addWeighted32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* scalars ); +void addWeighted64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* scalars ); //! @} }} //cv::hal +namespace cv { + +template struct OpAdd +{ + typedef T1 type1; + typedef T2 type2; + typedef T3 rtype; + T3 operator ()(const T1 a, const T2 b) const { return saturate_cast(a + b); } +}; + +template struct OpSub +{ + typedef T1 type1; + typedef T2 type2; + typedef T3 rtype; + T3 operator ()(const T1 a, const T2 b) const { return saturate_cast(a - b); } +}; + +template struct OpRSub +{ + typedef T1 type1; + typedef T2 type2; + typedef T3 rtype; + T3 operator ()(const T1 a, const T2 b) const { return saturate_cast(b - a); } +}; + +template struct OpMin +{ + typedef T type1; + typedef T type2; + typedef T rtype; + T operator ()(const T a, const T b) const { return std::min(a, b); } +}; + +template struct OpMax +{ + typedef T type1; + typedef T type2; + typedef T rtype; + T operator ()(const T a, const T b) const { return std::max(a, b); } +}; + +template struct OpAbsDiff +{ + typedef T type1; + typedef T type2; + typedef T rtype; + T operator()(T a, T b) const { return (T)std::abs(a - b); } +}; + +template struct OpAbsDiffS +{ + typedef T type1; + typedef WT type2; + typedef T rtype; + T operator()(T a, WT b) const { return saturate_cast(std::abs(a - b)); } +}; + +template struct OpAnd +{ + typedef T type1; + typedef T type2; + typedef T rtype; + T operator()( T a, T b ) const { return a & b; } +}; + +template struct OpOr +{ + typedef T type1; + typedef T type2; + typedef T rtype; + T operator()( T a, T b ) const { return a | b; } +}; + +template struct OpXor +{ + typedef T type1; + typedef T type2; + typedef T rtype; + T operator()( T a, T b ) const { return a ^ b; } +}; + +template struct OpNot +{ + typedef T type1; + typedef T type2; + typedef T rtype; + T operator()( T a, T ) const { return ~a; } +}; + +} + #endif //__OPENCV_HAL_HPP__ diff --git a/modules/hal/include/opencv2/hal/defs.h b/modules/hal/include/opencv2/hal/defs.h index d04f003879..117ec60469 100644 --- a/modules/hal/include/opencv2/hal/defs.h +++ b/modules/hal/include/opencv2/hal/defs.h @@ -53,6 +53,7 @@ #endif #include +#include "opencv2/hal/interface.hpp" #if defined __ICL # define CV_ICC __ICL @@ -117,9 +118,38 @@ #define CV_CPU_NEON 100 -// when adding to this list remember to update the enum in core/utility.cpp +// when adding to this list remember to update the following enum #define CV_HARDWARE_MAX_FEATURE 255 +/** @brief Available CPU features. +*/ +enum CpuFeatures { + CPU_MMX = 1, + CPU_SSE = 2, + CPU_SSE2 = 3, + CPU_SSE3 = 4, + CPU_SSSE3 = 5, + CPU_SSE4_1 = 6, + CPU_SSE4_2 = 7, + CPU_POPCNT = 8, + + CPU_AVX = 10, + CPU_AVX2 = 11, + CPU_FMA3 = 12, + + CPU_AVX_512F = 13, + CPU_AVX_512BW = 14, + CPU_AVX_512CD = 15, + CPU_AVX_512DQ = 16, + CPU_AVX_512ER = 17, + CPU_AVX_512IFMA512 = 18, + CPU_AVX_512PF = 19, + CPU_AVX_512VBMI = 20, + CPU_AVX_512VL = 21, + + CPU_NEON = 100 +}; + // do not include SSE/AVX/NEON headers for NVCC compiler #ifndef __CUDACC__ @@ -257,49 +287,6 @@ # define CV_VFP 0 #endif -/* primitive types */ -/* - schar - signed 1 byte integer - uchar - unsigned 1 byte integer - short - signed 2 byte integer - ushort - unsigned 2 byte integer - int - signed 4 byte integer - uint - unsigned 4 byte integer - int64 - signed 8 byte integer - uint64 - unsigned 8 byte integer -*/ - -#if !defined _MSC_VER && !defined __BORLANDC__ -# if defined __cplusplus && __cplusplus >= 201103L && !defined __APPLE__ -# include - typedef std::uint32_t uint; -# else -# include - typedef uint32_t uint; -# endif -#else - typedef unsigned uint; -#endif - -typedef signed char schar; - -#ifndef __IPL_H__ - typedef unsigned char uchar; - typedef unsigned short ushort; -#endif - -#if defined _MSC_VER || defined __BORLANDC__ - typedef __int64 int64; - typedef unsigned __int64 uint64; -# define CV_BIG_INT(n) n##I64 -# define CV_BIG_UINT(n) n##UI64 -#else - typedef int64_t int64; - typedef uint64_t uint64; -# define CV_BIG_INT(n) n##LL -# define CV_BIG_UINT(n) n##ULL -#endif - /* fundamental constants */ #define CV_PI 3.1415926535897932384626433832795 #define CV_2PI 6.283185307179586476925286766559 @@ -321,6 +308,19 @@ typedef union Cv64suf } Cv64suf; +namespace cv { namespace hal { + +bool checkHardwareSupport(int feature); +void setUseOptimized(bool onoff); +bool useOptimized(); + +}} + +#define USE_SSE2 (cv::hal::checkHardwareSupport(CV_CPU_SSE)) +#define USE_SSE4_2 (cv::hal::checkHardwareSupport(CV_CPU_SSE4_2)) +#define USE_AVX (cv::hal::checkHardwareSupport(CV_CPU_AVX)) +#define USE_AVX2 (cv::hal::checkHardwareSupport(CV_CPU_AVX2)) + /****************************************************************************************\ * fast math * diff --git a/modules/hal/include/opencv2/hal/interface.hpp b/modules/hal/include/opencv2/hal/interface.hpp new file mode 100644 index 0000000000..2a5bff04d7 --- /dev/null +++ b/modules/hal/include/opencv2/hal/interface.hpp @@ -0,0 +1,91 @@ +#ifndef _HAL_INTERFACE_HPP_INCLUDED_ +#define _HAL_INTERFACE_HPP_INCLUDED_ + +#define CV_HAL_ERROR_OK 0 +#define CV_HAL_ERROR_NI 1 +#define CV_HAL_ERROR_UNKNOWN -1 + +#define CV_HAL_CMP_EQ 0 +#define CV_HAL_CMP_GT 1 +#define CV_HAL_CMP_GE 2 +#define CV_HAL_CMP_LT 3 +#define CV_HAL_CMP_LE 4 +#define CV_HAL_CMP_NE 5 + +#ifdef __cplusplus +namespace cv { namespace hal { + +namespace Error { + +enum +{ + Ok = 0, + NotImplemented = 1, + Unknown = -1 +}; + +} + +enum +{ + CMP_EQ = 0, + CMP_GT = 1, + CMP_GE = 2, + CMP_LT = 3, + CMP_LE = 4, + CMP_NE = 5 +}; + +}} +#endif + +#ifdef __cplusplus +#include +#else +#include +#endif + +/* primitive types */ +/* + schar - signed 1 byte integer + uchar - unsigned 1 byte integer + short - signed 2 byte integer + ushort - unsigned 2 byte integer + int - signed 4 byte integer + uint - unsigned 4 byte integer + int64 - signed 8 byte integer + uint64 - unsigned 8 byte integer +*/ + +#if !defined _MSC_VER && !defined __BORLANDC__ +# if defined __cplusplus && __cplusplus >= 201103L && !defined __APPLE__ +# include + typedef std::uint32_t uint; +# else +# include + typedef uint32_t uint; +# endif +#else + typedef unsigned uint; +#endif + +typedef signed char schar; + +#ifndef __IPL_H__ + typedef unsigned char uchar; + typedef unsigned short ushort; +#endif + +#if defined _MSC_VER || defined __BORLANDC__ + typedef __int64 int64; + typedef unsigned __int64 uint64; +# define CV_BIG_INT(n) n##I64 +# define CV_BIG_UINT(n) n##UI64 +#else + typedef int64_t int64; + typedef uint64_t uint64; +# define CV_BIG_INT(n) n##LL +# define CV_BIG_UINT(n) n##ULL +#endif + +#endif diff --git a/modules/core/include/opencv2/core/sse_utils.hpp b/modules/hal/include/opencv2/hal/sse_utils.hpp similarity index 99% rename from modules/core/include/opencv2/core/sse_utils.hpp rename to modules/hal/include/opencv2/hal/sse_utils.hpp index e0283eb3f3..9ce4098bad 100644 --- a/modules/core/include/opencv2/core/sse_utils.hpp +++ b/modules/hal/include/opencv2/hal/sse_utils.hpp @@ -46,6 +46,8 @@ # error sse_utils.hpp header must be compiled as C++ #endif +#include "opencv2/hal/defs.h" + #if CV_SSE2 inline void _mm_deinterleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1) diff --git a/modules/hal/samples/simple_hal/CMakeLists.txt b/modules/hal/samples/simple_hal/CMakeLists.txt new file mode 100644 index 0000000000..ee4ac22be9 --- /dev/null +++ b/modules/hal/samples/simple_hal/CMakeLists.txt @@ -0,0 +1,12 @@ +cmake_minimum_required(VERSION 2.8.8 FATAL_ERROR) + +if(UNIX) + if(CMAKE_COMPILER_IS_GNUCXX OR CV_ICC) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC") + endif() +endif() + +add_library(simple_hal simple.cpp) +set(OPENCV_HAL_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../..") +target_include_directories(simple_hal PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} ${OPENCV_HAL_DIR}/include) + diff --git a/modules/hal/samples/simple_hal/simple.cpp b/modules/hal/samples/simple_hal/simple.cpp new file mode 100644 index 0000000000..49d77b02ac --- /dev/null +++ b/modules/hal/samples/simple_hal/simple.cpp @@ -0,0 +1,34 @@ +#include "simple.hpp" + +int slow_and8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height) +{ + for(; height--; src1 = src1 + step1, src2 = src2 + step2, dst = dst + step) + for(int x = 0 ; x < width; x++ ) + dst[x] = src1[x] & src2[x]; + return cv::hal::Error::Ok; +} + +int slow_or8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height) +{ + for(; height--; src1 = src1 + step1, src2 = src2 + step2, dst = dst + step) + for(int x = 0 ; x < width; x++ ) + dst[x] = src1[x] | src2[x]; + return cv::hal::Error::Ok; +} + +int slow_xor8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height) +{ + for(; height--; src1 = src1 + step1, src2 = src2 + step2, dst = dst + step) + for(int x = 0 ; x < width; x++ ) + dst[x] = src1[x] ^ src2[x]; + return cv::hal::Error::Ok; +} + +int slow_not8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height) +{ + for(; height--; src1 = src1 + step1, src2 = src2 + step2, dst = dst + step) + for(int x = 0 ; x < width; x++ ) + dst[x] = ~src1[x]; + return cv::hal::Error::Ok; +} + diff --git a/modules/hal/samples/simple_hal/simple.hpp b/modules/hal/samples/simple_hal/simple.hpp new file mode 100644 index 0000000000..85a16535de --- /dev/null +++ b/modules/hal/samples/simple_hal/simple.hpp @@ -0,0 +1,20 @@ +#ifndef _SIMPLE_HPP_INCLUDED_ +#define _SIMPLE_HPP_INCLUDED_ + +#include "opencv2/hal/interface.hpp" + +int slow_and8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height); +int slow_or8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height); +int slow_xor8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height); +int slow_not8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height); + +#undef hal_and8u +#define hal_and8u slow_and8u +#undef hal_or8u +#define hal_or8u slow_or8u +#undef hal_xor8u +#define hal_xor8u slow_xor8u +#undef hal_not8u +#define hal_not8u slow_not8u + +#endif diff --git a/modules/hal/src/arithm.cpp b/modules/hal/src/arithm.cpp index a3f69facca..b0705c5137 100644 --- a/modules/hal/src/arithm.cpp +++ b/modules/hal/src/arithm.cpp @@ -7,11 +7,13 @@ // copy or use the software. // // -// License Agreement +// License Agreement // For Open Source Computer Vision Library // // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. -// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 2013, OpenCV Foundation, all rights reserved. +// Copyright (C) 2015, Itseez Inc., all rights reserved. // Third party copyrights are property of their respective owners. // // Redistribution and use in source and binary forms, with or without modification, @@ -41,7 +43,1089 @@ //M*/ #include "precomp.hpp" +#include "arithm_simd.hpp" +#include "arithm_core.hpp" +#include "opencv2/hal/replacement.hpp" namespace cv { namespace hal { -}} +//======================================= + +#undef CALL_HAL +#define CALL_HAL(fun) \ + int res = fun(src1, step1, src2, step2, dst, step, width, height); \ + if (res == Error::Ok) \ + return; \ + else if (res != Error::NotImplemented) \ + throw Failure(res); + +#if (ARITHM_USE_IPP == 1) +static inline void fixSteps(width, height, size_t elemSize, size_t& step1, size_t& step2, size_t& step) +{ + if( height == 1 ) + step1 = step2 = step = width*elemSize; +} +#define CALL_IPP_BIN_12(fun) \ + CV_IPP_CHECK() \ + { \ + fixSteps(width, height, sizeof(dst[0]), step1, step2, step); \ + if (0 <= fun(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height), 0)) \ + { \ + CV_IMPL_ADD(CV_IMPL_IPP); \ + return; \ + } \ + setIppErrorStatus(); \ + } +#else +#define CALL_IPP_BIN_12(fun) +#endif + +//======================================= +// Add +//======================================= + +void add8u( const uchar* src1, size_t step1, + const uchar* src2, size_t step2, + uchar* dst, size_t step, int width, int height, void* ) +{ + CALL_HAL(hal_add8u) + CALL_IPP_BIN_12(ippiAdd_8u_C1RSfs) + (vBinOp, IF_SIMD(VAdd)>(src1, step1, src2, step2, dst, step, width, height)); +} + +void add8s( const schar* src1, size_t step1, + const schar* src2, size_t step2, + schar* dst, size_t step, int width, int height, void* ) +{ + CALL_HAL(hal_add8s) + vBinOp, IF_SIMD(VAdd)>(src1, step1, src2, step2, dst, step, width, height); +} + +void add16u( const ushort* src1, size_t step1, + const ushort* src2, size_t step2, + ushort* dst, size_t step, int width, int height, void* ) +{ + CALL_HAL(hal_add16u) + CALL_IPP_BIN_12(ippiAdd_16u_C1RSfs) + (vBinOp, IF_SIMD(VAdd)>(src1, step1, src2, step2, dst, step, width, height)); +} + +void add16s( const short* src1, size_t step1, + const short* src2, size_t step2, + short* dst, size_t step, int width, int height, void* ) +{ + CALL_HAL(hal_add16s) + CALL_IPP_BIN_12(ippiAdd_16s_C1RSfs) + (vBinOp, IF_SIMD(VAdd)>(src1, step1, src2, step2, dst, step, width, height)); +} + +void add32s( const int* src1, size_t step1, + const int* src2, size_t step2, + int* dst, size_t step, int width, int height, void* ) +{ + CALL_HAL(hal_add32s) + vBinOp32, IF_SIMD(VAdd)>(src1, step1, src2, step2, dst, step, width, height); +} + +void add32f( const float* src1, size_t step1, + const float* src2, size_t step2, + float* dst, size_t step, int width, int height, void* ) +{ + CALL_HAL(hal_add32f) + CALL_IPP_BIN_12(ippiAdd_32f_C1R) + (vBinOp32, IF_SIMD(VAdd)>(src1, step1, src2, step2, dst, step, width, height)); +} + +void add64f( const double* src1, size_t step1, + const double* src2, size_t step2, + double* dst, size_t step, int width, int height, void* ) +{ + CALL_HAL(hal_add64f) + vBinOp64, IF_SIMD(VAdd)>(src1, step1, src2, step2, dst, step, width, height); +} + +//======================================= + +#if (ARITHM_USE_IPP == 1) +#define CALL_IPP_BIN_21(fun) \ + CV_IPP_CHECK() \ + { \ + fixSteps(width, height, sizeof(dst[0]), step1, step2, step); \ + if (0 <= fun(src2, (int)step2, src1, (int)step1, dst, (int)step, ippiSize(width, height), 0)) \ + { \ + CV_IMPL_ADD(CV_IMPL_IPP); \ + return; \ + } \ + setIppErrorStatus(); \ + } +#else +#define CALL_IPP_BIN_21(fun) +#endif + +//======================================= +// Subtract +//======================================= + +void sub8u( const uchar* src1, size_t step1, + const uchar* src2, size_t step2, + uchar* dst, size_t step, int width, int height, void* ) +{ + CALL_HAL(hal_sub8u) + CALL_IPP_BIN_21(ippiSub_8u_C1RSfs) + (vBinOp, IF_SIMD(VSub)>(src1, step1, src2, step2, dst, step, width, height)); +} + +void sub8s( const schar* src1, size_t step1, + const schar* src2, size_t step2, + schar* dst, size_t step, int width, int height, void* ) +{ + CALL_HAL(hal_sub8s) + vBinOp, IF_SIMD(VSub)>(src1, step1, src2, step2, dst, step, width, height); +} + +void sub16u( const ushort* src1, size_t step1, + const ushort* src2, size_t step2, + ushort* dst, size_t step, int width, int height, void* ) +{ + CALL_HAL(hal_sub16u) + CALL_IPP_BIN_21(ippiSub_16u_C1RSfs) + (vBinOp, IF_SIMD(VSub)>(src1, step1, src2, step2, dst, step, width, height)); +} + +void sub16s( const short* src1, size_t step1, + const short* src2, size_t step2, + short* dst, size_t step, int width, int height, void* ) +{ + CALL_HAL(hal_sub16s) + CALL_IPP_BIN_21(ippiSub_16s_C1RSfs) + (vBinOp, IF_SIMD(VSub)>(src1, step1, src2, step2, dst, step, width, height)); +} + +void sub32s( const int* src1, size_t step1, + const int* src2, size_t step2, + int* dst, size_t step, int width, int height, void* ) +{ + CALL_HAL(hal_sub32s) + vBinOp32, IF_SIMD(VSub)>(src1, step1, src2, step2, dst, step, width, height); +} + +void sub32f( const float* src1, size_t step1, + const float* src2, size_t step2, + float* dst, size_t step, int width, int height, void* ) +{ + CALL_HAL(hal_sub32f) + CALL_IPP_BIN_21(ippiSub_32f_C1R) + (vBinOp32, IF_SIMD(VSub)>(src1, step1, src2, step2, dst, step, width, height)); +} + +void sub64f( const double* src1, size_t step1, + const double* src2, size_t step2, + double* dst, size_t step, int width, int height, void* ) +{ + CALL_HAL(hal_sub64f) + vBinOp64, IF_SIMD(VSub)>(src1, step1, src2, step2, dst, step, width, height); +} + +//======================================= + +#if (ARITHM_USE_IPP == 1) +#define CALL_IPP_MIN_MAX(fun, type) \ + CV_IPP_CHECK() \ + { \ + type* s1 = (type*)src1; \ + type* s2 = (type*)src2; \ + type* d = dst; \ + fixSteps(width, height, sizeof(dst[0]), step1, step2, step); \ + int i = 0; \ + for(; i < height; i++) \ + { \ + if (0 > fun(s1, s2, d, width)) \ + break; \ + s1 = (type*)((uchar*)s1 + step1); \ + s2 = (type*)((uchar*)s2 + step2); \ + d = (type*)((uchar*)d + step); \ + } \ + if (i == height) \ + { \ + CV_IMPL_ADD(CV_IMPL_IPP); \ + return; \ + } \ + setIppErrorStatus(); \ + } +#else +#define CALL_IPP_MIN_MAX(fun, type) +#endif + +//======================================= +// Max +//======================================= + +void max8u( const uchar* src1, size_t step1, + const uchar* src2, size_t step2, + uchar* dst, size_t step, int width, int height, void* ) +{ + CALL_HAL(hal_max8u) + CALL_IPP_MIN_MAX(ippsMaxEvery_8u, uchar) + vBinOp, IF_SIMD(VMax)>(src1, step1, src2, step2, dst, step, width, height); +} + +void max8s( const schar* src1, size_t step1, + const schar* src2, size_t step2, + schar* dst, size_t step, int width, int height, void* ) +{ + CALL_HAL(hal_max8s) + vBinOp, IF_SIMD(VMax)>(src1, step1, src2, step2, dst, step, width, height); +} + +void max16u( const ushort* src1, size_t step1, + const ushort* src2, size_t step2, + ushort* dst, size_t step, int width, int height, void* ) +{ + CALL_HAL(hal_max16u) + CALL_IPP_MIN_MAX(ippsMaxEvery_16u, ushort) + vBinOp, IF_SIMD(VMax)>(src1, step1, src2, step2, dst, step, width, height); +} + +void max16s( const short* src1, size_t step1, + const short* src2, size_t step2, + short* dst, size_t step, int width, int height, void* ) +{ + CALL_HAL(hal_max16s) + vBinOp, IF_SIMD(VMax)>(src1, step1, src2, step2, dst, step, width, height); +} + +void max32s( const int* src1, size_t step1, + const int* src2, size_t step2, + int* dst, size_t step, int width, int height, void* ) +{ + CALL_HAL(hal_max32s) + vBinOp32, IF_SIMD(VMax)>(src1, step1, src2, step2, dst, step, width, height); +} + +void max32f( const float* src1, size_t step1, + const float* src2, size_t step2, + float* dst, size_t step, int width, int height, void* ) +{ + CALL_HAL(hal_max32f) + CALL_IPP_MIN_MAX(ippsMaxEvery_32f, float) + vBinOp32, IF_SIMD(VMax)>(src1, step1, src2, step2, dst, step, width, height); +} + +void max64f( const double* src1, size_t step1, + const double* src2, size_t step2, + double* dst, size_t step, int width, int height, void* ) +{ + CALL_HAL(hal_max64f) + CALL_IPP_MIN_MAX(ippsMaxEvery_64f, double) + vBinOp64, IF_SIMD(VMax)>(src1, step1, src2, step2, dst, step, width, height); +} + +//======================================= +// Min +//======================================= + +void min8u( const uchar* src1, size_t step1, + const uchar* src2, size_t step2, + uchar* dst, size_t step, int width, int height, void* ) +{ + CALL_HAL(hal_min8u) + CALL_IPP_MIN_MAX(ippsMinEvery_8u, uchar) + vBinOp, IF_SIMD(VMin)>(src1, step1, src2, step2, dst, step, width, height); +} + +void min8s( const schar* src1, size_t step1, + const schar* src2, size_t step2, + schar* dst, size_t step, int width, int height, void* ) +{ + CALL_HAL(hal_min8s) + vBinOp, IF_SIMD(VMin)>(src1, step1, src2, step2, dst, step, width, height); +} + +void min16u( const ushort* src1, size_t step1, + const ushort* src2, size_t step2, + ushort* dst, size_t step, int width, int height, void* ) +{ + CALL_HAL(hal_min16u) + CALL_IPP_MIN_MAX(ippsMinEvery_16u, ushort) + vBinOp, IF_SIMD(VMin)>(src1, step1, src2, step2, dst, step, width, height); +} + +void min16s( const short* src1, size_t step1, + const short* src2, size_t step2, + short* dst, size_t step, int width, int height, void* ) +{ + CALL_HAL(hal_min16s) + vBinOp, IF_SIMD(VMin)>(src1, step1, src2, step2, dst, step, width, height); +} + +void min32s( const int* src1, size_t step1, + const int* src2, size_t step2, + int* dst, size_t step, int width, int height, void* ) +{ + CALL_HAL(hal_min32s) + vBinOp32, IF_SIMD(VMin)>(src1, step1, src2, step2, dst, step, width, height); +} + +void min32f( const float* src1, size_t step1, + const float* src2, size_t step2, + float* dst, size_t step, int width, int height, void* ) +{ + CALL_HAL(hal_min32f) + CALL_IPP_MIN_MAX(ippsMinEvery_32f, float) + vBinOp32, IF_SIMD(VMin)>(src1, step1, src2, step2, dst, step, width, height); +} + +void min64f( const double* src1, size_t step1, + const double* src2, size_t step2, + double* dst, size_t step, int width, int height, void* ) +{ + CALL_HAL(hal_min64f) + CALL_IPP_MIN_MAX(ippsMinEvery_64f, double) + vBinOp64, IF_SIMD(VMin)>(src1, step1, src2, step2, dst, step, width, height); +} + +//======================================= +// AbsDiff +//======================================= + +void absdiff8u( const uchar* src1, size_t step1, + const uchar* src2, size_t step2, + uchar* dst, size_t step, int width, int height, void* ) +{ + CALL_HAL(hal_absdiff8u) + CALL_IPP_BIN_12(ippiAbsDiff_8u_C1R) + (vBinOp, IF_SIMD(VAbsDiff)>(src1, step1, src2, step2, dst, step, width, height)); +} + +void absdiff8s( const schar* src1, size_t step1, + const schar* src2, size_t step2, + schar* dst, size_t step, int width, int height, void* ) +{ + CALL_HAL(hal_absdiff8s) + vBinOp, IF_SIMD(VAbsDiff)>(src1, step1, src2, step2, dst, step, width, height); +} + +void absdiff16u( const ushort* src1, size_t step1, + const ushort* src2, size_t step2, + ushort* dst, size_t step, int width, int height, void* ) +{ + CALL_HAL(hal_absdiff16u) + CALL_IPP_BIN_12(ippiAbsDiff_16u_C1R) + (vBinOp, IF_SIMD(VAbsDiff)>(src1, step1, src2, step2, dst, step, width, height)); +} + +void absdiff16s( const short* src1, size_t step1, + const short* src2, size_t step2, + short* dst, size_t step, int width, int height, void* ) +{ + CALL_HAL(hal_absdiff16s) + vBinOp, IF_SIMD(VAbsDiff)>(src1, step1, src2, step2, dst, step, width, height); +} + +void absdiff32s( const int* src1, size_t step1, + const int* src2, size_t step2, + int* dst, size_t step, int width, int height, void* ) +{ + CALL_HAL(hal_absdiff32s) + vBinOp32, IF_SIMD(VAbsDiff)>(src1, step1, src2, step2, dst, step, width, height); +} + +void absdiff32f( const float* src1, size_t step1, + const float* src2, size_t step2, + float* dst, size_t step, int width, int height, void* ) +{ + CALL_HAL(hal_absdiff32f) + CALL_IPP_BIN_12(ippiAbsDiff_32f_C1R) + (vBinOp32, IF_SIMD(VAbsDiff)>(src1, step1, src2, step2, dst, step, width, height)); +} + +void absdiff64f( const double* src1, size_t step1, + const double* src2, size_t step2, + double* dst, size_t step, int width, int height, void* ) +{ + CALL_HAL(hal_absdiff64f) + vBinOp64, IF_SIMD(VAbsDiff)>(src1, step1, src2, step2, dst, step, width, height); +} + +//======================================= +// Logical +//======================================= + +void and8u( const uchar* src1, size_t step1, + const uchar* src2, size_t step2, + uchar* dst, size_t step, int width, int height, void* ) +{ + CALL_HAL(hal_and8u) + CALL_IPP_BIN_12(ippiAnd_8u_C1R) + (vBinOp, IF_SIMD(VAnd)>(src1, step1, src2, step2, dst, step, width, height)); +} + +void or8u( const uchar* src1, size_t step1, + const uchar* src2, size_t step2, + uchar* dst, size_t step, int width, int height, void* ) +{ + CALL_HAL(hal_or8u) + CALL_IPP_BIN_12(ippiOr_8u_C1R) + (vBinOp, IF_SIMD(VOr)>(src1, step1, src2, step2, dst, step, width, height)); +} + +void xor8u( const uchar* src1, size_t step1, + const uchar* src2, size_t step2, + uchar* dst, size_t step, int width, int height, void* ) +{ + CALL_HAL(hal_xor8u) + CALL_IPP_BIN_12(ippiXor_8u_C1R) + (vBinOp, IF_SIMD(VXor)>(src1, step1, src2, step2, dst, step, width, height)); +} + +void not8u( const uchar* src1, size_t step1, + const uchar* src2, size_t step2, + uchar* dst, size_t step, int width, int height, void* ) +{ + CALL_HAL(hal_not8u) + CALL_IPP_BIN_12(ippiNot_8u_C1R) + (vBinOp, IF_SIMD(VNot)>(src1, step1, src2, step2, dst, step, width, height)); +} + +//======================================= + +#undef CALL_HAL +#define CALL_HAL(fun) \ + int res = fun(src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop); \ + if (res == Error::Ok) \ + return; \ + else if (res != Error::NotImplemented) \ + throw Failure(res); + +#if ARITHM_USE_IPP +inline static IppCmpOp convert_cmp(int _cmpop) +{ + return _cmpop == CMP_EQ ? ippCmpEq : + _cmpop == CMP_GT ? ippCmpGreater : + _cmpop == CMP_GE ? ippCmpGreaterEq : + _cmpop == CMP_LT ? ippCmpLess : + _cmpop == CMP_LE ? ippCmpLessEq : + (IppCmpOp)-1; +} +#define CALL_IPP_CMP(fun) \ + CV_IPP_CHECK() \ + { \ + IppCmpOp op = convert_cmp(*(int *)_cmpop); \ + if( op >= 0 ) \ + { \ + fixSteps(width, height, sizeof(dst[0]), step1, step2, step); \ + if (0 <= fun(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height), op)) \ + { \ + CV_IMPL_ADD(CV_IMPL_IPP); \ + return; \ + } \ + setIppErrorStatus(); \ + } \ + } +#else +#define CALL_IPP_CMP(fun) +#endif + +//======================================= +// Compare +//======================================= + +void cmp8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, + uchar* dst, size_t step, int width, int height, void* _cmpop) +{ + CALL_HAL(hal_cmp8u) + CALL_IPP_CMP(ippiCompare_8u_C1R) + //vz optimized cmp_(src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop); + int code = *(int*)_cmpop; + step1 /= sizeof(src1[0]); + step2 /= sizeof(src2[0]); + if( code == CMP_GE || code == CMP_LT ) + { + std::swap(src1, src2); + std::swap(step1, step2); + code = code == CMP_GE ? CMP_LE : CMP_GT; + } + + if( code == CMP_GT || code == CMP_LE ) + { + int m = code == CMP_GT ? 0 : 255; + for( ; height--; src1 += step1, src2 += step2, dst += step ) + { + int x =0; + #if CV_SSE2 + if( USE_SSE2 ) + { + __m128i m128 = code == CMP_GT ? _mm_setzero_si128() : _mm_set1_epi8 (-1); + __m128i c128 = _mm_set1_epi8 (-128); + for( ; x <= width - 16; x += 16 ) + { + __m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x)); + __m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x)); + // no simd for 8u comparison, that's why we need the trick + r00 = _mm_sub_epi8(r00,c128); + r10 = _mm_sub_epi8(r10,c128); + + r00 =_mm_xor_si128(_mm_cmpgt_epi8(r00, r10), m128); + _mm_storeu_si128((__m128i*)(dst + x),r00); + + } + } + #elif CV_NEON + uint8x16_t mask = code == CMP_GT ? vdupq_n_u8(0) : vdupq_n_u8(255); + + for( ; x <= width - 16; x += 16 ) + { + vst1q_u8(dst+x, veorq_u8(vcgtq_u8(vld1q_u8(src1+x), vld1q_u8(src2+x)), mask)); + } + + #endif + + for( ; x < width; x++ ){ + dst[x] = (uchar)(-(src1[x] > src2[x]) ^ m); + } + } + } + else if( code == CMP_EQ || code == CMP_NE ) + { + int m = code == CMP_EQ ? 0 : 255; + for( ; height--; src1 += step1, src2 += step2, dst += step ) + { + int x = 0; + #if CV_SSE2 + if( USE_SSE2 ) + { + __m128i m128 = code == CMP_EQ ? _mm_setzero_si128() : _mm_set1_epi8 (-1); + for( ; x <= width - 16; x += 16 ) + { + __m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x)); + __m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x)); + r00 = _mm_xor_si128 ( _mm_cmpeq_epi8 (r00, r10), m128); + _mm_storeu_si128((__m128i*)(dst + x), r00); + } + } + #elif CV_NEON + uint8x16_t mask = code == CMP_EQ ? vdupq_n_u8(0) : vdupq_n_u8(255); + + for( ; x <= width - 16; x += 16 ) + { + vst1q_u8(dst+x, veorq_u8(vceqq_u8(vld1q_u8(src1+x), vld1q_u8(src2+x)), mask)); + } + #endif + for( ; x < width; x++ ) + dst[x] = (uchar)(-(src1[x] == src2[x]) ^ m); + } + } +} + +void cmp8s(const schar* src1, size_t step1, const schar* src2, size_t step2, + uchar* dst, size_t step, int width, int height, void* _cmpop) +{ + CALL_HAL(hal_cmp8s) + cmp_(src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop); +} + +void cmp16u(const ushort* src1, size_t step1, const ushort* src2, size_t step2, + uchar* dst, size_t step, int width, int height, void* _cmpop) +{ + CALL_HAL(hal_cmp16u) + CALL_IPP_CMP(ippiCompare_16u_C1R) + cmp_(src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop); +} + +void cmp16s(const short* src1, size_t step1, const short* src2, size_t step2, + uchar* dst, size_t step, int width, int height, void* _cmpop) +{ + CALL_HAL(hal_cmp16s) + CALL_IPP_CMP(ippiCompare_16s_C1R) + //vz optimized cmp_(src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop); + + int code = *(int*)_cmpop; + step1 /= sizeof(src1[0]); + step2 /= sizeof(src2[0]); + if( code == CMP_GE || code == CMP_LT ) + { + std::swap(src1, src2); + std::swap(step1, step2); + code = code == CMP_GE ? CMP_LE : CMP_GT; + } + + if( code == CMP_GT || code == CMP_LE ) + { + int m = code == CMP_GT ? 0 : 255; + for( ; height--; src1 += step1, src2 += step2, dst += step ) + { + int x =0; + #if CV_SSE2 + if( USE_SSE2) + { + __m128i m128 = code == CMP_GT ? _mm_setzero_si128() : _mm_set1_epi16 (-1); + for( ; x <= width - 16; x += 16 ) + { + __m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x)); + __m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x)); + r00 = _mm_xor_si128 ( _mm_cmpgt_epi16 (r00, r10), m128); + __m128i r01 = _mm_loadu_si128((const __m128i*)(src1 + x + 8)); + __m128i r11 = _mm_loadu_si128((const __m128i*)(src2 + x + 8)); + r01 = _mm_xor_si128 ( _mm_cmpgt_epi16 (r01, r11), m128); + r11 = _mm_packs_epi16(r00, r01); + _mm_storeu_si128((__m128i*)(dst + x), r11); + } + if( x <= width-8) + { + __m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x)); + __m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x)); + r00 = _mm_xor_si128 ( _mm_cmpgt_epi16 (r00, r10), m128); + r10 = _mm_packs_epi16(r00, r00); + _mm_storel_epi64((__m128i*)(dst + x), r10); + + x += 8; + } + } + #elif CV_NEON + uint8x16_t mask = code == CMP_GT ? vdupq_n_u8(0) : vdupq_n_u8(255); + + for( ; x <= width - 16; x += 16 ) + { + int16x8_t in1 = vld1q_s16(src1 + x); + int16x8_t in2 = vld1q_s16(src2 + x); + uint8x8_t t1 = vmovn_u16(vcgtq_s16(in1, in2)); + + in1 = vld1q_s16(src1 + x + 8); + in2 = vld1q_s16(src2 + x + 8); + uint8x8_t t2 = vmovn_u16(vcgtq_s16(in1, in2)); + + vst1q_u8(dst+x, veorq_u8(vcombine_u8(t1, t2), mask)); + } + #endif + + for( ; x < width; x++ ){ + dst[x] = (uchar)(-(src1[x] > src2[x]) ^ m); + } + } + } + else if( code == CMP_EQ || code == CMP_NE ) + { + int m = code == CMP_EQ ? 0 : 255; + for( ; height--; src1 += step1, src2 += step2, dst += step ) + { + int x = 0; + #if CV_SSE2 + if( USE_SSE2 ) + { + __m128i m128 = code == CMP_EQ ? _mm_setzero_si128() : _mm_set1_epi16 (-1); + for( ; x <= width - 16; x += 16 ) + { + __m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x)); + __m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x)); + r00 = _mm_xor_si128 ( _mm_cmpeq_epi16 (r00, r10), m128); + __m128i r01 = _mm_loadu_si128((const __m128i*)(src1 + x + 8)); + __m128i r11 = _mm_loadu_si128((const __m128i*)(src2 + x + 8)); + r01 = _mm_xor_si128 ( _mm_cmpeq_epi16 (r01, r11), m128); + r11 = _mm_packs_epi16(r00, r01); + _mm_storeu_si128((__m128i*)(dst + x), r11); + } + if( x <= width - 8) + { + __m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x)); + __m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x)); + r00 = _mm_xor_si128 ( _mm_cmpeq_epi16 (r00, r10), m128); + r10 = _mm_packs_epi16(r00, r00); + _mm_storel_epi64((__m128i*)(dst + x), r10); + + x += 8; + } + } + #elif CV_NEON + uint8x16_t mask = code == CMP_EQ ? vdupq_n_u8(0) : vdupq_n_u8(255); + + for( ; x <= width - 16; x += 16 ) + { + int16x8_t in1 = vld1q_s16(src1 + x); + int16x8_t in2 = vld1q_s16(src2 + x); + uint8x8_t t1 = vmovn_u16(vceqq_s16(in1, in2)); + + in1 = vld1q_s16(src1 + x + 8); + in2 = vld1q_s16(src2 + x + 8); + uint8x8_t t2 = vmovn_u16(vceqq_s16(in1, in2)); + + vst1q_u8(dst+x, veorq_u8(vcombine_u8(t1, t2), mask)); + } + #endif + for( ; x < width; x++ ) + dst[x] = (uchar)(-(src1[x] == src2[x]) ^ m); + } + } +} + +void cmp32s(const int* src1, size_t step1, const int* src2, size_t step2, + uchar* dst, size_t step, int width, int height, void* _cmpop) +{ + CALL_HAL(hal_cmp32s) + cmp_(src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop); +} + +void cmp32f(const float* src1, size_t step1, const float* src2, size_t step2, + uchar* dst, size_t step, int width, int height, void* _cmpop) +{ + CALL_HAL(hal_cmp32f) + CALL_IPP_CMP(ippiCompare_32f_C1R) + cmp_(src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop); +} + +void cmp64f(const double* src1, size_t step1, const double* src2, size_t step2, + uchar* dst, size_t step, int width, int height, void* _cmpop) +{ + CALL_HAL(hal_cmp64f) + cmp_(src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop); +} + +//======================================= + +#undef CALL_HAL +#define CALL_HAL(fun) \ + int res = fun(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale); \ + if (res == Error::Ok) \ + return; \ + else if (res != Error::NotImplemented) \ + throw Failure(res); + +#if defined HAVE_IPP +#define CALL_IPP_MUL(fun) \ + CV_IPP_CHECK() \ + { \ + if (std::fabs(fscale - 1) <= FLT_EPSILON) \ + { \ + if (fun(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height), 0) >= 0) \ + { \ + CV_IMPL_ADD(CV_IMPL_IPP); \ + return; \ + } \ + setIppErrorStatus(); \ + } \ + } +#else +#define CALL_IPP_MUL(fun) +#endif + +//======================================= +// Multilpy +//======================================= + +void mul8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, + uchar* dst, size_t step, int width, int height, void* scale) +{ + CALL_HAL(hal_mul8u) + float fscale = (float)*(const double*)scale; + CALL_IPP_MUL(ippiMul_8u_C1RSfs) + mul_(src1, step1, src2, step2, dst, step, width, height, fscale); +} + +void mul8s( const schar* src1, size_t step1, const schar* src2, size_t step2, + schar* dst, size_t step, int width, int height, void* scale) +{ + CALL_HAL(hal_mul8s) + mul_(src1, step1, src2, step2, dst, step, width, height, (float)*(const double*)scale); +} + +void mul16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, + ushort* dst, size_t step, int width, int height, void* scale) +{ + CALL_HAL(hal_mul16u) + float fscale = (float)*(const double*)scale; + CALL_IPP_MUL(ippiMul_16u_C1RSfs) + mul_(src1, step1, src2, step2, dst, step, width, height, fscale); +} + +void mul16s( const short* src1, size_t step1, const short* src2, size_t step2, + short* dst, size_t step, int width, int height, void* scale) +{ + CALL_HAL(hal_mul16s) + float fscale = (float)*(const double*)scale; + CALL_IPP_MUL(ippiMul_16s_C1RSfs) + mul_(src1, step1, src2, step2, dst, step, width, height, fscale); +} + +void mul32s( const int* src1, size_t step1, const int* src2, size_t step2, + int* dst, size_t step, int width, int height, void* scale) +{ + CALL_HAL(hal_mul32s) + mul_(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale); +} + +void mul32f( const float* src1, size_t step1, const float* src2, size_t step2, + float* dst, size_t step, int width, int height, void* scale) +{ + CALL_HAL(hal_mul32f) + float fscale = (float)*(const double*)scale; + CALL_IPP_MUL(ippiMul_32f_C1R) + mul_(src1, step1, src2, step2, dst, step, width, height, fscale); +} + +void mul64f( const double* src1, size_t step1, const double* src2, size_t step2, + double* dst, size_t step, int width, int height, void* scale) +{ + CALL_HAL(hal_mul64f) + mul_(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale); +} + +//======================================= +// Divide +//======================================= + +void div8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, + uchar* dst, size_t step, int width, int height, void* scale) +{ + CALL_HAL(hal_div8u) + if( src1 ) + div_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale); + else + recip_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale); +} + +void div8s( const schar* src1, size_t step1, const schar* src2, size_t step2, + schar* dst, size_t step, int width, int height, void* scale) +{ + CALL_HAL(hal_div8s) + div_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale); +} + +void div16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, + ushort* dst, size_t step, int width, int height, void* scale) +{ + CALL_HAL(hal_div16u) + div_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale); +} + +void div16s( const short* src1, size_t step1, const short* src2, size_t step2, + short* dst, size_t step, int width, int height, void* scale) +{ + CALL_HAL(hal_div16s) + div_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale); +} + +void div32s( const int* src1, size_t step1, const int* src2, size_t step2, + int* dst, size_t step, int width, int height, void* scale) +{ + CALL_HAL(hal_div32s) + div_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale); +} + +void div32f( const float* src1, size_t step1, const float* src2, size_t step2, + float* dst, size_t step, int width, int height, void* scale) +{ + CALL_HAL(hal_div32f) + div_f(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale); +} + +void div64f( const double* src1, size_t step1, const double* src2, size_t step2, + double* dst, size_t step, int width, int height, void* scale) +{ + CALL_HAL(hal_div64f) + div_f(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale); +} + +//======================================= +// Reciprocial +//======================================= + +void recip8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, + uchar* dst, size_t step, int width, int height, void* scale) +{ + CALL_HAL(hal_recip8u) + recip_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale); +} + +void recip8s( const schar* src1, size_t step1, const schar* src2, size_t step2, + schar* dst, size_t step, int width, int height, void* scale) +{ + CALL_HAL(hal_recip8s) + recip_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale); +} + +void recip16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, + ushort* dst, size_t step, int width, int height, void* scale) +{ + CALL_HAL(hal_recip16u) + recip_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale); +} + +void recip16s( const short* src1, size_t step1, const short* src2, size_t step2, + short* dst, size_t step, int width, int height, void* scale) +{ + CALL_HAL(hal_recip16s) + recip_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale); +} + +void recip32s( const int* src1, size_t step1, const int* src2, size_t step2, + int* dst, size_t step, int width, int height, void* scale) +{ + CALL_HAL(hal_recip32s) + recip_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale); +} + +void recip32f( const float* src1, size_t step1, const float* src2, size_t step2, + float* dst, size_t step, int width, int height, void* scale) +{ + CALL_HAL(hal_recip32f) + recip_f(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale); +} + +void recip64f( const double* src1, size_t step1, const double* src2, size_t step2, + double* dst, size_t step, int width, int height, void* scale) +{ + CALL_HAL(hal_recip64f) + recip_f(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale); +} + +//======================================= + +#undef CALL_HAL +#define CALL_HAL(fun) \ + int res = fun(src1, step1, src2, step2, dst, step, width, height, scalars); \ + if (res == Error::Ok) \ + return; \ + else if (res != Error::NotImplemented) \ + throw Failure(res); + +//======================================= +// Add weighted +//======================================= + +void +addWeighted8u( const uchar* src1, size_t step1, + const uchar* src2, size_t step2, + uchar* dst, size_t step, int width, int height, + void* scalars ) +{ + CALL_HAL(hal_addWeighted8u) + const double* scalars_ = (const double*)scalars; + float alpha = (float)scalars_[0], beta = (float)scalars_[1], gamma = (float)scalars_[2]; + + for( ; height--; src1 += step1, src2 += step2, dst += step ) + { + int x = 0; + +#if CV_SSE2 + if( USE_SSE2 ) + { + __m128 a4 = _mm_set1_ps(alpha), b4 = _mm_set1_ps(beta), g4 = _mm_set1_ps(gamma); + __m128i z = _mm_setzero_si128(); + + for( ; x <= width - 8; x += 8 ) + { + __m128i u = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(src1 + x)), z); + __m128i v = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(src2 + x)), z); + + __m128 u0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(u, z)); + __m128 u1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(u, z)); + __m128 v0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v, z)); + __m128 v1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v, z)); + + u0 = _mm_add_ps(_mm_mul_ps(u0, a4), _mm_mul_ps(v0, b4)); + u1 = _mm_add_ps(_mm_mul_ps(u1, a4), _mm_mul_ps(v1, b4)); + u0 = _mm_add_ps(u0, g4); u1 = _mm_add_ps(u1, g4); + + u = _mm_packs_epi32(_mm_cvtps_epi32(u0), _mm_cvtps_epi32(u1)); + u = _mm_packus_epi16(u, u); + + _mm_storel_epi64((__m128i*)(dst + x), u); + } + } +#elif CV_NEON + float32x4_t g = vdupq_n_f32 (gamma); + + for( ; x <= width - 8; x += 8 ) + { + uint8x8_t in1 = vld1_u8(src1+x); + uint16x8_t in1_16 = vmovl_u8(in1); + float32x4_t in1_f_l = vcvtq_f32_u32(vmovl_u16(vget_low_u16(in1_16))); + float32x4_t in1_f_h = vcvtq_f32_u32(vmovl_u16(vget_high_u16(in1_16))); + + uint8x8_t in2 = vld1_u8(src2+x); + uint16x8_t in2_16 = vmovl_u8(in2); + float32x4_t in2_f_l = vcvtq_f32_u32(vmovl_u16(vget_low_u16(in2_16))); + float32x4_t in2_f_h = vcvtq_f32_u32(vmovl_u16(vget_high_u16(in2_16))); + + float32x4_t out_f_l = vaddq_f32(vmulq_n_f32(in1_f_l, alpha), vmulq_n_f32(in2_f_l, beta)); + float32x4_t out_f_h = vaddq_f32(vmulq_n_f32(in1_f_h, alpha), vmulq_n_f32(in2_f_h, beta)); + out_f_l = vaddq_f32(out_f_l, g); + out_f_h = vaddq_f32(out_f_h, g); + + uint16x4_t out_16_l = vqmovun_s32(cv_vrndq_s32_f32(out_f_l)); + uint16x4_t out_16_h = vqmovun_s32(cv_vrndq_s32_f32(out_f_h)); + + uint16x8_t out_16 = vcombine_u16(out_16_l, out_16_h); + uint8x8_t out = vqmovn_u16(out_16); + + vst1_u8(dst+x, out); + } +#endif + #if CV_ENABLE_UNROLLED + for( ; x <= width - 4; x += 4 ) + { + float t0, t1; + t0 = CV_8TO32F(src1[x])*alpha + CV_8TO32F(src2[x])*beta + gamma; + t1 = CV_8TO32F(src1[x+1])*alpha + CV_8TO32F(src2[x+1])*beta + gamma; + + dst[x] = saturate_cast(t0); + dst[x+1] = saturate_cast(t1); + + t0 = CV_8TO32F(src1[x+2])*alpha + CV_8TO32F(src2[x+2])*beta + gamma; + t1 = CV_8TO32F(src1[x+3])*alpha + CV_8TO32F(src2[x+3])*beta + gamma; + + dst[x+2] = saturate_cast(t0); + dst[x+3] = saturate_cast(t1); + } + #endif + + for( ; x < width; x++ ) + { + float t0 = CV_8TO32F(src1[x])*alpha + CV_8TO32F(src2[x])*beta + gamma; + dst[x] = saturate_cast(t0); + } + } +} + +void addWeighted8s( const schar* src1, size_t step1, const schar* src2, size_t step2, + schar* dst, size_t step, int width, int height, void* scalars ) +{ + CALL_HAL(hal_addWeighted8s) + addWeighted_(src1, step1, src2, step2, dst, step, width, height, scalars); +} + +void addWeighted16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, + ushort* dst, size_t step, int width, int height, void* scalars ) +{ + CALL_HAL(hal_addWeighted16u) + addWeighted_(src1, step1, src2, step2, dst, step, width, height, scalars); +} + +void addWeighted16s( const short* src1, size_t step1, const short* src2, size_t step2, + short* dst, size_t step, int width, int height, void* scalars ) +{ + CALL_HAL(hal_addWeighted16s) + addWeighted_(src1, step1, src2, step2, dst, step, width, height, scalars); +} + +void addWeighted32s( const int* src1, size_t step1, const int* src2, size_t step2, + int* dst, size_t step, int width, int height, void* scalars ) +{ + CALL_HAL(hal_addWeighted32s) + addWeighted_(src1, step1, src2, step2, dst, step, width, height, scalars); +} + +void addWeighted32f( const float* src1, size_t step1, const float* src2, size_t step2, + float* dst, size_t step, int width, int height, void* scalars ) +{ + CALL_HAL(hal_addWeighted32f) + addWeighted_(src1, step1, src2, step2, dst, step, width, height, scalars); +} + +void addWeighted64f( const double* src1, size_t step1, const double* src2, size_t step2, + double* dst, size_t step, int width, int height, void* scalars ) +{ + CALL_HAL(hal_addWeighted64f) + addWeighted_(src1, step1, src2, step2, dst, step, width, height, scalars); +} + +}} // cv::hal:: diff --git a/modules/hal/src/arithm_core.hpp b/modules/hal/src/arithm_core.hpp new file mode 100644 index 0000000000..a65e74c381 --- /dev/null +++ b/modules/hal/src/arithm_core.hpp @@ -0,0 +1,657 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 2013, OpenCV Foundation, all rights reserved. +// Copyright (C) 2015, Itseez Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#ifndef __OPENCV_HAL_ARITHM_CORE_HPP__ +#define __OPENCV_HAL_ARITHM_CORE_HPP__ + +#include "arithm_simd.hpp" + +const uchar g_Saturate8u[] = +{ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, + 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, + 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, + 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, + 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, + 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, + 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, + 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, + 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, + 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, + 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, + 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, + 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255 +}; + + +#define CV_FAST_CAST_8U(t) (assert(-256 <= (t) && (t) <= 512), g_Saturate8u[(t)+256]) +#define CV_MIN_8U(a,b) ((a) - CV_FAST_CAST_8U((a) - (b))) +#define CV_MAX_8U(a,b) ((a) + CV_FAST_CAST_8U((b) - (a))) + +const float g_8x32fTab[] = +{ + -128.f, -127.f, -126.f, -125.f, -124.f, -123.f, -122.f, -121.f, + -120.f, -119.f, -118.f, -117.f, -116.f, -115.f, -114.f, -113.f, + -112.f, -111.f, -110.f, -109.f, -108.f, -107.f, -106.f, -105.f, + -104.f, -103.f, -102.f, -101.f, -100.f, -99.f, -98.f, -97.f, + -96.f, -95.f, -94.f, -93.f, -92.f, -91.f, -90.f, -89.f, + -88.f, -87.f, -86.f, -85.f, -84.f, -83.f, -82.f, -81.f, + -80.f, -79.f, -78.f, -77.f, -76.f, -75.f, -74.f, -73.f, + -72.f, -71.f, -70.f, -69.f, -68.f, -67.f, -66.f, -65.f, + -64.f, -63.f, -62.f, -61.f, -60.f, -59.f, -58.f, -57.f, + -56.f, -55.f, -54.f, -53.f, -52.f, -51.f, -50.f, -49.f, + -48.f, -47.f, -46.f, -45.f, -44.f, -43.f, -42.f, -41.f, + -40.f, -39.f, -38.f, -37.f, -36.f, -35.f, -34.f, -33.f, + -32.f, -31.f, -30.f, -29.f, -28.f, -27.f, -26.f, -25.f, + -24.f, -23.f, -22.f, -21.f, -20.f, -19.f, -18.f, -17.f, + -16.f, -15.f, -14.f, -13.f, -12.f, -11.f, -10.f, -9.f, + -8.f, -7.f, -6.f, -5.f, -4.f, -3.f, -2.f, -1.f, + 0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, + 8.f, 9.f, 10.f, 11.f, 12.f, 13.f, 14.f, 15.f, + 16.f, 17.f, 18.f, 19.f, 20.f, 21.f, 22.f, 23.f, + 24.f, 25.f, 26.f, 27.f, 28.f, 29.f, 30.f, 31.f, + 32.f, 33.f, 34.f, 35.f, 36.f, 37.f, 38.f, 39.f, + 40.f, 41.f, 42.f, 43.f, 44.f, 45.f, 46.f, 47.f, + 48.f, 49.f, 50.f, 51.f, 52.f, 53.f, 54.f, 55.f, + 56.f, 57.f, 58.f, 59.f, 60.f, 61.f, 62.f, 63.f, + 64.f, 65.f, 66.f, 67.f, 68.f, 69.f, 70.f, 71.f, + 72.f, 73.f, 74.f, 75.f, 76.f, 77.f, 78.f, 79.f, + 80.f, 81.f, 82.f, 83.f, 84.f, 85.f, 86.f, 87.f, + 88.f, 89.f, 90.f, 91.f, 92.f, 93.f, 94.f, 95.f, + 96.f, 97.f, 98.f, 99.f, 100.f, 101.f, 102.f, 103.f, + 104.f, 105.f, 106.f, 107.f, 108.f, 109.f, 110.f, 111.f, + 112.f, 113.f, 114.f, 115.f, 116.f, 117.f, 118.f, 119.f, + 120.f, 121.f, 122.f, 123.f, 124.f, 125.f, 126.f, 127.f, + 128.f, 129.f, 130.f, 131.f, 132.f, 133.f, 134.f, 135.f, + 136.f, 137.f, 138.f, 139.f, 140.f, 141.f, 142.f, 143.f, + 144.f, 145.f, 146.f, 147.f, 148.f, 149.f, 150.f, 151.f, + 152.f, 153.f, 154.f, 155.f, 156.f, 157.f, 158.f, 159.f, + 160.f, 161.f, 162.f, 163.f, 164.f, 165.f, 166.f, 167.f, + 168.f, 169.f, 170.f, 171.f, 172.f, 173.f, 174.f, 175.f, + 176.f, 177.f, 178.f, 179.f, 180.f, 181.f, 182.f, 183.f, + 184.f, 185.f, 186.f, 187.f, 188.f, 189.f, 190.f, 191.f, + 192.f, 193.f, 194.f, 195.f, 196.f, 197.f, 198.f, 199.f, + 200.f, 201.f, 202.f, 203.f, 204.f, 205.f, 206.f, 207.f, + 208.f, 209.f, 210.f, 211.f, 212.f, 213.f, 214.f, 215.f, + 216.f, 217.f, 218.f, 219.f, 220.f, 221.f, 222.f, 223.f, + 224.f, 225.f, 226.f, 227.f, 228.f, 229.f, 230.f, 231.f, + 232.f, 233.f, 234.f, 235.f, 236.f, 237.f, 238.f, 239.f, + 240.f, 241.f, 242.f, 243.f, 244.f, 245.f, 246.f, 247.f, + 248.f, 249.f, 250.f, 251.f, 252.f, 253.f, 254.f, 255.f +}; + +#define CV_8TO32F(x) g_8x32fTab[(x)+128] + +namespace cv { + +template<> inline uchar OpAdd::operator ()(uchar a, uchar b) const +{ return CV_FAST_CAST_8U(a + b); } + +template<> inline uchar OpSub::operator ()(uchar a, uchar b) const +{ return CV_FAST_CAST_8U(a - b); } + +template<> inline short OpAbsDiff::operator ()(short a, short b) const +{ return saturate_cast(std::abs(a - b)); } + +template<> inline schar OpAbsDiff::operator ()(schar a, schar b) const +{ return saturate_cast(std::abs(a - b)); } + +template<> inline uchar OpMin::operator ()(uchar a, uchar b) const { return CV_MIN_8U(a, b); } + +template<> inline uchar OpMax::operator ()(uchar a, uchar b) const { return CV_MAX_8U(a, b); } + +} + +namespace cv { namespace hal { + +template +void vBinOp(const T* src1, size_t step1, const T* src2, size_t step2, T* dst, size_t step, int width, int height) +{ +#if CV_SSE2 || CV_NEON + VOp vop; +#endif + Op op; + + for( ; height--; src1 = (const T *)((const uchar *)src1 + step1), + src2 = (const T *)((const uchar *)src2 + step2), + dst = (T *)((uchar *)dst + step) ) + { + int x = 0; + +#if CV_NEON || CV_SSE2 +#if CV_AVX2 + if( USE_AVX2 ) + { + for( ; x <= width - 32/(int)sizeof(T); x += 32/sizeof(T) ) + { + typename VLoadStore256::reg_type r0 = VLoadStore256::load(src1 + x); + r0 = vop(r0, VLoadStore256::load(src2 + x)); + VLoadStore256::store(dst + x, r0); + } + } +#else +#if CV_SSE2 + if( USE_SSE2 ) + { +#endif // CV_SSE2 + for( ; x <= width - 32/(int)sizeof(T); x += 32/sizeof(T) ) + { + typename VLoadStore128::reg_type r0 = VLoadStore128::load(src1 + x ); + typename VLoadStore128::reg_type r1 = VLoadStore128::load(src1 + x + 16/sizeof(T)); + r0 = vop(r0, VLoadStore128::load(src2 + x )); + r1 = vop(r1, VLoadStore128::load(src2 + x + 16/sizeof(T))); + VLoadStore128::store(dst + x , r0); + VLoadStore128::store(dst + x + 16/sizeof(T), r1); + } +#if CV_SSE2 + } +#endif // CV_SSE2 +#endif // CV_AVX2 +#endif // CV_NEON || CV_SSE2 + +#if CV_AVX2 + // nothing +#elif CV_SSE2 + if( USE_SSE2 ) + { + for( ; x <= width - 8/(int)sizeof(T); x += 8/sizeof(T) ) + { + typename VLoadStore64::reg_type r = VLoadStore64::load(src1 + x); + r = vop(r, VLoadStore64::load(src2 + x)); + VLoadStore64::store(dst + x, r); + } + } +#endif + +#if CV_ENABLE_UNROLLED + for( ; x <= width - 4; x += 4 ) + { + T v0 = op(src1[x], src2[x]); + T v1 = op(src1[x+1], src2[x+1]); + dst[x] = v0; dst[x+1] = v1; + v0 = op(src1[x+2], src2[x+2]); + v1 = op(src1[x+3], src2[x+3]); + dst[x+2] = v0; dst[x+3] = v1; + } +#endif + + for( ; x < width; x++ ) + dst[x] = op(src1[x], src2[x]); + } +} + +template +void vBinOp32(const T* src1, size_t step1, const T* src2, size_t step2, + T* dst, size_t step, int width, int height) +{ +#if CV_SSE2 || CV_NEON + Op32 op32; +#endif + Op op; + + for( ; height--; src1 = (const T *)((const uchar *)src1 + step1), + src2 = (const T *)((const uchar *)src2 + step2), + dst = (T *)((uchar *)dst + step) ) + { + int x = 0; + +#if CV_AVX2 + if( USE_AVX2 ) + { + if( (((size_t)src1|(size_t)src2|(size_t)dst)&31) == 0 ) + { + for( ; x <= width - 8; x += 8 ) + { + typename VLoadStore256Aligned::reg_type r0 = VLoadStore256Aligned::load(src1 + x); + r0 = op32(r0, VLoadStore256Aligned::load(src2 + x)); + VLoadStore256Aligned::store(dst + x, r0); + } + } + } +#elif CV_SSE2 + if( USE_SSE2 ) + { + if( (((size_t)src1|(size_t)src2|(size_t)dst)&15) == 0 ) + { + for( ; x <= width - 8; x += 8 ) + { + typename VLoadStore128Aligned::reg_type r0 = VLoadStore128Aligned::load(src1 + x ); + typename VLoadStore128Aligned::reg_type r1 = VLoadStore128Aligned::load(src1 + x + 4); + r0 = op32(r0, VLoadStore128Aligned::load(src2 + x )); + r1 = op32(r1, VLoadStore128Aligned::load(src2 + x + 4)); + VLoadStore128Aligned::store(dst + x , r0); + VLoadStore128Aligned::store(dst + x + 4, r1); + } + } + } +#endif // CV_AVX2 + +#if CV_NEON || CV_SSE2 +#if CV_AVX2 + if( USE_AVX2 ) + { + for( ; x <= width - 8; x += 8 ) + { + typename VLoadStore256::reg_type r0 = VLoadStore256::load(src1 + x); + r0 = op32(r0, VLoadStore256::load(src2 + x)); + VLoadStore256::store(dst + x, r0); + } + } +#else +#if CV_SSE2 + if( USE_SSE2 ) + { +#endif // CV_SSE2 + for( ; x <= width - 8; x += 8 ) + { + typename VLoadStore128::reg_type r0 = VLoadStore128::load(src1 + x ); + typename VLoadStore128::reg_type r1 = VLoadStore128::load(src1 + x + 4); + r0 = op32(r0, VLoadStore128::load(src2 + x )); + r1 = op32(r1, VLoadStore128::load(src2 + x + 4)); + VLoadStore128::store(dst + x , r0); + VLoadStore128::store(dst + x + 4, r1); + } +#if CV_SSE2 + } +#endif // CV_SSE2 +#endif // CV_AVX2 +#endif // CV_NEON || CV_SSE2 + +#if CV_ENABLE_UNROLLED + for( ; x <= width - 4; x += 4 ) + { + T v0 = op(src1[x], src2[x]); + T v1 = op(src1[x+1], src2[x+1]); + dst[x] = v0; dst[x+1] = v1; + v0 = op(src1[x+2], src2[x+2]); + v1 = op(src1[x+3], src2[x+3]); + dst[x+2] = v0; dst[x+3] = v1; + } +#endif + + for( ; x < width; x++ ) + dst[x] = op(src1[x], src2[x]); + } +} + + +template +void vBinOp64(const T* src1, size_t step1, const T* src2, size_t step2, + T* dst, size_t step, int width, int height) +{ +#if CV_SSE2 + Op64 op64; +#endif + Op op; + + for( ; height--; src1 = (const T *)((const uchar *)src1 + step1), + src2 = (const T *)((const uchar *)src2 + step2), + dst = (T *)((uchar *)dst + step) ) + { + int x = 0; + +#if CV_AVX2 + if( USE_AVX2 ) + { + if( (((size_t)src1|(size_t)src2|(size_t)dst)&31) == 0 ) + { + for( ; x <= width - 4; x += 4 ) + { + typename VLoadStore256Aligned::reg_type r0 = VLoadStore256Aligned::load(src1 + x); + r0 = op64(r0, VLoadStore256Aligned::load(src2 + x)); + VLoadStore256Aligned::store(dst + x, r0); + } + } + } +#elif CV_SSE2 + if( USE_SSE2 ) + { + if( (((size_t)src1|(size_t)src2|(size_t)dst)&15) == 0 ) + { + for( ; x <= width - 4; x += 4 ) + { + typename VLoadStore128Aligned::reg_type r0 = VLoadStore128Aligned::load(src1 + x ); + typename VLoadStore128Aligned::reg_type r1 = VLoadStore128Aligned::load(src1 + x + 2); + r0 = op64(r0, VLoadStore128Aligned::load(src2 + x )); + r1 = op64(r1, VLoadStore128Aligned::load(src2 + x + 2)); + VLoadStore128Aligned::store(dst + x , r0); + VLoadStore128Aligned::store(dst + x + 2, r1); + } + } + } +#endif + + for( ; x <= width - 4; x += 4 ) + { + T v0 = op(src1[x], src2[x]); + T v1 = op(src1[x+1], src2[x+1]); + dst[x] = v0; dst[x+1] = v1; + v0 = op(src1[x+2], src2[x+2]); + v1 = op(src1[x+3], src2[x+3]); + dst[x+2] = v0; dst[x+3] = v1; + } + + for( ; x < width; x++ ) + dst[x] = op(src1[x], src2[x]); + } +} + +template static void +cmp_(const T* src1, size_t step1, const T* src2, size_t step2, + uchar* dst, size_t step, int width, int height, int code) +{ + step1 /= sizeof(src1[0]); + step2 /= sizeof(src2[0]); + if( code == CMP_GE || code == CMP_LT ) + { + std::swap(src1, src2); + std::swap(step1, step2); + code = code == CMP_GE ? CMP_LE : CMP_GT; + } + + Cmp_SIMD vop(code); + + if( code == CMP_GT || code == CMP_LE ) + { + int m = code == CMP_GT ? 0 : 255; + for( ; height--; src1 += step1, src2 += step2, dst += step ) + { + int x = vop(src1, src2, dst, width); + #if CV_ENABLE_UNROLLED + for( ; x <= width - 4; x += 4 ) + { + int t0, t1; + t0 = -(src1[x] > src2[x]) ^ m; + t1 = -(src1[x+1] > src2[x+1]) ^ m; + dst[x] = (uchar)t0; dst[x+1] = (uchar)t1; + t0 = -(src1[x+2] > src2[x+2]) ^ m; + t1 = -(src1[x+3] > src2[x+3]) ^ m; + dst[x+2] = (uchar)t0; dst[x+3] = (uchar)t1; + } + #endif + for( ; x < width; x++ ) + dst[x] = (uchar)(-(src1[x] > src2[x]) ^ m); + } + } + else if( code == CMP_EQ || code == CMP_NE ) + { + int m = code == CMP_EQ ? 0 : 255; + for( ; height--; src1 += step1, src2 += step2, dst += step ) + { + int x = 0; + #if CV_ENABLE_UNROLLED + for( ; x <= width - 4; x += 4 ) + { + int t0, t1; + t0 = -(src1[x] == src2[x]) ^ m; + t1 = -(src1[x+1] == src2[x+1]) ^ m; + dst[x] = (uchar)t0; dst[x+1] = (uchar)t1; + t0 = -(src1[x+2] == src2[x+2]) ^ m; + t1 = -(src1[x+3] == src2[x+3]) ^ m; + dst[x+2] = (uchar)t0; dst[x+3] = (uchar)t1; + } + #endif + for( ; x < width; x++ ) + dst[x] = (uchar)(-(src1[x] == src2[x]) ^ m); + } + } +} + +template static void +mul_( const T* src1, size_t step1, const T* src2, size_t step2, + T* dst, size_t step, int width, int height, WT scale ) +{ + step1 /= sizeof(src1[0]); + step2 /= sizeof(src2[0]); + step /= sizeof(dst[0]); + + Mul_SIMD vop; + + if( scale == (WT)1. ) + { + for( ; height--; src1 += step1, src2 += step2, dst += step ) + { + int i = vop(src1, src2, dst, width, scale); + #if CV_ENABLE_UNROLLED + for(; i <= width - 4; i += 4 ) + { + T t0; + T t1; + t0 = saturate_cast(src1[i ] * src2[i ]); + t1 = saturate_cast(src1[i+1] * src2[i+1]); + dst[i ] = t0; + dst[i+1] = t1; + + t0 = saturate_cast(src1[i+2] * src2[i+2]); + t1 = saturate_cast(src1[i+3] * src2[i+3]); + dst[i+2] = t0; + dst[i+3] = t1; + } + #endif + for( ; i < width; i++ ) + dst[i] = saturate_cast(src1[i] * src2[i]); + } + } + else + { + for( ; height--; src1 += step1, src2 += step2, dst += step ) + { + int i = vop(src1, src2, dst, width, scale); + #if CV_ENABLE_UNROLLED + for(; i <= width - 4; i += 4 ) + { + T t0 = saturate_cast(scale*(WT)src1[i]*src2[i]); + T t1 = saturate_cast(scale*(WT)src1[i+1]*src2[i+1]); + dst[i] = t0; dst[i+1] = t1; + + t0 = saturate_cast(scale*(WT)src1[i+2]*src2[i+2]); + t1 = saturate_cast(scale*(WT)src1[i+3]*src2[i+3]); + dst[i+2] = t0; dst[i+3] = t1; + } + #endif + for( ; i < width; i++ ) + dst[i] = saturate_cast(scale*(WT)src1[i]*src2[i]); + } + } +} + + +template static void +div_i( const T* src1, size_t step1, const T* src2, size_t step2, + T* dst, size_t step, int width, int height, double scale ) +{ + step1 /= sizeof(src1[0]); + step2 /= sizeof(src2[0]); + step /= sizeof(dst[0]); + + Div_SIMD vop; + float scale_f = (float)scale; + + for( ; height--; src1 += step1, src2 += step2, dst += step ) + { + int i = vop(src1, src2, dst, width, scale); + for( ; i < width; i++ ) + { + T num = src1[i], denom = src2[i]; + dst[i] = denom != 0 ? saturate_cast(num*scale_f/denom) : (T)0; + } + } +} + +template static void +div_f( const T* src1, size_t step1, const T* src2, size_t step2, + T* dst, size_t step, int width, int height, double scale ) +{ + T scale_f = (T)scale; + step1 /= sizeof(src1[0]); + step2 /= sizeof(src2[0]); + step /= sizeof(dst[0]); + + Div_SIMD vop; + + for( ; height--; src1 += step1, src2 += step2, dst += step ) + { + int i = vop(src1, src2, dst, width, scale); + for( ; i < width; i++ ) + { + T num = src1[i], denom = src2[i]; + dst[i] = denom != 0 ? saturate_cast(num*scale_f/denom) : (T)0; + } + } +} + +template static void +recip_i( const T*, size_t, const T* src2, size_t step2, + T* dst, size_t step, int width, int height, double scale ) +{ + step2 /= sizeof(src2[0]); + step /= sizeof(dst[0]); + + Recip_SIMD vop; + float scale_f = (float)scale; + + for( ; height--; src2 += step2, dst += step ) + { + int i = vop(src2, dst, width, scale); + for( ; i < width; i++ ) + { + T denom = src2[i]; + dst[i] = denom != 0 ? saturate_cast(scale_f/denom) : (T)0; + } + } +} + +template static void +recip_f( const T*, size_t, const T* src2, size_t step2, + T* dst, size_t step, int width, int height, double scale ) +{ + T scale_f = (T)scale; + step2 /= sizeof(src2[0]); + step /= sizeof(dst[0]); + + Recip_SIMD vop; + + for( ; height--; src2 += step2, dst += step ) + { + int i = vop(src2, dst, width, scale); + for( ; i < width; i++ ) + { + T denom = src2[i]; + dst[i] = denom != 0 ? saturate_cast(scale_f/denom) : (T)0; + } + } +} + +template static void +addWeighted_( const T* src1, size_t step1, const T* src2, size_t step2, + T* dst, size_t step, int width, int height, void* _scalars ) +{ + const double* scalars = (const double*)_scalars; + WT alpha = (WT)scalars[0], beta = (WT)scalars[1], gamma = (WT)scalars[2]; + step1 /= sizeof(src1[0]); + step2 /= sizeof(src2[0]); + step /= sizeof(dst[0]); + + AddWeighted_SIMD vop; + + for( ; height--; src1 += step1, src2 += step2, dst += step ) + { + int x = vop(src1, src2, dst, width, alpha, beta, gamma); + #if CV_ENABLE_UNROLLED + for( ; x <= width - 4; x += 4 ) + { + T t0 = saturate_cast(src1[x]*alpha + src2[x]*beta + gamma); + T t1 = saturate_cast(src1[x+1]*alpha + src2[x+1]*beta + gamma); + dst[x] = t0; dst[x+1] = t1; + + t0 = saturate_cast(src1[x+2]*alpha + src2[x+2]*beta + gamma); + t1 = saturate_cast(src1[x+3]*alpha + src2[x+3]*beta + gamma); + dst[x+2] = t0; dst[x+3] = t1; + } + #endif + for( ; x < width; x++ ) + dst[x] = saturate_cast(src1[x]*alpha + src2[x]*beta + gamma); + } +} + +}} // cv::hal:: + + +#endif // __OPENCV_HAL_ARITHM_CORE_HPP__ diff --git a/modules/hal/src/arithm_simd.hpp b/modules/hal/src/arithm_simd.hpp new file mode 100644 index 0000000000..4e4029875c --- /dev/null +++ b/modules/hal/src/arithm_simd.hpp @@ -0,0 +1,2025 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 2013, OpenCV Foundation, all rights reserved. +// Copyright (C) 2015, Itseez Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#ifndef __OPENCV_HAL_ARITHM_SIMD_HPP__ +#define __OPENCV_HAL_ARITHM_SIMD_HPP__ + +namespace cv { namespace hal { + +struct NOP {}; + +#if CV_SSE2 || CV_NEON +#define IF_SIMD(op) op +#else +#define IF_SIMD(op) NOP +#endif + + +#if CV_SSE2 || CV_NEON + +#define FUNCTOR_TEMPLATE(name) \ + template struct name {} + +FUNCTOR_TEMPLATE(VLoadStore128); +#if CV_SSE2 +FUNCTOR_TEMPLATE(VLoadStore64); +FUNCTOR_TEMPLATE(VLoadStore128Aligned); +#if CV_AVX2 +FUNCTOR_TEMPLATE(VLoadStore256); +FUNCTOR_TEMPLATE(VLoadStore256Aligned); +#endif +#endif + +#endif + +#if CV_AVX2 + +#define FUNCTOR_LOADSTORE_CAST(name, template_arg, register_type, load_body, store_body) \ + template <> \ + struct name{ \ + typedef register_type reg_type; \ + static reg_type load(const template_arg * p) { return load_body ((const reg_type *)p); } \ + static void store(template_arg * p, reg_type v) { store_body ((reg_type *)p, v); } \ + } + +#define FUNCTOR_LOADSTORE(name, template_arg, register_type, load_body, store_body) \ + template <> \ + struct name{ \ + typedef register_type reg_type; \ + static reg_type load(const template_arg * p) { return load_body (p); } \ + static void store(template_arg * p, reg_type v) { store_body (p, v); } \ + } + +#define FUNCTOR_CLOSURE_2arg(name, template_arg, body) \ + template<> \ + struct name \ + { \ + VLoadStore256::reg_type operator()( \ + const VLoadStore256::reg_type & a, \ + const VLoadStore256::reg_type & b) const \ + { \ + body; \ + } \ + } + +#define FUNCTOR_CLOSURE_1arg(name, template_arg, body) \ + template<> \ + struct name \ + { \ + VLoadStore256::reg_type operator()( \ + const VLoadStore256::reg_type & a, \ + const VLoadStore256::reg_type & ) const \ + { \ + body; \ + } \ + } + +FUNCTOR_LOADSTORE_CAST(VLoadStore256, uchar, __m256i, _mm256_loadu_si256, _mm256_storeu_si256); +FUNCTOR_LOADSTORE_CAST(VLoadStore256, schar, __m256i, _mm256_loadu_si256, _mm256_storeu_si256); +FUNCTOR_LOADSTORE_CAST(VLoadStore256, ushort, __m256i, _mm256_loadu_si256, _mm256_storeu_si256); +FUNCTOR_LOADSTORE_CAST(VLoadStore256, short, __m256i, _mm256_loadu_si256, _mm256_storeu_si256); +FUNCTOR_LOADSTORE_CAST(VLoadStore256, int, __m256i, _mm256_loadu_si256, _mm256_storeu_si256); +FUNCTOR_LOADSTORE( VLoadStore256, float, __m256 , _mm256_loadu_ps , _mm256_storeu_ps ); +FUNCTOR_LOADSTORE( VLoadStore256, double, __m256d, _mm256_loadu_pd , _mm256_storeu_pd ); + +FUNCTOR_LOADSTORE_CAST(VLoadStore256Aligned, int, __m256i, _mm256_load_si256, _mm256_store_si256); +FUNCTOR_LOADSTORE( VLoadStore256Aligned, float, __m256 , _mm256_load_ps , _mm256_store_ps ); +FUNCTOR_LOADSTORE( VLoadStore256Aligned, double, __m256d, _mm256_load_pd , _mm256_store_pd ); + +FUNCTOR_TEMPLATE(VAdd); +FUNCTOR_CLOSURE_2arg(VAdd, uchar, return _mm256_adds_epu8 (a, b)); +FUNCTOR_CLOSURE_2arg(VAdd, schar, return _mm256_adds_epi8 (a, b)); +FUNCTOR_CLOSURE_2arg(VAdd, ushort, return _mm256_adds_epu16(a, b)); +FUNCTOR_CLOSURE_2arg(VAdd, short, return _mm256_adds_epi16(a, b)); +FUNCTOR_CLOSURE_2arg(VAdd, int, return _mm256_add_epi32 (a, b)); +FUNCTOR_CLOSURE_2arg(VAdd, float, return _mm256_add_ps (a, b)); +FUNCTOR_CLOSURE_2arg(VAdd, double, return _mm256_add_pd (a, b)); + +FUNCTOR_TEMPLATE(VSub); +FUNCTOR_CLOSURE_2arg(VSub, uchar, return _mm256_subs_epu8 (a, b)); +FUNCTOR_CLOSURE_2arg(VSub, schar, return _mm256_subs_epi8 (a, b)); +FUNCTOR_CLOSURE_2arg(VSub, ushort, return _mm256_subs_epu16(a, b)); +FUNCTOR_CLOSURE_2arg(VSub, short, return _mm256_subs_epi16(a, b)); +FUNCTOR_CLOSURE_2arg(VSub, int, return _mm256_sub_epi32 (a, b)); +FUNCTOR_CLOSURE_2arg(VSub, float, return _mm256_sub_ps (a, b)); +FUNCTOR_CLOSURE_2arg(VSub, double, return _mm256_sub_pd (a, b)); + +FUNCTOR_TEMPLATE(VMin); +FUNCTOR_CLOSURE_2arg(VMin, uchar, return _mm256_min_epu8 (a, b)); +FUNCTOR_CLOSURE_2arg(VMin, schar, return _mm256_min_epi8 (a, b)); +FUNCTOR_CLOSURE_2arg(VMin, ushort, return _mm256_min_epi16(a, b)); +FUNCTOR_CLOSURE_2arg(VMin, short, return _mm256_min_epi16(a, b)); +FUNCTOR_CLOSURE_2arg(VMin, int, return _mm256_min_epi32(a, b)); +FUNCTOR_CLOSURE_2arg(VMin, float, return _mm256_min_ps (a, b)); +FUNCTOR_CLOSURE_2arg(VMin, double, return _mm256_min_pd (a, b)); + +FUNCTOR_TEMPLATE(VMax); +FUNCTOR_CLOSURE_2arg(VMax, uchar, return _mm256_max_epu8 (a, b)); +FUNCTOR_CLOSURE_2arg(VMax, schar, return _mm256_max_epi8 (a, b)); +FUNCTOR_CLOSURE_2arg(VMax, ushort, return _mm256_max_epu16(a, b)); +FUNCTOR_CLOSURE_2arg(VMax, short, return _mm256_max_epi16(a, b)); +FUNCTOR_CLOSURE_2arg(VMax, int, return _mm256_max_epi32(a, b)); +FUNCTOR_CLOSURE_2arg(VMax, float, return _mm256_max_ps (a, b)); +FUNCTOR_CLOSURE_2arg(VMax, double, return _mm256_max_pd (a, b)); + + +static unsigned int CV_DECL_ALIGNED(32) v32f_absmask[] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, + 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff }; +static unsigned int CV_DECL_ALIGNED(32) v64f_absmask[] = { 0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff, + 0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff }; + +FUNCTOR_TEMPLATE(VAbsDiff); +FUNCTOR_CLOSURE_2arg(VAbsDiff, uchar, + return _mm256_add_epi8(_mm256_subs_epu8(a, b), _mm256_subs_epu8(b, a)); + ); +FUNCTOR_CLOSURE_2arg(VAbsDiff, schar, + __m256i d = _mm256_subs_epi8(a, b); + __m256i m = _mm256_cmpgt_epi8(b, a); + return _mm256_subs_epi8(_mm256_xor_si256(d, m), m); + ); +FUNCTOR_CLOSURE_2arg(VAbsDiff, ushort, + return _mm256_add_epi16(_mm256_subs_epu16(a, b), _mm256_subs_epu16(b, a)); + ); +FUNCTOR_CLOSURE_2arg(VAbsDiff, short, + __m256i M = _mm256_max_epi16(a, b); + __m256i m = _mm256_min_epi16(a, b); + return _mm256_subs_epi16(M, m); + ); +FUNCTOR_CLOSURE_2arg(VAbsDiff, int, + __m256i d = _mm256_sub_epi32(a, b); + __m256i m = _mm256_cmpgt_epi32(b, a); + return _mm256_sub_epi32(_mm256_xor_si256(d, m), m); + ); +FUNCTOR_CLOSURE_2arg(VAbsDiff, float, + return _mm256_and_ps(_mm256_sub_ps(a, b), *(const __m256*)v32f_absmask); + ); +FUNCTOR_CLOSURE_2arg(VAbsDiff, double, + return _mm256_and_pd(_mm256_sub_pd(a, b), *(const __m256d*)v64f_absmask); + ); + +FUNCTOR_TEMPLATE(VAnd); +FUNCTOR_CLOSURE_2arg(VAnd, uchar, return _mm256_and_si256(a, b)); +FUNCTOR_TEMPLATE(VOr); +FUNCTOR_CLOSURE_2arg(VOr , uchar, return _mm256_or_si256 (a, b)); +FUNCTOR_TEMPLATE(VXor); +FUNCTOR_CLOSURE_2arg(VXor, uchar, return _mm256_xor_si256(a, b)); +FUNCTOR_TEMPLATE(VNot); +FUNCTOR_CLOSURE_1arg(VNot, uchar, return _mm256_xor_si256(_mm256_set1_epi32(-1), a)); + +#elif CV_SSE2 + +#define FUNCTOR_LOADSTORE_CAST(name, template_arg, register_type, load_body, store_body)\ + template <> \ + struct name{ \ + typedef register_type reg_type; \ + static reg_type load(const template_arg * p) { return load_body ((const reg_type *)p); } \ + static void store(template_arg * p, reg_type v) { store_body ((reg_type *)p, v); } \ + } + +#define FUNCTOR_LOADSTORE(name, template_arg, register_type, load_body, store_body)\ + template <> \ + struct name{ \ + typedef register_type reg_type; \ + static reg_type load(const template_arg * p) { return load_body (p); } \ + static void store(template_arg * p, reg_type v) { store_body (p, v); } \ + } + +#define FUNCTOR_CLOSURE_2arg(name, template_arg, body)\ + template<> \ + struct name \ + { \ + VLoadStore128::reg_type operator()( \ + const VLoadStore128::reg_type & a, \ + const VLoadStore128::reg_type & b) const \ + { \ + body; \ + } \ + } + +#define FUNCTOR_CLOSURE_1arg(name, template_arg, body)\ + template<> \ + struct name \ + { \ + VLoadStore128::reg_type operator()( \ + const VLoadStore128::reg_type & a, \ + const VLoadStore128::reg_type & ) const \ + { \ + body; \ + } \ + } + +FUNCTOR_LOADSTORE_CAST(VLoadStore128, uchar, __m128i, _mm_loadu_si128, _mm_storeu_si128); +FUNCTOR_LOADSTORE_CAST(VLoadStore128, schar, __m128i, _mm_loadu_si128, _mm_storeu_si128); +FUNCTOR_LOADSTORE_CAST(VLoadStore128, ushort, __m128i, _mm_loadu_si128, _mm_storeu_si128); +FUNCTOR_LOADSTORE_CAST(VLoadStore128, short, __m128i, _mm_loadu_si128, _mm_storeu_si128); +FUNCTOR_LOADSTORE_CAST(VLoadStore128, int, __m128i, _mm_loadu_si128, _mm_storeu_si128); +FUNCTOR_LOADSTORE( VLoadStore128, float, __m128 , _mm_loadu_ps , _mm_storeu_ps ); +FUNCTOR_LOADSTORE( VLoadStore128, double, __m128d, _mm_loadu_pd , _mm_storeu_pd ); + +FUNCTOR_LOADSTORE_CAST(VLoadStore64, uchar, __m128i, _mm_loadl_epi64, _mm_storel_epi64); +FUNCTOR_LOADSTORE_CAST(VLoadStore64, schar, __m128i, _mm_loadl_epi64, _mm_storel_epi64); +FUNCTOR_LOADSTORE_CAST(VLoadStore64, ushort, __m128i, _mm_loadl_epi64, _mm_storel_epi64); +FUNCTOR_LOADSTORE_CAST(VLoadStore64, short, __m128i, _mm_loadl_epi64, _mm_storel_epi64); + +FUNCTOR_LOADSTORE_CAST(VLoadStore128Aligned, int, __m128i, _mm_load_si128, _mm_store_si128); +FUNCTOR_LOADSTORE( VLoadStore128Aligned, float, __m128 , _mm_load_ps , _mm_store_ps ); +FUNCTOR_LOADSTORE( VLoadStore128Aligned, double, __m128d, _mm_load_pd , _mm_store_pd ); + +FUNCTOR_TEMPLATE(VAdd); +FUNCTOR_CLOSURE_2arg(VAdd, uchar, return _mm_adds_epu8 (a, b)); +FUNCTOR_CLOSURE_2arg(VAdd, schar, return _mm_adds_epi8 (a, b)); +FUNCTOR_CLOSURE_2arg(VAdd, ushort, return _mm_adds_epu16(a, b)); +FUNCTOR_CLOSURE_2arg(VAdd, short, return _mm_adds_epi16(a, b)); +FUNCTOR_CLOSURE_2arg(VAdd, int, return _mm_add_epi32 (a, b)); +FUNCTOR_CLOSURE_2arg(VAdd, float, return _mm_add_ps (a, b)); +FUNCTOR_CLOSURE_2arg(VAdd, double, return _mm_add_pd (a, b)); + +FUNCTOR_TEMPLATE(VSub); +FUNCTOR_CLOSURE_2arg(VSub, uchar, return _mm_subs_epu8 (a, b)); +FUNCTOR_CLOSURE_2arg(VSub, schar, return _mm_subs_epi8 (a, b)); +FUNCTOR_CLOSURE_2arg(VSub, ushort, return _mm_subs_epu16(a, b)); +FUNCTOR_CLOSURE_2arg(VSub, short, return _mm_subs_epi16(a, b)); +FUNCTOR_CLOSURE_2arg(VSub, int, return _mm_sub_epi32 (a, b)); +FUNCTOR_CLOSURE_2arg(VSub, float, return _mm_sub_ps (a, b)); +FUNCTOR_CLOSURE_2arg(VSub, double, return _mm_sub_pd (a, b)); + +FUNCTOR_TEMPLATE(VMin); +FUNCTOR_CLOSURE_2arg(VMin, uchar, return _mm_min_epu8(a, b)); +FUNCTOR_CLOSURE_2arg(VMin, schar, + __m128i m = _mm_cmpgt_epi8(a, b); + return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m)); + ); +FUNCTOR_CLOSURE_2arg(VMin, ushort, return _mm_subs_epu16(a, _mm_subs_epu16(a, b))); +FUNCTOR_CLOSURE_2arg(VMin, short, return _mm_min_epi16(a, b)); +FUNCTOR_CLOSURE_2arg(VMin, int, + __m128i m = _mm_cmpgt_epi32(a, b); + return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m)); + ); +FUNCTOR_CLOSURE_2arg(VMin, float, return _mm_min_ps(a, b)); +FUNCTOR_CLOSURE_2arg(VMin, double, return _mm_min_pd(a, b)); + +FUNCTOR_TEMPLATE(VMax); +FUNCTOR_CLOSURE_2arg(VMax, uchar, return _mm_max_epu8(a, b)); +FUNCTOR_CLOSURE_2arg(VMax, schar, + __m128i m = _mm_cmpgt_epi8(b, a); + return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m)); + ); +FUNCTOR_CLOSURE_2arg(VMax, ushort, return _mm_adds_epu16(_mm_subs_epu16(a, b), b)); +FUNCTOR_CLOSURE_2arg(VMax, short, return _mm_max_epi16(a, b)); +FUNCTOR_CLOSURE_2arg(VMax, int, + __m128i m = _mm_cmpgt_epi32(b, a); + return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m)); + ); +FUNCTOR_CLOSURE_2arg(VMax, float, return _mm_max_ps(a, b)); +FUNCTOR_CLOSURE_2arg(VMax, double, return _mm_max_pd(a, b)); + + +static unsigned int CV_DECL_ALIGNED(16) v32f_absmask[] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff }; +static unsigned int CV_DECL_ALIGNED(16) v64f_absmask[] = { 0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff }; + +FUNCTOR_TEMPLATE(VAbsDiff); +FUNCTOR_CLOSURE_2arg(VAbsDiff, uchar, + return _mm_add_epi8(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a)); + ); +FUNCTOR_CLOSURE_2arg(VAbsDiff, schar, + __m128i d = _mm_subs_epi8(a, b); + __m128i m = _mm_cmpgt_epi8(b, a); + return _mm_subs_epi8(_mm_xor_si128(d, m), m); + ); +FUNCTOR_CLOSURE_2arg(VAbsDiff, ushort, + return _mm_add_epi16(_mm_subs_epu16(a, b), _mm_subs_epu16(b, a)); + ); +FUNCTOR_CLOSURE_2arg(VAbsDiff, short, + __m128i M = _mm_max_epi16(a, b); + __m128i m = _mm_min_epi16(a, b); + return _mm_subs_epi16(M, m); + ); +FUNCTOR_CLOSURE_2arg(VAbsDiff, int, + __m128i d = _mm_sub_epi32(a, b); + __m128i m = _mm_cmpgt_epi32(b, a); + return _mm_sub_epi32(_mm_xor_si128(d, m), m); + ); +FUNCTOR_CLOSURE_2arg(VAbsDiff, float, + return _mm_and_ps(_mm_sub_ps(a,b), *(const __m128*)v32f_absmask); + ); +FUNCTOR_CLOSURE_2arg(VAbsDiff, double, + return _mm_and_pd(_mm_sub_pd(a,b), *(const __m128d*)v64f_absmask); + ); + +FUNCTOR_TEMPLATE(VAnd); +FUNCTOR_CLOSURE_2arg(VAnd, uchar, return _mm_and_si128(a, b)); +FUNCTOR_TEMPLATE(VOr); +FUNCTOR_CLOSURE_2arg(VOr , uchar, return _mm_or_si128 (a, b)); +FUNCTOR_TEMPLATE(VXor); +FUNCTOR_CLOSURE_2arg(VXor, uchar, return _mm_xor_si128(a, b)); +FUNCTOR_TEMPLATE(VNot); +FUNCTOR_CLOSURE_1arg(VNot, uchar, return _mm_xor_si128(_mm_set1_epi32(-1), a)); +#endif + +#if CV_NEON + +#define FUNCTOR_LOADSTORE(name, template_arg, register_type, load_body, store_body)\ + template <> \ + struct name{ \ + typedef register_type reg_type; \ + static reg_type load(const template_arg * p) { return load_body (p);}; \ + static void store(template_arg * p, reg_type v) { store_body (p, v);}; \ + } + +#define FUNCTOR_CLOSURE_2arg(name, template_arg, body)\ + template<> \ + struct name \ + { \ + VLoadStore128::reg_type operator()( \ + VLoadStore128::reg_type a, \ + VLoadStore128::reg_type b) const \ + { \ + return body; \ + }; \ + } + +#define FUNCTOR_CLOSURE_1arg(name, template_arg, body)\ + template<> \ + struct name \ + { \ + VLoadStore128::reg_type operator()( \ + VLoadStore128::reg_type a, \ + VLoadStore128::reg_type ) const \ + { \ + return body; \ + }; \ + } + +FUNCTOR_LOADSTORE(VLoadStore128, uchar, uint8x16_t, vld1q_u8 , vst1q_u8 ); +FUNCTOR_LOADSTORE(VLoadStore128, schar, int8x16_t, vld1q_s8 , vst1q_s8 ); +FUNCTOR_LOADSTORE(VLoadStore128, ushort, uint16x8_t, vld1q_u16, vst1q_u16); +FUNCTOR_LOADSTORE(VLoadStore128, short, int16x8_t, vld1q_s16, vst1q_s16); +FUNCTOR_LOADSTORE(VLoadStore128, int, int32x4_t, vld1q_s32, vst1q_s32); +FUNCTOR_LOADSTORE(VLoadStore128, float, float32x4_t, vld1q_f32, vst1q_f32); + +FUNCTOR_TEMPLATE(VAdd); +FUNCTOR_CLOSURE_2arg(VAdd, uchar, vqaddq_u8 (a, b)); +FUNCTOR_CLOSURE_2arg(VAdd, schar, vqaddq_s8 (a, b)); +FUNCTOR_CLOSURE_2arg(VAdd, ushort, vqaddq_u16(a, b)); +FUNCTOR_CLOSURE_2arg(VAdd, short, vqaddq_s16(a, b)); +FUNCTOR_CLOSURE_2arg(VAdd, int, vaddq_s32 (a, b)); +FUNCTOR_CLOSURE_2arg(VAdd, float, vaddq_f32 (a, b)); + +FUNCTOR_TEMPLATE(VSub); +FUNCTOR_CLOSURE_2arg(VSub, uchar, vqsubq_u8 (a, b)); +FUNCTOR_CLOSURE_2arg(VSub, schar, vqsubq_s8 (a, b)); +FUNCTOR_CLOSURE_2arg(VSub, ushort, vqsubq_u16(a, b)); +FUNCTOR_CLOSURE_2arg(VSub, short, vqsubq_s16(a, b)); +FUNCTOR_CLOSURE_2arg(VSub, int, vsubq_s32 (a, b)); +FUNCTOR_CLOSURE_2arg(VSub, float, vsubq_f32 (a, b)); + +FUNCTOR_TEMPLATE(VMin); +FUNCTOR_CLOSURE_2arg(VMin, uchar, vminq_u8 (a, b)); +FUNCTOR_CLOSURE_2arg(VMin, schar, vminq_s8 (a, b)); +FUNCTOR_CLOSURE_2arg(VMin, ushort, vminq_u16(a, b)); +FUNCTOR_CLOSURE_2arg(VMin, short, vminq_s16(a, b)); +FUNCTOR_CLOSURE_2arg(VMin, int, vminq_s32(a, b)); +FUNCTOR_CLOSURE_2arg(VMin, float, vminq_f32(a, b)); + +FUNCTOR_TEMPLATE(VMax); +FUNCTOR_CLOSURE_2arg(VMax, uchar, vmaxq_u8 (a, b)); +FUNCTOR_CLOSURE_2arg(VMax, schar, vmaxq_s8 (a, b)); +FUNCTOR_CLOSURE_2arg(VMax, ushort, vmaxq_u16(a, b)); +FUNCTOR_CLOSURE_2arg(VMax, short, vmaxq_s16(a, b)); +FUNCTOR_CLOSURE_2arg(VMax, int, vmaxq_s32(a, b)); +FUNCTOR_CLOSURE_2arg(VMax, float, vmaxq_f32(a, b)); + +FUNCTOR_TEMPLATE(VAbsDiff); +FUNCTOR_CLOSURE_2arg(VAbsDiff, uchar, vabdq_u8 (a, b)); +FUNCTOR_CLOSURE_2arg(VAbsDiff, schar, vqabsq_s8 (vqsubq_s8(a, b))); +FUNCTOR_CLOSURE_2arg(VAbsDiff, ushort, vabdq_u16 (a, b)); +FUNCTOR_CLOSURE_2arg(VAbsDiff, short, vqabsq_s16(vqsubq_s16(a, b))); +FUNCTOR_CLOSURE_2arg(VAbsDiff, int, vabdq_s32 (a, b)); +FUNCTOR_CLOSURE_2arg(VAbsDiff, float, vabdq_f32 (a, b)); + +FUNCTOR_TEMPLATE(VAnd); +FUNCTOR_CLOSURE_2arg(VAnd, uchar, vandq_u8(a, b)); +FUNCTOR_TEMPLATE(VOr); +FUNCTOR_CLOSURE_2arg(VOr , uchar, vorrq_u8(a, b)); +FUNCTOR_TEMPLATE(VXor); +FUNCTOR_CLOSURE_2arg(VXor, uchar, veorq_u8(a, b)); +FUNCTOR_TEMPLATE(VNot); +FUNCTOR_CLOSURE_1arg(VNot, uchar, vmvnq_u8(a )); +#endif + + +template +struct Cmp_SIMD +{ + explicit Cmp_SIMD(int) + { + } + + int operator () (const T *, const T *, uchar *, int) const + { + return 0; + } +}; + +#if CV_NEON + +template <> +struct Cmp_SIMD +{ + explicit Cmp_SIMD(int code_) : + code(code_) + { + // CV_Assert(code == CMP_GT || code == CMP_LE || + // code == CMP_EQ || code == CMP_NE); + + v_mask = vdupq_n_u8(255); + } + + int operator () (const schar * src1, const schar * src2, uchar * dst, int width) const + { + int x = 0; + + if (code == CMP_GT) + for ( ; x <= width - 16; x += 16) + vst1q_u8(dst + x, vcgtq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x))); + else if (code == CMP_LE) + for ( ; x <= width - 16; x += 16) + vst1q_u8(dst + x, vcleq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x))); + else if (code == CMP_EQ) + for ( ; x <= width - 16; x += 16) + vst1q_u8(dst + x, vceqq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x))); + else if (code == CMP_NE) + for ( ; x <= width - 16; x += 16) + vst1q_u8(dst + x, veorq_u8(vceqq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x)), v_mask)); + + return x; + } + + int code; + uint8x16_t v_mask; +}; + +template <> +struct Cmp_SIMD +{ + explicit Cmp_SIMD(int code_) : + code(code_) + { + // CV_Assert(code == CMP_GT || code == CMP_LE || + // code == CMP_EQ || code == CMP_NE); + + v_mask = vdup_n_u8(255); + } + + int operator () (const ushort * src1, const ushort * src2, uchar * dst, int width) const + { + int x = 0; + + if (code == CMP_GT) + for ( ; x <= width - 8; x += 8) + { + uint16x8_t v_dst = vcgtq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x)); + vst1_u8(dst + x, vmovn_u16(v_dst)); + } + else if (code == CMP_LE) + for ( ; x <= width - 8; x += 8) + { + uint16x8_t v_dst = vcleq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x)); + vst1_u8(dst + x, vmovn_u16(v_dst)); + } + else if (code == CMP_EQ) + for ( ; x <= width - 8; x += 8) + { + uint16x8_t v_dst = vceqq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x)); + vst1_u8(dst + x, vmovn_u16(v_dst)); + } + else if (code == CMP_NE) + for ( ; x <= width - 8; x += 8) + { + uint16x8_t v_dst = vceqq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x)); + vst1_u8(dst + x, veor_u8(vmovn_u16(v_dst), v_mask)); + } + + return x; + } + + int code; + uint8x8_t v_mask; +}; + +template <> +struct Cmp_SIMD +{ + explicit Cmp_SIMD(int code_) : + code(code_) + { + // CV_Assert(code == CMP_GT || code == CMP_LE || + // code == CMP_EQ || code == CMP_NE); + + v_mask = vdup_n_u8(255); + } + + int operator () (const int * src1, const int * src2, uchar * dst, int width) const + { + int x = 0; + + if (code == CMP_GT) + for ( ; x <= width - 8; x += 8) + { + uint32x4_t v_dst1 = vcgtq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x)); + uint32x4_t v_dst2 = vcgtq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4)); + vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2)))); + } + else if (code == CMP_LE) + for ( ; x <= width - 8; x += 8) + { + uint32x4_t v_dst1 = vcleq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x)); + uint32x4_t v_dst2 = vcleq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4)); + vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2)))); + } + else if (code == CMP_EQ) + for ( ; x <= width - 8; x += 8) + { + uint32x4_t v_dst1 = vceqq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x)); + uint32x4_t v_dst2 = vceqq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4)); + vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2)))); + } + else if (code == CMP_NE) + for ( ; x <= width - 8; x += 8) + { + uint32x4_t v_dst1 = vceqq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x)); + uint32x4_t v_dst2 = vceqq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4)); + uint8x8_t v_dst = vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))); + vst1_u8(dst + x, veor_u8(v_dst, v_mask)); + } + + return x; + } + + int code; + uint8x8_t v_mask; +}; + +template <> +struct Cmp_SIMD +{ + explicit Cmp_SIMD(int code_) : + code(code_) + { + // CV_Assert(code == CMP_GT || code == CMP_LE || + // code == CMP_EQ || code == CMP_NE); + + v_mask = vdup_n_u8(255); + } + + int operator () (const float * src1, const float * src2, uchar * dst, int width) const + { + int x = 0; + + if (code == CMP_GT) + for ( ; x <= width - 8; x += 8) + { + uint32x4_t v_dst1 = vcgtq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x)); + uint32x4_t v_dst2 = vcgtq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4)); + vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2)))); + } + else if (code == CMP_LE) + for ( ; x <= width - 8; x += 8) + { + uint32x4_t v_dst1 = vcleq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x)); + uint32x4_t v_dst2 = vcleq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4)); + vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2)))); + } + else if (code == CMP_EQ) + for ( ; x <= width - 8; x += 8) + { + uint32x4_t v_dst1 = vceqq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x)); + uint32x4_t v_dst2 = vceqq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4)); + vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2)))); + } + else if (code == CMP_NE) + for ( ; x <= width - 8; x += 8) + { + uint32x4_t v_dst1 = vceqq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x)); + uint32x4_t v_dst2 = vceqq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4)); + uint8x8_t v_dst = vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))); + vst1_u8(dst + x, veor_u8(v_dst, v_mask)); + } + + return x; + } + + int code; + uint8x8_t v_mask; +}; + +#elif CV_SSE2 + +template <> +struct Cmp_SIMD +{ + explicit Cmp_SIMD(int code_) : + code(code_) + { + // CV_Assert(code == CMP_GT || code == CMP_LE || + // code == CMP_EQ || code == CMP_NE); + + haveSSE = checkHardwareSupport(CV_CPU_SSE2); + + v_mask = _mm_set1_epi8(-1); + } + + int operator () (const schar * src1, const schar * src2, uchar * dst, int width) const + { + int x = 0; + + if (!haveSSE) + return x; + + if (code == CMP_GT) + for ( ; x <= width - 16; x += 16) + _mm_storeu_si128((__m128i *)(dst + x), _mm_cmpgt_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)), + _mm_loadu_si128((const __m128i *)(src2 + x)))); + else if (code == CMP_LE) + for ( ; x <= width - 16; x += 16) + { + __m128i v_gt = _mm_cmpgt_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)), + _mm_loadu_si128((const __m128i *)(src2 + x))); + _mm_storeu_si128((__m128i *)(dst + x), _mm_xor_si128(v_mask, v_gt)); + } + else if (code == CMP_EQ) + for ( ; x <= width - 16; x += 16) + _mm_storeu_si128((__m128i *)(dst + x), _mm_cmpeq_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)), + _mm_loadu_si128((const __m128i *)(src2 + x)))); + else if (code == CMP_NE) + for ( ; x <= width - 16; x += 16) + { + __m128i v_eq = _mm_cmpeq_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)), + _mm_loadu_si128((const __m128i *)(src2 + x))); + _mm_storeu_si128((__m128i *)(dst + x), _mm_xor_si128(v_mask, v_eq)); + } + + return x; + } + + int code; + __m128i v_mask; + bool haveSSE; +}; + +template <> +struct Cmp_SIMD +{ + explicit Cmp_SIMD(int code_) : + code(code_) + { + // CV_Assert(code == CMP_GT || code == CMP_LE || + // code == CMP_EQ || code == CMP_NE); + + haveSSE = checkHardwareSupport(CV_CPU_SSE2); + + v_mask = _mm_set1_epi32(0xffffffff); + } + + int operator () (const int * src1, const int * src2, uchar * dst, int width) const + { + int x = 0; + + if (!haveSSE) + return x; + + if (code == CMP_GT) + for ( ; x <= width - 8; x += 8) + { + __m128i v_dst0 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)), + _mm_loadu_si128((const __m128i *)(src2 + x))); + __m128i v_dst1 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)), + _mm_loadu_si128((const __m128i *)(src2 + x + 4))); + + _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask)); + } + else if (code == CMP_LE) + for ( ; x <= width - 8; x += 8) + { + __m128i v_dst0 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)), + _mm_loadu_si128((const __m128i *)(src2 + x))); + __m128i v_dst1 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)), + _mm_loadu_si128((const __m128i *)(src2 + x + 4))); + + _mm_storel_epi64((__m128i *)(dst + x), _mm_xor_si128(_mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask), v_mask)); + } + else if (code == CMP_EQ) + for ( ; x <= width - 8; x += 8) + { + __m128i v_dst0 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)), + _mm_loadu_si128((const __m128i *)(src2 + x))); + __m128i v_dst1 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)), + _mm_loadu_si128((const __m128i *)(src2 + x + 4))); + + _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask)); + } + else if (code == CMP_NE) + for ( ; x <= width - 8; x += 8) + { + __m128i v_dst0 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)), + _mm_loadu_si128((const __m128i *)(src2 + x))); + __m128i v_dst1 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)), + _mm_loadu_si128((const __m128i *)(src2 + x + 4))); + + _mm_storel_epi64((__m128i *)(dst + x), _mm_xor_si128(v_mask, _mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask))); + } + + return x; + } + + int code; + __m128i v_mask; + bool haveSSE; +}; + +#endif + + +template +struct Mul_SIMD +{ + int operator() (const T *, const T *, T *, int, WT) const + { + return 0; + } +}; + +#if CV_NEON + +template <> +struct Mul_SIMD +{ + int operator() (const uchar * src1, const uchar * src2, uchar * dst, int width, float scale) const + { + int x = 0; + + if( scale == 1.0f ) + for ( ; x <= width - 8; x += 8) + { + uint16x8_t v_src1 = vmovl_u8(vld1_u8(src1 + x)); + uint16x8_t v_src2 = vmovl_u8(vld1_u8(src2 + x)); + + float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))), + vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2)))); + float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))), + vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2)))); + + uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)), + vqmovn_u32(cv_vrndq_u32_f32(v_dst2))); + vst1_u8(dst + x, vqmovn_u16(v_dst)); + } + else + { + float32x4_t v_scale = vdupq_n_f32(scale); + for ( ; x <= width - 8; x += 8) + { + uint16x8_t v_src1 = vmovl_u8(vld1_u8(src1 + x)); + uint16x8_t v_src2 = vmovl_u8(vld1_u8(src2 + x)); + + float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))), + vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2)))); + v_dst1 = vmulq_f32(v_dst1, v_scale); + float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))), + vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2)))); + v_dst2 = vmulq_f32(v_dst2, v_scale); + + uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)), + vqmovn_u32(cv_vrndq_u32_f32(v_dst2))); + vst1_u8(dst + x, vqmovn_u16(v_dst)); + } + } + + return x; + } +}; + +template <> +struct Mul_SIMD +{ + int operator() (const schar * src1, const schar * src2, schar * dst, int width, float scale) const + { + int x = 0; + + if( scale == 1.0f ) + for ( ; x <= width - 8; x += 8) + { + int16x8_t v_src1 = vmovl_s8(vld1_s8(src1 + x)); + int16x8_t v_src2 = vmovl_s8(vld1_s8(src2 + x)); + + float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))), + vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2)))); + float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))), + vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2)))); + + int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)), + vqmovn_s32(cv_vrndq_s32_f32(v_dst2))); + vst1_s8(dst + x, vqmovn_s16(v_dst)); + } + else + { + float32x4_t v_scale = vdupq_n_f32(scale); + for ( ; x <= width - 8; x += 8) + { + int16x8_t v_src1 = vmovl_s8(vld1_s8(src1 + x)); + int16x8_t v_src2 = vmovl_s8(vld1_s8(src2 + x)); + + float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))), + vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2)))); + v_dst1 = vmulq_f32(v_dst1, v_scale); + float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))), + vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2)))); + v_dst2 = vmulq_f32(v_dst2, v_scale); + + int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)), + vqmovn_s32(cv_vrndq_s32_f32(v_dst2))); + vst1_s8(dst + x, vqmovn_s16(v_dst)); + } + } + + return x; + } +}; + +template <> +struct Mul_SIMD +{ + int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float scale) const + { + int x = 0; + + if( scale == 1.0f ) + for ( ; x <= width - 8; x += 8) + { + uint16x8_t v_src1 = vld1q_u16(src1 + x), v_src2 = vld1q_u16(src2 + x); + + float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))), + vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2)))); + float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))), + vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2)))); + + uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)), + vqmovn_u32(cv_vrndq_u32_f32(v_dst2))); + vst1q_u16(dst + x, v_dst); + } + else + { + float32x4_t v_scale = vdupq_n_f32(scale); + for ( ; x <= width - 8; x += 8) + { + uint16x8_t v_src1 = vld1q_u16(src1 + x), v_src2 = vld1q_u16(src2 + x); + + float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))), + vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2)))); + v_dst1 = vmulq_f32(v_dst1, v_scale); + float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))), + vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2)))); + v_dst2 = vmulq_f32(v_dst2, v_scale); + + uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)), + vqmovn_u32(cv_vrndq_u32_f32(v_dst2))); + vst1q_u16(dst + x, v_dst); + } + } + + return x; + } +}; + +template <> +struct Mul_SIMD +{ + int operator() (const short * src1, const short * src2, short * dst, int width, float scale) const + { + int x = 0; + + if( scale == 1.0f ) + for ( ; x <= width - 8; x += 8) + { + int16x8_t v_src1 = vld1q_s16(src1 + x), v_src2 = vld1q_s16(src2 + x); + + float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))), + vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2)))); + float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))), + vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2)))); + + int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)), + vqmovn_s32(cv_vrndq_s32_f32(v_dst2))); + vst1q_s16(dst + x, v_dst); + } + else + { + float32x4_t v_scale = vdupq_n_f32(scale); + for ( ; x <= width - 8; x += 8) + { + int16x8_t v_src1 = vld1q_s16(src1 + x), v_src2 = vld1q_s16(src2 + x); + + float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))), + vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2)))); + v_dst1 = vmulq_f32(v_dst1, v_scale); + float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))), + vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2)))); + v_dst2 = vmulq_f32(v_dst2, v_scale); + + int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)), + vqmovn_s32(cv_vrndq_s32_f32(v_dst2))); + vst1q_s16(dst + x, v_dst); + } + } + + return x; + } +}; + +template <> +struct Mul_SIMD +{ + int operator() (const float * src1, const float * src2, float * dst, int width, float scale) const + { + int x = 0; + + if( scale == 1.0f ) + for ( ; x <= width - 8; x += 8) + { + float32x4_t v_dst1 = vmulq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x)); + float32x4_t v_dst2 = vmulq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4)); + vst1q_f32(dst + x, v_dst1); + vst1q_f32(dst + x + 4, v_dst2); + } + else + { + float32x4_t v_scale = vdupq_n_f32(scale); + for ( ; x <= width - 8; x += 8) + { + float32x4_t v_dst1 = vmulq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x)); + v_dst1 = vmulq_f32(v_dst1, v_scale); + + float32x4_t v_dst2 = vmulq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4)); + v_dst2 = vmulq_f32(v_dst2, v_scale); + + vst1q_f32(dst + x, v_dst1); + vst1q_f32(dst + x + 4, v_dst2); + } + } + + return x; + } +}; + +#elif CV_SSE2 + +#if CV_SSE4_1 + +template <> +struct Mul_SIMD +{ + Mul_SIMD() + { + haveSSE = checkHardwareSupport(CV_CPU_SSE4_1); + } + + int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float scale) const + { + int x = 0; + + if (!haveSSE) + return x; + + __m128i v_zero = _mm_setzero_si128(); + + if( scale != 1.0f ) + { + __m128 v_scale = _mm_set1_ps(scale); + for ( ; x <= width - 8; x += 8) + { + __m128i v_src1 = _mm_loadu_si128((__m128i const *)(src1 + x)); + __m128i v_src2 = _mm_loadu_si128((__m128i const *)(src2 + x)); + + __m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src1, v_zero)), + _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src2, v_zero))); + v_dst1 = _mm_mul_ps(v_dst1, v_scale); + + __m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src1, v_zero)), + _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src2, v_zero))); + v_dst2 = _mm_mul_ps(v_dst2, v_scale); + + __m128i v_dsti = _mm_packus_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2)); + _mm_storeu_si128((__m128i *)(dst + x), v_dsti); + } + } + + return x; + } + + bool haveSSE; +}; + +#endif + +template <> +struct Mul_SIMD +{ + Mul_SIMD() + { + haveSSE = checkHardwareSupport(CV_CPU_SSE2); + } + + int operator() (const schar * src1, const schar * src2, schar * dst, int width, float scale) const + { + int x = 0; + + if (!haveSSE) + return x; + + __m128i v_zero = _mm_setzero_si128(); + + if( scale == 1.0f ) + for ( ; x <= width - 8; x += 8) + { + __m128i v_src1 = _mm_loadl_epi64((__m128i const *)(src1 + x)); + __m128i v_src2 = _mm_loadl_epi64((__m128i const *)(src2 + x)); + + v_src1 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src1), 8); + v_src2 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src2), 8); + + __m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)), + _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16))); + + __m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)), + _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16))); + + __m128i v_dsti = _mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2)); + _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dsti, v_zero)); + } + else + { + __m128 v_scale = _mm_set1_ps(scale); + for ( ; x <= width - 8; x += 8) + { + __m128i v_src1 = _mm_loadl_epi64((__m128i const *)(src1 + x)); + __m128i v_src2 = _mm_loadl_epi64((__m128i const *)(src2 + x)); + + v_src1 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src1), 8); + v_src2 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src2), 8); + + __m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)), + _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16))); + v_dst1 = _mm_mul_ps(v_dst1, v_scale); + + __m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)), + _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16))); + v_dst2 = _mm_mul_ps(v_dst2, v_scale); + + __m128i v_dsti = _mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2)); + _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dsti, v_zero)); + } + } + + return x; + } + + bool haveSSE; +}; + +template <> +struct Mul_SIMD +{ + Mul_SIMD() + { + haveSSE = checkHardwareSupport(CV_CPU_SSE2); + } + + int operator() (const short * src1, const short * src2, short * dst, int width, float scale) const + { + int x = 0; + + if (!haveSSE) + return x; + + __m128i v_zero = _mm_setzero_si128(); + + if( scale != 1.0f ) + { + __m128 v_scale = _mm_set1_ps(scale); + for ( ; x <= width - 8; x += 8) + { + __m128i v_src1 = _mm_loadu_si128((__m128i const *)(src1 + x)); + __m128i v_src2 = _mm_loadu_si128((__m128i const *)(src2 + x)); + + __m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)), + _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16))); + v_dst1 = _mm_mul_ps(v_dst1, v_scale); + + __m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)), + _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16))); + v_dst2 = _mm_mul_ps(v_dst2, v_scale); + + __m128i v_dsti = _mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2)); + _mm_storeu_si128((__m128i *)(dst + x), v_dsti); + } + } + + return x; + } + + bool haveSSE; +}; + +#endif + +template +struct Div_SIMD +{ + int operator() (const T *, const T *, T *, int, double) const + { + return 0; + } +}; + +template +struct Recip_SIMD +{ + int operator() (const T *, T *, int, double) const + { + return 0; + } +}; + + +#if CV_SIMD128 + +template <> +struct Div_SIMD +{ + bool haveSIMD; + Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } + + int operator() (const uchar * src1, const uchar * src2, uchar * dst, int width, double scale) const + { + int x = 0; + + if (!haveSIMD) + return x; + + v_float32x4 v_scale = v_setall_f32((float)scale); + v_uint16x8 v_zero = v_setzero_u16(); + + for ( ; x <= width - 8; x += 8) + { + v_uint16x8 v_src1 = v_load_expand(src1 + x); + v_uint16x8 v_src2 = v_load_expand(src2 + x); + + v_uint32x4 t0, t1, t2, t3; + v_expand(v_src1, t0, t1); + v_expand(v_src2, t2, t3); + + v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0)); + v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1)); + + v_float32x4 f2 = v_cvt_f32(v_reinterpret_as_s32(t2)); + v_float32x4 f3 = v_cvt_f32(v_reinterpret_as_s32(t3)); + + f0 = f0 * v_scale / f2; + f1 = f1 * v_scale / f3; + + v_int32x4 i0 = v_round(f0), i1 = v_round(f1); + v_uint16x8 res = v_pack_u(i0, i1); + + res = v_select(v_src2 == v_zero, v_zero, res); + v_pack_store(dst + x, res); + } + + return x; + } +}; + + +template <> +struct Div_SIMD +{ + bool haveSIMD; + Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } + + int operator() (const schar * src1, const schar * src2, schar * dst, int width, double scale) const + { + int x = 0; + + if (!haveSIMD) + return x; + + v_float32x4 v_scale = v_setall_f32((float)scale); + v_int16x8 v_zero = v_setzero_s16(); + + for ( ; x <= width - 8; x += 8) + { + v_int16x8 v_src1 = v_load_expand(src1 + x); + v_int16x8 v_src2 = v_load_expand(src2 + x); + + v_int32x4 t0, t1, t2, t3; + v_expand(v_src1, t0, t1); + v_expand(v_src2, t2, t3); + + v_float32x4 f0 = v_cvt_f32(t0); + v_float32x4 f1 = v_cvt_f32(t1); + + v_float32x4 f2 = v_cvt_f32(t2); + v_float32x4 f3 = v_cvt_f32(t3); + + f0 = f0 * v_scale / f2; + f1 = f1 * v_scale / f3; + + v_int32x4 i0 = v_round(f0), i1 = v_round(f1); + v_int16x8 res = v_pack(i0, i1); + + res = v_select(v_src2 == v_zero, v_zero, res); + v_pack_store(dst + x, res); + } + + return x; + } +}; + + +template <> +struct Div_SIMD +{ + bool haveSIMD; + Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } + + int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, double scale) const + { + int x = 0; + + if (!haveSIMD) + return x; + + v_float32x4 v_scale = v_setall_f32((float)scale); + v_uint16x8 v_zero = v_setzero_u16(); + + for ( ; x <= width - 8; x += 8) + { + v_uint16x8 v_src1 = v_load(src1 + x); + v_uint16x8 v_src2 = v_load(src2 + x); + + v_uint32x4 t0, t1, t2, t3; + v_expand(v_src1, t0, t1); + v_expand(v_src2, t2, t3); + + v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0)); + v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1)); + + v_float32x4 f2 = v_cvt_f32(v_reinterpret_as_s32(t2)); + v_float32x4 f3 = v_cvt_f32(v_reinterpret_as_s32(t3)); + + f0 = f0 * v_scale / f2; + f1 = f1 * v_scale / f3; + + v_int32x4 i0 = v_round(f0), i1 = v_round(f1); + v_uint16x8 res = v_pack_u(i0, i1); + + res = v_select(v_src2 == v_zero, v_zero, res); + v_store(dst + x, res); + } + + return x; + } +}; + +template <> +struct Div_SIMD +{ + bool haveSIMD; + Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } + + int operator() (const short * src1, const short * src2, short * dst, int width, double scale) const + { + int x = 0; + + if (!haveSIMD) + return x; + + v_float32x4 v_scale = v_setall_f32((float)scale); + v_int16x8 v_zero = v_setzero_s16(); + + for ( ; x <= width - 8; x += 8) + { + v_int16x8 v_src1 = v_load(src1 + x); + v_int16x8 v_src2 = v_load(src2 + x); + + v_int32x4 t0, t1, t2, t3; + v_expand(v_src1, t0, t1); + v_expand(v_src2, t2, t3); + + v_float32x4 f0 = v_cvt_f32(t0); + v_float32x4 f1 = v_cvt_f32(t1); + + v_float32x4 f2 = v_cvt_f32(t2); + v_float32x4 f3 = v_cvt_f32(t3); + + f0 = f0 * v_scale / f2; + f1 = f1 * v_scale / f3; + + v_int32x4 i0 = v_round(f0), i1 = v_round(f1); + v_int16x8 res = v_pack(i0, i1); + + res = v_select(v_src2 == v_zero, v_zero, res); + v_store(dst + x, res); + } + + return x; + } +}; + +template <> +struct Div_SIMD +{ + bool haveSIMD; + Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } + + int operator() (const int * src1, const int * src2, int * dst, int width, double scale) const + { + int x = 0; + + if (!haveSIMD) + return x; + + v_float32x4 v_scale = v_setall_f32((float)scale); + v_int32x4 v_zero = v_setzero_s32(); + + for ( ; x <= width - 8; x += 8) + { + v_int32x4 t0 = v_load(src1 + x); + v_int32x4 t1 = v_load(src1 + x + 4); + v_int32x4 t2 = v_load(src2 + x); + v_int32x4 t3 = v_load(src2 + x + 4); + + v_float32x4 f0 = v_cvt_f32(t0); + v_float32x4 f1 = v_cvt_f32(t1); + v_float32x4 f2 = v_cvt_f32(t2); + v_float32x4 f3 = v_cvt_f32(t3); + + f0 = f0 * v_scale / f2; + f1 = f1 * v_scale / f3; + + v_int32x4 res0 = v_round(f0), res1 = v_round(f1); + + res0 = v_select(t2 == v_zero, v_zero, res0); + res1 = v_select(t3 == v_zero, v_zero, res1); + v_store(dst + x, res0); + v_store(dst + x + 4, res1); + } + + return x; + } +}; + + +template <> +struct Div_SIMD +{ + bool haveSIMD; + Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } + + int operator() (const float * src1, const float * src2, float * dst, int width, double scale) const + { + int x = 0; + + if (!haveSIMD) + return x; + + v_float32x4 v_scale = v_setall_f32((float)scale); + v_float32x4 v_zero = v_setzero_f32(); + + for ( ; x <= width - 8; x += 8) + { + v_float32x4 f0 = v_load(src1 + x); + v_float32x4 f1 = v_load(src1 + x + 4); + v_float32x4 f2 = v_load(src2 + x); + v_float32x4 f3 = v_load(src2 + x + 4); + + v_float32x4 res0 = f0 * v_scale / f2; + v_float32x4 res1 = f1 * v_scale / f3; + + res0 = v_select(f2 == v_zero, v_zero, res0); + res1 = v_select(f3 == v_zero, v_zero, res1); + + v_store(dst + x, res0); + v_store(dst + x + 4, res1); + } + + return x; + } +}; + + +///////////////////////// RECIPROCAL ////////////////////// + +template <> +struct Recip_SIMD +{ + bool haveSIMD; + Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } + + int operator() (const uchar * src2, uchar * dst, int width, double scale) const + { + int x = 0; + + if (!haveSIMD) + return x; + + v_float32x4 v_scale = v_setall_f32((float)scale); + v_uint16x8 v_zero = v_setzero_u16(); + + for ( ; x <= width - 8; x += 8) + { + v_uint16x8 v_src2 = v_load_expand(src2 + x); + + v_uint32x4 t0, t1; + v_expand(v_src2, t0, t1); + + v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0)); + v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1)); + + f0 = v_scale / f0; + f1 = v_scale / f1; + + v_int32x4 i0 = v_round(f0), i1 = v_round(f1); + v_uint16x8 res = v_pack_u(i0, i1); + + res = v_select(v_src2 == v_zero, v_zero, res); + v_pack_store(dst + x, res); + } + + return x; + } +}; + + +template <> +struct Recip_SIMD +{ + bool haveSIMD; + Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } + + int operator() (const schar * src2, schar * dst, int width, double scale) const + { + int x = 0; + + if (!haveSIMD) + return x; + + v_float32x4 v_scale = v_setall_f32((float)scale); + v_int16x8 v_zero = v_setzero_s16(); + + for ( ; x <= width - 8; x += 8) + { + v_int16x8 v_src2 = v_load_expand(src2 + x); + + v_int32x4 t0, t1; + v_expand(v_src2, t0, t1); + + v_float32x4 f0 = v_cvt_f32(t0); + v_float32x4 f1 = v_cvt_f32(t1); + + f0 = v_scale / f0; + f1 = v_scale / f1; + + v_int32x4 i0 = v_round(f0), i1 = v_round(f1); + v_int16x8 res = v_pack(i0, i1); + + res = v_select(v_src2 == v_zero, v_zero, res); + v_pack_store(dst + x, res); + } + + return x; + } +}; + + +template <> +struct Recip_SIMD +{ + bool haveSIMD; + Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } + + int operator() (const ushort * src2, ushort * dst, int width, double scale) const + { + int x = 0; + + if (!haveSIMD) + return x; + + v_float32x4 v_scale = v_setall_f32((float)scale); + v_uint16x8 v_zero = v_setzero_u16(); + + for ( ; x <= width - 8; x += 8) + { + v_uint16x8 v_src2 = v_load(src2 + x); + + v_uint32x4 t0, t1; + v_expand(v_src2, t0, t1); + + v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0)); + v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1)); + + f0 = v_scale / f0; + f1 = v_scale / f1; + + v_int32x4 i0 = v_round(f0), i1 = v_round(f1); + v_uint16x8 res = v_pack_u(i0, i1); + + res = v_select(v_src2 == v_zero, v_zero, res); + v_store(dst + x, res); + } + + return x; + } +}; + +template <> +struct Recip_SIMD +{ + bool haveSIMD; + Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } + + int operator() (const short * src2, short * dst, int width, double scale) const + { + int x = 0; + + if (!haveSIMD) + return x; + + v_float32x4 v_scale = v_setall_f32((float)scale); + v_int16x8 v_zero = v_setzero_s16(); + + for ( ; x <= width - 8; x += 8) + { + v_int16x8 v_src2 = v_load(src2 + x); + + v_int32x4 t0, t1; + v_expand(v_src2, t0, t1); + + v_float32x4 f0 = v_cvt_f32(t0); + v_float32x4 f1 = v_cvt_f32(t1); + + f0 = v_scale / f0; + f1 = v_scale / f1; + + v_int32x4 i0 = v_round(f0), i1 = v_round(f1); + v_int16x8 res = v_pack(i0, i1); + + res = v_select(v_src2 == v_zero, v_zero, res); + v_store(dst + x, res); + } + + return x; + } +}; + +template <> +struct Recip_SIMD +{ + bool haveSIMD; + Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } + + int operator() (const int * src2, int * dst, int width, double scale) const + { + int x = 0; + + if (!haveSIMD) + return x; + + v_float32x4 v_scale = v_setall_f32((float)scale); + v_int32x4 v_zero = v_setzero_s32(); + + for ( ; x <= width - 8; x += 8) + { + v_int32x4 t0 = v_load(src2 + x); + v_int32x4 t1 = v_load(src2 + x + 4); + + v_float32x4 f0 = v_cvt_f32(t0); + v_float32x4 f1 = v_cvt_f32(t1); + + f0 = v_scale / f0; + f1 = v_scale / f1; + + v_int32x4 res0 = v_round(f0), res1 = v_round(f1); + + res0 = v_select(t0 == v_zero, v_zero, res0); + res1 = v_select(t1 == v_zero, v_zero, res1); + v_store(dst + x, res0); + v_store(dst + x + 4, res1); + } + + return x; + } +}; + + +template <> +struct Recip_SIMD +{ + bool haveSIMD; + Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } + + int operator() (const float * src2, float * dst, int width, double scale) const + { + int x = 0; + + if (!haveSIMD) + return x; + + v_float32x4 v_scale = v_setall_f32((float)scale); + v_float32x4 v_zero = v_setzero_f32(); + + for ( ; x <= width - 8; x += 8) + { + v_float32x4 f0 = v_load(src2 + x); + v_float32x4 f1 = v_load(src2 + x + 4); + + v_float32x4 res0 = v_scale / f0; + v_float32x4 res1 = v_scale / f1; + + res0 = v_select(f0 == v_zero, v_zero, res0); + res1 = v_select(f1 == v_zero, v_zero, res1); + + v_store(dst + x, res0); + v_store(dst + x + 4, res1); + } + + return x; + } +}; + +#if CV_SIMD128_64F + +template <> +struct Div_SIMD +{ + bool haveSIMD; + Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } + + int operator() (const double * src1, const double * src2, double * dst, int width, double scale) const + { + int x = 0; + + if (!haveSIMD) + return x; + + v_float64x2 v_scale = v_setall_f64(scale); + v_float64x2 v_zero = v_setzero_f64(); + + for ( ; x <= width - 4; x += 4) + { + v_float64x2 f0 = v_load(src1 + x); + v_float64x2 f1 = v_load(src1 + x + 2); + v_float64x2 f2 = v_load(src2 + x); + v_float64x2 f3 = v_load(src2 + x + 2); + + v_float64x2 res0 = f0 * v_scale / f2; + v_float64x2 res1 = f1 * v_scale / f3; + + res0 = v_select(f0 == v_zero, v_zero, res0); + res1 = v_select(f1 == v_zero, v_zero, res1); + + v_store(dst + x, res0); + v_store(dst + x + 2, res1); + } + + return x; + } +}; + +template <> +struct Recip_SIMD +{ + bool haveSIMD; + Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } + + int operator() (const double * src2, double * dst, int width, double scale) const + { + int x = 0; + + if (!haveSIMD) + return x; + + v_float64x2 v_scale = v_setall_f64(scale); + v_float64x2 v_zero = v_setzero_f64(); + + for ( ; x <= width - 4; x += 4) + { + v_float64x2 f0 = v_load(src2 + x); + v_float64x2 f1 = v_load(src2 + x + 2); + + v_float64x2 res0 = v_scale / f0; + v_float64x2 res1 = v_scale / f1; + + res0 = v_select(f0 == v_zero, v_zero, res0); + res1 = v_select(f1 == v_zero, v_zero, res1); + + v_store(dst + x, res0); + v_store(dst + x + 2, res1); + } + + return x; + } +}; + +#endif + +#endif + + +template +struct AddWeighted_SIMD +{ + int operator() (const T *, const T *, T *, int, WT, WT, WT) const + { + return 0; + } +}; + +#if CV_SSE2 + +template <> +struct AddWeighted_SIMD +{ + AddWeighted_SIMD() + { + haveSSE2 = checkHardwareSupport(CV_CPU_SSE2); + } + + int operator() (const schar * src1, const schar * src2, schar * dst, int width, float alpha, float beta, float gamma) const + { + int x = 0; + + if (!haveSSE2) + return x; + + __m128i v_zero = _mm_setzero_si128(); + __m128 v_alpha = _mm_set1_ps(alpha), v_beta = _mm_set1_ps(beta), + v_gamma = _mm_set1_ps(gamma); + + for( ; x <= width - 8; x += 8 ) + { + __m128i v_src1 = _mm_loadl_epi64((const __m128i *)(src1 + x)); + __m128i v_src2 = _mm_loadl_epi64((const __m128i *)(src2 + x)); + + __m128i v_src1_p = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src1), 8); + __m128i v_src2_p = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src2), 8); + + __m128 v_dstf0 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1_p), 16)), v_alpha); + v_dstf0 = _mm_add_ps(_mm_add_ps(v_dstf0, v_gamma), + _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2_p), 16)), v_beta)); + + __m128 v_dstf1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1_p), 16)), v_alpha); + v_dstf1 = _mm_add_ps(_mm_add_ps(v_dstf1, v_gamma), + _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2_p), 16)), v_beta)); + + __m128i v_dst16 = _mm_packs_epi32(_mm_cvtps_epi32(v_dstf0), + _mm_cvtps_epi32(v_dstf1)); + + _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst16, v_zero)); + } + + return x; + } + + bool haveSSE2; +}; + +template <> +struct AddWeighted_SIMD +{ + AddWeighted_SIMD() + { + haveSSE2 = checkHardwareSupport(CV_CPU_SSE2); + } + + int operator() (const short * src1, const short * src2, short * dst, int width, float alpha, float beta, float gamma) const + { + int x = 0; + + if (!haveSSE2) + return x; + + __m128i v_zero = _mm_setzero_si128(); + __m128 v_alpha = _mm_set1_ps(alpha), v_beta = _mm_set1_ps(beta), + v_gamma = _mm_set1_ps(gamma); + + for( ; x <= width - 8; x += 8 ) + { + __m128i v_src1 = _mm_loadu_si128((const __m128i *)(src1 + x)); + __m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x)); + + __m128 v_dstf0 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)), v_alpha); + v_dstf0 = _mm_add_ps(_mm_add_ps(v_dstf0, v_gamma), + _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16)), v_beta)); + + __m128 v_dstf1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)), v_alpha); + v_dstf1 = _mm_add_ps(_mm_add_ps(v_dstf1, v_gamma), + _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16)), v_beta)); + + _mm_storeu_si128((__m128i *)(dst + x), _mm_packs_epi32(_mm_cvtps_epi32(v_dstf0), + _mm_cvtps_epi32(v_dstf1))); + } + + return x; + } + + bool haveSSE2; +}; + +#if CV_SSE4_1 + +template <> +struct AddWeighted_SIMD +{ + AddWeighted_SIMD() + { + haveSSE4_1 = checkHardwareSupport(CV_CPU_SSE4_1); + } + + int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float alpha, float beta, float gamma) const + { + int x = 0; + + if (!haveSSE4_1) + return x; + + __m128i v_zero = _mm_setzero_si128(); + __m128 v_alpha = _mm_set1_ps(alpha), v_beta = _mm_set1_ps(beta), + v_gamma = _mm_set1_ps(gamma); + + for( ; x <= width - 8; x += 8 ) + { + __m128i v_src1 = _mm_loadu_si128((const __m128i *)(src1 + x)); + __m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x)); + + __m128 v_dstf0 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src1, v_zero)), v_alpha); + v_dstf0 = _mm_add_ps(_mm_add_ps(v_dstf0, v_gamma), + _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src2, v_zero)), v_beta)); + + __m128 v_dstf1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src1, v_zero)), v_alpha); + v_dstf1 = _mm_add_ps(_mm_add_ps(v_dstf1, v_gamma), + _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src2, v_zero)), v_beta)); + + _mm_storeu_si128((__m128i *)(dst + x), _mm_packus_epi32(_mm_cvtps_epi32(v_dstf0), + _mm_cvtps_epi32(v_dstf1))); + } + + return x; + } + + bool haveSSE4_1; +}; + +#endif + +#elif CV_NEON + +template <> +struct AddWeighted_SIMD +{ + int operator() (const schar * src1, const schar * src2, schar * dst, int width, float alpha, float beta, float gamma) const + { + int x = 0; + + float32x4_t g = vdupq_n_f32 (gamma); + + for( ; x <= width - 8; x += 8 ) + { + int8x8_t in1 = vld1_s8(src1 + x); + int16x8_t in1_16 = vmovl_s8(in1); + float32x4_t in1_f_l = vcvtq_f32_s32(vmovl_s16(vget_low_s16(in1_16))); + float32x4_t in1_f_h = vcvtq_f32_s32(vmovl_s16(vget_high_s16(in1_16))); + + int8x8_t in2 = vld1_s8(src2+x); + int16x8_t in2_16 = vmovl_s8(in2); + float32x4_t in2_f_l = vcvtq_f32_s32(vmovl_s16(vget_low_s16(in2_16))); + float32x4_t in2_f_h = vcvtq_f32_s32(vmovl_s16(vget_high_s16(in2_16))); + + float32x4_t out_f_l = vaddq_f32(vmulq_n_f32(in1_f_l, alpha), vmulq_n_f32(in2_f_l, beta)); + float32x4_t out_f_h = vaddq_f32(vmulq_n_f32(in1_f_h, alpha), vmulq_n_f32(in2_f_h, beta)); + out_f_l = vaddq_f32(out_f_l, g); + out_f_h = vaddq_f32(out_f_h, g); + + int16x4_t out_16_l = vqmovn_s32(cv_vrndq_s32_f32(out_f_l)); + int16x4_t out_16_h = vqmovn_s32(cv_vrndq_s32_f32(out_f_h)); + + int16x8_t out_16 = vcombine_s16(out_16_l, out_16_h); + int8x8_t out = vqmovn_s16(out_16); + + vst1_s8(dst + x, out); + } + + return x; + } +}; + +template <> +struct AddWeighted_SIMD +{ + int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float alpha, float beta, float gamma) const + { + int x = 0; + + float32x4_t g = vdupq_n_f32(gamma); + + for( ; x <= width - 8; x += 8 ) + { + uint16x8_t v_src1 = vld1q_u16(src1 + x), v_src2 = vld1q_u16(src2 + x); + + float32x4_t v_s1 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))), alpha); + float32x4_t v_s2 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2))), beta); + uint16x4_t v_dst1 = vqmovn_u32(cv_vrndq_u32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g))); + + v_s1 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))), alpha); + v_s2 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2))), beta); + uint16x4_t v_dst2 = vqmovn_u32(cv_vrndq_u32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g))); + + vst1q_u16(dst + x, vcombine_u16(v_dst1, v_dst2)); + } + + return x; + } +}; + +template <> +struct AddWeighted_SIMD +{ + int operator() (const short * src1, const short * src2, short * dst, int width, float alpha, float beta, float gamma) const + { + int x = 0; + + float32x4_t g = vdupq_n_f32(gamma); + + for( ; x <= width - 8; x += 8 ) + { + int16x8_t v_src1 = vld1q_s16(src1 + x), v_src2 = vld1q_s16(src2 + x); + + float32x4_t v_s1 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))), alpha); + float32x4_t v_s2 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2))), beta); + int16x4_t v_dst1 = vqmovn_s32(cv_vrndq_s32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g))); + + v_s1 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))), alpha); + v_s2 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2))), beta); + int16x4_t v_dst2 = vqmovn_s32(cv_vrndq_s32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g))); + + vst1q_s16(dst + x, vcombine_s16(v_dst1, v_dst2)); + } + + return x; + } +}; + +#endif + +}} + +#endif // __OPENCV_HAL_ARITHM_SIMD_HPP__ diff --git a/modules/hal/src/hardware.cpp b/modules/hal/src/hardware.cpp new file mode 100644 index 0000000000..6a08b9f44a --- /dev/null +++ b/modules/hal/src/hardware.cpp @@ -0,0 +1,221 @@ +#include "precomp.hpp" + +#if defined WIN32 || defined _WIN32 || defined WINCE +#include +#if defined _MSC_VER + #if _MSC_VER >= 1400 + #include + #elif defined _M_IX86 + static void __cpuid(int* cpuid_data, int) + { + __asm + { + push ebx + push edi + mov edi, cpuid_data + mov eax, 1 + cpuid + mov [edi], eax + mov [edi + 4], ebx + mov [edi + 8], ecx + mov [edi + 12], edx + pop edi + pop ebx + } + } + static void __cpuidex(int* cpuid_data, int, int) + { + __asm + { + push edi + mov edi, cpuid_data + mov eax, 7 + mov ecx, 0 + cpuid + mov [edi], eax + mov [edi + 4], ebx + mov [edi + 8], ecx + mov [edi + 12], edx + pop edi + } + } + #endif +#endif +#endif + +#if defined ANDROID || defined __linux__ +# include +# include +# include +# include +#endif + +#if defined __linux__ || defined __APPLE__ || defined __EMSCRIPTEN__ +#include +#include +#include +#if defined ANDROID +#include +#endif +#endif + +#ifdef ANDROID +# include +#endif + +struct HWFeatures +{ + enum { MAX_FEATURE = CV_HARDWARE_MAX_FEATURE }; + + HWFeatures(void) + { + memset( have, 0, sizeof(have) ); + x86_family = 0; + } + + static HWFeatures initialize(void) + { + HWFeatures f; + int cpuid_data[4] = { 0, 0, 0, 0 }; + + #if defined _MSC_VER && (defined _M_IX86 || defined _M_X64) + __cpuid(cpuid_data, 1); + #elif defined __GNUC__ && (defined __i386__ || defined __x86_64__) + #ifdef __x86_64__ + asm __volatile__ + ( + "movl $1, %%eax\n\t" + "cpuid\n\t" + :[eax]"=a"(cpuid_data[0]),[ebx]"=b"(cpuid_data[1]),[ecx]"=c"(cpuid_data[2]),[edx]"=d"(cpuid_data[3]) + : + : "cc" + ); + #else + asm volatile + ( + "pushl %%ebx\n\t" + "movl $1,%%eax\n\t" + "cpuid\n\t" + "popl %%ebx\n\t" + : "=a"(cpuid_data[0]), "=c"(cpuid_data[2]), "=d"(cpuid_data[3]) + : + : "cc" + ); + #endif + #endif + + f.x86_family = (cpuid_data[0] >> 8) & 15; + if( f.x86_family >= 6 ) + { + f.have[CV_CPU_MMX] = (cpuid_data[3] & (1 << 23)) != 0; + f.have[CV_CPU_SSE] = (cpuid_data[3] & (1<<25)) != 0; + f.have[CV_CPU_SSE2] = (cpuid_data[3] & (1<<26)) != 0; + f.have[CV_CPU_SSE3] = (cpuid_data[2] & (1<<0)) != 0; + f.have[CV_CPU_SSSE3] = (cpuid_data[2] & (1<<9)) != 0; + f.have[CV_CPU_FMA3] = (cpuid_data[2] & (1<<12)) != 0; + f.have[CV_CPU_SSE4_1] = (cpuid_data[2] & (1<<19)) != 0; + f.have[CV_CPU_SSE4_2] = (cpuid_data[2] & (1<<20)) != 0; + f.have[CV_CPU_POPCNT] = (cpuid_data[2] & (1<<23)) != 0; + f.have[CV_CPU_AVX] = (((cpuid_data[2] & (1<<28)) != 0)&&((cpuid_data[2] & (1<<27)) != 0));//OS uses XSAVE_XRSTORE and CPU support AVX + + // make the second call to the cpuid command in order to get + // information about extended features like AVX2 + #if defined _MSC_VER && (defined _M_IX86 || defined _M_X64) + __cpuidex(cpuid_data, 7, 0); + #elif defined __GNUC__ && (defined __i386__ || defined __x86_64__) + #ifdef __x86_64__ + asm __volatile__ + ( + "movl $7, %%eax\n\t" + "movl $0, %%ecx\n\t" + "cpuid\n\t" + :[eax]"=a"(cpuid_data[0]),[ebx]"=b"(cpuid_data[1]),[ecx]"=c"(cpuid_data[2]),[edx]"=d"(cpuid_data[3]) + : + : "cc" + ); + #else + asm volatile + ( + "pushl %%ebx\n\t" + "movl $7,%%eax\n\t" + "movl $0,%%ecx\n\t" + "cpuid\n\t" + "movl %%ebx, %0\n\t" + "popl %%ebx\n\t" + : "=r"(cpuid_data[1]), "=c"(cpuid_data[2]) + : + : "cc" + ); + #endif + #endif + f.have[CV_CPU_AVX2] = (cpuid_data[1] & (1<<5)) != 0; + + f.have[CV_CPU_AVX_512F] = (cpuid_data[1] & (1<<16)) != 0; + f.have[CV_CPU_AVX_512DQ] = (cpuid_data[1] & (1<<17)) != 0; + f.have[CV_CPU_AVX_512IFMA512] = (cpuid_data[1] & (1<<21)) != 0; + f.have[CV_CPU_AVX_512PF] = (cpuid_data[1] & (1<<26)) != 0; + f.have[CV_CPU_AVX_512ER] = (cpuid_data[1] & (1<<27)) != 0; + f.have[CV_CPU_AVX_512CD] = (cpuid_data[1] & (1<<28)) != 0; + f.have[CV_CPU_AVX_512BW] = (cpuid_data[1] & (1<<30)) != 0; + f.have[CV_CPU_AVX_512VL] = (cpuid_data[1] & (1<<31)) != 0; + f.have[CV_CPU_AVX_512VBMI] = (cpuid_data[2] & (1<<1)) != 0; + } + + #if defined ANDROID || defined __linux__ + #ifdef __aarch64__ + f.have[CV_CPU_NEON] = true; + #else + int cpufile = open("/proc/self/auxv", O_RDONLY); + + if (cpufile >= 0) + { + Elf32_auxv_t auxv; + const size_t size_auxv_t = sizeof(auxv); + + while ((size_t)read(cpufile, &auxv, size_auxv_t) == size_auxv_t) + { + if (auxv.a_type == AT_HWCAP) + { + f.have[CV_CPU_NEON] = (auxv.a_un.a_val & 4096) != 0; + break; + } + } + + close(cpufile); + } + #endif + #elif (defined __clang__ || defined __APPLE__) && (defined __ARM_NEON__ || (defined __ARM_NEON && defined __aarch64__)) + f.have[CV_CPU_NEON] = true; + #endif + + return f; + } + + int x86_family; + bool have[MAX_FEATURE+1]; +}; + +static HWFeatures featuresEnabled = HWFeatures::initialize(), featuresDisabled = HWFeatures(); +static HWFeatures* currentFeatures = &featuresEnabled; +volatile bool useOptimizedFlag = true; + +namespace cv { namespace hal { + +bool checkHardwareSupport(int feature) +{ +// CV_DbgAssert( 0 <= feature && feature <= CV_HARDWARE_MAX_FEATURE ); + return currentFeatures->have[feature]; +} + +void setUseOptimized( bool flag ) +{ + useOptimizedFlag = flag; + currentFeatures = flag ? &featuresEnabled : &featuresDisabled; +} + +bool useOptimized(void) +{ + return useOptimizedFlag; +} + +}} diff --git a/modules/hal/src/merge.cpp b/modules/hal/src/merge.cpp new file mode 100644 index 0000000000..982b24c250 --- /dev/null +++ b/modules/hal/src/merge.cpp @@ -0,0 +1,408 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved. +// Copyright (C) 2014-2015, Itseez Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include "precomp.hpp" + +namespace cv { namespace hal { + +#if CV_NEON +template struct VMerge2; +template struct VMerge3; +template struct VMerge4; + +#define MERGE2_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \ + template<> \ + struct name{ \ + void operator()(const data_type* src0, const data_type* src1, \ + data_type* dst){ \ + reg_type r; \ + r.val[0] = load_func(src0); \ + r.val[1] = load_func(src1); \ + store_func(dst, r); \ + } \ + } + +#define MERGE3_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \ + template<> \ + struct name{ \ + void operator()(const data_type* src0, const data_type* src1, \ + const data_type* src2, data_type* dst){ \ + reg_type r; \ + r.val[0] = load_func(src0); \ + r.val[1] = load_func(src1); \ + r.val[2] = load_func(src2); \ + store_func(dst, r); \ + } \ + } + +#define MERGE4_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \ + template<> \ + struct name{ \ + void operator()(const data_type* src0, const data_type* src1, \ + const data_type* src2, const data_type* src3, \ + data_type* dst){ \ + reg_type r; \ + r.val[0] = load_func(src0); \ + r.val[1] = load_func(src1); \ + r.val[2] = load_func(src2); \ + r.val[3] = load_func(src3); \ + store_func(dst, r); \ + } \ + } + +MERGE2_KERNEL_TEMPLATE(VMerge2, uchar , uint8x16x2_t, vld1q_u8 , vst2q_u8 ); +MERGE2_KERNEL_TEMPLATE(VMerge2, ushort, uint16x8x2_t, vld1q_u16, vst2q_u16); +MERGE2_KERNEL_TEMPLATE(VMerge2, int , int32x4x2_t, vld1q_s32, vst2q_s32); +MERGE2_KERNEL_TEMPLATE(VMerge2, int64 , int64x1x2_t, vld1_s64 , vst2_s64 ); + +MERGE3_KERNEL_TEMPLATE(VMerge3, uchar , uint8x16x3_t, vld1q_u8 , vst3q_u8 ); +MERGE3_KERNEL_TEMPLATE(VMerge3, ushort, uint16x8x3_t, vld1q_u16, vst3q_u16); +MERGE3_KERNEL_TEMPLATE(VMerge3, int , int32x4x3_t, vld1q_s32, vst3q_s32); +MERGE3_KERNEL_TEMPLATE(VMerge3, int64 , int64x1x3_t, vld1_s64 , vst3_s64 ); + +MERGE4_KERNEL_TEMPLATE(VMerge4, uchar , uint8x16x4_t, vld1q_u8 , vst4q_u8 ); +MERGE4_KERNEL_TEMPLATE(VMerge4, ushort, uint16x8x4_t, vld1q_u16, vst4q_u16); +MERGE4_KERNEL_TEMPLATE(VMerge4, int , int32x4x4_t, vld1q_s32, vst4q_s32); +MERGE4_KERNEL_TEMPLATE(VMerge4, int64 , int64x1x4_t, vld1_s64 , vst4_s64 ); + +#elif CV_SSE2 + +template +struct VMerge2 +{ + VMerge2() : support(false) { } + void operator()(const T *, const T *, T *) const { } + + bool support; +}; + +template +struct VMerge3 +{ + VMerge3() : support(false) { } + void operator()(const T *, const T *, const T *, T *) const { } + + bool support; +}; + +template +struct VMerge4 +{ + VMerge4() : support(false) { } + void operator()(const T *, const T *, const T *, const T *, T *) const { } + + bool support; +}; + +#define MERGE2_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor, se) \ +template <> \ +struct VMerge2 \ +{ \ + enum \ + { \ + ELEMS_IN_VEC = 16 / sizeof(data_type) \ + }; \ + \ + VMerge2() \ + { \ + support = checkHardwareSupport(se); \ + } \ + \ + void operator()(const data_type * src0, const data_type * src1, \ + data_type * dst) const \ + { \ + reg_type v_src0 = _mm_loadu_##flavor((const cast_type *)(src0)); \ + reg_type v_src1 = _mm_loadu_##flavor((const cast_type *)(src0 + ELEMS_IN_VEC)); \ + reg_type v_src2 = _mm_loadu_##flavor((const cast_type *)(src1)); \ + reg_type v_src3 = _mm_loadu_##flavor((const cast_type *)(src1 + ELEMS_IN_VEC)); \ + \ + _mm_interleave(v_src0, v_src1, v_src2, v_src3); \ + \ + _mm_storeu_##flavor((cast_type *)(dst), v_src0); \ + _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC), v_src1); \ + _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 2), v_src2); \ + _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 3), v_src3); \ + } \ + \ + bool support; \ +} + +#define MERGE3_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor, se) \ +template <> \ +struct VMerge3 \ +{ \ + enum \ + { \ + ELEMS_IN_VEC = 16 / sizeof(data_type) \ + }; \ + \ + VMerge3() \ + { \ + support = checkHardwareSupport(se); \ + } \ + \ + void operator()(const data_type * src0, const data_type * src1, const data_type * src2,\ + data_type * dst) const \ + { \ + reg_type v_src0 = _mm_loadu_##flavor((const cast_type *)(src0)); \ + reg_type v_src1 = _mm_loadu_##flavor((const cast_type *)(src0 + ELEMS_IN_VEC)); \ + reg_type v_src2 = _mm_loadu_##flavor((const cast_type *)(src1)); \ + reg_type v_src3 = _mm_loadu_##flavor((const cast_type *)(src1 + ELEMS_IN_VEC)); \ + reg_type v_src4 = _mm_loadu_##flavor((const cast_type *)(src2)); \ + reg_type v_src5 = _mm_loadu_##flavor((const cast_type *)(src2 + ELEMS_IN_VEC)); \ + \ + _mm_interleave(v_src0, v_src1, v_src2, \ + v_src3, v_src4, v_src5); \ + \ + _mm_storeu_##flavor((cast_type *)(dst), v_src0); \ + _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC), v_src1); \ + _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 2), v_src2); \ + _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 3), v_src3); \ + _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 4), v_src4); \ + _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 5), v_src5); \ + } \ + \ + bool support; \ +} + +#define MERGE4_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor, se) \ +template <> \ +struct VMerge4 \ +{ \ + enum \ + { \ + ELEMS_IN_VEC = 16 / sizeof(data_type) \ + }; \ + \ + VMerge4() \ + { \ + support = checkHardwareSupport(se); \ + } \ + \ + void operator()(const data_type * src0, const data_type * src1, \ + const data_type * src2, const data_type * src3, \ + data_type * dst) const \ + { \ + reg_type v_src0 = _mm_loadu_##flavor((const cast_type *)(src0)); \ + reg_type v_src1 = _mm_loadu_##flavor((const cast_type *)(src0 + ELEMS_IN_VEC)); \ + reg_type v_src2 = _mm_loadu_##flavor((const cast_type *)(src1)); \ + reg_type v_src3 = _mm_loadu_##flavor((const cast_type *)(src1 + ELEMS_IN_VEC)); \ + reg_type v_src4 = _mm_loadu_##flavor((const cast_type *)(src2)); \ + reg_type v_src5 = _mm_loadu_##flavor((const cast_type *)(src2 + ELEMS_IN_VEC)); \ + reg_type v_src6 = _mm_loadu_##flavor((const cast_type *)(src3)); \ + reg_type v_src7 = _mm_loadu_##flavor((const cast_type *)(src3 + ELEMS_IN_VEC)); \ + \ + _mm_interleave(v_src0, v_src1, v_src2, v_src3, \ + v_src4, v_src5, v_src6, v_src7); \ + \ + _mm_storeu_##flavor((cast_type *)(dst), v_src0); \ + _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC), v_src1); \ + _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 2), v_src2); \ + _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 3), v_src3); \ + _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 4), v_src4); \ + _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 5), v_src5); \ + _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 6), v_src6); \ + _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 7), v_src7); \ + } \ + \ + bool support; \ +} + +MERGE2_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128, CV_CPU_SSE2); +MERGE3_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128, CV_CPU_SSE2); +MERGE4_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128, CV_CPU_SSE2); + +#if CV_SSE4_1 +MERGE2_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128, CV_CPU_SSE4_1); +MERGE3_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128, CV_CPU_SSE4_1); +MERGE4_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128, CV_CPU_SSE4_1); +#endif + +MERGE2_KERNEL_TEMPLATE( int, __m128, float, _mm_interleave_ps, ps, CV_CPU_SSE2); +MERGE3_KERNEL_TEMPLATE( int, __m128, float, _mm_interleave_ps, ps, CV_CPU_SSE2); +MERGE4_KERNEL_TEMPLATE( int, __m128, float, _mm_interleave_ps, ps, CV_CPU_SSE2); + +#endif + +template static void +merge_( const T** src, T* dst, int len, int cn ) +{ + int k = cn % 4 ? cn % 4 : 4; + int i, j; + if( k == 1 ) + { + const T* src0 = src[0]; + for( i = j = 0; i < len; i++, j += cn ) + dst[j] = src0[i]; + } + else if( k == 2 ) + { + const T *src0 = src[0], *src1 = src[1]; + i = j = 0; +#if CV_NEON + if(cn == 2) + { + int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T); + int inc_j = 2 * inc_i; + + VMerge2 vmerge; + for( ; i < len - inc_i; i += inc_i, j += inc_j) + vmerge(src0 + i, src1 + i, dst + j); + } +#elif CV_SSE2 + if(cn == 2) + { + int inc_i = 32/sizeof(T); + int inc_j = 2 * inc_i; + + VMerge2 vmerge; + if (vmerge.support) + for( ; i < len - inc_i; i += inc_i, j += inc_j) + vmerge(src0 + i, src1 + i, dst + j); + } +#endif + for( ; i < len; i++, j += cn ) + { + dst[j] = src0[i]; + dst[j+1] = src1[i]; + } + } + else if( k == 3 ) + { + const T *src0 = src[0], *src1 = src[1], *src2 = src[2]; + i = j = 0; +#if CV_NEON + if(cn == 3) + { + int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T); + int inc_j = 3 * inc_i; + + VMerge3 vmerge; + for( ; i < len - inc_i; i += inc_i, j += inc_j) + vmerge(src0 + i, src1 + i, src2 + i, dst + j); + } +#elif CV_SSE2 + if(cn == 3) + { + int inc_i = 32/sizeof(T); + int inc_j = 3 * inc_i; + + VMerge3 vmerge; + if (vmerge.support) + for( ; i < len - inc_i; i += inc_i, j += inc_j) + vmerge(src0 + i, src1 + i, src2 + i, dst + j); + } +#endif + for( ; i < len; i++, j += cn ) + { + dst[j] = src0[i]; + dst[j+1] = src1[i]; + dst[j+2] = src2[i]; + } + } + else + { + const T *src0 = src[0], *src1 = src[1], *src2 = src[2], *src3 = src[3]; + i = j = 0; +#if CV_NEON + if(cn == 4) + { + int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T); + int inc_j = 4 * inc_i; + + VMerge4 vmerge; + for( ; i < len - inc_i; i += inc_i, j += inc_j) + vmerge(src0 + i, src1 + i, src2 + i, src3 + i, dst + j); + } +#elif CV_SSE2 + if(cn == 4) + { + int inc_i = 32/sizeof(T); + int inc_j = 4 * inc_i; + + VMerge4 vmerge; + if (vmerge.support) + for( ; i < len - inc_i; i += inc_i, j += inc_j) + vmerge(src0 + i, src1 + i, src2 + i, src3 + i, dst + j); + } +#endif + for( ; i < len; i++, j += cn ) + { + dst[j] = src0[i]; dst[j+1] = src1[i]; + dst[j+2] = src2[i]; dst[j+3] = src3[i]; + } + } + + for( ; k < cn; k += 4 ) + { + const T *src0 = src[k], *src1 = src[k+1], *src2 = src[k+2], *src3 = src[k+3]; + for( i = 0, j = k; i < len; i++, j += cn ) + { + dst[j] = src0[i]; dst[j+1] = src1[i]; + dst[j+2] = src2[i]; dst[j+3] = src3[i]; + } + } +} + + +void merge8u(const uchar** src, uchar* dst, int len, int cn ) +{ + merge_(src, dst, len, cn); +} + +void merge16u(const ushort** src, ushort* dst, int len, int cn ) +{ + merge_(src, dst, len, cn); +} + +void merge32s(const int** src, int* dst, int len, int cn ) +{ + merge_(src, dst, len, cn); +} + +void merge64s(const int64** src, int64* dst, int len, int cn ) +{ + merge_(src, dst, len, cn); +} + +}} diff --git a/modules/hal/src/precomp.hpp b/modules/hal/src/precomp.hpp index 630565bec3..e0181aaf35 100644 --- a/modules/hal/src/precomp.hpp +++ b/modules/hal/src/precomp.hpp @@ -47,3 +47,13 @@ #include #include #include +#include +#include + +#include "opencv2/hal/sse_utils.hpp" + +#if defined HAVE_IPP && (IPP_VERSION_X100 >= 700) +#define ARITHM_USE_IPP 1 +#else +#define ARITHM_USE_IPP 0 +#endif diff --git a/modules/hal/src/replacement.hpp b/modules/hal/src/replacement.hpp new file mode 100644 index 0000000000..c8cc19224e --- /dev/null +++ b/modules/hal/src/replacement.hpp @@ -0,0 +1,208 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 2013, OpenCV Foundation, all rights reserved. +// Copyright (C) 2015, Itseez Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#ifndef __OPENCV_HAL_REPLACEMENT_HPP__ +#define __OPENCV_HAL_REPLACEMENT_HPP__ + +#include "opencv2/hal.hpp" + +inline int hal_t_add8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return cv::hal::Error::NotImplemented; } +inline int hal_t_add8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int) { return cv::hal::Error::NotImplemented; } +inline int hal_t_add16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int) { return cv::hal::Error::NotImplemented; } +inline int hal_t_add16s(const short*, size_t, const short*, size_t, short*, size_t, int, int) { return cv::hal::Error::NotImplemented; } +inline int hal_t_add32s(const int*, size_t, const int*, size_t, int*, size_t, int, int) { return cv::hal::Error::NotImplemented; } +inline int hal_t_add32f(const float*, size_t, const float*, size_t, float*, size_t, int, int) { return cv::hal::Error::NotImplemented; } +inline int hal_t_add64f(const double*, size_t, const double*, size_t, double*, size_t, int, int) { return cv::hal::Error::NotImplemented; } +inline int hal_t_sub8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return cv::hal::Error::NotImplemented; } +inline int hal_t_sub8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int) { return cv::hal::Error::NotImplemented; } +inline int hal_t_sub16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int) { return cv::hal::Error::NotImplemented; } +inline int hal_t_sub16s(const short*, size_t, const short*, size_t, short*, size_t, int, int) { return cv::hal::Error::NotImplemented; } +inline int hal_t_sub32s(const int*, size_t, const int*, size_t, int*, size_t, int, int) { return cv::hal::Error::NotImplemented; } +inline int hal_t_sub32f(const float*, size_t, const float*, size_t, float*, size_t, int, int) { return cv::hal::Error::NotImplemented; } +inline int hal_t_sub64f(const double*, size_t, const double*, size_t, double*, size_t, int, int) { return cv::hal::Error::NotImplemented; } +inline int hal_t_max8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return cv::hal::Error::NotImplemented; } +inline int hal_t_max8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int) { return cv::hal::Error::NotImplemented; } +inline int hal_t_max16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int) { return cv::hal::Error::NotImplemented; } +inline int hal_t_max16s(const short*, size_t, const short*, size_t, short*, size_t, int, int) { return cv::hal::Error::NotImplemented; } +inline int hal_t_max32s(const int*, size_t, const int*, size_t, int*, size_t, int, int) { return cv::hal::Error::NotImplemented; } +inline int hal_t_max32f(const float*, size_t, const float*, size_t, float*, size_t, int, int) { return cv::hal::Error::NotImplemented; } +inline int hal_t_max64f(const double*, size_t, const double*, size_t, double*, size_t, int, int) { return cv::hal::Error::NotImplemented; } +inline int hal_t_min8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return cv::hal::Error::NotImplemented; } +inline int hal_t_min8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int) { return cv::hal::Error::NotImplemented; } +inline int hal_t_min16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int) { return cv::hal::Error::NotImplemented; } +inline int hal_t_min16s(const short*, size_t, const short*, size_t, short*, size_t, int, int) { return cv::hal::Error::NotImplemented; } +inline int hal_t_min32s(const int*, size_t, const int*, size_t, int*, size_t, int, int) { return cv::hal::Error::NotImplemented; } +inline int hal_t_min32f(const float*, size_t, const float*, size_t, float*, size_t, int, int) { return cv::hal::Error::NotImplemented; } +inline int hal_t_min64f(const double*, size_t, const double*, size_t, double*, size_t, int, int) { return cv::hal::Error::NotImplemented; } +inline int hal_t_absdiff8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return cv::hal::Error::NotImplemented; } +inline int hal_t_absdiff8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int) { return cv::hal::Error::NotImplemented; } +inline int hal_t_absdiff16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int) { return cv::hal::Error::NotImplemented; } +inline int hal_t_absdiff16s(const short*, size_t, const short*, size_t, short*, size_t, int, int) { return cv::hal::Error::NotImplemented; } +inline int hal_t_absdiff32s(const int*, size_t, const int*, size_t, int*, size_t, int, int) { return cv::hal::Error::NotImplemented; } +inline int hal_t_absdiff32f(const float*, size_t, const float*, size_t, float*, size_t, int, int) { return cv::hal::Error::NotImplemented; } +inline int hal_t_absdiff64f(const double*, size_t, const double*, size_t, double*, size_t, int, int) { return cv::hal::Error::NotImplemented; } +inline int hal_t_and8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return cv::hal::Error::NotImplemented; } +inline int hal_t_or8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return cv::hal::Error::NotImplemented; } +inline int hal_t_xor8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return cv::hal::Error::NotImplemented; } +inline int hal_t_not8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return cv::hal::Error::NotImplemented; } + +#define hal_add8u hal_t_add8u +#define hal_add8s hal_t_add8s +#define hal_add16u hal_t_add16u +#define hal_add16s hal_t_add16s +#define hal_add32s hal_t_add32s +#define hal_add32f hal_t_add32f +#define hal_add64f hal_t_add64f +#define hal_sub8u hal_t_sub8u +#define hal_sub8s hal_t_sub8s +#define hal_sub16u hal_t_sub16u +#define hal_sub16s hal_t_sub16s +#define hal_sub32s hal_t_sub32s +#define hal_sub32f hal_t_sub32f +#define hal_sub64f hal_t_sub64f +#define hal_max8u hal_t_max8u +#define hal_max8s hal_t_max8s +#define hal_max16u hal_t_max16u +#define hal_max16s hal_t_max16s +#define hal_max32s hal_t_max32s +#define hal_max32f hal_t_max32f +#define hal_max64f hal_t_max64f +#define hal_min8u hal_t_min8u +#define hal_min8s hal_t_min8s +#define hal_min16u hal_t_min16u +#define hal_min16s hal_t_min16s +#define hal_min32s hal_t_min32s +#define hal_min32f hal_t_min32f +#define hal_min64f hal_t_min64f +#define hal_absdiff8u hal_t_absdiff8u +#define hal_absdiff8s hal_t_absdiff8s +#define hal_absdiff16u hal_t_absdiff16u +#define hal_absdiff16s hal_t_absdiff16s +#define hal_absdiff32s hal_t_absdiff32s +#define hal_absdiff32f hal_t_absdiff32f +#define hal_absdiff64f hal_t_absdiff64f +#define hal_and8u hal_t_and8u +#define hal_or8u hal_t_or8u +#define hal_xor8u hal_t_xor8u +#define hal_not8u hal_t_not8u + +inline int hal_t_cmp8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int, int) { return cv::hal::Error::NotImplemented; } +inline int hal_t_cmp8s(const schar*, size_t, const schar*, size_t, uchar*, size_t, int, int, int) { return cv::hal::Error::NotImplemented; } +inline int hal_t_cmp16u(const ushort*, size_t, const ushort*, size_t, uchar*, size_t, int, int, int) { return cv::hal::Error::NotImplemented; } +inline int hal_t_cmp16s(const short*, size_t, const short*, size_t, uchar*, size_t, int, int, int) { return cv::hal::Error::NotImplemented; } +inline int hal_t_cmp32s(const int*, size_t, const int*, size_t, uchar*, size_t, int, int, int) { return cv::hal::Error::NotImplemented; } +inline int hal_t_cmp32f(const float*, size_t, const float*, size_t, uchar*, size_t, int, int, int) { return cv::hal::Error::NotImplemented; } +inline int hal_t_cmp64f(const double*, size_t, const double*, size_t, uchar*, size_t, int, int, int) { return cv::hal::Error::NotImplemented; } + +#define hal_cmp8u hal_t_cmp8u +#define hal_cmp8s hal_t_cmp8s +#define hal_cmp16u hal_t_cmp16u +#define hal_cmp16s hal_t_cmp16s +#define hal_cmp32s hal_t_cmp32s +#define hal_cmp32f hal_t_cmp32f +#define hal_cmp64f hal_t_cmp64f + +inline int hal_t_mul8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; } +inline int hal_t_mul8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; } +inline int hal_t_mul16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; } +inline int hal_t_mul16s(const short*, size_t, const short*, size_t, short*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; } +inline int hal_t_mul32s(const int*, size_t, const int*, size_t, int*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; } +inline int hal_t_mul32f(const float*, size_t, const float*, size_t, float*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; } +inline int hal_t_mul64f(const double*, size_t, const double*, size_t, double*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; } +inline int hal_t_div8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; } +inline int hal_t_div8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; } +inline int hal_t_div16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; } +inline int hal_t_div16s(const short*, size_t, const short*, size_t, short*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; } +inline int hal_t_div32s(const int*, size_t, const int*, size_t, int*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; } +inline int hal_t_div32f(const float*, size_t, const float*, size_t, float*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; } +inline int hal_t_div64f(const double*, size_t, const double*, size_t, double*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; } +inline int hal_t_recip8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; } +inline int hal_t_recip8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; } +inline int hal_t_recip16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; } +inline int hal_t_recip16s(const short*, size_t, const short*, size_t, short*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; } +inline int hal_t_recip32s(const int*, size_t, const int*, size_t, int*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; } +inline int hal_t_recip32f(const float*, size_t, const float*, size_t, float*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; } +inline int hal_t_recip64f(const double*, size_t, const double*, size_t, double*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; } + +#define hal_mul8u hal_t_mul8u +#define hal_mul8s hal_t_mul8s +#define hal_mul16u hal_t_mul16u +#define hal_mul16s hal_t_mul16s +#define hal_mul32s hal_t_mul32s +#define hal_mul32f hal_t_mul32f +#define hal_mul64f hal_t_mul64f +#define hal_div8u hal_t_div8u +#define hal_div8s hal_t_div8s +#define hal_div16u hal_t_div16u +#define hal_div16s hal_t_div16s +#define hal_div32s hal_t_div32s +#define hal_div32f hal_t_div32f +#define hal_div64f hal_t_div64f +#define hal_recip8u hal_t_recip8u +#define hal_recip8s hal_t_recip8s +#define hal_recip16u hal_t_recip16u +#define hal_recip16s hal_t_recip16s +#define hal_recip32s hal_t_recip32s +#define hal_recip32f hal_t_recip32f +#define hal_recip64f hal_t_recip64f + +inline int hal_t_addWeighted8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int, void*) { return cv::hal::Error::NotImplemented; } +inline int hal_t_addWeighted8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int, void*) { return cv::hal::Error::NotImplemented; } +inline int hal_t_addWeighted16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int, void*) { return cv::hal::Error::NotImplemented; } +inline int hal_t_addWeighted16s(const short*, size_t, const short*, size_t, short*, size_t, int, int, void*) { return cv::hal::Error::NotImplemented; } +inline int hal_t_addWeighted32s(const int*, size_t, const int*, size_t, int*, size_t, int, int, void*) { return cv::hal::Error::NotImplemented; } +inline int hal_t_addWeighted32f(const float*, size_t, const float*, size_t, float*, size_t, int, int, void*) { return cv::hal::Error::NotImplemented; } +inline int hal_t_addWeighted64f(const double*, size_t, const double*, size_t, double*, size_t, int, int, void*) { return cv::hal::Error::NotImplemented; } + +#define hal_addWeighted8u hal_t_addWeighted8u +#define hal_addWeighted8s hal_t_addWeighted8s +#define hal_addWeighted16u hal_t_addWeighted16u +#define hal_addWeighted16s hal_t_addWeighted16s +#define hal_addWeighted32s hal_t_addWeighted32s +#define hal_addWeighted32f hal_t_addWeighted32f +#define hal_addWeighted64f hal_t_addWeighted64f + +#include "custom_hal.hpp" + +#endif diff --git a/modules/hal/src/split.cpp b/modules/hal/src/split.cpp new file mode 100644 index 0000000000..c31bf8cc44 --- /dev/null +++ b/modules/hal/src/split.cpp @@ -0,0 +1,424 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved. +// Copyright (C) 2014-2015, Itseez Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include "precomp.hpp" + +namespace cv { namespace hal { + +#if CV_NEON +template struct VSplit2; +template struct VSplit3; +template struct VSplit4; + +#define SPLIT2_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \ + template<> \ + struct name \ + { \ + void operator()(const data_type* src, data_type* dst0, \ + data_type* dst1) const \ + { \ + reg_type r = load_func(src); \ + store_func(dst0, r.val[0]); \ + store_func(dst1, r.val[1]); \ + } \ + } + +#define SPLIT3_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \ + template<> \ + struct name \ + { \ + void operator()(const data_type* src, data_type* dst0, data_type* dst1, \ + data_type* dst2) const \ + { \ + reg_type r = load_func(src); \ + store_func(dst0, r.val[0]); \ + store_func(dst1, r.val[1]); \ + store_func(dst2, r.val[2]); \ + } \ + } + +#define SPLIT4_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \ + template<> \ + struct name \ + { \ + void operator()(const data_type* src, data_type* dst0, data_type* dst1, \ + data_type* dst2, data_type* dst3) const \ + { \ + reg_type r = load_func(src); \ + store_func(dst0, r.val[0]); \ + store_func(dst1, r.val[1]); \ + store_func(dst2, r.val[2]); \ + store_func(dst3, r.val[3]); \ + } \ + } + +SPLIT2_KERNEL_TEMPLATE(VSplit2, uchar , uint8x16x2_t, vld2q_u8 , vst1q_u8 ); +SPLIT2_KERNEL_TEMPLATE(VSplit2, ushort, uint16x8x2_t, vld2q_u16, vst1q_u16); +SPLIT2_KERNEL_TEMPLATE(VSplit2, int , int32x4x2_t, vld2q_s32, vst1q_s32); +SPLIT2_KERNEL_TEMPLATE(VSplit2, int64 , int64x1x2_t, vld2_s64 , vst1_s64 ); + +SPLIT3_KERNEL_TEMPLATE(VSplit3, uchar , uint8x16x3_t, vld3q_u8 , vst1q_u8 ); +SPLIT3_KERNEL_TEMPLATE(VSplit3, ushort, uint16x8x3_t, vld3q_u16, vst1q_u16); +SPLIT3_KERNEL_TEMPLATE(VSplit3, int , int32x4x3_t, vld3q_s32, vst1q_s32); +SPLIT3_KERNEL_TEMPLATE(VSplit3, int64 , int64x1x3_t, vld3_s64 , vst1_s64 ); + +SPLIT4_KERNEL_TEMPLATE(VSplit4, uchar , uint8x16x4_t, vld4q_u8 , vst1q_u8 ); +SPLIT4_KERNEL_TEMPLATE(VSplit4, ushort, uint16x8x4_t, vld4q_u16, vst1q_u16); +SPLIT4_KERNEL_TEMPLATE(VSplit4, int , int32x4x4_t, vld4q_s32, vst1q_s32); +SPLIT4_KERNEL_TEMPLATE(VSplit4, int64 , int64x1x4_t, vld4_s64 , vst1_s64 ); + +#elif CV_SSE2 + +template +struct VSplit2 +{ + VSplit2() : support(false) { } + void operator()(const T *, T *, T *) const { } + + bool support; +}; + +template +struct VSplit3 +{ + VSplit3() : support(false) { } + void operator()(const T *, T *, T *, T *) const { } + + bool support; +}; + +template +struct VSplit4 +{ + VSplit4() : support(false) { } + void operator()(const T *, T *, T *, T *, T *) const { } + + bool support; +}; + +#define SPLIT2_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_deinterleave, flavor) \ +template <> \ +struct VSplit2 \ +{ \ + enum \ + { \ + ELEMS_IN_VEC = 16 / sizeof(data_type) \ + }; \ + \ + VSplit2() \ + { \ + support = checkHardwareSupport(CV_CPU_SSE2); \ + } \ + \ + void operator()(const data_type * src, \ + data_type * dst0, data_type * dst1) const \ + { \ + reg_type v_src0 = _mm_loadu_##flavor((cast_type const *)(src)); \ + reg_type v_src1 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC)); \ + reg_type v_src2 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 2)); \ + reg_type v_src3 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 3)); \ + \ + _mm_deinterleave(v_src0, v_src1, v_src2, v_src3); \ + \ + _mm_storeu_##flavor((cast_type *)(dst0), v_src0); \ + _mm_storeu_##flavor((cast_type *)(dst0 + ELEMS_IN_VEC), v_src1); \ + _mm_storeu_##flavor((cast_type *)(dst1), v_src2); \ + _mm_storeu_##flavor((cast_type *)(dst1 + ELEMS_IN_VEC), v_src3); \ + } \ + \ + bool support; \ +} + +#define SPLIT3_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_deinterleave, flavor) \ +template <> \ +struct VSplit3 \ +{ \ + enum \ + { \ + ELEMS_IN_VEC = 16 / sizeof(data_type) \ + }; \ + \ + VSplit3() \ + { \ + support = checkHardwareSupport(CV_CPU_SSE2); \ + } \ + \ + void operator()(const data_type * src, \ + data_type * dst0, data_type * dst1, data_type * dst2) const \ + { \ + reg_type v_src0 = _mm_loadu_##flavor((cast_type const *)(src)); \ + reg_type v_src1 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC)); \ + reg_type v_src2 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 2)); \ + reg_type v_src3 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 3)); \ + reg_type v_src4 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 4)); \ + reg_type v_src5 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 5)); \ + \ + _mm_deinterleave(v_src0, v_src1, v_src2, \ + v_src3, v_src4, v_src5); \ + \ + _mm_storeu_##flavor((cast_type *)(dst0), v_src0); \ + _mm_storeu_##flavor((cast_type *)(dst0 + ELEMS_IN_VEC), v_src1); \ + _mm_storeu_##flavor((cast_type *)(dst1), v_src2); \ + _mm_storeu_##flavor((cast_type *)(dst1 + ELEMS_IN_VEC), v_src3); \ + _mm_storeu_##flavor((cast_type *)(dst2), v_src4); \ + _mm_storeu_##flavor((cast_type *)(dst2 + ELEMS_IN_VEC), v_src5); \ + } \ + \ + bool support; \ +} + +#define SPLIT4_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_deinterleave, flavor) \ +template <> \ +struct VSplit4 \ +{ \ + enum \ + { \ + ELEMS_IN_VEC = 16 / sizeof(data_type) \ + }; \ + \ + VSplit4() \ + { \ + support = checkHardwareSupport(CV_CPU_SSE2); \ + } \ + \ + void operator()(const data_type * src, data_type * dst0, data_type * dst1, \ + data_type * dst2, data_type * dst3) const \ + { \ + reg_type v_src0 = _mm_loadu_##flavor((cast_type const *)(src)); \ + reg_type v_src1 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC)); \ + reg_type v_src2 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 2)); \ + reg_type v_src3 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 3)); \ + reg_type v_src4 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 4)); \ + reg_type v_src5 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 5)); \ + reg_type v_src6 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 6)); \ + reg_type v_src7 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 7)); \ + \ + _mm_deinterleave(v_src0, v_src1, v_src2, v_src3, \ + v_src4, v_src5, v_src6, v_src7); \ + \ + _mm_storeu_##flavor((cast_type *)(dst0), v_src0); \ + _mm_storeu_##flavor((cast_type *)(dst0 + ELEMS_IN_VEC), v_src1); \ + _mm_storeu_##flavor((cast_type *)(dst1), v_src2); \ + _mm_storeu_##flavor((cast_type *)(dst1 + ELEMS_IN_VEC), v_src3); \ + _mm_storeu_##flavor((cast_type *)(dst2), v_src4); \ + _mm_storeu_##flavor((cast_type *)(dst2 + ELEMS_IN_VEC), v_src5); \ + _mm_storeu_##flavor((cast_type *)(dst3), v_src6); \ + _mm_storeu_##flavor((cast_type *)(dst3 + ELEMS_IN_VEC), v_src7); \ + } \ + \ + bool support; \ +} + +SPLIT2_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128); +SPLIT2_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128); +SPLIT2_KERNEL_TEMPLATE( int, __m128, float, _mm_deinterleave_ps, ps); + +SPLIT3_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128); +SPLIT3_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128); +SPLIT3_KERNEL_TEMPLATE( int, __m128, float, _mm_deinterleave_ps, ps); + +SPLIT4_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128); +SPLIT4_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128); +SPLIT4_KERNEL_TEMPLATE( int, __m128, float, _mm_deinterleave_ps, ps); + +#endif + +template static void +split_( const T* src, T** dst, int len, int cn ) +{ + int k = cn % 4 ? cn % 4 : 4; + int i, j; + if( k == 1 ) + { + T* dst0 = dst[0]; + + if(cn == 1) + { + memcpy(dst0, src, len * sizeof(T)); + } + else + { + for( i = 0, j = 0 ; i < len; i++, j += cn ) + dst0[i] = src[j]; + } + } + else if( k == 2 ) + { + T *dst0 = dst[0], *dst1 = dst[1]; + i = j = 0; + +#if CV_NEON + if(cn == 2) + { + int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T); + int inc_j = 2 * inc_i; + + VSplit2 vsplit; + for( ; i < len - inc_i; i += inc_i, j += inc_j) + vsplit(src + j, dst0 + i, dst1 + i); + } +#elif CV_SSE2 + if (cn == 2) + { + int inc_i = 32/sizeof(T); + int inc_j = 2 * inc_i; + + VSplit2 vsplit; + if (vsplit.support) + { + for( ; i <= len - inc_i; i += inc_i, j += inc_j) + vsplit(src + j, dst0 + i, dst1 + i); + } + } +#endif + for( ; i < len; i++, j += cn ) + { + dst0[i] = src[j]; + dst1[i] = src[j+1]; + } + } + else if( k == 3 ) + { + T *dst0 = dst[0], *dst1 = dst[1], *dst2 = dst[2]; + i = j = 0; + +#if CV_NEON + if(cn == 3) + { + int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T); + int inc_j = 3 * inc_i; + + VSplit3 vsplit; + for( ; i <= len - inc_i; i += inc_i, j += inc_j) + vsplit(src + j, dst0 + i, dst1 + i, dst2 + i); + } +#elif CV_SSE2 + if (cn == 3) + { + int inc_i = 32/sizeof(T); + int inc_j = 3 * inc_i; + + VSplit3 vsplit; + + if (vsplit.support) + { + for( ; i <= len - inc_i; i += inc_i, j += inc_j) + vsplit(src + j, dst0 + i, dst1 + i, dst2 + i); + } + } +#endif + for( ; i < len; i++, j += cn ) + { + dst0[i] = src[j]; + dst1[i] = src[j+1]; + dst2[i] = src[j+2]; + } + } + else + { + T *dst0 = dst[0], *dst1 = dst[1], *dst2 = dst[2], *dst3 = dst[3]; + i = j = 0; + +#if CV_NEON + if(cn == 4) + { + int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T); + int inc_j = 4 * inc_i; + + VSplit4 vsplit; + for( ; i <= len - inc_i; i += inc_i, j += inc_j) + vsplit(src + j, dst0 + i, dst1 + i, dst2 + i, dst3 + i); + } +#elif CV_SSE2 + if (cn == 4) + { + int inc_i = 32/sizeof(T); + int inc_j = 4 * inc_i; + + VSplit4 vsplit; + if (vsplit.support) + { + for( ; i <= len - inc_i; i += inc_i, j += inc_j) + vsplit(src + j, dst0 + i, dst1 + i, dst2 + i, dst3 + i); + } + } +#endif + for( ; i < len; i++, j += cn ) + { + dst0[i] = src[j]; dst1[i] = src[j+1]; + dst2[i] = src[j+2]; dst3[i] = src[j+3]; + } + } + + for( ; k < cn; k += 4 ) + { + T *dst0 = dst[k], *dst1 = dst[k+1], *dst2 = dst[k+2], *dst3 = dst[k+3]; + for( i = 0, j = k; i < len; i++, j += cn ) + { + dst0[i] = src[j]; dst1[i] = src[j+1]; + dst2[i] = src[j+2]; dst3[i] = src[j+3]; + } + } +} + +void split8u(const uchar* src, uchar** dst, int len, int cn ) +{ + split_(src, dst, len, cn); +} + +void split16u(const ushort* src, ushort** dst, int len, int cn ) +{ + split_(src, dst, len, cn); +} + +void split32s(const int* src, int** dst, int len, int cn ) +{ + split_(src, dst, len, cn); +} + +void split64s(const int64* src, int64** dst, int len, int cn ) +{ + split_(src, dst, len, cn); +} + +}} diff --git a/modules/imgproc/src/precomp.hpp b/modules/imgproc/src/precomp.hpp index 7a0cece2f2..3bb8d8e760 100644 --- a/modules/imgproc/src/precomp.hpp +++ b/modules/imgproc/src/precomp.hpp @@ -94,4 +94,6 @@ extern const float icv8x32fSqrTab[]; #include "_geom.h" #include "filterengine.hpp" +#include "opencv2/hal/sse_utils.hpp" + #endif /*__OPENCV_CV_INTERNAL_H_*/ From 98f5fcd86eb2b0144ba372c32cadc67cd4714abc Mon Sep 17 00:00:00 2001 From: Maksim Shabunin Date: Thu, 3 Dec 2015 14:56:15 +0300 Subject: [PATCH 2/5] HAL: fixed header path --- modules/hal/src/arithm.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/hal/src/arithm.cpp b/modules/hal/src/arithm.cpp index b0705c5137..e30cd7d9e5 100644 --- a/modules/hal/src/arithm.cpp +++ b/modules/hal/src/arithm.cpp @@ -45,7 +45,7 @@ #include "precomp.hpp" #include "arithm_simd.hpp" #include "arithm_core.hpp" -#include "opencv2/hal/replacement.hpp" +#include "replacement.hpp" namespace cv { namespace hal { From 5473dbebedea1bb8c22e84e03f2ec3df475ff874 Mon Sep 17 00:00:00 2001 From: Maksim Shabunin Date: Thu, 3 Dec 2015 15:25:42 +0300 Subject: [PATCH 3/5] Fixed some build issues --- modules/hal/include/opencv2/hal.hpp | 10 +--------- modules/hal/samples/simple_hal/CMakeLists.txt | 1 - modules/hal/samples/simple_hal/simple.cpp | 1 - 3 files changed, 1 insertion(+), 11 deletions(-) diff --git a/modules/hal/include/opencv2/hal.hpp b/modules/hal/include/opencv2/hal.hpp index d1ec73b429..125bbc8248 100644 --- a/modules/hal/include/opencv2/hal.hpp +++ b/modules/hal/include/opencv2/hal.hpp @@ -247,15 +247,7 @@ template struct OpAbsDiff typedef T type1; typedef T type2; typedef T rtype; - T operator()(T a, T b) const { return (T)std::abs(a - b); } -}; - -template struct OpAbsDiffS -{ - typedef T type1; - typedef WT type2; - typedef T rtype; - T operator()(T a, WT b) const { return saturate_cast(std::abs(a - b)); } + T operator()(T a, T b) const { return a > b ? a - b : b - a; } }; template struct OpAnd diff --git a/modules/hal/samples/simple_hal/CMakeLists.txt b/modules/hal/samples/simple_hal/CMakeLists.txt index ee4ac22be9..dd0be70f2f 100644 --- a/modules/hal/samples/simple_hal/CMakeLists.txt +++ b/modules/hal/samples/simple_hal/CMakeLists.txt @@ -9,4 +9,3 @@ endif() add_library(simple_hal simple.cpp) set(OPENCV_HAL_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../..") target_include_directories(simple_hal PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} ${OPENCV_HAL_DIR}/include) - diff --git a/modules/hal/samples/simple_hal/simple.cpp b/modules/hal/samples/simple_hal/simple.cpp index 49d77b02ac..564a611a5a 100644 --- a/modules/hal/samples/simple_hal/simple.cpp +++ b/modules/hal/samples/simple_hal/simple.cpp @@ -31,4 +31,3 @@ int slow_not8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, dst[x] = ~src1[x]; return cv::hal::Error::Ok; } - From 5c5d0e6743c3e1289a018c5ac5af5709e3525aef Mon Sep 17 00:00:00 2001 From: Maksim Shabunin Date: Thu, 3 Dec 2015 15:40:55 +0300 Subject: [PATCH 4/5] Fixed NEON compilation issue --- modules/core/include/opencv2/core/base.hpp | 81 +----------- .../hal/include/opencv2/hal/neon_utils.hpp | 123 ++++++++++++++++++ modules/hal/src/precomp.hpp | 1 + 3 files changed, 125 insertions(+), 80 deletions(-) create mode 100644 modules/hal/include/opencv2/hal/neon_utils.hpp diff --git a/modules/core/include/opencv2/core/base.hpp b/modules/core/include/opencv2/core/base.hpp index 9a0d4989b9..a2e8dfdb0e 100644 --- a/modules/core/include/opencv2/core/base.hpp +++ b/modules/core/include/opencv2/core/base.hpp @@ -679,86 +679,7 @@ CV_EXPORTS void setUseIPP(bool flag); //! @} core_utils -//! @addtogroup core_utils_neon -//! @{ - -#if CV_NEON - -inline int32x2_t cv_vrnd_s32_f32(float32x2_t v) -{ - static int32x2_t v_sign = vdup_n_s32(1 << 31), - v_05 = vreinterpret_s32_f32(vdup_n_f32(0.5f)); - - int32x2_t v_addition = vorr_s32(v_05, vand_s32(v_sign, vreinterpret_s32_f32(v))); - return vcvt_s32_f32(vadd_f32(v, vreinterpret_f32_s32(v_addition))); -} - -inline int32x4_t cv_vrndq_s32_f32(float32x4_t v) -{ - static int32x4_t v_sign = vdupq_n_s32(1 << 31), - v_05 = vreinterpretq_s32_f32(vdupq_n_f32(0.5f)); - - int32x4_t v_addition = vorrq_s32(v_05, vandq_s32(v_sign, vreinterpretq_s32_f32(v))); - return vcvtq_s32_f32(vaddq_f32(v, vreinterpretq_f32_s32(v_addition))); -} - -inline uint32x2_t cv_vrnd_u32_f32(float32x2_t v) -{ - static float32x2_t v_05 = vdup_n_f32(0.5f); - return vcvt_u32_f32(vadd_f32(v, v_05)); -} - -inline uint32x4_t cv_vrndq_u32_f32(float32x4_t v) -{ - static float32x4_t v_05 = vdupq_n_f32(0.5f); - return vcvtq_u32_f32(vaddq_f32(v, v_05)); -} - -inline float32x4_t cv_vrecpq_f32(float32x4_t val) -{ - float32x4_t reciprocal = vrecpeq_f32(val); - reciprocal = vmulq_f32(vrecpsq_f32(val, reciprocal), reciprocal); - reciprocal = vmulq_f32(vrecpsq_f32(val, reciprocal), reciprocal); - return reciprocal; -} - -inline float32x2_t cv_vrecp_f32(float32x2_t val) -{ - float32x2_t reciprocal = vrecpe_f32(val); - reciprocal = vmul_f32(vrecps_f32(val, reciprocal), reciprocal); - reciprocal = vmul_f32(vrecps_f32(val, reciprocal), reciprocal); - return reciprocal; -} - -inline float32x4_t cv_vrsqrtq_f32(float32x4_t val) -{ - float32x4_t e = vrsqrteq_f32(val); - e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(e, e), val), e); - e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(e, e), val), e); - return e; -} - -inline float32x2_t cv_vrsqrt_f32(float32x2_t val) -{ - float32x2_t e = vrsqrte_f32(val); - e = vmul_f32(vrsqrts_f32(vmul_f32(e, e), val), e); - e = vmul_f32(vrsqrts_f32(vmul_f32(e, e), val), e); - return e; -} - -inline float32x4_t cv_vsqrtq_f32(float32x4_t val) -{ - return cv_vrecpq_f32(cv_vrsqrtq_f32(val)); -} - -inline float32x2_t cv_vsqrt_f32(float32x2_t val) -{ - return cv_vrecp_f32(cv_vrsqrt_f32(val)); -} - -#endif - -//! @} core_utils_neon +#include "opencv2/hal/neon_utils.hpp" } // cv diff --git a/modules/hal/include/opencv2/hal/neon_utils.hpp b/modules/hal/include/opencv2/hal/neon_utils.hpp new file mode 100644 index 0000000000..21407a028b --- /dev/null +++ b/modules/hal/include/opencv2/hal/neon_utils.hpp @@ -0,0 +1,123 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2015, Itseez Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#ifndef __OPENCV_HAL_NEON_UTILS_HPP__ +#define __OPENCV_HAL_NEON_UTILS_HPP__ + +#include "opencv2/hal/defs.h" + +#if CV_NEON + +inline int32x2_t cv_vrnd_s32_f32(float32x2_t v) +{ + static int32x2_t v_sign = vdup_n_s32(1 << 31), + v_05 = vreinterpret_s32_f32(vdup_n_f32(0.5f)); + + int32x2_t v_addition = vorr_s32(v_05, vand_s32(v_sign, vreinterpret_s32_f32(v))); + return vcvt_s32_f32(vadd_f32(v, vreinterpret_f32_s32(v_addition))); +} + +inline int32x4_t cv_vrndq_s32_f32(float32x4_t v) +{ + static int32x4_t v_sign = vdupq_n_s32(1 << 31), + v_05 = vreinterpretq_s32_f32(vdupq_n_f32(0.5f)); + + int32x4_t v_addition = vorrq_s32(v_05, vandq_s32(v_sign, vreinterpretq_s32_f32(v))); + return vcvtq_s32_f32(vaddq_f32(v, vreinterpretq_f32_s32(v_addition))); +} + +inline uint32x2_t cv_vrnd_u32_f32(float32x2_t v) +{ + static float32x2_t v_05 = vdup_n_f32(0.5f); + return vcvt_u32_f32(vadd_f32(v, v_05)); +} + +inline uint32x4_t cv_vrndq_u32_f32(float32x4_t v) +{ + static float32x4_t v_05 = vdupq_n_f32(0.5f); + return vcvtq_u32_f32(vaddq_f32(v, v_05)); +} + +inline float32x4_t cv_vrecpq_f32(float32x4_t val) +{ + float32x4_t reciprocal = vrecpeq_f32(val); + reciprocal = vmulq_f32(vrecpsq_f32(val, reciprocal), reciprocal); + reciprocal = vmulq_f32(vrecpsq_f32(val, reciprocal), reciprocal); + return reciprocal; +} + +inline float32x2_t cv_vrecp_f32(float32x2_t val) +{ + float32x2_t reciprocal = vrecpe_f32(val); + reciprocal = vmul_f32(vrecps_f32(val, reciprocal), reciprocal); + reciprocal = vmul_f32(vrecps_f32(val, reciprocal), reciprocal); + return reciprocal; +} + +inline float32x4_t cv_vrsqrtq_f32(float32x4_t val) +{ + float32x4_t e = vrsqrteq_f32(val); + e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(e, e), val), e); + e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(e, e), val), e); + return e; +} + +inline float32x2_t cv_vrsqrt_f32(float32x2_t val) +{ + float32x2_t e = vrsqrte_f32(val); + e = vmul_f32(vrsqrts_f32(vmul_f32(e, e), val), e); + e = vmul_f32(vrsqrts_f32(vmul_f32(e, e), val), e); + return e; +} + +inline float32x4_t cv_vsqrtq_f32(float32x4_t val) +{ + return cv_vrecpq_f32(cv_vrsqrtq_f32(val)); +} + +inline float32x2_t cv_vsqrt_f32(float32x2_t val) +{ + return cv_vrecp_f32(cv_vrsqrt_f32(val)); +} + +#endif + +#endif // __OPENCV_HAL_NEON_UTILS_HPP__ diff --git a/modules/hal/src/precomp.hpp b/modules/hal/src/precomp.hpp index e0181aaf35..16586368e4 100644 --- a/modules/hal/src/precomp.hpp +++ b/modules/hal/src/precomp.hpp @@ -51,6 +51,7 @@ #include #include "opencv2/hal/sse_utils.hpp" +#include "opencv2/hal/neon_utils.hpp" #if defined HAVE_IPP && (IPP_VERSION_X100 >= 700) #define ARITHM_USE_IPP 1 From 0e5c710757418812b76cdc7240193c49a4ebac3a Mon Sep 17 00:00:00 2001 From: Maksim Shabunin Date: Thu, 3 Dec 2015 17:30:45 +0300 Subject: [PATCH 5/5] Fix documentation warning --- modules/core/include/opencv2/core/base.hpp | 4 ++-- modules/hal/include/opencv2/hal/neon_utils.hpp | 4 ++++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/modules/core/include/opencv2/core/base.hpp b/modules/core/include/opencv2/core/base.hpp index a2e8dfdb0e..a8a0b23e12 100644 --- a/modules/core/include/opencv2/core/base.hpp +++ b/modules/core/include/opencv2/core/base.hpp @@ -679,8 +679,8 @@ CV_EXPORTS void setUseIPP(bool flag); //! @} core_utils -#include "opencv2/hal/neon_utils.hpp" - } // cv +#include "opencv2/hal/neon_utils.hpp" + #endif //__OPENCV_CORE_BASE_HPP__ diff --git a/modules/hal/include/opencv2/hal/neon_utils.hpp b/modules/hal/include/opencv2/hal/neon_utils.hpp index 21407a028b..6026777a6f 100644 --- a/modules/hal/include/opencv2/hal/neon_utils.hpp +++ b/modules/hal/include/opencv2/hal/neon_utils.hpp @@ -44,6 +44,8 @@ #include "opencv2/hal/defs.h" +namespace cv { + #if CV_NEON inline int32x2_t cv_vrnd_s32_f32(float32x2_t v) @@ -120,4 +122,6 @@ inline float32x2_t cv_vsqrt_f32(float32x2_t val) #endif +} + #endif // __OPENCV_HAL_NEON_UTILS_HPP__