From 80b62a41c6f65ccae09610284a45c8f935667d19 Mon Sep 17 00:00:00 2001 From: Vadim Pisarevsky Date: Thu, 6 Sep 2018 19:36:59 +0300 Subject: [PATCH] Merge pull request #12411 from vpisarev:wide_convert * rewrote Mat::convertTo() and convertScaleAbs() to wide universal intrinsics; added always-available and SIMD-optimized FP16<=>FP32 conversion * fixed compile warnings * fix some more compile errors * slightly relaxed accuracy threshold for int->float conversion (since we now do it using single-precision arithmetics, not double-precision) * fixed compile errors on iOS, Android and in the baseline C++ version (intrin_cpp.hpp) * trying to fix ARM-neon builds * trying to fix ARM-neon builds * trying to fix ARM-neon builds * trying to fix ARM-neon builds --- modules/core/include/opencv2/core/cvdef.h | 122 +- .../core/include/opencv2/core/hal/intrin.hpp | 26 +- .../include/opencv2/core/hal/intrin_avx.hpp | 23 +- .../include/opencv2/core/hal/intrin_cpp.hpp | 22 + .../include/opencv2/core/hal/intrin_neon.hpp | 131 +- .../include/opencv2/core/hal/intrin_sse.hpp | 46 +- .../include/opencv2/core/hal/intrin_vsx.hpp | 18 + modules/core/perf/perf_addWeighted.cpp | 5 +- modules/core/perf/perf_convertTo.cpp | 2 +- modules/core/src/convert.avx2.cpp | 40 - modules/core/src/convert.cpp | 1482 +++----------- modules/core/src/convert.fp16.cpp | 126 -- modules/core/src/convert.hpp | 566 +++-- modules/core/src/convert.sse4_1.cpp | 203 -- modules/core/src/convert_scale.cpp | 1822 +++-------------- modules/core/test/test_intrin_utils.hpp | 10 +- modules/core/test/test_math.cpp | 73 +- platforms/ios/build_framework.py | 2 +- 18 files changed, 1178 insertions(+), 3541 deletions(-) delete mode 100644 modules/core/src/convert.avx2.cpp delete mode 100644 modules/core/src/convert.fp16.cpp delete mode 100644 modules/core/src/convert.sse4_1.cpp mode change 100644 => 100755 platforms/ios/build_framework.py diff --git a/modules/core/include/opencv2/core/cvdef.h b/modules/core/include/opencv2/core/cvdef.h index 56403b3191..b9e5bce222 100644 --- a/modules/core/include/opencv2/core/cvdef.h +++ b/modules/core/include/opencv2/core/cvdef.h @@ -219,15 +219,10 @@ enum CpuFeatures { typedef union Cv16suf { short i; + ushort u; #if CV_FP16_TYPE __fp16 h; #endif - struct _fp16Format - { - unsigned int significand : 10; - unsigned int exponent : 5; - unsigned int sign : 1; - } fmt; } Cv16suf; @@ -236,12 +231,6 @@ typedef union Cv32suf int i; unsigned u; float f; - struct _fp32Format - { - unsigned int significand : 23; - unsigned int exponent : 8; - unsigned int sign : 1; - } fmt; } Cv32suf; @@ -548,6 +537,115 @@ typedef ::uint64_t uint64_t; #include #endif +#ifdef __cplusplus +namespace cv +{ + +class float16_t +{ +public: +#if CV_FP16_TYPE + + float16_t() {} + explicit float16_t(float x) { h = (__fp16)x; } + operator float() const { return (float)h; } + static float16_t fromBits(ushort w) + { + Cv16suf u; + u.u = w; + float16_t result; + result.h = u.h; + return result; + } + static float16_t zero() + { + float16_t result; + result.h = (__fp16)0; + return result; + } + ushort bits() const + { + Cv16suf u; + u.h = h; + return u.u; + } +protected: + __fp16 h; + +#else + float16_t() {} + explicit float16_t(float x) + { + #if CV_AVX2 + __m128 v = _mm_load_ss(&x); + w = (ushort)_mm_cvtsi128_si32(_mm_cvtps_ph(v, 0)); + #else + Cv32suf in; + in.f = x; + unsigned sign = in.u & 0x80000000; + in.u ^= sign; + + if( in.u >= 0x47800000 ) + w = (ushort)(in.u > 0x7f800000 ? 0x7e00 : 0x7c00); + else + { + if (in.u < 0x38800000) + { + in.f += 0.5f; + w = (ushort)(in.u - 0x3f000000); + } + else + { + unsigned t = in.u + 0xc8000fff; + w = (ushort)((t + ((in.u >> 13) & 1)) >> 13); + } + } + + w = (ushort)(w | (sign >> 16)); + #endif + } + + operator float() const + { + #if CV_AVX2 + float f; + _mm_store_ss(&f, _mm_cvtph_ps(_mm_cvtsi32_si128(w))); + return f; + #else + Cv32suf out; + + unsigned t = ((w & 0x7fff) << 13) + 0x38000000; + unsigned sign = (w & 0x8000) << 16; + unsigned e = w & 0x7c00; + + out.u = t + (1 << 23); + out.u = (e >= 0x7c00 ? t + 0x38000000 : + e == 0 ? (out.f -= 6.103515625e-05f, out.u) : t) | sign; + return out.f; + #endif + } + + static float16_t fromBits(ushort b) + { + float16_t result; + result.w = b; + return result; + } + static float16_t zero() + { + float16_t result; + result.w = (ushort)0; + return result; + } + ushort bits() const { return w; } +protected: + ushort w; + +#endif +}; + +} +#endif //! @} diff --git a/modules/core/include/opencv2/core/hal/intrin.hpp b/modules/core/include/opencv2/core/hal/intrin.hpp index 6505f255cb..a321627081 100644 --- a/modules/core/include/opencv2/core/hal/intrin.hpp +++ b/modules/core/include/opencv2/core/hal/intrin.hpp @@ -252,7 +252,8 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(unsigned, v_uint64, prefix) \ CV_INTRIN_DEFINE_WIDE_INTRIN(float, v_float32, f32, prefix, load) \ CV_INTRIN_DEFINE_WIDE_INTRIN(int64, v_int64, s64, prefix, load) \ - CV_INTRIN_DEFINE_WIDE_INTRIN(uint64, v_uint64, u64, prefix, load) + CV_INTRIN_DEFINE_WIDE_INTRIN(uint64, v_uint64, u64, prefix, load) \ + CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(float16_t, v_float32, prefix) template struct V_RegTraits { @@ -286,9 +287,6 @@ template struct V_RegTraits #if CV_SIMD128_64F CV_DEF_REG_TRAITS(v, v_float64x2, double, f64, v_float64x2, void, void, v_int64x2, v_int32x4); #endif -#if CV_SIMD128_FP16 - CV_DEF_REG_TRAITS(v, v_float16x8, short, f16, v_float16x8, void, void, v_int16x8, v_int16x8); -#endif #endif #if CV_SIMD256 @@ -302,9 +300,6 @@ template struct V_RegTraits CV_DEF_REG_TRAITS(v256, v_uint64x4, uint64, u64, v_uint64x4, void, void, v_int64x4, void); CV_DEF_REG_TRAITS(v256, v_int64x4, int64, s64, v_uint64x4, void, void, v_int64x4, void); CV_DEF_REG_TRAITS(v256, v_float64x4, double, f64, v_float64x4, void, void, v_int64x4, v_int32x8); -#if CV_SIMD256_FP16 - CV_DEF_REG_TRAITS(v256, v_float16x16, short, f16, v_float16x16, void, void, v_int16x16, void); -#endif #endif #if CV_SIMD512 && (!defined(CV__SIMD_FORCE_WIDTH) || CV__SIMD_FORCE_WIDTH == 512) @@ -335,14 +330,6 @@ namespace CV__SIMD_NAMESPACE { #if CV_SIMD256_64F typedef v_float64x4 v_float64; #endif - #if CV_FP16 - #define vx_load_fp16_f32 v256_load_fp16_f32 - #define vx_store_fp16 v_store_fp16 - #endif - #if CV_SIMD256_FP16 - typedef v_float16x16 v_float16; - CV_INTRIN_DEFINE_WIDE_INTRIN(short, v_float16, f16, v256, load_f16) - #endif CV_INTRIN_DEFINE_WIDE_INTRIN_ALL_TYPES(v256) CV_INTRIN_DEFINE_WIDE_INTRIN(double, v_float64, f64, v256, load) inline void vx_cleanup() { v256_cleanup(); } @@ -353,7 +340,6 @@ using namespace CV__SIMD_NAMESPACE; namespace CV__SIMD_NAMESPACE { #define CV_SIMD CV_SIMD128 #define CV_SIMD_64F CV_SIMD128_64F - #define CV_SIMD_FP16 CV_SIMD128_FP16 #define CV_SIMD_WIDTH 16 typedef v_uint8x16 v_uint8; typedef v_int8x16 v_int8; @@ -367,14 +353,6 @@ namespace CV__SIMD_NAMESPACE { #if CV_SIMD128_64F typedef v_float64x2 v_float64; #endif - #if CV_FP16 - #define vx_load_fp16_f32 v128_load_fp16_f32 - #define vx_store_fp16 v_store_fp16 - #endif - #if CV_SIMD128_FP16 - typedef v_float16x8 v_float16; - CV_INTRIN_DEFINE_WIDE_INTRIN(short, v_float16, f16, v, load_f16) - #endif CV_INTRIN_DEFINE_WIDE_INTRIN_ALL_TYPES(v) #if CV_SIMD128_64F CV_INTRIN_DEFINE_WIDE_INTRIN(double, v_float64, f64, v, load) diff --git a/modules/core/include/opencv2/core/hal/intrin_avx.hpp b/modules/core/include/opencv2/core/hal/intrin_avx.hpp index c21b46a58f..a38c25e385 100644 --- a/modules/core/include/opencv2/core/hal/intrin_avx.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_avx.hpp @@ -1414,10 +1414,17 @@ inline v_int8x32 v_pack(const v_int16x16& a, const v_int16x16& b) { return v_int8x32(_v256_shuffle_odd_64(_mm256_packs_epi16(a.val, b.val))); } inline v_uint8x32 v_pack(const v_uint16x16& a, const v_uint16x16& b) -{ return v_uint8x32(_v256_shuffle_odd_64(_mm256_packus_epi16(a.val, b.val))); } +{ + __m256i t = _mm256_set1_epi16(255); + __m256i a1 = _mm256_min_epu16(a.val, t); + __m256i b1 = _mm256_min_epu16(b.val, t); + return v_uint8x32(_v256_shuffle_odd_64(_mm256_packus_epi16(a1, b1))); +} inline v_uint8x32 v_pack_u(const v_int16x16& a, const v_int16x16& b) -{ return v_pack(v_reinterpret_as_u16(a), v_reinterpret_as_u16(b)); } +{ + return v_uint8x32(_v256_shuffle_odd_64(_mm256_packus_epi16(a.val, b.val))); +} inline void v_pack_store(schar* ptr, const v_int16x16& a) { v_store_low(ptr, v_pack(a, a)); } @@ -2390,6 +2397,18 @@ OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_float32x8, float, f32, v_uint32x8, un OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_int64x4, int64, s64, v_uint64x4, uint64, u64) OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_float64x4, double, f64, v_uint64x4, uint64, u64) +// FP16 +inline v_float32x8 v256_load_expand(const float16_t* ptr) +{ + return v_float32x8(_mm256_cvtph_ps(_mm_loadu_si128((const __m128i*)ptr))); +} + +inline void v_pack_store(float16_t* ptr, const v_float32x8& a) +{ + __m128i ah = _mm256_cvtps_ph(a.val, 0); + _mm_storeu_si128((__m128i*)ptr, ah); +} + inline void v256_cleanup() { _mm256_zeroupper(); } //! @name Check SIMD256 support diff --git a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp index ccd317682d..64a457a530 100644 --- a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp @@ -2062,6 +2062,28 @@ inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0, v.s[0]*m0.s[3] + v.s[1]*m1.s[3] + v.s[2]*m2.s[3] + m3.s[3]); } +////// FP16 suport /////// + +inline v_reg::nlanes128> +v_load_expand(const float16_t* ptr) +{ + v_reg::nlanes128> v; + for( int i = 0; i < v.nlanes; i++ ) + { + v.s[i] = ptr[i]; + } + return v; +} + +inline void +v_pack_store(float16_t* ptr, v_reg::nlanes128>& v) +{ + for( int i = 0; i < v.nlanes; i++ ) + { + ptr[i] = float16_t(v.s[i]); + } +} + inline void v_cleanup() {} //! @} diff --git a/modules/core/include/opencv2/core/hal/intrin_neon.hpp b/modules/core/include/opencv2/core/hal/intrin_neon.hpp index c017b075f1..d87b4e2ba0 100644 --- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp @@ -62,15 +62,6 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN #define CV_SIMD128_64F 0 #endif -#ifndef CV_SIMD128_FP16 -# if CV_FP16 && (defined(__GNUC__) && __GNUC__ >= 5) // #12027: float16x8_t is missing in GCC 4.8.2 -# define CV_SIMD128_FP16 1 -# endif -#endif -#ifndef CV_SIMD128_FP16 -# define CV_SIMD128_FP16 0 -#endif - #if CV_SIMD128_64F #define OPENCV_HAL_IMPL_NEON_REINTERPRET(_Tpv, suffix) \ template static inline \ @@ -329,53 +320,6 @@ inline void v_store_fp16(short* ptr, const v_float32x4& a) } #endif - -#if CV_SIMD128_FP16 -// Workaround for old compilers -static inline int16x8_t vreinterpretq_s16_f16(float16x8_t a) { return (int16x8_t)a; } -static inline float16x8_t vreinterpretq_f16_s16(int16x8_t a) { return (float16x8_t)a; } - -static inline float16x8_t cv_vld1q_f16(const void* ptr) -{ -#ifndef vld1q_f16 // APPLE compiler defines vld1_f16 as macro - return vreinterpretq_f16_s16(vld1q_s16((const short*)ptr)); -#else - return vld1q_f16((const __fp16*)ptr); -#endif -} -static inline void cv_vst1q_f16(void* ptr, float16x8_t a) -{ -#ifndef vst1q_f16 // APPLE compiler defines vst1_f16 as macro - vst1q_s16((short*)ptr, vreinterpretq_s16_f16(a)); -#else - vst1q_f16((__fp16*)ptr, a); -#endif -} - -struct v_float16x8 -{ - typedef short lane_type; - enum { nlanes = 8 }; - - v_float16x8() {} - explicit v_float16x8(float16x8_t v) : val(v) {} - v_float16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7) - { - short v[] = {v0, v1, v2, v3, v4, v5, v6, v7}; - val = cv_vld1q_f16(v); - } - short get0() const - { - return vgetq_lane_s16(vreinterpretq_s16_f16(val), 0); - } - float16x8_t val; -}; - -inline v_float16x8 v_setzero_f16() { return v_float16x8(vreinterpretq_f16_s16(vdupq_n_s16((short)0))); } -inline v_float16x8 v_setall_f16(short v) { return v_float16x8(vreinterpretq_f16_s16(vdupq_n_s16(v))); } - -#endif // CV_SIMD128_FP16 - #define OPENCV_HAL_IMPL_NEON_INIT(_Tpv, _Tp, suffix) \ inline v_##_Tpv v_setzero_##suffix() { return v_##_Tpv(vdupq_n_##suffix((_Tp)0)); } \ inline v_##_Tpv v_setall_##suffix(_Tp v) { return v_##_Tpv(vdupq_n_##suffix(v)); } \ @@ -934,24 +878,6 @@ OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float32x4, float, f32) OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float64x2, double, f64) #endif -#if CV_SIMD128_FP16 -// Workaround for old comiplers -inline v_float16x8 v_load_f16(const short* ptr) -{ return v_float16x8(cv_vld1q_f16(ptr)); } -inline v_float16x8 v_load_f16_aligned(const short* ptr) -{ return v_float16x8(cv_vld1q_f16(ptr)); } - -inline v_float16x8 v_load_f16_low(const short* ptr) -{ return v_float16x8(vcombine_f16(cv_vld1_f16(ptr), vdup_n_f16((float16_t)0))); } -inline v_float16x8 v_load_f16_halves(const short* ptr0, const short* ptr1) -{ return v_float16x8(vcombine_f16(cv_vld1_f16(ptr0), cv_vld1_f16(ptr1))); } - -inline void v_store(short* ptr, const v_float16x8& a) -{ cv_vst1q_f16(ptr, a.val); } -inline void v_store_aligned(short* ptr, const v_float16x8& a) -{ cv_vst1q_f16(ptr, a.val); } -#endif - #define OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \ inline scalartype v_reduce_##func(const _Tpvec& a) \ { \ @@ -1507,22 +1433,6 @@ inline v_float64x2 v_cvt_f64_high(const v_float32x4& a) } #endif -#if CV_SIMD128_FP16 -inline v_float32x4 v_cvt_f32(const v_float16x8& a) -{ - return v_float32x4(vcvt_f32_f16(vget_low_f16(a.val))); -} -inline v_float32x4 v_cvt_f32_high(const v_float16x8& a) -{ - return v_float32x4(vcvt_f32_f16(vget_high_f16(a.val))); -} - -inline v_float16x8 v_cvt_f16(const v_float32x4& a, const v_float32x4& b) -{ - return v_float16x8(vcombine_f16(vcvt_f16_f32(a.val), vcvt_f16_f32(b.val))); -} -#endif - ////////////// Lookup table access //////////////////// inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec) @@ -1588,6 +1498,47 @@ inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_flo } #endif +////// FP16 suport /////// +#if CV_FP16 +inline v_float32x4 v_load_expand(const float16_t* ptr) +{ + float16x4_t v = + #ifndef vld1_f16 // APPLE compiler defines vld1_f16 as macro + (float16x4_t)vld1_s16((const short*)ptr); + #else + vld1_f16((const __fp16*)ptr); + #endif + return v_float32x4(vcvt_f32_f16(v)); +} + +inline void v_pack_store(float16_t* ptr, const v_float32x4& v) +{ + float16x4_t hv = vcvt_f16_f32(v.val); + + #ifndef vst1_f16 // APPLE compiler defines vst1_f16 as macro + vst1_s16((short*)ptr, (int16x4_t)hv); + #else + vst1_f16((__fp16*)ptr, hv); + #endif +} +#else +inline v_float32x4 v_load_expand(const float16_t* ptr) +{ + const int N = 4; + float buf[N]; + for( int i = 0; i < N; i++ ) buf[i] = (float)ptr[i]; + return v_load(buf); +} + +inline void v_pack_store(float16_t* ptr, const v_float32x4& v) +{ + const int N = 4; + float buf[N]; + v_store(buf, v); + for( int i = 0; i < N; i++ ) ptr[i] = float16_t(buf[i]); +} +#endif + inline void v_cleanup() {} //! @name Check SIMD support diff --git a/modules/core/include/opencv2/core/hal/intrin_sse.hpp b/modules/core/include/opencv2/core/hal/intrin_sse.hpp index 159ef356b5..29c4f646ec 100644 --- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp @@ -404,7 +404,7 @@ void v_rshr_pack_u_store(uchar* ptr, const v_int16x8& a) inline v_int8x16 v_pack(const v_int16x8& a, const v_int16x8& b) { return v_int8x16(_mm_packs_epi16(a.val, b.val)); } -inline void v_pack_store(schar* ptr, v_int16x8& a) +inline void v_pack_store(schar* ptr, const v_int16x8& a) { _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi16(a.val, a.val)); } template inline @@ -2655,6 +2655,50 @@ inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_flo y = v_float64x2(_mm_unpackhi_pd(xy0, xy1)); } + +////////////// FP16 support /////////////////////////// + +inline v_float32x4 v_load_expand(const float16_t* ptr) +{ + const __m128i z = _mm_setzero_si128(), delta = _mm_set1_epi32(0x38000000); + const __m128i signmask = _mm_set1_epi32(0x80000000), maxexp = _mm_set1_epi32(0x7c000000); + const __m128 deltaf = _mm_castsi128_ps(_mm_set1_epi32(0x38800000)); + __m128i bits = _mm_unpacklo_epi16(z, _mm_loadl_epi64((const __m128i*)ptr)); // h << 16 + __m128i e = _mm_and_si128(bits, maxexp), sign = _mm_and_si128(bits, signmask); + __m128i t = _mm_add_epi32(_mm_srli_epi32(_mm_xor_si128(bits, sign), 3), delta); // ((h & 0x7fff) << 13) + delta + __m128i zt = _mm_castps_si128(_mm_sub_ps(_mm_castsi128_ps(_mm_add_epi32(t, _mm_set1_epi32(1 << 23))), deltaf)); + + t = _mm_add_epi32(t, _mm_and_si128(delta, _mm_cmpeq_epi32(maxexp, e))); + __m128i zmask = _mm_cmpeq_epi32(e, z); + __m128i ft = v_select_si128(zmask, zt, t); + return v_float32x4(_mm_castsi128_ps(_mm_or_si128(ft, sign))); +} + +inline void v_pack_store(float16_t* ptr, const v_float32x4& v) +{ + const __m128i signmask = _mm_set1_epi32(0x80000000); + const __m128i rval = _mm_set1_epi32(0x3f000000); + + __m128i t = _mm_castps_si128(v.val); + __m128i sign = _mm_srai_epi32(_mm_and_si128(t, signmask), 16); + t = _mm_andnot_si128(signmask, t); + + __m128i finitemask = _mm_cmpgt_epi32(_mm_set1_epi32(0x47800000), t); + __m128i isnan = _mm_cmpgt_epi32(t, _mm_set1_epi32(0x7f800000)); + __m128i naninf = v_select_si128(isnan, _mm_set1_epi32(0x7e00), _mm_set1_epi32(0x7c00)); + __m128i tinymask = _mm_cmpgt_epi32(_mm_set1_epi32(0x38800000), t); + __m128i tt = _mm_castps_si128(_mm_add_ps(_mm_castsi128_ps(t), _mm_castsi128_ps(rval))); + tt = _mm_sub_epi32(tt, rval); + __m128i odd = _mm_and_si128(_mm_srli_epi32(t, 13), _mm_set1_epi32(1)); + __m128i nt = _mm_add_epi32(t, _mm_set1_epi32(0xc8000fff)); + nt = _mm_srli_epi32(_mm_add_epi32(nt, odd), 13); + t = v_select_si128(tinymask, tt, nt); + t = v_select_si128(finitemask, t, naninf); + t = _mm_or_si128(t, sign); + t = _mm_packs_epi32(t, t); + _mm_storel_epi64((__m128i*)ptr, t); +} + inline void v_cleanup() {} //! @name Check SIMD support diff --git a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp index a45e7a875f..fb81986f6c 100644 --- a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp @@ -916,6 +916,24 @@ inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_flo y = v_float64x2(tab[idx[0]+1], tab[idx[1]+1]); } +/////// FP16 support //////// + +// [TODO] implement these 2 using VSX or universal intrinsics (copy from intrin_sse.cpp and adopt) +inline v_float32x4 v_load_expand(const float16_t* ptr) +{ + return v_float32x4((float)ptr[0], (float)ptr[1], (float)ptr[2], (float)ptr[3]); +} + +inline void v_pack_store(float16_t* ptr, const v_float32x4& v) +{ + float CV_DECL_ALIGNED(32) f[4]; + v_store_aligned(f, v); + ptr[0] = float16_t(f[0]); + ptr[1] = float16_t(f[1]); + ptr[2] = float16_t(f[2]); + ptr[3] = float16_t(f[3]); +} + inline void v_cleanup() {} diff --git a/modules/core/perf/perf_addWeighted.cpp b/modules/core/perf/perf_addWeighted.cpp index 15daced72e..2822bc61e7 100644 --- a/modules/core/perf/perf_addWeighted.cpp +++ b/modules/core/perf/perf_addWeighted.cpp @@ -11,6 +11,7 @@ PERF_TEST_P(Size_MatType, addWeighted, TYPICAL_MATS_ADWEIGHTED) { Size size = get<0>(GetParam()); int type = get<1>(GetParam()); + int depth = CV_MAT_DEPTH(type); Mat src1(size, type); Mat src2(size, type); double alpha = 3.75; @@ -21,7 +22,7 @@ PERF_TEST_P(Size_MatType, addWeighted, TYPICAL_MATS_ADWEIGHTED) declare.in(src1, src2, dst, WARMUP_RNG).out(dst); - if (CV_MAT_DEPTH(type) == CV_32S) + if (depth == CV_32S) { // there might be not enough precision for integers src1 /= 2048; @@ -30,7 +31,7 @@ PERF_TEST_P(Size_MatType, addWeighted, TYPICAL_MATS_ADWEIGHTED) TEST_CYCLE() cv::addWeighted( src1, alpha, src2, beta, gamma, dst, dst.type() ); - SANITY_CHECK(dst, 1); + SANITY_CHECK(dst, depth == CV_32S ? 4 : 1); } } // namespace diff --git a/modules/core/perf/perf_convertTo.cpp b/modules/core/perf/perf_convertTo.cpp index c6c157e704..344d81cb8a 100644 --- a/modules/core/perf/perf_convertTo.cpp +++ b/modules/core/perf/perf_convertTo.cpp @@ -33,7 +33,7 @@ PERF_TEST_P( Size_DepthSrc_DepthDst_Channels_alpha, convertTo, int runs = (sz.width <= 640) ? 8 : 1; TEST_CYCLE_MULTIRUN(runs) src.convertTo(dst, depthDst, alpha); - double eps = depthSrc <= CV_32S ? 1e-12 : (FLT_EPSILON * maxValue); + double eps = depthSrc <= CV_32S && (depthDst <= CV_32S || depthDst == CV_64F) ? 1e-12 : (FLT_EPSILON * maxValue); eps = eps * std::max(1.0, fabs(alpha)); SANITY_CHECK(dst, eps); } diff --git a/modules/core/src/convert.avx2.cpp b/modules/core/src/convert.avx2.cpp deleted file mode 100644 index b724cbbf1e..0000000000 --- a/modules/core/src/convert.avx2.cpp +++ /dev/null @@ -1,40 +0,0 @@ -// This file is part of OpenCV project. -// It is subject to the license terms in the LICENSE file found in the top-level directory -// of this distribution and at http://opencv.org/license.html - - -#include "precomp.hpp" -#include "convert.hpp" - -namespace cv -{ -namespace opt_AVX2 -{ - -void cvtScale_s16s32f32Line_AVX2(const short* src, int* dst, float scale, float shift, int width) -{ - int x = 0; - - __m256 scale256 = _mm256_set1_ps(scale); - __m256 shift256 = _mm256_set1_ps(shift); - const int shuffle = 0xD8; - - for (; x <= width - 16; x += 16) - { - __m256i v_src = _mm256_loadu_si256((const __m256i *)(src + x)); - v_src = _mm256_permute4x64_epi64(v_src, shuffle); - __m256i v_src_lo = _mm256_srai_epi32(_mm256_unpacklo_epi16(v_src, v_src), 16); - __m256i v_src_hi = _mm256_srai_epi32(_mm256_unpackhi_epi16(v_src, v_src), 16); - __m256 v_dst0 = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(v_src_lo), scale256), shift256); - __m256 v_dst1 = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(v_src_hi), scale256), shift256); - _mm256_storeu_si256((__m256i *)(dst + x), _mm256_cvtps_epi32(v_dst0)); - _mm256_storeu_si256((__m256i *)(dst + x + 8), _mm256_cvtps_epi32(v_dst1)); - } - - for (; x < width; x++) - dst[x] = saturate_cast(src[x] * scale + shift); -} - -} -} // cv:: -/* End of file. */ diff --git a/modules/core/src/convert.cpp b/modules/core/src/convert.cpp index 75b4967194..a54f4c1bcd 100644 --- a/modules/core/src/convert.cpp +++ b/modules/core/src/convert.cpp @@ -2,1093 +2,242 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html - #include "precomp.hpp" #include "opencl_kernels_core.hpp" #include "convert.hpp" -#include "opencv2/core/openvx/ovx_defs.hpp" namespace cv { -template -struct Cvt_SIMD -{ - int operator() (const T *, DT *, int) const - { - return 0; - } -}; +/*namespace hal { -#if CV_SIMD128 -// from uchar - -template <> -struct Cvt_SIMD +void cvt16f32f( const float16_t* src, float* dst, int len ) { - int operator() (const uchar * src, schar * dst, int width) const + int j = 0; +#if CV_SIMD + const int VECSZ = v_float32::nlanes; + for( ; j < len; j += VECSZ ) { - int x = 0; - if (hasSIMD128()) + if( j > len - VECSZ ) { - int cWidth = v_uint16x8::nlanes; - for (; x <= width - cWidth; x += cWidth) - { - v_int16x8 v_src = v_reinterpret_as_s16(v_load_expand(src + x)); - v_store_low(dst + x, v_pack(v_src, v_src)); - } + if( j == 0 ) + break; + j = len - VECSZ; } - return x; + v_store(dst + j, vx_load_expand(src + j)); } -}; - -template <> -struct Cvt_SIMD -{ - int operator() (const uchar * src, ushort * dst, int width) const - { - int x = 0; - if (hasSIMD128()) - { - int cWidth = v_uint16x8::nlanes; - for (; x <= width - cWidth; x += cWidth) - v_store(dst + x, v_load_expand(src + x)); - } - return x; - } -}; - -template <> -struct Cvt_SIMD -{ - int operator() (const uchar * src, short * dst, int width) const - { - int x = 0; - if (hasSIMD128()) - { - int cWidth = v_uint16x8::nlanes; - for (; x <= width - cWidth; x += cWidth) - { - v_int16x8 v_src = v_reinterpret_as_s16(v_load_expand(src + x)); - v_store(dst + x, v_src); - } - } - return x; - } -}; - -template <> -struct Cvt_SIMD -{ - int operator() (const uchar * src, int * dst, int width) const - { - int x = 0; - if (hasSIMD128()) - { - int cWidth = v_int32x4::nlanes; - for (; x <= width - cWidth * 2; x += cWidth * 2) - { - v_uint16x8 v_src = v_load_expand(src + x); - v_uint32x4 v_src1, v_src2; - v_expand(v_src, v_src1, v_src2); - v_store(dst + x, v_reinterpret_as_s32(v_src1)); - v_store(dst + x + cWidth, v_reinterpret_as_s32(v_src2)); - } - } - return x; - } -}; - -template <> -struct Cvt_SIMD -{ - int operator() (const uchar * src, float * dst, int width) const - { - int x = 0; - if (hasSIMD128()) - { - int cWidth = v_float32x4::nlanes; - for (; x <= width - cWidth * 2; x += cWidth * 2) - { - v_uint16x8 v_src = v_load_expand(src + x); - v_uint32x4 v_src1, v_src2; - v_expand(v_src, v_src1, v_src2); - v_store(dst + x, v_cvt_f32(v_reinterpret_as_s32(v_src1))); - v_store(dst + x + cWidth, v_cvt_f32(v_reinterpret_as_s32(v_src2))); - } - } - return x; - } -}; - -// from schar - -template <> -struct Cvt_SIMD -{ - int operator() (const schar * src, uchar * dst, int width) const - { - int x = 0; - if (hasSIMD128()) - { - int cWidth = v_int16x8::nlanes; - for (; x <= width - cWidth; x += cWidth) - v_pack_u_store(dst + x, v_load_expand(src + x)); - } - - return x; - } -}; - -template <> -struct Cvt_SIMD -{ - int operator() (const schar * src, short * dst, int width) const - { - int x = 0; - if (hasSIMD128()) - { - int cWidth = v_int16x8::nlanes; - for (; x <= width - cWidth; x += cWidth) - v_store(dst + x, v_load_expand(src + x)); - } - return x; - } -}; - -template <> -struct Cvt_SIMD -{ - int operator() (const schar * src, ushort * dst, int width) const - { - int x = 0; - if (hasSIMD128()) - { - int cWidth = v_int16x8::nlanes; - for (; x <= width - cWidth; x += cWidth) - { - v_int16x8 v_src = v_load_expand(src + x); - v_int32x4 v_src1, v_src2; - v_expand(v_src, v_src1, v_src2); - v_store(dst + x, v_pack_u(v_src1, v_src2)); - } - } - return x; - } -}; - - -template <> -struct Cvt_SIMD -{ - int operator() (const schar * src, int * dst, int width) const - { - int x = 0; - if (hasSIMD128()) - { - int cWidth = v_int32x4::nlanes; - for (; x <= width - cWidth * 2; x += cWidth * 2) - { - v_int16x8 v_src = v_load_expand(src + x); - v_int32x4 v_src1, v_src2; - v_expand(v_src, v_src1, v_src2); - v_store(dst + x, v_src1); - v_store(dst + x + cWidth, v_src2); - } - } - return x; - } -}; - -template <> -struct Cvt_SIMD -{ - int operator() (const schar * src, float * dst, int width) const - { - int x = 0; - if (hasSIMD128()) - { - int cWidth = v_int32x4::nlanes; - for (; x <= width - cWidth * 2; x += cWidth * 2) - { - v_int16x8 v_src = v_load_expand(src + x); - v_int32x4 v_src1, v_src2; - v_expand(v_src, v_src1, v_src2); - v_store(dst + x, v_cvt_f32(v_src1)); - v_store(dst + x + cWidth, v_cvt_f32(v_src2)); - } - } - return x; - } -}; - -// from ushort - -template <> -struct Cvt_SIMD -{ - int operator() (const ushort * src, uchar * dst, int width) const - { - int x = 0; - if (hasSIMD128()) - { - int cWidth = v_uint16x8::nlanes; - for (; x <= width - cWidth * 2; x += cWidth * 2) - { - v_uint16x8 v_src1 = v_load(src + x), v_src2 = v_load(src + x + cWidth); - v_store(dst + x, v_pack(v_src1, v_src2)); - } - } - return x; - } -}; - -template <> -struct Cvt_SIMD -{ - int operator() (const ushort * src, schar * dst, int width) const - { - int x = 0; - if (hasSIMD128()) - { - int cWidth = v_uint16x8::nlanes; - for (; x <= width - cWidth * 2; x += cWidth * 2) - { - v_uint16x8 v_src1 = v_load(src + x), v_src2 = v_load(src + x + cWidth); - v_uint32x4 v_dst10, v_dst11, v_dst20, v_dst21; - v_expand(v_src1, v_dst10, v_dst11); - v_expand(v_src2, v_dst20, v_dst21); - - v_store(dst + x, v_pack( - v_pack(v_reinterpret_as_s32(v_dst10), v_reinterpret_as_s32(v_dst11)), - v_pack(v_reinterpret_as_s32(v_dst20), v_reinterpret_as_s32(v_dst21)))); - } - } - return x; - } -}; - -template <> -struct Cvt_SIMD -{ - int operator() (const ushort * src, short * dst, int width) const - { - int x = 0; - if (hasSIMD128()) - { - int cWidth = v_uint16x8::nlanes; - for (; x <= width - cWidth; x += cWidth) - { - v_uint16x8 v_src = v_load(src + x); - v_uint32x4 v_dst0, v_dst1; - v_expand(v_src, v_dst0, v_dst1); - v_store(dst + x, v_pack(v_reinterpret_as_s32(v_dst0), v_reinterpret_as_s32(v_dst1))); - } - } - return x; - } -}; - -template <> -struct Cvt_SIMD -{ - int operator() (const ushort * src, int * dst, int width) const - { - int x = 0; - if (hasSIMD128()) - { - int cWidth = v_int32x4::nlanes; - for (; x <= width - cWidth * 2; x += cWidth * 2) - { - v_uint16x8 v_src = v_load(src + x); - v_uint32x4 v_src1, v_src2; - v_expand(v_src, v_src1, v_src2); - v_store(dst + x, v_reinterpret_as_s32(v_src1)); - v_store(dst + x + cWidth, v_reinterpret_as_s32(v_src2)); - } - } - return x; - } -}; - -template <> -struct Cvt_SIMD -{ - int operator() (const ushort * src, float * dst, int width) const - { - int x = 0; - if (hasSIMD128()) - { - int cWidth = v_int32x4::nlanes; - for (; x <= width - cWidth * 2; x += cWidth * 2) - { - v_uint16x8 v_src = v_load(src + x); - v_uint32x4 v_src1, v_src2; - v_expand(v_src, v_src1, v_src2); - v_store(dst + x, v_cvt_f32(v_reinterpret_as_s32(v_src1))); - v_store(dst + x + cWidth, v_cvt_f32(v_reinterpret_as_s32(v_src2))); - } - } - return x; - } -}; - - -// from short - -template <> -struct Cvt_SIMD -{ - int operator() (const short * src, uchar * dst, int width) const - { - int x = 0; - if (hasSIMD128()) - { - int cWidth = v_int16x8::nlanes; - for (; x <= width - cWidth * 2; x += cWidth * 2) - { - v_int16x8 v_src1 = v_load(src + x), v_src2 = v_load(src + x + cWidth); - v_store(dst + x, v_pack_u(v_src1, v_src2)); - } - } - return x; - } -}; - -template <> -struct Cvt_SIMD -{ - int operator() (const short * src, schar * dst, int width) const - { - int x = 0; - if (hasSIMD128()) - { - int cWidth = v_int16x8::nlanes; - for (; x <= width - cWidth * 2; x += cWidth * 2) - { - v_int16x8 v_src1 = v_load(src + x), v_src2 = v_load(src + x + cWidth); - v_store(dst + x, v_pack(v_src1, v_src2)); - } - } - return x; - } -}; - -template <> -struct Cvt_SIMD -{ - int operator() (const short * src, ushort * dst, int width) const - { - int x = 0; - if (hasSIMD128()) - { - int cWidth = v_int16x8::nlanes; - for (; x <= width - cWidth; x += cWidth) - { - v_int16x8 v_src = v_load(src + x); - v_int32x4 v_dst1, v_dst2; - v_expand(v_src, v_dst1, v_dst2); - v_store(dst + x, v_pack_u(v_dst1, v_dst2)); - } - } - return x; - } -}; - -template <> -struct Cvt_SIMD -{ - int operator() (const short * src, int * dst, int width) const - { - int x = 0; - if (hasSIMD128()) - { - int cWidth = v_int32x4::nlanes; - for (; x <= width - cWidth * 2; x += cWidth * 2) - { - v_int16x8 v_src = v_load(src + x); - v_int32x4 v_dst1, v_dst2; - v_expand(v_src, v_dst1, v_dst2); - v_store(dst + x, v_dst1); - v_store(dst + x + cWidth, v_dst2); - } - } - return x; - } -}; - -template <> -struct Cvt_SIMD -{ - int operator() (const short * src, float * dst, int width) const - { - int x = 0; - if (hasSIMD128()) - { - int cWidth = v_int32x4::nlanes; - for (; x <= width - cWidth * 2; x += cWidth * 2) - { - v_int16x8 v_src = v_load(src + x); - v_int32x4 v_dst1, v_dst2; - v_expand(v_src, v_dst1, v_dst2); - v_store(dst + x, v_cvt_f32(v_dst1)); - v_store(dst + x + cWidth, v_cvt_f32(v_dst2)); - } - } - return x; - } -}; - -// from int - -template <> -struct Cvt_SIMD -{ - int operator() (const int * src, uchar * dst, int width) const - { - int x = 0; - if (hasSIMD128()) - { - int cWidth = v_int32x4::nlanes; - for (; x <= width - cWidth * 4; x += cWidth * 4) - { - v_int32x4 v_src1 = v_load(src + x), v_src2 = v_load(src + x + cWidth); - v_int32x4 v_src3 = v_load(src + x + cWidth * 2), v_src4 = v_load(src + x + cWidth * 3); - v_int16x8 v_dst1 = v_pack(v_src1, v_src2); - v_int16x8 v_dst2 = v_pack(v_src3, v_src4); - v_store(dst + x, v_pack_u(v_dst1, v_dst2)); - } - } - return x; - } -}; - -template <> -struct Cvt_SIMD -{ - int operator() (const int * src, schar * dst, int width) const - { - int x = 0; - if (hasSIMD128()) - { - int cWidth = v_int32x4::nlanes; - for (; x <= width - cWidth * 4; x += cWidth * 4) - { - v_int32x4 v_src1 = v_load(src + x), v_src2 = v_load(src + x + cWidth); - v_int32x4 v_src3 = v_load(src + x + cWidth * 2), v_src4 = v_load(src + x + cWidth * 3); - v_int16x8 v_dst1 = v_pack(v_src1, v_src2); - v_int16x8 v_dst2 = v_pack(v_src3, v_src4); - v_store(dst + x, v_pack(v_dst1, v_dst2)); - } - } - return x; - } -}; - - -template <> -struct Cvt_SIMD -{ - int operator() (const int * src, ushort * dst, int width) const - { - int x = 0; - if (hasSIMD128()) - { - int cWidth = v_int32x4::nlanes; - for (; x <= width - cWidth * 2; x += cWidth * 2) - { - v_int32x4 v_src1 = v_load(src + x), v_src2 = v_load(src + x + cWidth); - v_store(dst + x, v_pack_u(v_src1, v_src2)); - } - } - return x; - } -}; - -template <> -struct Cvt_SIMD -{ - int operator() (const int * src, short * dst, int width) const - { - int x = 0; - if (hasSIMD128()) - { - int cWidth = v_int32x4::nlanes; - for (; x <= width - cWidth * 2; x += cWidth * 2) - { - v_int32x4 v_src1 = v_load(src + x), v_src2 = v_load(src + x + cWidth); - v_store(dst + x, v_pack(v_src1, v_src2)); - } - } - return x; - } -}; - -template <> -struct Cvt_SIMD -{ - int operator() (const int * src, float * dst, int width) const - { - int x = 0; - if (hasSIMD128()) - { - int cWidth = v_int32x4::nlanes; - for (; x <= width - cWidth; x += cWidth) - v_store(dst + x, v_cvt_f32(v_load(src + x))); - } - return x; - } -}; - -// from float - -template <> -struct Cvt_SIMD -{ - int operator() (const float * src, uchar * dst, int width) const - { - int x = 0; - if (hasSIMD128()) - { - int cWidth = v_float32x4::nlanes; - for (; x <= width - cWidth * 4; x += cWidth * 4) - { - v_int32x4 v_src1 = v_round(v_load(src + x)); - v_int32x4 v_src2 = v_round(v_load(src + x + cWidth)); - v_int32x4 v_src3 = v_round(v_load(src + x + cWidth * 2)); - v_int32x4 v_src4 = v_round(v_load(src + x + cWidth * 3)); - v_uint16x8 v_dst1 = v_pack_u(v_src1, v_src2); - v_uint16x8 v_dst2 = v_pack_u(v_src3, v_src4); - v_store(dst + x, v_pack(v_dst1, v_dst2)); - } - } - return x; - } -}; - -template <> -struct Cvt_SIMD -{ - int operator() (const float * src, schar * dst, int width) const - { - int x = 0; - if (hasSIMD128()) - { - int cWidth = v_float32x4::nlanes; - for (; x <= width - cWidth * 4; x += cWidth * 4) - { - v_int32x4 v_src1 = v_round(v_load(src + x)); - v_int32x4 v_src2 = v_round(v_load(src + x + cWidth)); - v_int32x4 v_src3 = v_round(v_load(src + x + cWidth * 2)); - v_int32x4 v_src4 = v_round(v_load(src + x + cWidth * 3)); - v_int16x8 v_dst1 = v_pack(v_src1, v_src2); - v_int16x8 v_dst2 = v_pack(v_src3, v_src4); - v_store(dst + x, v_pack(v_dst1, v_dst2)); - } - } - return x; - } -}; - -template <> -struct Cvt_SIMD -{ - int operator() (const float * src, ushort * dst, int width) const - { - int x = 0; - if (hasSIMD128()) - { - int cWidth = v_float32x4::nlanes; - for (; x <= width - cWidth * 2; x += cWidth * 2) - { - v_int32x4 v_src1 = v_round(v_load(src + x)); - v_int32x4 v_src2 = v_round(v_load(src + x + cWidth)); - v_store(dst + x, v_pack_u(v_src1, v_src2)); - } - } - return x; - } -}; - -template <> -struct Cvt_SIMD -{ - int operator() (const float * src, short * dst, int width) const - { - int x = 0; - if (hasSIMD128()) - { - int cWidth = v_float32x4::nlanes; - for (; x <= width - cWidth * 2; x += cWidth * 2) - { - v_int32x4 v_src1 = v_round(v_load(src + x)); - v_int32x4 v_src2 = v_round(v_load(src + x + cWidth)); - v_store(dst + x, v_pack(v_src1, v_src2)); - } - } - return x; - } -}; - -template <> -struct Cvt_SIMD -{ - int operator() (const float * src, int * dst, int width) const - { - int x = 0; - if (hasSIMD128()) - { - int cWidth = v_float32x4::nlanes; - for (; x <= width - cWidth; x += cWidth) - v_store(dst + x, v_round(v_load(src + x))); - } - return x; - } -}; -#if CV_SIMD128_64F -// from double - -template <> -struct Cvt_SIMD -{ - int operator() (const double * src, uchar * dst, int width) const - { - int x = 0; - if (hasSIMD128()) - { - int cWidth = v_float64x2::nlanes; - for (; x <= width - cWidth * 4; x += cWidth * 4) - { - v_float32x4 v_src0 = v_cvt_f32(v_load(src + x)); - v_float32x4 v_src1 = v_cvt_f32(v_load(src + x + cWidth)); - v_float32x4 v_src2 = v_cvt_f32(v_load(src + x + cWidth * 2)); - v_float32x4 v_src3 = v_cvt_f32(v_load(src + x + cWidth * 3)); - - v_src0 = v_combine_low(v_src0, v_src1); - v_src1 = v_combine_low(v_src2, v_src3); - - v_int16x8 v_dst = v_pack(v_round(v_src0), v_round(v_src1)); - v_pack_u_store(dst + x, v_dst); - } - } - return x; - } -}; - -template <> -struct Cvt_SIMD -{ - int operator() (const double * src, schar * dst, int width) const - { - int x = 0; - if (hasSIMD128()) - { - int cWidth = v_float64x2::nlanes; - for (; x <= width - cWidth * 4; x += cWidth * 4) - { - v_float32x4 v_src0 = v_cvt_f32(v_load(src + x)); - v_float32x4 v_src1 = v_cvt_f32(v_load(src + x + cWidth)); - v_float32x4 v_src2 = v_cvt_f32(v_load(src + x + cWidth * 2)); - v_float32x4 v_src3 = v_cvt_f32(v_load(src + x + cWidth * 3)); - - v_src0 = v_combine_low(v_src0, v_src1); - v_src1 = v_combine_low(v_src2, v_src3); - - v_int16x8 v_dst = v_pack(v_round(v_src0), v_round(v_src1)); - v_store_low(dst + x, v_pack(v_dst, v_dst)); - } - } - return x; - } -}; - -template <> -struct Cvt_SIMD -{ - int operator() (const double * src, ushort * dst, int width) const - { - int x = 0; -#if CV_TRY_SSE4_1 - if (CV_CPU_HAS_SUPPORT_SSE4_1) - return opt_SSE4_1::Cvt_SIMD_f64u16_SSE41(src, dst, width); #endif - if (hasSIMD128()) - { - int cWidth = v_float64x2::nlanes; - for (; x <= width - cWidth * 4; x += cWidth * 4) - { - v_float32x4 v_src0 = v_cvt_f32(v_load(src + x)); - v_float32x4 v_src1 = v_cvt_f32(v_load(src + x + cWidth)); - v_float32x4 v_src2 = v_cvt_f32(v_load(src + x + cWidth * 2)); - v_float32x4 v_src3 = v_cvt_f32(v_load(src + x + cWidth * 3)); + for( ; j < len; j++ ) + dst[j] = (float)src[j]; +} - v_src0 = v_combine_low(v_src0, v_src1); - v_src1 = v_combine_low(v_src2, v_src3); - - v_uint16x8 v_dst = v_pack_u(v_round(v_src0), v_round(v_src1)); - v_store(dst + x, v_dst); - } - } - return x; - } -}; - -template <> -struct Cvt_SIMD +void cvt32f16f( const float* src, float16_t* dst, int len ) { - int operator() (const double * src, short * dst, int width) const + int j = 0; +#if CV_SIMD + const int VECSZ = v_float32::nlanes; + for( ; j < len; j += VECSZ ) { - int x = 0; - if (hasSIMD128()) + if( j > len - VECSZ ) { - int cWidth = v_float64x2::nlanes; - for (; x <= width - cWidth * 4; x += cWidth * 4) - { - v_float32x4 v_src0 = v_cvt_f32(v_load(src + x)); - v_float32x4 v_src1 = v_cvt_f32(v_load(src + x + cWidth)); - v_float32x4 v_src2 = v_cvt_f32(v_load(src + x + cWidth * 2)); - v_float32x4 v_src3 = v_cvt_f32(v_load(src + x + cWidth * 3)); - - v_src0 = v_combine_low(v_src0, v_src1); - v_src1 = v_combine_low(v_src2, v_src3); - - v_int16x8 v_dst = v_pack(v_round(v_src0), v_round(v_src1)); - v_store(dst + x, v_dst); - } + if( j == 0 ) + break; + j = len - VECSZ; } - return x; + v_pack_store(dst + j, vx_load(src + j)); } -}; - -template <> -struct Cvt_SIMD -{ - int operator() (const double * src, int * dst, int width) const - { - int x = 0; - if (hasSIMD128()) - { - int cWidth = v_float64x2::nlanes; - for (; x <= width - cWidth * 2; x += cWidth * 2) - { - v_int32x4 v_src0 = v_round(v_load(src + x)); - v_int32x4 v_src1 = v_round(v_load(src + x + cWidth)); - - v_store(dst + x, v_combine_low(v_src0, v_src1)); - } - } - return x; - } -}; - -template <> -struct Cvt_SIMD -{ - int operator() (const double * src, float * dst, int width) const - { - int x = 0; - if (hasSIMD128()) - { - int cWidth = v_float64x2::nlanes; - for (; x <= width - cWidth * 2; x += cWidth * 2) - { - v_float32x4 v_src0 = v_cvt_f32(v_load(src + x)); - v_float32x4 v_src1 = v_cvt_f32(v_load(src + x + cWidth)); - - v_store(dst + x, v_combine_low(v_src0, v_src1)); - } - } - return x; - } -}; - -// to double - -template <> -struct Cvt_SIMD -{ - int operator() (const uchar* src, double* dst, int width) const - { - int x = 0; - if (hasSIMD128()) - { - int cWidth = v_float64x2::nlanes; - for (; x <= width - cWidth * 4; x += cWidth * 4) - { - v_uint16x8 v_src = v_load_expand(src + x); - v_uint32x4 v_src1, v_src2; - v_expand(v_src, v_src1, v_src2); - v_store(dst + x, v_cvt_f64(v_reinterpret_as_s32(v_src1))); - v_store(dst + x + cWidth, v_cvt_f64_high(v_reinterpret_as_s32(v_src1))); - v_store(dst + x + cWidth * 2, v_cvt_f64(v_reinterpret_as_s32(v_src2))); - v_store(dst + x + cWidth * 3, v_cvt_f64_high(v_reinterpret_as_s32(v_src2))); - } - } - return x; - } -}; - -template <> -struct Cvt_SIMD -{ - int operator() (const schar* src, double* dst, int width) const - { - int x = 0; - if (hasSIMD128()) - { - int cWidth = v_float64x2::nlanes; - for (; x <= width - cWidth * 4; x += cWidth * 4) - { - v_int16x8 v_src = v_load_expand(src + x); - v_int32x4 v_src1, v_src2; - v_expand(v_src, v_src1, v_src2); - v_store(dst + x, v_cvt_f64(v_src1)); - v_store(dst + x + cWidth, v_cvt_f64_high(v_src1)); - v_store(dst + x + cWidth * 2, v_cvt_f64(v_src2)); - v_store(dst + x + cWidth * 3, v_cvt_f64_high(v_src2)); - } - } - return x; - } -}; - -template <> -struct Cvt_SIMD -{ - int operator() (const ushort* src, double* dst, int width) const - { - int x = 0; - if (hasSIMD128()) - { - int cWidth = v_float64x2::nlanes; - for (; x <= width - cWidth * 2; x += cWidth * 2) - { - v_uint32x4 v_src = v_load_expand(src + x); - - v_store(dst + x, v_cvt_f64(v_reinterpret_as_s32(v_src))); - v_store(dst + x + cWidth, v_cvt_f64_high(v_reinterpret_as_s32(v_src))); - } - } - return x; - } -}; - -template <> -struct Cvt_SIMD -{ - int operator() (const short* src, double* dst, int width) const - { - int x = 0; - if (hasSIMD128()) - { - int cWidth = v_float64x2::nlanes; - for (; x <= width - cWidth * 2; x += cWidth * 2) - { - v_int32x4 v_src = v_load_expand(src + x); - - v_store(dst + x, v_cvt_f64(v_src)); - v_store(dst + x + cWidth, v_cvt_f64_high(v_src)); - } - } - return x; - } -}; - -template <> -struct Cvt_SIMD -{ - int operator() (const int* src, double* dst, int width) const - { - int x = 0; - if (hasSIMD128()) - { - int cWidth = v_float64x2::nlanes; - for (; x <= width - cWidth * 2; x += cWidth * 2) - { - v_int32x4 v_src = v_load(src + x); - - v_store(dst + x, v_cvt_f64(v_src)); - v_store(dst + x + cWidth, v_cvt_f64_high(v_src)); - } - } - return x; - } -}; - -template <> -struct Cvt_SIMD -{ - int operator() (const float* src, double* dst, int width) const - { - int x = 0; - if (hasSIMD128()) - { - int cWidth = v_float64x2::nlanes; - for (; x <= width - cWidth * 2; x += cWidth * 2) - { - v_float32x4 v_src = v_load(src + x); - - v_store(dst + x, v_cvt_f64(v_src)); - v_store(dst + x + cWidth, v_cvt_f64_high(v_src)); - } - } - return x; - } -}; -#endif // CV_SIMD128_64F -#endif // CV_SIMD128 - - -#ifdef HAVE_OPENVX - -template -static bool _openvx_cvt(const T* src, size_t sstep, - DT* dst, size_t dstep, Size continuousSize) -{ - using namespace ivx; - - if(!(continuousSize.width > 0 && continuousSize.height > 0)) - { - return true; - } - - //.height is for number of continuous pieces - //.width is for length of one piece - Size imgSize = continuousSize; - if(continuousSize.height == 1) - { - if(sstep / sizeof(T) == dstep / sizeof(DT) && sstep / sizeof(T) > 0 && - continuousSize.width % (sstep / sizeof(T)) == 0) - { - //continuous n-lines image - imgSize.width = sstep / sizeof(T); - imgSize.height = continuousSize.width / (sstep / sizeof(T)); - } - else - { - //1-row image with possibly incorrect step - sstep = continuousSize.width * sizeof(T); - dstep = continuousSize.width * sizeof(DT); - } - } - - int srcType = DataType::type, dstType = DataType
::type; - - if (ovx::skipSmallImages(imgSize.width, imgSize.height)) - return false; - - try - { - Context context = ovx::getOpenVXContext(); - - // Other conversions are marked as "experimental" - if(context.vendorID() == VX_ID_KHRONOS && - !(srcType == CV_8U && dstType == CV_16S) && - !(srcType == CV_16S && dstType == CV_8U)) - { - return false; - } - - Image srcImage = Image::createFromHandle(context, Image::matTypeToFormat(srcType), - Image::createAddressing(imgSize.width, imgSize.height, - (vx_uint32)sizeof(T), (vx_uint32)sstep), - (void*)src); - Image dstImage = Image::createFromHandle(context, Image::matTypeToFormat(dstType), - Image::createAddressing(imgSize.width, imgSize.height, - (vx_uint32)sizeof(DT), (vx_uint32)dstep), - (void*)dst); - - IVX_CHECK_STATUS(vxuConvertDepth(context, srcImage, dstImage, VX_CONVERT_POLICY_SATURATE, 0)); - -#ifdef VX_VERSION_1_1 - //we should take user memory back before release - //(it's not done automatically according to standard) - srcImage.swapHandle(); dstImage.swapHandle(); #endif - } - catch (RuntimeError & e) - { - VX_DbgThrow(e.what()); - } - catch (WrapperError & e) - { - VX_DbgThrow(e.what()); - } - - return true; + for( ; j < len; j++ ) + dst[j] = float16_t(src[j]); } -template -static bool openvx_cvt(const T* src, size_t sstep, - DT* dst, size_t dstep, Size size) +/*void addRNGBias32f( float* arr, const float* scaleBiasPairs, int len ) { - (void)src; (void)sstep; (void)dst; (void)dstep; (void)size; - return false; + // the loop is simple enough, so we let the compiler to vectorize it + for( int i = 0; i < len; i++ ) + arr[i] = scaleBiasPairs[i*2 + 1]; } -#define DEFINE_OVX_CVT_SPECIALIZATION(T, DT) \ -template<> \ -bool openvx_cvt(const T *src, size_t sstep, DT *dst, size_t dstep, Size size) \ -{ \ - return _openvx_cvt(src, sstep, dst, dstep, size); \ -} - -DEFINE_OVX_CVT_SPECIALIZATION(uchar, ushort) -DEFINE_OVX_CVT_SPECIALIZATION(uchar, short) -DEFINE_OVX_CVT_SPECIALIZATION(uchar, int) -DEFINE_OVX_CVT_SPECIALIZATION(ushort, uchar) -DEFINE_OVX_CVT_SPECIALIZATION(ushort, int) -DEFINE_OVX_CVT_SPECIALIZATION(short, uchar) -DEFINE_OVX_CVT_SPECIALIZATION(short, int) -DEFINE_OVX_CVT_SPECIALIZATION(int, uchar) -DEFINE_OVX_CVT_SPECIALIZATION(int, ushort) -DEFINE_OVX_CVT_SPECIALIZATION(int, short) - -#endif - -template static void -cvt_( const T* src, size_t sstep, - DT* dst, size_t dstep, Size size ) +void addRNGBias64f( double* arr, const double* scaleBiasPairs, int len ) { - CV_OVX_RUN( - true, - openvx_cvt(src, sstep, dst, dstep, size) - ) - - sstep /= sizeof(src[0]); - dstep /= sizeof(dst[0]); - Cvt_SIMD vop; - - for( ; size.height--; src += sstep, dst += dstep ) - { - int x = vop(src, dst, size.width); - #if CV_ENABLE_UNROLLED - for( ; x <= size.width - 4; x += 4 ) - { - DT t0, t1; - t0 = saturate_cast
(src[x]); - t1 = saturate_cast
(src[x+1]); - dst[x] = t0; dst[x+1] = t1; - t0 = saturate_cast
(src[x+2]); - t1 = saturate_cast
(src[x+3]); - dst[x+2] = t0; dst[x+3] = t1; - } - #endif - for( ; x < size.width; x++ ) - dst[x] = saturate_cast
(src[x]); - } + // the loop is simple enough, so we let the compiler to vectorize it + for( int i = 0; i < len; i++ ) + arr[i] = scaleBiasPairs[i*2 + 1]; } -template static void -cpy_( const T* src, size_t sstep, T* dst, size_t dstep, Size size ) +}*/ + +template inline void +cvt_( const _Ts* src, size_t sstep, _Td* dst, size_t dstep, Size size ) { sstep /= sizeof(src[0]); dstep /= sizeof(dst[0]); - for( ; size.height--; src += sstep, dst += dstep ) - memcpy(dst, src, size.width*sizeof(src[0])); + for( int i = 0; i < size.height; i++, src += sstep, dst += dstep ) + { + int j = 0; +#if CV_SIMD + const int VECSZ = _Twvec::nlanes*2; + for( ; j < size.width; j += VECSZ ) + { + if( j > size.width - VECSZ ) + { + if( j == 0 || src == (_Ts*)dst ) + break; + j = size.width - VECSZ; + } + _Twvec v0, v1; + vx_load_pair_as(src + j, v0, v1); + v_store_pair_as(dst + j, v0, v1); + } +#endif + for( ; j < size.width; j++ ) + dst[j] = saturate_cast<_Td>(src[j]); + } } +// in order to reduce the code size, for (16f <-> ...) conversions +// we add a conversion function without loop unrolling +template inline void +cvt1_( const _Ts* src, size_t sstep, _Td* dst, size_t dstep, Size size ) +{ + sstep /= sizeof(src[0]); + dstep /= sizeof(dst[0]); + for( int i = 0; i < size.height; i++, src += sstep, dst += dstep ) + { + int j = 0; +#if CV_SIMD + const int VECSZ = _Twvec::nlanes; + for( ; j < size.width; j += VECSZ ) + { + if( j > size.width - VECSZ ) + { + if( j == 0 || src == (_Ts*)dst ) + break; + j = size.width - VECSZ; + } + _Twvec v; + vx_load_as(src + j, v); + v_store_as(dst + j, v); + } + vx_cleanup(); +#endif + for( ; j < size.width; j++ ) + dst[j] = saturate_cast<_Td>(src[j]); + } +} + +static void cvtCopy( const uchar* src, size_t sstep, + uchar* dst, size_t dstep, Size size, size_t elemsize) +{ + size_t len = size.width*elemsize; + for( int i = 0; i < size.height; i++, src += sstep, dst += dstep ) + { + memcpy( dst, src, len ); + } +} + +#define DEF_CVT_FUNC(suffix, cvtfunc, _Ts, _Td, _Twvec) \ +static void cvt##suffix(const _Ts* src, size_t sstep, uchar*, size_t, \ + _Td* dst, size_t dstep, Size size, void*) \ +{ cvtfunc<_Ts, _Td, _Twvec>(src, sstep, dst, dstep, size); } + +////////////////////// 8u -> ... //////////////////////// + +DEF_CVT_FUNC(8u8s, cvt_, uchar, schar, v_int16) +DEF_CVT_FUNC(8u16u, cvt_, uchar, ushort, v_uint16) +DEF_CVT_FUNC(8u16s, cvt_, uchar, short, v_int16) +DEF_CVT_FUNC(8u32s, cvt_, uchar, int, v_int32) +DEF_CVT_FUNC(8u32f, cvt_, uchar, float, v_float32) +DEF_CVT_FUNC(8u64f, cvt_, uchar, double, v_int32) +//DEF_CVT_FUNC(8u16f, cvt1_, uchar, float16_t, v_float32) + +////////////////////// 8s -> ... //////////////////////// + +DEF_CVT_FUNC(8s8u, cvt_, schar, uchar, v_int16) +DEF_CVT_FUNC(8s16u, cvt_, schar, ushort, v_uint16) +DEF_CVT_FUNC(8s16s, cvt_, schar, short, v_int16) +DEF_CVT_FUNC(8s32s, cvt_, schar, int, v_int32) +DEF_CVT_FUNC(8s32f, cvt_, schar, float, v_float32) +DEF_CVT_FUNC(8s64f, cvt_, schar, double, v_int32) +//DEF_CVT_FUNC(8s16f, cvt1_, schar, float16_t, v_float32) + +////////////////////// 16u -> ... //////////////////////// + +DEF_CVT_FUNC(16u8u, cvt_, ushort, uchar, v_uint16) +DEF_CVT_FUNC(16u8s, cvt_, ushort, schar, v_uint16) +DEF_CVT_FUNC(16u16s, cvt_, ushort, short, v_int32) +DEF_CVT_FUNC(16u32s, cvt_, ushort, int, v_int32) +DEF_CVT_FUNC(16u32f, cvt_, ushort, float, v_float32) +DEF_CVT_FUNC(16u64f, cvt_, ushort, double, v_int32) +//DEF_CVT_FUNC(16u16f, cvt1_,ushort, float16_t, v_float32) + +////////////////////// 16s -> ... //////////////////////// + +DEF_CVT_FUNC(16s8u, cvt_, short, uchar, v_int16) +DEF_CVT_FUNC(16s8s, cvt_, short, schar, v_int16) +DEF_CVT_FUNC(16s16u, cvt_, short, ushort, v_int32) +DEF_CVT_FUNC(16s32s, cvt_, short, int, v_int32) +DEF_CVT_FUNC(16s32f, cvt_, short, float, v_float32) +DEF_CVT_FUNC(16s64f, cvt_, short, double, v_int32) +//DEF_CVT_FUNC(16s16f, cvt1_,short, float16_t, v_float32) + +////////////////////// 32s -> ... //////////////////////// + +DEF_CVT_FUNC(32s8u, cvt_, int, uchar, v_int32) +DEF_CVT_FUNC(32s8s, cvt_, int, schar, v_int32) +DEF_CVT_FUNC(32s16u, cvt_, int, ushort, v_int32) +DEF_CVT_FUNC(32s16s, cvt_, int, short, v_int32) +DEF_CVT_FUNC(32s32f, cvt_, int, float, v_float32) +DEF_CVT_FUNC(32s64f, cvt_, int, double, v_int32) +//DEF_CVT_FUNC(32s16f, cvt1_,int, float16_t, v_float32) + +////////////////////// 32f -> ... //////////////////////// + +DEF_CVT_FUNC(32f8u, cvt_, float, uchar, v_float32) +DEF_CVT_FUNC(32f8s, cvt_, float, schar, v_float32) +DEF_CVT_FUNC(32f16u, cvt_, float, ushort, v_float32) +DEF_CVT_FUNC(32f16s, cvt_, float, short, v_float32) +DEF_CVT_FUNC(32f32s, cvt_, float, int, v_float32) +DEF_CVT_FUNC(32f64f, cvt_, float, double, v_float32) +DEF_CVT_FUNC(32f16f, cvt1_,float, float16_t, v_float32) + +////////////////////// 64f -> ... //////////////////////// + +DEF_CVT_FUNC(64f8u, cvt_, double, uchar, v_int32) +DEF_CVT_FUNC(64f8s, cvt_, double, schar, v_int32) +DEF_CVT_FUNC(64f16u, cvt_, double, ushort, v_int32) +DEF_CVT_FUNC(64f16s, cvt_, double, short, v_int32) +DEF_CVT_FUNC(64f32s, cvt_, double, int, v_int32) +DEF_CVT_FUNC(64f32f, cvt_, double, float, v_float32) +//DEF_CVT_FUNC(64f16f, cvt1_,double, float16_t, v_float32) + +////////////////////// 16f -> ... //////////////////////// + +//DEF_CVT_FUNC(16f8u, cvt_, float16_t, uchar, v_float32) +//DEF_CVT_FUNC(16f8s, cvt_, float16_t, schar, v_float32) +//DEF_CVT_FUNC(16f16u, cvt1_, float16_t, ushort, v_float32) +//DEF_CVT_FUNC(16f16s, cvt1_, float16_t, short, v_float32) +//DEF_CVT_FUNC(16f32s, cvt1_, float16_t, int, v_float32) +DEF_CVT_FUNC(16f32f, cvt1_, float16_t, float, v_float32) +//DEF_CVT_FUNC(16f64f, cvt1_, float16_t, double, v_float32) + +///////////// "conversion" w/o conversion /////////////// + +static void cvt8u(const uchar* src, size_t sstep, uchar*, size_t, uchar* dst, size_t dstep, Size size, void*) +{ cvtCopy(src, sstep, dst, dstep, size, 1); } + +static void cvt16u(const ushort* src, size_t sstep, uchar*, size_t, ushort* dst, size_t dstep, Size size, void*) +{ cvtCopy((const uchar*)src, sstep, (uchar*)dst, dstep, size, 2); } + +static void cvt32s(const int* src, size_t sstep, uchar*, size_t, int* dst, size_t dstep, Size size, void*) +{ cvtCopy((const uchar*)src, sstep, (uchar*)dst, dstep, size, 4); } + +static void cvt64s(const int64* src, size_t sstep, uchar*, size_t, int64* dst, size_t dstep, Size size, void*) +{ cvtCopy((const uchar*)src, sstep, (uchar*)dst, dstep, size, 8); } + + +/* [TODO] Recover IPP calls #if defined(HAVE_IPP) #define DEF_CVT_FUNC_F(suffix, stype, dtype, ippFavor) \ static void cvt##suffix( const stype* src, size_t sstep, const uchar*, size_t, \ @@ -1129,7 +278,6 @@ static void cvt##suffix( const stype* src, size_t sstep, const uchar*, size_t, \ cpy_(src, sstep, dst, dstep, size); \ } - DEF_CPY_FUNC(8u, uchar) DEF_CVT_FUNC_F(8s8u, schar, uchar, 8s8u_C1Rs) DEF_CVT_FUNC_F(16u8u, ushort, uchar, 16u8u_C1R) @@ -1182,7 +330,7 @@ DEF_CVT_FUNC(16s64f, short, double) DEF_CVT_FUNC(32s64f, int, double) DEF_CVT_FUNC(32f64f, float, double) DEF_CPY_FUNC(64s, int64) - +*/ BinaryFunc getConvertFunc(int sdepth, int ddepth) { @@ -1191,114 +339,78 @@ BinaryFunc getConvertFunc(int sdepth, int ddepth) { (BinaryFunc)(cvt8u), (BinaryFunc)GET_OPTIMIZED(cvt8s8u), (BinaryFunc)GET_OPTIMIZED(cvt16u8u), (BinaryFunc)GET_OPTIMIZED(cvt16s8u), (BinaryFunc)GET_OPTIMIZED(cvt32s8u), (BinaryFunc)GET_OPTIMIZED(cvt32f8u), - (BinaryFunc)GET_OPTIMIZED(cvt64f8u), 0 + (BinaryFunc)GET_OPTIMIZED(cvt64f8u), 0 //(BinaryFunc)(cvt16f8u) }, { (BinaryFunc)GET_OPTIMIZED(cvt8u8s), (BinaryFunc)cvt8u, (BinaryFunc)GET_OPTIMIZED(cvt16u8s), (BinaryFunc)GET_OPTIMIZED(cvt16s8s), (BinaryFunc)GET_OPTIMIZED(cvt32s8s), (BinaryFunc)GET_OPTIMIZED(cvt32f8s), - (BinaryFunc)GET_OPTIMIZED(cvt64f8s), 0 + (BinaryFunc)GET_OPTIMIZED(cvt64f8s), 0 //(BinaryFunc)(cvt16f8s) }, { (BinaryFunc)GET_OPTIMIZED(cvt8u16u), (BinaryFunc)GET_OPTIMIZED(cvt8s16u), (BinaryFunc)cvt16u, (BinaryFunc)GET_OPTIMIZED(cvt16s16u), (BinaryFunc)GET_OPTIMIZED(cvt32s16u), (BinaryFunc)GET_OPTIMIZED(cvt32f16u), - (BinaryFunc)GET_OPTIMIZED(cvt64f16u), 0 + (BinaryFunc)GET_OPTIMIZED(cvt64f16u), 0 //(BinaryFunc)(cvt16f16u) }, { (BinaryFunc)GET_OPTIMIZED(cvt8u16s), (BinaryFunc)GET_OPTIMIZED(cvt8s16s), (BinaryFunc)GET_OPTIMIZED(cvt16u16s), (BinaryFunc)cvt16u, (BinaryFunc)GET_OPTIMIZED(cvt32s16s), (BinaryFunc)GET_OPTIMIZED(cvt32f16s), - (BinaryFunc)GET_OPTIMIZED(cvt64f16s), 0 + (BinaryFunc)GET_OPTIMIZED(cvt64f16s), 0 //(BinaryFunc)(cvt16f16s) }, { (BinaryFunc)GET_OPTIMIZED(cvt8u32s), (BinaryFunc)GET_OPTIMIZED(cvt8s32s), (BinaryFunc)GET_OPTIMIZED(cvt16u32s), (BinaryFunc)GET_OPTIMIZED(cvt16s32s), (BinaryFunc)cvt32s, (BinaryFunc)GET_OPTIMIZED(cvt32f32s), - (BinaryFunc)GET_OPTIMIZED(cvt64f32s), 0 + (BinaryFunc)GET_OPTIMIZED(cvt64f32s), 0 //(BinaryFunc)(cvt16f32s) }, { (BinaryFunc)GET_OPTIMIZED(cvt8u32f), (BinaryFunc)GET_OPTIMIZED(cvt8s32f), (BinaryFunc)GET_OPTIMIZED(cvt16u32f), (BinaryFunc)GET_OPTIMIZED(cvt16s32f), (BinaryFunc)GET_OPTIMIZED(cvt32s32f), (BinaryFunc)cvt32s, - (BinaryFunc)GET_OPTIMIZED(cvt64f32f), 0 + (BinaryFunc)GET_OPTIMIZED(cvt64f32f), 0 //(BinaryFunc)(cvt16f32f) }, { (BinaryFunc)GET_OPTIMIZED(cvt8u64f), (BinaryFunc)GET_OPTIMIZED(cvt8s64f), (BinaryFunc)GET_OPTIMIZED(cvt16u64f), (BinaryFunc)GET_OPTIMIZED(cvt16s64f), (BinaryFunc)GET_OPTIMIZED(cvt32s64f), (BinaryFunc)GET_OPTIMIZED(cvt32f64f), - (BinaryFunc)(cvt64s), 0 + (BinaryFunc)(cvt64s), 0 //(BinaryFunc)(cvt16f64f) }, { 0, 0, 0, 0, 0, 0, 0, 0 + //(BinaryFunc)(cvt8u16f), (BinaryFunc)(cvt8s16f), (BinaryFunc)(cvt16u16f), (BinaryFunc)(cvt16s16f), + //(BinaryFunc)(cvt32s16f), (BinaryFunc)(cvt32f16f), (BinaryFunc)(cvt64f16f), (BinaryFunc)(cvt16u) } }; - return cvtTab[CV_MAT_DEPTH(ddepth)][CV_MAT_DEPTH(sdepth)]; } -} // cv:: - -#ifdef HAVE_IPP -namespace cv +#ifdef HAVE_OPENCL +static bool ocl_convertFp16( InputArray _src, OutputArray _dst, int sdepth, int ddepth ) { -static bool ipp_convertTo(Mat &src, Mat &dst, double alpha, double beta) -{ -#ifdef HAVE_IPP_IW - CV_INSTRUMENT_REGION_IPP() + int type = _src.type(), cn = CV_MAT_CN(type); - IppDataType srcDepth = ippiGetDataType(src.depth()); - IppDataType dstDepth = ippiGetDataType(dst.depth()); - int channels = src.channels(); - - if(src.dims == 0) + _dst.createSameSize( _src, CV_MAKETYPE(ddepth, cn) ); + int kercn = 1; + int rowsPerWI = 1; + String build_opt = format("-D HALF_SUPPORT -D srcT=%s -D dstT=%s -D rowsPerWI=%d%s", + sdepth == CV_32F ? "float" : "half", + sdepth == CV_32F ? "half" : "float", + rowsPerWI, + sdepth == CV_32F ? " -D FLOAT_TO_HALF " : ""); + ocl::Kernel k("convertFp16", ocl::core::halfconvert_oclsrc, build_opt); + if (k.empty()) return false; - ::ipp::IwiImage iwSrc; - ::ipp::IwiImage iwDst; + UMat src = _src.getUMat(); + UMat dst = _dst.getUMat(); - try - { - IppHintAlgorithm mode = ippAlgHintFast; - if(dstDepth == ipp64f || - (dstDepth == ipp32f && (srcDepth == ipp32s || srcDepth == ipp64f)) || - (dstDepth == ipp32s && (srcDepth == ipp32s || srcDepth == ipp64f))) - mode = ippAlgHintAccurate; + ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src), + dstarg = ocl::KernelArg::WriteOnly(dst, cn, kercn); - if(src.dims <= 2) - { - Size sz = getContinuousSize(src, dst, channels); + k.args(srcarg, dstarg); - iwSrc.Init(ippiSize(sz), srcDepth, 1, NULL, (void*)src.ptr(), src.step); - iwDst.Init(ippiSize(sz), dstDepth, 1, NULL, (void*)dst.ptr(), dst.step); - - CV_INSTRUMENT_FUN_IPP(::ipp::iwiScale, iwSrc, iwDst, alpha, beta, ::ipp::IwiScaleParams(mode)); - } - else - { - const Mat *arrays[] = {&src, &dst, NULL}; - uchar *ptrs[2] = {NULL}; - NAryMatIterator it(arrays, ptrs); - - iwSrc.Init(ippiSize(it.size, 1), srcDepth, channels); - iwDst.Init(ippiSize(it.size, 1), dstDepth, channels); - - for(size_t i = 0; i < it.nplanes; i++, ++it) - { - iwSrc.m_ptr = ptrs[0]; - iwDst.m_ptr = ptrs[1]; - - CV_INSTRUMENT_FUN_IPP(::ipp::iwiScale, iwSrc, iwDst, alpha, beta, ::ipp::IwiScaleParams(mode)); - } - } - } - catch (::ipp::IwException) - { - return false; - } - return true; -#else - CV_UNUSED(src); CV_UNUSED(dst); CV_UNUSED(alpha); CV_UNUSED(beta); - return false; -#endif + size_t globalsize[2] = { (size_t)src.cols * cn / kercn, ((size_t)src.rows + rowsPerWI - 1) / rowsPerWI }; + return k.run(2, globalsize, NULL, false); } -} // cv:: #endif +} // cv:: void cv::Mat::convertTo(OutputArray _dst, int _type, double alpha, double beta) const { @@ -1331,7 +443,6 @@ void cv::Mat::convertTo(OutputArray _dst, int _type, double alpha, double beta) _dst.create( dims, size, _type ); Mat dst = _dst.getMat(); - CV_IPP_RUN_FAST(ipp_convertTo(src, dst, alpha, beta )); BinaryFunc func = noScale ? getConvertFunc(sdepth, ddepth) : getConvertScaleFunc(sdepth, ddepth); double scale[] = {alpha, beta}; @@ -1341,7 +452,6 @@ void cv::Mat::convertTo(OutputArray _dst, int _type, double alpha, double beta) if( dims <= 2 ) { Size sz = getContinuousSize(src, dst, cn); - func( src.data, src.step, 0, 0, dst.data, dst.step, sz, scale ); } else @@ -1358,118 +468,30 @@ void cv::Mat::convertTo(OutputArray _dst, int _type, double alpha, double beta) //================================================================================================== -namespace cv { - -// template for FP16 HW conversion function -template static void -cvtScaleHalf_( const T* src, size_t sstep, DT* dst, size_t dstep, Size size); - -template<> void -cvtScaleHalf_( const float* src, size_t sstep, short* dst, size_t dstep, Size size ) -{ - CV_CPU_CALL_FP16_(cvtScaleHalf_SIMD32f16f, (src, sstep, dst, dstep, size)); - -#if !CV_CPU_FORCE_FP16 - sstep /= sizeof(src[0]); - dstep /= sizeof(dst[0]); - - for( ; size.height--; src += sstep, dst += dstep ) - { - for ( int x = 0; x < size.width; x++ ) - { - dst[x] = convertFp16SW(src[x]); - } - } -#endif -} - -template<> void -cvtScaleHalf_( const short* src, size_t sstep, float* dst, size_t dstep, Size size ) -{ - CV_CPU_CALL_FP16_(cvtScaleHalf_SIMD16f32f, (src, sstep, dst, dstep, size)); - -#if !CV_CPU_FORCE_FP16 - sstep /= sizeof(src[0]); - dstep /= sizeof(dst[0]); - - for( ; size.height--; src += sstep, dst += dstep ) - { - for ( int x = 0; x < size.width; x++ ) - { - dst[x] = convertFp16SW(src[x]); - } - } -#endif -} - -#define DEF_CVT_SCALE_FP16_FUNC(suffix, stype, dtype) \ -static void cvtScaleHalf##suffix( const stype* src, size_t sstep, \ -dtype* dst, size_t dstep, Size size, void*) \ -{ \ - cvtScaleHalf_(src, sstep, dst, dstep, size); \ -} - -DEF_CVT_SCALE_FP16_FUNC(32f16f, float, short) -DEF_CVT_SCALE_FP16_FUNC(16f32f, short, float) - -static UnaryFunc getConvertFuncFp16(int ddepth) -{ - static UnaryFunc cvtTab[] = - { - 0, 0, 0, - (UnaryFunc)(cvtScaleHalf32f16f), 0, (UnaryFunc)(cvtScaleHalf16f32f), - 0, 0, - }; - return cvtTab[CV_MAT_DEPTH(ddepth)]; -} - - -#ifdef HAVE_OPENCL - -static bool ocl_convertFp16( InputArray _src, OutputArray _dst, int ddepth ) -{ - int type = _src.type(), cn = CV_MAT_CN(type); - - _dst.createSameSize( _src, CV_MAKETYPE(ddepth, cn) ); - int kercn = 1; - int rowsPerWI = 1; - String build_opt = format("-D HALF_SUPPORT -D dstT=%s -D srcT=%s -D rowsPerWI=%d%s", - ddepth == CV_16S ? "half" : "float", - ddepth == CV_16S ? "float" : "half", - rowsPerWI, - ddepth == CV_16S ? " -D FLOAT_TO_HALF " : ""); - ocl::Kernel k("convertFp16", ocl::core::halfconvert_oclsrc, build_opt); - if (k.empty()) - return false; - - UMat src = _src.getUMat(); - UMat dst = _dst.getUMat(); - - ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src), - dstarg = ocl::KernelArg::WriteOnly(dst, cn, kercn); - - k.args(srcarg, dstarg); - - size_t globalsize[2] = { (size_t)src.cols * cn / kercn, ((size_t)src.rows + rowsPerWI - 1) / rowsPerWI }; - return k.run(2, globalsize, NULL, false); -} - -#endif - -} //cv:: - -void cv::convertFp16( InputArray _src, OutputArray _dst) +void cv::convertFp16( InputArray _src, OutputArray _dst ) { CV_INSTRUMENT_REGION() - int ddepth = 0; - switch( _src.depth() ) + int sdepth = _src.depth(), ddepth = 0; + BinaryFunc func = 0; + + switch( sdepth ) { case CV_32F: - ddepth = CV_16S; + if(_dst.fixedType()) + { + ddepth = _dst.depth(); + CV_Assert(ddepth == CV_16S /*|| ddepth == CV_16F*/); + CV_Assert(_dst.channels() == _src.channels()); + } + else + ddepth = CV_16S; + func = (BinaryFunc)cvt32f16f; break; case CV_16S: + //case CV_16F: ddepth = CV_32F; + func = (BinaryFunc)cvt16f32f; break; default: CV_Error(Error::StsUnsupportedFormat, "Unsupported input depth"); @@ -1477,21 +499,21 @@ void cv::convertFp16( InputArray _src, OutputArray _dst) } CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(), - ocl_convertFp16(_src, _dst, ddepth)) + ocl_convertFp16(_src, _dst, sdepth, ddepth)) Mat src = _src.getMat(); int type = CV_MAKETYPE(ddepth, src.channels()); _dst.create( src.dims, src.size, type ); Mat dst = _dst.getMat(); - UnaryFunc func = getConvertFuncFp16(ddepth); int cn = src.channels(); + CV_Assert( func != 0 ); if( src.dims <= 2 ) { Size sz = getContinuousSize(src, dst, cn); - func( src.data, src.step, dst.data, dst.step, sz, 0); + func( src.data, src.step, 0, 0, dst.data, dst.step, sz, 0); } else { @@ -1501,6 +523,6 @@ void cv::convertFp16( InputArray _src, OutputArray _dst) Size sz((int)(it.size*cn), 1); for( size_t i = 0; i < it.nplanes; i++, ++it ) - func(ptrs[0], 1, ptrs[1], 1, sz, 0); + func(ptrs[0], 0, 0, 0, ptrs[1], 0, sz, 0); } } diff --git a/modules/core/src/convert.fp16.cpp b/modules/core/src/convert.fp16.cpp deleted file mode 100644 index 7168e8d643..0000000000 --- a/modules/core/src/convert.fp16.cpp +++ /dev/null @@ -1,126 +0,0 @@ -// This file is part of OpenCV project. -// It is subject to the license terms in the LICENSE file found in the top-level directory -// of this distribution and at http://opencv.org/license.html - - -#include "precomp.hpp" -#include "convert.hpp" - -namespace cv -{ -namespace opt_FP16 -{ -#if !defined(CV_NEON) || !CV_NEON -const static int cVectorWidth = 8; - -void cvtScaleHalf_SIMD32f16f( const float* src, size_t sstep, short* dst, size_t dstep, cv::Size size ) -{ - CV_INSTRUMENT_REGION() - - sstep /= sizeof(src[0]); - dstep /= sizeof(dst[0]); - - for( ; size.height--; src += sstep, dst += dstep ) - { - int x = 0; - for ( ; x <= size.width - cVectorWidth ; x += cVectorWidth ) - { - __m256 v_src = _mm256_loadu_ps(src + x); - - // round to nearest even - __m128i v_dst = _mm256_cvtps_ph(v_src, 0); - - _mm_storeu_si128((__m128i*)(dst + x), v_dst); - } - - for ( ; x < size.width; x++ ) - { - dst[x] = convertFp16SW(src[x]); - } - } -} - -void cvtScaleHalf_SIMD16f32f( const short* src, size_t sstep, float* dst, size_t dstep, cv::Size size ) -{ - CV_INSTRUMENT_REGION() - - sstep /= sizeof(src[0]); - dstep /= sizeof(dst[0]); - - for( ; size.height--; src += sstep, dst += dstep ) - { - int x = 0; - for ( ; x <= size.width - cVectorWidth ; x += cVectorWidth ) - { - __m128i v_src = _mm_loadu_si128((__m128i*)(src + x)); - - __m256 v_dst = _mm256_cvtph_ps(v_src); - - _mm256_storeu_ps(dst + x, v_dst); - } - - for ( ; x < size.width; x++ ) - { - dst[x] = convertFp16SW(src[x]); - } - } -} -#elif CV_NEON -const static int cVectorWidth = 4; - -void cvtScaleHalf_SIMD32f16f( const float* src, size_t sstep, short* dst, size_t dstep, cv::Size size ) -{ - CV_INSTRUMENT_REGION() - - sstep /= sizeof(src[0]); - dstep /= sizeof(dst[0]); - - for( ; size.height--; src += sstep, dst += dstep ) - { - int x = 0; - for ( ; x <= size.width - cVectorWidth ; x += cVectorWidth) - { - float32x4_t v_src = vld1q_f32(src + x); - float16x4_t v_dst = vcvt_f16_f32(v_src); - - cv_vst1_f16(dst + x, v_dst); - } - - for ( ; x < size.width; x++ ) - { - dst[x] = convertFp16SW(src[x]); - } - } -} - -void cvtScaleHalf_SIMD16f32f( const short* src, size_t sstep, float* dst, size_t dstep, cv::Size size ) -{ - CV_INSTRUMENT_REGION() - - sstep /= sizeof(src[0]); - dstep /= sizeof(dst[0]); - - for( ; size.height--; src += sstep, dst += dstep ) - { - int x = 0; - for ( ; x <= size.width - cVectorWidth ; x += cVectorWidth ) - { - float16x4_t v_src = cv_vld1_f16((__fp16*)src + x); - - float32x4_t v_dst = vcvt_f32_f16(v_src); - - vst1q_f32(dst + x, v_dst); - } - - for ( ; x < size.width; x++ ) - { - dst[x] = convertFp16SW(src[x]); - } - } -} -#else -#error "Unsupported build configuration" -#endif -} - -} // cv:: diff --git a/modules/core/src/convert.hpp b/modules/core/src/convert.hpp index 580076367e..0d0aa3a770 100644 --- a/modules/core/src/convert.hpp +++ b/modules/core/src/convert.hpp @@ -8,192 +8,402 @@ #include "opencv2/core/types.hpp" -namespace -{ -float convertFp16SW(short fp16); -short convertFp16SW(float fp32); - -#if !CV_FP16_TYPE -// const numbers for floating points format -const unsigned int kShiftSignificand = 13; -const unsigned int kMaskFp16Significand = 0x3ff; -const unsigned int kBiasFp16Exponent = 15; -const unsigned int kBiasFp32Exponent = 127; -#endif - -#if CV_FP16_TYPE -inline float convertFp16SW(short fp16) -{ - // Fp16 -> Fp32 - Cv16suf a; - a.i = fp16; - return (float)a.h; -} -#else -inline float convertFp16SW(short fp16) -{ - // Fp16 -> Fp32 - Cv16suf b; - b.i = fp16; - int exponent = b.fmt.exponent - kBiasFp16Exponent; - int significand = b.fmt.significand; - - Cv32suf a; - a.i = 0; - a.fmt.sign = b.fmt.sign; // sign bit - if( exponent == 16 ) - { - // Inf or NaN - a.i = a.i | 0x7F800000; - if( significand != 0 ) - { - // NaN -#if defined(__x86_64__) || defined(_M_X64) - // 64bit - a.i = a.i | 0x7FC00000; -#endif - a.fmt.significand = a.fmt.significand | (significand << kShiftSignificand); - } - return a.f; - } - else if ( exponent == -(int)kBiasFp16Exponent ) - { - // subnormal in Fp16 - if( significand == 0 ) - { - // zero - return a.f; - } - else - { - int shift = -1; - while( ( significand & 0x400 ) == 0 ) - { - significand = significand << 1; - shift++; - } - significand = significand & kMaskFp16Significand; - exponent -= shift; - } - } - - a.fmt.exponent = (exponent+kBiasFp32Exponent); - a.fmt.significand = significand << kShiftSignificand; - return a.f; -} -#endif - -#if CV_FP16_TYPE -inline short convertFp16SW(float fp32) -{ - // Fp32 -> Fp16 - Cv16suf a; - a.h = (__fp16)fp32; - return a.i; -} -#else -inline short convertFp16SW(float fp32) -{ - // Fp32 -> Fp16 - Cv32suf a; - a.f = fp32; - int exponent = a.fmt.exponent - kBiasFp32Exponent; - int significand = a.fmt.significand; - - Cv16suf result; - result.i = 0; - unsigned int absolute = a.i & 0x7fffffff; - if( 0x477ff000 <= absolute ) - { - // Inf in Fp16 - result.i = result.i | 0x7C00; - if( exponent == 128 && significand != 0 ) - { - // NaN - result.i = (short)( result.i | 0x200 | ( significand >> kShiftSignificand ) ); - } - } - else if ( absolute < 0x33000001 ) - { - // too small for fp16 - result.i = 0; - } - else if ( absolute < 0x387fe000 ) - { - // subnormal in Fp16 - int fp16Significand = significand | 0x800000; - int bitShift = (-exponent) - 1; - fp16Significand = fp16Significand >> bitShift; - - // special cases to round up - bitShift = exponent + 24; - int threshold = ( ( 0x400000 >> bitShift ) | ( ( ( significand & ( 0x800000 >> bitShift ) ) >> ( 126 - a.fmt.exponent ) ) ^ 1 ) ); - if( absolute == 0x33c00000 ) - { - result.i = 2; - } - else - { - if( threshold <= ( significand & ( 0xffffff >> ( exponent + 25 ) ) ) ) - { - fp16Significand++; - } - result.i = (short)fp16Significand; - } - } - else - { - // usual situation - // exponent - result.fmt.exponent = ( exponent + kBiasFp16Exponent ); - - // significand; - short fp16Significand = (short)(significand >> kShiftSignificand); - result.fmt.significand = fp16Significand; - - // special cases to round up - short lsb10bitsFp32 = (significand & 0x1fff); - short threshold = 0x1000 + ( ( fp16Significand & 0x1 ) ? 0 : 1 ); - if( threshold <= lsb10bitsFp32 ) - { - result.i++; - } - else if ( fp16Significand == kMaskFp16Significand && exponent == -15) - { - result.i++; - } - } - - // sign bit - result.fmt.sign = a.fmt.sign; - return result.i; -} -#endif - -} - namespace cv { -namespace opt_FP16 + +#if CV_SIMD + +static inline void vx_load_as(const uchar* ptr, v_float32& a) +{ a = v_cvt_f32(v_reinterpret_as_s32(vx_load_expand_q(ptr))); } + +static inline void vx_load_as(const schar* ptr, v_float32& a) +{ a = v_cvt_f32(vx_load_expand_q(ptr)); } + +static inline void vx_load_as(const ushort* ptr, v_float32& a) +{ a = v_cvt_f32(v_reinterpret_as_s32(vx_load_expand(ptr))); } + +static inline void vx_load_as(const short* ptr, v_float32& a) +{ a = v_cvt_f32(v_reinterpret_as_s32(vx_load_expand(ptr))); } + +static inline void vx_load_as(const int* ptr, v_float32& a) +{ a = v_cvt_f32(vx_load(ptr)); } + +static inline void vx_load_as(const float* ptr, v_float32& a) +{ a = vx_load(ptr); } + +static inline void vx_load_as(const float16_t* ptr, v_float32& a) +{ a = vx_load_expand(ptr); } + +static inline void v_store_as(ushort* ptr, const v_float32& a) +{ v_pack_u_store(ptr, v_round(a)); } + +static inline void v_store_as(short* ptr, const v_float32& a) +{ v_pack_store(ptr, v_round(a)); } + +static inline void v_store_as(int* ptr, const v_float32& a) +{ v_store(ptr, v_round(a)); } + +static inline void v_store_as(float* ptr, const v_float32& a) +{ v_store(ptr, a); } + +static inline void v_store_as(float16_t* ptr, const v_float32& a) +{ v_pack_store(ptr, a); } + +static inline void vx_load_pair_as(const uchar* ptr, v_uint16& a, v_uint16& b) +{ v_expand(vx_load(ptr), a, b); } + +static inline void vx_load_pair_as(const schar* ptr, v_uint16& a, v_uint16& b) { -void cvtScaleHalf_SIMD32f16f( const float* src, size_t sstep, short* dst, size_t dstep, cv::Size size ); -void cvtScaleHalf_SIMD16f32f( const short* src, size_t sstep, float* dst, size_t dstep, cv::Size size ); + const v_int8 z = vx_setzero_s8(); + v_int16 sa, sb; + v_expand(v_max(vx_load(ptr), z), sa, sb); + a = v_reinterpret_as_u16(sa); + b = v_reinterpret_as_u16(sb); } -namespace opt_AVX2 + +static inline void vx_load_pair_as(const ushort* ptr, v_uint16& a, v_uint16& b) +{ a = vx_load(ptr); b = vx_load(ptr + v_uint16::nlanes); } + +static inline void vx_load_pair_as(const uchar* ptr, v_int16& a, v_int16& b) { -void cvtScale_s16s32f32Line_AVX2(const short* src, int* dst, float scale, float shift, int width); + v_uint16 ua, ub; + v_expand(vx_load(ptr), ua, ub); + a = v_reinterpret_as_s16(ua); + b = v_reinterpret_as_s16(ub); } -namespace opt_SSE4_1 + +static inline void vx_load_pair_as(const schar* ptr, v_int16& a, v_int16& b) +{ v_expand(vx_load(ptr), a, b); } + +static inline void vx_load_pair_as(const short* ptr, v_int16& a, v_int16& b) +{ a = vx_load(ptr); b = vx_load(ptr + v_uint16::nlanes); } + +static inline void vx_load_pair_as(const uchar* ptr, v_int32& a, v_int32& b) { - int cvtScale_SIMD_u8u16f32_SSE41(const uchar * src, ushort * dst, int width, float scale, float shift); - int cvtScale_SIMD_s8u16f32_SSE41(const schar * src, ushort * dst, int width, float scale, float shift); - int cvtScale_SIMD_u16u16f32_SSE41(const ushort * src, ushort * dst, int width, float scale, float shift); - int cvtScale_SIMD_s16u16f32_SSE41(const short * src, ushort * dst, int width, float scale, float shift); - int cvtScale_SIMD_s32u16f32_SSE41(const int * src, ushort * dst, int width, float scale, float shift); - int cvtScale_SIMD_f32u16f32_SSE41(const float * src, ushort * dst, int width, float scale, float shift); - int cvtScale_SIMD_f64u16f32_SSE41(const double * src, ushort * dst, int width, float scale, float shift); - int Cvt_SIMD_f64u16_SSE41(const double * src, ushort * dst, int width); + v_uint32 ua, ub; + v_expand(vx_load_expand(ptr), ua, ub); + a = v_reinterpret_as_s32(ua); + b = v_reinterpret_as_s32(ub); } + +static inline void vx_load_pair_as(const schar* ptr, v_int32& a, v_int32& b) +{ v_expand(vx_load_expand(ptr), a, b); } + +static inline void vx_load_pair_as(const ushort* ptr, v_int32& a, v_int32& b) +{ + v_uint32 ua, ub; + v_expand(vx_load(ptr), ua, ub); + a = v_reinterpret_as_s32(ua); + b = v_reinterpret_as_s32(ub); +} + +static inline void vx_load_pair_as(const short* ptr, v_int32& a, v_int32& b) +{ + v_expand(vx_load(ptr), a, b); +} + +static inline void vx_load_pair_as(const int* ptr, v_int32& a, v_int32& b) +{ + a = vx_load(ptr); + b = vx_load(ptr + v_int32::nlanes); +} + +static inline void vx_load_pair_as(const uchar* ptr, v_float32& a, v_float32& b) +{ + v_uint32 ua, ub; + v_expand(vx_load_expand(ptr), ua, ub); + a = v_cvt_f32(v_reinterpret_as_s32(ua)); + b = v_cvt_f32(v_reinterpret_as_s32(ub)); +} + +static inline void vx_load_pair_as(const schar* ptr, v_float32& a, v_float32& b) +{ + v_int32 ia, ib; + v_expand(vx_load_expand(ptr), ia, ib); + a = v_cvt_f32(ia); + b = v_cvt_f32(ib); +} + +static inline void vx_load_pair_as(const ushort* ptr, v_float32& a, v_float32& b) +{ + v_uint32 ua, ub; + v_expand(vx_load(ptr), ua, ub); + a = v_cvt_f32(v_reinterpret_as_s32(ua)); + b = v_cvt_f32(v_reinterpret_as_s32(ub)); +} + +static inline void vx_load_pair_as(const short* ptr, v_float32& a, v_float32& b) +{ + v_int32 ia, ib; + v_expand(vx_load(ptr), ia, ib); + a = v_cvt_f32(ia); + b = v_cvt_f32(ib); +} + +static inline void vx_load_pair_as(const int* ptr, v_float32& a, v_float32& b) +{ + v_int32 ia = vx_load(ptr), ib = vx_load(ptr + v_int32::nlanes); + a = v_cvt_f32(ia); + b = v_cvt_f32(ib); +} + +static inline void vx_load_pair_as(const float* ptr, v_float32& a, v_float32& b) +{ a = vx_load(ptr); b = vx_load(ptr + v_float32::nlanes); } + +//static inline void vx_load_pair_as(const float16_t* ptr, v_float32& a, v_float32& b) +//{ +// a = vx_load_expand(ptr); +// b = vx_load_expand(ptr + v_float32::nlanes); +//} + + +static inline void v_store_pair_as(uchar* ptr, const v_uint16& a, const v_uint16& b) +{ + v_store(ptr, v_pack(a, b)); +} + +static inline void v_store_pair_as(schar* ptr, const v_uint16& a, const v_uint16& b) +{ + const v_uint8 maxval = vx_setall_u8((uchar)std::numeric_limits::max()); + v_uint8 v = v_pack(a, b); + v_store(ptr, v_reinterpret_as_s8(v_min(v, maxval))); +} + +static inline void v_store_pair_as(ushort* ptr, const v_uint16& a, const v_uint16& b) +{ v_store(ptr, a); v_store(ptr + v_uint16::nlanes, b); } + +static inline void v_store_pair_as(uchar* ptr, const v_int16& a, const v_int16& b) +{ v_store(ptr, v_pack_u(a, b)); } + +static inline void v_store_pair_as(schar* ptr, const v_int16& a, const v_int16& b) +{ v_store(ptr, v_pack(a, b)); } + +static inline void v_store_pair_as(short* ptr, const v_int16& a, const v_int16& b) +{ v_store(ptr, a); v_store(ptr + v_int16::nlanes, b); } + +static inline void v_store_pair_as(uchar* ptr, const v_int32& a, const v_int32& b) +{ v_pack_u_store(ptr, v_pack(a, b)); } + +static inline void v_store_pair_as(schar* ptr, const v_int32& a, const v_int32& b) +{ v_pack_store(ptr, v_pack(a, b)); } + +static inline void v_store_pair_as(ushort* ptr, const v_int32& a, const v_int32& b) +{ v_store(ptr, v_pack_u(a, b)); } + +static inline void v_store_pair_as(short* ptr, const v_int32& a, const v_int32& b) +{ v_store(ptr, v_pack(a, b)); } + +static inline void v_store_pair_as(int* ptr, const v_int32& a, const v_int32& b) +{ + v_store(ptr, a); + v_store(ptr + v_int32::nlanes, b); +} + +static inline void v_store_pair_as(uchar* ptr, const v_float32& a, const v_float32& b) +{ v_pack_u_store(ptr, v_pack(v_round(a), v_round(b))); } + +static inline void v_store_pair_as(schar* ptr, const v_float32& a, const v_float32& b) +{ v_pack_store(ptr, v_pack(v_round(a), v_round(b))); } + +static inline void v_store_pair_as(ushort* ptr, const v_float32& a, const v_float32& b) +{ v_store(ptr, v_pack_u(v_round(a), v_round(b))); } + +static inline void v_store_pair_as(short* ptr, const v_float32& a, const v_float32& b) +{ v_store(ptr, v_pack(v_round(a), v_round(b))); } + +static inline void v_store_pair_as(int* ptr, const v_float32& a, const v_float32& b) +{ + v_int32 ia = v_round(a), ib = v_round(b); + v_store(ptr, ia); + v_store(ptr + v_int32::nlanes, ib); +} + +static inline void v_store_pair_as(float* ptr, const v_float32& a, const v_float32& b) +{ v_store(ptr, a); v_store(ptr + v_float32::nlanes, b); } + +#if CV_SIMD_64F + +static inline void vx_load_as(const double* ptr, v_float32& a) +{ + v_float64 v0 = vx_load(ptr), v1 = vx_load(ptr + v_float64::nlanes); + a = v_cvt_f32(v0, v1); +} + +static inline void vx_load_pair_as(const double* ptr, v_int32& a, v_int32& b) +{ + v_float64 v0 = vx_load(ptr), v1 = vx_load(ptr + v_float64::nlanes); + v_float64 v2 = vx_load(ptr + v_float64::nlanes*2), v3 = vx_load(ptr + v_float64::nlanes*3); + v_int32 iv0 = v_round(v0), iv1 = v_round(v1); + v_int32 iv2 = v_round(v2), iv3 = v_round(v3); + a = v_combine_low(iv0, iv1); + b = v_combine_low(iv2, iv3); +} + +static inline void vx_load_pair_as(const double* ptr, v_float32& a, v_float32& b) +{ + v_float64 v0 = vx_load(ptr), v1 = vx_load(ptr + v_float64::nlanes); + v_float64 v2 = vx_load(ptr + v_float64::nlanes*2), v3 = vx_load(ptr + v_float64::nlanes*3); + a = v_cvt_f32(v0, v1); + b = v_cvt_f32(v2, v3); +} + +static inline void vx_load_pair_as(const uchar* ptr, v_float64& a, v_float64& b) +{ + v_int32 v0 = v_reinterpret_as_s32(vx_load_expand_q(ptr)); + a = v_cvt_f64(v0); + b = v_cvt_f64_high(v0); +} + +static inline void vx_load_pair_as(const schar* ptr, v_float64& a, v_float64& b) +{ + v_int32 v0 = vx_load_expand_q(ptr); + a = v_cvt_f64(v0); + b = v_cvt_f64_high(v0); +} + +static inline void vx_load_pair_as(const ushort* ptr, v_float64& a, v_float64& b) +{ + v_int32 v0 = v_reinterpret_as_s32(vx_load_expand(ptr)); + a = v_cvt_f64(v0); + b = v_cvt_f64_high(v0); +} + +static inline void vx_load_pair_as(const short* ptr, v_float64& a, v_float64& b) +{ + v_int32 v0 = vx_load_expand(ptr); + a = v_cvt_f64(v0); + b = v_cvt_f64_high(v0); +} + +static inline void vx_load_pair_as(const int* ptr, v_float64& a, v_float64& b) +{ + v_int32 v0 = vx_load(ptr); + a = v_cvt_f64(v0); + b = v_cvt_f64_high(v0); +} + +static inline void vx_load_pair_as(const float* ptr, v_float64& a, v_float64& b) +{ + v_float32 v0 = vx_load(ptr); + a = v_cvt_f64(v0); + b = v_cvt_f64_high(v0); +} + +static inline void vx_load_pair_as(const double* ptr, v_float64& a, v_float64& b) +{ + a = vx_load(ptr); + b = vx_load(ptr + v_float64::nlanes); +} + +//static inline void vx_load_pair_as(const float16_t* ptr, v_float64& a, v_float64& b) +//{ +// v_float32 v0 = vx_load_expand(ptr); +// a = v_cvt_f64(v0); +// b = v_cvt_f64_high(v0); +//} + +static inline void v_store_as(double* ptr, const v_float32& a) +{ + v_float64 fa0 = v_cvt_f64(a), fa1 = v_cvt_f64_high(a); + v_store(ptr, fa0); + v_store(ptr + v_float64::nlanes, fa1); +} + +static inline void v_store_pair_as(double* ptr, const v_int32& a, const v_int32& b) +{ + v_float64 fa0 = v_cvt_f64(a), fa1 = v_cvt_f64_high(a); + v_float64 fb0 = v_cvt_f64(b), fb1 = v_cvt_f64_high(b); + + v_store(ptr, fa0); + v_store(ptr + v_float64::nlanes, fa1); + v_store(ptr + v_float64::nlanes*2, fb0); + v_store(ptr + v_float64::nlanes*3, fb1); +} + +static inline void v_store_pair_as(double* ptr, const v_float32& a, const v_float32& b) +{ + v_float64 fa0 = v_cvt_f64(a), fa1 = v_cvt_f64_high(a); + v_float64 fb0 = v_cvt_f64(b), fb1 = v_cvt_f64_high(b); + + v_store(ptr, fa0); + v_store(ptr + v_float64::nlanes, fa1); + v_store(ptr + v_float64::nlanes*2, fb0); + v_store(ptr + v_float64::nlanes*3, fb1); +} + +static inline void v_store_pair_as(double* ptr, const v_float64& a, const v_float64& b) +{ + v_store(ptr, a); + v_store(ptr + v_float64::nlanes, b); +} + +static inline void v_store_pair_as(int* ptr, const v_float64& a, const v_float64& b) +{ + v_int32 ia = v_round(a), ib = v_round(b); + v_store(ptr, v_combine_low(ia, ib)); +} + +static inline void v_store_pair_as(float* ptr, const v_float64& a, const v_float64& b) +{ + v_float32 v = v_cvt_f32(a, b); + v_store(ptr, v); +} + +//static inline void v_store_pair_as(float16_t* ptr, const v_float64& a, const v_float64& b) +//{ +// v_float32 v = v_cvt_f32(a, b); +// v_pack_store(ptr, v); +//} + +#else + +static inline void vx_load_as(const double* ptr, v_float32& a) +{ + const int VECSZ = v_float32::nlanes; + float buf[VECSZ*2]; + + for( int i = 0; i < VECSZ; i++ ) + buf[i] = saturate_cast(ptr[i]); + a = vx_load(buf); +} + +template +static inline void vx_load_pair_as(const double* ptr, _Tdvec& a, _Tdvec& b) +{ + const int VECSZ = _Tdvec::nlanes; + typename _Tdvec::lane_type buf[VECSZ*2]; + + for( int i = 0; i < VECSZ*2; i++ ) + buf[i] = saturate_cast(ptr[i]); + a = vx_load(buf); + b = vx_load(buf + VECSZ); +} + +static inline void v_store_as(double* ptr, const v_float32& a) +{ + const int VECSZ = v_float32::nlanes; + float buf[VECSZ]; + + v_store(buf, a); + for( int i = 0; i < VECSZ; i++ ) + ptr[i] = (double)buf[i]; +} + +template +static inline void v_store_pair_as(double* ptr, const _Tsvec& a, const _Tsvec& b) +{ + const int VECSZ = _Tsvec::nlanes; + typename _Tsvec::lane_type buf[VECSZ*2]; + + v_store(buf, a); v_store(buf + VECSZ, b); + for( int i = 0; i < VECSZ*2; i++ ) + ptr[i] = (double)buf[i]; +} + +#endif /////////// CV_SIMD_64F + +#endif /////////// CV_SIMD + } #endif // SRC_CONVERT_HPP diff --git a/modules/core/src/convert.sse4_1.cpp b/modules/core/src/convert.sse4_1.cpp deleted file mode 100644 index 3c18063d1d..0000000000 --- a/modules/core/src/convert.sse4_1.cpp +++ /dev/null @@ -1,203 +0,0 @@ -// This file is part of OpenCV project. -// It is subject to the license terms in the LICENSE file found in the top-level directory -// of this distribution and at http://opencv.org/license.html - - -#include "precomp.hpp" -#include "convert.hpp" - -namespace cv -{ -namespace opt_SSE4_1 -{ - -int cvtScale_SIMD_u8u16f32_SSE41(const uchar * src, ushort * dst, int width, float scale, float shift) -{ - int x = 0; - - __m128i v_zero = _mm_setzero_si128(); - __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); - - for ( ; x <= width - 8; x += 8) - { - __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero); - __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)); - __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); - - v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)); - __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); - - __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0), - _mm_cvtps_epi32(v_dst_1)); - _mm_storeu_si128((__m128i *)(dst + x), v_dst); - } - - return x; -} - -int cvtScale_SIMD_s8u16f32_SSE41(const schar * src, ushort * dst, int width, float scale, float shift) -{ - int x = 0; - - __m128i v_zero = _mm_setzero_si128(); - __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); - - for ( ; x <= width - 8; x += 8) - { - __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src + x))), 8); - __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16)); - __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); - - v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16)); - __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); - - __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0), - _mm_cvtps_epi32(v_dst_1)); - _mm_storeu_si128((__m128i *)(dst + x), v_dst); - } - - return x; -} - -int cvtScale_SIMD_u16u16f32_SSE41(const ushort * src, ushort * dst, int width, float scale, float shift) -{ - int x = 0; - - __m128i v_zero = _mm_setzero_si128(); - __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); - - for ( ; x <= width - 8; x += 8) - { - __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); - __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)); - __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); - - v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)); - __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); - - __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0), - _mm_cvtps_epi32(v_dst_1)); - _mm_storeu_si128((__m128i *)(dst + x), v_dst); - } - - return x; -} - -int cvtScale_SIMD_s16u16f32_SSE41(const short * src, ushort * dst, int width, float scale, float shift) -{ - int x = 0; - - __m128i v_zero = _mm_setzero_si128(); - __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); - - for ( ; x <= width - 8; x += 8) - { - __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); - __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16)); - __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); - - v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16)); - __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); - - __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0), - _mm_cvtps_epi32(v_dst_1)); - _mm_storeu_si128((__m128i *)(dst + x), v_dst); - } - - return x; -} - -int cvtScale_SIMD_s32u16f32_SSE41(const int * src, ushort * dst, int width, float scale, float shift) -{ - int x = 0; - - __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); - - for ( ; x <= width - 8; x += 8) - { - __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); - __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift); - - v_src = _mm_loadu_si128((__m128i const *)(src + x + 4)); - __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift); - - __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0), - _mm_cvtps_epi32(v_dst_1)); - _mm_storeu_si128((__m128i *)(dst + x), v_dst); - } - - return x; -} - -int cvtScale_SIMD_f32u16f32_SSE41(const float * src, ushort * dst, int width, float scale, float shift) -{ - int x = 0; - - __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); - - for ( ; x <= width - 8; x += 8) - { - __m128 v_src = _mm_loadu_ps(src + x); - __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); - - v_src = _mm_loadu_ps(src + x + 4); - __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); - - __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0), - _mm_cvtps_epi32(v_dst_1)); - _mm_storeu_si128((__m128i *)(dst + x), v_dst); - } - - return x; -} - -int cvtScale_SIMD_f64u16f32_SSE41(const double * src, ushort * dst, int width, float scale, float shift) -{ - int x = 0; - - __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); - - for ( ; x <= width - 8; x += 8) - { - __m128 v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x)), - _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2))); - __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); - - v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x + 4)), - _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6))); - __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); - - __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0), - _mm_cvtps_epi32(v_dst_1)); - _mm_storeu_si128((__m128i *)(dst + x), v_dst); - } - - return x; -} - -int Cvt_SIMD_f64u16_SSE41(const double * src, ushort * dst, int width) -{ - int x = 0; - - for ( ; x <= width - 8; x += 8) - { - __m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x)); - __m128 v_src1 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2)); - __m128 v_src2 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 4)); - __m128 v_src3 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6)); - - v_src0 = _mm_movelh_ps(v_src0, v_src1); - v_src1 = _mm_movelh_ps(v_src2, v_src3); - - __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_src0), - _mm_cvtps_epi32(v_src1)); - _mm_storeu_si128((__m128i *)(dst + x), v_dst); - } - - return x; -} - -} -} // cv:: - -/* End of file. */ diff --git a/modules/core/src/convert_scale.cpp b/modules/core/src/convert_scale.cpp index 25f5a963b7..0d4b5151a3 100644 --- a/modules/core/src/convert_scale.cpp +++ b/modules/core/src/convert_scale.cpp @@ -14,1623 +14,278 @@ namespace cv { -template -struct cvtScaleAbs_SIMD +template inline void +cvtabs_32f( const _Ts* src, size_t sstep, _Td* dst, size_t dstep, + Size size, float a, float b ) { - int operator () (const T *, DT *, int, WT, WT) const - { - return 0; - } -}; - -#if CV_SIMD128 - -static inline void v_load_expand_from_u8_f32(const uchar* src, const v_float32x4 &v_scale, const v_float32x4 &v_shift, v_float32x4 &a, v_float32x4 &b) -{ - v_uint32x4 v_src0, v_src1; - v_expand(v_load_expand(src), v_src0, v_src1); - - a = v_shift + v_scale * v_cvt_f32(v_reinterpret_as_s32(v_src0)); - b = v_shift + v_scale * v_cvt_f32(v_reinterpret_as_s32(v_src1)); -} - -static inline void v_load_expand_from_s8_f32(const schar* src, const v_float32x4 &v_scale, const v_float32x4 &v_shift, v_float32x4 &a, v_float32x4 &b) -{ - v_int32x4 v_src0, v_src1; - v_expand(v_load_expand(src), v_src0, v_src1); - - a = v_shift + v_scale * v_cvt_f32(v_src0); - b = v_shift + v_scale * v_cvt_f32(v_src1); -} - -static inline void v_load_expand_from_u16_f32(const ushort* src, const v_float32x4 &v_scale, const v_float32x4 &v_shift, v_float32x4 &a, v_float32x4 &b) -{ - v_uint32x4 v_src0, v_src1; - v_expand(v_load(src), v_src0, v_src1); - - a = v_shift + v_scale * v_cvt_f32(v_reinterpret_as_s32(v_src0)); - b = v_shift + v_scale * v_cvt_f32(v_reinterpret_as_s32(v_src1)); -} - -static inline void v_load_expand_from_s16_f32(const short* src, const v_float32x4 &v_scale, const v_float32x4 &v_shift, v_float32x4 &a, v_float32x4 &b) -{ - v_int32x4 v_src0, v_src1; - v_expand(v_load(src), v_src0, v_src1); - - a = v_shift + v_scale * v_cvt_f32(v_src0); - b = v_shift + v_scale * v_cvt_f32(v_src1); -} - -static inline void v_load_expand_from_s32_f32(const int* src, const v_float32x4 &v_scale, const v_float32x4 &v_shift, v_float32x4 &a, v_float32x4 &b) -{ - a = v_shift + v_scale * v_cvt_f32(v_load(src)); - b = v_shift + v_scale * v_cvt_f32(v_load(src + v_int32x4::nlanes)); -} - -template <> -struct cvtScaleAbs_SIMD -{ - int operator () (const uchar * src, uchar * dst, int width, - float scale, float shift) const - { - int x = 0; - if (hasSIMD128()) - { - v_float32x4 v_shift = v_setall_f32(shift); - v_float32x4 v_scale = v_setall_f32(scale); - const int cWidth = v_uint16x8::nlanes; - for (; x <= width - cWidth * 2; x += cWidth * 2) - { - v_float32x4 v_dst_0, v_dst_1, v_dst_2, v_dst_3; - v_load_expand_from_u8_f32(src + x, v_scale, v_shift, v_dst_0, v_dst_1); - v_load_expand_from_u8_f32(src + x + cWidth, v_scale, v_shift, v_dst_2, v_dst_3); - v_dst_0 = v_abs(v_dst_0); - v_dst_1 = v_abs(v_dst_1); - v_dst_2 = v_abs(v_dst_2); - v_dst_3 = v_abs(v_dst_3); - - v_int16x8 v_dsti_0 = v_pack(v_round(v_dst_0), v_round(v_dst_1)); - v_int16x8 v_dsti_1 = v_pack(v_round(v_dst_2), v_round(v_dst_3)); - v_store(dst + x, v_pack_u(v_dsti_0, v_dsti_1)); - } - } - return x; - } -}; - -template <> -struct cvtScaleAbs_SIMD -{ - int operator () (const schar * src, uchar * dst, int width, - float scale, float shift) const - { - int x = 0; - if (hasSIMD128()) - { - v_float32x4 v_shift = v_setall_f32(shift); - v_float32x4 v_scale = v_setall_f32(scale); - const int cWidth = v_int16x8::nlanes; - for (; x <= width - cWidth*2; x += cWidth*2) - { - v_float32x4 v_dst_0, v_dst_1, v_dst_2, v_dst_3; - v_load_expand_from_s8_f32(src + x, v_scale, v_shift, v_dst_0, v_dst_1); - v_load_expand_from_s8_f32(src + x + cWidth, v_scale, v_shift, v_dst_2, v_dst_3); - v_dst_0 = v_abs(v_dst_0); - v_dst_1 = v_abs(v_dst_1); - v_dst_2 = v_abs(v_dst_2); - v_dst_3 = v_abs(v_dst_3); - - v_uint16x8 v_dsti_0 = v_pack_u(v_round(v_dst_0), v_round(v_dst_1)); - v_uint16x8 v_dsti_1 = v_pack_u(v_round(v_dst_2), v_round(v_dst_3)); - v_store(dst + x, v_pack(v_dsti_0, v_dsti_1)); - } - } - return x; - } -}; - -template <> -struct cvtScaleAbs_SIMD -{ - int operator () (const ushort * src, uchar * dst, int width, - float scale, float shift) const - { - int x = 0; - if (hasSIMD128()) - { - v_float32x4 v_shift = v_setall_f32(shift); - v_float32x4 v_scale = v_setall_f32(scale); - const int cWidth = v_uint16x8::nlanes; - for (; x <= width - cWidth; x += cWidth) - { - v_float32x4 v_dst0, v_dst1; - v_load_expand_from_u16_f32(src + x, v_scale, v_shift, v_dst0, v_dst1); - v_dst0 = v_abs(v_dst0); - v_dst1 = v_abs(v_dst1); - - v_int16x8 v_dst = v_pack(v_round(v_dst0), v_round(v_dst1)); - v_pack_u_store(dst + x, v_dst); - } - } - return x; - } -}; - -template <> -struct cvtScaleAbs_SIMD -{ - int operator () (const short * src, uchar * dst, int width, - float scale, float shift) const - { - int x = 0; - if (hasSIMD128()) - { - v_float32x4 v_shift = v_setall_f32(shift); - v_float32x4 v_scale = v_setall_f32(scale); - const int cWidth = v_int16x8::nlanes; - for (; x <= width - cWidth; x += cWidth) - { - v_float32x4 v_dst0, v_dst1; - v_load_expand_from_s16_f32(src + x, v_scale, v_shift, v_dst0, v_dst1); - v_dst0 = v_abs(v_dst0); - v_dst1 = v_abs(v_dst1); - - v_int16x8 v_dst = v_pack(v_round(v_dst0), v_round(v_dst1)); - v_pack_u_store(dst + x, v_dst); - } - } - return x; - } -}; - -template <> -struct cvtScaleAbs_SIMD -{ - int operator () (const int * src, uchar * dst, int width, - float scale, float shift) const - { - int x = 0; - v_float32x4 v_shift = v_setall_f32(shift); - v_float32x4 v_scale = v_setall_f32(scale); - const int cWidth = v_int32x4::nlanes; - for (; x <= width - cWidth * 2; x += cWidth * 2) - { - v_float32x4 v_dst_0 = v_cvt_f32(v_load(src + x)) * v_scale; - v_dst_0 = v_abs(v_dst_0 + v_shift); - - v_float32x4 v_dst_1 = v_cvt_f32(v_load(src + x + cWidth)) * v_scale; - v_dst_1 = v_abs(v_dst_1 + v_shift); - - v_int16x8 v_dst = v_pack(v_round(v_dst_0), v_round(v_dst_1)); - v_pack_u_store(dst + x, v_dst); - } - - return x; - } -}; - -template <> -struct cvtScaleAbs_SIMD -{ - int operator () (const float * src, uchar * dst, int width, - float scale, float shift) const - { - int x = 0; - v_float32x4 v_shift = v_setall_f32(shift); - v_float32x4 v_scale = v_setall_f32(scale); - int cWidth = v_float32x4::nlanes; - for (; x <= width - cWidth * 2; x += cWidth * 2) - { - v_float32x4 v_dst_0 = v_load(src + x) * v_scale; - v_dst_0 = v_abs(v_dst_0 + v_shift); - - v_float32x4 v_dst_1 = v_load(src + x + cWidth) * v_scale; - v_dst_1 = v_abs(v_dst_1 + v_shift); - - v_int16x8 v_dst = v_pack(v_round(v_dst_0), v_round(v_dst_1)); - v_pack_u_store(dst + x, v_dst); - } - return x; - } -}; - -#if CV_SIMD128_64F -template <> -struct cvtScaleAbs_SIMD -{ - int operator () (const double * src, uchar * dst, int width, - float scale, float shift) const - { - int x = 0; - - if (hasSIMD128()) - { - v_float32x4 v_scale = v_setall_f32(scale); - v_float32x4 v_shift = v_setall_f32(shift); - int cWidth = v_float64x2::nlanes; - for (; x <= width - cWidth * 4; x += cWidth * 4) - { - v_float32x4 v_src1, v_src2, v_dummy; - v_recombine(v_cvt_f32(v_load(src + x)), v_cvt_f32(v_load(src + x + cWidth)), v_src1, v_dummy); - v_recombine(v_cvt_f32(v_load(src + x + cWidth * 2)), v_cvt_f32(v_load(src + x + cWidth * 3)), v_src2, v_dummy); - - v_float32x4 v_dst1 = v_abs((v_src1 * v_scale) + v_shift); - v_float32x4 v_dst2 = v_abs((v_src2 * v_scale) + v_shift); - - v_int16x8 v_dst_i = v_pack(v_round(v_dst1), v_round(v_dst2)); - v_pack_u_store(dst + x, v_dst_i); - } - } - - return x; - } -}; -#endif // CV_SIMD128_64F - +#if CV_SIMD + v_float32 va = vx_setall_f32(a), vb = vx_setall_f32(b); + const int VECSZ = v_float32::nlanes*2; #endif - -template static void -cvtScaleAbs_( const T* src, size_t sstep, - DT* dst, size_t dstep, Size size, - WT scale, WT shift ) -{ - sstep /= sizeof(src[0]); - dstep /= sizeof(dst[0]); - cvtScaleAbs_SIMD vop; - - for( ; size.height--; src += sstep, dst += dstep ) - { - int x = vop(src, dst, size.width, scale, shift); - - #if CV_ENABLE_UNROLLED - for( ; x <= size.width - 4; x += 4 ) - { - DT t0, t1; - t0 = saturate_cast
(std::abs(src[x]*scale + shift)); - t1 = saturate_cast
(std::abs(src[x+1]*scale + shift)); - dst[x] = t0; dst[x+1] = t1; - t0 = saturate_cast
(std::abs(src[x+2]*scale + shift)); - t1 = saturate_cast
(std::abs(src[x+3]*scale + shift)); - dst[x+2] = t0; dst[x+3] = t1; - } - #endif - for( ; x < size.width; x++ ) - dst[x] = saturate_cast
(std::abs(src[x]*scale + shift)); - } -} - -template -struct cvtScale_SIMD -{ - int operator () (const T *, DT *, int, WT, WT) const - { - return 0; - } -}; - -#if CV_SIMD128 - -// from uchar - -template <> -struct cvtScale_SIMD -{ - int operator () (const uchar * src, uchar * dst, int width, float scale, float shift) const - { - int x = 0; - if (hasSIMD128()) - { - v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); - int cWidth = v_uint16x8::nlanes; - for (; x <= width - cWidth; x += cWidth) - { - v_float32x4 v_src1, v_src2; - v_load_expand_from_u8_f32(src + x, v_scale, v_shift, v_src1, v_src2); - - v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2)); - v_pack_u_store(dst + x, v_dst); - } - } - return x; - } -}; - -template <> -struct cvtScale_SIMD -{ - int operator () (const uchar * src, schar * dst, int width, float scale, float shift) const - { - int x = 0; - if (hasSIMD128()) - { - v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); - int cWidth = v_int16x8::nlanes; - for (; x <= width - cWidth; x += cWidth) - { - v_float32x4 v_src1, v_src2; - v_load_expand_from_u8_f32(src + x, v_scale, v_shift, v_src1, v_src2); - - v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2)); - v_store_low(dst + x, v_pack(v_dst, v_dst)); - } - } - return x; - } -}; - -template <> -struct cvtScale_SIMD -{ - int operator () (const uchar * src, ushort * dst, int width, float scale, float shift) const - { - int x = 0; -#if CV_TRY_SSE4_1 - if (CV_CPU_HAS_SUPPORT_SSE4_1) - return opt_SSE4_1::cvtScale_SIMD_u8u16f32_SSE41(src, dst, width, scale, shift); -#endif - if (hasSIMD128()) - { - v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); - int cWidth = v_uint16x8::nlanes; - for (; x <= width - cWidth; x += cWidth) - { - v_float32x4 v_src1, v_src2; - v_load_expand_from_u8_f32(src + x, v_scale, v_shift, v_src1, v_src2); - - v_uint16x8 v_dst = v_pack_u(v_round(v_src1), v_round(v_src2)); - v_store(dst + x, v_dst); - } - } - return x; - } -}; - -template <> -struct cvtScale_SIMD -{ - int operator () (const uchar * src, short * dst, int width, float scale, float shift) const - { - int x = 0; - if (hasSIMD128()) - { - v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); - int cWidth = v_int16x8::nlanes; - for (; x <= width - cWidth; x += cWidth) - { - v_float32x4 v_src1, v_src2; - v_load_expand_from_u8_f32(src + x, v_scale, v_shift, v_src1, v_src2); - - v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2)); - v_store(dst + x, v_dst); - } - } - return x; - } -}; - -template <> -struct cvtScale_SIMD -{ - int operator () (const uchar * src, int * dst, int width, float scale, float shift) const - { - int x = 0; - if (hasSIMD128()) - { - v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); - int cWidth = v_int32x4::nlanes; - for (; x <= width - cWidth * 2; x += cWidth * 2) - { - v_float32x4 v_src1, v_src2; - v_load_expand_from_u8_f32(src + x, v_scale, v_shift, v_src1, v_src2); - - v_store(dst + x, v_round(v_src1)); - v_store(dst + x + cWidth, v_round(v_src2)); - } - } - return x; - } -}; - -template <> -struct cvtScale_SIMD -{ - int operator () (const uchar * src, float * dst, int width, float scale, float shift) const - { - int x = 0; - if (hasSIMD128()) - { - v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); - int cWidth = v_float32x4::nlanes; - for (; x <= width - cWidth * 2; x += cWidth * 2) - { - v_float32x4 v_src1, v_src2; - v_load_expand_from_u8_f32(src + x, v_scale, v_shift, v_src1, v_src2); - - v_store(dst + x, v_src1); - v_store(dst + x + cWidth, v_src2); - } - } - return x; - } -}; - -// from schar - -template <> -struct cvtScale_SIMD -{ - int operator () (const schar * src, uchar * dst, int width, float scale, float shift) const - { - int x = 0; - if (hasSIMD128()) - { - v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); - int cWidth = v_uint16x8::nlanes; - for (; x <= width - cWidth; x += cWidth) - { - v_float32x4 v_src1, v_src2; - v_load_expand_from_s8_f32(src + x, v_scale, v_shift, v_src1, v_src2); - - v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2)); - v_pack_u_store(dst + x, v_dst); - } - } - return x; - } -}; - -template <> -struct cvtScale_SIMD -{ - int operator () (const schar * src, schar * dst, int width, float scale, float shift) const - { - int x = 0; - if (hasSIMD128()) - { - v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); - int cWidth = v_int16x8::nlanes; - for (; x <= width - cWidth; x += cWidth) - { - v_float32x4 v_src1, v_src2; - v_load_expand_from_s8_f32(src + x, v_scale, v_shift, v_src1, v_src2); - - v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2)); - v_store_low(dst + x, v_pack(v_dst, v_dst)); - } - } - return x; - } -}; - -template <> -struct cvtScale_SIMD -{ - int operator () (const schar * src, ushort * dst, int width, float scale, float shift) const - { - int x = 0; -#if CV_TRY_SSE4_1 - if (CV_CPU_HAS_SUPPORT_SSE4_1) - return opt_SSE4_1::cvtScale_SIMD_s8u16f32_SSE41(src, dst, width, scale, shift); -#endif - if (hasSIMD128()) - { - v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); - int cWidth = v_uint16x8::nlanes; - for (; x <= width - cWidth; x += cWidth) - { - v_float32x4 v_src1, v_src2; - v_load_expand_from_s8_f32(src + x, v_scale, v_shift, v_src1, v_src2); - - v_uint16x8 v_dst = v_pack_u(v_round(v_src1), v_round(v_src2)); - v_store(dst + x, v_dst); - } - } - return x; - } -}; - -template <> -struct cvtScale_SIMD -{ - int operator () (const schar * src, short * dst, int width, float scale, float shift) const - { - int x = 0; - if (hasSIMD128()) - { - v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); - int cWidth = v_int16x8::nlanes; - for (; x <= width - cWidth; x += cWidth) - { - v_float32x4 v_src1, v_src2; - v_load_expand_from_s8_f32(src + x, v_scale, v_shift, v_src1, v_src2); - - v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2)); - v_store(dst + x, v_dst); - } - } - return x; - } -}; - -template <> -struct cvtScale_SIMD -{ - int operator () (const schar * src, int * dst, int width, float scale, float shift) const - { - int x = 0; - if (hasSIMD128()) - { - v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); - int cWidth = v_int32x4::nlanes; - for (; x <= width - cWidth * 2; x += cWidth * 2) - { - v_float32x4 v_src1, v_src2; - v_load_expand_from_s8_f32(src + x, v_scale, v_shift, v_src1, v_src2); - - v_store(dst + x, v_round(v_src1)); - v_store(dst + x + cWidth, v_round(v_src2)); - } - } - return x; - } -}; - -template <> -struct cvtScale_SIMD -{ - int operator () (const schar * src, float * dst, int width, float scale, float shift) const - { - int x = 0; - if (hasSIMD128()) - { - v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); - int cWidth = v_float32x4::nlanes; - for (; x <= width - cWidth * 2; x += cWidth * 2) - { - v_float32x4 v_src1, v_src2; - v_load_expand_from_s8_f32(src + x, v_scale, v_shift, v_src1, v_src2); - - v_store(dst + x, v_src1); - v_store(dst + x + cWidth, v_src2); - } - } - return x; - } -}; - -// from ushort - -template <> -struct cvtScale_SIMD -{ - int operator () (const ushort * src, uchar * dst, int width, float scale, float shift) const - { - int x = 0; - if (hasSIMD128()) - { - v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); - int cWidth = v_uint16x8::nlanes; - for (; x <= width - cWidth; x += cWidth) - { - v_float32x4 v_src1, v_src2; - v_load_expand_from_u16_f32(src + x, v_scale, v_shift, v_src1, v_src2); - - v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2)); - v_pack_u_store(dst + x, v_dst); - } - } - return x; - } -}; - -template <> -struct cvtScale_SIMD -{ - int operator () (const ushort * src, schar * dst, int width, float scale, float shift) const - { - int x = 0; - if (hasSIMD128()) - { - v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); - int cWidth = v_int16x8::nlanes; - for (; x <= width - cWidth; x += cWidth) - { - v_float32x4 v_src1, v_src2; - v_load_expand_from_u16_f32(src + x, v_scale, v_shift, v_src1, v_src2); - - v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2)); - v_store_low(dst + x, v_pack(v_dst, v_dst)); - } - } - return x; - } -}; - -template <> -struct cvtScale_SIMD -{ - int operator () (const ushort * src, ushort * dst, int width, float scale, float shift) const - { - int x = 0; -#if CV_TRY_SSE4_1 - if (CV_CPU_HAS_SUPPORT_SSE4_1) - return opt_SSE4_1::cvtScale_SIMD_u16u16f32_SSE41(src, dst, width, scale, shift); -#endif - if (hasSIMD128()) - { - v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); - int cWidth = v_uint16x8::nlanes; - for (; x <= width - cWidth; x += cWidth) - { - v_float32x4 v_src1, v_src2; - v_load_expand_from_u16_f32(src + x, v_scale, v_shift, v_src1, v_src2); - - v_uint16x8 v_dst = v_pack_u(v_round(v_src1), v_round(v_src2)); - v_store(dst + x, v_dst); - } - } - return x; - } -}; - -template <> -struct cvtScale_SIMD -{ - int operator () (const ushort * src, short * dst, int width, float scale, float shift) const - { - int x = 0; - if (hasSIMD128()) - { - v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); - int cWidth = v_int16x8::nlanes; - for (; x <= width - cWidth; x += cWidth) - { - v_float32x4 v_src1, v_src2; - v_load_expand_from_u16_f32(src + x, v_scale, v_shift, v_src1, v_src2); - - v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2)); - v_store(dst + x, v_dst); - } - } - return x; - } -}; - -template <> -struct cvtScale_SIMD -{ - int operator () (const ushort * src, int * dst, int width, float scale, float shift) const - { - int x = 0; - if (hasSIMD128()) - { - v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); - int cWidth = v_int32x4::nlanes; - for (; x <= width - cWidth * 2; x += cWidth * 2) - { - v_float32x4 v_src1, v_src2; - v_load_expand_from_u16_f32(src + x, v_scale, v_shift, v_src1, v_src2); - - v_store(dst + x, v_round(v_src1)); - v_store(dst + x + cWidth, v_round(v_src2)); - } - } - return x; - } -}; - -template <> -struct cvtScale_SIMD -{ - int operator () (const ushort * src, float * dst, int width, float scale, float shift) const - { - int x = 0; - if (hasSIMD128()) - { - v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); - int cWidth = v_float32x4::nlanes; - for (; x <= width - cWidth * 2; x += cWidth * 2) - { - v_float32x4 v_src1, v_src2; - v_load_expand_from_u16_f32(src + x, v_scale, v_shift, v_src1, v_src2); - - v_store(dst + x, v_src1); - v_store(dst + x + cWidth, v_src2); - } - } - return x; - } -}; - -// from short - -template <> -struct cvtScale_SIMD -{ - int operator () (const short * src, uchar * dst, int width, float scale, float shift) const - { - int x = 0; - if (hasSIMD128()) - { - v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); - int cWidth = v_uint16x8::nlanes; - for (; x <= width - cWidth; x += cWidth) - { - v_float32x4 v_src1, v_src2; - v_load_expand_from_s16_f32(src + x, v_scale, v_shift, v_src1, v_src2); - - v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2)); - v_pack_u_store(dst + x, v_dst); - } - } - return x; - } -}; - -template <> -struct cvtScale_SIMD -{ - int operator () (const short * src, schar * dst, int width, float scale, float shift) const - { - int x = 0; - if (hasSIMD128()) - { - v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); - int cWidth = v_int16x8::nlanes; - for (; x <= width - cWidth; x += cWidth) - { - v_float32x4 v_src1, v_src2; - v_load_expand_from_s16_f32(src + x, v_scale, v_shift, v_src1, v_src2); - - v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2)); - v_store_low(dst + x, v_pack(v_dst, v_dst)); - } - } - return x; - } -}; - -template <> -struct cvtScale_SIMD -{ - int operator () (const short * src, ushort * dst, int width, float scale, float shift) const - { - int x = 0; -#if CV_TRY_SSE4_1 - if (CV_CPU_HAS_SUPPORT_SSE4_1) - return opt_SSE4_1::cvtScale_SIMD_s16u16f32_SSE41(src, dst, width, scale, shift); -#endif - if (hasSIMD128()) - { - v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); - int cWidth = v_uint16x8::nlanes; - for (; x <= width - cWidth; x += cWidth) - { - v_float32x4 v_src1, v_src2; - v_load_expand_from_s16_f32(src + x, v_scale, v_shift, v_src1, v_src2); - - v_uint16x8 v_dst = v_pack_u(v_round(v_src1), v_round(v_src2)); - v_store(dst + x, v_dst); - } - } - return x; - } -}; - -template <> -struct cvtScale_SIMD -{ - int operator () (const short * src, short * dst, int width, float scale, float shift) const - { - int x = 0; - if (hasSIMD128()) - { - v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); - int cWidth = v_uint16x8::nlanes; - for (; x <= width - cWidth; x += cWidth) - { - v_float32x4 v_src1, v_src2; - v_load_expand_from_s16_f32(src + x, v_scale, v_shift, v_src1, v_src2); - - v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2)); - v_store(dst + x, v_dst); - } - } - return x; - } -}; - -template <> -struct cvtScale_SIMD -{ - int operator () (const short * src, float * dst, int width, float scale, float shift) const - { - int x = 0; - if (hasSIMD128()) - { - v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); - int cWidth = v_float32x4::nlanes; - for (; x <= width - cWidth * 2; x += cWidth * 2) - { - v_float32x4 v_src1, v_src2; - v_load_expand_from_s16_f32(src + x, v_scale, v_shift, v_src1, v_src2); - - v_store(dst + x, v_src1); - v_store(dst + x + cWidth, v_src2); - } - } - return x; - } -}; - -// from int - -template <> -struct cvtScale_SIMD -{ - int operator () (const int * src, uchar * dst, int width, float scale, float shift) const - { - int x = 0; - if (hasSIMD128()) - { - v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); - int cWidth = v_uint16x8::nlanes; - for (; x <= width - cWidth; x += cWidth) - { - v_float32x4 v_src1, v_src2; - v_load_expand_from_s32_f32(src + x, v_scale, v_shift, v_src1, v_src2); - - v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2)); - v_pack_u_store(dst + x, v_dst); - } - } - return x; - } -}; - -template <> -struct cvtScale_SIMD -{ - int operator () (const int * src, schar * dst, int width, float scale, float shift) const - { - int x = 0; - if (hasSIMD128()) - { - v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); - int cWidth = v_int16x8::nlanes; - for (; x <= width - cWidth; x += cWidth) - { - v_float32x4 v_src1, v_src2; - v_load_expand_from_s32_f32(src + x, v_scale, v_shift, v_src1, v_src2); - - v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2)); - v_store_low(dst + x, v_pack(v_dst, v_dst)); - } - } - return x; - } -}; - -template <> -struct cvtScale_SIMD -{ - int operator () (const int * src, ushort * dst, int width, float scale, float shift) const - { - int x = 0; -#if CV_TRY_SSE4_1 - if (CV_CPU_HAS_SUPPORT_SSE4_1) - return opt_SSE4_1::cvtScale_SIMD_s32u16f32_SSE41(src, dst, width, scale, shift); -#endif - if (hasSIMD128()) - { - v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); - int cWidth = v_uint16x8::nlanes; - for (; x <= width - cWidth; x += cWidth) - { - v_float32x4 v_src1, v_src2; - v_load_expand_from_s32_f32(src + x, v_scale, v_shift, v_src1, v_src2); - - v_uint16x8 v_dst = v_pack_u(v_round(v_src1), v_round(v_src2)); - v_store(dst + x, v_dst); - } - } - return x; - } -}; - -template <> -struct cvtScale_SIMD -{ - int operator () (const int * src, short * dst, int width, float scale, float shift) const - { - int x = 0; - if (hasSIMD128()) - { - v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); - int cWidth = v_int16x8::nlanes; - for (; x <= width - cWidth; x += cWidth) - { - v_float32x4 v_src1, v_src2; - v_load_expand_from_s32_f32(src + x, v_scale, v_shift, v_src1, v_src2); - - v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2)); - v_store(dst + x, v_dst); - } - } - return x; - } -}; - -#if CV_SIMD128_64F -template <> -struct cvtScale_SIMD -{ - int operator () (const int * src, int * dst, int width, double scale, double shift) const - { - int x = 0; - if (hasSIMD128()) - { - v_float64x2 v_shift = v_setall_f64(shift), v_scale = v_setall_f64(scale); - int cWidth = v_int32x4::nlanes; - for (; x <= width - cWidth; x += cWidth) - { - double v_srcbuf[] = { (double)src[x], (double)src[x+1], (double)src[x+2], (double)src[x+3] }; - v_float64x2 v_src1 = v_shift + v_scale * v_load(v_srcbuf); - v_float64x2 v_src2 = v_shift + v_scale * v_load(v_srcbuf + 2); - v_store(dst + x, v_combine_low(v_round(v_src1), v_round(v_src2))); - } - } - return x; - } -}; - -template <> -struct cvtScale_SIMD -{ - int operator () (const int * src, float * dst, int width, double scale, double shift) const - { - int x = 0; - if (hasSIMD128()) - { - v_float64x2 v_shift = v_setall_f64(shift), v_scale = v_setall_f64(scale); - int cWidth = v_int32x4::nlanes; - for (; x <= width - cWidth; x += cWidth) - { - double v_srcbuf[] = { (double)src[x], (double)src[x+1], (double)src[x+2], (double)src[x+3] }; - v_float64x2 v_src1 = v_shift + v_scale * v_load(v_srcbuf); - v_float64x2 v_src2 = v_shift + v_scale * v_load(v_srcbuf + 2); - v_store(dst + x, v_combine_low(v_cvt_f32(v_src1), v_cvt_f32(v_src2))); - } - } - return x; - } -}; -#endif //CV_SIMD128_64F - -// from float - -template <> -struct cvtScale_SIMD -{ - int operator () (const float * src, uchar * dst, int width, float scale, float shift) const - { - int x = 0; - if (hasSIMD128()) - { - v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); - int cWidth = v_float32x4::nlanes; - for (; x <= width - cWidth * 2; x += cWidth * 2) - { - v_float32x4 v_dst1 = v_shift + v_scale * v_load(src + x); - v_float32x4 v_dst2 = v_shift + v_scale * v_load(src + x + cWidth); - - v_int16x8 v_dst = v_pack(v_round(v_dst1), v_round(v_dst2)); - v_pack_u_store(dst + x, v_dst); - } - } - return x; - } -}; - -template <> -struct cvtScale_SIMD -{ - int operator () (const float * src, schar * dst, int width, float scale, float shift) const - { - int x = 0; - if (hasSIMD128()) - { - v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); - int cWidth = v_float32x4::nlanes; - for (; x <= width - cWidth * 2; x += cWidth * 2) - { - v_float32x4 v_dst1 = v_shift + v_scale * v_load(src + x); - v_float32x4 v_dst2 = v_shift + v_scale * v_load(src + x + cWidth); - - v_int16x8 v_dst = v_pack(v_round(v_dst1), v_round(v_dst2)); - v_store_low(dst + x, v_pack(v_dst, v_dst)); - } - } - return x; - } -}; - -template <> -struct cvtScale_SIMD -{ - int operator () (const float * src, ushort * dst, int width, float scale, float shift) const - { - int x = 0; -#if CV_TRY_SSE4_1 - if (CV_CPU_HAS_SUPPORT_SSE4_1) - return opt_SSE4_1::cvtScale_SIMD_f32u16f32_SSE41(src, dst, width, scale, shift); -#endif - if (hasSIMD128()) - { - v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); - int cWidth = v_float32x4::nlanes; - for (; x <= width - cWidth * 2; x += cWidth * 2) - { - v_float32x4 v_dst1 = v_shift + v_scale * v_load(src + x); - v_float32x4 v_dst2 = v_shift + v_scale * v_load(src + x + cWidth); - - v_uint16x8 v_dst = v_pack_u(v_round(v_dst1), v_round(v_dst2)); - v_store(dst + x, v_dst); - } - } - return x; - } -}; - -template <> -struct cvtScale_SIMD -{ - int operator () (const float * src, short * dst, int width, float scale, float shift) const - { - int x = 0; - if (hasSIMD128()) - { - v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); - int cWidth = v_float32x4::nlanes; - for (; x <= width - cWidth * 2; x += cWidth * 2) - { - v_float32x4 v_dst1 = v_shift + v_scale * v_load(src + x); - v_float32x4 v_dst2 = v_shift + v_scale * v_load(src + x + cWidth); - - v_int16x8 v_dst = v_pack(v_round(v_dst1), v_round(v_dst2)); - v_store(dst + x, v_dst); - } - } - return x; - } -}; - -template <> -struct cvtScale_SIMD -{ - int operator () (const float * src, int * dst, int width, float scale, float shift) const - { - int x = 0; - if (hasSIMD128()) - { - v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); - int cWidth = v_float32x4::nlanes; - for (; x <= width - cWidth; x += cWidth) - v_store(dst + x, v_round(v_load(src + x) * v_scale + v_shift)); - } - return x; - } -}; - -template <> -struct cvtScale_SIMD -{ - int operator () (const float * src, float * dst, int width, float scale, float shift) const - { - int x = 0; - if (hasSIMD128()) - { - v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); - int cWidth = v_float32x4::nlanes; - for (; x <= width - cWidth; x += cWidth) - v_store(dst + x, v_load(src + x) * v_scale + v_shift); - } - return x; - } -}; - -#if CV_SIMD128_64F - -static inline void v_load_scale_shift(const double* src, const v_float64x2& v_scale, const v_float64x2 &v_shift, v_float32x4& v_dst1, v_float32x4 &v_dst2) -{ - int cWidth = v_float64x2::nlanes; - v_float64x2 v_src1 = v_shift + v_scale * v_load(src); - v_float64x2 v_src2 = v_shift + v_scale * v_load(src + cWidth); - v_float64x2 v_src3 = v_shift + v_scale * v_load(src + cWidth * 2); - v_float64x2 v_src4 = v_shift + v_scale * v_load(src + cWidth * 3); - v_dst1 = v_combine_low(v_cvt_f32(v_src1), v_cvt_f32(v_src2)); - v_dst2 = v_combine_low(v_cvt_f32(v_src3), v_cvt_f32(v_src4)); -} - -static inline void v_store_scale_shift_s32_to_f64(double *dst, const v_float64x2 &v_scale, const v_float64x2 &v_shift, const v_int32x4 &v1, const v_int32x4 &v2) -{ - v_float64x2 v_dst1 = v_shift + v_scale * v_cvt_f64(v1); - v_float64x2 v_dst2 = v_shift + v_scale * v_cvt_f64_high(v1); - v_float64x2 v_dst3 = v_shift + v_scale * v_cvt_f64(v2); - v_float64x2 v_dst4 = v_shift + v_scale * v_cvt_f64_high(v2); - - v_store(dst, v_dst1); - v_store(dst + v_float64x2::nlanes, v_dst2); - v_store(dst + v_float64x2::nlanes * 2, v_dst3); - v_store(dst + v_float64x2::nlanes * 3, v_dst4); -} - -static inline void v_store_scale_shift_f32_to_f64(double *dst, const v_float64x2 &v_scale, const v_float64x2 &v_shift, const v_float32x4 &v1, const v_float32x4 &v2) -{ - v_float64x2 v_dst1 = v_shift + v_scale * v_cvt_f64(v1); - v_float64x2 v_dst2 = v_shift + v_scale * v_cvt_f64_high(v1); - v_float64x2 v_dst3 = v_shift + v_scale * v_cvt_f64(v2); - v_float64x2 v_dst4 = v_shift + v_scale * v_cvt_f64_high(v2); - - v_store(dst, v_dst1); - v_store(dst + v_float64x2::nlanes, v_dst2); - v_store(dst + v_float64x2::nlanes * 2, v_dst3); - v_store(dst + v_float64x2::nlanes * 3, v_dst4); -} - -// from double - -template <> -struct cvtScale_SIMD -{ - int operator () (const double * src, uchar * dst, int width, float scale, float shift) const - { - int x = 0; - if (hasSIMD128()) - { - v_float64x2 v_shift = v_setall_f64((double)shift), v_scale = v_setall_f64((double)scale); - int cWidth = v_float64x2::nlanes; - for (; x <= width - cWidth * 4; x += cWidth * 4) - { - v_float32x4 v_dst1, v_dst2; - v_load_scale_shift(src + x, v_scale, v_shift, v_dst1, v_dst2); - v_pack_u_store(dst + x, v_pack(v_round(v_dst1), v_round(v_dst2))); - } - } - return x; - } -}; - -template <> -struct cvtScale_SIMD -{ - int operator () (const double * src, schar * dst, int width, float scale, float shift) const - { - int x = 0; - if (hasSIMD128()) - { - v_float64x2 v_shift = v_setall_f64((double)shift), v_scale = v_setall_f64((double)scale); - int cWidth = v_float64x2::nlanes; - for (; x <= width - cWidth * 4; x += cWidth * 4) - { - v_float32x4 v_dst1, v_dst2; - v_load_scale_shift(src + x, v_scale, v_shift, v_dst1, v_dst2); - v_int16x8 v_dst = v_pack(v_round(v_dst1), v_round(v_dst2)); - v_pack_store(dst + x, v_dst); - } - } - return x; - } -}; - -template <> -struct cvtScale_SIMD -{ - int operator () (const double * src, ushort * dst, int width, float scale, float shift) const - { - int x = 0; -#if CV_TRY_SSE4_1 - if (CV_CPU_HAS_SUPPORT_SSE4_1) - return opt_SSE4_1::cvtScale_SIMD_f64u16f32_SSE41(src, dst, width, scale, shift); -#endif - if (hasSIMD128()) - { - v_float64x2 v_shift = v_setall_f64((double)shift), v_scale = v_setall_f64((double)scale); - int cWidth = v_uint16x8::nlanes; - for (; x <= width - cWidth; x += cWidth) - { - v_float32x4 v_dst1, v_dst2; - v_load_scale_shift(src + x, v_scale, v_shift, v_dst1, v_dst2); - v_uint16x8 v_dst = v_pack_u(v_round(v_dst1), v_round(v_dst2)); - v_store(dst + x, v_dst); - } - } - return x; - } -}; - -template <> -struct cvtScale_SIMD -{ - int operator () (const double * src, short * dst, int width, float scale, float shift) const - { - int x = 0; - if (hasSIMD128()) - { - v_float64x2 v_shift = v_setall_f64((double)shift), v_scale = v_setall_f64((double)scale); - int cWidth = v_int16x8::nlanes; - for (; x <= width - cWidth; x += cWidth) - { - v_float32x4 v_dst1, v_dst2; - v_load_scale_shift(src + x, v_scale, v_shift, v_dst1, v_dst2); - v_int16x8 v_dst = v_pack(v_round(v_dst1), v_round(v_dst2)); - v_store(dst + x, v_dst); - } - } - return x; - } -}; - -template <> -struct cvtScale_SIMD -{ - int operator () (const double * src, int * dst, int width, double scale, double shift) const - { - int x = 0; - if (hasSIMD128()) - { - v_float64x2 v_shift = v_setall_f64(shift), v_scale = v_setall_f64(scale); - int cWidth = v_float64x2::nlanes; - for (; x <= width - cWidth * 2; x += cWidth * 2) - { - v_float64x2 v_src1 = v_shift + v_scale * v_load(src + x); - v_float64x2 v_src2 = v_shift + v_scale * v_load(src + x + cWidth); - - v_store(dst + x, v_combine_low(v_round(v_src1), v_round(v_src2))); - } - } - return x; - } -}; - -template <> -struct cvtScale_SIMD -{ - int operator () (const double * src, float * dst, int width, double scale, double shift) const - { - int x = 0; - if (hasSIMD128()) - { - v_float64x2 v_shift = v_setall_f64(shift), v_scale = v_setall_f64(scale); - int cWidth = v_float64x2::nlanes; - for (; x <= width - cWidth * 2; x += cWidth * 2) - { - v_float64x2 v_src1 = v_shift + v_scale * v_load(src + x); - v_float64x2 v_src2 = v_shift + v_scale * v_load(src + x + cWidth); - v_float32x4 v_dst1 = v_cvt_f32(v_src1); - v_float32x4 v_dst2 = v_cvt_f32(v_src2); - - v_store(dst + x, v_combine_low(v_dst1, v_dst2)); - } - } - return x; - } -}; - -// to double - -template <> -struct cvtScale_SIMD -{ - int operator () (const uchar * src, double * dst, int width, double scale, double shift) const - { - int x = 0; - if (hasSIMD128()) - { - v_float64x2 v_shift = v_setall_f64(shift), v_scale = v_setall_f64(scale); - int cWidth = v_float64x2::nlanes; - for (; x <= width - cWidth * 4; x += cWidth * 4) - { - v_uint32x4 v_src1, v_src2; - v_expand(v_load_expand(src + x), v_src1, v_src2); - v_store_scale_shift_s32_to_f64(dst + x, v_scale, v_shift - , v_reinterpret_as_s32(v_src1), v_reinterpret_as_s32(v_src2)); - } - } - return x; - } -}; - -template <> -struct cvtScale_SIMD -{ - int operator () (const schar * src, double * dst, int width, double scale, double shift) const - { - int x = 0; - if (hasSIMD128()) - { - v_float64x2 v_shift = v_setall_f64(shift), v_scale = v_setall_f64(scale); - int cWidth = v_float64x2::nlanes; - for (; x <= width - cWidth * 4; x += cWidth * 4) - { - v_int32x4 v_src1, v_src2; - v_expand(v_load_expand(src + x), v_src1, v_src2); - v_store_scale_shift_s32_to_f64(dst + x, v_scale, v_shift, v_src1, v_src2); - } - } - return x; - } -}; - -template <> -struct cvtScale_SIMD -{ - int operator () (const ushort * src, double * dst, int width, double scale, double shift) const - { - int x = 0; - if (hasSIMD128()) - { - v_float64x2 v_shift = v_setall_f64(shift), v_scale = v_setall_f64(scale); - int cWidth = v_float64x2::nlanes; - for (; x <= width - cWidth * 4; x += cWidth * 4) - { - v_uint32x4 v_src1, v_src2; - v_expand(v_load(src + x), v_src1, v_src2); - v_store_scale_shift_s32_to_f64(dst + x, v_scale, v_shift - , v_reinterpret_as_s32(v_src1), v_reinterpret_as_s32(v_src2)); - } - } - return x; - } -}; - -template <> -struct cvtScale_SIMD -{ - int operator () (const short * src, double * dst, int width, double scale, double shift) const - { - int x = 0; - if (hasSIMD128()) - { - v_float64x2 v_shift = v_setall_f64(shift), v_scale = v_setall_f64(scale); - int cWidth = v_float64x2::nlanes; - for (; x <= width - cWidth * 4; x += cWidth * 4) - { - v_int32x4 v_src1, v_src2; - v_expand(v_load(src + x), v_src1, v_src2); - v_store_scale_shift_s32_to_f64(dst + x, v_scale, v_shift, v_src1, v_src2); - } - } - return x; - } -}; - -template <> -struct cvtScale_SIMD -{ - int operator () (const int * src, double * dst, int width, double scale, double shift) const - { - int x = 0; - if (hasSIMD128()) - { - v_float64x2 v_shift = v_setall_f64(shift), v_scale = v_setall_f64(scale); - int cWidth = v_float32x4::nlanes; - for (; x <= width - cWidth * 2; x += cWidth * 2) - { - v_int32x4 v_src1 = v_load(src + x); - v_int32x4 v_src2 = v_load(src + x + cWidth); - v_store_scale_shift_s32_to_f64(dst + x, v_scale, v_shift, v_src1, v_src2); - } - } - return x; - } -}; - -template <> -struct cvtScale_SIMD -{ - int operator () (const float * src, double * dst, int width, double scale, double shift) const - { - int x = 0; - if (hasSIMD128()) - { - v_float64x2 v_shift = v_setall_f64(shift), v_scale = v_setall_f64(scale); - int cWidth = v_float32x4::nlanes; - for (; x <= width - cWidth * 2; x += cWidth * 2) - { - v_float32x4 v_src1 = v_load(src + x); - v_float32x4 v_src2 = v_load(src + x + cWidth); - v_store_scale_shift_f32_to_f64(dst + x, v_scale, v_shift, v_src1, v_src2); - } - } - return x; - } -}; - -template <> -struct cvtScale_SIMD -{ - int operator () (const double * src, double * dst, int width, double scale, double shift) const - { - int x = 0; - if (hasSIMD128()) - { - v_float64x2 v_shift = v_setall_f64(shift), v_scale = v_setall_f64(scale); - int cWidth = v_float64x2::nlanes; - for (; x <= width - cWidth * 2; x += cWidth * 2) - { - v_float64x2 v_src1 = v_shift + v_scale * v_load(src + x); - v_float64x2 v_src2 = v_shift + v_scale * v_load(src + x + cWidth); - v_store(dst + x, v_src1); - v_store(dst + x + cWidth, v_src2); - } - } - return x; - } -}; -#endif -#endif - -template static void -cvtScale_( const T* src, size_t sstep, - DT* dst, size_t dstep, Size size, - WT scale, WT shift ) -{ sstep /= sizeof(src[0]); dstep /= sizeof(dst[0]); - cvtScale_SIMD vop; - - for( ; size.height--; src += sstep, dst += dstep ) + for( int i = 0; i < size.height; i++, src += sstep, dst += dstep ) { - int x = vop(src, dst, size.width, scale, shift); - - #if CV_ENABLE_UNROLLED - for( ; x <= size.width - 4; x += 4 ) + int j = 0; +#if CV_SIMD + for( ; j < size.width; j += VECSZ ) { - DT t0, t1; - t0 = saturate_cast
(src[x]*scale + shift); - t1 = saturate_cast
(src[x+1]*scale + shift); - dst[x] = t0; dst[x+1] = t1; - t0 = saturate_cast
(src[x+2]*scale + shift); - t1 = saturate_cast
(src[x+3]*scale + shift); - dst[x+2] = t0; dst[x+3] = t1; + if( j > size.width - VECSZ ) + { + if( j == 0 || src == (_Ts*)dst ) + break; + j = size.width - VECSZ; + } + v_float32 v0, v1; + vx_load_pair_as(src + j, v0, v1); + v0 = v_fma(v0, va, vb); + v1 = v_fma(v1, va, vb); + v_store_pair_as(dst + j, v_abs(v0), v_abs(v1)); } - #endif - - for( ; x < size.width; x++ ) - dst[x] = saturate_cast
(src[x]*scale + shift); +#endif + for( ; j < size.width; j++ ) + dst[j] = saturate_cast<_Td>(std::abs(src[j]*a + b)); } } -template<> void -cvtScale_( const short* src, size_t sstep, - int* dst, size_t dstep, Size size, - float scale, float shift ) +// variant for convrsions 16f <-> ... w/o unrolling +template inline void +cvtabs1_32f( const _Ts* src, size_t sstep, _Td* dst, size_t dstep, + Size size, float a, float b ) { +#if CV_SIMD + v_float32 va = vx_setall_f32(a), vb = vx_setall_f32(b); + const int VECSZ = v_float32::nlanes*2; +#endif sstep /= sizeof(src[0]); dstep /= sizeof(dst[0]); - for( ; size.height--; src += sstep, dst += dstep ) + for( int i = 0; i < size.height; i++, src += sstep, dst += dstep ) { - int x = 0; - #if CV_TRY_AVX2 - if (CV_CPU_HAS_SUPPORT_AVX2) + int j = 0; +#if CV_SIMD + for( ; j < size.width; j += VECSZ ) { - opt_AVX2::cvtScale_s16s32f32Line_AVX2(src, dst, scale, shift, size.width); - continue; - } - #endif - #if CV_SIMD128 - if (hasSIMD128()) - { - v_float32x4 v_shift = v_setall_f32(shift); - v_float32x4 v_scale = v_setall_f32(scale); - int cWidth = v_int32x4::nlanes; - for (; x <= size.width - cWidth * 2; x += cWidth * 2) + if( j > size.width - VECSZ ) { - v_int16x8 v_src = v_load(src + x); - v_int32x4 v_src1, v_src2; - v_expand(v_src, v_src1, v_src2); - v_float32x4 v_tmp1 = v_cvt_f32(v_src1); - v_float32x4 v_tmp2 = v_cvt_f32(v_src2); - - v_tmp1 = v_tmp1 * v_scale + v_shift; - v_tmp2 = v_tmp2 * v_scale + v_shift; - - v_store(dst + x, v_round(v_tmp1)); - v_store(dst + x + cWidth, v_round(v_tmp2)); + if( j == 0 || src == (_Ts*)dst ) + break; + j = size.width - VECSZ; } + v_float32 v0; + vx_load_as(src + j, v0); + v0 = v_fma(v0, va, vb); + v_store_as(dst + j, v_abs(v0)); } - #endif - - for(; x < size.width; x++ ) - dst[x] = saturate_cast(src[x]*scale + shift); +#endif + for( ; j < size.width; j++ ) + dst[j] = saturate_cast<_Td>(src[j]*a + b); } } +template inline void +cvt_32f( const _Ts* src, size_t sstep, _Td* dst, size_t dstep, + Size size, float a, float b ) +{ +#if CV_SIMD + v_float32 va = vx_setall_f32(a), vb = vx_setall_f32(b); + const int VECSZ = v_float32::nlanes*2; +#endif + sstep /= sizeof(src[0]); + dstep /= sizeof(dst[0]); + + for( int i = 0; i < size.height; i++, src += sstep, dst += dstep ) + { + int j = 0; +#if CV_SIMD + for( ; j < size.width; j += VECSZ ) + { + if( j > size.width - VECSZ ) + { + if( j == 0 || src == (_Ts*)dst ) + break; + j = size.width - VECSZ; + } + v_float32 v0, v1; + vx_load_pair_as(src + j, v0, v1); + v0 = v_fma(v0, va, vb); + v1 = v_fma(v1, va, vb); + v_store_pair_as(dst + j, v0, v1); + } +#endif + for( ; j < size.width; j++ ) + dst[j] = saturate_cast<_Td>(src[j]*a + b); + } +} + +// variant for convrsions 16f <-> ... w/o unrolling +template inline void +cvt1_32f( const _Ts* src, size_t sstep, _Td* dst, size_t dstep, + Size size, float a, float b ) +{ +#if CV_SIMD + v_float32 va = vx_setall_f32(a), vb = vx_setall_f32(b); + const int VECSZ = v_float32::nlanes; +#endif + sstep /= sizeof(src[0]); + dstep /= sizeof(dst[0]); + + for( int i = 0; i < size.height; i++, src += sstep, dst += dstep ) + { + int j = 0; +#if CV_SIMD + for( ; j < size.width; j += VECSZ ) + { + if( j > size.width - VECSZ ) + { + if( j == 0 || src == (_Ts*)dst ) + break; + j = size.width - VECSZ; + } + v_float32 v0; + vx_load_as(src + j, v0); + v0 = v_fma(v0, va, vb); + v_store_as(dst + j, v0); + } +#endif + for( ; j < size.width; j++ ) + dst[j] = saturate_cast<_Td>(src[j]*a + b); + } +} + + +template inline void +cvt_64f( const _Ts* src, size_t sstep, _Td* dst, size_t dstep, + Size size, double a, double b ) +{ +#if CV_SIMD_64F + v_float64 va = vx_setall_f64(a), vb = vx_setall_f64(b); + const int VECSZ = v_float64::nlanes*2; +#endif + sstep /= sizeof(src[0]); + dstep /= sizeof(dst[0]); + + for( int i = 0; i < size.height; i++, src += sstep, dst += dstep ) + { + int j = 0; +#if CV_SIMD_64F + for( ; j < size.width; j += VECSZ ) + { + if( j > size.width - VECSZ ) + { + if( j == 0 || src == (_Ts*)dst ) + break; + j = size.width - VECSZ; + } + v_float64 v0, v1; + vx_load_pair_as(src + j, v0, v1); + v0 = v_fma(v0, va, vb); + v1 = v_fma(v1, va, vb); + v_store_pair_as(dst + j, v0, v1); + } +#endif + for( ; j < size.width; j++ ) + dst[j] = saturate_cast<_Td>(src[j]*a + b); + } +} //================================================================================================== -#define DEF_CVT_SCALE_ABS_FUNC(suffix, tfunc, stype, dtype, wtype) \ +#define DEF_CVT_SCALE_ABS_FUNC(suffix, cvt, stype, dtype, wtype) \ static void cvtScaleAbs##suffix( const stype* src, size_t sstep, const uchar*, size_t, \ - dtype* dst, size_t dstep, Size size, double* scale) \ + dtype* dst, size_t dstep, Size size, double* scale) \ { \ - tfunc(src, sstep, dst, dstep, size, (wtype)scale[0], (wtype)scale[1]); \ + cvt(src, sstep, dst, dstep, size, (wtype)scale[0], (wtype)scale[1]); \ } -#define DEF_CVT_SCALE_FUNC(suffix, stype, dtype, wtype) \ +#define DEF_CVT_SCALE_FUNC(suffix, cvt, stype, dtype, wtype) \ static void cvtScale##suffix( const stype* src, size_t sstep, const uchar*, size_t, \ -dtype* dst, size_t dstep, Size size, double* scale) \ + dtype* dst, size_t dstep, Size size, double* scale) \ { \ - cvtScale_(src, sstep, dst, dstep, size, (wtype)scale[0], (wtype)scale[1]); \ + cvt(src, sstep, dst, dstep, size, (wtype)scale[0], (wtype)scale[1]); \ } -DEF_CVT_SCALE_ABS_FUNC(8u, cvtScaleAbs_, uchar, uchar, float) -DEF_CVT_SCALE_ABS_FUNC(8s8u, cvtScaleAbs_, schar, uchar, float) -DEF_CVT_SCALE_ABS_FUNC(16u8u, cvtScaleAbs_, ushort, uchar, float) -DEF_CVT_SCALE_ABS_FUNC(16s8u, cvtScaleAbs_, short, uchar, float) -DEF_CVT_SCALE_ABS_FUNC(32s8u, cvtScaleAbs_, int, uchar, float) -DEF_CVT_SCALE_ABS_FUNC(32f8u, cvtScaleAbs_, float, uchar, float) -DEF_CVT_SCALE_ABS_FUNC(64f8u, cvtScaleAbs_, double, uchar, float) +DEF_CVT_SCALE_ABS_FUNC(8u, cvtabs_32f, uchar, uchar, float) +DEF_CVT_SCALE_ABS_FUNC(8s8u, cvtabs_32f, schar, uchar, float) +DEF_CVT_SCALE_ABS_FUNC(16u8u, cvtabs_32f, ushort, uchar, float) +DEF_CVT_SCALE_ABS_FUNC(16s8u, cvtabs_32f, short, uchar, float) +DEF_CVT_SCALE_ABS_FUNC(32s8u, cvtabs_32f, int, uchar, float) +DEF_CVT_SCALE_ABS_FUNC(32f8u, cvtabs_32f, float, uchar, float) +DEF_CVT_SCALE_ABS_FUNC(64f8u, cvtabs_32f, double, uchar, float) +DEF_CVT_SCALE_FUNC(8u, cvt_32f, uchar, uchar, float) +DEF_CVT_SCALE_FUNC(8s8u, cvt_32f, schar, uchar, float) +DEF_CVT_SCALE_FUNC(16u8u, cvt_32f, ushort, uchar, float) +DEF_CVT_SCALE_FUNC(16s8u, cvt_32f, short, uchar, float) +DEF_CVT_SCALE_FUNC(32s8u, cvt_32f, int, uchar, float) +DEF_CVT_SCALE_FUNC(32f8u, cvt_32f, float, uchar, float) +DEF_CVT_SCALE_FUNC(64f8u, cvt_32f, double, uchar, float) +//DEF_CVT_SCALE_FUNC(16f8u, cvt_32f, float16_t, uchar, float) -DEF_CVT_SCALE_FUNC(8u, uchar, uchar, float) -DEF_CVT_SCALE_FUNC(8s8u, schar, uchar, float) -DEF_CVT_SCALE_FUNC(16u8u, ushort, uchar, float) -DEF_CVT_SCALE_FUNC(16s8u, short, uchar, float) -DEF_CVT_SCALE_FUNC(32s8u, int, uchar, float) -DEF_CVT_SCALE_FUNC(32f8u, float, uchar, float) -DEF_CVT_SCALE_FUNC(64f8u, double, uchar, float) +DEF_CVT_SCALE_FUNC(8u8s, cvt_32f, uchar, schar, float) +DEF_CVT_SCALE_FUNC(8s, cvt_32f, schar, schar, float) +DEF_CVT_SCALE_FUNC(16u8s, cvt_32f, ushort, schar, float) +DEF_CVT_SCALE_FUNC(16s8s, cvt_32f, short, schar, float) +DEF_CVT_SCALE_FUNC(32s8s, cvt_32f, int, schar, float) +DEF_CVT_SCALE_FUNC(32f8s, cvt_32f, float, schar, float) +DEF_CVT_SCALE_FUNC(64f8s, cvt_32f, double, schar, float) +//DEF_CVT_SCALE_FUNC(16f8s, cvt_32f, float16_t, schar, float) -DEF_CVT_SCALE_FUNC(8u8s, uchar, schar, float) -DEF_CVT_SCALE_FUNC(8s, schar, schar, float) -DEF_CVT_SCALE_FUNC(16u8s, ushort, schar, float) -DEF_CVT_SCALE_FUNC(16s8s, short, schar, float) -DEF_CVT_SCALE_FUNC(32s8s, int, schar, float) -DEF_CVT_SCALE_FUNC(32f8s, float, schar, float) -DEF_CVT_SCALE_FUNC(64f8s, double, schar, float) +DEF_CVT_SCALE_FUNC(8u16u, cvt_32f, uchar, ushort, float) +DEF_CVT_SCALE_FUNC(8s16u, cvt_32f, schar, ushort, float) +DEF_CVT_SCALE_FUNC(16u, cvt_32f, ushort, ushort, float) +DEF_CVT_SCALE_FUNC(16s16u, cvt_32f, short, ushort, float) +DEF_CVT_SCALE_FUNC(32s16u, cvt_32f, int, ushort, float) +DEF_CVT_SCALE_FUNC(32f16u, cvt_32f, float, ushort, float) +DEF_CVT_SCALE_FUNC(64f16u, cvt_32f, double, ushort, float) +//DEF_CVT_SCALE_FUNC(16f16u, cvt1_32f, float16_t, ushort, float) -DEF_CVT_SCALE_FUNC(8u16u, uchar, ushort, float) -DEF_CVT_SCALE_FUNC(8s16u, schar, ushort, float) -DEF_CVT_SCALE_FUNC(16u, ushort, ushort, float) -DEF_CVT_SCALE_FUNC(16s16u, short, ushort, float) -DEF_CVT_SCALE_FUNC(32s16u, int, ushort, float) -DEF_CVT_SCALE_FUNC(32f16u, float, ushort, float) -DEF_CVT_SCALE_FUNC(64f16u, double, ushort, float) +DEF_CVT_SCALE_FUNC(8u16s, cvt_32f, uchar, short, float) +DEF_CVT_SCALE_FUNC(8s16s, cvt_32f, schar, short, float) +DEF_CVT_SCALE_FUNC(16u16s, cvt_32f, ushort, short, float) +DEF_CVT_SCALE_FUNC(16s, cvt_32f, short, short, float) +DEF_CVT_SCALE_FUNC(32s16s, cvt_32f, int, short, float) +DEF_CVT_SCALE_FUNC(32f16s, cvt_32f, float, short, float) +DEF_CVT_SCALE_FUNC(64f16s, cvt_32f, double, short, float) +//DEF_CVT_SCALE_FUNC(16f16s, cvt1_32f, float16_t, short, float) -DEF_CVT_SCALE_FUNC(8u16s, uchar, short, float) -DEF_CVT_SCALE_FUNC(8s16s, schar, short, float) -DEF_CVT_SCALE_FUNC(16u16s, ushort, short, float) -DEF_CVT_SCALE_FUNC(16s, short, short, float) -DEF_CVT_SCALE_FUNC(32s16s, int, short, float) -DEF_CVT_SCALE_FUNC(32f16s, float, short, float) -DEF_CVT_SCALE_FUNC(64f16s, double, short, float) +DEF_CVT_SCALE_FUNC(8u32s, cvt_32f, uchar, int, float) +DEF_CVT_SCALE_FUNC(8s32s, cvt_32f, schar, int, float) +DEF_CVT_SCALE_FUNC(16u32s, cvt_32f, ushort, int, float) +DEF_CVT_SCALE_FUNC(16s32s, cvt_32f, short, int, float) +DEF_CVT_SCALE_FUNC(32s, cvt_64f, int, int, double) +DEF_CVT_SCALE_FUNC(32f32s, cvt_32f, float, int, float) +DEF_CVT_SCALE_FUNC(64f32s, cvt_64f, double, int, double) +//DEF_CVT_SCALE_FUNC(16f32s, cvt1_32f, float16_t, int, float) -DEF_CVT_SCALE_FUNC(8u32s, uchar, int, float) -DEF_CVT_SCALE_FUNC(8s32s, schar, int, float) -DEF_CVT_SCALE_FUNC(16u32s, ushort, int, float) -DEF_CVT_SCALE_FUNC(16s32s, short, int, float) -DEF_CVT_SCALE_FUNC(32s, int, int, double) -DEF_CVT_SCALE_FUNC(32f32s, float, int, float) -DEF_CVT_SCALE_FUNC(64f32s, double, int, double) +DEF_CVT_SCALE_FUNC(8u32f, cvt_32f, uchar, float, float) +DEF_CVT_SCALE_FUNC(8s32f, cvt_32f, schar, float, float) +DEF_CVT_SCALE_FUNC(16u32f, cvt_32f, ushort, float, float) +DEF_CVT_SCALE_FUNC(16s32f, cvt_32f, short, float, float) +DEF_CVT_SCALE_FUNC(32s32f, cvt_32f, int, float, float) +DEF_CVT_SCALE_FUNC(32f, cvt_32f, float, float, float) +DEF_CVT_SCALE_FUNC(64f32f, cvt_64f, double, float, double) +//DEF_CVT_SCALE_FUNC(16f32f, cvt1_32f, float16_t, float, float) -DEF_CVT_SCALE_FUNC(8u32f, uchar, float, float) -DEF_CVT_SCALE_FUNC(8s32f, schar, float, float) -DEF_CVT_SCALE_FUNC(16u32f, ushort, float, float) -DEF_CVT_SCALE_FUNC(16s32f, short, float, float) -DEF_CVT_SCALE_FUNC(32s32f, int, float, double) -DEF_CVT_SCALE_FUNC(32f, float, float, float) -DEF_CVT_SCALE_FUNC(64f32f, double, float, double) +DEF_CVT_SCALE_FUNC(8u64f, cvt_64f, uchar, double, double) +DEF_CVT_SCALE_FUNC(8s64f, cvt_64f, schar, double, double) +DEF_CVT_SCALE_FUNC(16u64f, cvt_64f, ushort, double, double) +DEF_CVT_SCALE_FUNC(16s64f, cvt_64f, short, double, double) +DEF_CVT_SCALE_FUNC(32s64f, cvt_64f, int, double, double) +DEF_CVT_SCALE_FUNC(32f64f, cvt_64f, float, double, double) +DEF_CVT_SCALE_FUNC(64f, cvt_64f, double, double, double) +//DEF_CVT_SCALE_FUNC(16f64f, cvt_64f, float16_t, double, double) -DEF_CVT_SCALE_FUNC(8u64f, uchar, double, double) -DEF_CVT_SCALE_FUNC(8s64f, schar, double, double) -DEF_CVT_SCALE_FUNC(16u64f, ushort, double, double) -DEF_CVT_SCALE_FUNC(16s64f, short, double, double) -DEF_CVT_SCALE_FUNC(32s64f, int, double, double) -DEF_CVT_SCALE_FUNC(32f64f, float, double, double) -DEF_CVT_SCALE_FUNC(64f, double, double, double) +/*DEF_CVT_SCALE_FUNC(8u16f, cvt1_32f, uchar, float16_t, float) +DEF_CVT_SCALE_FUNC(8s16f, cvt1_32f, schar, float16_t, float) +DEF_CVT_SCALE_FUNC(16u16f, cvt1_32f, ushort, float16_t, float) +DEF_CVT_SCALE_FUNC(16s16f, cvt1_32f, short, float16_t, float) +DEF_CVT_SCALE_FUNC(32s16f, cvt1_32f, int, float16_t, float) +DEF_CVT_SCALE_FUNC(32f16f, cvt1_32f, float, float16_t, float) +DEF_CVT_SCALE_FUNC(64f16f, cvt_64f, double, float16_t, double) +DEF_CVT_SCALE_FUNC(16f, cvt1_32f, float16_t, float16_t, float)*/ static BinaryFunc getCvtScaleAbsFunc(int depth) { @@ -1651,41 +306,44 @@ BinaryFunc getConvertScaleFunc(int sdepth, int ddepth) { (BinaryFunc)GET_OPTIMIZED(cvtScale8u), (BinaryFunc)GET_OPTIMIZED(cvtScale8s8u), (BinaryFunc)GET_OPTIMIZED(cvtScale16u8u), (BinaryFunc)GET_OPTIMIZED(cvtScale16s8u), (BinaryFunc)GET_OPTIMIZED(cvtScale32s8u), (BinaryFunc)GET_OPTIMIZED(cvtScale32f8u), - (BinaryFunc)cvtScale64f8u, 0 + (BinaryFunc)cvtScale64f8u, 0 //(BinaryFunc)cvtScale16f8u }, { (BinaryFunc)GET_OPTIMIZED(cvtScale8u8s), (BinaryFunc)GET_OPTIMIZED(cvtScale8s), (BinaryFunc)GET_OPTIMIZED(cvtScale16u8s), (BinaryFunc)GET_OPTIMIZED(cvtScale16s8s), (BinaryFunc)GET_OPTIMIZED(cvtScale32s8s), (BinaryFunc)GET_OPTIMIZED(cvtScale32f8s), - (BinaryFunc)cvtScale64f8s, 0 + (BinaryFunc)cvtScale64f8s, 0 //(BinaryFunc)cvtScale16f8s }, { (BinaryFunc)GET_OPTIMIZED(cvtScale8u16u), (BinaryFunc)GET_OPTIMIZED(cvtScale8s16u), (BinaryFunc)GET_OPTIMIZED(cvtScale16u), (BinaryFunc)GET_OPTIMIZED(cvtScale16s16u), (BinaryFunc)GET_OPTIMIZED(cvtScale32s16u), (BinaryFunc)GET_OPTIMIZED(cvtScale32f16u), - (BinaryFunc)cvtScale64f16u, 0 + (BinaryFunc)cvtScale64f16u, 0 //(BinaryFunc)cvtScale16f16u }, { (BinaryFunc)GET_OPTIMIZED(cvtScale8u16s), (BinaryFunc)GET_OPTIMIZED(cvtScale8s16s), (BinaryFunc)GET_OPTIMIZED(cvtScale16u16s), (BinaryFunc)GET_OPTIMIZED(cvtScale16s), (BinaryFunc)GET_OPTIMIZED(cvtScale32s16s), (BinaryFunc)GET_OPTIMIZED(cvtScale32f16s), - (BinaryFunc)cvtScale64f16s, 0 + (BinaryFunc)cvtScale64f16s, 0 //(BinaryFunc)cvtScale16f16s }, { (BinaryFunc)GET_OPTIMIZED(cvtScale8u32s), (BinaryFunc)GET_OPTIMIZED(cvtScale8s32s), (BinaryFunc)GET_OPTIMIZED(cvtScale16u32s), (BinaryFunc)GET_OPTIMIZED(cvtScale16s32s), (BinaryFunc)GET_OPTIMIZED(cvtScale32s), (BinaryFunc)GET_OPTIMIZED(cvtScale32f32s), - (BinaryFunc)cvtScale64f32s, 0 + (BinaryFunc)cvtScale64f32s, 0 //(BinaryFunc)cvtScale16f32s }, { (BinaryFunc)GET_OPTIMIZED(cvtScale8u32f), (BinaryFunc)GET_OPTIMIZED(cvtScale8s32f), (BinaryFunc)GET_OPTIMIZED(cvtScale16u32f), (BinaryFunc)GET_OPTIMIZED(cvtScale16s32f), (BinaryFunc)GET_OPTIMIZED(cvtScale32s32f), (BinaryFunc)GET_OPTIMIZED(cvtScale32f), - (BinaryFunc)cvtScale64f32f, 0 + (BinaryFunc)cvtScale64f32f, 0 //(BinaryFunc)cvtScale16f32f }, { (BinaryFunc)cvtScale8u64f, (BinaryFunc)cvtScale8s64f, (BinaryFunc)cvtScale16u64f, (BinaryFunc)cvtScale16s64f, (BinaryFunc)cvtScale32s64f, (BinaryFunc)cvtScale32f64f, - (BinaryFunc)cvtScale64f, 0 + (BinaryFunc)cvtScale64f, 0 //(BinaryFunc)cvtScale16f64f }, { 0, 0, 0, 0, 0, 0, 0, 0 - } + /*(BinaryFunc)cvtScale8u16f, (BinaryFunc)cvtScale8s16f, (BinaryFunc)cvtScale16u16f, + (BinaryFunc)cvtScale16s16f, (BinaryFunc)cvtScale32s16f, (BinaryFunc)cvtScale32f16f, + (BinaryFunc)cvtScale64f16f, (BinaryFunc)cvtScale16f*/ + }, }; return cvtScaleTab[CV_MAT_DEPTH(ddepth)][CV_MAT_DEPTH(sdepth)]; diff --git a/modules/core/test/test_intrin_utils.hpp b/modules/core/test/test_intrin_utils.hpp index a1409f0979..6666bc4253 100644 --- a/modules/core/test/test_intrin_utils.hpp +++ b/modules/core/test/test_intrin_utils.hpp @@ -1123,7 +1123,6 @@ template struct TheTest return *this; } -#if CV_FP16 TheTest & test_loadstore_fp16_f32() { printf("test_loadstore_fp16_f32 ...\n"); @@ -1133,14 +1132,14 @@ template struct TheTest AlignedData data_f32; data_f32.a.clear(); AlignedData out; - R r1 = vx_load_fp16_f32((short*)data.a.d); + R r1 = vx_load_expand((const cv::float16_t*)data.a.d); R r2(r1); EXPECT_EQ(1.0f, r1.get0()); vx_store(data_f32.a.d, r2); EXPECT_EQ(-2.0f, data_f32.a.d[R::nlanes - 1]); out.a.clear(); - vx_store_fp16((short*)out.a.d, r2); + v_pack_store((cv::float16_t*)out.a.d, r2); for (int i = 0; i < R::nlanes; ++i) { EXPECT_EQ(data.a[i], out.a[i]) << "i=" << i; @@ -1148,9 +1147,8 @@ template struct TheTest return *this; } -#endif -#if CV_SIMD_FP16 +#if 0 TheTest & test_loadstore_fp16() { printf("test_loadstore_fp16 ...\n"); @@ -1165,7 +1163,7 @@ template struct TheTest // check some initialization methods R r1 = data.u; - R r2 = vx_load_f16(data.a.d); + R r2 = vx_load_expand((const float16_t*)data.a.d); R r3(r2); EXPECT_EQ(data.u[0], r1.get0()); EXPECT_EQ(data.a[0], r2.get0()); diff --git a/modules/core/test/test_math.cpp b/modules/core/test/test_math.cpp index 68dfc3c969..610d16252a 100644 --- a/modules/core/test/test_math.cpp +++ b/modules/core/test/test_math.cpp @@ -3230,6 +3230,22 @@ softdouble naiveExp(softdouble x) } } +static float makeFP32(int sign, int exponent, int significand) +{ + Cv32suf x; + x.u = (unsigned)(((sign & 1) << 31) | ((exponent&255) << 23) | (significand & 0x7fffff)); + return x.f; +} + +static float makeRandomFP32(RNG& rng, int sign, int exprange) +{ + if( sign == -1 ) + sign = rng() % 2; + int exponent = rng() % exprange; + int significand = rng() % (1 << 23); + return makeFP32(sign, exponent, significand); +} + TEST(Core_SoftFloat, exp32) { //special cases @@ -3246,13 +3262,11 @@ TEST(Core_SoftFloat, exp32) inputs.push_back(softfloat::min()); for(int i = 0; i < 50000; i++) { - Cv32suf x; - x.fmt.sign = rng() % 2; - x.fmt.exponent = rng() % (10 + 127); //bigger exponent will produce inf - x.fmt.significand = rng() % (1 << 23); - if(softfloat(x.f) > ln_max) - x.f = rng.uniform(0.0f, (float)ln_max); - inputs.push_back(softfloat(x.f)); + float x = makeRandomFP32(rng, -1, 10+127 //bigger exponent will produce inf + ); + if(softfloat(x) > ln_max) + x = rng.uniform(0.0f, (float)ln_max); + inputs.push_back(softfloat(x)); } for(size_t i = 0; i < inputs.size(); i++) @@ -3323,11 +3337,7 @@ TEST(Core_SoftFloat, log32) EXPECT_TRUE(log(softfloat::nan()).isNaN()); for(int i = 0; i < nValues; i++) { - Cv32suf x; - x.fmt.sign = 1; - x.fmt.exponent = rng() % 255; - x.fmt.significand = rng() % (1 << 23); - softfloat x32(x.f); + softfloat x32(makeRandomFP32(rng, 1, 255)); ASSERT_TRUE(log(x32).isNaN()); } EXPECT_TRUE(log(softfloat::zero()).isInf()); @@ -3340,11 +3350,7 @@ TEST(Core_SoftFloat, log32) inputs.push_back(softfloat::max()); for(int i = 0; i < nValues; i++) { - Cv32suf x; - x.fmt.sign = 0; - x.fmt.exponent = rng() % 255; - x.fmt.significand = rng() % (1 << 23); - inputs.push_back(softfloat(x.f)); + inputs.push_back(softfloat(makeRandomFP32(rng, 0, 255))); } for(size_t i = 0; i < inputs.size(); i++) @@ -3426,11 +3432,7 @@ TEST(Core_SoftFloat, cbrt32) inputs.push_back(softfloat::min()); for(int i = 0; i < 50000; i++) { - Cv32suf x; - x.fmt.sign = rng() % 2; - x.fmt.exponent = rng() % 255; - x.fmt.significand = rng() % (1 << 23); - inputs.push_back(softfloat(x.f)); + inputs.push_back(softfloat(makeRandomFP32(rng, -1, 255))); } for(size_t i = 0; i < inputs.size(); i++) @@ -3522,11 +3524,8 @@ TEST(Core_SoftFloat, pow32) // inf ** y == inf, if y > 0 for(size_t i = 0; i < nValues; i++) { - Cv32suf x; - x.fmt.sign = 0; - x.fmt.exponent = rng() % 255; - x.fmt.significand = rng() % (1 << 23); - softfloat x32 = softfloat(x.f); + float x = makeRandomFP32(rng, 0, 255); + softfloat x32 = softfloat(x); ASSERT_TRUE(pow( inf, x32).isInf()); ASSERT_TRUE(pow(-inf, x32).isInf()); ASSERT_EQ(pow( inf, -x32), zero); @@ -3538,17 +3537,9 @@ TEST(Core_SoftFloat, pow32) // x ** y == nan, if x < 0 and y is not integer for(size_t i = 0; i < nValues; i++) { - Cv32suf x; - x.fmt.sign = 1; - x.fmt.exponent = rng() % 255; - x.fmt.significand = rng() % (1 << 23); - softfloat x32(x.f); - Cv32suf y; - y.fmt.sign = rng() % 2; - //bigger exponent produces integer numbers only - y.fmt.exponent = rng() % (23 + 127); - y.fmt.significand = rng() % (1 << 23); - softfloat y32(y.f); + softfloat x32(makeRandomFP32(rng, 1, 255)); + softfloat y32(makeRandomFP32(rng, -1, 23+127 //bigger exponent produces integer numbers only + )); int yi = cvRound(y32); if(y32 != softfloat(yi)) ASSERT_TRUE(pow(x32, y32).isNaN()); @@ -3565,11 +3556,7 @@ TEST(Core_SoftFloat, pow32) // 0 ** y == 0, if y > 0 for(size_t i = 0; i < nValues; i++) { - Cv32suf x; - x.fmt.sign = 0; - x.fmt.exponent = rng() % 255; - x.fmt.significand = rng() % (1 << 23); - softfloat x32(x.f); + softfloat x32(makeRandomFP32(rng, 0, 255)); ASSERT_TRUE(pow(zero, -x32).isInf()); if(x32 != one) { diff --git a/platforms/ios/build_framework.py b/platforms/ios/build_framework.py old mode 100644 new mode 100755 index 32305f9a08..d624e08d90 --- a/platforms/ios/build_framework.py +++ b/platforms/ios/build_framework.py @@ -183,7 +183,7 @@ class Builder: cmakecmd = self.getCMakeArgs(arch, target) + \ (["-DCMAKE_TOOLCHAIN_FILE=%s" % toolchain] if toolchain is not None else []) if target.lower().startswith("iphoneos"): - cmakecmd.append("-DENABLE_NEON=ON") + cmakecmd.append("-DCPU_BASELINE=NEON;FP16") cmakecmd.append(self.opencv) cmakecmd.extend(cmakeargs) execute(cmakecmd, cwd = builddir)