From 80b62a41c6f65ccae09610284a45c8f935667d19 Mon Sep 17 00:00:00 2001
From: Vadim Pisarevsky <vadim.pisarevsky@gmail.com>
Date: Thu, 6 Sep 2018 19:36:59 +0300
Subject: [PATCH] Merge pull request #12411 from vpisarev:wide_convert

* rewrote Mat::convertTo() and convertScaleAbs() to wide universal intrinsics; added always-available and SIMD-optimized FP16<=>FP32 conversion

* fixed compile warnings

* fix some more compile errors

* slightly relaxed accuracy threshold for int->float conversion (since we now do it using single-precision arithmetics, not double-precision)

* fixed compile errors on iOS, Android and in the baseline C++ version (intrin_cpp.hpp)

* trying to fix ARM-neon builds

* trying to fix ARM-neon builds

* trying to fix ARM-neon builds

* trying to fix ARM-neon builds
---
 modules/core/include/opencv2/core/cvdef.h     |  122 +-
 .../core/include/opencv2/core/hal/intrin.hpp  |   26 +-
 .../include/opencv2/core/hal/intrin_avx.hpp   |   23 +-
 .../include/opencv2/core/hal/intrin_cpp.hpp   |   22 +
 .../include/opencv2/core/hal/intrin_neon.hpp  |  131 +-
 .../include/opencv2/core/hal/intrin_sse.hpp   |   46 +-
 .../include/opencv2/core/hal/intrin_vsx.hpp   |   18 +
 modules/core/perf/perf_addWeighted.cpp        |    5 +-
 modules/core/perf/perf_convertTo.cpp          |    2 +-
 modules/core/src/convert.avx2.cpp             |   40 -
 modules/core/src/convert.cpp                  | 1482 +++-----------
 modules/core/src/convert.fp16.cpp             |  126 --
 modules/core/src/convert.hpp                  |  566 +++--
 modules/core/src/convert.sse4_1.cpp           |  203 --
 modules/core/src/convert_scale.cpp            | 1822 +++--------------
 modules/core/test/test_intrin_utils.hpp       |   10 +-
 modules/core/test/test_math.cpp               |   73 +-
 platforms/ios/build_framework.py              |    2 +-
 18 files changed, 1178 insertions(+), 3541 deletions(-)
 delete mode 100644 modules/core/src/convert.avx2.cpp
 delete mode 100644 modules/core/src/convert.fp16.cpp
 delete mode 100644 modules/core/src/convert.sse4_1.cpp
 mode change 100644 => 100755 platforms/ios/build_framework.py

diff --git a/modules/core/include/opencv2/core/cvdef.h b/modules/core/include/opencv2/core/cvdef.h
index 56403b3191..b9e5bce222 100644
--- a/modules/core/include/opencv2/core/cvdef.h
+++ b/modules/core/include/opencv2/core/cvdef.h
@@ -219,15 +219,10 @@ enum CpuFeatures {
 typedef union Cv16suf
 {
     short i;
+    ushort u;
 #if CV_FP16_TYPE
     __fp16 h;
 #endif
-    struct _fp16Format
-    {
-        unsigned int significand : 10;
-        unsigned int exponent    : 5;
-        unsigned int sign        : 1;
-    } fmt;
 }
 Cv16suf;
 
@@ -236,12 +231,6 @@ typedef union Cv32suf
     int i;
     unsigned u;
     float f;
-    struct _fp32Format
-    {
-        unsigned int significand : 23;
-        unsigned int exponent    : 8;
-        unsigned int sign        : 1;
-    } fmt;
 }
 Cv32suf;
 
@@ -548,6 +537,115 @@ typedef ::uint64_t uint64_t;
 #include <stdint.h>
 #endif
 
+#ifdef __cplusplus
+namespace cv
+{
+
+class float16_t
+{
+public:
+#if CV_FP16_TYPE
+
+    float16_t() {}
+    explicit float16_t(float x) { h = (__fp16)x; }
+    operator float() const { return (float)h; }
+    static float16_t fromBits(ushort w)
+    {
+        Cv16suf u;
+        u.u = w;
+        float16_t result;
+        result.h = u.h;
+        return result;
+    }
+    static float16_t zero()
+    {
+        float16_t result;
+        result.h = (__fp16)0;
+        return result;
+    }
+    ushort bits() const
+    {
+        Cv16suf u;
+        u.h = h;
+        return u.u;
+    }
+protected:
+    __fp16 h;
+
+#else
+    float16_t() {}
+    explicit float16_t(float x)
+    {
+    #if CV_AVX2
+        __m128 v = _mm_load_ss(&x);
+        w = (ushort)_mm_cvtsi128_si32(_mm_cvtps_ph(v, 0));
+    #else
+        Cv32suf in;
+        in.f = x;
+        unsigned sign = in.u & 0x80000000;
+        in.u ^= sign;
+
+        if( in.u >= 0x47800000 )
+            w = (ushort)(in.u > 0x7f800000 ? 0x7e00 : 0x7c00);
+        else
+        {
+            if (in.u < 0x38800000)
+            {
+                in.f += 0.5f;
+                w = (ushort)(in.u - 0x3f000000);
+            }
+            else
+            {
+                unsigned t = in.u + 0xc8000fff;
+                w = (ushort)((t + ((in.u >> 13) & 1)) >> 13);
+            }
+        }
+
+        w = (ushort)(w | (sign >> 16));
+    #endif
+    }
+
+    operator float() const
+    {
+    #if CV_AVX2
+        float f;
+        _mm_store_ss(&f, _mm_cvtph_ps(_mm_cvtsi32_si128(w)));
+        return f;
+    #else
+        Cv32suf out;
+
+        unsigned t = ((w & 0x7fff) << 13) + 0x38000000;
+        unsigned sign = (w & 0x8000) << 16;
+        unsigned e = w & 0x7c00;
+
+        out.u = t + (1 << 23);
+        out.u = (e >= 0x7c00 ? t + 0x38000000 :
+                 e == 0 ? (out.f -= 6.103515625e-05f, out.u) : t) | sign;
+        return out.f;
+    #endif
+    }
+
+    static float16_t fromBits(ushort b)
+    {
+        float16_t result;
+        result.w = b;
+        return result;
+    }
+    static float16_t zero()
+    {
+        float16_t result;
+        result.w = (ushort)0;
+        return result;
+    }
+    ushort bits() const { return w; }
+protected:
+    ushort w;
+
+#endif
+};
+
+}
+#endif
 
 //! @}
 
diff --git a/modules/core/include/opencv2/core/hal/intrin.hpp b/modules/core/include/opencv2/core/hal/intrin.hpp
index 6505f255cb..a321627081 100644
--- a/modules/core/include/opencv2/core/hal/intrin.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin.hpp
@@ -252,7 +252,8 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
     CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(unsigned, v_uint64, prefix) \
     CV_INTRIN_DEFINE_WIDE_INTRIN(float, v_float32, f32, prefix, load) \
     CV_INTRIN_DEFINE_WIDE_INTRIN(int64, v_int64, s64, prefix, load) \
-    CV_INTRIN_DEFINE_WIDE_INTRIN(uint64, v_uint64, u64, prefix, load)
+    CV_INTRIN_DEFINE_WIDE_INTRIN(uint64, v_uint64, u64, prefix, load) \
+    CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(float16_t, v_float32, prefix)
 
 template<typename _Tp> struct V_RegTraits
 {
@@ -286,9 +287,6 @@ template<typename _Tp> struct V_RegTraits
 #if CV_SIMD128_64F
     CV_DEF_REG_TRAITS(v, v_float64x2, double, f64, v_float64x2, void, void, v_int64x2, v_int32x4);
 #endif
-#if CV_SIMD128_FP16
-    CV_DEF_REG_TRAITS(v, v_float16x8, short, f16, v_float16x8, void, void, v_int16x8, v_int16x8);
-#endif
 #endif
 
 #if CV_SIMD256
@@ -302,9 +300,6 @@ template<typename _Tp> struct V_RegTraits
     CV_DEF_REG_TRAITS(v256, v_uint64x4, uint64, u64, v_uint64x4, void, void, v_int64x4, void);
     CV_DEF_REG_TRAITS(v256, v_int64x4, int64, s64, v_uint64x4, void, void, v_int64x4, void);
     CV_DEF_REG_TRAITS(v256, v_float64x4, double, f64, v_float64x4, void, void, v_int64x4, v_int32x8);
-#if CV_SIMD256_FP16
-    CV_DEF_REG_TRAITS(v256, v_float16x16, short, f16, v_float16x16, void, void, v_int16x16, void);
-#endif
 #endif
 
 #if CV_SIMD512 && (!defined(CV__SIMD_FORCE_WIDTH) || CV__SIMD_FORCE_WIDTH == 512)
@@ -335,14 +330,6 @@ namespace CV__SIMD_NAMESPACE {
     #if CV_SIMD256_64F
     typedef v_float64x4  v_float64;
     #endif
-    #if CV_FP16
-    #define vx_load_fp16_f32 v256_load_fp16_f32
-    #define vx_store_fp16 v_store_fp16
-    #endif
-    #if CV_SIMD256_FP16
-    typedef v_float16x16  v_float16;
-    CV_INTRIN_DEFINE_WIDE_INTRIN(short, v_float16, f16, v256, load_f16)
-    #endif
     CV_INTRIN_DEFINE_WIDE_INTRIN_ALL_TYPES(v256)
     CV_INTRIN_DEFINE_WIDE_INTRIN(double, v_float64, f64, v256, load)
     inline void vx_cleanup() { v256_cleanup(); }
@@ -353,7 +340,6 @@ using namespace CV__SIMD_NAMESPACE;
 namespace CV__SIMD_NAMESPACE {
     #define CV_SIMD CV_SIMD128
     #define CV_SIMD_64F CV_SIMD128_64F
-    #define CV_SIMD_FP16 CV_SIMD128_FP16
     #define CV_SIMD_WIDTH 16
     typedef v_uint8x16  v_uint8;
     typedef v_int8x16   v_int8;
@@ -367,14 +353,6 @@ namespace CV__SIMD_NAMESPACE {
     #if CV_SIMD128_64F
     typedef v_float64x2 v_float64;
     #endif
-    #if CV_FP16
-    #define vx_load_fp16_f32 v128_load_fp16_f32
-    #define vx_store_fp16 v_store_fp16
-    #endif
-    #if CV_SIMD128_FP16
-    typedef v_float16x8  v_float16;
-    CV_INTRIN_DEFINE_WIDE_INTRIN(short, v_float16, f16, v, load_f16)
-    #endif
     CV_INTRIN_DEFINE_WIDE_INTRIN_ALL_TYPES(v)
     #if CV_SIMD128_64F
     CV_INTRIN_DEFINE_WIDE_INTRIN(double, v_float64, f64, v, load)
diff --git a/modules/core/include/opencv2/core/hal/intrin_avx.hpp b/modules/core/include/opencv2/core/hal/intrin_avx.hpp
index c21b46a58f..a38c25e385 100644
--- a/modules/core/include/opencv2/core/hal/intrin_avx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_avx.hpp
@@ -1414,10 +1414,17 @@ inline v_int8x32 v_pack(const v_int16x16& a, const v_int16x16& b)
 { return v_int8x32(_v256_shuffle_odd_64(_mm256_packs_epi16(a.val, b.val))); }
 
 inline v_uint8x32 v_pack(const v_uint16x16& a, const v_uint16x16& b)
-{ return v_uint8x32(_v256_shuffle_odd_64(_mm256_packus_epi16(a.val, b.val))); }
+{
+    __m256i t = _mm256_set1_epi16(255);
+    __m256i a1 = _mm256_min_epu16(a.val, t);
+    __m256i b1 = _mm256_min_epu16(b.val, t);
+    return v_uint8x32(_v256_shuffle_odd_64(_mm256_packus_epi16(a1, b1)));
+}
 
 inline v_uint8x32 v_pack_u(const v_int16x16& a, const v_int16x16& b)
-{ return v_pack(v_reinterpret_as_u16(a), v_reinterpret_as_u16(b)); }
+{
+    return v_uint8x32(_v256_shuffle_odd_64(_mm256_packus_epi16(a.val, b.val)));
+}
 
 inline void v_pack_store(schar* ptr, const v_int16x16& a)
 { v_store_low(ptr, v_pack(a, a)); }
@@ -2390,6 +2397,18 @@ OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_float32x8, float, f32, v_uint32x8, un
 OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_int64x4, int64, s64, v_uint64x4, uint64, u64)
 OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_float64x4, double, f64, v_uint64x4, uint64, u64)
 
+// FP16
+inline v_float32x8 v256_load_expand(const float16_t* ptr)
+{
+    return v_float32x8(_mm256_cvtph_ps(_mm_loadu_si128((const __m128i*)ptr)));
+}
+
+inline void v_pack_store(float16_t* ptr, const v_float32x8& a)
+{
+    __m128i ah = _mm256_cvtps_ph(a.val, 0);
+    _mm_storeu_si128((__m128i*)ptr, ah);
+}
+
 inline void v256_cleanup() { _mm256_zeroupper(); }
 
 //! @name Check SIMD256 support
diff --git a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
index ccd317682d..64a457a530 100644
--- a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
@@ -2062,6 +2062,28 @@ inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
                        v.s[0]*m0.s[3] + v.s[1]*m1.s[3] + v.s[2]*m2.s[3] + m3.s[3]);
 }
 
+////// FP16 suport ///////
+
+inline v_reg<float, V_TypeTraits<float>::nlanes128>
+v_load_expand(const float16_t* ptr)
+{
+    v_reg<float, V_TypeTraits<float>::nlanes128> v;
+    for( int i = 0; i < v.nlanes; i++ )
+    {
+        v.s[i] = ptr[i];
+    }
+    return v;
+}
+
+inline void
+v_pack_store(float16_t* ptr, v_reg<float, V_TypeTraits<float>::nlanes128>& v)
+{
+    for( int i = 0; i < v.nlanes; i++ )
+    {
+        ptr[i] = float16_t(v.s[i]);
+    }
+}
+
 inline void v_cleanup() {}
 
 //! @}
diff --git a/modules/core/include/opencv2/core/hal/intrin_neon.hpp b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
index c017b075f1..d87b4e2ba0 100644
--- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
@@ -62,15 +62,6 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
 #define CV_SIMD128_64F 0
 #endif
 
-#ifndef CV_SIMD128_FP16
-# if CV_FP16 && (defined(__GNUC__) && __GNUC__ >= 5)  // #12027: float16x8_t is missing in GCC 4.8.2
-#   define CV_SIMD128_FP16 1
-# endif
-#endif
-#ifndef CV_SIMD128_FP16
-# define CV_SIMD128_FP16 0
-#endif
-
 #if CV_SIMD128_64F
 #define OPENCV_HAL_IMPL_NEON_REINTERPRET(_Tpv, suffix) \
 template <typename T> static inline \
@@ -329,53 +320,6 @@ inline void v_store_fp16(short* ptr, const v_float32x4& a)
 }
 #endif
 
-
-#if CV_SIMD128_FP16
-// Workaround for old compilers
-static inline int16x8_t vreinterpretq_s16_f16(float16x8_t a) { return (int16x8_t)a; }
-static inline float16x8_t vreinterpretq_f16_s16(int16x8_t a) { return (float16x8_t)a; }
-
-static inline float16x8_t cv_vld1q_f16(const void* ptr)
-{
-#ifndef vld1q_f16 // APPLE compiler defines vld1_f16 as macro
-    return vreinterpretq_f16_s16(vld1q_s16((const short*)ptr));
-#else
-    return vld1q_f16((const __fp16*)ptr);
-#endif
-}
-static inline void cv_vst1q_f16(void* ptr, float16x8_t a)
-{
-#ifndef vst1q_f16 // APPLE compiler defines vst1_f16 as macro
-    vst1q_s16((short*)ptr, vreinterpretq_s16_f16(a));
-#else
-    vst1q_f16((__fp16*)ptr, a);
-#endif
-}
-
-struct v_float16x8
-{
-    typedef short lane_type;
-    enum { nlanes = 8 };
-
-    v_float16x8() {}
-    explicit v_float16x8(float16x8_t v) : val(v) {}
-    v_float16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
-    {
-        short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
-        val = cv_vld1q_f16(v);
-    }
-    short get0() const
-    {
-        return vgetq_lane_s16(vreinterpretq_s16_f16(val), 0);
-    }
-    float16x8_t val;
-};
-
-inline v_float16x8 v_setzero_f16() { return v_float16x8(vreinterpretq_f16_s16(vdupq_n_s16((short)0))); }
-inline v_float16x8 v_setall_f16(short v) { return v_float16x8(vreinterpretq_f16_s16(vdupq_n_s16(v))); }
-
-#endif // CV_SIMD128_FP16
-
 #define OPENCV_HAL_IMPL_NEON_INIT(_Tpv, _Tp, suffix) \
 inline v_##_Tpv v_setzero_##suffix() { return v_##_Tpv(vdupq_n_##suffix((_Tp)0)); } \
 inline v_##_Tpv v_setall_##suffix(_Tp v) { return v_##_Tpv(vdupq_n_##suffix(v)); } \
@@ -934,24 +878,6 @@ OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float32x4, float, f32)
 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float64x2, double, f64)
 #endif
 
-#if CV_SIMD128_FP16
-// Workaround for old comiplers
-inline v_float16x8 v_load_f16(const short* ptr)
-{ return v_float16x8(cv_vld1q_f16(ptr)); }
-inline v_float16x8 v_load_f16_aligned(const short* ptr)
-{ return v_float16x8(cv_vld1q_f16(ptr)); }
-
-inline v_float16x8 v_load_f16_low(const short* ptr)
-{ return v_float16x8(vcombine_f16(cv_vld1_f16(ptr), vdup_n_f16((float16_t)0))); }
-inline v_float16x8 v_load_f16_halves(const short* ptr0, const short* ptr1)
-{ return v_float16x8(vcombine_f16(cv_vld1_f16(ptr0), cv_vld1_f16(ptr1))); }
-
-inline void v_store(short* ptr, const v_float16x8& a)
-{ cv_vst1q_f16(ptr, a.val); }
-inline void v_store_aligned(short* ptr, const v_float16x8& a)
-{ cv_vst1q_f16(ptr, a.val); }
-#endif
-
 #define OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
 inline scalartype v_reduce_##func(const _Tpvec& a) \
 { \
@@ -1507,22 +1433,6 @@ inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
 }
 #endif
 
-#if CV_SIMD128_FP16
-inline v_float32x4 v_cvt_f32(const v_float16x8& a)
-{
-    return v_float32x4(vcvt_f32_f16(vget_low_f16(a.val)));
-}
-inline v_float32x4 v_cvt_f32_high(const v_float16x8& a)
-{
-    return v_float32x4(vcvt_f32_f16(vget_high_f16(a.val)));
-}
-
-inline v_float16x8 v_cvt_f16(const v_float32x4& a, const v_float32x4& b)
-{
-    return v_float16x8(vcombine_f16(vcvt_f16_f32(a.val), vcvt_f16_f32(b.val)));
-}
-#endif
-
 ////////////// Lookup table access ////////////////////
 
 inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
@@ -1588,6 +1498,47 @@ inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_flo
 }
 #endif
 
+////// FP16 suport ///////
+#if CV_FP16
+inline v_float32x4 v_load_expand(const float16_t* ptr)
+{
+    float16x4_t v =
+    #ifndef vld1_f16 // APPLE compiler defines vld1_f16 as macro
+        (float16x4_t)vld1_s16((const short*)ptr);
+    #else
+        vld1_f16((const __fp16*)ptr);
+    #endif
+    return v_float32x4(vcvt_f32_f16(v));
+}
+
+inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
+{
+    float16x4_t hv = vcvt_f16_f32(v.val);
+
+    #ifndef vst1_f16 // APPLE compiler defines vst1_f16 as macro
+        vst1_s16((short*)ptr, (int16x4_t)hv);
+    #else
+        vst1_f16((__fp16*)ptr, hv);
+    #endif
+}
+#else
+inline v_float32x4 v_load_expand(const float16_t* ptr)
+{
+    const int N = 4;
+    float buf[N];
+    for( int i = 0; i < N; i++ ) buf[i] = (float)ptr[i];
+    return v_load(buf);
+}
+
+inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
+{
+    const int N = 4;
+    float buf[N];
+    v_store(buf, v);
+    for( int i = 0; i < N; i++ ) ptr[i] = float16_t(buf[i]);
+}
+#endif
+
 inline void v_cleanup() {}
 
 //! @name Check SIMD support
diff --git a/modules/core/include/opencv2/core/hal/intrin_sse.hpp b/modules/core/include/opencv2/core/hal/intrin_sse.hpp
index 159ef356b5..29c4f646ec 100644
--- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp
@@ -404,7 +404,7 @@ void v_rshr_pack_u_store(uchar* ptr, const v_int16x8& a)
 inline v_int8x16 v_pack(const v_int16x8& a, const v_int16x8& b)
 { return v_int8x16(_mm_packs_epi16(a.val, b.val)); }
 
-inline void v_pack_store(schar* ptr, v_int16x8& a)
+inline void v_pack_store(schar* ptr, const v_int16x8& a)
 { _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi16(a.val, a.val)); }
 
 template<int n> inline
@@ -2655,6 +2655,50 @@ inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_flo
     y = v_float64x2(_mm_unpackhi_pd(xy0, xy1));
 }
 
+
+////////////// FP16 support ///////////////////////////
+
+inline v_float32x4 v_load_expand(const float16_t* ptr)
+{
+    const __m128i z = _mm_setzero_si128(), delta = _mm_set1_epi32(0x38000000);
+    const __m128i signmask = _mm_set1_epi32(0x80000000), maxexp = _mm_set1_epi32(0x7c000000);
+    const __m128 deltaf = _mm_castsi128_ps(_mm_set1_epi32(0x38800000));
+    __m128i bits = _mm_unpacklo_epi16(z, _mm_loadl_epi64((const __m128i*)ptr)); // h << 16
+    __m128i e = _mm_and_si128(bits, maxexp), sign = _mm_and_si128(bits, signmask);
+    __m128i t = _mm_add_epi32(_mm_srli_epi32(_mm_xor_si128(bits, sign), 3), delta); // ((h & 0x7fff) << 13) + delta
+    __m128i zt = _mm_castps_si128(_mm_sub_ps(_mm_castsi128_ps(_mm_add_epi32(t, _mm_set1_epi32(1 << 23))), deltaf));
+
+    t = _mm_add_epi32(t, _mm_and_si128(delta, _mm_cmpeq_epi32(maxexp, e)));
+    __m128i zmask = _mm_cmpeq_epi32(e, z);
+    __m128i ft = v_select_si128(zmask, zt, t);
+    return v_float32x4(_mm_castsi128_ps(_mm_or_si128(ft, sign)));
+}
+
+inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
+{
+    const __m128i signmask = _mm_set1_epi32(0x80000000);
+    const __m128i rval = _mm_set1_epi32(0x3f000000);
+
+    __m128i t = _mm_castps_si128(v.val);
+    __m128i sign = _mm_srai_epi32(_mm_and_si128(t, signmask), 16);
+    t = _mm_andnot_si128(signmask, t);
+
+    __m128i finitemask = _mm_cmpgt_epi32(_mm_set1_epi32(0x47800000), t);
+    __m128i isnan = _mm_cmpgt_epi32(t, _mm_set1_epi32(0x7f800000));
+    __m128i naninf = v_select_si128(isnan, _mm_set1_epi32(0x7e00), _mm_set1_epi32(0x7c00));
+    __m128i tinymask = _mm_cmpgt_epi32(_mm_set1_epi32(0x38800000), t);
+    __m128i tt = _mm_castps_si128(_mm_add_ps(_mm_castsi128_ps(t), _mm_castsi128_ps(rval)));
+    tt = _mm_sub_epi32(tt, rval);
+    __m128i odd = _mm_and_si128(_mm_srli_epi32(t, 13), _mm_set1_epi32(1));
+    __m128i nt = _mm_add_epi32(t, _mm_set1_epi32(0xc8000fff));
+    nt = _mm_srli_epi32(_mm_add_epi32(nt, odd), 13);
+    t = v_select_si128(tinymask, tt, nt);
+    t = v_select_si128(finitemask, t, naninf);
+    t = _mm_or_si128(t, sign);
+    t = _mm_packs_epi32(t, t);
+    _mm_storel_epi64((__m128i*)ptr, t);
+}
+
 inline void v_cleanup() {}
 
 //! @name Check SIMD support
diff --git a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
index a45e7a875f..fb81986f6c 100644
--- a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
@@ -916,6 +916,24 @@ inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_flo
     y = v_float64x2(tab[idx[0]+1], tab[idx[1]+1]);
 }
 
+/////// FP16 support ////////
+
+// [TODO] implement these 2 using VSX or universal intrinsics (copy from intrin_sse.cpp and adopt)
+inline v_float32x4 v_load_expand(const float16_t* ptr)
+{
+    return v_float32x4((float)ptr[0], (float)ptr[1], (float)ptr[2], (float)ptr[3]);
+}
+
+inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
+{
+    float CV_DECL_ALIGNED(32) f[4];
+    v_store_aligned(f, v);
+    ptr[0] = float16_t(f[0]);
+    ptr[1] = float16_t(f[1]);
+    ptr[2] = float16_t(f[2]);
+    ptr[3] = float16_t(f[3]);
+}
+
 inline void v_cleanup() {}
 
 
diff --git a/modules/core/perf/perf_addWeighted.cpp b/modules/core/perf/perf_addWeighted.cpp
index 15daced72e..2822bc61e7 100644
--- a/modules/core/perf/perf_addWeighted.cpp
+++ b/modules/core/perf/perf_addWeighted.cpp
@@ -11,6 +11,7 @@ PERF_TEST_P(Size_MatType, addWeighted, TYPICAL_MATS_ADWEIGHTED)
 {
     Size size = get<0>(GetParam());
     int type = get<1>(GetParam());
+    int depth = CV_MAT_DEPTH(type);
     Mat src1(size, type);
     Mat src2(size, type);
     double alpha = 3.75;
@@ -21,7 +22,7 @@ PERF_TEST_P(Size_MatType, addWeighted, TYPICAL_MATS_ADWEIGHTED)
 
     declare.in(src1, src2, dst, WARMUP_RNG).out(dst);
 
-    if (CV_MAT_DEPTH(type) == CV_32S)
+    if (depth == CV_32S)
     {
         // there might be not enough precision for integers
         src1 /= 2048;
@@ -30,7 +31,7 @@ PERF_TEST_P(Size_MatType, addWeighted, TYPICAL_MATS_ADWEIGHTED)
 
     TEST_CYCLE() cv::addWeighted( src1, alpha, src2, beta, gamma, dst, dst.type() );
 
-    SANITY_CHECK(dst, 1);
+    SANITY_CHECK(dst, depth == CV_32S ? 4 : 1);
 }
 
 } // namespace
diff --git a/modules/core/perf/perf_convertTo.cpp b/modules/core/perf/perf_convertTo.cpp
index c6c157e704..344d81cb8a 100644
--- a/modules/core/perf/perf_convertTo.cpp
+++ b/modules/core/perf/perf_convertTo.cpp
@@ -33,7 +33,7 @@ PERF_TEST_P( Size_DepthSrc_DepthDst_Channels_alpha, convertTo,
     int runs = (sz.width <= 640) ? 8 : 1;
     TEST_CYCLE_MULTIRUN(runs) src.convertTo(dst, depthDst, alpha);
 
-    double eps = depthSrc <= CV_32S ? 1e-12 : (FLT_EPSILON * maxValue);
+    double eps = depthSrc <= CV_32S && (depthDst <= CV_32S || depthDst == CV_64F) ? 1e-12 : (FLT_EPSILON * maxValue);
     eps = eps * std::max(1.0, fabs(alpha));
     SANITY_CHECK(dst, eps);
 }
diff --git a/modules/core/src/convert.avx2.cpp b/modules/core/src/convert.avx2.cpp
deleted file mode 100644
index b724cbbf1e..0000000000
--- a/modules/core/src/convert.avx2.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-// This file is part of OpenCV project.
-// It is subject to the license terms in the LICENSE file found in the top-level directory
-// of this distribution and at http://opencv.org/license.html
-
-
-#include "precomp.hpp"
-#include "convert.hpp"
-
-namespace cv
-{
-namespace opt_AVX2
-{
-
-void cvtScale_s16s32f32Line_AVX2(const short* src, int* dst, float scale, float shift, int width)
-{
-    int x = 0;
-
-    __m256 scale256 = _mm256_set1_ps(scale);
-    __m256 shift256 = _mm256_set1_ps(shift);
-    const int shuffle = 0xD8;
-
-    for (; x <= width - 16; x += 16)
-    {
-        __m256i v_src = _mm256_loadu_si256((const __m256i *)(src + x));
-        v_src = _mm256_permute4x64_epi64(v_src, shuffle);
-        __m256i v_src_lo = _mm256_srai_epi32(_mm256_unpacklo_epi16(v_src, v_src), 16);
-        __m256i v_src_hi = _mm256_srai_epi32(_mm256_unpackhi_epi16(v_src, v_src), 16);
-        __m256 v_dst0 = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(v_src_lo), scale256), shift256);
-        __m256 v_dst1 = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(v_src_hi), scale256), shift256);
-        _mm256_storeu_si256((__m256i *)(dst + x), _mm256_cvtps_epi32(v_dst0));
-        _mm256_storeu_si256((__m256i *)(dst + x + 8), _mm256_cvtps_epi32(v_dst1));
-    }
-
-    for (; x < width; x++)
-        dst[x] = saturate_cast<int>(src[x] * scale + shift);
-}
-
-}
-} // cv::
-/* End of file. */
diff --git a/modules/core/src/convert.cpp b/modules/core/src/convert.cpp
index 75b4967194..a54f4c1bcd 100644
--- a/modules/core/src/convert.cpp
+++ b/modules/core/src/convert.cpp
@@ -2,1093 +2,242 @@
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html
 
-
 #include "precomp.hpp"
 #include "opencl_kernels_core.hpp"
 #include "convert.hpp"
-#include "opencv2/core/openvx/ovx_defs.hpp"
 
 namespace cv {
 
-template <typename T, typename DT>
-struct Cvt_SIMD
-{
-    int operator() (const T *, DT *, int) const
-    {
-        return 0;
-    }
-};
+/*namespace hal {
 
-#if CV_SIMD128
-// from uchar
-
-template <>
-struct Cvt_SIMD<uchar, schar>
+void cvt16f32f( const float16_t* src, float* dst, int len )
 {
-    int operator() (const uchar * src, schar * dst, int width) const
+    int j = 0;
+#if CV_SIMD
+    const int VECSZ = v_float32::nlanes;
+    for( ; j < len; j += VECSZ )
     {
-        int x = 0;
-        if (hasSIMD128())
+        if( j > len - VECSZ )
         {
-            int cWidth = v_uint16x8::nlanes;
-            for (; x <= width - cWidth; x += cWidth)
-            {
-                v_int16x8 v_src = v_reinterpret_as_s16(v_load_expand(src + x));
-                v_store_low(dst + x, v_pack(v_src, v_src));
-            }
+            if( j == 0 )
+                break;
+            j = len - VECSZ;
         }
-        return x;
+        v_store(dst + j, vx_load_expand(src + j));
     }
-};
-
-template <>
-struct Cvt_SIMD<uchar, ushort>
-{
-    int operator() (const uchar * src, ushort * dst, int width) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            int cWidth = v_uint16x8::nlanes;
-            for (; x <= width - cWidth; x += cWidth)
-                v_store(dst + x, v_load_expand(src + x));
-        }
-        return x;
-    }
-};
-
-template <>
-struct Cvt_SIMD<uchar, short>
-{
-    int operator() (const uchar * src, short * dst, int width) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            int cWidth = v_uint16x8::nlanes;
-            for (; x <= width - cWidth; x += cWidth)
-            {
-                v_int16x8 v_src = v_reinterpret_as_s16(v_load_expand(src + x));
-                v_store(dst + x, v_src);
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct Cvt_SIMD<uchar, int>
-{
-    int operator() (const uchar * src, int * dst, int width) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            int cWidth = v_int32x4::nlanes;
-            for (; x <= width - cWidth * 2; x += cWidth * 2)
-            {
-                v_uint16x8 v_src = v_load_expand(src + x);
-                v_uint32x4 v_src1, v_src2;
-                v_expand(v_src, v_src1, v_src2);
-                v_store(dst + x, v_reinterpret_as_s32(v_src1));
-                v_store(dst + x + cWidth, v_reinterpret_as_s32(v_src2));
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct Cvt_SIMD<uchar, float>
-{
-    int operator() (const uchar * src, float * dst, int width) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            int cWidth = v_float32x4::nlanes;
-            for (; x <= width - cWidth * 2; x += cWidth * 2)
-            {
-                v_uint16x8 v_src = v_load_expand(src + x);
-                v_uint32x4 v_src1, v_src2;
-                v_expand(v_src, v_src1, v_src2);
-                v_store(dst + x, v_cvt_f32(v_reinterpret_as_s32(v_src1)));
-                v_store(dst + x + cWidth, v_cvt_f32(v_reinterpret_as_s32(v_src2)));
-            }
-        }
-        return x;
-    }
-};
-
-// from schar
-
-template <>
-struct Cvt_SIMD<schar, uchar>
-{
-    int operator() (const schar * src, uchar * dst, int width) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            int cWidth = v_int16x8::nlanes;
-            for (; x <= width - cWidth; x += cWidth)
-                v_pack_u_store(dst + x, v_load_expand(src + x));
-        }
-
-        return x;
-    }
-};
-
-template <>
-struct Cvt_SIMD<schar, short>
-{
-    int operator() (const schar * src, short * dst, int width) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            int cWidth = v_int16x8::nlanes;
-            for (; x <= width - cWidth; x += cWidth)
-                v_store(dst + x, v_load_expand(src + x));
-        }
-        return x;
-    }
-};
-
-template <>
-struct Cvt_SIMD<schar, ushort>
-{
-    int operator() (const schar * src, ushort * dst, int width) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            int cWidth = v_int16x8::nlanes;
-            for (; x <= width - cWidth; x += cWidth)
-            {
-                v_int16x8 v_src = v_load_expand(src + x);
-                v_int32x4 v_src1, v_src2;
-                v_expand(v_src, v_src1, v_src2);
-                v_store(dst + x, v_pack_u(v_src1, v_src2));
-            }
-        }
-        return x;
-    }
-};
-
-
-template <>
-struct Cvt_SIMD<schar, int>
-{
-    int operator() (const schar * src, int * dst, int width) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            int cWidth = v_int32x4::nlanes;
-            for (; x <= width - cWidth * 2; x += cWidth * 2)
-            {
-                v_int16x8 v_src = v_load_expand(src + x);
-                v_int32x4 v_src1, v_src2;
-                v_expand(v_src, v_src1, v_src2);
-                v_store(dst + x, v_src1);
-                v_store(dst + x + cWidth, v_src2);
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct Cvt_SIMD<schar, float>
-{
-    int operator() (const schar * src, float * dst, int width) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            int cWidth = v_int32x4::nlanes;
-            for (; x <= width - cWidth * 2; x += cWidth * 2)
-            {
-                v_int16x8 v_src = v_load_expand(src + x);
-                v_int32x4 v_src1, v_src2;
-                v_expand(v_src, v_src1, v_src2);
-                v_store(dst + x, v_cvt_f32(v_src1));
-                v_store(dst + x + cWidth, v_cvt_f32(v_src2));
-            }
-        }
-        return x;
-    }
-};
-
-// from ushort
-
-template <>
-struct Cvt_SIMD<ushort, uchar>
-{
-    int operator() (const ushort * src, uchar * dst, int width) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            int cWidth = v_uint16x8::nlanes;
-            for (; x <= width - cWidth * 2; x += cWidth * 2)
-            {
-                v_uint16x8 v_src1 = v_load(src + x), v_src2 = v_load(src + x + cWidth);
-                v_store(dst + x, v_pack(v_src1, v_src2));
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct Cvt_SIMD<ushort, schar>
-{
-    int operator() (const ushort * src, schar * dst, int width) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            int cWidth = v_uint16x8::nlanes;
-            for (; x <= width - cWidth * 2; x += cWidth * 2)
-            {
-                v_uint16x8 v_src1 = v_load(src + x), v_src2 = v_load(src + x + cWidth);
-                v_uint32x4 v_dst10, v_dst11, v_dst20, v_dst21;
-                v_expand(v_src1, v_dst10, v_dst11);
-                v_expand(v_src2, v_dst20, v_dst21);
-
-                v_store(dst + x, v_pack(
-                    v_pack(v_reinterpret_as_s32(v_dst10), v_reinterpret_as_s32(v_dst11)),
-                    v_pack(v_reinterpret_as_s32(v_dst20), v_reinterpret_as_s32(v_dst21))));
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct Cvt_SIMD<ushort, short>
-{
-    int operator() (const ushort * src, short * dst, int width) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            int cWidth = v_uint16x8::nlanes;
-            for (; x <= width - cWidth; x += cWidth)
-            {
-                v_uint16x8 v_src = v_load(src + x);
-                v_uint32x4 v_dst0, v_dst1;
-                v_expand(v_src, v_dst0, v_dst1);
-                v_store(dst + x, v_pack(v_reinterpret_as_s32(v_dst0), v_reinterpret_as_s32(v_dst1)));
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct Cvt_SIMD<ushort, int>
-{
-    int operator() (const ushort * src, int * dst, int width) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            int cWidth = v_int32x4::nlanes;
-            for (; x <= width - cWidth * 2; x += cWidth * 2)
-            {
-                v_uint16x8 v_src = v_load(src + x);
-                v_uint32x4 v_src1, v_src2;
-                v_expand(v_src, v_src1, v_src2);
-                v_store(dst + x, v_reinterpret_as_s32(v_src1));
-                v_store(dst + x + cWidth, v_reinterpret_as_s32(v_src2));
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct Cvt_SIMD<ushort, float>
-{
-    int operator() (const ushort * src, float * dst, int width) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            int cWidth = v_int32x4::nlanes;
-            for (; x <= width - cWidth * 2; x += cWidth * 2)
-            {
-                v_uint16x8 v_src = v_load(src + x);
-                v_uint32x4 v_src1, v_src2;
-                v_expand(v_src, v_src1, v_src2);
-                v_store(dst + x, v_cvt_f32(v_reinterpret_as_s32(v_src1)));
-                v_store(dst + x + cWidth, v_cvt_f32(v_reinterpret_as_s32(v_src2)));
-            }
-        }
-        return x;
-    }
-};
-
-
-// from short
-
-template <>
-struct Cvt_SIMD<short, uchar>
-{
-    int operator() (const short * src, uchar * dst, int width) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            int cWidth = v_int16x8::nlanes;
-            for (; x <= width - cWidth * 2; x += cWidth * 2)
-            {
-                v_int16x8 v_src1 = v_load(src + x), v_src2 = v_load(src + x + cWidth);
-                v_store(dst + x, v_pack_u(v_src1, v_src2));
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct Cvt_SIMD<short, schar>
-{
-    int operator() (const short * src, schar * dst, int width) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            int cWidth = v_int16x8::nlanes;
-            for (; x <= width - cWidth * 2; x += cWidth * 2)
-            {
-                v_int16x8 v_src1 = v_load(src + x), v_src2 = v_load(src + x + cWidth);
-                v_store(dst + x, v_pack(v_src1, v_src2));
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct Cvt_SIMD<short, ushort>
-{
-    int operator() (const short * src, ushort * dst, int width) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            int cWidth = v_int16x8::nlanes;
-            for (; x <= width - cWidth; x += cWidth)
-            {
-                v_int16x8 v_src = v_load(src + x);
-                v_int32x4 v_dst1, v_dst2;
-                v_expand(v_src, v_dst1, v_dst2);
-                v_store(dst + x, v_pack_u(v_dst1, v_dst2));
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct Cvt_SIMD<short, int>
-{
-    int operator() (const short * src, int * dst, int width) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            int cWidth = v_int32x4::nlanes;
-            for (; x <= width - cWidth * 2; x += cWidth * 2)
-            {
-                v_int16x8 v_src = v_load(src + x);
-                v_int32x4 v_dst1, v_dst2;
-                v_expand(v_src, v_dst1, v_dst2);
-                v_store(dst + x, v_dst1);
-                v_store(dst + x + cWidth, v_dst2);
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct Cvt_SIMD<short, float>
-{
-    int operator() (const short * src, float * dst, int width) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            int cWidth = v_int32x4::nlanes;
-            for (; x <= width - cWidth * 2; x += cWidth * 2)
-            {
-                v_int16x8 v_src = v_load(src + x);
-                v_int32x4 v_dst1, v_dst2;
-                v_expand(v_src, v_dst1, v_dst2);
-                v_store(dst + x, v_cvt_f32(v_dst1));
-                v_store(dst + x + cWidth, v_cvt_f32(v_dst2));
-            }
-        }
-        return x;
-    }
-};
-
-// from int
-
-template <>
-struct Cvt_SIMD<int, uchar>
-{
-    int operator() (const int * src, uchar * dst, int width) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            int cWidth = v_int32x4::nlanes;
-            for (; x <= width - cWidth * 4; x += cWidth * 4)
-            {
-                v_int32x4 v_src1 = v_load(src + x), v_src2 = v_load(src + x + cWidth);
-                v_int32x4 v_src3 = v_load(src + x + cWidth * 2), v_src4 = v_load(src + x + cWidth * 3);
-                v_int16x8 v_dst1 = v_pack(v_src1, v_src2);
-                v_int16x8 v_dst2 = v_pack(v_src3, v_src4);
-                v_store(dst + x, v_pack_u(v_dst1, v_dst2));
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct Cvt_SIMD<int, schar>
-{
-    int operator() (const int * src, schar * dst, int width) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            int cWidth = v_int32x4::nlanes;
-            for (; x <= width - cWidth * 4; x += cWidth * 4)
-            {
-                v_int32x4 v_src1 = v_load(src + x), v_src2 = v_load(src + x + cWidth);
-                v_int32x4 v_src3 = v_load(src + x + cWidth * 2), v_src4 = v_load(src + x + cWidth * 3);
-                v_int16x8 v_dst1 = v_pack(v_src1, v_src2);
-                v_int16x8 v_dst2 = v_pack(v_src3, v_src4);
-                v_store(dst + x, v_pack(v_dst1, v_dst2));
-            }
-        }
-        return x;
-    }
-};
-
-
-template <>
-struct Cvt_SIMD<int, ushort>
-{
-    int operator() (const int * src, ushort * dst, int width) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            int cWidth = v_int32x4::nlanes;
-            for (; x <= width - cWidth * 2; x += cWidth * 2)
-            {
-                v_int32x4 v_src1 = v_load(src + x), v_src2 = v_load(src + x + cWidth);
-                v_store(dst + x, v_pack_u(v_src1, v_src2));
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct Cvt_SIMD<int, short>
-{
-    int operator() (const int * src, short * dst, int width) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            int cWidth = v_int32x4::nlanes;
-            for (; x <= width - cWidth * 2; x += cWidth * 2)
-            {
-                v_int32x4 v_src1 = v_load(src + x), v_src2 = v_load(src + x + cWidth);
-                v_store(dst + x, v_pack(v_src1, v_src2));
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct Cvt_SIMD<int, float>
-{
-    int operator() (const int * src, float * dst, int width) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            int cWidth = v_int32x4::nlanes;
-            for (; x <= width - cWidth; x += cWidth)
-                v_store(dst + x, v_cvt_f32(v_load(src + x)));
-        }
-        return x;
-    }
-};
-
-// from float
-
-template <>
-struct Cvt_SIMD<float, uchar>
-{
-    int operator() (const float * src, uchar * dst, int width) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            int cWidth = v_float32x4::nlanes;
-            for (; x <= width - cWidth * 4; x += cWidth * 4)
-            {
-                v_int32x4 v_src1 = v_round(v_load(src + x));
-                v_int32x4 v_src2 = v_round(v_load(src + x + cWidth));
-                v_int32x4 v_src3 = v_round(v_load(src + x + cWidth * 2));
-                v_int32x4 v_src4 = v_round(v_load(src + x + cWidth * 3));
-                v_uint16x8 v_dst1 = v_pack_u(v_src1, v_src2);
-                v_uint16x8 v_dst2 = v_pack_u(v_src3, v_src4);
-                v_store(dst + x, v_pack(v_dst1, v_dst2));
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct Cvt_SIMD<float, schar>
-{
-    int operator() (const float * src, schar * dst, int width) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            int cWidth = v_float32x4::nlanes;
-            for (; x <= width - cWidth * 4; x += cWidth * 4)
-            {
-                v_int32x4 v_src1 = v_round(v_load(src + x));
-                v_int32x4 v_src2 = v_round(v_load(src + x + cWidth));
-                v_int32x4 v_src3 = v_round(v_load(src + x + cWidth * 2));
-                v_int32x4 v_src4 = v_round(v_load(src + x + cWidth * 3));
-                v_int16x8 v_dst1 = v_pack(v_src1, v_src2);
-                v_int16x8 v_dst2 = v_pack(v_src3, v_src4);
-                v_store(dst + x, v_pack(v_dst1, v_dst2));
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct Cvt_SIMD<float, ushort>
-{
-    int operator() (const float * src, ushort * dst, int width) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            int cWidth = v_float32x4::nlanes;
-            for (; x <= width - cWidth * 2; x += cWidth * 2)
-            {
-                v_int32x4 v_src1 = v_round(v_load(src + x));
-                v_int32x4 v_src2 = v_round(v_load(src + x + cWidth));
-                v_store(dst + x, v_pack_u(v_src1, v_src2));
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct Cvt_SIMD<float, short>
-{
-    int operator() (const float * src, short * dst, int width) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            int cWidth = v_float32x4::nlanes;
-            for (; x <= width - cWidth * 2; x += cWidth * 2)
-            {
-                v_int32x4 v_src1 = v_round(v_load(src + x));
-                v_int32x4 v_src2 = v_round(v_load(src + x + cWidth));
-                v_store(dst + x, v_pack(v_src1, v_src2));
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct Cvt_SIMD<float, int>
-{
-    int operator() (const float * src, int * dst, int width) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            int cWidth = v_float32x4::nlanes;
-            for (; x <= width - cWidth; x += cWidth)
-                v_store(dst + x, v_round(v_load(src + x)));
-        }
-        return x;
-    }
-};
-#if CV_SIMD128_64F
-// from double
-
-template <>
-struct Cvt_SIMD<double, uchar>
-{
-    int operator() (const double * src, uchar * dst, int width) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            int cWidth = v_float64x2::nlanes;
-            for (; x <= width - cWidth * 4; x += cWidth * 4)
-            {
-                v_float32x4 v_src0 = v_cvt_f32(v_load(src + x));
-                v_float32x4 v_src1 = v_cvt_f32(v_load(src + x + cWidth));
-                v_float32x4 v_src2 = v_cvt_f32(v_load(src + x + cWidth * 2));
-                v_float32x4 v_src3 = v_cvt_f32(v_load(src + x + cWidth * 3));
-
-                v_src0 = v_combine_low(v_src0, v_src1);
-                v_src1 = v_combine_low(v_src2, v_src3);
-
-                v_int16x8 v_dst = v_pack(v_round(v_src0), v_round(v_src1));
-                v_pack_u_store(dst + x, v_dst);
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct Cvt_SIMD<double, schar>
-{
-    int operator() (const double * src, schar * dst, int width) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            int cWidth = v_float64x2::nlanes;
-            for (; x <= width - cWidth * 4; x += cWidth * 4)
-            {
-                v_float32x4 v_src0 = v_cvt_f32(v_load(src + x));
-                v_float32x4 v_src1 = v_cvt_f32(v_load(src + x + cWidth));
-                v_float32x4 v_src2 = v_cvt_f32(v_load(src + x + cWidth * 2));
-                v_float32x4 v_src3 = v_cvt_f32(v_load(src + x + cWidth * 3));
-
-                v_src0 = v_combine_low(v_src0, v_src1);
-                v_src1 = v_combine_low(v_src2, v_src3);
-
-                v_int16x8 v_dst = v_pack(v_round(v_src0), v_round(v_src1));
-                v_store_low(dst + x, v_pack(v_dst, v_dst));
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct Cvt_SIMD<double, ushort>
-{
-    int operator() (const double * src, ushort * dst, int width) const
-    {
-        int x = 0;
-#if CV_TRY_SSE4_1
-        if (CV_CPU_HAS_SUPPORT_SSE4_1)
-            return opt_SSE4_1::Cvt_SIMD_f64u16_SSE41(src, dst, width);
 #endif
-        if (hasSIMD128())
-        {
-            int cWidth = v_float64x2::nlanes;
-            for (; x <= width - cWidth * 4; x += cWidth * 4)
-            {
-                v_float32x4 v_src0 = v_cvt_f32(v_load(src + x));
-                v_float32x4 v_src1 = v_cvt_f32(v_load(src + x + cWidth));
-                v_float32x4 v_src2 = v_cvt_f32(v_load(src + x + cWidth * 2));
-                v_float32x4 v_src3 = v_cvt_f32(v_load(src + x + cWidth * 3));
+    for( ; j < len; j++ )
+        dst[j] = (float)src[j];
+}
 
-                v_src0 = v_combine_low(v_src0, v_src1);
-                v_src1 = v_combine_low(v_src2, v_src3);
-
-                v_uint16x8 v_dst = v_pack_u(v_round(v_src0), v_round(v_src1));
-                v_store(dst + x, v_dst);
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct Cvt_SIMD<double, short>
+void cvt32f16f( const float* src, float16_t* dst, int len )
 {
-    int operator() (const double * src, short * dst, int width) const
+    int j = 0;
+#if CV_SIMD
+    const int VECSZ = v_float32::nlanes;
+    for( ; j < len; j += VECSZ )
     {
-        int x = 0;
-        if (hasSIMD128())
+        if( j > len - VECSZ )
         {
-            int cWidth = v_float64x2::nlanes;
-            for (; x <= width - cWidth * 4; x += cWidth * 4)
-            {
-                v_float32x4 v_src0 = v_cvt_f32(v_load(src + x));
-                v_float32x4 v_src1 = v_cvt_f32(v_load(src + x + cWidth));
-                v_float32x4 v_src2 = v_cvt_f32(v_load(src + x + cWidth * 2));
-                v_float32x4 v_src3 = v_cvt_f32(v_load(src + x + cWidth * 3));
-
-                v_src0 = v_combine_low(v_src0, v_src1);
-                v_src1 = v_combine_low(v_src2, v_src3);
-
-                v_int16x8 v_dst = v_pack(v_round(v_src0), v_round(v_src1));
-                v_store(dst + x, v_dst);
-            }
+            if( j == 0 )
+                break;
+            j = len - VECSZ;
         }
-        return x;
+        v_pack_store(dst + j, vx_load(src + j));
     }
-};
-
-template <>
-struct Cvt_SIMD<double, int>
-{
-    int operator() (const double * src, int * dst, int width) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            int cWidth = v_float64x2::nlanes;
-            for (; x <= width - cWidth * 2; x += cWidth * 2)
-            {
-                v_int32x4 v_src0 = v_round(v_load(src + x));
-                v_int32x4 v_src1 = v_round(v_load(src + x + cWidth));
-
-                v_store(dst + x, v_combine_low(v_src0, v_src1));
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct Cvt_SIMD<double, float>
-{
-    int operator() (const double * src, float * dst, int width) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            int cWidth = v_float64x2::nlanes;
-            for (; x <= width - cWidth * 2; x += cWidth * 2)
-            {
-                v_float32x4 v_src0 = v_cvt_f32(v_load(src + x));
-                v_float32x4 v_src1 = v_cvt_f32(v_load(src + x + cWidth));
-
-                v_store(dst + x, v_combine_low(v_src0, v_src1));
-            }
-        }
-        return x;
-    }
-};
-
-// to double
-
-template <>
-struct Cvt_SIMD<uchar, double>
-{
-    int operator() (const uchar* src, double* dst, int width) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            int cWidth = v_float64x2::nlanes;
-            for (; x <= width - cWidth * 4; x += cWidth * 4)
-            {
-                v_uint16x8 v_src = v_load_expand(src + x);
-                v_uint32x4 v_src1, v_src2;
-                v_expand(v_src, v_src1, v_src2);
-                v_store(dst + x, v_cvt_f64(v_reinterpret_as_s32(v_src1)));
-                v_store(dst + x + cWidth, v_cvt_f64_high(v_reinterpret_as_s32(v_src1)));
-                v_store(dst + x + cWidth * 2, v_cvt_f64(v_reinterpret_as_s32(v_src2)));
-                v_store(dst + x + cWidth * 3, v_cvt_f64_high(v_reinterpret_as_s32(v_src2)));
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct Cvt_SIMD<schar, double>
-{
-    int operator() (const schar* src, double* dst, int width) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            int cWidth = v_float64x2::nlanes;
-            for (; x <= width - cWidth * 4; x += cWidth * 4)
-            {
-                v_int16x8 v_src = v_load_expand(src + x);
-                v_int32x4 v_src1, v_src2;
-                v_expand(v_src, v_src1, v_src2);
-                v_store(dst + x, v_cvt_f64(v_src1));
-                v_store(dst + x + cWidth, v_cvt_f64_high(v_src1));
-                v_store(dst + x + cWidth * 2, v_cvt_f64(v_src2));
-                v_store(dst + x + cWidth * 3, v_cvt_f64_high(v_src2));
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct Cvt_SIMD<ushort, double>
-{
-    int operator() (const ushort* src, double* dst, int width) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            int cWidth = v_float64x2::nlanes;
-            for (; x <= width - cWidth * 2; x += cWidth * 2)
-            {
-                v_uint32x4 v_src = v_load_expand(src + x);
-
-                v_store(dst + x, v_cvt_f64(v_reinterpret_as_s32(v_src)));
-                v_store(dst + x + cWidth, v_cvt_f64_high(v_reinterpret_as_s32(v_src)));
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct Cvt_SIMD<short, double>
-{
-    int operator() (const short* src, double* dst, int width) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            int cWidth = v_float64x2::nlanes;
-            for (; x <= width - cWidth * 2; x += cWidth * 2)
-            {
-                v_int32x4 v_src = v_load_expand(src + x);
-
-                v_store(dst + x, v_cvt_f64(v_src));
-                v_store(dst + x + cWidth, v_cvt_f64_high(v_src));
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct Cvt_SIMD<int, double>
-{
-    int operator() (const int* src, double* dst, int width) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            int cWidth = v_float64x2::nlanes;
-            for (; x <= width - cWidth * 2; x += cWidth * 2)
-            {
-                v_int32x4 v_src = v_load(src + x);
-
-                v_store(dst + x, v_cvt_f64(v_src));
-                v_store(dst + x + cWidth, v_cvt_f64_high(v_src));
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct Cvt_SIMD<float, double>
-{
-    int operator() (const float* src, double* dst, int width) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            int cWidth = v_float64x2::nlanes;
-            for (; x <= width - cWidth * 2; x += cWidth * 2)
-            {
-                v_float32x4 v_src = v_load(src + x);
-
-                v_store(dst + x, v_cvt_f64(v_src));
-                v_store(dst + x + cWidth, v_cvt_f64_high(v_src));
-            }
-        }
-        return x;
-    }
-};
-#endif // CV_SIMD128_64F
-#endif // CV_SIMD128
-
-
-#ifdef HAVE_OPENVX
-
-template<typename T, typename DT>
-static bool _openvx_cvt(const T* src, size_t sstep,
-                        DT* dst, size_t dstep, Size continuousSize)
-{
-    using namespace ivx;
-
-    if(!(continuousSize.width > 0 && continuousSize.height > 0))
-    {
-        return true;
-    }
-
-    //.height is for number of continuous pieces
-    //.width  is for length of one piece
-    Size imgSize = continuousSize;
-    if(continuousSize.height == 1)
-    {
-        if(sstep / sizeof(T) == dstep / sizeof(DT) && sstep / sizeof(T) > 0 &&
-           continuousSize.width % (sstep / sizeof(T)) == 0)
-        {
-            //continuous n-lines image
-            imgSize.width  = sstep / sizeof(T);
-            imgSize.height = continuousSize.width / (sstep / sizeof(T));
-        }
-        else
-        {
-            //1-row image with possibly incorrect step
-            sstep = continuousSize.width * sizeof(T);
-            dstep = continuousSize.width * sizeof(DT);
-        }
-    }
-
-    int srcType = DataType<T>::type, dstType = DataType<DT>::type;
-
-    if (ovx::skipSmallImages<VX_KERNEL_CONVERTDEPTH>(imgSize.width, imgSize.height))
-        return false;
-
-    try
-    {
-        Context context = ovx::getOpenVXContext();
-
-        // Other conversions are marked as "experimental"
-        if(context.vendorID() == VX_ID_KHRONOS &&
-           !(srcType == CV_8U  && dstType == CV_16S) &&
-           !(srcType == CV_16S && dstType == CV_8U))
-        {
-            return false;
-        }
-
-        Image srcImage = Image::createFromHandle(context, Image::matTypeToFormat(srcType),
-                                                 Image::createAddressing(imgSize.width, imgSize.height,
-                                                                         (vx_uint32)sizeof(T), (vx_uint32)sstep),
-                                                 (void*)src);
-        Image dstImage = Image::createFromHandle(context, Image::matTypeToFormat(dstType),
-                                                 Image::createAddressing(imgSize.width, imgSize.height,
-                                                                         (vx_uint32)sizeof(DT), (vx_uint32)dstep),
-                                                 (void*)dst);
-
-        IVX_CHECK_STATUS(vxuConvertDepth(context, srcImage, dstImage, VX_CONVERT_POLICY_SATURATE, 0));
-
-#ifdef VX_VERSION_1_1
-        //we should take user memory back before release
-        //(it's not done automatically according to standard)
-        srcImage.swapHandle(); dstImage.swapHandle();
 #endif
-    }
-    catch (RuntimeError & e)
-    {
-        VX_DbgThrow(e.what());
-    }
-    catch (WrapperError & e)
-    {
-        VX_DbgThrow(e.what());
-    }
-
-    return true;
+    for( ; j < len; j++ )
+        dst[j] = float16_t(src[j]);
 }
 
-template<typename T, typename DT>
-static bool openvx_cvt(const T* src, size_t sstep,
-                       DT* dst, size_t dstep, Size size)
+/*void addRNGBias32f( float* arr, const float* scaleBiasPairs, int len )
 {
-    (void)src; (void)sstep; (void)dst; (void)dstep; (void)size;
-    return false;
+    // the loop is simple enough, so we let the compiler to vectorize it
+    for( int i = 0; i < len; i++ )
+        arr[i] = scaleBiasPairs[i*2 + 1];
 }
 
-#define DEFINE_OVX_CVT_SPECIALIZATION(T, DT) \
-template<>                                                                    \
-bool openvx_cvt(const T *src, size_t sstep, DT *dst, size_t dstep, Size size) \
-{                                                                             \
-    return _openvx_cvt<T, DT>(src, sstep, dst, dstep, size);                  \
-}
-
-DEFINE_OVX_CVT_SPECIALIZATION(uchar, ushort)
-DEFINE_OVX_CVT_SPECIALIZATION(uchar, short)
-DEFINE_OVX_CVT_SPECIALIZATION(uchar, int)
-DEFINE_OVX_CVT_SPECIALIZATION(ushort, uchar)
-DEFINE_OVX_CVT_SPECIALIZATION(ushort, int)
-DEFINE_OVX_CVT_SPECIALIZATION(short, uchar)
-DEFINE_OVX_CVT_SPECIALIZATION(short, int)
-DEFINE_OVX_CVT_SPECIALIZATION(int, uchar)
-DEFINE_OVX_CVT_SPECIALIZATION(int, ushort)
-DEFINE_OVX_CVT_SPECIALIZATION(int, short)
-
-#endif
-
-template<typename T, typename DT> static void
-cvt_( const T* src, size_t sstep,
-      DT* dst, size_t dstep, Size size )
+void addRNGBias64f( double* arr, const double* scaleBiasPairs, int len )
 {
-    CV_OVX_RUN(
-        true,
-        openvx_cvt(src, sstep, dst, dstep, size)
-    )
-
-    sstep /= sizeof(src[0]);
-    dstep /= sizeof(dst[0]);
-    Cvt_SIMD<T, DT> vop;
-
-    for( ; size.height--; src += sstep, dst += dstep )
-    {
-        int x = vop(src, dst, size.width);
-        #if CV_ENABLE_UNROLLED
-        for( ; x <= size.width - 4; x += 4 )
-        {
-            DT t0, t1;
-            t0 = saturate_cast<DT>(src[x]);
-            t1 = saturate_cast<DT>(src[x+1]);
-            dst[x] = t0; dst[x+1] = t1;
-            t0 = saturate_cast<DT>(src[x+2]);
-            t1 = saturate_cast<DT>(src[x+3]);
-            dst[x+2] = t0; dst[x+3] = t1;
-        }
-        #endif
-        for( ; x < size.width; x++ )
-            dst[x] = saturate_cast<DT>(src[x]);
-    }
+    // the loop is simple enough, so we let the compiler to vectorize it
+    for( int i = 0; i < len; i++ )
+        arr[i] = scaleBiasPairs[i*2 + 1];
 }
 
-template<typename T> static void
-cpy_( const T* src, size_t sstep, T* dst, size_t dstep, Size size )
+}*/
+
+template<typename _Ts, typename _Td, typename _Twvec> inline void
+cvt_( const _Ts* src, size_t sstep, _Td* dst, size_t dstep, Size size )
 {
     sstep /= sizeof(src[0]);
     dstep /= sizeof(dst[0]);
 
-    for( ; size.height--; src += sstep, dst += dstep )
-        memcpy(dst, src, size.width*sizeof(src[0]));
+    for( int i = 0; i < size.height; i++, src += sstep, dst += dstep )
+    {
+        int j = 0;
+#if CV_SIMD
+        const int VECSZ = _Twvec::nlanes*2;
+        for( ; j < size.width; j += VECSZ )
+        {
+            if( j > size.width - VECSZ )
+            {
+                if( j == 0 || src == (_Ts*)dst )
+                    break;
+                j = size.width - VECSZ;
+            }
+            _Twvec v0, v1;
+            vx_load_pair_as(src + j, v0, v1);
+            v_store_pair_as(dst + j, v0, v1);
+        }
+#endif
+        for( ; j < size.width; j++ )
+            dst[j] = saturate_cast<_Td>(src[j]);
+    }
 }
 
+// in order to reduce the code size, for (16f <-> ...) conversions
+// we add a conversion function without loop unrolling
+template<typename _Ts, typename _Td, typename _Twvec> inline void
+cvt1_( const _Ts* src, size_t sstep, _Td* dst, size_t dstep, Size size )
+{
+    sstep /= sizeof(src[0]);
+    dstep /= sizeof(dst[0]);
 
+    for( int i = 0; i < size.height; i++, src += sstep, dst += dstep )
+    {
+        int j = 0;
+#if CV_SIMD
+        const int VECSZ = _Twvec::nlanes;
+        for( ; j < size.width; j += VECSZ )
+        {
+            if( j > size.width - VECSZ )
+            {
+                if( j == 0 || src == (_Ts*)dst )
+                    break;
+                j = size.width - VECSZ;
+            }
+            _Twvec v;
+            vx_load_as(src + j, v);
+            v_store_as(dst + j, v);
+        }
+        vx_cleanup();
+#endif
+        for( ; j < size.width; j++ )
+            dst[j] = saturate_cast<_Td>(src[j]);
+    }
+}
+
+static void cvtCopy( const uchar* src, size_t sstep,
+                     uchar* dst, size_t dstep, Size size, size_t elemsize)
+{
+    size_t len = size.width*elemsize;
+    for( int i = 0; i < size.height; i++, src += sstep, dst += dstep )
+    {
+        memcpy( dst, src, len );
+    }
+}
+
+#define DEF_CVT_FUNC(suffix, cvtfunc, _Ts, _Td, _Twvec) \
+static void cvt##suffix(const _Ts* src, size_t sstep, uchar*, size_t, \
+                        _Td* dst, size_t dstep, Size size, void*) \
+{ cvtfunc<_Ts, _Td, _Twvec>(src, sstep, dst, dstep, size); }
+
+////////////////////// 8u -> ... ////////////////////////
+
+DEF_CVT_FUNC(8u8s,  cvt_,  uchar, schar,    v_int16)
+DEF_CVT_FUNC(8u16u, cvt_,  uchar, ushort,   v_uint16)
+DEF_CVT_FUNC(8u16s, cvt_,  uchar, short,    v_int16)
+DEF_CVT_FUNC(8u32s, cvt_,  uchar, int,      v_int32)
+DEF_CVT_FUNC(8u32f, cvt_,  uchar, float,    v_float32)
+DEF_CVT_FUNC(8u64f, cvt_,  uchar, double,   v_int32)
+//DEF_CVT_FUNC(8u16f, cvt1_, uchar, float16_t, v_float32)
+
+////////////////////// 8s -> ... ////////////////////////
+
+DEF_CVT_FUNC(8s8u,  cvt_,  schar, uchar,    v_int16)
+DEF_CVT_FUNC(8s16u, cvt_,  schar, ushort,   v_uint16)
+DEF_CVT_FUNC(8s16s, cvt_,  schar, short,    v_int16)
+DEF_CVT_FUNC(8s32s, cvt_,  schar, int,      v_int32)
+DEF_CVT_FUNC(8s32f, cvt_,  schar, float,    v_float32)
+DEF_CVT_FUNC(8s64f, cvt_,  schar, double,   v_int32)
+//DEF_CVT_FUNC(8s16f, cvt1_, schar, float16_t, v_float32)
+
+////////////////////// 16u -> ... ////////////////////////
+
+DEF_CVT_FUNC(16u8u,  cvt_, ushort, uchar,  v_uint16)
+DEF_CVT_FUNC(16u8s,  cvt_, ushort, schar,  v_uint16)
+DEF_CVT_FUNC(16u16s, cvt_, ushort, short,  v_int32)
+DEF_CVT_FUNC(16u32s, cvt_, ushort, int,    v_int32)
+DEF_CVT_FUNC(16u32f, cvt_, ushort, float,  v_float32)
+DEF_CVT_FUNC(16u64f, cvt_, ushort, double, v_int32)
+//DEF_CVT_FUNC(16u16f, cvt1_,ushort, float16_t, v_float32)
+
+////////////////////// 16s -> ... ////////////////////////
+
+DEF_CVT_FUNC(16s8u,  cvt_, short, uchar,  v_int16)
+DEF_CVT_FUNC(16s8s,  cvt_, short, schar,  v_int16)
+DEF_CVT_FUNC(16s16u, cvt_, short, ushort, v_int32)
+DEF_CVT_FUNC(16s32s, cvt_, short, int,    v_int32)
+DEF_CVT_FUNC(16s32f, cvt_, short, float,  v_float32)
+DEF_CVT_FUNC(16s64f, cvt_, short, double, v_int32)
+//DEF_CVT_FUNC(16s16f, cvt1_,short, float16_t, v_float32)
+
+////////////////////// 32s -> ... ////////////////////////
+
+DEF_CVT_FUNC(32s8u,  cvt_, int, uchar,  v_int32)
+DEF_CVT_FUNC(32s8s,  cvt_, int, schar,  v_int32)
+DEF_CVT_FUNC(32s16u, cvt_, int, ushort, v_int32)
+DEF_CVT_FUNC(32s16s, cvt_, int, short,  v_int32)
+DEF_CVT_FUNC(32s32f, cvt_, int, float,  v_float32)
+DEF_CVT_FUNC(32s64f, cvt_, int, double, v_int32)
+//DEF_CVT_FUNC(32s16f, cvt1_,int, float16_t, v_float32)
+
+////////////////////// 32f -> ... ////////////////////////
+
+DEF_CVT_FUNC(32f8u,  cvt_, float, uchar,  v_float32)
+DEF_CVT_FUNC(32f8s,  cvt_, float, schar,  v_float32)
+DEF_CVT_FUNC(32f16u, cvt_, float, ushort, v_float32)
+DEF_CVT_FUNC(32f16s, cvt_, float, short,  v_float32)
+DEF_CVT_FUNC(32f32s, cvt_, float, int,    v_float32)
+DEF_CVT_FUNC(32f64f, cvt_, float, double, v_float32)
+DEF_CVT_FUNC(32f16f, cvt1_,float, float16_t, v_float32)
+
+////////////////////// 64f -> ... ////////////////////////
+
+DEF_CVT_FUNC(64f8u,  cvt_, double, uchar,  v_int32)
+DEF_CVT_FUNC(64f8s,  cvt_, double, schar,  v_int32)
+DEF_CVT_FUNC(64f16u, cvt_, double, ushort, v_int32)
+DEF_CVT_FUNC(64f16s, cvt_, double, short,  v_int32)
+DEF_CVT_FUNC(64f32s, cvt_, double, int,    v_int32)
+DEF_CVT_FUNC(64f32f, cvt_, double, float,  v_float32)
+//DEF_CVT_FUNC(64f16f, cvt1_,double, float16_t, v_float32)
+
+////////////////////// 16f -> ... ////////////////////////
+
+//DEF_CVT_FUNC(16f8u,  cvt_,  float16_t, uchar,  v_float32)
+//DEF_CVT_FUNC(16f8s,  cvt_,  float16_t, schar,  v_float32)
+//DEF_CVT_FUNC(16f16u, cvt1_, float16_t, ushort, v_float32)
+//DEF_CVT_FUNC(16f16s, cvt1_, float16_t, short,  v_float32)
+//DEF_CVT_FUNC(16f32s, cvt1_, float16_t, int,    v_float32)
+DEF_CVT_FUNC(16f32f, cvt1_, float16_t, float,  v_float32)
+//DEF_CVT_FUNC(16f64f, cvt1_, float16_t, double, v_float32)
+
+///////////// "conversion" w/o conversion ///////////////
+
+static void cvt8u(const uchar* src, size_t sstep, uchar*, size_t, uchar* dst, size_t dstep, Size size, void*)
+{ cvtCopy(src, sstep, dst, dstep, size, 1); }
+
+static void cvt16u(const ushort* src, size_t sstep, uchar*, size_t, ushort* dst, size_t dstep, Size size, void*)
+{ cvtCopy((const uchar*)src, sstep, (uchar*)dst, dstep, size, 2); }
+
+static void cvt32s(const int* src, size_t sstep, uchar*, size_t, int* dst, size_t dstep, Size size, void*)
+{ cvtCopy((const uchar*)src, sstep, (uchar*)dst, dstep, size, 4); }
+
+static void cvt64s(const int64* src, size_t sstep, uchar*, size_t, int64* dst, size_t dstep, Size size, void*)
+{ cvtCopy((const uchar*)src, sstep, (uchar*)dst, dstep, size, 8); }
+
+
+/* [TODO] Recover IPP calls
 #if defined(HAVE_IPP)
 #define DEF_CVT_FUNC_F(suffix, stype, dtype, ippFavor) \
 static void cvt##suffix( const stype* src, size_t sstep, const uchar*, size_t, \
@@ -1129,7 +278,6 @@ static void cvt##suffix( const stype* src, size_t sstep, const uchar*, size_t, \
     cpy_(src, sstep, dst, dstep, size); \
 }
 
-
 DEF_CPY_FUNC(8u,     uchar)
 DEF_CVT_FUNC_F(8s8u,   schar, uchar, 8s8u_C1Rs)
 DEF_CVT_FUNC_F(16u8u,  ushort, uchar, 16u8u_C1R)
@@ -1182,7 +330,7 @@ DEF_CVT_FUNC(16s64f, short, double)
 DEF_CVT_FUNC(32s64f, int, double)
 DEF_CVT_FUNC(32f64f, float, double)
 DEF_CPY_FUNC(64s,    int64)
-
+*/
 
 BinaryFunc getConvertFunc(int sdepth, int ddepth)
 {
@@ -1191,114 +339,78 @@ BinaryFunc getConvertFunc(int sdepth, int ddepth)
         {
             (BinaryFunc)(cvt8u), (BinaryFunc)GET_OPTIMIZED(cvt8s8u), (BinaryFunc)GET_OPTIMIZED(cvt16u8u),
             (BinaryFunc)GET_OPTIMIZED(cvt16s8u), (BinaryFunc)GET_OPTIMIZED(cvt32s8u), (BinaryFunc)GET_OPTIMIZED(cvt32f8u),
-            (BinaryFunc)GET_OPTIMIZED(cvt64f8u), 0
+            (BinaryFunc)GET_OPTIMIZED(cvt64f8u), 0 //(BinaryFunc)(cvt16f8u)
         },
         {
             (BinaryFunc)GET_OPTIMIZED(cvt8u8s), (BinaryFunc)cvt8u, (BinaryFunc)GET_OPTIMIZED(cvt16u8s),
             (BinaryFunc)GET_OPTIMIZED(cvt16s8s), (BinaryFunc)GET_OPTIMIZED(cvt32s8s), (BinaryFunc)GET_OPTIMIZED(cvt32f8s),
-            (BinaryFunc)GET_OPTIMIZED(cvt64f8s), 0
+            (BinaryFunc)GET_OPTIMIZED(cvt64f8s), 0 //(BinaryFunc)(cvt16f8s)
         },
         {
             (BinaryFunc)GET_OPTIMIZED(cvt8u16u), (BinaryFunc)GET_OPTIMIZED(cvt8s16u), (BinaryFunc)cvt16u,
             (BinaryFunc)GET_OPTIMIZED(cvt16s16u), (BinaryFunc)GET_OPTIMIZED(cvt32s16u), (BinaryFunc)GET_OPTIMIZED(cvt32f16u),
-            (BinaryFunc)GET_OPTIMIZED(cvt64f16u), 0
+            (BinaryFunc)GET_OPTIMIZED(cvt64f16u), 0 //(BinaryFunc)(cvt16f16u)
         },
         {
             (BinaryFunc)GET_OPTIMIZED(cvt8u16s), (BinaryFunc)GET_OPTIMIZED(cvt8s16s), (BinaryFunc)GET_OPTIMIZED(cvt16u16s),
             (BinaryFunc)cvt16u, (BinaryFunc)GET_OPTIMIZED(cvt32s16s), (BinaryFunc)GET_OPTIMIZED(cvt32f16s),
-            (BinaryFunc)GET_OPTIMIZED(cvt64f16s), 0
+            (BinaryFunc)GET_OPTIMIZED(cvt64f16s), 0 //(BinaryFunc)(cvt16f16s)
         },
         {
             (BinaryFunc)GET_OPTIMIZED(cvt8u32s), (BinaryFunc)GET_OPTIMIZED(cvt8s32s), (BinaryFunc)GET_OPTIMIZED(cvt16u32s),
             (BinaryFunc)GET_OPTIMIZED(cvt16s32s), (BinaryFunc)cvt32s, (BinaryFunc)GET_OPTIMIZED(cvt32f32s),
-            (BinaryFunc)GET_OPTIMIZED(cvt64f32s), 0
+            (BinaryFunc)GET_OPTIMIZED(cvt64f32s), 0 //(BinaryFunc)(cvt16f32s)
         },
         {
             (BinaryFunc)GET_OPTIMIZED(cvt8u32f), (BinaryFunc)GET_OPTIMIZED(cvt8s32f), (BinaryFunc)GET_OPTIMIZED(cvt16u32f),
             (BinaryFunc)GET_OPTIMIZED(cvt16s32f), (BinaryFunc)GET_OPTIMIZED(cvt32s32f), (BinaryFunc)cvt32s,
-            (BinaryFunc)GET_OPTIMIZED(cvt64f32f), 0
+            (BinaryFunc)GET_OPTIMIZED(cvt64f32f), 0 //(BinaryFunc)(cvt16f32f)
         },
         {
             (BinaryFunc)GET_OPTIMIZED(cvt8u64f), (BinaryFunc)GET_OPTIMIZED(cvt8s64f), (BinaryFunc)GET_OPTIMIZED(cvt16u64f),
             (BinaryFunc)GET_OPTIMIZED(cvt16s64f), (BinaryFunc)GET_OPTIMIZED(cvt32s64f), (BinaryFunc)GET_OPTIMIZED(cvt32f64f),
-            (BinaryFunc)(cvt64s), 0
+            (BinaryFunc)(cvt64s), 0 //(BinaryFunc)(cvt16f64f)
         },
         {
             0, 0, 0, 0, 0, 0, 0, 0
+            //(BinaryFunc)(cvt8u16f), (BinaryFunc)(cvt8s16f), (BinaryFunc)(cvt16u16f), (BinaryFunc)(cvt16s16f),
+            //(BinaryFunc)(cvt32s16f), (BinaryFunc)(cvt32f16f), (BinaryFunc)(cvt64f16f), (BinaryFunc)(cvt16u)
         }
     };
-
     return cvtTab[CV_MAT_DEPTH(ddepth)][CV_MAT_DEPTH(sdepth)];
 }
 
-} // cv::
-
-#ifdef HAVE_IPP
-namespace cv
+#ifdef HAVE_OPENCL
+static bool ocl_convertFp16( InputArray _src, OutputArray _dst, int sdepth, int ddepth )
 {
-static bool ipp_convertTo(Mat &src, Mat &dst, double alpha, double beta)
-{
-#ifdef HAVE_IPP_IW
-    CV_INSTRUMENT_REGION_IPP()
+    int type = _src.type(), cn = CV_MAT_CN(type);
 
-    IppDataType srcDepth = ippiGetDataType(src.depth());
-    IppDataType dstDepth = ippiGetDataType(dst.depth());
-    int         channels = src.channels();
-
-    if(src.dims == 0)
+    _dst.createSameSize( _src, CV_MAKETYPE(ddepth, cn) );
+    int kercn = 1;
+    int rowsPerWI = 1;
+    String build_opt = format("-D HALF_SUPPORT -D srcT=%s -D dstT=%s -D rowsPerWI=%d%s",
+                              sdepth == CV_32F ? "float" : "half",
+                              sdepth == CV_32F ? "half" : "float",
+                              rowsPerWI,
+                              sdepth == CV_32F ? " -D FLOAT_TO_HALF " : "");
+    ocl::Kernel k("convertFp16", ocl::core::halfconvert_oclsrc, build_opt);
+    if (k.empty())
         return false;
 
-    ::ipp::IwiImage iwSrc;
-    ::ipp::IwiImage iwDst;
+    UMat src = _src.getUMat();
+    UMat dst = _dst.getUMat();
 
-    try
-    {
-        IppHintAlgorithm mode = ippAlgHintFast;
-        if(dstDepth == ipp64f ||
-            (dstDepth == ipp32f && (srcDepth == ipp32s || srcDepth == ipp64f)) ||
-            (dstDepth == ipp32s && (srcDepth == ipp32s || srcDepth == ipp64f)))
-            mode = ippAlgHintAccurate;
+    ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src),
+    dstarg = ocl::KernelArg::WriteOnly(dst, cn, kercn);
 
-        if(src.dims <= 2)
-        {
-            Size sz = getContinuousSize(src, dst, channels);
+    k.args(srcarg, dstarg);
 
-            iwSrc.Init(ippiSize(sz), srcDepth, 1, NULL, (void*)src.ptr(), src.step);
-            iwDst.Init(ippiSize(sz), dstDepth, 1, NULL, (void*)dst.ptr(), dst.step);
-
-            CV_INSTRUMENT_FUN_IPP(::ipp::iwiScale, iwSrc, iwDst, alpha, beta, ::ipp::IwiScaleParams(mode));
-        }
-        else
-        {
-            const Mat *arrays[] = {&src, &dst, NULL};
-            uchar     *ptrs[2]  = {NULL};
-            NAryMatIterator it(arrays, ptrs);
-
-            iwSrc.Init(ippiSize(it.size, 1), srcDepth, channels);
-            iwDst.Init(ippiSize(it.size, 1), dstDepth, channels);
-
-            for(size_t i = 0; i < it.nplanes; i++, ++it)
-            {
-                iwSrc.m_ptr  = ptrs[0];
-                iwDst.m_ptr  = ptrs[1];
-
-                CV_INSTRUMENT_FUN_IPP(::ipp::iwiScale, iwSrc, iwDst, alpha, beta, ::ipp::IwiScaleParams(mode));
-            }
-        }
-    }
-    catch (::ipp::IwException)
-    {
-        return false;
-    }
-    return true;
-#else
-    CV_UNUSED(src); CV_UNUSED(dst); CV_UNUSED(alpha); CV_UNUSED(beta);
-    return false;
-#endif
+    size_t globalsize[2] = { (size_t)src.cols * cn / kercn, ((size_t)src.rows + rowsPerWI - 1) / rowsPerWI };
+    return k.run(2, globalsize, NULL, false);
 }
-} // cv::
 #endif
 
+} // cv::
 
 void cv::Mat::convertTo(OutputArray _dst, int _type, double alpha, double beta) const
 {
@@ -1331,7 +443,6 @@ void cv::Mat::convertTo(OutputArray _dst, int _type, double alpha, double beta)
         _dst.create( dims, size, _type );
     Mat dst = _dst.getMat();
 
-    CV_IPP_RUN_FAST(ipp_convertTo(src, dst, alpha, beta ));
 
     BinaryFunc func = noScale ? getConvertFunc(sdepth, ddepth) : getConvertScaleFunc(sdepth, ddepth);
     double scale[] = {alpha, beta};
@@ -1341,7 +452,6 @@ void cv::Mat::convertTo(OutputArray _dst, int _type, double alpha, double beta)
     if( dims <= 2 )
     {
         Size sz = getContinuousSize(src, dst, cn);
-
         func( src.data, src.step, 0, 0, dst.data, dst.step, sz, scale );
     }
     else
@@ -1358,118 +468,30 @@ void cv::Mat::convertTo(OutputArray _dst, int _type, double alpha, double beta)
 
 //==================================================================================================
 
-namespace cv {
-
-// template for FP16 HW conversion function
-template<typename T, typename DT> static void
-cvtScaleHalf_( const T* src, size_t sstep, DT* dst, size_t dstep, Size size);
-
-template<> void
-cvtScaleHalf_<float, short>( const float* src, size_t sstep, short* dst, size_t dstep, Size size )
-{
-    CV_CPU_CALL_FP16_(cvtScaleHalf_SIMD32f16f, (src, sstep, dst, dstep, size));
-
-#if !CV_CPU_FORCE_FP16
-    sstep /= sizeof(src[0]);
-    dstep /= sizeof(dst[0]);
-
-    for( ; size.height--; src += sstep, dst += dstep )
-    {
-        for ( int x = 0; x < size.width; x++ )
-        {
-            dst[x] = convertFp16SW(src[x]);
-        }
-    }
-#endif
-}
-
-template<> void
-cvtScaleHalf_<short, float>( const short* src, size_t sstep, float* dst, size_t dstep, Size size )
-{
-    CV_CPU_CALL_FP16_(cvtScaleHalf_SIMD16f32f, (src, sstep, dst, dstep, size));
-
-#if !CV_CPU_FORCE_FP16
-    sstep /= sizeof(src[0]);
-    dstep /= sizeof(dst[0]);
-
-    for( ; size.height--; src += sstep, dst += dstep )
-    {
-        for ( int x = 0; x < size.width; x++ )
-        {
-            dst[x] = convertFp16SW(src[x]);
-        }
-    }
-#endif
-}
-
-#define DEF_CVT_SCALE_FP16_FUNC(suffix, stype, dtype) \
-static void cvtScaleHalf##suffix( const stype* src, size_t sstep, \
-dtype* dst, size_t dstep, Size size, void*) \
-{ \
-    cvtScaleHalf_<stype,dtype>(src, sstep, dst, dstep, size); \
-}
-
-DEF_CVT_SCALE_FP16_FUNC(32f16f, float, short)
-DEF_CVT_SCALE_FP16_FUNC(16f32f, short, float)
-
-static UnaryFunc getConvertFuncFp16(int ddepth)
-{
-    static UnaryFunc cvtTab[] =
-    {
-        0, 0, 0,
-        (UnaryFunc)(cvtScaleHalf32f16f), 0, (UnaryFunc)(cvtScaleHalf16f32f),
-        0, 0,
-    };
-    return cvtTab[CV_MAT_DEPTH(ddepth)];
-}
-
-
-#ifdef HAVE_OPENCL
-
-static bool ocl_convertFp16( InputArray _src, OutputArray _dst, int ddepth )
-{
-    int type = _src.type(), cn = CV_MAT_CN(type);
-
-    _dst.createSameSize( _src, CV_MAKETYPE(ddepth, cn) );
-    int kercn = 1;
-    int rowsPerWI = 1;
-    String build_opt = format("-D HALF_SUPPORT -D dstT=%s -D srcT=%s -D rowsPerWI=%d%s",
-                           ddepth == CV_16S ? "half" : "float",
-                           ddepth == CV_16S ? "float" : "half",
-                           rowsPerWI,
-                           ddepth == CV_16S ? " -D FLOAT_TO_HALF " : "");
-    ocl::Kernel k("convertFp16", ocl::core::halfconvert_oclsrc, build_opt);
-    if (k.empty())
-        return false;
-
-    UMat src = _src.getUMat();
-    UMat dst = _dst.getUMat();
-
-    ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src),
-            dstarg = ocl::KernelArg::WriteOnly(dst, cn, kercn);
-
-    k.args(srcarg, dstarg);
-
-    size_t globalsize[2] = { (size_t)src.cols * cn / kercn, ((size_t)src.rows + rowsPerWI - 1) / rowsPerWI };
-    return k.run(2, globalsize, NULL, false);
-}
-
-#endif
-
-} //cv::
-
-void cv::convertFp16( InputArray _src, OutputArray _dst)
+void cv::convertFp16( InputArray _src, OutputArray _dst )
 {
     CV_INSTRUMENT_REGION()
 
-    int ddepth = 0;
-    switch( _src.depth() )
+    int sdepth = _src.depth(), ddepth = 0;
+    BinaryFunc func = 0;
+
+    switch( sdepth )
     {
     case CV_32F:
-        ddepth = CV_16S;
+        if(_dst.fixedType())
+        {
+            ddepth = _dst.depth();
+            CV_Assert(ddepth == CV_16S /*|| ddepth == CV_16F*/);
+            CV_Assert(_dst.channels() == _src.channels());
+        }
+        else
+            ddepth =  CV_16S;
+        func = (BinaryFunc)cvt32f16f;
         break;
     case CV_16S:
+    //case CV_16F:
         ddepth = CV_32F;
+        func = (BinaryFunc)cvt16f32f;
         break;
     default:
         CV_Error(Error::StsUnsupportedFormat, "Unsupported input depth");
@@ -1477,21 +499,21 @@ void cv::convertFp16( InputArray _src, OutputArray _dst)
     }
 
     CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
-               ocl_convertFp16(_src, _dst, ddepth))
+               ocl_convertFp16(_src, _dst, sdepth, ddepth))
 
     Mat src = _src.getMat();
 
     int type = CV_MAKETYPE(ddepth, src.channels());
     _dst.create( src.dims, src.size, type );
     Mat dst = _dst.getMat();
-    UnaryFunc func = getConvertFuncFp16(ddepth);
     int cn = src.channels();
+
     CV_Assert( func != 0 );
 
     if( src.dims <= 2 )
     {
         Size sz = getContinuousSize(src, dst, cn);
-        func( src.data, src.step, dst.data, dst.step, sz, 0);
+        func( src.data, src.step, 0, 0, dst.data, dst.step, sz, 0);
     }
     else
     {
@@ -1501,6 +523,6 @@ void cv::convertFp16( InputArray _src, OutputArray _dst)
         Size sz((int)(it.size*cn), 1);
 
         for( size_t i = 0; i < it.nplanes; i++, ++it )
-            func(ptrs[0], 1, ptrs[1], 1, sz, 0);
+            func(ptrs[0], 0, 0, 0, ptrs[1], 0, sz, 0);
     }
 }
diff --git a/modules/core/src/convert.fp16.cpp b/modules/core/src/convert.fp16.cpp
deleted file mode 100644
index 7168e8d643..0000000000
--- a/modules/core/src/convert.fp16.cpp
+++ /dev/null
@@ -1,126 +0,0 @@
-// This file is part of OpenCV project.
-// It is subject to the license terms in the LICENSE file found in the top-level directory
-// of this distribution and at http://opencv.org/license.html
-
-
-#include "precomp.hpp"
-#include "convert.hpp"
-
-namespace cv
-{
-namespace opt_FP16
-{
-#if !defined(CV_NEON) || !CV_NEON
-const static int cVectorWidth = 8;
-
-void cvtScaleHalf_SIMD32f16f( const float* src, size_t sstep, short* dst, size_t dstep, cv::Size size )
-{
-    CV_INSTRUMENT_REGION()
-
-    sstep /= sizeof(src[0]);
-    dstep /= sizeof(dst[0]);
-
-    for( ; size.height--; src += sstep, dst += dstep )
-    {
-        int x = 0;
-        for ( ; x <= size.width - cVectorWidth ; x += cVectorWidth )
-        {
-            __m256 v_src = _mm256_loadu_ps(src + x);
-
-            // round to nearest even
-            __m128i v_dst = _mm256_cvtps_ph(v_src, 0);
-
-            _mm_storeu_si128((__m128i*)(dst + x), v_dst);
-        }
-
-        for ( ; x < size.width; x++ )
-        {
-            dst[x] = convertFp16SW(src[x]);
-        }
-    }
-}
-
-void cvtScaleHalf_SIMD16f32f( const short* src, size_t sstep, float* dst, size_t dstep, cv::Size size )
-{
-    CV_INSTRUMENT_REGION()
-
-    sstep /= sizeof(src[0]);
-    dstep /= sizeof(dst[0]);
-
-    for( ; size.height--; src += sstep, dst += dstep )
-    {
-        int x = 0;
-        for ( ; x <= size.width - cVectorWidth ; x += cVectorWidth )
-        {
-            __m128i v_src = _mm_loadu_si128((__m128i*)(src + x));
-
-            __m256 v_dst = _mm256_cvtph_ps(v_src);
-
-            _mm256_storeu_ps(dst + x, v_dst);
-        }
-
-        for ( ; x < size.width; x++ )
-        {
-            dst[x] = convertFp16SW(src[x]);
-        }
-    }
-}
-#elif CV_NEON
-const static int cVectorWidth = 4;
-
-void cvtScaleHalf_SIMD32f16f( const float* src, size_t sstep, short* dst, size_t dstep, cv::Size size )
-{
-    CV_INSTRUMENT_REGION()
-
-    sstep /= sizeof(src[0]);
-    dstep /= sizeof(dst[0]);
-
-    for( ; size.height--; src += sstep, dst += dstep )
-    {
-        int x = 0;
-        for ( ; x <= size.width - cVectorWidth ; x += cVectorWidth)
-        {
-            float32x4_t v_src = vld1q_f32(src + x);
-            float16x4_t v_dst = vcvt_f16_f32(v_src);
-
-            cv_vst1_f16(dst + x, v_dst);
-        }
-
-        for ( ; x < size.width; x++ )
-        {
-            dst[x] = convertFp16SW(src[x]);
-        }
-    }
-}
-
-void cvtScaleHalf_SIMD16f32f( const short* src, size_t sstep, float* dst, size_t dstep, cv::Size size )
-{
-    CV_INSTRUMENT_REGION()
-
-    sstep /= sizeof(src[0]);
-    dstep /= sizeof(dst[0]);
-
-    for( ; size.height--; src += sstep, dst += dstep )
-    {
-        int x = 0;
-        for ( ; x <= size.width - cVectorWidth ; x += cVectorWidth )
-        {
-            float16x4_t v_src = cv_vld1_f16((__fp16*)src + x);
-
-            float32x4_t v_dst = vcvt_f32_f16(v_src);
-
-            vst1q_f32(dst + x, v_dst);
-        }
-
-        for ( ; x < size.width; x++ )
-        {
-            dst[x] = convertFp16SW(src[x]);
-        }
-    }
-}
-#else
-#error "Unsupported build configuration"
-#endif
-}
-
-} // cv::
diff --git a/modules/core/src/convert.hpp b/modules/core/src/convert.hpp
index 580076367e..0d0aa3a770 100644
--- a/modules/core/src/convert.hpp
+++ b/modules/core/src/convert.hpp
@@ -8,192 +8,402 @@
 
 #include "opencv2/core/types.hpp"
 
-namespace
-{
-float convertFp16SW(short fp16);
-short convertFp16SW(float fp32);
-
-#if !CV_FP16_TYPE
-// const numbers for floating points format
-const unsigned int kShiftSignificand    = 13;
-const unsigned int kMaskFp16Significand = 0x3ff;
-const unsigned int kBiasFp16Exponent    = 15;
-const unsigned int kBiasFp32Exponent    = 127;
-#endif
-
-#if CV_FP16_TYPE
-inline float convertFp16SW(short fp16)
-{
-    // Fp16 -> Fp32
-    Cv16suf a;
-    a.i = fp16;
-    return (float)a.h;
-}
-#else
-inline float convertFp16SW(short fp16)
-{
-    // Fp16 -> Fp32
-    Cv16suf b;
-    b.i = fp16;
-    int exponent    = b.fmt.exponent - kBiasFp16Exponent;
-    int significand = b.fmt.significand;
-
-    Cv32suf a;
-    a.i = 0;
-    a.fmt.sign = b.fmt.sign; // sign bit
-    if( exponent == 16 )
-    {
-        // Inf or NaN
-        a.i = a.i | 0x7F800000;
-        if( significand != 0 )
-        {
-            // NaN
-#if defined(__x86_64__) || defined(_M_X64)
-            // 64bit
-            a.i = a.i | 0x7FC00000;
-#endif
-            a.fmt.significand = a.fmt.significand | (significand << kShiftSignificand);
-        }
-        return a.f;
-    }
-    else if ( exponent == -(int)kBiasFp16Exponent )
-    {
-        // subnormal in Fp16
-        if( significand == 0 )
-        {
-            // zero
-            return a.f;
-        }
-        else
-        {
-            int shift = -1;
-            while( ( significand & 0x400 ) == 0 )
-            {
-                significand = significand << 1;
-                shift++;
-            }
-            significand = significand & kMaskFp16Significand;
-            exponent -= shift;
-        }
-    }
-
-    a.fmt.exponent = (exponent+kBiasFp32Exponent);
-    a.fmt.significand = significand << kShiftSignificand;
-    return a.f;
-}
-#endif
-
-#if CV_FP16_TYPE
-inline short convertFp16SW(float fp32)
-{
-    // Fp32 -> Fp16
-    Cv16suf a;
-    a.h = (__fp16)fp32;
-    return a.i;
-}
-#else
-inline short convertFp16SW(float fp32)
-{
-    // Fp32 -> Fp16
-    Cv32suf a;
-    a.f = fp32;
-    int exponent    = a.fmt.exponent - kBiasFp32Exponent;
-    int significand = a.fmt.significand;
-
-    Cv16suf result;
-    result.i = 0;
-    unsigned int absolute = a.i & 0x7fffffff;
-    if( 0x477ff000 <= absolute )
-    {
-        // Inf in Fp16
-        result.i = result.i | 0x7C00;
-        if( exponent == 128 && significand != 0 )
-        {
-            // NaN
-            result.i = (short)( result.i | 0x200 | ( significand >> kShiftSignificand ) );
-        }
-    }
-    else if ( absolute < 0x33000001 )
-    {
-        // too small for fp16
-        result.i = 0;
-    }
-    else if ( absolute < 0x387fe000 )
-    {
-        // subnormal in Fp16
-        int fp16Significand = significand | 0x800000;
-        int bitShift = (-exponent) - 1;
-        fp16Significand = fp16Significand >> bitShift;
-
-        // special cases to round up
-        bitShift = exponent + 24;
-        int threshold = ( ( 0x400000 >> bitShift ) | ( ( ( significand & ( 0x800000 >> bitShift ) ) >> ( 126 - a.fmt.exponent ) ) ^ 1 ) );
-        if( absolute == 0x33c00000 )
-        {
-            result.i = 2;
-        }
-        else
-        {
-            if( threshold <= ( significand & ( 0xffffff >> ( exponent + 25 ) ) ) )
-            {
-                fp16Significand++;
-            }
-            result.i = (short)fp16Significand;
-        }
-    }
-    else
-    {
-        // usual situation
-        // exponent
-        result.fmt.exponent = ( exponent + kBiasFp16Exponent );
-
-        // significand;
-        short fp16Significand = (short)(significand >> kShiftSignificand);
-        result.fmt.significand = fp16Significand;
-
-        // special cases to round up
-        short lsb10bitsFp32 = (significand & 0x1fff);
-        short threshold = 0x1000 + ( ( fp16Significand & 0x1 ) ? 0 : 1 );
-        if( threshold <= lsb10bitsFp32 )
-        {
-            result.i++;
-        }
-        else if ( fp16Significand == kMaskFp16Significand && exponent == -15)
-        {
-            result.i++;
-        }
-    }
-
-    // sign bit
-    result.fmt.sign = a.fmt.sign;
-    return result.i;
-}
-#endif
-
-}
-
 namespace cv
 {
-namespace opt_FP16
+
+#if CV_SIMD
+
+static inline void vx_load_as(const uchar* ptr, v_float32& a)
+{ a = v_cvt_f32(v_reinterpret_as_s32(vx_load_expand_q(ptr))); }
+
+static inline void vx_load_as(const schar* ptr, v_float32& a)
+{ a = v_cvt_f32(vx_load_expand_q(ptr)); }
+
+static inline void vx_load_as(const ushort* ptr, v_float32& a)
+{ a = v_cvt_f32(v_reinterpret_as_s32(vx_load_expand(ptr))); }
+
+static inline void vx_load_as(const short* ptr, v_float32& a)
+{ a = v_cvt_f32(v_reinterpret_as_s32(vx_load_expand(ptr))); }
+
+static inline void vx_load_as(const int* ptr, v_float32& a)
+{ a = v_cvt_f32(vx_load(ptr)); }
+
+static inline void vx_load_as(const float* ptr, v_float32& a)
+{ a = vx_load(ptr); }
+
+static inline void vx_load_as(const float16_t* ptr, v_float32& a)
+{ a = vx_load_expand(ptr); }
+
+static inline void v_store_as(ushort* ptr, const v_float32& a)
+{ v_pack_u_store(ptr, v_round(a)); }
+
+static inline void v_store_as(short* ptr, const v_float32& a)
+{ v_pack_store(ptr, v_round(a)); }
+
+static inline void v_store_as(int* ptr, const v_float32& a)
+{ v_store(ptr, v_round(a)); }
+
+static inline void v_store_as(float* ptr, const v_float32& a)
+{ v_store(ptr, a); }
+
+static inline void v_store_as(float16_t* ptr, const v_float32& a)
+{ v_pack_store(ptr, a); }
+
+static inline void vx_load_pair_as(const uchar* ptr, v_uint16& a, v_uint16& b)
+{ v_expand(vx_load(ptr), a, b); }
+
+static inline void vx_load_pair_as(const schar* ptr, v_uint16& a, v_uint16& b)
 {
-void cvtScaleHalf_SIMD32f16f( const float* src, size_t sstep, short* dst, size_t dstep, cv::Size size );
-void cvtScaleHalf_SIMD16f32f( const short* src, size_t sstep, float* dst, size_t dstep, cv::Size size );
+    const v_int8 z = vx_setzero_s8();
+    v_int16 sa, sb;
+    v_expand(v_max(vx_load(ptr), z), sa, sb);
+    a = v_reinterpret_as_u16(sa);
+    b = v_reinterpret_as_u16(sb);
 }
-namespace opt_AVX2
+
+static inline void vx_load_pair_as(const ushort* ptr, v_uint16& a, v_uint16& b)
+{ a = vx_load(ptr); b = vx_load(ptr + v_uint16::nlanes); }
+
+static inline void vx_load_pair_as(const uchar* ptr, v_int16& a, v_int16& b)
 {
-void cvtScale_s16s32f32Line_AVX2(const short* src, int* dst, float scale, float shift, int width);
+    v_uint16 ua, ub;
+    v_expand(vx_load(ptr), ua, ub);
+    a = v_reinterpret_as_s16(ua);
+    b = v_reinterpret_as_s16(ub);
 }
-namespace opt_SSE4_1
+
+static inline void vx_load_pair_as(const schar* ptr, v_int16& a, v_int16& b)
+{ v_expand(vx_load(ptr), a, b); }
+
+static inline void vx_load_pair_as(const short* ptr, v_int16& a, v_int16& b)
+{ a = vx_load(ptr); b = vx_load(ptr + v_uint16::nlanes); }
+
+static inline void vx_load_pair_as(const uchar* ptr, v_int32& a, v_int32& b)
 {
-    int cvtScale_SIMD_u8u16f32_SSE41(const uchar * src, ushort * dst, int width, float scale, float shift);
-    int cvtScale_SIMD_s8u16f32_SSE41(const schar * src, ushort * dst, int width, float scale, float shift);
-    int cvtScale_SIMD_u16u16f32_SSE41(const ushort * src, ushort * dst, int width, float scale, float shift);
-    int cvtScale_SIMD_s16u16f32_SSE41(const short * src, ushort * dst, int width, float scale, float shift);
-    int cvtScale_SIMD_s32u16f32_SSE41(const int * src, ushort * dst, int width, float scale, float shift);
-    int cvtScale_SIMD_f32u16f32_SSE41(const float * src, ushort * dst, int width, float scale, float shift);
-    int cvtScale_SIMD_f64u16f32_SSE41(const double * src, ushort * dst, int width, float scale, float shift);
-    int Cvt_SIMD_f64u16_SSE41(const double * src, ushort * dst, int width);
+    v_uint32 ua, ub;
+    v_expand(vx_load_expand(ptr), ua, ub);
+    a = v_reinterpret_as_s32(ua);
+    b = v_reinterpret_as_s32(ub);
 }
+
+static inline void vx_load_pair_as(const schar* ptr, v_int32& a, v_int32& b)
+{ v_expand(vx_load_expand(ptr), a, b); }
+
+static inline void vx_load_pair_as(const ushort* ptr, v_int32& a, v_int32& b)
+{
+    v_uint32 ua, ub;
+    v_expand(vx_load(ptr), ua, ub);
+    a = v_reinterpret_as_s32(ua);
+    b = v_reinterpret_as_s32(ub);
+}
+
+static inline void vx_load_pair_as(const short* ptr, v_int32& a, v_int32& b)
+{
+    v_expand(vx_load(ptr), a, b);
+}
+
+static inline void vx_load_pair_as(const int* ptr, v_int32& a, v_int32& b)
+{
+    a = vx_load(ptr);
+    b = vx_load(ptr + v_int32::nlanes);
+}
+
+static inline void vx_load_pair_as(const uchar* ptr, v_float32& a, v_float32& b)
+{
+    v_uint32 ua, ub;
+    v_expand(vx_load_expand(ptr), ua, ub);
+    a = v_cvt_f32(v_reinterpret_as_s32(ua));
+    b = v_cvt_f32(v_reinterpret_as_s32(ub));
+}
+
+static inline void vx_load_pair_as(const schar* ptr, v_float32& a, v_float32& b)
+{
+    v_int32 ia, ib;
+    v_expand(vx_load_expand(ptr), ia, ib);
+    a = v_cvt_f32(ia);
+    b = v_cvt_f32(ib);
+}
+
+static inline void vx_load_pair_as(const ushort* ptr, v_float32& a, v_float32& b)
+{
+    v_uint32 ua, ub;
+    v_expand(vx_load(ptr), ua, ub);
+    a = v_cvt_f32(v_reinterpret_as_s32(ua));
+    b = v_cvt_f32(v_reinterpret_as_s32(ub));
+}
+
+static inline void vx_load_pair_as(const short* ptr, v_float32& a, v_float32& b)
+{
+    v_int32 ia, ib;
+    v_expand(vx_load(ptr), ia, ib);
+    a = v_cvt_f32(ia);
+    b = v_cvt_f32(ib);
+}
+
+static inline void vx_load_pair_as(const int* ptr, v_float32& a, v_float32& b)
+{
+    v_int32 ia = vx_load(ptr), ib = vx_load(ptr + v_int32::nlanes);
+    a = v_cvt_f32(ia);
+    b = v_cvt_f32(ib);
+}
+
+static inline void vx_load_pair_as(const float* ptr, v_float32& a, v_float32& b)
+{ a = vx_load(ptr); b = vx_load(ptr + v_float32::nlanes); }
+
+//static inline void vx_load_pair_as(const float16_t* ptr, v_float32& a, v_float32& b)
+//{
+//    a = vx_load_expand(ptr);
+//    b = vx_load_expand(ptr + v_float32::nlanes);
+//}
+
+
+static inline void v_store_pair_as(uchar* ptr, const v_uint16& a, const v_uint16& b)
+{
+    v_store(ptr, v_pack(a, b));
+}
+
+static inline void v_store_pair_as(schar* ptr, const v_uint16& a, const v_uint16& b)
+{
+    const v_uint8 maxval = vx_setall_u8((uchar)std::numeric_limits<schar>::max());
+    v_uint8 v = v_pack(a, b);
+    v_store(ptr, v_reinterpret_as_s8(v_min(v, maxval)));
+}
+
+static inline void v_store_pair_as(ushort* ptr, const v_uint16& a, const v_uint16& b)
+{ v_store(ptr, a); v_store(ptr + v_uint16::nlanes, b); }
+
+static inline void v_store_pair_as(uchar* ptr, const v_int16& a, const v_int16& b)
+{ v_store(ptr, v_pack_u(a, b)); }
+
+static inline void v_store_pair_as(schar* ptr, const v_int16& a, const v_int16& b)
+{ v_store(ptr, v_pack(a, b)); }
+
+static inline void v_store_pair_as(short* ptr, const v_int16& a, const v_int16& b)
+{ v_store(ptr, a); v_store(ptr + v_int16::nlanes, b); }
+
+static inline void v_store_pair_as(uchar* ptr, const v_int32& a, const v_int32& b)
+{ v_pack_u_store(ptr, v_pack(a, b)); }
+
+static inline void v_store_pair_as(schar* ptr, const v_int32& a, const v_int32& b)
+{ v_pack_store(ptr, v_pack(a, b)); }
+
+static inline void v_store_pair_as(ushort* ptr, const v_int32& a, const v_int32& b)
+{ v_store(ptr, v_pack_u(a, b)); }
+
+static inline void v_store_pair_as(short* ptr, const v_int32& a, const v_int32& b)
+{ v_store(ptr, v_pack(a, b)); }
+
+static inline void v_store_pair_as(int* ptr, const v_int32& a, const v_int32& b)
+{
+    v_store(ptr, a);
+    v_store(ptr + v_int32::nlanes, b);
+}
+
+static inline void v_store_pair_as(uchar* ptr, const v_float32& a, const v_float32& b)
+{ v_pack_u_store(ptr, v_pack(v_round(a), v_round(b))); }
+
+static inline void v_store_pair_as(schar* ptr, const v_float32& a, const v_float32& b)
+{ v_pack_store(ptr, v_pack(v_round(a), v_round(b))); }
+
+static inline void v_store_pair_as(ushort* ptr, const v_float32& a, const v_float32& b)
+{ v_store(ptr, v_pack_u(v_round(a), v_round(b))); }
+
+static inline void v_store_pair_as(short* ptr, const v_float32& a, const v_float32& b)
+{ v_store(ptr, v_pack(v_round(a), v_round(b))); }
+
+static inline void v_store_pair_as(int* ptr, const v_float32& a, const v_float32& b)
+{
+    v_int32 ia = v_round(a), ib = v_round(b);
+    v_store(ptr, ia);
+    v_store(ptr + v_int32::nlanes, ib);
+}
+
+static inline void v_store_pair_as(float* ptr, const v_float32& a, const v_float32& b)
+{ v_store(ptr, a); v_store(ptr + v_float32::nlanes, b); }
+
+#if CV_SIMD_64F
+
+static inline void vx_load_as(const double* ptr, v_float32& a)
+{
+    v_float64 v0 = vx_load(ptr), v1 = vx_load(ptr + v_float64::nlanes);
+    a = v_cvt_f32(v0, v1);
+}
+
+static inline void vx_load_pair_as(const double* ptr, v_int32& a, v_int32& b)
+{
+    v_float64 v0 = vx_load(ptr), v1 = vx_load(ptr + v_float64::nlanes);
+    v_float64 v2 = vx_load(ptr + v_float64::nlanes*2), v3 = vx_load(ptr + v_float64::nlanes*3);
+    v_int32 iv0 = v_round(v0), iv1 = v_round(v1);
+    v_int32 iv2 = v_round(v2), iv3 = v_round(v3);
+    a = v_combine_low(iv0, iv1);
+    b = v_combine_low(iv2, iv3);
+}
+
+static inline void vx_load_pair_as(const double* ptr, v_float32& a, v_float32& b)
+{
+    v_float64 v0 = vx_load(ptr), v1 = vx_load(ptr + v_float64::nlanes);
+    v_float64 v2 = vx_load(ptr + v_float64::nlanes*2), v3 = vx_load(ptr + v_float64::nlanes*3);
+    a = v_cvt_f32(v0, v1);
+    b = v_cvt_f32(v2, v3);
+}
+
+static inline void vx_load_pair_as(const uchar* ptr, v_float64& a, v_float64& b)
+{
+    v_int32 v0 = v_reinterpret_as_s32(vx_load_expand_q(ptr));
+    a = v_cvt_f64(v0);
+    b = v_cvt_f64_high(v0);
+}
+
+static inline void vx_load_pair_as(const schar* ptr, v_float64& a, v_float64& b)
+{
+    v_int32 v0 = vx_load_expand_q(ptr);
+    a = v_cvt_f64(v0);
+    b = v_cvt_f64_high(v0);
+}
+
+static inline void vx_load_pair_as(const ushort* ptr, v_float64& a, v_float64& b)
+{
+    v_int32 v0 = v_reinterpret_as_s32(vx_load_expand(ptr));
+    a = v_cvt_f64(v0);
+    b = v_cvt_f64_high(v0);
+}
+
+static inline void vx_load_pair_as(const short* ptr, v_float64& a, v_float64& b)
+{
+    v_int32 v0 = vx_load_expand(ptr);
+    a = v_cvt_f64(v0);
+    b = v_cvt_f64_high(v0);
+}
+
+static inline void vx_load_pair_as(const int* ptr, v_float64& a, v_float64& b)
+{
+    v_int32 v0 = vx_load(ptr);
+    a = v_cvt_f64(v0);
+    b = v_cvt_f64_high(v0);
+}
+
+static inline void vx_load_pair_as(const float* ptr, v_float64& a, v_float64& b)
+{
+    v_float32 v0 = vx_load(ptr);
+    a = v_cvt_f64(v0);
+    b = v_cvt_f64_high(v0);
+}
+
+static inline void vx_load_pair_as(const double* ptr, v_float64& a, v_float64& b)
+{
+    a = vx_load(ptr);
+    b = vx_load(ptr + v_float64::nlanes);
+}
+
+//static inline void vx_load_pair_as(const float16_t* ptr, v_float64& a, v_float64& b)
+//{
+//    v_float32 v0 = vx_load_expand(ptr);
+//    a = v_cvt_f64(v0);
+//    b = v_cvt_f64_high(v0);
+//}
+
+static inline void v_store_as(double* ptr, const v_float32& a)
+{
+    v_float64 fa0 = v_cvt_f64(a), fa1 = v_cvt_f64_high(a);
+    v_store(ptr, fa0);
+    v_store(ptr + v_float64::nlanes, fa1);
+}
+
+static inline void v_store_pair_as(double* ptr, const v_int32& a, const v_int32& b)
+{
+    v_float64 fa0 = v_cvt_f64(a), fa1 = v_cvt_f64_high(a);
+    v_float64 fb0 = v_cvt_f64(b), fb1 = v_cvt_f64_high(b);
+
+    v_store(ptr, fa0);
+    v_store(ptr + v_float64::nlanes, fa1);
+    v_store(ptr + v_float64::nlanes*2, fb0);
+    v_store(ptr + v_float64::nlanes*3, fb1);
+}
+
+static inline void v_store_pair_as(double* ptr, const v_float32& a, const v_float32& b)
+{
+    v_float64 fa0 = v_cvt_f64(a), fa1 = v_cvt_f64_high(a);
+    v_float64 fb0 = v_cvt_f64(b), fb1 = v_cvt_f64_high(b);
+
+    v_store(ptr, fa0);
+    v_store(ptr + v_float64::nlanes, fa1);
+    v_store(ptr + v_float64::nlanes*2, fb0);
+    v_store(ptr + v_float64::nlanes*3, fb1);
+}
+
+static inline void v_store_pair_as(double* ptr, const v_float64& a, const v_float64& b)
+{
+    v_store(ptr, a);
+    v_store(ptr + v_float64::nlanes, b);
+}
+
+static inline void v_store_pair_as(int* ptr, const v_float64& a, const v_float64& b)
+{
+    v_int32 ia = v_round(a), ib = v_round(b);
+    v_store(ptr, v_combine_low(ia, ib));
+}
+
+static inline void v_store_pair_as(float* ptr, const v_float64& a, const v_float64& b)
+{
+    v_float32 v = v_cvt_f32(a, b);
+    v_store(ptr, v);
+}
+
+//static inline void v_store_pair_as(float16_t* ptr, const v_float64& a, const v_float64& b)
+//{
+//    v_float32 v = v_cvt_f32(a, b);
+//    v_pack_store(ptr, v);
+//}
+
+#else
+
+static inline void vx_load_as(const double* ptr, v_float32& a)
+{
+    const int VECSZ = v_float32::nlanes;
+    float buf[VECSZ*2];
+
+    for( int i = 0; i < VECSZ; i++ )
+        buf[i] = saturate_cast<float>(ptr[i]);
+    a = vx_load(buf);
+}
+
+template<typename _Tdvec>
+static inline void vx_load_pair_as(const double* ptr, _Tdvec& a, _Tdvec& b)
+{
+    const int VECSZ = _Tdvec::nlanes;
+    typename _Tdvec::lane_type buf[VECSZ*2];
+
+    for( int i = 0; i < VECSZ*2; i++ )
+        buf[i] = saturate_cast<typename _Tdvec::lane_type>(ptr[i]);
+    a = vx_load(buf);
+    b = vx_load(buf + VECSZ);
+}
+
+static inline void v_store_as(double* ptr, const v_float32& a)
+{
+    const int VECSZ = v_float32::nlanes;
+    float buf[VECSZ];
+
+    v_store(buf, a);
+    for( int i = 0; i < VECSZ; i++ )
+        ptr[i] = (double)buf[i];
+}
+
+template<typename _Tsvec>
+static inline void v_store_pair_as(double* ptr, const _Tsvec& a, const _Tsvec& b)
+{
+    const int VECSZ = _Tsvec::nlanes;
+    typename _Tsvec::lane_type buf[VECSZ*2];
+
+    v_store(buf, a); v_store(buf + VECSZ, b);
+    for( int i = 0; i < VECSZ*2; i++ )
+        ptr[i] = (double)buf[i];
+}
+
+#endif /////////// CV_SIMD_64F
+
+#endif /////////// CV_SIMD
+
 }
 
 #endif // SRC_CONVERT_HPP
diff --git a/modules/core/src/convert.sse4_1.cpp b/modules/core/src/convert.sse4_1.cpp
deleted file mode 100644
index 3c18063d1d..0000000000
--- a/modules/core/src/convert.sse4_1.cpp
+++ /dev/null
@@ -1,203 +0,0 @@
-// This file is part of OpenCV project.
-// It is subject to the license terms in the LICENSE file found in the top-level directory
-// of this distribution and at http://opencv.org/license.html
-
-
-#include "precomp.hpp"
-#include "convert.hpp"
-
-namespace cv
-{
-namespace opt_SSE4_1
-{
-
-int cvtScale_SIMD_u8u16f32_SSE41(const uchar * src, ushort * dst, int width, float scale, float shift)
-{
-    int x = 0;
-
-    __m128i v_zero = _mm_setzero_si128();
-    __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
-
-    for ( ; x <= width - 8; x += 8)
-    {
-        __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero);
-        __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero));
-        __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
-
-        v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero));
-        __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
-
-        __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0),
-                                            _mm_cvtps_epi32(v_dst_1));
-        _mm_storeu_si128((__m128i *)(dst + x), v_dst);
-    }
-
-    return x;
-}
-
-int cvtScale_SIMD_s8u16f32_SSE41(const schar * src, ushort * dst, int width, float scale, float shift)
-{
-    int x = 0;
-
-    __m128i v_zero = _mm_setzero_si128();
-    __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
-
-    for ( ; x <= width - 8; x += 8)
-    {
-        __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src + x))), 8);
-        __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
-        __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
-
-        v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
-        __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
-
-        __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0),
-                                            _mm_cvtps_epi32(v_dst_1));
-        _mm_storeu_si128((__m128i *)(dst + x), v_dst);
-    }
-
-    return x;
-}
-
-int cvtScale_SIMD_u16u16f32_SSE41(const ushort * src, ushort * dst, int width, float scale, float shift)
-{
-    int x = 0;
-
-    __m128i v_zero = _mm_setzero_si128();
-    __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
-
-    for ( ; x <= width - 8; x += 8)
-    {
-        __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
-        __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero));
-        __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
-
-        v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero));
-        __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
-
-        __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0),
-                                            _mm_cvtps_epi32(v_dst_1));
-        _mm_storeu_si128((__m128i *)(dst + x), v_dst);
-    }
-
-    return x;
-}
-
-int cvtScale_SIMD_s16u16f32_SSE41(const short * src, ushort * dst, int width, float scale, float shift)
-{
-    int x = 0;
-
-    __m128i v_zero = _mm_setzero_si128();
-    __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
-
-    for ( ; x <= width - 8; x += 8)
-    {
-        __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
-        __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
-        __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
-
-        v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
-        __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
-
-        __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0),
-                                            _mm_cvtps_epi32(v_dst_1));
-        _mm_storeu_si128((__m128i *)(dst + x), v_dst);
-    }
-
-    return x;
-}
-
-int cvtScale_SIMD_s32u16f32_SSE41(const int * src, ushort * dst, int width, float scale, float shift)
-{
-    int x = 0;
-
-    __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
-
-    for ( ; x <= width - 8; x += 8)
-    {
-        __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
-        __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift);
-
-        v_src = _mm_loadu_si128((__m128i const *)(src + x + 4));
-        __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift);
-
-        __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0),
-                                            _mm_cvtps_epi32(v_dst_1));
-        _mm_storeu_si128((__m128i *)(dst + x), v_dst);
-    }
-
-    return x;
-}
-
-int cvtScale_SIMD_f32u16f32_SSE41(const float * src, ushort * dst, int width, float scale, float shift)
-{
-    int x = 0;
-
-    __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
-
-    for ( ; x <= width - 8; x += 8)
-    {
-        __m128 v_src = _mm_loadu_ps(src + x);
-        __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
-
-        v_src = _mm_loadu_ps(src + x + 4);
-        __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
-
-        __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0),
-                                            _mm_cvtps_epi32(v_dst_1));
-        _mm_storeu_si128((__m128i *)(dst + x), v_dst);
-    }
-
-    return x;
-}
-
-int cvtScale_SIMD_f64u16f32_SSE41(const double * src, ushort * dst, int width, float scale, float shift)
-{
-    int x = 0;
-
-    __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
-
-    for ( ; x <= width - 8; x += 8)
-    {
-        __m128 v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x)),
-                                        _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2)));
-        __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
-
-        v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x + 4)),
-                                _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6)));
-        __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
-
-        __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0),
-                                            _mm_cvtps_epi32(v_dst_1));
-        _mm_storeu_si128((__m128i *)(dst + x), v_dst);
-    }
-
-    return x;
-}
-
-int Cvt_SIMD_f64u16_SSE41(const double * src, ushort * dst, int width)
-{
-    int x = 0;
-
-    for ( ; x <= width - 8; x += 8)
-    {
-        __m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x));
-        __m128 v_src1 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2));
-        __m128 v_src2 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 4));
-        __m128 v_src3 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6));
-
-        v_src0 = _mm_movelh_ps(v_src0, v_src1);
-        v_src1 = _mm_movelh_ps(v_src2, v_src3);
-
-        __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_src0),
-                                            _mm_cvtps_epi32(v_src1));
-        _mm_storeu_si128((__m128i *)(dst + x), v_dst);
-    }
-
-    return x;
-}
-
-}
-} // cv::
-
-/* End of file. */
diff --git a/modules/core/src/convert_scale.cpp b/modules/core/src/convert_scale.cpp
index 25f5a963b7..0d4b5151a3 100644
--- a/modules/core/src/convert_scale.cpp
+++ b/modules/core/src/convert_scale.cpp
@@ -14,1623 +14,278 @@
 namespace cv
 {
 
-template<typename T, typename DT, typename WT>
-struct cvtScaleAbs_SIMD
+template<typename _Ts, typename _Td> inline void
+cvtabs_32f( const _Ts* src, size_t sstep, _Td* dst, size_t dstep,
+            Size size, float a, float b )
 {
-    int operator () (const T *, DT *, int, WT, WT) const
-    {
-        return 0;
-    }
-};
-
-#if CV_SIMD128
-
-static inline void v_load_expand_from_u8_f32(const uchar* src, const v_float32x4 &v_scale, const v_float32x4 &v_shift, v_float32x4 &a, v_float32x4 &b)
-{
-    v_uint32x4 v_src0, v_src1;
-    v_expand(v_load_expand(src), v_src0, v_src1);
-
-    a = v_shift + v_scale * v_cvt_f32(v_reinterpret_as_s32(v_src0));
-    b = v_shift + v_scale * v_cvt_f32(v_reinterpret_as_s32(v_src1));
-}
-
-static inline void v_load_expand_from_s8_f32(const schar* src, const v_float32x4 &v_scale, const v_float32x4 &v_shift, v_float32x4 &a, v_float32x4 &b)
-{
-    v_int32x4 v_src0, v_src1;
-    v_expand(v_load_expand(src), v_src0, v_src1);
-
-    a = v_shift + v_scale * v_cvt_f32(v_src0);
-    b = v_shift + v_scale * v_cvt_f32(v_src1);
-}
-
-static inline void v_load_expand_from_u16_f32(const ushort* src, const v_float32x4 &v_scale, const v_float32x4 &v_shift, v_float32x4 &a, v_float32x4 &b)
-{
-    v_uint32x4 v_src0, v_src1;
-    v_expand(v_load(src), v_src0, v_src1);
-
-    a = v_shift + v_scale * v_cvt_f32(v_reinterpret_as_s32(v_src0));
-    b = v_shift + v_scale * v_cvt_f32(v_reinterpret_as_s32(v_src1));
-}
-
-static inline void v_load_expand_from_s16_f32(const short* src, const v_float32x4 &v_scale, const v_float32x4 &v_shift, v_float32x4 &a, v_float32x4 &b)
-{
-    v_int32x4 v_src0, v_src1;
-    v_expand(v_load(src), v_src0, v_src1);
-
-    a = v_shift + v_scale * v_cvt_f32(v_src0);
-    b = v_shift + v_scale * v_cvt_f32(v_src1);
-}
-
-static inline void v_load_expand_from_s32_f32(const int* src, const v_float32x4 &v_scale, const v_float32x4 &v_shift, v_float32x4 &a, v_float32x4 &b)
-{
-    a = v_shift + v_scale * v_cvt_f32(v_load(src));
-    b = v_shift + v_scale * v_cvt_f32(v_load(src + v_int32x4::nlanes));
-}
-
-template <>
-struct cvtScaleAbs_SIMD<uchar, uchar, float>
-{
-    int operator () (const uchar * src, uchar * dst, int width,
-        float scale, float shift) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            v_float32x4 v_shift = v_setall_f32(shift);
-            v_float32x4 v_scale = v_setall_f32(scale);
-            const int cWidth = v_uint16x8::nlanes;
-            for (; x <= width - cWidth * 2; x += cWidth * 2)
-            {
-                v_float32x4 v_dst_0, v_dst_1, v_dst_2, v_dst_3;
-                v_load_expand_from_u8_f32(src + x, v_scale, v_shift, v_dst_0, v_dst_1);
-                v_load_expand_from_u8_f32(src + x + cWidth, v_scale, v_shift, v_dst_2, v_dst_3);
-                v_dst_0 = v_abs(v_dst_0);
-                v_dst_1 = v_abs(v_dst_1);
-                v_dst_2 = v_abs(v_dst_2);
-                v_dst_3 = v_abs(v_dst_3);
-
-                v_int16x8 v_dsti_0 = v_pack(v_round(v_dst_0), v_round(v_dst_1));
-                v_int16x8 v_dsti_1 = v_pack(v_round(v_dst_2), v_round(v_dst_3));
-                v_store(dst + x, v_pack_u(v_dsti_0, v_dsti_1));
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct cvtScaleAbs_SIMD<schar, uchar, float>
-{
-    int operator () (const schar * src, uchar * dst, int width,
-        float scale, float shift) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            v_float32x4 v_shift = v_setall_f32(shift);
-            v_float32x4 v_scale = v_setall_f32(scale);
-            const int cWidth = v_int16x8::nlanes;
-            for (; x <= width - cWidth*2; x += cWidth*2)
-            {
-                v_float32x4 v_dst_0, v_dst_1, v_dst_2, v_dst_3;
-                v_load_expand_from_s8_f32(src + x, v_scale, v_shift, v_dst_0, v_dst_1);
-                v_load_expand_from_s8_f32(src + x + cWidth, v_scale, v_shift, v_dst_2, v_dst_3);
-                v_dst_0 = v_abs(v_dst_0);
-                v_dst_1 = v_abs(v_dst_1);
-                v_dst_2 = v_abs(v_dst_2);
-                v_dst_3 = v_abs(v_dst_3);
-
-                v_uint16x8 v_dsti_0 = v_pack_u(v_round(v_dst_0), v_round(v_dst_1));
-                v_uint16x8 v_dsti_1 = v_pack_u(v_round(v_dst_2), v_round(v_dst_3));
-                v_store(dst + x, v_pack(v_dsti_0, v_dsti_1));
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct cvtScaleAbs_SIMD<ushort, uchar, float>
-{
-    int operator () (const ushort * src, uchar * dst, int width,
-        float scale, float shift) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            v_float32x4 v_shift = v_setall_f32(shift);
-            v_float32x4 v_scale = v_setall_f32(scale);
-            const int cWidth = v_uint16x8::nlanes;
-            for (; x <= width - cWidth; x += cWidth)
-            {
-                v_float32x4 v_dst0, v_dst1;
-                v_load_expand_from_u16_f32(src + x, v_scale, v_shift, v_dst0, v_dst1);
-                v_dst0 = v_abs(v_dst0);
-                v_dst1 = v_abs(v_dst1);
-
-                v_int16x8 v_dst = v_pack(v_round(v_dst0), v_round(v_dst1));
-                v_pack_u_store(dst + x, v_dst);
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct cvtScaleAbs_SIMD<short, uchar, float>
-{
-    int operator () (const short * src, uchar * dst, int width,
-        float scale, float shift) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            v_float32x4 v_shift = v_setall_f32(shift);
-            v_float32x4 v_scale = v_setall_f32(scale);
-            const int cWidth = v_int16x8::nlanes;
-            for (; x <= width - cWidth; x += cWidth)
-            {
-                v_float32x4 v_dst0, v_dst1;
-                v_load_expand_from_s16_f32(src + x, v_scale, v_shift, v_dst0, v_dst1);
-                v_dst0 = v_abs(v_dst0);
-                v_dst1 = v_abs(v_dst1);
-
-                v_int16x8 v_dst = v_pack(v_round(v_dst0), v_round(v_dst1));
-                v_pack_u_store(dst + x, v_dst);
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct cvtScaleAbs_SIMD<int, uchar, float>
-{
-    int operator () (const int * src, uchar * dst, int width,
-        float scale, float shift) const
-    {
-        int x = 0;
-        v_float32x4 v_shift = v_setall_f32(shift);
-        v_float32x4 v_scale = v_setall_f32(scale);
-        const int cWidth = v_int32x4::nlanes;
-        for (; x <= width - cWidth * 2; x += cWidth * 2)
-        {
-            v_float32x4 v_dst_0 = v_cvt_f32(v_load(src + x)) * v_scale;
-            v_dst_0 = v_abs(v_dst_0 + v_shift);
-
-            v_float32x4 v_dst_1 = v_cvt_f32(v_load(src + x + cWidth)) * v_scale;
-            v_dst_1 = v_abs(v_dst_1 + v_shift);
-
-            v_int16x8 v_dst = v_pack(v_round(v_dst_0), v_round(v_dst_1));
-            v_pack_u_store(dst + x, v_dst);
-        }
-
-        return x;
-    }
-};
-
-template <>
-struct cvtScaleAbs_SIMD<float, uchar, float>
-{
-    int operator () (const float * src, uchar * dst, int width,
-        float scale, float shift) const
-    {
-        int x = 0;
-        v_float32x4 v_shift = v_setall_f32(shift);
-        v_float32x4 v_scale = v_setall_f32(scale);
-        int cWidth = v_float32x4::nlanes;
-        for (; x <= width - cWidth * 2; x += cWidth * 2)
-        {
-            v_float32x4 v_dst_0 = v_load(src + x) * v_scale;
-            v_dst_0 = v_abs(v_dst_0 + v_shift);
-
-            v_float32x4 v_dst_1 = v_load(src + x + cWidth) * v_scale;
-            v_dst_1 = v_abs(v_dst_1 + v_shift);
-
-            v_int16x8 v_dst = v_pack(v_round(v_dst_0), v_round(v_dst_1));
-            v_pack_u_store(dst + x, v_dst);
-        }
-        return x;
-    }
-};
-
-#if CV_SIMD128_64F
-template <>
-struct cvtScaleAbs_SIMD<double, uchar, float>
-{
-    int operator () (const double * src, uchar * dst, int width,
-        float scale, float shift) const
-    {
-        int x = 0;
-
-        if (hasSIMD128())
-        {
-            v_float32x4 v_scale = v_setall_f32(scale);
-            v_float32x4 v_shift = v_setall_f32(shift);
-            int cWidth = v_float64x2::nlanes;
-            for (; x <= width - cWidth * 4; x += cWidth * 4)
-            {
-                v_float32x4 v_src1, v_src2, v_dummy;
-                v_recombine(v_cvt_f32(v_load(src + x)), v_cvt_f32(v_load(src + x + cWidth)), v_src1, v_dummy);
-                v_recombine(v_cvt_f32(v_load(src + x + cWidth * 2)), v_cvt_f32(v_load(src + x + cWidth * 3)), v_src2, v_dummy);
-
-                v_float32x4 v_dst1 = v_abs((v_src1 * v_scale) + v_shift);
-                v_float32x4 v_dst2 = v_abs((v_src2 * v_scale) + v_shift);
-
-                v_int16x8 v_dst_i = v_pack(v_round(v_dst1), v_round(v_dst2));
-                v_pack_u_store(dst + x, v_dst_i);
-            }
-        }
-
-        return x;
-    }
-};
-#endif // CV_SIMD128_64F
-
+#if CV_SIMD
+    v_float32 va = vx_setall_f32(a), vb = vx_setall_f32(b);
+    const int VECSZ = v_float32::nlanes*2;
 #endif
-
-template<typename T, typename DT, typename WT> static void
-cvtScaleAbs_( const T* src, size_t sstep,
-              DT* dst, size_t dstep, Size size,
-              WT scale, WT shift )
-{
-    sstep /= sizeof(src[0]);
-    dstep /= sizeof(dst[0]);
-    cvtScaleAbs_SIMD<T, DT, WT> vop;
-
-    for( ; size.height--; src += sstep, dst += dstep )
-    {
-        int x = vop(src, dst, size.width, scale, shift);
-
-        #if CV_ENABLE_UNROLLED
-        for( ; x <= size.width - 4; x += 4 )
-        {
-            DT t0, t1;
-            t0 = saturate_cast<DT>(std::abs(src[x]*scale + shift));
-            t1 = saturate_cast<DT>(std::abs(src[x+1]*scale + shift));
-            dst[x] = t0; dst[x+1] = t1;
-            t0 = saturate_cast<DT>(std::abs(src[x+2]*scale + shift));
-            t1 = saturate_cast<DT>(std::abs(src[x+3]*scale + shift));
-            dst[x+2] = t0; dst[x+3] = t1;
-        }
-        #endif
-        for( ; x < size.width; x++ )
-            dst[x] = saturate_cast<DT>(std::abs(src[x]*scale + shift));
-    }
-}
-
-template <typename T, typename DT, typename WT>
-struct cvtScale_SIMD
-{
-    int operator () (const T *, DT *, int, WT, WT) const
-    {
-        return 0;
-    }
-};
-
-#if CV_SIMD128
-
-// from uchar
-
-template <>
-struct cvtScale_SIMD<uchar, uchar, float>
-{
-    int operator () (const uchar * src, uchar * dst, int width, float scale, float shift) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
-            int cWidth = v_uint16x8::nlanes;
-            for (; x <= width - cWidth; x += cWidth)
-            {
-                v_float32x4 v_src1, v_src2;
-                v_load_expand_from_u8_f32(src + x, v_scale, v_shift, v_src1, v_src2);
-
-                v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2));
-                v_pack_u_store(dst + x, v_dst);
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct cvtScale_SIMD<uchar, schar, float>
-{
-    int operator () (const uchar * src, schar * dst, int width, float scale, float shift) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
-            int cWidth = v_int16x8::nlanes;
-            for (; x <= width - cWidth; x += cWidth)
-            {
-                v_float32x4 v_src1, v_src2;
-                v_load_expand_from_u8_f32(src + x, v_scale, v_shift, v_src1, v_src2);
-
-                v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2));
-                v_store_low(dst + x, v_pack(v_dst, v_dst));
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct cvtScale_SIMD<uchar, ushort, float>
-{
-    int operator () (const uchar * src, ushort * dst, int width, float scale, float shift) const
-    {
-        int x = 0;
-#if CV_TRY_SSE4_1
-        if (CV_CPU_HAS_SUPPORT_SSE4_1)
-            return opt_SSE4_1::cvtScale_SIMD_u8u16f32_SSE41(src, dst, width, scale, shift);
-#endif
-        if (hasSIMD128())
-        {
-            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
-            int cWidth = v_uint16x8::nlanes;
-            for (; x <= width - cWidth; x += cWidth)
-            {
-                v_float32x4 v_src1, v_src2;
-                v_load_expand_from_u8_f32(src + x, v_scale, v_shift, v_src1, v_src2);
-
-                v_uint16x8 v_dst = v_pack_u(v_round(v_src1), v_round(v_src2));
-                v_store(dst + x, v_dst);
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct cvtScale_SIMD<uchar, short, float>
-{
-    int operator () (const uchar * src, short * dst, int width, float scale, float shift) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
-            int cWidth = v_int16x8::nlanes;
-            for (; x <= width - cWidth; x += cWidth)
-            {
-                v_float32x4 v_src1, v_src2;
-                v_load_expand_from_u8_f32(src + x, v_scale, v_shift, v_src1, v_src2);
-
-                v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2));
-                v_store(dst + x, v_dst);
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct cvtScale_SIMD<uchar, int, float>
-{
-    int operator () (const uchar * src, int * dst, int width, float scale, float shift) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
-            int cWidth = v_int32x4::nlanes;
-            for (; x <= width - cWidth * 2; x += cWidth * 2)
-            {
-                v_float32x4 v_src1, v_src2;
-                v_load_expand_from_u8_f32(src + x, v_scale, v_shift, v_src1, v_src2);
-
-                v_store(dst + x, v_round(v_src1));
-                v_store(dst + x + cWidth, v_round(v_src2));
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct cvtScale_SIMD<uchar, float, float>
-{
-    int operator () (const uchar * src, float * dst, int width, float scale, float shift) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
-            int cWidth = v_float32x4::nlanes;
-            for (; x <= width - cWidth * 2; x += cWidth * 2)
-            {
-                v_float32x4 v_src1, v_src2;
-                v_load_expand_from_u8_f32(src + x, v_scale, v_shift, v_src1, v_src2);
-
-                v_store(dst + x, v_src1);
-                v_store(dst + x + cWidth, v_src2);
-            }
-        }
-        return x;
-    }
-};
-
-// from schar
-
-template <>
-struct cvtScale_SIMD<schar, uchar, float>
-{
-    int operator () (const schar * src, uchar * dst, int width, float scale, float shift) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
-            int cWidth = v_uint16x8::nlanes;
-            for (; x <= width - cWidth; x += cWidth)
-            {
-                v_float32x4 v_src1, v_src2;
-                v_load_expand_from_s8_f32(src + x, v_scale, v_shift, v_src1, v_src2);
-
-                v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2));
-                v_pack_u_store(dst + x, v_dst);
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct cvtScale_SIMD<schar, schar, float>
-{
-    int operator () (const schar * src, schar * dst, int width, float scale, float shift) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
-            int cWidth = v_int16x8::nlanes;
-            for (; x <= width - cWidth; x += cWidth)
-            {
-                v_float32x4 v_src1, v_src2;
-                v_load_expand_from_s8_f32(src + x, v_scale, v_shift, v_src1, v_src2);
-
-                v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2));
-                v_store_low(dst + x, v_pack(v_dst, v_dst));
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct cvtScale_SIMD<schar, ushort, float>
-{
-    int operator () (const schar * src, ushort * dst, int width, float scale, float shift) const
-    {
-        int x = 0;
-#if CV_TRY_SSE4_1
-        if (CV_CPU_HAS_SUPPORT_SSE4_1)
-            return opt_SSE4_1::cvtScale_SIMD_s8u16f32_SSE41(src, dst, width, scale, shift);
-#endif
-        if (hasSIMD128())
-        {
-            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
-            int cWidth = v_uint16x8::nlanes;
-            for (; x <= width - cWidth; x += cWidth)
-            {
-                v_float32x4 v_src1, v_src2;
-                v_load_expand_from_s8_f32(src + x, v_scale, v_shift, v_src1, v_src2);
-
-                v_uint16x8 v_dst = v_pack_u(v_round(v_src1), v_round(v_src2));
-                v_store(dst + x, v_dst);
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct cvtScale_SIMD<schar, short, float>
-{
-    int operator () (const schar * src, short * dst, int width, float scale, float shift) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
-            int cWidth = v_int16x8::nlanes;
-            for (; x <= width - cWidth; x += cWidth)
-            {
-                v_float32x4 v_src1, v_src2;
-                v_load_expand_from_s8_f32(src + x, v_scale, v_shift, v_src1, v_src2);
-
-                v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2));
-                v_store(dst + x, v_dst);
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct cvtScale_SIMD<schar, int, float>
-{
-    int operator () (const schar * src, int * dst, int width, float scale, float shift) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
-            int cWidth = v_int32x4::nlanes;
-            for (; x <= width - cWidth * 2; x += cWidth * 2)
-            {
-                v_float32x4 v_src1, v_src2;
-                v_load_expand_from_s8_f32(src + x, v_scale, v_shift, v_src1, v_src2);
-
-                v_store(dst + x, v_round(v_src1));
-                v_store(dst + x + cWidth, v_round(v_src2));
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct cvtScale_SIMD<schar, float, float>
-{
-    int operator () (const schar * src, float * dst, int width, float scale, float shift) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
-            int cWidth = v_float32x4::nlanes;
-            for (; x <= width - cWidth * 2; x += cWidth * 2)
-            {
-                v_float32x4 v_src1, v_src2;
-                v_load_expand_from_s8_f32(src + x, v_scale, v_shift, v_src1, v_src2);
-
-                v_store(dst + x, v_src1);
-                v_store(dst + x + cWidth, v_src2);
-            }
-        }
-        return x;
-    }
-};
-
-// from ushort
-
-template <>
-struct cvtScale_SIMD<ushort, uchar, float>
-{
-    int operator () (const ushort * src, uchar * dst, int width, float scale, float shift) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
-            int cWidth = v_uint16x8::nlanes;
-            for (; x <= width - cWidth; x += cWidth)
-            {
-                v_float32x4 v_src1, v_src2;
-                v_load_expand_from_u16_f32(src + x, v_scale, v_shift, v_src1, v_src2);
-
-                v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2));
-                v_pack_u_store(dst + x, v_dst);
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct cvtScale_SIMD<ushort, schar, float>
-{
-    int operator () (const ushort * src, schar * dst, int width, float scale, float shift) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
-            int cWidth = v_int16x8::nlanes;
-            for (; x <= width - cWidth; x += cWidth)
-            {
-                v_float32x4 v_src1, v_src2;
-                v_load_expand_from_u16_f32(src + x, v_scale, v_shift, v_src1, v_src2);
-
-                v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2));
-                v_store_low(dst + x, v_pack(v_dst, v_dst));
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct cvtScale_SIMD<ushort, ushort, float>
-{
-    int operator () (const ushort * src, ushort * dst, int width, float scale, float shift) const
-    {
-        int x = 0;
-#if CV_TRY_SSE4_1
-        if (CV_CPU_HAS_SUPPORT_SSE4_1)
-            return opt_SSE4_1::cvtScale_SIMD_u16u16f32_SSE41(src, dst, width, scale, shift);
-#endif
-        if (hasSIMD128())
-        {
-            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
-            int cWidth = v_uint16x8::nlanes;
-            for (; x <= width - cWidth; x += cWidth)
-            {
-                v_float32x4 v_src1, v_src2;
-                v_load_expand_from_u16_f32(src + x, v_scale, v_shift, v_src1, v_src2);
-
-                v_uint16x8 v_dst = v_pack_u(v_round(v_src1), v_round(v_src2));
-                v_store(dst + x, v_dst);
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct cvtScale_SIMD<ushort, short, float>
-{
-    int operator () (const ushort * src, short * dst, int width, float scale, float shift) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
-            int cWidth = v_int16x8::nlanes;
-            for (; x <= width - cWidth; x += cWidth)
-            {
-                v_float32x4 v_src1, v_src2;
-                v_load_expand_from_u16_f32(src + x, v_scale, v_shift, v_src1, v_src2);
-
-                v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2));
-                v_store(dst + x, v_dst);
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct cvtScale_SIMD<ushort, int, float>
-{
-    int operator () (const ushort * src, int * dst, int width, float scale, float shift) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
-            int cWidth = v_int32x4::nlanes;
-            for (; x <= width - cWidth * 2; x += cWidth * 2)
-            {
-                v_float32x4 v_src1, v_src2;
-                v_load_expand_from_u16_f32(src + x, v_scale, v_shift, v_src1, v_src2);
-
-                v_store(dst + x, v_round(v_src1));
-                v_store(dst + x + cWidth, v_round(v_src2));
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct cvtScale_SIMD<ushort, float, float>
-{
-    int operator () (const ushort * src, float * dst, int width, float scale, float shift) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
-            int cWidth = v_float32x4::nlanes;
-            for (; x <= width - cWidth * 2; x += cWidth * 2)
-            {
-                v_float32x4 v_src1, v_src2;
-                v_load_expand_from_u16_f32(src + x, v_scale, v_shift, v_src1, v_src2);
-
-                v_store(dst + x, v_src1);
-                v_store(dst + x + cWidth, v_src2);
-            }
-        }
-        return x;
-    }
-};
-
-// from short
-
-template <>
-struct cvtScale_SIMD<short, uchar, float>
-{
-    int operator () (const short * src, uchar * dst, int width, float scale, float shift) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
-            int cWidth = v_uint16x8::nlanes;
-            for (; x <= width - cWidth; x += cWidth)
-            {
-                v_float32x4 v_src1, v_src2;
-                v_load_expand_from_s16_f32(src + x, v_scale, v_shift, v_src1, v_src2);
-
-                v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2));
-                v_pack_u_store(dst + x, v_dst);
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct cvtScale_SIMD<short, schar, float>
-{
-    int operator () (const short * src, schar * dst, int width, float scale, float shift) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
-            int cWidth = v_int16x8::nlanes;
-            for (; x <= width - cWidth; x += cWidth)
-            {
-                v_float32x4 v_src1, v_src2;
-                v_load_expand_from_s16_f32(src + x, v_scale, v_shift, v_src1, v_src2);
-
-                v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2));
-                v_store_low(dst + x, v_pack(v_dst, v_dst));
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct cvtScale_SIMD<short, ushort, float>
-{
-    int operator () (const short * src, ushort * dst, int width, float scale, float shift) const
-    {
-        int x = 0;
-#if CV_TRY_SSE4_1
-        if (CV_CPU_HAS_SUPPORT_SSE4_1)
-            return opt_SSE4_1::cvtScale_SIMD_s16u16f32_SSE41(src, dst, width, scale, shift);
-#endif
-        if (hasSIMD128())
-        {
-            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
-            int cWidth = v_uint16x8::nlanes;
-            for (; x <= width - cWidth; x += cWidth)
-            {
-                v_float32x4 v_src1, v_src2;
-                v_load_expand_from_s16_f32(src + x, v_scale, v_shift, v_src1, v_src2);
-
-                v_uint16x8 v_dst = v_pack_u(v_round(v_src1), v_round(v_src2));
-                v_store(dst + x, v_dst);
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct cvtScale_SIMD<short, short, float>
-{
-    int operator () (const short * src, short * dst, int width, float scale, float shift) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
-            int cWidth = v_uint16x8::nlanes;
-            for (; x <= width - cWidth; x += cWidth)
-            {
-                v_float32x4 v_src1, v_src2;
-                v_load_expand_from_s16_f32(src + x, v_scale, v_shift, v_src1, v_src2);
-
-                v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2));
-                v_store(dst + x, v_dst);
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct cvtScale_SIMD<short, float, float>
-{
-    int operator () (const short * src, float * dst, int width, float scale, float shift) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
-            int cWidth = v_float32x4::nlanes;
-            for (; x <= width - cWidth * 2; x += cWidth * 2)
-            {
-                v_float32x4 v_src1, v_src2;
-                v_load_expand_from_s16_f32(src + x, v_scale, v_shift, v_src1, v_src2);
-
-                v_store(dst + x, v_src1);
-                v_store(dst + x + cWidth, v_src2);
-            }
-        }
-        return x;
-    }
-};
-
-// from int
-
-template <>
-struct cvtScale_SIMD<int, uchar, float>
-{
-    int operator () (const int * src, uchar * dst, int width, float scale, float shift) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
-            int cWidth = v_uint16x8::nlanes;
-            for (; x <= width - cWidth; x += cWidth)
-            {
-                v_float32x4 v_src1, v_src2;
-                v_load_expand_from_s32_f32(src + x, v_scale, v_shift, v_src1, v_src2);
-
-                v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2));
-                v_pack_u_store(dst + x, v_dst);
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct cvtScale_SIMD<int, schar, float>
-{
-    int operator () (const int * src, schar * dst, int width, float scale, float shift) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
-            int cWidth = v_int16x8::nlanes;
-            for (; x <= width - cWidth; x += cWidth)
-            {
-                v_float32x4 v_src1, v_src2;
-                v_load_expand_from_s32_f32(src + x, v_scale, v_shift, v_src1, v_src2);
-
-                v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2));
-                v_store_low(dst + x, v_pack(v_dst, v_dst));
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct cvtScale_SIMD<int, ushort, float>
-{
-    int operator () (const int * src, ushort * dst, int width, float scale, float shift) const
-    {
-        int x = 0;
-#if CV_TRY_SSE4_1
-        if (CV_CPU_HAS_SUPPORT_SSE4_1)
-            return opt_SSE4_1::cvtScale_SIMD_s32u16f32_SSE41(src, dst, width, scale, shift);
-#endif
-        if (hasSIMD128())
-        {
-            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
-            int cWidth = v_uint16x8::nlanes;
-            for (; x <= width - cWidth; x += cWidth)
-            {
-                v_float32x4 v_src1, v_src2;
-                v_load_expand_from_s32_f32(src + x, v_scale, v_shift, v_src1, v_src2);
-
-                v_uint16x8 v_dst = v_pack_u(v_round(v_src1), v_round(v_src2));
-                v_store(dst + x, v_dst);
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct cvtScale_SIMD<int, short, float>
-{
-    int operator () (const int * src, short * dst, int width, float scale, float shift) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
-            int cWidth = v_int16x8::nlanes;
-            for (; x <= width - cWidth; x += cWidth)
-            {
-                v_float32x4 v_src1, v_src2;
-                v_load_expand_from_s32_f32(src + x, v_scale, v_shift, v_src1, v_src2);
-
-                v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2));
-                v_store(dst + x, v_dst);
-            }
-        }
-        return x;
-    }
-};
-
-#if CV_SIMD128_64F
-template <>
-struct cvtScale_SIMD<int, int, double>
-{
-    int operator () (const int * src, int * dst, int width, double scale, double shift) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            v_float64x2 v_shift = v_setall_f64(shift), v_scale = v_setall_f64(scale);
-            int cWidth = v_int32x4::nlanes;
-            for (; x <= width - cWidth; x += cWidth)
-            {
-                double v_srcbuf[] = { (double)src[x], (double)src[x+1], (double)src[x+2], (double)src[x+3] };
-                v_float64x2 v_src1 = v_shift + v_scale * v_load(v_srcbuf);
-                v_float64x2 v_src2 = v_shift + v_scale * v_load(v_srcbuf + 2);
-                v_store(dst + x, v_combine_low(v_round(v_src1), v_round(v_src2)));
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct cvtScale_SIMD<int, float, double>
-{
-    int operator () (const int * src, float * dst, int width, double scale, double shift) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            v_float64x2 v_shift = v_setall_f64(shift), v_scale = v_setall_f64(scale);
-            int cWidth = v_int32x4::nlanes;
-            for (; x <= width - cWidth; x += cWidth)
-            {
-                double v_srcbuf[] = { (double)src[x], (double)src[x+1], (double)src[x+2], (double)src[x+3] };
-                v_float64x2 v_src1 = v_shift + v_scale * v_load(v_srcbuf);
-                v_float64x2 v_src2 = v_shift + v_scale * v_load(v_srcbuf + 2);
-                v_store(dst + x, v_combine_low(v_cvt_f32(v_src1), v_cvt_f32(v_src2)));
-            }
-        }
-        return x;
-    }
-};
-#endif //CV_SIMD128_64F
-
-// from float
-
-template <>
-struct cvtScale_SIMD<float, uchar, float>
-{
-    int operator () (const float * src, uchar * dst, int width, float scale, float shift) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
-            int cWidth = v_float32x4::nlanes;
-            for (; x <= width - cWidth * 2; x += cWidth * 2)
-            {
-                v_float32x4 v_dst1 = v_shift + v_scale * v_load(src + x);
-                v_float32x4 v_dst2 = v_shift + v_scale * v_load(src + x + cWidth);
-
-                v_int16x8 v_dst = v_pack(v_round(v_dst1), v_round(v_dst2));
-                v_pack_u_store(dst + x, v_dst);
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct cvtScale_SIMD<float, schar, float>
-{
-    int operator () (const float * src, schar * dst, int width, float scale, float shift) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
-            int cWidth = v_float32x4::nlanes;
-            for (; x <= width - cWidth * 2; x += cWidth * 2)
-            {
-                v_float32x4 v_dst1 = v_shift + v_scale * v_load(src + x);
-                v_float32x4 v_dst2 = v_shift + v_scale * v_load(src + x + cWidth);
-
-                v_int16x8 v_dst = v_pack(v_round(v_dst1), v_round(v_dst2));
-                v_store_low(dst + x, v_pack(v_dst, v_dst));
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct cvtScale_SIMD<float, ushort, float>
-{
-    int operator () (const float * src, ushort * dst, int width, float scale, float shift) const
-    {
-        int x = 0;
-#if CV_TRY_SSE4_1
-        if (CV_CPU_HAS_SUPPORT_SSE4_1)
-            return opt_SSE4_1::cvtScale_SIMD_f32u16f32_SSE41(src, dst, width, scale, shift);
-#endif
-        if (hasSIMD128())
-        {
-            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
-            int cWidth = v_float32x4::nlanes;
-            for (; x <= width - cWidth * 2; x += cWidth * 2)
-            {
-                v_float32x4 v_dst1 = v_shift + v_scale * v_load(src + x);
-                v_float32x4 v_dst2 = v_shift + v_scale * v_load(src + x + cWidth);
-
-                v_uint16x8 v_dst = v_pack_u(v_round(v_dst1), v_round(v_dst2));
-                v_store(dst + x, v_dst);
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct cvtScale_SIMD<float, short, float>
-{
-    int operator () (const float * src, short * dst, int width, float scale, float shift) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
-            int cWidth = v_float32x4::nlanes;
-            for (; x <= width - cWidth * 2; x += cWidth * 2)
-            {
-                v_float32x4 v_dst1 = v_shift + v_scale * v_load(src + x);
-                v_float32x4 v_dst2 = v_shift + v_scale * v_load(src + x + cWidth);
-
-                v_int16x8 v_dst = v_pack(v_round(v_dst1), v_round(v_dst2));
-                v_store(dst + x, v_dst);
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct cvtScale_SIMD<float, int, float>
-{
-    int operator () (const float * src, int * dst, int width, float scale, float shift) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
-            int cWidth = v_float32x4::nlanes;
-            for (; x <= width - cWidth; x += cWidth)
-                v_store(dst + x, v_round(v_load(src + x) * v_scale + v_shift));
-        }
-        return x;
-    }
-};
-
-template <>
-struct cvtScale_SIMD<float, float, float>
-{
-    int operator () (const float * src, float * dst, int width, float scale, float shift) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
-            int cWidth = v_float32x4::nlanes;
-            for (; x <= width - cWidth; x += cWidth)
-                v_store(dst + x, v_load(src + x) * v_scale + v_shift);
-        }
-        return x;
-    }
-};
-
-#if CV_SIMD128_64F
-
-static inline void v_load_scale_shift(const double* src, const v_float64x2& v_scale, const v_float64x2 &v_shift, v_float32x4& v_dst1, v_float32x4 &v_dst2)
-{
-    int cWidth = v_float64x2::nlanes;
-    v_float64x2 v_src1 = v_shift + v_scale * v_load(src);
-    v_float64x2 v_src2 = v_shift + v_scale * v_load(src + cWidth);
-    v_float64x2 v_src3 = v_shift + v_scale * v_load(src + cWidth * 2);
-    v_float64x2 v_src4 = v_shift + v_scale * v_load(src + cWidth * 3);
-    v_dst1 = v_combine_low(v_cvt_f32(v_src1), v_cvt_f32(v_src2));
-    v_dst2 = v_combine_low(v_cvt_f32(v_src3), v_cvt_f32(v_src4));
-}
-
-static inline void v_store_scale_shift_s32_to_f64(double *dst, const v_float64x2 &v_scale, const v_float64x2 &v_shift, const v_int32x4 &v1, const v_int32x4 &v2)
-{
-    v_float64x2 v_dst1 = v_shift + v_scale * v_cvt_f64(v1);
-    v_float64x2 v_dst2 = v_shift + v_scale * v_cvt_f64_high(v1);
-    v_float64x2 v_dst3 = v_shift + v_scale * v_cvt_f64(v2);
-    v_float64x2 v_dst4 = v_shift + v_scale * v_cvt_f64_high(v2);
-
-    v_store(dst, v_dst1);
-    v_store(dst + v_float64x2::nlanes, v_dst2);
-    v_store(dst + v_float64x2::nlanes * 2, v_dst3);
-    v_store(dst + v_float64x2::nlanes * 3, v_dst4);
-}
-
-static inline void v_store_scale_shift_f32_to_f64(double *dst, const v_float64x2 &v_scale, const v_float64x2 &v_shift, const v_float32x4 &v1, const v_float32x4 &v2)
-{
-    v_float64x2 v_dst1 = v_shift + v_scale * v_cvt_f64(v1);
-    v_float64x2 v_dst2 = v_shift + v_scale * v_cvt_f64_high(v1);
-    v_float64x2 v_dst3 = v_shift + v_scale * v_cvt_f64(v2);
-    v_float64x2 v_dst4 = v_shift + v_scale * v_cvt_f64_high(v2);
-
-    v_store(dst, v_dst1);
-    v_store(dst + v_float64x2::nlanes, v_dst2);
-    v_store(dst + v_float64x2::nlanes * 2, v_dst3);
-    v_store(dst + v_float64x2::nlanes * 3, v_dst4);
-}
-
-// from double
-
-template <>
-struct cvtScale_SIMD<double, uchar, float>
-{
-    int operator () (const double * src, uchar * dst, int width, float scale, float shift) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            v_float64x2 v_shift = v_setall_f64((double)shift), v_scale = v_setall_f64((double)scale);
-            int cWidth = v_float64x2::nlanes;
-            for (; x <= width - cWidth * 4; x += cWidth * 4)
-            {
-                v_float32x4 v_dst1, v_dst2;
-                v_load_scale_shift(src + x, v_scale, v_shift, v_dst1, v_dst2);
-                v_pack_u_store(dst + x, v_pack(v_round(v_dst1), v_round(v_dst2)));
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct cvtScale_SIMD<double, schar, float>
-{
-    int operator () (const double * src, schar * dst, int width, float scale, float shift) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            v_float64x2 v_shift = v_setall_f64((double)shift), v_scale = v_setall_f64((double)scale);
-            int cWidth = v_float64x2::nlanes;
-            for (; x <= width - cWidth * 4; x += cWidth * 4)
-            {
-                v_float32x4 v_dst1, v_dst2;
-                v_load_scale_shift(src + x, v_scale, v_shift, v_dst1, v_dst2);
-                v_int16x8 v_dst = v_pack(v_round(v_dst1), v_round(v_dst2));
-                v_pack_store(dst + x, v_dst);
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct cvtScale_SIMD<double, ushort, float>
-{
-    int operator () (const double * src, ushort * dst, int width, float scale, float shift) const
-    {
-        int x = 0;
-#if CV_TRY_SSE4_1
-        if (CV_CPU_HAS_SUPPORT_SSE4_1)
-            return opt_SSE4_1::cvtScale_SIMD_f64u16f32_SSE41(src, dst, width, scale, shift);
-#endif
-        if (hasSIMD128())
-        {
-            v_float64x2 v_shift = v_setall_f64((double)shift), v_scale = v_setall_f64((double)scale);
-            int cWidth = v_uint16x8::nlanes;
-            for (; x <= width - cWidth; x += cWidth)
-            {
-                v_float32x4 v_dst1, v_dst2;
-                v_load_scale_shift(src + x, v_scale, v_shift, v_dst1, v_dst2);
-                v_uint16x8 v_dst = v_pack_u(v_round(v_dst1), v_round(v_dst2));
-                v_store(dst + x, v_dst);
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct cvtScale_SIMD<double, short, float>
-{
-    int operator () (const double * src, short * dst, int width, float scale, float shift) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            v_float64x2 v_shift = v_setall_f64((double)shift), v_scale = v_setall_f64((double)scale);
-            int cWidth = v_int16x8::nlanes;
-            for (; x <= width - cWidth; x += cWidth)
-            {
-                v_float32x4 v_dst1, v_dst2;
-                v_load_scale_shift(src + x, v_scale, v_shift, v_dst1, v_dst2);
-                v_int16x8 v_dst = v_pack(v_round(v_dst1), v_round(v_dst2));
-                v_store(dst + x, v_dst);
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct cvtScale_SIMD<double, int, double>
-{
-    int operator () (const double * src, int * dst, int width, double scale, double shift) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            v_float64x2 v_shift = v_setall_f64(shift), v_scale = v_setall_f64(scale);
-            int cWidth = v_float64x2::nlanes;
-            for (; x <= width - cWidth * 2; x += cWidth * 2)
-            {
-                v_float64x2 v_src1 = v_shift + v_scale * v_load(src + x);
-                v_float64x2 v_src2 = v_shift + v_scale * v_load(src + x + cWidth);
-
-                v_store(dst + x, v_combine_low(v_round(v_src1), v_round(v_src2)));
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct cvtScale_SIMD<double, float, double>
-{
-    int operator () (const double * src, float * dst, int width, double scale, double shift) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            v_float64x2 v_shift = v_setall_f64(shift), v_scale = v_setall_f64(scale);
-            int cWidth = v_float64x2::nlanes;
-            for (; x <= width - cWidth * 2; x += cWidth * 2)
-            {
-                v_float64x2 v_src1 = v_shift + v_scale * v_load(src + x);
-                v_float64x2 v_src2 = v_shift + v_scale * v_load(src + x + cWidth);
-                v_float32x4 v_dst1 = v_cvt_f32(v_src1);
-                v_float32x4 v_dst2 = v_cvt_f32(v_src2);
-
-                v_store(dst + x, v_combine_low(v_dst1, v_dst2));
-            }
-        }
-        return x;
-    }
-};
-
-// to double
-
-template <>
-struct cvtScale_SIMD<uchar, double, double>
-{
-    int operator () (const uchar * src, double * dst, int width, double scale, double shift) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            v_float64x2 v_shift = v_setall_f64(shift), v_scale = v_setall_f64(scale);
-            int cWidth = v_float64x2::nlanes;
-            for (; x <= width - cWidth * 4; x += cWidth * 4)
-            {
-                v_uint32x4 v_src1, v_src2;
-                v_expand(v_load_expand(src + x), v_src1, v_src2);
-                v_store_scale_shift_s32_to_f64(dst + x, v_scale, v_shift
-                    , v_reinterpret_as_s32(v_src1), v_reinterpret_as_s32(v_src2));
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct cvtScale_SIMD<schar, double, double>
-{
-    int operator () (const schar * src, double * dst, int width, double scale, double shift) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            v_float64x2 v_shift = v_setall_f64(shift), v_scale = v_setall_f64(scale);
-            int cWidth = v_float64x2::nlanes;
-            for (; x <= width - cWidth * 4; x += cWidth * 4)
-            {
-                v_int32x4 v_src1, v_src2;
-                v_expand(v_load_expand(src + x), v_src1, v_src2);
-                v_store_scale_shift_s32_to_f64(dst + x, v_scale, v_shift, v_src1, v_src2);
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct cvtScale_SIMD<ushort, double, double>
-{
-    int operator () (const ushort * src, double * dst, int width, double scale, double shift) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            v_float64x2 v_shift = v_setall_f64(shift), v_scale = v_setall_f64(scale);
-            int cWidth = v_float64x2::nlanes;
-            for (; x <= width - cWidth * 4; x += cWidth * 4)
-            {
-                v_uint32x4 v_src1, v_src2;
-                v_expand(v_load(src + x), v_src1, v_src2);
-                v_store_scale_shift_s32_to_f64(dst + x, v_scale, v_shift
-                    , v_reinterpret_as_s32(v_src1), v_reinterpret_as_s32(v_src2));
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct cvtScale_SIMD<short, double, double>
-{
-    int operator () (const short * src, double * dst, int width, double scale, double shift) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            v_float64x2 v_shift = v_setall_f64(shift), v_scale = v_setall_f64(scale);
-            int cWidth = v_float64x2::nlanes;
-            for (; x <= width - cWidth * 4; x += cWidth * 4)
-            {
-                v_int32x4 v_src1, v_src2;
-                v_expand(v_load(src + x), v_src1, v_src2);
-                v_store_scale_shift_s32_to_f64(dst + x, v_scale, v_shift, v_src1, v_src2);
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct cvtScale_SIMD<int, double, double>
-{
-    int operator () (const int * src, double * dst, int width, double scale, double shift) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            v_float64x2 v_shift = v_setall_f64(shift), v_scale = v_setall_f64(scale);
-            int cWidth = v_float32x4::nlanes;
-            for (; x <= width - cWidth * 2; x += cWidth * 2)
-            {
-                v_int32x4 v_src1 = v_load(src + x);
-                v_int32x4 v_src2 = v_load(src + x + cWidth);
-                v_store_scale_shift_s32_to_f64(dst + x, v_scale, v_shift, v_src1, v_src2);
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct cvtScale_SIMD<float, double, double>
-{
-    int operator () (const float * src, double * dst, int width, double scale, double shift) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            v_float64x2 v_shift = v_setall_f64(shift), v_scale = v_setall_f64(scale);
-            int cWidth = v_float32x4::nlanes;
-            for (; x <= width - cWidth * 2; x += cWidth * 2)
-            {
-                v_float32x4 v_src1 = v_load(src + x);
-                v_float32x4 v_src2 = v_load(src + x + cWidth);
-                v_store_scale_shift_f32_to_f64(dst + x, v_scale, v_shift, v_src1, v_src2);
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct cvtScale_SIMD<double, double, double>
-{
-    int operator () (const double * src, double * dst, int width, double scale, double shift) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            v_float64x2 v_shift = v_setall_f64(shift), v_scale = v_setall_f64(scale);
-            int cWidth = v_float64x2::nlanes;
-            for (; x <= width - cWidth * 2; x += cWidth * 2)
-            {
-                v_float64x2 v_src1 = v_shift + v_scale * v_load(src + x);
-                v_float64x2 v_src2 = v_shift + v_scale * v_load(src + x + cWidth);
-                v_store(dst + x, v_src1);
-                v_store(dst + x + cWidth, v_src2);
-            }
-        }
-        return x;
-    }
-};
-#endif
-#endif
-
-template<typename T, typename DT, typename WT> static void
-cvtScale_( const T* src, size_t sstep,
-           DT* dst, size_t dstep, Size size,
-           WT scale, WT shift )
-{
     sstep /= sizeof(src[0]);
     dstep /= sizeof(dst[0]);
 
-    cvtScale_SIMD<T, DT, WT> vop;
-
-    for( ; size.height--; src += sstep, dst += dstep )
+    for( int i = 0; i < size.height; i++, src += sstep, dst += dstep )
     {
-        int x = vop(src, dst, size.width, scale, shift);
-
-        #if CV_ENABLE_UNROLLED
-        for( ; x <= size.width - 4; x += 4 )
+        int j = 0;
+#if CV_SIMD
+        for( ; j < size.width; j += VECSZ )
         {
-            DT t0, t1;
-            t0 = saturate_cast<DT>(src[x]*scale + shift);
-            t1 = saturate_cast<DT>(src[x+1]*scale + shift);
-            dst[x] = t0; dst[x+1] = t1;
-            t0 = saturate_cast<DT>(src[x+2]*scale + shift);
-            t1 = saturate_cast<DT>(src[x+3]*scale + shift);
-            dst[x+2] = t0; dst[x+3] = t1;
+            if( j > size.width - VECSZ )
+            {
+                if( j == 0 || src == (_Ts*)dst )
+                    break;
+                j = size.width - VECSZ;
+            }
+            v_float32 v0, v1;
+            vx_load_pair_as(src + j, v0, v1);
+            v0 = v_fma(v0, va, vb);
+            v1 = v_fma(v1, va, vb);
+            v_store_pair_as(dst + j, v_abs(v0), v_abs(v1));
         }
-        #endif
-
-        for( ; x < size.width; x++ )
-            dst[x] = saturate_cast<DT>(src[x]*scale + shift);
+#endif
+        for( ; j < size.width; j++ )
+            dst[j] = saturate_cast<_Td>(std::abs(src[j]*a + b));
     }
 }
 
-template<> void
-cvtScale_<short, int, float>( const short* src, size_t sstep,
-           int* dst, size_t dstep, Size size,
-           float scale, float shift )
+// variant for convrsions 16f <-> ... w/o unrolling
+template<typename _Ts, typename _Td> inline void
+cvtabs1_32f( const _Ts* src, size_t sstep, _Td* dst, size_t dstep,
+             Size size, float a, float b )
 {
+#if CV_SIMD
+    v_float32 va = vx_setall_f32(a), vb = vx_setall_f32(b);
+    const int VECSZ = v_float32::nlanes*2;
+#endif
     sstep /= sizeof(src[0]);
     dstep /= sizeof(dst[0]);
 
-    for( ; size.height--; src += sstep, dst += dstep )
+    for( int i = 0; i < size.height; i++, src += sstep, dst += dstep )
     {
-        int x = 0;
-        #if CV_TRY_AVX2
-        if (CV_CPU_HAS_SUPPORT_AVX2)
+        int j = 0;
+#if CV_SIMD
+        for( ; j < size.width; j += VECSZ )
         {
-            opt_AVX2::cvtScale_s16s32f32Line_AVX2(src, dst, scale, shift, size.width);
-            continue;
-        }
-        #endif
-        #if CV_SIMD128
-        if (hasSIMD128())
-        {
-            v_float32x4 v_shift = v_setall_f32(shift);
-            v_float32x4 v_scale = v_setall_f32(scale);
-            int cWidth = v_int32x4::nlanes;
-            for (; x <= size.width - cWidth * 2; x += cWidth * 2)
+            if( j > size.width - VECSZ )
             {
-                v_int16x8 v_src = v_load(src + x);
-                v_int32x4 v_src1, v_src2;
-                v_expand(v_src, v_src1, v_src2);
-                v_float32x4 v_tmp1 = v_cvt_f32(v_src1);
-                v_float32x4 v_tmp2 = v_cvt_f32(v_src2);
-
-                v_tmp1 = v_tmp1 * v_scale + v_shift;
-                v_tmp2 = v_tmp2 * v_scale + v_shift;
-
-                v_store(dst + x, v_round(v_tmp1));
-                v_store(dst + x + cWidth, v_round(v_tmp2));
+                if( j == 0 || src == (_Ts*)dst )
+                    break;
+                j = size.width - VECSZ;
             }
+            v_float32 v0;
+            vx_load_as(src + j, v0);
+            v0 = v_fma(v0, va, vb);
+            v_store_as(dst + j, v_abs(v0));
         }
-        #endif
-
-        for(; x < size.width; x++ )
-            dst[x] = saturate_cast<int>(src[x]*scale + shift);
+#endif
+        for( ; j < size.width; j++ )
+            dst[j] = saturate_cast<_Td>(src[j]*a + b);
     }
 }
 
+template<typename _Ts, typename _Td> inline void
+cvt_32f( const _Ts* src, size_t sstep, _Td* dst, size_t dstep,
+         Size size, float a, float b )
+{
+#if CV_SIMD
+    v_float32 va = vx_setall_f32(a), vb = vx_setall_f32(b);
+    const int VECSZ = v_float32::nlanes*2;
+#endif
+    sstep /= sizeof(src[0]);
+    dstep /= sizeof(dst[0]);
+
+    for( int i = 0; i < size.height; i++, src += sstep, dst += dstep )
+    {
+        int j = 0;
+#if CV_SIMD
+        for( ; j < size.width; j += VECSZ )
+        {
+            if( j > size.width - VECSZ )
+            {
+                if( j == 0 || src == (_Ts*)dst )
+                    break;
+                j = size.width - VECSZ;
+            }
+            v_float32 v0, v1;
+            vx_load_pair_as(src + j, v0, v1);
+            v0 = v_fma(v0, va, vb);
+            v1 = v_fma(v1, va, vb);
+            v_store_pair_as(dst + j, v0, v1);
+        }
+#endif
+        for( ; j < size.width; j++ )
+            dst[j] = saturate_cast<_Td>(src[j]*a + b);
+    }
+}
+
+// variant for convrsions 16f <-> ... w/o unrolling
+template<typename _Ts, typename _Td> inline void
+cvt1_32f( const _Ts* src, size_t sstep, _Td* dst, size_t dstep,
+          Size size, float a, float b )
+{
+#if CV_SIMD
+    v_float32 va = vx_setall_f32(a), vb = vx_setall_f32(b);
+    const int VECSZ = v_float32::nlanes;
+#endif
+    sstep /= sizeof(src[0]);
+    dstep /= sizeof(dst[0]);
+
+    for( int i = 0; i < size.height; i++, src += sstep, dst += dstep )
+    {
+        int j = 0;
+#if CV_SIMD
+        for( ; j < size.width; j += VECSZ )
+        {
+            if( j > size.width - VECSZ )
+            {
+                if( j == 0 || src == (_Ts*)dst )
+                    break;
+                j = size.width - VECSZ;
+            }
+            v_float32 v0;
+            vx_load_as(src + j, v0);
+            v0 = v_fma(v0, va, vb);
+            v_store_as(dst + j, v0);
+        }
+#endif
+        for( ; j < size.width; j++ )
+            dst[j] = saturate_cast<_Td>(src[j]*a + b);
+    }
+}
+
+
+template<typename _Ts, typename _Td> inline void
+cvt_64f( const _Ts* src, size_t sstep, _Td* dst, size_t dstep,
+         Size size, double a, double b )
+{
+#if CV_SIMD_64F
+    v_float64 va = vx_setall_f64(a), vb = vx_setall_f64(b);
+    const int VECSZ = v_float64::nlanes*2;
+#endif
+    sstep /= sizeof(src[0]);
+    dstep /= sizeof(dst[0]);
+
+    for( int i = 0; i < size.height; i++, src += sstep, dst += dstep )
+    {
+        int j = 0;
+#if CV_SIMD_64F
+        for( ; j < size.width; j += VECSZ )
+        {
+            if( j > size.width - VECSZ )
+            {
+                if( j == 0 || src == (_Ts*)dst )
+                    break;
+                j = size.width - VECSZ;
+            }
+            v_float64 v0, v1;
+            vx_load_pair_as(src + j, v0, v1);
+            v0 = v_fma(v0, va, vb);
+            v1 = v_fma(v1, va, vb);
+            v_store_pair_as(dst + j, v0, v1);
+        }
+#endif
+        for( ; j < size.width; j++ )
+            dst[j] = saturate_cast<_Td>(src[j]*a + b);
+    }
+}
 
 //==================================================================================================
 
-#define DEF_CVT_SCALE_ABS_FUNC(suffix, tfunc, stype, dtype, wtype) \
+#define DEF_CVT_SCALE_ABS_FUNC(suffix, cvt, stype, dtype, wtype) \
 static void cvtScaleAbs##suffix( const stype* src, size_t sstep, const uchar*, size_t, \
-                         dtype* dst, size_t dstep, Size size, double* scale) \
+                                 dtype* dst, size_t dstep, Size size, double* scale) \
 { \
-    tfunc(src, sstep, dst, dstep, size, (wtype)scale[0], (wtype)scale[1]); \
+    cvt(src, sstep, dst, dstep, size, (wtype)scale[0], (wtype)scale[1]); \
 }
 
 
-#define DEF_CVT_SCALE_FUNC(suffix, stype, dtype, wtype) \
+#define DEF_CVT_SCALE_FUNC(suffix, cvt, stype, dtype, wtype) \
 static void cvtScale##suffix( const stype* src, size_t sstep, const uchar*, size_t, \
-dtype* dst, size_t dstep, Size size, double* scale) \
+                              dtype* dst, size_t dstep, Size size, double* scale) \
 { \
-    cvtScale_(src, sstep, dst, dstep, size, (wtype)scale[0], (wtype)scale[1]); \
+    cvt(src, sstep, dst, dstep, size, (wtype)scale[0], (wtype)scale[1]); \
 }
 
-DEF_CVT_SCALE_ABS_FUNC(8u, cvtScaleAbs_, uchar, uchar, float)
-DEF_CVT_SCALE_ABS_FUNC(8s8u, cvtScaleAbs_, schar, uchar, float)
-DEF_CVT_SCALE_ABS_FUNC(16u8u, cvtScaleAbs_, ushort, uchar, float)
-DEF_CVT_SCALE_ABS_FUNC(16s8u, cvtScaleAbs_, short, uchar, float)
-DEF_CVT_SCALE_ABS_FUNC(32s8u, cvtScaleAbs_, int, uchar, float)
-DEF_CVT_SCALE_ABS_FUNC(32f8u, cvtScaleAbs_, float, uchar, float)
-DEF_CVT_SCALE_ABS_FUNC(64f8u, cvtScaleAbs_, double, uchar, float)
+DEF_CVT_SCALE_ABS_FUNC(8u,    cvtabs_32f, uchar,  uchar, float)
+DEF_CVT_SCALE_ABS_FUNC(8s8u,  cvtabs_32f, schar,  uchar, float)
+DEF_CVT_SCALE_ABS_FUNC(16u8u, cvtabs_32f, ushort, uchar, float)
+DEF_CVT_SCALE_ABS_FUNC(16s8u, cvtabs_32f, short,  uchar, float)
+DEF_CVT_SCALE_ABS_FUNC(32s8u, cvtabs_32f, int,    uchar, float)
+DEF_CVT_SCALE_ABS_FUNC(32f8u, cvtabs_32f, float,  uchar, float)
+DEF_CVT_SCALE_ABS_FUNC(64f8u, cvtabs_32f, double, uchar, float)
 
+DEF_CVT_SCALE_FUNC(8u,     cvt_32f, uchar,  uchar, float)
+DEF_CVT_SCALE_FUNC(8s8u,   cvt_32f, schar,  uchar, float)
+DEF_CVT_SCALE_FUNC(16u8u,  cvt_32f, ushort, uchar, float)
+DEF_CVT_SCALE_FUNC(16s8u,  cvt_32f, short,  uchar, float)
+DEF_CVT_SCALE_FUNC(32s8u,  cvt_32f, int,    uchar, float)
+DEF_CVT_SCALE_FUNC(32f8u,  cvt_32f, float,  uchar, float)
+DEF_CVT_SCALE_FUNC(64f8u,  cvt_32f, double, uchar, float)
+//DEF_CVT_SCALE_FUNC(16f8u,  cvt_32f, float16_t, uchar, float)
 
-DEF_CVT_SCALE_FUNC(8u,     uchar, uchar, float)
-DEF_CVT_SCALE_FUNC(8s8u,   schar, uchar, float)
-DEF_CVT_SCALE_FUNC(16u8u,  ushort, uchar, float)
-DEF_CVT_SCALE_FUNC(16s8u,  short, uchar, float)
-DEF_CVT_SCALE_FUNC(32s8u,  int, uchar, float)
-DEF_CVT_SCALE_FUNC(32f8u,  float, uchar, float)
-DEF_CVT_SCALE_FUNC(64f8u,  double, uchar, float)
+DEF_CVT_SCALE_FUNC(8u8s,   cvt_32f, uchar,  schar, float)
+DEF_CVT_SCALE_FUNC(8s,     cvt_32f, schar,  schar, float)
+DEF_CVT_SCALE_FUNC(16u8s,  cvt_32f, ushort, schar, float)
+DEF_CVT_SCALE_FUNC(16s8s,  cvt_32f, short,  schar, float)
+DEF_CVT_SCALE_FUNC(32s8s,  cvt_32f, int,    schar, float)
+DEF_CVT_SCALE_FUNC(32f8s,  cvt_32f, float,  schar, float)
+DEF_CVT_SCALE_FUNC(64f8s,  cvt_32f, double, schar, float)
+//DEF_CVT_SCALE_FUNC(16f8s,  cvt_32f, float16_t, schar, float)
 
-DEF_CVT_SCALE_FUNC(8u8s,   uchar, schar, float)
-DEF_CVT_SCALE_FUNC(8s,     schar, schar, float)
-DEF_CVT_SCALE_FUNC(16u8s,  ushort, schar, float)
-DEF_CVT_SCALE_FUNC(16s8s,  short, schar, float)
-DEF_CVT_SCALE_FUNC(32s8s,  int, schar, float)
-DEF_CVT_SCALE_FUNC(32f8s,  float, schar, float)
-DEF_CVT_SCALE_FUNC(64f8s,  double, schar, float)
+DEF_CVT_SCALE_FUNC(8u16u,  cvt_32f, uchar,  ushort, float)
+DEF_CVT_SCALE_FUNC(8s16u,  cvt_32f, schar,  ushort, float)
+DEF_CVT_SCALE_FUNC(16u,    cvt_32f, ushort, ushort, float)
+DEF_CVT_SCALE_FUNC(16s16u, cvt_32f, short,  ushort, float)
+DEF_CVT_SCALE_FUNC(32s16u, cvt_32f, int,    ushort, float)
+DEF_CVT_SCALE_FUNC(32f16u, cvt_32f, float,  ushort, float)
+DEF_CVT_SCALE_FUNC(64f16u, cvt_32f, double, ushort, float)
+//DEF_CVT_SCALE_FUNC(16f16u, cvt1_32f, float16_t, ushort, float)
 
-DEF_CVT_SCALE_FUNC(8u16u,  uchar, ushort, float)
-DEF_CVT_SCALE_FUNC(8s16u,  schar, ushort, float)
-DEF_CVT_SCALE_FUNC(16u,    ushort, ushort, float)
-DEF_CVT_SCALE_FUNC(16s16u, short, ushort, float)
-DEF_CVT_SCALE_FUNC(32s16u, int, ushort, float)
-DEF_CVT_SCALE_FUNC(32f16u, float, ushort, float)
-DEF_CVT_SCALE_FUNC(64f16u, double, ushort, float)
+DEF_CVT_SCALE_FUNC(8u16s,  cvt_32f, uchar,  short, float)
+DEF_CVT_SCALE_FUNC(8s16s,  cvt_32f, schar,  short, float)
+DEF_CVT_SCALE_FUNC(16u16s, cvt_32f, ushort, short, float)
+DEF_CVT_SCALE_FUNC(16s,    cvt_32f, short,  short, float)
+DEF_CVT_SCALE_FUNC(32s16s, cvt_32f, int,    short, float)
+DEF_CVT_SCALE_FUNC(32f16s, cvt_32f, float,  short, float)
+DEF_CVT_SCALE_FUNC(64f16s, cvt_32f, double, short, float)
+//DEF_CVT_SCALE_FUNC(16f16s, cvt1_32f, float16_t, short, float)
 
-DEF_CVT_SCALE_FUNC(8u16s,  uchar, short, float)
-DEF_CVT_SCALE_FUNC(8s16s,  schar, short, float)
-DEF_CVT_SCALE_FUNC(16u16s, ushort, short, float)
-DEF_CVT_SCALE_FUNC(16s,    short, short, float)
-DEF_CVT_SCALE_FUNC(32s16s, int, short, float)
-DEF_CVT_SCALE_FUNC(32f16s, float, short, float)
-DEF_CVT_SCALE_FUNC(64f16s, double, short, float)
+DEF_CVT_SCALE_FUNC(8u32s,  cvt_32f, uchar,  int, float)
+DEF_CVT_SCALE_FUNC(8s32s,  cvt_32f, schar,  int, float)
+DEF_CVT_SCALE_FUNC(16u32s, cvt_32f, ushort, int, float)
+DEF_CVT_SCALE_FUNC(16s32s, cvt_32f, short,  int, float)
+DEF_CVT_SCALE_FUNC(32s,    cvt_64f, int,    int, double)
+DEF_CVT_SCALE_FUNC(32f32s, cvt_32f, float,  int, float)
+DEF_CVT_SCALE_FUNC(64f32s, cvt_64f, double, int, double)
+//DEF_CVT_SCALE_FUNC(16f32s, cvt1_32f, float16_t, int, float)
 
-DEF_CVT_SCALE_FUNC(8u32s,  uchar, int, float)
-DEF_CVT_SCALE_FUNC(8s32s,  schar, int, float)
-DEF_CVT_SCALE_FUNC(16u32s, ushort, int, float)
-DEF_CVT_SCALE_FUNC(16s32s, short, int, float)
-DEF_CVT_SCALE_FUNC(32s,    int, int, double)
-DEF_CVT_SCALE_FUNC(32f32s, float, int, float)
-DEF_CVT_SCALE_FUNC(64f32s, double, int, double)
+DEF_CVT_SCALE_FUNC(8u32f,  cvt_32f, uchar,  float, float)
+DEF_CVT_SCALE_FUNC(8s32f,  cvt_32f, schar,  float, float)
+DEF_CVT_SCALE_FUNC(16u32f, cvt_32f, ushort, float, float)
+DEF_CVT_SCALE_FUNC(16s32f, cvt_32f, short,  float, float)
+DEF_CVT_SCALE_FUNC(32s32f, cvt_32f, int,    float, float)
+DEF_CVT_SCALE_FUNC(32f,    cvt_32f, float,  float, float)
+DEF_CVT_SCALE_FUNC(64f32f, cvt_64f, double, float, double)
+//DEF_CVT_SCALE_FUNC(16f32f, cvt1_32f, float16_t, float, float)
 
-DEF_CVT_SCALE_FUNC(8u32f,  uchar, float, float)
-DEF_CVT_SCALE_FUNC(8s32f,  schar, float, float)
-DEF_CVT_SCALE_FUNC(16u32f, ushort, float, float)
-DEF_CVT_SCALE_FUNC(16s32f, short, float, float)
-DEF_CVT_SCALE_FUNC(32s32f, int, float, double)
-DEF_CVT_SCALE_FUNC(32f,    float, float, float)
-DEF_CVT_SCALE_FUNC(64f32f, double, float, double)
+DEF_CVT_SCALE_FUNC(8u64f,  cvt_64f, uchar,  double, double)
+DEF_CVT_SCALE_FUNC(8s64f,  cvt_64f, schar,  double, double)
+DEF_CVT_SCALE_FUNC(16u64f, cvt_64f, ushort, double, double)
+DEF_CVT_SCALE_FUNC(16s64f, cvt_64f, short,  double, double)
+DEF_CVT_SCALE_FUNC(32s64f, cvt_64f, int,    double, double)
+DEF_CVT_SCALE_FUNC(32f64f, cvt_64f, float,  double, double)
+DEF_CVT_SCALE_FUNC(64f,    cvt_64f, double, double, double)
+//DEF_CVT_SCALE_FUNC(16f64f, cvt_64f, float16_t, double, double)
 
-DEF_CVT_SCALE_FUNC(8u64f,  uchar, double, double)
-DEF_CVT_SCALE_FUNC(8s64f,  schar, double, double)
-DEF_CVT_SCALE_FUNC(16u64f, ushort, double, double)
-DEF_CVT_SCALE_FUNC(16s64f, short, double, double)
-DEF_CVT_SCALE_FUNC(32s64f, int, double, double)
-DEF_CVT_SCALE_FUNC(32f64f, float, double, double)
-DEF_CVT_SCALE_FUNC(64f,    double, double, double)
+/*DEF_CVT_SCALE_FUNC(8u16f,  cvt1_32f, uchar,  float16_t, float)
+DEF_CVT_SCALE_FUNC(8s16f,  cvt1_32f, schar,  float16_t, float)
+DEF_CVT_SCALE_FUNC(16u16f, cvt1_32f, ushort, float16_t, float)
+DEF_CVT_SCALE_FUNC(16s16f, cvt1_32f, short,  float16_t, float)
+DEF_CVT_SCALE_FUNC(32s16f, cvt1_32f, int,    float16_t, float)
+DEF_CVT_SCALE_FUNC(32f16f, cvt1_32f, float,  float16_t, float)
+DEF_CVT_SCALE_FUNC(64f16f, cvt_64f,  double, float16_t, double)
+DEF_CVT_SCALE_FUNC(16f,    cvt1_32f, float16_t, float16_t, float)*/
 
 static BinaryFunc getCvtScaleAbsFunc(int depth)
 {
@@ -1651,41 +306,44 @@ BinaryFunc getConvertScaleFunc(int sdepth, int ddepth)
         {
             (BinaryFunc)GET_OPTIMIZED(cvtScale8u), (BinaryFunc)GET_OPTIMIZED(cvtScale8s8u), (BinaryFunc)GET_OPTIMIZED(cvtScale16u8u),
             (BinaryFunc)GET_OPTIMIZED(cvtScale16s8u), (BinaryFunc)GET_OPTIMIZED(cvtScale32s8u), (BinaryFunc)GET_OPTIMIZED(cvtScale32f8u),
-            (BinaryFunc)cvtScale64f8u, 0
+            (BinaryFunc)cvtScale64f8u, 0 //(BinaryFunc)cvtScale16f8u
         },
         {
             (BinaryFunc)GET_OPTIMIZED(cvtScale8u8s), (BinaryFunc)GET_OPTIMIZED(cvtScale8s), (BinaryFunc)GET_OPTIMIZED(cvtScale16u8s),
             (BinaryFunc)GET_OPTIMIZED(cvtScale16s8s), (BinaryFunc)GET_OPTIMIZED(cvtScale32s8s), (BinaryFunc)GET_OPTIMIZED(cvtScale32f8s),
-            (BinaryFunc)cvtScale64f8s, 0
+            (BinaryFunc)cvtScale64f8s, 0 //(BinaryFunc)cvtScale16f8s
         },
         {
             (BinaryFunc)GET_OPTIMIZED(cvtScale8u16u), (BinaryFunc)GET_OPTIMIZED(cvtScale8s16u), (BinaryFunc)GET_OPTIMIZED(cvtScale16u),
             (BinaryFunc)GET_OPTIMIZED(cvtScale16s16u), (BinaryFunc)GET_OPTIMIZED(cvtScale32s16u), (BinaryFunc)GET_OPTIMIZED(cvtScale32f16u),
-            (BinaryFunc)cvtScale64f16u, 0
+            (BinaryFunc)cvtScale64f16u, 0 //(BinaryFunc)cvtScale16f16u
         },
         {
             (BinaryFunc)GET_OPTIMIZED(cvtScale8u16s), (BinaryFunc)GET_OPTIMIZED(cvtScale8s16s), (BinaryFunc)GET_OPTIMIZED(cvtScale16u16s),
             (BinaryFunc)GET_OPTIMIZED(cvtScale16s), (BinaryFunc)GET_OPTIMIZED(cvtScale32s16s), (BinaryFunc)GET_OPTIMIZED(cvtScale32f16s),
-            (BinaryFunc)cvtScale64f16s, 0
+            (BinaryFunc)cvtScale64f16s, 0 //(BinaryFunc)cvtScale16f16s
         },
         {
             (BinaryFunc)GET_OPTIMIZED(cvtScale8u32s), (BinaryFunc)GET_OPTIMIZED(cvtScale8s32s), (BinaryFunc)GET_OPTIMIZED(cvtScale16u32s),
             (BinaryFunc)GET_OPTIMIZED(cvtScale16s32s), (BinaryFunc)GET_OPTIMIZED(cvtScale32s), (BinaryFunc)GET_OPTIMIZED(cvtScale32f32s),
-            (BinaryFunc)cvtScale64f32s, 0
+            (BinaryFunc)cvtScale64f32s, 0 //(BinaryFunc)cvtScale16f32s
         },
         {
             (BinaryFunc)GET_OPTIMIZED(cvtScale8u32f), (BinaryFunc)GET_OPTIMIZED(cvtScale8s32f), (BinaryFunc)GET_OPTIMIZED(cvtScale16u32f),
             (BinaryFunc)GET_OPTIMIZED(cvtScale16s32f), (BinaryFunc)GET_OPTIMIZED(cvtScale32s32f), (BinaryFunc)GET_OPTIMIZED(cvtScale32f),
-            (BinaryFunc)cvtScale64f32f, 0
+            (BinaryFunc)cvtScale64f32f, 0 //(BinaryFunc)cvtScale16f32f
         },
         {
             (BinaryFunc)cvtScale8u64f, (BinaryFunc)cvtScale8s64f, (BinaryFunc)cvtScale16u64f,
             (BinaryFunc)cvtScale16s64f, (BinaryFunc)cvtScale32s64f, (BinaryFunc)cvtScale32f64f,
-            (BinaryFunc)cvtScale64f, 0
+            (BinaryFunc)cvtScale64f, 0 //(BinaryFunc)cvtScale16f64f
         },
         {
             0, 0, 0, 0, 0, 0, 0, 0
-        }
+            /*(BinaryFunc)cvtScale8u16f, (BinaryFunc)cvtScale8s16f, (BinaryFunc)cvtScale16u16f,
+            (BinaryFunc)cvtScale16s16f, (BinaryFunc)cvtScale32s16f, (BinaryFunc)cvtScale32f16f,
+            (BinaryFunc)cvtScale64f16f, (BinaryFunc)cvtScale16f*/
+        },
     };
 
     return cvtScaleTab[CV_MAT_DEPTH(ddepth)][CV_MAT_DEPTH(sdepth)];
diff --git a/modules/core/test/test_intrin_utils.hpp b/modules/core/test/test_intrin_utils.hpp
index a1409f0979..6666bc4253 100644
--- a/modules/core/test/test_intrin_utils.hpp
+++ b/modules/core/test/test_intrin_utils.hpp
@@ -1123,7 +1123,6 @@ template<typename R> struct TheTest
         return *this;
     }
 
-#if CV_FP16
     TheTest & test_loadstore_fp16_f32()
     {
         printf("test_loadstore_fp16_f32 ...\n");
@@ -1133,14 +1132,14 @@ template<typename R> struct TheTest
         AlignedData<v_float32> data_f32; data_f32.a.clear();
         AlignedData<v_uint16> out;
 
-        R r1 = vx_load_fp16_f32((short*)data.a.d);
+        R r1 = vx_load_expand((const cv::float16_t*)data.a.d);
         R r2(r1);
         EXPECT_EQ(1.0f, r1.get0());
         vx_store(data_f32.a.d, r2);
         EXPECT_EQ(-2.0f, data_f32.a.d[R::nlanes - 1]);
 
         out.a.clear();
-        vx_store_fp16((short*)out.a.d, r2);
+        v_pack_store((cv::float16_t*)out.a.d, r2);
         for (int i = 0; i < R::nlanes; ++i)
         {
             EXPECT_EQ(data.a[i], out.a[i]) << "i=" << i;
@@ -1148,9 +1147,8 @@ template<typename R> struct TheTest
 
         return *this;
     }
-#endif
 
-#if CV_SIMD_FP16
+#if 0
     TheTest & test_loadstore_fp16()
     {
         printf("test_loadstore_fp16 ...\n");
@@ -1165,7 +1163,7 @@ template<typename R> struct TheTest
 
         // check some initialization methods
         R r1 = data.u;
-        R r2 = vx_load_f16(data.a.d);
+        R r2 = vx_load_expand((const float16_t*)data.a.d);
         R r3(r2);
         EXPECT_EQ(data.u[0], r1.get0());
         EXPECT_EQ(data.a[0], r2.get0());
diff --git a/modules/core/test/test_math.cpp b/modules/core/test/test_math.cpp
index 68dfc3c969..610d16252a 100644
--- a/modules/core/test/test_math.cpp
+++ b/modules/core/test/test_math.cpp
@@ -3230,6 +3230,22 @@ softdouble naiveExp(softdouble x)
     }
 }
 
+static float makeFP32(int sign, int exponent, int significand)
+{
+    Cv32suf x;
+    x.u = (unsigned)(((sign & 1) << 31) | ((exponent&255) << 23) | (significand & 0x7fffff));
+    return x.f;
+}
+
+static float makeRandomFP32(RNG& rng, int sign, int exprange)
+{
+    if( sign == -1 )
+        sign = rng() % 2;
+    int exponent = rng() % exprange;
+    int significand = rng() % (1 << 23);
+    return makeFP32(sign, exponent, significand);
+}
+
 TEST(Core_SoftFloat, exp32)
 {
     //special cases
@@ -3246,13 +3262,11 @@ TEST(Core_SoftFloat, exp32)
     inputs.push_back(softfloat::min());
     for(int i = 0; i < 50000; i++)
     {
-        Cv32suf x;
-        x.fmt.sign = rng() % 2;
-        x.fmt.exponent = rng() % (10 + 127); //bigger exponent will produce inf
-        x.fmt.significand = rng() % (1 << 23);
-        if(softfloat(x.f) > ln_max)
-            x.f = rng.uniform(0.0f, (float)ln_max);
-        inputs.push_back(softfloat(x.f));
+        float x = makeRandomFP32(rng, -1, 10+127 //bigger exponent will produce inf
+                                 );
+        if(softfloat(x) > ln_max)
+            x = rng.uniform(0.0f, (float)ln_max);
+        inputs.push_back(softfloat(x));
     }
 
     for(size_t i = 0; i < inputs.size(); i++)
@@ -3323,11 +3337,7 @@ TEST(Core_SoftFloat, log32)
     EXPECT_TRUE(log(softfloat::nan()).isNaN());
     for(int i = 0; i < nValues; i++)
     {
-        Cv32suf x;
-        x.fmt.sign = 1;
-        x.fmt.exponent = rng() % 255;
-        x.fmt.significand = rng() % (1 << 23);
-        softfloat x32(x.f);
+        softfloat x32(makeRandomFP32(rng, 1, 255));
         ASSERT_TRUE(log(x32).isNaN());
     }
     EXPECT_TRUE(log(softfloat::zero()).isInf());
@@ -3340,11 +3350,7 @@ TEST(Core_SoftFloat, log32)
     inputs.push_back(softfloat::max());
     for(int i = 0; i < nValues; i++)
     {
-        Cv32suf x;
-        x.fmt.sign = 0;
-        x.fmt.exponent = rng() % 255;
-        x.fmt.significand = rng() % (1 << 23);
-        inputs.push_back(softfloat(x.f));
+        inputs.push_back(softfloat(makeRandomFP32(rng, 0, 255)));
     }
 
     for(size_t i = 0; i < inputs.size(); i++)
@@ -3426,11 +3432,7 @@ TEST(Core_SoftFloat, cbrt32)
     inputs.push_back(softfloat::min());
     for(int i = 0; i < 50000; i++)
     {
-        Cv32suf x;
-        x.fmt.sign = rng() % 2;
-        x.fmt.exponent = rng() % 255;
-        x.fmt.significand = rng() % (1 << 23);
-        inputs.push_back(softfloat(x.f));
+        inputs.push_back(softfloat(makeRandomFP32(rng, -1, 255)));
     }
 
     for(size_t i = 0; i < inputs.size(); i++)
@@ -3522,11 +3524,8 @@ TEST(Core_SoftFloat, pow32)
     // inf ** y == inf, if y > 0
     for(size_t i = 0; i < nValues; i++)
     {
-        Cv32suf x;
-        x.fmt.sign = 0;
-        x.fmt.exponent = rng() % 255;
-        x.fmt.significand = rng() % (1 << 23);
-        softfloat x32 = softfloat(x.f);
+        float x = makeRandomFP32(rng, 0, 255);
+        softfloat x32 = softfloat(x);
         ASSERT_TRUE(pow( inf, x32).isInf());
         ASSERT_TRUE(pow(-inf, x32).isInf());
         ASSERT_EQ(pow( inf, -x32), zero);
@@ -3538,17 +3537,9 @@ TEST(Core_SoftFloat, pow32)
     // x ** y == nan, if x < 0 and y is not integer
     for(size_t i = 0; i < nValues; i++)
     {
-        Cv32suf x;
-        x.fmt.sign = 1;
-        x.fmt.exponent = rng() % 255;
-        x.fmt.significand = rng() % (1 << 23);
-        softfloat x32(x.f);
-        Cv32suf y;
-        y.fmt.sign = rng() % 2;
-        //bigger exponent produces integer numbers only
-        y.fmt.exponent = rng() % (23 + 127);
-        y.fmt.significand = rng() % (1 << 23);
-        softfloat y32(y.f);
+        softfloat x32(makeRandomFP32(rng, 1, 255));
+        softfloat y32(makeRandomFP32(rng, -1, 23+127 //bigger exponent produces integer numbers only
+                                     ));
         int yi = cvRound(y32);
         if(y32 != softfloat(yi))
             ASSERT_TRUE(pow(x32, y32).isNaN());
@@ -3565,11 +3556,7 @@ TEST(Core_SoftFloat, pow32)
     // 0 ** y == 0, if y > 0
     for(size_t i = 0; i < nValues; i++)
     {
-        Cv32suf x;
-        x.fmt.sign = 0;
-        x.fmt.exponent = rng() % 255;
-        x.fmt.significand = rng() % (1 << 23);
-        softfloat x32(x.f);
+        softfloat x32(makeRandomFP32(rng, 0, 255));
         ASSERT_TRUE(pow(zero, -x32).isInf());
         if(x32 != one)
         {
diff --git a/platforms/ios/build_framework.py b/platforms/ios/build_framework.py
old mode 100644
new mode 100755
index 32305f9a08..d624e08d90
--- a/platforms/ios/build_framework.py
+++ b/platforms/ios/build_framework.py
@@ -183,7 +183,7 @@ class Builder:
         cmakecmd = self.getCMakeArgs(arch, target) + \
             (["-DCMAKE_TOOLCHAIN_FILE=%s" % toolchain] if toolchain is not None else [])
         if target.lower().startswith("iphoneos"):
-            cmakecmd.append("-DENABLE_NEON=ON")
+            cmakecmd.append("-DCPU_BASELINE=NEON;FP16")
         cmakecmd.append(self.opencv)
         cmakecmd.extend(cmakeargs)
         execute(cmakecmd, cwd = builddir)