diff --git a/cmake/OpenCVFindLibsVideo.cmake b/cmake/OpenCVFindLibsVideo.cmake
index b2d927957f..6e1029c2c4 100644
--- a/cmake/OpenCVFindLibsVideo.cmake
+++ b/cmake/OpenCVFindLibsVideo.cmake
@@ -229,6 +229,18 @@ if(WITH_FFMPEG)
         find_library(FFMPEG_UTIL_LIB "avutil" HINTS "${FFMPEG_LIB_DIR}")
         find_library(FFMPEG_SWSCALE_LIB "swscale" HINTS "${FFMPEG_LIB_DIR}")
         find_library(FFMPEG_RESAMPLE_LIB "avresample" HINTS "${FFMPEG_LIB_DIR}")
+        if(FFMPEG_CODEC_LIB)
+          set(HAVE_FFMPEG_CODEC 1)
+        endif()
+        if(FFMPEG_FORMAT_LIB)
+          set(HAVE_FFMPEG_FORMAT 1)
+        endif()
+        if(FFMPEG_UTIL_LIB)
+          set(HAVE_FFMPEG_UTIL 1)
+        endif()
+        if(FFMPEG_SWSCALE_LIB)
+          set(HAVE_FFMPEG_SWSCALE 1)
+        endif()
         if(FFMPEG_CODEC_LIB AND FFMPEG_FORMAT_LIB AND
            FFMPEG_UTIL_LIB AND FFMPEG_SWSCALE_LIB)
           set(ALIASOF_libavcodec_VERSION "Unknown")
diff --git a/modules/core/include/opencv2/core/mat.hpp b/modules/core/include/opencv2/core/mat.hpp
index 315d498c5a..45f3cef0ce 100644
--- a/modules/core/include/opencv2/core/mat.hpp
+++ b/modules/core/include/opencv2/core/mat.hpp
@@ -163,7 +163,8 @@ public:
         CUDA_HOST_MEM     = 8 << KIND_SHIFT,
         CUDA_GPU_MAT      = 9 << KIND_SHIFT,
         UMAT              =10 << KIND_SHIFT,
-        STD_VECTOR_UMAT   =11 << KIND_SHIFT
+        STD_VECTOR_UMAT   =11 << KIND_SHIFT,
+        STD_BOOL_VECTOR   =12 << KIND_SHIFT
     };
 
     _InputArray();
@@ -173,6 +174,7 @@ public:
     _InputArray(const std::vector<Mat>& vec);
     template<typename _Tp> _InputArray(const Mat_<_Tp>& m);
     template<typename _Tp> _InputArray(const std::vector<_Tp>& vec);
+    _InputArray(const std::vector<bool>& vec);
     template<typename _Tp> _InputArray(const std::vector<std::vector<_Tp> >& vec);
     template<typename _Tp> _InputArray(const std::vector<Mat_<_Tp> >& vec);
     template<typename _Tp> _InputArray(const _Tp* vec, int n);
@@ -284,6 +286,7 @@ public:
     _OutputArray(cuda::HostMem& cuda_mem);
     template<typename _Tp> _OutputArray(cudev::GpuMat_<_Tp>& m);
     template<typename _Tp> _OutputArray(std::vector<_Tp>& vec);
+    _OutputArray(std::vector<bool>& vec);
     template<typename _Tp> _OutputArray(std::vector<std::vector<_Tp> >& vec);
     template<typename _Tp> _OutputArray(std::vector<Mat_<_Tp> >& vec);
     template<typename _Tp> _OutputArray(Mat_<_Tp>& m);
@@ -340,6 +343,7 @@ public:
     _InputOutputArray(cuda::HostMem& cuda_mem);
     template<typename _Tp> _InputOutputArray(cudev::GpuMat_<_Tp>& m);
     template<typename _Tp> _InputOutputArray(std::vector<_Tp>& vec);
+    _InputOutputArray(std::vector<bool>& vec);
     template<typename _Tp> _InputOutputArray(std::vector<std::vector<_Tp> >& vec);
     template<typename _Tp> _InputOutputArray(std::vector<Mat_<_Tp> >& vec);
     template<typename _Tp> _InputOutputArray(Mat_<_Tp>& m);
diff --git a/modules/core/include/opencv2/core/mat.inl.hpp b/modules/core/include/opencv2/core/mat.inl.hpp
index 535baa156d..3779b83f82 100644
--- a/modules/core/include/opencv2/core/mat.inl.hpp
+++ b/modules/core/include/opencv2/core/mat.inl.hpp
@@ -77,6 +77,10 @@ template<typename _Tp> inline
 _InputArray::_InputArray(const std::vector<_Tp>& vec)
 { init(FIXED_TYPE + STD_VECTOR + DataType<_Tp>::type + ACCESS_READ, &vec); }
 
+inline
+_InputArray::_InputArray(const std::vector<bool>& vec)
+{ init(FIXED_TYPE + STD_BOOL_VECTOR + DataType<bool>::type + ACCESS_READ, &vec); }
+
 template<typename _Tp> inline
 _InputArray::_InputArray(const std::vector<std::vector<_Tp> >& vec)
 { init(FIXED_TYPE + STD_VECTOR_VECTOR + DataType<_Tp>::type + ACCESS_READ, &vec); }
@@ -140,6 +144,10 @@ template<typename _Tp> inline
 _OutputArray::_OutputArray(std::vector<_Tp>& vec)
 { init(FIXED_TYPE + STD_VECTOR + DataType<_Tp>::type + ACCESS_WRITE, &vec); }
 
+inline
+_OutputArray::_OutputArray(std::vector<bool>&)
+{ CV_Error(Error::StsUnsupportedFormat, "std::vector<bool> cannot be an output array\n"); }
+
 template<typename _Tp> inline
 _OutputArray::_OutputArray(std::vector<std::vector<_Tp> >& vec)
 { init(FIXED_TYPE + STD_VECTOR_VECTOR + DataType<_Tp>::type + ACCESS_WRITE, &vec); }
@@ -227,6 +235,9 @@ template<typename _Tp> inline
 _InputOutputArray::_InputOutputArray(std::vector<_Tp>& vec)
 { init(FIXED_TYPE + STD_VECTOR + DataType<_Tp>::type + ACCESS_RW, &vec); }
 
+inline _InputOutputArray::_InputOutputArray(std::vector<bool>&)
+{ CV_Error(Error::StsUnsupportedFormat, "std::vector<bool> cannot be an input/output array\n"); }
+
 template<typename _Tp> inline
 _InputOutputArray::_InputOutputArray(std::vector<std::vector<_Tp> >& vec)
 { init(FIXED_TYPE + STD_VECTOR_VECTOR + DataType<_Tp>::type + ACCESS_RW, &vec); }
@@ -1464,13 +1475,15 @@ Mat_<_Tp> Mat_<_Tp>::operator()( const Range* ranges ) const
 template<typename _Tp> inline
 _Tp* Mat_<_Tp>::operator [](int y)
 {
-    return (_Tp*)ptr(y);
+    CV_DbgAssert( 0 <= y && y < rows );
+    return (_Tp*)(data + y*step.p[0]);
 }
 
 template<typename _Tp> inline
 const _Tp* Mat_<_Tp>::operator [](int y) const
 {
-    return (const _Tp*)ptr(y);
+    CV_DbgAssert( 0 <= y && y < rows );
+    return (const _Tp*)(data + y*step.p[0]);
 }
 
 template<typename _Tp> inline
diff --git a/modules/core/include/opencv2/core/optim.hpp b/modules/core/include/opencv2/core/optim.hpp
index 4f1749ec97..18e733f47b 100644
--- a/modules/core/include/opencv2/core/optim.hpp
+++ b/modules/core/include/opencv2/core/optim.hpp
@@ -63,9 +63,11 @@ public:
     class CV_EXPORTS Function
     {
     public:
-       virtual ~Function() {}
-       virtual double calc(const double* x) const = 0;
-       virtual void getGradient(const double* /*x*/,double* /*grad*/) {}
+        virtual ~Function() {}
+        virtual int getDims() const = 0;
+        virtual double getGradientEps() const;
+        virtual double calc(const double* x) const = 0;
+        virtual void getGradient(const double* x,double* grad);
     };
 
     /** @brief Getter for the optimized function.
diff --git a/modules/core/src/arithm.cpp b/modules/core/src/arithm.cpp
index 15b0759f77..1b8705983f 100644
--- a/modules/core/src/arithm.cpp
+++ b/modules/core/src/arithm.cpp
@@ -2781,15 +2781,23 @@ struct Div_SIMD
     }
 };
 
-#if CV_SSE2
+template <typename T>
+struct Recip_SIMD
+{
+    int operator() (const T *, T *, int, double) const
+    {
+        return 0;
+    }
+};
 
-#if CV_SSE4_1
+
+#if CV_SIMD128
 
 template <>
 struct Div_SIMD<uchar>
 {
     bool haveSIMD;
-    Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE4_1); }
+    Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
 
     int operator() (const uchar * src1, const uchar * src2, uchar * dst, int width, double scale) const
     {
@@ -2798,50 +2806,44 @@ struct Div_SIMD<uchar>
         if (!haveSIMD)
             return x;
 
-        __m128d v_scale = _mm_set1_pd(scale);
-        __m128i v_zero = _mm_setzero_si128();
+        v_float32x4 v_scale = v_setall_f32((float)scale);
+        v_uint16x8 v_zero = v_setzero_u16();
 
         for ( ; x <= width - 8; x += 8)
         {
-            __m128i v_src1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(src1 + x)), v_zero);
-            __m128i _v_src2 = _mm_loadl_epi64((const __m128i *)(src2 + x));
-            __m128i v_src2 = _mm_unpacklo_epi8(_v_src2, v_zero);
+            v_uint16x8 v_src1 = v_load_expand(src1 + x);
+            v_uint16x8 v_src2 = v_load_expand(src2 + x);
 
-            __m128i v_src1i = _mm_unpacklo_epi16(v_src1, v_zero);
-            __m128i v_src2i = _mm_unpacklo_epi16(v_src2, v_zero);
-            __m128d v_src1d = _mm_cvtepi32_pd(v_src1i);
-            __m128d v_src2d = _mm_cvtepi32_pd(v_src2i);
-            __m128i v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d));
-            v_src1d = _mm_cvtepi32_pd(_mm_srli_si128(v_src1i, 8));
-            v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8));
-            __m128i v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d));
-            __m128i v_dst_0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1)));
+            v_uint32x4 t0, t1, t2, t3;
+            v_expand(v_src1, t0, t1);
+            v_expand(v_src2, t2, t3);
 
-            v_src1i = _mm_unpackhi_epi16(v_src1, v_zero);
-            v_src2i = _mm_unpackhi_epi16(v_src2, v_zero);
-            v_src1d = _mm_cvtepi32_pd(v_src1i);
-            v_src2d = _mm_cvtepi32_pd(v_src2i);
-            v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d));
-            v_src1d = _mm_cvtepi32_pd(_mm_srli_si128(v_src1i, 8));
-            v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8));
-            v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d));
-            __m128i v_dst_1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1)));
+            v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0));
+            v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1));
 
-            __m128i v_mask = _mm_cmpeq_epi8(_v_src2, v_zero);
-            _mm_storel_epi64((__m128i *)(dst + x), _mm_andnot_si128(v_mask, _mm_packus_epi16(_mm_packs_epi32(v_dst_0, v_dst_1), v_zero)));
+            v_float32x4 f2 = v_cvt_f32(v_reinterpret_as_s32(t2));
+            v_float32x4 f3 = v_cvt_f32(v_reinterpret_as_s32(t3));
+
+            f0 = f0 * v_scale / f2;
+            f1 = f1 * v_scale / f3;
+
+            v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
+            v_uint16x8 res = v_pack_u(i0, i1);
+
+            res = v_select(v_src2 == v_zero, v_zero, res);
+            v_pack_store(dst + x, res);
         }
 
         return x;
     }
 };
 
-#endif // CV_SSE4_1
 
 template <>
 struct Div_SIMD<schar>
 {
     bool haveSIMD;
-    Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2); }
+    Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
 
     int operator() (const schar * src1, const schar * src2, schar * dst, int width, double scale) const
     {
@@ -2850,50 +2852,44 @@ struct Div_SIMD<schar>
         if (!haveSIMD)
             return x;
 
-        __m128d v_scale = _mm_set1_pd(scale);
-        __m128i v_zero = _mm_setzero_si128();
+        v_float32x4 v_scale = v_setall_f32((float)scale);
+        v_int16x8 v_zero = v_setzero_s16();
 
         for ( ; x <= width - 8; x += 8)
         {
-            __m128i v_src1 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((const __m128i *)(src1 + x))), 8);
-            __m128i _v_src2 = _mm_loadl_epi64((const __m128i *)(src2 + x));
-            __m128i v_src2 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _v_src2), 8);
+            v_int16x8 v_src1 = v_load_expand(src1 + x);
+            v_int16x8 v_src2 = v_load_expand(src2 + x);
 
-            __m128i v_src1i = _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16);
-            __m128i v_src2i = _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16);
-            __m128d v_src1d = _mm_cvtepi32_pd(v_src1i);
-            __m128d v_src2d = _mm_cvtepi32_pd(v_src2i);
-            __m128i v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d));
-            v_src1d = _mm_cvtepi32_pd(_mm_srli_si128(v_src1i, 8));
-            v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8));
-            __m128i v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d));
-            __m128i v_dst_0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1)));
+            v_int32x4 t0, t1, t2, t3;
+            v_expand(v_src1, t0, t1);
+            v_expand(v_src2, t2, t3);
 
-            v_src1i = _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16);
-            v_src2i = _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16);
-            v_src1d = _mm_cvtepi32_pd(v_src1i);
-            v_src2d = _mm_cvtepi32_pd(v_src2i);
-            v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d));
-            v_src1d = _mm_cvtepi32_pd(_mm_srli_si128(v_src1i, 8));
-            v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8));
-            v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d));
-            __m128i v_dst_1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1)));
+            v_float32x4 f0 = v_cvt_f32(t0);
+            v_float32x4 f1 = v_cvt_f32(t1);
 
-            __m128i v_mask = _mm_cmpeq_epi8(_v_src2, v_zero);
-            _mm_storel_epi64((__m128i *)(dst + x), _mm_andnot_si128(v_mask, _mm_packs_epi16(_mm_packs_epi32(v_dst_0, v_dst_1), v_zero)));
+            v_float32x4 f2 = v_cvt_f32(t2);
+            v_float32x4 f3 = v_cvt_f32(t3);
+
+            f0 = f0 * v_scale / f2;
+            f1 = f1 * v_scale / f3;
+
+            v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
+            v_int16x8 res = v_pack(i0, i1);
+
+            res = v_select(v_src2 == v_zero, v_zero, res);
+            v_pack_store(dst + x, res);
         }
 
         return x;
     }
 };
 
-#if CV_SSE4_1
 
 template <>
 struct Div_SIMD<ushort>
 {
     bool haveSIMD;
-    Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE4_1); }
+    Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
 
     int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, double scale) const
     {
@@ -2902,49 +2898,43 @@ struct Div_SIMD<ushort>
         if (!haveSIMD)
             return x;
 
-        __m128d v_scale = _mm_set1_pd(scale);
-        __m128i v_zero = _mm_setzero_si128();
+        v_float32x4 v_scale = v_setall_f32((float)scale);
+        v_uint16x8 v_zero = v_setzero_u16();
 
         for ( ; x <= width - 8; x += 8)
         {
-            __m128i v_src1 = _mm_loadu_si128((const __m128i *)(src1 + x));
-            __m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x));
+            v_uint16x8 v_src1 = v_load(src1 + x);
+            v_uint16x8 v_src2 = v_load(src2 + x);
 
-            __m128i v_src1i = _mm_unpacklo_epi16(v_src1, v_zero);
-            __m128i v_src2i = _mm_unpacklo_epi16(v_src2, v_zero);
-            __m128d v_src1d = _mm_cvtepi32_pd(v_src1i);
-            __m128d v_src2d = _mm_cvtepi32_pd(v_src2i);
-            __m128i v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d));
-            v_src1d = _mm_cvtepi32_pd(_mm_srli_si128(v_src1i, 8));
-            v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8));
-            __m128i v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d));
-            __m128i v_dst_0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1)));
+            v_uint32x4 t0, t1, t2, t3;
+            v_expand(v_src1, t0, t1);
+            v_expand(v_src2, t2, t3);
 
-            v_src1i = _mm_unpackhi_epi16(v_src1, v_zero);
-            v_src2i = _mm_unpackhi_epi16(v_src2, v_zero);
-            v_src1d = _mm_cvtepi32_pd(v_src1i);
-            v_src2d = _mm_cvtepi32_pd(v_src2i);
-            v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d));
-            v_src1d = _mm_cvtepi32_pd(_mm_srli_si128(v_src1i, 8));
-            v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8));
-            v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d));
-            __m128i v_dst_1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1)));
+            v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0));
+            v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1));
 
-            __m128i v_mask = _mm_cmpeq_epi16(v_src2, v_zero);
-            _mm_storeu_si128((__m128i *)(dst + x), _mm_andnot_si128(v_mask, _mm_packus_epi32(v_dst_0, v_dst_1)));
+            v_float32x4 f2 = v_cvt_f32(v_reinterpret_as_s32(t2));
+            v_float32x4 f3 = v_cvt_f32(v_reinterpret_as_s32(t3));
+
+            f0 = f0 * v_scale / f2;
+            f1 = f1 * v_scale / f3;
+
+            v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
+            v_uint16x8 res = v_pack_u(i0, i1);
+
+            res = v_select(v_src2 == v_zero, v_zero, res);
+            v_store(dst + x, res);
         }
 
         return x;
     }
 };
 
-#endif // CV_SSE4_1
-
 template <>
 struct Div_SIMD<short>
 {
     bool haveSIMD;
-    Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2); }
+    Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
 
     int operator() (const short * src1, const short * src2, short * dst, int width, double scale) const
     {
@@ -2953,36 +2943,32 @@ struct Div_SIMD<short>
         if (!haveSIMD)
             return x;
 
-        __m128d v_scale = _mm_set1_pd(scale);
-        __m128i v_zero = _mm_setzero_si128();
+        v_float32x4 v_scale = v_setall_f32((float)scale);
+        v_int16x8 v_zero = v_setzero_s16();
 
         for ( ; x <= width - 8; x += 8)
         {
-            __m128i v_src1 = _mm_loadu_si128((const __m128i *)(src1 + x));
-            __m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x));
+            v_int16x8 v_src1 = v_load(src1 + x);
+            v_int16x8 v_src2 = v_load(src2 + x);
 
-            __m128i v_src1i = _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16);
-            __m128i v_src2i = _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16);
-            __m128d v_src1d = _mm_cvtepi32_pd(v_src1i);
-            __m128d v_src2d = _mm_cvtepi32_pd(v_src2i);
-            __m128i v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d));
-            v_src1d = _mm_cvtepi32_pd(_mm_srli_si128(v_src1i, 8));
-            v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8));
-            __m128i v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d));
-            __m128i v_dst_0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1)));
+            v_int32x4 t0, t1, t2, t3;
+            v_expand(v_src1, t0, t1);
+            v_expand(v_src2, t2, t3);
 
-            v_src1i = _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16);
-            v_src2i = _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16);
-            v_src1d = _mm_cvtepi32_pd(v_src1i);
-            v_src2d = _mm_cvtepi32_pd(v_src2i);
-            v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d));
-            v_src1d = _mm_cvtepi32_pd(_mm_srli_si128(v_src1i, 8));
-            v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8));
-            v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d));
-            __m128i v_dst_1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1)));
+            v_float32x4 f0 = v_cvt_f32(t0);
+            v_float32x4 f1 = v_cvt_f32(t1);
 
-            __m128i v_mask = _mm_cmpeq_epi16(v_src2, v_zero);
-            _mm_storeu_si128((__m128i *)(dst + x), _mm_andnot_si128(v_mask, _mm_packs_epi32(v_dst_0, v_dst_1)));
+            v_float32x4 f2 = v_cvt_f32(t2);
+            v_float32x4 f3 = v_cvt_f32(t3);
+
+            f0 = f0 * v_scale / f2;
+            f1 = f1 * v_scale / f3;
+
+            v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
+            v_int16x8 res = v_pack(i0, i1);
+
+            res = v_select(v_src2 == v_zero, v_zero, res);
+            v_store(dst + x, res);
         }
 
         return x;
@@ -2993,7 +2979,7 @@ template <>
 struct Div_SIMD<int>
 {
     bool haveSIMD;
-    Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2); }
+    Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
 
     int operator() (const int * src1, const int * src2, int * dst, int width, double scale) const
     {
@@ -3002,100 +2988,82 @@ struct Div_SIMD<int>
         if (!haveSIMD)
             return x;
 
-        __m128d v_scale = _mm_set1_pd(scale);
-        __m128i v_zero = _mm_setzero_si128();
+        v_float32x4 v_scale = v_setall_f32((float)scale);
+        v_int32x4 v_zero = v_setzero_s32();
 
-        for ( ; x <= width - 4; x += 4)
+        for ( ; x <= width - 8; x += 8)
         {
-            __m128i v_src1 = _mm_loadu_si128((const __m128i *)(src1 + x));
-            __m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x));
+            v_int32x4 t0 = v_load(src1 + x);
+            v_int32x4 t1 = v_load(src1 + x + 4);
+            v_int32x4 t2 = v_load(src2 + x);
+            v_int32x4 t3 = v_load(src2 + x + 4);
 
-            __m128d v_src1d = _mm_cvtepi32_pd(v_src1);
-            __m128d v_src2d = _mm_cvtepi32_pd(v_src2);
-            __m128i v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d));
+            v_float32x4 f0 = v_cvt_f32(t0);
+            v_float32x4 f1 = v_cvt_f32(t1);
+            v_float32x4 f2 = v_cvt_f32(t2);
+            v_float32x4 f3 = v_cvt_f32(t3);
 
-            v_src1d = _mm_cvtepi32_pd(_mm_srli_si128(v_src1, 8));
-            v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2, 8));
-            __m128i v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d));
+            f0 = f0 * v_scale / f2;
+            f1 = f1 * v_scale / f3;
 
-            __m128i v_dst = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1)));
-            __m128i v_mask = _mm_cmpeq_epi32(v_src2, v_zero);
-            _mm_storeu_si128((__m128i *)(dst + x), _mm_andnot_si128(v_mask, v_dst));
+            v_int32x4 res0 = v_round(f0), res1 = v_round(f1);
+
+            res0 = v_select(t2 == v_zero, v_zero, res0);
+            res1 = v_select(t3 == v_zero, v_zero, res1);
+            v_store(dst + x, res0);
+            v_store(dst + x + 4, res1);
         }
 
         return x;
     }
 };
 
-#endif
 
-template<typename T> static void
-div_( const T* src1, size_t step1, const T* src2, size_t step2,
-      T* dst, size_t step, Size size, double scale )
+template <>
+struct Div_SIMD<float>
 {
-    step1 /= sizeof(src1[0]);
-    step2 /= sizeof(src2[0]);
-    step /= sizeof(dst[0]);
+    bool haveSIMD;
+    Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
 
-    Div_SIMD<T> vop;
-
-    for( ; size.height--; src1 += step1, src2 += step2, dst += step )
+    int operator() (const float * src1, const float * src2, float * dst, int width, double scale) const
     {
-        int i = vop(src1, src2, dst, size.width, scale);
-        #if CV_ENABLE_UNROLLED
-        for( ; i <= size.width - 4; i += 4 )
+        int x = 0;
+
+        if (!haveSIMD)
+            return x;
+
+        v_float32x4 v_scale = v_setall_f32((float)scale);
+        v_float32x4 v_zero = v_setzero_f32();
+
+        for ( ; x <= width - 8; x += 8)
         {
-            if( src2[i] != 0 && src2[i+1] != 0 && src2[i+2] != 0 && src2[i+3] != 0 )
-            {
-                double a = (double)src2[i] * src2[i+1];
-                double b = (double)src2[i+2] * src2[i+3];
-                double d = scale/(a * b);
-                b *= d;
-                a *= d;
+            v_float32x4 f0 = v_load(src1 + x);
+            v_float32x4 f1 = v_load(src1 + x + 4);
+            v_float32x4 f2 = v_load(src2 + x);
+            v_float32x4 f3 = v_load(src2 + x + 4);
 
-                T z0 = saturate_cast<T>(src2[i+1] * ((double)src1[i] * b));
-                T z1 = saturate_cast<T>(src2[i] * ((double)src1[i+1] * b));
-                T z2 = saturate_cast<T>(src2[i+3] * ((double)src1[i+2] * a));
-                T z3 = saturate_cast<T>(src2[i+2] * ((double)src1[i+3] * a));
+            v_float32x4 res0 = f0 * v_scale / f2;
+            v_float32x4 res1 = f1 * v_scale / f3;
 
-                dst[i] = z0; dst[i+1] = z1;
-                dst[i+2] = z2; dst[i+3] = z3;
-            }
-            else
-            {
-                T z0 = src2[i] != 0 ? saturate_cast<T>(src1[i]*scale/src2[i]) : 0;
-                T z1 = src2[i+1] != 0 ? saturate_cast<T>(src1[i+1]*scale/src2[i+1]) : 0;
-                T z2 = src2[i+2] != 0 ? saturate_cast<T>(src1[i+2]*scale/src2[i+2]) : 0;
-                T z3 = src2[i+3] != 0 ? saturate_cast<T>(src1[i+3]*scale/src2[i+3]) : 0;
+            res0 = v_select(f2 == v_zero, v_zero, res0);
+            res1 = v_select(f3 == v_zero, v_zero, res1);
 
-                dst[i] = z0; dst[i+1] = z1;
-                dst[i+2] = z2; dst[i+3] = z3;
-            }
+            v_store(dst + x, res0);
+            v_store(dst + x + 4, res1);
         }
-        #endif
-        for( ; i < size.width; i++ )
-            dst[i] = src2[i] != 0 ? saturate_cast<T>(src1[i]*scale/src2[i]) : 0;
-    }
-}
 
-template <typename T>
-struct Recip_SIMD
-{
-    int operator() (const T *, T *, int, double) const
-    {
-        return 0;
+        return x;
     }
 };
 
-#if CV_SSE2
 
-#if CV_SSE4_1
+///////////////////////// RECIPROCAL //////////////////////
 
 template <>
 struct Recip_SIMD<uchar>
 {
     bool haveSIMD;
-    Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE4_1); }
+    Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
 
     int operator() (const uchar * src2, uchar * dst, int width, double scale) const
     {
@@ -3104,43 +3072,39 @@ struct Recip_SIMD<uchar>
         if (!haveSIMD)
             return x;
 
-        __m128d v_scale = _mm_set1_pd(scale);
-        __m128i v_zero = _mm_setzero_si128();
+        v_float32x4 v_scale = v_setall_f32((float)scale);
+        v_uint16x8 v_zero = v_setzero_u16();
 
         for ( ; x <= width - 8; x += 8)
         {
-            __m128i _v_src2 = _mm_loadl_epi64((const __m128i *)(src2 + x));
-            __m128i v_src2 = _mm_unpacklo_epi8(_v_src2, v_zero);
+            v_uint16x8 v_src2 = v_load_expand(src2 + x);
 
-            __m128i v_src2i = _mm_unpacklo_epi16(v_src2, v_zero);
-            __m128d v_src2d = _mm_cvtepi32_pd(v_src2i);
-            __m128i v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d));
-            v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8));
-            __m128i v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d));
-            __m128i v_dst_0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1)));
+            v_uint32x4 t0, t1;
+            v_expand(v_src2, t0, t1);
 
-            v_src2i = _mm_unpackhi_epi16(v_src2, v_zero);
-            v_src2d = _mm_cvtepi32_pd(v_src2i);
-            v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d));
-            v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8));
-            v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d));
-            __m128i v_dst_1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1)));
+            v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0));
+            v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1));
 
-            __m128i v_mask = _mm_cmpeq_epi8(_v_src2, v_zero);
-            _mm_storel_epi64((__m128i *)(dst + x), _mm_andnot_si128(v_mask, _mm_packus_epi16(_mm_packs_epi32(v_dst_0, v_dst_1), v_zero)));
+            f0 = v_scale / f0;
+            f1 = v_scale / f1;
+
+            v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
+            v_uint16x8 res = v_pack_u(i0, i1);
+
+            res = v_select(v_src2 == v_zero, v_zero, res);
+            v_pack_store(dst + x, res);
         }
 
         return x;
     }
 };
 
-#endif // CV_SSE4_1
 
 template <>
 struct Recip_SIMD<schar>
 {
     bool haveSIMD;
-    Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2); }
+    Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
 
     int operator() (const schar * src2, schar * dst, int width, double scale) const
     {
@@ -3149,43 +3113,39 @@ struct Recip_SIMD<schar>
         if (!haveSIMD)
             return x;
 
-        __m128d v_scale = _mm_set1_pd(scale);
-        __m128i v_zero = _mm_setzero_si128();
+        v_float32x4 v_scale = v_setall_f32((float)scale);
+        v_int16x8 v_zero = v_setzero_s16();
 
         for ( ; x <= width - 8; x += 8)
         {
-            __m128i _v_src2 = _mm_loadl_epi64((const __m128i *)(src2 + x));
-            __m128i v_src2 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _v_src2), 8);
+            v_int16x8 v_src2 = v_load_expand(src2 + x);
 
-            __m128i v_src2i = _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16);
-            __m128d v_src2d = _mm_cvtepi32_pd(v_src2i);
-            __m128i v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d));
-            v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8));
-            __m128i v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d));
-            __m128i v_dst_0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1)));
+            v_int32x4 t0, t1;
+            v_expand(v_src2, t0, t1);
 
-            v_src2i = _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16);
-            v_src2d = _mm_cvtepi32_pd(v_src2i);
-            v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d));
-            v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8));
-            v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d));
-            __m128i v_dst_1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1)));
+            v_float32x4 f0 = v_cvt_f32(t0);
+            v_float32x4 f1 = v_cvt_f32(t1);
 
-            __m128i v_mask = _mm_cmpeq_epi8(_v_src2, v_zero);
-            _mm_storel_epi64((__m128i *)(dst + x), _mm_andnot_si128(v_mask, _mm_packs_epi16(_mm_packs_epi32(v_dst_0, v_dst_1), v_zero)));
+            f0 = v_scale / f0;
+            f1 = v_scale / f1;
+
+            v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
+            v_int16x8 res = v_pack(i0, i1);
+
+            res = v_select(v_src2 == v_zero, v_zero, res);
+            v_pack_store(dst + x, res);
         }
 
         return x;
     }
 };
 
-#if CV_SSE4_1
 
 template <>
 struct Recip_SIMD<ushort>
 {
     bool haveSIMD;
-    Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE4_1); }
+    Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
 
     int operator() (const ushort * src2, ushort * dst, int width, double scale) const
     {
@@ -3194,42 +3154,38 @@ struct Recip_SIMD<ushort>
         if (!haveSIMD)
             return x;
 
-        __m128d v_scale = _mm_set1_pd(scale);
-        __m128i v_zero = _mm_setzero_si128();
+        v_float32x4 v_scale = v_setall_f32((float)scale);
+        v_uint16x8 v_zero = v_setzero_u16();
 
         for ( ; x <= width - 8; x += 8)
         {
-            __m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x));
+            v_uint16x8 v_src2 = v_load(src2 + x);
 
-            __m128i v_src2i = _mm_unpacklo_epi16(v_src2, v_zero);
-            __m128d v_src2d = _mm_cvtepi32_pd(v_src2i);
-            __m128i v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d));
-            v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8));
-            __m128i v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d));
-            __m128i v_dst_0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1)));
+            v_uint32x4 t0, t1;
+            v_expand(v_src2, t0, t1);
 
-            v_src2i = _mm_unpackhi_epi16(v_src2, v_zero);
-            v_src2d = _mm_cvtepi32_pd(v_src2i);
-            v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d));
-            v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8));
-            v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d));
-            __m128i v_dst_1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1)));
+            v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0));
+            v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1));
 
-            __m128i v_mask = _mm_cmpeq_epi16(v_src2, v_zero);
-            _mm_storeu_si128((__m128i *)(dst + x), _mm_andnot_si128(v_mask, _mm_packus_epi32(v_dst_0, v_dst_1)));
+            f0 = v_scale / f0;
+            f1 = v_scale / f1;
+
+            v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
+            v_uint16x8 res = v_pack_u(i0, i1);
+
+            res = v_select(v_src2 == v_zero, v_zero, res);
+            v_store(dst + x, res);
         }
 
         return x;
     }
 };
 
-#endif // CV_SSE4_1
-
 template <>
 struct Recip_SIMD<short>
 {
     bool haveSIMD;
-    Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2); }
+    Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
 
     int operator() (const short * src2, short * dst, int width, double scale) const
     {
@@ -3238,29 +3194,27 @@ struct Recip_SIMD<short>
         if (!haveSIMD)
             return x;
 
-        __m128d v_scale = _mm_set1_pd(scale);
-        __m128i v_zero = _mm_setzero_si128();
+        v_float32x4 v_scale = v_setall_f32((float)scale);
+        v_int16x8 v_zero = v_setzero_s16();
 
         for ( ; x <= width - 8; x += 8)
         {
-            __m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x));
+            v_int16x8 v_src2 = v_load(src2 + x);
 
-            __m128i v_src2i = _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16);
-            __m128d v_src2d = _mm_cvtepi32_pd(v_src2i);
-            __m128i v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d));
-            v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8));
-            __m128i v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d));
-            __m128i v_dst_0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1)));
+            v_int32x4 t0, t1;
+            v_expand(v_src2, t0, t1);
 
-            v_src2i = _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16);
-            v_src2d = _mm_cvtepi32_pd(v_src2i);
-            v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d));
-            v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8));
-            v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d));
-            __m128i v_dst_1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1)));
+            v_float32x4 f0 = v_cvt_f32(t0);
+            v_float32x4 f1 = v_cvt_f32(t1);
 
-            __m128i v_mask = _mm_cmpeq_epi16(v_src2, v_zero);
-            _mm_storeu_si128((__m128i *)(dst + x), _mm_andnot_si128(v_mask, _mm_packs_epi32(v_dst_0, v_dst_1)));
+            f0 = v_scale / f0;
+            f1 = v_scale / f1;
+
+            v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
+            v_int16x8 res = v_pack(i0, i1);
+
+            res = v_select(v_src2 == v_zero, v_zero, res);
+            v_store(dst + x, res);
         }
 
         return x;
@@ -3271,7 +3225,7 @@ template <>
 struct Recip_SIMD<int>
 {
     bool haveSIMD;
-    Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2); }
+    Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
 
     int operator() (const int * src2, int * dst, int width, double scale) const
     {
@@ -3280,22 +3234,136 @@ struct Recip_SIMD<int>
         if (!haveSIMD)
             return x;
 
-        __m128d v_scale = _mm_set1_pd(scale);
-        __m128i v_zero = _mm_setzero_si128();
+        v_float32x4 v_scale = v_setall_f32((float)scale);
+        v_int32x4 v_zero = v_setzero_s32();
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            v_int32x4 t0 = v_load(src2 + x);
+            v_int32x4 t1 = v_load(src2 + x + 4);
+
+            v_float32x4 f0 = v_cvt_f32(t0);
+            v_float32x4 f1 = v_cvt_f32(t1);
+
+            f0 = v_scale / f0;
+            f1 = v_scale / f1;
+
+            v_int32x4 res0 = v_round(f0), res1 = v_round(f1);
+
+            res0 = v_select(t0 == v_zero, v_zero, res0);
+            res1 = v_select(t1 == v_zero, v_zero, res1);
+            v_store(dst + x, res0);
+            v_store(dst + x + 4, res1);
+        }
+
+        return x;
+    }
+};
+
+
+template <>
+struct Recip_SIMD<float>
+{
+    bool haveSIMD;
+    Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
+
+    int operator() (const float * src2, float * dst, int width, double scale) const
+    {
+        int x = 0;
+
+        if (!haveSIMD)
+            return x;
+
+        v_float32x4 v_scale = v_setall_f32((float)scale);
+        v_float32x4 v_zero = v_setzero_f32();
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            v_float32x4 f0 = v_load(src2 + x);
+            v_float32x4 f1 = v_load(src2 + x + 4);
+
+            v_float32x4 res0 = v_scale / f0;
+            v_float32x4 res1 = v_scale / f1;
+
+            res0 = v_select(f0 == v_zero, v_zero, res0);
+            res1 = v_select(f1 == v_zero, v_zero, res1);
+
+            v_store(dst + x, res0);
+            v_store(dst + x + 4, res1);
+        }
+
+        return x;
+    }
+};
+
+#if CV_SIMD128_64F
+
+template <>
+struct Div_SIMD<double>
+{
+    bool haveSIMD;
+    Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
+
+    int operator() (const double * src1, const double * src2, double * dst, int width, double scale) const
+    {
+        int x = 0;
+
+        if (!haveSIMD)
+            return x;
+
+        v_float64x2 v_scale = v_setall_f64(scale);
+        v_float64x2 v_zero = v_setzero_f64();
 
         for ( ; x <= width - 4; x += 4)
         {
-            __m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x));
+            v_float64x2 f0 = v_load(src1 + x);
+            v_float64x2 f1 = v_load(src1 + x + 2);
+            v_float64x2 f2 = v_load(src2 + x);
+            v_float64x2 f3 = v_load(src2 + x + 2);
 
-            __m128d v_src2d = _mm_cvtepi32_pd(v_src2);
-            __m128i v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d));
+            v_float64x2 res0 = f0 * v_scale / f2;
+            v_float64x2 res1 = f1 * v_scale / f3;
 
-            v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2, 8));
-            __m128i v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d));
+            res0 = v_select(f0 == v_zero, v_zero, res0);
+            res1 = v_select(f1 == v_zero, v_zero, res1);
 
-            __m128i v_dst = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1)));
-            __m128i v_mask = _mm_cmpeq_epi32(v_src2, v_zero);
-            _mm_storeu_si128((__m128i *)(dst + x), _mm_andnot_si128(v_mask, v_dst));
+            v_store(dst + x, res0);
+            v_store(dst + x + 2, res1);
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct Recip_SIMD<double>
+{
+    bool haveSIMD;
+    Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
+
+    int operator() (const double * src2, double * dst, int width, double scale) const
+    {
+        int x = 0;
+
+        if (!haveSIMD)
+            return x;
+
+        v_float64x2 v_scale = v_setall_f64(scale);
+        v_float64x2 v_zero = v_setzero_f64();
+
+        for ( ; x <= width - 4; x += 4)
+        {
+            v_float64x2 f0 = v_load(src2 + x);
+            v_float64x2 f1 = v_load(src2 + x + 2);
+
+            v_float64x2 res0 = v_scale / f0;
+            v_float64x2 res1 = v_scale / f1;
+
+            res0 = v_select(f0 == v_zero, v_zero, res0);
+            res1 = v_select(f1 == v_zero, v_zero, res1);
+
+            v_store(dst + x, res0);
+            v_store(dst + x + 2, res1);
         }
 
         return x;
@@ -3304,10 +3372,78 @@ struct Recip_SIMD<int>
 
 #endif
 
+#endif
+
 template<typename T> static void
-recip_( const T*, size_t, const T* src2, size_t step2,
-        T* dst, size_t step, Size size, double scale )
+div_i( const T* src1, size_t step1, const T* src2, size_t step2,
+      T* dst, size_t step, Size size, double scale )
 {
+    step1 /= sizeof(src1[0]);
+    step2 /= sizeof(src2[0]);
+    step /= sizeof(dst[0]);
+
+    Div_SIMD<T> vop;
+    float scale_f = (float)scale;
+
+    for( ; size.height--; src1 += step1, src2 += step2, dst += step )
+    {
+        int i = vop(src1, src2, dst, size.width, scale);
+        for( ; i < size.width; i++ )
+        {
+            T num = src1[i], denom = src2[i];
+            dst[i] = denom != 0 ? saturate_cast<T>(num*scale_f/denom) : (T)0;
+        }
+    }
+}
+
+template<typename T> static void
+div_f( const T* src1, size_t step1, const T* src2, size_t step2,
+      T* dst, size_t step, Size size, double scale )
+{
+    T scale_f = (T)scale;
+    step1 /= sizeof(src1[0]);
+    step2 /= sizeof(src2[0]);
+    step /= sizeof(dst[0]);
+
+    Div_SIMD<T> vop;
+
+    for( ; size.height--; src1 += step1, src2 += step2, dst += step )
+    {
+        int i = vop(src1, src2, dst, size.width, scale);
+        for( ; i < size.width; i++ )
+        {
+            T num = src1[i], denom = src2[i];
+            dst[i] = denom != 0 ? saturate_cast<T>(num*scale_f/denom) : (T)0;
+        }
+    }
+}
+
+template<typename T> static void
+recip_i( const T*, size_t, const T* src2, size_t step2,
+         T* dst, size_t step, Size size, double scale )
+{
+    step2 /= sizeof(src2[0]);
+    step /= sizeof(dst[0]);
+
+    Recip_SIMD<T> vop;
+    float scale_f = (float)scale;
+
+    for( ; size.height--; src2 += step2, dst += step )
+    {
+        int i = vop(src2, dst, size.width, scale);
+        for( ; i < size.width; i++ )
+        {
+            T denom = src2[i];
+            dst[i] = denom != 0 ? saturate_cast<T>(scale_f/denom) : (T)0;
+        }
+    }
+}
+
+template<typename T> static void
+recip_f( const T*, size_t, const T* src2, size_t step2,
+         T* dst, size_t step, Size size, double scale )
+{
+    T scale_f = (T)scale;
     step2 /= sizeof(src2[0]);
     step /= sizeof(dst[0]);
 
@@ -3316,39 +3452,11 @@ recip_( const T*, size_t, const T* src2, size_t step2,
     for( ; size.height--; src2 += step2, dst += step )
     {
         int i = vop(src2, dst, size.width, scale);
-        #if CV_ENABLE_UNROLLED
-        for( ; i <= size.width - 4; i += 4 )
-        {
-            if( src2[i] != 0 && src2[i+1] != 0 && src2[i+2] != 0 && src2[i+3] != 0 )
-            {
-                double a = (double)src2[i] * src2[i+1];
-                double b = (double)src2[i+2] * src2[i+3];
-                double d = scale/(a * b);
-                b *= d;
-                a *= d;
-
-                T z0 = saturate_cast<T>(src2[i+1] * b);
-                T z1 = saturate_cast<T>(src2[i] * b);
-                T z2 = saturate_cast<T>(src2[i+3] * a);
-                T z3 = saturate_cast<T>(src2[i+2] * a);
-
-                dst[i] = z0; dst[i+1] = z1;
-                dst[i+2] = z2; dst[i+3] = z3;
-            }
-            else
-            {
-                T z0 = src2[i] != 0 ? saturate_cast<T>(scale/src2[i]) : 0;
-                T z1 = src2[i+1] != 0 ? saturate_cast<T>(scale/src2[i+1]) : 0;
-                T z2 = src2[i+2] != 0 ? saturate_cast<T>(scale/src2[i+2]) : 0;
-                T z3 = src2[i+3] != 0 ? saturate_cast<T>(scale/src2[i+3]) : 0;
-
-                dst[i] = z0; dst[i+1] = z1;
-                dst[i+2] = z2; dst[i+3] = z3;
-            }
-        }
-        #endif
         for( ; i < size.width; i++ )
-            dst[i] = src2[i] != 0 ? saturate_cast<T>(scale/src2[i]) : 0;
+        {
+            T denom = src2[i];
+            dst[i] = denom != 0 ? saturate_cast<T>(scale_f/denom) : (T)0;
+        }
     }
 }
 
@@ -3459,87 +3567,87 @@ static void div8u( const uchar* src1, size_t step1, const uchar* src2, size_t st
                    uchar* dst, size_t step, Size sz, void* scale)
 {
     if( src1 )
-        div_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
+        div_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
     else
-        recip_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
+        recip_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
 }
 
 static void div8s( const schar* src1, size_t step1, const schar* src2, size_t step2,
                   schar* dst, size_t step, Size sz, void* scale)
 {
-    div_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
+    div_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
 }
 
 static void div16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2,
                     ushort* dst, size_t step, Size sz, void* scale)
 {
-    div_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
+    div_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
 }
 
 static void div16s( const short* src1, size_t step1, const short* src2, size_t step2,
                     short* dst, size_t step, Size sz, void* scale)
 {
-    div_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
+    div_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
 }
 
 static void div32s( const int* src1, size_t step1, const int* src2, size_t step2,
                     int* dst, size_t step, Size sz, void* scale)
 {
-    div_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
+    div_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
 }
 
 static void div32f( const float* src1, size_t step1, const float* src2, size_t step2,
                     float* dst, size_t step, Size sz, void* scale)
 {
-    div_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
+    div_f(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
 }
 
 static void div64f( const double* src1, size_t step1, const double* src2, size_t step2,
                     double* dst, size_t step, Size sz, void* scale)
 {
-    div_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
+    div_f(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
 }
 
 static void recip8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2,
                   uchar* dst, size_t step, Size sz, void* scale)
 {
-    recip_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
+    recip_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
 }
 
 static void recip8s( const schar* src1, size_t step1, const schar* src2, size_t step2,
                   schar* dst, size_t step, Size sz, void* scale)
 {
-    recip_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
+    recip_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
 }
 
 static void recip16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2,
                    ushort* dst, size_t step, Size sz, void* scale)
 {
-    recip_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
+    recip_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
 }
 
 static void recip16s( const short* src1, size_t step1, const short* src2, size_t step2,
                    short* dst, size_t step, Size sz, void* scale)
 {
-    recip_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
+    recip_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
 }
 
 static void recip32s( const int* src1, size_t step1, const int* src2, size_t step2,
                    int* dst, size_t step, Size sz, void* scale)
 {
-    recip_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
+    recip_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
 }
 
 static void recip32f( const float* src1, size_t step1, const float* src2, size_t step2,
                    float* dst, size_t step, Size sz, void* scale)
 {
-    recip_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
+    recip_f(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
 }
 
 static void recip64f( const double* src1, size_t step1, const double* src2, size_t step2,
                    double* dst, size_t step, Size sz, void* scale)
 {
-    recip_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
+    recip_f(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
 }
 
 
diff --git a/modules/core/src/conjugate_gradient.cpp b/modules/core/src/conjugate_gradient.cpp
index 90353cc7fd..1259cc9756 100644
--- a/modules/core/src/conjugate_gradient.cpp
+++ b/modules/core/src/conjugate_gradient.cpp
@@ -46,6 +46,25 @@
 
 namespace cv
 {
+    double MinProblemSolver::Function::getGradientEps() const { return 1e-3; }
+    void MinProblemSolver::Function::getGradient(const double* x, double* grad)
+    {
+        double eps = getGradientEps();
+        int i, n = getDims();
+        AutoBuffer<double> x_buf(n);
+        double* x_ = x_buf;
+        for( i = 0; i < n; i++ )
+            x_[i] = x[i];
+        for( i = 0; i < n; i++ )
+        {
+            x_[i] = x[i] + eps;
+            double y1 = calc(x_);
+            x_[i] = x[i] - eps;
+            double y0 = calc(x_);
+            grad[i] = (y1 - y0)/(2*eps);
+            x_[i] = x[i];
+        }
+    }
 
 #define SEC_METHOD_ITERATIONS 4
 #define INITIAL_SEC_METHOD_SIGMA 0.1
diff --git a/modules/core/src/downhill_simplex.cpp b/modules/core/src/downhill_simplex.cpp
index 261bf33c3f..a0cc1320b8 100644
--- a/modules/core/src/downhill_simplex.cpp
+++ b/modules/core/src/downhill_simplex.cpp
@@ -40,8 +40,13 @@
 //M*/
 #include "precomp.hpp"
 
+#if 0
+#define dprintf(x) printf x
+#define print_matrix(x) print(x)
+#else
 #define dprintf(x)
 #define print_matrix(x)
+#endif
 
 /*
 
@@ -51,14 +56,14 @@ Downhill Simplex method in OpenCV dev 3.0.0 getting this error:
 
 OpenCV Error: Assertion failed (dims <= 2 && data && (unsigned)i0 < (unsigned)(s ize.p[0] * size.p[1])
 && elemSize() == (((((DataType<_Tp>::type) & ((512 - 1) << 3)) >> 3) + 1) << ((((sizeof(size_t)/4+1)16384|0x3a50)
->> ((DataType<_Tp>::typ e) & ((1 << 3) - 1))2) & 3))) in cv::Mat::at,
+>> ((DataType<_Tp>::typ e) & ((1 << 3) - 1))2) & 3))) in Mat::at,
 file C:\builds\master_PackSlave-w in32-vc12-shared\opencv\modules\core\include\opencv2/core/mat.inl.hpp, line 893
 
 ****Problem and Possible Fix*********************************************************************************************************
 
 DownhillSolverImpl::innerDownhillSimplex something looks broken here:
 Mat_<double> coord_sum(1,ndim,0.0),buf(1,ndim,0.0),y(1,ndim,0.0);
-nfunk = 0;
+fcount = 0;
 for(i=0;i<ndim+1;++i)
 {
 y(i) = f->calc(p[i]);
@@ -135,275 +140,326 @@ multiple lines in three dimensions as not all lines intersect in three dimension
 namespace cv
 {
 
-    class DownhillSolverImpl : public DownhillSolver
+class DownhillSolverImpl : public DownhillSolver
+{
+public:
+    DownhillSolverImpl()
     {
-    public:
-        void getInitStep(OutputArray step) const;
-        void setInitStep(InputArray step);
-        Ptr<Function> getFunction() const;
-        void setFunction(const Ptr<Function>& f);
-        TermCriteria getTermCriteria() const;
-        DownhillSolverImpl();
-        void setTermCriteria(const TermCriteria& termcrit);
-        double minimize(InputOutputArray x);
-    protected:
-        Ptr<MinProblemSolver::Function> _Function;
-        TermCriteria _termcrit;
-        Mat _step;
-        Mat_<double> buf_x;
-
-    private:
-        inline void createInitialSimplex(Mat_<double>& simplex,Mat& step);
-        inline double innerDownhillSimplex(cv::Mat_<double>& p,double MinRange,double MinError,int& nfunk,
-                const Ptr<MinProblemSolver::Function>& f,int nmax);
-        inline double tryNewPoint(Mat_<double>& p,Mat_<double>& y,Mat_<double>& coord_sum,const Ptr<MinProblemSolver::Function>& f,int ihi,
-                double fac,Mat_<double>& ptry);
-    };
-
-    double DownhillSolverImpl::tryNewPoint(
-        Mat_<double>& p,
-        Mat_<double>& y,
-        Mat_<double>&  coord_sum,
-        const Ptr<MinProblemSolver::Function>& f,
-        int      ihi,
-        double   fac,
-        Mat_<double>& ptry
-        )
-    {
-        int ndim=p.cols;
-        int j;
-        double fac1,fac2,ytry;
-
-        fac1=(1.0-fac)/ndim;
-        fac2=fac1-fac;
-        for (j=0;j<ndim;j++)
-        {
-            ptry(j)=coord_sum(j)*fac1-p(ihi,j)*fac2;
-        }
-        ytry=f->calc(ptry.ptr<double>());
-        if (ytry < y(ihi))
-        {
-            y(ihi)=ytry;
-            for (j=0;j<ndim;j++)
-            {
-                coord_sum(j) += ptry(j)-p(ihi,j);
-                p(ihi,j)=ptry(j);
-            }
-        }
-
-        return ytry;
+        _Function=Ptr<Function>();
+        _step=Mat_<double>();
     }
 
-    /*
-    Performs the actual minimization of MinProblemSolver::Function f (after the initialization was done)
-
-    The matrix p[ndim+1][1..ndim] represents ndim+1 vertices that
-    form a simplex - each row is an ndim vector.
-    On output, nfunk gives the number of function evaluations taken.
-    */
-    double DownhillSolverImpl::innerDownhillSimplex(
-        cv::Mat_<double>&   p,
-        double     MinRange,
-        double     MinError,
-        int&       nfunk,
-        const Ptr<MinProblemSolver::Function>& f,
-        int nmax
-        )
+    void getInitStep(OutputArray step) const { _step.copyTo(step); }
+    void setInitStep(InputArray step)
     {
-        int ndim=p.cols;
-        double res;
-        int i,ihi,ilo,inhi,j,mpts=ndim+1;
-        double error, range,ysave,ytry;
-        Mat_<double> coord_sum(1,ndim,0.0),buf(1,ndim,0.0),y(1,ndim+1,0.0);
-
-        nfunk = 0;
-
-        for(i=0;i<ndim+1;++i)
-        {
-            y(i) = f->calc(p[i]);
-        }
-
-        nfunk = ndim+1;
-
-        reduce(p,coord_sum,0,CV_REDUCE_SUM);
-
-        for (;;)
-        {
-            ilo=0;
-            /*  find highest (worst), next-to-worst, and lowest
-                (best) points by going through all of them. */
-            ihi = y(0)>y(1) ? (inhi=1,0) : (inhi=0,1);
-            for (i=0;i<mpts;i++)
-            {
-                if (y(i) <= y(ilo))
-                    ilo=i;
-                if (y(i) > y(ihi))
-                {
-                    inhi=ihi;
-                    ihi=i;
-                }
-                else if (y(i) > y(inhi) && i != ihi)
-                    inhi=i;
-            }
-
-            /* check stop criterion */
-            error=fabs(y(ihi)-y(ilo));
-            range=0;
-            for(i=0;i<ndim;++i)
-            {
-                double min = p(0,i);
-                double max = p(0,i);
-                double d;
-                for(j=1;j<=ndim;++j)
-                {
-                    if( min > p(j,i) ) min = p(j,i);
-                    if( max < p(j,i) ) max = p(j,i);
-                }
-                d = fabs(max-min);
-                if(range < d) range = d;
-            }
-
-            if(range <= MinRange || error <= MinError)
-            { /* Put best point and value in first slot. */
-                std::swap(y(0),y(ilo));
-                for (i=0;i<ndim;i++)
-                {
-                    std::swap(p(0,i),p(ilo,i));
-                }
-                break;
-            }
-
-            if (nfunk >= nmax){
-                dprintf(("nmax exceeded\n"));
-                return y(ilo);
-            }
-            nfunk += 2;
-            /*Begin a new iteration. First, reflect the worst point about the centroid of others */
-            ytry = tryNewPoint(p,y,coord_sum,f,ihi,-1.0,buf);
-            if (ytry <= y(ilo))
-            { /*If that's better than the best point, go twice as far in that direction*/
-                ytry = tryNewPoint(p,y,coord_sum,f,ihi,2.0,buf);
-            }
-            else if (ytry >= y(inhi))
-            {   /* The new point is worse than the second-highest, but better
-                  than the worst so do not go so far in that direction */
-                ysave = y(ihi);
-                ytry = tryNewPoint(p,y,coord_sum,f,ihi,0.5,buf);
-                if (ytry >= ysave)
-                { /* Can't seem to improve things. Contract the simplex to good point
-               in hope to find a simplex landscape. */
-                    for (i=0;i<mpts;i++)
-                    {
-                        if (i != ilo)
-                        {
-                            for (j=0;j<ndim;j++)
-                            {
-                                p(i,j) = coord_sum(j) = 0.5*(p(i,j)+p(ilo,j));
-                            }
-                            y(i)=f->calc(coord_sum.ptr<double>());
-                        }
-                    }
-                    nfunk += ndim;
-                    reduce(p,coord_sum,0,CV_REDUCE_SUM);
-                }
-            } else --(nfunk); /* correct nfunk */
-            dprintf(("this is simplex on iteration %d\n",nfunk));
-            print_matrix(p);
-        } /* go to next iteration. */
-        res = y(0);
-
-        return res;
+        // set dimensionality and make a deep copy of step
+        Mat m = step.getMat();
+        dprintf(("m.cols=%d\nm.rows=%d\n", m.cols, m.rows));
+        if( m.rows == 1 )
+            m.copyTo(_step);
+        else
+            transpose(m, _step);
     }
 
-    void DownhillSolverImpl::createInitialSimplex(Mat_<double>& simplex,Mat& step){
-        for(int i=1;i<=step.cols;++i)
-        {
-            simplex.row(0).copyTo(simplex.row(i));
-            simplex(i,i-1)+= 0.5*step.at<double>(0,i-1);
-        }
-        simplex.row(0) -= 0.5*step;
+    Ptr<MinProblemSolver::Function> getFunction() const { return _Function; }
 
-        dprintf(("this is simplex\n"));
-        print_matrix(simplex);
+    void setFunction(const Ptr<Function>& f) { _Function=f; }
+
+    TermCriteria getTermCriteria() const { return _termcrit; }
+
+    void setTermCriteria( const TermCriteria& termcrit )
+    {
+        CV_Assert( termcrit.type == (TermCriteria::MAX_ITER + TermCriteria::EPS) &&
+                   termcrit.epsilon > 0 &&
+                   termcrit.maxCount > 0 );
+        _termcrit=termcrit;
     }
 
-    double DownhillSolverImpl::minimize(InputOutputArray x){
+    double minimize( InputOutputArray x_ )
+    {
         dprintf(("hi from minimize\n"));
-        CV_Assert(_Function.empty()==false);
+        CV_Assert( !_Function.empty() );
+        CV_Assert( std::min(_step.cols, _step.rows) == 1 &&
+                  std::max(_step.cols, _step.rows) >= 2 &&
+                  _step.type() == CV_64FC1 );
         dprintf(("termcrit:\n\ttype: %d\n\tmaxCount: %d\n\tEPS: %g\n",_termcrit.type,_termcrit.maxCount,_termcrit.epsilon));
         dprintf(("step\n"));
         print_matrix(_step);
 
-        Mat x_mat=x.getMat();
-        CV_Assert(MIN(x_mat.rows,x_mat.cols)==1);
-        CV_Assert(MAX(x_mat.rows,x_mat.cols)==_step.cols);
-        CV_Assert(x_mat.type()==CV_64FC1);
-
-        Mat_<double> proxy_x;
-
-        if(x_mat.rows>1){
-            buf_x.create(1,_step.cols);
-            Mat_<double> proxy(_step.cols,1,buf_x.ptr<double>());
-            x_mat.copyTo(proxy);
-            proxy_x=buf_x;
-        }else{
-            proxy_x=x_mat;
-        }
-
-        int count=0;
-        int ndim=_step.cols;
-        Mat_<double> simplex=Mat_<double>(ndim+1,ndim,0.0);
-
-        simplex.row(0).copyTo(proxy_x);
-        createInitialSimplex(simplex,_step);
-        double res = innerDownhillSimplex(
-                simplex,_termcrit.epsilon, _termcrit.epsilon, count,_Function,_termcrit.maxCount);
-        simplex.row(0).copyTo(proxy_x);
+        Mat x = x_.getMat(), simplex;
 
+        createInitialSimplex(x, simplex, _step);
+        int count = 0;
+        double res = innerDownhillSimplex(simplex,_termcrit.epsilon, _termcrit.epsilon,
+                                          count, _termcrit.maxCount);
         dprintf(("%d iterations done\n",count));
 
-        if(x_mat.rows>1){
-            Mat(x_mat.rows, 1, CV_64F, proxy_x.ptr<double>()).copyTo(x);
+        if( !x.empty() )
+        {
+            Mat simplex_0m(x.rows, x.cols, CV_64F, simplex.ptr<double>());
+            simplex_0m.convertTo(x, x.type());
+        }
+        else
+        {
+            int x_type = x_.fixedType() ? x_.type() : CV_64F;
+            simplex.row(0).convertTo(x_, x_type);
         }
         return res;
     }
-    DownhillSolverImpl::DownhillSolverImpl(){
-        _Function=Ptr<Function>();
-        _step=Mat_<double>();
-    }
-    Ptr<MinProblemSolver::Function> DownhillSolverImpl::getFunction()const{
-        return _Function;
-    }
-    void DownhillSolverImpl::setFunction(const Ptr<Function>& f){
-        _Function=f;
-    }
-    TermCriteria DownhillSolverImpl::getTermCriteria()const{
-        return _termcrit;
-    }
-    void DownhillSolverImpl::setTermCriteria(const TermCriteria& termcrit){
-        CV_Assert(termcrit.type==(TermCriteria::MAX_ITER+TermCriteria::EPS) && termcrit.epsilon>0 && termcrit.maxCount>0);
-        _termcrit=termcrit;
-    }
-    // both minRange & minError are specified by termcrit.epsilon; In addition, user may specify the number of iterations that the algorithm does.
-    Ptr<DownhillSolver> DownhillSolver::create(const Ptr<MinProblemSolver::Function>& f, InputArray initStep, TermCriteria termcrit){
-        Ptr<DownhillSolver> DS = makePtr<DownhillSolverImpl>();
-        DS->setFunction(f);
-        DS->setInitStep(initStep);
-        DS->setTermCriteria(termcrit);
-        return DS;
-    }
-    void DownhillSolverImpl::getInitStep(OutputArray step)const{
-        _step.copyTo(step);
-    }
-    void DownhillSolverImpl::setInitStep(InputArray step){
-        //set dimensionality and make a deep copy of step
-        Mat m=step.getMat();
-        dprintf(("m.cols=%d\nm.rows=%d\n",m.cols,m.rows));
-        CV_Assert(MIN(m.cols,m.rows)==1 && m.type()==CV_64FC1);
-        if(m.rows==1){
-            m.copyTo(_step);
-        }else{
-            transpose(m,_step);
+protected:
+    Ptr<MinProblemSolver::Function> _Function;
+    TermCriteria _termcrit;
+    Mat _step;
+
+    inline void updateCoordSum(const Mat& p, Mat& coord_sum)
+    {
+        int i, j, m = p.rows, n = p.cols;
+        double* coord_sum_ = coord_sum.ptr<double>();
+        CV_Assert( coord_sum.cols == n && coord_sum.rows == 1 );
+
+        for( j = 0; j < n; j++ )
+            coord_sum_[j] = 0.;
+
+        for( i = 0; i < m; i++ )
+        {
+            const double* p_i = p.ptr<double>(i);
+            for( j = 0; j < n; j++ )
+                coord_sum_[j] += p_i[j];
         }
+
+        dprintf(("\nupdated coord sum:\n"));
+        print_matrix(coord_sum);
+
     }
+
+    inline void createInitialSimplex( const Mat& x0, Mat& simplex, Mat& step )
+    {
+        int i, j, ndim = step.cols;
+        CV_Assert( _Function->getDims() == ndim );
+        Mat x = x0;
+        if( x0.empty() )
+            x = Mat::zeros(1, ndim, CV_64F);
+        CV_Assert( (x.cols == 1 && x.rows == ndim) || (x.cols == ndim && x.rows == 1) );
+        CV_Assert( x.type() == CV_32F || x.type() == CV_64F );
+
+        simplex.create(ndim + 1, ndim, CV_64F);
+        Mat simplex_0m(x.rows, x.cols, CV_64F, simplex.ptr<double>());
+
+        x.convertTo(simplex_0m, CV_64F);
+        double* simplex_0 = simplex.ptr<double>();
+        const double* step_ = step.ptr<double>();
+        for( i = 1; i <= ndim; i++ )
+        {
+            double* simplex_i = simplex.ptr<double>(i);
+            for( j = 0; j < ndim; j++ )
+                simplex_i[j] = simplex_0[j];
+            simplex_i[i-1] += 0.5*step_[i-1];
+        }
+        for( j = 0; j < ndim; j++ )
+            simplex_0[j] -= 0.5*step_[j];
+
+        dprintf(("\nthis is simplex\n"));
+        print_matrix(simplex);
+    }
+
+    /*
+     Performs the actual minimization of MinProblemSolver::Function f (after the initialization was done)
+
+     The matrix p[ndim+1][1..ndim] represents ndim+1 vertices that
+     form a simplex - each row is an ndim vector.
+     On output, fcount gives the number of function evaluations taken.
+    */
+    double innerDownhillSimplex( Mat& p, double MinRange, double MinError, int& fcount, int nmax )
+    {
+        int i, j, ndim = p.cols;
+        Mat coord_sum(1, ndim, CV_64F), buf(1, ndim, CV_64F), y(1, ndim+1, CV_64F);
+        double* y_ = y.ptr<double>();
+
+        fcount = ndim+1;
+        for( i = 0; i <= ndim; i++ )
+            y_[i] = calc_f(p.ptr<double>(i));
+
+        updateCoordSum(p, coord_sum);
+
+        for (;;)
+        {
+            // find highest (worst), next-to-worst, and lowest
+            // (best) points by going through all of them.
+            int ilo = 0, ihi, inhi;
+            if( y_[0] > y_[1] )
+            {
+                ihi = 0; inhi = 1;
+            }
+            else
+            {
+                ihi = 1; inhi = 0;
+            }
+            for( i = 0; i <= ndim; i++ )
+            {
+                double yval = y_[i];
+                if (yval <= y_[ilo])
+                    ilo = i;
+                if (yval > y_[ihi])
+                {
+                    inhi = ihi;
+                    ihi = i;
+                }
+                else if (yval > y_[inhi] && i != ihi)
+                    inhi = i;
+            }
+            CV_Assert( ihi != inhi );
+            if( ilo == inhi || ilo == ihi )
+            {
+                for( i = 0; i <= ndim; i++ )
+                {
+                    double yval = y_[i];
+                    if( yval == y_[ilo] && i != ihi && i != inhi )
+                    {
+                        ilo = i;
+                        break;
+                    }
+                }
+            }
+            dprintf(("\nthis is y on iteration %d:\n",fcount));
+            print_matrix(y);
+
+            // check stop criterion
+            double error = fabs(y_[ihi] - y_[ilo]);
+            double range = 0;
+            for( j = 0; j < ndim; j++ )
+            {
+                double minval, maxval;
+                minval = maxval = p.at<double>(0, j);
+                for( i = 1; i <= ndim; i++ )
+                {
+                    double pval = p.at<double>(i, j);
+                    minval = std::min(minval, pval);
+                    maxval = std::max(maxval, pval);
+                }
+                range = std::max(range, fabs(maxval - minval));
+            }
+
+            if( range <= MinRange || error <= MinError || fcount >= nmax )
+            {
+                // Put best point and value in first slot.
+                std::swap(y_[0], y_[ilo]);
+                for( j = 0; j < ndim; j++ )
+                {
+                    std::swap(p.at<double>(0, j), p.at<double>(ilo, j));
+                }
+                break;
+            }
+
+            double y_lo = y_[ilo], y_nhi = y_[inhi], y_hi = y_[ihi];
+            // Begin a new iteration. First, reflect the worst point about the centroid of others
+            double alpha = -1.0;
+            double y_alpha = tryNewPoint(p, coord_sum, ihi, alpha, buf, fcount);
+
+            dprintf(("\ny_lo=%g, y_nhi=%g, y_hi=%g, y_alpha=%g, p_alpha:\n", y_lo, y_nhi, y_hi, y_alpha));
+            print_matrix(buf);
+
+            if( y_alpha < y_nhi )
+            {
+                if( y_alpha < y_lo )
+                {
+                    // If that's better than the best point, go twice as far in that direction
+                    double beta = -2.0;
+                    double y_beta = tryNewPoint(p, coord_sum, ihi, beta, buf, fcount);
+                    dprintf(("\ny_beta=%g, p_beta:\n", y_beta));
+                    print_matrix(buf);
+                    if( y_beta < y_alpha )
+                    {
+                        alpha = beta;
+                        y_alpha = y_beta;
+                    }
+                }
+                replacePoint(p, coord_sum, y, ihi, alpha, y_alpha);
+            }
+            else
+            {
+                // The new point is worse than the second-highest,
+                // do not go so far in that direction
+                double gamma = 0.5;
+                double y_gamma = tryNewPoint(p, coord_sum, ihi, gamma, buf, fcount);
+                dprintf(("\ny_gamma=%g, p_gamma:\n", y_gamma));
+                print_matrix(buf);
+                if( y_gamma < y_hi )
+                    replacePoint(p, coord_sum, y, ihi, gamma, y_gamma);
+                else
+                {
+                    // Can't seem to improve things.
+                    // Contract the simplex to good point
+                    // in hope to find a simplex landscape.
+                    for( i = 0; i <= ndim; i++ )
+                    {
+                        if (i != ilo)
+                        {
+                            for( j = 0; j < ndim; j++ )
+                                p.at<double>(i, j) = 0.5*(p.at<double>(i, j) + p.at<double>(ilo, j));
+                            y_[i] = calc_f(p.ptr<double>(i));
+                        }
+                    }
+                    fcount += ndim;
+                    updateCoordSum(p, coord_sum);
+                }
+            }
+            dprintf(("\nthis is simplex on iteration %d\n",fcount));
+            print_matrix(p);
+        }
+        return y_[0];
+    }
+
+    inline double calc_f(const double* ptr)
+    {
+        double res = _Function->calc(ptr);
+        CV_Assert( !cvIsNaN(res) && !cvIsInf(res) );
+        return res;
+    }
+
+    double tryNewPoint( Mat& p, Mat& coord_sum, int ihi, double alpha_, Mat& ptry, int& fcount )
+    {
+        int j, ndim = p.cols;
+
+        double alpha = (1.0 - alpha_)/ndim;
+        double beta = alpha - alpha_;
+        double* p_ihi = p.ptr<double>(ihi);
+        double* ptry_ = ptry.ptr<double>();
+        double* coord_sum_ = coord_sum.ptr<double>();
+
+        for( j = 0; j < ndim; j++ )
+            ptry_[j] = coord_sum_[j]*alpha - p_ihi[j]*beta;
+
+        fcount++;
+        return calc_f(ptry_);
+    }
+
+    void replacePoint( Mat& p, Mat& coord_sum, Mat& y, int ihi, double alpha_, double ytry )
+    {
+        int j, ndim = p.cols;
+
+        double alpha = (1.0 - alpha_)/ndim;
+        double beta = alpha - alpha_;
+        double* p_ihi = p.ptr<double>(ihi);
+        double* coord_sum_ = coord_sum.ptr<double>();
+
+        for( j = 0; j < ndim; j++ )
+            p_ihi[j] = coord_sum_[j]*alpha - p_ihi[j]*beta;
+        y.at<double>(ihi) = ytry;
+        updateCoordSum(p, coord_sum);
+    }
+};
+
+
+// both minRange & minError are specified by termcrit.epsilon;
+// In addition, user may specify the number of iterations that the algorithm does.
+Ptr<DownhillSolver> DownhillSolver::create( const Ptr<MinProblemSolver::Function>& f,
+                                            InputArray initStep, TermCriteria termcrit )
+{
+    Ptr<DownhillSolver> DS = makePtr<DownhillSolverImpl>();
+    DS->setFunction(f);
+    DS->setInitStep(initStep);
+    DS->setTermCriteria(termcrit);
+    return DS;
+}
+
 }
diff --git a/modules/core/src/dxt.cpp b/modules/core/src/dxt.cpp
index 1291278209..e3fd0e4c40 100644
--- a/modules/core/src/dxt.cpp
+++ b/modules/core/src/dxt.cpp
@@ -60,7 +60,6 @@ namespace cv
 #undef USE_IPP_DFT
 #endif
 
-
 /****************************************************************************************\
                                Discrete Fourier Transform
 \****************************************************************************************/
@@ -1090,11 +1089,12 @@ RealDFT( const T* src, T* dst, int n, int nf, int* factors, const int* itab,
         }
     }
 
-    if( complex_output && (n & 1) == 0 )
+    if( complex_output && ((n & 1) == 0 || n == 1))
     {
         dst[-1] = dst[0];
         dst[0] = 0;
-        dst[n] = 0;
+        if( n > 1 )
+            dst[n] = 0;
     }
 }
 
@@ -2426,6 +2426,47 @@ static bool ocl_dft_amdfft(InputArray _src, OutputArray _dst, int flags)
 
 #endif // HAVE_CLAMDFFT
 
+namespace cv
+{
+static void complementComplexOutput(Mat& dst, int len, int dft_dims)
+{
+    int i, n = dst.cols;
+    size_t elem_size = dst.elemSize1();
+    if( elem_size == sizeof(float) )
+    {
+        float* p0 = dst.ptr<float>();
+        size_t dstep = dst.step/sizeof(p0[0]);
+        for( i = 0; i < len; i++ )
+        {
+            float* p = p0 + dstep*i;
+            float* q = dft_dims == 1 || i == 0 || i*2 == len ? p : p0 + dstep*(len-i);
+
+            for( int j = 1; j < (n+1)/2; j++ )
+            {
+                p[(n-j)*2] = q[j*2];
+                p[(n-j)*2+1] = -q[j*2+1];
+            }
+        }
+    }
+    else
+    {
+        double* p0 = dst.ptr<double>();
+        size_t dstep = dst.step/sizeof(p0[0]);
+        for( i = 0; i < len; i++ )
+        {
+            double* p = p0 + dstep*i;
+            double* q = dft_dims == 1 || i == 0 || i*2 == len ? p : p0 + dstep*(len-i);
+
+            for( int j = 1; j < (n+1)/2; j++ )
+            {
+                p[(n-j)*2] = q[j*2];
+                p[(n-j)*2+1] = -q[j*2+1];
+            }
+        }
+    }
+}
+}
+
 void cv::dft( InputArray _src0, OutputArray _dst, int flags, int nonzero_rows )
 {
 #ifdef HAVE_CLAMDFFT
@@ -2705,7 +2746,11 @@ void cv::dft( InputArray _src0, OutputArray _dst, int flags, int nonzero_rows )
             }
 
             if( stage != 1 )
+            {
+                if( !inv && real_transform && dst.channels() == 2 )
+                    complementComplexOutput(dst, nonzero_rows, 1);
                 break;
+            }
             src = dst;
         }
         else
@@ -2847,41 +2892,7 @@ void cv::dft( InputArray _src0, OutputArray _dst, int flags, int nonzero_rows )
             if( stage != 0 )
             {
                 if( !inv && real_transform && dst.channels() == 2 && len > 1 )
-                {
-                    int n = dst.cols;
-                    if( elem_size == (int)sizeof(float) )
-                    {
-                        float* p0 = dst.ptr<float>();
-                        size_t dstep = dst.step/sizeof(p0[0]);
-                        for( i = 0; i < len; i++ )
-                        {
-                            float* p = p0 + dstep*i;
-                            float* q = i == 0 || i*2 == len ? p : p0 + dstep*(len-i);
-
-                            for( int j = 1; j < (n+1)/2; j++ )
-                            {
-                                p[(n-j)*2] = q[j*2];
-                                p[(n-j)*2+1] = -q[j*2+1];
-                            }
-                        }
-                    }
-                    else
-                    {
-                        double* p0 = dst.ptr<double>();
-                        size_t dstep = dst.step/sizeof(p0[0]);
-                        for( i = 0; i < len; i++ )
-                        {
-                            double* p = p0 + dstep*i;
-                            double* q = i == 0 || i*2 == len ? p : p0 + dstep*(len-i);
-
-                            for( int j = 1; j < (n+1)/2; j++ )
-                            {
-                                p[(n-j)*2] = q[j*2];
-                                p[(n-j)*2+1] = -q[j*2+1];
-                            }
-                        }
-                    }
-                }
+                    complementComplexOutput(dst, len, 2);
                 break;
             }
             src = dst;
diff --git a/modules/core/src/lapack.cpp b/modules/core/src/lapack.cpp
index dea25dd64c..e246a7c400 100644
--- a/modules/core/src/lapack.cpp
+++ b/modules/core/src/lapack.cpp
@@ -502,7 +502,7 @@ JacobiSVDImpl_(_Tp* At, size_t astep, _Tp* _W, _Tp* Vt, size_t vstep,
     {
         sd = i < n ? W[i] : 0;
 
-        while( sd <= minval )
+        for( int ii = 0; ii < 100 && sd <= minval; ii++ )
         {
             // if we got a zero singular value, then in order to get the corresponding left singular vector
             // we generate a random vector, project it to the previously computed left singular vectors,
@@ -527,7 +527,7 @@ JacobiSVDImpl_(_Tp* At, size_t astep, _Tp* _W, _Tp* Vt, size_t vstep,
                         At[i*astep + k] = t;
                         asum += std::abs(t);
                     }
-                    asum = asum ? 1/asum : 0;
+                    asum = asum > eps*100 ? 1/asum : 0;
                     for( k = 0; k < m; k++ )
                         At[i*astep + k] *= asum;
                 }
@@ -541,7 +541,7 @@ JacobiSVDImpl_(_Tp* At, size_t astep, _Tp* _W, _Tp* Vt, size_t vstep,
             sd = std::sqrt(sd);
         }
 
-        s = (_Tp)(1/sd);
+        s = (_Tp)(sd > minval ? 1/sd : 0.);
         for( k = 0; k < m; k++ )
             At[i*astep + k] *= s;
     }
diff --git a/modules/core/src/mathfuncs.cpp b/modules/core/src/mathfuncs.cpp
index e96eaeb41a..b8e4b1a2ed 100644
--- a/modules/core/src/mathfuncs.cpp
+++ b/modules/core/src/mathfuncs.cpp
@@ -43,6 +43,7 @@
 
 #include "precomp.hpp"
 #include "opencl_kernels_core.hpp"
+#include <limits>
 
 namespace cv
 {
@@ -889,38 +890,41 @@ struct iPow_SIMD
     }
 };
 
-#if CV_NEON
+#if CV_SIMD128
 
 template <>
 struct iPow_SIMD<uchar, int>
 {
-    int operator() ( const uchar * src, uchar * dst, int len, int power)
+    int operator() ( const uchar * src, uchar * dst, int len, int power )
     {
         int i = 0;
-        uint32x4_t v_1 = vdupq_n_u32(1u);
+        v_uint32x4 v_1 = v_setall_u32(1u);
 
         for ( ; i <= len - 8; i += 8)
         {
-            uint32x4_t v_a1 = v_1, v_a2 = v_1;
-            uint16x8_t v_src = vmovl_u8(vld1_u8(src + i));
-            uint32x4_t v_b1 = vmovl_u16(vget_low_u16(v_src)), v_b2 = vmovl_u16(vget_high_u16(v_src));
+            v_uint32x4 v_a1 = v_1, v_a2 = v_1;
+            v_uint16x8 v = v_load_expand(src + i);
+            v_uint32x4 v_b1, v_b2;
+            v_expand(v, v_b1, v_b2);
             int p = power;
 
             while( p > 1 )
             {
                 if (p & 1)
                 {
-                    v_a1 = vmulq_u32(v_a1, v_b1);
-                    v_a2 = vmulq_u32(v_a2, v_b2);
+                    v_a1 *= v_b1;
+                    v_a2 *= v_b2;
                 }
-                v_b1 = vmulq_u32(v_b1, v_b1);
-                v_b2 = vmulq_u32(v_b2, v_b2);
+                v_b1 *= v_b1;
+                v_b2 *= v_b2;
                 p >>= 1;
             }
 
-            v_a1 = vmulq_u32(v_a1, v_b1);
-            v_a2 = vmulq_u32(v_a2, v_b2);
-            vst1_u8(dst + i, vqmovn_u16(vcombine_u16(vqmovn_u32(v_a1), vqmovn_u32(v_a2))));
+            v_a1 *= v_b1;
+            v_a2 *= v_b2;
+
+            v = v_pack(v_a1, v_a2);
+            v_pack_store(dst + i, v);
         }
 
         return i;
@@ -933,30 +937,33 @@ struct iPow_SIMD<schar, int>
     int operator() ( const schar * src, schar * dst, int len, int power)
     {
         int i = 0;
-        int32x4_t v_1 = vdupq_n_s32(1);
+        v_int32x4 v_1 = v_setall_s32(1);
 
         for ( ; i <= len - 8; i += 8)
         {
-            int32x4_t v_a1 = v_1, v_a2 = v_1;
-            int16x8_t v_src = vmovl_s8(vld1_s8(src + i));
-            int32x4_t v_b1 = vmovl_s16(vget_low_s16(v_src)), v_b2 = vmovl_s16(vget_high_s16(v_src));
+            v_int32x4 v_a1 = v_1, v_a2 = v_1;
+            v_int16x8 v = v_load_expand(src + i);
+            v_int32x4 v_b1, v_b2;
+            v_expand(v, v_b1, v_b2);
             int p = power;
 
             while( p > 1 )
             {
                 if (p & 1)
                 {
-                    v_a1 = vmulq_s32(v_a1, v_b1);
-                    v_a2 = vmulq_s32(v_a2, v_b2);
+                    v_a1 *= v_b1;
+                    v_a2 *= v_b2;
                 }
-                v_b1 = vmulq_s32(v_b1, v_b1);
-                v_b2 = vmulq_s32(v_b2, v_b2);
+                v_b1 *= v_b1;
+                v_b2 *= v_b2;
                 p >>= 1;
             }
 
-            v_a1 = vmulq_s32(v_a1, v_b1);
-            v_a2 = vmulq_s32(v_a2, v_b2);
-            vst1_s8(dst + i, vqmovn_s16(vcombine_s16(vqmovn_s32(v_a1), vqmovn_s32(v_a2))));
+            v_a1 *= v_b1;
+            v_a2 *= v_b2;
+
+            v = v_pack(v_a1, v_a2);
+            v_pack_store(dst + i, v);
         }
 
         return i;
@@ -969,30 +976,33 @@ struct iPow_SIMD<ushort, int>
     int operator() ( const ushort * src, ushort * dst, int len, int power)
     {
         int i = 0;
-        uint32x4_t v_1 = vdupq_n_u32(1u);
+        v_uint32x4 v_1 = v_setall_u32(1u);
 
         for ( ; i <= len - 8; i += 8)
         {
-            uint32x4_t v_a1 = v_1, v_a2 = v_1;
-            uint16x8_t v_src = vld1q_u16(src + i);
-            uint32x4_t v_b1 = vmovl_u16(vget_low_u16(v_src)), v_b2 = vmovl_u16(vget_high_u16(v_src));
+            v_uint32x4 v_a1 = v_1, v_a2 = v_1;
+            v_uint16x8 v = v_load(src + i);
+            v_uint32x4 v_b1, v_b2;
+            v_expand(v, v_b1, v_b2);
             int p = power;
 
             while( p > 1 )
             {
                 if (p & 1)
                 {
-                    v_a1 = vmulq_u32(v_a1, v_b1);
-                    v_a2 = vmulq_u32(v_a2, v_b2);
+                    v_a1 *= v_b1;
+                    v_a2 *= v_b2;
                 }
-                v_b1 = vmulq_u32(v_b1, v_b1);
-                v_b2 = vmulq_u32(v_b2, v_b2);
+                v_b1 *= v_b1;
+                v_b2 *= v_b2;
                 p >>= 1;
             }
 
-            v_a1 = vmulq_u32(v_a1, v_b1);
-            v_a2 = vmulq_u32(v_a2, v_b2);
-            vst1q_u16(dst + i, vcombine_u16(vqmovn_u32(v_a1), vqmovn_u32(v_a2)));
+            v_a1 *= v_b1;
+            v_a2 *= v_b2;
+
+            v = v_pack(v_a1, v_a2);
+            v_store(dst + i, v);
         }
 
         return i;
@@ -1005,60 +1015,70 @@ struct iPow_SIMD<short, int>
     int operator() ( const short * src, short * dst, int len, int power)
     {
         int i = 0;
-        int32x4_t v_1 = vdupq_n_s32(1);
+        v_int32x4 v_1 = v_setall_s32(1);
 
         for ( ; i <= len - 8; i += 8)
         {
-            int32x4_t v_a1 = v_1, v_a2 = v_1;
-            int16x8_t v_src = vld1q_s16(src + i);
-            int32x4_t v_b1 = vmovl_s16(vget_low_s16(v_src)), v_b2 = vmovl_s16(vget_high_s16(v_src));
+            v_int32x4 v_a1 = v_1, v_a2 = v_1;
+            v_int16x8 v = v_load(src + i);
+            v_int32x4 v_b1, v_b2;
+            v_expand(v, v_b1, v_b2);
             int p = power;
 
             while( p > 1 )
             {
                 if (p & 1)
                 {
-                    v_a1 = vmulq_s32(v_a1, v_b1);
-                    v_a2 = vmulq_s32(v_a2, v_b2);
+                    v_a1 *= v_b1;
+                    v_a2 *= v_b2;
                 }
-                v_b1 = vmulq_s32(v_b1, v_b1);
-                v_b2 = vmulq_s32(v_b2, v_b2);
+                v_b1 *= v_b1;
+                v_b2 *= v_b2;
                 p >>= 1;
             }
 
-            v_a1 = vmulq_s32(v_a1, v_b1);
-            v_a2 = vmulq_s32(v_a2, v_b2);
-            vst1q_s16(dst + i, vcombine_s16(vqmovn_s32(v_a1), vqmovn_s32(v_a2)));
+            v_a1 *= v_b1;
+            v_a2 *= v_b2;
+
+            v = v_pack(v_a1, v_a2);
+            v_store(dst + i, v);
         }
 
         return i;
     }
 };
 
-
 template <>
 struct iPow_SIMD<int, int>
 {
     int operator() ( const int * src, int * dst, int len, int power)
     {
         int i = 0;
-        int32x4_t v_1 = vdupq_n_s32(1);
+        v_int32x4 v_1 = v_setall_s32(1);
 
-        for ( ; i <= len - 4; i += 4)
+        for ( ; i <= len - 8; i += 8)
         {
-            int32x4_t v_b = vld1q_s32(src + i), v_a = v_1;
+            v_int32x4 v_a1 = v_1, v_a2 = v_1;
+            v_int32x4 v_b1 = v_load(src + i), v_b2 = v_load(src + i + 4);
             int p = power;
 
             while( p > 1 )
             {
                 if (p & 1)
-                    v_a = vmulq_s32(v_a, v_b);
-                v_b = vmulq_s32(v_b, v_b);
+                {
+                    v_a1 *= v_b1;
+                    v_a2 *= v_b2;
+                }
+                v_b1 *= v_b1;
+                v_b2 *= v_b2;
                 p >>= 1;
             }
 
-            v_a = vmulq_s32(v_a, v_b);
-            vst1q_s32(dst + i, v_a);
+            v_a1 *= v_b1;
+            v_a2 *= v_b2;
+
+            v_store(dst + i, v_a1);
+            v_store(dst + i + 4, v_a2);
         }
 
         return i;
@@ -1071,42 +1091,143 @@ struct iPow_SIMD<float, float>
     int operator() ( const float * src, float * dst, int len, int power)
     {
         int i = 0;
-        float32x4_t v_1 = vdupq_n_f32(1.0f);
+        v_float32x4 v_1 = v_setall_f32(1.f);
 
-        for ( ; i <= len - 4; i += 4)
+        for ( ; i <= len - 8; i += 8)
         {
-            float32x4_t v_b = vld1q_f32(src + i), v_a = v_1;
-            int p = power;
+            v_float32x4 v_a1 = v_1, v_a2 = v_1;
+            v_float32x4 v_b1 = v_load(src + i), v_b2 = v_load(src + i + 4);
+            int p = std::abs(power);
+            if( power < 0 )
+            {
+                v_b1 = v_1 / v_b1;
+                v_b2 = v_1 / v_b2;
+            }
 
             while( p > 1 )
             {
                 if (p & 1)
-                    v_a = vmulq_f32(v_a, v_b);
-                v_b = vmulq_f32(v_b, v_b);
+                {
+                    v_a1 *= v_b1;
+                    v_a2 *= v_b2;
+                }
+                v_b1 *= v_b1;
+                v_b2 *= v_b2;
                 p >>= 1;
             }
 
-            v_a = vmulq_f32(v_a, v_b);
-            vst1q_f32(dst + i, v_a);
+            v_a1 *= v_b1;
+            v_a2 *= v_b2;
+
+            v_store(dst + i, v_a1);
+            v_store(dst + i + 4, v_a2);
         }
 
         return i;
     }
 };
 
+#if CV_SIMD128_64F
+template <>
+struct iPow_SIMD<double, double>
+{
+    int operator() ( const double * src, double * dst, int len, int power)
+    {
+        int i = 0;
+        v_float64x2 v_1 = v_setall_f64(1.);
+
+        for ( ; i <= len - 4; i += 4)
+        {
+            v_float64x2 v_a1 = v_1, v_a2 = v_1;
+            v_float64x2 v_b1 = v_load(src + i), v_b2 = v_load(src + i + 2);
+            int p = std::abs(power);
+            if( power < 0 )
+            {
+                v_b1 = v_1 / v_b1;
+                v_b2 = v_1 / v_b2;
+            }
+
+            while( p > 1 )
+            {
+                if (p & 1)
+                {
+                    v_a1 *= v_b1;
+                    v_a2 *= v_b2;
+                }
+                v_b1 *= v_b1;
+                v_b2 *= v_b2;
+                p >>= 1;
+            }
+
+            v_a1 *= v_b1;
+            v_a2 *= v_b2;
+
+            v_store(dst + i, v_a1);
+            v_store(dst + i + 2, v_a2);
+        }
+
+        return i;
+    }
+};
+#endif
+
 #endif
 
 template<typename T, typename WT>
 static void
-iPow_( const T* src, T* dst, int len, int power )
+iPow_i( const T* src, T* dst, int len, int power )
 {
-    iPow_SIMD<T, WT> vop;
-    int i = vop(src, dst, len, power);
+    if( power < 0 )
+    {
+        T tab[5] =
+        {
+            power == -1 ? saturate_cast<T>(-1) : 0, (power & 1) ? -1 : 1,
+            std::numeric_limits<T>::max(), 1, power == -1 ? 1 : 0
+        };
+        for( int i = 0; i < len; i++ )
+        {
+            T val = src[i];
+            dst[i] = cv_abs(val) <= 2 ? tab[val + 2] : (T)0;
+        }
+    }
+    else
+    {
+        iPow_SIMD<T, WT> vop;
+        int i = vop(src, dst, len, power);
+
+        for( ; i < len; i++ )
+        {
+            WT a = 1, b = src[i];
+            int p = power;
+            while( p > 1 )
+            {
+                if( p & 1 )
+                    a *= b;
+                b *= b;
+                p >>= 1;
+            }
+
+            a *= b;
+            dst[i] = saturate_cast<T>(a);
+        }
+    }
+}
+
+template<typename T>
+static void
+iPow_f( const T* src, T* dst, int len, int power0 )
+{
+    iPow_SIMD<T, T> vop;
+    int i = vop(src, dst, len, power0);
+    int power = std::abs(power0);
 
     for( ; i < len; i++ )
     {
-        WT a = 1, b = src[i];
+        T a = 1, b = src[i];
         int p = power;
+        if( power0 < 0 )
+            b = 1/b;
+
         while( p > 1 )
         {
             if( p & 1 )
@@ -1116,44 +1237,43 @@ iPow_( const T* src, T* dst, int len, int power )
         }
 
         a *= b;
-        dst[i] = saturate_cast<T>(a);
+        dst[i] = a;
     }
 }
 
-
 static void iPow8u(const uchar* src, uchar* dst, int len, int power)
 {
-    iPow_<uchar, int>(src, dst, len, power);
+    iPow_i<uchar, unsigned>(src, dst, len, power);
 }
 
 static void iPow8s(const schar* src, schar* dst, int len, int power)
 {
-    iPow_<schar, int>(src, dst, len, power);
+    iPow_i<schar, int>(src, dst, len, power);
 }
 
 static void iPow16u(const ushort* src, ushort* dst, int len, int power)
 {
-    iPow_<ushort, int>(src, dst, len, power);
+    iPow_i<ushort, unsigned>(src, dst, len, power);
 }
 
 static void iPow16s(const short* src, short* dst, int len, int power)
 {
-    iPow_<short, int>(src, dst, len, power);
+    iPow_i<short, int>(src, dst, len, power);
 }
 
 static void iPow32s(const int* src, int* dst, int len, int power)
 {
-    iPow_<int, int>(src, dst, len, power);
+    iPow_i<int, int>(src, dst, len, power);
 }
 
 static void iPow32f(const float* src, float* dst, int len, int power)
 {
-    iPow_<float, float>(src, dst, len, power);
+    iPow_f<float>(src, dst, len, power);
 }
 
 static void iPow64f(const double* src, double* dst, int len, int power)
 {
-    iPow_<double, double>(src, dst, len, power);
+    iPow_f<double>(src, dst, len, power);
 }
 
 
@@ -1176,14 +1296,25 @@ static bool ocl_pow(InputArray _src, double power, OutputArray _dst,
     bool doubleSupport = d.doubleFPConfig() > 0;
 
     _dst.createSameSize(_src, type);
-    if (is_ipower && (ipower == 0 || ipower == 1))
+    if (is_ipower)
     {
         if (ipower == 0)
+        {
             _dst.setTo(Scalar::all(1));
-        else if (ipower == 1)
+            return true;
+        }
+        if (ipower == 1)
+        {
             _src.copyTo(_dst);
-
-        return true;
+            return true;
+        }
+        if( ipower < 0 )
+        {
+            if( depth == CV_32F || depth == CV_64F )
+                is_ipower = false;
+            else
+                return false;
+        }
     }
 
     if (depth == CV_64F && !doubleSupport)
@@ -1233,20 +1364,11 @@ void pow( InputArray _src, double power, OutputArray _dst )
 {
     int type = _src.type(), depth = CV_MAT_DEPTH(type),
             cn = CV_MAT_CN(type), ipower = cvRound(power);
-    bool is_ipower = fabs(ipower - power) < DBL_EPSILON, same = false,
+    bool is_ipower = fabs(ipower - power) < DBL_EPSILON,
             useOpenCL = _dst.isUMat() && _src.dims() <= 2;
 
     if( is_ipower && !(ocl::Device::getDefault().isIntel() && useOpenCL && depth != CV_64F))
     {
-        if( ipower < 0 )
-        {
-            divide( Scalar::all(1), _src, _dst );
-            if( ipower == -1 )
-                return;
-            ipower = -ipower;
-            same = true;
-        }
-
         switch( ipower )
         {
         case 0:
@@ -1257,59 +1379,18 @@ void pow( InputArray _src, double power, OutputArray _dst )
             _src.copyTo(_dst);
             return;
         case 2:
-#if defined(HAVE_IPP)
-            CV_IPP_CHECK()
-            {
-                if (depth == CV_32F && !same && ( (_src.dims() <= 2 && !ocl::useOpenCL()) ||
-                                                  (_src.dims() > 2 && _src.isContinuous() && _dst.isContinuous()) ))
-                {
-                    Mat src = _src.getMat();
-                    _dst.create( src.dims, src.size, type );
-                    Mat dst = _dst.getMat();
-
-                    Size size = src.size();
-                    int srcstep = (int)src.step, dststep = (int)dst.step, esz = CV_ELEM_SIZE(type);
-                    if (src.isContinuous() && dst.isContinuous())
-                    {
-                        size.width = (int)src.total();
-                        size.height = 1;
-                        srcstep = dststep = (int)src.total() * esz;
-                    }
-                    size.width *= cn;
-
-                    IppStatus status = ippiSqr_32f_C1R(src.ptr<Ipp32f>(), srcstep, dst.ptr<Ipp32f>(), dststep, ippiSize(size.width, size.height));
-
-                    if (status >= 0)
-                    {
-                        CV_IMPL_ADD(CV_IMPL_IPP);
-                        return;
-                    }
-                    setIppErrorStatus();
-                }
-            }
-#endif
-            if (same)
-                multiply(_dst, _dst, _dst);
-            else
-                multiply(_src, _src, _dst);
+            multiply(_src, _src, _dst);
             return;
         }
     }
     else
         CV_Assert( depth == CV_32F || depth == CV_64F );
 
-    CV_OCL_RUN(useOpenCL,
-               ocl_pow(same ? _dst : _src, power, _dst, is_ipower, ipower))
+    CV_OCL_RUN(useOpenCL, ocl_pow(_src, power, _dst, is_ipower, ipower))
 
-    Mat src, dst;
-    if (same)
-        src = dst = _dst.getMat();
-    else
-    {
-        src = _src.getMat();
-        _dst.create( src.dims, src.size, type );
-        dst = _dst.getMat();
-    }
+    Mat src = _src.getMat();
+    _dst.create( src.dims, src.size, type );
+    Mat dst = _dst.getMat();
 
     const Mat* arrays[] = {&src, &dst, 0};
     uchar* ptrs[2];
@@ -1335,52 +1416,103 @@ void pow( InputArray _src, double power, OutputArray _dst )
     }
     else
     {
-#if defined(HAVE_IPP)
-        CV_IPP_CHECK()
-        {
-            if (src.isContinuous() && dst.isContinuous())
-            {
-                IppStatus status = depth == CV_32F ?
-                            ippsPowx_32f_A21(src.ptr<Ipp32f>(), (Ipp32f)power, dst.ptr<Ipp32f>(), (Ipp32s)(src.total() * cn)) :
-                            ippsPowx_64f_A50(src.ptr<Ipp64f>(), power, dst.ptr<Ipp64f>(), (Ipp32s)(src.total() * cn));
-
-                if (status >= 0)
-                {
-                    CV_IMPL_ADD(CV_IMPL_IPP);
-                    return;
-                }
-                setIppErrorStatus();
-            }
-        }
-#endif
-
         int j, k, blockSize = std::min(len, ((BLOCK_SIZE + cn-1)/cn)*cn);
         size_t esz1 = src.elemSize1();
+        AutoBuffer<uchar> buf;
+        Cv32suf inf32, nan32;
+        Cv64suf inf64, nan64;
+        float* fbuf = 0;
+        double* dbuf = 0;
+        inf32.i = 0x7f800000;
+        nan32.i = 0x7fffffff;
+        inf64.i = CV_BIG_INT(0x7FF0000000000000);
+        nan64.i = CV_BIG_INT(0x7FFFFFFFFFFFFFFF);
+
+        if( src.ptr() == dst.ptr() )
+        {
+            buf.allocate(blockSize*esz1);
+            fbuf = (float*)(uchar*)buf;
+            dbuf = (double*)(uchar*)buf;
+        }
 
         for( size_t i = 0; i < it.nplanes; i++, ++it )
         {
             for( j = 0; j < len; j += blockSize )
             {
                 int bsz = std::min(len - j, blockSize);
+
+            #if defined(HAVE_IPP)
+                CV_IPP_CHECK()
+                {
+                    IppStatus status = depth == CV_32F ?
+                    ippsPowx_32f_A21((const float*)ptrs[0], (float)power, (float*)ptrs[1], bsz) :
+                    ippsPowx_64f_A50((const double*)ptrs[0], (double)power, (double*)ptrs[1], bsz);
+
+                    if (status >= 0)
+                    {
+                        CV_IMPL_ADD(CV_IMPL_IPP);
+                        ptrs[0] += bsz*esz1;
+                        ptrs[1] += bsz*esz1;
+                        continue;
+                    }
+                    setIppErrorStatus();
+                }
+            #endif
+
                 if( depth == CV_32F )
                 {
-                    const float* x = (const float*)ptrs[0];
+                    float* x0 = (float*)ptrs[0];
+                    float* x = fbuf ? fbuf : x0;
                     float* y = (float*)ptrs[1];
 
+                    if( x != x0 )
+                        memcpy(x, x0, bsz*esz1);
+
                     Log_32f(x, y, bsz);
                     for( k = 0; k < bsz; k++ )
                         y[k] = (float)(y[k]*power);
                     Exp_32f(y, y, bsz);
+                    for( k = 0; k < bsz; k++ )
+                    {
+                        if( x0[k] <= 0 )
+                        {
+                            if( x0[k] == 0.f )
+                            {
+                                if( power < 0 )
+                                    y[k] = inf32.f;
+                            }
+                            else
+                                y[k] = nan32.f;
+                        }
+                    }
                 }
                 else
                 {
-                    const double* x = (const double*)ptrs[0];
+                    double* x0 = (double*)ptrs[0];
+                    double* x = dbuf ? dbuf : x0;
                     double* y = (double*)ptrs[1];
 
+                    if( x != x0 )
+                        memcpy(x, x0, bsz*esz1);
+
                     Log_64f(x, y, bsz);
                     for( k = 0; k < bsz; k++ )
                         y[k] *= power;
                     Exp_64f(y, y, bsz);
+
+                    for( k = 0; k < bsz; k++ )
+                    {
+                        if( x0[k] <= 0 )
+                        {
+                            if( x0[k] == 0. )
+                            {
+                                if( power < 0 )
+                                    y[k] = inf64.f;
+                            }
+                            else
+                                y[k] = nan64.f;
+                        }
+                    }
                 }
                 ptrs[0] += bsz*esz1;
                 ptrs[1] += bsz*esz1;
diff --git a/modules/core/src/matmul.cpp b/modules/core/src/matmul.cpp
index feffc8d32f..c1eacfd210 100644
--- a/modules/core/src/matmul.cpp
+++ b/modules/core/src/matmul.cpp
@@ -1195,7 +1195,7 @@ void cv::gemm( InputArray matA, InputArray matB, double alpha,
     GEMMBlockMulFunc blockMulFunc;
     GEMMStoreFunc storeFunc;
     Mat *matD = &D, tmat;
-    int tmat_size = 0;
+    size_t tmat_size = 0;
     const uchar* Cdata = C.data;
     size_t Cstep = C.data ? (size_t)C.step : 0;
     AutoBuffer<uchar> buf;
@@ -1228,7 +1228,7 @@ void cv::gemm( InputArray matA, InputArray matB, double alpha,
 
     if( D.data == A.data || D.data == B.data )
     {
-        tmat_size = d_size.width*d_size.height*CV_ELEM_SIZE(type);
+        tmat_size = (size_t)d_size.width*d_size.height*CV_ELEM_SIZE(type);
         // Allocate tmat later, once the size of buf is known
         matD = &tmat;
     }
@@ -1319,7 +1319,7 @@ void cv::gemm( InputArray matA, InputArray matB, double alpha,
         int is_b_t = flags & GEMM_2_T;
         int elem_size = CV_ELEM_SIZE(type);
         int dk0_1, dk0_2;
-        int a_buf_size = 0, b_buf_size, d_buf_size;
+        size_t a_buf_size = 0, b_buf_size, d_buf_size;
         uchar* a_buf = 0;
         uchar* b_buf = 0;
         uchar* d_buf = 0;
@@ -1360,12 +1360,12 @@ void cv::gemm( InputArray matA, InputArray matB, double alpha,
             dn0 = block_size / dk0;
 
         dk0_1 = (dn0+dn0/8+2) & -2;
-        b_buf_size = (dk0+dk0/8+1)*dk0_1*elem_size;
-        d_buf_size = (dk0+dk0/8+1)*dk0_1*work_elem_size;
+        b_buf_size = (size_t)(dk0+dk0/8+1)*dk0_1*elem_size;
+        d_buf_size = (size_t)(dk0+dk0/8+1)*dk0_1*work_elem_size;
 
         if( is_a_t )
         {
-            a_buf_size = (dm0+dm0/8+1)*((dk0+dk0/8+2)&-2)*elem_size;
+            a_buf_size = (size_t)(dm0+dm0/8+1)*((dk0+dk0/8+2)&-2)*elem_size;
             flags &= ~GEMM_1_T;
         }
 
diff --git a/modules/core/src/matrix.cpp b/modules/core/src/matrix.cpp
index 65b93890ec..9b77a4cf2c 100644
--- a/modules/core/src/matrix.cpp
+++ b/modules/core/src/matrix.cpp
@@ -222,11 +222,10 @@ public:
     }
 };
 
-
 MatAllocator* Mat::getStdAllocator()
 {
-    static MatAllocator * allocator = new StdMatAllocator();
-    return allocator;
+    static StdMatAllocator allocator;
+    return &allocator;
 }
 
 void swap( Mat& a, Mat& b )
@@ -1155,6 +1154,21 @@ Mat _InputArray::getMat_(int i) const
         return !v.empty() ? Mat(size(), t, (void*)&v[0]) : Mat();
     }
 
+    if( k == STD_BOOL_VECTOR )
+    {
+        CV_Assert( i < 0 );
+        int t = CV_8U;
+        const std::vector<bool>& v = *(const std::vector<bool>*)obj;
+        int j, n = (int)v.size();
+        if( n == 0 )
+            return Mat();
+        Mat m(1, n, t);
+        uchar* dst = m.data;
+        for( j = 0; j < n; j++ )
+            dst[j] = (uchar)v[j];
+        return m;
+    }
+
     if( k == NONE )
         return Mat();
 
@@ -1482,6 +1496,13 @@ Size _InputArray::size(int i) const
         return szb == szi ? Size((int)szb, 1) : Size((int)(szb/CV_ELEM_SIZE(flags)), 1);
     }
 
+    if( k == STD_BOOL_VECTOR )
+    {
+        CV_Assert( i < 0 );
+        const std::vector<bool>& v = *(const std::vector<bool>*)obj;
+        return Size((int)v.size(), 1);
+    }
+
     if( k == NONE )
         return Size();
 
@@ -1662,7 +1683,7 @@ int _InputArray::dims(int i) const
         return 2;
     }
 
-    if( k == STD_VECTOR )
+    if( k == STD_VECTOR || k == STD_BOOL_VECTOR )
     {
         CV_Assert( i < 0 );
         return 2;
@@ -1774,7 +1795,7 @@ int _InputArray::type(int i) const
     if( k == EXPR )
         return ((const MatExpr*)obj)->type();
 
-    if( k == MATX || k == STD_VECTOR || k == STD_VECTOR_VECTOR )
+    if( k == MATX || k == STD_VECTOR || k == STD_VECTOR_VECTOR || k == STD_BOOL_VECTOR )
         return CV_MAT_TYPE(flags);
 
     if( k == NONE )
@@ -1849,6 +1870,12 @@ bool _InputArray::empty() const
         return v.empty();
     }
 
+    if( k == STD_BOOL_VECTOR )
+    {
+        const std::vector<bool>& v = *(const std::vector<bool>*)obj;
+        return v.empty();
+    }
+
     if( k == NONE )
         return true;
 
@@ -1893,7 +1920,8 @@ bool _InputArray::isContinuous(int i) const
     if( k == UMAT )
         return i < 0 ? ((const UMat*)obj)->isContinuous() : true;
 
-    if( k == EXPR || k == MATX || k == STD_VECTOR || k == NONE || k == STD_VECTOR_VECTOR)
+    if( k == EXPR || k == MATX || k == STD_VECTOR ||
+        k == NONE || k == STD_VECTOR_VECTOR || k == STD_BOOL_VECTOR )
         return true;
 
     if( k == STD_VECTOR_MAT )
@@ -1924,7 +1952,8 @@ bool _InputArray::isSubmatrix(int i) const
     if( k == UMAT )
         return i < 0 ? ((const UMat*)obj)->isSubmatrix() : false;
 
-    if( k == EXPR || k == MATX || k == STD_VECTOR || k == NONE || k == STD_VECTOR_VECTOR)
+    if( k == EXPR || k == MATX || k == STD_VECTOR ||
+        k == NONE || k == STD_VECTOR_VECTOR || k == STD_BOOL_VECTOR )
         return false;
 
     if( k == STD_VECTOR_MAT )
@@ -1962,7 +1991,8 @@ size_t _InputArray::offset(int i) const
         return ((const UMat*)obj)->offset;
     }
 
-    if( k == EXPR || k == MATX || k == STD_VECTOR || k == NONE || k == STD_VECTOR_VECTOR)
+    if( k == EXPR || k == MATX || k == STD_VECTOR ||
+        k == NONE || k == STD_VECTOR_VECTOR || k == STD_BOOL_VECTOR )
         return 0;
 
     if( k == STD_VECTOR_MAT )
@@ -2009,7 +2039,8 @@ size_t _InputArray::step(int i) const
         return ((const UMat*)obj)->step;
     }
 
-    if( k == EXPR || k == MATX || k == STD_VECTOR || k == NONE || k == STD_VECTOR_VECTOR)
+    if( k == EXPR || k == MATX || k == STD_VECTOR ||
+        k == NONE || k == STD_VECTOR_VECTOR || k == STD_BOOL_VECTOR )
         return 0;
 
     if( k == STD_VECTOR_MAT )
@@ -2044,7 +2075,7 @@ void _InputArray::copyTo(const _OutputArray& arr) const
 
     if( k == NONE )
         arr.release();
-    else if( k == MAT || k == MATX || k == STD_VECTOR )
+    else if( k == MAT || k == MATX || k == STD_VECTOR || k == STD_BOOL_VECTOR )
     {
         Mat m = getMat();
         m.copyTo(arr);
@@ -2069,7 +2100,7 @@ void _InputArray::copyTo(const _OutputArray& arr, const _InputArray & mask) cons
 
     if( k == NONE )
         arr.release();
-    else if( k == MAT || k == MATX || k == STD_VECTOR )
+    else if( k == MAT || k == MATX || k == STD_VECTOR || k == STD_BOOL_VECTOR )
     {
         Mat m = getMat();
         m.copyTo(arr, mask);
diff --git a/modules/core/src/opencl/fft.cl b/modules/core/src/opencl/fft.cl
index 3901db4eb7..d9dfbb7c6a 100644
--- a/modules/core/src/opencl/fft.cl
+++ b/modules/core/src/opencl/fft.cl
@@ -574,6 +574,16 @@ __kernel void fft_multi_radix_rows(__global const uchar* src_ptr, int src_step,
         #pragma unroll
         for (int i=x; i<cols; i+=block_size)
             dst[i] = SCALE_VAL(smem[i], scale);
+#ifdef REAL_INPUT
+#ifdef COMPLEX_OUTPUT
+#ifdef IS_1D
+        for(int i=x+1; i < (dst_cols+1)/2; i+=block_size)
+        {
+            dst[dst_cols-i] = (CT)(SCALE_VAL(smem[i].x, scale), SCALE_VAL(-smem[i].y, scale));
+        }
+#endif
+#endif
+#endif
 #else
         // pack row to CCS
         __local FT* smem_1cn = (__local FT*) smem;
diff --git a/modules/core/src/precomp.hpp b/modules/core/src/precomp.hpp
index e5007e5d17..88b60e4713 100644
--- a/modules/core/src/precomp.hpp
+++ b/modules/core/src/precomp.hpp
@@ -151,39 +151,46 @@ template<typename T> struct OpMax
     T operator ()(const T a, const T b) const { return std::max(a, b); }
 };
 
+inline Size getContinuousSize_( int flags, int cols, int rows, int widthScale )
+{
+    int64 sz = (int64)cols * rows * widthScale;
+    return (flags & Mat::CONTINUOUS_FLAG) != 0 &&
+        (int)sz == sz ? Size((int)sz, 1) : Size(cols * widthScale, rows);
+}
+
 inline Size getContinuousSize( const Mat& m1, int widthScale=1 )
 {
-    return m1.isContinuous() ? Size(m1.cols*m1.rows*widthScale, 1) :
-        Size(m1.cols*widthScale, m1.rows);
+    return getContinuousSize_(m1.flags,
+                              m1.cols, m1.rows, widthScale);
 }
 
 inline Size getContinuousSize( const Mat& m1, const Mat& m2, int widthScale=1 )
 {
-    return (m1.flags & m2.flags & Mat::CONTINUOUS_FLAG) != 0 ?
-        Size(m1.cols*m1.rows*widthScale, 1) : Size(m1.cols*widthScale, m1.rows);
+    return getContinuousSize_(m1.flags & m2.flags,
+                              m1.cols, m1.rows, widthScale);
 }
 
 inline Size getContinuousSize( const Mat& m1, const Mat& m2,
                                const Mat& m3, int widthScale=1 )
 {
-    return (m1.flags & m2.flags & m3.flags & Mat::CONTINUOUS_FLAG) != 0 ?
-        Size(m1.cols*m1.rows*widthScale, 1) : Size(m1.cols*widthScale, m1.rows);
+    return getContinuousSize_(m1.flags & m2.flags & m3.flags,
+                              m1.cols, m1.rows, widthScale);
 }
 
 inline Size getContinuousSize( const Mat& m1, const Mat& m2,
                                const Mat& m3, const Mat& m4,
                                int widthScale=1 )
 {
-    return (m1.flags & m2.flags & m3.flags & m4.flags & Mat::CONTINUOUS_FLAG) != 0 ?
-        Size(m1.cols*m1.rows*widthScale, 1) : Size(m1.cols*widthScale, m1.rows);
+    return getContinuousSize_(m1.flags & m2.flags & m3.flags & m4.flags,
+                              m1.cols, m1.rows, widthScale);
 }
 
 inline Size getContinuousSize( const Mat& m1, const Mat& m2,
                                const Mat& m3, const Mat& m4,
                                const Mat& m5, int widthScale=1 )
 {
-    return (m1.flags & m2.flags & m3.flags & m4.flags & m5.flags & Mat::CONTINUOUS_FLAG) != 0 ?
-        Size(m1.cols*m1.rows*widthScale, 1) : Size(m1.cols*widthScale, m1.rows);
+    return getContinuousSize_(m1.flags & m2.flags & m3.flags & m4.flags & m5.flags,
+                              m1.cols, m1.rows, widthScale);
 }
 
 struct NoVec
@@ -290,4 +297,6 @@ extern bool __termination; // skip some cleanups, because process is terminating
 
 }
 
+#include "opencv2/hal/intrin.hpp"
+
 #endif /*_CXCORE_INTERNAL_H_*/
diff --git a/modules/core/src/rand.cpp b/modules/core/src/rand.cpp
index bd1d80ca2b..9d95243828 100644
--- a/modules/core/src/rand.cpp
+++ b/modules/core/src/rand.cpp
@@ -748,29 +748,35 @@ namespace cv
 {
 
 template<typename T> static void
-randShuffle_( Mat& _arr, RNG& rng, double iterFactor )
+randShuffle_( Mat& _arr, RNG& rng, double )
 {
-    int sz = _arr.rows*_arr.cols, iters = cvRound(iterFactor*sz);
+    unsigned sz = (unsigned)_arr.total();
     if( _arr.isContinuous() )
     {
         T* arr = _arr.ptr<T>();
-        for( int i = 0; i < iters; i++ )
+        for( unsigned i = 0; i < sz; i++ )
         {
-            int j = (unsigned)rng % sz, k = (unsigned)rng % sz;
-            std::swap( arr[j], arr[k] );
+            unsigned j = (unsigned)rng % sz;
+            std::swap( arr[j], arr[i] );
         }
     }
     else
     {
+        CV_Assert( _arr.dims <= 2 );
         uchar* data = _arr.ptr();
         size_t step = _arr.step;
+        int rows = _arr.rows;
         int cols = _arr.cols;
-        for( int i = 0; i < iters; i++ )
+        for( int i0 = 0; i0 < rows; i0++ )
         {
-            int j1 = (unsigned)rng % sz, k1 = (unsigned)rng % sz;
-            int j0 = j1/cols, k0 = k1/cols;
-            j1 -= j0*cols; k1 -= k0*cols;
-            std::swap( ((T*)(data + step*j0))[j1], ((T*)(data + step*k0))[k1] );
+            T* p = _arr.ptr<T>(i0);
+            for( int j0 = 0; j0 < cols; j0++ )
+            {
+                unsigned k1 = (unsigned)rng % sz;
+                int i1 = (int)(k1 / cols);
+                int j1 = (int)(k1 - (unsigned)i1*(unsigned)cols);
+                std::swap( p[j0], ((T*)(data + step*i1))[j1] );
+            }
         }
     }
 }
diff --git a/modules/core/src/stat.cpp b/modules/core/src/stat.cpp
index e43df94448..8a6bd54721 100644
--- a/modules/core/src/stat.cpp
+++ b/modules/core/src/stat.cpp
@@ -3826,6 +3826,11 @@ void cv::findNonZero( InputArray _src, OutputArray _idx )
     Mat src = _src.getMat();
     CV_Assert( src.type() == CV_8UC1 );
     int n = countNonZero(src);
+    if( n == 0 )
+    {
+        _idx.release();
+        return;
+    }
     if( _idx.kind() == _InputArray::MAT && !_idx.getMatRef().isContinuous() )
         _idx.release();
     _idx.create(n, 1, CV_32SC2);
diff --git a/modules/core/test/test_arithm.cpp b/modules/core/test/test_arithm.cpp
index 14c741edc0..8bd45ee396 100644
--- a/modules/core/test/test_arithm.cpp
+++ b/modules/core/test/test_arithm.cpp
@@ -1791,3 +1791,28 @@ INSTANTIATE_TEST_CASE_P(Arithm, SubtractOutputMatNotEmpty, testing::Combine(
     testing::Values(perf::MatType(CV_8UC1), CV_8UC3, CV_8UC4, CV_16SC1, CV_16SC3),
     testing::Values(-1, CV_16S, CV_32S, CV_32F),
     testing::Bool()));
+
+
+TEST(Core_FindNonZero, singular)
+{
+    Mat img(10, 10, CV_8U, Scalar::all(0));
+    vector<Point> pts, pts2(10);
+    findNonZero(img, pts);
+    findNonZero(img, pts2);
+    ASSERT_TRUE(pts.empty() && pts2.empty());
+}
+
+TEST(Core_BoolVector, support)
+{
+    std::vector<bool> test;
+    int i, n = 205;
+    int nz = 0;
+    test.resize(n);
+    for( i = 0; i < n; i++ )
+    {
+        test[i] = theRNG().uniform(0, 2) != 0;
+        nz += (int)test[i];
+    }
+    ASSERT_EQ( nz, countNonZero(test) );
+    ASSERT_FLOAT_EQ((float)nz/n, (float)(mean(test)[0]));
+}
diff --git a/modules/core/test/test_conjugate_gradient.cpp b/modules/core/test/test_conjugate_gradient.cpp
index 3a4d93797a..a1fd6113c3 100644
--- a/modules/core/test/test_conjugate_gradient.cpp
+++ b/modules/core/test/test_conjugate_gradient.cpp
@@ -58,18 +58,21 @@ static void mytest(cv::Ptr<cv::ConjGradSolver> solver,cv::Ptr<cv::MinProblemSolv
     std::cout<<"--------------------------\n";
 }
 
-class SphereF:public cv::MinProblemSolver::Function{
+class SphereF_CG:public cv::MinProblemSolver::Function{
 public:
+    int getDims() const { return 4; }
     double calc(const double* x)const{
         return x[0]*x[0]+x[1]*x[1]+x[2]*x[2]+x[3]*x[3];
     }
-    void getGradient(const double* x,double* grad){
+    // use automatically computed gradient
+    /*void getGradient(const double* x,double* grad){
         for(int i=0;i<4;i++){
             grad[i]=2*x[i];
         }
-    }
+    }*/
 };
-class RosenbrockF:public cv::MinProblemSolver::Function{
+class RosenbrockF_CG:public cv::MinProblemSolver::Function{
+    int getDims() const { return 2; }
     double calc(const double* x)const{
         return 100*(x[1]-x[0]*x[0])*(x[1]-x[0]*x[0])+(1-x[0])*(1-x[0]);
     }
@@ -79,11 +82,11 @@ class RosenbrockF:public cv::MinProblemSolver::Function{
     }
 };
 
-TEST(DISABLED_Core_ConjGradSolver, regression_basic){
+TEST(Core_ConjGradSolver, regression_basic){
     cv::Ptr<cv::ConjGradSolver> solver=cv::ConjGradSolver::create();
 #if 1
     {
-        cv::Ptr<cv::MinProblemSolver::Function> ptr_F(new SphereF());
+        cv::Ptr<cv::MinProblemSolver::Function> ptr_F(new SphereF_CG());
         cv::Mat x=(cv::Mat_<double>(4,1)<<50.0,10.0,1.0,-10.0),
             etalon_x=(cv::Mat_<double>(1,4)<<0.0,0.0,0.0,0.0);
         double etalon_res=0.0;
@@ -92,7 +95,7 @@ TEST(DISABLED_Core_ConjGradSolver, regression_basic){
 #endif
 #if 1
     {
-        cv::Ptr<cv::MinProblemSolver::Function> ptr_F(new RosenbrockF());
+        cv::Ptr<cv::MinProblemSolver::Function> ptr_F(new RosenbrockF_CG());
         cv::Mat x=(cv::Mat_<double>(2,1)<<0.0,0.0),
             etalon_x=(cv::Mat_<double>(2,1)<<1.0,1.0);
         double etalon_res=0.0;
diff --git a/modules/core/test/test_downhill_simplex.cpp b/modules/core/test/test_downhill_simplex.cpp
index aa6d746121..47c73f9751 100644
--- a/modules/core/test/test_downhill_simplex.cpp
+++ b/modules/core/test/test_downhill_simplex.cpp
@@ -68,17 +68,19 @@ static void mytest(cv::Ptr<cv::DownhillSolver> solver,cv::Ptr<cv::MinProblemSolv
 
 class SphereF:public cv::MinProblemSolver::Function{
 public:
+    int getDims() const { return 2; }
     double calc(const double* x)const{
         return x[0]*x[0]+x[1]*x[1];
     }
 };
 class RosenbrockF:public cv::MinProblemSolver::Function{
+    int getDims() const { return 2; }
     double calc(const double* x)const{
         return 100*(x[1]-x[0]*x[0])*(x[1]-x[0]*x[0])+(1-x[0])*(1-x[0]);
     }
 };
 
-TEST(DISABLED_Core_DownhillSolver, regression_basic){
+TEST(Core_DownhillSolver, regression_basic){
     cv::Ptr<cv::DownhillSolver> solver=cv::DownhillSolver::create();
 #if 1
     {
diff --git a/modules/core/test/test_dxt.cpp b/modules/core/test/test_dxt.cpp
index 2e7bb38eaf..ad75e52dda 100644
--- a/modules/core/test/test_dxt.cpp
+++ b/modules/core/test/test_dxt.cpp
@@ -866,3 +866,24 @@ protected:
 };
 
 TEST(Core_DFT, complex_output) { Core_DFTComplexOutputTest test; test.safe_run(); }
+
+TEST(Core_DFT, complex_output2)
+{
+    for( int i = 0; i < 100; i++ )
+    {
+        int type = theRNG().uniform(0, 2) ? CV_64F : CV_32F;
+        int m = theRNG().uniform(1, 10);
+        int n = theRNG().uniform(1, 10);
+        Mat x(m, n, type), out;
+        randu(x, -1., 1.);
+        dft(x, out, DFT_ROWS | DFT_COMPLEX_OUTPUT);
+        double nrm = norm(out, NORM_INF);
+        double thresh = n*m*2;
+        if( nrm > thresh )
+        {
+            cout << "x: " << x << endl;
+            cout << "out: " << out << endl;
+            ASSERT_LT(nrm, thresh);
+        }
+    }
+}
diff --git a/modules/core/test/test_mat.cpp b/modules/core/test/test_mat.cpp
index 8cd92dd514..102bd99764 100644
--- a/modules/core/test/test_mat.cpp
+++ b/modules/core/test/test_mat.cpp
@@ -1209,3 +1209,42 @@ TEST(Core_Mat, copyNx1ToVector)
 
     ASSERT_PRED_FORMAT2(cvtest::MatComparator(0, 0), ref_dst16, cv::Mat_<ushort>(dst16));
 }
+
+TEST(Core_Matx, fromMat_)
+{
+    Mat_<double> a = (Mat_<double>(2,2) << 10, 11, 12, 13);
+    Matx22d b(a);
+    ASSERT_EQ( norm(a, b, NORM_INF), 0.);
+}
+
+TEST(Core_InputArray, empty)
+{
+    vector<vector<Point> > data;
+    ASSERT_TRUE( _InputArray(data).empty() );
+}
+
+TEST(Core_CopyMask, bug1918)
+{
+    Mat_<unsigned char> tmpSrc(100,100);
+    tmpSrc = 124;
+    Mat_<unsigned char> tmpMask(100,100);
+    tmpMask = 255;
+    Mat_<unsigned char> tmpDst(100,100);
+    tmpDst = 2;
+    tmpSrc.copyTo(tmpDst,tmpMask);
+    ASSERT_EQ(sum(tmpDst)[0], 124*100*100);
+}
+
+TEST(Core_SVD, orthogonality)
+{
+    for( int i = 0; i < 2; i++ )
+    {
+        int type = i == 0 ? CV_32F : CV_64F;
+        Mat mat_D(2, 2, type);
+        mat_D.setTo(88.);
+        Mat mat_U, mat_W;
+        SVD::compute(mat_D, mat_W, mat_U, noArray(), SVD::FULL_UV);
+        mat_U *= mat_U.t();
+        ASSERT_LT(norm(mat_U, Mat::eye(2, 2, type), NORM_INF), 1e-5);
+    }
+}
diff --git a/modules/core/test/test_math.cpp b/modules/core/test/test_math.cpp
index 5d483206e2..e44051914e 100644
--- a/modules/core/test/test_math.cpp
+++ b/modules/core/test/test_math.cpp
@@ -232,7 +232,7 @@ void Core_PowTest::prepare_to_validation( int /*test_case_idx*/ )
                     for( j = 0; j < ncols; j++ )
                     {
                         int val = ((uchar*)a_data)[j];
-                        ((uchar*)b_data)[j] = (uchar)(val <= 1 ? val :
+                        ((uchar*)b_data)[j] = (uchar)(val == 0 ? 255 : val == 1 ? 1 :
                                                       val == 2 && ipower == -1 ? 1 : 0);
                     }
                 else
@@ -247,17 +247,17 @@ void Core_PowTest::prepare_to_validation( int /*test_case_idx*/ )
                 if( ipower < 0 )
                     for( j = 0; j < ncols; j++ )
                     {
-                        int val = ((char*)a_data)[j];
-                        ((char*)b_data)[j] = (char)((val&~1)==0 ? val :
+                        int val = ((schar*)a_data)[j];
+                        ((schar*)b_data)[j] = (schar)(val == 0 ? 127 : val == 1 ? 1 :
                                                     val ==-1 ? 1-2*(ipower&1) :
                                                     val == 2 && ipower == -1 ? 1 : 0);
                     }
                 else
                     for( j = 0; j < ncols; j++ )
                     {
-                        int val = ((char*)a_data)[j];
+                        int val = ((schar*)a_data)[j];
                         val = ipow( val, ipower );
-                        ((char*)b_data)[j] = saturate_cast<schar>(val);
+                        ((schar*)b_data)[j] = saturate_cast<schar>(val);
                     }
                 break;
             case CV_16U:
@@ -265,7 +265,7 @@ void Core_PowTest::prepare_to_validation( int /*test_case_idx*/ )
                     for( j = 0; j < ncols; j++ )
                     {
                         int val = ((ushort*)a_data)[j];
-                        ((ushort*)b_data)[j] = (ushort)((val&~1)==0 ? val :
+                        ((ushort*)b_data)[j] = (ushort)(val == 0 ? 65535 : val == 1 ? 1 :
                                                         val ==-1 ? 1-2*(ipower&1) :
                                                         val == 2 && ipower == -1 ? 1 : 0);
                     }
@@ -282,7 +282,7 @@ void Core_PowTest::prepare_to_validation( int /*test_case_idx*/ )
                     for( j = 0; j < ncols; j++ )
                     {
                         int val = ((short*)a_data)[j];
-                        ((short*)b_data)[j] = (short)((val&~1)==0 ? val :
+                        ((short*)b_data)[j] = (short)(val == 0 ? 32767 : val == 1 ? 1 :
                                                       val ==-1 ? 1-2*(ipower&1) :
                                                       val == 2 && ipower == -1 ? 1 : 0);
                     }
@@ -299,7 +299,7 @@ void Core_PowTest::prepare_to_validation( int /*test_case_idx*/ )
                     for( j = 0; j < ncols; j++ )
                     {
                         int val = ((int*)a_data)[j];
-                        ((int*)b_data)[j] = (val&~1)==0 ? val :
+                        ((int*)b_data)[j] = val == 0 ? INT_MAX : val == 1 ? 1 :
                         val ==-1 ? 1-2*(ipower&1) :
                         val == 2 && ipower == -1 ? 1 : 0;
                     }
@@ -351,8 +351,6 @@ void Core_PowTest::prepare_to_validation( int /*test_case_idx*/ )
     }
 }
 
-
-
 ///////////////////////////////////////// matrix tests ////////////////////////////////////////////
 
 class Core_MatrixTest : public cvtest::ArrayTest
@@ -2822,4 +2820,56 @@ TEST(CovariationMatrixVectorOfMatWithMean, accuracy)
     ASSERT_EQ(sDiff.dot(sDiff), 0.0);
 }
 
+TEST(Core_Pow, special)
+{
+    for( int i = 0; i < 100; i++ )
+    {
+        int n = theRNG().uniform(1, 30);
+        Mat mtx0(1, n, CV_8S), mtx, result;
+        randu(mtx0, -5, 5);
+
+        int type = theRNG().uniform(0, 2) ? CV_64F : CV_32F;
+        double eps = type == CV_32F ? 1e-3 : 1e-10;
+        mtx0.convertTo(mtx, type);
+        // generate power from [-n, n] interval with 1/8 step - enough to check various cases.
+        const int max_pf = 3;
+        int pf = theRNG().uniform(0, max_pf*2+1);
+        double power = ((1 << pf) - (1 << (max_pf*2-1)))/16.;
+        int ipower = cvRound(power);
+        bool is_ipower = ipower == power;
+        cv::pow(mtx, power, result);
+        for( int j = 0; j < n; j++ )
+        {
+            double val = type == CV_32F ? (double)mtx.at<float>(j) : mtx.at<double>(j);
+            double r = type == CV_32F ? (double)result.at<float>(j) : result.at<double>(j);
+            double r0;
+            if( power == 0. )
+                r0 = 1;
+            else if( is_ipower )
+            {
+                r0 = 1;
+                for( int k = 0; k < std::abs(ipower); k++ )
+                    r0 *= val;
+                if( ipower < 0 )
+                    r0 = 1./r0;
+            }
+            else
+                r0 = std::pow(val, power);
+            if( cvIsInf(r0) )
+            {
+                ASSERT_TRUE(cvIsInf(r) != 0);
+            }
+            else if( cvIsNaN(r0) )
+            {
+                ASSERT_TRUE(cvIsNaN(r) != 0);
+            }
+            else
+            {
+                ASSERT_TRUE(cvIsInf(r) == 0 && cvIsNaN(r) == 0);
+                ASSERT_LT(fabs(r - r0), eps);
+            }
+        }
+    }
+}
+
 /* End of file. */
diff --git a/modules/hal/include/opencv2/hal/intrin_neon.hpp b/modules/hal/include/opencv2/hal/intrin_neon.hpp
index ab6aa86315..2418566e62 100644
--- a/modules/hal/include/opencv2/hal/intrin_neon.hpp
+++ b/modules/hal/include/opencv2/hal/intrin_neon.hpp
@@ -322,6 +322,9 @@ OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_int16x8, vmulq_s16)
 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int32x4, vaddq_s32)
 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int32x4, vsubq_s32)
 OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_int32x4, vmulq_s32)
+OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint32x4, vaddq_u32)
+OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint32x4, vsubq_u32)
+OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_uint32x4, vmulq_u32)
 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_float32x4, vaddq_f32)
 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_float32x4, vsubq_f32)
 OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_float32x4, vmulq_f32)
diff --git a/modules/hal/include/opencv2/hal/intrin_sse.hpp b/modules/hal/include/opencv2/hal/intrin_sse.hpp
index 3b77a11542..69171e2516 100644
--- a/modules/hal/include/opencv2/hal/intrin_sse.hpp
+++ b/modules/hal/include/opencv2/hal/intrin_sse.hpp
@@ -614,6 +614,16 @@ inline v_int32x4 operator * (const v_int32x4& a, const v_int32x4& b)
     __m128i d1 = _mm_unpackhi_epi32(c0, c1);
     return v_int32x4(_mm_unpacklo_epi64(d0, d1));
 }
+inline v_uint32x4& operator *= (v_uint32x4& a, const v_uint32x4& b)
+{
+    a = a * b;
+    return a;
+}
+inline v_int32x4& operator *= (v_int32x4& a, const v_int32x4& b)
+{
+    a = a * b;
+    return a;
+}
 
 inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
                          v_int32x4& c, v_int32x4& d)