Merge remote-tracking branch 'upstream/3.4' into merge-3.4

2025-06-07 09:25:45 +08:00 · 2019-10-09 19:46:18 +00:00 · 2019-10-09 19:46:18 +00:00 · 65573784c4
commit 65573784c4
parent 9ab5e52f04 dd4f591d54
23 changed files with 1842 additions and 321 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -477,6 +477,7 @@ OCV_OPTION(CV_DISABLE_OPTIMIZATION    "Disable explicit optimized code (dispatch
 OCV_OPTION(CV_TRACE                   "Enable OpenCV code trace" ON)
 OCV_OPTION(OPENCV_GENERATE_SETUPVARS  "Generate setup_vars* scripts" ON IF (NOT ANDROID AND NOT APPLE_FRAMEWORK) )
 OCV_OPTION(ENABLE_CONFIG_VERIFICATION "Fail build if actual configuration doesn't match requested (WITH_XXX != HAVE_XXX)" OFF)
+OCV_OPTION(OPENCV_ENABLE_MEMALIGN     "Enable posix_memalign or memalign usage" ON)

 OCV_OPTION(ENABLE_PYLINT              "Add target with Pylint checks"                            (BUILD_DOCS OR BUILD_EXAMPLES) IF (NOT CMAKE_CROSSCOMPILING AND NOT APPLE_FRAMEWORK) )
 OCV_OPTION(ENABLE_FLAKE8              "Add target with Python flake8 checker"                    (BUILD_DOCS OR BUILD_EXAMPLES) IF (NOT CMAKE_CROSSCOMPILING AND NOT APPLE_FRAMEWORK) )
@ -625,10 +626,15 @@ if(UNIX)
    set(HAVE_PTHREAD 1)
  endif()

-  CHECK_SYMBOL_EXISTS(posix_memalign stdlib.h HAVE_POSIX_MEMALIGN)
-  CHECK_INCLUDE_FILE(malloc.h HAVE_MALLOC_H)
-  if(HAVE_MALLOC_H)
-    CHECK_SYMBOL_EXISTS(memalign malloc.h HAVE_MEMALIGN)
+  if(OPENCV_ENABLE_MEMALIGN)
+    CHECK_SYMBOL_EXISTS(posix_memalign stdlib.h HAVE_POSIX_MEMALIGN)
+    CHECK_INCLUDE_FILE(malloc.h HAVE_MALLOC_H)
+    if(HAVE_MALLOC_H)
+      CHECK_SYMBOL_EXISTS(memalign malloc.h HAVE_MEMALIGN)
+    endif()
+    # TODO:
+    # - _aligned_malloc() on Win32
+    # - std::aligned_alloc() C++17 / C11
  endif()
 endif()

--- a/cmake/checks/cpu_avx2.cpp
+++ b/cmake/checks/cpu_avx2.cpp
@ -6,5 +6,6 @@ void test()
 {
    int data[8] = {0,0,0,0, 0,0,0,0};
    __m256i a = _mm256_loadu_si256((const __m256i *)data);
+    __m256i b = _mm256_bslli_epi128(a, 1);  // available in GCC 4.9.3+
 }
 int main() { return 0; }
--- a/modules/core/CMakeLists.txt
+++ b/modules/core/CMakeLists.txt
@ -6,7 +6,7 @@ ocv_add_dispatched_file(arithm SSE2 SSE4_1 AVX2 VSX3)
 ocv_add_dispatched_file(convert SSE2 AVX2 VSX3)
 ocv_add_dispatched_file(convert_scale SSE2 AVX2)
 ocv_add_dispatched_file(count_non_zero SSE2 AVX2)
-ocv_add_dispatched_file(matmul SSE2 AVX2)
+ocv_add_dispatched_file(matmul SSE2 SSE4_1 AVX2 AVX512_SKX)
 ocv_add_dispatched_file(mean SSE2 AVX2)
 ocv_add_dispatched_file(merge SSE2 AVX2)
 ocv_add_dispatched_file(split SSE2 AVX2)
--- a/modules/core/include/opencv2/core/cv_cpu_dispatch.h
+++ b/modules/core/include/opencv2/core/cv_cpu_dispatch.h
@ -342,4 +342,4 @@ struct VZeroUpperGuard {

 #ifndef CV_WASM_SIMD
 #  define CV_WASM_SIMD 0
-#endif
+#endif
--- a/modules/core/include/opencv2/core/hal/intrin_avx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_avx.hpp
@ -1431,6 +1431,28 @@ inline v_float64x4 v_cvt_f64(const v_float32x8& a)
 inline v_float64x4 v_cvt_f64_high(const v_float32x8& a)
 { return v_float64x4(_mm256_cvtps_pd(_v256_extract_high(a.val))); }

+// from (Mysticial and wim) https://stackoverflow.com/q/41144668
+inline v_float64x4 v_cvt_f64(const v_int64x4& v)
+{
+    // constants encoded as floating-point
+    __m256i magic_i_lo   = _mm256_set1_epi64x(0x4330000000000000); // 2^52
+    __m256i magic_i_hi32 = _mm256_set1_epi64x(0x4530000080000000); // 2^84 + 2^63
+    __m256i magic_i_all  = _mm256_set1_epi64x(0x4530000080100000); // 2^84 + 2^63 + 2^52
+    __m256d magic_d_all  = _mm256_castsi256_pd(magic_i_all);
+
+    // Blend the 32 lowest significant bits of v with magic_int_lo
+    __m256i v_lo         = _mm256_blend_epi32(magic_i_lo, v.val, 0x55);
+    // Extract the 32 most significant bits of v
+    __m256i v_hi         = _mm256_srli_epi64(v.val, 32);
+    // Flip the msb of v_hi and blend with 0x45300000
+            v_hi         = _mm256_xor_si256(v_hi, magic_i_hi32);
+    // Compute in double precision
+    __m256d v_hi_dbl     = _mm256_sub_pd(_mm256_castsi256_pd(v_hi), magic_d_all);
+    // (v_hi - magic_d_all) + v_lo  Do not assume associativity of floating point addition
+    __m256d result       = _mm256_add_pd(v_hi_dbl, _mm256_castsi256_pd(v_lo));
+    return v_float64x4(result);
+}
+
 ////////////// Lookup table access ////////////////////

 inline v_int8x32 v256_lut(const schar* tab, const int* idx)
@ -1638,12 +1660,165 @@ inline v_float32x8 v_pack_triplets(const v_float32x8& vec)

 ////////// Matrix operations /////////

+//////// Dot Product ////////
+
+// 16 >> 32
 inline v_int32x8 v_dotprod(const v_int16x16& a, const v_int16x16& b)
 { return v_int32x8(_mm256_madd_epi16(a.val, b.val)); }
-
 inline v_int32x8 v_dotprod(const v_int16x16& a, const v_int16x16& b, const v_int32x8& c)
 { return v_dotprod(a, b) + c; }

+// 32 >> 64
+inline v_int64x4 v_dotprod(const v_int32x8& a, const v_int32x8& b)
+{
+    __m256i even = _mm256_mul_epi32(a.val, b.val);
+    __m256i odd = _mm256_mul_epi32(_mm256_srli_epi64(a.val, 32), _mm256_srli_epi64(b.val, 32));
+    return v_int64x4(_mm256_add_epi64(even, odd));
+}
+inline v_int64x4 v_dotprod(const v_int32x8& a, const v_int32x8& b, const v_int64x4& c)
+{ return v_dotprod(a, b) + c; }
+
+// 8 >> 32
+inline v_uint32x8 v_dotprod_expand(const v_uint8x32& a, const v_uint8x32& b)
+{
+    __m256i even_m = _mm256_set1_epi32(0xFF00FF00);
+    __m256i even_a = _mm256_blendv_epi8(a.val, _mm256_setzero_si256(), even_m);
+    __m256i odd_a  = _mm256_srli_epi16(a.val, 8);
+
+    __m256i even_b = _mm256_blendv_epi8(b.val, _mm256_setzero_si256(), even_m);
+    __m256i odd_b  = _mm256_srli_epi16(b.val, 8);
+
+    __m256i prod0  = _mm256_madd_epi16(even_a, even_b);
+    __m256i prod1  = _mm256_madd_epi16(odd_a, odd_b);
+    return v_uint32x8(_mm256_add_epi32(prod0, prod1));
+}
+inline v_uint32x8 v_dotprod_expand(const v_uint8x32& a, const v_uint8x32& b, const v_uint32x8& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+inline v_int32x8 v_dotprod_expand(const v_int8x32& a, const v_int8x32& b)
+{
+    __m256i even_a = _mm256_srai_epi16(_mm256_bslli_epi128(a.val, 1), 8);
+    __m256i odd_a  = _mm256_srai_epi16(a.val, 8);
+
+    __m256i even_b = _mm256_srai_epi16(_mm256_bslli_epi128(b.val, 1), 8);
+    __m256i odd_b  = _mm256_srai_epi16(b.val, 8);
+
+    __m256i prod0  = _mm256_madd_epi16(even_a, even_b);
+    __m256i prod1  = _mm256_madd_epi16(odd_a, odd_b);
+    return v_int32x8(_mm256_add_epi32(prod0, prod1));
+}
+inline v_int32x8 v_dotprod_expand(const v_int8x32& a, const v_int8x32& b, const v_int32x8& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+// 16 >> 64
+inline v_uint64x4 v_dotprod_expand(const v_uint16x16& a, const v_uint16x16& b)
+{
+    __m256i mullo = _mm256_mullo_epi16(a.val, b.val);
+    __m256i mulhi = _mm256_mulhi_epu16(a.val, b.val);
+    __m256i mul0  = _mm256_unpacklo_epi16(mullo, mulhi);
+    __m256i mul1  = _mm256_unpackhi_epi16(mullo, mulhi);
+
+    __m256i p02   = _mm256_blend_epi32(mul0, _mm256_setzero_si256(), 0xAA);
+    __m256i p13   = _mm256_srli_epi64(mul0, 32);
+    __m256i p46   = _mm256_blend_epi32(mul1, _mm256_setzero_si256(), 0xAA);
+    __m256i p57   = _mm256_srli_epi64(mul1, 32);
+
+    __m256i p15_  = _mm256_add_epi64(p02, p13);
+    __m256i p9d_  = _mm256_add_epi64(p46, p57);
+
+    return v_uint64x4(_mm256_add_epi64(
+        _mm256_unpacklo_epi64(p15_, p9d_),
+        _mm256_unpackhi_epi64(p15_, p9d_)
+    ));
+}
+inline v_uint64x4 v_dotprod_expand(const v_uint16x16& a, const v_uint16x16& b, const v_uint64x4& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+inline v_int64x4 v_dotprod_expand(const v_int16x16& a, const v_int16x16& b)
+{
+    __m256i prod = _mm256_madd_epi16(a.val, b.val);
+    __m256i sign = _mm256_srai_epi32(prod, 31);
+
+    __m256i lo = _mm256_unpacklo_epi32(prod, sign);
+    __m256i hi = _mm256_unpackhi_epi32(prod, sign);
+
+    return v_int64x4(_mm256_add_epi64(
+        _mm256_unpacklo_epi64(lo, hi),
+        _mm256_unpackhi_epi64(lo, hi)
+    ));
+}
+inline v_int64x4 v_dotprod_expand(const v_int16x16& a, const v_int16x16& b, const v_int64x4& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+// 32 >> 64f
+inline v_float64x4 v_dotprod_expand(const v_int32x8& a, const v_int32x8& b)
+{ return v_cvt_f64(v_dotprod(a, b)); }
+inline v_float64x4 v_dotprod_expand(const v_int32x8& a, const v_int32x8& b, const v_float64x4& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+//////// Fast Dot Product ////////
+
+// 16 >> 32
+inline v_int32x8 v_dotprod_fast(const v_int16x16& a, const v_int16x16& b)
+{ return v_dotprod(a, b); }
+inline v_int32x8 v_dotprod_fast(const v_int16x16& a, const v_int16x16& b, const v_int32x8& c)
+{ return v_dotprod(a, b, c); }
+
+// 32 >> 64
+inline v_int64x4 v_dotprod_fast(const v_int32x8& a, const v_int32x8& b)
+{ return v_dotprod(a, b); }
+inline v_int64x4 v_dotprod_fast(const v_int32x8& a, const v_int32x8& b, const v_int64x4& c)
+{ return v_dotprod(a, b, c); }
+
+// 8 >> 32
+inline v_uint32x8 v_dotprod_expand_fast(const v_uint8x32& a, const v_uint8x32& b)
+{ return v_dotprod_expand(a, b); }
+inline v_uint32x8 v_dotprod_expand_fast(const v_uint8x32& a, const v_uint8x32& b, const v_uint32x8& c)
+{ return v_dotprod_expand(a, b, c); }
+
+inline v_int32x8 v_dotprod_expand_fast(const v_int8x32& a, const v_int8x32& b)
+{ return v_dotprod_expand(a, b); }
+inline v_int32x8 v_dotprod_expand_fast(const v_int8x32& a, const v_int8x32& b, const v_int32x8& c)
+{ return v_dotprod_expand(a, b, c); }
+
+// 16 >> 64
+inline v_uint64x4 v_dotprod_expand_fast(const v_uint16x16& a, const v_uint16x16& b)
+{
+    __m256i mullo = _mm256_mullo_epi16(a.val, b.val);
+    __m256i mulhi = _mm256_mulhi_epu16(a.val, b.val);
+    __m256i mul0  = _mm256_unpacklo_epi16(mullo, mulhi);
+    __m256i mul1  = _mm256_unpackhi_epi16(mullo, mulhi);
+
+    __m256i p02   = _mm256_blend_epi32(mul0, _mm256_setzero_si256(), 0xAA);
+    __m256i p13   = _mm256_srli_epi64(mul0, 32);
+    __m256i p46   = _mm256_blend_epi32(mul1, _mm256_setzero_si256(), 0xAA);
+    __m256i p57   = _mm256_srli_epi64(mul1, 32);
+
+    __m256i p15_  = _mm256_add_epi64(p02, p13);
+    __m256i p9d_  = _mm256_add_epi64(p46, p57);
+
+    return v_uint64x4(_mm256_add_epi64(p15_, p9d_));
+}
+inline v_uint64x4 v_dotprod_expand_fast(const v_uint16x16& a, const v_uint16x16& b, const v_uint64x4& c)
+{ return v_dotprod_expand_fast(a, b) + c; }
+
+inline v_int64x4 v_dotprod_expand_fast(const v_int16x16& a, const v_int16x16& b)
+{
+    __m256i prod = _mm256_madd_epi16(a.val, b.val);
+    __m256i sign = _mm256_srai_epi32(prod, 31);
+    __m256i lo = _mm256_unpacklo_epi32(prod, sign);
+    __m256i hi = _mm256_unpackhi_epi32(prod, sign);
+    return v_int64x4(_mm256_add_epi64(lo, hi));
+}
+inline v_int64x4 v_dotprod_expand_fast(const v_int16x16& a, const v_int16x16& b, const v_int64x4& c)
+{ return v_dotprod_expand_fast(a, b) + c; }
+
+// 32 >> 64f
+inline v_float64x4 v_dotprod_expand_fast(const v_int32x8& a, const v_int32x8& b)
+{ return v_dotprod_expand(a, b); }
+inline v_float64x4 v_dotprod_expand_fast(const v_int32x8& a, const v_int32x8& b, const v_float64x4& c)
+{ return v_dotprod_expand(a, b, c); }
+
 #define OPENCV_HAL_AVX_SPLAT2_PS(a, im) \
    v_float32x8(_mm256_permute_ps(a.val, _MM_SHUFFLE(im, im, im, im)))

--- a/modules/core/include/opencv2/core/hal/intrin_avx512.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_avx512.hpp
@ -1473,6 +1473,32 @@ inline v_float64x8 v_cvt_f64(const v_float32x16& a)
 inline v_float64x8 v_cvt_f64_high(const v_float32x16& a)
 { return v_float64x8(_mm512_cvtps_pd(_v512_extract_high(a.val))); }

+// from (Mysticial and wim) https://stackoverflow.com/q/41144668
+inline v_float64x8 v_cvt_f64(const v_int64x8& v)
+{
+#if CV_AVX_512DQ
+    return v_float64x8(_mm512_cvtepi64_pd(v.val));
+#else
+    // constants encoded as floating-point
+    __m512i magic_i_lo   = _mm512_set1_epi64x(0x4330000000000000); // 2^52
+    __m512i magic_i_hi32 = _mm512_set1_epi64x(0x4530000080000000); // 2^84 + 2^63
+    __m512i magic_i_all  = _mm512_set1_epi64x(0x4530000080100000); // 2^84 + 2^63 + 2^52
+    __m512d magic_d_all  = _mm512_castsi512_pd(magic_i_all);
+
+    // Blend the 32 lowest significant bits of v with magic_int_lo
+    __m512i v_lo         = _mm512_blend_epi32(magic_i_lo, v.val, 0x55);
+    // Extract the 32 most significant bits of v
+    __m512i v_hi         = _mm512_srli_epi64(v.val, 32);
+    // Flip the msb of v_hi and blend with 0x45300000
+            v_hi         = _mm512_xor_si512(v_hi, magic_i_hi32);
+    // Compute in double precision
+    __m512d v_hi_dbl     = _mm512_sub_pd(_mm512_castsi512_pd(v_hi), magic_d_all);
+    // (v_hi - magic_d_all) + v_lo  Do not assume associativity of floating point addition
+    __m512d result       = _mm512_add_pd(v_hi_dbl, _mm512_castsi512_pd(v_lo));
+    return v_float64x8(result);
+#endif
+}
+
 ////////////// Lookup table access ////////////////////

 inline v_int8x64 v512_lut(const schar* tab, const int* idx)
@ -1672,12 +1698,152 @@ inline v_float32x16 v_pack_triplets(const v_float32x16& vec)

 ////////// Matrix operations /////////

+//////// Dot Product ////////
+
+// 16 >> 32
 inline v_int32x16 v_dotprod(const v_int16x32& a, const v_int16x32& b)
 { return v_int32x16(_mm512_madd_epi16(a.val, b.val)); }
-
 inline v_int32x16 v_dotprod(const v_int16x32& a, const v_int16x32& b, const v_int32x16& c)
 { return v_dotprod(a, b) + c; }

+// 32 >> 64
+inline v_int64x8 v_dotprod(const v_int32x16& a, const v_int32x16& b)
+{
+    __m512i even = _mm512_mul_epi32(a.val, b.val);
+    __m512i odd = _mm512_mul_epi32(_mm512_srli_epi64(a.val, 32), _mm512_srli_epi64(b.val, 32));
+    return v_int64x8(_mm512_add_epi64(even, odd));
+}
+inline v_int64x8 v_dotprod(const v_int32x16& a, const v_int32x16& b, const v_int64x8& c)
+{ return v_dotprod(a, b) + c; }
+
+// 8 >> 32
+inline v_uint32x16 v_dotprod_expand(const v_uint8x64& a, const v_uint8x64& b)
+{
+    __m512i even_a = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, a.val, _mm512_setzero_si512());
+    __m512i odd_a  = _mm512_srli_epi16(a.val, 8);
+
+    __m512i even_b = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, b.val, _mm512_setzero_si512());
+    __m512i odd_b  = _mm512_srli_epi16(b.val, 8);
+
+    __m512i prod0  = _mm512_madd_epi16(even_a, even_b);
+    __m512i prod1  = _mm512_madd_epi16(odd_a, odd_b);
+    return v_uint32x16(_mm512_add_epi32(prod0, prod1));
+}
+inline v_uint32x16 v_dotprod_expand(const v_uint8x64& a, const v_uint8x64& b, const v_uint32x16& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+inline v_int32x16 v_dotprod_expand(const v_int8x64& a, const v_int8x64& b)
+{
+    __m512i even_a = _mm512_srai_epi16(_mm512_bslli_epi128(a.val, 1), 8);
+    __m512i odd_a  = _mm512_srai_epi16(a.val, 8);
+
+    __m512i even_b = _mm512_srai_epi16(_mm512_bslli_epi128(b.val, 1), 8);
+    __m512i odd_b  = _mm512_srai_epi16(b.val, 8);
+
+    __m512i prod0  = _mm512_madd_epi16(even_a, even_b);
+    __m512i prod1  = _mm512_madd_epi16(odd_a, odd_b);
+    return v_int32x16(_mm512_add_epi32(prod0, prod1));
+}
+inline v_int32x16 v_dotprod_expand(const v_int8x64& a, const v_int8x64& b, const v_int32x16& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+// 16 >> 64
+inline v_uint64x8 v_dotprod_expand(const v_uint16x32& a, const v_uint16x32& b)
+{
+    __m512i mullo = _mm512_mullo_epi16(a.val, b.val);
+    __m512i mulhi = _mm512_mulhi_epu16(a.val, b.val);
+    __m512i mul0  = _mm512_unpacklo_epi16(mullo, mulhi);
+    __m512i mul1  = _mm512_unpackhi_epi16(mullo, mulhi);
+
+    __m512i p02   = _mm512_mask_blend_epi32(0xAAAA, mul0, _mm512_setzero_si512());
+    __m512i p13   = _mm512_srli_epi64(mul0, 32);
+    __m512i p46   = _mm512_mask_blend_epi32(0xAAAA, mul1, _mm512_setzero_si512());
+    __m512i p57   = _mm512_srli_epi64(mul1, 32);
+
+    __m512i p15_  = _mm512_add_epi64(p02, p13);
+    __m512i p9d_  = _mm512_add_epi64(p46, p57);
+
+    return v_uint64x8(_mm512_add_epi64(
+        _mm512_unpacklo_epi64(p15_, p9d_),
+        _mm512_unpackhi_epi64(p15_, p9d_)
+    ));
+}
+inline v_uint64x8 v_dotprod_expand(const v_uint16x32& a, const v_uint16x32& b, const v_uint64x8& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+inline v_int64x8 v_dotprod_expand(const v_int16x32& a, const v_int16x32& b)
+{
+    __m512i prod = _mm512_madd_epi16(a.val, b.val);
+    __m512i even = _mm512_srai_epi64(_mm512_bslli_epi128(prod, 4), 32);
+    __m512i odd  = _mm512_srai_epi64(prod, 32);
+    return v_int64x8(_mm512_add_epi64(even, odd));
+}
+inline v_int64x8 v_dotprod_expand(const v_int16x32& a, const v_int16x32& b, const v_int64x8& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+// 32 >> 64f
+inline v_float64x8 v_dotprod_expand(const v_int32x16& a, const v_int32x16& b)
+{ return v_cvt_f64(v_dotprod(a, b)); }
+inline v_float64x8 v_dotprod_expand(const v_int32x16& a, const v_int32x16& b, const v_float64x8& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+//////// Fast Dot Product ////////
+
+// 16 >> 32
+inline v_int32x16 v_dotprod_fast(const v_int16x32& a, const v_int16x32& b)
+{ return v_dotprod(a, b); }
+inline v_int32x16 v_dotprod_fast(const v_int16x32& a, const v_int16x32& b, const v_int32x16& c)
+{ return v_dotprod(a, b, c); }
+
+// 32 >> 64
+inline v_int64x8 v_dotprod_fast(const v_int32x16& a, const v_int32x16& b)
+{ return v_dotprod(a, b); }
+inline v_int64x8 v_dotprod_fast(const v_int32x16& a, const v_int32x16& b, const v_int64x8& c)
+{ return v_dotprod(a, b, c); }
+
+// 8 >> 32
+inline v_uint32x16 v_dotprod_expand_fast(const v_uint8x64& a, const v_uint8x64& b)
+{ return v_dotprod_expand(a, b); }
+inline v_uint32x16 v_dotprod_expand_fast(const v_uint8x64& a, const v_uint8x64& b, const v_uint32x16& c)
+{ return v_dotprod_expand(a, b, c); }
+
+inline v_int32x16 v_dotprod_expand_fast(const v_int8x64& a, const v_int8x64& b)
+{ return v_dotprod_expand(a, b); }
+inline v_int32x16 v_dotprod_expand_fast(const v_int8x64& a, const v_int8x64& b, const v_int32x16& c)
+{ return v_dotprod_expand(a, b, c); }
+
+// 16 >> 64
+inline v_uint64x8 v_dotprod_expand_fast(const v_uint16x32& a, const v_uint16x32& b)
+{
+    __m512i mullo = _mm512_mullo_epi16(a.val, b.val);
+    __m512i mulhi = _mm512_mulhi_epu16(a.val, b.val);
+    __m512i mul0  = _mm512_unpacklo_epi16(mullo, mulhi);
+    __m512i mul1  = _mm512_unpackhi_epi16(mullo, mulhi);
+
+    __m512i p02   = _mm512_mask_blend_epi32(0xAAAA, mul0, _mm512_setzero_si512());
+    __m512i p13   = _mm512_srli_epi64(mul0, 32);
+    __m512i p46   = _mm512_mask_blend_epi32(0xAAAA, mul1, _mm512_setzero_si512());
+    __m512i p57   = _mm512_srli_epi64(mul1, 32);
+
+    __m512i p15_  = _mm512_add_epi64(p02, p13);
+    __m512i p9d_  = _mm512_add_epi64(p46, p57);
+    return v_uint64x8(_mm512_add_epi64(p15_, p9d_));
+}
+inline v_uint64x8 v_dotprod_expand_fast(const v_uint16x32& a, const v_uint16x32& b, const v_uint64x8& c)
+{ return v_dotprod_expand_fast(a, b) + c; }
+
+inline v_int64x8 v_dotprod_expand_fast(const v_int16x32& a, const v_int16x32& b)
+{ return v_dotprod_expand(a, b); }
+inline v_int64x8 v_dotprod_expand_fast(const v_int16x32& a, const v_int16x32& b, const v_int64x8& c)
+{ return v_dotprod_expand(a, b, c); }
+
+// 32 >> 64f
+inline v_float64x8 v_dotprod_expand_fast(const v_int32x16& a, const v_int32x16& b)
+{ return v_dotprod_expand(a, b); }
+inline v_float64x8 v_dotprod_expand_fast(const v_int32x16& a, const v_int32x16& b, const v_float64x8& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+
 #define OPENCV_HAL_AVX512_SPLAT2_PS(a, im) \
    v_float32x16(_mm512_permute_ps(a.val, _MM_SHUFFLE(im, im, im, im)))

--- a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
@ -171,7 +171,8 @@ Different type conversions and casts:

 ### Matrix operations

-In these operations vectors represent matrix rows/columns: @ref v_dotprod, @ref v_matmul, @ref v_transpose4x4
+In these operations vectors represent matrix rows/columns: @ref v_dotprod, @ref v_dotprod_fast,
+@ref v_dotprod_expand, @ref v_dotprod_expand_fast, @ref v_matmul, @ref v_transpose4x4

 ### Usability

@ -195,7 +196,10 @@ Regular integers:
 |mul_expand         | x | x | x | x | x |   |
 |compare            | x | x | x | x | x | x |
 |shift              |   |   | x | x | x | x |
-|dotprod            |   |   |   | x |   |   |
+|dotprod            |   |   |   | x |   | x |
+|dotprod_fast       |   |   |   | x |   | x |
+|dotprod_expand     | x | x | x | x |   | x |
+|dotprod_expand_fast| x | x | x | x |   | x |
 |logical            | x | x | x | x | x | x |
 |min, max           | x | x | x | x | x | x |
 |absdiff            | x | x | x | x | x | x |
@ -222,6 +226,7 @@ Big integers:
 |logical            | x | x |
 |extract            | x | x |
 |rotate (lanes)     | x | x |
+|cvt_flt64          |   | x |

 Floating point:

@ -853,17 +858,18 @@ inline v_reg<_Tp, n> v_muladd(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
 /** @brief Dot product of elements

 Multiply values in two registers and sum adjacent result pairs.
+
 Scheme:
@code
  {A1 A2 ...} // 16-bit
 x {B1 B2 ...} // 16-bit
 -------------
 {A1B1+A2B2 ...} // 32-bit
+
@endcode
-Implemented only for 16-bit signed source type (v_int16x8).
 */
 template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
-    v_dotprod(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
+v_dotprod(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
 {
    typedef typename V_TypeTraits<_Tp>::w_type w_type;
    v_reg<w_type, n/2> c;
@ -881,12 +887,11 @@ Scheme:
 x {B1 B2 ...} // 16-bit
 -------------
  {A1B1+A2B2+C1 ...} // 32-bit
-
@endcode
-Implemented only for 16-bit signed source type (v_int16x8).
 */
 template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
-    v_dotprod(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, const v_reg<typename V_TypeTraits<_Tp>::w_type, n / 2>& c)
+v_dotprod(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
+          const v_reg<typename V_TypeTraits<_Tp>::w_type, n / 2>& c)
 {
    typedef typename V_TypeTraits<_Tp>::w_type w_type;
    v_reg<w_type, n/2> s;
@ -895,6 +900,95 @@ template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n
    return s;
 }

+/** @brief Fast Dot product of elements
+
+Same as cv::v_dotprod, but it may perform unorder sum between result pairs in some platforms,
+this intrinsic can be used if the sum among all lanes is only matters
+and also it should be yielding better performance on the affected platforms.
+
+*/
+template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
+v_dotprod_fast(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
+{ return v_dotprod(a, b); }
+
+/** @brief Fast Dot product of elements
+
+Same as cv::v_dotprod_fast, but add a third element to the sum of adjacent pairs.
+*/
+template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
+v_dotprod_fast(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
+               const v_reg<typename V_TypeTraits<_Tp>::w_type, n / 2>& c)
+{ return v_dotprod(a, b, c); }
+
+/** @brief Dot product of elements and expand
+
+Multiply values in two registers and expand the sum of adjacent result pairs.
+
+Scheme:
+@code
+  {A1 A2 A3 A4 ...} // 8-bit
+x {B1 B2 B3 B4 ...} // 8-bit
+-------------
+  {A1B1+A2B2+A3B3+A4B4 ...} // 32-bit
+
+@endcode
+*/
+template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::q_type, n/4>
+v_dotprod_expand(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
+{
+    typedef typename V_TypeTraits<_Tp>::q_type q_type;
+    v_reg<q_type, n/4> s;
+    for( int i = 0; i < (n/4); i++ )
+        s.s[i] = (q_type)a.s[i*4    ]*b.s[i*4    ] + (q_type)a.s[i*4 + 1]*b.s[i*4 + 1] +
+                 (q_type)a.s[i*4 + 2]*b.s[i*4 + 2] + (q_type)a.s[i*4 + 3]*b.s[i*4 + 3];
+    return s;
+}
+
+/** @brief Dot product of elements
+
+Same as cv::v_dotprod_expand, but add a third element to the sum of adjacent pairs.
+Scheme:
+@code
+  {A1 A2 A3 A4 ...} // 8-bit
+x {B1 B2 B3 B4 ...} // 8-bit
+-------------
+  {A1B1+A2B2+A3B3+A4B4+C1 ...} // 32-bit
+@endcode
+*/
+template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::q_type, n/4>
+v_dotprod_expand(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
+                 const v_reg<typename V_TypeTraits<_Tp>::q_type, n / 4>& c)
+{
+    typedef typename V_TypeTraits<_Tp>::q_type q_type;
+    v_reg<q_type, n/4> s;
+    for( int i = 0; i < (n/4); i++ )
+        s.s[i] = (q_type)a.s[i*4    ]*b.s[i*4    ] + (q_type)a.s[i*4 + 1]*b.s[i*4 + 1] +
+                 (q_type)a.s[i*4 + 2]*b.s[i*4 + 2] + (q_type)a.s[i*4 + 3]*b.s[i*4 + 3] + c.s[i];
+    return s;
+}
+
+/** @brief Fast Dot product of elements and expand
+
+Multiply values in two registers and expand the sum of adjacent result pairs.
+
+Same as cv::v_dotprod_expand, but it may perform unorder sum between result pairs in some platforms,
+this intrinsic can be used if the sum among all lanes is only matters
+and also it should be yielding better performance on the affected platforms.
+
+*/
+template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::q_type, n/4>
+v_dotprod_expand_fast(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
+{ return v_dotprod_expand(a, b); }
+
+/** @brief Fast Dot product of elements
+
+Same as cv::v_dotprod_expand_fast, but add a third element to the sum of adjacent pairs.
+*/
+template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::q_type, n/4>
+v_dotprod_expand_fast(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
+                      const v_reg<typename V_TypeTraits<_Tp>::q_type, n / 4>& c)
+{ return v_dotprod_expand(a, b, c); }
+
 /** @brief Multiply and expand

 Multiply values two registers and store results in two registers with wider pack type.
@ -1810,6 +1904,17 @@ template<int n> inline v_reg<double, n> v_cvt_f64(const v_reg<float, n*2>& a)
    return c;
 }

+/** @brief Convert to double
+
+Supported input type is cv::v_int64x2. */
+template<int n> inline v_reg<double, n> v_cvt_f64(const v_reg<int64, n>& a)
+{
+    v_reg<double, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = (double)a.s[i];
+    return c;
+}
+
 template<typename _Tp> inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_lut(const _Tp* tab, const int* idx)
 {
    v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> c;
--- a/modules/core/include/opencv2/core/hal/intrin_forward.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_forward.hpp
@ -160,6 +160,16 @@ void v_mul_expand(const __CV_V_UINT32&, const __CV_V_UINT32&, __CV_V_UINT64&, __
 void v_mul_expand(const __CV_V_INT32&,  const __CV_V_INT32&,  __CV_V_INT64&,  __CV_V_INT64&);
 #endif

+// Conversions
+__CV_V_FLOAT32 v_cvt_f32(const __CV_V_INT32& a);
+__CV_V_FLOAT32 v_cvt_f32(const __CV_V_FLOAT64& a);
+__CV_V_FLOAT32 v_cvt_f32(const __CV_V_FLOAT64& a, const __CV_V_FLOAT64& b);
+__CV_V_FLOAT64 v_cvt_f64(const __CV_V_INT32& a);
+__CV_V_FLOAT64 v_cvt_f64_high(const __CV_V_INT32& a);
+__CV_V_FLOAT64 v_cvt_f64(const __CV_V_FLOAT32& a);
+__CV_V_FLOAT64 v_cvt_f64_high(const __CV_V_FLOAT32& a);
+__CV_V_FLOAT64 v_cvt_f64(const __CV_V_INT64& a);
+
 /** Cleanup **/
 #undef CV__SIMD_FORWARD
 #undef __CV_VX
--- a/modules/core/include/opencv2/core/hal/intrin_msa.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_msa.hpp
@ -461,15 +461,124 @@ inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b)
                                    msa_mulq_u32(msa_paddlq_u16(a_hi), msa_paddlq_u16(b_hi)), 16));
 }

+//////// Dot Product ////////
+
+// 16 >> 32
 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
+{ return v_int32x4(msa_dotp_s_w(a.val, b.val)); }
+inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
+{ return v_int32x4(msa_dpadd_s_w(c.val , a.val, b.val)); }
+
+// 32 >> 64
+inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
+{ return v_int64x2(msa_dotp_s_d(a.val, b.val)); }
+inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
+{ return v_int64x2(msa_dpadd_s_d(c.val , a.val, b.val)); }
+
+// 8 >> 32
+inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
 {
-    return v_int32x4(msa_dotp_s_w(a.val, b.val));
+    v8u16 even_a = msa_shrq_n_u16(msa_shlq_n_u16(MSA_TPV_REINTERPRET(v8u16, a.val), 8), 8);
+    v8u16 odd_a  = msa_shrq_n_u16(MSA_TPV_REINTERPRET(v8u16, a.val), 8);
+    v8u16 even_b = msa_shrq_n_u16(msa_shlq_n_u16(MSA_TPV_REINTERPRET(v8u16, b.val), 8), 8);
+    v8u16 odd_b  = msa_shrq_n_u16(MSA_TPV_REINTERPRET(v8u16, b.val), 8);
+    v4u32 prod   = msa_dotp_u_w(even_a, even_b);
+    return v_uint32x4(msa_dpadd_u_w(prod, odd_a, odd_b));
+}
+inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
+{
+    v8u16 even_a = msa_shrq_n_u16(msa_shlq_n_u16(MSA_TPV_REINTERPRET(v8u16, a.val), 8), 8);
+    v8u16 odd_a  = msa_shrq_n_u16(MSA_TPV_REINTERPRET(v8u16, a.val), 8);
+    v8u16 even_b = msa_shrq_n_u16(msa_shlq_n_u16(MSA_TPV_REINTERPRET(v8u16, b.val), 8), 8);
+    v8u16 odd_b  = msa_shrq_n_u16(MSA_TPV_REINTERPRET(v8u16, b.val), 8);
+    v4u32 prod   = msa_dpadd_u_w(c.val, even_a, even_b);
+    return v_uint32x4(msa_dpadd_u_w(prod, odd_a, odd_b));
 }

-inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
+inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
 {
-    return v_int32x4(msa_dpadd_s_w(c.val , a.val, b.val));
+    v8i16 prod = msa_dotp_s_h(a.val, b.val);
+    return v_int32x4(msa_hadd_s32(prod, prod));
 }
+inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b,
+                                  const v_int32x4& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+// 16 >> 64
+inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
+{
+    v4u32 even_a = msa_shrq_n_u32(msa_shlq_n_u32(MSA_TPV_REINTERPRET(v4u32, a.val), 16), 16);
+    v4u32 odd_a  = msa_shrq_n_u32(MSA_TPV_REINTERPRET(v4u32, a.val), 16);
+    v4u32 even_b = msa_shrq_n_u32(msa_shlq_n_u32(MSA_TPV_REINTERPRET(v4u32, b.val), 16), 16);
+    v4u32 odd_b  = msa_shrq_n_u32(MSA_TPV_REINTERPRET(v4u32, b.val), 16);
+    v2u64 prod   = msa_dotp_u_d(even_a, even_b);
+    return v_uint64x2(msa_dpadd_u_d(prod, odd_a, odd_b));
+}
+inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b,
+                                   const v_uint64x2& c)
+{
+    v4u32 even_a = msa_shrq_n_u32(msa_shlq_n_u32(MSA_TPV_REINTERPRET(v4u32, a.val), 16), 16);
+    v4u32 odd_a  = msa_shrq_n_u32(MSA_TPV_REINTERPRET(v4u32, a.val), 16);
+    v4u32 even_b = msa_shrq_n_u32(msa_shlq_n_u32(MSA_TPV_REINTERPRET(v4u32, b.val), 16), 16);
+    v4u32 odd_b  = msa_shrq_n_u32(MSA_TPV_REINTERPRET(v4u32, b.val), 16);
+    v2u64 prod   = msa_dpadd_u_d(c.val, even_a, even_b);
+    return v_uint64x2(msa_dpadd_u_d(prod, odd_a, odd_b));
+}
+
+inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
+{
+    v4i32 prod = msa_dotp_s_w(a.val, b.val);
+    return v_int64x2(msa_hadd_s64(prod, prod));
+}
+inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+// 32 >> 64f
+inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
+{ return v_cvt_f64(v_dotprod(a, b)); }
+inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+
+//////// Fast Dot Product ////////
+
+// 16 >> 32
+inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
+{ return v_dotprod(a, b); }
+inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
+{ return v_dotprod(a, b, c); }
+
+// 32 >> 64
+inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
+{ return v_dotprod(a, b); }
+inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
+{ return v_dotprod(a, b, c); }
+
+// 8 >> 32
+inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
+{ return v_dotprod_expand(a, b); }
+inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
+{ return v_dotprod_expand(a, b, c); }
+inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
+{ return v_dotprod_expand(a, b); }
+inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
+{ return v_dotprod_expand(a, b, c); }
+
+// 16 >> 64
+inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
+{ return v_dotprod_expand(a, b); }
+inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
+{ return v_dotprod_expand(a, b, c); }
+inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
+{ return v_dotprod_expand(a, b); }
+inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
+{ return v_dotprod_expand(a, b, c); }
+
+// 32 >> 64f
+inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
+{ return v_dotprod_expand(a, b); }
+inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
+{ return v_dotprod_expand(a, b, c); }

 #define OPENCV_HAL_IMPL_MSA_LOGIC_OP(_Tpvec, _Tpv, suffix) \
 OPENCV_HAL_IMPL_MSA_BIN_OP(&, _Tpvec, msa_andq_##suffix)   \
@ -1311,6 +1420,11 @@ inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
    return v_float64x2(msa_cvtfhq_f64_f32(a.val));
 }

+inline v_float64x2 v_cvt_f64(const v_int64x2& a)
+{
+    return v_float64x2(msa_cvtfintq_f64_s64(a.val));
+}
+
 ////////////// Lookup table access ////////////////////
 inline v_int8x16 v_lut(const schar* tab, const int* idx)
 {
--- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
@ -62,23 +62,63 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
 #define CV_SIMD128_64F 0
 #endif

+// TODO
+#define CV_NEON_DOT 0
+
+//////////// Utils ////////////
+
+#if CV_SIMD128_64F
+#define OPENCV_HAL_IMPL_NEON_UNZIP(_Tpv, _Tpvx2, suffix) \
+    inline void _v128_unzip(const _Tpv& a, const _Tpv& b, _Tpv& c, _Tpv& d) \
+    { c = vuzp1q_##suffix(a, b); d = vuzp2q_##suffix(a, b); }
+#define OPENCV_HAL_IMPL_NEON_UNZIP_L(_Tpv, _Tpvx2, suffix) \
+    inline void _v128_unzip(const _Tpv&a, const _Tpv&b, _Tpv& c, _Tpv& d) \
+    { c = vuzp1_##suffix(a, b); d = vuzp2_##suffix(a, b); }
+#else
+#define OPENCV_HAL_IMPL_NEON_UNZIP(_Tpv, _Tpvx2, suffix) \
+    inline void _v128_unzip(const _Tpv& a, const _Tpv& b, _Tpv& c, _Tpv& d) \
+    { _Tpvx2 ab = vuzpq_##suffix(a, b); c = ab.val[0]; d = ab.val[1]; }
+#define OPENCV_HAL_IMPL_NEON_UNZIP_L(_Tpv, _Tpvx2, suffix) \
+    inline void _v128_unzip(const _Tpv& a, const _Tpv& b, _Tpv& c, _Tpv& d) \
+    { _Tpvx2 ab = vuzp_##suffix(a, b); c = ab.val[0]; d = ab.val[1]; }
+#endif
+
 #if CV_SIMD128_64F
 #define OPENCV_HAL_IMPL_NEON_REINTERPRET(_Tpv, suffix) \
-template <typename T> static inline \
-_Tpv vreinterpretq_##suffix##_f64(T a) { return (_Tpv) a; } \
-template <typename T> static inline \
-float64x2_t vreinterpretq_f64_##suffix(T a) { return (float64x2_t) a; }
-OPENCV_HAL_IMPL_NEON_REINTERPRET(uint8x16_t, u8)
-OPENCV_HAL_IMPL_NEON_REINTERPRET(int8x16_t, s8)
-OPENCV_HAL_IMPL_NEON_REINTERPRET(uint16x8_t, u16)
-OPENCV_HAL_IMPL_NEON_REINTERPRET(int16x8_t, s16)
-OPENCV_HAL_IMPL_NEON_REINTERPRET(uint32x4_t, u32)
-OPENCV_HAL_IMPL_NEON_REINTERPRET(int32x4_t, s32)
-OPENCV_HAL_IMPL_NEON_REINTERPRET(uint64x2_t, u64)
-OPENCV_HAL_IMPL_NEON_REINTERPRET(int64x2_t, s64)
-OPENCV_HAL_IMPL_NEON_REINTERPRET(float32x4_t, f32)
+    template <typename T> static inline \
+    _Tpv vreinterpretq_##suffix##_f64(T a) { return (_Tpv) a; } \
+    template <typename T> static inline \
+    float64x2_t vreinterpretq_f64_##suffix(T a) { return (float64x2_t) a; }
+#else
+#define OPENCV_HAL_IMPL_NEON_REINTERPRET(_Tpv, suffix)
 #endif

+#define OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(_Tpv, _Tpvl, suffix) \
+    OPENCV_HAL_IMPL_NEON_UNZIP(_Tpv##_t, _Tpv##x2_t, suffix) \
+    OPENCV_HAL_IMPL_NEON_UNZIP_L(_Tpvl##_t, _Tpvl##x2_t, suffix) \
+    OPENCV_HAL_IMPL_NEON_REINTERPRET(_Tpv##_t, suffix)
+
+#define OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_I64(_Tpv, _Tpvl, suffix) \
+    OPENCV_HAL_IMPL_NEON_REINTERPRET(_Tpv##_t, suffix)
+
+#define OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_F64(_Tpv, _Tpvl, suffix) \
+    OPENCV_HAL_IMPL_NEON_UNZIP(_Tpv##_t, _Tpv##x2_t, suffix)
+
+OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(uint8x16, uint8x8,  u8)
+OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(int8x16,  int8x8,   s8)
+OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(uint16x8, uint16x4, u16)
+OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(int16x8,  int16x4,  s16)
+OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(uint32x4, uint32x2, u32)
+OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(int32x4,  int32x2,  s32)
+OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(float32x4, float32x2, f32)
+OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_I64(uint64x2, uint64x1, u64)
+OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_I64(int64x2,  int64x1,  s64)
+#if CV_SIMD128_64F
+OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_F64(float64x2, float64x1,f64)
+#endif
+
+//////////// Types ////////////
+
 struct v_uint8x16
 {
    typedef uchar lane_type;
@ -528,20 +568,272 @@ inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b)
                                  ));
 }

+//////// Dot Product ////////
+
+// 16 >> 32
 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
 {
-    int32x4_t c = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
-    int32x4_t d = vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val));
-    int32x4x2_t cd = vuzpq_s32(c, d);
-    return v_int32x4(vaddq_s32(cd.val[0], cd.val[1]));
+    int16x8_t uzp1, uzp2;
+    _v128_unzip(a.val, b.val, uzp1, uzp2);
+    int16x4_t a0 = vget_low_s16(uzp1);
+    int16x4_t b0 = vget_high_s16(uzp1);
+    int16x4_t a1 = vget_low_s16(uzp2);
+    int16x4_t b1 = vget_high_s16(uzp2);
+    int32x4_t p = vmull_s16(a0, b0);
+    return v_int32x4(vmlal_s16(p, a1, b1));
 }
-
 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
 {
-    v_int32x4 s = v_dotprod(a, b);
-    return v_int32x4(vaddq_s32(s.val , c.val));
+    int16x8_t uzp1, uzp2;
+    _v128_unzip(a.val, b.val, uzp1, uzp2);
+    int16x4_t a0 = vget_low_s16(uzp1);
+    int16x4_t b0 = vget_high_s16(uzp1);
+    int16x4_t a1 = vget_low_s16(uzp2);
+    int16x4_t b1 = vget_high_s16(uzp2);
+    int32x4_t p = vmlal_s16(c.val, a0, b0);
+    return v_int32x4(vmlal_s16(p, a1, b1));
 }

+// 32 >> 64
+inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
+{
+    int32x4_t uzp1, uzp2;
+    _v128_unzip(a.val, b.val, uzp1, uzp2);
+    int32x2_t a0 = vget_low_s32(uzp1);
+    int32x2_t b0 = vget_high_s32(uzp1);
+    int32x2_t a1 = vget_low_s32(uzp2);
+    int32x2_t b1 = vget_high_s32(uzp2);
+    int64x2_t p = vmull_s32(a0, b0);
+    return v_int64x2(vmlal_s32(p, a1, b1));
+}
+inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
+{
+    int32x4_t uzp1, uzp2;
+    _v128_unzip(a.val, b.val, uzp1, uzp2);
+    int32x2_t a0 = vget_low_s32(uzp1);
+    int32x2_t b0 = vget_high_s32(uzp1);
+    int32x2_t a1 = vget_low_s32(uzp2);
+    int32x2_t b1 = vget_high_s32(uzp2);
+    int64x2_t p = vmlal_s32(c.val, a0, b0);
+    return v_int64x2(vmlal_s32(p, a1, b1));
+}
+
+// 8 >> 32
+inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
+{
+#if CV_NEON_DOT
+    return v_uint32x4(vdotq_u32(vdupq_n_u32(0), a.val, b.val));
+#else
+    const uint8x16_t zero   = vreinterpretq_u8_u32(vdupq_n_u32(0));
+    const uint8x16_t mask   = vreinterpretq_u8_u32(vdupq_n_u32(0x00FF00FF));
+    const uint16x8_t zero32 = vreinterpretq_u16_u32(vdupq_n_u32(0));
+    const uint16x8_t mask32 = vreinterpretq_u16_u32(vdupq_n_u32(0x0000FFFF));
+
+    uint16x8_t even = vmulq_u16(vreinterpretq_u16_u8(vbslq_u8(mask, a.val, zero)),
+                                vreinterpretq_u16_u8(vbslq_u8(mask, b.val, zero)));
+    uint16x8_t odd  = vmulq_u16(vshrq_n_u16(vreinterpretq_u16_u8(a.val), 8),
+                                vshrq_n_u16(vreinterpretq_u16_u8(b.val), 8));
+
+    uint32x4_t s0 = vaddq_u32(vreinterpretq_u32_u16(vbslq_u16(mask32, even, zero32)),
+                              vreinterpretq_u32_u16(vbslq_u16(mask32, odd,  zero32)));
+    uint32x4_t s1 = vaddq_u32(vshrq_n_u32(vreinterpretq_u32_u16(even), 16),
+                              vshrq_n_u32(vreinterpretq_u32_u16(odd),  16));
+    return v_uint32x4(vaddq_u32(s0, s1));
+#endif
+}
+inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b,
+                                   const v_uint32x4& c)
+{
+#if CV_NEON_DOT
+    return v_uint32x4(vdotq_u32(c.val, a.val, b.val));
+#else
+    return v_dotprod_expand(a, b) + c;
+#endif
+}
+
+inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
+{
+#if CV_NEON_DOT
+    return v_int32x4(vdotq_s32(vdupq_n_s32(0), a.val, b.val));
+#else
+    int16x8_t p0  = vmull_s8(vget_low_s8(a.val), vget_low_s8(b.val));
+    int16x8_t p1  = vmull_s8(vget_high_s8(a.val), vget_high_s8(b.val));
+    int16x8_t uzp1, uzp2;
+    _v128_unzip(p0, p1, uzp1, uzp2);
+    int16x8_t sum = vaddq_s16(uzp1, uzp2);
+    int16x4_t uzpl1, uzpl2;
+    _v128_unzip(vget_low_s16(sum), vget_high_s16(sum), uzpl1, uzpl2);
+    return v_int32x4(vaddl_s16(uzpl1, uzpl2));
+#endif
+}
+inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b,
+                                  const v_int32x4& c)
+{
+#if CV_NEON_DOT
+    return v_int32x4(vdotq_s32(c.val, a.val, b.val));
+#else
+    return v_dotprod_expand(a, b) + c;
+#endif
+}
+
+// 16 >> 64
+inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
+{
+    const uint16x8_t zero = vreinterpretq_u16_u32(vdupq_n_u32(0));
+    const uint16x8_t mask = vreinterpretq_u16_u32(vdupq_n_u32(0x0000FFFF));
+
+    uint32x4_t even = vmulq_u32(vreinterpretq_u32_u16(vbslq_u16(mask, a.val, zero)),
+                                vreinterpretq_u32_u16(vbslq_u16(mask, b.val, zero)));
+    uint32x4_t odd  = vmulq_u32(vshrq_n_u32(vreinterpretq_u32_u16(a.val), 16),
+                                vshrq_n_u32(vreinterpretq_u32_u16(b.val), 16));
+    uint32x4_t uzp1, uzp2;
+    _v128_unzip(even, odd, uzp1, uzp2);
+    uint64x2_t s0  = vaddl_u32(vget_low_u32(uzp1), vget_high_u32(uzp1));
+    uint64x2_t s1  = vaddl_u32(vget_low_u32(uzp2), vget_high_u32(uzp2));
+    return v_uint64x2(vaddq_u64(s0, s1));
+}
+inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
+{
+    int32x4_t p0  = vmull_s16(vget_low_s16(a.val),  vget_low_s16(b.val));
+    int32x4_t p1  = vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val));
+
+    int32x4_t uzp1, uzp2;
+    _v128_unzip(p0, p1, uzp1, uzp2);
+    int32x4_t sum = vaddq_s32(uzp1, uzp2);
+
+    int32x2_t uzpl1, uzpl2;
+    _v128_unzip(vget_low_s32(sum), vget_high_s32(sum), uzpl1, uzpl2);
+    return v_int64x2(vaddl_s32(uzpl1, uzpl2));
+}
+inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b,
+                                  const v_int64x2& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+// 32 >> 64f
+#if CV_SIMD128_64F
+inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
+{ return v_cvt_f64(v_dotprod(a, b)); }
+inline v_float64x2 v_dotprod_expand(const v_int32x4& a,   const v_int32x4& b,
+                                    const v_float64x2& c)
+{ return v_dotprod_expand(a, b) + c; }
+#endif
+
+//////// Fast Dot Product ////////
+
+// 16 >> 32
+inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
+{
+    int16x4_t a0 = vget_low_s16(a.val);
+    int16x4_t a1 = vget_high_s16(a.val);
+    int16x4_t b0 = vget_low_s16(b.val);
+    int16x4_t b1 = vget_high_s16(b.val);
+    int32x4_t p = vmull_s16(a0, b0);
+    return v_int32x4(vmlal_s16(p, a1, b1));
+}
+inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
+{
+    int16x4_t a0 = vget_low_s16(a.val);
+    int16x4_t a1 = vget_high_s16(a.val);
+    int16x4_t b0 = vget_low_s16(b.val);
+    int16x4_t b1 = vget_high_s16(b.val);
+    int32x4_t p = vmlal_s16(c.val, a0, b0);
+    return v_int32x4(vmlal_s16(p, a1, b1));
+}
+
+// 32 >> 64
+inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
+{
+    int32x2_t a0 = vget_low_s32(a.val);
+    int32x2_t a1 = vget_high_s32(a.val);
+    int32x2_t b0 = vget_low_s32(b.val);
+    int32x2_t b1 = vget_high_s32(b.val);
+    int64x2_t p = vmull_s32(a0, b0);
+    return v_int64x2(vmlal_s32(p, a1, b1));
+}
+inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
+{
+    int32x2_t a0 = vget_low_s32(a.val);
+    int32x2_t a1 = vget_high_s32(a.val);
+    int32x2_t b0 = vget_low_s32(b.val);
+    int32x2_t b1 = vget_high_s32(b.val);
+    int64x2_t p = vmlal_s32(c.val, a0, b0);
+    return v_int64x2(vmlal_s32(p, a1, b1));
+}
+
+// 8 >> 32
+inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
+{
+#if CV_NEON_DOT
+    return v_uint32x4(vdotq_u32(vdupq_n_u32(0), a.val, b.val));
+#else
+    uint16x8_t p0 = vmull_u8(vget_low_u8(a.val), vget_low_u8(b.val));
+    uint16x8_t p1 = vmull_u8(vget_high_u8(a.val), vget_high_u8(b.val));
+    uint32x4_t s0 = vaddl_u16(vget_low_u16(p0), vget_low_u16(p1));
+    uint32x4_t s1 = vaddl_u16(vget_high_u16(p0), vget_high_u16(p1));
+    return v_uint32x4(vaddq_u32(s0, s1));
+#endif
+}
+inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
+{
+#if CV_NEON_DOT
+    return v_uint32x4(vdotq_u32(c.val, a.val, b.val));
+#else
+    return v_dotprod_expand_fast(a, b) + c;
+#endif
+}
+
+inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
+{
+#if CV_NEON_DOT
+    return v_int32x4(vdotq_s32(vdupq_n_s32(0), a.val, b.val));
+#else
+    int16x8_t prod = vmull_s8(vget_low_s8(a.val), vget_low_s8(b.val));
+    prod = vmlal_s8(prod, vget_high_s8(a.val), vget_high_s8(b.val));
+    return v_int32x4(vaddl_s16(vget_low_s16(prod), vget_high_s16(prod)));
+#endif
+}
+inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
+{
+#if CV_NEON_DOT
+    return v_int32x4(vdotq_s32(c.val, a.val, b.val));
+#else
+    return v_dotprod_expand_fast(a, b) + c;
+#endif
+}
+
+// 16 >> 64
+inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
+{
+    uint32x4_t p0  = vmull_u16(vget_low_u16(a.val),  vget_low_u16(b.val));
+    uint32x4_t p1  = vmull_u16(vget_high_u16(a.val), vget_high_u16(b.val));
+    uint64x2_t s0  = vaddl_u32(vget_low_u32(p0), vget_high_u32(p0));
+    uint64x2_t s1  = vaddl_u32(vget_low_u32(p1), vget_high_u32(p1));
+    return v_uint64x2(vaddq_u64(s0, s1));
+}
+inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
+{ return v_dotprod_expand_fast(a, b) + c; }
+
+inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
+{
+    int32x4_t prod = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
+    prod = vmlal_s16(prod, vget_high_s16(a.val), vget_high_s16(b.val));
+    return v_int64x2(vaddl_s32(vget_low_s32(prod), vget_high_s32(prod)));
+}
+inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
+{ return v_dotprod_expand_fast(a, b) + c; }
+
+// 32 >> 64f
+#if CV_SIMD128_64F
+inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
+{ return v_cvt_f64(v_dotprod_fast(a, b)); }
+inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
+{ return v_dotprod_expand_fast(a, b) + c; }
+#endif
+
+
 #define OPENCV_HAL_IMPL_NEON_LOGIC_OP(_Tpvec, suffix) \
    OPENCV_HAL_IMPL_NEON_BIN_OP(&, _Tpvec, vandq_##suffix) \
    OPENCV_HAL_IMPL_NEON_BIN_OP(|, _Tpvec, vorrq_##suffix) \
@ -1593,6 +1885,10 @@ inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
 {
    return v_float64x2(vcvt_f64_f32(vget_high_f32(a.val)));
 }
+
+inline v_float64x2 v_cvt_f64(const v_int64x2& a)
+{  return v_float64x2(vcvtq_f64_s64(a.val)); }
+
 #endif

 ////////////// Lookup table access ////////////////////
--- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp
@ -225,9 +225,13 @@ struct v_uint64x2
    }
    uint64 get0() const
    {
+    #if !defined(__x86_64__) && !defined(_M_X64)
        int a = _mm_cvtsi128_si32(val);
        int b = _mm_cvtsi128_si32(_mm_srli_epi64(val, 32));
        return (unsigned)a | ((uint64)(unsigned)b << 32);
+    #else
+        return (uint64)_mm_cvtsi128_si64(val);
+    #endif
    }

    __m128i val;
@ -247,9 +251,13 @@ struct v_int64x2
    }
    int64 get0() const
    {
+    #if !defined(__x86_64__) && !defined(_M_X64)
        int a = _mm_cvtsi128_si32(val);
        int b = _mm_cvtsi128_si32(_mm_srli_epi64(val, 32));
        return (int64)((unsigned)a | ((uint64)(unsigned)b << 32));
+    #else
+        return _mm_cvtsi128_si64(val);
+    #endif
    }

    __m128i val;
@ -791,15 +799,195 @@ inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
 inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b) { return v_int16x8(_mm_mulhi_epi16(a.val, b.val)); }
 inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b) { return v_uint16x8(_mm_mulhi_epu16(a.val, b.val)); }

-inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
-{
-    return v_int32x4(_mm_madd_epi16(a.val, b.val));
-}
+//////// Dot Product ////////

+// 16 >> 32
+inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
+{ return v_int32x4(_mm_madd_epi16(a.val, b.val)); }
 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
+{ return v_dotprod(a, b) + c; }
+
+// 32 >> 64
+inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
 {
-    return v_int32x4(_mm_add_epi32(_mm_madd_epi16(a.val, b.val), c.val));
+#if CV_SSE4_1
+    __m128i even = _mm_mul_epi32(a.val, b.val);
+    __m128i odd = _mm_mul_epi32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32));
+    return v_int64x2(_mm_add_epi64(even, odd));
+#else
+    __m128i even_u = _mm_mul_epu32(a.val, b.val);
+    __m128i odd_u = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32));
+    // convert unsigned to signed high multiplication (from: Agner Fog(veclib) and H S Warren: Hacker's delight, 2003, p. 132)
+    __m128i a_sign = _mm_srai_epi32(a.val, 31);
+    __m128i b_sign = _mm_srai_epi32(b.val, 31);
+    // |x * sign of x
+    __m128i axb  = _mm_and_si128(a.val, b_sign);
+    __m128i bxa  = _mm_and_si128(b.val, a_sign);
+    // sum of sign corrections
+    __m128i ssum = _mm_add_epi32(bxa, axb);
+    __m128i even_ssum = _mm_slli_epi64(ssum, 32);
+    __m128i odd_ssum = _mm_and_si128(ssum, _mm_set_epi32(-1, 0, -1, 0));
+    // convert to signed and prod
+    return v_int64x2(_mm_add_epi64(_mm_sub_epi64(even_u, even_ssum), _mm_sub_epi64(odd_u, odd_ssum)));
+#endif
 }
+inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
+{ return v_dotprod(a, b) + c; }
+
+// 8 >> 32
+inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
+{
+    __m128i a0 = _mm_srli_epi16(_mm_slli_si128(a.val, 1), 8); // even
+    __m128i a1 = _mm_srli_epi16(a.val, 8); // odd
+    __m128i b0 = _mm_srli_epi16(_mm_slli_si128(b.val, 1), 8);
+    __m128i b1 = _mm_srli_epi16(b.val, 8);
+    __m128i p0 = _mm_madd_epi16(a0, b0);
+    __m128i p1 = _mm_madd_epi16(a1, b1);
+    return v_uint32x4(_mm_add_epi32(p0, p1));
+}
+inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
+{
+    __m128i a0 = _mm_srai_epi16(_mm_slli_si128(a.val, 1), 8); // even
+    __m128i a1 = _mm_srai_epi16(a.val, 8); // odd
+    __m128i b0 = _mm_srai_epi16(_mm_slli_si128(b.val, 1), 8);
+    __m128i b1 = _mm_srai_epi16(b.val, 8);
+    __m128i p0 = _mm_madd_epi16(a0, b0);
+    __m128i p1 = _mm_madd_epi16(a1, b1);
+    return v_int32x4(_mm_add_epi32(p0, p1));
+}
+inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+// 16 >> 64
+inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
+{
+    v_uint32x4 c, d;
+    v_mul_expand(a, b, c, d);
+
+    v_uint64x2 c0, c1, d0, d1;
+    v_expand(c, c0, c1);
+    v_expand(d, d0, d1);
+
+    c0 += c1; d0 += d1;
+    return v_uint64x2(_mm_add_epi64(
+        _mm_unpacklo_epi64(c0.val, d0.val),
+        _mm_unpackhi_epi64(c0.val, d0.val)
+    ));
+}
+inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
+{
+    v_int32x4 prod = v_dotprod(a, b);
+    v_int64x2 c, d;
+    v_expand(prod, c, d);
+    return v_int64x2(_mm_add_epi64(
+        _mm_unpacklo_epi64(c.val, d.val),
+        _mm_unpackhi_epi64(c.val, d.val)
+    ));
+}
+inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+// 32 >> 64f
+inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
+{
+#if CV_SSE4_1
+    return v_cvt_f64(v_dotprod(a, b));
+#else
+    v_float64x2 c = v_cvt_f64(a) * v_cvt_f64(b);
+    v_float64x2 d = v_cvt_f64_high(a) * v_cvt_f64_high(b);
+
+    return v_float64x2(_mm_add_pd(
+        _mm_unpacklo_pd(c.val, d.val),
+        _mm_unpackhi_pd(c.val, d.val)
+    ));
+#endif
+}
+inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+//////// Fast Dot Product ////////
+
+// 16 >> 32
+inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
+{ return v_dotprod(a, b); }
+inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
+{ return v_dotprod(a, b) + c; }
+
+// 32 >> 64
+inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
+{ return v_dotprod(a, b); }
+inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
+{ return v_dotprod_fast(a, b) + c; }
+
+// 8 >> 32
+inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
+{
+    __m128i a0 = v_expand_low(a).val;
+    __m128i a1 = v_expand_high(a).val;
+    __m128i b0 = v_expand_low(b).val;
+    __m128i b1 = v_expand_high(b).val;
+    __m128i p0 = _mm_madd_epi16(a0, b0);
+    __m128i p1 = _mm_madd_epi16(a1, b1);
+    return v_uint32x4(_mm_add_epi32(p0, p1));
+}
+inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
+{ return v_dotprod_expand_fast(a, b) + c; }
+
+inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
+{
+#if CV_SSE4_1
+    __m128i a0 = _mm_cvtepi8_epi16(a.val);
+    __m128i a1 = v_expand_high(a).val;
+    __m128i b0 = _mm_cvtepi8_epi16(b.val);
+    __m128i b1 = v_expand_high(b).val;
+    __m128i p0 = _mm_madd_epi16(a0, b0);
+    __m128i p1 = _mm_madd_epi16(a1, b1);
+    return v_int32x4(_mm_add_epi32(p0, p1));
+#else
+    return v_dotprod_expand(a, b);
+#endif
+}
+inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
+{ return v_dotprod_expand_fast(a, b) + c; }
+
+// 16 >> 64
+inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
+{
+    v_uint32x4 c, d;
+    v_mul_expand(a, b, c, d);
+
+    v_uint64x2 c0, c1, d0, d1;
+    v_expand(c, c0, c1);
+    v_expand(d, d0, d1);
+
+    c0 += c1; d0 += d1;
+    return c0 + d0;
+}
+inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
+{ return v_dotprod_expand_fast(a, b) + c; }
+
+inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
+{
+    v_int32x4 prod = v_dotprod(a, b);
+    v_int64x2 c, d;
+    v_expand(prod, c, d);
+    return c + d;
+}
+inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
+{ return v_dotprod_expand_fast(a, b) + c; }
+
+// 32 >> 64f
+v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c);
+inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
+{ return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_cvt_f64_high(a) * v_cvt_f64_high(b)); }
+inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a,   const v_int32x4& b, const v_float64x2& c)
+{ return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_fma(v_cvt_f64_high(a), v_cvt_f64_high(b), c)); }

 #define OPENCV_HAL_IMPL_SSE_LOGIC_OP(_Tpvec, suffix, not_const) \
    OPENCV_HAL_IMPL_SSE_BIN_OP(&, _Tpvec, _mm_and_##suffix) \
@ -2739,6 +2927,32 @@ inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
    return v_float64x2(_mm_cvtps_pd(_mm_movehl_ps(a.val, a.val)));
 }

+// from (Mysticial and wim) https://stackoverflow.com/q/41144668
+inline v_float64x2 v_cvt_f64(const v_int64x2& v)
+{
+    // constants encoded as floating-point
+    __m128i magic_i_hi32 = _mm_set1_epi64x(0x4530000080000000); // 2^84 + 2^63
+    __m128i magic_i_all  = _mm_set1_epi64x(0x4530000080100000); // 2^84 + 2^63 + 2^52
+    __m128d magic_d_all  = _mm_castsi128_pd(magic_i_all);
+    // Blend the 32 lowest significant bits of v with magic_int_lo
+#if CV_SSE4_1
+    __m128i magic_i_lo   = _mm_set1_epi64x(0x4330000000000000); // 2^52
+    __m128i v_lo         = _mm_blend_epi16(v.val, magic_i_lo, 0xcc);
+#else
+    __m128i magic_i_lo   = _mm_set1_epi32(0x43300000); // 2^52
+    __m128i v_lo         = _mm_unpacklo_epi32(_mm_shuffle_epi32(v.val, _MM_SHUFFLE(0, 0, 2, 0)), magic_i_lo);
+#endif
+    // Extract the 32 most significant bits of v
+    __m128i v_hi         = _mm_srli_epi64(v.val, 32);
+    // Flip the msb of v_hi and blend with 0x45300000
+            v_hi         = _mm_xor_si128(v_hi, magic_i_hi32);
+    // Compute in double precision
+    __m128d v_hi_dbl     = _mm_sub_pd(_mm_castsi128_pd(v_hi), magic_d_all);
+    // (v_hi - magic_d_all) + v_lo  Do not assume associativity of floating point addition
+    __m128d result       = _mm_add_pd(v_hi_dbl, _mm_castsi128_pd(v_lo));
+    return v_float64x2(result);
+}
+
 ////////////// Lookup table access ////////////////////

 inline v_int8x16 v_lut(const schar* tab, const int* idx)
--- a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
@ -499,12 +499,6 @@ inline void v_mul_expand(const Tvec& a, const Tvec& b, Twvec& c, Twvec& d)
    v_zip(p0, p1, c, d);
 }

-inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b, v_uint64x2& c, v_uint64x2& d)
-{
-    c.val = vec_mul(vec_unpackhu(a.val), vec_unpackhu(b.val));
-    d.val = vec_mul(vec_unpacklu(a.val), vec_unpacklu(b.val));
-}
-
 inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b)
 {
    vec_int4 p0 = vec_mule(a.val, b.val);
@ -1043,14 +1037,8 @@ inline v_float64x2 v_cvt_f64(const v_float32x4& a)
 inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
 { return v_float64x2(vec_cvfo(vec_mergel(a.val, a.val))); }

-// The altivec intrinsic is missing for this 2.06 insn
 inline v_float64x2 v_cvt_f64(const v_int64x2& a)
-{
-vec_double2 out;
-
-__asm__ ("xvcvsxddp %x0,%x1" : "=wa"(out) : "wa"(a.val));
-return v_float64x2(out);
-}
+{ return v_float64x2(vec_ctd(a.val)); }

 ////////////// Lookup table access ////////////////////

@ -1322,12 +1310,134 @@ inline void v_cleanup() {}

 ////////// Matrix operations /////////

+//////// Dot Product ////////
+// 16 >> 32
 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
 { return v_int32x4(vec_msum(a.val, b.val, vec_int4_z)); }
-
 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
 { return v_int32x4(vec_msum(a.val, b.val, c.val)); }

+// 32 >> 64
+inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
+{
+    vec_dword2 even = vec_mule(a.val, b.val);
+    vec_dword2 odd = vec_mulo(a.val, b.val);
+    return v_int64x2(vec_add(even, odd));
+}
+inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
+{ return v_dotprod(a, b) + c; }
+
+// 8 >> 32
+inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
+{ return v_uint32x4(vec_msum(a.val, b.val, c.val)); }
+inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
+{ return v_uint32x4(vec_msum(a.val, b.val, vec_uint4_z)); }
+
+inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
+{
+    const vec_ushort8 eight = vec_ushort8_sp(8);
+    vec_short8 a0 = vec_sra((vec_short8)vec_sld(a.val, a.val, 1), eight); // even
+    vec_short8 a1 = vec_sra((vec_short8)a.val, eight); // odd
+    vec_short8 b0 = vec_sra((vec_short8)vec_sld(b.val, b.val, 1), eight);
+    vec_short8 b1 = vec_sra((vec_short8)b.val, eight);
+    return v_int32x4(vec_msum(a0, b0, vec_msum(a1, b1, vec_int4_z)));
+}
+
+inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
+{
+    const vec_ushort8 eight = vec_ushort8_sp(8);
+    vec_short8 a0 = vec_sra((vec_short8)vec_sld(a.val, a.val, 1), eight); // even
+    vec_short8 a1 = vec_sra((vec_short8)a.val, eight); // odd
+    vec_short8 b0 = vec_sra((vec_short8)vec_sld(b.val, b.val, 1), eight);
+    vec_short8 b1 = vec_sra((vec_short8)b.val, eight);
+    return v_int32x4(vec_msum(a0, b0, vec_msum(a1, b1, c.val)));
+}
+
+// 16 >> 64
+inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
+{
+    const vec_uint4 zero = vec_uint4_z;
+    vec_uint4 even = vec_mule(a.val, b.val);
+    vec_uint4 odd  = vec_mulo(a.val, b.val);
+    vec_udword2 e0 = (vec_udword2)vec_mergee(even, zero);
+    vec_udword2 e1 = (vec_udword2)vec_mergeo(even, zero);
+    vec_udword2 o0 = (vec_udword2)vec_mergee(odd, zero);
+    vec_udword2 o1 = (vec_udword2)vec_mergeo(odd, zero);
+    vec_udword2 s0 = vec_add(e0, o0);
+    vec_udword2 s1 = vec_add(e1, o1);
+    return v_uint64x2(vec_add(s0, s1));
+}
+inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
+{
+    v_int32x4 prod = v_dotprod(a, b);
+    v_int64x2 c, d;
+    v_expand(prod, c, d);
+    return v_int64x2(vec_add(vec_mergeh(c.val, d.val), vec_mergel(c.val, d.val)));
+}
+inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+// 32 >> 64f
+inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
+{ return v_cvt_f64(v_dotprod(a, b)); }
+inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+//////// Fast Dot Product ////////
+
+// 16 >> 32
+inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
+{ return v_dotprod(a, b); }
+inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
+{ return v_int32x4(vec_msum(a.val, b.val, vec_int4_z)) + c; }
+// 32 >> 64
+inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
+{ return v_dotprod(a, b); }
+inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
+{ return v_dotprod(a, b, c); }
+
+// 8 >> 32
+inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
+{ return v_dotprod_expand(a, b); }
+inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
+{ return v_uint32x4(vec_msum(a.val, b.val, vec_uint4_z)) + c; }
+
+inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
+{
+    vec_short8 a0 = vec_unpackh(a.val);
+    vec_short8 a1 = vec_unpackl(a.val);
+    vec_short8 b0 = vec_unpackh(b.val);
+    vec_short8 b1 = vec_unpackl(b.val);
+    return v_int32x4(vec_msum(a0, b0, vec_msum(a1, b1, vec_int4_z)));
+}
+inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
+{ return v_dotprod_expand_fast(a, b) + c; }
+
+// 16 >> 64
+inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
+{ return v_dotprod_expand(a, b); }
+inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
+{ return v_dotprod_expand(a, b, c); }
+
+inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
+{
+    v_int32x4 prod = v_dotprod(a, b);
+    v_int64x2 c, d;
+    v_expand(prod, c, d);
+    return c + d;
+}
+inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
+{ return v_dotprod_expand_fast(a, b) + c; }
+
+// 32 >> 64f
+inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
+{ return v_dotprod_expand(a, b); }
+inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
+{ return v_dotprod_expand(a, b, c); }
+
 inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
                            const v_float32x4& m1, const v_float32x4& m2,
                            const v_float32x4& m3)
--- a/modules/core/include/opencv2/core/hal/intrin_wasm.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_wasm.hpp
@ -682,6 +682,29 @@ template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n
    return s;
 }

+template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::q_type, n/4>
+    v_dotprod_expand(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
+{
+    typedef typename V_TypeTraits<_Tp>::q_type q_type;
+    v_reg<q_type, n/4> s;
+    for( int i = 0; i < (n/4); i++ )
+        s.s[i] = (q_type)a.s[i*4    ]*b.s[i*4    ] + (q_type)a.s[i*4 + 1]*b.s[i*4 + 1] +
+                 (q_type)a.s[i*4 + 2]*b.s[i*4 + 2] + (q_type)a.s[i*4 + 3]*b.s[i*4 + 3];
+    return s;
+}
+
+template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::q_type, n/4>
+    v_dotprod_expand(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
+                     const v_reg<typename V_TypeTraits<_Tp>::q_type, n / 4>& c)
+{
+    typedef typename V_TypeTraits<_Tp>::q_type q_type;
+    v_reg<q_type, n/4> s;
+    for( int i = 0; i < (n/4); i++ )
+        s.s[i] = (q_type)a.s[i*4    ]*b.s[i*4    ] + (q_type)a.s[i*4 + 1]*b.s[i*4 + 1] +
+                 (q_type)a.s[i*4 + 2]*b.s[i*4 + 2] + (q_type)a.s[i*4 + 3]*b.s[i*4 + 3] + c.s[i];
+    return s;
+}
+
 template<typename _Tp, int n> inline void v_mul_expand(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
                                                       v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& c,
                                                       v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& d)
@ -1282,6 +1305,14 @@ inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
    return c;
 }

+inline v_float64x2 v_cvt_f64(const v_int64x2& a)
+{
+    v_float64x2 c;
+    for( int i = 0; i < 2; i++ )
+        c.s[i] = (double)a.s[i];
+    return c;
+}
+
 template<typename _Tp> inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_lut(const _Tp* tab, const int* idx)
 {
    v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> c;
@ -2398,6 +2429,8 @@ inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b)
    return v_uint16x8(wasm_v8x16_shuffle(c, d, 2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31));
 }

+//////// Dot Product ////////
+
 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
 {
    v128_t a0 = wasm_i32x4_shr(wasm_i32x4_shl(a.val, 16), 16);
@ -2410,15 +2443,140 @@ inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
 }

 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
+{ return v_dotprod(a, b) + c; }
+
+inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
 {
-    v128_t a0 = wasm_i32x4_shr(wasm_i32x4_shl(a.val, 16), 16);
-    v128_t a1 = wasm_i32x4_shr(a.val, 16);
-    v128_t b0 = wasm_i32x4_shr(wasm_i32x4_shl(b.val, 16), 16);
-    v128_t b1 = wasm_i32x4_shr(b.val, 16);
-    v128_t d = wasm_i32x4_mul(a0, b0);
-    v128_t e = wasm_i32x4_mul(a1, b1);
-    return v_int32x4(wasm_i32x4_add(wasm_i32x4_add(d, e), c.val));
+#ifdef __wasm_unimplemented_simd128__
+    v128_t a0 = wasm_i64x2_shr(wasm_i64x2_shl(a.val, 32), 32);
+    v128_t a1 = wasm_i64x2_shr(a.val, 32);
+    v128_t b0 = wasm_i64x2_shr(wasm_i64x2_shl(b.val, 32), 32);
+    v128_t b1 = wasm_i64x2_shr(b.val, 32);
+    v128_t c = (v128_t)((__i64x2)a0 * (__i64x2)b0);
+    v128_t d = (v128_t)((__i64x2)a1 * (__i64x2)b1);
+    return v_int64x2(wasm_i64x2_add(c, d));
+#else
+    fallback::v_int32x4 a_(a);
+    fallback::v_int32x4 b_(b);
+    return fallback::v_dotprod(a_, b_);
+#endif
 }
+inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
+{
+#ifdef __wasm_unimplemented_simd128__
+    return v_dotprod(a, b) + c;
+#else
+    fallback::v_int32x4 a_(a);
+    fallback::v_int32x4 b_(b);
+    fallback::v_int64x2 c_(c);
+    return fallback::v_dotprod(a_, b_, c_);
+#endif
+}
+
+// 8 >> 32
+inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
+{
+    v128_t a0 = wasm_u16x8_shr(wasm_i16x8_shl(a.val, 8), 8);
+    v128_t a1 = wasm_u16x8_shr(a.val, 8);
+    v128_t b0 = wasm_u16x8_shr(wasm_i16x8_shl(b.val, 8), 8);
+    v128_t b1 = wasm_u16x8_shr(b.val, 8);
+    return v_uint32x4((
+        v_dotprod(v_int16x8(a0), v_int16x8(b0)) +
+        v_dotprod(v_int16x8(a1), v_int16x8(b1))).val
+    );
+}
+inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
+{
+    v128_t a0 = wasm_i16x8_shr(wasm_i16x8_shl(a.val, 8), 8);
+    v128_t a1 = wasm_i16x8_shr(a.val, 8);
+    v128_t b0 = wasm_i16x8_shr(wasm_i16x8_shl(b.val, 8), 8);
+    v128_t b1 = wasm_i16x8_shr(b.val, 8);
+    return v_int32x4(
+        v_dotprod(v_int16x8(a0), v_int16x8(b0)) +
+        v_dotprod(v_int16x8(a1), v_int16x8(b1))
+    );
+}
+inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+// 16 >> 64
+inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
+{
+    fallback::v_uint16x8 a_(a);
+    fallback::v_uint16x8 b_(b);
+    return fallback::v_dotprod_expand(a_, b_);
+}
+inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
+{
+    fallback::v_uint16x8 a_(a);
+    fallback::v_uint16x8 b_(b);
+    fallback::v_uint64x2 c_(c);
+    return fallback::v_dotprod_expand(a_, b_, c_);
+}
+
+inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
+{
+    fallback::v_int16x8 a_(a);
+    fallback::v_int16x8 b_(b);
+    return fallback::v_dotprod_expand(a_, b_);
+}
+
+inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
+{
+    fallback::v_int16x8 a_(a);
+    fallback::v_int16x8 b_(b);
+    fallback::v_int64x2 c_(c);
+    return fallback::v_dotprod_expand(a_, b_, c_);
+}
+
+// 32 >> 64f
+inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
+{ return v_cvt_f64(v_dotprod(a, b)); }
+inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+//////// Fast Dot Product ////////
+
+// 16 >> 32
+inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
+{ return v_dotprod(a, b); }
+inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
+{ return v_dotprod(a, b, c); }
+
+// 32 >> 64
+inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
+{ return v_dotprod(a, b); }
+inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
+{ return v_dotprod(a, b, c); }
+
+// 8 >> 32
+inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
+{ return v_dotprod_expand(a, b); }
+inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
+{ return v_dotprod_expand(a, b, c); }
+inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
+{ return v_dotprod_expand(a, b); }
+inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
+{ return v_dotprod_expand(a, b, c); }
+
+// 16 >> 64
+inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
+{ return v_dotprod_expand(a, b); }
+inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
+{ return v_dotprod_expand(a, b, c); }
+inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
+{ return v_dotprod_expand(a, b); }
+inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
+{ return v_dotprod_expand(a, b, c); }
+
+// 32 >> 64f
+inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
+{ return v_dotprod_expand(a, b); }
+inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
+{ return v_dotprod_expand(a, b, c); }

 #define OPENCV_HAL_IMPL_WASM_LOGIC_OP(_Tpvec) \
 OPENCV_HAL_IMPL_WASM_BIN_OP(&, _Tpvec, wasm_v128_and) \
@ -3815,6 +3973,16 @@ inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
    return fallback::v_cvt_f64_high(a_);
 }

+inline v_float64x2 v_cvt_f64(const v_int64x2& a)
+{
+#ifdef __wasm_unimplemented_simd128__
+    return v_float64x2(wasm_convert_f64x2_i64x2(a.val));
+#else
+    fallback::v_int64x2 a_(a);
+    return fallback::v_cvt_f64(a_);
+#endif
+}
+
 ////////////// Lookup table access ////////////////////

 inline v_int8x16 v_lut(const schar* tab, const int* idx)
--- a/modules/core/include/opencv2/core/private.cuda.hpp
+++ b/modules/core/include/opencv2/core/private.cuda.hpp
@ -72,7 +72,9 @@
 #  include "opencv2/core/cuda_stream_accessor.hpp"
 #  include "opencv2/core/cuda/common.hpp"

+# ifndef NPP_VERSION
 #  define NPP_VERSION (NPP_VERSION_MAJOR * 1000 + NPP_VERSION_MINOR * 100 + NPP_VERSION_BUILD)
+# endif

 #  define CUDART_MINIMUM_REQUIRED_VERSION 6050

--- a/modules/core/include/opencv2/core/vsx_utils.hpp
+++ b/modules/core/include/opencv2/core/vsx_utils.hpp
@ -144,10 +144,10 @@ VSX_FINLINE(rt) fnm(const rg& a, const rg& b)  \
    VSX_REDIRECT_2RG(vec_uint4,  vec_ushort8,  vec_mulo, __builtin_vec_mulo)

    // dword2 support arrived in ISA 2.07 and GCC 8+
-    VSX_IMPL_2VRG(vec_dword2,  vec_int4,  vmulesw, vec_mule)
-    VSX_IMPL_2VRG(vec_udword2, vec_uint4, vmuleuw, vec_mule)
-    VSX_IMPL_2VRG(vec_dword2,  vec_int4,  vmulosw, vec_mulo)
-    VSX_IMPL_2VRG(vec_udword2, vec_uint4, vmulouw, vec_mulo)
+    VSX_IMPL_2VRG(vec_dword2,  vec_int4,  vmulosw, vec_mule)
+    VSX_IMPL_2VRG(vec_udword2, vec_uint4, vmulouw, vec_mule)
+    VSX_IMPL_2VRG(vec_dword2,  vec_int4,  vmulesw, vec_mulo)
+    VSX_IMPL_2VRG(vec_udword2, vec_uint4, vmuleuw, vec_mulo)

 #endif

--- a/modules/core/perf/perf_dot.cpp
+++ b/modules/core/perf/perf_dot.cpp
@ -9,7 +9,7 @@ typedef TestBaseWithParam<MatType_Length_t> MatType_Length;

 PERF_TEST_P( MatType_Length, dot,
             testing::Combine(
-                 testing::Values( CV_8UC1, CV_32SC1, CV_32FC1 ),
+                 testing::Values( CV_8UC1, CV_8SC1, CV_16SC1, CV_16UC1, CV_32SC1, CV_32FC1 ),
                 testing::Values( 32, 64, 128, 256, 512, 1024 )
                 ))
 {
--- a/modules/core/src/alloc.cpp
+++ b/modules/core/src/alloc.cpp
@ -46,6 +46,7 @@
 #undef CV_LOG_STRIP_LEVEL
 #define CV_LOG_STRIP_LEVEL CV_LOG_LEVEL_VERBOSE + 1
 #include <opencv2/core/utils/logger.hpp>
+#include <opencv2/core/utils/configuration.private.hpp>

 #define CV__ALLOCATOR_STATS_LOG(...) CV_LOG_VERBOSE(NULL, 0, "alloc.cpp: " << __VA_ARGS__)
 #include "opencv2/core/utils/allocator_stats.impl.hpp"
@ -81,6 +82,38 @@ cv::utils::AllocatorStatisticsInterface& getAllocatorStatistics()
    return allocator_stats;
 }

+#if defined HAVE_POSIX_MEMALIGN || defined HAVE_MEMALIGN
+static bool readMemoryAlignmentParameter()
+{
+    bool value = true;
+#if defined(__GLIBC__) && defined(__linux__) \
+    && !defined(CV_STATIC_ANALYSIS) \
+    && !defined(OPENCV_ENABLE_MEMORY_SANITIZER) \
+    && !defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION)  /* oss-fuzz */ \
+    && !defined(_WIN32)  /* MinGW? */
+    {
+        // https://github.com/opencv/opencv/issues/15526
+        value = false;
+    }
+#endif
+    value = cv::utils::getConfigurationParameterBool("OPENCV_ENABLE_MEMALIGN", value);  // should not call fastMalloc() internally
+    // TODO add checks for valgrind, ASAN if value == false
+    return value;
+}
+static inline
+bool isAlignedAllocationEnabled()
+{
+    static bool initialized = false;
+    static bool useMemalign = true;
+    if (!initialized)
+    {
+        initialized = true;  // trick to avoid stuck in acquire (works only if allocations are scope based)
+        useMemalign = readMemoryAlignmentParameter();
+    }
+    return useMemalign;
+}
+#endif
+
 #ifdef OPENCV_ALLOC_ENABLE_STATISTICS
 static inline
 void* fastMalloc_(size_t size)
@ -89,25 +122,30 @@ void* fastMalloc(size_t size)
 #endif
 {
 #ifdef HAVE_POSIX_MEMALIGN
-    void* ptr = NULL;
-    if(posix_memalign(&ptr, CV_MALLOC_ALIGN, size))
-        ptr = NULL;
-    if(!ptr)
-        return OutOfMemoryError(size);
-    return ptr;
+    if (isAlignedAllocationEnabled())
+    {
+        void* ptr = NULL;
+        if(posix_memalign(&ptr, CV_MALLOC_ALIGN, size))
+            ptr = NULL;
+        if(!ptr)
+            return OutOfMemoryError(size);
+        return ptr;
+    }
 #elif defined HAVE_MEMALIGN
-    void* ptr = memalign(CV_MALLOC_ALIGN, size);
-    if(!ptr)
-        return OutOfMemoryError(size);
-    return ptr;
-#else
+    if (isAlignedAllocationEnabled())
+    {
+        void* ptr = memalign(CV_MALLOC_ALIGN, size);
+        if(!ptr)
+            return OutOfMemoryError(size);
+        return ptr;
+    }
+#endif
    uchar* udata = (uchar*)malloc(size + sizeof(void*) + CV_MALLOC_ALIGN);
    if(!udata)
        return OutOfMemoryError(size);
    uchar** adata = alignPtr((uchar**)udata + 1, CV_MALLOC_ALIGN);
    adata[-1] = udata;
    return adata;
-#endif
 }

 #ifdef OPENCV_ALLOC_ENABLE_STATISTICS
@ -118,8 +156,12 @@ void fastFree(void* ptr)
 #endif
 {
 #if defined HAVE_POSIX_MEMALIGN || defined HAVE_MEMALIGN
-    free(ptr);
-#else
+    if (isAlignedAllocationEnabled())
+    {
+        free(ptr);
+        return;
+    }
+#endif
    if(ptr)
    {
        uchar* udata = ((uchar**)ptr)[-1];
@ -127,7 +169,6 @@ void fastFree(void* ptr)
               ((uchar*)ptr - udata) <= (ptrdiff_t)(sizeof(void*)+CV_MALLOC_ALIGN));
        free(udata);
    }
-#endif
 }

 #ifdef OPENCV_ALLOC_ENABLE_STATISTICS
--- a/modules/core/src/copy.cpp
+++ b/modules/core/src/copy.cpp
@ -782,36 +782,10 @@ void flip( InputArray _src, OutputArray _dst, int flip_mode )
        flipHoriz( dst.ptr(), dst.step, dst.ptr(), dst.step, dst.size(), esz );
 }

-#ifdef HAVE_OPENCL
-
-static bool ocl_rotate(InputArray _src, OutputArray _dst, int rotateMode)
-{
-    switch (rotateMode)
-    {
-    case ROTATE_90_CLOCKWISE:
-        transpose(_src, _dst);
-        flip(_dst, _dst, 1);
-        break;
-    case ROTATE_180:
-        flip(_src, _dst, -1);
-        break;
-    case ROTATE_90_COUNTERCLOCKWISE:
-        transpose(_src, _dst);
-        flip(_dst, _dst, 0);
-        break;
-    default:
-        break;
-    }
-    return true;
-}
-#endif
-
 void rotate(InputArray _src, OutputArray _dst, int rotateMode)
 {
    CV_Assert(_src.dims() <= 2);

-    CV_OCL_RUN(_dst.isUMat(), ocl_rotate(_src, _dst, rotateMode))
-
    switch (rotateMode)
    {
    case ROTATE_90_CLOCKWISE:
--- a/modules/core/src/matmul.simd.hpp
+++ b/modules/core/src/matmul.simd.hpp
@ -2320,26 +2320,22 @@ double dotProd_8u(const uchar* src1, const uchar* src2, int len)
    while (i < len0)
    {
        blockSize = std::min(len0 - i, blockSize0);
-        v_int32 v_sum = vx_setzero_s32();
+        v_uint32 v_sum = vx_setzero_u32();
        const int cWidth = v_uint16::nlanes;

        int j = 0;
        for (; j <= blockSize - cWidth * 2; j += cWidth * 2)
        {
-            v_uint16 v_src10, v_src20, v_src11, v_src21;
-            v_expand(vx_load(src1 + j), v_src10, v_src11);
-            v_expand(vx_load(src2 + j), v_src20, v_src21);
-
-            v_sum += v_dotprod(v_reinterpret_as_s16(v_src10), v_reinterpret_as_s16(v_src20));
-            v_sum += v_dotprod(v_reinterpret_as_s16(v_src11), v_reinterpret_as_s16(v_src21));
+            v_uint8 v_src1 = vx_load(src1 + j);
+            v_uint8 v_src2 = vx_load(src2 + j);
+            v_sum = v_dotprod_expand_fast(v_src1, v_src2, v_sum);
        }

        for (; j <= blockSize - cWidth; j += cWidth)
        {
            v_int16 v_src10 = v_reinterpret_as_s16(vx_load_expand(src1 + j));
            v_int16 v_src20 = v_reinterpret_as_s16(vx_load_expand(src2 + j));
-
-            v_sum += v_dotprod(v_src10, v_src20);
+            v_sum += v_reinterpret_as_u32(v_dotprod_fast(v_src10, v_src20));
        }
        r += (double)v_reduce_sum(v_sum);

@ -2348,48 +2344,6 @@ double dotProd_8u(const uchar* src1, const uchar* src2, int len)
        i += blockSize;
    }
    vx_cleanup();
-#elif CV_NEON
-    if( cv::checkHardwareSupport(CV_CPU_NEON) )
-    {
-        int len0 = len & -8, blockSize0 = (1 << 15), blockSize;
-        uint32x4_t v_zero = vdupq_n_u32(0u);
-        CV_DECL_ALIGNED(16) uint buf[4];
-
-        while( i < len0 )
-        {
-            blockSize = std::min(len0 - i, blockSize0);
-            uint32x4_t v_sum = v_zero;
-
-            int j = 0;
-            for( ; j <= blockSize - 16; j += 16 )
-            {
-                uint8x16_t v_src1 = vld1q_u8(src1 + j), v_src2 = vld1q_u8(src2 + j);
-
-                uint16x8_t v_src10 = vmovl_u8(vget_low_u8(v_src1)), v_src20 = vmovl_u8(vget_low_u8(v_src2));
-                v_sum = vmlal_u16(v_sum, vget_low_u16(v_src10), vget_low_u16(v_src20));
-                v_sum = vmlal_u16(v_sum, vget_high_u16(v_src10), vget_high_u16(v_src20));
-
-                v_src10 = vmovl_u8(vget_high_u8(v_src1));
-                v_src20 = vmovl_u8(vget_high_u8(v_src2));
-                v_sum = vmlal_u16(v_sum, vget_low_u16(v_src10), vget_low_u16(v_src20));
-                v_sum = vmlal_u16(v_sum, vget_high_u16(v_src10), vget_high_u16(v_src20));
-            }
-
-            for( ; j <= blockSize - 8; j += 8 )
-            {
-                uint16x8_t v_src1 = vmovl_u8(vld1_u8(src1 + j)), v_src2 = vmovl_u8(vld1_u8(src2 + j));
-                v_sum = vmlal_u16(v_sum, vget_low_u16(v_src1), vget_low_u16(v_src2));
-                v_sum = vmlal_u16(v_sum, vget_high_u16(v_src1), vget_high_u16(v_src2));
-            }
-
-            vst1q_u32(buf, v_sum);
-            r += buf[0] + buf[1] + buf[2] + buf[3];
-
-            src1 += blockSize;
-            src2 += blockSize;
-            i += blockSize;
-        }
-    }
 #endif
    return r + dotProd_(src1, src2, len - i);
 }
@ -2412,20 +2366,16 @@ double dotProd_8s(const schar* src1, const schar* src2, int len)
        int j = 0;
        for (; j <= blockSize - cWidth * 2; j += cWidth * 2)
        {
-            v_int16 v_src10, v_src20, v_src11, v_src21;
-            v_expand(vx_load(src1 + j), v_src10, v_src11);
-            v_expand(vx_load(src2 + j), v_src20, v_src21);
-
-            v_sum += v_dotprod(v_src10, v_src20);
-            v_sum += v_dotprod(v_src11, v_src21);
+            v_int8 v_src1 = vx_load(src1 + j);
+            v_int8 v_src2 = vx_load(src2 + j);
+            v_sum = v_dotprod_expand_fast(v_src1, v_src2, v_sum);
        }

        for (; j <= blockSize - cWidth; j += cWidth)
        {
-            v_int16 v_src10 = vx_load_expand(src1 + j);
-            v_int16 v_src20 = vx_load_expand(src2 + j);
-
-            v_sum += v_dotprod(v_src10, v_src20);
+            v_int16 v_src1 = vx_load_expand(src1 + j);
+            v_int16 v_src2 = vx_load_expand(src2 + j);
+            v_sum = v_dotprod_fast(v_src1, v_src2, v_sum);
        }
        r += (double)v_reduce_sum(v_sum);

@ -2434,87 +2384,6 @@ double dotProd_8s(const schar* src1, const schar* src2, int len)
        i += blockSize;
    }
    vx_cleanup();
-#elif CV_NEON
-    if( cv::checkHardwareSupport(CV_CPU_NEON) )
-    {
-        int len0 = len & -8, blockSize0 = (1 << 14), blockSize;
-        int32x4_t v_zero = vdupq_n_s32(0);
-        CV_DECL_ALIGNED(16) int buf[4];
-
-        while( i < len0 )
-        {
-            blockSize = std::min(len0 - i, blockSize0);
-            int32x4_t v_sum = v_zero;
-
-            int j = 0;
-            for( ; j <= blockSize - 16; j += 16 )
-            {
-                int8x16_t v_src1 = vld1q_s8(src1 + j), v_src2 = vld1q_s8(src2 + j);
-
-                int16x8_t v_src10 = vmovl_s8(vget_low_s8(v_src1)), v_src20 = vmovl_s8(vget_low_s8(v_src2));
-                v_sum = vmlal_s16(v_sum, vget_low_s16(v_src10), vget_low_s16(v_src20));
-                v_sum = vmlal_s16(v_sum, vget_high_s16(v_src10), vget_high_s16(v_src20));
-
-                v_src10 = vmovl_s8(vget_high_s8(v_src1));
-                v_src20 = vmovl_s8(vget_high_s8(v_src2));
-                v_sum = vmlal_s16(v_sum, vget_low_s16(v_src10), vget_low_s16(v_src20));
-                v_sum = vmlal_s16(v_sum, vget_high_s16(v_src10), vget_high_s16(v_src20));
-            }
-
-            for( ; j <= blockSize - 8; j += 8 )
-            {
-                int16x8_t v_src1 = vmovl_s8(vld1_s8(src1 + j)), v_src2 = vmovl_s8(vld1_s8(src2 + j));
-                v_sum = vmlal_s16(v_sum, vget_low_s16(v_src1), vget_low_s16(v_src2));
-                v_sum = vmlal_s16(v_sum, vget_high_s16(v_src1), vget_high_s16(v_src2));
-            }
-
-            vst1q_s32(buf, v_sum);
-            r += buf[0] + buf[1] + buf[2] + buf[3];
-
-            src1 += blockSize;
-            src2 += blockSize;
-            i += blockSize;
-        }
-    }
-#elif CV_MSA
-    int len0 = len & -8, blockSize0 = (1 << 14), blockSize;
-    v4i32 v_zero = msa_dupq_n_s32(0);
-    CV_DECL_ALIGNED(16) int buf[4];
-
-    while( i < len0 )
-    {
-        blockSize = std::min(len0 - i, blockSize0);
-        v4i32 v_sum = v_zero;
-
-        int j = 0;
-        for( ; j <= blockSize - 16; j += 16 )
-        {
-            v16i8 v_src1 = msa_ld1q_s8(src1 + j), v_src2 = msa_ld1q_s8(src2 + j);
-
-            v8i16 v_src10 = msa_movl_s8(msa_get_low_s8(v_src1)), v_src20 = msa_movl_s8(msa_get_low_s8(v_src2));
-            v_sum = msa_mlal_s16(v_sum, msa_get_low_s16(v_src10), msa_get_low_s16(v_src20));
-            v_sum = msa_mlal_s16(v_sum, msa_get_high_s16(v_src10), msa_get_high_s16(v_src20));
-
-            v_src10 = msa_movl_s8(msa_get_high_s8(v_src1));
-            v_src20 = msa_movl_s8(msa_get_high_s8(v_src2));
-            v_sum = msa_mlal_s16(v_sum, msa_get_low_s16(v_src10), msa_get_low_s16(v_src20));
-            v_sum = msa_mlal_s16(v_sum, msa_get_high_s16(v_src10), msa_get_high_s16(v_src20));
-        }
-
-        for( ; j <= blockSize - 8; j += 8 )
-        {
-            v8i16 v_src1 = msa_movl_s8(msa_ld1_s8(src1 + j)), v_src2 = msa_movl_s8(msa_ld1_s8(src2 + j));
-            v_sum = msa_mlal_s16(v_sum, msa_get_low_s16(v_src1), msa_get_low_s16(v_src2));
-            v_sum = msa_mlal_s16(v_sum, msa_get_high_s16(v_src1), msa_get_high_s16(v_src2));
-        }
-
-        msa_st1q_s32(buf, v_sum);
-        r += buf[0] + buf[1] + buf[2] + buf[3];
-
-        src1 += blockSize;
-        src2 += blockSize;
-        i += blockSize;
-    }
 #endif

    return r + dotProd_(src1, src2, len - i);
@ -2522,42 +2391,97 @@ double dotProd_8s(const schar* src1, const schar* src2, int len)

 double dotProd_16u(const ushort* src1, const ushort* src2, int len)
 {
-    return dotProd_(src1, src2, len);
+    double r = 0.0;
+    int i = 0;
+
+#if CV_SIMD
+    int len0 = len & -v_uint16::nlanes, blockSize0 = (1 << 24), blockSize;
+
+    while (i < len0)
+    {
+        blockSize = std::min(len0 - i, blockSize0);
+        v_uint64 v_sum = vx_setzero_u64();
+        const int cWidth = v_uint16::nlanes;
+
+        int j = 0;
+        for (; j <= blockSize - cWidth; j += cWidth)
+        {
+            v_uint16 v_src1 = vx_load(src1 + j);
+            v_uint16 v_src2 = vx_load(src2 + j);
+            v_sum = v_dotprod_expand_fast(v_src1, v_src2, v_sum);
+        }
+        r += (double)v_reduce_sum(v_sum);
+
+        src1 += blockSize;
+        src2 += blockSize;
+        i += blockSize;
+    }
+    vx_cleanup();
+#endif
+    return r + dotProd_(src1, src2, len - i);
 }

 double dotProd_16s(const short* src1, const short* src2, int len)
 {
-    return dotProd_(src1, src2, len);
+    double r = 0.0;
+    int i = 0;
+
+#if CV_SIMD
+    int len0 = len & -v_int16::nlanes, blockSize0 = (1 << 24), blockSize;
+
+    while (i < len0)
+    {
+        blockSize = std::min(len0 - i, blockSize0);
+        v_int64 v_sum = vx_setzero_s64();
+        const int cWidth = v_int16::nlanes;
+
+        int j = 0;
+        for (; j <= blockSize - cWidth; j += cWidth)
+        {
+            v_int16 v_src1 = vx_load(src1 + j);
+            v_int16 v_src2 = vx_load(src2 + j);
+            v_sum = v_dotprod_expand_fast(v_src1, v_src2, v_sum);
+        }
+        r += (double)v_reduce_sum(v_sum);
+
+        src1 += blockSize;
+        src2 += blockSize;
+        i += blockSize;
+    }
+    vx_cleanup();
+#endif
+    return r + dotProd_(src1, src2, len - i);
 }

 double dotProd_32s(const int* src1, const int* src2, int len)
 {
-#if CV_SIMD128_64F
-    double r = 0.0;
+#if CV_SIMD_64F
+    double r = .0;
    int i = 0;
-    int lenAligned = len & -v_int32x4::nlanes;
-    v_float64x2 a(0.0, 0.0);
-    v_float64x2 b(0.0, 0.0);
-
-    for( i = 0; i < lenAligned; i += v_int32x4::nlanes )
-        {
-        v_int32x4 s1 = v_load(src1);
-        v_int32x4 s2 = v_load(src2);
-
-#if CV_VSX
-        // Do 32x32->64 multiplies, convert/round to double, accumulate
-        // Potentially less precise than FMA, but 1.5x faster than fma below.
-        a += v_cvt_f64(v_int64(vec_mule(s1.val, s2.val)));
-        b += v_cvt_f64(v_int64(vec_mulo(s1.val, s2.val)));
-#else
-        a = v_fma(v_cvt_f64(s1), v_cvt_f64(s2), a);
-        b = v_fma(v_cvt_f64_high(s1), v_cvt_f64_high(s2), b);
+    const int step  = v_int32::nlanes;
+    v_float64 v_sum0 = vx_setzero_f64();
+#if CV_SIMD_WIDTH == 16
+    const int wstep = step * 2;
+    v_float64 v_sum1 = vx_setzero_f64();
+    for (; i < len - wstep; i += wstep, src1 += wstep, src2 += wstep)
+    {
+        v_int32 v_src10 = vx_load(src1);
+        v_int32 v_src20 = vx_load(src2);
+        v_int32 v_src11 = vx_load(src1 + step);
+        v_int32 v_src21 = vx_load(src2 + step);
+        v_sum0 = v_dotprod_expand_fast(v_src10, v_src20, v_sum0);
+        v_sum1 = v_dotprod_expand_fast(v_src11, v_src21, v_sum1);
+    }
+    v_sum0 += v_sum1;
 #endif
-        src1 += v_int32x4::nlanes;
-        src2 += v_int32x4::nlanes;
-        }
-    a += b;
-    r = v_reduce_sum(a);
+    for (; i < len - step; i += step, src1 += step, src2 += step)
+    {
+        v_int32 v_src1 = vx_load(src1);
+        v_int32 v_src2 = vx_load(src2);
+        v_sum0 = v_dotprod_expand_fast(v_src1, v_src2, v_sum0);
+    }
+    r = v_reduce_sum(v_sum0);
+    vx_cleanup();
    return r + dotProd_(src1, src2, len - i);
 #else
    return dotProd_(src1, src2, len);
--- a/modules/core/src/opengl.cpp
+++ b/modules/core/src/opengl.cpp
@ -45,6 +45,13 @@
 #ifdef HAVE_OPENGL
 #  include "gl_core_3_1.hpp"
 #  ifdef HAVE_CUDA
+#    if (defined(__arm__) || defined(__aarch64__)) \
+         && !defined(OPENCV_SKIP_CUDA_OPENGL_ARM_WORKAROUND)
+#      include <GL/gl.h>
+#      ifndef GL_VERSION
+#        define GL_VERSION 0x1F02
+#      endif
+#    endif
 #    include <cuda_gl_interop.h>
 #  endif
 #else // HAVE_OPENGL
--- a/modules/core/test/test_intrin_utils.hpp
+++ b/modules/core/test/test_intrin_utils.hpp
@ -603,12 +603,14 @@ template<typename R> struct TheTest
        return *this;
    }

-    TheTest & test_dot_prod()
+    TheTest & test_dotprod()
    {
        typedef typename V_RegTraits<R>::w_reg Rx2;
        typedef typename Rx2::lane_type w_type;

-        Data<R> dataA, dataB(2);
+        Data<R> dataA, dataB;
+        dataA += std::numeric_limits<LaneType>::max() - R::nlanes;
+        dataB += std::numeric_limits<LaneType>::min() + R::nlanes;
        R a = dataA, b = dataB;

        Data<Rx2> dataC;
@ -621,12 +623,95 @@ template<typename R> struct TheTest
                  resE = v_dotprod(a, b, c);

        const int n = R::nlanes / 2;
+        w_type sumAB = 0, sumABC = 0, tmp_sum;
        for (int i = 0; i < n; ++i)
        {
            SCOPED_TRACE(cv::format("i=%d", i));
-            EXPECT_EQ(dataA[i*2] * dataB[i*2] + dataA[i*2 + 1] * dataB[i*2 + 1], resD[i]);
-            EXPECT_EQ(dataA[i*2] * dataB[i*2] + dataA[i*2 + 1] * dataB[i*2 + 1] + dataC[i], resE[i]);
+
+            tmp_sum = (w_type)dataA[i*2] * (w_type)dataB[i*2] +
+                      (w_type)dataA[i*2 + 1] * (w_type)dataB[i*2 + 1];
+            sumAB  += tmp_sum;
+            EXPECT_EQ(tmp_sum, resD[i]);
+
+            tmp_sum = tmp_sum + dataC[i];
+            sumABC += tmp_sum;
+            EXPECT_EQ(tmp_sum, resE[i]);
        }
+
+        w_type resF = v_reduce_sum(v_dotprod_fast(a, b)),
+               resG = v_reduce_sum(v_dotprod_fast(a, b, c));
+        EXPECT_EQ(sumAB,  resF);
+        EXPECT_EQ(sumABC, resG);
+        return *this;
+    }
+
+    TheTest & test_dotprod_expand()
+    {
+        typedef typename V_RegTraits<R>::q_reg Rx4;
+        typedef typename Rx4::lane_type l4_type;
+
+        Data<R> dataA, dataB;
+        dataA += std::numeric_limits<LaneType>::max() - R::nlanes;
+        dataB += std::numeric_limits<LaneType>::min() + R::nlanes;
+        R a = dataA, b = dataB;
+
+        Data<Rx4> dataC;
+        Rx4 c = dataC;
+
+        Data<Rx4> resD = v_dotprod_expand(a, b),
+                  resE = v_dotprod_expand(a, b, c);
+
+        l4_type sumAB = 0, sumABC = 0, tmp_sum;
+        for (int i = 0; i < Rx4::nlanes; ++i)
+        {
+            SCOPED_TRACE(cv::format("i=%d", i));
+            tmp_sum  = (l4_type)dataA[i*4]     * (l4_type)dataB[i*4]     +
+                       (l4_type)dataA[i*4 + 1] * (l4_type)dataB[i*4 + 1] +
+                       (l4_type)dataA[i*4 + 2] * (l4_type)dataB[i*4 + 2] +
+                       (l4_type)dataA[i*4 + 3] * (l4_type)dataB[i*4 + 3];
+            sumAB  += tmp_sum;
+            EXPECT_EQ(tmp_sum, resD[i]);
+
+            tmp_sum = tmp_sum + dataC[i];
+            sumABC += tmp_sum;
+            EXPECT_EQ(tmp_sum, resE[i]);
+        }
+
+        l4_type resF = v_reduce_sum(v_dotprod_expand_fast(a, b)),
+                resG = v_reduce_sum(v_dotprod_expand_fast(a, b, c));
+        EXPECT_EQ(sumAB,  resF);
+        EXPECT_EQ(sumABC, resG);
+
+        return *this;
+    }
+
+    TheTest & test_dotprod_expand_f64()
+    {
+    #if CV_SIMD_64F
+        Data<R> dataA, dataB;
+        dataA += std::numeric_limits<LaneType>::max() - R::nlanes;
+        dataB += std::numeric_limits<LaneType>::min();
+        R a = dataA, b = dataB;
+
+        Data<v_float64> dataC;
+        v_float64 c = dataC;
+
+        Data<v_float64> resA = v_dotprod_expand(a, a),
+                        resB = v_dotprod_expand(b, b),
+                        resC = v_dotprod_expand(a, b, c);
+
+        const int n = R::nlanes / 2;
+        for (int i = 0; i < n; ++i)
+        {
+            SCOPED_TRACE(cv::format("i=%d", i));
+            EXPECT_EQ((double)dataA[i*2]     * (double)dataA[i*2] +
+                      (double)dataA[i*2 + 1] * (double)dataA[i*2  + 1], resA[i]);
+            EXPECT_EQ((double)dataB[i*2]     * (double)dataB[i*2] +
+                      (double)dataB[i*2 + 1] * (double)dataB[i*2  + 1], resB[i]);
+            EXPECT_EQ((double)dataA[i*2]     * (double)dataB[i*2] +
+                      (double)dataA[i*2 + 1] * (double)dataB[i*2  + 1] + dataC[i], resC[i]);
+        }
+    #endif
        return *this;
    }

@ -1165,6 +1250,29 @@ template<typename R> struct TheTest
        return *this;
    }

+    TheTest & test_cvt64_double()
+    {
+#if CV_SIMD_64F
+        Data<R> dataA(std::numeric_limits<LaneType>::max()),
+                dataB(std::numeric_limits<LaneType>::min());
+        dataB += R::nlanes;
+
+        R a = dataA, b = dataB;
+        v_float64 c = v_cvt_f64(a), d = v_cvt_f64(b);
+
+        Data<v_float64> resC = c;
+        Data<v_float64> resD = d;
+
+        for (int i = 0; i < R::nlanes; ++i)
+        {
+            SCOPED_TRACE(cv::format("i=%d", i));
+            EXPECT_EQ((double)dataA[i], resC[i]);
+            EXPECT_EQ((double)dataB[i], resD[i]);
+        }
+#endif
+        return *this;
+    }
+
    TheTest & test_matmul()
    {
        Data<R> dataV, dataA, dataB, dataC, dataD;
@ -1341,6 +1449,7 @@ void test_hal_intrin_uint8()
        .test_mul_expand()
        .test_cmp()
        .test_logic()
+        .test_dotprod_expand()
        .test_min_max()
        .test_absdiff()
        .test_reduce_sad()
@ -1378,6 +1487,7 @@ void test_hal_intrin_int8()
        .test_mul_expand()
        .test_cmp()
        .test_logic()
+        .test_dotprod_expand()
        .test_min_max()
        .test_absdiff()
        .test_absdiffs()
@ -1408,6 +1518,7 @@ void test_hal_intrin_uint16()
        .test_cmp()
        .test_shift<1>()
        .test_shift<8>()
+        .test_dotprod_expand()
        .test_logic()
        .test_min_max()
        .test_absdiff()
@ -1437,7 +1548,8 @@ void test_hal_intrin_int16()
        .test_cmp()
        .test_shift<1>()
        .test_shift<8>()
-        .test_dot_prod()
+        .test_dotprod()
+        .test_dotprod_expand()
        .test_logic()
        .test_min_max()
        .test_absdiff()
@ -1497,6 +1609,8 @@ void test_hal_intrin_int32()
        .test_cmp()
        .test_popcount()
        .test_shift<1>().test_shift<8>()
+        .test_dotprod()
+        .test_dotprod_expand_f64()
        .test_logic()
        .test_min_max()
        .test_absdiff()
@ -1538,6 +1652,7 @@ void test_hal_intrin_int64()
        .test_logic()
        .test_extract<0>().test_extract<1>()
        .test_rotate<0>().test_rotate<1>()
+        .test_cvt64_double()
        ;
 }

--- a/modules/imgproc/src/histogram.cpp
+++ b/modules/imgproc/src/histogram.cpp
@ -50,6 +50,8 @@ namespace cv

 ////////////////// Helper functions //////////////////////

+#define CV_CLAMP_INT(v, vmin, vmax) (v < vmin ? vmin : (vmax < v ? vmax : v))
+
 static const size_t OUT_OF_RANGE = (size_t)1 << (sizeof(size_t)*8 - 2);

 static void
@ -71,15 +73,18 @@ calcHistLookupTables_8u( const Mat& hist, const SparseMat& shist,
            int sz = !issparse ? hist.size[i] : shist.size(i);
            size_t step = !issparse ? hist.step[i] : 1;

+            double v_lo = ranges[i][0];
+            double v_hi = ranges[i][1];
+
            for( j = low; j < high; j++ )
            {
                int idx = cvFloor(j*a + b);
-                size_t written_idx;
-                if( (unsigned)idx < (unsigned)sz )
+                size_t written_idx = OUT_OF_RANGE;
+                if (j >= v_lo && j < v_hi)
+                {
+                    idx = CV_CLAMP_INT(idx, 0, sz - 1);
                    written_idx = idx*step;
-                else
-                    written_idx = OUT_OF_RANGE;
-
+                }
                tab[i*(high - low) + j - low] = written_idx;
            }
        }
@ -197,6 +202,10 @@ static void histPrepareImages( const Mat* images, int nimages, const int* channe
            double t = histSize[i]/(high - low);
            uniranges[i*2] = t;
            uniranges[i*2+1] = -t*low;
+#if 0  // This should be true by math, but it is not accurate numerically
+            CV_Assert(cvFloor(low * uniranges[i*2] + uniranges[i*2+1]) == 0);
+            CV_Assert((high * uniranges[i*2] + uniranges[i*2+1]) < histSize[i]);
+#endif
        }
    }
    else
@ -243,22 +252,33 @@ calcHist_( std::vector<uchar*>& _ptrs, const std::vector<int>& _deltas,
            int sz = size[0], d0 = deltas[0], step0 = deltas[1];
            const T* p0 = (const T*)ptrs[0];

+            double v0_lo = _ranges[0][0];
+            double v0_hi = _ranges[0][1];
+
            for( ; imsize.height--; p0 += step0, mask += mstep )
            {
                if( !mask )
                    for( x = 0; x < imsize.width; x++, p0 += d0 )
                    {
-                        int idx = cvFloor(*p0*a + b);
-                        if( (unsigned)idx < (unsigned)sz )
-                            ((int*)H)[idx]++;
+                        double v0 = (double)*p0;
+                        int idx = cvFloor(v0*a + b);
+                        if (v0 < v0_lo || v0 >= v0_hi)
+                            continue;
+                        idx = CV_CLAMP_INT(idx, 0, sz - 1);
+                        CV_DbgAssert((unsigned)idx < (unsigned)sz);
+                        ((int*)H)[idx]++;
                    }
                else
                    for( x = 0; x < imsize.width; x++, p0 += d0 )
                        if( mask[x] )
                        {
-                            int idx = cvFloor(*p0*a + b);
-                            if( (unsigned)idx < (unsigned)sz )
-                                ((int*)H)[idx]++;
+                            double v0 = (double)*p0;
+                            int idx = cvFloor(v0*a + b);
+                            if (v0 < v0_lo || v0 >= v0_hi)
+                                continue;
+                            idx = CV_CLAMP_INT(idx, 0, sz - 1);
+                            CV_DbgAssert((unsigned)idx < (unsigned)sz);
+                            ((int*)H)[idx]++;
                        }
            }
            return;
@ -273,24 +293,45 @@ calcHist_( std::vector<uchar*>& _ptrs, const std::vector<int>& _deltas,
            const T* p0 = (const T*)ptrs[0];
            const T* p1 = (const T*)ptrs[1];

+            double v0_lo = _ranges[0][0];
+            double v0_hi = _ranges[0][1];
+            double v1_lo = _ranges[1][0];
+            double v1_hi = _ranges[1][1];
+
            for( ; imsize.height--; p0 += step0, p1 += step1, mask += mstep )
            {
                if( !mask )
                    for( x = 0; x < imsize.width; x++, p0 += d0, p1 += d1 )
                    {
-                        int idx0 = cvFloor(*p0*a0 + b0);
-                        int idx1 = cvFloor(*p1*a1 + b1);
-                        if( (unsigned)idx0 < (unsigned)sz0 && (unsigned)idx1 < (unsigned)sz1 )
-                            ((int*)(H + hstep0*idx0))[idx1]++;
+                        double v0 = (double)*p0;
+                        double v1 = (double)*p1;
+                        int idx0 = cvFloor(v0*a0 + b0);
+                        int idx1 = cvFloor(v1*a1 + b1);
+                        if (v0 < v0_lo || v0 >= v0_hi)
+                            continue;
+                        if (v1 < v1_lo || v1 >= v1_hi)
+                            continue;
+                        idx0 = CV_CLAMP_INT(idx0, 0, sz0 - 1);
+                        idx1 = CV_CLAMP_INT(idx1, 0, sz1 - 1);
+                        CV_DbgAssert((unsigned)idx0 < (unsigned)sz0 && (unsigned)idx1 < (unsigned)sz1);
+                        ((int*)(H + hstep0*idx0))[idx1]++;
                    }
                else
                    for( x = 0; x < imsize.width; x++, p0 += d0, p1 += d1 )
                        if( mask[x] )
                        {
-                            int idx0 = cvFloor(*p0*a0 + b0);
-                            int idx1 = cvFloor(*p1*a1 + b1);
-                            if( (unsigned)idx0 < (unsigned)sz0 && (unsigned)idx1 < (unsigned)sz1 )
-                                ((int*)(H + hstep0*idx0))[idx1]++;
+                            double v0 = (double)*p0;
+                            double v1 = (double)*p1;
+                            int idx0 = cvFloor(v0*a0 + b0);
+                            int idx1 = cvFloor(v1*a1 + b1);
+                            if (v0 < v0_lo || v0 >= v0_hi)
+                                continue;
+                            if (v1 < v1_lo || v1 >= v1_hi)
+                                continue;
+                            idx0 = CV_CLAMP_INT(idx0, 0, sz0 - 1);
+                            idx1 = CV_CLAMP_INT(idx1, 0, sz1 - 1);
+                            CV_DbgAssert((unsigned)idx0 < (unsigned)sz0 && (unsigned)idx1 < (unsigned)sz1);
+                            ((int*)(H + hstep0*idx0))[idx1]++;
                        }
            }
            return;
@ -309,30 +350,63 @@ calcHist_( std::vector<uchar*>& _ptrs, const std::vector<int>& _deltas,
            const T* p1 = (const T*)ptrs[1];
            const T* p2 = (const T*)ptrs[2];

+            double v0_lo = _ranges[0][0];
+            double v0_hi = _ranges[0][1];
+            double v1_lo = _ranges[1][0];
+            double v1_hi = _ranges[1][1];
+            double v2_lo = _ranges[2][0];
+            double v2_hi = _ranges[2][1];
+
            for( ; imsize.height--; p0 += step0, p1 += step1, p2 += step2, mask += mstep )
            {
                if( !mask )
                    for( x = 0; x < imsize.width; x++, p0 += d0, p1 += d1, p2 += d2 )
                    {
-                        int idx0 = cvFloor(*p0*a0 + b0);
-                        int idx1 = cvFloor(*p1*a1 + b1);
-                        int idx2 = cvFloor(*p2*a2 + b2);
-                        if( (unsigned)idx0 < (unsigned)sz0 &&
+                        double v0 = (double)*p0;
+                        double v1 = (double)*p1;
+                        double v2 = (double)*p2;
+                        int idx0 = cvFloor(v0*a0 + b0);
+                        int idx1 = cvFloor(v1*a1 + b1);
+                        int idx2 = cvFloor(v2*a2 + b2);
+                        if (v0 < v0_lo || v0 >= v0_hi)
+                            continue;
+                        if (v1 < v1_lo || v1 >= v1_hi)
+                            continue;
+                        if (v2 < v2_lo || v2 >= v2_hi)
+                            continue;
+                        idx0 = CV_CLAMP_INT(idx0, 0, sz0 - 1);
+                        idx1 = CV_CLAMP_INT(idx1, 0, sz1 - 1);
+                        idx2 = CV_CLAMP_INT(idx2, 0, sz2 - 1);
+                        CV_DbgAssert(
+                            (unsigned)idx0 < (unsigned)sz0 &&
                            (unsigned)idx1 < (unsigned)sz1 &&
-                            (unsigned)idx2 < (unsigned)sz2 )
-                            ((int*)(H + hstep0*idx0 + hstep1*idx1))[idx2]++;
+                            (unsigned)idx2 < (unsigned)sz2);
+                        ((int*)(H + hstep0*idx0 + hstep1*idx1))[idx2]++;
                    }
                else
                    for( x = 0; x < imsize.width; x++, p0 += d0, p1 += d1, p2 += d2 )
                        if( mask[x] )
                        {
-                            int idx0 = cvFloor(*p0*a0 + b0);
-                            int idx1 = cvFloor(*p1*a1 + b1);
-                            int idx2 = cvFloor(*p2*a2 + b2);
-                            if( (unsigned)idx0 < (unsigned)sz0 &&
-                               (unsigned)idx1 < (unsigned)sz1 &&
-                               (unsigned)idx2 < (unsigned)sz2 )
-                                ((int*)(H + hstep0*idx0 + hstep1*idx1))[idx2]++;
+                            double v0 = (double)*p0;
+                            double v1 = (double)*p1;
+                            double v2 = (double)*p2;
+                            int idx0 = cvFloor(v0*a0 + b0);
+                            int idx1 = cvFloor(v1*a1 + b1);
+                            int idx2 = cvFloor(v2*a2 + b2);
+                            if (v0 < v0_lo || v0 >= v0_hi)
+                                continue;
+                            if (v1 < v1_lo || v1 >= v1_hi)
+                                continue;
+                            if (v2 < v2_lo || v2 >= v2_hi)
+                                continue;
+                            idx0 = CV_CLAMP_INT(idx0, 0, sz0 - 1);
+                            idx1 = CV_CLAMP_INT(idx1, 0, sz1 - 1);
+                            idx2 = CV_CLAMP_INT(idx2, 0, sz2 - 1);
+                            CV_DbgAssert(
+                                (unsigned)idx0 < (unsigned)sz0 &&
+                                (unsigned)idx1 < (unsigned)sz1 &&
+                                (unsigned)idx2 < (unsigned)sz2);
+                            ((int*)(H + hstep0*idx0 + hstep1*idx1))[idx2]++;
                        }
            }
        }
@ -346,9 +420,14 @@ calcHist_( std::vector<uchar*>& _ptrs, const std::vector<int>& _deltas,
                        uchar* Hptr = H;
                        for( i = 0; i < dims; i++ )
                        {
-                            int idx = cvFloor(*ptrs[i]*uniranges[i*2] + uniranges[i*2+1]);
-                            if( (unsigned)idx >= (unsigned)size[i] )
+                            double v_lo = _ranges[i][0];
+                            double v_hi = _ranges[i][1];
+                            double v = *ptrs[i];
+                            if (v < v_lo || v >= v_hi)
                                break;
+                            int idx = cvFloor(v*uniranges[i*2] + uniranges[i*2+1]);
+                            idx = CV_CLAMP_INT(idx, 0, size[i] - 1);
+                            CV_DbgAssert((unsigned)idx < (unsigned)size[i]);
                            ptrs[i] += deltas[i*2];
                            Hptr += idx*hstep[i];
                        }
@ -367,9 +446,14 @@ calcHist_( std::vector<uchar*>& _ptrs, const std::vector<int>& _deltas,
                        if( mask[x] )
                            for( ; i < dims; i++ )
                            {
-                                int idx = cvFloor(*ptrs[i]*uniranges[i*2] + uniranges[i*2+1]);
-                                if( (unsigned)idx >= (unsigned)size[i] )
+                                double v_lo = _ranges[i][0];
+                                double v_hi = _ranges[i][1];
+                                double v = *ptrs[i];
+                                if (v < v_lo || v >= v_hi)
                                    break;
+                                int idx = cvFloor(v*uniranges[i*2] + uniranges[i*2+1]);
+                                idx = CV_CLAMP_INT(idx, 0, size[i] - 1);
+                                CV_DbgAssert((unsigned)idx < (unsigned)size[i]);
                                ptrs[i] += deltas[i*2];
                                Hptr += idx*hstep[i];
                            }
--- a/modules/imgproc/test/test_histograms.cpp
+++ b/modules/imgproc/test/test_histograms.cpp
@ -1307,9 +1307,18 @@ cvTsCalcHist( const vector<Mat>& images, CvHistogram* hist, Mat mask, const vect
                for( k = 0; k < cdims; k++ )
                {
                    double v = val[k], lo = hist->thresh[k][0], hi = hist->thresh[k][1];
-                    idx[k] = cvFloor((v - lo)*dims[k]/(hi - lo));
-                    if( idx[k] < 0 || idx[k] >= dims[k] )
+                    if (v < lo || v >= hi)
                        break;
+                    double idx_ = (v - lo)*dims[k]/(hi - lo);
+                    idx[k] = cvFloor(idx_);
+                    if (idx[k] < 0)
+                    {
+                        idx[k] = 0;
+                    }
+                    if (idx[k] >= dims[k])
+                    {
+                        idx[k] = dims[k] - 1;
+                    }
                }
            }
            else